1 #include <xen/init.h>
2 #include <xen/types.h>
3 #include <xen/irq.h>
4 #include <xen/event.h>
5 #include <xen/kernel.h>
6 #include <xen/delay.h>
7 #include <xen/smp.h>
8 #include <xen/mm.h>
9 #include <xen/cpu.h>
10 #include <asm/processor.h>
11 #include <public/sysctl.h>
12 #include <asm/system.h>
13 #include <asm/msr.h>
14 #include <asm/p2m.h>
15 #include <asm/mce.h>
16 #include <asm/apic.h>
17 #include "mce.h"
18 #include "x86_mca.h"
19 #include "barrier.h"
20 #include "util.h"
21 #include "vmce.h"
22 #include "mcaction.h"
23 
24 static DEFINE_PER_CPU_READ_MOSTLY(struct mca_banks *, mce_banks_owned);
25 bool __read_mostly cmci_support;
26 static bool __read_mostly ser_support;
27 static bool __read_mostly mce_force_broadcast;
28 boolean_param("mce_fb", mce_force_broadcast);
29 
30 static int __read_mostly nr_intel_ext_msrs;
31 
32 /* If mce_force_broadcast == 1, lmce_support will be disabled forcibly. */
33 bool __read_mostly lmce_support;
34 
35 /* Intel SDM define bit15~bit0 of IA32_MCi_STATUS as the MC error code */
36 #define INTEL_MCCOD_MASK 0xFFFF
37 
38 /*
39  * Currently Intel SDM define 2 kinds of srao errors:
40  * 1). Memory scrubbing error, error code = 0xC0 ~ 0xCF
41  * 2). L3 explicit writeback error, error code = 0x17A
42  */
43 #define INTEL_SRAO_MEM_SCRUB 0xC0 ... 0xCF
44 #define INTEL_SRAO_L3_EWB    0x17A
45 
46 /*
47  * Currently Intel SDM define 2 kinds of srar errors:
48  * 1). Data Load error, error code = 0x134
49  * 2). Instruction Fetch error, error code = 0x150
50  */
51 #define INTEL_SRAR_DATA_LOAD	0x134
52 #define INTEL_SRAR_INSTR_FETCH	0x150
53 
54 #ifdef CONFIG_X86_MCE_THERMAL
55 #define MCE_RING                0x1
56 static DEFINE_PER_CPU(int, last_state);
57 
intel_thermal_interrupt(struct cpu_user_regs * regs)58 static void intel_thermal_interrupt(struct cpu_user_regs *regs)
59 {
60     uint64_t msr_content;
61     unsigned int cpu = smp_processor_id();
62     static DEFINE_PER_CPU(s_time_t, next);
63     int *this_last_state;
64 
65     ack_APIC_irq();
66 
67     if ( NOW() < per_cpu(next, cpu) )
68         return;
69 
70     per_cpu(next, cpu) = NOW() + MILLISECS(5000);
71     rdmsrl(MSR_IA32_THERM_STATUS, msr_content);
72     this_last_state = &per_cpu(last_state, cpu);
73     if ( *this_last_state == (msr_content & MCE_RING) )
74         return;
75     *this_last_state = msr_content & MCE_RING;
76     if ( msr_content & MCE_RING )
77     {
78         printk(KERN_EMERG "CPU%u: Temperature above threshold\n", cpu);
79         printk(KERN_EMERG "CPU%u: Running in modulated clock mode\n", cpu);
80         add_taint(TAINT_MACHINE_CHECK);
81     } else
82         printk(KERN_INFO "CPU%u: Temperature/speed normal\n", cpu);
83 }
84 
85 /* Thermal monitoring depends on APIC, ACPI and clock modulation */
intel_thermal_supported(struct cpuinfo_x86 * c)86 static bool intel_thermal_supported(struct cpuinfo_x86 *c)
87 {
88     if ( !cpu_has_apic )
89         return false;
90     if ( !cpu_has(c, X86_FEATURE_ACPI) || !cpu_has(c, X86_FEATURE_TM1) )
91         return false;
92     return true;
93 }
94 
95 static u32 __read_mostly lvtthmr_init;
96 
mcheck_intel_therm_init(void)97 static void __init mcheck_intel_therm_init(void)
98 {
99     /*
100      * This function is only called on boot CPU. Save the init thermal
101      * LVT value on BSP and use that value to restore APs' thermal LVT
102      * entry BIOS programmed later
103      */
104     if ( intel_thermal_supported(&boot_cpu_data) )
105         lvtthmr_init = apic_read(APIC_LVTTHMR);
106 }
107 
108 /* P4/Xeon Thermal regulation detect and init */
intel_init_thermal(struct cpuinfo_x86 * c)109 static void intel_init_thermal(struct cpuinfo_x86 *c)
110 {
111     uint64_t msr_content;
112     uint32_t val;
113     int tm2 = 0;
114     unsigned int cpu = smp_processor_id();
115     static uint8_t thermal_apic_vector;
116 
117     if ( !intel_thermal_supported(c) )
118         return; /* -ENODEV */
119 
120     /* first check if its enabled already, in which case there might
121      * be some SMM goo which handles it, so we can't even put a handler
122      * since it might be delivered via SMI already -zwanem.
123      */
124     rdmsrl(MSR_IA32_MISC_ENABLE, msr_content);
125     val = lvtthmr_init;
126     /*
127      * The initial value of thermal LVT entries on all APs always reads
128      * 0x10000 because APs are woken up by BSP issuing INIT-SIPI-SIPI
129      * sequence to them and LVT registers are reset to 0s except for
130      * the mask bits which are set to 1s when APs receive INIT IPI.
131      * If BIOS takes over the thermal interrupt and sets its interrupt
132      * delivery mode to SMI (not fixed), it restores the value that the
133      * BIOS has programmed on AP based on BSP's info we saved (since BIOS
134      * is required to set the same value for all threads/cores).
135      */
136     if ( (val & APIC_MODE_MASK) != APIC_DM_FIXED
137          || (val & APIC_VECTOR_MASK) > 0xf )
138         apic_write(APIC_LVTTHMR, val);
139 
140     if ( (msr_content & (1ULL<<3))
141          && (val & APIC_MODE_MASK) == APIC_DM_SMI )
142     {
143         if ( c == &boot_cpu_data )
144             printk(KERN_DEBUG "Thermal monitoring handled by SMI\n");
145         return; /* -EBUSY */
146     }
147 
148     if ( cpu_has(c, X86_FEATURE_TM2) && (msr_content & (1ULL << 13)) )
149         tm2 = 1;
150 
151     /* check whether a vector already exists, temporarily masked? */
152     if ( val & APIC_VECTOR_MASK )
153     {
154         if ( c == &boot_cpu_data )
155             printk(KERN_DEBUG "Thermal LVT vector (%#x) already installed\n",
156                    val & APIC_VECTOR_MASK);
157         return; /* -EBUSY */
158     }
159 
160     alloc_direct_apic_vector(&thermal_apic_vector, intel_thermal_interrupt);
161 
162     /* The temperature transition interrupt handler setup */
163     val = thermal_apic_vector;    /* our delivery vector */
164     val |= (APIC_DM_FIXED | APIC_LVT_MASKED);  /* we'll mask till we're ready */
165     apic_write(APIC_LVTTHMR, val);
166 
167     rdmsrl(MSR_IA32_THERM_INTERRUPT, msr_content);
168     wrmsrl(MSR_IA32_THERM_INTERRUPT, msr_content | 0x03);
169 
170     rdmsrl(MSR_IA32_MISC_ENABLE, msr_content);
171     wrmsrl(MSR_IA32_MISC_ENABLE, msr_content | (1ULL<<3));
172 
173     apic_write(APIC_LVTTHMR, val & ~APIC_LVT_MASKED);
174     if ( opt_cpu_info )
175         printk(KERN_INFO "CPU%u: Thermal monitoring enabled (%s)\n",
176                cpu, tm2 ? "TM2" : "TM1");
177     return;
178 }
179 #endif /* CONFIG_X86_MCE_THERMAL */
180 
181 /* Intel MCE handler */
intel_get_extended_msr(struct mcinfo_extended * ext,u32 msr)182 static inline void intel_get_extended_msr(struct mcinfo_extended *ext, u32 msr)
183 {
184     if ( ext->mc_msrs < ARRAY_SIZE(ext->mc_msr)
185          && msr < MSR_IA32_MCG_EAX + nr_intel_ext_msrs )
186     {
187         ext->mc_msr[ext->mc_msrs].reg = msr;
188         rdmsrl(msr, ext->mc_msr[ext->mc_msrs].value);
189         ++ext->mc_msrs;
190     }
191 }
192 
193 
194 struct mcinfo_extended *
intel_get_extended_msrs(struct mcinfo_global * mig,struct mc_info * mi)195 intel_get_extended_msrs(struct mcinfo_global *mig, struct mc_info *mi)
196 {
197     struct mcinfo_extended *mc_ext;
198     int i;
199 
200     /*
201      * According to spec, processor _support_ 64 bit will always
202      * have MSR beyond IA32_MCG_MISC
203      */
204     if ( !mi|| !mig || nr_intel_ext_msrs == 0 ||
205          !(mig->mc_gstatus & MCG_STATUS_EIPV) )
206         return NULL;
207 
208     mc_ext = x86_mcinfo_reserve(mi, sizeof(*mc_ext), MC_TYPE_EXTENDED);
209     if ( !mc_ext )
210     {
211         mi->flags |= MCINFO_FLAGS_UNCOMPLETE;
212         return NULL;
213     }
214 
215     for ( i = MSR_IA32_MCG_EAX; i <= MSR_IA32_MCG_MISC; i++ )
216         intel_get_extended_msr(mc_ext, i);
217 
218     for ( i = MSR_IA32_MCG_R8; i <= MSR_IA32_MCG_R15; i++ )
219         intel_get_extended_msr(mc_ext, i);
220 
221     return mc_ext;
222 }
223 
224 enum intel_mce_type
225 {
226     intel_mce_invalid,
227     intel_mce_fatal,
228     intel_mce_corrected,
229     intel_mce_ucr_ucna,
230     intel_mce_ucr_srao,
231     intel_mce_ucr_srar,
232 };
233 
intel_check_mce_type(uint64_t status)234 static enum intel_mce_type intel_check_mce_type(uint64_t status)
235 {
236     if ( !(status & MCi_STATUS_VAL) )
237         return intel_mce_invalid;
238 
239     if ( status & MCi_STATUS_PCC )
240         return intel_mce_fatal;
241 
242     /* Corrected error? */
243     if ( !(status & MCi_STATUS_UC) )
244         return intel_mce_corrected;
245 
246     if ( !ser_support )
247         return intel_mce_fatal;
248 
249     if ( status & MCi_STATUS_S )
250     {
251         if ( status & MCi_STATUS_AR )
252         {
253             if ( status & MCi_STATUS_OVER )
254                 return intel_mce_fatal;
255             else
256                 return intel_mce_ucr_srar;
257         } else
258             return intel_mce_ucr_srao;
259     }
260     else
261         return intel_mce_ucr_ucna;
262 
263     /* Any type not included abovoe ? */
264     return intel_mce_fatal;
265 }
266 
intel_memerr_dhandler(struct mca_binfo * binfo,enum mce_result * result,const struct cpu_user_regs * regs)267 static void intel_memerr_dhandler(
268              struct mca_binfo *binfo,
269              enum mce_result *result,
270              const struct cpu_user_regs *regs)
271 {
272     mce_printk(MCE_VERBOSE, "MCE: Enter UCR recovery action\n");
273     mc_memerr_dhandler(binfo, result, regs);
274 }
275 
intel_srar_check(uint64_t status)276 static bool intel_srar_check(uint64_t status)
277 {
278     return (intel_check_mce_type(status) == intel_mce_ucr_srar);
279 }
280 
intel_checkaddr(uint64_t status,uint64_t misc,int addrtype)281 static bool intel_checkaddr(uint64_t status, uint64_t misc, int addrtype)
282 {
283     if ( !(status & MCi_STATUS_ADDRV) ||
284          !(status & MCi_STATUS_MISCV) ||
285          ((misc & MCi_MISC_ADDRMOD_MASK) != MCi_MISC_PHYSMOD) )
286         /* addr is virtual */
287         return (addrtype == MC_ADDR_VIRTUAL);
288 
289     return (addrtype == MC_ADDR_PHYSICAL);
290 }
291 
intel_srar_dhandler(struct mca_binfo * binfo,enum mce_result * result,const struct cpu_user_regs * regs)292 static void intel_srar_dhandler(
293              struct mca_binfo *binfo,
294              enum mce_result *result,
295              const struct cpu_user_regs *regs)
296 {
297     uint64_t status = binfo->mib->mc_status;
298 
299     /* For unknown srar error code, reset system */
300     *result = MCER_RESET;
301 
302     switch ( status & INTEL_MCCOD_MASK )
303     {
304     case INTEL_SRAR_DATA_LOAD:
305     case INTEL_SRAR_INSTR_FETCH:
306         intel_memerr_dhandler(binfo, result, regs);
307         break;
308     }
309 }
310 
intel_srao_check(uint64_t status)311 static bool intel_srao_check(uint64_t status)
312 {
313     return (intel_check_mce_type(status) == intel_mce_ucr_srao);
314 }
315 
intel_srao_dhandler(struct mca_binfo * binfo,enum mce_result * result,const struct cpu_user_regs * regs)316 static void intel_srao_dhandler(
317              struct mca_binfo *binfo,
318              enum mce_result *result,
319              const struct cpu_user_regs *regs)
320 {
321     uint64_t status = binfo->mib->mc_status;
322 
323     /* For unknown srao error code, no action required */
324     *result = MCER_CONTINUE;
325 
326     if ( status & MCi_STATUS_VAL )
327     {
328         switch ( status & INTEL_MCCOD_MASK )
329         {
330         case INTEL_SRAO_MEM_SCRUB:
331         case INTEL_SRAO_L3_EWB:
332             intel_memerr_dhandler(binfo, result, regs);
333             break;
334         }
335     }
336 }
337 
intel_default_check(uint64_t status)338 static bool intel_default_check(uint64_t status)
339 {
340     return true;
341 }
342 
intel_default_mce_dhandler(struct mca_binfo * binfo,enum mce_result * result,const struct cpu_user_regs * regs)343 static void intel_default_mce_dhandler(
344              struct mca_binfo *binfo,
345              enum mce_result *result,
346              const struct cpu_user_regs * regs)
347 {
348     uint64_t status = binfo->mib->mc_status;
349     enum intel_mce_type type;
350 
351     type = intel_check_mce_type(status);
352 
353     if ( type == intel_mce_fatal )
354         *result = MCER_RESET;
355     else
356         *result = MCER_CONTINUE;
357 }
358 
359 static const struct mca_error_handler intel_mce_dhandlers[] = {
360     {intel_srao_check, intel_srao_dhandler},
361     {intel_srar_check, intel_srar_dhandler},
362     {intel_default_check, intel_default_mce_dhandler}
363 };
364 
intel_default_mce_uhandler(struct mca_binfo * binfo,enum mce_result * result,const struct cpu_user_regs * regs)365 static void intel_default_mce_uhandler(
366              struct mca_binfo *binfo,
367              enum mce_result *result,
368              const struct cpu_user_regs *regs)
369 {
370     uint64_t status = binfo->mib->mc_status;
371     enum intel_mce_type type;
372 
373     type = intel_check_mce_type(status);
374 
375     switch ( type )
376     {
377     case intel_mce_fatal:
378         *result = MCER_RESET;
379         break;
380 
381     default:
382         *result = MCER_CONTINUE;
383         break;
384     }
385 }
386 
387 static const struct mca_error_handler intel_mce_uhandlers[] = {
388     {intel_default_check, intel_default_mce_uhandler}
389 };
390 
391 /* According to MCA OS writer guide, CMCI handler need to clear bank when
392  * 1) CE (UC = 0)
393  * 2) ser_support = 1, Superious error, OVER = 0, EN = 0, [UC = 1]
394  * 3) ser_support = 1, UCNA, OVER = 0, S = 1, AR = 0, PCC = 0, [UC = 1, EN = 1]
395  * MCA handler need to clear bank when
396  * 1) ser_support = 1, Superious error, OVER = 0, EN = 0, UC = 1
397  * 2) ser_support = 1, SRAR, UC = 1, OVER = 0, S = 1, AR = 1, [EN = 1]
398  * 3) ser_support = 1, SRAO, UC = 1, S = 1, AR = 0, [EN = 1]
399  */
400 
intel_need_clearbank_scan(enum mca_source who,u64 status)401 static bool intel_need_clearbank_scan(enum mca_source who, u64 status)
402 {
403     if ( who == MCA_CMCI_HANDLER )
404     {
405         /* CMCI need clear bank */
406         if ( !(status & MCi_STATUS_UC) )
407             return true;
408         /* Spurious need clear bank */
409         else if ( ser_support && !(status & MCi_STATUS_OVER)
410                   && !(status & MCi_STATUS_EN) )
411             return true;
412         /* UCNA OVER = 0 need clear bank */
413         else if ( ser_support && !(status & MCi_STATUS_OVER)
414                   && !(status & MCi_STATUS_PCC) && !(status & MCi_STATUS_S)
415                   && !(status & MCi_STATUS_AR) )
416             return true;
417         /* Only Log, no clear */
418         else return false;
419     }
420     else if ( who == MCA_MCE_SCAN )
421     {
422         if ( !ser_support )
423             return false;
424         /*
425          * For fatal error, it shouldn't be cleared so that sticky bank
426          * have chance to be handled after reboot by polling
427          */
428         if ( (status & MCi_STATUS_UC) && (status & MCi_STATUS_PCC) )
429             return false;
430         /* Spurious need clear bank */
431         else if ( !(status & MCi_STATUS_OVER)
432                   && (status & MCi_STATUS_UC) && !(status & MCi_STATUS_EN) )
433             return true;
434         /* SRAR OVER=0 clear bank. OVER = 1 have caused reset */
435         else if ( (status & MCi_STATUS_UC)
436                   && (status & MCi_STATUS_S) && (status & MCi_STATUS_AR)
437                   && !(status & MCi_STATUS_OVER) )
438             return true;
439         /* SRAO need clear bank */
440         else if ( !(status & MCi_STATUS_AR)
441                   && (status & MCi_STATUS_S) && (status & MCi_STATUS_UC) )
442             return true;
443         else
444             return false;
445     }
446 
447     return true;
448 }
449 
450 /*
451  * MCE continues/is recoverable when
452  * 1) CE UC = 0
453  * 2) Supious ser_support = 1, OVER = 0, En = 0 [UC = 1]
454  * 3) SRAR ser_support = 1, OVER = 0, PCC = 0, S = 1, AR = 1 [UC =1, EN = 1]
455  * 4) SRAO ser_support = 1, PCC = 0, S = 1, AR = 0, EN = 1 [UC = 1]
456  * 5) UCNA ser_support = 1, OVER = 0, EN = 1, PCC = 0, S = 0, AR = 0, [UC = 1]
457  */
intel_recoverable_scan(uint64_t status)458 static bool intel_recoverable_scan(uint64_t status)
459 {
460 
461     if ( !(status & MCi_STATUS_UC ) )
462         return true;
463     else if ( ser_support && !(status & MCi_STATUS_EN)
464               && !(status & MCi_STATUS_OVER) )
465         return true;
466     /* SRAR error */
467     else if ( ser_support && !(status & MCi_STATUS_OVER)
468               && !(status & MCi_STATUS_PCC) && (status & MCi_STATUS_S)
469               && (status & MCi_STATUS_AR) && (status & MCi_STATUS_EN) )
470         return true;
471     /* SRAO error */
472     else if ( ser_support && !(status & MCi_STATUS_PCC)
473               && (status & MCi_STATUS_S) && !(status & MCi_STATUS_AR)
474               && (status & MCi_STATUS_EN) )
475         return true;
476     /* UCNA error */
477     else if ( ser_support && !(status & MCi_STATUS_OVER)
478               && (status & MCi_STATUS_EN) && !(status & MCi_STATUS_PCC)
479               && !(status & MCi_STATUS_S) && !(status & MCi_STATUS_AR) )
480         return true;
481     return false;
482 }
483 
484 /* CMCI */
485 static DEFINE_SPINLOCK(cmci_discover_lock);
486 
487 /*
488  * Discover bank sharing using the algorithm recommended in the SDM.
489  */
do_cmci_discover(int i)490 static int do_cmci_discover(int i)
491 {
492     unsigned msr = MSR_IA32_MCx_CTL2(i);
493     u64 val;
494     unsigned int threshold, max_threshold;
495     static unsigned int cmci_threshold = 2;
496     integer_param("cmci-threshold", cmci_threshold);
497 
498     rdmsrl(msr, val);
499     /* Some other CPU already owns this bank. */
500     if ( val & CMCI_EN )
501     {
502         mcabanks_clear(i, __get_cpu_var(mce_banks_owned));
503         goto out;
504     }
505 
506     if ( cmci_threshold )
507     {
508         wrmsrl(msr, val | CMCI_EN | CMCI_THRESHOLD_MASK);
509         rdmsrl(msr, val);
510     }
511 
512     if ( !(val & CMCI_EN) )
513     {
514         /* This bank does not support CMCI. Polling timer has to handle it. */
515         mcabanks_set(i, __get_cpu_var(no_cmci_banks));
516         wrmsrl(msr, val & ~CMCI_THRESHOLD_MASK);
517         return 0;
518     }
519     max_threshold = MASK_EXTR(val, CMCI_THRESHOLD_MASK);
520     threshold = cmci_threshold;
521     if ( threshold > max_threshold )
522     {
523         mce_printk(MCE_QUIET,
524                    "CMCI: threshold %#x too large for CPU%u bank %u, using %#x\n",
525                    threshold, smp_processor_id(), i, max_threshold);
526         threshold = max_threshold;
527     }
528     wrmsrl(msr, (val & ~CMCI_THRESHOLD_MASK) | CMCI_EN | threshold);
529     mcabanks_set(i, __get_cpu_var(mce_banks_owned));
530 out:
531     mcabanks_clear(i, __get_cpu_var(no_cmci_banks));
532     return 1;
533 }
534 
cmci_discover(void)535 static void cmci_discover(void)
536 {
537     unsigned long flags;
538     int i;
539     mctelem_cookie_t mctc;
540     struct mca_summary bs;
541 
542     mce_printk(MCE_VERBOSE, "CMCI: find owner on CPU%d\n", smp_processor_id());
543 
544     spin_lock_irqsave(&cmci_discover_lock, flags);
545 
546     for ( i = 0; i < nr_mce_banks; i++ )
547         if ( !mcabanks_test(i, __get_cpu_var(mce_banks_owned)) )
548             do_cmci_discover(i);
549 
550     spin_unlock_irqrestore(&cmci_discover_lock, flags);
551 
552     /*
553      * In case CMCI happended when do owner change.
554      * If CMCI happened yet not processed immediately,
555      * MCi_status (error_count bit 38~52) is not cleared,
556      * the CMCI interrupt will never be triggered again.
557      */
558 
559     mctc = mcheck_mca_logout(
560         MCA_CMCI_HANDLER, __get_cpu_var(mce_banks_owned), &bs, NULL);
561 
562     if ( bs.errcnt && mctc != NULL )
563     {
564         if ( dom0_vmce_enabled() )
565         {
566             mctelem_commit(mctc);
567             send_global_virq(VIRQ_MCA);
568         }
569         else
570         {
571             x86_mcinfo_dump(mctelem_dataptr(mctc));
572             mctelem_dismiss(mctc);
573         }
574     }
575     else if ( mctc != NULL )
576         mctelem_dismiss(mctc);
577 
578     mce_printk(MCE_VERBOSE, "CMCI: CPU%d owner_map[%lx], no_cmci_map[%lx]\n",
579                smp_processor_id(),
580                *((unsigned long *)__get_cpu_var(mce_banks_owned)->bank_map),
581                *((unsigned long *)__get_cpu_var(no_cmci_banks)->bank_map));
582 }
583 
584 /*
585  * Define an owner for each bank. Banks can be shared between CPUs
586  * and to avoid reporting events multiple times always set up one
587  * CPU as owner.
588  *
589  * The assignment has to be redone when CPUs go offline and
590  * any of the owners goes away. Also pollers run in parallel so we
591  * have to be careful to update the banks in a way that doesn't
592  * lose or duplicate events.
593  */
594 
mce_set_owner(void)595 static void mce_set_owner(void)
596 {
597     if ( !cmci_support || !opt_mce )
598         return;
599 
600     cmci_discover();
601 }
602 
__cpu_mcheck_distribute_cmci(void * unused)603 static void __cpu_mcheck_distribute_cmci(void *unused)
604 {
605     cmci_discover();
606 }
607 
cpu_mcheck_distribute_cmci(void)608 static void cpu_mcheck_distribute_cmci(void)
609 {
610     if ( cmci_support && opt_mce )
611         on_each_cpu(__cpu_mcheck_distribute_cmci, NULL, 0);
612 }
613 
clear_cmci(void)614 static void clear_cmci(void)
615 {
616     int i;
617 
618     if ( !cmci_support || !opt_mce )
619         return;
620 
621     mce_printk(MCE_VERBOSE, "CMCI: clear_cmci support on CPU%d\n",
622                smp_processor_id());
623 
624     for ( i = 0; i < nr_mce_banks; i++ )
625     {
626         unsigned msr = MSR_IA32_MCx_CTL2(i);
627         u64 val;
628         if ( !mcabanks_test(i, __get_cpu_var(mce_banks_owned)) )
629             continue;
630         rdmsrl(msr, val);
631         if ( val & (CMCI_EN|CMCI_THRESHOLD_MASK) )
632             wrmsrl(msr, val & ~(CMCI_EN|CMCI_THRESHOLD_MASK));
633         mcabanks_clear(i, __get_cpu_var(mce_banks_owned));
634     }
635 }
636 
cpu_mcheck_disable(void)637 static void cpu_mcheck_disable(void)
638 {
639     clear_in_cr4(X86_CR4_MCE);
640 
641     if ( cmci_support && opt_mce )
642         clear_cmci();
643 }
644 
cmci_interrupt(struct cpu_user_regs * regs)645 static void cmci_interrupt(struct cpu_user_regs *regs)
646 {
647     mctelem_cookie_t mctc;
648     struct mca_summary bs;
649 
650     ack_APIC_irq();
651 
652     mctc = mcheck_mca_logout(
653         MCA_CMCI_HANDLER, __get_cpu_var(mce_banks_owned), &bs, NULL);
654 
655     if ( bs.errcnt && mctc != NULL )
656     {
657         if ( dom0_vmce_enabled() )
658         {
659             mctelem_commit(mctc);
660             mce_printk(MCE_VERBOSE, "CMCI: send CMCI to DOM0 through virq\n");
661             send_global_virq(VIRQ_MCA);
662         }
663         else
664         {
665             x86_mcinfo_dump(mctelem_dataptr(mctc));
666             mctelem_dismiss(mctc);
667         }
668     }
669     else if ( mctc != NULL )
670         mctelem_dismiss(mctc);
671 }
672 
intel_init_cmci(struct cpuinfo_x86 * c)673 static void intel_init_cmci(struct cpuinfo_x86 *c)
674 {
675     u32 l, apic;
676     int cpu = smp_processor_id();
677 
678     if ( !mce_available(c) || !cmci_support )
679     {
680         if ( opt_cpu_info )
681             mce_printk(MCE_QUIET, "CMCI: CPU%d has no CMCI support\n", cpu);
682         return;
683     }
684 
685     apic = apic_read(APIC_CMCI);
686     if ( apic & APIC_VECTOR_MASK )
687     {
688         mce_printk(MCE_QUIET, "CPU%d CMCI LVT vector (%#x) already installed\n",
689                    cpu, ( apic & APIC_VECTOR_MASK ));
690         return;
691     }
692 
693     alloc_direct_apic_vector(&cmci_apic_vector, cmci_interrupt);
694 
695     apic = cmci_apic_vector;
696     apic |= (APIC_DM_FIXED | APIC_LVT_MASKED);
697     apic_write(APIC_CMCI, apic);
698 
699     l = apic_read(APIC_CMCI);
700     apic_write(APIC_CMCI, l & ~APIC_LVT_MASKED);
701 
702     mce_set_owner();
703 }
704 
705 /* MCA */
706 
mce_is_broadcast(struct cpuinfo_x86 * c)707 static bool mce_is_broadcast(struct cpuinfo_x86 *c)
708 {
709     if ( mce_force_broadcast )
710         return true;
711 
712     /*
713      * According to Intel SDM Dec, 2009, 15.10.4.1, For processors with
714      * DisplayFamily_DisplayModel encoding of 06H_EH and above,
715      * a MCA signal is broadcast to all logical processors in the system
716      */
717     if ( c->x86_vendor == X86_VENDOR_INTEL && c->x86 == 6 &&
718          c->x86_model >= 0xe )
719         return true;
720     return false;
721 }
722 
intel_enable_lmce(void)723 static bool intel_enable_lmce(void)
724 {
725     uint64_t msr_content;
726 
727     /*
728      * Section "Enabling Local Machine Check" in Intel SDM Vol 3
729      * requires software must ensure the LOCK bit and LMCE_ON bit
730      * of MSR_IA32_FEATURE_CONTROL are set before setting
731      * MSR_IA32_MCG_EXT_CTL.LMCE_EN.
732      */
733 
734     if ( rdmsr_safe(MSR_IA32_FEATURE_CONTROL, msr_content) )
735         return false;
736 
737     if ( (msr_content & IA32_FEATURE_CONTROL_LOCK) &&
738          (msr_content & IA32_FEATURE_CONTROL_LMCE_ON) )
739     {
740         wrmsrl(MSR_IA32_MCG_EXT_CTL, MCG_EXT_CTL_LMCE_EN);
741         return true;
742     }
743 
744     return false;
745 }
746 
747 /* Check and init MCA */
intel_init_mca(struct cpuinfo_x86 * c)748 static void intel_init_mca(struct cpuinfo_x86 *c)
749 {
750     bool broadcast, cmci = false, ser = false, lmce = false;
751     int ext_num = 0, first;
752     uint64_t msr_content;
753 
754     broadcast = mce_is_broadcast(c);
755 
756     rdmsrl(MSR_IA32_MCG_CAP, msr_content);
757 
758     if ( (msr_content & MCG_CMCI_P) && cpu_has_apic )
759         cmci = true;
760 
761     /* Support Software Error Recovery */
762     if ( msr_content & MCG_SER_P )
763         ser = true;
764 
765     if ( msr_content & MCG_EXT_P )
766         ext_num = (msr_content >> MCG_EXT_CNT) & 0xff;
767 
768     first = mce_firstbank(c);
769 
770     if ( !mce_force_broadcast && (msr_content & MCG_LMCE_P) )
771         lmce = intel_enable_lmce();
772 
773 #define CAP(enabled, name) ((enabled) ? ", " name : "")
774     if ( smp_processor_id() == 0 )
775     {
776         dprintk(XENLOG_INFO,
777                 "MCA Capability: firstbank %d, extended MCE MSR %d%s%s%s%s\n",
778                 first, ext_num,
779                 CAP(broadcast, "BCAST"),
780                 CAP(ser, "SER"),
781                 CAP(cmci, "CMCI"),
782                 CAP(lmce, "LMCE"));
783 
784         mce_broadcast = broadcast;
785         cmci_support = cmci;
786         ser_support = ser;
787         lmce_support = lmce;
788         nr_intel_ext_msrs = ext_num;
789         firstbank = first;
790     }
791     else if ( cmci != cmci_support || ser != ser_support ||
792               broadcast != mce_broadcast ||
793               first != firstbank || ext_num != nr_intel_ext_msrs ||
794               lmce != lmce_support )
795         dprintk(XENLOG_WARNING,
796                 "CPU%u has different MCA capability "
797                 "(firstbank %d, extended MCE MSR %d%s%s%s%s)"
798                 " than BSP, may cause undetermined result!!!\n",
799                 smp_processor_id(), first, ext_num,
800                 CAP(broadcast, "BCAST"),
801                 CAP(ser, "SER"),
802                 CAP(cmci, "CMCI"),
803                 CAP(lmce, "LMCE"));
804 #undef CAP
805 }
806 
intel_mce_post_reset(void)807 static void intel_mce_post_reset(void)
808 {
809     mctelem_cookie_t mctc;
810     struct mca_summary bs;
811 
812     mctc = mcheck_mca_logout(MCA_RESET, mca_allbanks, &bs, NULL);
813 
814     /* in the boot up stage, print out and also log in DOM0 boot process */
815     if ( bs.errcnt && mctc != NULL )
816     {
817         x86_mcinfo_dump(mctelem_dataptr(mctc));
818         mctelem_commit(mctc);
819     }
820     return;
821 }
822 
intel_init_mce(void)823 static void intel_init_mce(void)
824 {
825     uint64_t msr_content;
826     int i;
827 
828     intel_mce_post_reset();
829 
830     /* clear all banks */
831     for ( i = firstbank; i < nr_mce_banks; i++ )
832     {
833         /*
834          * Some banks are shared across cores, use MCi_CTRL to judge whether
835          * this bank has been initialized by other cores already.
836          */
837         rdmsrl(MSR_IA32_MCx_CTL(i), msr_content);
838         if ( !msr_content )
839         {
840             /* if ctl is 0, this bank is never initialized */
841             mce_printk(MCE_VERBOSE, "mce_init: init bank%d\n", i);
842             wrmsrl(MSR_IA32_MCx_CTL(i), 0xffffffffffffffffULL);
843             wrmsrl(MSR_IA32_MCx_STATUS(i), 0x0ULL);
844         }
845     }
846     if ( firstbank ) /* if cmci enabled, firstbank = 0 */
847         wrmsrl(MSR_IA32_MC0_STATUS, 0x0ULL);
848 
849     x86_mce_vector_register(mcheck_cmn_handler);
850     mce_recoverable_register(intel_recoverable_scan);
851     mce_need_clearbank_register(intel_need_clearbank_scan);
852     mce_register_addrcheck(intel_checkaddr);
853 
854     mce_dhandlers = intel_mce_dhandlers;
855     mce_dhandler_num = ARRAY_SIZE(intel_mce_dhandlers);
856     mce_uhandlers = intel_mce_uhandlers;
857     mce_uhandler_num = ARRAY_SIZE(intel_mce_uhandlers);
858 }
859 
cpu_mcabank_free(unsigned int cpu)860 static void cpu_mcabank_free(unsigned int cpu)
861 {
862     struct mca_banks *cmci = per_cpu(no_cmci_banks, cpu);
863     struct mca_banks *owned = per_cpu(mce_banks_owned, cpu);
864 
865     mcabanks_free(cmci);
866     mcabanks_free(owned);
867 }
868 
cpu_mcabank_alloc(unsigned int cpu)869 static int cpu_mcabank_alloc(unsigned int cpu)
870 {
871     struct mca_banks *cmci = mcabanks_alloc();
872     struct mca_banks *owned = mcabanks_alloc();
873 
874     if ( !cmci || !owned )
875         goto out;
876 
877     per_cpu(no_cmci_banks, cpu) = cmci;
878     per_cpu(mce_banks_owned, cpu) = owned;
879     per_cpu(last_state, cpu) = -1;
880 
881     return 0;
882  out:
883     mcabanks_free(cmci);
884     mcabanks_free(owned);
885     return -ENOMEM;
886 }
887 
cpu_callback(struct notifier_block * nfb,unsigned long action,void * hcpu)888 static int cpu_callback(
889     struct notifier_block *nfb, unsigned long action, void *hcpu)
890 {
891     unsigned int cpu = (unsigned long)hcpu;
892     int rc = 0;
893 
894     switch ( action )
895     {
896     case CPU_UP_PREPARE:
897         rc = cpu_mcabank_alloc(cpu);
898         break;
899 
900     case CPU_DYING:
901         cpu_mcheck_disable();
902         break;
903 
904     case CPU_UP_CANCELED:
905     case CPU_DEAD:
906         cpu_mcheck_distribute_cmci();
907         cpu_mcabank_free(cpu);
908         break;
909     }
910 
911     return !rc ? NOTIFY_DONE : notifier_from_errno(rc);
912 }
913 
914 static struct notifier_block cpu_nfb = {
915     .notifier_call = cpu_callback
916 };
917 
918 /* p4/p6 family have similar MCA initialization process */
intel_mcheck_init(struct cpuinfo_x86 * c,bool bsp)919 enum mcheck_type intel_mcheck_init(struct cpuinfo_x86 *c, bool bsp)
920 {
921     if ( bsp )
922     {
923         /* Early MCE initialisation for BSP. */
924         if ( cpu_mcabank_alloc(0) )
925             BUG();
926         register_cpu_notifier(&cpu_nfb);
927         mcheck_intel_therm_init();
928     }
929 
930     intel_init_mca(c);
931 
932     mce_handler_init();
933 
934     intel_init_mce();
935 
936     intel_init_cmci(c);
937 #ifdef CONFIG_X86_MCE_THERMAL
938     intel_init_thermal(c);
939 #endif
940 
941     return mcheck_intel;
942 }
943 
944 /* intel specific MCA MSR */
vmce_intel_wrmsr(struct vcpu * v,uint32_t msr,uint64_t val)945 int vmce_intel_wrmsr(struct vcpu *v, uint32_t msr, uint64_t val)
946 {
947     unsigned int bank = msr - MSR_IA32_MC0_CTL2;
948 
949     if ( bank < GUEST_MC_BANK_NUM )
950     {
951         v->arch.vmce.bank[bank].mci_ctl2 = val;
952         mce_printk(MCE_VERBOSE, "MCE: wr MC%u_CTL2 %#"PRIx64"\n", bank, val);
953     }
954 
955     return 1;
956 }
957 
vmce_intel_rdmsr(const struct vcpu * v,uint32_t msr,uint64_t * val)958 int vmce_intel_rdmsr(const struct vcpu *v, uint32_t msr, uint64_t *val)
959 {
960     unsigned int bank = msr - MSR_IA32_MC0_CTL2;
961 
962     if ( bank < GUEST_MC_BANK_NUM )
963     {
964         *val = v->arch.vmce.bank[bank].mci_ctl2;
965         mce_printk(MCE_VERBOSE, "MCE: rd MC%u_CTL2 %#"PRIx64"\n", bank, *val);
966     }
967 
968     return 1;
969 }
970 
vmce_has_lmce(const struct vcpu * v)971 bool vmce_has_lmce(const struct vcpu *v)
972 {
973     return v->arch.vmce.mcg_cap & MCG_LMCE_P;
974 }
975