1 #include <xen/init.h>
2 #include <xen/types.h>
3 #include <xen/irq.h>
4 #include <xen/event.h>
5 #include <xen/kernel.h>
6 #include <xen/delay.h>
7 #include <xen/param.h>
8 #include <xen/smp.h>
9 #include <xen/mm.h>
10 #include <xen/cpu.h>
11 #include <asm/processor.h>
12 #include <public/sysctl.h>
13 #include <asm/system.h>
14 #include <asm/msr.h>
15 #include <asm/p2m.h>
16 #include <asm/mce.h>
17 #include <asm/apic.h>
18 
19 #include <acpi/cpufreq/cpufreq.h>
20 
21 #include "mce.h"
22 #include "x86_mca.h"
23 #include "barrier.h"
24 #include "util.h"
25 #include "vmce.h"
26 #include "mcaction.h"
27 
28 static DEFINE_PER_CPU_READ_MOSTLY(struct mca_banks *, mce_banks_owned);
29 static bool __read_mostly ser_support;
30 static bool __read_mostly mce_force_broadcast;
31 boolean_param("mce_fb", mce_force_broadcast);
32 
33 static int __read_mostly nr_intel_ext_msrs;
34 
35 /* Intel SDM define bit15~bit0 of IA32_MCi_STATUS as the MC error code */
36 #define INTEL_MCCOD_MASK 0xFFFF
37 
38 /*
39  * Currently Intel SDM define 2 kinds of srao errors:
40  * 1). Memory scrubbing error, error code = 0xC0 ~ 0xCF
41  * 2). L3 explicit writeback error, error code = 0x17A
42  */
43 #define INTEL_SRAO_MEM_SCRUB 0xC0 ... 0xCF
44 #define INTEL_SRAO_L3_EWB    0x17A
45 
46 /*
47  * Currently Intel SDM define 2 kinds of srar errors:
48  * 1). Data Load error, error code = 0x134
49  * 2). Instruction Fetch error, error code = 0x150
50  */
51 #define INTEL_SRAR_DATA_LOAD	0x134
52 #define INTEL_SRAR_INSTR_FETCH	0x150
53 
54 #define MCE_RING                0x1
55 static DEFINE_PER_CPU(int, last_state);
56 
intel_thermal_interrupt(void)57 static void cf_check intel_thermal_interrupt(void)
58 {
59     uint64_t msr_content;
60     unsigned int cpu = smp_processor_id();
61     static DEFINE_PER_CPU(s_time_t, next);
62     int *this_last_state;
63 
64     ack_APIC_irq();
65 
66     if ( hwp_active() )
67         wrmsr_safe(MSR_HWP_STATUS, 0);
68 
69     if ( NOW() < per_cpu(next, cpu) )
70         return;
71 
72     per_cpu(next, cpu) = NOW() + MILLISECS(5000);
73     rdmsrl(MSR_IA32_THERM_STATUS, msr_content);
74     this_last_state = &per_cpu(last_state, cpu);
75     if ( *this_last_state == (msr_content & MCE_RING) )
76         return;
77     *this_last_state = msr_content & MCE_RING;
78     if ( msr_content & MCE_RING )
79     {
80         printk(KERN_EMERG "CPU%u: Temperature above threshold\n", cpu);
81         printk(KERN_EMERG "CPU%u: Running in modulated clock mode\n", cpu);
82         add_taint(TAINT_MACHINE_CHECK);
83     } else
84         printk(KERN_INFO "CPU%u: Temperature/speed normal\n", cpu);
85 }
86 
87 /* Thermal monitoring depends on APIC, ACPI and clock modulation */
intel_thermal_supported(struct cpuinfo_x86 * c)88 static bool intel_thermal_supported(struct cpuinfo_x86 *c)
89 {
90     if ( !cpu_has_apic )
91         return false;
92     if ( !cpu_has(c, X86_FEATURE_ACPI) || !cpu_has(c, X86_FEATURE_TM1) )
93         return false;
94     return true;
95 }
96 
97 static u32 __read_mostly lvtthmr_init;
98 
mcheck_intel_therm_init(void)99 static void __init mcheck_intel_therm_init(void)
100 {
101     /*
102      * This function is only called on boot CPU. Save the init thermal
103      * LVT value on BSP and use that value to restore APs' thermal LVT
104      * entry BIOS programmed later
105      */
106     if ( intel_thermal_supported(&boot_cpu_data) )
107         lvtthmr_init = apic_read(APIC_LVTTHMR);
108 }
109 
110 /* P4/Xeon Thermal regulation detect and init */
intel_init_thermal(struct cpuinfo_x86 * c)111 static void intel_init_thermal(struct cpuinfo_x86 *c)
112 {
113     uint64_t msr_content;
114     uint32_t val;
115     int tm2 = 0;
116     unsigned int cpu = smp_processor_id();
117     static uint8_t thermal_apic_vector;
118 
119     if ( !intel_thermal_supported(c) )
120         return; /* -ENODEV */
121 
122     /* first check if its enabled already, in which case there might
123      * be some SMM goo which handles it, so we can't even put a handler
124      * since it might be delivered via SMI already -zwanem.
125      */
126     rdmsrl(MSR_IA32_MISC_ENABLE, msr_content);
127     val = lvtthmr_init;
128     /*
129      * The initial value of thermal LVT entries on all APs always reads
130      * 0x10000 because APs are woken up by BSP issuing INIT-SIPI-SIPI
131      * sequence to them and LVT registers are reset to 0s except for
132      * the mask bits which are set to 1s when APs receive INIT IPI.
133      * If BIOS takes over the thermal interrupt and sets its interrupt
134      * delivery mode to SMI (not fixed), it restores the value that the
135      * BIOS has programmed on AP based on BSP's info we saved (since BIOS
136      * is required to set the same value for all threads/cores).
137      */
138     if ( (val & APIC_DM_MASK) != APIC_DM_FIXED || APIC_VECTOR_VALID(val) )
139         apic_write(APIC_LVTTHMR, val);
140 
141     if ( (msr_content & (1ULL<<3))
142          && (val & APIC_DM_MASK) == APIC_DM_SMI )
143     {
144         if ( c == &boot_cpu_data )
145             printk(KERN_DEBUG "Thermal monitoring handled by SMI\n");
146         return; /* -EBUSY */
147     }
148 
149     if ( cpu_has(c, X86_FEATURE_TM2) && (msr_content & (1ULL << 13)) )
150         tm2 = 1;
151 
152     /* check whether a vector already exists, temporarily masked? */
153     if ( val & APIC_VECTOR_MASK )
154     {
155         if ( c == &boot_cpu_data )
156             printk(KERN_DEBUG "Thermal LVT vector (%#x) already installed\n",
157                    val & APIC_VECTOR_MASK);
158         return; /* -EBUSY */
159     }
160 
161     alloc_direct_apic_vector(&thermal_apic_vector, intel_thermal_interrupt);
162 
163     /* The temperature transition interrupt handler setup */
164     val = thermal_apic_vector;    /* our delivery vector */
165     val |= (APIC_DM_FIXED | APIC_LVT_MASKED);  /* we'll mask till we're ready */
166     apic_write(APIC_LVTTHMR, val);
167 
168     rdmsrl(MSR_IA32_THERM_INTERRUPT, msr_content);
169     wrmsrl(MSR_IA32_THERM_INTERRUPT, msr_content | 0x03);
170 
171     rdmsrl(MSR_IA32_MISC_ENABLE, msr_content);
172     wrmsrl(MSR_IA32_MISC_ENABLE, msr_content | (1ULL<<3));
173 
174     apic_write(APIC_LVTTHMR, val & ~APIC_LVT_MASKED);
175     if ( opt_cpu_info )
176         printk(KERN_INFO "CPU%u: Thermal monitoring enabled (%s)\n",
177                cpu, tm2 ? "TM2" : "TM1");
178 }
179 
180 /* Intel MCE handler */
intel_get_extended_msr(struct mcinfo_extended * ext,u32 msr)181 static inline void intel_get_extended_msr(struct mcinfo_extended *ext, u32 msr)
182 {
183     if ( ext->mc_msrs < ARRAY_SIZE(ext->mc_msr)
184          && msr < MSR_IA32_MCG_EAX + nr_intel_ext_msrs )
185     {
186         ext->mc_msr[ext->mc_msrs].reg = msr;
187         rdmsrl(msr, ext->mc_msr[ext->mc_msrs].value);
188         ++ext->mc_msrs;
189     }
190 }
191 
192 
193 struct mcinfo_extended *
intel_get_extended_msrs(struct mcinfo_global * mig,struct mc_info * mi)194 intel_get_extended_msrs(struct mcinfo_global *mig, struct mc_info *mi)
195 {
196     struct mcinfo_extended *mc_ext;
197     int i;
198 
199     /*
200      * According to spec, processor _support_ 64 bit will always
201      * have MSR beyond IA32_MCG_MISC
202      */
203     if ( !mi|| !mig || nr_intel_ext_msrs == 0 ||
204          !(mig->mc_gstatus & MCG_STATUS_EIPV) )
205         return NULL;
206 
207     mc_ext = x86_mcinfo_reserve(mi, sizeof(*mc_ext), MC_TYPE_EXTENDED);
208     if ( !mc_ext )
209     {
210         mi->flags |= MCINFO_FLAGS_UNCOMPLETE;
211         return NULL;
212     }
213 
214     for ( i = MSR_IA32_MCG_EAX; i <= MSR_IA32_MCG_MISC; i++ )
215         intel_get_extended_msr(mc_ext, i);
216 
217     for ( i = MSR_IA32_MCG_R8; i <= MSR_IA32_MCG_R15; i++ )
218         intel_get_extended_msr(mc_ext, i);
219 
220     return mc_ext;
221 }
222 
223 enum intel_mce_type
224 {
225     intel_mce_invalid,
226     intel_mce_fatal,
227     intel_mce_corrected,
228     intel_mce_ucr_ucna,
229     intel_mce_ucr_srao,
230     intel_mce_ucr_srar,
231 };
232 
intel_check_mce_type(uint64_t status)233 static enum intel_mce_type intel_check_mce_type(uint64_t status)
234 {
235     if ( !(status & MCi_STATUS_VAL) )
236         return intel_mce_invalid;
237 
238     if ( status & MCi_STATUS_PCC )
239         return intel_mce_fatal;
240 
241     /* Corrected error? */
242     if ( !(status & MCi_STATUS_UC) )
243         return intel_mce_corrected;
244 
245     if ( !ser_support )
246         return intel_mce_fatal;
247 
248     if ( status & MCi_STATUS_S )
249     {
250         if ( status & MCi_STATUS_AR )
251         {
252             if ( status & MCi_STATUS_OVER )
253                 return intel_mce_fatal;
254             else
255                 return intel_mce_ucr_srar;
256         } else
257             return intel_mce_ucr_srao;
258     }
259     else
260         return intel_mce_ucr_ucna;
261 
262     /* Any type not included abovoe ? */
263     return intel_mce_fatal;
264 }
265 
intel_memerr_dhandler(struct mca_binfo * binfo,enum mce_result * result,const struct cpu_user_regs * regs)266 static void intel_memerr_dhandler(
267              struct mca_binfo *binfo,
268              enum mce_result *result,
269              const struct cpu_user_regs *regs)
270 {
271     mce_printk(MCE_VERBOSE, "MCE: Enter UCR recovery action\n");
272     mc_memerr_dhandler(binfo, result, regs);
273 }
274 
intel_srar_check(uint64_t status)275 static bool cf_check intel_srar_check(uint64_t status)
276 {
277     return (intel_check_mce_type(status) == intel_mce_ucr_srar);
278 }
279 
intel_checkaddr(uint64_t status,uint64_t misc,int addrtype)280 static bool cf_check intel_checkaddr(
281     uint64_t status, uint64_t misc, int addrtype)
282 {
283     if ( !(status & MCi_STATUS_ADDRV) ||
284          !(status & MCi_STATUS_MISCV) ||
285          ((misc & MCi_MISC_ADDRMOD_MASK) != MCi_MISC_PHYSMOD) )
286         /* addr is virtual */
287         return (addrtype == MC_ADDR_VIRTUAL);
288 
289     return (addrtype == MC_ADDR_PHYSICAL);
290 }
291 
intel_srar_dhandler(struct mca_binfo * binfo,enum mce_result * result,const struct cpu_user_regs * regs)292 static void cf_check intel_srar_dhandler(
293     struct mca_binfo *binfo, enum mce_result *result,
294     const struct cpu_user_regs *regs)
295 {
296     uint64_t status = binfo->mib->mc_status;
297 
298     /* For unknown srar error code, reset system */
299     *result = MCER_RESET;
300 
301     switch ( status & INTEL_MCCOD_MASK )
302     {
303     case INTEL_SRAR_DATA_LOAD:
304     case INTEL_SRAR_INSTR_FETCH:
305         intel_memerr_dhandler(binfo, result, regs);
306         break;
307     }
308 }
309 
intel_srao_check(uint64_t status)310 static bool cf_check intel_srao_check(uint64_t status)
311 {
312     return (intel_check_mce_type(status) == intel_mce_ucr_srao);
313 }
314 
intel_srao_dhandler(struct mca_binfo * binfo,enum mce_result * result,const struct cpu_user_regs * regs)315 static void cf_check intel_srao_dhandler(
316     struct mca_binfo *binfo, enum mce_result *result,
317     const struct cpu_user_regs *regs)
318 {
319     uint64_t status = binfo->mib->mc_status;
320 
321     /* For unknown srao error code, no action required */
322     *result = MCER_CONTINUE;
323 
324     if ( status & MCi_STATUS_VAL )
325     {
326         switch ( status & INTEL_MCCOD_MASK )
327         {
328         case INTEL_SRAO_MEM_SCRUB:
329         case INTEL_SRAO_L3_EWB:
330             intel_memerr_dhandler(binfo, result, regs);
331             break;
332         }
333     }
334 }
335 
intel_default_check(uint64_t status)336 static bool cf_check intel_default_check(uint64_t status)
337 {
338     return true;
339 }
340 
intel_default_mce_dhandler(struct mca_binfo * binfo,enum mce_result * result,const struct cpu_user_regs * regs)341 static void cf_check intel_default_mce_dhandler(
342     struct mca_binfo *binfo, enum mce_result *result,
343     const struct cpu_user_regs * regs)
344 {
345     uint64_t status = binfo->mib->mc_status;
346     enum intel_mce_type type;
347 
348     type = intel_check_mce_type(status);
349 
350     if ( type == intel_mce_fatal )
351         *result = MCER_RESET;
352     else
353         *result = MCER_CONTINUE;
354 }
355 
356 static const struct mca_error_handler intel_mce_dhandlers[] = {
357     {intel_srao_check, intel_srao_dhandler},
358     {intel_srar_check, intel_srar_dhandler},
359     {intel_default_check, intel_default_mce_dhandler}
360 };
361 
intel_default_mce_uhandler(struct mca_binfo * binfo,enum mce_result * result,const struct cpu_user_regs * regs)362 static void cf_check intel_default_mce_uhandler(
363     struct mca_binfo *binfo, enum mce_result *result,
364     const struct cpu_user_regs *regs)
365 {
366     uint64_t status = binfo->mib->mc_status;
367     enum intel_mce_type type;
368 
369     type = intel_check_mce_type(status);
370 
371     switch ( type )
372     {
373     case intel_mce_fatal:
374         *result = MCER_RESET;
375         break;
376 
377     default:
378         *result = MCER_CONTINUE;
379         break;
380     }
381 }
382 
383 static const struct mca_error_handler intel_mce_uhandlers[] = {
384     {intel_default_check, intel_default_mce_uhandler}
385 };
386 
387 /* According to MCA OS writer guide, CMCI handler need to clear bank when
388  * 1) CE (UC = 0)
389  * 2) ser_support = 1, Superious error, OVER = 0, EN = 0, [UC = 1]
390  * 3) ser_support = 1, UCNA, OVER = 0, S = 1, AR = 0, PCC = 0, [UC = 1, EN = 1]
391  * MCA handler need to clear bank when
392  * 1) ser_support = 1, Superious error, OVER = 0, EN = 0, UC = 1
393  * 2) ser_support = 1, SRAR, UC = 1, OVER = 0, S = 1, AR = 1, [EN = 1]
394  * 3) ser_support = 1, SRAO, UC = 1, S = 1, AR = 0, [EN = 1]
395  */
396 
intel_need_clearbank_scan(enum mca_source who,u64 status)397 static bool cf_check intel_need_clearbank_scan(enum mca_source who, u64 status)
398 {
399     if ( who == MCA_CMCI_HANDLER )
400     {
401         /* CMCI need clear bank */
402         if ( !(status & MCi_STATUS_UC) )
403             return true;
404         /* Spurious need clear bank */
405         else if ( ser_support && !(status & MCi_STATUS_OVER)
406                   && !(status & MCi_STATUS_EN) )
407             return true;
408         /* UCNA OVER = 0 need clear bank */
409         else if ( ser_support && !(status & MCi_STATUS_OVER)
410                   && !(status & MCi_STATUS_PCC) && !(status & MCi_STATUS_S)
411                   && !(status & MCi_STATUS_AR) )
412             return true;
413         /* Only Log, no clear */
414         else return false;
415     }
416     else if ( who == MCA_MCE_SCAN )
417     {
418         if ( !ser_support )
419             return false;
420         /*
421          * For fatal error, it shouldn't be cleared so that sticky bank
422          * have chance to be handled after reboot by polling
423          */
424         if ( (status & MCi_STATUS_UC) && (status & MCi_STATUS_PCC) )
425             return false;
426         /* Spurious need clear bank */
427         else if ( !(status & MCi_STATUS_OVER)
428                   && (status & MCi_STATUS_UC) && !(status & MCi_STATUS_EN) )
429             return true;
430         /* SRAR OVER=0 clear bank. OVER = 1 have caused reset */
431         else if ( (status & MCi_STATUS_UC)
432                   && (status & MCi_STATUS_S) && (status & MCi_STATUS_AR)
433                   && !(status & MCi_STATUS_OVER) )
434             return true;
435         /* SRAO need clear bank */
436         else if ( !(status & MCi_STATUS_AR)
437                   && (status & MCi_STATUS_S) && (status & MCi_STATUS_UC) )
438             return true;
439         else
440             return false;
441     }
442 
443     return true;
444 }
445 
446 /*
447  * MCE continues/is recoverable when
448  * 1) CE UC = 0
449  * 2) Supious ser_support = 1, OVER = 0, En = 0 [UC = 1]
450  * 3) SRAR ser_support = 1, OVER = 0, PCC = 0, S = 1, AR = 1 [UC =1, EN = 1]
451  * 4) SRAO ser_support = 1, PCC = 0, S = 1, AR = 0, EN = 1 [UC = 1]
452  * 5) UCNA ser_support = 1, OVER = 0, EN = 1, PCC = 0, S = 0, AR = 0, [UC = 1]
453  */
intel_recoverable_scan(uint64_t status)454 static bool cf_check intel_recoverable_scan(uint64_t status)
455 {
456 
457     if ( !(status & MCi_STATUS_UC ) )
458         return true;
459     else if ( ser_support && !(status & MCi_STATUS_EN)
460               && !(status & MCi_STATUS_OVER) )
461         return true;
462     /* SRAR error */
463     else if ( ser_support && !(status & MCi_STATUS_OVER)
464               && !(status & MCi_STATUS_PCC) && (status & MCi_STATUS_S)
465               && (status & MCi_STATUS_AR) && (status & MCi_STATUS_EN) )
466         return true;
467     /* SRAO error */
468     else if ( ser_support && !(status & MCi_STATUS_PCC)
469               && (status & MCi_STATUS_S) && !(status & MCi_STATUS_AR)
470               && (status & MCi_STATUS_EN) )
471         return true;
472     /* UCNA error */
473     else if ( ser_support && !(status & MCi_STATUS_OVER)
474               && (status & MCi_STATUS_EN) && !(status & MCi_STATUS_PCC)
475               && !(status & MCi_STATUS_S) && !(status & MCi_STATUS_AR) )
476         return true;
477     return false;
478 }
479 
480 /* CMCI */
481 static DEFINE_SPINLOCK(cmci_discover_lock);
482 
483 /*
484  * Discover bank sharing using the algorithm recommended in the SDM.
485  */
do_cmci_discover(int i)486 static int do_cmci_discover(int i)
487 {
488     unsigned msr = MSR_IA32_MCx_CTL2(i);
489     u64 val;
490     unsigned int threshold, max_threshold;
491     unsigned int cpu = smp_processor_id();
492     static unsigned int cmci_threshold = 2;
493     integer_param("cmci-threshold", cmci_threshold);
494 
495     rdmsrl(msr, val);
496     /* Some other CPU already owns this bank. */
497     if ( val & CMCI_EN )
498     {
499         mcabanks_clear(i, per_cpu(mce_banks_owned, cpu));
500         goto out;
501     }
502 
503     if ( cmci_threshold )
504     {
505         wrmsrl(msr, val | CMCI_EN | CMCI_THRESHOLD_MASK);
506         rdmsrl(msr, val);
507     }
508 
509     if ( !(val & CMCI_EN) )
510     {
511         /* This bank does not support CMCI. Polling timer has to handle it. */
512         mcabanks_set(i, per_cpu(no_cmci_banks, cpu));
513         wrmsrl(msr, val & ~CMCI_THRESHOLD_MASK);
514         return 0;
515     }
516     max_threshold = MASK_EXTR(val, CMCI_THRESHOLD_MASK);
517     threshold = cmci_threshold;
518     if ( threshold > max_threshold )
519     {
520         mce_printk(MCE_QUIET,
521                    "CMCI: threshold %#x too large for CPU%u bank %u, using %#x\n",
522                    threshold, cpu, i, max_threshold);
523         threshold = max_threshold;
524     }
525     wrmsrl(msr, (val & ~CMCI_THRESHOLD_MASK) | CMCI_EN | threshold);
526     mcabanks_set(i, per_cpu(mce_banks_owned, cpu));
527 out:
528     mcabanks_clear(i, per_cpu(no_cmci_banks, cpu));
529     return 1;
530 }
531 
cmci_discover(void)532 static void cmci_discover(void)
533 {
534     unsigned long flags;
535     unsigned int i, cpu = smp_processor_id();
536     mctelem_cookie_t mctc;
537     struct mca_summary bs;
538 
539     mce_printk(MCE_VERBOSE, "CMCI: find owner on CPU%u\n", cpu);
540 
541     spin_lock_irqsave(&cmci_discover_lock, flags);
542 
543     for ( i = 0; i < per_cpu(nr_mce_banks, cpu); i++ )
544         if ( !mcabanks_test(i, per_cpu(mce_banks_owned, cpu)) )
545             do_cmci_discover(i);
546 
547     spin_unlock_irqrestore(&cmci_discover_lock, flags);
548 
549     /*
550      * In case CMCI happended when do owner change.
551      * If CMCI happened yet not processed immediately,
552      * MCi_status (error_count bit 38~52) is not cleared,
553      * the CMCI interrupt will never be triggered again.
554      */
555 
556     mctc = mcheck_mca_logout(
557         MCA_CMCI_HANDLER, per_cpu(mce_banks_owned, cpu), &bs, NULL);
558 
559     if ( bs.errcnt && mctc != NULL )
560     {
561         if ( dom0_vmce_enabled() )
562         {
563             mctelem_commit(mctc);
564             send_global_virq(VIRQ_MCA);
565         }
566         else
567         {
568             x86_mcinfo_dump(mctelem_dataptr(mctc));
569             mctelem_dismiss(mctc);
570         }
571     }
572     else if ( mctc != NULL )
573         mctelem_dismiss(mctc);
574 
575     mce_printk(MCE_VERBOSE, "CMCI: CPU%d owner_map[%lx], no_cmci_map[%lx]\n",
576                cpu,
577                per_cpu(mce_banks_owned, cpu)->bank_map[0],
578                per_cpu(no_cmci_banks, cpu)->bank_map[0]);
579 }
580 
581 /*
582  * Define an owner for each bank. Banks can be shared between CPUs
583  * and to avoid reporting events multiple times always set up one
584  * CPU as owner.
585  *
586  * The assignment has to be redone when CPUs go offline and
587  * any of the owners goes away. Also pollers run in parallel so we
588  * have to be careful to update the banks in a way that doesn't
589  * lose or duplicate events.
590  */
591 
mce_set_owner(void)592 static void mce_set_owner(void)
593 {
594     if ( !cmci_support || !opt_mce )
595         return;
596 
597     cmci_discover();
598 }
599 
__cpu_mcheck_distribute_cmci(void * unused)600 static void cf_check __cpu_mcheck_distribute_cmci(void *unused)
601 {
602     cmci_discover();
603 }
604 
cpu_mcheck_distribute_cmci(void)605 static void cpu_mcheck_distribute_cmci(void)
606 {
607     if ( cmci_support && opt_mce )
608         on_each_cpu(__cpu_mcheck_distribute_cmci, NULL, 0);
609 }
610 
clear_cmci(void)611 static void clear_cmci(void)
612 {
613     unsigned int i, cpu = smp_processor_id();
614 
615     if ( !cmci_support || !opt_mce )
616         return;
617 
618     mce_printk(MCE_VERBOSE, "CMCI: clear_cmci support on CPU%u\n", cpu);
619 
620     for ( i = 0; i < per_cpu(nr_mce_banks, cpu); i++ )
621     {
622         unsigned msr = MSR_IA32_MCx_CTL2(i);
623         u64 val;
624 
625         if ( !mcabanks_test(i, per_cpu(mce_banks_owned, cpu)) )
626             continue;
627         rdmsrl(msr, val);
628         if ( val & (CMCI_EN|CMCI_THRESHOLD_MASK) )
629             wrmsrl(msr, val & ~(CMCI_EN|CMCI_THRESHOLD_MASK));
630         mcabanks_clear(i, per_cpu(mce_banks_owned, cpu));
631     }
632 }
633 
cpu_mcheck_disable(void)634 static void cpu_mcheck_disable(void)
635 {
636     if ( cmci_support && opt_mce )
637         clear_cmci();
638 }
639 
cmci_interrupt(void)640 static void cf_check cmci_interrupt(void)
641 {
642     mctelem_cookie_t mctc;
643     struct mca_summary bs;
644 
645     ack_APIC_irq();
646 
647     mctc = mcheck_mca_logout(
648         MCA_CMCI_HANDLER, this_cpu(mce_banks_owned), &bs, NULL);
649 
650     if ( bs.errcnt && mctc != NULL )
651     {
652         if ( dom0_vmce_enabled() )
653         {
654             mctelem_commit(mctc);
655             mce_printk(MCE_VERBOSE, "CMCI: send CMCI to DOM0 through virq\n");
656             send_global_virq(VIRQ_MCA);
657         }
658         else
659         {
660             x86_mcinfo_dump(mctelem_dataptr(mctc));
661             mctelem_dismiss(mctc);
662         }
663     }
664     else if ( mctc != NULL )
665         mctelem_dismiss(mctc);
666 }
667 
intel_init_cmci(struct cpuinfo_x86 * c)668 static void intel_init_cmci(struct cpuinfo_x86 *c)
669 {
670     u32 l, apic;
671     int cpu = smp_processor_id();
672 
673     if ( !mce_available(c) || !cmci_support )
674     {
675         if ( opt_cpu_info )
676             mce_printk(MCE_QUIET, "CMCI: CPU%d has no CMCI support\n", cpu);
677         return;
678     }
679 
680     apic = apic_read(APIC_CMCI);
681     if ( apic & APIC_VECTOR_MASK )
682     {
683         mce_printk(MCE_QUIET, "CPU%d CMCI LVT vector (%#x) already installed\n",
684                    cpu, ( apic & APIC_VECTOR_MASK ));
685         return;
686     }
687 
688     alloc_direct_apic_vector(&cmci_apic_vector, cmci_interrupt);
689 
690     apic = cmci_apic_vector;
691     apic |= (APIC_DM_FIXED | APIC_LVT_MASKED);
692     apic_write(APIC_CMCI, apic);
693 
694     l = apic_read(APIC_CMCI);
695     apic_write(APIC_CMCI, l & ~APIC_LVT_MASKED);
696 
697     mce_set_owner();
698 }
699 
700 /* MCA */
701 
mce_is_broadcast(struct cpuinfo_x86 * c)702 static bool mce_is_broadcast(struct cpuinfo_x86 *c)
703 {
704     if ( mce_force_broadcast )
705         return true;
706 
707     /*
708      * According to Intel SDM Dec, 2009, 15.10.4.1, For processors with
709      * DisplayFamily_DisplayModel encoding of 06H_EH and above,
710      * a MCA signal is broadcast to all logical processors in the system
711      */
712     if ( c->x86_vendor == X86_VENDOR_INTEL && c->x86 == 6 &&
713          c->x86_model >= 0xe )
714         return true;
715     return false;
716 }
717 
intel_enable_lmce(void)718 static bool intel_enable_lmce(void)
719 {
720     uint64_t msr_content;
721 
722     /*
723      * Section "Enabling Local Machine Check" in Intel SDM Vol 3
724      * requires software must ensure the LOCK bit and LMCE_ON bit
725      * of MSR_IA32_FEATURE_CONTROL are set before setting
726      * MSR_IA32_MCG_EXT_CTL.LMCE_EN.
727      */
728 
729     if ( rdmsr_safe(MSR_IA32_FEATURE_CONTROL, msr_content) )
730         return false;
731 
732     if ( (msr_content & IA32_FEATURE_CONTROL_LOCK) &&
733          (msr_content & IA32_FEATURE_CONTROL_LMCE_ON) )
734     {
735         wrmsrl(MSR_IA32_MCG_EXT_CTL, MCG_EXT_CTL_LMCE_EN);
736         return true;
737     }
738 
739     return false;
740 }
741 
742 /* Check and init MCA */
intel_init_mca(struct cpuinfo_x86 * c)743 static void intel_init_mca(struct cpuinfo_x86 *c)
744 {
745     bool broadcast, cmci = false, ser = false, lmce = false;
746     int ext_num = 0, first;
747     uint64_t msr_content;
748 
749     broadcast = mce_is_broadcast(c);
750 
751     rdmsrl(MSR_IA32_MCG_CAP, msr_content);
752 
753     if ( (msr_content & MCG_CMCI_P) && cpu_has_apic )
754         cmci = true;
755 
756     /* Support Software Error Recovery */
757     if ( msr_content & MCG_SER_P )
758         ser = true;
759 
760     if ( msr_content & MCG_EXT_P )
761         ext_num = (msr_content >> MCG_EXT_CNT) & 0xff;
762 
763     first = mce_firstbank(c);
764 
765     if ( !mce_force_broadcast && (msr_content & MCG_LMCE_P) )
766         lmce = intel_enable_lmce();
767 
768 #define CAP(enabled, name) ((enabled) ? ", " name : "")
769     if ( smp_processor_id() == 0 )
770     {
771         dprintk(XENLOG_INFO,
772                 "MCA Capability: firstbank %d, extended MCE MSR %d%s%s%s%s\n",
773                 first, ext_num,
774                 CAP(broadcast, "BCAST"),
775                 CAP(ser, "SER"),
776                 CAP(cmci, "CMCI"),
777                 CAP(lmce, "LMCE"));
778 
779         mce_broadcast = broadcast;
780         cmci_support = cmci;
781         ser_support = ser;
782         lmce_support = lmce;
783         nr_intel_ext_msrs = ext_num;
784         firstbank = first;
785     }
786     else if ( cmci != cmci_support || ser != ser_support ||
787               broadcast != mce_broadcast ||
788               first != firstbank || ext_num != nr_intel_ext_msrs ||
789               lmce != lmce_support )
790         dprintk(XENLOG_WARNING,
791                 "CPU%u has different MCA capability "
792                 "(firstbank %d, extended MCE MSR %d%s%s%s%s)"
793                 " than BSP, may cause undetermined result!!!\n",
794                 smp_processor_id(), first, ext_num,
795                 CAP(broadcast, "BCAST"),
796                 CAP(ser, "SER"),
797                 CAP(cmci, "CMCI"),
798                 CAP(lmce, "LMCE"));
799 #undef CAP
800 }
801 
intel_mce_post_reset(void)802 static void intel_mce_post_reset(void)
803 {
804     mctelem_cookie_t mctc;
805     struct mca_summary bs;
806 
807     mctc = mcheck_mca_logout(MCA_RESET, mca_allbanks, &bs, NULL);
808 
809     /* in the boot up stage, print out and also log in DOM0 boot process */
810     if ( bs.errcnt && mctc != NULL )
811     {
812         x86_mcinfo_dump(mctelem_dataptr(mctc));
813         mctelem_commit(mctc);
814     }
815     return;
816 }
817 
intel_init_mce(bool bsp)818 static void intel_init_mce(bool bsp)
819 {
820     uint64_t msr_content;
821     int i;
822 
823     intel_mce_post_reset();
824 
825     /* clear all banks */
826     for ( i = firstbank; i < this_cpu(nr_mce_banks); i++ )
827     {
828         /*
829          * Some banks are shared across cores, use MCi_CTRL to judge whether
830          * this bank has been initialized by other cores already.
831          */
832         rdmsrl(MSR_IA32_MCx_CTL(i), msr_content);
833         if ( !msr_content )
834         {
835             /* if ctl is 0, this bank is never initialized */
836             mce_printk(MCE_VERBOSE, "mce_init: init bank%d\n", i);
837             wrmsrl(MSR_IA32_MCx_CTL(i), 0xffffffffffffffffULL);
838             wrmsrl(MSR_IA32_MCx_STATUS(i), 0x0ULL);
839         }
840     }
841     if ( firstbank ) /* if cmci enabled, firstbank = 0 */
842         wrmsrl(MSR_IA32_MC0_STATUS, 0x0ULL);
843 
844     if ( !bsp )
845         return;
846 
847     mce_dhandlers = intel_mce_dhandlers;
848     mce_dhandler_num = ARRAY_SIZE(intel_mce_dhandlers);
849     mce_uhandlers = intel_mce_uhandlers;
850     mce_uhandler_num = ARRAY_SIZE(intel_mce_uhandlers);
851 }
852 
intel_init_ppin(const struct cpuinfo_x86 * c)853 static void intel_init_ppin(const struct cpuinfo_x86 *c)
854 {
855     /*
856      * Even if testing the presence of the MSR would be enough, we don't
857      * want to risk the situation where other models reuse this MSR for
858      * other purposes.  Despite the late addition of a CPUID bit (rendering
859      * the MSR architectural), keep using the same detection logic there.
860      */
861     switch ( c->x86_model )
862     {
863         uint64_t val;
864 
865     default:
866         if ( !cpu_has(c, X86_FEATURE_INTEL_PPIN) )
867         {
868             ppin_msr = 0;
869             return;
870         }
871         fallthrough;
872     case 0x3e: /* IvyBridge X */
873     case 0x3f: /* Haswell X */
874     case 0x4f: /* Broadwell X */
875     case 0x55: /* Skylake X */
876     case 0x56: /* Broadwell Xeon D */
877     case 0x6a: /* Icelake X */
878     case 0x6c: /* Icelake D */
879     case 0x8f: /* Sapphire Rapids X */
880 
881         if ( (c != &boot_cpu_data && !ppin_msr) ||
882              rdmsr_safe(MSR_PPIN_CTL, val) )
883             return;
884 
885         /* If PPIN is disabled, but not locked, try to enable. */
886         if ( !(val & (PPIN_ENABLE | PPIN_LOCKOUT)) )
887         {
888             wrmsr_safe(MSR_PPIN_CTL, val | PPIN_ENABLE);
889             rdmsr_safe(MSR_PPIN_CTL, val);
890         }
891 
892         if ( !(val & PPIN_ENABLE) )
893             ppin_msr = 0;
894         else if ( c == &boot_cpu_data )
895             ppin_msr = MSR_PPIN;
896 
897         break;
898     }
899 }
900 
cpu_mcabank_free(unsigned int cpu)901 static void cpu_mcabank_free(unsigned int cpu)
902 {
903     struct mca_banks *cmci = per_cpu(no_cmci_banks, cpu);
904     struct mca_banks *owned = per_cpu(mce_banks_owned, cpu);
905 
906     mcabanks_free(cmci);
907     mcabanks_free(owned);
908 }
909 
cpu_mcabank_alloc(unsigned int cpu)910 static int cpu_mcabank_alloc(unsigned int cpu)
911 {
912     unsigned int nr = per_cpu(nr_mce_banks, cpu);
913     struct mca_banks *cmci = mcabanks_alloc(nr);
914     struct mca_banks *owned = mcabanks_alloc(nr);
915 
916     if ( !cmci || !owned )
917         goto out;
918 
919     per_cpu(no_cmci_banks, cpu) = cmci;
920     per_cpu(mce_banks_owned, cpu) = owned;
921     per_cpu(last_state, cpu) = -1;
922 
923     return 0;
924  out:
925     mcabanks_free(cmci);
926     mcabanks_free(owned);
927     return -ENOMEM;
928 }
929 
cpu_callback(struct notifier_block * nfb,unsigned long action,void * hcpu)930 static int cf_check cpu_callback(
931     struct notifier_block *nfb, unsigned long action, void *hcpu)
932 {
933     unsigned int cpu = (unsigned long)hcpu;
934     int rc = 0;
935 
936     switch ( action )
937     {
938     case CPU_UP_PREPARE:
939         rc = cpu_mcabank_alloc(cpu);
940         break;
941 
942     case CPU_DYING:
943         cpu_mcheck_disable();
944         break;
945 
946     case CPU_UP_CANCELED:
947     case CPU_DEAD:
948         cpu_mcheck_distribute_cmci();
949         cpu_mcabank_free(cpu);
950         break;
951     }
952 
953     return notifier_from_errno(rc);
954 }
955 
956 static const struct mce_callbacks __initconst_cf_clobber intel_callbacks = {
957     .handler = mcheck_cmn_handler,
958     .check_addr = intel_checkaddr,
959     .recoverable_scan = intel_recoverable_scan,
960     .need_clearbank_scan = intel_need_clearbank_scan,
961 };
962 
963 static struct notifier_block cpu_nfb = {
964     .notifier_call = cpu_callback
965 };
966 
967 /* p4/p6 family have similar MCA initialization process */
intel_mcheck_init(struct cpuinfo_x86 * c,bool bsp)968 enum mcheck_type intel_mcheck_init(struct cpuinfo_x86 *c, bool bsp)
969 {
970     if ( bsp )
971     {
972         /* Early MCE initialisation for BSP. */
973         if ( cpu_mcabank_alloc(0) )
974             BUG();
975         register_cpu_notifier(&cpu_nfb);
976         mcheck_intel_therm_init();
977     }
978     else
979     {
980         unsigned int cpu = smp_processor_id();
981 
982         per_cpu(no_cmci_banks, cpu)->num = per_cpu(nr_mce_banks, cpu);
983         per_cpu(mce_banks_owned, cpu)->num = per_cpu(nr_mce_banks, cpu);
984     }
985 
986     intel_init_mca(c);
987 
988     if ( bsp )
989         mce_handler_init(&intel_callbacks);
990 
991     intel_init_mce(bsp);
992 
993     intel_init_cmci(c);
994 
995     intel_init_thermal(c);
996 
997     intel_init_ppin(c);
998 
999     return mcheck_intel;
1000 }
1001 
1002 /* intel specific MCA MSR */
vmce_intel_wrmsr(struct vcpu * v,uint32_t msr,uint64_t val)1003 int vmce_intel_wrmsr(struct vcpu *v, uint32_t msr, uint64_t val)
1004 {
1005     unsigned int bank = msr - MSR_IA32_MC0_CTL2;
1006 
1007     if ( bank < GUEST_MC_BANK_NUM )
1008     {
1009         v->arch.vmce.bank[bank].mci_ctl2 = val;
1010         mce_printk(MCE_VERBOSE, "MCE: wr MC%u_CTL2 %#"PRIx64"\n", bank, val);
1011     }
1012 
1013     return 1;
1014 }
1015 
vmce_intel_rdmsr(const struct vcpu * v,uint32_t msr,uint64_t * val)1016 int vmce_intel_rdmsr(const struct vcpu *v, uint32_t msr, uint64_t *val)
1017 {
1018     const struct cpu_policy *cp = v->domain->arch.cpu_policy;
1019     unsigned int bank = msr - MSR_IA32_MC0_CTL2;
1020 
1021     switch ( msr )
1022     {
1023     case MSR_P5_MC_ADDR:
1024         /*
1025          * Bank 0 is used for the 'bank 0 quirk' on older processors.
1026          * See vcpu_fill_mc_msrs() for reference.
1027          */
1028         *val = v->arch.vmce.bank[1].mci_addr;
1029         return 1;
1030 
1031     case MSR_P5_MC_TYPE:
1032         *val = v->arch.vmce.bank[1].mci_status;
1033         return 1;
1034     }
1035 
1036     if ( !(cp->x86_vendor & X86_VENDOR_INTEL) )
1037         return 0;
1038 
1039     if ( bank < GUEST_MC_BANK_NUM )
1040     {
1041         *val = v->arch.vmce.bank[bank].mci_ctl2;
1042         mce_printk(MCE_VERBOSE, "MCE: rd MC%u_CTL2 %#"PRIx64"\n", bank, *val);
1043     }
1044 
1045     return 1;
1046 }
1047 
1048