1 #include <xen/init.h>
2 #include <xen/types.h>
3 #include <xen/irq.h>
4 #include <xen/event.h>
5 #include <xen/kernel.h>
6 #include <xen/delay.h>
7 #include <xen/param.h>
8 #include <xen/smp.h>
9 #include <xen/mm.h>
10 #include <xen/cpu.h>
11 #include <asm/processor.h>
12 #include <public/sysctl.h>
13 #include <asm/system.h>
14 #include <asm/msr.h>
15 #include <asm/p2m.h>
16 #include <asm/mce.h>
17 #include <asm/apic.h>
18
19 #include <acpi/cpufreq/cpufreq.h>
20
21 #include "mce.h"
22 #include "x86_mca.h"
23 #include "barrier.h"
24 #include "util.h"
25 #include "vmce.h"
26 #include "mcaction.h"
27
28 static DEFINE_PER_CPU_READ_MOSTLY(struct mca_banks *, mce_banks_owned);
29 static bool __read_mostly ser_support;
30 static bool __read_mostly mce_force_broadcast;
31 boolean_param("mce_fb", mce_force_broadcast);
32
33 static int __read_mostly nr_intel_ext_msrs;
34
35 /* Intel SDM define bit15~bit0 of IA32_MCi_STATUS as the MC error code */
36 #define INTEL_MCCOD_MASK 0xFFFF
37
38 /*
39 * Currently Intel SDM define 2 kinds of srao errors:
40 * 1). Memory scrubbing error, error code = 0xC0 ~ 0xCF
41 * 2). L3 explicit writeback error, error code = 0x17A
42 */
43 #define INTEL_SRAO_MEM_SCRUB 0xC0 ... 0xCF
44 #define INTEL_SRAO_L3_EWB 0x17A
45
46 /*
47 * Currently Intel SDM define 2 kinds of srar errors:
48 * 1). Data Load error, error code = 0x134
49 * 2). Instruction Fetch error, error code = 0x150
50 */
51 #define INTEL_SRAR_DATA_LOAD 0x134
52 #define INTEL_SRAR_INSTR_FETCH 0x150
53
54 #define MCE_RING 0x1
55 static DEFINE_PER_CPU(int, last_state);
56
intel_thermal_interrupt(void)57 static void cf_check intel_thermal_interrupt(void)
58 {
59 uint64_t msr_content;
60 unsigned int cpu = smp_processor_id();
61 static DEFINE_PER_CPU(s_time_t, next);
62 int *this_last_state;
63
64 ack_APIC_irq();
65
66 if ( hwp_active() )
67 wrmsr_safe(MSR_HWP_STATUS, 0);
68
69 if ( NOW() < per_cpu(next, cpu) )
70 return;
71
72 per_cpu(next, cpu) = NOW() + MILLISECS(5000);
73 rdmsrl(MSR_IA32_THERM_STATUS, msr_content);
74 this_last_state = &per_cpu(last_state, cpu);
75 if ( *this_last_state == (msr_content & MCE_RING) )
76 return;
77 *this_last_state = msr_content & MCE_RING;
78 if ( msr_content & MCE_RING )
79 {
80 printk(KERN_EMERG "CPU%u: Temperature above threshold\n", cpu);
81 printk(KERN_EMERG "CPU%u: Running in modulated clock mode\n", cpu);
82 add_taint(TAINT_MACHINE_CHECK);
83 } else
84 printk(KERN_INFO "CPU%u: Temperature/speed normal\n", cpu);
85 }
86
87 /* Thermal monitoring depends on APIC, ACPI and clock modulation */
intel_thermal_supported(struct cpuinfo_x86 * c)88 static bool intel_thermal_supported(struct cpuinfo_x86 *c)
89 {
90 if ( !cpu_has_apic )
91 return false;
92 if ( !cpu_has(c, X86_FEATURE_ACPI) || !cpu_has(c, X86_FEATURE_TM1) )
93 return false;
94 return true;
95 }
96
97 static u32 __read_mostly lvtthmr_init;
98
mcheck_intel_therm_init(void)99 static void __init mcheck_intel_therm_init(void)
100 {
101 /*
102 * This function is only called on boot CPU. Save the init thermal
103 * LVT value on BSP and use that value to restore APs' thermal LVT
104 * entry BIOS programmed later
105 */
106 if ( intel_thermal_supported(&boot_cpu_data) )
107 lvtthmr_init = apic_read(APIC_LVTTHMR);
108 }
109
110 /* P4/Xeon Thermal regulation detect and init */
intel_init_thermal(struct cpuinfo_x86 * c)111 static void intel_init_thermal(struct cpuinfo_x86 *c)
112 {
113 uint64_t msr_content;
114 uint32_t val;
115 int tm2 = 0;
116 unsigned int cpu = smp_processor_id();
117 static uint8_t thermal_apic_vector;
118
119 if ( !intel_thermal_supported(c) )
120 return; /* -ENODEV */
121
122 /* first check if its enabled already, in which case there might
123 * be some SMM goo which handles it, so we can't even put a handler
124 * since it might be delivered via SMI already -zwanem.
125 */
126 rdmsrl(MSR_IA32_MISC_ENABLE, msr_content);
127 val = lvtthmr_init;
128 /*
129 * The initial value of thermal LVT entries on all APs always reads
130 * 0x10000 because APs are woken up by BSP issuing INIT-SIPI-SIPI
131 * sequence to them and LVT registers are reset to 0s except for
132 * the mask bits which are set to 1s when APs receive INIT IPI.
133 * If BIOS takes over the thermal interrupt and sets its interrupt
134 * delivery mode to SMI (not fixed), it restores the value that the
135 * BIOS has programmed on AP based on BSP's info we saved (since BIOS
136 * is required to set the same value for all threads/cores).
137 */
138 if ( (val & APIC_DM_MASK) != APIC_DM_FIXED || APIC_VECTOR_VALID(val) )
139 apic_write(APIC_LVTTHMR, val);
140
141 if ( (msr_content & (1ULL<<3))
142 && (val & APIC_DM_MASK) == APIC_DM_SMI )
143 {
144 if ( c == &boot_cpu_data )
145 printk(KERN_DEBUG "Thermal monitoring handled by SMI\n");
146 return; /* -EBUSY */
147 }
148
149 if ( cpu_has(c, X86_FEATURE_TM2) && (msr_content & (1ULL << 13)) )
150 tm2 = 1;
151
152 /* check whether a vector already exists, temporarily masked? */
153 if ( val & APIC_VECTOR_MASK )
154 {
155 if ( c == &boot_cpu_data )
156 printk(KERN_DEBUG "Thermal LVT vector (%#x) already installed\n",
157 val & APIC_VECTOR_MASK);
158 return; /* -EBUSY */
159 }
160
161 alloc_direct_apic_vector(&thermal_apic_vector, intel_thermal_interrupt);
162
163 /* The temperature transition interrupt handler setup */
164 val = thermal_apic_vector; /* our delivery vector */
165 val |= (APIC_DM_FIXED | APIC_LVT_MASKED); /* we'll mask till we're ready */
166 apic_write(APIC_LVTTHMR, val);
167
168 rdmsrl(MSR_IA32_THERM_INTERRUPT, msr_content);
169 wrmsrl(MSR_IA32_THERM_INTERRUPT, msr_content | 0x03);
170
171 rdmsrl(MSR_IA32_MISC_ENABLE, msr_content);
172 wrmsrl(MSR_IA32_MISC_ENABLE, msr_content | (1ULL<<3));
173
174 apic_write(APIC_LVTTHMR, val & ~APIC_LVT_MASKED);
175 if ( opt_cpu_info )
176 printk(KERN_INFO "CPU%u: Thermal monitoring enabled (%s)\n",
177 cpu, tm2 ? "TM2" : "TM1");
178 }
179
180 /* Intel MCE handler */
intel_get_extended_msr(struct mcinfo_extended * ext,u32 msr)181 static inline void intel_get_extended_msr(struct mcinfo_extended *ext, u32 msr)
182 {
183 if ( ext->mc_msrs < ARRAY_SIZE(ext->mc_msr)
184 && msr < MSR_IA32_MCG_EAX + nr_intel_ext_msrs )
185 {
186 ext->mc_msr[ext->mc_msrs].reg = msr;
187 rdmsrl(msr, ext->mc_msr[ext->mc_msrs].value);
188 ++ext->mc_msrs;
189 }
190 }
191
192
193 struct mcinfo_extended *
intel_get_extended_msrs(struct mcinfo_global * mig,struct mc_info * mi)194 intel_get_extended_msrs(struct mcinfo_global *mig, struct mc_info *mi)
195 {
196 struct mcinfo_extended *mc_ext;
197 int i;
198
199 /*
200 * According to spec, processor _support_ 64 bit will always
201 * have MSR beyond IA32_MCG_MISC
202 */
203 if ( !mi|| !mig || nr_intel_ext_msrs == 0 ||
204 !(mig->mc_gstatus & MCG_STATUS_EIPV) )
205 return NULL;
206
207 mc_ext = x86_mcinfo_reserve(mi, sizeof(*mc_ext), MC_TYPE_EXTENDED);
208 if ( !mc_ext )
209 {
210 mi->flags |= MCINFO_FLAGS_UNCOMPLETE;
211 return NULL;
212 }
213
214 for ( i = MSR_IA32_MCG_EAX; i <= MSR_IA32_MCG_MISC; i++ )
215 intel_get_extended_msr(mc_ext, i);
216
217 for ( i = MSR_IA32_MCG_R8; i <= MSR_IA32_MCG_R15; i++ )
218 intel_get_extended_msr(mc_ext, i);
219
220 return mc_ext;
221 }
222
223 enum intel_mce_type
224 {
225 intel_mce_invalid,
226 intel_mce_fatal,
227 intel_mce_corrected,
228 intel_mce_ucr_ucna,
229 intel_mce_ucr_srao,
230 intel_mce_ucr_srar,
231 };
232
intel_check_mce_type(uint64_t status)233 static enum intel_mce_type intel_check_mce_type(uint64_t status)
234 {
235 if ( !(status & MCi_STATUS_VAL) )
236 return intel_mce_invalid;
237
238 if ( status & MCi_STATUS_PCC )
239 return intel_mce_fatal;
240
241 /* Corrected error? */
242 if ( !(status & MCi_STATUS_UC) )
243 return intel_mce_corrected;
244
245 if ( !ser_support )
246 return intel_mce_fatal;
247
248 if ( status & MCi_STATUS_S )
249 {
250 if ( status & MCi_STATUS_AR )
251 {
252 if ( status & MCi_STATUS_OVER )
253 return intel_mce_fatal;
254 else
255 return intel_mce_ucr_srar;
256 } else
257 return intel_mce_ucr_srao;
258 }
259 else
260 return intel_mce_ucr_ucna;
261
262 /* Any type not included abovoe ? */
263 return intel_mce_fatal;
264 }
265
intel_memerr_dhandler(struct mca_binfo * binfo,enum mce_result * result,const struct cpu_user_regs * regs)266 static void intel_memerr_dhandler(
267 struct mca_binfo *binfo,
268 enum mce_result *result,
269 const struct cpu_user_regs *regs)
270 {
271 mce_printk(MCE_VERBOSE, "MCE: Enter UCR recovery action\n");
272 mc_memerr_dhandler(binfo, result, regs);
273 }
274
intel_srar_check(uint64_t status)275 static bool cf_check intel_srar_check(uint64_t status)
276 {
277 return (intel_check_mce_type(status) == intel_mce_ucr_srar);
278 }
279
intel_checkaddr(uint64_t status,uint64_t misc,int addrtype)280 static bool cf_check intel_checkaddr(
281 uint64_t status, uint64_t misc, int addrtype)
282 {
283 if ( !(status & MCi_STATUS_ADDRV) ||
284 !(status & MCi_STATUS_MISCV) ||
285 ((misc & MCi_MISC_ADDRMOD_MASK) != MCi_MISC_PHYSMOD) )
286 /* addr is virtual */
287 return (addrtype == MC_ADDR_VIRTUAL);
288
289 return (addrtype == MC_ADDR_PHYSICAL);
290 }
291
intel_srar_dhandler(struct mca_binfo * binfo,enum mce_result * result,const struct cpu_user_regs * regs)292 static void cf_check intel_srar_dhandler(
293 struct mca_binfo *binfo, enum mce_result *result,
294 const struct cpu_user_regs *regs)
295 {
296 uint64_t status = binfo->mib->mc_status;
297
298 /* For unknown srar error code, reset system */
299 *result = MCER_RESET;
300
301 switch ( status & INTEL_MCCOD_MASK )
302 {
303 case INTEL_SRAR_DATA_LOAD:
304 case INTEL_SRAR_INSTR_FETCH:
305 intel_memerr_dhandler(binfo, result, regs);
306 break;
307 }
308 }
309
intel_srao_check(uint64_t status)310 static bool cf_check intel_srao_check(uint64_t status)
311 {
312 return (intel_check_mce_type(status) == intel_mce_ucr_srao);
313 }
314
intel_srao_dhandler(struct mca_binfo * binfo,enum mce_result * result,const struct cpu_user_regs * regs)315 static void cf_check intel_srao_dhandler(
316 struct mca_binfo *binfo, enum mce_result *result,
317 const struct cpu_user_regs *regs)
318 {
319 uint64_t status = binfo->mib->mc_status;
320
321 /* For unknown srao error code, no action required */
322 *result = MCER_CONTINUE;
323
324 if ( status & MCi_STATUS_VAL )
325 {
326 switch ( status & INTEL_MCCOD_MASK )
327 {
328 case INTEL_SRAO_MEM_SCRUB:
329 case INTEL_SRAO_L3_EWB:
330 intel_memerr_dhandler(binfo, result, regs);
331 break;
332 }
333 }
334 }
335
intel_default_check(uint64_t status)336 static bool cf_check intel_default_check(uint64_t status)
337 {
338 return true;
339 }
340
intel_default_mce_dhandler(struct mca_binfo * binfo,enum mce_result * result,const struct cpu_user_regs * regs)341 static void cf_check intel_default_mce_dhandler(
342 struct mca_binfo *binfo, enum mce_result *result,
343 const struct cpu_user_regs * regs)
344 {
345 uint64_t status = binfo->mib->mc_status;
346 enum intel_mce_type type;
347
348 type = intel_check_mce_type(status);
349
350 if ( type == intel_mce_fatal )
351 *result = MCER_RESET;
352 else
353 *result = MCER_CONTINUE;
354 }
355
356 static const struct mca_error_handler intel_mce_dhandlers[] = {
357 {intel_srao_check, intel_srao_dhandler},
358 {intel_srar_check, intel_srar_dhandler},
359 {intel_default_check, intel_default_mce_dhandler}
360 };
361
intel_default_mce_uhandler(struct mca_binfo * binfo,enum mce_result * result,const struct cpu_user_regs * regs)362 static void cf_check intel_default_mce_uhandler(
363 struct mca_binfo *binfo, enum mce_result *result,
364 const struct cpu_user_regs *regs)
365 {
366 uint64_t status = binfo->mib->mc_status;
367 enum intel_mce_type type;
368
369 type = intel_check_mce_type(status);
370
371 switch ( type )
372 {
373 case intel_mce_fatal:
374 *result = MCER_RESET;
375 break;
376
377 default:
378 *result = MCER_CONTINUE;
379 break;
380 }
381 }
382
383 static const struct mca_error_handler intel_mce_uhandlers[] = {
384 {intel_default_check, intel_default_mce_uhandler}
385 };
386
387 /* According to MCA OS writer guide, CMCI handler need to clear bank when
388 * 1) CE (UC = 0)
389 * 2) ser_support = 1, Superious error, OVER = 0, EN = 0, [UC = 1]
390 * 3) ser_support = 1, UCNA, OVER = 0, S = 1, AR = 0, PCC = 0, [UC = 1, EN = 1]
391 * MCA handler need to clear bank when
392 * 1) ser_support = 1, Superious error, OVER = 0, EN = 0, UC = 1
393 * 2) ser_support = 1, SRAR, UC = 1, OVER = 0, S = 1, AR = 1, [EN = 1]
394 * 3) ser_support = 1, SRAO, UC = 1, S = 1, AR = 0, [EN = 1]
395 */
396
intel_need_clearbank_scan(enum mca_source who,u64 status)397 static bool cf_check intel_need_clearbank_scan(enum mca_source who, u64 status)
398 {
399 if ( who == MCA_CMCI_HANDLER )
400 {
401 /* CMCI need clear bank */
402 if ( !(status & MCi_STATUS_UC) )
403 return true;
404 /* Spurious need clear bank */
405 else if ( ser_support && !(status & MCi_STATUS_OVER)
406 && !(status & MCi_STATUS_EN) )
407 return true;
408 /* UCNA OVER = 0 need clear bank */
409 else if ( ser_support && !(status & MCi_STATUS_OVER)
410 && !(status & MCi_STATUS_PCC) && !(status & MCi_STATUS_S)
411 && !(status & MCi_STATUS_AR) )
412 return true;
413 /* Only Log, no clear */
414 else return false;
415 }
416 else if ( who == MCA_MCE_SCAN )
417 {
418 if ( !ser_support )
419 return false;
420 /*
421 * For fatal error, it shouldn't be cleared so that sticky bank
422 * have chance to be handled after reboot by polling
423 */
424 if ( (status & MCi_STATUS_UC) && (status & MCi_STATUS_PCC) )
425 return false;
426 /* Spurious need clear bank */
427 else if ( !(status & MCi_STATUS_OVER)
428 && (status & MCi_STATUS_UC) && !(status & MCi_STATUS_EN) )
429 return true;
430 /* SRAR OVER=0 clear bank. OVER = 1 have caused reset */
431 else if ( (status & MCi_STATUS_UC)
432 && (status & MCi_STATUS_S) && (status & MCi_STATUS_AR)
433 && !(status & MCi_STATUS_OVER) )
434 return true;
435 /* SRAO need clear bank */
436 else if ( !(status & MCi_STATUS_AR)
437 && (status & MCi_STATUS_S) && (status & MCi_STATUS_UC) )
438 return true;
439 else
440 return false;
441 }
442
443 return true;
444 }
445
446 /*
447 * MCE continues/is recoverable when
448 * 1) CE UC = 0
449 * 2) Supious ser_support = 1, OVER = 0, En = 0 [UC = 1]
450 * 3) SRAR ser_support = 1, OVER = 0, PCC = 0, S = 1, AR = 1 [UC =1, EN = 1]
451 * 4) SRAO ser_support = 1, PCC = 0, S = 1, AR = 0, EN = 1 [UC = 1]
452 * 5) UCNA ser_support = 1, OVER = 0, EN = 1, PCC = 0, S = 0, AR = 0, [UC = 1]
453 */
intel_recoverable_scan(uint64_t status)454 static bool cf_check intel_recoverable_scan(uint64_t status)
455 {
456
457 if ( !(status & MCi_STATUS_UC ) )
458 return true;
459 else if ( ser_support && !(status & MCi_STATUS_EN)
460 && !(status & MCi_STATUS_OVER) )
461 return true;
462 /* SRAR error */
463 else if ( ser_support && !(status & MCi_STATUS_OVER)
464 && !(status & MCi_STATUS_PCC) && (status & MCi_STATUS_S)
465 && (status & MCi_STATUS_AR) && (status & MCi_STATUS_EN) )
466 return true;
467 /* SRAO error */
468 else if ( ser_support && !(status & MCi_STATUS_PCC)
469 && (status & MCi_STATUS_S) && !(status & MCi_STATUS_AR)
470 && (status & MCi_STATUS_EN) )
471 return true;
472 /* UCNA error */
473 else if ( ser_support && !(status & MCi_STATUS_OVER)
474 && (status & MCi_STATUS_EN) && !(status & MCi_STATUS_PCC)
475 && !(status & MCi_STATUS_S) && !(status & MCi_STATUS_AR) )
476 return true;
477 return false;
478 }
479
480 /* CMCI */
481 static DEFINE_SPINLOCK(cmci_discover_lock);
482
483 /*
484 * Discover bank sharing using the algorithm recommended in the SDM.
485 */
do_cmci_discover(int i)486 static int do_cmci_discover(int i)
487 {
488 unsigned msr = MSR_IA32_MCx_CTL2(i);
489 u64 val;
490 unsigned int threshold, max_threshold;
491 unsigned int cpu = smp_processor_id();
492 static unsigned int cmci_threshold = 2;
493 integer_param("cmci-threshold", cmci_threshold);
494
495 rdmsrl(msr, val);
496 /* Some other CPU already owns this bank. */
497 if ( val & CMCI_EN )
498 {
499 mcabanks_clear(i, per_cpu(mce_banks_owned, cpu));
500 goto out;
501 }
502
503 if ( cmci_threshold )
504 {
505 wrmsrl(msr, val | CMCI_EN | CMCI_THRESHOLD_MASK);
506 rdmsrl(msr, val);
507 }
508
509 if ( !(val & CMCI_EN) )
510 {
511 /* This bank does not support CMCI. Polling timer has to handle it. */
512 mcabanks_set(i, per_cpu(no_cmci_banks, cpu));
513 wrmsrl(msr, val & ~CMCI_THRESHOLD_MASK);
514 return 0;
515 }
516 max_threshold = MASK_EXTR(val, CMCI_THRESHOLD_MASK);
517 threshold = cmci_threshold;
518 if ( threshold > max_threshold )
519 {
520 mce_printk(MCE_QUIET,
521 "CMCI: threshold %#x too large for CPU%u bank %u, using %#x\n",
522 threshold, cpu, i, max_threshold);
523 threshold = max_threshold;
524 }
525 wrmsrl(msr, (val & ~CMCI_THRESHOLD_MASK) | CMCI_EN | threshold);
526 mcabanks_set(i, per_cpu(mce_banks_owned, cpu));
527 out:
528 mcabanks_clear(i, per_cpu(no_cmci_banks, cpu));
529 return 1;
530 }
531
cmci_discover(void)532 static void cmci_discover(void)
533 {
534 unsigned long flags;
535 unsigned int i, cpu = smp_processor_id();
536 mctelem_cookie_t mctc;
537 struct mca_summary bs;
538
539 mce_printk(MCE_VERBOSE, "CMCI: find owner on CPU%u\n", cpu);
540
541 spin_lock_irqsave(&cmci_discover_lock, flags);
542
543 for ( i = 0; i < per_cpu(nr_mce_banks, cpu); i++ )
544 if ( !mcabanks_test(i, per_cpu(mce_banks_owned, cpu)) )
545 do_cmci_discover(i);
546
547 spin_unlock_irqrestore(&cmci_discover_lock, flags);
548
549 /*
550 * In case CMCI happended when do owner change.
551 * If CMCI happened yet not processed immediately,
552 * MCi_status (error_count bit 38~52) is not cleared,
553 * the CMCI interrupt will never be triggered again.
554 */
555
556 mctc = mcheck_mca_logout(
557 MCA_CMCI_HANDLER, per_cpu(mce_banks_owned, cpu), &bs, NULL);
558
559 if ( bs.errcnt && mctc != NULL )
560 {
561 if ( dom0_vmce_enabled() )
562 {
563 mctelem_commit(mctc);
564 send_global_virq(VIRQ_MCA);
565 }
566 else
567 {
568 x86_mcinfo_dump(mctelem_dataptr(mctc));
569 mctelem_dismiss(mctc);
570 }
571 }
572 else if ( mctc != NULL )
573 mctelem_dismiss(mctc);
574
575 mce_printk(MCE_VERBOSE, "CMCI: CPU%d owner_map[%lx], no_cmci_map[%lx]\n",
576 cpu,
577 per_cpu(mce_banks_owned, cpu)->bank_map[0],
578 per_cpu(no_cmci_banks, cpu)->bank_map[0]);
579 }
580
581 /*
582 * Define an owner for each bank. Banks can be shared between CPUs
583 * and to avoid reporting events multiple times always set up one
584 * CPU as owner.
585 *
586 * The assignment has to be redone when CPUs go offline and
587 * any of the owners goes away. Also pollers run in parallel so we
588 * have to be careful to update the banks in a way that doesn't
589 * lose or duplicate events.
590 */
591
mce_set_owner(void)592 static void mce_set_owner(void)
593 {
594 if ( !cmci_support || !opt_mce )
595 return;
596
597 cmci_discover();
598 }
599
__cpu_mcheck_distribute_cmci(void * unused)600 static void cf_check __cpu_mcheck_distribute_cmci(void *unused)
601 {
602 cmci_discover();
603 }
604
cpu_mcheck_distribute_cmci(void)605 static void cpu_mcheck_distribute_cmci(void)
606 {
607 if ( cmci_support && opt_mce )
608 on_each_cpu(__cpu_mcheck_distribute_cmci, NULL, 0);
609 }
610
clear_cmci(void)611 static void clear_cmci(void)
612 {
613 unsigned int i, cpu = smp_processor_id();
614
615 if ( !cmci_support || !opt_mce )
616 return;
617
618 mce_printk(MCE_VERBOSE, "CMCI: clear_cmci support on CPU%u\n", cpu);
619
620 for ( i = 0; i < per_cpu(nr_mce_banks, cpu); i++ )
621 {
622 unsigned msr = MSR_IA32_MCx_CTL2(i);
623 u64 val;
624
625 if ( !mcabanks_test(i, per_cpu(mce_banks_owned, cpu)) )
626 continue;
627 rdmsrl(msr, val);
628 if ( val & (CMCI_EN|CMCI_THRESHOLD_MASK) )
629 wrmsrl(msr, val & ~(CMCI_EN|CMCI_THRESHOLD_MASK));
630 mcabanks_clear(i, per_cpu(mce_banks_owned, cpu));
631 }
632 }
633
cpu_mcheck_disable(void)634 static void cpu_mcheck_disable(void)
635 {
636 if ( cmci_support && opt_mce )
637 clear_cmci();
638 }
639
cmci_interrupt(void)640 static void cf_check cmci_interrupt(void)
641 {
642 mctelem_cookie_t mctc;
643 struct mca_summary bs;
644
645 ack_APIC_irq();
646
647 mctc = mcheck_mca_logout(
648 MCA_CMCI_HANDLER, this_cpu(mce_banks_owned), &bs, NULL);
649
650 if ( bs.errcnt && mctc != NULL )
651 {
652 if ( dom0_vmce_enabled() )
653 {
654 mctelem_commit(mctc);
655 mce_printk(MCE_VERBOSE, "CMCI: send CMCI to DOM0 through virq\n");
656 send_global_virq(VIRQ_MCA);
657 }
658 else
659 {
660 x86_mcinfo_dump(mctelem_dataptr(mctc));
661 mctelem_dismiss(mctc);
662 }
663 }
664 else if ( mctc != NULL )
665 mctelem_dismiss(mctc);
666 }
667
intel_init_cmci(struct cpuinfo_x86 * c)668 static void intel_init_cmci(struct cpuinfo_x86 *c)
669 {
670 u32 l, apic;
671 int cpu = smp_processor_id();
672
673 if ( !mce_available(c) || !cmci_support )
674 {
675 if ( opt_cpu_info )
676 mce_printk(MCE_QUIET, "CMCI: CPU%d has no CMCI support\n", cpu);
677 return;
678 }
679
680 apic = apic_read(APIC_CMCI);
681 if ( apic & APIC_VECTOR_MASK )
682 {
683 mce_printk(MCE_QUIET, "CPU%d CMCI LVT vector (%#x) already installed\n",
684 cpu, ( apic & APIC_VECTOR_MASK ));
685 return;
686 }
687
688 alloc_direct_apic_vector(&cmci_apic_vector, cmci_interrupt);
689
690 apic = cmci_apic_vector;
691 apic |= (APIC_DM_FIXED | APIC_LVT_MASKED);
692 apic_write(APIC_CMCI, apic);
693
694 l = apic_read(APIC_CMCI);
695 apic_write(APIC_CMCI, l & ~APIC_LVT_MASKED);
696
697 mce_set_owner();
698 }
699
700 /* MCA */
701
mce_is_broadcast(struct cpuinfo_x86 * c)702 static bool mce_is_broadcast(struct cpuinfo_x86 *c)
703 {
704 if ( mce_force_broadcast )
705 return true;
706
707 /*
708 * According to Intel SDM Dec, 2009, 15.10.4.1, For processors with
709 * DisplayFamily_DisplayModel encoding of 06H_EH and above,
710 * a MCA signal is broadcast to all logical processors in the system
711 */
712 if ( c->x86_vendor == X86_VENDOR_INTEL && c->x86 == 6 &&
713 c->x86_model >= 0xe )
714 return true;
715 return false;
716 }
717
intel_enable_lmce(void)718 static bool intel_enable_lmce(void)
719 {
720 uint64_t msr_content;
721
722 /*
723 * Section "Enabling Local Machine Check" in Intel SDM Vol 3
724 * requires software must ensure the LOCK bit and LMCE_ON bit
725 * of MSR_IA32_FEATURE_CONTROL are set before setting
726 * MSR_IA32_MCG_EXT_CTL.LMCE_EN.
727 */
728
729 if ( rdmsr_safe(MSR_IA32_FEATURE_CONTROL, msr_content) )
730 return false;
731
732 if ( (msr_content & IA32_FEATURE_CONTROL_LOCK) &&
733 (msr_content & IA32_FEATURE_CONTROL_LMCE_ON) )
734 {
735 wrmsrl(MSR_IA32_MCG_EXT_CTL, MCG_EXT_CTL_LMCE_EN);
736 return true;
737 }
738
739 return false;
740 }
741
742 /* Check and init MCA */
intel_init_mca(struct cpuinfo_x86 * c)743 static void intel_init_mca(struct cpuinfo_x86 *c)
744 {
745 bool broadcast, cmci = false, ser = false, lmce = false;
746 int ext_num = 0, first;
747 uint64_t msr_content;
748
749 broadcast = mce_is_broadcast(c);
750
751 rdmsrl(MSR_IA32_MCG_CAP, msr_content);
752
753 if ( (msr_content & MCG_CMCI_P) && cpu_has_apic )
754 cmci = true;
755
756 /* Support Software Error Recovery */
757 if ( msr_content & MCG_SER_P )
758 ser = true;
759
760 if ( msr_content & MCG_EXT_P )
761 ext_num = (msr_content >> MCG_EXT_CNT) & 0xff;
762
763 first = mce_firstbank(c);
764
765 if ( !mce_force_broadcast && (msr_content & MCG_LMCE_P) )
766 lmce = intel_enable_lmce();
767
768 #define CAP(enabled, name) ((enabled) ? ", " name : "")
769 if ( smp_processor_id() == 0 )
770 {
771 dprintk(XENLOG_INFO,
772 "MCA Capability: firstbank %d, extended MCE MSR %d%s%s%s%s\n",
773 first, ext_num,
774 CAP(broadcast, "BCAST"),
775 CAP(ser, "SER"),
776 CAP(cmci, "CMCI"),
777 CAP(lmce, "LMCE"));
778
779 mce_broadcast = broadcast;
780 cmci_support = cmci;
781 ser_support = ser;
782 lmce_support = lmce;
783 nr_intel_ext_msrs = ext_num;
784 firstbank = first;
785 }
786 else if ( cmci != cmci_support || ser != ser_support ||
787 broadcast != mce_broadcast ||
788 first != firstbank || ext_num != nr_intel_ext_msrs ||
789 lmce != lmce_support )
790 dprintk(XENLOG_WARNING,
791 "CPU%u has different MCA capability "
792 "(firstbank %d, extended MCE MSR %d%s%s%s%s)"
793 " than BSP, may cause undetermined result!!!\n",
794 smp_processor_id(), first, ext_num,
795 CAP(broadcast, "BCAST"),
796 CAP(ser, "SER"),
797 CAP(cmci, "CMCI"),
798 CAP(lmce, "LMCE"));
799 #undef CAP
800 }
801
intel_mce_post_reset(void)802 static void intel_mce_post_reset(void)
803 {
804 mctelem_cookie_t mctc;
805 struct mca_summary bs;
806
807 mctc = mcheck_mca_logout(MCA_RESET, mca_allbanks, &bs, NULL);
808
809 /* in the boot up stage, print out and also log in DOM0 boot process */
810 if ( bs.errcnt && mctc != NULL )
811 {
812 x86_mcinfo_dump(mctelem_dataptr(mctc));
813 mctelem_commit(mctc);
814 }
815 return;
816 }
817
intel_init_mce(bool bsp)818 static void intel_init_mce(bool bsp)
819 {
820 uint64_t msr_content;
821 int i;
822
823 intel_mce_post_reset();
824
825 /* clear all banks */
826 for ( i = firstbank; i < this_cpu(nr_mce_banks); i++ )
827 {
828 /*
829 * Some banks are shared across cores, use MCi_CTRL to judge whether
830 * this bank has been initialized by other cores already.
831 */
832 rdmsrl(MSR_IA32_MCx_CTL(i), msr_content);
833 if ( !msr_content )
834 {
835 /* if ctl is 0, this bank is never initialized */
836 mce_printk(MCE_VERBOSE, "mce_init: init bank%d\n", i);
837 wrmsrl(MSR_IA32_MCx_CTL(i), 0xffffffffffffffffULL);
838 wrmsrl(MSR_IA32_MCx_STATUS(i), 0x0ULL);
839 }
840 }
841 if ( firstbank ) /* if cmci enabled, firstbank = 0 */
842 wrmsrl(MSR_IA32_MC0_STATUS, 0x0ULL);
843
844 if ( !bsp )
845 return;
846
847 mce_dhandlers = intel_mce_dhandlers;
848 mce_dhandler_num = ARRAY_SIZE(intel_mce_dhandlers);
849 mce_uhandlers = intel_mce_uhandlers;
850 mce_uhandler_num = ARRAY_SIZE(intel_mce_uhandlers);
851 }
852
intel_init_ppin(const struct cpuinfo_x86 * c)853 static void intel_init_ppin(const struct cpuinfo_x86 *c)
854 {
855 /*
856 * Even if testing the presence of the MSR would be enough, we don't
857 * want to risk the situation where other models reuse this MSR for
858 * other purposes. Despite the late addition of a CPUID bit (rendering
859 * the MSR architectural), keep using the same detection logic there.
860 */
861 switch ( c->x86_model )
862 {
863 uint64_t val;
864
865 default:
866 if ( !cpu_has(c, X86_FEATURE_INTEL_PPIN) )
867 {
868 ppin_msr = 0;
869 return;
870 }
871 fallthrough;
872 case 0x3e: /* IvyBridge X */
873 case 0x3f: /* Haswell X */
874 case 0x4f: /* Broadwell X */
875 case 0x55: /* Skylake X */
876 case 0x56: /* Broadwell Xeon D */
877 case 0x6a: /* Icelake X */
878 case 0x6c: /* Icelake D */
879 case 0x8f: /* Sapphire Rapids X */
880
881 if ( (c != &boot_cpu_data && !ppin_msr) ||
882 rdmsr_safe(MSR_PPIN_CTL, val) )
883 return;
884
885 /* If PPIN is disabled, but not locked, try to enable. */
886 if ( !(val & (PPIN_ENABLE | PPIN_LOCKOUT)) )
887 {
888 wrmsr_safe(MSR_PPIN_CTL, val | PPIN_ENABLE);
889 rdmsr_safe(MSR_PPIN_CTL, val);
890 }
891
892 if ( !(val & PPIN_ENABLE) )
893 ppin_msr = 0;
894 else if ( c == &boot_cpu_data )
895 ppin_msr = MSR_PPIN;
896
897 break;
898 }
899 }
900
cpu_mcabank_free(unsigned int cpu)901 static void cpu_mcabank_free(unsigned int cpu)
902 {
903 struct mca_banks *cmci = per_cpu(no_cmci_banks, cpu);
904 struct mca_banks *owned = per_cpu(mce_banks_owned, cpu);
905
906 mcabanks_free(cmci);
907 mcabanks_free(owned);
908 }
909
cpu_mcabank_alloc(unsigned int cpu)910 static int cpu_mcabank_alloc(unsigned int cpu)
911 {
912 unsigned int nr = per_cpu(nr_mce_banks, cpu);
913 struct mca_banks *cmci = mcabanks_alloc(nr);
914 struct mca_banks *owned = mcabanks_alloc(nr);
915
916 if ( !cmci || !owned )
917 goto out;
918
919 per_cpu(no_cmci_banks, cpu) = cmci;
920 per_cpu(mce_banks_owned, cpu) = owned;
921 per_cpu(last_state, cpu) = -1;
922
923 return 0;
924 out:
925 mcabanks_free(cmci);
926 mcabanks_free(owned);
927 return -ENOMEM;
928 }
929
cpu_callback(struct notifier_block * nfb,unsigned long action,void * hcpu)930 static int cf_check cpu_callback(
931 struct notifier_block *nfb, unsigned long action, void *hcpu)
932 {
933 unsigned int cpu = (unsigned long)hcpu;
934 int rc = 0;
935
936 switch ( action )
937 {
938 case CPU_UP_PREPARE:
939 rc = cpu_mcabank_alloc(cpu);
940 break;
941
942 case CPU_DYING:
943 cpu_mcheck_disable();
944 break;
945
946 case CPU_UP_CANCELED:
947 case CPU_DEAD:
948 cpu_mcheck_distribute_cmci();
949 cpu_mcabank_free(cpu);
950 break;
951 }
952
953 return notifier_from_errno(rc);
954 }
955
956 static const struct mce_callbacks __initconst_cf_clobber intel_callbacks = {
957 .handler = mcheck_cmn_handler,
958 .check_addr = intel_checkaddr,
959 .recoverable_scan = intel_recoverable_scan,
960 .need_clearbank_scan = intel_need_clearbank_scan,
961 };
962
963 static struct notifier_block cpu_nfb = {
964 .notifier_call = cpu_callback
965 };
966
967 /* p4/p6 family have similar MCA initialization process */
intel_mcheck_init(struct cpuinfo_x86 * c,bool bsp)968 enum mcheck_type intel_mcheck_init(struct cpuinfo_x86 *c, bool bsp)
969 {
970 if ( bsp )
971 {
972 /* Early MCE initialisation for BSP. */
973 if ( cpu_mcabank_alloc(0) )
974 BUG();
975 register_cpu_notifier(&cpu_nfb);
976 mcheck_intel_therm_init();
977 }
978 else
979 {
980 unsigned int cpu = smp_processor_id();
981
982 per_cpu(no_cmci_banks, cpu)->num = per_cpu(nr_mce_banks, cpu);
983 per_cpu(mce_banks_owned, cpu)->num = per_cpu(nr_mce_banks, cpu);
984 }
985
986 intel_init_mca(c);
987
988 if ( bsp )
989 mce_handler_init(&intel_callbacks);
990
991 intel_init_mce(bsp);
992
993 intel_init_cmci(c);
994
995 intel_init_thermal(c);
996
997 intel_init_ppin(c);
998
999 return mcheck_intel;
1000 }
1001
1002 /* intel specific MCA MSR */
vmce_intel_wrmsr(struct vcpu * v,uint32_t msr,uint64_t val)1003 int vmce_intel_wrmsr(struct vcpu *v, uint32_t msr, uint64_t val)
1004 {
1005 unsigned int bank = msr - MSR_IA32_MC0_CTL2;
1006
1007 if ( bank < GUEST_MC_BANK_NUM )
1008 {
1009 v->arch.vmce.bank[bank].mci_ctl2 = val;
1010 mce_printk(MCE_VERBOSE, "MCE: wr MC%u_CTL2 %#"PRIx64"\n", bank, val);
1011 }
1012
1013 return 1;
1014 }
1015
vmce_intel_rdmsr(const struct vcpu * v,uint32_t msr,uint64_t * val)1016 int vmce_intel_rdmsr(const struct vcpu *v, uint32_t msr, uint64_t *val)
1017 {
1018 const struct cpu_policy *cp = v->domain->arch.cpu_policy;
1019 unsigned int bank = msr - MSR_IA32_MC0_CTL2;
1020
1021 switch ( msr )
1022 {
1023 case MSR_P5_MC_ADDR:
1024 /*
1025 * Bank 0 is used for the 'bank 0 quirk' on older processors.
1026 * See vcpu_fill_mc_msrs() for reference.
1027 */
1028 *val = v->arch.vmce.bank[1].mci_addr;
1029 return 1;
1030
1031 case MSR_P5_MC_TYPE:
1032 *val = v->arch.vmce.bank[1].mci_status;
1033 return 1;
1034 }
1035
1036 if ( !(cp->x86_vendor & X86_VENDOR_INTEL) )
1037 return 0;
1038
1039 if ( bank < GUEST_MC_BANK_NUM )
1040 {
1041 *val = v->arch.vmce.bank[bank].mci_ctl2;
1042 mce_printk(MCE_VERBOSE, "MCE: rd MC%u_CTL2 %#"PRIx64"\n", bank, *val);
1043 }
1044
1045 return 1;
1046 }
1047
1048