1 /*
2 * mce.c - x86 Machine Check Exception Reporting
3 * (c) 2002 Alan Cox <alan@redhat.com>, Dave Jones <davej@codemonkey.org.uk>
4 */
5
6 #include <xen/init.h>
7 #include <xen/types.h>
8 #include <xen/kernel.h>
9 #include <xen/smp.h>
10 #include <xen/errno.h>
11 #include <xen/console.h>
12 #include <xen/sched.h>
13 #include <xen/sched-if.h>
14 #include <xen/cpumask.h>
15 #include <xen/event.h>
16 #include <xen/guest_access.h>
17 #include <xen/hypercall.h> /* for do_mca */
18 #include <xen/cpu.h>
19
20 #include <asm/processor.h>
21 #include <asm/setup.h>
22 #include <asm/system.h>
23 #include <asm/apic.h>
24 #include <asm/msr.h>
25 #include <asm/p2m.h>
26
27 #include "mce.h"
28 #include "barrier.h"
29 #include "mcaction.h"
30 #include "util.h"
31 #include "vmce.h"
32
33 bool __read_mostly opt_mce = true;
34 boolean_param("mce", opt_mce);
35 bool __read_mostly mce_broadcast;
36 bool is_mc_panic;
37 unsigned int __read_mostly nr_mce_banks;
38 unsigned int __read_mostly firstbank;
39 uint8_t __read_mostly cmci_apic_vector;
40
41 DEFINE_PER_CPU_READ_MOSTLY(struct mca_banks *, poll_bankmask);
42 DEFINE_PER_CPU_READ_MOSTLY(struct mca_banks *, no_cmci_banks);
43 DEFINE_PER_CPU_READ_MOSTLY(struct mca_banks *, mce_clear_banks);
44
45 static void intpose_init(void);
46 static void mcinfo_clear(struct mc_info *);
47 struct mca_banks *mca_allbanks;
48
49 #define SEG_PL(segsel) ((segsel) & 0x3)
50 #define _MC_MSRINJ_F_REQ_HWCR_WREN (1 << 16)
51
52 #if 0
53 #define x86_mcerr(fmt, err, args...) \
54 ({ \
55 int _err = (err); \
56 gdprintk(XENLOG_WARNING, "x86_mcerr: " fmt ", returning %d\n", \
57 ## args, _err); \
58 _err; \
59 })
60 #else
61 #define x86_mcerr(fmt, err, args...) (err)
62 #endif
63
64 int mce_verbosity;
mce_set_verbosity(const char * str)65 static int __init mce_set_verbosity(const char *str)
66 {
67 if ( strcmp("verbose", str) == 0 )
68 mce_verbosity = MCE_VERBOSE;
69 else
70 return -EINVAL;
71
72 return 0;
73 }
74 custom_param("mce_verbosity", mce_set_verbosity);
75
76 /* Handle unconfigured int18 (should never happen) */
unexpected_machine_check(const struct cpu_user_regs * regs)77 static void unexpected_machine_check(const struct cpu_user_regs *regs)
78 {
79 console_force_unlock();
80 printk("Unexpected Machine Check Exception\n");
81 fatal_trap(regs, 1);
82 }
83
84 static x86_mce_vector_t _machine_check_vector = unexpected_machine_check;
85
x86_mce_vector_register(x86_mce_vector_t hdlr)86 void x86_mce_vector_register(x86_mce_vector_t hdlr)
87 {
88 _machine_check_vector = hdlr;
89 wmb();
90 }
91
92 /* Call the installed machine check handler for this CPU setup. */
93
do_machine_check(const struct cpu_user_regs * regs)94 void do_machine_check(const struct cpu_user_regs *regs)
95 {
96 _machine_check_vector(regs);
97 }
98
99 /*
100 * Init machine check callback handler
101 * It is used to collect additional information provided by newer
102 * CPU families/models without the need to duplicate the whole handler.
103 * This avoids having many handlers doing almost nearly the same and each
104 * with its own tweaks ands bugs.
105 */
106 static x86_mce_callback_t mc_callback_bank_extended = NULL;
107
x86_mce_callback_register(x86_mce_callback_t cbfunc)108 void x86_mce_callback_register(x86_mce_callback_t cbfunc)
109 {
110 mc_callback_bank_extended = cbfunc;
111 }
112
113 /*
114 * Machine check recoverable judgement callback handler
115 * It is used to judge whether an UC error is recoverable by software
116 */
117 static mce_recoverable_t mc_recoverable_scan = NULL;
118
mce_recoverable_register(mce_recoverable_t cbfunc)119 void mce_recoverable_register(mce_recoverable_t cbfunc)
120 {
121 mc_recoverable_scan = cbfunc;
122 }
123
mcabanks_alloc(void)124 struct mca_banks *mcabanks_alloc(void)
125 {
126 struct mca_banks *mb;
127
128 mb = xmalloc(struct mca_banks);
129 if ( !mb )
130 return NULL;
131
132 mb->bank_map = xzalloc_array(unsigned long,
133 BITS_TO_LONGS(nr_mce_banks));
134 if ( !mb->bank_map )
135 {
136 xfree(mb);
137 return NULL;
138 }
139
140 mb->num = nr_mce_banks;
141
142 return mb;
143 }
144
mcabanks_free(struct mca_banks * banks)145 void mcabanks_free(struct mca_banks *banks)
146 {
147 if ( banks == NULL )
148 return;
149 if ( banks->bank_map )
150 xfree(banks->bank_map);
151 xfree(banks);
152 }
153
mcabank_clear(int banknum)154 static void mcabank_clear(int banknum)
155 {
156 uint64_t status;
157
158 status = mca_rdmsr(MSR_IA32_MCx_STATUS(banknum));
159
160 if ( status & MCi_STATUS_ADDRV )
161 mca_wrmsr(MSR_IA32_MCx_ADDR(banknum), 0x0ULL);
162 if ( status & MCi_STATUS_MISCV )
163 mca_wrmsr(MSR_IA32_MCx_MISC(banknum), 0x0ULL);
164
165 mca_wrmsr(MSR_IA32_MCx_STATUS(banknum), 0x0ULL);
166 }
167
168 /*
169 * Judging whether to Clear Machine Check error bank callback handler
170 * According to Intel latest MCA OS Recovery Writer's Guide,
171 * whether the error MCA bank needs to be cleared is decided by the mca_source
172 * and MCi_status bit value.
173 */
174 static mce_need_clearbank_t mc_need_clearbank_scan = NULL;
175
mce_need_clearbank_register(mce_need_clearbank_t cbfunc)176 void mce_need_clearbank_register(mce_need_clearbank_t cbfunc)
177 {
178 mc_need_clearbank_scan = cbfunc;
179 }
180
181 /*
182 * mce_logout_lock should only be used in the trap handler,
183 * while MCIP has not been cleared yet in the global status
184 * register. Other use is not safe, since an MCE trap can
185 * happen at any moment, which would cause lock recursion.
186 */
187 static DEFINE_SPINLOCK(mce_logout_lock);
188
189 const struct mca_error_handler *__read_mostly mce_dhandlers;
190 const struct mca_error_handler *__read_mostly mce_uhandlers;
191 unsigned int __read_mostly mce_dhandler_num;
192 unsigned int __read_mostly mce_uhandler_num;
193
mca_init_bank(enum mca_source who,struct mc_info * mi,int bank)194 static void mca_init_bank(enum mca_source who, struct mc_info *mi, int bank)
195 {
196 struct mcinfo_bank *mib;
197
198 if ( !mi )
199 return;
200
201 mib = x86_mcinfo_reserve(mi, sizeof(*mib), MC_TYPE_BANK);
202 if ( !mib )
203 {
204 mi->flags |= MCINFO_FLAGS_UNCOMPLETE;
205 return;
206 }
207
208 mib->mc_status = mca_rdmsr(MSR_IA32_MCx_STATUS(bank));
209
210 mib->mc_bank = bank;
211 mib->mc_domid = DOMID_INVALID;
212
213 if ( mib->mc_status & MCi_STATUS_MISCV )
214 mib->mc_misc = mca_rdmsr(MSR_IA32_MCx_MISC(bank));
215
216 if ( mib->mc_status & MCi_STATUS_ADDRV )
217 mib->mc_addr = mca_rdmsr(MSR_IA32_MCx_ADDR(bank));
218
219 if ( (mib->mc_status & MCi_STATUS_MISCV) &&
220 (mib->mc_status & MCi_STATUS_ADDRV) &&
221 (mc_check_addr(mib->mc_status, mib->mc_misc, MC_ADDR_PHYSICAL)) &&
222 (who == MCA_POLLER || who == MCA_CMCI_HANDLER) &&
223 (mfn_valid(_mfn(paddr_to_pfn(mib->mc_addr)))) )
224 {
225 struct domain *d;
226
227 d = maddr_get_owner(mib->mc_addr);
228 if ( d )
229 mib->mc_domid = d->domain_id;
230 }
231
232 if ( who == MCA_CMCI_HANDLER )
233 {
234 mib->mc_ctrl2 = mca_rdmsr(MSR_IA32_MC0_CTL2 + bank);
235 mib->mc_tsc = rdtsc();
236 }
237 }
238
mca_init_global(uint32_t flags,struct mcinfo_global * mig)239 static int mca_init_global(uint32_t flags, struct mcinfo_global *mig)
240 {
241 uint64_t status;
242 int cpu_nr;
243 const struct vcpu *curr = current;
244
245 /* Set global information */
246 status = mca_rdmsr(MSR_IA32_MCG_STATUS);
247 mig->mc_gstatus = status;
248 mig->mc_domid = DOMID_INVALID;
249 mig->mc_vcpuid = XEN_MC_VCPUID_INVALID;
250 mig->mc_flags = flags;
251 cpu_nr = smp_processor_id();
252 /* Retrieve detector information */
253 x86_mc_get_cpu_info(cpu_nr, &mig->mc_socketid,
254 &mig->mc_coreid, &mig->mc_core_threadid,
255 &mig->mc_apicid, NULL, NULL, NULL);
256
257 if ( curr != INVALID_VCPU )
258 {
259 mig->mc_domid = curr->domain->domain_id;
260 mig->mc_vcpuid = curr->vcpu_id;
261 }
262
263 return 0;
264 }
265
266 /*
267 * Utility function to perform MCA bank telemetry readout and to push that
268 * telemetry towards an interested dom0 for logging and diagnosis.
269 * The caller - #MC handler or MCA poll function - must arrange that we
270 * do not migrate cpus.
271 */
272
273 /* XXFM Could add overflow counting? */
274
275 /*
276 * Add out_param clear_bank for Machine Check Handler Caller.
277 * For Intel latest CPU, whether to clear the error bank status needs to
278 * be judged by the callback function defined above.
279 */
280 mctelem_cookie_t
mcheck_mca_logout(enum mca_source who,struct mca_banks * bankmask,struct mca_summary * sp,struct mca_banks * clear_bank)281 mcheck_mca_logout(enum mca_source who, struct mca_banks *bankmask,
282 struct mca_summary *sp, struct mca_banks *clear_bank)
283 {
284 uint64_t gstatus, status;
285 struct mcinfo_global *mig = NULL; /* on stack */
286 mctelem_cookie_t mctc = NULL;
287 bool uc = false, pcc = false, recover = true, need_clear = true;
288 uint32_t mc_flags = 0;
289 struct mc_info *mci = NULL;
290 mctelem_class_t which = MC_URGENT; /* XXXgcc */
291 int errcnt = 0;
292 int i;
293
294 gstatus = mca_rdmsr(MSR_IA32_MCG_STATUS);
295 switch ( who )
296 {
297 case MCA_MCE_SCAN:
298 mc_flags = MC_FLAG_MCE;
299 which = MC_URGENT;
300 break;
301
302 case MCA_POLLER:
303 case MCA_RESET:
304 mc_flags = MC_FLAG_POLLED;
305 which = MC_NONURGENT;
306 break;
307
308 case MCA_CMCI_HANDLER:
309 mc_flags = MC_FLAG_CMCI;
310 which = MC_NONURGENT;
311 break;
312
313 default:
314 BUG();
315 }
316
317 /*
318 * If no mc_recovery_scan callback handler registered,
319 * this error is not recoverable
320 */
321 recover = mc_recoverable_scan ? 1 : 0;
322
323 for ( i = 0; i < nr_mce_banks; i++ )
324 {
325 /* Skip bank if corresponding bit in bankmask is clear */
326 if ( !mcabanks_test(i, bankmask) )
327 continue;
328
329 status = mca_rdmsr(MSR_IA32_MCx_STATUS(i));
330 if ( !(status & MCi_STATUS_VAL) )
331 continue; /* this bank has no valid telemetry */
332
333 /*
334 * For Intel Latest CPU CMCI/MCE Handler caller, we need to
335 * decide whether to clear bank by MCi_STATUS bit value such as
336 * OVER/UC/EN/PCC/S/AR
337 */
338 if ( mc_need_clearbank_scan )
339 need_clear = mc_need_clearbank_scan(who, status);
340
341 /*
342 * If this is the first bank with valid MCA DATA, then
343 * try to reserve an entry from the urgent/nonurgent queue
344 * depending on whether we are called from an exception or
345 * a poller; this can fail (for example dom0 may not
346 * yet have consumed past telemetry).
347 */
348 if ( errcnt++ == 0 )
349 {
350 mctc = mctelem_reserve(which);
351 if ( mctc )
352 {
353 mci = mctelem_dataptr(mctc);
354 mcinfo_clear(mci);
355 mig = x86_mcinfo_reserve(mci, sizeof(*mig), MC_TYPE_GLOBAL);
356 /* mc_info should at least hold up the global information */
357 ASSERT(mig);
358 mca_init_global(mc_flags, mig);
359 /* A hook here to get global extended msrs */
360 if ( boot_cpu_data.x86_vendor == X86_VENDOR_INTEL )
361 intel_get_extended_msrs(mig, mci);
362 }
363 }
364
365 /* flag for uncorrected errors */
366 if ( !uc && ((status & MCi_STATUS_UC) != 0) )
367 uc = true;
368
369 /* flag processor context corrupt */
370 if ( !pcc && ((status & MCi_STATUS_PCC) != 0) )
371 pcc = true;
372
373 if ( recover && uc )
374 /* uc = true, recover = true, we need not panic. */
375 recover = mc_recoverable_scan(status);
376
377 mca_init_bank(who, mci, i);
378
379 if ( mc_callback_bank_extended )
380 mc_callback_bank_extended(mci, i, status);
381
382 /* By default, need_clear = true */
383 if ( who != MCA_MCE_SCAN && need_clear )
384 /* Clear bank */
385 mcabank_clear(i);
386 else if ( who == MCA_MCE_SCAN && need_clear )
387 mcabanks_set(i, clear_bank);
388
389 wmb();
390 }
391
392 if ( mig && errcnt > 0 )
393 {
394 if ( pcc )
395 mig->mc_flags |= MC_FLAG_UNCORRECTABLE;
396 else if ( uc )
397 mig->mc_flags |= MC_FLAG_RECOVERABLE;
398 else
399 mig->mc_flags |= MC_FLAG_CORRECTABLE;
400 }
401
402 if ( sp )
403 {
404 sp->errcnt = errcnt;
405 sp->ripv = (gstatus & MCG_STATUS_RIPV) != 0;
406 sp->eipv = (gstatus & MCG_STATUS_EIPV) != 0;
407 sp->lmce = (gstatus & MCG_STATUS_LMCE) != 0;
408 sp->uc = uc;
409 sp->pcc = pcc;
410 sp->recoverable = recover;
411 }
412
413 return mci != NULL ? mctc : NULL; /* may be NULL */
414 }
415
mce_spin_lock(spinlock_t * lk)416 static void mce_spin_lock(spinlock_t *lk)
417 {
418 while ( !spin_trylock(lk) )
419 {
420 cpu_relax();
421 mce_panic_check();
422 }
423 }
424
mce_spin_unlock(spinlock_t * lk)425 static void mce_spin_unlock(spinlock_t *lk)
426 {
427 spin_unlock(lk);
428 }
429
430 static enum mce_result mce_action(const struct cpu_user_regs *regs,
431 mctelem_cookie_t mctc);
432
433 /*
434 * Return:
435 * -1: if system can't be recovered
436 * 0: Continue to next step
437 */
mce_urgent_action(const struct cpu_user_regs * regs,mctelem_cookie_t mctc)438 static int mce_urgent_action(const struct cpu_user_regs *regs,
439 mctelem_cookie_t mctc)
440 {
441 uint64_t gstatus;
442
443 if ( mctc == NULL )
444 return 0;
445
446 gstatus = mca_rdmsr(MSR_IA32_MCG_STATUS);
447
448 /*
449 * FIXME: When RIPV = EIPV = 0, it's a little bit tricky. It may be an
450 * asynchronic error, currently we have no way to precisely locate
451 * whether the error occur at guest or hypervisor.
452 * To avoid handling error in wrong way, we treat it as unrecovered.
453 *
454 * Another unrecovered case is RIPV = 0 while in hypervisor
455 * since Xen is not pre-emptible.
456 */
457 if ( !(gstatus & MCG_STATUS_RIPV) &&
458 (!(gstatus & MCG_STATUS_EIPV) || !guest_mode(regs)) )
459 return -1;
460
461 return mce_action(regs, mctc) == MCER_RESET ? -1 : 0;
462 }
463
464 /* Shared #MC handler. */
mcheck_cmn_handler(const struct cpu_user_regs * regs)465 void mcheck_cmn_handler(const struct cpu_user_regs *regs)
466 {
467 static DEFINE_MCE_BARRIER(mce_trap_bar);
468 static atomic_t severity_cpu = ATOMIC_INIT(-1);
469 static atomic_t found_error = ATOMIC_INIT(0);
470 static cpumask_t mce_fatal_cpus;
471 struct mca_banks *bankmask = mca_allbanks;
472 struct mca_banks *clear_bank = __get_cpu_var(mce_clear_banks);
473 uint64_t gstatus;
474 mctelem_cookie_t mctc = NULL;
475 struct mca_summary bs;
476 bool bcast, lmce;
477
478 mce_spin_lock(&mce_logout_lock);
479
480 if ( clear_bank != NULL )
481 memset(clear_bank->bank_map, 0x0,
482 sizeof(long) * BITS_TO_LONGS(clear_bank->num));
483 mctc = mcheck_mca_logout(MCA_MCE_SCAN, bankmask, &bs, clear_bank);
484 lmce = bs.lmce;
485 bcast = mce_broadcast && !lmce;
486
487 if ( bs.errcnt )
488 {
489 /*
490 * Uncorrected errors must be dealt with in softirq context.
491 */
492 if ( bs.uc || bs.pcc )
493 {
494 add_taint(TAINT_MACHINE_CHECK);
495 if ( mctc )
496 mctelem_defer(mctc, lmce);
497 /*
498 * For PCC=1 and can't be recovered, context is lost, so
499 * reboot now without clearing the banks, and deal with
500 * the telemetry after reboot (the MSRs are sticky)
501 */
502 if ( bs.pcc || !bs.recoverable )
503 cpumask_set_cpu(smp_processor_id(), &mce_fatal_cpus);
504 }
505 else if ( mctc != NULL )
506 mctelem_commit(mctc);
507 atomic_set(&found_error, 1);
508
509 /* The last CPU will be take check/clean-up etc */
510 atomic_set(&severity_cpu, smp_processor_id());
511
512 mce_printk(MCE_CRITICAL, "MCE: clear_bank map %lx on CPU%d\n",
513 *((unsigned long *)clear_bank), smp_processor_id());
514 if ( clear_bank != NULL )
515 mcheck_mca_clearbanks(clear_bank);
516 }
517 else if ( mctc != NULL )
518 mctelem_dismiss(mctc);
519 mce_spin_unlock(&mce_logout_lock);
520
521 mce_barrier_enter(&mce_trap_bar, bcast);
522 if ( mctc != NULL && mce_urgent_action(regs, mctc) )
523 cpumask_set_cpu(smp_processor_id(), &mce_fatal_cpus);
524 mce_barrier_exit(&mce_trap_bar, bcast);
525
526 /*
527 * Wait until everybody has processed the trap.
528 */
529 mce_barrier_enter(&mce_trap_bar, bcast);
530 if ( lmce || atomic_read(&severity_cpu) == smp_processor_id() )
531 {
532 /*
533 * According to SDM, if no error bank found on any cpus,
534 * something unexpected happening, we can't do any
535 * recovery job but to reset the system.
536 */
537 if ( atomic_read(&found_error) == 0 )
538 mc_panic("MCE: No CPU found valid MCE, need reset");
539 if ( !cpumask_empty(&mce_fatal_cpus) )
540 {
541 char *ebufp, ebuf[96] = "MCE: Fatal error happened on CPUs ";
542 ebufp = ebuf + strlen(ebuf);
543 cpumask_scnprintf(ebufp, 95 - strlen(ebuf), &mce_fatal_cpus);
544 mc_panic(ebuf);
545 }
546 atomic_set(&found_error, 0);
547 atomic_set(&severity_cpu, -1);
548 }
549 mce_barrier_exit(&mce_trap_bar, bcast);
550
551 /* Clear flags after above fatal check */
552 mce_barrier_enter(&mce_trap_bar, bcast);
553 gstatus = mca_rdmsr(MSR_IA32_MCG_STATUS);
554 if ( (gstatus & MCG_STATUS_MCIP) != 0 )
555 {
556 mce_printk(MCE_CRITICAL, "MCE: Clear MCIP@ last step");
557 mca_wrmsr(MSR_IA32_MCG_STATUS, 0);
558 }
559 mce_barrier_exit(&mce_trap_bar, bcast);
560
561 raise_softirq(MACHINE_CHECK_SOFTIRQ);
562 }
563
mcheck_mca_clearbanks(struct mca_banks * bankmask)564 void mcheck_mca_clearbanks(struct mca_banks *bankmask)
565 {
566 int i;
567
568 for ( i = 0; i < nr_mce_banks; i++ )
569 {
570 if ( !mcabanks_test(i, bankmask) )
571 continue;
572 mcabank_clear(i);
573 }
574 }
575
576 /*check the existence of Machine Check*/
mce_available(const struct cpuinfo_x86 * c)577 bool mce_available(const struct cpuinfo_x86 *c)
578 {
579 return cpu_has(c, X86_FEATURE_MCE) && cpu_has(c, X86_FEATURE_MCA);
580 }
581
582 /*
583 * Check if bank 0 is usable for MCE. It isn't for Intel P6 family
584 * before model 0x1a.
585 */
mce_firstbank(struct cpuinfo_x86 * c)586 unsigned int mce_firstbank(struct cpuinfo_x86 *c)
587 {
588 return c->x86 == 6 &&
589 c->x86_vendor == X86_VENDOR_INTEL && c->x86_model < 0x1a;
590 }
591
show_mca_info(int inited,struct cpuinfo_x86 * c)592 int show_mca_info(int inited, struct cpuinfo_x86 *c)
593 {
594 static enum mcheck_type g_type = mcheck_unset;
595
596 if ( inited != g_type )
597 {
598 char prefix[20];
599 static const char *const type_str[] = {
600 [mcheck_amd_famXX] = "AMD",
601 [mcheck_amd_k8] = "AMD K8",
602 [mcheck_intel] = "Intel"
603 };
604
605 snprintf(prefix, ARRAY_SIZE(prefix), "%sCPU%u: ",
606 g_type != mcheck_unset ? XENLOG_WARNING : XENLOG_INFO,
607 smp_processor_id());
608 BUG_ON(inited >= ARRAY_SIZE(type_str));
609 switch ( inited )
610 {
611 default:
612 printk("%s%s machine check reporting enabled\n",
613 prefix, type_str[inited]);
614 break;
615
616 case mcheck_amd_famXX:
617 printk("%s%s Fam%xh machine check reporting enabled\n",
618 prefix, type_str[inited], c->x86);
619 break;
620
621 case mcheck_none:
622 printk("%sNo machine check initialization\n", prefix);
623 break;
624 }
625 g_type = inited;
626 }
627
628 return 0;
629 }
630
set_poll_bankmask(struct cpuinfo_x86 * c)631 static void set_poll_bankmask(struct cpuinfo_x86 *c)
632 {
633 int cpu = smp_processor_id();
634 struct mca_banks *mb;
635
636 mb = per_cpu(poll_bankmask, cpu);
637 BUG_ON(!mb);
638
639 if ( cmci_support && opt_mce )
640 {
641 mb->num = per_cpu(no_cmci_banks, cpu)->num;
642 bitmap_copy(mb->bank_map, per_cpu(no_cmci_banks, cpu)->bank_map,
643 nr_mce_banks);
644 }
645 else
646 {
647 bitmap_copy(mb->bank_map, mca_allbanks->bank_map, nr_mce_banks);
648 if ( mce_firstbank(c) )
649 mcabanks_clear(0, mb);
650 }
651 }
652
653 /* The perbank ctl/status init is platform specific because of AMD's quirk */
mca_cap_init(void)654 int mca_cap_init(void)
655 {
656 uint64_t msr_content;
657
658 rdmsrl(MSR_IA32_MCG_CAP, msr_content);
659
660 if ( msr_content & MCG_CTL_P ) /* Control register present ? */
661 wrmsrl(MSR_IA32_MCG_CTL, 0xffffffffffffffffULL);
662
663 if ( nr_mce_banks && (msr_content & MCG_CAP_COUNT) != nr_mce_banks )
664 {
665 dprintk(XENLOG_WARNING, "Different bank number on cpu %x\n",
666 smp_processor_id());
667 return -ENODEV;
668 }
669 nr_mce_banks = msr_content & MCG_CAP_COUNT;
670
671 if ( !nr_mce_banks )
672 {
673 printk(XENLOG_INFO "CPU%u: No MCE banks present. "
674 "Machine check support disabled\n", smp_processor_id());
675 return -ENODEV;
676 }
677
678 /* mcabanks_alloc depends on nr_mce_banks */
679 if ( !mca_allbanks )
680 {
681 int i;
682
683 mca_allbanks = mcabanks_alloc();
684 for ( i = 0; i < nr_mce_banks; i++ )
685 mcabanks_set(i, mca_allbanks);
686 }
687
688 return mca_allbanks ? 0 : -ENOMEM;
689 }
690
cpu_bank_free(unsigned int cpu)691 static void cpu_bank_free(unsigned int cpu)
692 {
693 struct mca_banks *poll = per_cpu(poll_bankmask, cpu);
694 struct mca_banks *clr = per_cpu(mce_clear_banks, cpu);
695
696 mcabanks_free(poll);
697 mcabanks_free(clr);
698 }
699
cpu_bank_alloc(unsigned int cpu)700 static int cpu_bank_alloc(unsigned int cpu)
701 {
702 struct mca_banks *poll = mcabanks_alloc();
703 struct mca_banks *clr = mcabanks_alloc();
704
705 if ( !poll || !clr )
706 {
707 mcabanks_free(poll);
708 mcabanks_free(clr);
709 return -ENOMEM;
710 }
711
712 per_cpu(poll_bankmask, cpu) = poll;
713 per_cpu(mce_clear_banks, cpu) = clr;
714 return 0;
715 }
716
cpu_callback(struct notifier_block * nfb,unsigned long action,void * hcpu)717 static int cpu_callback(
718 struct notifier_block *nfb, unsigned long action, void *hcpu)
719 {
720 unsigned int cpu = (unsigned long)hcpu;
721 int rc = 0;
722
723 switch ( action )
724 {
725 case CPU_UP_PREPARE:
726 rc = cpu_bank_alloc(cpu);
727 break;
728
729 case CPU_UP_CANCELED:
730 case CPU_DEAD:
731 cpu_bank_free(cpu);
732 break;
733 }
734
735 return !rc ? NOTIFY_DONE : notifier_from_errno(rc);
736 }
737
738 static struct notifier_block cpu_nfb = {
739 .notifier_call = cpu_callback
740 };
741
742 /* This has to be run for each processor */
mcheck_init(struct cpuinfo_x86 * c,bool bsp)743 void mcheck_init(struct cpuinfo_x86 *c, bool bsp)
744 {
745 enum mcheck_type inited = mcheck_none;
746
747 if ( !opt_mce )
748 {
749 if ( bsp )
750 printk(XENLOG_INFO "MCE support disabled by bootparam\n");
751 return;
752 }
753
754 if ( !mce_available(c) )
755 {
756 printk(XENLOG_INFO "CPU%i: No machine check support available\n",
757 smp_processor_id());
758 return;
759 }
760
761 /*Hardware Enable */
762 if ( mca_cap_init() )
763 return;
764
765 /* Early MCE initialisation for BSP. */
766 if ( bsp && cpu_bank_alloc(smp_processor_id()) )
767 BUG();
768
769 switch ( c->x86_vendor )
770 {
771 case X86_VENDOR_AMD:
772 inited = amd_mcheck_init(c);
773 break;
774
775 case X86_VENDOR_INTEL:
776 switch ( c->x86 )
777 {
778 case 6:
779 case 15:
780 inited = intel_mcheck_init(c, bsp);
781 break;
782 }
783 break;
784
785 default:
786 break;
787 }
788
789 show_mca_info(inited, c);
790 if ( inited == mcheck_none || inited == mcheck_unset )
791 goto out;
792
793 intpose_init();
794
795 if ( bsp )
796 {
797 mctelem_init(sizeof(struct mc_info));
798 register_cpu_notifier(&cpu_nfb);
799 }
800
801 /* Turn on MCE now */
802 set_in_cr4(X86_CR4_MCE);
803
804 set_poll_bankmask(c);
805
806 return;
807 out:
808 if ( bsp )
809 {
810 cpu_bank_free(smp_processor_id());
811 mcabanks_free(mca_allbanks);
812 mca_allbanks = NULL;
813 }
814 }
815
mcinfo_clear(struct mc_info * mi)816 static void mcinfo_clear(struct mc_info *mi)
817 {
818 memset(mi, 0, sizeof(struct mc_info));
819 x86_mcinfo_nentries(mi) = 0;
820 }
821
x86_mcinfo_reserve(struct mc_info * mi,unsigned int size,unsigned int type)822 void *x86_mcinfo_reserve(struct mc_info *mi,
823 unsigned int size, unsigned int type)
824 {
825 int i;
826 unsigned long end1, end2;
827 struct mcinfo_common *mic_base, *mic_index;
828
829 mic_index = mic_base = x86_mcinfo_first(mi);
830
831 /* go to first free entry */
832 for ( i = 0; i < x86_mcinfo_nentries(mi); i++ )
833 mic_index = x86_mcinfo_next(mic_index);
834
835 /* check if there is enough size */
836 end1 = (unsigned long)((uint8_t *)mic_base + sizeof(struct mc_info));
837 end2 = (unsigned long)((uint8_t *)mic_index + size);
838
839 if ( end1 < end2 )
840 {
841 mce_printk(MCE_CRITICAL,
842 "mcinfo_add: No space left in mc_info\n");
843 return NULL;
844 }
845
846 /* there's enough space. add entry. */
847 x86_mcinfo_nentries(mi)++;
848
849 memset(mic_index, 0, size);
850 mic_index->size = size;
851 mic_index->type = type;
852
853 return mic_index;
854 }
855
x86_mcinfo_apei_save(struct mcinfo_global * mc_global,struct mcinfo_bank * mc_bank)856 static void x86_mcinfo_apei_save(
857 struct mcinfo_global *mc_global, struct mcinfo_bank *mc_bank)
858 {
859 struct mce m;
860
861 memset(&m, 0, sizeof(struct mce));
862
863 m.cpu = mc_global->mc_coreid;
864 m.cpuvendor = boot_cpu_data.x86_vendor;
865 m.cpuid = cpuid_eax(1);
866 m.socketid = mc_global->mc_socketid;
867 m.apicid = mc_global->mc_apicid;
868
869 m.mcgstatus = mc_global->mc_gstatus;
870 m.status = mc_bank->mc_status;
871 m.misc = mc_bank->mc_misc;
872 m.addr = mc_bank->mc_addr;
873 m.bank = mc_bank->mc_bank;
874
875 apei_write_mce(&m);
876 }
877
878 /*
879 * Dump machine check information in a format,
880 * mcelog can parse. This is used only when
881 * Dom0 does not take the notification.
882 */
x86_mcinfo_dump(struct mc_info * mi)883 void x86_mcinfo_dump(struct mc_info *mi)
884 {
885 struct mcinfo_common *mic = NULL;
886 struct mcinfo_global *mc_global;
887 struct mcinfo_bank *mc_bank;
888
889 /* first print the global info */
890 x86_mcinfo_lookup(mic, mi, MC_TYPE_GLOBAL);
891 if ( mic == NULL )
892 return;
893 mc_global = (struct mcinfo_global *)mic;
894 if ( mc_global->mc_flags & MC_FLAG_MCE )
895 printk(XENLOG_WARNING
896 "CPU%d: Machine Check Exception: %16"PRIx64"\n",
897 mc_global->mc_coreid, mc_global->mc_gstatus);
898 else if ( mc_global->mc_flags & MC_FLAG_CMCI )
899 printk(XENLOG_WARNING "CMCI occurred on CPU %d.\n",
900 mc_global->mc_coreid);
901 else if ( mc_global->mc_flags & MC_FLAG_POLLED )
902 printk(XENLOG_WARNING "POLLED occurred on CPU %d.\n",
903 mc_global->mc_coreid);
904
905 /* then the bank information */
906 x86_mcinfo_lookup(mic, mi, MC_TYPE_BANK); /* finds the first entry */
907 do {
908 if ( mic == NULL )
909 return;
910 if ( mic->type != MC_TYPE_BANK )
911 goto next;
912
913 mc_bank = (struct mcinfo_bank *)mic;
914
915 printk(XENLOG_WARNING "Bank %d: %16"PRIx64,
916 mc_bank->mc_bank,
917 mc_bank->mc_status);
918 if ( mc_bank->mc_status & MCi_STATUS_MISCV )
919 printk("[%16"PRIx64"]", mc_bank->mc_misc);
920 if ( mc_bank->mc_status & MCi_STATUS_ADDRV )
921 printk(" at %16"PRIx64, mc_bank->mc_addr);
922 printk("\n");
923
924 if ( is_mc_panic )
925 x86_mcinfo_apei_save(mc_global, mc_bank);
926
927 next:
928 mic = x86_mcinfo_next(mic); /* next entry */
929 if ( (mic == NULL) || (mic->size == 0) )
930 break;
931 } while ( 1 );
932 }
933
do_mc_get_cpu_info(void * v)934 static void do_mc_get_cpu_info(void *v)
935 {
936 int cpu = smp_processor_id();
937 int cindex, cpn;
938 struct cpuinfo_x86 *c;
939 xen_mc_logical_cpu_t *log_cpus, *xcp;
940 uint32_t junk, ebx;
941
942 log_cpus = v;
943 c = &cpu_data[cpu];
944 cindex = 0;
945 cpn = cpu - 1;
946
947 /*
948 * Deal with sparse masks, condensed into a contig array.
949 */
950 while ( cpn >= 0 )
951 {
952 if ( cpu_online(cpn) )
953 cindex++;
954 cpn--;
955 }
956
957 xcp = &log_cpus[cindex];
958 c = &cpu_data[cpu];
959 xcp->mc_cpunr = cpu;
960 x86_mc_get_cpu_info(cpu, &xcp->mc_chipid,
961 &xcp->mc_coreid, &xcp->mc_threadid,
962 &xcp->mc_apicid, &xcp->mc_ncores,
963 &xcp->mc_ncores_active, &xcp->mc_nthreads);
964 xcp->mc_cpuid_level = c->cpuid_level;
965 xcp->mc_family = c->x86;
966 xcp->mc_vendor = c->x86_vendor;
967 xcp->mc_model = c->x86_model;
968 xcp->mc_step = c->x86_mask;
969 xcp->mc_cache_size = c->x86_cache_size;
970 xcp->mc_cache_alignment = c->x86_cache_alignment;
971 memcpy(xcp->mc_vendorid, c->x86_vendor_id, sizeof xcp->mc_vendorid);
972 memcpy(xcp->mc_brandid, c->x86_model_id, sizeof xcp->mc_brandid);
973 memcpy(xcp->mc_cpu_caps, c->x86_capability, sizeof xcp->mc_cpu_caps);
974
975 /*
976 * This part needs to run on the CPU itself.
977 */
978 xcp->mc_nmsrvals = __MC_NMSRS;
979 xcp->mc_msrvalues[0].reg = MSR_IA32_MCG_CAP;
980 rdmsrl(MSR_IA32_MCG_CAP, xcp->mc_msrvalues[0].value);
981
982 if ( c->cpuid_level >= 1 )
983 {
984 cpuid(1, &junk, &ebx, &junk, &junk);
985 xcp->mc_clusterid = (ebx >> 24) & 0xff;
986 }
987 else
988 xcp->mc_clusterid = get_apic_id();
989 }
990
x86_mc_get_cpu_info(unsigned cpu,uint32_t * chipid,uint16_t * coreid,uint16_t * threadid,uint32_t * apicid,unsigned * ncores,unsigned * ncores_active,unsigned * nthreads)991 void x86_mc_get_cpu_info(unsigned cpu, uint32_t *chipid, uint16_t *coreid,
992 uint16_t *threadid, uint32_t *apicid,
993 unsigned *ncores, unsigned *ncores_active,
994 unsigned *nthreads)
995 {
996 struct cpuinfo_x86 *c;
997
998 *apicid = cpu_physical_id(cpu);
999 c = &cpu_data[cpu];
1000 if ( c->apicid == BAD_APICID )
1001 {
1002 *chipid = cpu;
1003 *coreid = 0;
1004 *threadid = 0;
1005 if ( ncores != NULL )
1006 *ncores = 1;
1007 if ( ncores_active != NULL )
1008 *ncores_active = 1;
1009 if ( nthreads != NULL )
1010 *nthreads = 1;
1011 }
1012 else
1013 {
1014 *chipid = c->phys_proc_id;
1015 if ( c->x86_max_cores > 1 )
1016 *coreid = c->cpu_core_id;
1017 else
1018 *coreid = 0;
1019 *threadid = c->apicid & ((1 << (c->x86_num_siblings - 1)) - 1);
1020 if ( ncores != NULL )
1021 *ncores = c->x86_max_cores;
1022 if ( ncores_active != NULL )
1023 *ncores_active = c->booted_cores;
1024 if ( nthreads != NULL )
1025 *nthreads = c->x86_num_siblings;
1026 }
1027 }
1028
1029 #define INTPOSE_NENT 50
1030
1031 static struct intpose_ent {
1032 unsigned int cpu_nr;
1033 uint64_t msr;
1034 uint64_t val;
1035 } intpose_arr[INTPOSE_NENT];
1036
intpose_init(void)1037 static void intpose_init(void)
1038 {
1039 static int done;
1040 int i;
1041
1042 if ( done++ > 0 )
1043 return;
1044
1045 for ( i = 0; i < INTPOSE_NENT; i++ )
1046 intpose_arr[i].cpu_nr = -1;
1047
1048 }
1049
intpose_lookup(unsigned int cpu_nr,uint64_t msr,uint64_t * valp)1050 struct intpose_ent *intpose_lookup(unsigned int cpu_nr, uint64_t msr,
1051 uint64_t *valp)
1052 {
1053 int i;
1054
1055 for ( i = 0; i < INTPOSE_NENT; i++ )
1056 {
1057 if ( intpose_arr[i].cpu_nr == cpu_nr && intpose_arr[i].msr == msr )
1058 {
1059 if ( valp != NULL )
1060 *valp = intpose_arr[i].val;
1061 return &intpose_arr[i];
1062 }
1063 }
1064
1065 return NULL;
1066 }
1067
intpose_add(unsigned int cpu_nr,uint64_t msr,uint64_t val)1068 static void intpose_add(unsigned int cpu_nr, uint64_t msr, uint64_t val)
1069 {
1070 struct intpose_ent *ent = intpose_lookup(cpu_nr, msr, NULL);
1071 int i;
1072
1073 if ( ent )
1074 {
1075 ent->val = val;
1076 return;
1077 }
1078
1079 for ( i = 0, ent = &intpose_arr[0]; i < INTPOSE_NENT; i++, ent++ )
1080 {
1081 if ( ent->cpu_nr == -1 )
1082 {
1083 ent->cpu_nr = cpu_nr;
1084 ent->msr = msr;
1085 ent->val = val;
1086 return;
1087 }
1088 }
1089
1090 printk("intpose_add: interpose array full - request dropped\n");
1091 }
1092
intpose_inval(unsigned int cpu_nr,uint64_t msr)1093 bool intpose_inval(unsigned int cpu_nr, uint64_t msr)
1094 {
1095 struct intpose_ent *ent = intpose_lookup(cpu_nr, msr, NULL);
1096
1097 if ( !ent )
1098 return false;
1099
1100 ent->cpu_nr = -1;
1101 return true;
1102 }
1103
1104 #define IS_MCA_BANKREG(r) \
1105 ((r) >= MSR_IA32_MC0_CTL && \
1106 (r) <= MSR_IA32_MCx_MISC(nr_mce_banks - 1) && \
1107 ((r) - MSR_IA32_MC0_CTL) % 4 != 0) /* excludes MCi_CTL */
1108
x86_mc_msrinject_verify(struct xen_mc_msrinject * mci)1109 static bool x86_mc_msrinject_verify(struct xen_mc_msrinject *mci)
1110 {
1111 struct cpuinfo_x86 *c;
1112 int i, errs = 0;
1113
1114 c = &cpu_data[smp_processor_id()];
1115
1116 for ( i = 0; i < mci->mcinj_count; i++ )
1117 {
1118 uint64_t reg = mci->mcinj_msr[i].reg;
1119 const char *reason = NULL;
1120
1121 if ( IS_MCA_BANKREG(reg) )
1122 {
1123 if ( c->x86_vendor == X86_VENDOR_AMD )
1124 {
1125 /*
1126 * On AMD we can set MCi_STATUS_WREN in the
1127 * HWCR MSR to allow non-zero writes to banks
1128 * MSRs not to #GP. The injector in dom0
1129 * should set that bit, but we detect when it
1130 * is necessary and set it as a courtesy to
1131 * avoid #GP in the hypervisor.
1132 */
1133 mci->mcinj_flags |=
1134 _MC_MSRINJ_F_REQ_HWCR_WREN;
1135 continue;
1136 }
1137 else
1138 {
1139 /*
1140 * No alternative but to interpose, so require
1141 * that the injector specified as such.
1142 */
1143 if ( !(mci->mcinj_flags & MC_MSRINJ_F_INTERPOSE) )
1144 reason = "must specify interposition";
1145 }
1146 }
1147 else
1148 {
1149 switch ( reg )
1150 {
1151 /* MSRs acceptable on all x86 cpus */
1152 case MSR_IA32_MCG_STATUS:
1153 break;
1154
1155 case MSR_F10_MC4_MISC1:
1156 case MSR_F10_MC4_MISC2:
1157 case MSR_F10_MC4_MISC3:
1158 if ( c->x86_vendor != X86_VENDOR_AMD )
1159 reason = "only supported on AMD";
1160 else if ( c->x86 < 0x10 )
1161 reason = "only supported on AMD Fam10h+";
1162 break;
1163
1164 /* MSRs that the HV will take care of */
1165 case MSR_K8_HWCR:
1166 if ( c->x86_vendor == X86_VENDOR_AMD )
1167 reason = "HV will operate HWCR";
1168 else
1169 reason = "only supported on AMD";
1170 break;
1171
1172 default:
1173 reason = "not a recognized MCA MSR";
1174 break;
1175 }
1176 }
1177
1178 if ( reason != NULL )
1179 {
1180 printk("HV MSR INJECT ERROR: MSR %#Lx %s\n",
1181 (unsigned long long)mci->mcinj_msr[i].reg, reason);
1182 errs++;
1183 }
1184 }
1185
1186 return !errs;
1187 }
1188
x86_mc_hwcr_wren(void)1189 static uint64_t x86_mc_hwcr_wren(void)
1190 {
1191 uint64_t old;
1192
1193 rdmsrl(MSR_K8_HWCR, old);
1194
1195 if ( !(old & K8_HWCR_MCi_STATUS_WREN) )
1196 {
1197 uint64_t new = old | K8_HWCR_MCi_STATUS_WREN;
1198 wrmsrl(MSR_K8_HWCR, new);
1199 }
1200
1201 return old;
1202 }
1203
x86_mc_hwcr_wren_restore(uint64_t hwcr)1204 static void x86_mc_hwcr_wren_restore(uint64_t hwcr)
1205 {
1206 if ( !(hwcr & K8_HWCR_MCi_STATUS_WREN) )
1207 wrmsrl(MSR_K8_HWCR, hwcr);
1208 }
1209
x86_mc_msrinject(void * data)1210 static void x86_mc_msrinject(void *data)
1211 {
1212 struct xen_mc_msrinject *mci = data;
1213 struct mcinfo_msr *msr;
1214 uint64_t hwcr = 0;
1215 int intpose;
1216 int i;
1217
1218 if ( mci->mcinj_flags & _MC_MSRINJ_F_REQ_HWCR_WREN )
1219 hwcr = x86_mc_hwcr_wren();
1220
1221 intpose = (mci->mcinj_flags & MC_MSRINJ_F_INTERPOSE) != 0;
1222
1223 for ( i = 0, msr = &mci->mcinj_msr[0]; i < mci->mcinj_count; i++, msr++ )
1224 {
1225 printk("HV MSR INJECT (%s) target %u actual %u MSR %#Lx <-- %#Lx\n",
1226 intpose ? "interpose" : "hardware",
1227 mci->mcinj_cpunr, smp_processor_id(),
1228 (unsigned long long)msr->reg,
1229 (unsigned long long)msr->value);
1230
1231 if ( intpose )
1232 intpose_add(mci->mcinj_cpunr, msr->reg, msr->value);
1233 else
1234 wrmsrl(msr->reg, msr->value);
1235 }
1236
1237 if ( mci->mcinj_flags & _MC_MSRINJ_F_REQ_HWCR_WREN )
1238 x86_mc_hwcr_wren_restore(hwcr);
1239 }
1240
1241 /*ARGSUSED*/
x86_mc_mceinject(void * data)1242 static void x86_mc_mceinject(void *data)
1243 {
1244 printk("Simulating #MC on cpu %d\n", smp_processor_id());
1245 __asm__ __volatile__("int $0x12");
1246 }
1247
1248 #if BITS_PER_LONG == 64
1249
1250 #define ID2COOKIE(id) ((mctelem_cookie_t)(id))
1251 #define COOKIE2ID(c) ((uint64_t)(c))
1252
1253 #elif defined(BITS_PER_LONG)
1254 #error BITS_PER_LONG has unexpected value
1255 #else
1256 #error BITS_PER_LONG definition absent
1257 #endif
1258
1259 # include <compat/arch-x86/xen-mca.h>
1260
1261 # define xen_mcinfo_msr mcinfo_msr
1262 CHECK_mcinfo_msr;
1263 # undef xen_mcinfo_msr
1264 # undef CHECK_mcinfo_msr
1265 # define CHECK_mcinfo_msr struct mcinfo_msr
1266
1267 # define xen_mcinfo_common mcinfo_common
1268 CHECK_mcinfo_common;
1269 # undef xen_mcinfo_common
1270 # undef CHECK_mcinfo_common
1271 # define CHECK_mcinfo_common struct mcinfo_common
1272
1273 CHECK_FIELD_(struct, mc_fetch, flags);
1274 CHECK_FIELD_(struct, mc_fetch, fetch_id);
1275 # define CHECK_compat_mc_fetch struct mc_fetch
1276
1277 CHECK_FIELD_(struct, mc_physcpuinfo, ncpus);
1278 # define CHECK_compat_mc_physcpuinfo struct mc_physcpuinfo
1279
1280 #define CHECK_compat_mc_inject_v2 struct mc_inject_v2
1281 CHECK_mc;
1282 # undef CHECK_compat_mc_fetch
1283 # undef CHECK_compat_mc_physcpuinfo
1284
1285 # define xen_mc_info mc_info
1286 CHECK_mc_info;
1287 # undef xen_mc_info
1288
1289 # define xen_mcinfo_global mcinfo_global
1290 CHECK_mcinfo_global;
1291 # undef xen_mcinfo_global
1292
1293 # define xen_mcinfo_bank mcinfo_bank
1294 CHECK_mcinfo_bank;
1295 # undef xen_mcinfo_bank
1296
1297 # define xen_mcinfo_extended mcinfo_extended
1298 CHECK_mcinfo_extended;
1299 # undef xen_mcinfo_extended
1300
1301 # define xen_mcinfo_recovery mcinfo_recovery
1302 # define xen_cpu_offline_action cpu_offline_action
1303 # define xen_page_offline_action page_offline_action
1304 CHECK_mcinfo_recovery;
1305 # undef xen_cpu_offline_action
1306 # undef xen_page_offline_action
1307 # undef xen_mcinfo_recovery
1308
1309 /* Machine Check Architecture Hypercall */
do_mca(XEN_GUEST_HANDLE_PARAM (xen_mc_t)u_xen_mc)1310 long do_mca(XEN_GUEST_HANDLE_PARAM(xen_mc_t) u_xen_mc)
1311 {
1312 long ret = 0;
1313 struct xen_mc curop, *op = &curop;
1314 struct vcpu *v = current;
1315 union {
1316 struct xen_mc_fetch *nat;
1317 struct compat_mc_fetch *cmp;
1318 } mc_fetch;
1319 union {
1320 struct xen_mc_physcpuinfo *nat;
1321 struct compat_mc_physcpuinfo *cmp;
1322 } mc_physcpuinfo;
1323 uint32_t flags, cmdflags;
1324 int nlcpu;
1325 xen_mc_logical_cpu_t *log_cpus = NULL;
1326 mctelem_cookie_t mctc;
1327 mctelem_class_t which;
1328 unsigned int target;
1329 struct xen_mc_msrinject *mc_msrinject;
1330 struct xen_mc_mceinject *mc_mceinject;
1331
1332 ret = xsm_do_mca(XSM_PRIV);
1333 if ( ret )
1334 return x86_mcerr("", ret);
1335
1336 if ( copy_from_guest(op, u_xen_mc, 1) )
1337 return x86_mcerr("do_mca: failed copyin of xen_mc_t", -EFAULT);
1338
1339 if ( op->interface_version != XEN_MCA_INTERFACE_VERSION )
1340 return x86_mcerr("do_mca: interface version mismatch", -EACCES);
1341
1342 switch ( op->cmd )
1343 {
1344 case XEN_MC_fetch:
1345 mc_fetch.nat = &op->u.mc_fetch;
1346 cmdflags = mc_fetch.nat->flags;
1347
1348 switch ( cmdflags & (XEN_MC_NONURGENT | XEN_MC_URGENT) )
1349 {
1350 case XEN_MC_NONURGENT:
1351 which = MC_NONURGENT;
1352 break;
1353
1354 case XEN_MC_URGENT:
1355 which = MC_URGENT;
1356 break;
1357
1358 default:
1359 return x86_mcerr("do_mca fetch: bad cmdflags", -EINVAL);
1360 }
1361
1362 flags = XEN_MC_OK;
1363
1364 if ( cmdflags & XEN_MC_ACK )
1365 {
1366 mctelem_cookie_t cookie = ID2COOKIE(mc_fetch.nat->fetch_id);
1367 mctelem_ack(which, cookie);
1368 }
1369 else
1370 {
1371 if ( !is_pv_32bit_vcpu(v)
1372 ? guest_handle_is_null(mc_fetch.nat->data)
1373 : compat_handle_is_null(mc_fetch.cmp->data) )
1374 return x86_mcerr("do_mca fetch: guest buffer "
1375 "invalid", -EINVAL);
1376
1377 mctc = mctelem_consume_oldest_begin(which);
1378 if ( mctc )
1379 {
1380 struct mc_info *mcip = mctelem_dataptr(mctc);
1381 if ( !is_pv_32bit_vcpu(v)
1382 ? copy_to_guest(mc_fetch.nat->data, mcip, 1)
1383 : copy_to_compat(mc_fetch.cmp->data, mcip, 1) )
1384 {
1385 ret = -EFAULT;
1386 flags |= XEN_MC_FETCHFAILED;
1387 mc_fetch.nat->fetch_id = 0;
1388 }
1389 else
1390 mc_fetch.nat->fetch_id = COOKIE2ID(mctc);
1391 mctelem_consume_oldest_end(mctc);
1392 }
1393 else
1394 {
1395 /* There is no data */
1396 flags |= XEN_MC_NODATA;
1397 mc_fetch.nat->fetch_id = 0;
1398 }
1399
1400 mc_fetch.nat->flags = flags;
1401 if (copy_to_guest(u_xen_mc, op, 1) != 0)
1402 ret = -EFAULT;
1403 }
1404
1405 break;
1406
1407 case XEN_MC_notifydomain:
1408 return x86_mcerr("do_mca notify unsupported", -EINVAL);
1409
1410 case XEN_MC_physcpuinfo:
1411 mc_physcpuinfo.nat = &op->u.mc_physcpuinfo;
1412 nlcpu = num_online_cpus();
1413
1414 if ( !is_pv_32bit_vcpu(v)
1415 ? !guest_handle_is_null(mc_physcpuinfo.nat->info)
1416 : !compat_handle_is_null(mc_physcpuinfo.cmp->info) )
1417 {
1418 if ( mc_physcpuinfo.nat->ncpus <= 0 )
1419 return x86_mcerr("do_mca cpuinfo: ncpus <= 0",
1420 -EINVAL);
1421 nlcpu = min(nlcpu, (int)mc_physcpuinfo.nat->ncpus);
1422 log_cpus = xmalloc_array(xen_mc_logical_cpu_t, nlcpu);
1423 if ( log_cpus == NULL )
1424 return x86_mcerr("do_mca cpuinfo", -ENOMEM);
1425 on_each_cpu(do_mc_get_cpu_info, log_cpus, 1);
1426 if ( !is_pv_32bit_vcpu(v)
1427 ? copy_to_guest(mc_physcpuinfo.nat->info, log_cpus, nlcpu)
1428 : copy_to_compat(mc_physcpuinfo.cmp->info, log_cpus, nlcpu) )
1429 ret = -EFAULT;
1430 xfree(log_cpus);
1431 }
1432
1433 mc_physcpuinfo.nat->ncpus = nlcpu;
1434
1435 if ( copy_to_guest(u_xen_mc, op, 1) )
1436 return x86_mcerr("do_mca cpuinfo", -EFAULT);
1437
1438 break;
1439
1440 case XEN_MC_msrinject:
1441 if ( nr_mce_banks == 0 )
1442 return x86_mcerr("do_mca inject", -ENODEV);
1443
1444 mc_msrinject = &op->u.mc_msrinject;
1445 target = mc_msrinject->mcinj_cpunr;
1446
1447 if ( target >= nr_cpu_ids )
1448 return x86_mcerr("do_mca inject: bad target", -EINVAL);
1449
1450 if ( !cpu_online(target) )
1451 return x86_mcerr("do_mca inject: target offline",
1452 -EINVAL);
1453
1454 if ( mc_msrinject->mcinj_count == 0 )
1455 return 0;
1456
1457 if ( mc_msrinject->mcinj_flags & MC_MSRINJ_F_GPADDR )
1458 {
1459 domid_t domid;
1460 struct domain *d;
1461 struct mcinfo_msr *msr;
1462 unsigned int i;
1463 paddr_t gaddr;
1464 unsigned long gfn, mfn;
1465 p2m_type_t t;
1466
1467 domid = (mc_msrinject->mcinj_domid == DOMID_SELF) ?
1468 current->domain->domain_id : mc_msrinject->mcinj_domid;
1469 if ( domid >= DOMID_FIRST_RESERVED )
1470 return x86_mcerr("do_mca inject: incompatible flag "
1471 "MC_MSRINJ_F_GPADDR with domain %d",
1472 -EINVAL, domid);
1473
1474 d = get_domain_by_id(domid);
1475 if ( d == NULL )
1476 return x86_mcerr("do_mca inject: bad domain id %d",
1477 -EINVAL, domid);
1478
1479 for ( i = 0, msr = &mc_msrinject->mcinj_msr[0];
1480 i < mc_msrinject->mcinj_count;
1481 i++, msr++ )
1482 {
1483 gaddr = msr->value;
1484 gfn = PFN_DOWN(gaddr);
1485 mfn = mfn_x(get_gfn(d, gfn, &t));
1486
1487 if ( mfn == mfn_x(INVALID_MFN) )
1488 {
1489 put_gfn(d, gfn);
1490 put_domain(d);
1491 return x86_mcerr("do_mca inject: bad gfn %#lx of domain %d",
1492 -EINVAL, gfn, domid);
1493 }
1494
1495 msr->value = pfn_to_paddr(mfn) | (gaddr & (PAGE_SIZE - 1));
1496
1497 put_gfn(d, gfn);
1498 }
1499
1500 put_domain(d);
1501 }
1502
1503 if ( !x86_mc_msrinject_verify(mc_msrinject) )
1504 return x86_mcerr("do_mca inject: illegal MSR", -EINVAL);
1505
1506 add_taint(TAINT_ERROR_INJECT);
1507
1508 on_selected_cpus(cpumask_of(target), x86_mc_msrinject,
1509 mc_msrinject, 1);
1510
1511 break;
1512
1513 case XEN_MC_mceinject:
1514 if ( nr_mce_banks == 0 )
1515 return x86_mcerr("do_mca #MC", -ENODEV);
1516
1517 mc_mceinject = &op->u.mc_mceinject;
1518 target = mc_mceinject->mceinj_cpunr;
1519
1520 if ( target >= nr_cpu_ids )
1521 return x86_mcerr("do_mca #MC: bad target", -EINVAL);
1522
1523 if ( !cpu_online(target) )
1524 return x86_mcerr("do_mca #MC: target offline", -EINVAL);
1525
1526 add_taint(TAINT_ERROR_INJECT);
1527
1528 if ( mce_broadcast )
1529 on_each_cpu(x86_mc_mceinject, mc_mceinject, 1);
1530 else
1531 on_selected_cpus(cpumask_of(target), x86_mc_mceinject,
1532 mc_mceinject, 1);
1533 break;
1534
1535 case XEN_MC_inject_v2:
1536 {
1537 const cpumask_t *cpumap;
1538 cpumask_var_t cmv;
1539 bool broadcast = op->u.mc_inject_v2.flags & XEN_MC_INJECT_CPU_BROADCAST;
1540
1541 if ( nr_mce_banks == 0 )
1542 return x86_mcerr("do_mca #MC", -ENODEV);
1543
1544 if ( broadcast )
1545 cpumap = &cpu_online_map;
1546 else
1547 {
1548 ret = xenctl_bitmap_to_cpumask(&cmv, &op->u.mc_inject_v2.cpumap);
1549 if ( ret )
1550 break;
1551 cpumap = cmv;
1552 if ( !cpumask_intersects(cpumap, &cpu_online_map) )
1553 {
1554 free_cpumask_var(cmv);
1555 ret = x86_mcerr("No online CPU passed\n", -EINVAL);
1556 break;
1557 }
1558 if ( !cpumask_subset(cpumap, &cpu_online_map) )
1559 dprintk(XENLOG_INFO,
1560 "Not all required CPUs are online\n");
1561 }
1562
1563 switch ( op->u.mc_inject_v2.flags & XEN_MC_INJECT_TYPE_MASK )
1564 {
1565 case XEN_MC_INJECT_TYPE_MCE:
1566 if ( mce_broadcast &&
1567 !cpumask_equal(cpumap, &cpu_online_map) )
1568 printk("Not trigger MCE on all CPUs, may HANG!\n");
1569 on_selected_cpus(cpumap, x86_mc_mceinject, NULL, 1);
1570 break;
1571
1572 case XEN_MC_INJECT_TYPE_CMCI:
1573 if ( !cmci_apic_vector )
1574 ret = x86_mcerr("No CMCI supported in platform\n", -EINVAL);
1575 else
1576 {
1577 if ( cpumask_test_cpu(smp_processor_id(), cpumap) )
1578 send_IPI_self(cmci_apic_vector);
1579 send_IPI_mask(cpumap, cmci_apic_vector);
1580 }
1581 break;
1582
1583 case XEN_MC_INJECT_TYPE_LMCE:
1584 if ( !lmce_support )
1585 {
1586 ret = x86_mcerr("No LMCE support", -EINVAL);
1587 break;
1588 }
1589 if ( broadcast )
1590 {
1591 ret = x86_mcerr("Broadcast cannot be used with LMCE", -EINVAL);
1592 break;
1593 }
1594 /* Ensure at most one CPU is specified. */
1595 if ( nr_cpu_ids > cpumask_next(cpumask_first(cpumap), cpumap) )
1596 {
1597 ret = x86_mcerr("More than one CPU specified for LMCE",
1598 -EINVAL);
1599 break;
1600 }
1601 on_selected_cpus(cpumap, x86_mc_mceinject, NULL, 1);
1602 break;
1603
1604 default:
1605 ret = x86_mcerr("Wrong mca type\n", -EINVAL);
1606 break;
1607 }
1608
1609 if ( cpumap != &cpu_online_map )
1610 free_cpumask_var(cmv);
1611
1612 break;
1613 }
1614
1615 default:
1616 return x86_mcerr("do_mca: bad command", -EINVAL);
1617 }
1618
1619 return ret;
1620 }
1621
1622 int mcinfo_dumpped;
x86_mcinfo_dump_panic(mctelem_cookie_t mctc)1623 static int x86_mcinfo_dump_panic(mctelem_cookie_t mctc)
1624 {
1625 struct mc_info *mcip = mctelem_dataptr(mctc);
1626
1627 x86_mcinfo_dump(mcip);
1628 mcinfo_dumpped++;
1629
1630 return 0;
1631 }
1632
1633 /* XXX shall we dump commited mc_info?? */
mc_panic_dump(void)1634 static void mc_panic_dump(void)
1635 {
1636 int cpu;
1637
1638 dprintk(XENLOG_ERR, "Begin dump mc_info\n");
1639 for_each_online_cpu(cpu)
1640 mctelem_process_deferred(cpu, x86_mcinfo_dump_panic,
1641 mctelem_has_deferred_lmce(cpu));
1642 dprintk(XENLOG_ERR, "End dump mc_info, %x mcinfo dumped\n", mcinfo_dumpped);
1643 }
1644
mc_panic(char * s)1645 void mc_panic(char *s)
1646 {
1647 is_mc_panic = true;
1648 console_force_unlock();
1649
1650 printk("Fatal machine check: %s\n", s);
1651 printk("\n"
1652 "****************************************\n"
1653 "\n"
1654 " The processor has reported a hardware error which cannot\n"
1655 " be recovered from. Xen will now reboot the machine.\n");
1656 mc_panic_dump();
1657 panic("HARDWARE ERROR");
1658 }
1659
1660 /*
1661 * Machine Check owner judge algorithm:
1662 * When error happens, all cpus serially read its msr banks.
1663 * The first CPU who fetches the error bank's info will clear
1664 * this bank. Later readers can't get any information again.
1665 * The first CPU is the actual mce_owner
1666 *
1667 * For Fatal (pcc=1) error, it might cause machine crash
1668 * before we're able to log. For avoiding log missing, we adopt two
1669 * round scanning:
1670 * Round1: simply scan. If found pcc = 1 or ripv = 0, simply reset.
1671 * All MCE banks are sticky, when boot up, MCE polling mechanism
1672 * will help to collect and log those MCE errors.
1673 * Round2: Do all MCE processing logic as normal.
1674 */
1675
1676 /* Maybe called in MCE context, no lock, no printk */
mce_action(const struct cpu_user_regs * regs,mctelem_cookie_t mctc)1677 static enum mce_result mce_action(const struct cpu_user_regs *regs,
1678 mctelem_cookie_t mctc)
1679 {
1680 struct mc_info *local_mi;
1681 enum mce_result bank_result = MCER_NOERROR;
1682 enum mce_result worst_result = MCER_NOERROR;
1683 struct mcinfo_common *mic = NULL;
1684 struct mca_binfo binfo;
1685 const struct mca_error_handler *handlers = mce_dhandlers;
1686 unsigned int i, handler_num = mce_dhandler_num;
1687
1688 /* When in mce context, regs is valid */
1689 if ( regs )
1690 {
1691 handler_num = mce_uhandler_num;
1692 handlers = mce_uhandlers;
1693 }
1694
1695 local_mi = (struct mc_info *)mctelem_dataptr(mctc);
1696 x86_mcinfo_lookup(mic, local_mi, MC_TYPE_GLOBAL);
1697 if ( mic == NULL )
1698 {
1699 printk(KERN_ERR "MCE: get local buffer entry failed\n ");
1700 return MCER_CONTINUE;
1701 }
1702
1703 memset(&binfo, 0, sizeof(binfo));
1704 binfo.mig = (struct mcinfo_global *)mic;
1705 binfo.mi = local_mi;
1706
1707 /* Processing bank information */
1708 x86_mcinfo_lookup(mic, local_mi, MC_TYPE_BANK);
1709
1710 for ( ; bank_result != MCER_RESET && mic && mic->size;
1711 mic = x86_mcinfo_next(mic) )
1712 {
1713 if ( mic->type != MC_TYPE_BANK )
1714 {
1715 continue;
1716 }
1717 binfo.mib = (struct mcinfo_bank *)mic;
1718 binfo.bank = binfo.mib->mc_bank;
1719 bank_result = MCER_NOERROR;
1720 for ( i = 0; i < handler_num; i++ )
1721 {
1722 if ( handlers[i].owned_error(binfo.mib->mc_status) )
1723 {
1724 handlers[i].recovery_handler(&binfo, &bank_result, regs);
1725 if ( worst_result < bank_result )
1726 worst_result = bank_result;
1727 break;
1728 }
1729 }
1730 }
1731
1732 return worst_result;
1733 }
1734
1735 /*
1736 * Called from mctelem_process_deferred. Return 1 if the telemetry
1737 * should be committed for dom0 consumption, 0 if it should be
1738 * dismissed.
1739 */
mce_delayed_action(mctelem_cookie_t mctc)1740 static int mce_delayed_action(mctelem_cookie_t mctc)
1741 {
1742 enum mce_result result;
1743 int ret = 0;
1744
1745 result = mce_action(NULL, mctc);
1746
1747 switch ( result )
1748 {
1749 case MCER_RESET:
1750 dprintk(XENLOG_ERR, "MCE delayed action failed\n");
1751 is_mc_panic = true;
1752 x86_mcinfo_dump(mctelem_dataptr(mctc));
1753 panic("MCE: Software recovery failed for the UCR");
1754 break;
1755
1756 case MCER_RECOVERED:
1757 dprintk(XENLOG_INFO, "MCE: Error is successfully recovered\n");
1758 ret = 1;
1759 break;
1760
1761 case MCER_CONTINUE:
1762 dprintk(XENLOG_INFO, "MCE: Error can't be recovered, "
1763 "system is tainted\n");
1764 x86_mcinfo_dump(mctelem_dataptr(mctc));
1765 ret = 1;
1766 break;
1767
1768 default:
1769 ret = 0;
1770 break;
1771 }
1772 return ret;
1773 }
1774
1775 /* Softirq Handler for this MCE# processing */
mce_softirq(void)1776 static void mce_softirq(void)
1777 {
1778 static DEFINE_MCE_BARRIER(mce_inside_bar);
1779 static DEFINE_MCE_BARRIER(mce_severity_bar);
1780 static atomic_t severity_cpu;
1781 int cpu = smp_processor_id();
1782 unsigned int workcpu;
1783 bool lmce = mctelem_has_deferred_lmce(cpu);
1784 bool bcast = mce_broadcast && !lmce;
1785
1786 mce_printk(MCE_VERBOSE, "CPU%d enter softirq\n", cpu);
1787
1788 mce_barrier_enter(&mce_inside_bar, bcast);
1789
1790 if ( !lmce )
1791 {
1792 /*
1793 * Everybody is here. Now let's see who gets to do the
1794 * recovery work. Right now we just see if there's a CPU
1795 * that did not have any problems, and pick that one.
1796 *
1797 * First, just set a default value: the last CPU who reaches this
1798 * will overwrite the value and become the default.
1799 */
1800
1801 atomic_set(&severity_cpu, cpu);
1802
1803 mce_barrier_enter(&mce_severity_bar, bcast);
1804 if ( !mctelem_has_deferred(cpu) )
1805 atomic_set(&severity_cpu, cpu);
1806 mce_barrier_exit(&mce_severity_bar, bcast);
1807 }
1808
1809 /* We choose severity_cpu for further processing */
1810 if ( lmce || atomic_read(&severity_cpu) == cpu )
1811 {
1812
1813 mce_printk(MCE_VERBOSE, "CPU%d handling errors\n", cpu);
1814
1815 /*
1816 * Step1: Fill DOM0 LOG buffer, vMCE injection buffer and
1817 * vMCE MSRs virtualization buffer
1818 */
1819
1820 if ( lmce )
1821 mctelem_process_deferred(cpu, mce_delayed_action, true);
1822 else
1823 for_each_online_cpu(workcpu)
1824 mctelem_process_deferred(workcpu, mce_delayed_action, false);
1825
1826 /* Step2: Send Log to DOM0 through vIRQ */
1827 if ( dom0_vmce_enabled() )
1828 {
1829 mce_printk(MCE_VERBOSE, "MCE: send MCE# to DOM0 through virq\n");
1830 send_global_virq(VIRQ_MCA);
1831 }
1832 }
1833
1834 mce_barrier_exit(&mce_inside_bar, bcast);
1835 }
1836
1837 /*
1838 * Machine Check owner judge algorithm:
1839 * When error happens, all cpus serially read its msr banks.
1840 * The first CPU who fetches the error bank's info will clear
1841 * this bank. Later readers can't get any infor again.
1842 * The first CPU is the actual mce_owner
1843 *
1844 * For Fatal (pcc=1) error, it might cause machine crash
1845 * before we're able to log. For avoiding log missing, we adopt two
1846 * round scanning:
1847 * Round1: simply scan. If found pcc = 1 or ripv = 0, simply reset.
1848 * All MCE banks are sticky, when boot up, MCE polling mechanism
1849 * will help to collect and log those MCE errors.
1850 * Round2: Do all MCE processing logic as normal.
1851 */
mce_handler_init(void)1852 void mce_handler_init(void)
1853 {
1854 if ( smp_processor_id() != 0 )
1855 return;
1856
1857 /* callback register, do we really need so many callback? */
1858 /* mce handler data initialization */
1859 spin_lock_init(&mce_logout_lock);
1860 open_softirq(MACHINE_CHECK_SOFTIRQ, mce_softirq);
1861 }
1862