1 /*
2  * mce.c - x86 Machine Check Exception Reporting
3  * (c) 2002 Alan Cox <alan@redhat.com>, Dave Jones <davej@codemonkey.org.uk>
4  */
5 
6 #include <xen/init.h>
7 #include <xen/types.h>
8 #include <xen/kernel.h>
9 #include <xen/smp.h>
10 #include <xen/errno.h>
11 #include <xen/console.h>
12 #include <xen/sched.h>
13 #include <xen/sched-if.h>
14 #include <xen/cpumask.h>
15 #include <xen/event.h>
16 #include <xen/guest_access.h>
17 #include <xen/hypercall.h> /* for do_mca */
18 #include <xen/cpu.h>
19 
20 #include <asm/processor.h>
21 #include <asm/setup.h>
22 #include <asm/system.h>
23 #include <asm/apic.h>
24 #include <asm/msr.h>
25 #include <asm/p2m.h>
26 
27 #include "mce.h"
28 #include "barrier.h"
29 #include "mcaction.h"
30 #include "util.h"
31 #include "vmce.h"
32 
33 bool __read_mostly opt_mce = true;
34 boolean_param("mce", opt_mce);
35 bool __read_mostly mce_broadcast;
36 bool is_mc_panic;
37 unsigned int __read_mostly nr_mce_banks;
38 unsigned int __read_mostly firstbank;
39 uint8_t __read_mostly cmci_apic_vector;
40 
41 DEFINE_PER_CPU_READ_MOSTLY(struct mca_banks *, poll_bankmask);
42 DEFINE_PER_CPU_READ_MOSTLY(struct mca_banks *, no_cmci_banks);
43 DEFINE_PER_CPU_READ_MOSTLY(struct mca_banks *, mce_clear_banks);
44 
45 static void intpose_init(void);
46 static void mcinfo_clear(struct mc_info *);
47 struct mca_banks *mca_allbanks;
48 
49 #define SEG_PL(segsel)   ((segsel) & 0x3)
50 #define _MC_MSRINJ_F_REQ_HWCR_WREN (1 << 16)
51 
52 #if 0
53 #define x86_mcerr(fmt, err, args...)                                    \
54     ({                                                                  \
55         int _err = (err);                                               \
56         gdprintk(XENLOG_WARNING, "x86_mcerr: " fmt ", returning %d\n",  \
57                  ## args, _err);                                        \
58         _err;                                                           \
59     })
60 #else
61 #define x86_mcerr(fmt, err, args...) (err)
62 #endif
63 
64 int mce_verbosity;
mce_set_verbosity(const char * str)65 static int __init mce_set_verbosity(const char *str)
66 {
67     if ( strcmp("verbose", str) == 0 )
68         mce_verbosity = MCE_VERBOSE;
69     else
70         return -EINVAL;
71 
72     return 0;
73 }
74 custom_param("mce_verbosity", mce_set_verbosity);
75 
76 /* Handle unconfigured int18 (should never happen) */
unexpected_machine_check(const struct cpu_user_regs * regs)77 static void unexpected_machine_check(const struct cpu_user_regs *regs)
78 {
79     console_force_unlock();
80     printk("Unexpected Machine Check Exception\n");
81     fatal_trap(regs, 1);
82 }
83 
84 static x86_mce_vector_t _machine_check_vector = unexpected_machine_check;
85 
x86_mce_vector_register(x86_mce_vector_t hdlr)86 void x86_mce_vector_register(x86_mce_vector_t hdlr)
87 {
88     _machine_check_vector = hdlr;
89     wmb();
90 }
91 
92 /* Call the installed machine check handler for this CPU setup. */
93 
do_machine_check(const struct cpu_user_regs * regs)94 void do_machine_check(const struct cpu_user_regs *regs)
95 {
96     _machine_check_vector(regs);
97 }
98 
99 /*
100  * Init machine check callback handler
101  * It is used to collect additional information provided by newer
102  * CPU families/models without the need to duplicate the whole handler.
103  * This avoids having many handlers doing almost nearly the same and each
104  * with its own tweaks ands bugs.
105  */
106 static x86_mce_callback_t mc_callback_bank_extended = NULL;
107 
x86_mce_callback_register(x86_mce_callback_t cbfunc)108 void x86_mce_callback_register(x86_mce_callback_t cbfunc)
109 {
110     mc_callback_bank_extended = cbfunc;
111 }
112 
113 /*
114  * Machine check recoverable judgement callback handler
115  * It is used to judge whether an UC error is recoverable by software
116  */
117 static mce_recoverable_t mc_recoverable_scan = NULL;
118 
mce_recoverable_register(mce_recoverable_t cbfunc)119 void mce_recoverable_register(mce_recoverable_t cbfunc)
120 {
121     mc_recoverable_scan = cbfunc;
122 }
123 
mcabanks_alloc(void)124 struct mca_banks *mcabanks_alloc(void)
125 {
126     struct mca_banks *mb;
127 
128     mb = xmalloc(struct mca_banks);
129     if ( !mb )
130         return NULL;
131 
132     mb->bank_map = xzalloc_array(unsigned long,
133                                  BITS_TO_LONGS(nr_mce_banks));
134     if ( !mb->bank_map )
135     {
136         xfree(mb);
137         return NULL;
138     }
139 
140     mb->num = nr_mce_banks;
141 
142     return mb;
143 }
144 
mcabanks_free(struct mca_banks * banks)145 void mcabanks_free(struct mca_banks *banks)
146 {
147     if ( banks == NULL )
148         return;
149     if ( banks->bank_map )
150         xfree(banks->bank_map);
151     xfree(banks);
152 }
153 
mcabank_clear(int banknum)154 static void mcabank_clear(int banknum)
155 {
156     uint64_t status;
157 
158     status = mca_rdmsr(MSR_IA32_MCx_STATUS(banknum));
159 
160     if ( status & MCi_STATUS_ADDRV )
161         mca_wrmsr(MSR_IA32_MCx_ADDR(banknum), 0x0ULL);
162     if ( status & MCi_STATUS_MISCV )
163         mca_wrmsr(MSR_IA32_MCx_MISC(banknum), 0x0ULL);
164 
165     mca_wrmsr(MSR_IA32_MCx_STATUS(banknum), 0x0ULL);
166 }
167 
168 /*
169  * Judging whether to Clear Machine Check error bank callback handler
170  * According to Intel latest MCA OS Recovery Writer's Guide,
171  * whether the error MCA bank needs to be cleared is decided by the mca_source
172  * and MCi_status bit value.
173  */
174 static mce_need_clearbank_t mc_need_clearbank_scan = NULL;
175 
mce_need_clearbank_register(mce_need_clearbank_t cbfunc)176 void mce_need_clearbank_register(mce_need_clearbank_t cbfunc)
177 {
178     mc_need_clearbank_scan = cbfunc;
179 }
180 
181 /*
182  * mce_logout_lock should only be used in the trap handler,
183  * while MCIP has not been cleared yet in the global status
184  * register. Other use is not safe, since an MCE trap can
185  * happen at any moment, which would cause lock recursion.
186  */
187 static DEFINE_SPINLOCK(mce_logout_lock);
188 
189 const struct mca_error_handler *__read_mostly mce_dhandlers;
190 const struct mca_error_handler *__read_mostly mce_uhandlers;
191 unsigned int __read_mostly mce_dhandler_num;
192 unsigned int __read_mostly mce_uhandler_num;
193 
mca_init_bank(enum mca_source who,struct mc_info * mi,int bank)194 static void mca_init_bank(enum mca_source who, struct mc_info *mi, int bank)
195 {
196     struct mcinfo_bank *mib;
197 
198     if ( !mi )
199         return;
200 
201     mib = x86_mcinfo_reserve(mi, sizeof(*mib), MC_TYPE_BANK);
202     if ( !mib )
203     {
204         mi->flags |= MCINFO_FLAGS_UNCOMPLETE;
205         return;
206     }
207 
208     mib->mc_status = mca_rdmsr(MSR_IA32_MCx_STATUS(bank));
209 
210     mib->mc_bank = bank;
211     mib->mc_domid = DOMID_INVALID;
212 
213     if ( mib->mc_status & MCi_STATUS_MISCV )
214         mib->mc_misc = mca_rdmsr(MSR_IA32_MCx_MISC(bank));
215 
216     if ( mib->mc_status & MCi_STATUS_ADDRV )
217         mib->mc_addr = mca_rdmsr(MSR_IA32_MCx_ADDR(bank));
218 
219     if ( (mib->mc_status & MCi_STATUS_MISCV) &&
220          (mib->mc_status & MCi_STATUS_ADDRV) &&
221          (mc_check_addr(mib->mc_status, mib->mc_misc, MC_ADDR_PHYSICAL)) &&
222          (who == MCA_POLLER || who == MCA_CMCI_HANDLER) &&
223          (mfn_valid(_mfn(paddr_to_pfn(mib->mc_addr)))) )
224     {
225         struct domain *d;
226 
227         d = maddr_get_owner(mib->mc_addr);
228         if ( d )
229             mib->mc_domid = d->domain_id;
230     }
231 
232     if ( who == MCA_CMCI_HANDLER )
233     {
234         mib->mc_ctrl2 = mca_rdmsr(MSR_IA32_MC0_CTL2 + bank);
235         mib->mc_tsc = rdtsc();
236     }
237 }
238 
mca_init_global(uint32_t flags,struct mcinfo_global * mig)239 static int mca_init_global(uint32_t flags, struct mcinfo_global *mig)
240 {
241     uint64_t status;
242     int cpu_nr;
243     const struct vcpu *curr = current;
244 
245     /* Set global information */
246     status = mca_rdmsr(MSR_IA32_MCG_STATUS);
247     mig->mc_gstatus = status;
248     mig->mc_domid = DOMID_INVALID;
249     mig->mc_vcpuid = XEN_MC_VCPUID_INVALID;
250     mig->mc_flags = flags;
251     cpu_nr = smp_processor_id();
252     /* Retrieve detector information */
253     x86_mc_get_cpu_info(cpu_nr, &mig->mc_socketid,
254                         &mig->mc_coreid, &mig->mc_core_threadid,
255                         &mig->mc_apicid, NULL, NULL, NULL);
256 
257     if ( curr != INVALID_VCPU )
258     {
259         mig->mc_domid = curr->domain->domain_id;
260         mig->mc_vcpuid = curr->vcpu_id;
261     }
262 
263     return 0;
264 }
265 
266 /*
267  * Utility function to perform MCA bank telemetry readout and to push that
268  * telemetry towards an interested dom0 for logging and diagnosis.
269  * The caller - #MC handler or MCA poll function - must arrange that we
270  * do not migrate cpus.
271  */
272 
273 /* XXFM Could add overflow counting? */
274 
275 /*
276  *  Add out_param clear_bank for Machine Check Handler Caller.
277  * For Intel latest CPU, whether to clear the error bank status needs to
278  * be judged by the callback function defined above.
279  */
280 mctelem_cookie_t
mcheck_mca_logout(enum mca_source who,struct mca_banks * bankmask,struct mca_summary * sp,struct mca_banks * clear_bank)281 mcheck_mca_logout(enum mca_source who, struct mca_banks *bankmask,
282                   struct mca_summary *sp, struct mca_banks *clear_bank)
283 {
284     uint64_t gstatus, status;
285     struct mcinfo_global *mig = NULL; /* on stack */
286     mctelem_cookie_t mctc = NULL;
287     bool uc = false, pcc = false, recover = true, need_clear = true;
288     uint32_t mc_flags = 0;
289     struct mc_info *mci = NULL;
290     mctelem_class_t which = MC_URGENT; /* XXXgcc */
291     int errcnt = 0;
292     int i;
293 
294     gstatus = mca_rdmsr(MSR_IA32_MCG_STATUS);
295     switch ( who )
296     {
297     case MCA_MCE_SCAN:
298         mc_flags = MC_FLAG_MCE;
299         which = MC_URGENT;
300         break;
301 
302     case MCA_POLLER:
303     case MCA_RESET:
304         mc_flags = MC_FLAG_POLLED;
305         which = MC_NONURGENT;
306         break;
307 
308     case MCA_CMCI_HANDLER:
309         mc_flags = MC_FLAG_CMCI;
310         which = MC_NONURGENT;
311         break;
312 
313     default:
314         BUG();
315     }
316 
317     /*
318      * If no mc_recovery_scan callback handler registered,
319      * this error is not recoverable
320      */
321     recover = mc_recoverable_scan ? 1 : 0;
322 
323     for ( i = 0; i < nr_mce_banks; i++ )
324     {
325         /* Skip bank if corresponding bit in bankmask is clear */
326         if ( !mcabanks_test(i, bankmask) )
327             continue;
328 
329         status = mca_rdmsr(MSR_IA32_MCx_STATUS(i));
330         if ( !(status & MCi_STATUS_VAL) )
331             continue; /* this bank has no valid telemetry */
332 
333         /*
334          * For Intel Latest CPU CMCI/MCE Handler caller, we need to
335          * decide whether to clear bank by MCi_STATUS bit value such as
336          * OVER/UC/EN/PCC/S/AR
337          */
338         if ( mc_need_clearbank_scan )
339             need_clear = mc_need_clearbank_scan(who, status);
340 
341         /*
342          * If this is the first bank with valid MCA DATA, then
343          * try to reserve an entry from the urgent/nonurgent queue
344          * depending on whether we are called from an exception or
345          * a poller;  this can fail (for example dom0 may not
346          * yet have consumed past telemetry).
347          */
348         if ( errcnt++ == 0 )
349         {
350             mctc = mctelem_reserve(which);
351             if ( mctc )
352             {
353                 mci = mctelem_dataptr(mctc);
354                 mcinfo_clear(mci);
355                 mig = x86_mcinfo_reserve(mci, sizeof(*mig), MC_TYPE_GLOBAL);
356                 /* mc_info should at least hold up the global information */
357                 ASSERT(mig);
358                 mca_init_global(mc_flags, mig);
359                 /* A hook here to get global extended msrs */
360                 if ( boot_cpu_data.x86_vendor == X86_VENDOR_INTEL )
361                     intel_get_extended_msrs(mig, mci);
362             }
363         }
364 
365         /* flag for uncorrected errors */
366         if ( !uc && ((status & MCi_STATUS_UC) != 0) )
367             uc = true;
368 
369         /* flag processor context corrupt */
370         if ( !pcc && ((status & MCi_STATUS_PCC) != 0) )
371             pcc = true;
372 
373         if ( recover && uc )
374             /* uc = true, recover = true, we need not panic. */
375             recover = mc_recoverable_scan(status);
376 
377         mca_init_bank(who, mci, i);
378 
379         if ( mc_callback_bank_extended )
380             mc_callback_bank_extended(mci, i, status);
381 
382         /* By default, need_clear = true */
383         if ( who != MCA_MCE_SCAN && need_clear )
384             /* Clear bank */
385             mcabank_clear(i);
386         else if ( who == MCA_MCE_SCAN && need_clear )
387             mcabanks_set(i, clear_bank);
388 
389         wmb();
390     }
391 
392     if ( mig && errcnt > 0 )
393     {
394         if ( pcc )
395             mig->mc_flags |= MC_FLAG_UNCORRECTABLE;
396         else if ( uc )
397             mig->mc_flags |= MC_FLAG_RECOVERABLE;
398         else
399             mig->mc_flags |= MC_FLAG_CORRECTABLE;
400     }
401 
402     if ( sp )
403     {
404         sp->errcnt = errcnt;
405         sp->ripv = (gstatus & MCG_STATUS_RIPV) != 0;
406         sp->eipv = (gstatus & MCG_STATUS_EIPV) != 0;
407         sp->lmce = (gstatus & MCG_STATUS_LMCE) != 0;
408         sp->uc = uc;
409         sp->pcc = pcc;
410         sp->recoverable = recover;
411     }
412 
413     return mci != NULL ? mctc : NULL; /* may be NULL */
414 }
415 
mce_spin_lock(spinlock_t * lk)416 static void mce_spin_lock(spinlock_t *lk)
417 {
418     while ( !spin_trylock(lk) )
419     {
420         cpu_relax();
421         mce_panic_check();
422     }
423 }
424 
mce_spin_unlock(spinlock_t * lk)425 static void mce_spin_unlock(spinlock_t *lk)
426 {
427     spin_unlock(lk);
428 }
429 
430 static enum mce_result mce_action(const struct cpu_user_regs *regs,
431                                   mctelem_cookie_t mctc);
432 
433 /*
434  * Return:
435  * -1: if system can't be recovered
436  * 0: Continue to next step
437  */
mce_urgent_action(const struct cpu_user_regs * regs,mctelem_cookie_t mctc)438 static int mce_urgent_action(const struct cpu_user_regs *regs,
439                              mctelem_cookie_t mctc)
440 {
441     uint64_t gstatus;
442 
443     if ( mctc == NULL )
444         return 0;
445 
446     gstatus = mca_rdmsr(MSR_IA32_MCG_STATUS);
447 
448     /*
449      * FIXME: When RIPV = EIPV = 0, it's a little bit tricky. It may be an
450      * asynchronic error, currently we have no way to precisely locate
451      * whether the error occur at guest or hypervisor.
452      * To avoid handling error in wrong way, we treat it as unrecovered.
453      *
454      * Another unrecovered case is RIPV = 0 while in hypervisor
455      * since Xen is not pre-emptible.
456      */
457     if ( !(gstatus & MCG_STATUS_RIPV) &&
458          (!(gstatus & MCG_STATUS_EIPV) || !guest_mode(regs)) )
459         return -1;
460 
461     return mce_action(regs, mctc) == MCER_RESET ? -1 : 0;
462 }
463 
464 /* Shared #MC handler. */
mcheck_cmn_handler(const struct cpu_user_regs * regs)465 void mcheck_cmn_handler(const struct cpu_user_regs *regs)
466 {
467     static DEFINE_MCE_BARRIER(mce_trap_bar);
468     static atomic_t severity_cpu = ATOMIC_INIT(-1);
469     static atomic_t found_error = ATOMIC_INIT(0);
470     static cpumask_t mce_fatal_cpus;
471     struct mca_banks *bankmask = mca_allbanks;
472     struct mca_banks *clear_bank = __get_cpu_var(mce_clear_banks);
473     uint64_t gstatus;
474     mctelem_cookie_t mctc = NULL;
475     struct mca_summary bs;
476     bool bcast, lmce;
477 
478     mce_spin_lock(&mce_logout_lock);
479 
480     if ( clear_bank != NULL )
481         memset(clear_bank->bank_map, 0x0,
482                sizeof(long) * BITS_TO_LONGS(clear_bank->num));
483     mctc = mcheck_mca_logout(MCA_MCE_SCAN, bankmask, &bs, clear_bank);
484     lmce = bs.lmce;
485     bcast = mce_broadcast && !lmce;
486 
487     if ( bs.errcnt )
488     {
489         /*
490          * Uncorrected errors must be dealt with in softirq context.
491          */
492         if ( bs.uc || bs.pcc )
493         {
494             add_taint(TAINT_MACHINE_CHECK);
495             if ( mctc )
496                 mctelem_defer(mctc, lmce);
497             /*
498              * For PCC=1 and can't be recovered, context is lost, so
499              * reboot now without clearing the banks, and deal with
500              * the telemetry after reboot (the MSRs are sticky)
501              */
502             if ( bs.pcc || !bs.recoverable )
503                 cpumask_set_cpu(smp_processor_id(), &mce_fatal_cpus);
504         }
505         else if ( mctc != NULL )
506             mctelem_commit(mctc);
507         atomic_set(&found_error, 1);
508 
509         /* The last CPU will be take check/clean-up etc */
510         atomic_set(&severity_cpu, smp_processor_id());
511 
512         mce_printk(MCE_CRITICAL, "MCE: clear_bank map %lx on CPU%d\n",
513                    *((unsigned long *)clear_bank), smp_processor_id());
514         if ( clear_bank != NULL )
515             mcheck_mca_clearbanks(clear_bank);
516     }
517     else if ( mctc != NULL )
518         mctelem_dismiss(mctc);
519     mce_spin_unlock(&mce_logout_lock);
520 
521     mce_barrier_enter(&mce_trap_bar, bcast);
522     if ( mctc != NULL && mce_urgent_action(regs, mctc) )
523         cpumask_set_cpu(smp_processor_id(), &mce_fatal_cpus);
524     mce_barrier_exit(&mce_trap_bar, bcast);
525 
526     /*
527      * Wait until everybody has processed the trap.
528      */
529     mce_barrier_enter(&mce_trap_bar, bcast);
530     if ( lmce || atomic_read(&severity_cpu) == smp_processor_id() )
531     {
532         /*
533          * According to SDM, if no error bank found on any cpus,
534          * something unexpected happening, we can't do any
535          * recovery job but to reset the system.
536          */
537         if ( atomic_read(&found_error) == 0 )
538             mc_panic("MCE: No CPU found valid MCE, need reset");
539         if ( !cpumask_empty(&mce_fatal_cpus) )
540         {
541             char *ebufp, ebuf[96] = "MCE: Fatal error happened on CPUs ";
542             ebufp = ebuf + strlen(ebuf);
543             cpumask_scnprintf(ebufp, 95 - strlen(ebuf), &mce_fatal_cpus);
544             mc_panic(ebuf);
545         }
546         atomic_set(&found_error, 0);
547         atomic_set(&severity_cpu, -1);
548     }
549     mce_barrier_exit(&mce_trap_bar, bcast);
550 
551     /* Clear flags after above fatal check */
552     mce_barrier_enter(&mce_trap_bar, bcast);
553     gstatus = mca_rdmsr(MSR_IA32_MCG_STATUS);
554     if ( (gstatus & MCG_STATUS_MCIP) != 0 )
555     {
556         mce_printk(MCE_CRITICAL, "MCE: Clear MCIP@ last step");
557         mca_wrmsr(MSR_IA32_MCG_STATUS, 0);
558     }
559     mce_barrier_exit(&mce_trap_bar, bcast);
560 
561     raise_softirq(MACHINE_CHECK_SOFTIRQ);
562 }
563 
mcheck_mca_clearbanks(struct mca_banks * bankmask)564 void mcheck_mca_clearbanks(struct mca_banks *bankmask)
565 {
566     int i;
567 
568     for ( i = 0; i < nr_mce_banks; i++ )
569     {
570         if ( !mcabanks_test(i, bankmask) )
571             continue;
572         mcabank_clear(i);
573     }
574 }
575 
576 /*check the existence of Machine Check*/
mce_available(const struct cpuinfo_x86 * c)577 bool mce_available(const struct cpuinfo_x86 *c)
578 {
579     return cpu_has(c, X86_FEATURE_MCE) && cpu_has(c, X86_FEATURE_MCA);
580 }
581 
582 /*
583  * Check if bank 0 is usable for MCE. It isn't for Intel P6 family
584  * before model 0x1a.
585  */
mce_firstbank(struct cpuinfo_x86 * c)586 unsigned int mce_firstbank(struct cpuinfo_x86 *c)
587 {
588     return c->x86 == 6 &&
589            c->x86_vendor == X86_VENDOR_INTEL && c->x86_model < 0x1a;
590 }
591 
show_mca_info(int inited,struct cpuinfo_x86 * c)592 int show_mca_info(int inited, struct cpuinfo_x86 *c)
593 {
594     static enum mcheck_type g_type = mcheck_unset;
595 
596     if ( inited != g_type )
597     {
598         char prefix[20];
599         static const char *const type_str[] = {
600             [mcheck_amd_famXX] = "AMD",
601             [mcheck_amd_k8] = "AMD K8",
602             [mcheck_intel] = "Intel"
603         };
604 
605         snprintf(prefix, ARRAY_SIZE(prefix), "%sCPU%u: ",
606                  g_type != mcheck_unset ? XENLOG_WARNING : XENLOG_INFO,
607                  smp_processor_id());
608         BUG_ON(inited >= ARRAY_SIZE(type_str));
609         switch ( inited )
610         {
611         default:
612             printk("%s%s machine check reporting enabled\n",
613                    prefix, type_str[inited]);
614             break;
615 
616         case mcheck_amd_famXX:
617             printk("%s%s Fam%xh machine check reporting enabled\n",
618                    prefix, type_str[inited], c->x86);
619             break;
620 
621         case mcheck_none:
622             printk("%sNo machine check initialization\n", prefix);
623             break;
624         }
625         g_type = inited;
626     }
627 
628     return 0;
629 }
630 
set_poll_bankmask(struct cpuinfo_x86 * c)631 static void set_poll_bankmask(struct cpuinfo_x86 *c)
632 {
633     int cpu = smp_processor_id();
634     struct mca_banks *mb;
635 
636     mb = per_cpu(poll_bankmask, cpu);
637     BUG_ON(!mb);
638 
639     if ( cmci_support && opt_mce )
640     {
641         mb->num = per_cpu(no_cmci_banks, cpu)->num;
642         bitmap_copy(mb->bank_map, per_cpu(no_cmci_banks, cpu)->bank_map,
643                     nr_mce_banks);
644     }
645     else
646     {
647         bitmap_copy(mb->bank_map, mca_allbanks->bank_map, nr_mce_banks);
648         if ( mce_firstbank(c) )
649             mcabanks_clear(0, mb);
650     }
651 }
652 
653 /* The perbank ctl/status init is platform specific because of AMD's quirk */
mca_cap_init(void)654 int mca_cap_init(void)
655 {
656     uint64_t msr_content;
657 
658     rdmsrl(MSR_IA32_MCG_CAP, msr_content);
659 
660     if ( msr_content & MCG_CTL_P ) /* Control register present ? */
661         wrmsrl(MSR_IA32_MCG_CTL, 0xffffffffffffffffULL);
662 
663     if ( nr_mce_banks && (msr_content & MCG_CAP_COUNT) != nr_mce_banks )
664     {
665         dprintk(XENLOG_WARNING, "Different bank number on cpu %x\n",
666                 smp_processor_id());
667         return -ENODEV;
668     }
669     nr_mce_banks = msr_content & MCG_CAP_COUNT;
670 
671     if ( !nr_mce_banks )
672     {
673         printk(XENLOG_INFO "CPU%u: No MCE banks present. "
674                "Machine check support disabled\n", smp_processor_id());
675         return -ENODEV;
676     }
677 
678     /* mcabanks_alloc depends on nr_mce_banks */
679     if ( !mca_allbanks )
680     {
681         int i;
682 
683         mca_allbanks = mcabanks_alloc();
684         for ( i = 0; i < nr_mce_banks; i++ )
685             mcabanks_set(i, mca_allbanks);
686     }
687 
688     return mca_allbanks ? 0 : -ENOMEM;
689 }
690 
cpu_bank_free(unsigned int cpu)691 static void cpu_bank_free(unsigned int cpu)
692 {
693     struct mca_banks *poll = per_cpu(poll_bankmask, cpu);
694     struct mca_banks *clr = per_cpu(mce_clear_banks, cpu);
695 
696     mcabanks_free(poll);
697     mcabanks_free(clr);
698 }
699 
cpu_bank_alloc(unsigned int cpu)700 static int cpu_bank_alloc(unsigned int cpu)
701 {
702     struct mca_banks *poll = mcabanks_alloc();
703     struct mca_banks *clr = mcabanks_alloc();
704 
705     if ( !poll || !clr )
706     {
707         mcabanks_free(poll);
708         mcabanks_free(clr);
709         return -ENOMEM;
710     }
711 
712     per_cpu(poll_bankmask, cpu) = poll;
713     per_cpu(mce_clear_banks, cpu) = clr;
714     return 0;
715 }
716 
cpu_callback(struct notifier_block * nfb,unsigned long action,void * hcpu)717 static int cpu_callback(
718     struct notifier_block *nfb, unsigned long action, void *hcpu)
719 {
720     unsigned int cpu = (unsigned long)hcpu;
721     int rc = 0;
722 
723     switch ( action )
724     {
725     case CPU_UP_PREPARE:
726         rc = cpu_bank_alloc(cpu);
727         break;
728 
729     case CPU_UP_CANCELED:
730     case CPU_DEAD:
731         cpu_bank_free(cpu);
732         break;
733     }
734 
735     return !rc ? NOTIFY_DONE : notifier_from_errno(rc);
736 }
737 
738 static struct notifier_block cpu_nfb = {
739     .notifier_call = cpu_callback
740 };
741 
742 /* This has to be run for each processor */
mcheck_init(struct cpuinfo_x86 * c,bool bsp)743 void mcheck_init(struct cpuinfo_x86 *c, bool bsp)
744 {
745     enum mcheck_type inited = mcheck_none;
746 
747     if ( !opt_mce )
748     {
749         if ( bsp )
750             printk(XENLOG_INFO "MCE support disabled by bootparam\n");
751         return;
752     }
753 
754     if ( !mce_available(c) )
755     {
756         printk(XENLOG_INFO "CPU%i: No machine check support available\n",
757                smp_processor_id());
758         return;
759     }
760 
761     /*Hardware Enable */
762     if ( mca_cap_init() )
763         return;
764 
765     /* Early MCE initialisation for BSP. */
766     if ( bsp && cpu_bank_alloc(smp_processor_id()) )
767         BUG();
768 
769     switch ( c->x86_vendor )
770     {
771     case X86_VENDOR_AMD:
772         inited = amd_mcheck_init(c);
773         break;
774 
775     case X86_VENDOR_INTEL:
776         switch ( c->x86 )
777         {
778         case 6:
779         case 15:
780             inited = intel_mcheck_init(c, bsp);
781             break;
782         }
783         break;
784 
785     default:
786         break;
787     }
788 
789     show_mca_info(inited, c);
790     if ( inited == mcheck_none || inited == mcheck_unset )
791         goto out;
792 
793     intpose_init();
794 
795     if ( bsp )
796     {
797         mctelem_init(sizeof(struct mc_info));
798         register_cpu_notifier(&cpu_nfb);
799     }
800 
801     /* Turn on MCE now */
802     set_in_cr4(X86_CR4_MCE);
803 
804     set_poll_bankmask(c);
805 
806     return;
807  out:
808     if ( bsp )
809     {
810         cpu_bank_free(smp_processor_id());
811         mcabanks_free(mca_allbanks);
812         mca_allbanks = NULL;
813     }
814 }
815 
mcinfo_clear(struct mc_info * mi)816 static void mcinfo_clear(struct mc_info *mi)
817 {
818     memset(mi, 0, sizeof(struct mc_info));
819     x86_mcinfo_nentries(mi) = 0;
820 }
821 
x86_mcinfo_reserve(struct mc_info * mi,unsigned int size,unsigned int type)822 void *x86_mcinfo_reserve(struct mc_info *mi,
823                          unsigned int size, unsigned int type)
824 {
825     int i;
826     unsigned long end1, end2;
827     struct mcinfo_common *mic_base, *mic_index;
828 
829     mic_index = mic_base = x86_mcinfo_first(mi);
830 
831     /* go to first free entry */
832     for ( i = 0; i < x86_mcinfo_nentries(mi); i++ )
833         mic_index = x86_mcinfo_next(mic_index);
834 
835     /* check if there is enough size */
836     end1 = (unsigned long)((uint8_t *)mic_base + sizeof(struct mc_info));
837     end2 = (unsigned long)((uint8_t *)mic_index + size);
838 
839     if ( end1 < end2 )
840     {
841         mce_printk(MCE_CRITICAL,
842                    "mcinfo_add: No space left in mc_info\n");
843         return NULL;
844     }
845 
846     /* there's enough space. add entry. */
847     x86_mcinfo_nentries(mi)++;
848 
849     memset(mic_index, 0, size);
850     mic_index->size = size;
851     mic_index->type = type;
852 
853     return mic_index;
854 }
855 
x86_mcinfo_apei_save(struct mcinfo_global * mc_global,struct mcinfo_bank * mc_bank)856 static void x86_mcinfo_apei_save(
857     struct mcinfo_global *mc_global, struct mcinfo_bank *mc_bank)
858 {
859     struct mce m;
860 
861     memset(&m, 0, sizeof(struct mce));
862 
863     m.cpu = mc_global->mc_coreid;
864     m.cpuvendor = boot_cpu_data.x86_vendor;
865     m.cpuid = cpuid_eax(1);
866     m.socketid = mc_global->mc_socketid;
867     m.apicid = mc_global->mc_apicid;
868 
869     m.mcgstatus = mc_global->mc_gstatus;
870     m.status = mc_bank->mc_status;
871     m.misc = mc_bank->mc_misc;
872     m.addr = mc_bank->mc_addr;
873     m.bank = mc_bank->mc_bank;
874 
875     apei_write_mce(&m);
876 }
877 
878 /*
879  * Dump machine check information in a format,
880  * mcelog can parse. This is used only when
881  * Dom0 does not take the notification.
882  */
x86_mcinfo_dump(struct mc_info * mi)883 void x86_mcinfo_dump(struct mc_info *mi)
884 {
885     struct mcinfo_common *mic = NULL;
886     struct mcinfo_global *mc_global;
887     struct mcinfo_bank *mc_bank;
888 
889     /* first print the global info */
890     x86_mcinfo_lookup(mic, mi, MC_TYPE_GLOBAL);
891     if ( mic == NULL )
892         return;
893     mc_global = (struct mcinfo_global *)mic;
894     if ( mc_global->mc_flags & MC_FLAG_MCE )
895         printk(XENLOG_WARNING
896                "CPU%d: Machine Check Exception: %16"PRIx64"\n",
897                mc_global->mc_coreid, mc_global->mc_gstatus);
898     else if ( mc_global->mc_flags & MC_FLAG_CMCI )
899         printk(XENLOG_WARNING "CMCI occurred on CPU %d.\n",
900                mc_global->mc_coreid);
901     else if ( mc_global->mc_flags & MC_FLAG_POLLED )
902         printk(XENLOG_WARNING "POLLED occurred on CPU %d.\n",
903                mc_global->mc_coreid);
904 
905     /* then the bank information */
906     x86_mcinfo_lookup(mic, mi, MC_TYPE_BANK); /* finds the first entry */
907     do {
908         if ( mic == NULL )
909             return;
910         if ( mic->type != MC_TYPE_BANK )
911             goto next;
912 
913         mc_bank = (struct mcinfo_bank *)mic;
914 
915         printk(XENLOG_WARNING "Bank %d: %16"PRIx64,
916                mc_bank->mc_bank,
917                mc_bank->mc_status);
918         if ( mc_bank->mc_status & MCi_STATUS_MISCV )
919             printk("[%16"PRIx64"]", mc_bank->mc_misc);
920         if ( mc_bank->mc_status & MCi_STATUS_ADDRV )
921             printk(" at %16"PRIx64, mc_bank->mc_addr);
922         printk("\n");
923 
924         if ( is_mc_panic )
925             x86_mcinfo_apei_save(mc_global, mc_bank);
926 
927  next:
928         mic = x86_mcinfo_next(mic); /* next entry */
929         if ( (mic == NULL) || (mic->size == 0) )
930             break;
931     } while ( 1 );
932 }
933 
do_mc_get_cpu_info(void * v)934 static void do_mc_get_cpu_info(void *v)
935 {
936     int cpu = smp_processor_id();
937     int cindex, cpn;
938     struct cpuinfo_x86 *c;
939     xen_mc_logical_cpu_t *log_cpus, *xcp;
940     uint32_t junk, ebx;
941 
942     log_cpus = v;
943     c = &cpu_data[cpu];
944     cindex = 0;
945     cpn = cpu - 1;
946 
947     /*
948      * Deal with sparse masks, condensed into a contig array.
949      */
950     while ( cpn >= 0 )
951     {
952         if ( cpu_online(cpn) )
953             cindex++;
954         cpn--;
955     }
956 
957     xcp = &log_cpus[cindex];
958     c = &cpu_data[cpu];
959     xcp->mc_cpunr = cpu;
960     x86_mc_get_cpu_info(cpu, &xcp->mc_chipid,
961                         &xcp->mc_coreid, &xcp->mc_threadid,
962                         &xcp->mc_apicid, &xcp->mc_ncores,
963                         &xcp->mc_ncores_active, &xcp->mc_nthreads);
964     xcp->mc_cpuid_level = c->cpuid_level;
965     xcp->mc_family = c->x86;
966     xcp->mc_vendor = c->x86_vendor;
967     xcp->mc_model = c->x86_model;
968     xcp->mc_step = c->x86_mask;
969     xcp->mc_cache_size = c->x86_cache_size;
970     xcp->mc_cache_alignment = c->x86_cache_alignment;
971     memcpy(xcp->mc_vendorid, c->x86_vendor_id, sizeof xcp->mc_vendorid);
972     memcpy(xcp->mc_brandid, c->x86_model_id, sizeof xcp->mc_brandid);
973     memcpy(xcp->mc_cpu_caps, c->x86_capability, sizeof xcp->mc_cpu_caps);
974 
975     /*
976      * This part needs to run on the CPU itself.
977      */
978     xcp->mc_nmsrvals = __MC_NMSRS;
979     xcp->mc_msrvalues[0].reg = MSR_IA32_MCG_CAP;
980     rdmsrl(MSR_IA32_MCG_CAP, xcp->mc_msrvalues[0].value);
981 
982     if ( c->cpuid_level >= 1 )
983     {
984         cpuid(1, &junk, &ebx, &junk, &junk);
985         xcp->mc_clusterid = (ebx >> 24) & 0xff;
986     }
987     else
988         xcp->mc_clusterid = get_apic_id();
989 }
990 
x86_mc_get_cpu_info(unsigned cpu,uint32_t * chipid,uint16_t * coreid,uint16_t * threadid,uint32_t * apicid,unsigned * ncores,unsigned * ncores_active,unsigned * nthreads)991 void x86_mc_get_cpu_info(unsigned cpu, uint32_t *chipid, uint16_t *coreid,
992                          uint16_t *threadid, uint32_t *apicid,
993                          unsigned *ncores, unsigned *ncores_active,
994                          unsigned *nthreads)
995 {
996     struct cpuinfo_x86 *c;
997 
998     *apicid = cpu_physical_id(cpu);
999     c = &cpu_data[cpu];
1000     if ( c->apicid == BAD_APICID )
1001     {
1002         *chipid = cpu;
1003         *coreid = 0;
1004         *threadid = 0;
1005         if ( ncores != NULL )
1006             *ncores = 1;
1007         if ( ncores_active != NULL )
1008             *ncores_active = 1;
1009         if ( nthreads != NULL )
1010             *nthreads = 1;
1011     }
1012     else
1013     {
1014         *chipid = c->phys_proc_id;
1015         if ( c->x86_max_cores > 1 )
1016             *coreid = c->cpu_core_id;
1017         else
1018             *coreid = 0;
1019         *threadid = c->apicid & ((1 << (c->x86_num_siblings - 1)) - 1);
1020         if ( ncores != NULL )
1021             *ncores = c->x86_max_cores;
1022         if ( ncores_active != NULL )
1023             *ncores_active = c->booted_cores;
1024         if ( nthreads != NULL )
1025             *nthreads = c->x86_num_siblings;
1026     }
1027 }
1028 
1029 #define INTPOSE_NENT 50
1030 
1031 static struct intpose_ent {
1032     unsigned int cpu_nr;
1033     uint64_t msr;
1034     uint64_t val;
1035 } intpose_arr[INTPOSE_NENT];
1036 
intpose_init(void)1037 static void intpose_init(void)
1038 {
1039     static int done;
1040     int i;
1041 
1042     if ( done++ > 0 )
1043         return;
1044 
1045     for ( i = 0; i < INTPOSE_NENT; i++ )
1046         intpose_arr[i].cpu_nr = -1;
1047 
1048 }
1049 
intpose_lookup(unsigned int cpu_nr,uint64_t msr,uint64_t * valp)1050 struct intpose_ent *intpose_lookup(unsigned int cpu_nr, uint64_t msr,
1051                                    uint64_t *valp)
1052 {
1053     int i;
1054 
1055     for ( i = 0; i < INTPOSE_NENT; i++ )
1056     {
1057         if ( intpose_arr[i].cpu_nr == cpu_nr && intpose_arr[i].msr == msr )
1058         {
1059             if ( valp != NULL )
1060                 *valp = intpose_arr[i].val;
1061             return &intpose_arr[i];
1062         }
1063     }
1064 
1065     return NULL;
1066 }
1067 
intpose_add(unsigned int cpu_nr,uint64_t msr,uint64_t val)1068 static void intpose_add(unsigned int cpu_nr, uint64_t msr, uint64_t val)
1069 {
1070     struct intpose_ent *ent = intpose_lookup(cpu_nr, msr, NULL);
1071     int i;
1072 
1073     if ( ent )
1074     {
1075         ent->val = val;
1076         return;
1077     }
1078 
1079     for ( i = 0, ent = &intpose_arr[0]; i < INTPOSE_NENT; i++, ent++ )
1080     {
1081         if ( ent->cpu_nr == -1 )
1082         {
1083             ent->cpu_nr = cpu_nr;
1084             ent->msr = msr;
1085             ent->val = val;
1086             return;
1087         }
1088     }
1089 
1090     printk("intpose_add: interpose array full - request dropped\n");
1091 }
1092 
intpose_inval(unsigned int cpu_nr,uint64_t msr)1093 bool intpose_inval(unsigned int cpu_nr, uint64_t msr)
1094 {
1095     struct intpose_ent *ent = intpose_lookup(cpu_nr, msr, NULL);
1096 
1097     if ( !ent )
1098         return false;
1099 
1100     ent->cpu_nr = -1;
1101     return true;
1102 }
1103 
1104 #define IS_MCA_BANKREG(r) \
1105     ((r) >= MSR_IA32_MC0_CTL && \
1106     (r) <= MSR_IA32_MCx_MISC(nr_mce_banks - 1) && \
1107     ((r) - MSR_IA32_MC0_CTL) % 4 != 0) /* excludes MCi_CTL */
1108 
x86_mc_msrinject_verify(struct xen_mc_msrinject * mci)1109 static bool x86_mc_msrinject_verify(struct xen_mc_msrinject *mci)
1110 {
1111     struct cpuinfo_x86 *c;
1112     int i, errs = 0;
1113 
1114     c = &cpu_data[smp_processor_id()];
1115 
1116     for ( i = 0; i < mci->mcinj_count; i++ )
1117     {
1118         uint64_t reg = mci->mcinj_msr[i].reg;
1119         const char *reason = NULL;
1120 
1121         if ( IS_MCA_BANKREG(reg) )
1122         {
1123             if ( c->x86_vendor == X86_VENDOR_AMD )
1124             {
1125                 /*
1126                  * On AMD we can set MCi_STATUS_WREN in the
1127                  * HWCR MSR to allow non-zero writes to banks
1128                  * MSRs not to #GP.  The injector in dom0
1129                  * should set that bit, but we detect when it
1130                  * is necessary and set it as a courtesy to
1131                  * avoid #GP in the hypervisor.
1132                  */
1133                 mci->mcinj_flags |=
1134                     _MC_MSRINJ_F_REQ_HWCR_WREN;
1135                 continue;
1136             }
1137             else
1138             {
1139                 /*
1140                  * No alternative but to interpose, so require
1141                  * that the injector specified as such.
1142                  */
1143                 if ( !(mci->mcinj_flags & MC_MSRINJ_F_INTERPOSE) )
1144                     reason = "must specify interposition";
1145             }
1146         }
1147         else
1148         {
1149             switch ( reg )
1150             {
1151             /* MSRs acceptable on all x86 cpus */
1152             case MSR_IA32_MCG_STATUS:
1153                 break;
1154 
1155             case MSR_F10_MC4_MISC1:
1156             case MSR_F10_MC4_MISC2:
1157             case MSR_F10_MC4_MISC3:
1158                 if ( c->x86_vendor != X86_VENDOR_AMD )
1159                     reason = "only supported on AMD";
1160                 else if ( c->x86 < 0x10 )
1161                     reason = "only supported on AMD Fam10h+";
1162                 break;
1163 
1164             /* MSRs that the HV will take care of */
1165             case MSR_K8_HWCR:
1166                 if ( c->x86_vendor == X86_VENDOR_AMD )
1167                     reason = "HV will operate HWCR";
1168                 else
1169                     reason = "only supported on AMD";
1170                 break;
1171 
1172             default:
1173                 reason = "not a recognized MCA MSR";
1174                 break;
1175             }
1176         }
1177 
1178         if ( reason != NULL )
1179         {
1180             printk("HV MSR INJECT ERROR: MSR %#Lx %s\n",
1181                    (unsigned long long)mci->mcinj_msr[i].reg, reason);
1182             errs++;
1183         }
1184     }
1185 
1186     return !errs;
1187 }
1188 
x86_mc_hwcr_wren(void)1189 static uint64_t x86_mc_hwcr_wren(void)
1190 {
1191     uint64_t old;
1192 
1193     rdmsrl(MSR_K8_HWCR, old);
1194 
1195     if ( !(old & K8_HWCR_MCi_STATUS_WREN) )
1196     {
1197         uint64_t new = old | K8_HWCR_MCi_STATUS_WREN;
1198         wrmsrl(MSR_K8_HWCR, new);
1199     }
1200 
1201     return old;
1202 }
1203 
x86_mc_hwcr_wren_restore(uint64_t hwcr)1204 static void x86_mc_hwcr_wren_restore(uint64_t hwcr)
1205 {
1206     if ( !(hwcr & K8_HWCR_MCi_STATUS_WREN) )
1207         wrmsrl(MSR_K8_HWCR, hwcr);
1208 }
1209 
x86_mc_msrinject(void * data)1210 static void x86_mc_msrinject(void *data)
1211 {
1212     struct xen_mc_msrinject *mci = data;
1213     struct mcinfo_msr *msr;
1214     uint64_t hwcr = 0;
1215     int intpose;
1216     int i;
1217 
1218     if ( mci->mcinj_flags & _MC_MSRINJ_F_REQ_HWCR_WREN )
1219         hwcr = x86_mc_hwcr_wren();
1220 
1221     intpose = (mci->mcinj_flags & MC_MSRINJ_F_INTERPOSE) != 0;
1222 
1223     for ( i = 0, msr = &mci->mcinj_msr[0]; i < mci->mcinj_count; i++, msr++ )
1224     {
1225         printk("HV MSR INJECT (%s) target %u actual %u MSR %#Lx <-- %#Lx\n",
1226                intpose ? "interpose" : "hardware",
1227                mci->mcinj_cpunr, smp_processor_id(),
1228                (unsigned long long)msr->reg,
1229                (unsigned long long)msr->value);
1230 
1231         if ( intpose )
1232             intpose_add(mci->mcinj_cpunr, msr->reg, msr->value);
1233         else
1234             wrmsrl(msr->reg, msr->value);
1235     }
1236 
1237     if ( mci->mcinj_flags & _MC_MSRINJ_F_REQ_HWCR_WREN )
1238         x86_mc_hwcr_wren_restore(hwcr);
1239 }
1240 
1241 /*ARGSUSED*/
x86_mc_mceinject(void * data)1242 static void x86_mc_mceinject(void *data)
1243 {
1244     printk("Simulating #MC on cpu %d\n", smp_processor_id());
1245     __asm__ __volatile__("int $0x12");
1246 }
1247 
1248 #if BITS_PER_LONG == 64
1249 
1250 #define ID2COOKIE(id) ((mctelem_cookie_t)(id))
1251 #define COOKIE2ID(c) ((uint64_t)(c))
1252 
1253 #elif defined(BITS_PER_LONG)
1254 #error BITS_PER_LONG has unexpected value
1255 #else
1256 #error BITS_PER_LONG definition absent
1257 #endif
1258 
1259 # include <compat/arch-x86/xen-mca.h>
1260 
1261 # define xen_mcinfo_msr              mcinfo_msr
1262 CHECK_mcinfo_msr;
1263 # undef xen_mcinfo_msr
1264 # undef CHECK_mcinfo_msr
1265 # define CHECK_mcinfo_msr            struct mcinfo_msr
1266 
1267 # define xen_mcinfo_common           mcinfo_common
1268 CHECK_mcinfo_common;
1269 # undef xen_mcinfo_common
1270 # undef CHECK_mcinfo_common
1271 # define CHECK_mcinfo_common         struct mcinfo_common
1272 
1273 CHECK_FIELD_(struct, mc_fetch, flags);
1274 CHECK_FIELD_(struct, mc_fetch, fetch_id);
1275 # define CHECK_compat_mc_fetch       struct mc_fetch
1276 
1277 CHECK_FIELD_(struct, mc_physcpuinfo, ncpus);
1278 # define CHECK_compat_mc_physcpuinfo struct mc_physcpuinfo
1279 
1280 #define CHECK_compat_mc_inject_v2   struct mc_inject_v2
1281 CHECK_mc;
1282 # undef CHECK_compat_mc_fetch
1283 # undef CHECK_compat_mc_physcpuinfo
1284 
1285 # define xen_mc_info                 mc_info
1286 CHECK_mc_info;
1287 # undef xen_mc_info
1288 
1289 # define xen_mcinfo_global           mcinfo_global
1290 CHECK_mcinfo_global;
1291 # undef xen_mcinfo_global
1292 
1293 # define xen_mcinfo_bank             mcinfo_bank
1294 CHECK_mcinfo_bank;
1295 # undef xen_mcinfo_bank
1296 
1297 # define xen_mcinfo_extended         mcinfo_extended
1298 CHECK_mcinfo_extended;
1299 # undef xen_mcinfo_extended
1300 
1301 # define xen_mcinfo_recovery         mcinfo_recovery
1302 # define xen_cpu_offline_action      cpu_offline_action
1303 # define xen_page_offline_action     page_offline_action
1304 CHECK_mcinfo_recovery;
1305 # undef xen_cpu_offline_action
1306 # undef xen_page_offline_action
1307 # undef xen_mcinfo_recovery
1308 
1309 /* Machine Check Architecture Hypercall */
do_mca(XEN_GUEST_HANDLE_PARAM (xen_mc_t)u_xen_mc)1310 long do_mca(XEN_GUEST_HANDLE_PARAM(xen_mc_t) u_xen_mc)
1311 {
1312     long ret = 0;
1313     struct xen_mc curop, *op = &curop;
1314     struct vcpu *v = current;
1315     union {
1316         struct xen_mc_fetch *nat;
1317         struct compat_mc_fetch *cmp;
1318     } mc_fetch;
1319     union {
1320         struct xen_mc_physcpuinfo *nat;
1321         struct compat_mc_physcpuinfo *cmp;
1322     } mc_physcpuinfo;
1323     uint32_t flags, cmdflags;
1324     int nlcpu;
1325     xen_mc_logical_cpu_t *log_cpus = NULL;
1326     mctelem_cookie_t mctc;
1327     mctelem_class_t which;
1328     unsigned int target;
1329     struct xen_mc_msrinject *mc_msrinject;
1330     struct xen_mc_mceinject *mc_mceinject;
1331 
1332     ret = xsm_do_mca(XSM_PRIV);
1333     if ( ret )
1334         return x86_mcerr("", ret);
1335 
1336     if ( copy_from_guest(op, u_xen_mc, 1) )
1337         return x86_mcerr("do_mca: failed copyin of xen_mc_t", -EFAULT);
1338 
1339     if ( op->interface_version != XEN_MCA_INTERFACE_VERSION )
1340         return x86_mcerr("do_mca: interface version mismatch", -EACCES);
1341 
1342     switch ( op->cmd )
1343     {
1344     case XEN_MC_fetch:
1345         mc_fetch.nat = &op->u.mc_fetch;
1346         cmdflags = mc_fetch.nat->flags;
1347 
1348         switch ( cmdflags & (XEN_MC_NONURGENT | XEN_MC_URGENT) )
1349         {
1350         case XEN_MC_NONURGENT:
1351             which = MC_NONURGENT;
1352             break;
1353 
1354         case XEN_MC_URGENT:
1355             which = MC_URGENT;
1356             break;
1357 
1358         default:
1359             return x86_mcerr("do_mca fetch: bad cmdflags", -EINVAL);
1360         }
1361 
1362         flags = XEN_MC_OK;
1363 
1364         if ( cmdflags & XEN_MC_ACK )
1365         {
1366             mctelem_cookie_t cookie = ID2COOKIE(mc_fetch.nat->fetch_id);
1367             mctelem_ack(which, cookie);
1368         }
1369         else
1370         {
1371             if ( !is_pv_32bit_vcpu(v)
1372                  ? guest_handle_is_null(mc_fetch.nat->data)
1373                  : compat_handle_is_null(mc_fetch.cmp->data) )
1374                 return x86_mcerr("do_mca fetch: guest buffer "
1375                                  "invalid", -EINVAL);
1376 
1377             mctc = mctelem_consume_oldest_begin(which);
1378             if ( mctc )
1379             {
1380                 struct mc_info *mcip = mctelem_dataptr(mctc);
1381                 if ( !is_pv_32bit_vcpu(v)
1382                      ? copy_to_guest(mc_fetch.nat->data, mcip, 1)
1383                      : copy_to_compat(mc_fetch.cmp->data, mcip, 1) )
1384                 {
1385                     ret = -EFAULT;
1386                     flags |= XEN_MC_FETCHFAILED;
1387                     mc_fetch.nat->fetch_id = 0;
1388                 }
1389                 else
1390                     mc_fetch.nat->fetch_id = COOKIE2ID(mctc);
1391                 mctelem_consume_oldest_end(mctc);
1392             }
1393             else
1394             {
1395                 /* There is no data */
1396                 flags |= XEN_MC_NODATA;
1397                 mc_fetch.nat->fetch_id = 0;
1398             }
1399 
1400             mc_fetch.nat->flags = flags;
1401             if (copy_to_guest(u_xen_mc, op, 1) != 0)
1402                 ret = -EFAULT;
1403         }
1404 
1405         break;
1406 
1407     case XEN_MC_notifydomain:
1408         return x86_mcerr("do_mca notify unsupported", -EINVAL);
1409 
1410     case XEN_MC_physcpuinfo:
1411         mc_physcpuinfo.nat = &op->u.mc_physcpuinfo;
1412         nlcpu = num_online_cpus();
1413 
1414         if ( !is_pv_32bit_vcpu(v)
1415              ? !guest_handle_is_null(mc_physcpuinfo.nat->info)
1416              : !compat_handle_is_null(mc_physcpuinfo.cmp->info) )
1417         {
1418             if ( mc_physcpuinfo.nat->ncpus <= 0 )
1419                 return x86_mcerr("do_mca cpuinfo: ncpus <= 0",
1420                                  -EINVAL);
1421             nlcpu = min(nlcpu, (int)mc_physcpuinfo.nat->ncpus);
1422             log_cpus = xmalloc_array(xen_mc_logical_cpu_t, nlcpu);
1423             if ( log_cpus == NULL )
1424                 return x86_mcerr("do_mca cpuinfo", -ENOMEM);
1425             on_each_cpu(do_mc_get_cpu_info, log_cpus, 1);
1426             if ( !is_pv_32bit_vcpu(v)
1427                  ? copy_to_guest(mc_physcpuinfo.nat->info, log_cpus, nlcpu)
1428                  : copy_to_compat(mc_physcpuinfo.cmp->info, log_cpus, nlcpu) )
1429                 ret = -EFAULT;
1430             xfree(log_cpus);
1431         }
1432 
1433         mc_physcpuinfo.nat->ncpus = nlcpu;
1434 
1435         if ( copy_to_guest(u_xen_mc, op, 1) )
1436             return x86_mcerr("do_mca cpuinfo", -EFAULT);
1437 
1438         break;
1439 
1440     case XEN_MC_msrinject:
1441         if ( nr_mce_banks == 0 )
1442             return x86_mcerr("do_mca inject", -ENODEV);
1443 
1444         mc_msrinject = &op->u.mc_msrinject;
1445         target = mc_msrinject->mcinj_cpunr;
1446 
1447         if ( target >= nr_cpu_ids )
1448             return x86_mcerr("do_mca inject: bad target", -EINVAL);
1449 
1450         if ( !cpu_online(target) )
1451             return x86_mcerr("do_mca inject: target offline",
1452                              -EINVAL);
1453 
1454         if ( mc_msrinject->mcinj_count == 0 )
1455             return 0;
1456 
1457         if ( mc_msrinject->mcinj_flags & MC_MSRINJ_F_GPADDR )
1458         {
1459             domid_t domid;
1460             struct domain *d;
1461             struct mcinfo_msr *msr;
1462             unsigned int i;
1463             paddr_t gaddr;
1464             unsigned long gfn, mfn;
1465             p2m_type_t t;
1466 
1467             domid = (mc_msrinject->mcinj_domid == DOMID_SELF) ?
1468                     current->domain->domain_id : mc_msrinject->mcinj_domid;
1469             if ( domid >= DOMID_FIRST_RESERVED )
1470                 return x86_mcerr("do_mca inject: incompatible flag "
1471                                  "MC_MSRINJ_F_GPADDR with domain %d",
1472                                  -EINVAL, domid);
1473 
1474             d = get_domain_by_id(domid);
1475             if ( d == NULL )
1476                 return x86_mcerr("do_mca inject: bad domain id %d",
1477                                  -EINVAL, domid);
1478 
1479             for ( i = 0, msr = &mc_msrinject->mcinj_msr[0];
1480                   i < mc_msrinject->mcinj_count;
1481                   i++, msr++ )
1482             {
1483                 gaddr = msr->value;
1484                 gfn = PFN_DOWN(gaddr);
1485                 mfn = mfn_x(get_gfn(d, gfn, &t));
1486 
1487                 if ( mfn == mfn_x(INVALID_MFN) )
1488                 {
1489                     put_gfn(d, gfn);
1490                     put_domain(d);
1491                     return x86_mcerr("do_mca inject: bad gfn %#lx of domain %d",
1492                                      -EINVAL, gfn, domid);
1493                 }
1494 
1495                 msr->value = pfn_to_paddr(mfn) | (gaddr & (PAGE_SIZE - 1));
1496 
1497                 put_gfn(d, gfn);
1498             }
1499 
1500             put_domain(d);
1501         }
1502 
1503         if ( !x86_mc_msrinject_verify(mc_msrinject) )
1504             return x86_mcerr("do_mca inject: illegal MSR", -EINVAL);
1505 
1506         add_taint(TAINT_ERROR_INJECT);
1507 
1508         on_selected_cpus(cpumask_of(target), x86_mc_msrinject,
1509                          mc_msrinject, 1);
1510 
1511         break;
1512 
1513     case XEN_MC_mceinject:
1514         if ( nr_mce_banks == 0 )
1515             return x86_mcerr("do_mca #MC", -ENODEV);
1516 
1517         mc_mceinject = &op->u.mc_mceinject;
1518         target = mc_mceinject->mceinj_cpunr;
1519 
1520         if ( target >= nr_cpu_ids )
1521             return x86_mcerr("do_mca #MC: bad target", -EINVAL);
1522 
1523         if ( !cpu_online(target) )
1524             return x86_mcerr("do_mca #MC: target offline", -EINVAL);
1525 
1526         add_taint(TAINT_ERROR_INJECT);
1527 
1528         if ( mce_broadcast )
1529             on_each_cpu(x86_mc_mceinject, mc_mceinject, 1);
1530         else
1531             on_selected_cpus(cpumask_of(target), x86_mc_mceinject,
1532                              mc_mceinject, 1);
1533         break;
1534 
1535     case XEN_MC_inject_v2:
1536     {
1537         const cpumask_t *cpumap;
1538         cpumask_var_t cmv;
1539         bool broadcast = op->u.mc_inject_v2.flags & XEN_MC_INJECT_CPU_BROADCAST;
1540 
1541         if ( nr_mce_banks == 0 )
1542             return x86_mcerr("do_mca #MC", -ENODEV);
1543 
1544         if ( broadcast )
1545             cpumap = &cpu_online_map;
1546         else
1547         {
1548             ret = xenctl_bitmap_to_cpumask(&cmv, &op->u.mc_inject_v2.cpumap);
1549             if ( ret )
1550                 break;
1551             cpumap = cmv;
1552             if ( !cpumask_intersects(cpumap, &cpu_online_map) )
1553             {
1554                 free_cpumask_var(cmv);
1555                 ret = x86_mcerr("No online CPU passed\n", -EINVAL);
1556                 break;
1557             }
1558             if ( !cpumask_subset(cpumap, &cpu_online_map) )
1559                 dprintk(XENLOG_INFO,
1560                         "Not all required CPUs are online\n");
1561         }
1562 
1563         switch ( op->u.mc_inject_v2.flags & XEN_MC_INJECT_TYPE_MASK )
1564         {
1565         case XEN_MC_INJECT_TYPE_MCE:
1566             if ( mce_broadcast &&
1567                  !cpumask_equal(cpumap, &cpu_online_map) )
1568                 printk("Not trigger MCE on all CPUs, may HANG!\n");
1569             on_selected_cpus(cpumap, x86_mc_mceinject, NULL, 1);
1570             break;
1571 
1572         case XEN_MC_INJECT_TYPE_CMCI:
1573             if ( !cmci_apic_vector )
1574                 ret = x86_mcerr("No CMCI supported in platform\n", -EINVAL);
1575             else
1576             {
1577                 if ( cpumask_test_cpu(smp_processor_id(), cpumap) )
1578                     send_IPI_self(cmci_apic_vector);
1579                 send_IPI_mask(cpumap, cmci_apic_vector);
1580             }
1581             break;
1582 
1583         case XEN_MC_INJECT_TYPE_LMCE:
1584             if ( !lmce_support )
1585             {
1586                 ret = x86_mcerr("No LMCE support", -EINVAL);
1587                 break;
1588             }
1589             if ( broadcast )
1590             {
1591                 ret = x86_mcerr("Broadcast cannot be used with LMCE", -EINVAL);
1592                 break;
1593             }
1594             /* Ensure at most one CPU is specified. */
1595             if ( nr_cpu_ids > cpumask_next(cpumask_first(cpumap), cpumap) )
1596             {
1597                 ret = x86_mcerr("More than one CPU specified for LMCE",
1598                                 -EINVAL);
1599                 break;
1600             }
1601             on_selected_cpus(cpumap, x86_mc_mceinject, NULL, 1);
1602             break;
1603 
1604         default:
1605             ret = x86_mcerr("Wrong mca type\n", -EINVAL);
1606             break;
1607         }
1608 
1609         if ( cpumap != &cpu_online_map )
1610             free_cpumask_var(cmv);
1611 
1612         break;
1613     }
1614 
1615     default:
1616         return x86_mcerr("do_mca: bad command", -EINVAL);
1617     }
1618 
1619     return ret;
1620 }
1621 
1622 int mcinfo_dumpped;
x86_mcinfo_dump_panic(mctelem_cookie_t mctc)1623 static int x86_mcinfo_dump_panic(mctelem_cookie_t mctc)
1624 {
1625     struct mc_info *mcip = mctelem_dataptr(mctc);
1626 
1627     x86_mcinfo_dump(mcip);
1628     mcinfo_dumpped++;
1629 
1630     return 0;
1631 }
1632 
1633 /* XXX shall we dump commited mc_info?? */
mc_panic_dump(void)1634 static void mc_panic_dump(void)
1635 {
1636     int cpu;
1637 
1638     dprintk(XENLOG_ERR, "Begin dump mc_info\n");
1639     for_each_online_cpu(cpu)
1640         mctelem_process_deferred(cpu, x86_mcinfo_dump_panic,
1641                                  mctelem_has_deferred_lmce(cpu));
1642     dprintk(XENLOG_ERR, "End dump mc_info, %x mcinfo dumped\n", mcinfo_dumpped);
1643 }
1644 
mc_panic(char * s)1645 void mc_panic(char *s)
1646 {
1647     is_mc_panic = true;
1648     console_force_unlock();
1649 
1650     printk("Fatal machine check: %s\n", s);
1651     printk("\n"
1652            "****************************************\n"
1653            "\n"
1654            "   The processor has reported a hardware error which cannot\n"
1655            "   be recovered from.  Xen will now reboot the machine.\n");
1656     mc_panic_dump();
1657     panic("HARDWARE ERROR");
1658 }
1659 
1660 /*
1661  * Machine Check owner judge algorithm:
1662  * When error happens, all cpus serially read its msr banks.
1663  * The first CPU who fetches the error bank's info will clear
1664  * this bank. Later readers can't get any information again.
1665  * The first CPU is the actual mce_owner
1666  *
1667  * For Fatal (pcc=1) error, it might cause machine crash
1668  * before we're able to log. For avoiding log missing, we adopt two
1669  * round scanning:
1670  * Round1: simply scan. If found pcc = 1 or ripv = 0, simply reset.
1671  * All MCE banks are sticky, when boot up, MCE polling mechanism
1672  * will help to collect and log those MCE errors.
1673  * Round2: Do all MCE processing logic as normal.
1674  */
1675 
1676 /* Maybe called in MCE context, no lock, no printk */
mce_action(const struct cpu_user_regs * regs,mctelem_cookie_t mctc)1677 static enum mce_result mce_action(const struct cpu_user_regs *regs,
1678                                   mctelem_cookie_t mctc)
1679 {
1680     struct mc_info *local_mi;
1681     enum mce_result bank_result = MCER_NOERROR;
1682     enum mce_result worst_result = MCER_NOERROR;
1683     struct mcinfo_common *mic = NULL;
1684     struct mca_binfo binfo;
1685     const struct mca_error_handler *handlers = mce_dhandlers;
1686     unsigned int i, handler_num = mce_dhandler_num;
1687 
1688     /* When in mce context, regs is valid */
1689     if ( regs )
1690     {
1691         handler_num = mce_uhandler_num;
1692         handlers = mce_uhandlers;
1693     }
1694 
1695     local_mi = (struct mc_info *)mctelem_dataptr(mctc);
1696     x86_mcinfo_lookup(mic, local_mi, MC_TYPE_GLOBAL);
1697     if ( mic == NULL )
1698     {
1699         printk(KERN_ERR "MCE: get local buffer entry failed\n ");
1700         return MCER_CONTINUE;
1701     }
1702 
1703     memset(&binfo, 0, sizeof(binfo));
1704     binfo.mig = (struct mcinfo_global *)mic;
1705     binfo.mi = local_mi;
1706 
1707     /* Processing bank information */
1708     x86_mcinfo_lookup(mic, local_mi, MC_TYPE_BANK);
1709 
1710     for ( ; bank_result != MCER_RESET && mic && mic->size;
1711           mic = x86_mcinfo_next(mic) )
1712     {
1713         if ( mic->type != MC_TYPE_BANK )
1714         {
1715             continue;
1716         }
1717         binfo.mib = (struct mcinfo_bank *)mic;
1718         binfo.bank = binfo.mib->mc_bank;
1719         bank_result = MCER_NOERROR;
1720         for ( i = 0; i < handler_num; i++ )
1721         {
1722             if ( handlers[i].owned_error(binfo.mib->mc_status) )
1723             {
1724                 handlers[i].recovery_handler(&binfo, &bank_result, regs);
1725                 if ( worst_result < bank_result )
1726                     worst_result = bank_result;
1727                 break;
1728             }
1729         }
1730     }
1731 
1732     return worst_result;
1733 }
1734 
1735 /*
1736  * Called from mctelem_process_deferred. Return 1 if the telemetry
1737  * should be committed for dom0 consumption, 0 if it should be
1738  * dismissed.
1739  */
mce_delayed_action(mctelem_cookie_t mctc)1740 static int mce_delayed_action(mctelem_cookie_t mctc)
1741 {
1742     enum mce_result result;
1743     int ret = 0;
1744 
1745     result = mce_action(NULL, mctc);
1746 
1747     switch ( result )
1748     {
1749     case MCER_RESET:
1750         dprintk(XENLOG_ERR, "MCE delayed action failed\n");
1751         is_mc_panic = true;
1752         x86_mcinfo_dump(mctelem_dataptr(mctc));
1753         panic("MCE: Software recovery failed for the UCR");
1754         break;
1755 
1756     case MCER_RECOVERED:
1757         dprintk(XENLOG_INFO, "MCE: Error is successfully recovered\n");
1758         ret = 1;
1759         break;
1760 
1761     case MCER_CONTINUE:
1762         dprintk(XENLOG_INFO, "MCE: Error can't be recovered, "
1763                 "system is tainted\n");
1764         x86_mcinfo_dump(mctelem_dataptr(mctc));
1765         ret = 1;
1766         break;
1767 
1768     default:
1769         ret = 0;
1770         break;
1771     }
1772     return ret;
1773 }
1774 
1775 /* Softirq Handler for this MCE# processing */
mce_softirq(void)1776 static void mce_softirq(void)
1777 {
1778     static DEFINE_MCE_BARRIER(mce_inside_bar);
1779     static DEFINE_MCE_BARRIER(mce_severity_bar);
1780     static atomic_t severity_cpu;
1781     int cpu = smp_processor_id();
1782     unsigned int workcpu;
1783     bool lmce = mctelem_has_deferred_lmce(cpu);
1784     bool bcast = mce_broadcast && !lmce;
1785 
1786     mce_printk(MCE_VERBOSE, "CPU%d enter softirq\n", cpu);
1787 
1788     mce_barrier_enter(&mce_inside_bar, bcast);
1789 
1790     if ( !lmce )
1791     {
1792         /*
1793          * Everybody is here. Now let's see who gets to do the
1794          * recovery work. Right now we just see if there's a CPU
1795          * that did not have any problems, and pick that one.
1796          *
1797          * First, just set a default value: the last CPU who reaches this
1798          * will overwrite the value and become the default.
1799          */
1800 
1801         atomic_set(&severity_cpu, cpu);
1802 
1803         mce_barrier_enter(&mce_severity_bar, bcast);
1804         if ( !mctelem_has_deferred(cpu) )
1805             atomic_set(&severity_cpu, cpu);
1806         mce_barrier_exit(&mce_severity_bar, bcast);
1807     }
1808 
1809     /* We choose severity_cpu for further processing */
1810     if ( lmce || atomic_read(&severity_cpu) == cpu )
1811     {
1812 
1813         mce_printk(MCE_VERBOSE, "CPU%d handling errors\n", cpu);
1814 
1815         /*
1816          * Step1: Fill DOM0 LOG buffer, vMCE injection buffer and
1817          * vMCE MSRs virtualization buffer
1818          */
1819 
1820         if ( lmce )
1821             mctelem_process_deferred(cpu, mce_delayed_action, true);
1822         else
1823             for_each_online_cpu(workcpu)
1824                 mctelem_process_deferred(workcpu, mce_delayed_action, false);
1825 
1826         /* Step2: Send Log to DOM0 through vIRQ */
1827         if ( dom0_vmce_enabled() )
1828         {
1829             mce_printk(MCE_VERBOSE, "MCE: send MCE# to DOM0 through virq\n");
1830             send_global_virq(VIRQ_MCA);
1831         }
1832     }
1833 
1834     mce_barrier_exit(&mce_inside_bar, bcast);
1835 }
1836 
1837 /*
1838  * Machine Check owner judge algorithm:
1839  * When error happens, all cpus serially read its msr banks.
1840  * The first CPU who fetches the error bank's info will clear
1841  * this bank. Later readers can't get any infor again.
1842  * The first CPU is the actual mce_owner
1843  *
1844  * For Fatal (pcc=1) error, it might cause machine crash
1845  * before we're able to log. For avoiding log missing, we adopt two
1846  * round scanning:
1847  * Round1: simply scan. If found pcc = 1 or ripv = 0, simply reset.
1848  * All MCE banks are sticky, when boot up, MCE polling mechanism
1849  * will help to collect and log those MCE errors.
1850  * Round2: Do all MCE processing logic as normal.
1851  */
mce_handler_init(void)1852 void mce_handler_init(void)
1853 {
1854     if ( smp_processor_id() != 0 )
1855         return;
1856 
1857     /* callback register, do we really need so many callback? */
1858     /* mce handler data initialization */
1859     spin_lock_init(&mce_logout_lock);
1860     open_softirq(MACHINE_CHECK_SOFTIRQ, mce_softirq);
1861 }
1862