1 /* SPDX-License-Identifier: GPL-2.0-or-later */
2 /*
3  * vmce.c - provide software emulated vMCE support to guest
4  *
5  * Copyright (C) 2010, 2011 Jiang, Yunhong <yunhong.jiang@intel.com>
6  * Copyright (C) 2012, 2013 Liu, Jinsong <jinsong.liu@intel.com>
7  */
8 
9 #include <xen/init.h>
10 #include <xen/types.h>
11 #include <xen/irq.h>
12 #include <xen/event.h>
13 #include <xen/kernel.h>
14 #include <xen/delay.h>
15 #include <xen/smp.h>
16 #include <xen/mm.h>
17 #include <asm/hvm/save.h>
18 #include <asm/processor.h>
19 #include <public/hvm/params.h>
20 #include <public/sysctl.h>
21 #include <asm/system.h>
22 #include <asm/msr.h>
23 #include <asm/p2m.h>
24 #include <asm/pv/traps.h>
25 
26 #include "mce.h"
27 #include "x86_mca.h"
28 #include "vmce.h"
29 
30 /*
31  * MCG_SER_P:  software error recovery supported
32  * MCG_TES_P:  to avoid MCi_status bit56:53 model specific
33  * MCG_CMCI_P: expose CMCI capability but never really inject it to guest,
34  *             for sake of performance since guest not polling periodically
35  */
36 #define INTEL_GUEST_MCG_CAP (MCG_SER_P |	\
37                              MCG_TES_P |	\
38                              MCG_CMCI_P |	\
39                              GUEST_MC_BANK_NUM)
40 
41 #define AMD_GUEST_MCG_CAP GUEST_MC_BANK_NUM
42 
vmce_init_vcpu(struct vcpu * v)43 void vmce_init_vcpu(struct vcpu *v)
44 {
45     int i;
46 
47     /* global MCA MSRs init */
48     if ( boot_cpu_data.x86_vendor == X86_VENDOR_INTEL )
49         v->arch.vmce.mcg_cap = INTEL_GUEST_MCG_CAP;
50     else
51         v->arch.vmce.mcg_cap = AMD_GUEST_MCG_CAP;
52 
53     v->arch.vmce.mcg_status = 0;
54 
55     /* per-bank MCA MSRs init */
56     for ( i = 0; i < GUEST_MC_BANK_NUM; i++ )
57         memset(&v->arch.vmce.bank[i], 0, sizeof(struct vmce_bank));
58 
59     spin_lock_init(&v->arch.vmce.lock);
60 }
61 
vmce_restore_vcpu(struct vcpu * v,const struct hvm_vmce_vcpu * ctxt)62 int vmce_restore_vcpu(struct vcpu *v, const struct hvm_vmce_vcpu *ctxt)
63 {
64     unsigned long guest_mcg_cap;
65 
66     if ( boot_cpu_data.x86_vendor == X86_VENDOR_INTEL )
67         guest_mcg_cap = INTEL_GUEST_MCG_CAP | MCG_LMCE_P;
68     else
69         guest_mcg_cap = AMD_GUEST_MCG_CAP;
70 
71     if ( ctxt->caps & ~guest_mcg_cap & ~MCG_CAP_COUNT & ~MCG_CTL_P )
72     {
73         printk(XENLOG_G_ERR
74                "%s restore: unsupported MCA capabilities %#"PRIx64" for %pv (supported: %#Lx)\n",
75                 is_hvm_vcpu(v) ? "HVM" : "PV", ctxt->caps,
76                 v, guest_mcg_cap & ~MCG_CAP_COUNT);
77         return -EINVAL;
78     }
79 
80     v->arch.vmce.mcg_cap = ctxt->caps;
81     v->arch.vmce.bank[0].mci_ctl2 = ctxt->mci_ctl2_bank0;
82     v->arch.vmce.bank[1].mci_ctl2 = ctxt->mci_ctl2_bank1;
83     v->arch.vmce.mcg_ext_ctl = ctxt->mcg_ext_ctl;
84 
85     return 0;
86 }
87 
88 /*
89  * For historic version reason, bank number may greater than GUEST_MC_BANK_NUM,
90  * when migrating from old vMCE version to new vMCE.
91  */
bank_mce_rdmsr(const struct vcpu * v,uint32_t msr,uint64_t * val)92 static int bank_mce_rdmsr(const struct vcpu *v, uint32_t msr, uint64_t *val)
93 {
94     int ret = 1;
95     unsigned int bank = (msr - MSR_IA32_MC0_CTL) / 4;
96 
97     *val = 0;
98 
99     switch ( msr & (-MSR_IA32_MC0_CTL | 3) )
100     {
101     case MSR_IA32_MC0_CTL:
102         /* stick all 1's to MCi_CTL */
103         *val = ~0UL;
104         mce_printk(MCE_VERBOSE, "MCE: %pv: rd MC%u_CTL %#"PRIx64"\n",
105                    v, bank, *val);
106         break;
107 
108     case MSR_IA32_MC0_STATUS:
109         if ( bank < GUEST_MC_BANK_NUM )
110         {
111             *val = v->arch.vmce.bank[bank].mci_status;
112             if ( *val )
113                 mce_printk(MCE_VERBOSE, "MCE: %pv: rd MC%u_STATUS %#"PRIx64"\n",
114                            v, bank, *val);
115         }
116         break;
117 
118     case MSR_IA32_MC0_ADDR:
119         if ( bank < GUEST_MC_BANK_NUM )
120         {
121             *val = v->arch.vmce.bank[bank].mci_addr;
122             if ( *val )
123                 mce_printk(MCE_VERBOSE, "MCE: %pv: rd MC%u_ADDR %#"PRIx64"\n",
124                            v, bank, *val);
125         }
126         break;
127 
128     case MSR_IA32_MC0_MISC:
129         if ( bank < GUEST_MC_BANK_NUM )
130         {
131             *val = v->arch.vmce.bank[bank].mci_misc;
132             if ( *val )
133                 mce_printk(MCE_VERBOSE, "MCE: %pv: rd MC%u_MISC %#"PRIx64"\n",
134                            v, bank, *val);
135         }
136         break;
137 
138     default:
139         switch ( boot_cpu_data.x86_vendor )
140         {
141 #ifdef CONFIG_INTEL
142         case X86_VENDOR_CENTAUR:
143         case X86_VENDOR_SHANGHAI:
144         case X86_VENDOR_INTEL:
145             ret = vmce_intel_rdmsr(v, msr, val);
146             break;
147 #endif
148 
149 #ifdef CONFIG_AMD
150         case X86_VENDOR_AMD:
151         case X86_VENDOR_HYGON:
152             ret = vmce_amd_rdmsr(v, msr, val);
153             break;
154 #endif
155 
156         default:
157             ret = 0;
158             break;
159         }
160         break;
161     }
162 
163     return ret;
164 }
165 
166 /*
167  * < 0: Unsupported and will #GP fault to guest
168  * = 0: Not handled, should be handled by other components
169  * > 0: Success
170  */
vmce_rdmsr(uint32_t msr,uint64_t * val)171 int vmce_rdmsr(uint32_t msr, uint64_t *val)
172 {
173     struct vcpu *cur = current;
174     int ret = 1;
175 
176     *val = 0;
177 
178     spin_lock(&cur->arch.vmce.lock);
179 
180     switch ( msr )
181     {
182     case MSR_IA32_MCG_STATUS:
183         *val = cur->arch.vmce.mcg_status;
184         if ( *val )
185             mce_printk(MCE_VERBOSE,
186                        "MCE: %pv: rd MCG_STATUS %#"PRIx64"\n", cur, *val);
187         break;
188 
189     case MSR_IA32_MCG_CAP:
190         *val = cur->arch.vmce.mcg_cap;
191         mce_printk(MCE_VERBOSE, "MCE: %pv: rd MCG_CAP %#"PRIx64"\n", cur, *val);
192         break;
193 
194     case MSR_IA32_MCG_CTL:
195         if ( cur->arch.vmce.mcg_cap & MCG_CTL_P )
196             *val = ~0ULL;
197         mce_printk(MCE_VERBOSE, "MCE: %pv: rd MCG_CTL %#"PRIx64"\n", cur, *val);
198         break;
199 
200     case MSR_IA32_MCG_EXT_CTL:
201         /*
202          * If MCG_LMCE_P is present in guest MSR_IA32_MCG_CAP, the LMCE and LOCK
203          * bits are always set in guest MSR_IA32_FEATURE_CONTROL by Xen, so it
204          * does not need to check them here.
205          */
206         if ( vmce_has_lmce(cur) )
207         {
208             *val = cur->arch.vmce.mcg_ext_ctl;
209             mce_printk(MCE_VERBOSE, "MCE: %pv: rd MCG_EXT_CTL %#"PRIx64"\n",
210                        cur, *val);
211         }
212         else
213         {
214             ret = -1;
215             mce_printk(MCE_VERBOSE, "MCE: %pv: rd MCG_EXT_CTL, not supported\n",
216                        cur);
217         }
218         break;
219 
220     default:
221         ret = mce_bank_msr(cur, msr) ? bank_mce_rdmsr(cur, msr, val) : 0;
222         break;
223     }
224 
225     spin_unlock(&cur->arch.vmce.lock);
226 
227     return ret;
228 }
229 
230 /*
231  * For historic version reason, bank number may greater than GUEST_MC_BANK_NUM,
232  * when migratie from old vMCE version to new vMCE.
233  */
bank_mce_wrmsr(struct vcpu * v,uint32_t msr,uint64_t val)234 static int bank_mce_wrmsr(struct vcpu *v, uint32_t msr, uint64_t val)
235 {
236     int ret = 1;
237     unsigned int bank = (msr - MSR_IA32_MC0_CTL) / 4;
238 
239     switch ( msr & (-MSR_IA32_MC0_CTL | 3) )
240     {
241     case MSR_IA32_MC0_CTL:
242         /*
243          * if guest crazy clear any bit of MCi_CTL,
244          * treat it as not implement and ignore write change it.
245          */
246         break;
247 
248     case MSR_IA32_MC0_STATUS:
249         mce_printk(MCE_VERBOSE, "MCE: %pv: wr MC%u_STATUS %#"PRIx64"\n",
250                    v, bank, val);
251         if ( val )
252             ret = -1;
253         else if ( bank < GUEST_MC_BANK_NUM )
254             v->arch.vmce.bank[bank].mci_status = val;
255         break;
256 
257     case MSR_IA32_MC0_ADDR:
258         mce_printk(MCE_VERBOSE, "MCE: %pv: wr MC%u_ADDR %#"PRIx64"\n",
259                    v, bank, val);
260         if ( val )
261             ret = -1;
262         else if ( bank < GUEST_MC_BANK_NUM )
263             v->arch.vmce.bank[bank].mci_addr = val;
264         break;
265 
266     case MSR_IA32_MC0_MISC:
267         mce_printk(MCE_VERBOSE, "MCE: %pv: wr MC%u_MISC %#"PRIx64"\n",
268                    v, bank, val);
269         if ( val )
270             ret = -1;
271         else if ( bank < GUEST_MC_BANK_NUM )
272             v->arch.vmce.bank[bank].mci_misc = val;
273         break;
274 
275     default:
276         switch ( boot_cpu_data.x86_vendor )
277         {
278 #ifdef CONFIG_INTEL
279         case X86_VENDOR_INTEL:
280             ret = vmce_intel_wrmsr(v, msr, val);
281             break;
282 #endif
283 
284 #ifdef CONFIG_AMD
285         case X86_VENDOR_AMD:
286         case X86_VENDOR_HYGON:
287             ret = vmce_amd_wrmsr(v, msr, val);
288             break;
289 #endif
290 
291         default:
292             ret = 0;
293             break;
294         }
295         break;
296     }
297 
298     return ret;
299 }
300 
301 /*
302  * < 0: Unsupported and will #GP fault to guest
303  * = 0: Not handled, should be handled by other components
304  * > 0: Success
305  */
vmce_wrmsr(uint32_t msr,uint64_t val)306 int vmce_wrmsr(uint32_t msr, uint64_t val)
307 {
308     struct vcpu *cur = current;
309     int ret = 1;
310 
311     spin_lock(&cur->arch.vmce.lock);
312 
313     switch ( msr )
314     {
315     case MSR_IA32_MCG_CTL:
316         /* If MCG_CTL exists then stick to all 1's, else ignore. */
317         break;
318 
319     case MSR_IA32_MCG_STATUS:
320         cur->arch.vmce.mcg_status = val;
321         mce_printk(MCE_VERBOSE, "MCE: %pv: wr MCG_STATUS %"PRIx64"\n",
322                    cur, val);
323         break;
324 
325     case MSR_IA32_MCG_CAP:
326         /*
327          * According to Intel SDM, IA32_MCG_CAP is a read-only register,
328          * the effect of writing to the IA32_MCG_CAP is undefined. Here we
329          * treat writing as 'write not change'. Guest would not surprise.
330          */
331         mce_printk(MCE_VERBOSE, "MCE: %pv: MCG_CAP is r/o\n", cur);
332         break;
333 
334     case MSR_IA32_MCG_EXT_CTL:
335         if ( vmce_has_lmce(cur) && !(val & ~MCG_EXT_CTL_LMCE_EN) )
336             cur->arch.vmce.mcg_ext_ctl = val;
337         else
338             ret = -1;
339         mce_printk(MCE_VERBOSE, "MCE: %pv: wr MCG_EXT_CTL %"PRIx64"%s\n",
340                    cur, val, (ret == -1) ? ", not supported" : "");
341         break;
342 
343     default:
344         ret = mce_bank_msr(cur, msr) ? bank_mce_wrmsr(cur, msr, val) : 0;
345         break;
346     }
347 
348     spin_unlock(&cur->arch.vmce.lock);
349     return ret;
350 }
351 
352 #if CONFIG_HVM
vmce_save_vcpu_ctxt(struct vcpu * v,hvm_domain_context_t * h)353 static int cf_check vmce_save_vcpu_ctxt(struct vcpu *v, hvm_domain_context_t *h)
354 {
355     struct hvm_vmce_vcpu ctxt = {
356         .caps = v->arch.vmce.mcg_cap,
357         .mci_ctl2_bank0 = v->arch.vmce.bank[0].mci_ctl2,
358         .mci_ctl2_bank1 = v->arch.vmce.bank[1].mci_ctl2,
359         .mcg_ext_ctl = v->arch.vmce.mcg_ext_ctl,
360     };
361 
362     return hvm_save_entry(VMCE_VCPU, v->vcpu_id, h, &ctxt);
363 }
364 
vmce_load_vcpu_ctxt(struct domain * d,hvm_domain_context_t * h)365 static int cf_check vmce_load_vcpu_ctxt(struct domain *d, hvm_domain_context_t *h)
366 {
367     unsigned int vcpuid = hvm_load_instance(h);
368     struct vcpu *v;
369     struct hvm_vmce_vcpu ctxt;
370     int err;
371 
372     if ( vcpuid >= d->max_vcpus || (v = d->vcpu[vcpuid]) == NULL )
373     {
374         dprintk(XENLOG_G_ERR, "HVM restore: dom%d has no vcpu%u\n",
375                 d->domain_id, vcpuid);
376         err = -EINVAL;
377     }
378     else
379         err = hvm_load_entry_zeroextend(VMCE_VCPU, h, &ctxt);
380 
381     return err ?: vmce_restore_vcpu(v, &ctxt);
382 }
383 
384 HVM_REGISTER_SAVE_RESTORE(VMCE_VCPU, vmce_save_vcpu_ctxt, NULL,
385                           vmce_load_vcpu_ctxt, 1, HVMSR_PER_VCPU);
386 #endif
387 
388 /*
389  * for Intel MCE, broadcast vMCE to all vcpus
390  * for AMD MCE, only inject vMCE to vcpu0
391  *
392  * @ d, domain to which would inject vmce
393  * @ vcpu,
394  *   -1 (VMCE_INJECT_BROADCAST), broadcast vMCE to all vcpus
395  *   >= 0, vcpu, the vMCE is injected to
396  */
inject_vmce(struct domain * d,int vcpu)397 int inject_vmce(struct domain *d, int vcpu)
398 {
399     struct vcpu *v;
400     int ret = -ESRCH;
401 
402     for_each_vcpu ( d, v )
403     {
404         if ( vcpu != VMCE_INJECT_BROADCAST && vcpu != v->vcpu_id )
405             continue;
406 
407         /* Don't inject to uninitialized VCPU. */
408         if ( !v->is_initialised )
409             continue;
410 
411         if ( (is_hvm_domain(d) ||
412               pv_trap_callback_registered(v, X86_EXC_MC)) &&
413              !test_and_set_bool(v->arch.mce_pending) )
414         {
415             mce_printk(MCE_VERBOSE, "MCE: inject vMCE to %pv\n", v);
416             vcpu_kick(v);
417             ret = 0;
418         }
419         else
420         {
421             mce_printk(MCE_QUIET, "Failed to inject vMCE to %pv\n", v);
422             ret = -EBUSY;
423             break;
424         }
425 
426         if ( vcpu != VMCE_INJECT_BROADCAST )
427             break;
428     }
429 
430     return ret;
431 }
432 
vcpu_fill_mc_msrs(struct vcpu * v,uint64_t mcg_status,uint64_t mci_status,uint64_t mci_addr,uint64_t mci_misc)433 static int vcpu_fill_mc_msrs(struct vcpu *v, uint64_t mcg_status,
434                              uint64_t mci_status, uint64_t mci_addr,
435                              uint64_t mci_misc)
436 {
437     if ( v->arch.vmce.mcg_status & MCG_STATUS_MCIP )
438     {
439         mce_printk(MCE_QUIET, "MCE: %pv: guest has not handled previous"
440                    " vMCE yet!\n", v);
441         return -EBUSY;
442     }
443 
444     spin_lock(&v->arch.vmce.lock);
445 
446     v->arch.vmce.mcg_status = mcg_status;
447     /*
448      * 1. Skip bank 0 to avoid 'bank 0 quirk' of old processors
449      * 2. Filter MCi_STATUS MSCOD model specific error code to guest
450      */
451     v->arch.vmce.bank[1].mci_status = mci_status & MCi_STATUS_MSCOD_MASK;
452     v->arch.vmce.bank[1].mci_addr = mci_addr;
453     v->arch.vmce.bank[1].mci_misc = mci_misc;
454 
455     spin_unlock(&v->arch.vmce.lock);
456 
457     return 0;
458 }
459 
fill_vmsr_data(struct mcinfo_bank * mc_bank,struct domain * d,uint64_t gstatus,int vmce_vcpuid)460 int fill_vmsr_data(struct mcinfo_bank *mc_bank, struct domain *d,
461                    uint64_t gstatus, int vmce_vcpuid)
462 {
463     struct vcpu *v = d->vcpu[0];
464     bool broadcast = (vmce_vcpuid == VMCE_INJECT_BROADCAST);
465     int ret, err;
466 
467     if ( mc_bank->mc_domid == DOMID_INVALID )
468         return -EINVAL;
469 
470     if ( broadcast )
471         gstatus &= ~MCG_STATUS_LMCE;
472     else if ( gstatus & MCG_STATUS_LMCE )
473     {
474         ASSERT(vmce_vcpuid >= 0 && vmce_vcpuid < d->max_vcpus);
475         v = d->vcpu[vmce_vcpuid];
476     }
477 
478     /*
479      * vMCE with the actual error information is injected to vCPU0,
480      * and, if broadcast is required, we choose to inject less severe
481      * vMCEs to other vCPUs. Thus guest can always get the severest
482      * error (i.e. the actual one) on vCPU0. If guest can recover from
483      * the severest error on vCPU0, the less severe errors on other
484      * vCPUs will not prevent guest from recovering on those vCPUs.
485      */
486     ret = vcpu_fill_mc_msrs(v, gstatus, mc_bank->mc_status,
487                             mc_bank->mc_addr, mc_bank->mc_misc);
488     if ( broadcast )
489         for_each_vcpu ( d, v )
490         {
491             if ( !v->vcpu_id )
492                 continue;
493             err = vcpu_fill_mc_msrs(v, MCG_STATUS_MCIP | MCG_STATUS_RIPV,
494                                     0, 0, 0);
495             if ( err )
496                 ret = err;
497         }
498 
499     return ret;
500 }
501 
502 /* It's said some ram is setup as mmio_direct for UC cache attribute */
503 #define P2M_UNMAP_TYPES (p2m_to_mask(p2m_ram_rw) \
504                                 | p2m_to_mask(p2m_ram_logdirty) \
505                                 | p2m_to_mask(p2m_ram_ro)       \
506                                 | p2m_to_mask(p2m_mmio_direct))
507 
508 /*
509  * Currently all CPUs are redenzevous at the MCE softirq handler, no
510  * need to consider paging p2m type
511  * Currently only support HVM guest with EPT paging mode
512  * XXX following situation missed:
513  * PoD, Foreign mapped, Granted, Shared
514  */
unmmap_broken_page(struct domain * d,mfn_t mfn,unsigned long gfn)515 int unmmap_broken_page(struct domain *d, mfn_t mfn, unsigned long gfn)
516 {
517     mfn_t r_mfn;
518     p2m_type_t pt;
519     int rc;
520 
521     /* Always trust dom0's MCE handler will prevent future access */
522     if ( is_hardware_domain(d) )
523         return 0;
524 
525     if ( !mfn_valid(mfn) )
526         return -EINVAL;
527 
528     if ( !is_hvm_domain(d) || !paging_mode_hap(d) )
529         return -EOPNOTSUPP;
530 
531     rc = -1;
532     r_mfn = get_gfn_query(d, gfn, &pt);
533     if ( p2m_to_mask(pt) & P2M_UNMAP_TYPES)
534     {
535         ASSERT(mfn_eq(r_mfn, mfn));
536         rc = p2m_change_type_one(d, gfn, pt, p2m_ram_broken);
537     }
538     put_gfn(d, gfn);
539 
540     return rc;
541 }
542 
vmce_enable_mca_cap(struct domain * d,uint64_t cap)543 int vmce_enable_mca_cap(struct domain *d, uint64_t cap)
544 {
545     struct vcpu *v;
546 
547     if ( cap & ~XEN_HVM_MCA_CAP_MASK )
548         return -EINVAL;
549 
550     if ( cap & XEN_HVM_MCA_CAP_LMCE )
551     {
552         if ( !lmce_support )
553             return -EINVAL;
554         for_each_vcpu(d, v)
555             v->arch.vmce.mcg_cap |= MCG_LMCE_P;
556     }
557 
558     return 0;
559 }
560