1 /*
2  * vmce.c - provide software emulated vMCE support to guest
3  *
4  * Copyright (C) 2010, 2011 Jiang, Yunhong <yunhong.jiang@intel.com>
5  * Copyright (C) 2012, 2013 Liu, Jinsong <jinsong.liu@intel.com>
6  *
7  * This program is free software; you can redistribute it and/or modify
8  * it under the terms of the GNU General Public License as published by
9  * the Free Software Foundation; either version 2 of the License, or
10  * (at your option) any later version.
11  *
12  * This program is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15  * GNU General Public License for more details.
16  *
17  * You should have received a copy of the GNU General Public License
18  * along with this program; If not, see <http://www.gnu.org/licenses/>.
19  */
20 
21 #include <xen/init.h>
22 #include <xen/types.h>
23 #include <xen/irq.h>
24 #include <xen/event.h>
25 #include <xen/kernel.h>
26 #include <xen/delay.h>
27 #include <xen/smp.h>
28 #include <xen/mm.h>
29 #include <asm/hvm/save.h>
30 #include <asm/processor.h>
31 #include <public/sysctl.h>
32 #include <asm/system.h>
33 #include <asm/msr.h>
34 #include <asm/p2m.h>
35 #include <asm/pv/traps.h>
36 
37 #include "mce.h"
38 #include "x86_mca.h"
39 #include "vmce.h"
40 
41 /*
42  * MCG_SER_P:  software error recovery supported
43  * MCG_TES_P:  to avoid MCi_status bit56:53 model specific
44  * MCG_CMCI_P: expose CMCI capability but never really inject it to guest,
45  *             for sake of performance since guest not polling periodically
46  */
47 #define INTEL_GUEST_MCG_CAP (MCG_SER_P |	\
48                              MCG_TES_P |	\
49                              MCG_CMCI_P |	\
50                              GUEST_MC_BANK_NUM)
51 
52 #define AMD_GUEST_MCG_CAP GUEST_MC_BANK_NUM
53 
vmce_init_vcpu(struct vcpu * v)54 void vmce_init_vcpu(struct vcpu *v)
55 {
56     int i;
57 
58     /* global MCA MSRs init */
59     if ( boot_cpu_data.x86_vendor == X86_VENDOR_INTEL )
60         v->arch.vmce.mcg_cap = INTEL_GUEST_MCG_CAP;
61     else
62         v->arch.vmce.mcg_cap = AMD_GUEST_MCG_CAP;
63 
64     v->arch.vmce.mcg_status = 0;
65 
66     /* per-bank MCA MSRs init */
67     for ( i = 0; i < GUEST_MC_BANK_NUM; i++ )
68         memset(&v->arch.vmce.bank[i], 0, sizeof(struct vmce_bank));
69 
70     spin_lock_init(&v->arch.vmce.lock);
71 }
72 
vmce_restore_vcpu(struct vcpu * v,const struct hvm_vmce_vcpu * ctxt)73 int vmce_restore_vcpu(struct vcpu *v, const struct hvm_vmce_vcpu *ctxt)
74 {
75     unsigned long guest_mcg_cap;
76 
77     if ( boot_cpu_data.x86_vendor == X86_VENDOR_INTEL )
78         guest_mcg_cap = INTEL_GUEST_MCG_CAP | MCG_LMCE_P;
79     else
80         guest_mcg_cap = AMD_GUEST_MCG_CAP;
81 
82     if ( ctxt->caps & ~guest_mcg_cap & ~MCG_CAP_COUNT & ~MCG_CTL_P )
83     {
84         dprintk(XENLOG_G_ERR, "%s restore: unsupported MCA capabilities"
85                 " %#" PRIx64 " for %pv (supported: %#Lx)\n",
86                 is_hvm_vcpu(v) ? "HVM" : "PV", ctxt->caps,
87                 v, guest_mcg_cap & ~MCG_CAP_COUNT);
88         return -EPERM;
89     }
90 
91     v->arch.vmce.mcg_cap = ctxt->caps;
92     v->arch.vmce.bank[0].mci_ctl2 = ctxt->mci_ctl2_bank0;
93     v->arch.vmce.bank[1].mci_ctl2 = ctxt->mci_ctl2_bank1;
94     v->arch.vmce.mcg_ext_ctl = ctxt->mcg_ext_ctl;
95 
96     return 0;
97 }
98 
99 /*
100  * For historic version reason, bank number may greater than GUEST_MC_BANK_NUM,
101  * when migrating from old vMCE version to new vMCE.
102  */
bank_mce_rdmsr(const struct vcpu * v,uint32_t msr,uint64_t * val)103 static int bank_mce_rdmsr(const struct vcpu *v, uint32_t msr, uint64_t *val)
104 {
105     int ret = 1;
106     unsigned int bank = (msr - MSR_IA32_MC0_CTL) / 4;
107 
108     *val = 0;
109 
110     switch ( msr & (-MSR_IA32_MC0_CTL | 3) )
111     {
112     case MSR_IA32_MC0_CTL:
113         /* stick all 1's to MCi_CTL */
114         *val = ~0UL;
115         mce_printk(MCE_VERBOSE, "MCE: %pv: rd MC%u_CTL %#"PRIx64"\n",
116                    v, bank, *val);
117         break;
118 
119     case MSR_IA32_MC0_STATUS:
120         if ( bank < GUEST_MC_BANK_NUM )
121         {
122             *val = v->arch.vmce.bank[bank].mci_status;
123             if ( *val )
124                 mce_printk(MCE_VERBOSE, "MCE: %pv: rd MC%u_STATUS %#"PRIx64"\n",
125                            v, bank, *val);
126         }
127         break;
128 
129     case MSR_IA32_MC0_ADDR:
130         if ( bank < GUEST_MC_BANK_NUM )
131         {
132             *val = v->arch.vmce.bank[bank].mci_addr;
133             if ( *val )
134                 mce_printk(MCE_VERBOSE, "MCE: %pv: rd MC%u_ADDR %#"PRIx64"\n",
135                            v, bank, *val);
136         }
137         break;
138 
139     case MSR_IA32_MC0_MISC:
140         if ( bank < GUEST_MC_BANK_NUM )
141         {
142             *val = v->arch.vmce.bank[bank].mci_misc;
143             if ( *val )
144                 mce_printk(MCE_VERBOSE, "MCE: %pv: rd MC%u_MISC %#"PRIx64"\n",
145                            v, bank, *val);
146         }
147         break;
148 
149     default:
150         switch ( boot_cpu_data.x86_vendor )
151         {
152         case X86_VENDOR_INTEL:
153             ret = vmce_intel_rdmsr(v, msr, val);
154             break;
155 
156         case X86_VENDOR_AMD:
157             ret = vmce_amd_rdmsr(v, msr, val);
158             break;
159 
160         default:
161             ret = 0;
162             break;
163         }
164         break;
165     }
166 
167     return ret;
168 }
169 
170 /*
171  * < 0: Unsupported and will #GP fault to guest
172  * = 0: Not handled, should be handled by other components
173  * > 0: Success
174  */
vmce_rdmsr(uint32_t msr,uint64_t * val)175 int vmce_rdmsr(uint32_t msr, uint64_t *val)
176 {
177     struct vcpu *cur = current;
178     int ret = 1;
179 
180     *val = 0;
181 
182     spin_lock(&cur->arch.vmce.lock);
183 
184     switch ( msr )
185     {
186     case MSR_IA32_MCG_STATUS:
187         *val = cur->arch.vmce.mcg_status;
188         if ( *val )
189             mce_printk(MCE_VERBOSE,
190                        "MCE: %pv: rd MCG_STATUS %#"PRIx64"\n", cur, *val);
191         break;
192 
193     case MSR_IA32_MCG_CAP:
194         *val = cur->arch.vmce.mcg_cap;
195         mce_printk(MCE_VERBOSE, "MCE: %pv: rd MCG_CAP %#"PRIx64"\n", cur, *val);
196         break;
197 
198     case MSR_IA32_MCG_CTL:
199         if ( cur->arch.vmce.mcg_cap & MCG_CTL_P )
200             *val = ~0ULL;
201         mce_printk(MCE_VERBOSE, "MCE: %pv: rd MCG_CTL %#"PRIx64"\n", cur, *val);
202         break;
203 
204     case MSR_IA32_MCG_EXT_CTL:
205         /*
206          * If MCG_LMCE_P is present in guest MSR_IA32_MCG_CAP, the LMCE and LOCK
207          * bits are always set in guest MSR_IA32_FEATURE_CONTROL by Xen, so it
208          * does not need to check them here.
209          */
210         if ( cur->arch.vmce.mcg_cap & MCG_LMCE_P )
211         {
212             *val = cur->arch.vmce.mcg_ext_ctl;
213             mce_printk(MCE_VERBOSE, "MCE: %pv: rd MCG_EXT_CTL %#"PRIx64"\n",
214                        cur, *val);
215         }
216         else
217         {
218             ret = -1;
219             mce_printk(MCE_VERBOSE, "MCE: %pv: rd MCG_EXT_CTL, not supported\n",
220                        cur);
221         }
222         break;
223 
224     default:
225         ret = mce_bank_msr(cur, msr) ? bank_mce_rdmsr(cur, msr, val) : 0;
226         break;
227     }
228 
229     spin_unlock(&cur->arch.vmce.lock);
230 
231     return ret;
232 }
233 
234 /*
235  * For historic version reason, bank number may greater than GUEST_MC_BANK_NUM,
236  * when migratie from old vMCE version to new vMCE.
237  */
bank_mce_wrmsr(struct vcpu * v,uint32_t msr,uint64_t val)238 static int bank_mce_wrmsr(struct vcpu *v, uint32_t msr, uint64_t val)
239 {
240     int ret = 1;
241     unsigned int bank = (msr - MSR_IA32_MC0_CTL) / 4;
242 
243     switch ( msr & (-MSR_IA32_MC0_CTL | 3) )
244     {
245     case MSR_IA32_MC0_CTL:
246         /*
247          * if guest crazy clear any bit of MCi_CTL,
248          * treat it as not implement and ignore write change it.
249          */
250         break;
251 
252     case MSR_IA32_MC0_STATUS:
253         mce_printk(MCE_VERBOSE, "MCE: %pv: wr MC%u_STATUS %#"PRIx64"\n",
254                    v, bank, val);
255         if ( val )
256             ret = -1;
257         else if ( bank < GUEST_MC_BANK_NUM )
258             v->arch.vmce.bank[bank].mci_status = val;
259         break;
260 
261     case MSR_IA32_MC0_ADDR:
262         mce_printk(MCE_VERBOSE, "MCE: %pv: wr MC%u_ADDR %#"PRIx64"\n",
263                    v, bank, val);
264         if ( val )
265             ret = -1;
266         else if ( bank < GUEST_MC_BANK_NUM )
267             v->arch.vmce.bank[bank].mci_addr = val;
268         break;
269 
270     case MSR_IA32_MC0_MISC:
271         mce_printk(MCE_VERBOSE, "MCE: %pv: wr MC%u_MISC %#"PRIx64"\n",
272                    v, bank, val);
273         if ( val )
274             ret = -1;
275         else if ( bank < GUEST_MC_BANK_NUM )
276             v->arch.vmce.bank[bank].mci_misc = val;
277         break;
278 
279     default:
280         switch ( boot_cpu_data.x86_vendor )
281         {
282         case X86_VENDOR_INTEL:
283             ret = vmce_intel_wrmsr(v, msr, val);
284             break;
285 
286         case X86_VENDOR_AMD:
287             ret = vmce_amd_wrmsr(v, msr, val);
288             break;
289 
290         default:
291             ret = 0;
292             break;
293         }
294         break;
295     }
296 
297     return ret;
298 }
299 
300 /*
301  * < 0: Unsupported and will #GP fault to guest
302  * = 0: Not handled, should be handled by other components
303  * > 0: Success
304  */
vmce_wrmsr(uint32_t msr,uint64_t val)305 int vmce_wrmsr(uint32_t msr, uint64_t val)
306 {
307     struct vcpu *cur = current;
308     int ret = 1;
309 
310     spin_lock(&cur->arch.vmce.lock);
311 
312     switch ( msr )
313     {
314     case MSR_IA32_MCG_CTL:
315         /* If MCG_CTL exists then stick to all 1's, else ignore. */
316         break;
317 
318     case MSR_IA32_MCG_STATUS:
319         cur->arch.vmce.mcg_status = val;
320         mce_printk(MCE_VERBOSE, "MCE: %pv: wr MCG_STATUS %"PRIx64"\n",
321                    cur, val);
322         break;
323 
324     case MSR_IA32_MCG_CAP:
325         /*
326          * According to Intel SDM, IA32_MCG_CAP is a read-only register,
327          * the effect of writing to the IA32_MCG_CAP is undefined. Here we
328          * treat writing as 'write not change'. Guest would not surprise.
329          */
330         mce_printk(MCE_VERBOSE, "MCE: %pv: MCG_CAP is r/o\n", cur);
331         break;
332 
333     case MSR_IA32_MCG_EXT_CTL:
334         if ( (cur->arch.vmce.mcg_cap & MCG_LMCE_P) &&
335              !(val & ~MCG_EXT_CTL_LMCE_EN) )
336             cur->arch.vmce.mcg_ext_ctl = val;
337         else
338             ret = -1;
339         mce_printk(MCE_VERBOSE, "MCE: %pv: wr MCG_EXT_CTL %"PRIx64"%s\n",
340                    cur, val, (ret == -1) ? ", not supported" : "");
341         break;
342 
343     default:
344         ret = mce_bank_msr(cur, msr) ? bank_mce_wrmsr(cur, msr, val) : 0;
345         break;
346     }
347 
348     spin_unlock(&cur->arch.vmce.lock);
349     return ret;
350 }
351 
vmce_save_vcpu_ctxt(struct domain * d,hvm_domain_context_t * h)352 static int vmce_save_vcpu_ctxt(struct domain *d, hvm_domain_context_t *h)
353 {
354     struct vcpu *v;
355     int err = 0;
356 
357     for_each_vcpu ( d, v )
358     {
359         struct hvm_vmce_vcpu ctxt = {
360             .caps = v->arch.vmce.mcg_cap,
361             .mci_ctl2_bank0 = v->arch.vmce.bank[0].mci_ctl2,
362             .mci_ctl2_bank1 = v->arch.vmce.bank[1].mci_ctl2,
363             .mcg_ext_ctl = v->arch.vmce.mcg_ext_ctl,
364         };
365 
366         err = hvm_save_entry(VMCE_VCPU, v->vcpu_id, h, &ctxt);
367         if ( err )
368             break;
369     }
370 
371     return err;
372 }
373 
vmce_load_vcpu_ctxt(struct domain * d,hvm_domain_context_t * h)374 static int vmce_load_vcpu_ctxt(struct domain *d, hvm_domain_context_t *h)
375 {
376     unsigned int vcpuid = hvm_load_instance(h);
377     struct vcpu *v;
378     struct hvm_vmce_vcpu ctxt;
379     int err;
380 
381     if ( vcpuid >= d->max_vcpus || (v = d->vcpu[vcpuid]) == NULL )
382     {
383         dprintk(XENLOG_G_ERR, "HVM restore: dom%d has no vcpu%u\n",
384                 d->domain_id, vcpuid);
385         err = -EINVAL;
386     }
387     else
388         err = hvm_load_entry_zeroextend(VMCE_VCPU, h, &ctxt);
389 
390     return err ?: vmce_restore_vcpu(v, &ctxt);
391 }
392 
393 HVM_REGISTER_SAVE_RESTORE(VMCE_VCPU, vmce_save_vcpu_ctxt,
394                           vmce_load_vcpu_ctxt, 1, HVMSR_PER_VCPU);
395 
396 /*
397  * for Intel MCE, broadcast vMCE to all vcpus
398  * for AMD MCE, only inject vMCE to vcpu0
399  *
400  * @ d, domain to which would inject vmce
401  * @ vcpu,
402  *   -1 (VMCE_INJECT_BROADCAST), broadcast vMCE to all vcpus
403  *   >= 0, vcpu, the vMCE is injected to
404  */
inject_vmce(struct domain * d,int vcpu)405 int inject_vmce(struct domain *d, int vcpu)
406 {
407     struct vcpu *v;
408     int ret = -ESRCH;
409 
410     for_each_vcpu ( d, v )
411     {
412         if ( vcpu != VMCE_INJECT_BROADCAST && vcpu != v->vcpu_id )
413             continue;
414 
415         /* Don't inject to uninitialized VCPU. */
416         if ( !v->is_initialised )
417             continue;
418 
419         if ( (is_hvm_domain(d) ||
420               pv_trap_callback_registered(v, TRAP_machine_check)) &&
421              !test_and_set_bool(v->mce_pending) )
422         {
423             mce_printk(MCE_VERBOSE, "MCE: inject vMCE to %pv\n", v);
424             vcpu_kick(v);
425             ret = 0;
426         }
427         else
428         {
429             mce_printk(MCE_QUIET, "Failed to inject vMCE to %pv\n", v);
430             ret = -EBUSY;
431             break;
432         }
433 
434         if ( vcpu != VMCE_INJECT_BROADCAST )
435             break;
436     }
437 
438     return ret;
439 }
440 
vcpu_fill_mc_msrs(struct vcpu * v,uint64_t mcg_status,uint64_t mci_status,uint64_t mci_addr,uint64_t mci_misc)441 static int vcpu_fill_mc_msrs(struct vcpu *v, uint64_t mcg_status,
442                              uint64_t mci_status, uint64_t mci_addr,
443                              uint64_t mci_misc)
444 {
445     if ( v->arch.vmce.mcg_status & MCG_STATUS_MCIP )
446     {
447         mce_printk(MCE_QUIET, "MCE: %pv: guest has not handled previous"
448                    " vMCE yet!\n", v);
449         return -EBUSY;
450     }
451 
452     spin_lock(&v->arch.vmce.lock);
453 
454     v->arch.vmce.mcg_status = mcg_status;
455     /*
456      * 1. Skip bank 0 to avoid 'bank 0 quirk' of old processors
457      * 2. Filter MCi_STATUS MSCOD model specific error code to guest
458      */
459     v->arch.vmce.bank[1].mci_status = mci_status & MCi_STATUS_MSCOD_MASK;
460     v->arch.vmce.bank[1].mci_addr = mci_addr;
461     v->arch.vmce.bank[1].mci_misc = mci_misc;
462 
463     spin_unlock(&v->arch.vmce.lock);
464 
465     return 0;
466 }
467 
fill_vmsr_data(struct mcinfo_bank * mc_bank,struct domain * d,uint64_t gstatus,int vmce_vcpuid)468 int fill_vmsr_data(struct mcinfo_bank *mc_bank, struct domain *d,
469                    uint64_t gstatus, int vmce_vcpuid)
470 {
471     struct vcpu *v = d->vcpu[0];
472     bool broadcast = (vmce_vcpuid == VMCE_INJECT_BROADCAST);
473     int ret, err;
474 
475     if ( mc_bank->mc_domid == DOMID_INVALID )
476         return -EINVAL;
477 
478     if ( broadcast )
479         gstatus &= ~MCG_STATUS_LMCE;
480     else if ( gstatus & MCG_STATUS_LMCE )
481     {
482         ASSERT(vmce_vcpuid >= 0 && vmce_vcpuid < d->max_vcpus);
483         v = d->vcpu[vmce_vcpuid];
484     }
485 
486     /*
487      * vMCE with the actual error information is injected to vCPU0,
488      * and, if broadcast is required, we choose to inject less severe
489      * vMCEs to other vCPUs. Thus guest can always get the severest
490      * error (i.e. the actual one) on vCPU0. If guest can recover from
491      * the severest error on vCPU0, the less severe errors on other
492      * vCPUs will not prevent guest from recovering on those vCPUs.
493      */
494     ret = vcpu_fill_mc_msrs(v, gstatus, mc_bank->mc_status,
495                             mc_bank->mc_addr, mc_bank->mc_misc);
496     if ( broadcast )
497         for_each_vcpu ( d, v )
498         {
499             if ( !v->vcpu_id )
500                 continue;
501             err = vcpu_fill_mc_msrs(v, MCG_STATUS_MCIP | MCG_STATUS_RIPV,
502                                     0, 0, 0);
503             if ( err )
504                 ret = err;
505         }
506 
507     return ret;
508 }
509 
510 /* It's said some ram is setup as mmio_direct for UC cache attribute */
511 #define P2M_UNMAP_TYPES (p2m_to_mask(p2m_ram_rw) \
512                                 | p2m_to_mask(p2m_ram_logdirty) \
513                                 | p2m_to_mask(p2m_ram_ro)       \
514                                 | p2m_to_mask(p2m_mmio_direct))
515 
516 /*
517  * Currently all CPUs are redenzevous at the MCE softirq handler, no
518  * need to consider paging p2m type
519  * Currently only support HVM guest with EPT paging mode
520  * XXX following situation missed:
521  * PoD, Foreign mapped, Granted, Shared
522  */
unmmap_broken_page(struct domain * d,mfn_t mfn,unsigned long gfn)523 int unmmap_broken_page(struct domain *d, mfn_t mfn, unsigned long gfn)
524 {
525     mfn_t r_mfn;
526     p2m_type_t pt;
527     int rc;
528 
529     /* Always trust dom0's MCE handler will prevent future access */
530     if ( is_hardware_domain(d) )
531         return 0;
532 
533     if ( !mfn_valid(mfn) )
534         return -EINVAL;
535 
536     if ( !is_hvm_domain(d) || !paging_mode_hap(d) )
537         return -EOPNOTSUPP;
538 
539     rc = -1;
540     r_mfn = get_gfn_query(d, gfn, &pt);
541     if ( p2m_to_mask(pt) & P2M_UNMAP_TYPES)
542     {
543         ASSERT(mfn_x(r_mfn) == mfn_x(mfn));
544         rc = p2m_change_type_one(d, gfn, pt, p2m_ram_broken);
545     }
546     put_gfn(d, gfn);
547 
548     return rc;
549 }
550 
vmce_enable_mca_cap(struct domain * d,uint64_t cap)551 int vmce_enable_mca_cap(struct domain *d, uint64_t cap)
552 {
553     struct vcpu *v;
554 
555     if ( cap & ~XEN_HVM_MCA_CAP_MASK )
556         return -EINVAL;
557 
558     if ( cap & XEN_HVM_MCA_CAP_LMCE )
559     {
560         if ( !lmce_support )
561             return -EINVAL;
562         for_each_vcpu(d, v)
563             v->arch.vmce.mcg_cap |= MCG_LMCE_P;
564     }
565 
566     return 0;
567 }
568