1 /* SPDX-License-Identifier: GPL-2.0-or-later */
2 /*
3 * vmce.c - provide software emulated vMCE support to guest
4 *
5 * Copyright (C) 2010, 2011 Jiang, Yunhong <yunhong.jiang@intel.com>
6 * Copyright (C) 2012, 2013 Liu, Jinsong <jinsong.liu@intel.com>
7 */
8
9 #include <xen/init.h>
10 #include <xen/types.h>
11 #include <xen/irq.h>
12 #include <xen/event.h>
13 #include <xen/kernel.h>
14 #include <xen/delay.h>
15 #include <xen/smp.h>
16 #include <xen/mm.h>
17 #include <asm/hvm/save.h>
18 #include <asm/processor.h>
19 #include <public/hvm/params.h>
20 #include <public/sysctl.h>
21 #include <asm/system.h>
22 #include <asm/msr.h>
23 #include <asm/p2m.h>
24 #include <asm/pv/traps.h>
25
26 #include "mce.h"
27 #include "x86_mca.h"
28 #include "vmce.h"
29
30 /*
31 * MCG_SER_P: software error recovery supported
32 * MCG_TES_P: to avoid MCi_status bit56:53 model specific
33 * MCG_CMCI_P: expose CMCI capability but never really inject it to guest,
34 * for sake of performance since guest not polling periodically
35 */
36 #define INTEL_GUEST_MCG_CAP (MCG_SER_P | \
37 MCG_TES_P | \
38 MCG_CMCI_P | \
39 GUEST_MC_BANK_NUM)
40
41 #define AMD_GUEST_MCG_CAP GUEST_MC_BANK_NUM
42
vmce_init_vcpu(struct vcpu * v)43 void vmce_init_vcpu(struct vcpu *v)
44 {
45 int i;
46
47 /* global MCA MSRs init */
48 if ( boot_cpu_data.x86_vendor == X86_VENDOR_INTEL )
49 v->arch.vmce.mcg_cap = INTEL_GUEST_MCG_CAP;
50 else
51 v->arch.vmce.mcg_cap = AMD_GUEST_MCG_CAP;
52
53 v->arch.vmce.mcg_status = 0;
54
55 /* per-bank MCA MSRs init */
56 for ( i = 0; i < GUEST_MC_BANK_NUM; i++ )
57 memset(&v->arch.vmce.bank[i], 0, sizeof(struct vmce_bank));
58
59 spin_lock_init(&v->arch.vmce.lock);
60 }
61
vmce_restore_vcpu(struct vcpu * v,const struct hvm_vmce_vcpu * ctxt)62 int vmce_restore_vcpu(struct vcpu *v, const struct hvm_vmce_vcpu *ctxt)
63 {
64 unsigned long guest_mcg_cap;
65
66 if ( boot_cpu_data.x86_vendor == X86_VENDOR_INTEL )
67 guest_mcg_cap = INTEL_GUEST_MCG_CAP | MCG_LMCE_P;
68 else
69 guest_mcg_cap = AMD_GUEST_MCG_CAP;
70
71 if ( ctxt->caps & ~guest_mcg_cap & ~MCG_CAP_COUNT & ~MCG_CTL_P )
72 {
73 printk(XENLOG_G_ERR
74 "%s restore: unsupported MCA capabilities %#"PRIx64" for %pv (supported: %#Lx)\n",
75 is_hvm_vcpu(v) ? "HVM" : "PV", ctxt->caps,
76 v, guest_mcg_cap & ~MCG_CAP_COUNT);
77 return -EINVAL;
78 }
79
80 v->arch.vmce.mcg_cap = ctxt->caps;
81 v->arch.vmce.bank[0].mci_ctl2 = ctxt->mci_ctl2_bank0;
82 v->arch.vmce.bank[1].mci_ctl2 = ctxt->mci_ctl2_bank1;
83 v->arch.vmce.mcg_ext_ctl = ctxt->mcg_ext_ctl;
84
85 return 0;
86 }
87
88 /*
89 * For historic version reason, bank number may greater than GUEST_MC_BANK_NUM,
90 * when migrating from old vMCE version to new vMCE.
91 */
bank_mce_rdmsr(const struct vcpu * v,uint32_t msr,uint64_t * val)92 static int bank_mce_rdmsr(const struct vcpu *v, uint32_t msr, uint64_t *val)
93 {
94 int ret = 1;
95 unsigned int bank = (msr - MSR_IA32_MC0_CTL) / 4;
96
97 *val = 0;
98
99 switch ( msr & (-MSR_IA32_MC0_CTL | 3) )
100 {
101 case MSR_IA32_MC0_CTL:
102 /* stick all 1's to MCi_CTL */
103 *val = ~0UL;
104 mce_printk(MCE_VERBOSE, "MCE: %pv: rd MC%u_CTL %#"PRIx64"\n",
105 v, bank, *val);
106 break;
107
108 case MSR_IA32_MC0_STATUS:
109 if ( bank < GUEST_MC_BANK_NUM )
110 {
111 *val = v->arch.vmce.bank[bank].mci_status;
112 if ( *val )
113 mce_printk(MCE_VERBOSE, "MCE: %pv: rd MC%u_STATUS %#"PRIx64"\n",
114 v, bank, *val);
115 }
116 break;
117
118 case MSR_IA32_MC0_ADDR:
119 if ( bank < GUEST_MC_BANK_NUM )
120 {
121 *val = v->arch.vmce.bank[bank].mci_addr;
122 if ( *val )
123 mce_printk(MCE_VERBOSE, "MCE: %pv: rd MC%u_ADDR %#"PRIx64"\n",
124 v, bank, *val);
125 }
126 break;
127
128 case MSR_IA32_MC0_MISC:
129 if ( bank < GUEST_MC_BANK_NUM )
130 {
131 *val = v->arch.vmce.bank[bank].mci_misc;
132 if ( *val )
133 mce_printk(MCE_VERBOSE, "MCE: %pv: rd MC%u_MISC %#"PRIx64"\n",
134 v, bank, *val);
135 }
136 break;
137
138 default:
139 switch ( boot_cpu_data.x86_vendor )
140 {
141 #ifdef CONFIG_INTEL
142 case X86_VENDOR_CENTAUR:
143 case X86_VENDOR_SHANGHAI:
144 case X86_VENDOR_INTEL:
145 ret = vmce_intel_rdmsr(v, msr, val);
146 break;
147 #endif
148
149 #ifdef CONFIG_AMD
150 case X86_VENDOR_AMD:
151 case X86_VENDOR_HYGON:
152 ret = vmce_amd_rdmsr(v, msr, val);
153 break;
154 #endif
155
156 default:
157 ret = 0;
158 break;
159 }
160 break;
161 }
162
163 return ret;
164 }
165
166 /*
167 * < 0: Unsupported and will #GP fault to guest
168 * = 0: Not handled, should be handled by other components
169 * > 0: Success
170 */
vmce_rdmsr(uint32_t msr,uint64_t * val)171 int vmce_rdmsr(uint32_t msr, uint64_t *val)
172 {
173 struct vcpu *cur = current;
174 int ret = 1;
175
176 *val = 0;
177
178 spin_lock(&cur->arch.vmce.lock);
179
180 switch ( msr )
181 {
182 case MSR_IA32_MCG_STATUS:
183 *val = cur->arch.vmce.mcg_status;
184 if ( *val )
185 mce_printk(MCE_VERBOSE,
186 "MCE: %pv: rd MCG_STATUS %#"PRIx64"\n", cur, *val);
187 break;
188
189 case MSR_IA32_MCG_CAP:
190 *val = cur->arch.vmce.mcg_cap;
191 mce_printk(MCE_VERBOSE, "MCE: %pv: rd MCG_CAP %#"PRIx64"\n", cur, *val);
192 break;
193
194 case MSR_IA32_MCG_CTL:
195 if ( cur->arch.vmce.mcg_cap & MCG_CTL_P )
196 *val = ~0ULL;
197 mce_printk(MCE_VERBOSE, "MCE: %pv: rd MCG_CTL %#"PRIx64"\n", cur, *val);
198 break;
199
200 case MSR_IA32_MCG_EXT_CTL:
201 /*
202 * If MCG_LMCE_P is present in guest MSR_IA32_MCG_CAP, the LMCE and LOCK
203 * bits are always set in guest MSR_IA32_FEATURE_CONTROL by Xen, so it
204 * does not need to check them here.
205 */
206 if ( vmce_has_lmce(cur) )
207 {
208 *val = cur->arch.vmce.mcg_ext_ctl;
209 mce_printk(MCE_VERBOSE, "MCE: %pv: rd MCG_EXT_CTL %#"PRIx64"\n",
210 cur, *val);
211 }
212 else
213 {
214 ret = -1;
215 mce_printk(MCE_VERBOSE, "MCE: %pv: rd MCG_EXT_CTL, not supported\n",
216 cur);
217 }
218 break;
219
220 default:
221 ret = mce_bank_msr(cur, msr) ? bank_mce_rdmsr(cur, msr, val) : 0;
222 break;
223 }
224
225 spin_unlock(&cur->arch.vmce.lock);
226
227 return ret;
228 }
229
230 /*
231 * For historic version reason, bank number may greater than GUEST_MC_BANK_NUM,
232 * when migratie from old vMCE version to new vMCE.
233 */
bank_mce_wrmsr(struct vcpu * v,uint32_t msr,uint64_t val)234 static int bank_mce_wrmsr(struct vcpu *v, uint32_t msr, uint64_t val)
235 {
236 int ret = 1;
237 unsigned int bank = (msr - MSR_IA32_MC0_CTL) / 4;
238
239 switch ( msr & (-MSR_IA32_MC0_CTL | 3) )
240 {
241 case MSR_IA32_MC0_CTL:
242 /*
243 * if guest crazy clear any bit of MCi_CTL,
244 * treat it as not implement and ignore write change it.
245 */
246 break;
247
248 case MSR_IA32_MC0_STATUS:
249 mce_printk(MCE_VERBOSE, "MCE: %pv: wr MC%u_STATUS %#"PRIx64"\n",
250 v, bank, val);
251 if ( val )
252 ret = -1;
253 else if ( bank < GUEST_MC_BANK_NUM )
254 v->arch.vmce.bank[bank].mci_status = val;
255 break;
256
257 case MSR_IA32_MC0_ADDR:
258 mce_printk(MCE_VERBOSE, "MCE: %pv: wr MC%u_ADDR %#"PRIx64"\n",
259 v, bank, val);
260 if ( val )
261 ret = -1;
262 else if ( bank < GUEST_MC_BANK_NUM )
263 v->arch.vmce.bank[bank].mci_addr = val;
264 break;
265
266 case MSR_IA32_MC0_MISC:
267 mce_printk(MCE_VERBOSE, "MCE: %pv: wr MC%u_MISC %#"PRIx64"\n",
268 v, bank, val);
269 if ( val )
270 ret = -1;
271 else if ( bank < GUEST_MC_BANK_NUM )
272 v->arch.vmce.bank[bank].mci_misc = val;
273 break;
274
275 default:
276 switch ( boot_cpu_data.x86_vendor )
277 {
278 #ifdef CONFIG_INTEL
279 case X86_VENDOR_INTEL:
280 ret = vmce_intel_wrmsr(v, msr, val);
281 break;
282 #endif
283
284 #ifdef CONFIG_AMD
285 case X86_VENDOR_AMD:
286 case X86_VENDOR_HYGON:
287 ret = vmce_amd_wrmsr(v, msr, val);
288 break;
289 #endif
290
291 default:
292 ret = 0;
293 break;
294 }
295 break;
296 }
297
298 return ret;
299 }
300
301 /*
302 * < 0: Unsupported and will #GP fault to guest
303 * = 0: Not handled, should be handled by other components
304 * > 0: Success
305 */
vmce_wrmsr(uint32_t msr,uint64_t val)306 int vmce_wrmsr(uint32_t msr, uint64_t val)
307 {
308 struct vcpu *cur = current;
309 int ret = 1;
310
311 spin_lock(&cur->arch.vmce.lock);
312
313 switch ( msr )
314 {
315 case MSR_IA32_MCG_CTL:
316 /* If MCG_CTL exists then stick to all 1's, else ignore. */
317 break;
318
319 case MSR_IA32_MCG_STATUS:
320 cur->arch.vmce.mcg_status = val;
321 mce_printk(MCE_VERBOSE, "MCE: %pv: wr MCG_STATUS %"PRIx64"\n",
322 cur, val);
323 break;
324
325 case MSR_IA32_MCG_CAP:
326 /*
327 * According to Intel SDM, IA32_MCG_CAP is a read-only register,
328 * the effect of writing to the IA32_MCG_CAP is undefined. Here we
329 * treat writing as 'write not change'. Guest would not surprise.
330 */
331 mce_printk(MCE_VERBOSE, "MCE: %pv: MCG_CAP is r/o\n", cur);
332 break;
333
334 case MSR_IA32_MCG_EXT_CTL:
335 if ( vmce_has_lmce(cur) && !(val & ~MCG_EXT_CTL_LMCE_EN) )
336 cur->arch.vmce.mcg_ext_ctl = val;
337 else
338 ret = -1;
339 mce_printk(MCE_VERBOSE, "MCE: %pv: wr MCG_EXT_CTL %"PRIx64"%s\n",
340 cur, val, (ret == -1) ? ", not supported" : "");
341 break;
342
343 default:
344 ret = mce_bank_msr(cur, msr) ? bank_mce_wrmsr(cur, msr, val) : 0;
345 break;
346 }
347
348 spin_unlock(&cur->arch.vmce.lock);
349 return ret;
350 }
351
352 #if CONFIG_HVM
vmce_save_vcpu_ctxt(struct vcpu * v,hvm_domain_context_t * h)353 static int cf_check vmce_save_vcpu_ctxt(struct vcpu *v, hvm_domain_context_t *h)
354 {
355 struct hvm_vmce_vcpu ctxt = {
356 .caps = v->arch.vmce.mcg_cap,
357 .mci_ctl2_bank0 = v->arch.vmce.bank[0].mci_ctl2,
358 .mci_ctl2_bank1 = v->arch.vmce.bank[1].mci_ctl2,
359 .mcg_ext_ctl = v->arch.vmce.mcg_ext_ctl,
360 };
361
362 return hvm_save_entry(VMCE_VCPU, v->vcpu_id, h, &ctxt);
363 }
364
vmce_load_vcpu_ctxt(struct domain * d,hvm_domain_context_t * h)365 static int cf_check vmce_load_vcpu_ctxt(struct domain *d, hvm_domain_context_t *h)
366 {
367 unsigned int vcpuid = hvm_load_instance(h);
368 struct vcpu *v;
369 struct hvm_vmce_vcpu ctxt;
370 int err;
371
372 if ( vcpuid >= d->max_vcpus || (v = d->vcpu[vcpuid]) == NULL )
373 {
374 dprintk(XENLOG_G_ERR, "HVM restore: dom%d has no vcpu%u\n",
375 d->domain_id, vcpuid);
376 err = -EINVAL;
377 }
378 else
379 err = hvm_load_entry_zeroextend(VMCE_VCPU, h, &ctxt);
380
381 return err ?: vmce_restore_vcpu(v, &ctxt);
382 }
383
384 HVM_REGISTER_SAVE_RESTORE(VMCE_VCPU, vmce_save_vcpu_ctxt, NULL,
385 vmce_load_vcpu_ctxt, 1, HVMSR_PER_VCPU);
386 #endif
387
388 /*
389 * for Intel MCE, broadcast vMCE to all vcpus
390 * for AMD MCE, only inject vMCE to vcpu0
391 *
392 * @ d, domain to which would inject vmce
393 * @ vcpu,
394 * -1 (VMCE_INJECT_BROADCAST), broadcast vMCE to all vcpus
395 * >= 0, vcpu, the vMCE is injected to
396 */
inject_vmce(struct domain * d,int vcpu)397 int inject_vmce(struct domain *d, int vcpu)
398 {
399 struct vcpu *v;
400 int ret = -ESRCH;
401
402 for_each_vcpu ( d, v )
403 {
404 if ( vcpu != VMCE_INJECT_BROADCAST && vcpu != v->vcpu_id )
405 continue;
406
407 /* Don't inject to uninitialized VCPU. */
408 if ( !v->is_initialised )
409 continue;
410
411 if ( (is_hvm_domain(d) ||
412 pv_trap_callback_registered(v, X86_EXC_MC)) &&
413 !test_and_set_bool(v->arch.mce_pending) )
414 {
415 mce_printk(MCE_VERBOSE, "MCE: inject vMCE to %pv\n", v);
416 vcpu_kick(v);
417 ret = 0;
418 }
419 else
420 {
421 mce_printk(MCE_QUIET, "Failed to inject vMCE to %pv\n", v);
422 ret = -EBUSY;
423 break;
424 }
425
426 if ( vcpu != VMCE_INJECT_BROADCAST )
427 break;
428 }
429
430 return ret;
431 }
432
vcpu_fill_mc_msrs(struct vcpu * v,uint64_t mcg_status,uint64_t mci_status,uint64_t mci_addr,uint64_t mci_misc)433 static int vcpu_fill_mc_msrs(struct vcpu *v, uint64_t mcg_status,
434 uint64_t mci_status, uint64_t mci_addr,
435 uint64_t mci_misc)
436 {
437 if ( v->arch.vmce.mcg_status & MCG_STATUS_MCIP )
438 {
439 mce_printk(MCE_QUIET, "MCE: %pv: guest has not handled previous"
440 " vMCE yet!\n", v);
441 return -EBUSY;
442 }
443
444 spin_lock(&v->arch.vmce.lock);
445
446 v->arch.vmce.mcg_status = mcg_status;
447 /*
448 * 1. Skip bank 0 to avoid 'bank 0 quirk' of old processors
449 * 2. Filter MCi_STATUS MSCOD model specific error code to guest
450 */
451 v->arch.vmce.bank[1].mci_status = mci_status & MCi_STATUS_MSCOD_MASK;
452 v->arch.vmce.bank[1].mci_addr = mci_addr;
453 v->arch.vmce.bank[1].mci_misc = mci_misc;
454
455 spin_unlock(&v->arch.vmce.lock);
456
457 return 0;
458 }
459
fill_vmsr_data(struct mcinfo_bank * mc_bank,struct domain * d,uint64_t gstatus,int vmce_vcpuid)460 int fill_vmsr_data(struct mcinfo_bank *mc_bank, struct domain *d,
461 uint64_t gstatus, int vmce_vcpuid)
462 {
463 struct vcpu *v = d->vcpu[0];
464 bool broadcast = (vmce_vcpuid == VMCE_INJECT_BROADCAST);
465 int ret, err;
466
467 if ( mc_bank->mc_domid == DOMID_INVALID )
468 return -EINVAL;
469
470 if ( broadcast )
471 gstatus &= ~MCG_STATUS_LMCE;
472 else if ( gstatus & MCG_STATUS_LMCE )
473 {
474 ASSERT(vmce_vcpuid >= 0 && vmce_vcpuid < d->max_vcpus);
475 v = d->vcpu[vmce_vcpuid];
476 }
477
478 /*
479 * vMCE with the actual error information is injected to vCPU0,
480 * and, if broadcast is required, we choose to inject less severe
481 * vMCEs to other vCPUs. Thus guest can always get the severest
482 * error (i.e. the actual one) on vCPU0. If guest can recover from
483 * the severest error on vCPU0, the less severe errors on other
484 * vCPUs will not prevent guest from recovering on those vCPUs.
485 */
486 ret = vcpu_fill_mc_msrs(v, gstatus, mc_bank->mc_status,
487 mc_bank->mc_addr, mc_bank->mc_misc);
488 if ( broadcast )
489 for_each_vcpu ( d, v )
490 {
491 if ( !v->vcpu_id )
492 continue;
493 err = vcpu_fill_mc_msrs(v, MCG_STATUS_MCIP | MCG_STATUS_RIPV,
494 0, 0, 0);
495 if ( err )
496 ret = err;
497 }
498
499 return ret;
500 }
501
502 /* It's said some ram is setup as mmio_direct for UC cache attribute */
503 #define P2M_UNMAP_TYPES (p2m_to_mask(p2m_ram_rw) \
504 | p2m_to_mask(p2m_ram_logdirty) \
505 | p2m_to_mask(p2m_ram_ro) \
506 | p2m_to_mask(p2m_mmio_direct))
507
508 /*
509 * Currently all CPUs are redenzevous at the MCE softirq handler, no
510 * need to consider paging p2m type
511 * Currently only support HVM guest with EPT paging mode
512 * XXX following situation missed:
513 * PoD, Foreign mapped, Granted, Shared
514 */
unmmap_broken_page(struct domain * d,mfn_t mfn,unsigned long gfn)515 int unmmap_broken_page(struct domain *d, mfn_t mfn, unsigned long gfn)
516 {
517 mfn_t r_mfn;
518 p2m_type_t pt;
519 int rc;
520
521 /* Always trust dom0's MCE handler will prevent future access */
522 if ( is_hardware_domain(d) )
523 return 0;
524
525 if ( !mfn_valid(mfn) )
526 return -EINVAL;
527
528 if ( !is_hvm_domain(d) || !paging_mode_hap(d) )
529 return -EOPNOTSUPP;
530
531 rc = -1;
532 r_mfn = get_gfn_query(d, gfn, &pt);
533 if ( p2m_to_mask(pt) & P2M_UNMAP_TYPES)
534 {
535 ASSERT(mfn_eq(r_mfn, mfn));
536 rc = p2m_change_type_one(d, gfn, pt, p2m_ram_broken);
537 }
538 put_gfn(d, gfn);
539
540 return rc;
541 }
542
vmce_enable_mca_cap(struct domain * d,uint64_t cap)543 int vmce_enable_mca_cap(struct domain *d, uint64_t cap)
544 {
545 struct vcpu *v;
546
547 if ( cap & ~XEN_HVM_MCA_CAP_MASK )
548 return -EINVAL;
549
550 if ( cap & XEN_HVM_MCA_CAP_LMCE )
551 {
552 if ( !lmce_support )
553 return -EINVAL;
554 for_each_vcpu(d, v)
555 v->arch.vmce.mcg_cap |= MCG_LMCE_P;
556 }
557
558 return 0;
559 }
560