1 /*
2 * vmce.c - provide software emulated vMCE support to guest
3 *
4 * Copyright (C) 2010, 2011 Jiang, Yunhong <yunhong.jiang@intel.com>
5 * Copyright (C) 2012, 2013 Liu, Jinsong <jinsong.liu@intel.com>
6 *
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation; either version 2 of the License, or
10 * (at your option) any later version.
11 *
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with this program; If not, see <http://www.gnu.org/licenses/>.
19 */
20
21 #include <xen/init.h>
22 #include <xen/types.h>
23 #include <xen/irq.h>
24 #include <xen/event.h>
25 #include <xen/kernel.h>
26 #include <xen/delay.h>
27 #include <xen/smp.h>
28 #include <xen/mm.h>
29 #include <asm/hvm/save.h>
30 #include <asm/processor.h>
31 #include <public/sysctl.h>
32 #include <asm/system.h>
33 #include <asm/msr.h>
34 #include <asm/p2m.h>
35 #include <asm/pv/traps.h>
36
37 #include "mce.h"
38 #include "x86_mca.h"
39 #include "vmce.h"
40
41 /*
42 * MCG_SER_P: software error recovery supported
43 * MCG_TES_P: to avoid MCi_status bit56:53 model specific
44 * MCG_CMCI_P: expose CMCI capability but never really inject it to guest,
45 * for sake of performance since guest not polling periodically
46 */
47 #define INTEL_GUEST_MCG_CAP (MCG_SER_P | \
48 MCG_TES_P | \
49 MCG_CMCI_P | \
50 GUEST_MC_BANK_NUM)
51
52 #define AMD_GUEST_MCG_CAP GUEST_MC_BANK_NUM
53
vmce_init_vcpu(struct vcpu * v)54 void vmce_init_vcpu(struct vcpu *v)
55 {
56 int i;
57
58 /* global MCA MSRs init */
59 if ( boot_cpu_data.x86_vendor == X86_VENDOR_INTEL )
60 v->arch.vmce.mcg_cap = INTEL_GUEST_MCG_CAP;
61 else
62 v->arch.vmce.mcg_cap = AMD_GUEST_MCG_CAP;
63
64 v->arch.vmce.mcg_status = 0;
65
66 /* per-bank MCA MSRs init */
67 for ( i = 0; i < GUEST_MC_BANK_NUM; i++ )
68 memset(&v->arch.vmce.bank[i], 0, sizeof(struct vmce_bank));
69
70 spin_lock_init(&v->arch.vmce.lock);
71 }
72
vmce_restore_vcpu(struct vcpu * v,const struct hvm_vmce_vcpu * ctxt)73 int vmce_restore_vcpu(struct vcpu *v, const struct hvm_vmce_vcpu *ctxt)
74 {
75 unsigned long guest_mcg_cap;
76
77 if ( boot_cpu_data.x86_vendor == X86_VENDOR_INTEL )
78 guest_mcg_cap = INTEL_GUEST_MCG_CAP | MCG_LMCE_P;
79 else
80 guest_mcg_cap = AMD_GUEST_MCG_CAP;
81
82 if ( ctxt->caps & ~guest_mcg_cap & ~MCG_CAP_COUNT & ~MCG_CTL_P )
83 {
84 dprintk(XENLOG_G_ERR, "%s restore: unsupported MCA capabilities"
85 " %#" PRIx64 " for %pv (supported: %#Lx)\n",
86 is_hvm_vcpu(v) ? "HVM" : "PV", ctxt->caps,
87 v, guest_mcg_cap & ~MCG_CAP_COUNT);
88 return -EPERM;
89 }
90
91 v->arch.vmce.mcg_cap = ctxt->caps;
92 v->arch.vmce.bank[0].mci_ctl2 = ctxt->mci_ctl2_bank0;
93 v->arch.vmce.bank[1].mci_ctl2 = ctxt->mci_ctl2_bank1;
94 v->arch.vmce.mcg_ext_ctl = ctxt->mcg_ext_ctl;
95
96 return 0;
97 }
98
99 /*
100 * For historic version reason, bank number may greater than GUEST_MC_BANK_NUM,
101 * when migrating from old vMCE version to new vMCE.
102 */
bank_mce_rdmsr(const struct vcpu * v,uint32_t msr,uint64_t * val)103 static int bank_mce_rdmsr(const struct vcpu *v, uint32_t msr, uint64_t *val)
104 {
105 int ret = 1;
106 unsigned int bank = (msr - MSR_IA32_MC0_CTL) / 4;
107
108 *val = 0;
109
110 switch ( msr & (-MSR_IA32_MC0_CTL | 3) )
111 {
112 case MSR_IA32_MC0_CTL:
113 /* stick all 1's to MCi_CTL */
114 *val = ~0UL;
115 mce_printk(MCE_VERBOSE, "MCE: %pv: rd MC%u_CTL %#"PRIx64"\n",
116 v, bank, *val);
117 break;
118
119 case MSR_IA32_MC0_STATUS:
120 if ( bank < GUEST_MC_BANK_NUM )
121 {
122 *val = v->arch.vmce.bank[bank].mci_status;
123 if ( *val )
124 mce_printk(MCE_VERBOSE, "MCE: %pv: rd MC%u_STATUS %#"PRIx64"\n",
125 v, bank, *val);
126 }
127 break;
128
129 case MSR_IA32_MC0_ADDR:
130 if ( bank < GUEST_MC_BANK_NUM )
131 {
132 *val = v->arch.vmce.bank[bank].mci_addr;
133 if ( *val )
134 mce_printk(MCE_VERBOSE, "MCE: %pv: rd MC%u_ADDR %#"PRIx64"\n",
135 v, bank, *val);
136 }
137 break;
138
139 case MSR_IA32_MC0_MISC:
140 if ( bank < GUEST_MC_BANK_NUM )
141 {
142 *val = v->arch.vmce.bank[bank].mci_misc;
143 if ( *val )
144 mce_printk(MCE_VERBOSE, "MCE: %pv: rd MC%u_MISC %#"PRIx64"\n",
145 v, bank, *val);
146 }
147 break;
148
149 default:
150 switch ( boot_cpu_data.x86_vendor )
151 {
152 case X86_VENDOR_INTEL:
153 ret = vmce_intel_rdmsr(v, msr, val);
154 break;
155
156 case X86_VENDOR_AMD:
157 ret = vmce_amd_rdmsr(v, msr, val);
158 break;
159
160 default:
161 ret = 0;
162 break;
163 }
164 break;
165 }
166
167 return ret;
168 }
169
170 /*
171 * < 0: Unsupported and will #GP fault to guest
172 * = 0: Not handled, should be handled by other components
173 * > 0: Success
174 */
vmce_rdmsr(uint32_t msr,uint64_t * val)175 int vmce_rdmsr(uint32_t msr, uint64_t *val)
176 {
177 struct vcpu *cur = current;
178 int ret = 1;
179
180 *val = 0;
181
182 spin_lock(&cur->arch.vmce.lock);
183
184 switch ( msr )
185 {
186 case MSR_IA32_MCG_STATUS:
187 *val = cur->arch.vmce.mcg_status;
188 if ( *val )
189 mce_printk(MCE_VERBOSE,
190 "MCE: %pv: rd MCG_STATUS %#"PRIx64"\n", cur, *val);
191 break;
192
193 case MSR_IA32_MCG_CAP:
194 *val = cur->arch.vmce.mcg_cap;
195 mce_printk(MCE_VERBOSE, "MCE: %pv: rd MCG_CAP %#"PRIx64"\n", cur, *val);
196 break;
197
198 case MSR_IA32_MCG_CTL:
199 if ( cur->arch.vmce.mcg_cap & MCG_CTL_P )
200 *val = ~0ULL;
201 mce_printk(MCE_VERBOSE, "MCE: %pv: rd MCG_CTL %#"PRIx64"\n", cur, *val);
202 break;
203
204 case MSR_IA32_MCG_EXT_CTL:
205 /*
206 * If MCG_LMCE_P is present in guest MSR_IA32_MCG_CAP, the LMCE and LOCK
207 * bits are always set in guest MSR_IA32_FEATURE_CONTROL by Xen, so it
208 * does not need to check them here.
209 */
210 if ( cur->arch.vmce.mcg_cap & MCG_LMCE_P )
211 {
212 *val = cur->arch.vmce.mcg_ext_ctl;
213 mce_printk(MCE_VERBOSE, "MCE: %pv: rd MCG_EXT_CTL %#"PRIx64"\n",
214 cur, *val);
215 }
216 else
217 {
218 ret = -1;
219 mce_printk(MCE_VERBOSE, "MCE: %pv: rd MCG_EXT_CTL, not supported\n",
220 cur);
221 }
222 break;
223
224 default:
225 ret = mce_bank_msr(cur, msr) ? bank_mce_rdmsr(cur, msr, val) : 0;
226 break;
227 }
228
229 spin_unlock(&cur->arch.vmce.lock);
230
231 return ret;
232 }
233
234 /*
235 * For historic version reason, bank number may greater than GUEST_MC_BANK_NUM,
236 * when migratie from old vMCE version to new vMCE.
237 */
bank_mce_wrmsr(struct vcpu * v,uint32_t msr,uint64_t val)238 static int bank_mce_wrmsr(struct vcpu *v, uint32_t msr, uint64_t val)
239 {
240 int ret = 1;
241 unsigned int bank = (msr - MSR_IA32_MC0_CTL) / 4;
242
243 switch ( msr & (-MSR_IA32_MC0_CTL | 3) )
244 {
245 case MSR_IA32_MC0_CTL:
246 /*
247 * if guest crazy clear any bit of MCi_CTL,
248 * treat it as not implement and ignore write change it.
249 */
250 break;
251
252 case MSR_IA32_MC0_STATUS:
253 mce_printk(MCE_VERBOSE, "MCE: %pv: wr MC%u_STATUS %#"PRIx64"\n",
254 v, bank, val);
255 if ( val )
256 ret = -1;
257 else if ( bank < GUEST_MC_BANK_NUM )
258 v->arch.vmce.bank[bank].mci_status = val;
259 break;
260
261 case MSR_IA32_MC0_ADDR:
262 mce_printk(MCE_VERBOSE, "MCE: %pv: wr MC%u_ADDR %#"PRIx64"\n",
263 v, bank, val);
264 if ( val )
265 ret = -1;
266 else if ( bank < GUEST_MC_BANK_NUM )
267 v->arch.vmce.bank[bank].mci_addr = val;
268 break;
269
270 case MSR_IA32_MC0_MISC:
271 mce_printk(MCE_VERBOSE, "MCE: %pv: wr MC%u_MISC %#"PRIx64"\n",
272 v, bank, val);
273 if ( val )
274 ret = -1;
275 else if ( bank < GUEST_MC_BANK_NUM )
276 v->arch.vmce.bank[bank].mci_misc = val;
277 break;
278
279 default:
280 switch ( boot_cpu_data.x86_vendor )
281 {
282 case X86_VENDOR_INTEL:
283 ret = vmce_intel_wrmsr(v, msr, val);
284 break;
285
286 case X86_VENDOR_AMD:
287 ret = vmce_amd_wrmsr(v, msr, val);
288 break;
289
290 default:
291 ret = 0;
292 break;
293 }
294 break;
295 }
296
297 return ret;
298 }
299
300 /*
301 * < 0: Unsupported and will #GP fault to guest
302 * = 0: Not handled, should be handled by other components
303 * > 0: Success
304 */
vmce_wrmsr(uint32_t msr,uint64_t val)305 int vmce_wrmsr(uint32_t msr, uint64_t val)
306 {
307 struct vcpu *cur = current;
308 int ret = 1;
309
310 spin_lock(&cur->arch.vmce.lock);
311
312 switch ( msr )
313 {
314 case MSR_IA32_MCG_CTL:
315 /* If MCG_CTL exists then stick to all 1's, else ignore. */
316 break;
317
318 case MSR_IA32_MCG_STATUS:
319 cur->arch.vmce.mcg_status = val;
320 mce_printk(MCE_VERBOSE, "MCE: %pv: wr MCG_STATUS %"PRIx64"\n",
321 cur, val);
322 break;
323
324 case MSR_IA32_MCG_CAP:
325 /*
326 * According to Intel SDM, IA32_MCG_CAP is a read-only register,
327 * the effect of writing to the IA32_MCG_CAP is undefined. Here we
328 * treat writing as 'write not change'. Guest would not surprise.
329 */
330 mce_printk(MCE_VERBOSE, "MCE: %pv: MCG_CAP is r/o\n", cur);
331 break;
332
333 case MSR_IA32_MCG_EXT_CTL:
334 if ( (cur->arch.vmce.mcg_cap & MCG_LMCE_P) &&
335 !(val & ~MCG_EXT_CTL_LMCE_EN) )
336 cur->arch.vmce.mcg_ext_ctl = val;
337 else
338 ret = -1;
339 mce_printk(MCE_VERBOSE, "MCE: %pv: wr MCG_EXT_CTL %"PRIx64"%s\n",
340 cur, val, (ret == -1) ? ", not supported" : "");
341 break;
342
343 default:
344 ret = mce_bank_msr(cur, msr) ? bank_mce_wrmsr(cur, msr, val) : 0;
345 break;
346 }
347
348 spin_unlock(&cur->arch.vmce.lock);
349 return ret;
350 }
351
vmce_save_vcpu_ctxt(struct domain * d,hvm_domain_context_t * h)352 static int vmce_save_vcpu_ctxt(struct domain *d, hvm_domain_context_t *h)
353 {
354 struct vcpu *v;
355 int err = 0;
356
357 for_each_vcpu ( d, v )
358 {
359 struct hvm_vmce_vcpu ctxt = {
360 .caps = v->arch.vmce.mcg_cap,
361 .mci_ctl2_bank0 = v->arch.vmce.bank[0].mci_ctl2,
362 .mci_ctl2_bank1 = v->arch.vmce.bank[1].mci_ctl2,
363 .mcg_ext_ctl = v->arch.vmce.mcg_ext_ctl,
364 };
365
366 err = hvm_save_entry(VMCE_VCPU, v->vcpu_id, h, &ctxt);
367 if ( err )
368 break;
369 }
370
371 return err;
372 }
373
vmce_load_vcpu_ctxt(struct domain * d,hvm_domain_context_t * h)374 static int vmce_load_vcpu_ctxt(struct domain *d, hvm_domain_context_t *h)
375 {
376 unsigned int vcpuid = hvm_load_instance(h);
377 struct vcpu *v;
378 struct hvm_vmce_vcpu ctxt;
379 int err;
380
381 if ( vcpuid >= d->max_vcpus || (v = d->vcpu[vcpuid]) == NULL )
382 {
383 dprintk(XENLOG_G_ERR, "HVM restore: dom%d has no vcpu%u\n",
384 d->domain_id, vcpuid);
385 err = -EINVAL;
386 }
387 else
388 err = hvm_load_entry_zeroextend(VMCE_VCPU, h, &ctxt);
389
390 return err ?: vmce_restore_vcpu(v, &ctxt);
391 }
392
393 HVM_REGISTER_SAVE_RESTORE(VMCE_VCPU, vmce_save_vcpu_ctxt,
394 vmce_load_vcpu_ctxt, 1, HVMSR_PER_VCPU);
395
396 /*
397 * for Intel MCE, broadcast vMCE to all vcpus
398 * for AMD MCE, only inject vMCE to vcpu0
399 *
400 * @ d, domain to which would inject vmce
401 * @ vcpu,
402 * -1 (VMCE_INJECT_BROADCAST), broadcast vMCE to all vcpus
403 * >= 0, vcpu, the vMCE is injected to
404 */
inject_vmce(struct domain * d,int vcpu)405 int inject_vmce(struct domain *d, int vcpu)
406 {
407 struct vcpu *v;
408 int ret = -ESRCH;
409
410 for_each_vcpu ( d, v )
411 {
412 if ( vcpu != VMCE_INJECT_BROADCAST && vcpu != v->vcpu_id )
413 continue;
414
415 /* Don't inject to uninitialized VCPU. */
416 if ( !v->is_initialised )
417 continue;
418
419 if ( (is_hvm_domain(d) ||
420 pv_trap_callback_registered(v, TRAP_machine_check)) &&
421 !test_and_set_bool(v->mce_pending) )
422 {
423 mce_printk(MCE_VERBOSE, "MCE: inject vMCE to %pv\n", v);
424 vcpu_kick(v);
425 ret = 0;
426 }
427 else
428 {
429 mce_printk(MCE_QUIET, "Failed to inject vMCE to %pv\n", v);
430 ret = -EBUSY;
431 break;
432 }
433
434 if ( vcpu != VMCE_INJECT_BROADCAST )
435 break;
436 }
437
438 return ret;
439 }
440
vcpu_fill_mc_msrs(struct vcpu * v,uint64_t mcg_status,uint64_t mci_status,uint64_t mci_addr,uint64_t mci_misc)441 static int vcpu_fill_mc_msrs(struct vcpu *v, uint64_t mcg_status,
442 uint64_t mci_status, uint64_t mci_addr,
443 uint64_t mci_misc)
444 {
445 if ( v->arch.vmce.mcg_status & MCG_STATUS_MCIP )
446 {
447 mce_printk(MCE_QUIET, "MCE: %pv: guest has not handled previous"
448 " vMCE yet!\n", v);
449 return -EBUSY;
450 }
451
452 spin_lock(&v->arch.vmce.lock);
453
454 v->arch.vmce.mcg_status = mcg_status;
455 /*
456 * 1. Skip bank 0 to avoid 'bank 0 quirk' of old processors
457 * 2. Filter MCi_STATUS MSCOD model specific error code to guest
458 */
459 v->arch.vmce.bank[1].mci_status = mci_status & MCi_STATUS_MSCOD_MASK;
460 v->arch.vmce.bank[1].mci_addr = mci_addr;
461 v->arch.vmce.bank[1].mci_misc = mci_misc;
462
463 spin_unlock(&v->arch.vmce.lock);
464
465 return 0;
466 }
467
fill_vmsr_data(struct mcinfo_bank * mc_bank,struct domain * d,uint64_t gstatus,int vmce_vcpuid)468 int fill_vmsr_data(struct mcinfo_bank *mc_bank, struct domain *d,
469 uint64_t gstatus, int vmce_vcpuid)
470 {
471 struct vcpu *v = d->vcpu[0];
472 bool broadcast = (vmce_vcpuid == VMCE_INJECT_BROADCAST);
473 int ret, err;
474
475 if ( mc_bank->mc_domid == DOMID_INVALID )
476 return -EINVAL;
477
478 if ( broadcast )
479 gstatus &= ~MCG_STATUS_LMCE;
480 else if ( gstatus & MCG_STATUS_LMCE )
481 {
482 ASSERT(vmce_vcpuid >= 0 && vmce_vcpuid < d->max_vcpus);
483 v = d->vcpu[vmce_vcpuid];
484 }
485
486 /*
487 * vMCE with the actual error information is injected to vCPU0,
488 * and, if broadcast is required, we choose to inject less severe
489 * vMCEs to other vCPUs. Thus guest can always get the severest
490 * error (i.e. the actual one) on vCPU0. If guest can recover from
491 * the severest error on vCPU0, the less severe errors on other
492 * vCPUs will not prevent guest from recovering on those vCPUs.
493 */
494 ret = vcpu_fill_mc_msrs(v, gstatus, mc_bank->mc_status,
495 mc_bank->mc_addr, mc_bank->mc_misc);
496 if ( broadcast )
497 for_each_vcpu ( d, v )
498 {
499 if ( !v->vcpu_id )
500 continue;
501 err = vcpu_fill_mc_msrs(v, MCG_STATUS_MCIP | MCG_STATUS_RIPV,
502 0, 0, 0);
503 if ( err )
504 ret = err;
505 }
506
507 return ret;
508 }
509
510 /* It's said some ram is setup as mmio_direct for UC cache attribute */
511 #define P2M_UNMAP_TYPES (p2m_to_mask(p2m_ram_rw) \
512 | p2m_to_mask(p2m_ram_logdirty) \
513 | p2m_to_mask(p2m_ram_ro) \
514 | p2m_to_mask(p2m_mmio_direct))
515
516 /*
517 * Currently all CPUs are redenzevous at the MCE softirq handler, no
518 * need to consider paging p2m type
519 * Currently only support HVM guest with EPT paging mode
520 * XXX following situation missed:
521 * PoD, Foreign mapped, Granted, Shared
522 */
unmmap_broken_page(struct domain * d,mfn_t mfn,unsigned long gfn)523 int unmmap_broken_page(struct domain *d, mfn_t mfn, unsigned long gfn)
524 {
525 mfn_t r_mfn;
526 p2m_type_t pt;
527 int rc;
528
529 /* Always trust dom0's MCE handler will prevent future access */
530 if ( is_hardware_domain(d) )
531 return 0;
532
533 if ( !mfn_valid(mfn) )
534 return -EINVAL;
535
536 if ( !is_hvm_domain(d) || !paging_mode_hap(d) )
537 return -EOPNOTSUPP;
538
539 rc = -1;
540 r_mfn = get_gfn_query(d, gfn, &pt);
541 if ( p2m_to_mask(pt) & P2M_UNMAP_TYPES)
542 {
543 ASSERT(mfn_x(r_mfn) == mfn_x(mfn));
544 rc = p2m_change_type_one(d, gfn, pt, p2m_ram_broken);
545 }
546 put_gfn(d, gfn);
547
548 return rc;
549 }
550
vmce_enable_mca_cap(struct domain * d,uint64_t cap)551 int vmce_enable_mca_cap(struct domain *d, uint64_t cap)
552 {
553 struct vcpu *v;
554
555 if ( cap & ~XEN_HVM_MCA_CAP_MASK )
556 return -EINVAL;
557
558 if ( cap & XEN_HVM_MCA_CAP_LMCE )
559 {
560 if ( !lmce_support )
561 return -EINVAL;
562 for_each_vcpu(d, v)
563 v->arch.vmce.mcg_cap |= MCG_LMCE_P;
564 }
565
566 return 0;
567 }
568