1 /*
2 * svm.c: handling SVM architecture-related VM exits
3 * Copyright (c) 2004, Intel Corporation.
4 * Copyright (c) 2005-2007, Advanced Micro Devices, Inc.
5 *
6 * This program is free software; you can redistribute it and/or modify it
7 * under the terms and conditions of the GNU General Public License,
8 * version 2, as published by the Free Software Foundation.
9 *
10 * This program is distributed in the hope it will be useful, but WITHOUT
11 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
12 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
13 * more details.
14 *
15 * You should have received a copy of the GNU General Public License along with
16 * this program; If not, see <http://www.gnu.org/licenses/>.
17 */
18
19 #include <xen/init.h>
20 #include <xen/lib.h>
21 #include <xen/trace.h>
22 #include <xen/sched.h>
23 #include <xen/irq.h>
24 #include <xen/softirq.h>
25 #include <xen/hypercall.h>
26 #include <xen/domain_page.h>
27 #include <xen/xenoprof.h>
28 #include <asm/current.h>
29 #include <asm/io.h>
30 #include <asm/paging.h>
31 #include <asm/p2m.h>
32 #include <asm/mem_sharing.h>
33 #include <asm/regs.h>
34 #include <asm/cpufeature.h>
35 #include <asm/processor.h>
36 #include <asm/amd.h>
37 #include <asm/guest_access.h>
38 #include <asm/debugreg.h>
39 #include <asm/msr.h>
40 #include <asm/i387.h>
41 #include <asm/iocap.h>
42 #include <asm/hvm/emulate.h>
43 #include <asm/hvm/hvm.h>
44 #include <asm/hvm/support.h>
45 #include <asm/hvm/io.h>
46 #include <asm/hvm/emulate.h>
47 #include <asm/hvm/svm/asid.h>
48 #include <asm/hvm/svm/svm.h>
49 #include <asm/hvm/svm/vmcb.h>
50 #include <asm/hvm/svm/emulate.h>
51 #include <asm/hvm/svm/intr.h>
52 #include <asm/hvm/svm/svmdebug.h>
53 #include <asm/hvm/svm/nestedsvm.h>
54 #include <asm/hvm/nestedhvm.h>
55 #include <asm/x86_emulate.h>
56 #include <public/sched.h>
57 #include <asm/hvm/vpt.h>
58 #include <asm/hvm/trace.h>
59 #include <asm/hap.h>
60 #include <asm/apic.h>
61 #include <asm/debugger.h>
62 #include <asm/xstate.h>
63
64 void svm_asm_do_resume(void);
65
66 u32 svm_feature_flags;
67
68 /* Indicates whether guests may use EFER.LMSLE. */
69 bool_t cpu_has_lmsl;
70
71 static void svm_update_guest_efer(struct vcpu *);
72
73 static struct hvm_function_table svm_function_table;
74
75 /*
76 * Physical addresses of the Host State Area (for hardware) and vmcb (for Xen)
77 * which contains Xen's fs/gs/tr/ldtr and GSBASE/STAR/SYSENTER state when in
78 * guest vcpu context.
79 */
80 static DEFINE_PER_CPU_READ_MOSTLY(paddr_t, hsa);
81 static DEFINE_PER_CPU_READ_MOSTLY(paddr_t, host_vmcb);
82
83 static bool_t amd_erratum383_found __read_mostly;
84
85 /* OSVW bits */
86 static uint64_t osvw_length, osvw_status;
87 static DEFINE_SPINLOCK(osvw_lock);
88
89 /* Only crash the guest if the problem originates in kernel mode. */
svm_crash_or_fault(struct vcpu * v)90 static void svm_crash_or_fault(struct vcpu *v)
91 {
92 if ( vmcb_get_cpl(v->arch.hvm_svm.vmcb) )
93 hvm_inject_hw_exception(TRAP_invalid_op, X86_EVENT_NO_EC);
94 else
95 domain_crash(v->domain);
96 }
97
__update_guest_eip(struct cpu_user_regs * regs,unsigned int inst_len)98 void __update_guest_eip(struct cpu_user_regs *regs, unsigned int inst_len)
99 {
100 struct vcpu *curr = current;
101
102 if ( unlikely(inst_len == 0) )
103 return;
104
105 if ( unlikely(inst_len > MAX_INST_LEN) )
106 {
107 gdprintk(XENLOG_ERR, "Bad instruction length %u\n", inst_len);
108 svm_crash_or_fault(curr);
109 return;
110 }
111
112 ASSERT(regs == guest_cpu_user_regs());
113
114 regs->rip += inst_len;
115 regs->eflags &= ~X86_EFLAGS_RF;
116
117 curr->arch.hvm_svm.vmcb->interrupt_shadow = 0;
118
119 if ( regs->eflags & X86_EFLAGS_TF )
120 hvm_inject_hw_exception(TRAP_debug, X86_EVENT_NO_EC);
121 }
122
svm_cpu_down(void)123 static void svm_cpu_down(void)
124 {
125 write_efer(read_efer() & ~EFER_SVME);
126 }
127
128 unsigned long *
svm_msrbit(unsigned long * msr_bitmap,uint32_t msr)129 svm_msrbit(unsigned long *msr_bitmap, uint32_t msr)
130 {
131 unsigned long *msr_bit = NULL;
132
133 /*
134 * See AMD64 Programmers Manual, Vol 2, Section 15.10 (MSR-Bitmap Address).
135 */
136 if ( msr <= 0x1fff )
137 msr_bit = msr_bitmap + 0x0000 / BYTES_PER_LONG;
138 else if ( (msr >= 0xc0000000) && (msr <= 0xc0001fff) )
139 msr_bit = msr_bitmap + 0x0800 / BYTES_PER_LONG;
140 else if ( (msr >= 0xc0010000) && (msr <= 0xc0011fff) )
141 msr_bit = msr_bitmap + 0x1000 / BYTES_PER_LONG;
142
143 return msr_bit;
144 }
145
svm_intercept_msr(struct vcpu * v,uint32_t msr,int flags)146 void svm_intercept_msr(struct vcpu *v, uint32_t msr, int flags)
147 {
148 unsigned long *msr_bit;
149
150 msr_bit = svm_msrbit(v->arch.hvm_svm.msrpm, msr);
151 BUG_ON(msr_bit == NULL);
152 msr &= 0x1fff;
153
154 if ( flags & MSR_INTERCEPT_READ )
155 __set_bit(msr * 2, msr_bit);
156 else
157 __clear_bit(msr * 2, msr_bit);
158
159 if ( flags & MSR_INTERCEPT_WRITE )
160 __set_bit(msr * 2 + 1, msr_bit);
161 else
162 __clear_bit(msr * 2 + 1, msr_bit);
163 }
164
svm_save_dr(struct vcpu * v)165 static void svm_save_dr(struct vcpu *v)
166 {
167 struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb;
168 unsigned int flag_dr_dirty = v->arch.hvm_vcpu.flag_dr_dirty;
169
170 if ( !flag_dr_dirty )
171 return;
172
173 /* Clear the DR dirty flag and re-enable intercepts for DR accesses. */
174 v->arch.hvm_vcpu.flag_dr_dirty = 0;
175 vmcb_set_dr_intercepts(vmcb, ~0u);
176
177 if ( v->domain->arch.cpuid->extd.dbext )
178 {
179 svm_intercept_msr(v, MSR_AMD64_DR0_ADDRESS_MASK, MSR_INTERCEPT_RW);
180 svm_intercept_msr(v, MSR_AMD64_DR1_ADDRESS_MASK, MSR_INTERCEPT_RW);
181 svm_intercept_msr(v, MSR_AMD64_DR2_ADDRESS_MASK, MSR_INTERCEPT_RW);
182 svm_intercept_msr(v, MSR_AMD64_DR3_ADDRESS_MASK, MSR_INTERCEPT_RW);
183
184 rdmsrl(MSR_AMD64_DR0_ADDRESS_MASK, v->arch.hvm_svm.dr_mask[0]);
185 rdmsrl(MSR_AMD64_DR1_ADDRESS_MASK, v->arch.hvm_svm.dr_mask[1]);
186 rdmsrl(MSR_AMD64_DR2_ADDRESS_MASK, v->arch.hvm_svm.dr_mask[2]);
187 rdmsrl(MSR_AMD64_DR3_ADDRESS_MASK, v->arch.hvm_svm.dr_mask[3]);
188 }
189
190 v->arch.debugreg[0] = read_debugreg(0);
191 v->arch.debugreg[1] = read_debugreg(1);
192 v->arch.debugreg[2] = read_debugreg(2);
193 v->arch.debugreg[3] = read_debugreg(3);
194 v->arch.debugreg[6] = vmcb_get_dr6(vmcb);
195 v->arch.debugreg[7] = vmcb_get_dr7(vmcb);
196 }
197
__restore_debug_registers(struct vmcb_struct * vmcb,struct vcpu * v)198 static void __restore_debug_registers(struct vmcb_struct *vmcb, struct vcpu *v)
199 {
200 if ( v->arch.hvm_vcpu.flag_dr_dirty )
201 return;
202
203 v->arch.hvm_vcpu.flag_dr_dirty = 1;
204 vmcb_set_dr_intercepts(vmcb, 0);
205
206 ASSERT(v == current);
207
208 if ( v->domain->arch.cpuid->extd.dbext )
209 {
210 svm_intercept_msr(v, MSR_AMD64_DR0_ADDRESS_MASK, MSR_INTERCEPT_NONE);
211 svm_intercept_msr(v, MSR_AMD64_DR1_ADDRESS_MASK, MSR_INTERCEPT_NONE);
212 svm_intercept_msr(v, MSR_AMD64_DR2_ADDRESS_MASK, MSR_INTERCEPT_NONE);
213 svm_intercept_msr(v, MSR_AMD64_DR3_ADDRESS_MASK, MSR_INTERCEPT_NONE);
214
215 wrmsrl(MSR_AMD64_DR0_ADDRESS_MASK, v->arch.hvm_svm.dr_mask[0]);
216 wrmsrl(MSR_AMD64_DR1_ADDRESS_MASK, v->arch.hvm_svm.dr_mask[1]);
217 wrmsrl(MSR_AMD64_DR2_ADDRESS_MASK, v->arch.hvm_svm.dr_mask[2]);
218 wrmsrl(MSR_AMD64_DR3_ADDRESS_MASK, v->arch.hvm_svm.dr_mask[3]);
219 }
220
221 write_debugreg(0, v->arch.debugreg[0]);
222 write_debugreg(1, v->arch.debugreg[1]);
223 write_debugreg(2, v->arch.debugreg[2]);
224 write_debugreg(3, v->arch.debugreg[3]);
225 vmcb_set_dr6(vmcb, v->arch.debugreg[6]);
226 vmcb_set_dr7(vmcb, v->arch.debugreg[7]);
227 }
228
229 /*
230 * DR7 is saved and restored on every vmexit. Other debug registers only
231 * need to be restored if their value is going to affect execution -- i.e.,
232 * if one of the breakpoints is enabled. So mask out all bits that don't
233 * enable some breakpoint functionality.
234 */
svm_restore_dr(struct vcpu * v)235 static void svm_restore_dr(struct vcpu *v)
236 {
237 struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb;
238 if ( unlikely(v->arch.debugreg[7] & DR7_ACTIVE_MASK) )
239 __restore_debug_registers(vmcb, v);
240 }
241
svm_vmcb_save(struct vcpu * v,struct hvm_hw_cpu * c)242 static int svm_vmcb_save(struct vcpu *v, struct hvm_hw_cpu *c)
243 {
244 struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb;
245
246 c->cr0 = v->arch.hvm_vcpu.guest_cr[0];
247 c->cr2 = v->arch.hvm_vcpu.guest_cr[2];
248 c->cr3 = v->arch.hvm_vcpu.guest_cr[3];
249 c->cr4 = v->arch.hvm_vcpu.guest_cr[4];
250
251 c->sysenter_cs = v->arch.hvm_svm.guest_sysenter_cs;
252 c->sysenter_esp = v->arch.hvm_svm.guest_sysenter_esp;
253 c->sysenter_eip = v->arch.hvm_svm.guest_sysenter_eip;
254
255 c->pending_event = 0;
256 c->error_code = 0;
257 if ( vmcb->eventinj.fields.v &&
258 hvm_event_needs_reinjection(vmcb->eventinj.fields.type,
259 vmcb->eventinj.fields.vector) )
260 {
261 c->pending_event = (uint32_t)vmcb->eventinj.bytes;
262 c->error_code = vmcb->eventinj.fields.errorcode;
263 }
264
265 return 1;
266 }
267
svm_vmcb_restore(struct vcpu * v,struct hvm_hw_cpu * c)268 static int svm_vmcb_restore(struct vcpu *v, struct hvm_hw_cpu *c)
269 {
270 struct page_info *page = NULL;
271 struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb;
272 struct p2m_domain *p2m = p2m_get_hostp2m(v->domain);
273
274 if ( c->pending_valid )
275 {
276 if ( (c->pending_type == 1) || (c->pending_type > 4) ||
277 (c->pending_reserved != 0) )
278 {
279 dprintk(XENLOG_ERR, "%pv: Invalid pending event %#"PRIx32"\n",
280 v, c->pending_event);
281 return -EINVAL;
282 }
283
284 if ( c->pending_error_valid &&
285 c->error_code != (uint16_t)c->error_code )
286 {
287 dprintk(XENLOG_ERR, "%pv: Invalid error code %#"PRIx32"\n",
288 v, c->error_code);
289 return -EINVAL;
290 }
291 }
292
293 if ( !paging_mode_hap(v->domain) )
294 {
295 if ( c->cr0 & X86_CR0_PG )
296 {
297 page = get_page_from_gfn(v->domain, c->cr3 >> PAGE_SHIFT,
298 NULL, P2M_ALLOC);
299 if ( !page )
300 {
301 gdprintk(XENLOG_ERR, "Invalid CR3 value=%#"PRIx64"\n",
302 c->cr3);
303 return -EINVAL;
304 }
305 }
306
307 if ( v->arch.hvm_vcpu.guest_cr[0] & X86_CR0_PG )
308 put_page(pagetable_get_page(v->arch.guest_table));
309
310 v->arch.guest_table =
311 page ? pagetable_from_page(page) : pagetable_null();
312 }
313
314 v->arch.hvm_vcpu.guest_cr[0] = c->cr0 | X86_CR0_ET;
315 v->arch.hvm_vcpu.guest_cr[2] = c->cr2;
316 v->arch.hvm_vcpu.guest_cr[3] = c->cr3;
317 v->arch.hvm_vcpu.guest_cr[4] = c->cr4;
318 svm_update_guest_cr(v, 0);
319 svm_update_guest_cr(v, 2);
320 svm_update_guest_cr(v, 4);
321
322 /* Load sysenter MSRs into both VMCB save area and VCPU fields. */
323 vmcb->sysenter_cs = v->arch.hvm_svm.guest_sysenter_cs = c->sysenter_cs;
324 vmcb->sysenter_esp = v->arch.hvm_svm.guest_sysenter_esp = c->sysenter_esp;
325 vmcb->sysenter_eip = v->arch.hvm_svm.guest_sysenter_eip = c->sysenter_eip;
326
327 if ( paging_mode_hap(v->domain) )
328 {
329 vmcb_set_np_enable(vmcb, 1);
330 vmcb_set_g_pat(vmcb, MSR_IA32_CR_PAT_RESET /* guest PAT */);
331 vmcb_set_h_cr3(vmcb, pagetable_get_paddr(p2m_get_pagetable(p2m)));
332 }
333
334 if ( c->pending_valid &&
335 hvm_event_needs_reinjection(c->pending_type, c->pending_vector) )
336 {
337 gdprintk(XENLOG_INFO, "Re-injecting %#"PRIx32", %#"PRIx32"\n",
338 c->pending_event, c->error_code);
339 vmcb->eventinj.bytes = c->pending_event;
340 vmcb->eventinj.fields.errorcode = c->error_code;
341 }
342 else
343 vmcb->eventinj.bytes = 0;
344
345 vmcb->cleanbits.bytes = 0;
346 paging_update_paging_modes(v);
347
348 return 0;
349 }
350
351
svm_save_cpu_state(struct vcpu * v,struct hvm_hw_cpu * data)352 static void svm_save_cpu_state(struct vcpu *v, struct hvm_hw_cpu *data)
353 {
354 struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb;
355
356 data->shadow_gs = vmcb->kerngsbase;
357 data->msr_lstar = vmcb->lstar;
358 data->msr_star = vmcb->star;
359 data->msr_cstar = vmcb->cstar;
360 data->msr_syscall_mask = vmcb->sfmask;
361 data->msr_efer = v->arch.hvm_vcpu.guest_efer;
362 data->msr_flags = 0;
363 }
364
365
svm_load_cpu_state(struct vcpu * v,struct hvm_hw_cpu * data)366 static void svm_load_cpu_state(struct vcpu *v, struct hvm_hw_cpu *data)
367 {
368 struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb;
369
370 vmcb->kerngsbase = data->shadow_gs;
371 vmcb->lstar = data->msr_lstar;
372 vmcb->star = data->msr_star;
373 vmcb->cstar = data->msr_cstar;
374 vmcb->sfmask = data->msr_syscall_mask;
375 v->arch.hvm_vcpu.guest_efer = data->msr_efer;
376 svm_update_guest_efer(v);
377 }
378
svm_save_vmcb_ctxt(struct vcpu * v,struct hvm_hw_cpu * ctxt)379 static void svm_save_vmcb_ctxt(struct vcpu *v, struct hvm_hw_cpu *ctxt)
380 {
381 svm_save_cpu_state(v, ctxt);
382 svm_vmcb_save(v, ctxt);
383 }
384
svm_load_vmcb_ctxt(struct vcpu * v,struct hvm_hw_cpu * ctxt)385 static int svm_load_vmcb_ctxt(struct vcpu *v, struct hvm_hw_cpu *ctxt)
386 {
387 svm_load_cpu_state(v, ctxt);
388 if (svm_vmcb_restore(v, ctxt)) {
389 gdprintk(XENLOG_ERR, "svm_vmcb restore failed!\n");
390 domain_crash(v->domain);
391 return -EINVAL;
392 }
393
394 return 0;
395 }
396
svm_init_msr(void)397 static unsigned int __init svm_init_msr(void)
398 {
399 return boot_cpu_has(X86_FEATURE_DBEXT) ? 4 : 0;
400 }
401
svm_save_msr(struct vcpu * v,struct hvm_msr * ctxt)402 static void svm_save_msr(struct vcpu *v, struct hvm_msr *ctxt)
403 {
404 if ( boot_cpu_has(X86_FEATURE_DBEXT) )
405 {
406 ctxt->msr[ctxt->count].val = v->arch.hvm_svm.dr_mask[0];
407 if ( ctxt->msr[ctxt->count].val )
408 ctxt->msr[ctxt->count++].index = MSR_AMD64_DR0_ADDRESS_MASK;
409
410 ctxt->msr[ctxt->count].val = v->arch.hvm_svm.dr_mask[1];
411 if ( ctxt->msr[ctxt->count].val )
412 ctxt->msr[ctxt->count++].index = MSR_AMD64_DR1_ADDRESS_MASK;
413
414 ctxt->msr[ctxt->count].val = v->arch.hvm_svm.dr_mask[2];
415 if ( ctxt->msr[ctxt->count].val )
416 ctxt->msr[ctxt->count++].index = MSR_AMD64_DR2_ADDRESS_MASK;
417
418 ctxt->msr[ctxt->count].val = v->arch.hvm_svm.dr_mask[3];
419 if ( ctxt->msr[ctxt->count].val )
420 ctxt->msr[ctxt->count++].index = MSR_AMD64_DR3_ADDRESS_MASK;
421 }
422 }
423
svm_load_msr(struct vcpu * v,struct hvm_msr * ctxt)424 static int svm_load_msr(struct vcpu *v, struct hvm_msr *ctxt)
425 {
426 unsigned int i, idx;
427 int err = 0;
428
429 for ( i = 0; i < ctxt->count; ++i )
430 {
431 switch ( idx = ctxt->msr[i].index )
432 {
433 case MSR_AMD64_DR0_ADDRESS_MASK:
434 if ( !boot_cpu_has(X86_FEATURE_DBEXT) )
435 err = -ENXIO;
436 else if ( ctxt->msr[i].val >> 32 )
437 err = -EDOM;
438 else
439 v->arch.hvm_svm.dr_mask[0] = ctxt->msr[i].val;
440 break;
441
442 case MSR_AMD64_DR1_ADDRESS_MASK ... MSR_AMD64_DR3_ADDRESS_MASK:
443 if ( !boot_cpu_has(X86_FEATURE_DBEXT) )
444 err = -ENXIO;
445 else if ( ctxt->msr[i].val >> 32 )
446 err = -EDOM;
447 else
448 v->arch.hvm_svm.dr_mask[idx - MSR_AMD64_DR1_ADDRESS_MASK + 1] =
449 ctxt->msr[i].val;
450 break;
451
452 default:
453 continue;
454 }
455 if ( err )
456 break;
457 ctxt->msr[i]._rsvd = 1;
458 }
459
460 return err;
461 }
462
svm_fpu_enter(struct vcpu * v)463 static void svm_fpu_enter(struct vcpu *v)
464 {
465 struct vmcb_struct *n1vmcb = vcpu_nestedhvm(v).nv_n1vmcx;
466
467 vcpu_restore_fpu_lazy(v);
468 vmcb_set_exception_intercepts(
469 n1vmcb,
470 vmcb_get_exception_intercepts(n1vmcb) & ~(1U << TRAP_no_device));
471 }
472
svm_fpu_leave(struct vcpu * v)473 static void svm_fpu_leave(struct vcpu *v)
474 {
475 struct vmcb_struct *n1vmcb = vcpu_nestedhvm(v).nv_n1vmcx;
476
477 ASSERT(!v->fpu_dirtied);
478 ASSERT(read_cr0() & X86_CR0_TS);
479
480 /*
481 * If the guest does not have TS enabled then we must cause and handle an
482 * exception on first use of the FPU. If the guest *does* have TS enabled
483 * then this is not necessary: no FPU activity can occur until the guest
484 * clears CR0.TS, and we will initialise the FPU when that happens.
485 */
486 if ( !(v->arch.hvm_vcpu.guest_cr[0] & X86_CR0_TS) )
487 {
488 vmcb_set_exception_intercepts(
489 n1vmcb,
490 vmcb_get_exception_intercepts(n1vmcb) | (1U << TRAP_no_device));
491 vmcb_set_cr0(n1vmcb, vmcb_get_cr0(n1vmcb) | X86_CR0_TS);
492 }
493 }
494
svm_get_interrupt_shadow(struct vcpu * v)495 static unsigned int svm_get_interrupt_shadow(struct vcpu *v)
496 {
497 struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb;
498 unsigned int intr_shadow = 0;
499
500 if ( vmcb->interrupt_shadow )
501 intr_shadow |= HVM_INTR_SHADOW_MOV_SS | HVM_INTR_SHADOW_STI;
502
503 if ( vmcb_get_general1_intercepts(vmcb) & GENERAL1_INTERCEPT_IRET )
504 intr_shadow |= HVM_INTR_SHADOW_NMI;
505
506 return intr_shadow;
507 }
508
svm_set_interrupt_shadow(struct vcpu * v,unsigned int intr_shadow)509 static void svm_set_interrupt_shadow(struct vcpu *v, unsigned int intr_shadow)
510 {
511 struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb;
512 u32 general1_intercepts = vmcb_get_general1_intercepts(vmcb);
513
514 vmcb->interrupt_shadow =
515 !!(intr_shadow & (HVM_INTR_SHADOW_MOV_SS|HVM_INTR_SHADOW_STI));
516
517 general1_intercepts &= ~GENERAL1_INTERCEPT_IRET;
518 if ( intr_shadow & HVM_INTR_SHADOW_NMI )
519 general1_intercepts |= GENERAL1_INTERCEPT_IRET;
520 vmcb_set_general1_intercepts(vmcb, general1_intercepts);
521 }
522
svm_guest_x86_mode(struct vcpu * v)523 static int svm_guest_x86_mode(struct vcpu *v)
524 {
525 struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb;
526
527 if ( unlikely(!(v->arch.hvm_vcpu.guest_cr[0] & X86_CR0_PE)) )
528 return 0;
529 if ( unlikely(guest_cpu_user_regs()->eflags & X86_EFLAGS_VM) )
530 return 1;
531 if ( hvm_long_mode_active(v) && likely(vmcb->cs.l) )
532 return 8;
533 return likely(vmcb->cs.db) ? 4 : 2;
534 }
535
svm_update_guest_cr(struct vcpu * v,unsigned int cr)536 void svm_update_guest_cr(struct vcpu *v, unsigned int cr)
537 {
538 struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb;
539 uint64_t value;
540
541 switch ( cr )
542 {
543 case 0: {
544 unsigned long hw_cr0_mask = 0;
545
546 if ( !(v->arch.hvm_vcpu.guest_cr[0] & X86_CR0_TS) )
547 {
548 if ( v != current )
549 hw_cr0_mask |= X86_CR0_TS;
550 else if ( vmcb_get_cr0(vmcb) & X86_CR0_TS )
551 svm_fpu_enter(v);
552 }
553
554 value = v->arch.hvm_vcpu.guest_cr[0] | hw_cr0_mask;
555 if ( !paging_mode_hap(v->domain) )
556 value |= X86_CR0_PG | X86_CR0_WP;
557 vmcb_set_cr0(vmcb, value);
558 break;
559 }
560 case 2:
561 vmcb_set_cr2(vmcb, v->arch.hvm_vcpu.guest_cr[2]);
562 break;
563 case 3:
564 vmcb_set_cr3(vmcb, v->arch.hvm_vcpu.hw_cr[3]);
565 if ( !nestedhvm_enabled(v->domain) )
566 hvm_asid_flush_vcpu(v);
567 else if ( nestedhvm_vmswitch_in_progress(v) )
568 ; /* CR3 switches during VMRUN/VMEXIT do not flush the TLB. */
569 else
570 hvm_asid_flush_vcpu_asid(
571 nestedhvm_vcpu_in_guestmode(v)
572 ? &vcpu_nestedhvm(v).nv_n2asid : &v->arch.hvm_vcpu.n1asid);
573 break;
574 case 4:
575 value = HVM_CR4_HOST_MASK;
576 if ( paging_mode_hap(v->domain) )
577 value &= ~X86_CR4_PAE;
578 value |= v->arch.hvm_vcpu.guest_cr[4];
579
580 if ( !hvm_paging_enabled(v) )
581 {
582 /*
583 * When the guest thinks paging is disabled, Xen may need to hide
584 * the effects of shadow paging, as hardware runs with the host
585 * paging settings, rather than the guests settings.
586 *
587 * Without CR0.PG, all memory accesses are user mode, so
588 * _PAGE_USER must be set in the shadow pagetables for guest
589 * userspace to function. This in turn trips up guest supervisor
590 * mode if SMEP/SMAP are left active in context. They wouldn't
591 * have any effect if paging was actually disabled, so hide them
592 * behind the back of the guest.
593 */
594 value &= ~(X86_CR4_SMEP | X86_CR4_SMAP);
595 }
596
597 vmcb_set_cr4(vmcb, value);
598 break;
599 default:
600 BUG();
601 }
602 }
603
svm_update_guest_efer(struct vcpu * v)604 static void svm_update_guest_efer(struct vcpu *v)
605 {
606 struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb;
607 bool_t lma = !!(v->arch.hvm_vcpu.guest_efer & EFER_LMA);
608 uint64_t new_efer;
609
610 new_efer = (v->arch.hvm_vcpu.guest_efer | EFER_SVME) & ~EFER_LME;
611 if ( lma )
612 new_efer |= EFER_LME;
613 vmcb_set_efer(vmcb, new_efer);
614 }
615
svm_update_guest_vendor(struct vcpu * v)616 static void svm_update_guest_vendor(struct vcpu *v)
617 {
618 struct arch_svm_struct *arch_svm = &v->arch.hvm_svm;
619 struct vmcb_struct *vmcb = arch_svm->vmcb;
620 u32 bitmap = vmcb_get_exception_intercepts(vmcb);
621
622 if ( opt_hvm_fep ||
623 (v->domain->arch.cpuid->x86_vendor != boot_cpu_data.x86_vendor) )
624 bitmap |= (1U << TRAP_invalid_op);
625 else
626 bitmap &= ~(1U << TRAP_invalid_op);
627
628 vmcb_set_exception_intercepts(vmcb, bitmap);
629 }
630
svm_sync_vmcb(struct vcpu * v)631 static void svm_sync_vmcb(struct vcpu *v)
632 {
633 struct arch_svm_struct *arch_svm = &v->arch.hvm_svm;
634
635 if ( arch_svm->vmcb_in_sync )
636 return;
637
638 arch_svm->vmcb_in_sync = 1;
639
640 svm_vmsave(arch_svm->vmcb);
641 }
642
svm_get_cpl(struct vcpu * v)643 static unsigned int svm_get_cpl(struct vcpu *v)
644 {
645 return vmcb_get_cpl(v->arch.hvm_svm.vmcb);
646 }
647
svm_get_segment_register(struct vcpu * v,enum x86_segment seg,struct segment_register * reg)648 static void svm_get_segment_register(struct vcpu *v, enum x86_segment seg,
649 struct segment_register *reg)
650 {
651 struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb;
652
653 ASSERT((v == current) || !vcpu_runnable(v));
654
655 switch ( seg )
656 {
657 case x86_seg_fs ... x86_seg_gs:
658 svm_sync_vmcb(v);
659
660 /* Fallthrough. */
661 case x86_seg_es ... x86_seg_ds:
662 *reg = vmcb->sreg[seg];
663
664 if ( seg == x86_seg_ss )
665 reg->dpl = vmcb_get_cpl(vmcb);
666 break;
667
668 case x86_seg_tr:
669 svm_sync_vmcb(v);
670 *reg = vmcb->tr;
671 break;
672
673 case x86_seg_gdtr:
674 *reg = vmcb->gdtr;
675 break;
676
677 case x86_seg_idtr:
678 *reg = vmcb->idtr;
679 break;
680
681 case x86_seg_ldtr:
682 svm_sync_vmcb(v);
683 *reg = vmcb->ldtr;
684 break;
685
686 default:
687 ASSERT_UNREACHABLE();
688 domain_crash(v->domain);
689 *reg = (struct segment_register){};
690 }
691 }
692
svm_set_segment_register(struct vcpu * v,enum x86_segment seg,struct segment_register * reg)693 static void svm_set_segment_register(struct vcpu *v, enum x86_segment seg,
694 struct segment_register *reg)
695 {
696 struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb;
697 bool sync = false;
698
699 ASSERT((v == current) || !vcpu_runnable(v));
700
701 switch ( seg )
702 {
703 case x86_seg_cs:
704 case x86_seg_ds:
705 case x86_seg_es:
706 case x86_seg_ss: /* cpl */
707 vmcb->cleanbits.fields.seg = 0;
708 break;
709
710 case x86_seg_gdtr:
711 case x86_seg_idtr:
712 vmcb->cleanbits.fields.dt = 0;
713 break;
714
715 case x86_seg_fs:
716 case x86_seg_gs:
717 case x86_seg_tr:
718 case x86_seg_ldtr:
719 sync = (v == current);
720 break;
721
722 default:
723 ASSERT_UNREACHABLE();
724 domain_crash(v->domain);
725 return;
726 }
727
728 if ( sync )
729 svm_sync_vmcb(v);
730
731 switch ( seg )
732 {
733 case x86_seg_ss:
734 vmcb_set_cpl(vmcb, reg->dpl);
735
736 /* Fallthrough */
737 case x86_seg_es ... x86_seg_cs:
738 case x86_seg_ds ... x86_seg_gs:
739 vmcb->sreg[seg] = *reg;
740 break;
741
742 case x86_seg_tr:
743 vmcb->tr = *reg;
744 break;
745
746 case x86_seg_gdtr:
747 vmcb->gdtr.base = reg->base;
748 vmcb->gdtr.limit = reg->limit;
749 break;
750
751 case x86_seg_idtr:
752 vmcb->idtr.base = reg->base;
753 vmcb->idtr.limit = reg->limit;
754 break;
755
756 case x86_seg_ldtr:
757 vmcb->ldtr = *reg;
758 break;
759
760 case x86_seg_none:
761 ASSERT_UNREACHABLE();
762 break;
763 }
764
765 if ( sync )
766 svm_vmload(vmcb);
767 }
768
svm_get_shadow_gs_base(struct vcpu * v)769 static unsigned long svm_get_shadow_gs_base(struct vcpu *v)
770 {
771 return v->arch.hvm_svm.vmcb->kerngsbase;
772 }
773
svm_set_guest_pat(struct vcpu * v,u64 gpat)774 static int svm_set_guest_pat(struct vcpu *v, u64 gpat)
775 {
776 struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb;
777
778 if ( !paging_mode_hap(v->domain) )
779 return 0;
780
781 vmcb_set_g_pat(vmcb, gpat);
782 return 1;
783 }
784
svm_get_guest_pat(struct vcpu * v,u64 * gpat)785 static int svm_get_guest_pat(struct vcpu *v, u64 *gpat)
786 {
787 struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb;
788
789 if ( !paging_mode_hap(v->domain) )
790 return 0;
791
792 *gpat = vmcb_get_g_pat(vmcb);
793 return 1;
794 }
795
scale_tsc(uint64_t host_tsc,uint64_t ratio)796 static uint64_t scale_tsc(uint64_t host_tsc, uint64_t ratio)
797 {
798 uint64_t mult, frac, scaled_host_tsc;
799
800 if ( ratio == DEFAULT_TSC_RATIO )
801 return host_tsc;
802
803 /*
804 * Suppose the most significant 32 bits of host_tsc and ratio are
805 * tsc_h and mult, and the least 32 bits of them are tsc_l and frac,
806 * then
807 * host_tsc * ratio * 2^-32
808 * = host_tsc * (mult * 2^32 + frac) * 2^-32
809 * = host_tsc * mult + (tsc_h * 2^32 + tsc_l) * frac * 2^-32
810 * = host_tsc * mult + tsc_h * frac + ((tsc_l * frac) >> 32)
811 *
812 * Multiplications in the last two terms are between 32-bit integers,
813 * so both of them can fit in 64-bit integers.
814 *
815 * Because mult is usually less than 10 in practice, it's very rare
816 * that host_tsc * mult can overflow a 64-bit integer.
817 */
818 mult = ratio >> 32;
819 frac = ratio & ((1ULL << 32) - 1);
820 scaled_host_tsc = host_tsc * mult;
821 scaled_host_tsc += (host_tsc >> 32) * frac;
822 scaled_host_tsc += ((host_tsc & ((1ULL << 32) - 1)) * frac) >> 32;
823
824 return scaled_host_tsc;
825 }
826
svm_get_tsc_offset(uint64_t host_tsc,uint64_t guest_tsc,uint64_t ratio)827 static uint64_t svm_get_tsc_offset(uint64_t host_tsc, uint64_t guest_tsc,
828 uint64_t ratio)
829 {
830 return guest_tsc - scale_tsc(host_tsc, ratio);
831 }
832
svm_set_tsc_offset(struct vcpu * v,u64 offset,u64 at_tsc)833 static void svm_set_tsc_offset(struct vcpu *v, u64 offset, u64 at_tsc)
834 {
835 struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb;
836 struct vmcb_struct *n1vmcb, *n2vmcb;
837 uint64_t n2_tsc_offset = 0;
838 struct domain *d = v->domain;
839
840 if ( !nestedhvm_enabled(d) ) {
841 vmcb_set_tsc_offset(vmcb, offset);
842 return;
843 }
844
845 n1vmcb = vcpu_nestedhvm(v).nv_n1vmcx;
846 n2vmcb = vcpu_nestedhvm(v).nv_n2vmcx;
847
848 if ( nestedhvm_vcpu_in_guestmode(v) ) {
849 struct nestedsvm *svm = &vcpu_nestedsvm(v);
850
851 n2_tsc_offset = vmcb_get_tsc_offset(n2vmcb) -
852 vmcb_get_tsc_offset(n1vmcb);
853 if ( svm->ns_tscratio != DEFAULT_TSC_RATIO ) {
854 uint64_t guest_tsc = hvm_get_guest_tsc_fixed(v, at_tsc);
855
856 n2_tsc_offset = svm_get_tsc_offset(guest_tsc,
857 guest_tsc + n2_tsc_offset,
858 svm->ns_tscratio);
859 }
860 vmcb_set_tsc_offset(n1vmcb, offset);
861 }
862
863 vmcb_set_tsc_offset(vmcb, offset + n2_tsc_offset);
864 }
865
svm_set_rdtsc_exiting(struct vcpu * v,bool_t enable)866 static void svm_set_rdtsc_exiting(struct vcpu *v, bool_t enable)
867 {
868 struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb;
869 u32 general1_intercepts = vmcb_get_general1_intercepts(vmcb);
870 u32 general2_intercepts = vmcb_get_general2_intercepts(vmcb);
871
872 general1_intercepts &= ~GENERAL1_INTERCEPT_RDTSC;
873 general2_intercepts &= ~GENERAL2_INTERCEPT_RDTSCP;
874
875 if ( enable )
876 {
877 general1_intercepts |= GENERAL1_INTERCEPT_RDTSC;
878 general2_intercepts |= GENERAL2_INTERCEPT_RDTSCP;
879 }
880
881 vmcb_set_general1_intercepts(vmcb, general1_intercepts);
882 vmcb_set_general2_intercepts(vmcb, general2_intercepts);
883 }
884
svm_set_descriptor_access_exiting(struct vcpu * v,bool enable)885 static void svm_set_descriptor_access_exiting(struct vcpu *v, bool enable)
886 {
887 struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb;
888 u32 general1_intercepts = vmcb_get_general1_intercepts(vmcb);
889 u32 mask = GENERAL1_INTERCEPT_IDTR_READ | GENERAL1_INTERCEPT_GDTR_READ
890 | GENERAL1_INTERCEPT_LDTR_READ | GENERAL1_INTERCEPT_TR_READ
891 | GENERAL1_INTERCEPT_IDTR_WRITE | GENERAL1_INTERCEPT_GDTR_WRITE
892 | GENERAL1_INTERCEPT_LDTR_WRITE | GENERAL1_INTERCEPT_TR_WRITE;
893
894 if ( enable )
895 general1_intercepts |= mask;
896 else
897 general1_intercepts &= ~mask;
898
899 vmcb_set_general1_intercepts(vmcb, general1_intercepts);
900 }
901
svm_get_insn_bytes(struct vcpu * v,uint8_t * buf)902 static unsigned int svm_get_insn_bytes(struct vcpu *v, uint8_t *buf)
903 {
904 struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb;
905 unsigned int len = v->arch.hvm_svm.cached_insn_len;
906
907 if ( len != 0 )
908 {
909 /* Latch and clear the cached instruction. */
910 memcpy(buf, vmcb->guest_ins, MAX_INST_LEN);
911 v->arch.hvm_svm.cached_insn_len = 0;
912 }
913
914 return len;
915 }
916
svm_init_hypercall_page(struct domain * d,void * hypercall_page)917 static void svm_init_hypercall_page(struct domain *d, void *hypercall_page)
918 {
919 char *p;
920 int i;
921
922 for ( i = 0; i < (PAGE_SIZE / 32); i++ )
923 {
924 if ( i == __HYPERVISOR_iret )
925 continue;
926
927 p = (char *)(hypercall_page + (i * 32));
928 *(u8 *)(p + 0) = 0xb8; /* mov imm32, %eax */
929 *(u32 *)(p + 1) = i;
930 *(u8 *)(p + 5) = 0x0f; /* vmmcall */
931 *(u8 *)(p + 6) = 0x01;
932 *(u8 *)(p + 7) = 0xd9;
933 *(u8 *)(p + 8) = 0xc3; /* ret */
934 }
935
936 /* Don't support HYPERVISOR_iret at the moment */
937 *(u16 *)(hypercall_page + (__HYPERVISOR_iret * 32)) = 0x0b0f; /* ud2 */
938 }
939
svm_lwp_interrupt(struct cpu_user_regs * regs)940 static void svm_lwp_interrupt(struct cpu_user_regs *regs)
941 {
942 struct vcpu *curr = current;
943
944 ack_APIC_irq();
945 vlapic_set_irq(
946 vcpu_vlapic(curr),
947 (curr->arch.hvm_svm.guest_lwp_cfg >> 40) & 0xff,
948 0);
949 }
950
svm_lwp_save(struct vcpu * v)951 static inline void svm_lwp_save(struct vcpu *v)
952 {
953 /* Don't mess up with other guests. Disable LWP for next VCPU. */
954 if ( v->arch.hvm_svm.guest_lwp_cfg )
955 {
956 wrmsrl(MSR_AMD64_LWP_CFG, 0x0);
957 wrmsrl(MSR_AMD64_LWP_CBADDR, 0x0);
958 }
959 }
960
svm_lwp_load(struct vcpu * v)961 static inline void svm_lwp_load(struct vcpu *v)
962 {
963 /* Only LWP_CFG is reloaded. LWP_CBADDR will be reloaded via xrstor. */
964 if ( v->arch.hvm_svm.guest_lwp_cfg )
965 wrmsrl(MSR_AMD64_LWP_CFG, v->arch.hvm_svm.cpu_lwp_cfg);
966 }
967
968 /* Update LWP_CFG MSR (0xc0000105). Return -1 if error; otherwise returns 0. */
svm_update_lwp_cfg(struct vcpu * v,uint64_t msr_content)969 static int svm_update_lwp_cfg(struct vcpu *v, uint64_t msr_content)
970 {
971 uint32_t msr_low;
972 static uint8_t lwp_intr_vector;
973
974 if ( xsave_enabled(v) && cpu_has_lwp )
975 {
976 msr_low = (uint32_t)msr_content;
977
978 /* generate #GP if guest tries to turn on unsupported features. */
979 if ( msr_low & ~v->domain->arch.cpuid->extd.raw[0x1c].d )
980 return -1;
981
982 v->arch.hvm_svm.guest_lwp_cfg = msr_content;
983
984 /* setup interrupt handler if needed */
985 if ( (msr_content & 0x80000000) && ((msr_content >> 40) & 0xff) )
986 {
987 alloc_direct_apic_vector(&lwp_intr_vector, svm_lwp_interrupt);
988 v->arch.hvm_svm.cpu_lwp_cfg = (msr_content & 0xffff00ffffffffffULL)
989 | ((uint64_t)lwp_intr_vector << 40);
990 }
991 else
992 {
993 /* otherwise disable it */
994 v->arch.hvm_svm.cpu_lwp_cfg = msr_content & 0xffff00ff7fffffffULL;
995 }
996
997 wrmsrl(MSR_AMD64_LWP_CFG, v->arch.hvm_svm.cpu_lwp_cfg);
998
999 /* track nonalzy state if LWP_CFG is non-zero. */
1000 v->arch.nonlazy_xstate_used = !!(msr_content);
1001 }
1002
1003 return 0;
1004 }
1005
svm_tsc_ratio_save(struct vcpu * v)1006 static inline void svm_tsc_ratio_save(struct vcpu *v)
1007 {
1008 /* Other vcpus might not have vtsc enabled. So disable TSC_RATIO here. */
1009 if ( cpu_has_tsc_ratio && !v->domain->arch.vtsc )
1010 wrmsrl(MSR_AMD64_TSC_RATIO, DEFAULT_TSC_RATIO);
1011 }
1012
svm_tsc_ratio_load(struct vcpu * v)1013 static inline void svm_tsc_ratio_load(struct vcpu *v)
1014 {
1015 if ( cpu_has_tsc_ratio && !v->domain->arch.vtsc )
1016 wrmsrl(MSR_AMD64_TSC_RATIO, hvm_tsc_scaling_ratio(v->domain));
1017 }
1018
svm_ctxt_switch_from(struct vcpu * v)1019 static void svm_ctxt_switch_from(struct vcpu *v)
1020 {
1021 int cpu = smp_processor_id();
1022
1023 /*
1024 * Return early if trying to do a context switch without SVM enabled,
1025 * this can happen when the hypervisor shuts down with HVM guests
1026 * still running.
1027 */
1028 if ( unlikely((read_efer() & EFER_SVME) == 0) )
1029 return;
1030
1031 svm_fpu_leave(v);
1032
1033 svm_save_dr(v);
1034 svm_lwp_save(v);
1035 svm_tsc_ratio_save(v);
1036
1037 svm_sync_vmcb(v);
1038 svm_vmload_pa(per_cpu(host_vmcb, cpu));
1039
1040 /* Resume use of ISTs now that the host TR is reinstated. */
1041 set_ist(&idt_tables[cpu][TRAP_double_fault], IST_DF);
1042 set_ist(&idt_tables[cpu][TRAP_nmi], IST_NMI);
1043 set_ist(&idt_tables[cpu][TRAP_machine_check], IST_MCE);
1044 }
1045
svm_ctxt_switch_to(struct vcpu * v)1046 static void svm_ctxt_switch_to(struct vcpu *v)
1047 {
1048 struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb;
1049 int cpu = smp_processor_id();
1050
1051 /*
1052 * This is required, because VMRUN does consistency check and some of the
1053 * DOM0 selectors are pointing to invalid GDT locations, and cause AMD
1054 * processors to shutdown.
1055 */
1056 asm volatile ("mov %0, %%ds; mov %0, %%es; mov %0, %%ss;" :: "r" (0));
1057
1058 /*
1059 * Cannot use ISTs for NMI/#MC/#DF while we are running with the guest TR.
1060 * But this doesn't matter: the IST is only req'd to handle SYSCALL/SYSRET.
1061 */
1062 set_ist(&idt_tables[cpu][TRAP_double_fault], IST_NONE);
1063 set_ist(&idt_tables[cpu][TRAP_nmi], IST_NONE);
1064 set_ist(&idt_tables[cpu][TRAP_machine_check], IST_NONE);
1065
1066 svm_restore_dr(v);
1067
1068 svm_vmsave_pa(per_cpu(host_vmcb, cpu));
1069 svm_vmload(vmcb);
1070 vmcb->cleanbits.bytes = 0;
1071 svm_lwp_load(v);
1072 svm_tsc_ratio_load(v);
1073
1074 if ( cpu_has_rdtscp )
1075 wrmsrl(MSR_TSC_AUX, hvm_msr_tsc_aux(v));
1076 }
1077
svm_do_resume(struct vcpu * v)1078 static void noreturn svm_do_resume(struct vcpu *v)
1079 {
1080 struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb;
1081 bool_t debug_state = v->domain->debugger_attached;
1082 bool_t vcpu_guestmode = 0;
1083 struct vlapic *vlapic = vcpu_vlapic(v);
1084
1085 if ( nestedhvm_enabled(v->domain) && nestedhvm_vcpu_in_guestmode(v) )
1086 vcpu_guestmode = 1;
1087
1088 if ( !vcpu_guestmode &&
1089 unlikely(v->arch.hvm_vcpu.debug_state_latch != debug_state) )
1090 {
1091 uint32_t intercepts = vmcb_get_exception_intercepts(vmcb);
1092
1093 v->arch.hvm_vcpu.debug_state_latch = debug_state;
1094 vmcb_set_exception_intercepts(
1095 vmcb, debug_state ? (intercepts | (1U << TRAP_int3))
1096 : (intercepts & ~(1U << TRAP_int3)));
1097 }
1098
1099 if ( v->arch.hvm_svm.launch_core != smp_processor_id() )
1100 {
1101 v->arch.hvm_svm.launch_core = smp_processor_id();
1102 hvm_migrate_timers(v);
1103 hvm_migrate_pirqs(v);
1104 /* Migrating to another ASID domain. Request a new ASID. */
1105 hvm_asid_flush_vcpu(v);
1106 }
1107
1108 if ( !vcpu_guestmode && !vlapic_hw_disabled(vlapic) )
1109 {
1110 vintr_t intr;
1111
1112 /* Reflect the vlapic's TPR in the hardware vtpr */
1113 intr = vmcb_get_vintr(vmcb);
1114 intr.fields.tpr =
1115 (vlapic_get_reg(vlapic, APIC_TASKPRI) & 0xFF) >> 4;
1116 vmcb_set_vintr(vmcb, intr);
1117 }
1118
1119 hvm_do_resume(v);
1120
1121 reset_stack_and_jump(svm_asm_do_resume);
1122 }
1123
svm_guest_osvw_init(struct vcpu * vcpu)1124 static void svm_guest_osvw_init(struct vcpu *vcpu)
1125 {
1126 if ( boot_cpu_data.x86_vendor != X86_VENDOR_AMD )
1127 return;
1128
1129 /*
1130 * Guests should see errata 400 and 415 as fixed (assuming that
1131 * HLT and IO instructions are intercepted).
1132 */
1133 vcpu->arch.hvm_svm.osvw.length = (osvw_length >= 3) ? osvw_length : 3;
1134 vcpu->arch.hvm_svm.osvw.status = osvw_status & ~(6ULL);
1135
1136 /*
1137 * By increasing VCPU's osvw.length to 3 we are telling the guest that
1138 * all osvw.status bits inside that length, including bit 0 (which is
1139 * reserved for erratum 298), are valid. However, if host processor's
1140 * osvw_len is 0 then osvw_status[0] carries no information. We need to
1141 * be conservative here and therefore we tell the guest that erratum 298
1142 * is present (because we really don't know).
1143 */
1144 if ( osvw_length == 0 && boot_cpu_data.x86 == 0x10 )
1145 vcpu->arch.hvm_svm.osvw.status |= 1;
1146 }
1147
svm_host_osvw_reset()1148 void svm_host_osvw_reset()
1149 {
1150 spin_lock(&osvw_lock);
1151
1152 osvw_length = 64; /* One register (MSRC001_0141) worth of errata */
1153 osvw_status = 0;
1154
1155 spin_unlock(&osvw_lock);
1156 }
1157
svm_host_osvw_init()1158 void svm_host_osvw_init()
1159 {
1160 spin_lock(&osvw_lock);
1161
1162 /*
1163 * Get OSVW bits. If bits are not the same on different processors then
1164 * choose the worst case (i.e. if erratum is present on one processor and
1165 * not on another assume that the erratum is present everywhere).
1166 */
1167 if ( test_bit(X86_FEATURE_OSVW, &boot_cpu_data.x86_capability) )
1168 {
1169 uint64_t len, status;
1170
1171 if ( rdmsr_safe(MSR_AMD_OSVW_ID_LENGTH, len) ||
1172 rdmsr_safe(MSR_AMD_OSVW_STATUS, status) )
1173 len = status = 0;
1174
1175 if (len < osvw_length)
1176 osvw_length = len;
1177
1178 osvw_status |= status;
1179 osvw_status &= (1ULL << osvw_length) - 1;
1180 }
1181 else
1182 osvw_length = osvw_status = 0;
1183
1184 spin_unlock(&osvw_lock);
1185 }
1186
svm_domain_initialise(struct domain * d)1187 static int svm_domain_initialise(struct domain *d)
1188 {
1189 static const struct arch_csw csw = {
1190 .from = svm_ctxt_switch_from,
1191 .to = svm_ctxt_switch_to,
1192 .tail = svm_do_resume,
1193 };
1194
1195 d->arch.ctxt_switch = &csw;
1196
1197 return 0;
1198 }
1199
svm_domain_destroy(struct domain * d)1200 static void svm_domain_destroy(struct domain *d)
1201 {
1202 }
1203
svm_vcpu_initialise(struct vcpu * v)1204 static int svm_vcpu_initialise(struct vcpu *v)
1205 {
1206 int rc;
1207
1208 v->arch.hvm_svm.launch_core = -1;
1209
1210 if ( (rc = svm_create_vmcb(v)) != 0 )
1211 {
1212 dprintk(XENLOG_WARNING,
1213 "Failed to create VMCB for vcpu %d: err=%d.\n",
1214 v->vcpu_id, rc);
1215 return rc;
1216 }
1217
1218 svm_guest_osvw_init(v);
1219
1220 return 0;
1221 }
1222
svm_vcpu_destroy(struct vcpu * v)1223 static void svm_vcpu_destroy(struct vcpu *v)
1224 {
1225 svm_destroy_vmcb(v);
1226 passive_domain_destroy(v);
1227 }
1228
1229 /*
1230 * Emulate enough of interrupt injection to cover the DPL check (omitted by
1231 * hardware), and to work out whether it is safe to move %rip fowards for
1232 * architectural trap vs fault semantics in the exception frame (which
1233 * hardware won't cope with).
1234 *
1235 * The event parameter will be modified to a fault if necessary.
1236 */
svm_emul_swint_injection(struct x86_event * event)1237 static void svm_emul_swint_injection(struct x86_event *event)
1238 {
1239 struct vcpu *curr = current;
1240 const struct vmcb_struct *vmcb = curr->arch.hvm_svm.vmcb;
1241 const struct cpu_user_regs *regs = guest_cpu_user_regs();
1242 unsigned int trap = event->vector, type = event->type;
1243 unsigned int fault = TRAP_gp_fault, ec = 0;
1244 pagefault_info_t pfinfo;
1245 struct segment_register cs, idtr;
1246 unsigned int idte_size, idte_offset;
1247 unsigned long idte_linear_addr;
1248 struct { uint32_t a, b, c, d; } idte = {};
1249 bool lm = vmcb_get_efer(vmcb) & EFER_LMA;
1250 int rc;
1251
1252 if ( !(vmcb_get_cr0(vmcb) & X86_CR0_PE) )
1253 goto raise_exception; /* TODO: support real-mode injection? */
1254
1255 idte_size = lm ? 16 : 8;
1256 idte_offset = trap * idte_size;
1257
1258 /* ICEBP sets the External Event bit despite being an instruction. */
1259 ec = (trap << 3) | X86_XEC_IDT |
1260 (type == X86_EVENTTYPE_PRI_SW_EXCEPTION ? X86_XEC_EXT : 0);
1261
1262 /*
1263 * TODO: This does not cover the v8086 mode with CR4.VME case
1264 * correctly, but falls on the safe side from the point of view of a
1265 * 32bit OS. Someone with many TUITs can see about reading the TSS
1266 * Software Interrupt Redirection bitmap.
1267 */
1268 if ( (regs->eflags & X86_EFLAGS_VM) &&
1269 MASK_EXTR(regs->eflags, X86_EFLAGS_IOPL) != 3 )
1270 goto raise_exception;
1271
1272 /*
1273 * Read all 8/16 bytes so the idtr limit check is applied properly to
1274 * this entry, even though we don't look at all the words read.
1275 */
1276 hvm_get_segment_register(curr, x86_seg_cs, &cs);
1277 hvm_get_segment_register(curr, x86_seg_idtr, &idtr);
1278 if ( !hvm_virtual_to_linear_addr(x86_seg_idtr, &idtr, idte_offset,
1279 idte_size, hvm_access_read,
1280 &cs, &idte_linear_addr) )
1281 goto raise_exception;
1282
1283 rc = hvm_copy_from_guest_linear(&idte, idte_linear_addr, idte_size,
1284 PFEC_implicit, &pfinfo);
1285 if ( rc )
1286 {
1287 if ( rc == HVMTRANS_bad_linear_to_gfn )
1288 {
1289 fault = TRAP_page_fault;
1290 ec = pfinfo.ec;
1291 event->cr2 = pfinfo.linear;
1292 }
1293
1294 goto raise_exception;
1295 }
1296
1297 /* This must be an interrupt, trap, or task gate. */
1298 switch ( (idte.b >> 8) & 0x1f )
1299 {
1300 case SYS_DESC_irq_gate:
1301 case SYS_DESC_trap_gate:
1302 break;
1303 case SYS_DESC_irq_gate16:
1304 case SYS_DESC_trap_gate16:
1305 case SYS_DESC_task_gate:
1306 if ( !lm )
1307 break;
1308 /* fall through */
1309 default:
1310 goto raise_exception;
1311 }
1312
1313 /* The 64-bit high half's type must be zero. */
1314 if ( idte.d & 0x1f00 )
1315 goto raise_exception;
1316
1317 /* ICEBP counts as a hardware event, and bypasses the dpl check. */
1318 if ( type != X86_EVENTTYPE_PRI_SW_EXCEPTION &&
1319 vmcb_get_cpl(vmcb) > ((idte.b >> 13) & 3) )
1320 goto raise_exception;
1321
1322 /* Is this entry present? */
1323 if ( !(idte.b & (1u << 15)) )
1324 {
1325 fault = TRAP_no_segment;
1326 goto raise_exception;
1327 }
1328
1329 /*
1330 * Any further fault during injection will cause a double fault. It
1331 * is fine to leave this up to hardware, and software won't be in a
1332 * position to care about the architectural correctness of %rip in the
1333 * exception frame.
1334 */
1335 return;
1336
1337 raise_exception:
1338 event->vector = fault;
1339 event->type = X86_EVENTTYPE_HW_EXCEPTION;
1340 event->insn_len = 0;
1341 event->error_code = ec;
1342 }
1343
svm_inject_event(const struct x86_event * event)1344 static void svm_inject_event(const struct x86_event *event)
1345 {
1346 struct vcpu *curr = current;
1347 struct vmcb_struct *vmcb = curr->arch.hvm_svm.vmcb;
1348 eventinj_t eventinj = vmcb->eventinj;
1349 struct x86_event _event = *event;
1350 struct cpu_user_regs *regs = guest_cpu_user_regs();
1351
1352 /*
1353 * For hardware lacking NRips support, and always for ICEBP instructions,
1354 * the processor requires extra help to deliver software events.
1355 *
1356 * Xen must emulate enough of the event injection to be sure that a
1357 * further fault shouldn't occur during delivery. This covers the fact
1358 * that hardware doesn't perform DPL checking on injection.
1359 *
1360 * Also, it accounts for proper positioning of %rip for an event with trap
1361 * semantics (where %rip should point after the instruction) which suffers
1362 * a fault during injection (at which point %rip should point at the
1363 * instruction).
1364 */
1365 if ( event->type == X86_EVENTTYPE_PRI_SW_EXCEPTION ||
1366 (!cpu_has_svm_nrips && (event->type == X86_EVENTTYPE_SW_INTERRUPT ||
1367 event->type == X86_EVENTTYPE_SW_EXCEPTION)) )
1368 svm_emul_swint_injection(&_event);
1369
1370 switch ( _event.vector )
1371 {
1372 case TRAP_debug:
1373 if ( regs->eflags & X86_EFLAGS_TF )
1374 {
1375 __restore_debug_registers(vmcb, curr);
1376 vmcb_set_dr6(vmcb, vmcb_get_dr6(vmcb) | 0x4000);
1377 }
1378 /* fall through */
1379 case TRAP_int3:
1380 if ( curr->domain->debugger_attached )
1381 {
1382 /* Debug/Int3: Trap to debugger. */
1383 domain_pause_for_debugger();
1384 return;
1385 }
1386 }
1387
1388 if ( unlikely(eventinj.fields.v) &&
1389 (eventinj.fields.type == X86_EVENTTYPE_HW_EXCEPTION) )
1390 {
1391 _event.vector = hvm_combine_hw_exceptions(
1392 eventinj.fields.vector, _event.vector);
1393 if ( _event.vector == TRAP_double_fault )
1394 _event.error_code = 0;
1395 }
1396
1397 eventinj.bytes = 0;
1398 eventinj.fields.v = 1;
1399 eventinj.fields.vector = _event.vector;
1400
1401 /*
1402 * Refer to AMD Vol 2: System Programming, 15.20 Event Injection.
1403 *
1404 * On hardware lacking NextRIP support, and all hardware in the case of
1405 * icebp, software events with trap semantics need emulating, so %rip in
1406 * the trap frame points after the instruction.
1407 *
1408 * The x86 emulator (if requested by the x86_swint_emulate_* choice) will
1409 * have performed checks such as presence/dpl/etc and believes that the
1410 * event injection will succeed without faulting.
1411 *
1412 * The x86 emulator will always provide fault semantics for software
1413 * events, with _trap.insn_len set appropriately. If the injection
1414 * requires emulation, move %rip forwards at this point.
1415 */
1416 switch ( _event.type )
1417 {
1418 case X86_EVENTTYPE_SW_INTERRUPT: /* int $n */
1419 if ( cpu_has_svm_nrips )
1420 vmcb->nextrip = regs->rip + _event.insn_len;
1421 else
1422 regs->rip += _event.insn_len;
1423 eventinj.fields.type = X86_EVENTTYPE_SW_INTERRUPT;
1424 break;
1425
1426 case X86_EVENTTYPE_PRI_SW_EXCEPTION: /* icebp */
1427 /*
1428 * icebp's injection must always be emulated, as hardware does not
1429 * special case HW_EXCEPTION with vector 1 (#DB) as having trap
1430 * semantics.
1431 */
1432 regs->rip += _event.insn_len;
1433 if ( cpu_has_svm_nrips )
1434 vmcb->nextrip = regs->rip;
1435 eventinj.fields.type = X86_EVENTTYPE_HW_EXCEPTION;
1436 break;
1437
1438 case X86_EVENTTYPE_SW_EXCEPTION: /* int3, into */
1439 /*
1440 * Hardware special cases HW_EXCEPTION with vectors 3 and 4 as having
1441 * trap semantics, and will perform DPL checks.
1442 */
1443 if ( cpu_has_svm_nrips )
1444 vmcb->nextrip = regs->rip + _event.insn_len;
1445 else
1446 regs->rip += _event.insn_len;
1447 eventinj.fields.type = X86_EVENTTYPE_HW_EXCEPTION;
1448 break;
1449
1450 default:
1451 eventinj.fields.type = X86_EVENTTYPE_HW_EXCEPTION;
1452 eventinj.fields.ev = (_event.error_code != X86_EVENT_NO_EC);
1453 eventinj.fields.errorcode = _event.error_code;
1454 break;
1455 }
1456
1457 /*
1458 * If injecting an event outside of 64bit mode, zero the upper bits of the
1459 * %eip and nextrip after the adjustments above.
1460 */
1461 if ( !((vmcb_get_efer(vmcb) & EFER_LMA) && vmcb->cs.l) )
1462 {
1463 regs->rip = regs->eip;
1464 vmcb->nextrip = (uint32_t)vmcb->nextrip;
1465 }
1466
1467 ASSERT(!eventinj.fields.ev ||
1468 eventinj.fields.errorcode == (uint16_t)eventinj.fields.errorcode);
1469 vmcb->eventinj = eventinj;
1470
1471 if ( _event.vector == TRAP_page_fault )
1472 {
1473 curr->arch.hvm_vcpu.guest_cr[2] = _event.cr2;
1474 vmcb_set_cr2(vmcb, _event.cr2);
1475 HVMTRACE_LONG_2D(PF_INJECT, _event.error_code, TRC_PAR_LONG(_event.cr2));
1476 }
1477 else
1478 {
1479 HVMTRACE_2D(INJ_EXC, _event.vector, _event.error_code);
1480 }
1481 }
1482
svm_event_pending(struct vcpu * v)1483 static int svm_event_pending(struct vcpu *v)
1484 {
1485 struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb;
1486 return vmcb->eventinj.fields.v;
1487 }
1488
svm_cpu_dead(unsigned int cpu)1489 static void svm_cpu_dead(unsigned int cpu)
1490 {
1491 paddr_t *this_hsa = &per_cpu(hsa, cpu);
1492 paddr_t *this_vmcb = &per_cpu(host_vmcb, cpu);
1493
1494 if ( *this_hsa )
1495 {
1496 free_domheap_page(maddr_to_page(*this_hsa));
1497 *this_hsa = 0;
1498 }
1499
1500 if ( *this_vmcb )
1501 {
1502 free_domheap_page(maddr_to_page(*this_vmcb));
1503 *this_vmcb = 0;
1504 }
1505 }
1506
svm_cpu_up_prepare(unsigned int cpu)1507 static int svm_cpu_up_prepare(unsigned int cpu)
1508 {
1509 paddr_t *this_hsa = &per_cpu(hsa, cpu);
1510 paddr_t *this_vmcb = &per_cpu(host_vmcb, cpu);
1511 nodeid_t node = cpu_to_node(cpu);
1512 unsigned int memflags = 0;
1513 struct page_info *pg;
1514
1515 if ( node != NUMA_NO_NODE )
1516 memflags = MEMF_node(node);
1517
1518 if ( !*this_hsa )
1519 {
1520 pg = alloc_domheap_page(NULL, memflags);
1521 if ( !pg )
1522 goto err;
1523
1524 clear_domain_page(_mfn(page_to_mfn(pg)));
1525 *this_hsa = page_to_maddr(pg);
1526 }
1527
1528 if ( !*this_vmcb )
1529 {
1530 pg = alloc_domheap_page(NULL, memflags);
1531 if ( !pg )
1532 goto err;
1533
1534 clear_domain_page(_mfn(page_to_mfn(pg)));
1535 *this_vmcb = page_to_maddr(pg);
1536 }
1537
1538 return 0;
1539
1540 err:
1541 svm_cpu_dead(cpu);
1542 return -ENOMEM;
1543 }
1544
svm_init_erratum_383(const struct cpuinfo_x86 * c)1545 static void svm_init_erratum_383(const struct cpuinfo_x86 *c)
1546 {
1547 uint64_t msr_content;
1548
1549 /* check whether CPU is affected */
1550 if ( !cpu_has_amd_erratum(c, AMD_ERRATUM_383) )
1551 return;
1552
1553 /* use safe methods to be compatible with nested virtualization */
1554 if (rdmsr_safe(MSR_AMD64_DC_CFG, msr_content) == 0 &&
1555 wrmsr_safe(MSR_AMD64_DC_CFG, msr_content | (1ULL << 47)) == 0)
1556 {
1557 amd_erratum383_found = 1;
1558 } else {
1559 printk("Failed to enable erratum 383\n");
1560 }
1561 }
1562
svm_handle_osvw(struct vcpu * v,uint32_t msr,uint64_t * val,bool_t read)1563 static int svm_handle_osvw(struct vcpu *v, uint32_t msr, uint64_t *val, bool_t read)
1564 {
1565 if ( !v->domain->arch.cpuid->extd.osvw )
1566 return -1;
1567
1568 if ( read )
1569 {
1570 if (msr == MSR_AMD_OSVW_ID_LENGTH)
1571 *val = v->arch.hvm_svm.osvw.length;
1572 else
1573 *val = v->arch.hvm_svm.osvw.status;
1574 }
1575 /* Writes are ignored */
1576
1577 return 0;
1578 }
1579
_svm_cpu_up(bool bsp)1580 static int _svm_cpu_up(bool bsp)
1581 {
1582 uint64_t msr_content;
1583 int rc;
1584 unsigned int cpu = smp_processor_id();
1585 const struct cpuinfo_x86 *c = &cpu_data[cpu];
1586
1587 /* Check whether SVM feature is disabled in BIOS */
1588 rdmsrl(MSR_K8_VM_CR, msr_content);
1589 if ( msr_content & K8_VMCR_SVME_DISABLE )
1590 {
1591 printk("CPU%d: AMD SVM Extension is disabled in BIOS.\n", cpu);
1592 return -EINVAL;
1593 }
1594
1595 if ( bsp && (rc = svm_cpu_up_prepare(cpu)) != 0 )
1596 return rc;
1597
1598 write_efer(read_efer() | EFER_SVME);
1599
1600 /* Initialize the HSA for this core. */
1601 wrmsrl(MSR_K8_VM_HSAVE_PA, per_cpu(hsa, cpu));
1602
1603 /* check for erratum 383 */
1604 svm_init_erratum_383(c);
1605
1606 /* Initialize core's ASID handling. */
1607 svm_asid_init(c);
1608
1609 /*
1610 * Check whether EFER.LMSLE can be written.
1611 * Unfortunately there's no feature bit defined for this.
1612 */
1613 msr_content = read_efer();
1614 if ( wrmsr_safe(MSR_EFER, msr_content | EFER_LMSLE) == 0 )
1615 rdmsrl(MSR_EFER, msr_content);
1616 if ( msr_content & EFER_LMSLE )
1617 {
1618 if ( 0 && /* FIXME: Migration! */ bsp )
1619 cpu_has_lmsl = 1;
1620 wrmsrl(MSR_EFER, msr_content ^ EFER_LMSLE);
1621 }
1622 else
1623 {
1624 if ( cpu_has_lmsl )
1625 printk(XENLOG_WARNING "Inconsistent LMSLE support across CPUs!\n");
1626 cpu_has_lmsl = 0;
1627 }
1628
1629 /* Initialize OSVW bits to be used by guests */
1630 svm_host_osvw_init();
1631
1632 return 0;
1633 }
1634
svm_cpu_up(void)1635 static int svm_cpu_up(void)
1636 {
1637 return _svm_cpu_up(false);
1638 }
1639
start_svm(void)1640 const struct hvm_function_table * __init start_svm(void)
1641 {
1642 bool_t printed = 0;
1643
1644 svm_host_osvw_reset();
1645
1646 if ( _svm_cpu_up(true) )
1647 {
1648 printk("SVM: failed to initialise.\n");
1649 return NULL;
1650 }
1651
1652 setup_vmcb_dump();
1653
1654 svm_feature_flags = (current_cpu_data.extended_cpuid_level >= 0x8000000A ?
1655 cpuid_edx(0x8000000A) : 0);
1656
1657 printk("SVM: Supported advanced features:\n");
1658
1659 /* DecodeAssists fast paths assume nextrip is valid for fast rIP update. */
1660 if ( !cpu_has_svm_nrips )
1661 clear_bit(SVM_FEATURE_DECODEASSISTS, &svm_feature_flags);
1662
1663 if ( cpu_has_tsc_ratio )
1664 svm_function_table.tsc_scaling.ratio_frac_bits = 32;
1665
1666 #define P(p,s) if ( p ) { printk(" - %s\n", s); printed = 1; }
1667 P(cpu_has_svm_npt, "Nested Page Tables (NPT)");
1668 P(cpu_has_svm_lbrv, "Last Branch Record (LBR) Virtualisation");
1669 P(cpu_has_svm_nrips, "Next-RIP Saved on #VMEXIT");
1670 P(cpu_has_svm_cleanbits, "VMCB Clean Bits");
1671 P(cpu_has_svm_decode, "DecodeAssists");
1672 P(cpu_has_pause_filter, "Pause-Intercept Filter");
1673 P(cpu_has_tsc_ratio, "TSC Rate MSR");
1674 #undef P
1675
1676 if ( !printed )
1677 printk(" - none\n");
1678
1679 svm_function_table.hap_supported = !!cpu_has_svm_npt;
1680 svm_function_table.hap_capabilities = HVM_HAP_SUPERPAGE_2MB |
1681 (cpu_has_page1gb ? HVM_HAP_SUPERPAGE_1GB : 0);
1682
1683 return &svm_function_table;
1684 }
1685
svm_do_nested_pgfault(struct vcpu * v,struct cpu_user_regs * regs,uint64_t pfec,paddr_t gpa)1686 static void svm_do_nested_pgfault(struct vcpu *v,
1687 struct cpu_user_regs *regs, uint64_t pfec, paddr_t gpa)
1688 {
1689 int ret;
1690 unsigned long gfn = gpa >> PAGE_SHIFT;
1691 mfn_t mfn;
1692 p2m_type_t p2mt;
1693 p2m_access_t p2ma;
1694 struct p2m_domain *p2m = NULL;
1695
1696 /*
1697 * Since HW doesn't explicitly provide a read access bit and we need to
1698 * somehow describe read-modify-write instructions we will conservatively
1699 * set read_access for all memory accesses that are not instruction fetches.
1700 */
1701 struct npfec npfec = {
1702 .read_access = !(pfec & PFEC_insn_fetch),
1703 .write_access = !!(pfec & PFEC_write_access),
1704 .insn_fetch = !!(pfec & PFEC_insn_fetch),
1705 .present = !!(pfec & PFEC_page_present),
1706 };
1707
1708 /* These bits are mutually exclusive */
1709 if ( pfec & NPT_PFEC_with_gla )
1710 npfec.kind = npfec_kind_with_gla;
1711 else if ( pfec & NPT_PFEC_in_gpt )
1712 npfec.kind = npfec_kind_in_gpt;
1713
1714 ret = hvm_hap_nested_page_fault(gpa, ~0ul, npfec);
1715
1716 if ( tb_init_done )
1717 {
1718 struct {
1719 uint64_t gpa;
1720 uint64_t mfn;
1721 uint32_t qualification;
1722 uint32_t p2mt;
1723 } _d;
1724
1725 p2m = p2m_get_p2m(v);
1726 _d.gpa = gpa;
1727 _d.qualification = 0;
1728 mfn = __get_gfn_type_access(p2m, gfn, &_d.p2mt, &p2ma, 0, NULL, 0);
1729 _d.mfn = mfn_x(mfn);
1730
1731 __trace_var(TRC_HVM_NPF, 0, sizeof(_d), &_d);
1732 }
1733
1734 switch (ret) {
1735 case 0:
1736 break;
1737 case 1:
1738 return;
1739 case -1:
1740 ASSERT(nestedhvm_enabled(v->domain) && nestedhvm_vcpu_in_guestmode(v));
1741 /* inject #VMEXIT(NPF) into guest. */
1742 nestedsvm_vmexit_defer(v, VMEXIT_NPF, pfec, gpa);
1743 return;
1744 }
1745
1746 if ( p2m == NULL )
1747 p2m = p2m_get_p2m(v);
1748 /* Everything else is an error. */
1749 mfn = __get_gfn_type_access(p2m, gfn, &p2mt, &p2ma, 0, NULL, 0);
1750 gdprintk(XENLOG_ERR,
1751 "SVM violation gpa %#"PRIpaddr", mfn %#lx, type %i\n",
1752 gpa, mfn_x(mfn), p2mt);
1753 domain_crash(v->domain);
1754 }
1755
svm_fpu_dirty_intercept(void)1756 static void svm_fpu_dirty_intercept(void)
1757 {
1758 struct vcpu *v = current;
1759 struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb;
1760 struct vmcb_struct *n1vmcb = vcpu_nestedhvm(v).nv_n1vmcx;
1761
1762 svm_fpu_enter(v);
1763
1764 if ( vmcb != n1vmcb )
1765 {
1766 /* Check if l1 guest must make FPU ready for the l2 guest */
1767 if ( v->arch.hvm_vcpu.guest_cr[0] & X86_CR0_TS )
1768 hvm_inject_hw_exception(TRAP_no_device, X86_EVENT_NO_EC);
1769 else
1770 vmcb_set_cr0(n1vmcb, vmcb_get_cr0(n1vmcb) & ~X86_CR0_TS);
1771 return;
1772 }
1773
1774 if ( !(v->arch.hvm_vcpu.guest_cr[0] & X86_CR0_TS) )
1775 vmcb_set_cr0(vmcb, vmcb_get_cr0(vmcb) & ~X86_CR0_TS);
1776 }
1777
svm_vmexit_do_cpuid(struct cpu_user_regs * regs)1778 static void svm_vmexit_do_cpuid(struct cpu_user_regs *regs)
1779 {
1780 struct vcpu *curr = current;
1781 unsigned int inst_len;
1782 struct cpuid_leaf res;
1783
1784 if ( (inst_len = __get_instruction_length(curr, INSTR_CPUID)) == 0 )
1785 return;
1786
1787 if ( hvm_check_cpuid_faulting(curr) )
1788 {
1789 hvm_inject_hw_exception(TRAP_gp_fault, 0);
1790 return;
1791 }
1792
1793 guest_cpuid(curr, regs->eax, regs->ecx, &res);
1794 HVMTRACE_5D(CPUID, regs->eax, res.a, res.b, res.c, res.d);
1795
1796 regs->rax = res.a;
1797 regs->rbx = res.b;
1798 regs->rcx = res.c;
1799 regs->rdx = res.d;
1800
1801 __update_guest_eip(regs, inst_len);
1802 }
1803
svm_vmexit_do_cr_access(struct vmcb_struct * vmcb,struct cpu_user_regs * regs)1804 static void svm_vmexit_do_cr_access(
1805 struct vmcb_struct *vmcb, struct cpu_user_regs *regs)
1806 {
1807 int gp, cr, dir, rc;
1808
1809 cr = vmcb->exitcode - VMEXIT_CR0_READ;
1810 dir = (cr > 15);
1811 cr &= 0xf;
1812 gp = vmcb->exitinfo1 & 0xf;
1813
1814 rc = dir ? hvm_mov_to_cr(cr, gp) : hvm_mov_from_cr(cr, gp);
1815
1816 if ( rc == X86EMUL_OKAY )
1817 __update_guest_eip(regs, vmcb->nextrip - vmcb->rip);
1818 }
1819
svm_dr_access(struct vcpu * v,struct cpu_user_regs * regs)1820 static void svm_dr_access(struct vcpu *v, struct cpu_user_regs *regs)
1821 {
1822 struct vmcb_struct *vmcb = vcpu_nestedhvm(v).nv_n1vmcx;
1823
1824 HVMTRACE_0D(DR_WRITE);
1825 __restore_debug_registers(vmcb, v);
1826 }
1827
svm_msr_read_intercept(unsigned int msr,uint64_t * msr_content)1828 static int svm_msr_read_intercept(unsigned int msr, uint64_t *msr_content)
1829 {
1830 int ret;
1831 struct vcpu *v = current;
1832 struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb;
1833
1834 switch ( msr )
1835 {
1836 case MSR_IA32_SYSENTER_CS:
1837 *msr_content = v->arch.hvm_svm.guest_sysenter_cs;
1838 break;
1839 case MSR_IA32_SYSENTER_ESP:
1840 *msr_content = v->arch.hvm_svm.guest_sysenter_esp;
1841 break;
1842 case MSR_IA32_SYSENTER_EIP:
1843 *msr_content = v->arch.hvm_svm.guest_sysenter_eip;
1844 break;
1845
1846 case MSR_IA32_MCx_MISC(4): /* Threshold register */
1847 case MSR_F10_MC4_MISC1 ... MSR_F10_MC4_MISC3:
1848 /*
1849 * MCA/MCE: We report that the threshold register is unavailable
1850 * for OS use (locked by the BIOS).
1851 */
1852 *msr_content = 1ULL << 61; /* MC4_MISC.Locked */
1853 break;
1854
1855 case MSR_IA32_EBC_FREQUENCY_ID:
1856 /*
1857 * This Intel-only register may be accessed if this HVM guest
1858 * has been migrated from an Intel host. The value zero is not
1859 * particularly meaningful, but at least avoids the guest crashing!
1860 */
1861 *msr_content = 0;
1862 break;
1863
1864 case MSR_IA32_DEBUGCTLMSR:
1865 *msr_content = vmcb_get_debugctlmsr(vmcb);
1866 break;
1867
1868 case MSR_IA32_LASTBRANCHFROMIP:
1869 *msr_content = vmcb_get_lastbranchfromip(vmcb);
1870 break;
1871
1872 case MSR_IA32_LASTBRANCHTOIP:
1873 *msr_content = vmcb_get_lastbranchtoip(vmcb);
1874 break;
1875
1876 case MSR_IA32_LASTINTFROMIP:
1877 *msr_content = vmcb_get_lastintfromip(vmcb);
1878 break;
1879
1880 case MSR_IA32_LASTINTTOIP:
1881 *msr_content = vmcb_get_lastinttoip(vmcb);
1882 break;
1883
1884 case MSR_AMD64_LWP_CFG:
1885 *msr_content = v->arch.hvm_svm.guest_lwp_cfg;
1886 break;
1887
1888 case MSR_K7_PERFCTR0:
1889 case MSR_K7_PERFCTR1:
1890 case MSR_K7_PERFCTR2:
1891 case MSR_K7_PERFCTR3:
1892 case MSR_K7_EVNTSEL0:
1893 case MSR_K7_EVNTSEL1:
1894 case MSR_K7_EVNTSEL2:
1895 case MSR_K7_EVNTSEL3:
1896 case MSR_AMD_FAM15H_PERFCTR0:
1897 case MSR_AMD_FAM15H_PERFCTR1:
1898 case MSR_AMD_FAM15H_PERFCTR2:
1899 case MSR_AMD_FAM15H_PERFCTR3:
1900 case MSR_AMD_FAM15H_PERFCTR4:
1901 case MSR_AMD_FAM15H_PERFCTR5:
1902 case MSR_AMD_FAM15H_EVNTSEL0:
1903 case MSR_AMD_FAM15H_EVNTSEL1:
1904 case MSR_AMD_FAM15H_EVNTSEL2:
1905 case MSR_AMD_FAM15H_EVNTSEL3:
1906 case MSR_AMD_FAM15H_EVNTSEL4:
1907 case MSR_AMD_FAM15H_EVNTSEL5:
1908 if ( vpmu_do_rdmsr(msr, msr_content) )
1909 goto gpf;
1910 break;
1911
1912 case MSR_AMD64_DR0_ADDRESS_MASK:
1913 if ( !v->domain->arch.cpuid->extd.dbext )
1914 goto gpf;
1915 *msr_content = v->arch.hvm_svm.dr_mask[0];
1916 break;
1917
1918 case MSR_AMD64_DR1_ADDRESS_MASK ... MSR_AMD64_DR3_ADDRESS_MASK:
1919 if ( !v->domain->arch.cpuid->extd.dbext )
1920 goto gpf;
1921 *msr_content =
1922 v->arch.hvm_svm.dr_mask[msr - MSR_AMD64_DR1_ADDRESS_MASK + 1];
1923 break;
1924
1925 case MSR_AMD_OSVW_ID_LENGTH:
1926 case MSR_AMD_OSVW_STATUS:
1927 ret = svm_handle_osvw(v, msr, msr_content, 1);
1928 if ( ret < 0 )
1929 goto gpf;
1930 break;
1931
1932 default:
1933 ret = nsvm_rdmsr(v, msr, msr_content);
1934 if ( ret < 0 )
1935 goto gpf;
1936 else if ( ret )
1937 break;
1938
1939 if ( rdmsr_viridian_regs(msr, msr_content) ||
1940 rdmsr_hypervisor_regs(msr, msr_content) )
1941 break;
1942
1943 if ( rdmsr_safe(msr, *msr_content) == 0 )
1944 break;
1945
1946 if ( boot_cpu_data.x86 == 0xf && msr == MSR_F10_BU_CFG )
1947 {
1948 /* Win2k8 x64 reads this MSR on revF chips, where it
1949 * wasn't publically available; it uses a magic constant
1950 * in %rdi as a password, which we don't have in
1951 * rdmsr_safe(). Since we'll ignore the later writes,
1952 * just use a plausible value here (the reset value from
1953 * rev10h chips) if the real CPU didn't provide one. */
1954 *msr_content = 0x0000000010200020ull;
1955 break;
1956 }
1957
1958 goto gpf;
1959 }
1960
1961 HVM_DBG_LOG(DBG_LEVEL_MSR, "returns: ecx=%x, msr_value=%"PRIx64,
1962 msr, *msr_content);
1963 return X86EMUL_OKAY;
1964
1965 gpf:
1966 return X86EMUL_EXCEPTION;
1967 }
1968
svm_msr_write_intercept(unsigned int msr,uint64_t msr_content)1969 static int svm_msr_write_intercept(unsigned int msr, uint64_t msr_content)
1970 {
1971 int ret, result = X86EMUL_OKAY;
1972 struct vcpu *v = current;
1973 struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb;
1974 int sync = 0;
1975
1976 switch ( msr )
1977 {
1978 case MSR_IA32_SYSENTER_CS:
1979 case MSR_IA32_SYSENTER_ESP:
1980 case MSR_IA32_SYSENTER_EIP:
1981 sync = 1;
1982 break;
1983 default:
1984 break;
1985 }
1986
1987 if ( sync )
1988 svm_sync_vmcb(v);
1989
1990 switch ( msr )
1991 {
1992 case MSR_IA32_SYSENTER_CS:
1993 vmcb->sysenter_cs = v->arch.hvm_svm.guest_sysenter_cs = msr_content;
1994 break;
1995 case MSR_IA32_SYSENTER_ESP:
1996 vmcb->sysenter_esp = v->arch.hvm_svm.guest_sysenter_esp = msr_content;
1997 break;
1998 case MSR_IA32_SYSENTER_EIP:
1999 vmcb->sysenter_eip = v->arch.hvm_svm.guest_sysenter_eip = msr_content;
2000 break;
2001
2002 case MSR_IA32_DEBUGCTLMSR:
2003 vmcb_set_debugctlmsr(vmcb, msr_content);
2004 if ( !msr_content || !cpu_has_svm_lbrv )
2005 break;
2006 vmcb->lbr_control.fields.enable = 1;
2007 svm_disable_intercept_for_msr(v, MSR_IA32_DEBUGCTLMSR);
2008 svm_disable_intercept_for_msr(v, MSR_IA32_LASTBRANCHFROMIP);
2009 svm_disable_intercept_for_msr(v, MSR_IA32_LASTBRANCHTOIP);
2010 svm_disable_intercept_for_msr(v, MSR_IA32_LASTINTFROMIP);
2011 svm_disable_intercept_for_msr(v, MSR_IA32_LASTINTTOIP);
2012 break;
2013
2014 case MSR_IA32_LASTBRANCHFROMIP:
2015 vmcb_set_lastbranchfromip(vmcb, msr_content);
2016 break;
2017
2018 case MSR_IA32_LASTBRANCHTOIP:
2019 vmcb_set_lastbranchtoip(vmcb, msr_content);
2020 break;
2021
2022 case MSR_IA32_LASTINTFROMIP:
2023 vmcb_set_lastintfromip(vmcb, msr_content);
2024 break;
2025
2026 case MSR_IA32_LASTINTTOIP:
2027 vmcb_set_lastinttoip(vmcb, msr_content);
2028 break;
2029
2030 case MSR_AMD64_LWP_CFG:
2031 if ( svm_update_lwp_cfg(v, msr_content) < 0 )
2032 goto gpf;
2033 break;
2034
2035 case MSR_K7_PERFCTR0:
2036 case MSR_K7_PERFCTR1:
2037 case MSR_K7_PERFCTR2:
2038 case MSR_K7_PERFCTR3:
2039 case MSR_K7_EVNTSEL0:
2040 case MSR_K7_EVNTSEL1:
2041 case MSR_K7_EVNTSEL2:
2042 case MSR_K7_EVNTSEL3:
2043 case MSR_AMD_FAM15H_PERFCTR0:
2044 case MSR_AMD_FAM15H_PERFCTR1:
2045 case MSR_AMD_FAM15H_PERFCTR2:
2046 case MSR_AMD_FAM15H_PERFCTR3:
2047 case MSR_AMD_FAM15H_PERFCTR4:
2048 case MSR_AMD_FAM15H_PERFCTR5:
2049 case MSR_AMD_FAM15H_EVNTSEL0:
2050 case MSR_AMD_FAM15H_EVNTSEL1:
2051 case MSR_AMD_FAM15H_EVNTSEL2:
2052 case MSR_AMD_FAM15H_EVNTSEL3:
2053 case MSR_AMD_FAM15H_EVNTSEL4:
2054 case MSR_AMD_FAM15H_EVNTSEL5:
2055 if ( vpmu_do_wrmsr(msr, msr_content, 0) )
2056 goto gpf;
2057 break;
2058
2059 case MSR_IA32_MCx_MISC(4): /* Threshold register */
2060 case MSR_F10_MC4_MISC1 ... MSR_F10_MC4_MISC3:
2061 /*
2062 * MCA/MCE: Threshold register is reported to be locked, so we ignore
2063 * all write accesses. This behaviour matches real HW, so guests should
2064 * have no problem with this.
2065 */
2066 break;
2067
2068 case MSR_AMD64_DR0_ADDRESS_MASK:
2069 if ( !v->domain->arch.cpuid->extd.dbext || (msr_content >> 32) )
2070 goto gpf;
2071 v->arch.hvm_svm.dr_mask[0] = msr_content;
2072 break;
2073
2074 case MSR_AMD64_DR1_ADDRESS_MASK ... MSR_AMD64_DR3_ADDRESS_MASK:
2075 if ( !v->domain->arch.cpuid->extd.dbext || (msr_content >> 32) )
2076 goto gpf;
2077 v->arch.hvm_svm.dr_mask[msr - MSR_AMD64_DR1_ADDRESS_MASK + 1] =
2078 msr_content;
2079 break;
2080
2081 case MSR_AMD_OSVW_ID_LENGTH:
2082 case MSR_AMD_OSVW_STATUS:
2083 ret = svm_handle_osvw(v, msr, &msr_content, 0);
2084 if ( ret < 0 )
2085 goto gpf;
2086 break;
2087
2088 default:
2089 ret = nsvm_wrmsr(v, msr, msr_content);
2090 if ( ret < 0 )
2091 goto gpf;
2092 else if ( ret )
2093 break;
2094
2095 if ( wrmsr_viridian_regs(msr, msr_content) )
2096 break;
2097
2098 switch ( wrmsr_hypervisor_regs(msr, msr_content) )
2099 {
2100 case -ERESTART:
2101 result = X86EMUL_RETRY;
2102 break;
2103 case 0:
2104 case 1:
2105 break;
2106 default:
2107 goto gpf;
2108 }
2109 break;
2110 }
2111
2112 if ( sync )
2113 svm_vmload(vmcb);
2114
2115 return result;
2116
2117 gpf:
2118 return X86EMUL_EXCEPTION;
2119 }
2120
svm_do_msr_access(struct cpu_user_regs * regs)2121 static void svm_do_msr_access(struct cpu_user_regs *regs)
2122 {
2123 struct vcpu *curr = current;
2124 bool rdmsr = curr->arch.hvm_svm.vmcb->exitinfo1 == 0;
2125 int rc, inst_len = __get_instruction_length(
2126 curr, rdmsr ? INSTR_RDMSR : INSTR_WRMSR);
2127
2128 if ( inst_len == 0 )
2129 return;
2130
2131 if ( rdmsr )
2132 {
2133 uint64_t msr_content = 0;
2134
2135 rc = hvm_msr_read_intercept(regs->ecx, &msr_content);
2136 if ( rc == X86EMUL_OKAY )
2137 msr_split(regs, msr_content);
2138 }
2139 else
2140 rc = hvm_msr_write_intercept(regs->ecx, msr_fold(regs), 1);
2141
2142 if ( rc == X86EMUL_OKAY )
2143 __update_guest_eip(regs, inst_len);
2144 else if ( rc == X86EMUL_EXCEPTION )
2145 hvm_inject_hw_exception(TRAP_gp_fault, 0);
2146 }
2147
svm_vmexit_do_hlt(struct vmcb_struct * vmcb,struct cpu_user_regs * regs)2148 static void svm_vmexit_do_hlt(struct vmcb_struct *vmcb,
2149 struct cpu_user_regs *regs)
2150 {
2151 unsigned int inst_len;
2152
2153 if ( (inst_len = __get_instruction_length(current, INSTR_HLT)) == 0 )
2154 return;
2155 __update_guest_eip(regs, inst_len);
2156
2157 hvm_hlt(regs->eflags);
2158 }
2159
svm_vmexit_do_rdtsc(struct cpu_user_regs * regs)2160 static void svm_vmexit_do_rdtsc(struct cpu_user_regs *regs)
2161 {
2162 unsigned int inst_len;
2163
2164 if ( (inst_len = __get_instruction_length(current, INSTR_RDTSC)) == 0 )
2165 return;
2166 __update_guest_eip(regs, inst_len);
2167
2168 hvm_rdtsc_intercept(regs);
2169 }
2170
svm_vmexit_do_pause(struct cpu_user_regs * regs)2171 static void svm_vmexit_do_pause(struct cpu_user_regs *regs)
2172 {
2173 unsigned int inst_len;
2174
2175 if ( (inst_len = __get_instruction_length(current, INSTR_PAUSE)) == 0 )
2176 return;
2177 __update_guest_eip(regs, inst_len);
2178
2179 /*
2180 * The guest is running a contended spinlock and we've detected it.
2181 * Do something useful, like reschedule the guest
2182 */
2183 perfc_incr(pauseloop_exits);
2184 do_sched_op(SCHEDOP_yield, guest_handle_from_ptr(NULL, void));
2185 }
2186
2187 static void
svm_vmexit_do_vmrun(struct cpu_user_regs * regs,struct vcpu * v,uint64_t vmcbaddr)2188 svm_vmexit_do_vmrun(struct cpu_user_regs *regs,
2189 struct vcpu *v, uint64_t vmcbaddr)
2190 {
2191 if ( !nsvm_efer_svm_enabled(v) )
2192 {
2193 gdprintk(XENLOG_ERR, "VMRUN: nestedhvm disabled, injecting #UD\n");
2194 hvm_inject_hw_exception(TRAP_invalid_op, X86_EVENT_NO_EC);
2195 return;
2196 }
2197
2198 if ( !nestedsvm_vmcb_map(v, vmcbaddr) )
2199 {
2200 gdprintk(XENLOG_ERR, "VMRUN: mapping vmcb failed, injecting #GP\n");
2201 hvm_inject_hw_exception(TRAP_gp_fault, 0);
2202 return;
2203 }
2204
2205 vcpu_nestedhvm(v).nv_vmentry_pending = 1;
2206 return;
2207 }
2208
2209 static struct page_info *
nsvm_get_nvmcb_page(struct vcpu * v,uint64_t vmcbaddr)2210 nsvm_get_nvmcb_page(struct vcpu *v, uint64_t vmcbaddr)
2211 {
2212 p2m_type_t p2mt;
2213 struct page_info *page;
2214 struct nestedvcpu *nv = &vcpu_nestedhvm(v);
2215
2216 if ( !nestedsvm_vmcb_map(v, vmcbaddr) )
2217 return NULL;
2218
2219 /* Need to translate L1-GPA to MPA */
2220 page = get_page_from_gfn(v->domain,
2221 nv->nv_vvmcxaddr >> PAGE_SHIFT,
2222 &p2mt, P2M_ALLOC | P2M_UNSHARE);
2223 if ( !page )
2224 return NULL;
2225
2226 if ( !p2m_is_ram(p2mt) || p2m_is_readonly(p2mt) )
2227 {
2228 put_page(page);
2229 return NULL;
2230 }
2231
2232 return page;
2233 }
2234
2235 static void
svm_vmexit_do_vmload(struct vmcb_struct * vmcb,struct cpu_user_regs * regs,struct vcpu * v,uint64_t vmcbaddr)2236 svm_vmexit_do_vmload(struct vmcb_struct *vmcb,
2237 struct cpu_user_regs *regs,
2238 struct vcpu *v, uint64_t vmcbaddr)
2239 {
2240 unsigned int inst_len;
2241 struct page_info *page;
2242
2243 if ( (inst_len = __get_instruction_length(v, INSTR_VMLOAD)) == 0 )
2244 return;
2245
2246 if ( !nsvm_efer_svm_enabled(v) )
2247 {
2248 gdprintk(XENLOG_ERR, "VMLOAD: nestedhvm disabled, injecting #UD\n");
2249 hvm_inject_hw_exception(TRAP_invalid_op, X86_EVENT_NO_EC);
2250 return;
2251 }
2252
2253 page = nsvm_get_nvmcb_page(v, vmcbaddr);
2254 if ( !page )
2255 {
2256 gdprintk(XENLOG_ERR,
2257 "VMLOAD: mapping failed, injecting #GP\n");
2258 hvm_inject_hw_exception(TRAP_gp_fault, 0);
2259 return;
2260 }
2261
2262 svm_vmload_pa(page_to_maddr(page));
2263 put_page(page);
2264
2265 /* State in L1 VMCB is stale now */
2266 v->arch.hvm_svm.vmcb_in_sync = 0;
2267
2268 __update_guest_eip(regs, inst_len);
2269 }
2270
2271 static void
svm_vmexit_do_vmsave(struct vmcb_struct * vmcb,struct cpu_user_regs * regs,struct vcpu * v,uint64_t vmcbaddr)2272 svm_vmexit_do_vmsave(struct vmcb_struct *vmcb,
2273 struct cpu_user_regs *regs,
2274 struct vcpu *v, uint64_t vmcbaddr)
2275 {
2276 unsigned int inst_len;
2277 struct page_info *page;
2278
2279 if ( (inst_len = __get_instruction_length(v, INSTR_VMSAVE)) == 0 )
2280 return;
2281
2282 if ( !nsvm_efer_svm_enabled(v) )
2283 {
2284 gdprintk(XENLOG_ERR, "VMSAVE: nestedhvm disabled, injecting #UD\n");
2285 hvm_inject_hw_exception(TRAP_invalid_op, X86_EVENT_NO_EC);
2286 return;
2287 }
2288
2289 page = nsvm_get_nvmcb_page(v, vmcbaddr);
2290 if ( !page )
2291 {
2292 gdprintk(XENLOG_ERR,
2293 "VMSAVE: mapping vmcb failed, injecting #GP\n");
2294 hvm_inject_hw_exception(TRAP_gp_fault, 0);
2295 return;
2296 }
2297
2298 svm_vmsave_pa(page_to_maddr(page));
2299 put_page(page);
2300 __update_guest_eip(regs, inst_len);
2301 }
2302
svm_is_erratum_383(struct cpu_user_regs * regs)2303 static int svm_is_erratum_383(struct cpu_user_regs *regs)
2304 {
2305 uint64_t msr_content;
2306 uint32_t i;
2307 struct vcpu *v = current;
2308
2309 if ( !amd_erratum383_found )
2310 return 0;
2311
2312 rdmsrl(MSR_IA32_MC0_STATUS, msr_content);
2313 /* Bit 62 may or may not be set for this mce */
2314 msr_content &= ~(1ULL << 62);
2315
2316 if ( msr_content != 0xb600000000010015ULL )
2317 return 0;
2318
2319 /* Clear MCi_STATUS registers */
2320 for (i = 0; i < nr_mce_banks; i++)
2321 wrmsrl(MSR_IA32_MCx_STATUS(i), 0ULL);
2322
2323 rdmsrl(MSR_IA32_MCG_STATUS, msr_content);
2324 wrmsrl(MSR_IA32_MCG_STATUS, msr_content & ~(1ULL << 2));
2325
2326 /* flush TLB */
2327 flush_tlb_mask(v->domain->domain_dirty_cpumask);
2328
2329 return 1;
2330 }
2331
svm_vmexit_mce_intercept(struct vcpu * v,struct cpu_user_regs * regs)2332 static void svm_vmexit_mce_intercept(
2333 struct vcpu *v, struct cpu_user_regs *regs)
2334 {
2335 if ( svm_is_erratum_383(regs) )
2336 {
2337 gdprintk(XENLOG_ERR, "SVM hits AMD erratum 383\n");
2338 domain_crash(v->domain);
2339 }
2340 }
2341
svm_wbinvd_intercept(void)2342 static void svm_wbinvd_intercept(void)
2343 {
2344 if ( cache_flush_permitted(current->domain) )
2345 flush_all(FLUSH_CACHE);
2346 }
2347
svm_vmexit_do_invalidate_cache(struct cpu_user_regs * regs)2348 static void svm_vmexit_do_invalidate_cache(struct cpu_user_regs *regs)
2349 {
2350 static const enum instruction_index list[] = { INSTR_INVD, INSTR_WBINVD };
2351 int inst_len;
2352
2353 inst_len = __get_instruction_length_from_list(
2354 current, list, ARRAY_SIZE(list));
2355 if ( inst_len == 0 )
2356 return;
2357
2358 svm_wbinvd_intercept();
2359
2360 __update_guest_eip(regs, inst_len);
2361 }
2362
svm_invlpga_intercept(struct vcpu * v,unsigned long vaddr,uint32_t asid)2363 static void svm_invlpga_intercept(
2364 struct vcpu *v, unsigned long vaddr, uint32_t asid)
2365 {
2366 svm_invlpga(vaddr,
2367 (asid == 0)
2368 ? v->arch.hvm_vcpu.n1asid.asid
2369 : vcpu_nestedhvm(v).nv_n2asid.asid);
2370 }
2371
svm_invlpg_intercept(unsigned long vaddr)2372 static void svm_invlpg_intercept(unsigned long vaddr)
2373 {
2374 HVMTRACE_LONG_2D(INVLPG, 0, TRC_PAR_LONG(vaddr));
2375 paging_invlpg(current, vaddr);
2376 }
2377
is_invlpg(const struct x86_emulate_state * state,const struct x86_emulate_ctxt * ctxt)2378 static bool is_invlpg(const struct x86_emulate_state *state,
2379 const struct x86_emulate_ctxt *ctxt)
2380 {
2381 unsigned int ext;
2382
2383 return ctxt->opcode == X86EMUL_OPC(0x0f, 0x01) &&
2384 x86_insn_modrm(state, NULL, &ext) != 3 &&
2385 (ext & 7) == 7;
2386 }
2387
svm_invlpg(struct vcpu * v,unsigned long vaddr)2388 static void svm_invlpg(struct vcpu *v, unsigned long vaddr)
2389 {
2390 svm_asid_g_invlpg(v, vaddr);
2391 }
2392
svm_get_pending_event(struct vcpu * v,struct x86_event * info)2393 static bool svm_get_pending_event(struct vcpu *v, struct x86_event *info)
2394 {
2395 const struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb;
2396
2397 if ( vmcb->eventinj.fields.v )
2398 return false;
2399
2400 info->vector = vmcb->eventinj.fields.vector;
2401 info->type = vmcb->eventinj.fields.type;
2402 info->error_code = vmcb->eventinj.fields.errorcode;
2403
2404 return true;
2405 }
2406
2407 static struct hvm_function_table __initdata svm_function_table = {
2408 .name = "SVM",
2409 .cpu_up_prepare = svm_cpu_up_prepare,
2410 .cpu_dead = svm_cpu_dead,
2411 .cpu_up = svm_cpu_up,
2412 .cpu_down = svm_cpu_down,
2413 .domain_initialise = svm_domain_initialise,
2414 .domain_destroy = svm_domain_destroy,
2415 .vcpu_initialise = svm_vcpu_initialise,
2416 .vcpu_destroy = svm_vcpu_destroy,
2417 .save_cpu_ctxt = svm_save_vmcb_ctxt,
2418 .load_cpu_ctxt = svm_load_vmcb_ctxt,
2419 .init_msr = svm_init_msr,
2420 .save_msr = svm_save_msr,
2421 .load_msr = svm_load_msr,
2422 .get_interrupt_shadow = svm_get_interrupt_shadow,
2423 .set_interrupt_shadow = svm_set_interrupt_shadow,
2424 .guest_x86_mode = svm_guest_x86_mode,
2425 .get_cpl = svm_get_cpl,
2426 .get_segment_register = svm_get_segment_register,
2427 .set_segment_register = svm_set_segment_register,
2428 .get_shadow_gs_base = svm_get_shadow_gs_base,
2429 .update_guest_cr = svm_update_guest_cr,
2430 .update_guest_efer = svm_update_guest_efer,
2431 .update_guest_vendor = svm_update_guest_vendor,
2432 .fpu_leave = svm_fpu_leave,
2433 .set_guest_pat = svm_set_guest_pat,
2434 .get_guest_pat = svm_get_guest_pat,
2435 .set_tsc_offset = svm_set_tsc_offset,
2436 .inject_event = svm_inject_event,
2437 .init_hypercall_page = svm_init_hypercall_page,
2438 .event_pending = svm_event_pending,
2439 .get_pending_event = svm_get_pending_event,
2440 .invlpg = svm_invlpg,
2441 .wbinvd_intercept = svm_wbinvd_intercept,
2442 .fpu_dirty_intercept = svm_fpu_dirty_intercept,
2443 .msr_read_intercept = svm_msr_read_intercept,
2444 .msr_write_intercept = svm_msr_write_intercept,
2445 .set_rdtsc_exiting = svm_set_rdtsc_exiting,
2446 .set_descriptor_access_exiting = svm_set_descriptor_access_exiting,
2447 .get_insn_bytes = svm_get_insn_bytes,
2448
2449 .nhvm_vcpu_initialise = nsvm_vcpu_initialise,
2450 .nhvm_vcpu_destroy = nsvm_vcpu_destroy,
2451 .nhvm_vcpu_reset = nsvm_vcpu_reset,
2452 .nhvm_vcpu_vmexit_event = nsvm_vcpu_vmexit_event,
2453 .nhvm_vcpu_p2m_base = nsvm_vcpu_hostcr3,
2454 .nhvm_vmcx_guest_intercepts_event = nsvm_vmcb_guest_intercepts_event,
2455 .nhvm_vmcx_hap_enabled = nsvm_vmcb_hap_enabled,
2456 .nhvm_intr_blocked = nsvm_intr_blocked,
2457 .nhvm_hap_walk_L1_p2m = nsvm_hap_walk_L1_p2m,
2458
2459 .tsc_scaling = {
2460 .max_ratio = ~TSC_RATIO_RSVD_BITS,
2461 },
2462 };
2463
svm_vmexit_handler(struct cpu_user_regs * regs)2464 void svm_vmexit_handler(struct cpu_user_regs *regs)
2465 {
2466 uint64_t exit_reason;
2467 struct vcpu *v = current;
2468 struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb;
2469 eventinj_t eventinj;
2470 int inst_len, rc;
2471 vintr_t intr;
2472 bool_t vcpu_guestmode = 0;
2473 struct vlapic *vlapic = vcpu_vlapic(v);
2474
2475 hvm_invalidate_regs_fields(regs);
2476
2477 if ( paging_mode_hap(v->domain) )
2478 v->arch.hvm_vcpu.guest_cr[3] = v->arch.hvm_vcpu.hw_cr[3] =
2479 vmcb_get_cr3(vmcb);
2480
2481 if ( nestedhvm_enabled(v->domain) && nestedhvm_vcpu_in_guestmode(v) )
2482 vcpu_guestmode = 1;
2483
2484 /*
2485 * Before doing anything else, we need to sync up the VLAPIC's TPR with
2486 * SVM's vTPR. It's OK if the guest doesn't touch CR8 (e.g. 32-bit Windows)
2487 * because we update the vTPR on MMIO writes to the TPR.
2488 * NB. We need to preserve the low bits of the TPR to make checked builds
2489 * of Windows work, even though they don't actually do anything.
2490 */
2491 if ( !vcpu_guestmode && !vlapic_hw_disabled(vlapic) )
2492 {
2493 intr = vmcb_get_vintr(vmcb);
2494 vlapic_set_reg(vlapic, APIC_TASKPRI,
2495 ((intr.fields.tpr & 0x0F) << 4) |
2496 (vlapic_get_reg(vlapic, APIC_TASKPRI) & 0x0F));
2497 }
2498
2499 exit_reason = vmcb->exitcode;
2500
2501 if ( hvm_long_mode_active(v) )
2502 HVMTRACE_ND(VMEXIT64, vcpu_guestmode ? TRC_HVM_NESTEDFLAG : 0,
2503 1/*cycles*/, 3, exit_reason,
2504 regs->eip, regs->rip >> 32, 0, 0, 0);
2505 else
2506 HVMTRACE_ND(VMEXIT, vcpu_guestmode ? TRC_HVM_NESTEDFLAG : 0,
2507 1/*cycles*/, 2, exit_reason,
2508 regs->eip, 0, 0, 0, 0);
2509
2510 if ( vcpu_guestmode ) {
2511 enum nestedhvm_vmexits nsret;
2512 struct nestedvcpu *nv = &vcpu_nestedhvm(v);
2513 struct vmcb_struct *ns_vmcb = nv->nv_vvmcx;
2514 uint64_t exitinfo1, exitinfo2;
2515
2516 paging_update_nestedmode(v);
2517
2518 /* Write real exitinfo1 back into virtual vmcb.
2519 * nestedsvm_check_intercepts() expects to have the correct
2520 * exitinfo1 value there.
2521 */
2522 exitinfo1 = ns_vmcb->exitinfo1;
2523 ns_vmcb->exitinfo1 = vmcb->exitinfo1;
2524 nsret = nestedsvm_check_intercepts(v, regs, exit_reason);
2525 switch (nsret) {
2526 case NESTEDHVM_VMEXIT_CONTINUE:
2527 BUG();
2528 break;
2529 case NESTEDHVM_VMEXIT_HOST:
2530 break;
2531 case NESTEDHVM_VMEXIT_INJECT:
2532 /* Switch vcpu from l2 to l1 guest. We must perform
2533 * the switch here to have svm_do_resume() working
2534 * as intended.
2535 */
2536 exitinfo1 = vmcb->exitinfo1;
2537 exitinfo2 = vmcb->exitinfo2;
2538 nv->nv_vmswitch_in_progress = 1;
2539 nsret = nestedsvm_vmexit_n2n1(v, regs);
2540 nv->nv_vmswitch_in_progress = 0;
2541 switch (nsret) {
2542 case NESTEDHVM_VMEXIT_DONE:
2543 /* defer VMEXIT injection */
2544 nestedsvm_vmexit_defer(v, exit_reason, exitinfo1, exitinfo2);
2545 goto out;
2546 case NESTEDHVM_VMEXIT_FATALERROR:
2547 gdprintk(XENLOG_ERR, "unexpected nestedsvm_vmexit() error\n");
2548 domain_crash(v->domain);
2549 goto out;
2550 default:
2551 BUG();
2552 case NESTEDHVM_VMEXIT_ERROR:
2553 break;
2554 }
2555 /* fallthrough */
2556 case NESTEDHVM_VMEXIT_ERROR:
2557 gdprintk(XENLOG_ERR,
2558 "nestedsvm_check_intercepts() returned NESTEDHVM_VMEXIT_ERROR\n");
2559 goto out;
2560 case NESTEDHVM_VMEXIT_FATALERROR:
2561 gdprintk(XENLOG_ERR,
2562 "unexpected nestedsvm_check_intercepts() error\n");
2563 domain_crash(v->domain);
2564 goto out;
2565 default:
2566 gdprintk(XENLOG_INFO, "nestedsvm_check_intercepts() returned %i\n",
2567 nsret);
2568 domain_crash(v->domain);
2569 goto out;
2570 }
2571 }
2572
2573 if ( unlikely(exit_reason == VMEXIT_INVALID) )
2574 {
2575 gdprintk(XENLOG_ERR, "invalid VMCB state:\n");
2576 svm_vmcb_dump(__func__, vmcb);
2577 domain_crash(v->domain);
2578 goto out;
2579 }
2580
2581 perfc_incra(svmexits, exit_reason);
2582
2583 hvm_maybe_deassert_evtchn_irq();
2584
2585 vmcb->cleanbits.bytes = cpu_has_svm_cleanbits ? ~0u : 0u;
2586
2587 /* Event delivery caused this intercept? Queue for redelivery. */
2588 eventinj = vmcb->exitintinfo;
2589 if ( unlikely(eventinj.fields.v) &&
2590 hvm_event_needs_reinjection(eventinj.fields.type,
2591 eventinj.fields.vector) )
2592 vmcb->eventinj = eventinj;
2593
2594 switch ( exit_reason )
2595 {
2596 case VMEXIT_INTR:
2597 /* Asynchronous event, handled when we STGI'd after the VMEXIT. */
2598 HVMTRACE_0D(INTR);
2599 break;
2600
2601 case VMEXIT_NMI:
2602 /* Asynchronous event, handled when we STGI'd after the VMEXIT. */
2603 HVMTRACE_0D(NMI);
2604 break;
2605
2606 case VMEXIT_SMI:
2607 /* Asynchronous event, handled when we STGI'd after the VMEXIT. */
2608 HVMTRACE_0D(SMI);
2609 break;
2610
2611 case VMEXIT_EXCEPTION_DB:
2612 if ( !v->domain->debugger_attached )
2613 hvm_inject_hw_exception(TRAP_debug, X86_EVENT_NO_EC);
2614 else
2615 domain_pause_for_debugger();
2616 break;
2617
2618 case VMEXIT_EXCEPTION_BP:
2619 if ( !v->domain->debugger_attached )
2620 goto unexpected_exit_type;
2621 /* AMD Vol2, 15.11: INT3, INTO, BOUND intercepts do not update RIP. */
2622 if ( (inst_len = __get_instruction_length(v, INSTR_INT3)) == 0 )
2623 break;
2624 __update_guest_eip(regs, inst_len);
2625 current->arch.gdbsx_vcpu_event = TRAP_int3;
2626 domain_pause_for_debugger();
2627 break;
2628
2629 case VMEXIT_EXCEPTION_NM:
2630 svm_fpu_dirty_intercept();
2631 break;
2632
2633 case VMEXIT_EXCEPTION_PF: {
2634 unsigned long va;
2635 va = vmcb->exitinfo2;
2636 regs->error_code = vmcb->exitinfo1;
2637 HVM_DBG_LOG(DBG_LEVEL_VMMU,
2638 "eax=%lx, ebx=%lx, ecx=%lx, edx=%lx, esi=%lx, edi=%lx",
2639 regs->rax, regs->rbx, regs->rcx,
2640 regs->rdx, regs->rsi, regs->rdi);
2641
2642 if ( cpu_has_svm_decode )
2643 v->arch.hvm_svm.cached_insn_len = vmcb->guest_ins_len & 0xf;
2644 rc = paging_fault(va, regs);
2645 v->arch.hvm_svm.cached_insn_len = 0;
2646
2647 if ( rc )
2648 {
2649 if ( trace_will_trace_event(TRC_SHADOW) )
2650 break;
2651 if ( hvm_long_mode_active(v) )
2652 HVMTRACE_LONG_2D(PF_XEN, regs->error_code, TRC_PAR_LONG(va));
2653 else
2654 HVMTRACE_2D(PF_XEN, regs->error_code, va);
2655 break;
2656 }
2657
2658 hvm_inject_page_fault(regs->error_code, va);
2659 break;
2660 }
2661
2662 case VMEXIT_EXCEPTION_AC:
2663 HVMTRACE_1D(TRAP, TRAP_alignment_check);
2664 hvm_inject_hw_exception(TRAP_alignment_check, vmcb->exitinfo1);
2665 break;
2666
2667 case VMEXIT_EXCEPTION_UD:
2668 hvm_ud_intercept(regs);
2669 break;
2670
2671 /* Asynchronous event, handled when we STGI'd after the VMEXIT. */
2672 case VMEXIT_EXCEPTION_MC:
2673 HVMTRACE_0D(MCE);
2674 svm_vmexit_mce_intercept(v, regs);
2675 break;
2676
2677 case VMEXIT_VINTR: {
2678 u32 general1_intercepts = vmcb_get_general1_intercepts(vmcb);
2679 intr = vmcb_get_vintr(vmcb);
2680
2681 intr.fields.irq = 0;
2682 general1_intercepts &= ~GENERAL1_INTERCEPT_VINTR;
2683
2684 vmcb_set_vintr(vmcb, intr);
2685 vmcb_set_general1_intercepts(vmcb, general1_intercepts);
2686 break;
2687 }
2688
2689 case VMEXIT_INVD:
2690 case VMEXIT_WBINVD:
2691 svm_vmexit_do_invalidate_cache(regs);
2692 break;
2693
2694 case VMEXIT_TASK_SWITCH: {
2695 enum hvm_task_switch_reason reason;
2696 int32_t errcode = -1;
2697 if ( (vmcb->exitinfo2 >> 36) & 1 )
2698 reason = TSW_iret;
2699 else if ( (vmcb->exitinfo2 >> 38) & 1 )
2700 reason = TSW_jmp;
2701 else
2702 reason = TSW_call_or_int;
2703 if ( (vmcb->exitinfo2 >> 44) & 1 )
2704 errcode = (uint32_t)vmcb->exitinfo2;
2705
2706 /*
2707 * Some processors set the EXITINTINFO field when the task switch
2708 * is caused by a task gate in the IDT. In this case we will be
2709 * emulating the event injection, so we do not want the processor
2710 * to re-inject the original event!
2711 */
2712 vmcb->eventinj.bytes = 0;
2713
2714 hvm_task_switch((uint16_t)vmcb->exitinfo1, reason, errcode);
2715 break;
2716 }
2717
2718 case VMEXIT_CPUID:
2719 svm_vmexit_do_cpuid(regs);
2720 break;
2721
2722 case VMEXIT_HLT:
2723 svm_vmexit_do_hlt(vmcb, regs);
2724 break;
2725
2726 case VMEXIT_IOIO:
2727 if ( (vmcb->exitinfo1 & (1u<<2)) == 0 )
2728 {
2729 uint16_t port = (vmcb->exitinfo1 >> 16) & 0xFFFF;
2730 int bytes = ((vmcb->exitinfo1 >> 4) & 0x07);
2731 int dir = (vmcb->exitinfo1 & 1) ? IOREQ_READ : IOREQ_WRITE;
2732 if ( handle_pio(port, bytes, dir) )
2733 __update_guest_eip(regs, vmcb->exitinfo2 - vmcb->rip);
2734 }
2735 else if ( !hvm_emulate_one_insn(x86_insn_is_portio, "port I/O") )
2736 hvm_inject_hw_exception(TRAP_gp_fault, 0);
2737 break;
2738
2739 case VMEXIT_CR0_READ ... VMEXIT_CR15_READ:
2740 case VMEXIT_CR0_WRITE ... VMEXIT_CR15_WRITE:
2741 if ( cpu_has_svm_decode && (vmcb->exitinfo1 & (1ULL << 63)) )
2742 svm_vmexit_do_cr_access(vmcb, regs);
2743 else if ( !hvm_emulate_one_insn(x86_insn_is_cr_access, "CR access") )
2744 hvm_inject_hw_exception(TRAP_gp_fault, 0);
2745 break;
2746
2747 case VMEXIT_INVLPG:
2748 if ( cpu_has_svm_decode )
2749 {
2750 svm_invlpg_intercept(vmcb->exitinfo1);
2751 __update_guest_eip(regs, vmcb->nextrip - vmcb->rip);
2752 }
2753 else if ( !hvm_emulate_one_insn(is_invlpg, "invlpg") )
2754 hvm_inject_hw_exception(TRAP_gp_fault, 0);
2755 break;
2756
2757 case VMEXIT_INVLPGA:
2758 if ( (inst_len = __get_instruction_length(v, INSTR_INVLPGA)) == 0 )
2759 break;
2760 svm_invlpga_intercept(v, regs->rax, regs->ecx);
2761 __update_guest_eip(regs, inst_len);
2762 break;
2763
2764 case VMEXIT_VMMCALL:
2765 if ( (inst_len = __get_instruction_length(v, INSTR_VMCALL)) == 0 )
2766 break;
2767 BUG_ON(vcpu_guestmode);
2768 HVMTRACE_1D(VMMCALL, regs->eax);
2769
2770 if ( hvm_hypercall(regs) == HVM_HCALL_completed )
2771 __update_guest_eip(regs, inst_len);
2772 break;
2773
2774 case VMEXIT_DR0_READ ... VMEXIT_DR7_READ:
2775 case VMEXIT_DR0_WRITE ... VMEXIT_DR7_WRITE:
2776 svm_dr_access(v, regs);
2777 break;
2778
2779 case VMEXIT_MSR:
2780 svm_do_msr_access(regs);
2781 break;
2782
2783 case VMEXIT_SHUTDOWN:
2784 hvm_triple_fault();
2785 break;
2786
2787 case VMEXIT_RDTSCP:
2788 regs->rcx = hvm_msr_tsc_aux(v);
2789 /* fall through */
2790 case VMEXIT_RDTSC:
2791 svm_vmexit_do_rdtsc(regs);
2792 break;
2793
2794 case VMEXIT_MONITOR:
2795 case VMEXIT_MWAIT:
2796 hvm_inject_hw_exception(TRAP_invalid_op, X86_EVENT_NO_EC);
2797 break;
2798
2799 case VMEXIT_VMRUN:
2800 svm_vmexit_do_vmrun(regs, v, regs->rax);
2801 break;
2802 case VMEXIT_VMLOAD:
2803 svm_vmexit_do_vmload(vmcb, regs, v, regs->rax);
2804 break;
2805 case VMEXIT_VMSAVE:
2806 svm_vmexit_do_vmsave(vmcb, regs, v, regs->rax);
2807 break;
2808 case VMEXIT_STGI:
2809 svm_vmexit_do_stgi(regs, v);
2810 break;
2811 case VMEXIT_CLGI:
2812 svm_vmexit_do_clgi(regs, v);
2813 break;
2814 case VMEXIT_SKINIT:
2815 hvm_inject_hw_exception(TRAP_invalid_op, X86_EVENT_NO_EC);
2816 break;
2817
2818 case VMEXIT_XSETBV:
2819 if ( vmcb_get_cpl(vmcb) )
2820 hvm_inject_hw_exception(TRAP_gp_fault, 0);
2821 else if ( (inst_len = __get_instruction_length(v, INSTR_XSETBV)) &&
2822 hvm_handle_xsetbv(regs->ecx, msr_fold(regs)) == 0 )
2823 __update_guest_eip(regs, inst_len);
2824 break;
2825
2826 case VMEXIT_NPF:
2827 perfc_incra(svmexits, VMEXIT_NPF_PERFC);
2828 if ( cpu_has_svm_decode )
2829 v->arch.hvm_svm.cached_insn_len = vmcb->guest_ins_len & 0xf;
2830 rc = vmcb->exitinfo1 & PFEC_page_present
2831 ? p2m_pt_handle_deferred_changes(vmcb->exitinfo2) : 0;
2832 if ( rc >= 0 )
2833 svm_do_nested_pgfault(v, regs, vmcb->exitinfo1, vmcb->exitinfo2);
2834 else
2835 {
2836 printk(XENLOG_G_ERR
2837 "%pv: Error %d handling NPF (gpa=%08lx ec=%04lx)\n",
2838 v, rc, vmcb->exitinfo2, vmcb->exitinfo1);
2839 domain_crash(v->domain);
2840 }
2841 v->arch.hvm_svm.cached_insn_len = 0;
2842 break;
2843
2844 case VMEXIT_IRET: {
2845 u32 general1_intercepts = vmcb_get_general1_intercepts(vmcb);
2846
2847 /*
2848 * IRET clears the NMI mask. However because we clear the mask
2849 * /before/ executing IRET, we set the interrupt shadow to prevent
2850 * a pending NMI from being injected immediately. This will work
2851 * perfectly unless the IRET instruction faults: in that case we
2852 * may inject an NMI before the NMI handler's IRET instruction is
2853 * retired.
2854 */
2855 general1_intercepts &= ~GENERAL1_INTERCEPT_IRET;
2856 vmcb->interrupt_shadow = 1;
2857
2858 vmcb_set_general1_intercepts(vmcb, general1_intercepts);
2859 break;
2860 }
2861
2862 case VMEXIT_PAUSE:
2863 svm_vmexit_do_pause(regs);
2864 break;
2865
2866 case VMEXIT_IDTR_READ:
2867 case VMEXIT_IDTR_WRITE:
2868 hvm_descriptor_access_intercept(vmcb->exitintinfo.bytes, 0,
2869 VM_EVENT_DESC_IDTR, exit_reason == VMEXIT_IDTR_WRITE);
2870 break;
2871
2872 case VMEXIT_GDTR_READ:
2873 case VMEXIT_GDTR_WRITE:
2874 hvm_descriptor_access_intercept(vmcb->exitintinfo.bytes, 0,
2875 VM_EVENT_DESC_GDTR, exit_reason == VMEXIT_GDTR_WRITE);
2876 break;
2877
2878 case VMEXIT_LDTR_READ:
2879 case VMEXIT_LDTR_WRITE:
2880 hvm_descriptor_access_intercept(vmcb->exitintinfo.bytes, 0,
2881 VM_EVENT_DESC_LDTR, exit_reason == VMEXIT_LDTR_WRITE);
2882 break;
2883
2884 case VMEXIT_TR_READ:
2885 case VMEXIT_TR_WRITE:
2886 hvm_descriptor_access_intercept(vmcb->exitintinfo.bytes, 0,
2887 VM_EVENT_DESC_TR, exit_reason == VMEXIT_TR_WRITE);
2888 break;
2889
2890 default:
2891 unexpected_exit_type:
2892 gprintk(XENLOG_ERR, "Unexpected vmexit: reason %#"PRIx64", "
2893 "exitinfo1 %#"PRIx64", exitinfo2 %#"PRIx64"\n",
2894 exit_reason, vmcb->exitinfo1, vmcb->exitinfo2);
2895 svm_crash_or_fault(v);
2896 break;
2897 }
2898
2899 out:
2900 if ( vcpu_guestmode || vlapic_hw_disabled(vlapic) )
2901 return;
2902
2903 /* The exit may have updated the TPR: reflect this in the hardware vtpr */
2904 intr = vmcb_get_vintr(vmcb);
2905 intr.fields.tpr =
2906 (vlapic_get_reg(vlapic, APIC_TASKPRI) & 0xFF) >> 4;
2907 vmcb_set_vintr(vmcb, intr);
2908 }
2909
svm_trace_vmentry(void)2910 void svm_trace_vmentry(void)
2911 {
2912 struct vcpu *curr = current;
2913 HVMTRACE_ND(VMENTRY,
2914 nestedhvm_vcpu_in_guestmode(curr) ? TRC_HVM_NESTEDFLAG : 0,
2915 1/*cycles*/, 0, 0, 0, 0, 0, 0, 0);
2916 }
2917
2918 /*
2919 * Local variables:
2920 * mode: C
2921 * c-file-style: "BSD"
2922 * c-basic-offset: 4
2923 * tab-width: 4
2924 * indent-tabs-mode: nil
2925 * End:
2926 */
2927