1 /*
2  * svm.c: handling SVM architecture-related VM exits
3  * Copyright (c) 2004, Intel Corporation.
4  * Copyright (c) 2005-2007, Advanced Micro Devices, Inc.
5  *
6  * This program is free software; you can redistribute it and/or modify it
7  * under the terms and conditions of the GNU General Public License,
8  * version 2, as published by the Free Software Foundation.
9  *
10  * This program is distributed in the hope it will be useful, but WITHOUT
11  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
12  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
13  * more details.
14  *
15  * You should have received a copy of the GNU General Public License along with
16  * this program; If not, see <http://www.gnu.org/licenses/>.
17  */
18 
19 #include <xen/init.h>
20 #include <xen/lib.h>
21 #include <xen/trace.h>
22 #include <xen/sched.h>
23 #include <xen/irq.h>
24 #include <xen/softirq.h>
25 #include <xen/hypercall.h>
26 #include <xen/domain_page.h>
27 #include <xen/xenoprof.h>
28 #include <asm/current.h>
29 #include <asm/io.h>
30 #include <asm/paging.h>
31 #include <asm/p2m.h>
32 #include <asm/mem_sharing.h>
33 #include <asm/regs.h>
34 #include <asm/cpufeature.h>
35 #include <asm/processor.h>
36 #include <asm/amd.h>
37 #include <asm/guest_access.h>
38 #include <asm/debugreg.h>
39 #include <asm/msr.h>
40 #include <asm/i387.h>
41 #include <asm/iocap.h>
42 #include <asm/hvm/emulate.h>
43 #include <asm/hvm/hvm.h>
44 #include <asm/hvm/support.h>
45 #include <asm/hvm/io.h>
46 #include <asm/hvm/emulate.h>
47 #include <asm/hvm/svm/asid.h>
48 #include <asm/hvm/svm/svm.h>
49 #include <asm/hvm/svm/vmcb.h>
50 #include <asm/hvm/svm/emulate.h>
51 #include <asm/hvm/svm/intr.h>
52 #include <asm/hvm/svm/svmdebug.h>
53 #include <asm/hvm/svm/nestedsvm.h>
54 #include <asm/hvm/nestedhvm.h>
55 #include <asm/x86_emulate.h>
56 #include <public/sched.h>
57 #include <asm/hvm/vpt.h>
58 #include <asm/hvm/trace.h>
59 #include <asm/hap.h>
60 #include <asm/apic.h>
61 #include <asm/debugger.h>
62 #include <asm/xstate.h>
63 
64 void svm_asm_do_resume(void);
65 
66 u32 svm_feature_flags;
67 
68 /* Indicates whether guests may use EFER.LMSLE. */
69 bool_t cpu_has_lmsl;
70 
71 static void svm_update_guest_efer(struct vcpu *);
72 
73 static struct hvm_function_table svm_function_table;
74 
75 /*
76  * Physical addresses of the Host State Area (for hardware) and vmcb (for Xen)
77  * which contains Xen's fs/gs/tr/ldtr and GSBASE/STAR/SYSENTER state when in
78  * guest vcpu context.
79  */
80 static DEFINE_PER_CPU_READ_MOSTLY(paddr_t, hsa);
81 static DEFINE_PER_CPU_READ_MOSTLY(paddr_t, host_vmcb);
82 
83 static bool_t amd_erratum383_found __read_mostly;
84 
85 /* OSVW bits */
86 static uint64_t osvw_length, osvw_status;
87 static DEFINE_SPINLOCK(osvw_lock);
88 
89 /* Only crash the guest if the problem originates in kernel mode. */
svm_crash_or_fault(struct vcpu * v)90 static void svm_crash_or_fault(struct vcpu *v)
91 {
92     if ( vmcb_get_cpl(v->arch.hvm_svm.vmcb) )
93         hvm_inject_hw_exception(TRAP_invalid_op, X86_EVENT_NO_EC);
94     else
95         domain_crash(v->domain);
96 }
97 
__update_guest_eip(struct cpu_user_regs * regs,unsigned int inst_len)98 void __update_guest_eip(struct cpu_user_regs *regs, unsigned int inst_len)
99 {
100     struct vcpu *curr = current;
101 
102     if ( unlikely(inst_len == 0) )
103         return;
104 
105     if ( unlikely(inst_len > MAX_INST_LEN) )
106     {
107         gdprintk(XENLOG_ERR, "Bad instruction length %u\n", inst_len);
108         svm_crash_or_fault(curr);
109         return;
110     }
111 
112     ASSERT(regs == guest_cpu_user_regs());
113 
114     regs->rip += inst_len;
115     regs->eflags &= ~X86_EFLAGS_RF;
116 
117     curr->arch.hvm_svm.vmcb->interrupt_shadow = 0;
118 
119     if ( regs->eflags & X86_EFLAGS_TF )
120         hvm_inject_hw_exception(TRAP_debug, X86_EVENT_NO_EC);
121 }
122 
svm_cpu_down(void)123 static void svm_cpu_down(void)
124 {
125     write_efer(read_efer() & ~EFER_SVME);
126 }
127 
128 unsigned long *
svm_msrbit(unsigned long * msr_bitmap,uint32_t msr)129 svm_msrbit(unsigned long *msr_bitmap, uint32_t msr)
130 {
131     unsigned long *msr_bit = NULL;
132 
133     /*
134      * See AMD64 Programmers Manual, Vol 2, Section 15.10 (MSR-Bitmap Address).
135      */
136     if ( msr <= 0x1fff )
137         msr_bit = msr_bitmap + 0x0000 / BYTES_PER_LONG;
138     else if ( (msr >= 0xc0000000) && (msr <= 0xc0001fff) )
139         msr_bit = msr_bitmap + 0x0800 / BYTES_PER_LONG;
140     else if ( (msr >= 0xc0010000) && (msr <= 0xc0011fff) )
141         msr_bit = msr_bitmap + 0x1000 / BYTES_PER_LONG;
142 
143     return msr_bit;
144 }
145 
svm_intercept_msr(struct vcpu * v,uint32_t msr,int flags)146 void svm_intercept_msr(struct vcpu *v, uint32_t msr, int flags)
147 {
148     unsigned long *msr_bit;
149 
150     msr_bit = svm_msrbit(v->arch.hvm_svm.msrpm, msr);
151     BUG_ON(msr_bit == NULL);
152     msr &= 0x1fff;
153 
154     if ( flags & MSR_INTERCEPT_READ )
155          __set_bit(msr * 2, msr_bit);
156     else
157          __clear_bit(msr * 2, msr_bit);
158 
159     if ( flags & MSR_INTERCEPT_WRITE )
160         __set_bit(msr * 2 + 1, msr_bit);
161     else
162         __clear_bit(msr * 2 + 1, msr_bit);
163 }
164 
svm_save_dr(struct vcpu * v)165 static void svm_save_dr(struct vcpu *v)
166 {
167     struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb;
168     unsigned int flag_dr_dirty = v->arch.hvm_vcpu.flag_dr_dirty;
169 
170     if ( !flag_dr_dirty )
171         return;
172 
173     /* Clear the DR dirty flag and re-enable intercepts for DR accesses. */
174     v->arch.hvm_vcpu.flag_dr_dirty = 0;
175     vmcb_set_dr_intercepts(vmcb, ~0u);
176 
177     if ( v->domain->arch.cpuid->extd.dbext )
178     {
179         svm_intercept_msr(v, MSR_AMD64_DR0_ADDRESS_MASK, MSR_INTERCEPT_RW);
180         svm_intercept_msr(v, MSR_AMD64_DR1_ADDRESS_MASK, MSR_INTERCEPT_RW);
181         svm_intercept_msr(v, MSR_AMD64_DR2_ADDRESS_MASK, MSR_INTERCEPT_RW);
182         svm_intercept_msr(v, MSR_AMD64_DR3_ADDRESS_MASK, MSR_INTERCEPT_RW);
183 
184         rdmsrl(MSR_AMD64_DR0_ADDRESS_MASK, v->arch.hvm_svm.dr_mask[0]);
185         rdmsrl(MSR_AMD64_DR1_ADDRESS_MASK, v->arch.hvm_svm.dr_mask[1]);
186         rdmsrl(MSR_AMD64_DR2_ADDRESS_MASK, v->arch.hvm_svm.dr_mask[2]);
187         rdmsrl(MSR_AMD64_DR3_ADDRESS_MASK, v->arch.hvm_svm.dr_mask[3]);
188     }
189 
190     v->arch.debugreg[0] = read_debugreg(0);
191     v->arch.debugreg[1] = read_debugreg(1);
192     v->arch.debugreg[2] = read_debugreg(2);
193     v->arch.debugreg[3] = read_debugreg(3);
194     v->arch.debugreg[6] = vmcb_get_dr6(vmcb);
195     v->arch.debugreg[7] = vmcb_get_dr7(vmcb);
196 }
197 
__restore_debug_registers(struct vmcb_struct * vmcb,struct vcpu * v)198 static void __restore_debug_registers(struct vmcb_struct *vmcb, struct vcpu *v)
199 {
200     if ( v->arch.hvm_vcpu.flag_dr_dirty )
201         return;
202 
203     v->arch.hvm_vcpu.flag_dr_dirty = 1;
204     vmcb_set_dr_intercepts(vmcb, 0);
205 
206     ASSERT(v == current);
207 
208     if ( v->domain->arch.cpuid->extd.dbext )
209     {
210         svm_intercept_msr(v, MSR_AMD64_DR0_ADDRESS_MASK, MSR_INTERCEPT_NONE);
211         svm_intercept_msr(v, MSR_AMD64_DR1_ADDRESS_MASK, MSR_INTERCEPT_NONE);
212         svm_intercept_msr(v, MSR_AMD64_DR2_ADDRESS_MASK, MSR_INTERCEPT_NONE);
213         svm_intercept_msr(v, MSR_AMD64_DR3_ADDRESS_MASK, MSR_INTERCEPT_NONE);
214 
215         wrmsrl(MSR_AMD64_DR0_ADDRESS_MASK, v->arch.hvm_svm.dr_mask[0]);
216         wrmsrl(MSR_AMD64_DR1_ADDRESS_MASK, v->arch.hvm_svm.dr_mask[1]);
217         wrmsrl(MSR_AMD64_DR2_ADDRESS_MASK, v->arch.hvm_svm.dr_mask[2]);
218         wrmsrl(MSR_AMD64_DR3_ADDRESS_MASK, v->arch.hvm_svm.dr_mask[3]);
219     }
220 
221     write_debugreg(0, v->arch.debugreg[0]);
222     write_debugreg(1, v->arch.debugreg[1]);
223     write_debugreg(2, v->arch.debugreg[2]);
224     write_debugreg(3, v->arch.debugreg[3]);
225     vmcb_set_dr6(vmcb, v->arch.debugreg[6]);
226     vmcb_set_dr7(vmcb, v->arch.debugreg[7]);
227 }
228 
229 /*
230  * DR7 is saved and restored on every vmexit.  Other debug registers only
231  * need to be restored if their value is going to affect execution -- i.e.,
232  * if one of the breakpoints is enabled.  So mask out all bits that don't
233  * enable some breakpoint functionality.
234  */
svm_restore_dr(struct vcpu * v)235 static void svm_restore_dr(struct vcpu *v)
236 {
237     struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb;
238     if ( unlikely(v->arch.debugreg[7] & DR7_ACTIVE_MASK) )
239         __restore_debug_registers(vmcb, v);
240 }
241 
svm_vmcb_save(struct vcpu * v,struct hvm_hw_cpu * c)242 static int svm_vmcb_save(struct vcpu *v, struct hvm_hw_cpu *c)
243 {
244     struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb;
245 
246     c->cr0 = v->arch.hvm_vcpu.guest_cr[0];
247     c->cr2 = v->arch.hvm_vcpu.guest_cr[2];
248     c->cr3 = v->arch.hvm_vcpu.guest_cr[3];
249     c->cr4 = v->arch.hvm_vcpu.guest_cr[4];
250 
251     c->sysenter_cs = v->arch.hvm_svm.guest_sysenter_cs;
252     c->sysenter_esp = v->arch.hvm_svm.guest_sysenter_esp;
253     c->sysenter_eip = v->arch.hvm_svm.guest_sysenter_eip;
254 
255     c->pending_event = 0;
256     c->error_code = 0;
257     if ( vmcb->eventinj.fields.v &&
258          hvm_event_needs_reinjection(vmcb->eventinj.fields.type,
259                                      vmcb->eventinj.fields.vector) )
260     {
261         c->pending_event = (uint32_t)vmcb->eventinj.bytes;
262         c->error_code = vmcb->eventinj.fields.errorcode;
263     }
264 
265     return 1;
266 }
267 
svm_vmcb_restore(struct vcpu * v,struct hvm_hw_cpu * c)268 static int svm_vmcb_restore(struct vcpu *v, struct hvm_hw_cpu *c)
269 {
270     struct page_info *page = NULL;
271     struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb;
272     struct p2m_domain *p2m = p2m_get_hostp2m(v->domain);
273 
274     if ( c->pending_valid )
275     {
276         if ( (c->pending_type == 1) || (c->pending_type > 4) ||
277              (c->pending_reserved != 0) )
278         {
279             dprintk(XENLOG_ERR, "%pv: Invalid pending event %#"PRIx32"\n",
280                     v, c->pending_event);
281             return -EINVAL;
282         }
283 
284         if ( c->pending_error_valid &&
285              c->error_code != (uint16_t)c->error_code )
286         {
287             dprintk(XENLOG_ERR, "%pv: Invalid error code %#"PRIx32"\n",
288                     v, c->error_code);
289             return -EINVAL;
290         }
291     }
292 
293     if ( !paging_mode_hap(v->domain) )
294     {
295         if ( c->cr0 & X86_CR0_PG )
296         {
297             page = get_page_from_gfn(v->domain, c->cr3 >> PAGE_SHIFT,
298                                      NULL, P2M_ALLOC);
299             if ( !page )
300             {
301                 gdprintk(XENLOG_ERR, "Invalid CR3 value=%#"PRIx64"\n",
302                          c->cr3);
303                 return -EINVAL;
304             }
305         }
306 
307         if ( v->arch.hvm_vcpu.guest_cr[0] & X86_CR0_PG )
308             put_page(pagetable_get_page(v->arch.guest_table));
309 
310         v->arch.guest_table =
311             page ? pagetable_from_page(page) : pagetable_null();
312     }
313 
314     v->arch.hvm_vcpu.guest_cr[0] = c->cr0 | X86_CR0_ET;
315     v->arch.hvm_vcpu.guest_cr[2] = c->cr2;
316     v->arch.hvm_vcpu.guest_cr[3] = c->cr3;
317     v->arch.hvm_vcpu.guest_cr[4] = c->cr4;
318     svm_update_guest_cr(v, 0);
319     svm_update_guest_cr(v, 2);
320     svm_update_guest_cr(v, 4);
321 
322     /* Load sysenter MSRs into both VMCB save area and VCPU fields. */
323     vmcb->sysenter_cs = v->arch.hvm_svm.guest_sysenter_cs = c->sysenter_cs;
324     vmcb->sysenter_esp = v->arch.hvm_svm.guest_sysenter_esp = c->sysenter_esp;
325     vmcb->sysenter_eip = v->arch.hvm_svm.guest_sysenter_eip = c->sysenter_eip;
326 
327     if ( paging_mode_hap(v->domain) )
328     {
329         vmcb_set_np_enable(vmcb, 1);
330         vmcb_set_g_pat(vmcb, MSR_IA32_CR_PAT_RESET /* guest PAT */);
331         vmcb_set_h_cr3(vmcb, pagetable_get_paddr(p2m_get_pagetable(p2m)));
332     }
333 
334     if ( c->pending_valid &&
335          hvm_event_needs_reinjection(c->pending_type, c->pending_vector) )
336     {
337         gdprintk(XENLOG_INFO, "Re-injecting %#"PRIx32", %#"PRIx32"\n",
338                  c->pending_event, c->error_code);
339         vmcb->eventinj.bytes = c->pending_event;
340         vmcb->eventinj.fields.errorcode = c->error_code;
341     }
342     else
343         vmcb->eventinj.bytes = 0;
344 
345     vmcb->cleanbits.bytes = 0;
346     paging_update_paging_modes(v);
347 
348     return 0;
349 }
350 
351 
svm_save_cpu_state(struct vcpu * v,struct hvm_hw_cpu * data)352 static void svm_save_cpu_state(struct vcpu *v, struct hvm_hw_cpu *data)
353 {
354     struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb;
355 
356     data->shadow_gs        = vmcb->kerngsbase;
357     data->msr_lstar        = vmcb->lstar;
358     data->msr_star         = vmcb->star;
359     data->msr_cstar        = vmcb->cstar;
360     data->msr_syscall_mask = vmcb->sfmask;
361     data->msr_efer         = v->arch.hvm_vcpu.guest_efer;
362     data->msr_flags        = 0;
363 }
364 
365 
svm_load_cpu_state(struct vcpu * v,struct hvm_hw_cpu * data)366 static void svm_load_cpu_state(struct vcpu *v, struct hvm_hw_cpu *data)
367 {
368     struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb;
369 
370     vmcb->kerngsbase = data->shadow_gs;
371     vmcb->lstar      = data->msr_lstar;
372     vmcb->star       = data->msr_star;
373     vmcb->cstar      = data->msr_cstar;
374     vmcb->sfmask     = data->msr_syscall_mask;
375     v->arch.hvm_vcpu.guest_efer = data->msr_efer;
376     svm_update_guest_efer(v);
377 }
378 
svm_save_vmcb_ctxt(struct vcpu * v,struct hvm_hw_cpu * ctxt)379 static void svm_save_vmcb_ctxt(struct vcpu *v, struct hvm_hw_cpu *ctxt)
380 {
381     svm_save_cpu_state(v, ctxt);
382     svm_vmcb_save(v, ctxt);
383 }
384 
svm_load_vmcb_ctxt(struct vcpu * v,struct hvm_hw_cpu * ctxt)385 static int svm_load_vmcb_ctxt(struct vcpu *v, struct hvm_hw_cpu *ctxt)
386 {
387     svm_load_cpu_state(v, ctxt);
388     if (svm_vmcb_restore(v, ctxt)) {
389         gdprintk(XENLOG_ERR, "svm_vmcb restore failed!\n");
390         domain_crash(v->domain);
391         return -EINVAL;
392     }
393 
394     return 0;
395 }
396 
svm_init_msr(void)397 static unsigned int __init svm_init_msr(void)
398 {
399     return boot_cpu_has(X86_FEATURE_DBEXT) ? 4 : 0;
400 }
401 
svm_save_msr(struct vcpu * v,struct hvm_msr * ctxt)402 static void svm_save_msr(struct vcpu *v, struct hvm_msr *ctxt)
403 {
404     if ( boot_cpu_has(X86_FEATURE_DBEXT) )
405     {
406         ctxt->msr[ctxt->count].val = v->arch.hvm_svm.dr_mask[0];
407         if ( ctxt->msr[ctxt->count].val )
408             ctxt->msr[ctxt->count++].index = MSR_AMD64_DR0_ADDRESS_MASK;
409 
410         ctxt->msr[ctxt->count].val = v->arch.hvm_svm.dr_mask[1];
411         if ( ctxt->msr[ctxt->count].val )
412             ctxt->msr[ctxt->count++].index = MSR_AMD64_DR1_ADDRESS_MASK;
413 
414         ctxt->msr[ctxt->count].val = v->arch.hvm_svm.dr_mask[2];
415         if ( ctxt->msr[ctxt->count].val )
416             ctxt->msr[ctxt->count++].index = MSR_AMD64_DR2_ADDRESS_MASK;
417 
418         ctxt->msr[ctxt->count].val = v->arch.hvm_svm.dr_mask[3];
419         if ( ctxt->msr[ctxt->count].val )
420             ctxt->msr[ctxt->count++].index = MSR_AMD64_DR3_ADDRESS_MASK;
421     }
422 }
423 
svm_load_msr(struct vcpu * v,struct hvm_msr * ctxt)424 static int svm_load_msr(struct vcpu *v, struct hvm_msr *ctxt)
425 {
426     unsigned int i, idx;
427     int err = 0;
428 
429     for ( i = 0; i < ctxt->count; ++i )
430     {
431         switch ( idx = ctxt->msr[i].index )
432         {
433         case MSR_AMD64_DR0_ADDRESS_MASK:
434             if ( !boot_cpu_has(X86_FEATURE_DBEXT) )
435                 err = -ENXIO;
436             else if ( ctxt->msr[i].val >> 32 )
437                 err = -EDOM;
438             else
439                 v->arch.hvm_svm.dr_mask[0] = ctxt->msr[i].val;
440             break;
441 
442         case MSR_AMD64_DR1_ADDRESS_MASK ... MSR_AMD64_DR3_ADDRESS_MASK:
443             if ( !boot_cpu_has(X86_FEATURE_DBEXT) )
444                 err = -ENXIO;
445             else if ( ctxt->msr[i].val >> 32 )
446                 err = -EDOM;
447             else
448                 v->arch.hvm_svm.dr_mask[idx - MSR_AMD64_DR1_ADDRESS_MASK + 1] =
449                     ctxt->msr[i].val;
450             break;
451 
452         default:
453             continue;
454         }
455         if ( err )
456             break;
457         ctxt->msr[i]._rsvd = 1;
458     }
459 
460     return err;
461 }
462 
svm_fpu_enter(struct vcpu * v)463 static void svm_fpu_enter(struct vcpu *v)
464 {
465     struct vmcb_struct *n1vmcb = vcpu_nestedhvm(v).nv_n1vmcx;
466 
467     vcpu_restore_fpu_lazy(v);
468     vmcb_set_exception_intercepts(
469         n1vmcb,
470         vmcb_get_exception_intercepts(n1vmcb) & ~(1U << TRAP_no_device));
471 }
472 
svm_fpu_leave(struct vcpu * v)473 static void svm_fpu_leave(struct vcpu *v)
474 {
475     struct vmcb_struct *n1vmcb = vcpu_nestedhvm(v).nv_n1vmcx;
476 
477     ASSERT(!v->fpu_dirtied);
478     ASSERT(read_cr0() & X86_CR0_TS);
479 
480     /*
481      * If the guest does not have TS enabled then we must cause and handle an
482      * exception on first use of the FPU. If the guest *does* have TS enabled
483      * then this is not necessary: no FPU activity can occur until the guest
484      * clears CR0.TS, and we will initialise the FPU when that happens.
485      */
486     if ( !(v->arch.hvm_vcpu.guest_cr[0] & X86_CR0_TS) )
487     {
488         vmcb_set_exception_intercepts(
489             n1vmcb,
490             vmcb_get_exception_intercepts(n1vmcb) | (1U << TRAP_no_device));
491         vmcb_set_cr0(n1vmcb, vmcb_get_cr0(n1vmcb) | X86_CR0_TS);
492     }
493 }
494 
svm_get_interrupt_shadow(struct vcpu * v)495 static unsigned int svm_get_interrupt_shadow(struct vcpu *v)
496 {
497     struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb;
498     unsigned int intr_shadow = 0;
499 
500     if ( vmcb->interrupt_shadow )
501         intr_shadow |= HVM_INTR_SHADOW_MOV_SS | HVM_INTR_SHADOW_STI;
502 
503     if ( vmcb_get_general1_intercepts(vmcb) & GENERAL1_INTERCEPT_IRET )
504         intr_shadow |= HVM_INTR_SHADOW_NMI;
505 
506     return intr_shadow;
507 }
508 
svm_set_interrupt_shadow(struct vcpu * v,unsigned int intr_shadow)509 static void svm_set_interrupt_shadow(struct vcpu *v, unsigned int intr_shadow)
510 {
511     struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb;
512     u32 general1_intercepts = vmcb_get_general1_intercepts(vmcb);
513 
514     vmcb->interrupt_shadow =
515         !!(intr_shadow & (HVM_INTR_SHADOW_MOV_SS|HVM_INTR_SHADOW_STI));
516 
517     general1_intercepts &= ~GENERAL1_INTERCEPT_IRET;
518     if ( intr_shadow & HVM_INTR_SHADOW_NMI )
519         general1_intercepts |= GENERAL1_INTERCEPT_IRET;
520     vmcb_set_general1_intercepts(vmcb, general1_intercepts);
521 }
522 
svm_guest_x86_mode(struct vcpu * v)523 static int svm_guest_x86_mode(struct vcpu *v)
524 {
525     struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb;
526 
527     if ( unlikely(!(v->arch.hvm_vcpu.guest_cr[0] & X86_CR0_PE)) )
528         return 0;
529     if ( unlikely(guest_cpu_user_regs()->eflags & X86_EFLAGS_VM) )
530         return 1;
531     if ( hvm_long_mode_active(v) && likely(vmcb->cs.l) )
532         return 8;
533     return likely(vmcb->cs.db) ? 4 : 2;
534 }
535 
svm_update_guest_cr(struct vcpu * v,unsigned int cr)536 void svm_update_guest_cr(struct vcpu *v, unsigned int cr)
537 {
538     struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb;
539     uint64_t value;
540 
541     switch ( cr )
542     {
543     case 0: {
544         unsigned long hw_cr0_mask = 0;
545 
546         if ( !(v->arch.hvm_vcpu.guest_cr[0] & X86_CR0_TS) )
547         {
548             if ( v != current )
549                 hw_cr0_mask |= X86_CR0_TS;
550             else if ( vmcb_get_cr0(vmcb) & X86_CR0_TS )
551                 svm_fpu_enter(v);
552         }
553 
554         value = v->arch.hvm_vcpu.guest_cr[0] | hw_cr0_mask;
555         if ( !paging_mode_hap(v->domain) )
556             value |= X86_CR0_PG | X86_CR0_WP;
557         vmcb_set_cr0(vmcb, value);
558         break;
559     }
560     case 2:
561         vmcb_set_cr2(vmcb, v->arch.hvm_vcpu.guest_cr[2]);
562         break;
563     case 3:
564         vmcb_set_cr3(vmcb, v->arch.hvm_vcpu.hw_cr[3]);
565         if ( !nestedhvm_enabled(v->domain) )
566             hvm_asid_flush_vcpu(v);
567         else if ( nestedhvm_vmswitch_in_progress(v) )
568             ; /* CR3 switches during VMRUN/VMEXIT do not flush the TLB. */
569         else
570             hvm_asid_flush_vcpu_asid(
571                 nestedhvm_vcpu_in_guestmode(v)
572                 ? &vcpu_nestedhvm(v).nv_n2asid : &v->arch.hvm_vcpu.n1asid);
573         break;
574     case 4:
575         value = HVM_CR4_HOST_MASK;
576         if ( paging_mode_hap(v->domain) )
577             value &= ~X86_CR4_PAE;
578         value |= v->arch.hvm_vcpu.guest_cr[4];
579 
580         if ( !hvm_paging_enabled(v) )
581         {
582             /*
583              * When the guest thinks paging is disabled, Xen may need to hide
584              * the effects of shadow paging, as hardware runs with the host
585              * paging settings, rather than the guests settings.
586              *
587              * Without CR0.PG, all memory accesses are user mode, so
588              * _PAGE_USER must be set in the shadow pagetables for guest
589              * userspace to function.  This in turn trips up guest supervisor
590              * mode if SMEP/SMAP are left active in context.  They wouldn't
591              * have any effect if paging was actually disabled, so hide them
592              * behind the back of the guest.
593              */
594             value &= ~(X86_CR4_SMEP | X86_CR4_SMAP);
595         }
596 
597         vmcb_set_cr4(vmcb, value);
598         break;
599     default:
600         BUG();
601     }
602 }
603 
svm_update_guest_efer(struct vcpu * v)604 static void svm_update_guest_efer(struct vcpu *v)
605 {
606     struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb;
607     bool_t lma = !!(v->arch.hvm_vcpu.guest_efer & EFER_LMA);
608     uint64_t new_efer;
609 
610     new_efer = (v->arch.hvm_vcpu.guest_efer | EFER_SVME) & ~EFER_LME;
611     if ( lma )
612         new_efer |= EFER_LME;
613     vmcb_set_efer(vmcb, new_efer);
614 }
615 
svm_update_guest_vendor(struct vcpu * v)616 static void svm_update_guest_vendor(struct vcpu *v)
617 {
618     struct arch_svm_struct *arch_svm = &v->arch.hvm_svm;
619     struct vmcb_struct *vmcb = arch_svm->vmcb;
620     u32 bitmap = vmcb_get_exception_intercepts(vmcb);
621 
622     if ( opt_hvm_fep ||
623          (v->domain->arch.cpuid->x86_vendor != boot_cpu_data.x86_vendor) )
624         bitmap |= (1U << TRAP_invalid_op);
625     else
626         bitmap &= ~(1U << TRAP_invalid_op);
627 
628     vmcb_set_exception_intercepts(vmcb, bitmap);
629 }
630 
svm_sync_vmcb(struct vcpu * v)631 static void svm_sync_vmcb(struct vcpu *v)
632 {
633     struct arch_svm_struct *arch_svm = &v->arch.hvm_svm;
634 
635     if ( arch_svm->vmcb_in_sync )
636         return;
637 
638     arch_svm->vmcb_in_sync = 1;
639 
640     svm_vmsave(arch_svm->vmcb);
641 }
642 
svm_get_cpl(struct vcpu * v)643 static unsigned int svm_get_cpl(struct vcpu *v)
644 {
645     return vmcb_get_cpl(v->arch.hvm_svm.vmcb);
646 }
647 
svm_get_segment_register(struct vcpu * v,enum x86_segment seg,struct segment_register * reg)648 static void svm_get_segment_register(struct vcpu *v, enum x86_segment seg,
649                                      struct segment_register *reg)
650 {
651     struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb;
652 
653     ASSERT((v == current) || !vcpu_runnable(v));
654 
655     switch ( seg )
656     {
657     case x86_seg_fs ... x86_seg_gs:
658         svm_sync_vmcb(v);
659 
660         /* Fallthrough. */
661     case x86_seg_es ... x86_seg_ds:
662         *reg = vmcb->sreg[seg];
663 
664         if ( seg == x86_seg_ss )
665             reg->dpl = vmcb_get_cpl(vmcb);
666         break;
667 
668     case x86_seg_tr:
669         svm_sync_vmcb(v);
670         *reg = vmcb->tr;
671         break;
672 
673     case x86_seg_gdtr:
674         *reg = vmcb->gdtr;
675         break;
676 
677     case x86_seg_idtr:
678         *reg = vmcb->idtr;
679         break;
680 
681     case x86_seg_ldtr:
682         svm_sync_vmcb(v);
683         *reg = vmcb->ldtr;
684         break;
685 
686     default:
687         ASSERT_UNREACHABLE();
688         domain_crash(v->domain);
689         *reg = (struct segment_register){};
690     }
691 }
692 
svm_set_segment_register(struct vcpu * v,enum x86_segment seg,struct segment_register * reg)693 static void svm_set_segment_register(struct vcpu *v, enum x86_segment seg,
694                                      struct segment_register *reg)
695 {
696     struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb;
697     bool sync = false;
698 
699     ASSERT((v == current) || !vcpu_runnable(v));
700 
701     switch ( seg )
702     {
703     case x86_seg_cs:
704     case x86_seg_ds:
705     case x86_seg_es:
706     case x86_seg_ss: /* cpl */
707         vmcb->cleanbits.fields.seg = 0;
708         break;
709 
710     case x86_seg_gdtr:
711     case x86_seg_idtr:
712         vmcb->cleanbits.fields.dt = 0;
713         break;
714 
715     case x86_seg_fs:
716     case x86_seg_gs:
717     case x86_seg_tr:
718     case x86_seg_ldtr:
719         sync = (v == current);
720         break;
721 
722     default:
723         ASSERT_UNREACHABLE();
724         domain_crash(v->domain);
725         return;
726     }
727 
728     if ( sync )
729         svm_sync_vmcb(v);
730 
731     switch ( seg )
732     {
733     case x86_seg_ss:
734         vmcb_set_cpl(vmcb, reg->dpl);
735 
736         /* Fallthrough */
737     case x86_seg_es ... x86_seg_cs:
738     case x86_seg_ds ... x86_seg_gs:
739         vmcb->sreg[seg] = *reg;
740         break;
741 
742     case x86_seg_tr:
743         vmcb->tr = *reg;
744         break;
745 
746     case x86_seg_gdtr:
747         vmcb->gdtr.base = reg->base;
748         vmcb->gdtr.limit = reg->limit;
749         break;
750 
751     case x86_seg_idtr:
752         vmcb->idtr.base = reg->base;
753         vmcb->idtr.limit = reg->limit;
754         break;
755 
756     case x86_seg_ldtr:
757         vmcb->ldtr = *reg;
758         break;
759 
760     case x86_seg_none:
761         ASSERT_UNREACHABLE();
762         break;
763     }
764 
765     if ( sync )
766         svm_vmload(vmcb);
767 }
768 
svm_get_shadow_gs_base(struct vcpu * v)769 static unsigned long svm_get_shadow_gs_base(struct vcpu *v)
770 {
771     return v->arch.hvm_svm.vmcb->kerngsbase;
772 }
773 
svm_set_guest_pat(struct vcpu * v,u64 gpat)774 static int svm_set_guest_pat(struct vcpu *v, u64 gpat)
775 {
776     struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb;
777 
778     if ( !paging_mode_hap(v->domain) )
779         return 0;
780 
781     vmcb_set_g_pat(vmcb, gpat);
782     return 1;
783 }
784 
svm_get_guest_pat(struct vcpu * v,u64 * gpat)785 static int svm_get_guest_pat(struct vcpu *v, u64 *gpat)
786 {
787     struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb;
788 
789     if ( !paging_mode_hap(v->domain) )
790         return 0;
791 
792     *gpat = vmcb_get_g_pat(vmcb);
793     return 1;
794 }
795 
scale_tsc(uint64_t host_tsc,uint64_t ratio)796 static uint64_t scale_tsc(uint64_t host_tsc, uint64_t ratio)
797 {
798     uint64_t mult, frac, scaled_host_tsc;
799 
800     if ( ratio == DEFAULT_TSC_RATIO )
801         return host_tsc;
802 
803     /*
804      * Suppose the most significant 32 bits of host_tsc and ratio are
805      * tsc_h and mult, and the least 32 bits of them are tsc_l and frac,
806      * then
807      *     host_tsc * ratio * 2^-32
808      *     = host_tsc * (mult * 2^32 + frac) * 2^-32
809      *     = host_tsc * mult + (tsc_h * 2^32 + tsc_l) * frac * 2^-32
810      *     = host_tsc * mult + tsc_h * frac + ((tsc_l * frac) >> 32)
811      *
812      * Multiplications in the last two terms are between 32-bit integers,
813      * so both of them can fit in 64-bit integers.
814      *
815      * Because mult is usually less than 10 in practice, it's very rare
816      * that host_tsc * mult can overflow a 64-bit integer.
817      */
818     mult = ratio >> 32;
819     frac = ratio & ((1ULL << 32) - 1);
820     scaled_host_tsc  = host_tsc * mult;
821     scaled_host_tsc += (host_tsc >> 32) * frac;
822     scaled_host_tsc += ((host_tsc & ((1ULL << 32) - 1)) * frac) >> 32;
823 
824     return scaled_host_tsc;
825 }
826 
svm_get_tsc_offset(uint64_t host_tsc,uint64_t guest_tsc,uint64_t ratio)827 static uint64_t svm_get_tsc_offset(uint64_t host_tsc, uint64_t guest_tsc,
828     uint64_t ratio)
829 {
830     return guest_tsc - scale_tsc(host_tsc, ratio);
831 }
832 
svm_set_tsc_offset(struct vcpu * v,u64 offset,u64 at_tsc)833 static void svm_set_tsc_offset(struct vcpu *v, u64 offset, u64 at_tsc)
834 {
835     struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb;
836     struct vmcb_struct *n1vmcb, *n2vmcb;
837     uint64_t n2_tsc_offset = 0;
838     struct domain *d = v->domain;
839 
840     if ( !nestedhvm_enabled(d) ) {
841         vmcb_set_tsc_offset(vmcb, offset);
842         return;
843     }
844 
845     n1vmcb = vcpu_nestedhvm(v).nv_n1vmcx;
846     n2vmcb = vcpu_nestedhvm(v).nv_n2vmcx;
847 
848     if ( nestedhvm_vcpu_in_guestmode(v) ) {
849         struct nestedsvm *svm = &vcpu_nestedsvm(v);
850 
851         n2_tsc_offset = vmcb_get_tsc_offset(n2vmcb) -
852                         vmcb_get_tsc_offset(n1vmcb);
853         if ( svm->ns_tscratio != DEFAULT_TSC_RATIO ) {
854             uint64_t guest_tsc = hvm_get_guest_tsc_fixed(v, at_tsc);
855 
856             n2_tsc_offset = svm_get_tsc_offset(guest_tsc,
857                                                guest_tsc + n2_tsc_offset,
858                                                svm->ns_tscratio);
859         }
860         vmcb_set_tsc_offset(n1vmcb, offset);
861     }
862 
863     vmcb_set_tsc_offset(vmcb, offset + n2_tsc_offset);
864 }
865 
svm_set_rdtsc_exiting(struct vcpu * v,bool_t enable)866 static void svm_set_rdtsc_exiting(struct vcpu *v, bool_t enable)
867 {
868     struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb;
869     u32 general1_intercepts = vmcb_get_general1_intercepts(vmcb);
870     u32 general2_intercepts = vmcb_get_general2_intercepts(vmcb);
871 
872     general1_intercepts &= ~GENERAL1_INTERCEPT_RDTSC;
873     general2_intercepts &= ~GENERAL2_INTERCEPT_RDTSCP;
874 
875     if ( enable )
876     {
877         general1_intercepts |= GENERAL1_INTERCEPT_RDTSC;
878         general2_intercepts |= GENERAL2_INTERCEPT_RDTSCP;
879     }
880 
881     vmcb_set_general1_intercepts(vmcb, general1_intercepts);
882     vmcb_set_general2_intercepts(vmcb, general2_intercepts);
883 }
884 
svm_set_descriptor_access_exiting(struct vcpu * v,bool enable)885 static void svm_set_descriptor_access_exiting(struct vcpu *v, bool enable)
886 {
887     struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb;
888     u32 general1_intercepts = vmcb_get_general1_intercepts(vmcb);
889     u32 mask = GENERAL1_INTERCEPT_IDTR_READ | GENERAL1_INTERCEPT_GDTR_READ
890             | GENERAL1_INTERCEPT_LDTR_READ | GENERAL1_INTERCEPT_TR_READ
891             | GENERAL1_INTERCEPT_IDTR_WRITE | GENERAL1_INTERCEPT_GDTR_WRITE
892             | GENERAL1_INTERCEPT_LDTR_WRITE | GENERAL1_INTERCEPT_TR_WRITE;
893 
894     if ( enable )
895         general1_intercepts |= mask;
896     else
897         general1_intercepts &= ~mask;
898 
899     vmcb_set_general1_intercepts(vmcb, general1_intercepts);
900 }
901 
svm_get_insn_bytes(struct vcpu * v,uint8_t * buf)902 static unsigned int svm_get_insn_bytes(struct vcpu *v, uint8_t *buf)
903 {
904     struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb;
905     unsigned int len = v->arch.hvm_svm.cached_insn_len;
906 
907     if ( len != 0 )
908     {
909         /* Latch and clear the cached instruction. */
910         memcpy(buf, vmcb->guest_ins, MAX_INST_LEN);
911         v->arch.hvm_svm.cached_insn_len = 0;
912     }
913 
914     return len;
915 }
916 
svm_init_hypercall_page(struct domain * d,void * hypercall_page)917 static void svm_init_hypercall_page(struct domain *d, void *hypercall_page)
918 {
919     char *p;
920     int i;
921 
922     for ( i = 0; i < (PAGE_SIZE / 32); i++ )
923     {
924         if ( i == __HYPERVISOR_iret )
925             continue;
926 
927         p = (char *)(hypercall_page + (i * 32));
928         *(u8  *)(p + 0) = 0xb8; /* mov imm32, %eax */
929         *(u32 *)(p + 1) = i;
930         *(u8  *)(p + 5) = 0x0f; /* vmmcall */
931         *(u8  *)(p + 6) = 0x01;
932         *(u8  *)(p + 7) = 0xd9;
933         *(u8  *)(p + 8) = 0xc3; /* ret */
934     }
935 
936     /* Don't support HYPERVISOR_iret at the moment */
937     *(u16 *)(hypercall_page + (__HYPERVISOR_iret * 32)) = 0x0b0f; /* ud2 */
938 }
939 
svm_lwp_interrupt(struct cpu_user_regs * regs)940 static void svm_lwp_interrupt(struct cpu_user_regs *regs)
941 {
942     struct vcpu *curr = current;
943 
944     ack_APIC_irq();
945     vlapic_set_irq(
946         vcpu_vlapic(curr),
947         (curr->arch.hvm_svm.guest_lwp_cfg >> 40) & 0xff,
948         0);
949 }
950 
svm_lwp_save(struct vcpu * v)951 static inline void svm_lwp_save(struct vcpu *v)
952 {
953     /* Don't mess up with other guests. Disable LWP for next VCPU. */
954     if ( v->arch.hvm_svm.guest_lwp_cfg )
955     {
956         wrmsrl(MSR_AMD64_LWP_CFG, 0x0);
957         wrmsrl(MSR_AMD64_LWP_CBADDR, 0x0);
958     }
959 }
960 
svm_lwp_load(struct vcpu * v)961 static inline void svm_lwp_load(struct vcpu *v)
962 {
963     /* Only LWP_CFG is reloaded. LWP_CBADDR will be reloaded via xrstor. */
964    if ( v->arch.hvm_svm.guest_lwp_cfg )
965        wrmsrl(MSR_AMD64_LWP_CFG, v->arch.hvm_svm.cpu_lwp_cfg);
966 }
967 
968 /* Update LWP_CFG MSR (0xc0000105). Return -1 if error; otherwise returns 0. */
svm_update_lwp_cfg(struct vcpu * v,uint64_t msr_content)969 static int svm_update_lwp_cfg(struct vcpu *v, uint64_t msr_content)
970 {
971     uint32_t msr_low;
972     static uint8_t lwp_intr_vector;
973 
974     if ( xsave_enabled(v) && cpu_has_lwp )
975     {
976         msr_low = (uint32_t)msr_content;
977 
978         /* generate #GP if guest tries to turn on unsupported features. */
979         if ( msr_low & ~v->domain->arch.cpuid->extd.raw[0x1c].d )
980             return -1;
981 
982         v->arch.hvm_svm.guest_lwp_cfg = msr_content;
983 
984         /* setup interrupt handler if needed */
985         if ( (msr_content & 0x80000000) && ((msr_content >> 40) & 0xff) )
986         {
987             alloc_direct_apic_vector(&lwp_intr_vector, svm_lwp_interrupt);
988             v->arch.hvm_svm.cpu_lwp_cfg = (msr_content & 0xffff00ffffffffffULL)
989                 | ((uint64_t)lwp_intr_vector << 40);
990         }
991         else
992         {
993             /* otherwise disable it */
994             v->arch.hvm_svm.cpu_lwp_cfg = msr_content & 0xffff00ff7fffffffULL;
995         }
996 
997         wrmsrl(MSR_AMD64_LWP_CFG, v->arch.hvm_svm.cpu_lwp_cfg);
998 
999         /* track nonalzy state if LWP_CFG is non-zero. */
1000         v->arch.nonlazy_xstate_used = !!(msr_content);
1001     }
1002 
1003     return 0;
1004 }
1005 
svm_tsc_ratio_save(struct vcpu * v)1006 static inline void svm_tsc_ratio_save(struct vcpu *v)
1007 {
1008     /* Other vcpus might not have vtsc enabled. So disable TSC_RATIO here. */
1009     if ( cpu_has_tsc_ratio && !v->domain->arch.vtsc )
1010         wrmsrl(MSR_AMD64_TSC_RATIO, DEFAULT_TSC_RATIO);
1011 }
1012 
svm_tsc_ratio_load(struct vcpu * v)1013 static inline void svm_tsc_ratio_load(struct vcpu *v)
1014 {
1015     if ( cpu_has_tsc_ratio && !v->domain->arch.vtsc )
1016         wrmsrl(MSR_AMD64_TSC_RATIO, hvm_tsc_scaling_ratio(v->domain));
1017 }
1018 
svm_ctxt_switch_from(struct vcpu * v)1019 static void svm_ctxt_switch_from(struct vcpu *v)
1020 {
1021     int cpu = smp_processor_id();
1022 
1023     /*
1024      * Return early if trying to do a context switch without SVM enabled,
1025      * this can happen when the hypervisor shuts down with HVM guests
1026      * still running.
1027      */
1028     if ( unlikely((read_efer() & EFER_SVME) == 0) )
1029         return;
1030 
1031     svm_fpu_leave(v);
1032 
1033     svm_save_dr(v);
1034     svm_lwp_save(v);
1035     svm_tsc_ratio_save(v);
1036 
1037     svm_sync_vmcb(v);
1038     svm_vmload_pa(per_cpu(host_vmcb, cpu));
1039 
1040     /* Resume use of ISTs now that the host TR is reinstated. */
1041     set_ist(&idt_tables[cpu][TRAP_double_fault],  IST_DF);
1042     set_ist(&idt_tables[cpu][TRAP_nmi],           IST_NMI);
1043     set_ist(&idt_tables[cpu][TRAP_machine_check], IST_MCE);
1044 }
1045 
svm_ctxt_switch_to(struct vcpu * v)1046 static void svm_ctxt_switch_to(struct vcpu *v)
1047 {
1048     struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb;
1049     int cpu = smp_processor_id();
1050 
1051     /*
1052      * This is required, because VMRUN does consistency check and some of the
1053      * DOM0 selectors are pointing to invalid GDT locations, and cause AMD
1054      * processors to shutdown.
1055      */
1056     asm volatile ("mov %0, %%ds; mov %0, %%es; mov %0, %%ss;" :: "r" (0));
1057 
1058     /*
1059      * Cannot use ISTs for NMI/#MC/#DF while we are running with the guest TR.
1060      * But this doesn't matter: the IST is only req'd to handle SYSCALL/SYSRET.
1061      */
1062     set_ist(&idt_tables[cpu][TRAP_double_fault],  IST_NONE);
1063     set_ist(&idt_tables[cpu][TRAP_nmi],           IST_NONE);
1064     set_ist(&idt_tables[cpu][TRAP_machine_check], IST_NONE);
1065 
1066     svm_restore_dr(v);
1067 
1068     svm_vmsave_pa(per_cpu(host_vmcb, cpu));
1069     svm_vmload(vmcb);
1070     vmcb->cleanbits.bytes = 0;
1071     svm_lwp_load(v);
1072     svm_tsc_ratio_load(v);
1073 
1074     if ( cpu_has_rdtscp )
1075         wrmsrl(MSR_TSC_AUX, hvm_msr_tsc_aux(v));
1076 }
1077 
svm_do_resume(struct vcpu * v)1078 static void noreturn svm_do_resume(struct vcpu *v)
1079 {
1080     struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb;
1081     bool_t debug_state = v->domain->debugger_attached;
1082     bool_t vcpu_guestmode = 0;
1083     struct vlapic *vlapic = vcpu_vlapic(v);
1084 
1085     if ( nestedhvm_enabled(v->domain) && nestedhvm_vcpu_in_guestmode(v) )
1086         vcpu_guestmode = 1;
1087 
1088     if ( !vcpu_guestmode &&
1089         unlikely(v->arch.hvm_vcpu.debug_state_latch != debug_state) )
1090     {
1091         uint32_t intercepts = vmcb_get_exception_intercepts(vmcb);
1092 
1093         v->arch.hvm_vcpu.debug_state_latch = debug_state;
1094         vmcb_set_exception_intercepts(
1095             vmcb, debug_state ? (intercepts | (1U << TRAP_int3))
1096                               : (intercepts & ~(1U << TRAP_int3)));
1097     }
1098 
1099     if ( v->arch.hvm_svm.launch_core != smp_processor_id() )
1100     {
1101         v->arch.hvm_svm.launch_core = smp_processor_id();
1102         hvm_migrate_timers(v);
1103         hvm_migrate_pirqs(v);
1104         /* Migrating to another ASID domain.  Request a new ASID. */
1105         hvm_asid_flush_vcpu(v);
1106     }
1107 
1108     if ( !vcpu_guestmode && !vlapic_hw_disabled(vlapic) )
1109     {
1110         vintr_t intr;
1111 
1112         /* Reflect the vlapic's TPR in the hardware vtpr */
1113         intr = vmcb_get_vintr(vmcb);
1114         intr.fields.tpr =
1115             (vlapic_get_reg(vlapic, APIC_TASKPRI) & 0xFF) >> 4;
1116         vmcb_set_vintr(vmcb, intr);
1117     }
1118 
1119     hvm_do_resume(v);
1120 
1121     reset_stack_and_jump(svm_asm_do_resume);
1122 }
1123 
svm_guest_osvw_init(struct vcpu * vcpu)1124 static void svm_guest_osvw_init(struct vcpu *vcpu)
1125 {
1126     if ( boot_cpu_data.x86_vendor != X86_VENDOR_AMD )
1127         return;
1128 
1129     /*
1130      * Guests should see errata 400 and 415 as fixed (assuming that
1131      * HLT and IO instructions are intercepted).
1132      */
1133     vcpu->arch.hvm_svm.osvw.length = (osvw_length >= 3) ? osvw_length : 3;
1134     vcpu->arch.hvm_svm.osvw.status = osvw_status & ~(6ULL);
1135 
1136     /*
1137      * By increasing VCPU's osvw.length to 3 we are telling the guest that
1138      * all osvw.status bits inside that length, including bit 0 (which is
1139      * reserved for erratum 298), are valid. However, if host processor's
1140      * osvw_len is 0 then osvw_status[0] carries no information. We need to
1141      * be conservative here and therefore we tell the guest that erratum 298
1142      * is present (because we really don't know).
1143      */
1144     if ( osvw_length == 0 && boot_cpu_data.x86 == 0x10 )
1145         vcpu->arch.hvm_svm.osvw.status |= 1;
1146 }
1147 
svm_host_osvw_reset()1148 void svm_host_osvw_reset()
1149 {
1150     spin_lock(&osvw_lock);
1151 
1152     osvw_length = 64; /* One register (MSRC001_0141) worth of errata */
1153     osvw_status = 0;
1154 
1155     spin_unlock(&osvw_lock);
1156 }
1157 
svm_host_osvw_init()1158 void svm_host_osvw_init()
1159 {
1160     spin_lock(&osvw_lock);
1161 
1162     /*
1163      * Get OSVW bits. If bits are not the same on different processors then
1164      * choose the worst case (i.e. if erratum is present on one processor and
1165      * not on another assume that the erratum is present everywhere).
1166      */
1167     if ( test_bit(X86_FEATURE_OSVW, &boot_cpu_data.x86_capability) )
1168     {
1169         uint64_t len, status;
1170 
1171         if ( rdmsr_safe(MSR_AMD_OSVW_ID_LENGTH, len) ||
1172              rdmsr_safe(MSR_AMD_OSVW_STATUS, status) )
1173             len = status = 0;
1174 
1175         if (len < osvw_length)
1176             osvw_length = len;
1177 
1178         osvw_status |= status;
1179         osvw_status &= (1ULL << osvw_length) - 1;
1180     }
1181     else
1182         osvw_length = osvw_status = 0;
1183 
1184     spin_unlock(&osvw_lock);
1185 }
1186 
svm_domain_initialise(struct domain * d)1187 static int svm_domain_initialise(struct domain *d)
1188 {
1189     static const struct arch_csw csw = {
1190         .from = svm_ctxt_switch_from,
1191         .to   = svm_ctxt_switch_to,
1192         .tail = svm_do_resume,
1193     };
1194 
1195     d->arch.ctxt_switch = &csw;
1196 
1197     return 0;
1198 }
1199 
svm_domain_destroy(struct domain * d)1200 static void svm_domain_destroy(struct domain *d)
1201 {
1202 }
1203 
svm_vcpu_initialise(struct vcpu * v)1204 static int svm_vcpu_initialise(struct vcpu *v)
1205 {
1206     int rc;
1207 
1208     v->arch.hvm_svm.launch_core = -1;
1209 
1210     if ( (rc = svm_create_vmcb(v)) != 0 )
1211     {
1212         dprintk(XENLOG_WARNING,
1213                 "Failed to create VMCB for vcpu %d: err=%d.\n",
1214                 v->vcpu_id, rc);
1215         return rc;
1216     }
1217 
1218     svm_guest_osvw_init(v);
1219 
1220     return 0;
1221 }
1222 
svm_vcpu_destroy(struct vcpu * v)1223 static void svm_vcpu_destroy(struct vcpu *v)
1224 {
1225     svm_destroy_vmcb(v);
1226     passive_domain_destroy(v);
1227 }
1228 
1229 /*
1230  * Emulate enough of interrupt injection to cover the DPL check (omitted by
1231  * hardware), and to work out whether it is safe to move %rip fowards for
1232  * architectural trap vs fault semantics in the exception frame (which
1233  * hardware won't cope with).
1234  *
1235  * The event parameter will be modified to a fault if necessary.
1236  */
svm_emul_swint_injection(struct x86_event * event)1237 static void svm_emul_swint_injection(struct x86_event *event)
1238 {
1239     struct vcpu *curr = current;
1240     const struct vmcb_struct *vmcb = curr->arch.hvm_svm.vmcb;
1241     const struct cpu_user_regs *regs = guest_cpu_user_regs();
1242     unsigned int trap = event->vector, type = event->type;
1243     unsigned int fault = TRAP_gp_fault, ec = 0;
1244     pagefault_info_t pfinfo;
1245     struct segment_register cs, idtr;
1246     unsigned int idte_size, idte_offset;
1247     unsigned long idte_linear_addr;
1248     struct { uint32_t a, b, c, d; } idte = {};
1249     bool lm = vmcb_get_efer(vmcb) & EFER_LMA;
1250     int rc;
1251 
1252     if ( !(vmcb_get_cr0(vmcb) & X86_CR0_PE) )
1253         goto raise_exception; /* TODO: support real-mode injection? */
1254 
1255     idte_size   = lm ? 16 : 8;
1256     idte_offset = trap * idte_size;
1257 
1258     /* ICEBP sets the External Event bit despite being an instruction. */
1259     ec = (trap << 3) | X86_XEC_IDT |
1260         (type == X86_EVENTTYPE_PRI_SW_EXCEPTION ? X86_XEC_EXT : 0);
1261 
1262     /*
1263      * TODO: This does not cover the v8086 mode with CR4.VME case
1264      * correctly, but falls on the safe side from the point of view of a
1265      * 32bit OS.  Someone with many TUITs can see about reading the TSS
1266      * Software Interrupt Redirection bitmap.
1267      */
1268     if ( (regs->eflags & X86_EFLAGS_VM) &&
1269          MASK_EXTR(regs->eflags, X86_EFLAGS_IOPL) != 3 )
1270         goto raise_exception;
1271 
1272     /*
1273      * Read all 8/16 bytes so the idtr limit check is applied properly to
1274      * this entry, even though we don't look at all the words read.
1275      */
1276     hvm_get_segment_register(curr, x86_seg_cs, &cs);
1277     hvm_get_segment_register(curr, x86_seg_idtr, &idtr);
1278     if ( !hvm_virtual_to_linear_addr(x86_seg_idtr, &idtr, idte_offset,
1279                                      idte_size, hvm_access_read,
1280                                      &cs, &idte_linear_addr) )
1281         goto raise_exception;
1282 
1283     rc = hvm_copy_from_guest_linear(&idte, idte_linear_addr, idte_size,
1284                                     PFEC_implicit, &pfinfo);
1285     if ( rc )
1286     {
1287         if ( rc == HVMTRANS_bad_linear_to_gfn )
1288         {
1289             fault = TRAP_page_fault;
1290             ec = pfinfo.ec;
1291             event->cr2 = pfinfo.linear;
1292         }
1293 
1294         goto raise_exception;
1295     }
1296 
1297     /* This must be an interrupt, trap, or task gate. */
1298     switch ( (idte.b >> 8) & 0x1f )
1299     {
1300     case SYS_DESC_irq_gate:
1301     case SYS_DESC_trap_gate:
1302         break;
1303     case SYS_DESC_irq_gate16:
1304     case SYS_DESC_trap_gate16:
1305     case SYS_DESC_task_gate:
1306         if ( !lm )
1307             break;
1308         /* fall through */
1309     default:
1310         goto raise_exception;
1311     }
1312 
1313     /* The 64-bit high half's type must be zero. */
1314     if ( idte.d & 0x1f00 )
1315         goto raise_exception;
1316 
1317     /* ICEBP counts as a hardware event, and bypasses the dpl check. */
1318     if ( type != X86_EVENTTYPE_PRI_SW_EXCEPTION &&
1319          vmcb_get_cpl(vmcb) > ((idte.b >> 13) & 3) )
1320         goto raise_exception;
1321 
1322     /* Is this entry present? */
1323     if ( !(idte.b & (1u << 15)) )
1324     {
1325         fault = TRAP_no_segment;
1326         goto raise_exception;
1327     }
1328 
1329     /*
1330      * Any further fault during injection will cause a double fault.  It
1331      * is fine to leave this up to hardware, and software won't be in a
1332      * position to care about the architectural correctness of %rip in the
1333      * exception frame.
1334      */
1335     return;
1336 
1337  raise_exception:
1338     event->vector = fault;
1339     event->type = X86_EVENTTYPE_HW_EXCEPTION;
1340     event->insn_len = 0;
1341     event->error_code = ec;
1342 }
1343 
svm_inject_event(const struct x86_event * event)1344 static void svm_inject_event(const struct x86_event *event)
1345 {
1346     struct vcpu *curr = current;
1347     struct vmcb_struct *vmcb = curr->arch.hvm_svm.vmcb;
1348     eventinj_t eventinj = vmcb->eventinj;
1349     struct x86_event _event = *event;
1350     struct cpu_user_regs *regs = guest_cpu_user_regs();
1351 
1352     /*
1353      * For hardware lacking NRips support, and always for ICEBP instructions,
1354      * the processor requires extra help to deliver software events.
1355      *
1356      * Xen must emulate enough of the event injection to be sure that a
1357      * further fault shouldn't occur during delivery.  This covers the fact
1358      * that hardware doesn't perform DPL checking on injection.
1359      *
1360      * Also, it accounts for proper positioning of %rip for an event with trap
1361      * semantics (where %rip should point after the instruction) which suffers
1362      * a fault during injection (at which point %rip should point at the
1363      * instruction).
1364      */
1365     if ( event->type == X86_EVENTTYPE_PRI_SW_EXCEPTION ||
1366          (!cpu_has_svm_nrips && (event->type == X86_EVENTTYPE_SW_INTERRUPT ||
1367                                  event->type == X86_EVENTTYPE_SW_EXCEPTION)) )
1368         svm_emul_swint_injection(&_event);
1369 
1370     switch ( _event.vector )
1371     {
1372     case TRAP_debug:
1373         if ( regs->eflags & X86_EFLAGS_TF )
1374         {
1375             __restore_debug_registers(vmcb, curr);
1376             vmcb_set_dr6(vmcb, vmcb_get_dr6(vmcb) | 0x4000);
1377         }
1378         /* fall through */
1379     case TRAP_int3:
1380         if ( curr->domain->debugger_attached )
1381         {
1382             /* Debug/Int3: Trap to debugger. */
1383             domain_pause_for_debugger();
1384             return;
1385         }
1386     }
1387 
1388     if ( unlikely(eventinj.fields.v) &&
1389          (eventinj.fields.type == X86_EVENTTYPE_HW_EXCEPTION) )
1390     {
1391         _event.vector = hvm_combine_hw_exceptions(
1392             eventinj.fields.vector, _event.vector);
1393         if ( _event.vector == TRAP_double_fault )
1394             _event.error_code = 0;
1395     }
1396 
1397     eventinj.bytes = 0;
1398     eventinj.fields.v = 1;
1399     eventinj.fields.vector = _event.vector;
1400 
1401     /*
1402      * Refer to AMD Vol 2: System Programming, 15.20 Event Injection.
1403      *
1404      * On hardware lacking NextRIP support, and all hardware in the case of
1405      * icebp, software events with trap semantics need emulating, so %rip in
1406      * the trap frame points after the instruction.
1407      *
1408      * The x86 emulator (if requested by the x86_swint_emulate_* choice) will
1409      * have performed checks such as presence/dpl/etc and believes that the
1410      * event injection will succeed without faulting.
1411      *
1412      * The x86 emulator will always provide fault semantics for software
1413      * events, with _trap.insn_len set appropriately.  If the injection
1414      * requires emulation, move %rip forwards at this point.
1415      */
1416     switch ( _event.type )
1417     {
1418     case X86_EVENTTYPE_SW_INTERRUPT: /* int $n */
1419         if ( cpu_has_svm_nrips )
1420             vmcb->nextrip = regs->rip + _event.insn_len;
1421         else
1422             regs->rip += _event.insn_len;
1423         eventinj.fields.type = X86_EVENTTYPE_SW_INTERRUPT;
1424         break;
1425 
1426     case X86_EVENTTYPE_PRI_SW_EXCEPTION: /* icebp */
1427         /*
1428          * icebp's injection must always be emulated, as hardware does not
1429          * special case HW_EXCEPTION with vector 1 (#DB) as having trap
1430          * semantics.
1431          */
1432         regs->rip += _event.insn_len;
1433         if ( cpu_has_svm_nrips )
1434             vmcb->nextrip = regs->rip;
1435         eventinj.fields.type = X86_EVENTTYPE_HW_EXCEPTION;
1436         break;
1437 
1438     case X86_EVENTTYPE_SW_EXCEPTION: /* int3, into */
1439         /*
1440          * Hardware special cases HW_EXCEPTION with vectors 3 and 4 as having
1441          * trap semantics, and will perform DPL checks.
1442          */
1443         if ( cpu_has_svm_nrips )
1444             vmcb->nextrip = regs->rip + _event.insn_len;
1445         else
1446             regs->rip += _event.insn_len;
1447         eventinj.fields.type = X86_EVENTTYPE_HW_EXCEPTION;
1448         break;
1449 
1450     default:
1451         eventinj.fields.type = X86_EVENTTYPE_HW_EXCEPTION;
1452         eventinj.fields.ev = (_event.error_code != X86_EVENT_NO_EC);
1453         eventinj.fields.errorcode = _event.error_code;
1454         break;
1455     }
1456 
1457     /*
1458      * If injecting an event outside of 64bit mode, zero the upper bits of the
1459      * %eip and nextrip after the adjustments above.
1460      */
1461     if ( !((vmcb_get_efer(vmcb) & EFER_LMA) && vmcb->cs.l) )
1462     {
1463         regs->rip = regs->eip;
1464         vmcb->nextrip = (uint32_t)vmcb->nextrip;
1465     }
1466 
1467     ASSERT(!eventinj.fields.ev ||
1468            eventinj.fields.errorcode == (uint16_t)eventinj.fields.errorcode);
1469     vmcb->eventinj = eventinj;
1470 
1471     if ( _event.vector == TRAP_page_fault )
1472     {
1473         curr->arch.hvm_vcpu.guest_cr[2] = _event.cr2;
1474         vmcb_set_cr2(vmcb, _event.cr2);
1475         HVMTRACE_LONG_2D(PF_INJECT, _event.error_code, TRC_PAR_LONG(_event.cr2));
1476     }
1477     else
1478     {
1479         HVMTRACE_2D(INJ_EXC, _event.vector, _event.error_code);
1480     }
1481 }
1482 
svm_event_pending(struct vcpu * v)1483 static int svm_event_pending(struct vcpu *v)
1484 {
1485     struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb;
1486     return vmcb->eventinj.fields.v;
1487 }
1488 
svm_cpu_dead(unsigned int cpu)1489 static void svm_cpu_dead(unsigned int cpu)
1490 {
1491     paddr_t *this_hsa = &per_cpu(hsa, cpu);
1492     paddr_t *this_vmcb = &per_cpu(host_vmcb, cpu);
1493 
1494     if ( *this_hsa )
1495     {
1496         free_domheap_page(maddr_to_page(*this_hsa));
1497         *this_hsa = 0;
1498     }
1499 
1500     if ( *this_vmcb )
1501     {
1502         free_domheap_page(maddr_to_page(*this_vmcb));
1503         *this_vmcb = 0;
1504     }
1505 }
1506 
svm_cpu_up_prepare(unsigned int cpu)1507 static int svm_cpu_up_prepare(unsigned int cpu)
1508 {
1509     paddr_t *this_hsa = &per_cpu(hsa, cpu);
1510     paddr_t *this_vmcb = &per_cpu(host_vmcb, cpu);
1511     nodeid_t node = cpu_to_node(cpu);
1512     unsigned int memflags = 0;
1513     struct page_info *pg;
1514 
1515     if ( node != NUMA_NO_NODE )
1516         memflags = MEMF_node(node);
1517 
1518     if ( !*this_hsa )
1519     {
1520         pg = alloc_domheap_page(NULL, memflags);
1521         if ( !pg )
1522             goto err;
1523 
1524         clear_domain_page(_mfn(page_to_mfn(pg)));
1525         *this_hsa = page_to_maddr(pg);
1526     }
1527 
1528     if ( !*this_vmcb )
1529     {
1530         pg = alloc_domheap_page(NULL, memflags);
1531         if ( !pg )
1532             goto err;
1533 
1534         clear_domain_page(_mfn(page_to_mfn(pg)));
1535         *this_vmcb = page_to_maddr(pg);
1536     }
1537 
1538     return 0;
1539 
1540  err:
1541     svm_cpu_dead(cpu);
1542     return -ENOMEM;
1543 }
1544 
svm_init_erratum_383(const struct cpuinfo_x86 * c)1545 static void svm_init_erratum_383(const struct cpuinfo_x86 *c)
1546 {
1547     uint64_t msr_content;
1548 
1549     /* check whether CPU is affected */
1550     if ( !cpu_has_amd_erratum(c, AMD_ERRATUM_383) )
1551         return;
1552 
1553     /* use safe methods to be compatible with nested virtualization */
1554     if (rdmsr_safe(MSR_AMD64_DC_CFG, msr_content) == 0 &&
1555         wrmsr_safe(MSR_AMD64_DC_CFG, msr_content | (1ULL << 47)) == 0)
1556     {
1557         amd_erratum383_found = 1;
1558     } else {
1559         printk("Failed to enable erratum 383\n");
1560     }
1561 }
1562 
svm_handle_osvw(struct vcpu * v,uint32_t msr,uint64_t * val,bool_t read)1563 static int svm_handle_osvw(struct vcpu *v, uint32_t msr, uint64_t *val, bool_t read)
1564 {
1565     if ( !v->domain->arch.cpuid->extd.osvw )
1566         return -1;
1567 
1568     if ( read )
1569     {
1570         if (msr == MSR_AMD_OSVW_ID_LENGTH)
1571             *val = v->arch.hvm_svm.osvw.length;
1572         else
1573             *val = v->arch.hvm_svm.osvw.status;
1574     }
1575     /* Writes are ignored */
1576 
1577     return 0;
1578 }
1579 
_svm_cpu_up(bool bsp)1580 static int _svm_cpu_up(bool bsp)
1581 {
1582     uint64_t msr_content;
1583     int rc;
1584     unsigned int cpu = smp_processor_id();
1585     const struct cpuinfo_x86 *c = &cpu_data[cpu];
1586 
1587     /* Check whether SVM feature is disabled in BIOS */
1588     rdmsrl(MSR_K8_VM_CR, msr_content);
1589     if ( msr_content & K8_VMCR_SVME_DISABLE )
1590     {
1591         printk("CPU%d: AMD SVM Extension is disabled in BIOS.\n", cpu);
1592         return -EINVAL;
1593     }
1594 
1595     if ( bsp && (rc = svm_cpu_up_prepare(cpu)) != 0 )
1596         return rc;
1597 
1598     write_efer(read_efer() | EFER_SVME);
1599 
1600     /* Initialize the HSA for this core. */
1601     wrmsrl(MSR_K8_VM_HSAVE_PA, per_cpu(hsa, cpu));
1602 
1603     /* check for erratum 383 */
1604     svm_init_erratum_383(c);
1605 
1606     /* Initialize core's ASID handling. */
1607     svm_asid_init(c);
1608 
1609     /*
1610      * Check whether EFER.LMSLE can be written.
1611      * Unfortunately there's no feature bit defined for this.
1612      */
1613     msr_content = read_efer();
1614     if ( wrmsr_safe(MSR_EFER, msr_content | EFER_LMSLE) == 0 )
1615         rdmsrl(MSR_EFER, msr_content);
1616     if ( msr_content & EFER_LMSLE )
1617     {
1618         if ( 0 && /* FIXME: Migration! */ bsp )
1619             cpu_has_lmsl = 1;
1620         wrmsrl(MSR_EFER, msr_content ^ EFER_LMSLE);
1621     }
1622     else
1623     {
1624         if ( cpu_has_lmsl )
1625             printk(XENLOG_WARNING "Inconsistent LMSLE support across CPUs!\n");
1626         cpu_has_lmsl = 0;
1627     }
1628 
1629     /* Initialize OSVW bits to be used by guests */
1630     svm_host_osvw_init();
1631 
1632     return 0;
1633 }
1634 
svm_cpu_up(void)1635 static int svm_cpu_up(void)
1636 {
1637     return _svm_cpu_up(false);
1638 }
1639 
start_svm(void)1640 const struct hvm_function_table * __init start_svm(void)
1641 {
1642     bool_t printed = 0;
1643 
1644     svm_host_osvw_reset();
1645 
1646     if ( _svm_cpu_up(true) )
1647     {
1648         printk("SVM: failed to initialise.\n");
1649         return NULL;
1650     }
1651 
1652     setup_vmcb_dump();
1653 
1654     svm_feature_flags = (current_cpu_data.extended_cpuid_level >= 0x8000000A ?
1655                          cpuid_edx(0x8000000A) : 0);
1656 
1657     printk("SVM: Supported advanced features:\n");
1658 
1659     /* DecodeAssists fast paths assume nextrip is valid for fast rIP update. */
1660     if ( !cpu_has_svm_nrips )
1661         clear_bit(SVM_FEATURE_DECODEASSISTS, &svm_feature_flags);
1662 
1663     if ( cpu_has_tsc_ratio )
1664         svm_function_table.tsc_scaling.ratio_frac_bits = 32;
1665 
1666 #define P(p,s) if ( p ) { printk(" - %s\n", s); printed = 1; }
1667     P(cpu_has_svm_npt, "Nested Page Tables (NPT)");
1668     P(cpu_has_svm_lbrv, "Last Branch Record (LBR) Virtualisation");
1669     P(cpu_has_svm_nrips, "Next-RIP Saved on #VMEXIT");
1670     P(cpu_has_svm_cleanbits, "VMCB Clean Bits");
1671     P(cpu_has_svm_decode, "DecodeAssists");
1672     P(cpu_has_pause_filter, "Pause-Intercept Filter");
1673     P(cpu_has_tsc_ratio, "TSC Rate MSR");
1674 #undef P
1675 
1676     if ( !printed )
1677         printk(" - none\n");
1678 
1679     svm_function_table.hap_supported = !!cpu_has_svm_npt;
1680     svm_function_table.hap_capabilities = HVM_HAP_SUPERPAGE_2MB |
1681         (cpu_has_page1gb ? HVM_HAP_SUPERPAGE_1GB : 0);
1682 
1683     return &svm_function_table;
1684 }
1685 
svm_do_nested_pgfault(struct vcpu * v,struct cpu_user_regs * regs,uint64_t pfec,paddr_t gpa)1686 static void svm_do_nested_pgfault(struct vcpu *v,
1687     struct cpu_user_regs *regs, uint64_t pfec, paddr_t gpa)
1688 {
1689     int ret;
1690     unsigned long gfn = gpa >> PAGE_SHIFT;
1691     mfn_t mfn;
1692     p2m_type_t p2mt;
1693     p2m_access_t p2ma;
1694     struct p2m_domain *p2m = NULL;
1695 
1696     /*
1697      * Since HW doesn't explicitly provide a read access bit and we need to
1698      * somehow describe read-modify-write instructions we will conservatively
1699      * set read_access for all memory accesses that are not instruction fetches.
1700      */
1701     struct npfec npfec = {
1702         .read_access = !(pfec & PFEC_insn_fetch),
1703         .write_access = !!(pfec & PFEC_write_access),
1704         .insn_fetch = !!(pfec & PFEC_insn_fetch),
1705         .present = !!(pfec & PFEC_page_present),
1706     };
1707 
1708     /* These bits are mutually exclusive */
1709     if ( pfec & NPT_PFEC_with_gla )
1710         npfec.kind = npfec_kind_with_gla;
1711     else if ( pfec & NPT_PFEC_in_gpt )
1712         npfec.kind = npfec_kind_in_gpt;
1713 
1714     ret = hvm_hap_nested_page_fault(gpa, ~0ul, npfec);
1715 
1716     if ( tb_init_done )
1717     {
1718         struct {
1719             uint64_t gpa;
1720             uint64_t mfn;
1721             uint32_t qualification;
1722             uint32_t p2mt;
1723         } _d;
1724 
1725         p2m = p2m_get_p2m(v);
1726         _d.gpa = gpa;
1727         _d.qualification = 0;
1728         mfn = __get_gfn_type_access(p2m, gfn, &_d.p2mt, &p2ma, 0, NULL, 0);
1729         _d.mfn = mfn_x(mfn);
1730 
1731         __trace_var(TRC_HVM_NPF, 0, sizeof(_d), &_d);
1732     }
1733 
1734     switch (ret) {
1735     case 0:
1736         break;
1737     case 1:
1738         return;
1739     case -1:
1740         ASSERT(nestedhvm_enabled(v->domain) && nestedhvm_vcpu_in_guestmode(v));
1741         /* inject #VMEXIT(NPF) into guest. */
1742         nestedsvm_vmexit_defer(v, VMEXIT_NPF, pfec, gpa);
1743         return;
1744     }
1745 
1746     if ( p2m == NULL )
1747         p2m = p2m_get_p2m(v);
1748     /* Everything else is an error. */
1749     mfn = __get_gfn_type_access(p2m, gfn, &p2mt, &p2ma, 0, NULL, 0);
1750     gdprintk(XENLOG_ERR,
1751          "SVM violation gpa %#"PRIpaddr", mfn %#lx, type %i\n",
1752          gpa, mfn_x(mfn), p2mt);
1753     domain_crash(v->domain);
1754 }
1755 
svm_fpu_dirty_intercept(void)1756 static void svm_fpu_dirty_intercept(void)
1757 {
1758     struct vcpu *v = current;
1759     struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb;
1760     struct vmcb_struct *n1vmcb = vcpu_nestedhvm(v).nv_n1vmcx;
1761 
1762     svm_fpu_enter(v);
1763 
1764     if ( vmcb != n1vmcb )
1765     {
1766        /* Check if l1 guest must make FPU ready for the l2 guest */
1767        if ( v->arch.hvm_vcpu.guest_cr[0] & X86_CR0_TS )
1768            hvm_inject_hw_exception(TRAP_no_device, X86_EVENT_NO_EC);
1769        else
1770            vmcb_set_cr0(n1vmcb, vmcb_get_cr0(n1vmcb) & ~X86_CR0_TS);
1771        return;
1772     }
1773 
1774     if ( !(v->arch.hvm_vcpu.guest_cr[0] & X86_CR0_TS) )
1775         vmcb_set_cr0(vmcb, vmcb_get_cr0(vmcb) & ~X86_CR0_TS);
1776 }
1777 
svm_vmexit_do_cpuid(struct cpu_user_regs * regs)1778 static void svm_vmexit_do_cpuid(struct cpu_user_regs *regs)
1779 {
1780     struct vcpu *curr = current;
1781     unsigned int inst_len;
1782     struct cpuid_leaf res;
1783 
1784     if ( (inst_len = __get_instruction_length(curr, INSTR_CPUID)) == 0 )
1785         return;
1786 
1787     if ( hvm_check_cpuid_faulting(curr) )
1788     {
1789         hvm_inject_hw_exception(TRAP_gp_fault, 0);
1790         return;
1791     }
1792 
1793     guest_cpuid(curr, regs->eax, regs->ecx, &res);
1794     HVMTRACE_5D(CPUID, regs->eax, res.a, res.b, res.c, res.d);
1795 
1796     regs->rax = res.a;
1797     regs->rbx = res.b;
1798     regs->rcx = res.c;
1799     regs->rdx = res.d;
1800 
1801     __update_guest_eip(regs, inst_len);
1802 }
1803 
svm_vmexit_do_cr_access(struct vmcb_struct * vmcb,struct cpu_user_regs * regs)1804 static void svm_vmexit_do_cr_access(
1805     struct vmcb_struct *vmcb, struct cpu_user_regs *regs)
1806 {
1807     int gp, cr, dir, rc;
1808 
1809     cr = vmcb->exitcode - VMEXIT_CR0_READ;
1810     dir = (cr > 15);
1811     cr &= 0xf;
1812     gp = vmcb->exitinfo1 & 0xf;
1813 
1814     rc = dir ? hvm_mov_to_cr(cr, gp) : hvm_mov_from_cr(cr, gp);
1815 
1816     if ( rc == X86EMUL_OKAY )
1817         __update_guest_eip(regs, vmcb->nextrip - vmcb->rip);
1818 }
1819 
svm_dr_access(struct vcpu * v,struct cpu_user_regs * regs)1820 static void svm_dr_access(struct vcpu *v, struct cpu_user_regs *regs)
1821 {
1822     struct vmcb_struct *vmcb = vcpu_nestedhvm(v).nv_n1vmcx;
1823 
1824     HVMTRACE_0D(DR_WRITE);
1825     __restore_debug_registers(vmcb, v);
1826 }
1827 
svm_msr_read_intercept(unsigned int msr,uint64_t * msr_content)1828 static int svm_msr_read_intercept(unsigned int msr, uint64_t *msr_content)
1829 {
1830     int ret;
1831     struct vcpu *v = current;
1832     struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb;
1833 
1834     switch ( msr )
1835     {
1836     case MSR_IA32_SYSENTER_CS:
1837         *msr_content = v->arch.hvm_svm.guest_sysenter_cs;
1838         break;
1839     case MSR_IA32_SYSENTER_ESP:
1840         *msr_content = v->arch.hvm_svm.guest_sysenter_esp;
1841         break;
1842     case MSR_IA32_SYSENTER_EIP:
1843         *msr_content = v->arch.hvm_svm.guest_sysenter_eip;
1844         break;
1845 
1846     case MSR_IA32_MCx_MISC(4): /* Threshold register */
1847     case MSR_F10_MC4_MISC1 ... MSR_F10_MC4_MISC3:
1848         /*
1849          * MCA/MCE: We report that the threshold register is unavailable
1850          * for OS use (locked by the BIOS).
1851          */
1852         *msr_content = 1ULL << 61; /* MC4_MISC.Locked */
1853         break;
1854 
1855     case MSR_IA32_EBC_FREQUENCY_ID:
1856         /*
1857          * This Intel-only register may be accessed if this HVM guest
1858          * has been migrated from an Intel host. The value zero is not
1859          * particularly meaningful, but at least avoids the guest crashing!
1860          */
1861         *msr_content = 0;
1862         break;
1863 
1864     case MSR_IA32_DEBUGCTLMSR:
1865         *msr_content = vmcb_get_debugctlmsr(vmcb);
1866         break;
1867 
1868     case MSR_IA32_LASTBRANCHFROMIP:
1869         *msr_content = vmcb_get_lastbranchfromip(vmcb);
1870         break;
1871 
1872     case MSR_IA32_LASTBRANCHTOIP:
1873         *msr_content = vmcb_get_lastbranchtoip(vmcb);
1874         break;
1875 
1876     case MSR_IA32_LASTINTFROMIP:
1877         *msr_content = vmcb_get_lastintfromip(vmcb);
1878         break;
1879 
1880     case MSR_IA32_LASTINTTOIP:
1881         *msr_content = vmcb_get_lastinttoip(vmcb);
1882         break;
1883 
1884     case MSR_AMD64_LWP_CFG:
1885         *msr_content = v->arch.hvm_svm.guest_lwp_cfg;
1886         break;
1887 
1888     case MSR_K7_PERFCTR0:
1889     case MSR_K7_PERFCTR1:
1890     case MSR_K7_PERFCTR2:
1891     case MSR_K7_PERFCTR3:
1892     case MSR_K7_EVNTSEL0:
1893     case MSR_K7_EVNTSEL1:
1894     case MSR_K7_EVNTSEL2:
1895     case MSR_K7_EVNTSEL3:
1896     case MSR_AMD_FAM15H_PERFCTR0:
1897     case MSR_AMD_FAM15H_PERFCTR1:
1898     case MSR_AMD_FAM15H_PERFCTR2:
1899     case MSR_AMD_FAM15H_PERFCTR3:
1900     case MSR_AMD_FAM15H_PERFCTR4:
1901     case MSR_AMD_FAM15H_PERFCTR5:
1902     case MSR_AMD_FAM15H_EVNTSEL0:
1903     case MSR_AMD_FAM15H_EVNTSEL1:
1904     case MSR_AMD_FAM15H_EVNTSEL2:
1905     case MSR_AMD_FAM15H_EVNTSEL3:
1906     case MSR_AMD_FAM15H_EVNTSEL4:
1907     case MSR_AMD_FAM15H_EVNTSEL5:
1908         if ( vpmu_do_rdmsr(msr, msr_content) )
1909             goto gpf;
1910         break;
1911 
1912     case MSR_AMD64_DR0_ADDRESS_MASK:
1913         if ( !v->domain->arch.cpuid->extd.dbext )
1914             goto gpf;
1915         *msr_content = v->arch.hvm_svm.dr_mask[0];
1916         break;
1917 
1918     case MSR_AMD64_DR1_ADDRESS_MASK ... MSR_AMD64_DR3_ADDRESS_MASK:
1919         if ( !v->domain->arch.cpuid->extd.dbext )
1920             goto gpf;
1921         *msr_content =
1922             v->arch.hvm_svm.dr_mask[msr - MSR_AMD64_DR1_ADDRESS_MASK + 1];
1923         break;
1924 
1925     case MSR_AMD_OSVW_ID_LENGTH:
1926     case MSR_AMD_OSVW_STATUS:
1927         ret = svm_handle_osvw(v, msr, msr_content, 1);
1928         if ( ret < 0 )
1929             goto gpf;
1930         break;
1931 
1932     default:
1933         ret = nsvm_rdmsr(v, msr, msr_content);
1934         if ( ret < 0 )
1935             goto gpf;
1936         else if ( ret )
1937             break;
1938 
1939         if ( rdmsr_viridian_regs(msr, msr_content) ||
1940              rdmsr_hypervisor_regs(msr, msr_content) )
1941             break;
1942 
1943         if ( rdmsr_safe(msr, *msr_content) == 0 )
1944             break;
1945 
1946         if ( boot_cpu_data.x86 == 0xf && msr == MSR_F10_BU_CFG )
1947         {
1948             /* Win2k8 x64 reads this MSR on revF chips, where it
1949              * wasn't publically available; it uses a magic constant
1950              * in %rdi as a password, which we don't have in
1951              * rdmsr_safe().  Since we'll ignore the later writes,
1952              * just use a plausible value here (the reset value from
1953              * rev10h chips) if the real CPU didn't provide one. */
1954             *msr_content = 0x0000000010200020ull;
1955             break;
1956         }
1957 
1958         goto gpf;
1959     }
1960 
1961     HVM_DBG_LOG(DBG_LEVEL_MSR, "returns: ecx=%x, msr_value=%"PRIx64,
1962                 msr, *msr_content);
1963     return X86EMUL_OKAY;
1964 
1965  gpf:
1966     return X86EMUL_EXCEPTION;
1967 }
1968 
svm_msr_write_intercept(unsigned int msr,uint64_t msr_content)1969 static int svm_msr_write_intercept(unsigned int msr, uint64_t msr_content)
1970 {
1971     int ret, result = X86EMUL_OKAY;
1972     struct vcpu *v = current;
1973     struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb;
1974     int sync = 0;
1975 
1976     switch ( msr )
1977     {
1978     case MSR_IA32_SYSENTER_CS:
1979     case MSR_IA32_SYSENTER_ESP:
1980     case MSR_IA32_SYSENTER_EIP:
1981         sync = 1;
1982         break;
1983     default:
1984         break;
1985     }
1986 
1987     if ( sync )
1988         svm_sync_vmcb(v);
1989 
1990     switch ( msr )
1991     {
1992     case MSR_IA32_SYSENTER_CS:
1993         vmcb->sysenter_cs = v->arch.hvm_svm.guest_sysenter_cs = msr_content;
1994         break;
1995     case MSR_IA32_SYSENTER_ESP:
1996         vmcb->sysenter_esp = v->arch.hvm_svm.guest_sysenter_esp = msr_content;
1997         break;
1998     case MSR_IA32_SYSENTER_EIP:
1999         vmcb->sysenter_eip = v->arch.hvm_svm.guest_sysenter_eip = msr_content;
2000         break;
2001 
2002     case MSR_IA32_DEBUGCTLMSR:
2003         vmcb_set_debugctlmsr(vmcb, msr_content);
2004         if ( !msr_content || !cpu_has_svm_lbrv )
2005             break;
2006         vmcb->lbr_control.fields.enable = 1;
2007         svm_disable_intercept_for_msr(v, MSR_IA32_DEBUGCTLMSR);
2008         svm_disable_intercept_for_msr(v, MSR_IA32_LASTBRANCHFROMIP);
2009         svm_disable_intercept_for_msr(v, MSR_IA32_LASTBRANCHTOIP);
2010         svm_disable_intercept_for_msr(v, MSR_IA32_LASTINTFROMIP);
2011         svm_disable_intercept_for_msr(v, MSR_IA32_LASTINTTOIP);
2012         break;
2013 
2014     case MSR_IA32_LASTBRANCHFROMIP:
2015         vmcb_set_lastbranchfromip(vmcb, msr_content);
2016         break;
2017 
2018     case MSR_IA32_LASTBRANCHTOIP:
2019         vmcb_set_lastbranchtoip(vmcb, msr_content);
2020         break;
2021 
2022     case MSR_IA32_LASTINTFROMIP:
2023         vmcb_set_lastintfromip(vmcb, msr_content);
2024         break;
2025 
2026     case MSR_IA32_LASTINTTOIP:
2027         vmcb_set_lastinttoip(vmcb, msr_content);
2028         break;
2029 
2030     case MSR_AMD64_LWP_CFG:
2031         if ( svm_update_lwp_cfg(v, msr_content) < 0 )
2032             goto gpf;
2033         break;
2034 
2035     case MSR_K7_PERFCTR0:
2036     case MSR_K7_PERFCTR1:
2037     case MSR_K7_PERFCTR2:
2038     case MSR_K7_PERFCTR3:
2039     case MSR_K7_EVNTSEL0:
2040     case MSR_K7_EVNTSEL1:
2041     case MSR_K7_EVNTSEL2:
2042     case MSR_K7_EVNTSEL3:
2043     case MSR_AMD_FAM15H_PERFCTR0:
2044     case MSR_AMD_FAM15H_PERFCTR1:
2045     case MSR_AMD_FAM15H_PERFCTR2:
2046     case MSR_AMD_FAM15H_PERFCTR3:
2047     case MSR_AMD_FAM15H_PERFCTR4:
2048     case MSR_AMD_FAM15H_PERFCTR5:
2049     case MSR_AMD_FAM15H_EVNTSEL0:
2050     case MSR_AMD_FAM15H_EVNTSEL1:
2051     case MSR_AMD_FAM15H_EVNTSEL2:
2052     case MSR_AMD_FAM15H_EVNTSEL3:
2053     case MSR_AMD_FAM15H_EVNTSEL4:
2054     case MSR_AMD_FAM15H_EVNTSEL5:
2055         if ( vpmu_do_wrmsr(msr, msr_content, 0) )
2056             goto gpf;
2057         break;
2058 
2059     case MSR_IA32_MCx_MISC(4): /* Threshold register */
2060     case MSR_F10_MC4_MISC1 ... MSR_F10_MC4_MISC3:
2061         /*
2062          * MCA/MCE: Threshold register is reported to be locked, so we ignore
2063          * all write accesses. This behaviour matches real HW, so guests should
2064          * have no problem with this.
2065          */
2066         break;
2067 
2068     case MSR_AMD64_DR0_ADDRESS_MASK:
2069         if ( !v->domain->arch.cpuid->extd.dbext || (msr_content >> 32) )
2070             goto gpf;
2071         v->arch.hvm_svm.dr_mask[0] = msr_content;
2072         break;
2073 
2074     case MSR_AMD64_DR1_ADDRESS_MASK ... MSR_AMD64_DR3_ADDRESS_MASK:
2075         if ( !v->domain->arch.cpuid->extd.dbext || (msr_content >> 32) )
2076             goto gpf;
2077         v->arch.hvm_svm.dr_mask[msr - MSR_AMD64_DR1_ADDRESS_MASK + 1] =
2078             msr_content;
2079         break;
2080 
2081     case MSR_AMD_OSVW_ID_LENGTH:
2082     case MSR_AMD_OSVW_STATUS:
2083         ret = svm_handle_osvw(v, msr, &msr_content, 0);
2084         if ( ret < 0 )
2085             goto gpf;
2086         break;
2087 
2088     default:
2089         ret = nsvm_wrmsr(v, msr, msr_content);
2090         if ( ret < 0 )
2091             goto gpf;
2092         else if ( ret )
2093             break;
2094 
2095         if ( wrmsr_viridian_regs(msr, msr_content) )
2096             break;
2097 
2098         switch ( wrmsr_hypervisor_regs(msr, msr_content) )
2099         {
2100         case -ERESTART:
2101             result = X86EMUL_RETRY;
2102             break;
2103         case 0:
2104         case 1:
2105             break;
2106         default:
2107             goto gpf;
2108         }
2109         break;
2110     }
2111 
2112     if ( sync )
2113         svm_vmload(vmcb);
2114 
2115     return result;
2116 
2117  gpf:
2118     return X86EMUL_EXCEPTION;
2119 }
2120 
svm_do_msr_access(struct cpu_user_regs * regs)2121 static void svm_do_msr_access(struct cpu_user_regs *regs)
2122 {
2123     struct vcpu *curr = current;
2124     bool rdmsr = curr->arch.hvm_svm.vmcb->exitinfo1 == 0;
2125     int rc, inst_len = __get_instruction_length(
2126         curr, rdmsr ? INSTR_RDMSR : INSTR_WRMSR);
2127 
2128     if ( inst_len == 0 )
2129         return;
2130 
2131     if ( rdmsr )
2132     {
2133         uint64_t msr_content = 0;
2134 
2135         rc = hvm_msr_read_intercept(regs->ecx, &msr_content);
2136         if ( rc == X86EMUL_OKAY )
2137             msr_split(regs, msr_content);
2138     }
2139     else
2140         rc = hvm_msr_write_intercept(regs->ecx, msr_fold(regs), 1);
2141 
2142     if ( rc == X86EMUL_OKAY )
2143         __update_guest_eip(regs, inst_len);
2144     else if ( rc == X86EMUL_EXCEPTION )
2145         hvm_inject_hw_exception(TRAP_gp_fault, 0);
2146 }
2147 
svm_vmexit_do_hlt(struct vmcb_struct * vmcb,struct cpu_user_regs * regs)2148 static void svm_vmexit_do_hlt(struct vmcb_struct *vmcb,
2149                               struct cpu_user_regs *regs)
2150 {
2151     unsigned int inst_len;
2152 
2153     if ( (inst_len = __get_instruction_length(current, INSTR_HLT)) == 0 )
2154         return;
2155     __update_guest_eip(regs, inst_len);
2156 
2157     hvm_hlt(regs->eflags);
2158 }
2159 
svm_vmexit_do_rdtsc(struct cpu_user_regs * regs)2160 static void svm_vmexit_do_rdtsc(struct cpu_user_regs *regs)
2161 {
2162     unsigned int inst_len;
2163 
2164     if ( (inst_len = __get_instruction_length(current, INSTR_RDTSC)) == 0 )
2165         return;
2166     __update_guest_eip(regs, inst_len);
2167 
2168     hvm_rdtsc_intercept(regs);
2169 }
2170 
svm_vmexit_do_pause(struct cpu_user_regs * regs)2171 static void svm_vmexit_do_pause(struct cpu_user_regs *regs)
2172 {
2173     unsigned int inst_len;
2174 
2175     if ( (inst_len = __get_instruction_length(current, INSTR_PAUSE)) == 0 )
2176         return;
2177     __update_guest_eip(regs, inst_len);
2178 
2179     /*
2180      * The guest is running a contended spinlock and we've detected it.
2181      * Do something useful, like reschedule the guest
2182      */
2183     perfc_incr(pauseloop_exits);
2184     do_sched_op(SCHEDOP_yield, guest_handle_from_ptr(NULL, void));
2185 }
2186 
2187 static void
svm_vmexit_do_vmrun(struct cpu_user_regs * regs,struct vcpu * v,uint64_t vmcbaddr)2188 svm_vmexit_do_vmrun(struct cpu_user_regs *regs,
2189                     struct vcpu *v, uint64_t vmcbaddr)
2190 {
2191     if ( !nsvm_efer_svm_enabled(v) )
2192     {
2193         gdprintk(XENLOG_ERR, "VMRUN: nestedhvm disabled, injecting #UD\n");
2194         hvm_inject_hw_exception(TRAP_invalid_op, X86_EVENT_NO_EC);
2195         return;
2196     }
2197 
2198     if ( !nestedsvm_vmcb_map(v, vmcbaddr) )
2199     {
2200         gdprintk(XENLOG_ERR, "VMRUN: mapping vmcb failed, injecting #GP\n");
2201         hvm_inject_hw_exception(TRAP_gp_fault, 0);
2202         return;
2203     }
2204 
2205     vcpu_nestedhvm(v).nv_vmentry_pending = 1;
2206     return;
2207 }
2208 
2209 static struct page_info *
nsvm_get_nvmcb_page(struct vcpu * v,uint64_t vmcbaddr)2210 nsvm_get_nvmcb_page(struct vcpu *v, uint64_t vmcbaddr)
2211 {
2212     p2m_type_t p2mt;
2213     struct page_info *page;
2214     struct nestedvcpu *nv = &vcpu_nestedhvm(v);
2215 
2216     if ( !nestedsvm_vmcb_map(v, vmcbaddr) )
2217         return NULL;
2218 
2219     /* Need to translate L1-GPA to MPA */
2220     page = get_page_from_gfn(v->domain,
2221                             nv->nv_vvmcxaddr >> PAGE_SHIFT,
2222                             &p2mt, P2M_ALLOC | P2M_UNSHARE);
2223     if ( !page )
2224         return NULL;
2225 
2226     if ( !p2m_is_ram(p2mt) || p2m_is_readonly(p2mt) )
2227     {
2228         put_page(page);
2229         return NULL;
2230     }
2231 
2232     return  page;
2233 }
2234 
2235 static void
svm_vmexit_do_vmload(struct vmcb_struct * vmcb,struct cpu_user_regs * regs,struct vcpu * v,uint64_t vmcbaddr)2236 svm_vmexit_do_vmload(struct vmcb_struct *vmcb,
2237                      struct cpu_user_regs *regs,
2238                      struct vcpu *v, uint64_t vmcbaddr)
2239 {
2240     unsigned int inst_len;
2241     struct page_info *page;
2242 
2243     if ( (inst_len = __get_instruction_length(v, INSTR_VMLOAD)) == 0 )
2244         return;
2245 
2246     if ( !nsvm_efer_svm_enabled(v) )
2247     {
2248         gdprintk(XENLOG_ERR, "VMLOAD: nestedhvm disabled, injecting #UD\n");
2249         hvm_inject_hw_exception(TRAP_invalid_op, X86_EVENT_NO_EC);
2250         return;
2251     }
2252 
2253     page = nsvm_get_nvmcb_page(v, vmcbaddr);
2254     if ( !page )
2255     {
2256         gdprintk(XENLOG_ERR,
2257             "VMLOAD: mapping failed, injecting #GP\n");
2258         hvm_inject_hw_exception(TRAP_gp_fault, 0);
2259         return;
2260     }
2261 
2262     svm_vmload_pa(page_to_maddr(page));
2263     put_page(page);
2264 
2265     /* State in L1 VMCB is stale now */
2266     v->arch.hvm_svm.vmcb_in_sync = 0;
2267 
2268     __update_guest_eip(regs, inst_len);
2269 }
2270 
2271 static void
svm_vmexit_do_vmsave(struct vmcb_struct * vmcb,struct cpu_user_regs * regs,struct vcpu * v,uint64_t vmcbaddr)2272 svm_vmexit_do_vmsave(struct vmcb_struct *vmcb,
2273                      struct cpu_user_regs *regs,
2274                      struct vcpu *v, uint64_t vmcbaddr)
2275 {
2276     unsigned int inst_len;
2277     struct page_info *page;
2278 
2279     if ( (inst_len = __get_instruction_length(v, INSTR_VMSAVE)) == 0 )
2280         return;
2281 
2282     if ( !nsvm_efer_svm_enabled(v) )
2283     {
2284         gdprintk(XENLOG_ERR, "VMSAVE: nestedhvm disabled, injecting #UD\n");
2285         hvm_inject_hw_exception(TRAP_invalid_op, X86_EVENT_NO_EC);
2286         return;
2287     }
2288 
2289     page = nsvm_get_nvmcb_page(v, vmcbaddr);
2290     if ( !page )
2291     {
2292         gdprintk(XENLOG_ERR,
2293             "VMSAVE: mapping vmcb failed, injecting #GP\n");
2294         hvm_inject_hw_exception(TRAP_gp_fault, 0);
2295         return;
2296     }
2297 
2298     svm_vmsave_pa(page_to_maddr(page));
2299     put_page(page);
2300     __update_guest_eip(regs, inst_len);
2301 }
2302 
svm_is_erratum_383(struct cpu_user_regs * regs)2303 static int svm_is_erratum_383(struct cpu_user_regs *regs)
2304 {
2305     uint64_t msr_content;
2306     uint32_t i;
2307     struct vcpu *v = current;
2308 
2309     if ( !amd_erratum383_found )
2310         return 0;
2311 
2312     rdmsrl(MSR_IA32_MC0_STATUS, msr_content);
2313     /* Bit 62 may or may not be set for this mce */
2314     msr_content &= ~(1ULL << 62);
2315 
2316     if ( msr_content != 0xb600000000010015ULL )
2317         return 0;
2318 
2319     /* Clear MCi_STATUS registers */
2320     for (i = 0; i < nr_mce_banks; i++)
2321         wrmsrl(MSR_IA32_MCx_STATUS(i), 0ULL);
2322 
2323     rdmsrl(MSR_IA32_MCG_STATUS, msr_content);
2324     wrmsrl(MSR_IA32_MCG_STATUS, msr_content & ~(1ULL << 2));
2325 
2326     /* flush TLB */
2327     flush_tlb_mask(v->domain->domain_dirty_cpumask);
2328 
2329     return 1;
2330 }
2331 
svm_vmexit_mce_intercept(struct vcpu * v,struct cpu_user_regs * regs)2332 static void svm_vmexit_mce_intercept(
2333     struct vcpu *v, struct cpu_user_regs *regs)
2334 {
2335     if ( svm_is_erratum_383(regs) )
2336     {
2337         gdprintk(XENLOG_ERR, "SVM hits AMD erratum 383\n");
2338         domain_crash(v->domain);
2339     }
2340 }
2341 
svm_wbinvd_intercept(void)2342 static void svm_wbinvd_intercept(void)
2343 {
2344     if ( cache_flush_permitted(current->domain) )
2345         flush_all(FLUSH_CACHE);
2346 }
2347 
svm_vmexit_do_invalidate_cache(struct cpu_user_regs * regs)2348 static void svm_vmexit_do_invalidate_cache(struct cpu_user_regs *regs)
2349 {
2350     static const enum instruction_index list[] = { INSTR_INVD, INSTR_WBINVD };
2351     int inst_len;
2352 
2353     inst_len = __get_instruction_length_from_list(
2354         current, list, ARRAY_SIZE(list));
2355     if ( inst_len == 0 )
2356         return;
2357 
2358     svm_wbinvd_intercept();
2359 
2360     __update_guest_eip(regs, inst_len);
2361 }
2362 
svm_invlpga_intercept(struct vcpu * v,unsigned long vaddr,uint32_t asid)2363 static void svm_invlpga_intercept(
2364     struct vcpu *v, unsigned long vaddr, uint32_t asid)
2365 {
2366     svm_invlpga(vaddr,
2367                 (asid == 0)
2368                 ? v->arch.hvm_vcpu.n1asid.asid
2369                 : vcpu_nestedhvm(v).nv_n2asid.asid);
2370 }
2371 
svm_invlpg_intercept(unsigned long vaddr)2372 static void svm_invlpg_intercept(unsigned long vaddr)
2373 {
2374     HVMTRACE_LONG_2D(INVLPG, 0, TRC_PAR_LONG(vaddr));
2375     paging_invlpg(current, vaddr);
2376 }
2377 
is_invlpg(const struct x86_emulate_state * state,const struct x86_emulate_ctxt * ctxt)2378 static bool is_invlpg(const struct x86_emulate_state *state,
2379                       const struct x86_emulate_ctxt *ctxt)
2380 {
2381     unsigned int ext;
2382 
2383     return ctxt->opcode == X86EMUL_OPC(0x0f, 0x01) &&
2384            x86_insn_modrm(state, NULL, &ext) != 3 &&
2385            (ext & 7) == 7;
2386 }
2387 
svm_invlpg(struct vcpu * v,unsigned long vaddr)2388 static void svm_invlpg(struct vcpu *v, unsigned long vaddr)
2389 {
2390     svm_asid_g_invlpg(v, vaddr);
2391 }
2392 
svm_get_pending_event(struct vcpu * v,struct x86_event * info)2393 static bool svm_get_pending_event(struct vcpu *v, struct x86_event *info)
2394 {
2395     const struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb;
2396 
2397     if ( vmcb->eventinj.fields.v )
2398         return false;
2399 
2400     info->vector = vmcb->eventinj.fields.vector;
2401     info->type = vmcb->eventinj.fields.type;
2402     info->error_code = vmcb->eventinj.fields.errorcode;
2403 
2404     return true;
2405 }
2406 
2407 static struct hvm_function_table __initdata svm_function_table = {
2408     .name                 = "SVM",
2409     .cpu_up_prepare       = svm_cpu_up_prepare,
2410     .cpu_dead             = svm_cpu_dead,
2411     .cpu_up               = svm_cpu_up,
2412     .cpu_down             = svm_cpu_down,
2413     .domain_initialise    = svm_domain_initialise,
2414     .domain_destroy       = svm_domain_destroy,
2415     .vcpu_initialise      = svm_vcpu_initialise,
2416     .vcpu_destroy         = svm_vcpu_destroy,
2417     .save_cpu_ctxt        = svm_save_vmcb_ctxt,
2418     .load_cpu_ctxt        = svm_load_vmcb_ctxt,
2419     .init_msr             = svm_init_msr,
2420     .save_msr             = svm_save_msr,
2421     .load_msr             = svm_load_msr,
2422     .get_interrupt_shadow = svm_get_interrupt_shadow,
2423     .set_interrupt_shadow = svm_set_interrupt_shadow,
2424     .guest_x86_mode       = svm_guest_x86_mode,
2425     .get_cpl              = svm_get_cpl,
2426     .get_segment_register = svm_get_segment_register,
2427     .set_segment_register = svm_set_segment_register,
2428     .get_shadow_gs_base   = svm_get_shadow_gs_base,
2429     .update_guest_cr      = svm_update_guest_cr,
2430     .update_guest_efer    = svm_update_guest_efer,
2431     .update_guest_vendor  = svm_update_guest_vendor,
2432     .fpu_leave            = svm_fpu_leave,
2433     .set_guest_pat        = svm_set_guest_pat,
2434     .get_guest_pat        = svm_get_guest_pat,
2435     .set_tsc_offset       = svm_set_tsc_offset,
2436     .inject_event         = svm_inject_event,
2437     .init_hypercall_page  = svm_init_hypercall_page,
2438     .event_pending        = svm_event_pending,
2439     .get_pending_event    = svm_get_pending_event,
2440     .invlpg               = svm_invlpg,
2441     .wbinvd_intercept     = svm_wbinvd_intercept,
2442     .fpu_dirty_intercept  = svm_fpu_dirty_intercept,
2443     .msr_read_intercept   = svm_msr_read_intercept,
2444     .msr_write_intercept  = svm_msr_write_intercept,
2445     .set_rdtsc_exiting    = svm_set_rdtsc_exiting,
2446     .set_descriptor_access_exiting = svm_set_descriptor_access_exiting,
2447     .get_insn_bytes       = svm_get_insn_bytes,
2448 
2449     .nhvm_vcpu_initialise = nsvm_vcpu_initialise,
2450     .nhvm_vcpu_destroy = nsvm_vcpu_destroy,
2451     .nhvm_vcpu_reset = nsvm_vcpu_reset,
2452     .nhvm_vcpu_vmexit_event = nsvm_vcpu_vmexit_event,
2453     .nhvm_vcpu_p2m_base = nsvm_vcpu_hostcr3,
2454     .nhvm_vmcx_guest_intercepts_event = nsvm_vmcb_guest_intercepts_event,
2455     .nhvm_vmcx_hap_enabled = nsvm_vmcb_hap_enabled,
2456     .nhvm_intr_blocked = nsvm_intr_blocked,
2457     .nhvm_hap_walk_L1_p2m = nsvm_hap_walk_L1_p2m,
2458 
2459     .tsc_scaling = {
2460         .max_ratio = ~TSC_RATIO_RSVD_BITS,
2461     },
2462 };
2463 
svm_vmexit_handler(struct cpu_user_regs * regs)2464 void svm_vmexit_handler(struct cpu_user_regs *regs)
2465 {
2466     uint64_t exit_reason;
2467     struct vcpu *v = current;
2468     struct vmcb_struct *vmcb = v->arch.hvm_svm.vmcb;
2469     eventinj_t eventinj;
2470     int inst_len, rc;
2471     vintr_t intr;
2472     bool_t vcpu_guestmode = 0;
2473     struct vlapic *vlapic = vcpu_vlapic(v);
2474 
2475     hvm_invalidate_regs_fields(regs);
2476 
2477     if ( paging_mode_hap(v->domain) )
2478         v->arch.hvm_vcpu.guest_cr[3] = v->arch.hvm_vcpu.hw_cr[3] =
2479             vmcb_get_cr3(vmcb);
2480 
2481     if ( nestedhvm_enabled(v->domain) && nestedhvm_vcpu_in_guestmode(v) )
2482         vcpu_guestmode = 1;
2483 
2484     /*
2485      * Before doing anything else, we need to sync up the VLAPIC's TPR with
2486      * SVM's vTPR. It's OK if the guest doesn't touch CR8 (e.g. 32-bit Windows)
2487      * because we update the vTPR on MMIO writes to the TPR.
2488      * NB. We need to preserve the low bits of the TPR to make checked builds
2489      * of Windows work, even though they don't actually do anything.
2490      */
2491     if ( !vcpu_guestmode && !vlapic_hw_disabled(vlapic) )
2492     {
2493         intr = vmcb_get_vintr(vmcb);
2494         vlapic_set_reg(vlapic, APIC_TASKPRI,
2495                    ((intr.fields.tpr & 0x0F) << 4) |
2496                    (vlapic_get_reg(vlapic, APIC_TASKPRI) & 0x0F));
2497     }
2498 
2499     exit_reason = vmcb->exitcode;
2500 
2501     if ( hvm_long_mode_active(v) )
2502         HVMTRACE_ND(VMEXIT64, vcpu_guestmode ? TRC_HVM_NESTEDFLAG : 0,
2503                     1/*cycles*/, 3, exit_reason,
2504                     regs->eip, regs->rip >> 32, 0, 0, 0);
2505     else
2506         HVMTRACE_ND(VMEXIT, vcpu_guestmode ? TRC_HVM_NESTEDFLAG : 0,
2507                     1/*cycles*/, 2, exit_reason,
2508                     regs->eip, 0, 0, 0, 0);
2509 
2510     if ( vcpu_guestmode ) {
2511         enum nestedhvm_vmexits nsret;
2512         struct nestedvcpu *nv = &vcpu_nestedhvm(v);
2513         struct vmcb_struct *ns_vmcb = nv->nv_vvmcx;
2514         uint64_t exitinfo1, exitinfo2;
2515 
2516         paging_update_nestedmode(v);
2517 
2518         /* Write real exitinfo1 back into virtual vmcb.
2519          * nestedsvm_check_intercepts() expects to have the correct
2520          * exitinfo1 value there.
2521          */
2522         exitinfo1 = ns_vmcb->exitinfo1;
2523         ns_vmcb->exitinfo1 = vmcb->exitinfo1;
2524         nsret = nestedsvm_check_intercepts(v, regs, exit_reason);
2525         switch (nsret) {
2526         case NESTEDHVM_VMEXIT_CONTINUE:
2527             BUG();
2528             break;
2529         case NESTEDHVM_VMEXIT_HOST:
2530             break;
2531         case NESTEDHVM_VMEXIT_INJECT:
2532             /* Switch vcpu from l2 to l1 guest. We must perform
2533              * the switch here to have svm_do_resume() working
2534              * as intended.
2535              */
2536             exitinfo1 = vmcb->exitinfo1;
2537             exitinfo2 = vmcb->exitinfo2;
2538             nv->nv_vmswitch_in_progress = 1;
2539             nsret = nestedsvm_vmexit_n2n1(v, regs);
2540             nv->nv_vmswitch_in_progress = 0;
2541             switch (nsret) {
2542             case NESTEDHVM_VMEXIT_DONE:
2543                 /* defer VMEXIT injection */
2544                 nestedsvm_vmexit_defer(v, exit_reason, exitinfo1, exitinfo2);
2545                 goto out;
2546             case NESTEDHVM_VMEXIT_FATALERROR:
2547                 gdprintk(XENLOG_ERR, "unexpected nestedsvm_vmexit() error\n");
2548                 domain_crash(v->domain);
2549                 goto out;
2550             default:
2551                 BUG();
2552             case NESTEDHVM_VMEXIT_ERROR:
2553                 break;
2554             }
2555             /* fallthrough */
2556         case NESTEDHVM_VMEXIT_ERROR:
2557             gdprintk(XENLOG_ERR,
2558                 "nestedsvm_check_intercepts() returned NESTEDHVM_VMEXIT_ERROR\n");
2559             goto out;
2560         case NESTEDHVM_VMEXIT_FATALERROR:
2561             gdprintk(XENLOG_ERR,
2562                 "unexpected nestedsvm_check_intercepts() error\n");
2563             domain_crash(v->domain);
2564             goto out;
2565         default:
2566             gdprintk(XENLOG_INFO, "nestedsvm_check_intercepts() returned %i\n",
2567                 nsret);
2568             domain_crash(v->domain);
2569             goto out;
2570         }
2571     }
2572 
2573     if ( unlikely(exit_reason == VMEXIT_INVALID) )
2574     {
2575         gdprintk(XENLOG_ERR, "invalid VMCB state:\n");
2576         svm_vmcb_dump(__func__, vmcb);
2577         domain_crash(v->domain);
2578         goto out;
2579     }
2580 
2581     perfc_incra(svmexits, exit_reason);
2582 
2583     hvm_maybe_deassert_evtchn_irq();
2584 
2585     vmcb->cleanbits.bytes = cpu_has_svm_cleanbits ? ~0u : 0u;
2586 
2587     /* Event delivery caused this intercept? Queue for redelivery. */
2588     eventinj = vmcb->exitintinfo;
2589     if ( unlikely(eventinj.fields.v) &&
2590          hvm_event_needs_reinjection(eventinj.fields.type,
2591                                      eventinj.fields.vector) )
2592         vmcb->eventinj = eventinj;
2593 
2594     switch ( exit_reason )
2595     {
2596     case VMEXIT_INTR:
2597         /* Asynchronous event, handled when we STGI'd after the VMEXIT. */
2598         HVMTRACE_0D(INTR);
2599         break;
2600 
2601     case VMEXIT_NMI:
2602         /* Asynchronous event, handled when we STGI'd after the VMEXIT. */
2603         HVMTRACE_0D(NMI);
2604         break;
2605 
2606     case VMEXIT_SMI:
2607         /* Asynchronous event, handled when we STGI'd after the VMEXIT. */
2608         HVMTRACE_0D(SMI);
2609         break;
2610 
2611     case VMEXIT_EXCEPTION_DB:
2612         if ( !v->domain->debugger_attached )
2613             hvm_inject_hw_exception(TRAP_debug, X86_EVENT_NO_EC);
2614         else
2615             domain_pause_for_debugger();
2616         break;
2617 
2618     case VMEXIT_EXCEPTION_BP:
2619         if ( !v->domain->debugger_attached )
2620             goto unexpected_exit_type;
2621         /* AMD Vol2, 15.11: INT3, INTO, BOUND intercepts do not update RIP. */
2622         if ( (inst_len = __get_instruction_length(v, INSTR_INT3)) == 0 )
2623             break;
2624         __update_guest_eip(regs, inst_len);
2625         current->arch.gdbsx_vcpu_event = TRAP_int3;
2626         domain_pause_for_debugger();
2627         break;
2628 
2629     case VMEXIT_EXCEPTION_NM:
2630         svm_fpu_dirty_intercept();
2631         break;
2632 
2633     case VMEXIT_EXCEPTION_PF: {
2634         unsigned long va;
2635         va = vmcb->exitinfo2;
2636         regs->error_code = vmcb->exitinfo1;
2637         HVM_DBG_LOG(DBG_LEVEL_VMMU,
2638                     "eax=%lx, ebx=%lx, ecx=%lx, edx=%lx, esi=%lx, edi=%lx",
2639                     regs->rax, regs->rbx, regs->rcx,
2640                     regs->rdx, regs->rsi, regs->rdi);
2641 
2642         if ( cpu_has_svm_decode )
2643             v->arch.hvm_svm.cached_insn_len = vmcb->guest_ins_len & 0xf;
2644         rc = paging_fault(va, regs);
2645         v->arch.hvm_svm.cached_insn_len = 0;
2646 
2647         if ( rc )
2648         {
2649             if ( trace_will_trace_event(TRC_SHADOW) )
2650                 break;
2651             if ( hvm_long_mode_active(v) )
2652                 HVMTRACE_LONG_2D(PF_XEN, regs->error_code, TRC_PAR_LONG(va));
2653             else
2654                 HVMTRACE_2D(PF_XEN, regs->error_code, va);
2655             break;
2656         }
2657 
2658         hvm_inject_page_fault(regs->error_code, va);
2659         break;
2660     }
2661 
2662     case VMEXIT_EXCEPTION_AC:
2663         HVMTRACE_1D(TRAP, TRAP_alignment_check);
2664         hvm_inject_hw_exception(TRAP_alignment_check, vmcb->exitinfo1);
2665         break;
2666 
2667     case VMEXIT_EXCEPTION_UD:
2668         hvm_ud_intercept(regs);
2669         break;
2670 
2671     /* Asynchronous event, handled when we STGI'd after the VMEXIT. */
2672     case VMEXIT_EXCEPTION_MC:
2673         HVMTRACE_0D(MCE);
2674         svm_vmexit_mce_intercept(v, regs);
2675         break;
2676 
2677     case VMEXIT_VINTR: {
2678         u32 general1_intercepts = vmcb_get_general1_intercepts(vmcb);
2679         intr = vmcb_get_vintr(vmcb);
2680 
2681         intr.fields.irq = 0;
2682         general1_intercepts &= ~GENERAL1_INTERCEPT_VINTR;
2683 
2684         vmcb_set_vintr(vmcb, intr);
2685         vmcb_set_general1_intercepts(vmcb, general1_intercepts);
2686         break;
2687     }
2688 
2689     case VMEXIT_INVD:
2690     case VMEXIT_WBINVD:
2691         svm_vmexit_do_invalidate_cache(regs);
2692         break;
2693 
2694     case VMEXIT_TASK_SWITCH: {
2695         enum hvm_task_switch_reason reason;
2696         int32_t errcode = -1;
2697         if ( (vmcb->exitinfo2 >> 36) & 1 )
2698             reason = TSW_iret;
2699         else if ( (vmcb->exitinfo2 >> 38) & 1 )
2700             reason = TSW_jmp;
2701         else
2702             reason = TSW_call_or_int;
2703         if ( (vmcb->exitinfo2 >> 44) & 1 )
2704             errcode = (uint32_t)vmcb->exitinfo2;
2705 
2706         /*
2707          * Some processors set the EXITINTINFO field when the task switch
2708          * is caused by a task gate in the IDT. In this case we will be
2709          * emulating the event injection, so we do not want the processor
2710          * to re-inject the original event!
2711          */
2712         vmcb->eventinj.bytes = 0;
2713 
2714         hvm_task_switch((uint16_t)vmcb->exitinfo1, reason, errcode);
2715         break;
2716     }
2717 
2718     case VMEXIT_CPUID:
2719         svm_vmexit_do_cpuid(regs);
2720         break;
2721 
2722     case VMEXIT_HLT:
2723         svm_vmexit_do_hlt(vmcb, regs);
2724         break;
2725 
2726     case VMEXIT_IOIO:
2727         if ( (vmcb->exitinfo1 & (1u<<2)) == 0 )
2728         {
2729             uint16_t port = (vmcb->exitinfo1 >> 16) & 0xFFFF;
2730             int bytes = ((vmcb->exitinfo1 >> 4) & 0x07);
2731             int dir = (vmcb->exitinfo1 & 1) ? IOREQ_READ : IOREQ_WRITE;
2732             if ( handle_pio(port, bytes, dir) )
2733                 __update_guest_eip(regs, vmcb->exitinfo2 - vmcb->rip);
2734         }
2735         else if ( !hvm_emulate_one_insn(x86_insn_is_portio, "port I/O") )
2736             hvm_inject_hw_exception(TRAP_gp_fault, 0);
2737         break;
2738 
2739     case VMEXIT_CR0_READ ... VMEXIT_CR15_READ:
2740     case VMEXIT_CR0_WRITE ... VMEXIT_CR15_WRITE:
2741         if ( cpu_has_svm_decode && (vmcb->exitinfo1 & (1ULL << 63)) )
2742             svm_vmexit_do_cr_access(vmcb, regs);
2743         else if ( !hvm_emulate_one_insn(x86_insn_is_cr_access, "CR access") )
2744             hvm_inject_hw_exception(TRAP_gp_fault, 0);
2745         break;
2746 
2747     case VMEXIT_INVLPG:
2748         if ( cpu_has_svm_decode )
2749         {
2750             svm_invlpg_intercept(vmcb->exitinfo1);
2751             __update_guest_eip(regs, vmcb->nextrip - vmcb->rip);
2752         }
2753         else if ( !hvm_emulate_one_insn(is_invlpg, "invlpg") )
2754             hvm_inject_hw_exception(TRAP_gp_fault, 0);
2755         break;
2756 
2757     case VMEXIT_INVLPGA:
2758         if ( (inst_len = __get_instruction_length(v, INSTR_INVLPGA)) == 0 )
2759             break;
2760         svm_invlpga_intercept(v, regs->rax, regs->ecx);
2761         __update_guest_eip(regs, inst_len);
2762         break;
2763 
2764     case VMEXIT_VMMCALL:
2765         if ( (inst_len = __get_instruction_length(v, INSTR_VMCALL)) == 0 )
2766             break;
2767         BUG_ON(vcpu_guestmode);
2768         HVMTRACE_1D(VMMCALL, regs->eax);
2769 
2770         if ( hvm_hypercall(regs) == HVM_HCALL_completed )
2771             __update_guest_eip(regs, inst_len);
2772         break;
2773 
2774     case VMEXIT_DR0_READ ... VMEXIT_DR7_READ:
2775     case VMEXIT_DR0_WRITE ... VMEXIT_DR7_WRITE:
2776         svm_dr_access(v, regs);
2777         break;
2778 
2779     case VMEXIT_MSR:
2780         svm_do_msr_access(regs);
2781         break;
2782 
2783     case VMEXIT_SHUTDOWN:
2784         hvm_triple_fault();
2785         break;
2786 
2787     case VMEXIT_RDTSCP:
2788         regs->rcx = hvm_msr_tsc_aux(v);
2789         /* fall through */
2790     case VMEXIT_RDTSC:
2791         svm_vmexit_do_rdtsc(regs);
2792         break;
2793 
2794     case VMEXIT_MONITOR:
2795     case VMEXIT_MWAIT:
2796         hvm_inject_hw_exception(TRAP_invalid_op, X86_EVENT_NO_EC);
2797         break;
2798 
2799     case VMEXIT_VMRUN:
2800         svm_vmexit_do_vmrun(regs, v, regs->rax);
2801         break;
2802     case VMEXIT_VMLOAD:
2803         svm_vmexit_do_vmload(vmcb, regs, v, regs->rax);
2804         break;
2805     case VMEXIT_VMSAVE:
2806         svm_vmexit_do_vmsave(vmcb, regs, v, regs->rax);
2807         break;
2808     case VMEXIT_STGI:
2809         svm_vmexit_do_stgi(regs, v);
2810         break;
2811     case VMEXIT_CLGI:
2812         svm_vmexit_do_clgi(regs, v);
2813         break;
2814     case VMEXIT_SKINIT:
2815         hvm_inject_hw_exception(TRAP_invalid_op, X86_EVENT_NO_EC);
2816         break;
2817 
2818     case VMEXIT_XSETBV:
2819         if ( vmcb_get_cpl(vmcb) )
2820             hvm_inject_hw_exception(TRAP_gp_fault, 0);
2821         else if ( (inst_len = __get_instruction_length(v, INSTR_XSETBV)) &&
2822                   hvm_handle_xsetbv(regs->ecx, msr_fold(regs)) == 0 )
2823             __update_guest_eip(regs, inst_len);
2824         break;
2825 
2826     case VMEXIT_NPF:
2827         perfc_incra(svmexits, VMEXIT_NPF_PERFC);
2828         if ( cpu_has_svm_decode )
2829             v->arch.hvm_svm.cached_insn_len = vmcb->guest_ins_len & 0xf;
2830         rc = vmcb->exitinfo1 & PFEC_page_present
2831              ? p2m_pt_handle_deferred_changes(vmcb->exitinfo2) : 0;
2832         if ( rc >= 0 )
2833             svm_do_nested_pgfault(v, regs, vmcb->exitinfo1, vmcb->exitinfo2);
2834         else
2835         {
2836             printk(XENLOG_G_ERR
2837                    "%pv: Error %d handling NPF (gpa=%08lx ec=%04lx)\n",
2838                    v, rc, vmcb->exitinfo2, vmcb->exitinfo1);
2839             domain_crash(v->domain);
2840         }
2841         v->arch.hvm_svm.cached_insn_len = 0;
2842         break;
2843 
2844     case VMEXIT_IRET: {
2845         u32 general1_intercepts = vmcb_get_general1_intercepts(vmcb);
2846 
2847         /*
2848          * IRET clears the NMI mask. However because we clear the mask
2849          * /before/ executing IRET, we set the interrupt shadow to prevent
2850          * a pending NMI from being injected immediately. This will work
2851          * perfectly unless the IRET instruction faults: in that case we
2852          * may inject an NMI before the NMI handler's IRET instruction is
2853          * retired.
2854          */
2855         general1_intercepts &= ~GENERAL1_INTERCEPT_IRET;
2856         vmcb->interrupt_shadow = 1;
2857 
2858         vmcb_set_general1_intercepts(vmcb, general1_intercepts);
2859         break;
2860     }
2861 
2862     case VMEXIT_PAUSE:
2863         svm_vmexit_do_pause(regs);
2864         break;
2865 
2866     case VMEXIT_IDTR_READ:
2867     case VMEXIT_IDTR_WRITE:
2868         hvm_descriptor_access_intercept(vmcb->exitintinfo.bytes, 0,
2869             VM_EVENT_DESC_IDTR, exit_reason == VMEXIT_IDTR_WRITE);
2870         break;
2871 
2872     case VMEXIT_GDTR_READ:
2873     case VMEXIT_GDTR_WRITE:
2874         hvm_descriptor_access_intercept(vmcb->exitintinfo.bytes, 0,
2875             VM_EVENT_DESC_GDTR, exit_reason == VMEXIT_GDTR_WRITE);
2876         break;
2877 
2878     case VMEXIT_LDTR_READ:
2879     case VMEXIT_LDTR_WRITE:
2880         hvm_descriptor_access_intercept(vmcb->exitintinfo.bytes, 0,
2881             VM_EVENT_DESC_LDTR, exit_reason == VMEXIT_LDTR_WRITE);
2882         break;
2883 
2884     case VMEXIT_TR_READ:
2885     case VMEXIT_TR_WRITE:
2886         hvm_descriptor_access_intercept(vmcb->exitintinfo.bytes, 0,
2887             VM_EVENT_DESC_TR, exit_reason == VMEXIT_TR_WRITE);
2888         break;
2889 
2890     default:
2891     unexpected_exit_type:
2892         gprintk(XENLOG_ERR, "Unexpected vmexit: reason %#"PRIx64", "
2893                 "exitinfo1 %#"PRIx64", exitinfo2 %#"PRIx64"\n",
2894                 exit_reason, vmcb->exitinfo1, vmcb->exitinfo2);
2895         svm_crash_or_fault(v);
2896         break;
2897     }
2898 
2899   out:
2900     if ( vcpu_guestmode || vlapic_hw_disabled(vlapic) )
2901         return;
2902 
2903     /* The exit may have updated the TPR: reflect this in the hardware vtpr */
2904     intr = vmcb_get_vintr(vmcb);
2905     intr.fields.tpr =
2906         (vlapic_get_reg(vlapic, APIC_TASKPRI) & 0xFF) >> 4;
2907     vmcb_set_vintr(vmcb, intr);
2908 }
2909 
svm_trace_vmentry(void)2910 void svm_trace_vmentry(void)
2911 {
2912     struct vcpu *curr = current;
2913     HVMTRACE_ND(VMENTRY,
2914                 nestedhvm_vcpu_in_guestmode(curr) ? TRC_HVM_NESTEDFLAG : 0,
2915                 1/*cycles*/, 0, 0, 0, 0, 0, 0, 0);
2916 }
2917 
2918 /*
2919  * Local variables:
2920  * mode: C
2921  * c-file-style: "BSD"
2922  * c-basic-offset: 4
2923  * tab-width: 4
2924  * indent-tabs-mode: nil
2925  * End:
2926  */
2927