1 /*
2  * vmcs.c: VMCS management
3  * Copyright (c) 2004, Intel Corporation.
4  *
5  * This program is free software; you can redistribute it and/or modify it
6  * under the terms and conditions of the GNU General Public License,
7  * version 2, as published by the Free Software Foundation.
8  *
9  * This program is distributed in the hope it will be useful, but WITHOUT
10  * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
11  * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
12  * more details.
13  *
14  * You should have received a copy of the GNU General Public License along with
15  * this program; If not, see <http://www.gnu.org/licenses/>.
16  */
17 
18 #include <xen/init.h>
19 #include <xen/mm.h>
20 #include <xen/lib.h>
21 #include <xen/errno.h>
22 #include <xen/domain_page.h>
23 #include <xen/event.h>
24 #include <xen/kernel.h>
25 #include <xen/keyhandler.h>
26 #include <xen/vm_event.h>
27 #include <asm/current.h>
28 #include <asm/cpufeature.h>
29 #include <asm/processor.h>
30 #include <asm/msr.h>
31 #include <asm/xstate.h>
32 #include <asm/hvm/hvm.h>
33 #include <asm/hvm/io.h>
34 #include <asm/hvm/support.h>
35 #include <asm/hvm/vmx/vmx.h>
36 #include <asm/hvm/vmx/vvmx.h>
37 #include <asm/hvm/vmx/vmcs.h>
38 #include <asm/flushtlb.h>
39 #include <asm/monitor.h>
40 #include <asm/shadow.h>
41 #include <asm/tboot.h>
42 #include <asm/apic.h>
43 
44 static bool_t __read_mostly opt_vpid_enabled = 1;
45 boolean_param("vpid", opt_vpid_enabled);
46 
47 static bool_t __read_mostly opt_unrestricted_guest_enabled = 1;
48 boolean_param("unrestricted_guest", opt_unrestricted_guest_enabled);
49 
50 static bool_t __read_mostly opt_apicv_enabled = 1;
51 boolean_param("apicv", opt_apicv_enabled);
52 
53 /*
54  * These two parameters are used to config the controls for Pause-Loop Exiting:
55  * ple_gap:    upper bound on the amount of time between two successive
56  *             executions of PAUSE in a loop.
57  * ple_window: upper bound on the amount of time a guest is allowed to execute
58  *             in a PAUSE loop.
59  * Time is measured based on a counter that runs at the same rate as the TSC,
60  * refer SDM volume 3b section 21.6.13 & 22.1.3.
61  */
62 static unsigned int __read_mostly ple_gap = 128;
63 integer_param("ple_gap", ple_gap);
64 static unsigned int __read_mostly ple_window = 4096;
65 integer_param("ple_window", ple_window);
66 
67 static bool_t __read_mostly opt_pml_enabled = 1;
68 static s8 __read_mostly opt_ept_ad = -1;
69 
70 /*
71  * The 'ept' parameter controls functionalities that depend on, or impact the
72  * EPT mechanism. Optional comma separated value may contain:
73  *
74  *  pml                 Enable PML
75  *  ad                  Use A/D bits
76  */
parse_ept_param(const char * s)77 static int __init parse_ept_param(const char *s)
78 {
79     const char *ss;
80     int rc = 0;
81 
82     do {
83         bool_t val = !!strncmp(s, "no-", 3);
84 
85         if ( !val )
86             s += 3;
87 
88         ss = strchr(s, ',');
89         if ( !ss )
90             ss = strchr(s, '\0');
91 
92         if ( !strncmp(s, "pml", ss - s) )
93             opt_pml_enabled = val;
94         else if ( !strncmp(s, "ad", ss - s) )
95             opt_ept_ad = val;
96         else
97             rc = -EINVAL;
98 
99         s = ss + 1;
100     } while ( *ss );
101 
102     return rc;
103 }
104 custom_param("ept", parse_ept_param);
105 
106 /* Dynamic (run-time adjusted) execution control flags. */
107 u32 vmx_pin_based_exec_control __read_mostly;
108 u32 vmx_cpu_based_exec_control __read_mostly;
109 u32 vmx_secondary_exec_control __read_mostly;
110 u32 vmx_vmexit_control __read_mostly;
111 u32 vmx_vmentry_control __read_mostly;
112 u64 vmx_ept_vpid_cap __read_mostly;
113 u64 vmx_vmfunc __read_mostly;
114 bool_t vmx_virt_exception __read_mostly;
115 
116 static DEFINE_PER_CPU_READ_MOSTLY(paddr_t, vmxon_region);
117 static DEFINE_PER_CPU(paddr_t, current_vmcs);
118 static DEFINE_PER_CPU(struct list_head, active_vmcs_list);
119 DEFINE_PER_CPU(bool_t, vmxon);
120 
121 static u32 vmcs_revision_id __read_mostly;
122 u64 __read_mostly vmx_basic_msr;
123 
vmx_display_features(void)124 static void __init vmx_display_features(void)
125 {
126     int printed = 0;
127 
128     printk("VMX: Supported advanced features:\n");
129 
130 #define P(p,s) if ( p ) { printk(" - %s\n", s); printed = 1; }
131     P(cpu_has_vmx_virtualize_apic_accesses, "APIC MMIO access virtualisation");
132     P(cpu_has_vmx_tpr_shadow, "APIC TPR shadow");
133     P(cpu_has_vmx_ept, "Extended Page Tables (EPT)");
134     P(cpu_has_vmx_vpid, "Virtual-Processor Identifiers (VPID)");
135     P(cpu_has_vmx_vnmi, "Virtual NMI");
136     P(cpu_has_vmx_msr_bitmap, "MSR direct-access bitmap");
137     P(cpu_has_vmx_unrestricted_guest, "Unrestricted Guest");
138     P(cpu_has_vmx_apic_reg_virt, "APIC Register Virtualization");
139     P(cpu_has_vmx_virtual_intr_delivery, "Virtual Interrupt Delivery");
140     P(cpu_has_vmx_posted_intr_processing, "Posted Interrupt Processing");
141     P(cpu_has_vmx_vmcs_shadowing, "VMCS shadowing");
142     P(cpu_has_vmx_vmfunc, "VM Functions");
143     P(cpu_has_vmx_virt_exceptions, "Virtualisation Exceptions");
144     P(cpu_has_vmx_pml, "Page Modification Logging");
145     P(cpu_has_vmx_tsc_scaling, "TSC Scaling");
146 #undef P
147 
148     if ( !printed )
149         printk(" - none\n");
150 }
151 
adjust_vmx_controls(const char * name,u32 ctl_min,u32 ctl_opt,u32 msr,bool_t * mismatch)152 static u32 adjust_vmx_controls(
153     const char *name, u32 ctl_min, u32 ctl_opt, u32 msr, bool_t *mismatch)
154 {
155     u32 vmx_msr_low, vmx_msr_high, ctl = ctl_min | ctl_opt;
156 
157     rdmsr(msr, vmx_msr_low, vmx_msr_high);
158 
159     ctl &= vmx_msr_high; /* bit == 0 in high word ==> must be zero */
160     ctl |= vmx_msr_low;  /* bit == 1 in low word  ==> must be one  */
161 
162     /* Ensure minimum (required) set of control bits are supported. */
163     if ( ctl_min & ~ctl )
164     {
165         *mismatch = 1;
166         printk("VMX: CPU%d has insufficient %s (%08x; requires %08x)\n",
167                smp_processor_id(), name, ctl, ctl_min);
168     }
169 
170     return ctl;
171 }
172 
cap_check(const char * name,u32 expected,u32 saw)173 static bool_t cap_check(const char *name, u32 expected, u32 saw)
174 {
175     if ( saw != expected )
176         printk("VMX %s: saw %#x expected %#x\n", name, saw, expected);
177     return saw != expected;
178 }
179 
vmx_init_vmcs_config(void)180 static int vmx_init_vmcs_config(void)
181 {
182     u32 vmx_basic_msr_low, vmx_basic_msr_high, min, opt;
183     u32 _vmx_pin_based_exec_control;
184     u32 _vmx_cpu_based_exec_control;
185     u32 _vmx_secondary_exec_control = 0;
186     u64 _vmx_ept_vpid_cap = 0;
187     u64 _vmx_misc_cap = 0;
188     u32 _vmx_vmexit_control;
189     u32 _vmx_vmentry_control;
190     u64 _vmx_vmfunc = 0;
191     bool_t mismatch = 0;
192 
193     rdmsr(MSR_IA32_VMX_BASIC, vmx_basic_msr_low, vmx_basic_msr_high);
194 
195     min = (PIN_BASED_EXT_INTR_MASK |
196            PIN_BASED_NMI_EXITING);
197     opt = (PIN_BASED_VIRTUAL_NMIS |
198            PIN_BASED_POSTED_INTERRUPT);
199     _vmx_pin_based_exec_control = adjust_vmx_controls(
200         "Pin-Based Exec Control", min, opt,
201         MSR_IA32_VMX_PINBASED_CTLS, &mismatch);
202 
203     min = (CPU_BASED_HLT_EXITING |
204            CPU_BASED_VIRTUAL_INTR_PENDING |
205            CPU_BASED_CR8_LOAD_EXITING |
206            CPU_BASED_CR8_STORE_EXITING |
207            CPU_BASED_INVLPG_EXITING |
208            CPU_BASED_CR3_LOAD_EXITING |
209            CPU_BASED_CR3_STORE_EXITING |
210            CPU_BASED_MONITOR_EXITING |
211            CPU_BASED_MWAIT_EXITING |
212            CPU_BASED_MOV_DR_EXITING |
213            CPU_BASED_ACTIVATE_IO_BITMAP |
214            CPU_BASED_USE_TSC_OFFSETING |
215            CPU_BASED_RDTSC_EXITING);
216     opt = (CPU_BASED_ACTIVATE_MSR_BITMAP |
217            CPU_BASED_TPR_SHADOW |
218            CPU_BASED_MONITOR_TRAP_FLAG |
219            CPU_BASED_ACTIVATE_SECONDARY_CONTROLS);
220     _vmx_cpu_based_exec_control = adjust_vmx_controls(
221         "CPU-Based Exec Control", min, opt,
222         MSR_IA32_VMX_PROCBASED_CTLS, &mismatch);
223     _vmx_cpu_based_exec_control &= ~CPU_BASED_RDTSC_EXITING;
224     if ( _vmx_cpu_based_exec_control & CPU_BASED_TPR_SHADOW )
225         _vmx_cpu_based_exec_control &=
226             ~(CPU_BASED_CR8_LOAD_EXITING | CPU_BASED_CR8_STORE_EXITING);
227 
228     if ( _vmx_cpu_based_exec_control & CPU_BASED_ACTIVATE_SECONDARY_CONTROLS )
229     {
230         min = 0;
231         opt = (SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES |
232                SECONDARY_EXEC_WBINVD_EXITING |
233                SECONDARY_EXEC_ENABLE_EPT |
234                SECONDARY_EXEC_DESCRIPTOR_TABLE_EXITING |
235                SECONDARY_EXEC_ENABLE_RDTSCP |
236                SECONDARY_EXEC_PAUSE_LOOP_EXITING |
237                SECONDARY_EXEC_ENABLE_INVPCID |
238                SECONDARY_EXEC_ENABLE_VM_FUNCTIONS |
239                SECONDARY_EXEC_ENABLE_VIRT_EXCEPTIONS |
240                SECONDARY_EXEC_XSAVES |
241                SECONDARY_EXEC_TSC_SCALING);
242         rdmsrl(MSR_IA32_VMX_MISC, _vmx_misc_cap);
243         if ( _vmx_misc_cap & VMX_MISC_VMWRITE_ALL )
244             opt |= SECONDARY_EXEC_ENABLE_VMCS_SHADOWING;
245         if ( opt_vpid_enabled )
246             opt |= SECONDARY_EXEC_ENABLE_VPID;
247         if ( opt_unrestricted_guest_enabled )
248             opt |= SECONDARY_EXEC_UNRESTRICTED_GUEST;
249         if ( opt_pml_enabled )
250             opt |= SECONDARY_EXEC_ENABLE_PML;
251 
252         /*
253          * "APIC Register Virtualization" and "Virtual Interrupt Delivery"
254          * can be set only when "use TPR shadow" is set
255          */
256         if ( (_vmx_cpu_based_exec_control & CPU_BASED_TPR_SHADOW) &&
257              opt_apicv_enabled )
258             opt |= SECONDARY_EXEC_APIC_REGISTER_VIRT |
259                    SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY |
260                    SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE;
261 
262         _vmx_secondary_exec_control = adjust_vmx_controls(
263             "Secondary Exec Control", min, opt,
264             MSR_IA32_VMX_PROCBASED_CTLS2, &mismatch);
265     }
266 
267     /* The IA32_VMX_EPT_VPID_CAP MSR exists only when EPT or VPID available */
268     if ( _vmx_secondary_exec_control & (SECONDARY_EXEC_ENABLE_EPT |
269                                         SECONDARY_EXEC_ENABLE_VPID) )
270     {
271         rdmsrl(MSR_IA32_VMX_EPT_VPID_CAP, _vmx_ept_vpid_cap);
272 
273         if ( !opt_ept_ad )
274             _vmx_ept_vpid_cap &= ~VMX_EPT_AD_BIT;
275         else if ( /* Work around Erratum AVR41 on Avoton processors. */
276                   boot_cpu_data.x86 == 6 && boot_cpu_data.x86_model == 0x4d &&
277                   opt_ept_ad < 0 )
278             _vmx_ept_vpid_cap &= ~VMX_EPT_AD_BIT;
279 
280         /*
281          * Additional sanity checking before using EPT:
282          * 1) the CPU we are running on must support EPT WB, as we will set
283          *    ept paging structures memory type to WB;
284          * 2) the CPU must support the EPT page-walk length of 4 according to
285          *    Intel SDM 25.2.2.
286          * 3) the CPU must support INVEPT all context invalidation, because we
287          *    will use it as final resort if other types are not supported.
288          *
289          * Or we just don't use EPT.
290          */
291         if ( !(_vmx_ept_vpid_cap & VMX_EPT_MEMORY_TYPE_WB) ||
292              !(_vmx_ept_vpid_cap & VMX_EPT_WALK_LENGTH_4_SUPPORTED) ||
293              !(_vmx_ept_vpid_cap & VMX_EPT_INVEPT_ALL_CONTEXT) )
294             _vmx_secondary_exec_control &= ~SECONDARY_EXEC_ENABLE_EPT;
295 
296         /*
297          * the CPU must support INVVPID all context invalidation, because we
298          * will use it as final resort if other types are not supported.
299          *
300          * Or we just don't use VPID.
301          */
302         if ( !(_vmx_ept_vpid_cap & VMX_VPID_INVVPID_ALL_CONTEXT) )
303             _vmx_secondary_exec_control &= ~SECONDARY_EXEC_ENABLE_VPID;
304 
305         /* EPT A/D bits is required for PML */
306         if ( !(_vmx_ept_vpid_cap & VMX_EPT_AD_BIT) )
307             _vmx_secondary_exec_control &= ~SECONDARY_EXEC_ENABLE_PML;
308     }
309 
310     if ( _vmx_secondary_exec_control & SECONDARY_EXEC_ENABLE_EPT )
311     {
312         /*
313          * To use EPT we expect to be able to clear certain intercepts.
314          * We check VMX_BASIC_MSR[55] to correctly handle default controls.
315          */
316         uint32_t must_be_one, must_be_zero, msr = MSR_IA32_VMX_PROCBASED_CTLS;
317         if ( vmx_basic_msr_high & (VMX_BASIC_DEFAULT1_ZERO >> 32) )
318             msr = MSR_IA32_VMX_TRUE_PROCBASED_CTLS;
319         rdmsr(msr, must_be_one, must_be_zero);
320         if ( must_be_one & (CPU_BASED_INVLPG_EXITING |
321                             CPU_BASED_CR3_LOAD_EXITING |
322                             CPU_BASED_CR3_STORE_EXITING) )
323             _vmx_secondary_exec_control &=
324                 ~(SECONDARY_EXEC_ENABLE_EPT |
325                   SECONDARY_EXEC_UNRESTRICTED_GUEST);
326     }
327 
328     /* PML cannot be supported if EPT is not used */
329     if ( !(_vmx_secondary_exec_control & SECONDARY_EXEC_ENABLE_EPT) )
330         _vmx_secondary_exec_control &= ~SECONDARY_EXEC_ENABLE_PML;
331 
332     /* Turn off opt_pml_enabled if PML feature is not present */
333     if ( !(_vmx_secondary_exec_control & SECONDARY_EXEC_ENABLE_PML) )
334         opt_pml_enabled = 0;
335 
336     if ( (_vmx_secondary_exec_control & SECONDARY_EXEC_PAUSE_LOOP_EXITING) &&
337           ple_gap == 0 )
338     {
339         if ( !vmx_pin_based_exec_control )
340             printk(XENLOG_INFO "Disable Pause-Loop Exiting.\n");
341         _vmx_secondary_exec_control &= ~ SECONDARY_EXEC_PAUSE_LOOP_EXITING;
342     }
343 
344     min = VM_EXIT_ACK_INTR_ON_EXIT;
345     opt = VM_EXIT_SAVE_GUEST_PAT | VM_EXIT_LOAD_HOST_PAT |
346           VM_EXIT_CLEAR_BNDCFGS;
347     min |= VM_EXIT_IA32E_MODE;
348     _vmx_vmexit_control = adjust_vmx_controls(
349         "VMExit Control", min, opt, MSR_IA32_VMX_EXIT_CTLS, &mismatch);
350 
351     /*
352      * "Process posted interrupt" can be set only when "virtual-interrupt
353      * delivery" and "acknowledge interrupt on exit" is set. For the latter
354      * is a minimal requirement, only check the former, which is optional.
355      */
356     if ( !(_vmx_secondary_exec_control & SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY) )
357         _vmx_pin_based_exec_control &= ~PIN_BASED_POSTED_INTERRUPT;
358 
359     if ( iommu_intpost &&
360          !(_vmx_pin_based_exec_control & PIN_BASED_POSTED_INTERRUPT) )
361     {
362         printk("Intel VT-d Posted Interrupt is disabled for CPU-side Posted "
363                "Interrupt is not enabled\n");
364         iommu_intpost = 0;
365     }
366 
367     /* The IA32_VMX_VMFUNC MSR exists only when VMFUNC is available */
368     if ( _vmx_secondary_exec_control & SECONDARY_EXEC_ENABLE_VM_FUNCTIONS )
369     {
370         rdmsrl(MSR_IA32_VMX_VMFUNC, _vmx_vmfunc);
371 
372         /*
373          * VMFUNC leaf 0 (EPTP switching) must be supported.
374          *
375          * Or we just don't use VMFUNC.
376          */
377         if ( !(_vmx_vmfunc & VMX_VMFUNC_EPTP_SWITCHING) )
378             _vmx_secondary_exec_control &= ~SECONDARY_EXEC_ENABLE_VM_FUNCTIONS;
379     }
380 
381     /* Virtualization exceptions are only enabled if VMFUNC is enabled */
382     if ( !(_vmx_secondary_exec_control & SECONDARY_EXEC_ENABLE_VM_FUNCTIONS) )
383         _vmx_secondary_exec_control &= ~SECONDARY_EXEC_ENABLE_VIRT_EXCEPTIONS;
384 
385     min = 0;
386     opt = VM_ENTRY_LOAD_GUEST_PAT | VM_ENTRY_LOAD_BNDCFGS;
387     _vmx_vmentry_control = adjust_vmx_controls(
388         "VMEntry Control", min, opt, MSR_IA32_VMX_ENTRY_CTLS, &mismatch);
389 
390     if ( mismatch )
391         return -EINVAL;
392 
393     if ( !vmx_pin_based_exec_control )
394     {
395         /* First time through. */
396         vmcs_revision_id           = vmx_basic_msr_low & VMX_BASIC_REVISION_MASK;
397         vmx_pin_based_exec_control = _vmx_pin_based_exec_control;
398         vmx_cpu_based_exec_control = _vmx_cpu_based_exec_control;
399         vmx_secondary_exec_control = _vmx_secondary_exec_control;
400         vmx_ept_vpid_cap           = _vmx_ept_vpid_cap;
401         vmx_vmexit_control         = _vmx_vmexit_control;
402         vmx_vmentry_control        = _vmx_vmentry_control;
403         vmx_basic_msr              = ((u64)vmx_basic_msr_high << 32) |
404                                      vmx_basic_msr_low;
405         vmx_vmfunc                 = _vmx_vmfunc;
406         vmx_virt_exception         = !!(_vmx_secondary_exec_control &
407                                        SECONDARY_EXEC_ENABLE_VIRT_EXCEPTIONS);
408         vmx_display_features();
409 
410         /* IA-32 SDM Vol 3B: VMCS size is never greater than 4kB. */
411         if ( (vmx_basic_msr_high & (VMX_BASIC_VMCS_SIZE_MASK >> 32)) >
412              PAGE_SIZE )
413         {
414             printk("VMX: CPU%d VMCS size is too big (%Lu bytes)\n",
415                    smp_processor_id(),
416                    vmx_basic_msr_high & (VMX_BASIC_VMCS_SIZE_MASK >> 32));
417             return -EINVAL;
418         }
419     }
420     else
421     {
422         /* Globals are already initialised: re-check them. */
423         mismatch |= cap_check(
424             "VMCS revision ID",
425             vmcs_revision_id, vmx_basic_msr_low & VMX_BASIC_REVISION_MASK);
426         mismatch |= cap_check(
427             "Pin-Based Exec Control",
428             vmx_pin_based_exec_control, _vmx_pin_based_exec_control);
429         mismatch |= cap_check(
430             "CPU-Based Exec Control",
431             vmx_cpu_based_exec_control, _vmx_cpu_based_exec_control);
432         mismatch |= cap_check(
433             "Secondary Exec Control",
434             vmx_secondary_exec_control, _vmx_secondary_exec_control);
435         mismatch |= cap_check(
436             "VMExit Control",
437             vmx_vmexit_control, _vmx_vmexit_control);
438         mismatch |= cap_check(
439             "VMEntry Control",
440             vmx_vmentry_control, _vmx_vmentry_control);
441         mismatch |= cap_check(
442             "EPT and VPID Capability",
443             vmx_ept_vpid_cap, _vmx_ept_vpid_cap);
444         mismatch |= cap_check(
445             "VMFUNC Capability",
446             vmx_vmfunc, _vmx_vmfunc);
447         if ( cpu_has_vmx_ins_outs_instr_info !=
448              !!(vmx_basic_msr_high & (VMX_BASIC_INS_OUT_INFO >> 32)) )
449         {
450             printk("VMX INS/OUTS Instruction Info: saw %d expected %d\n",
451                    !!(vmx_basic_msr_high & (VMX_BASIC_INS_OUT_INFO >> 32)),
452                    cpu_has_vmx_ins_outs_instr_info);
453             mismatch = 1;
454         }
455         if ( (vmx_basic_msr_high & (VMX_BASIC_VMCS_SIZE_MASK >> 32)) !=
456              ((vmx_basic_msr & VMX_BASIC_VMCS_SIZE_MASK) >> 32) )
457         {
458             printk("VMX: CPU%d unexpected VMCS size %Lu\n",
459                    smp_processor_id(),
460                    vmx_basic_msr_high & (VMX_BASIC_VMCS_SIZE_MASK >> 32));
461             mismatch = 1;
462         }
463         if ( mismatch )
464         {
465             printk("VMX: Capabilities fatally differ between CPU%d and CPU0\n",
466                    smp_processor_id());
467             return -EINVAL;
468         }
469     }
470 
471     /* IA-32 SDM Vol 3B: 64-bit CPUs always have VMX_BASIC_MSR[48]==0. */
472     if ( vmx_basic_msr_high & (VMX_BASIC_32BIT_ADDRESSES >> 32) )
473     {
474         printk("VMX: CPU%d limits VMX structure pointers to 32 bits\n",
475                smp_processor_id());
476         return -EINVAL;
477     }
478 
479     /* Require Write-Back (WB) memory type for VMCS accesses. */
480     opt = (vmx_basic_msr_high & (VMX_BASIC_MEMORY_TYPE_MASK >> 32)) /
481           ((VMX_BASIC_MEMORY_TYPE_MASK & -VMX_BASIC_MEMORY_TYPE_MASK) >> 32);
482     if ( opt != MTRR_TYPE_WRBACK )
483     {
484         printk("VMX: CPU%d has unexpected VMCS access type %u\n",
485                smp_processor_id(), opt);
486         return -EINVAL;
487     }
488 
489     return 0;
490 }
491 
vmx_alloc_vmcs(void)492 static paddr_t vmx_alloc_vmcs(void)
493 {
494     struct page_info *pg;
495     struct vmcs_struct *vmcs;
496 
497     if ( (pg = alloc_domheap_page(NULL, 0)) == NULL )
498     {
499         gdprintk(XENLOG_WARNING, "Failed to allocate VMCS.\n");
500         return 0;
501     }
502 
503     vmcs = __map_domain_page(pg);
504     clear_page(vmcs);
505     vmcs->vmcs_revision_id = vmcs_revision_id;
506     unmap_domain_page(vmcs);
507 
508     return page_to_maddr(pg);
509 }
510 
vmx_free_vmcs(paddr_t pa)511 static void vmx_free_vmcs(paddr_t pa)
512 {
513     free_domheap_page(maddr_to_page(pa));
514 }
515 
__vmx_clear_vmcs(void * info)516 static void __vmx_clear_vmcs(void *info)
517 {
518     struct vcpu *v = info;
519     struct arch_vmx_struct *arch_vmx = &v->arch.hvm_vmx;
520 
521     /* Otherwise we can nest (vmx_cpu_down() vs. vmx_clear_vmcs()). */
522     ASSERT(!local_irq_is_enabled());
523 
524     if ( arch_vmx->active_cpu == smp_processor_id() )
525     {
526         __vmpclear(arch_vmx->vmcs_pa);
527         if ( arch_vmx->vmcs_shadow_maddr )
528             __vmpclear(arch_vmx->vmcs_shadow_maddr);
529 
530         arch_vmx->active_cpu = -1;
531         arch_vmx->launched   = 0;
532 
533         list_del(&arch_vmx->active_list);
534 
535         if ( arch_vmx->vmcs_pa == this_cpu(current_vmcs) )
536             this_cpu(current_vmcs) = 0;
537     }
538 }
539 
vmx_clear_vmcs(struct vcpu * v)540 static void vmx_clear_vmcs(struct vcpu *v)
541 {
542     int cpu = v->arch.hvm_vmx.active_cpu;
543 
544     if ( cpu != -1 )
545         on_selected_cpus(cpumask_of(cpu), __vmx_clear_vmcs, v, 1);
546 }
547 
vmx_load_vmcs(struct vcpu * v)548 static void vmx_load_vmcs(struct vcpu *v)
549 {
550     unsigned long flags;
551 
552     local_irq_save(flags);
553 
554     if ( v->arch.hvm_vmx.active_cpu == -1 )
555     {
556         list_add(&v->arch.hvm_vmx.active_list, &this_cpu(active_vmcs_list));
557         v->arch.hvm_vmx.active_cpu = smp_processor_id();
558     }
559 
560     ASSERT(v->arch.hvm_vmx.active_cpu == smp_processor_id());
561 
562     __vmptrld(v->arch.hvm_vmx.vmcs_pa);
563     this_cpu(current_vmcs) = v->arch.hvm_vmx.vmcs_pa;
564 
565     local_irq_restore(flags);
566 }
567 
vmx_vmcs_reload(struct vcpu * v)568 void vmx_vmcs_reload(struct vcpu *v)
569 {
570     /*
571      * As we may be running with interrupts disabled, we can't acquire
572      * v->arch.hvm_vmx.vmcs_lock here. However, with interrupts disabled
573      * the VMCS can't be taken away from us anymore if we still own it.
574      */
575     ASSERT(v->is_running || !local_irq_is_enabled());
576     if ( v->arch.hvm_vmx.vmcs_pa == this_cpu(current_vmcs) )
577         return;
578 
579     vmx_load_vmcs(v);
580 }
581 
vmx_cpu_up_prepare(unsigned int cpu)582 int vmx_cpu_up_prepare(unsigned int cpu)
583 {
584     /*
585      * If nvmx_cpu_up_prepare() failed, do not return failure and just fallback
586      * to legacy mode for vvmcs synchronization.
587      */
588     if ( nvmx_cpu_up_prepare(cpu) != 0 )
589         printk("CPU%d: Could not allocate virtual VMCS buffer.\n", cpu);
590 
591     if ( per_cpu(vmxon_region, cpu) )
592         return 0;
593 
594     per_cpu(vmxon_region, cpu) = vmx_alloc_vmcs();
595     if ( per_cpu(vmxon_region, cpu) )
596         return 0;
597 
598     printk("CPU%d: Could not allocate host VMCS\n", cpu);
599     nvmx_cpu_dead(cpu);
600     return -ENOMEM;
601 }
602 
vmx_cpu_dead(unsigned int cpu)603 void vmx_cpu_dead(unsigned int cpu)
604 {
605     vmx_free_vmcs(per_cpu(vmxon_region, cpu));
606     per_cpu(vmxon_region, cpu) = 0;
607     nvmx_cpu_dead(cpu);
608     vmx_pi_desc_fixup(cpu);
609 }
610 
_vmx_cpu_up(bool bsp)611 int _vmx_cpu_up(bool bsp)
612 {
613     u32 eax, edx;
614     int rc, bios_locked, cpu = smp_processor_id();
615     u64 cr0, vmx_cr0_fixed0, vmx_cr0_fixed1;
616 
617     BUG_ON(!(read_cr4() & X86_CR4_VMXE));
618 
619     /*
620      * Ensure the current processor operating mode meets
621      * the requred CRO fixed bits in VMX operation.
622      */
623     cr0 = read_cr0();
624     rdmsrl(MSR_IA32_VMX_CR0_FIXED0, vmx_cr0_fixed0);
625     rdmsrl(MSR_IA32_VMX_CR0_FIXED1, vmx_cr0_fixed1);
626     if ( (~cr0 & vmx_cr0_fixed0) || (cr0 & ~vmx_cr0_fixed1) )
627     {
628         printk("CPU%d: some settings of host CR0 are "
629                "not allowed in VMX operation.\n", cpu);
630         return -EINVAL;
631     }
632 
633     rdmsr(MSR_IA32_FEATURE_CONTROL, eax, edx);
634 
635     bios_locked = !!(eax & IA32_FEATURE_CONTROL_LOCK);
636     if ( bios_locked )
637     {
638         if ( !(eax & (tboot_in_measured_env()
639                       ? IA32_FEATURE_CONTROL_ENABLE_VMXON_INSIDE_SMX
640                       : IA32_FEATURE_CONTROL_ENABLE_VMXON_OUTSIDE_SMX)) )
641         {
642             printk("CPU%d: VMX disabled by BIOS.\n", cpu);
643             return -EINVAL;
644         }
645     }
646     else
647     {
648         eax  = IA32_FEATURE_CONTROL_LOCK;
649         eax |= IA32_FEATURE_CONTROL_ENABLE_VMXON_OUTSIDE_SMX;
650         if ( test_bit(X86_FEATURE_SMX, &boot_cpu_data.x86_capability) )
651             eax |= IA32_FEATURE_CONTROL_ENABLE_VMXON_INSIDE_SMX;
652         wrmsr(MSR_IA32_FEATURE_CONTROL, eax, 0);
653     }
654 
655     if ( (rc = vmx_init_vmcs_config()) != 0 )
656         return rc;
657 
658     INIT_LIST_HEAD(&this_cpu(active_vmcs_list));
659 
660     if ( bsp && (rc = vmx_cpu_up_prepare(cpu)) != 0 )
661         return rc;
662 
663     switch ( __vmxon(this_cpu(vmxon_region)) )
664     {
665     case -2: /* #UD or #GP */
666         if ( bios_locked &&
667              test_bit(X86_FEATURE_SMX, &boot_cpu_data.x86_capability) &&
668              (!(eax & IA32_FEATURE_CONTROL_ENABLE_VMXON_OUTSIDE_SMX) ||
669               !(eax & IA32_FEATURE_CONTROL_ENABLE_VMXON_INSIDE_SMX)) )
670         {
671             printk("CPU%d: VMXON failed: perhaps because of TXT settings "
672                    "in your BIOS configuration?\n", cpu);
673             printk(" --> Disable TXT in your BIOS unless using a secure "
674                    "bootloader.\n");
675             return -EINVAL;
676         }
677         /* fall through */
678     case -1: /* CF==1 or ZF==1 */
679         printk("CPU%d: unexpected VMXON failure\n", cpu);
680         return -EINVAL;
681     case 0: /* success */
682         this_cpu(vmxon) = 1;
683         break;
684     default:
685         BUG();
686     }
687 
688     hvm_asid_init(cpu_has_vmx_vpid ? (1u << VMCS_VPID_WIDTH) : 0);
689 
690     if ( cpu_has_vmx_ept )
691         ept_sync_all();
692 
693     if ( cpu_has_vmx_vpid )
694         vpid_sync_all();
695 
696     vmx_pi_per_cpu_init(cpu);
697 
698     return 0;
699 }
700 
vmx_cpu_up()701 int vmx_cpu_up()
702 {
703     return _vmx_cpu_up(false);
704 }
705 
vmx_cpu_down(void)706 void vmx_cpu_down(void)
707 {
708     struct list_head *active_vmcs_list = &this_cpu(active_vmcs_list);
709     unsigned long flags;
710 
711     if ( !this_cpu(vmxon) )
712         return;
713 
714     local_irq_save(flags);
715 
716     while ( !list_empty(active_vmcs_list) )
717         __vmx_clear_vmcs(list_entry(active_vmcs_list->next,
718                                     struct vcpu, arch.hvm_vmx.active_list));
719 
720     BUG_ON(!(read_cr4() & X86_CR4_VMXE));
721     this_cpu(vmxon) = 0;
722     __vmxoff();
723 
724     local_irq_restore(flags);
725 }
726 
727 struct foreign_vmcs {
728     struct vcpu *v;
729     unsigned int count;
730 };
731 static DEFINE_PER_CPU(struct foreign_vmcs, foreign_vmcs);
732 
vmx_vmcs_try_enter(struct vcpu * v)733 bool_t vmx_vmcs_try_enter(struct vcpu *v)
734 {
735     struct foreign_vmcs *fv;
736 
737     /*
738      * NB. We must *always* run an HVM VCPU on its own VMCS, except for
739      * vmx_vmcs_enter/exit and scheduling tail critical regions.
740      */
741     if ( likely(v == current) )
742         return v->arch.hvm_vmx.vmcs_pa == this_cpu(current_vmcs);
743 
744     fv = &this_cpu(foreign_vmcs);
745 
746     if ( fv->v == v )
747     {
748         BUG_ON(fv->count == 0);
749     }
750     else
751     {
752         BUG_ON(fv->v != NULL);
753         BUG_ON(fv->count != 0);
754 
755         vcpu_pause(v);
756         spin_lock(&v->arch.hvm_vmx.vmcs_lock);
757 
758         vmx_clear_vmcs(v);
759         vmx_load_vmcs(v);
760 
761         fv->v = v;
762     }
763 
764     fv->count++;
765 
766     return 1;
767 }
768 
vmx_vmcs_enter(struct vcpu * v)769 void vmx_vmcs_enter(struct vcpu *v)
770 {
771     bool_t okay = vmx_vmcs_try_enter(v);
772 
773     ASSERT(okay);
774 }
775 
vmx_vmcs_exit(struct vcpu * v)776 void vmx_vmcs_exit(struct vcpu *v)
777 {
778     struct foreign_vmcs *fv;
779 
780     if ( likely(v == current) )
781         return;
782 
783     fv = &this_cpu(foreign_vmcs);
784     BUG_ON(fv->v != v);
785     BUG_ON(fv->count == 0);
786 
787     if ( --fv->count == 0 )
788     {
789         /* Don't confuse vmx_do_resume (for @v or @current!) */
790         vmx_clear_vmcs(v);
791         if ( is_hvm_vcpu(current) )
792             vmx_load_vmcs(current);
793 
794         spin_unlock(&v->arch.hvm_vmx.vmcs_lock);
795         vcpu_unpause(v);
796 
797         fv->v = NULL;
798     }
799 }
800 
vmx_set_host_env(struct vcpu * v)801 static void vmx_set_host_env(struct vcpu *v)
802 {
803     unsigned int cpu = smp_processor_id();
804 
805     __vmwrite(HOST_GDTR_BASE,
806               (unsigned long)(this_cpu(gdt_table) - FIRST_RESERVED_GDT_ENTRY));
807     __vmwrite(HOST_IDTR_BASE, (unsigned long)idt_tables[cpu]);
808 
809     __vmwrite(HOST_TR_SELECTOR, TSS_ENTRY << 3);
810     __vmwrite(HOST_TR_BASE, (unsigned long)&per_cpu(init_tss, cpu));
811 
812     __vmwrite(HOST_SYSENTER_ESP, get_stack_bottom());
813 
814     /*
815      * Skip end of cpu_user_regs when entering the hypervisor because the
816      * CPU does not save context onto the stack. SS,RSP,CS,RIP,RFLAGS,etc
817      * all get saved into the VMCS instead.
818      */
819     __vmwrite(HOST_RSP,
820               (unsigned long)&get_cpu_info()->guest_cpu_user_regs.error_code);
821 }
822 
vmx_clear_msr_intercept(struct vcpu * v,unsigned int msr,enum vmx_msr_intercept_type type)823 void vmx_clear_msr_intercept(struct vcpu *v, unsigned int msr,
824                              enum vmx_msr_intercept_type type)
825 {
826     struct vmx_msr_bitmap *msr_bitmap = v->arch.hvm_vmx.msr_bitmap;
827     struct domain *d = v->domain;
828 
829     /* VMX MSR bitmap supported? */
830     if ( msr_bitmap == NULL )
831         return;
832 
833     if ( unlikely(monitored_msr(d, msr)) )
834         return;
835 
836     if ( msr <= 0x1fff )
837     {
838         if ( type & VMX_MSR_R )
839             clear_bit(msr, msr_bitmap->read_low);
840         if ( type & VMX_MSR_W )
841             clear_bit(msr, msr_bitmap->write_low);
842     }
843     else if ( (msr >= 0xc0000000) && (msr <= 0xc0001fff) )
844     {
845         msr &= 0x1fff;
846         if ( type & VMX_MSR_R )
847             clear_bit(msr, msr_bitmap->read_high);
848         if ( type & VMX_MSR_W )
849             clear_bit(msr, msr_bitmap->write_high);
850     }
851     else
852         ASSERT(!"MSR out of range for interception\n");
853 }
854 
vmx_set_msr_intercept(struct vcpu * v,unsigned int msr,enum vmx_msr_intercept_type type)855 void vmx_set_msr_intercept(struct vcpu *v, unsigned int msr,
856                            enum vmx_msr_intercept_type type)
857 {
858     struct vmx_msr_bitmap *msr_bitmap = v->arch.hvm_vmx.msr_bitmap;
859 
860     /* VMX MSR bitmap supported? */
861     if ( msr_bitmap == NULL )
862         return;
863 
864     if ( msr <= 0x1fff )
865     {
866         if ( type & VMX_MSR_R )
867             set_bit(msr, msr_bitmap->read_low);
868         if ( type & VMX_MSR_W )
869             set_bit(msr, msr_bitmap->write_low);
870     }
871     else if ( (msr >= 0xc0000000) && (msr <= 0xc0001fff) )
872     {
873         msr &= 0x1fff;
874         if ( type & VMX_MSR_R )
875             set_bit(msr, msr_bitmap->read_high);
876         if ( type & VMX_MSR_W )
877             set_bit(msr, msr_bitmap->write_high);
878     }
879     else
880         ASSERT(!"MSR out of range for interception\n");
881 }
882 
vmx_msr_is_intercepted(struct vmx_msr_bitmap * msr_bitmap,unsigned int msr,bool is_write)883 bool vmx_msr_is_intercepted(struct vmx_msr_bitmap *msr_bitmap,
884                             unsigned int msr, bool is_write)
885 {
886     if ( msr <= 0x1fff )
887         return test_bit(msr, is_write ? msr_bitmap->write_low
888                                       : msr_bitmap->read_low);
889     else if ( (msr >= 0xc0000000) && (msr <= 0xc0001fff) )
890         return test_bit(msr & 0x1fff, is_write ? msr_bitmap->write_high
891                                                : msr_bitmap->read_high);
892     else
893         /* MSRs outside the bitmap ranges are always intercepted. */
894         return true;
895 }
896 
897 
898 /*
899  * Switch VMCS between layer 1 & 2 guest
900  */
vmx_vmcs_switch(paddr_t from,paddr_t to)901 void vmx_vmcs_switch(paddr_t from, paddr_t to)
902 {
903     struct arch_vmx_struct *vmx = &current->arch.hvm_vmx;
904     spin_lock(&vmx->vmcs_lock);
905 
906     __vmpclear(from);
907     if ( vmx->vmcs_shadow_maddr )
908         __vmpclear(vmx->vmcs_shadow_maddr);
909     __vmptrld(to);
910 
911     vmx->vmcs_pa = to;
912     vmx->launched = 0;
913     this_cpu(current_vmcs) = to;
914 
915     if ( vmx->hostenv_migrated )
916     {
917         vmx->hostenv_migrated = 0;
918         vmx_set_host_env(current);
919     }
920 
921     spin_unlock(&vmx->vmcs_lock);
922 }
923 
virtual_vmcs_enter(const struct vcpu * v)924 void virtual_vmcs_enter(const struct vcpu *v)
925 {
926     __vmptrld(v->arch.hvm_vmx.vmcs_shadow_maddr);
927 }
928 
virtual_vmcs_exit(const struct vcpu * v)929 void virtual_vmcs_exit(const struct vcpu *v)
930 {
931     paddr_t cur = this_cpu(current_vmcs);
932 
933     __vmpclear(v->arch.hvm_vmx.vmcs_shadow_maddr);
934     if ( cur )
935         __vmptrld(cur);
936 }
937 
virtual_vmcs_vmread(const struct vcpu * v,u32 vmcs_encoding)938 u64 virtual_vmcs_vmread(const struct vcpu *v, u32 vmcs_encoding)
939 {
940     u64 res;
941 
942     virtual_vmcs_enter(v);
943     __vmread(vmcs_encoding, &res);
944     virtual_vmcs_exit(v);
945 
946     return res;
947 }
948 
virtual_vmcs_vmread_safe(const struct vcpu * v,u32 vmcs_encoding,u64 * val)949 enum vmx_insn_errno virtual_vmcs_vmread_safe(const struct vcpu *v,
950                                              u32 vmcs_encoding, u64 *val)
951 {
952     enum vmx_insn_errno ret;
953 
954     virtual_vmcs_enter(v);
955     ret = vmread_safe(vmcs_encoding, val);
956     virtual_vmcs_exit(v);
957 
958     return ret;
959 }
960 
virtual_vmcs_vmwrite(const struct vcpu * v,u32 vmcs_encoding,u64 val)961 void virtual_vmcs_vmwrite(const struct vcpu *v, u32 vmcs_encoding, u64 val)
962 {
963     virtual_vmcs_enter(v);
964     __vmwrite(vmcs_encoding, val);
965     virtual_vmcs_exit(v);
966 }
967 
virtual_vmcs_vmwrite_safe(const struct vcpu * v,u32 vmcs_encoding,u64 val)968 enum vmx_insn_errno virtual_vmcs_vmwrite_safe(const struct vcpu *v,
969                                               u32 vmcs_encoding, u64 val)
970 {
971     enum vmx_insn_errno ret;
972 
973     virtual_vmcs_enter(v);
974     ret = vmwrite_safe(vmcs_encoding, val);
975     virtual_vmcs_exit(v);
976 
977     return ret;
978 }
979 
980 /*
981  * This function is only called in a vCPU's initialization phase,
982  * so we can update the posted-interrupt descriptor in non-atomic way.
983  */
pi_desc_init(struct vcpu * v)984 static void pi_desc_init(struct vcpu *v)
985 {
986     v->arch.hvm_vmx.pi_desc.nv = posted_intr_vector;
987 
988     /*
989      * Mark NDST as invalid, then we can use this invalid value as a
990      * marker to whether update NDST or not in vmx_pi_hooks_assign().
991      */
992     v->arch.hvm_vmx.pi_desc.ndst = APIC_INVALID_DEST;
993 }
994 
construct_vmcs(struct vcpu * v)995 static int construct_vmcs(struct vcpu *v)
996 {
997     struct domain *d = v->domain;
998     uint16_t sysenter_cs;
999     unsigned long sysenter_eip;
1000     u32 vmexit_ctl = vmx_vmexit_control;
1001     u32 vmentry_ctl = vmx_vmentry_control;
1002 
1003     vmx_vmcs_enter(v);
1004 
1005     /* VMCS controls. */
1006     __vmwrite(PIN_BASED_VM_EXEC_CONTROL, vmx_pin_based_exec_control);
1007 
1008     v->arch.hvm_vmx.exec_control = vmx_cpu_based_exec_control;
1009     if ( d->arch.vtsc && !cpu_has_vmx_tsc_scaling )
1010         v->arch.hvm_vmx.exec_control |= CPU_BASED_RDTSC_EXITING;
1011 
1012     v->arch.hvm_vmx.secondary_exec_control = vmx_secondary_exec_control;
1013 
1014     /*
1015      * Disable descriptor table exiting: It's controlled by the VM event
1016      * monitor requesting it.
1017      */
1018     v->arch.hvm_vmx.secondary_exec_control &=
1019         ~SECONDARY_EXEC_DESCRIPTOR_TABLE_EXITING;
1020 
1021     /* Disable VPID for now: we decide when to enable it on VMENTER. */
1022     v->arch.hvm_vmx.secondary_exec_control &= ~SECONDARY_EXEC_ENABLE_VPID;
1023 
1024     if ( paging_mode_hap(d) )
1025     {
1026         v->arch.hvm_vmx.exec_control &= ~(CPU_BASED_INVLPG_EXITING |
1027                                           CPU_BASED_CR3_LOAD_EXITING |
1028                                           CPU_BASED_CR3_STORE_EXITING);
1029     }
1030     else
1031     {
1032         v->arch.hvm_vmx.secondary_exec_control &=
1033             ~(SECONDARY_EXEC_ENABLE_EPT |
1034               SECONDARY_EXEC_UNRESTRICTED_GUEST |
1035               SECONDARY_EXEC_ENABLE_INVPCID);
1036         vmexit_ctl &= ~(VM_EXIT_SAVE_GUEST_PAT |
1037                         VM_EXIT_LOAD_HOST_PAT);
1038         vmentry_ctl &= ~VM_ENTRY_LOAD_GUEST_PAT;
1039     }
1040 
1041     /* Disable Virtualize x2APIC mode by default. */
1042     v->arch.hvm_vmx.secondary_exec_control &=
1043         ~SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE;
1044 
1045     /* Do not enable Monitor Trap Flag unless start single step debug */
1046     v->arch.hvm_vmx.exec_control &= ~CPU_BASED_MONITOR_TRAP_FLAG;
1047 
1048     /* Disable VMFUNC and #VE for now: they may be enabled later by altp2m. */
1049     v->arch.hvm_vmx.secondary_exec_control &=
1050         ~(SECONDARY_EXEC_ENABLE_VM_FUNCTIONS |
1051           SECONDARY_EXEC_ENABLE_VIRT_EXCEPTIONS);
1052 
1053     if ( !has_vlapic(d) )
1054     {
1055         /* Disable virtual apics, TPR */
1056         v->arch.hvm_vmx.secondary_exec_control &=
1057             ~(SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES
1058               | SECONDARY_EXEC_APIC_REGISTER_VIRT
1059               | SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY);
1060         v->arch.hvm_vmx.exec_control &= ~CPU_BASED_TPR_SHADOW;
1061 
1062         /* In turn, disable posted interrupts. */
1063         __vmwrite(PIN_BASED_VM_EXEC_CONTROL,
1064                   vmx_pin_based_exec_control & ~PIN_BASED_POSTED_INTERRUPT);
1065     }
1066 
1067     vmx_update_cpu_exec_control(v);
1068 
1069     __vmwrite(VM_EXIT_CONTROLS, vmexit_ctl);
1070     __vmwrite(VM_ENTRY_CONTROLS, vmentry_ctl);
1071 
1072     if ( cpu_has_vmx_ple )
1073     {
1074         __vmwrite(PLE_GAP, ple_gap);
1075         __vmwrite(PLE_WINDOW, ple_window);
1076     }
1077 
1078     if ( cpu_has_vmx_secondary_exec_control )
1079         __vmwrite(SECONDARY_VM_EXEC_CONTROL,
1080                   v->arch.hvm_vmx.secondary_exec_control);
1081 
1082     /* MSR access bitmap. */
1083     if ( cpu_has_vmx_msr_bitmap )
1084     {
1085         struct vmx_msr_bitmap *msr_bitmap = alloc_xenheap_page();
1086 
1087         if ( msr_bitmap == NULL )
1088         {
1089             vmx_vmcs_exit(v);
1090             return -ENOMEM;
1091         }
1092 
1093         memset(msr_bitmap, ~0, PAGE_SIZE);
1094         v->arch.hvm_vmx.msr_bitmap = msr_bitmap;
1095         __vmwrite(MSR_BITMAP, virt_to_maddr(msr_bitmap));
1096 
1097         vmx_clear_msr_intercept(v, MSR_FS_BASE, VMX_MSR_RW);
1098         vmx_clear_msr_intercept(v, MSR_GS_BASE, VMX_MSR_RW);
1099         vmx_clear_msr_intercept(v, MSR_SHADOW_GS_BASE, VMX_MSR_RW);
1100         vmx_clear_msr_intercept(v, MSR_IA32_SYSENTER_CS, VMX_MSR_RW);
1101         vmx_clear_msr_intercept(v, MSR_IA32_SYSENTER_ESP, VMX_MSR_RW);
1102         vmx_clear_msr_intercept(v, MSR_IA32_SYSENTER_EIP, VMX_MSR_RW);
1103         if ( paging_mode_hap(d) && (!iommu_enabled || iommu_snoop) )
1104             vmx_clear_msr_intercept(v, MSR_IA32_CR_PAT, VMX_MSR_RW);
1105         if ( (vmexit_ctl & VM_EXIT_CLEAR_BNDCFGS) &&
1106              (vmentry_ctl & VM_ENTRY_LOAD_BNDCFGS) )
1107             vmx_clear_msr_intercept(v, MSR_IA32_BNDCFGS, VMX_MSR_RW);
1108     }
1109 
1110     /* I/O access bitmap. */
1111     __vmwrite(IO_BITMAP_A, __pa(d->arch.hvm_domain.io_bitmap));
1112     __vmwrite(IO_BITMAP_B, __pa(d->arch.hvm_domain.io_bitmap) + PAGE_SIZE);
1113 
1114     if ( cpu_has_vmx_virtual_intr_delivery )
1115     {
1116         unsigned int i;
1117 
1118         /* EOI-exit bitmap */
1119         bitmap_zero(v->arch.hvm_vmx.eoi_exit_bitmap, NR_VECTORS);
1120         for ( i = 0; i < ARRAY_SIZE(v->arch.hvm_vmx.eoi_exit_bitmap); ++i )
1121             __vmwrite(EOI_EXIT_BITMAP(i), 0);
1122 
1123         /* Initialise Guest Interrupt Status (RVI and SVI) to 0 */
1124         __vmwrite(GUEST_INTR_STATUS, 0);
1125     }
1126 
1127     if ( cpu_has_vmx_posted_intr_processing )
1128     {
1129         if ( iommu_intpost )
1130             pi_desc_init(v);
1131 
1132         __vmwrite(PI_DESC_ADDR, virt_to_maddr(&v->arch.hvm_vmx.pi_desc));
1133         __vmwrite(POSTED_INTR_NOTIFICATION_VECTOR, posted_intr_vector);
1134     }
1135 
1136     /* Disable PML anyway here as it will only be enabled in log dirty mode */
1137     v->arch.hvm_vmx.secondary_exec_control &= ~SECONDARY_EXEC_ENABLE_PML;
1138 
1139     /* Host data selectors. */
1140     __vmwrite(HOST_SS_SELECTOR, __HYPERVISOR_DS);
1141     __vmwrite(HOST_DS_SELECTOR, __HYPERVISOR_DS);
1142     __vmwrite(HOST_ES_SELECTOR, __HYPERVISOR_DS);
1143     __vmwrite(HOST_FS_SELECTOR, 0);
1144     __vmwrite(HOST_GS_SELECTOR, 0);
1145     __vmwrite(HOST_FS_BASE, 0);
1146     __vmwrite(HOST_GS_BASE, 0);
1147 
1148     /* Host control registers. */
1149     v->arch.hvm_vmx.host_cr0 = read_cr0() | X86_CR0_TS;
1150     __vmwrite(HOST_CR0, v->arch.hvm_vmx.host_cr0);
1151     __vmwrite(HOST_CR4, mmu_cr4_features);
1152 
1153     /* Host CS:RIP. */
1154     __vmwrite(HOST_CS_SELECTOR, __HYPERVISOR_CS);
1155     __vmwrite(HOST_RIP, (unsigned long)vmx_asm_vmexit_handler);
1156 
1157     /* Host SYSENTER CS:RIP. */
1158     rdmsrl(MSR_IA32_SYSENTER_CS, sysenter_cs);
1159     __vmwrite(HOST_SYSENTER_CS, sysenter_cs);
1160     rdmsrl(MSR_IA32_SYSENTER_EIP, sysenter_eip);
1161     __vmwrite(HOST_SYSENTER_EIP, sysenter_eip);
1162 
1163     /* MSR intercepts. */
1164     __vmwrite(VM_EXIT_MSR_LOAD_COUNT, 0);
1165     __vmwrite(VM_EXIT_MSR_STORE_COUNT, 0);
1166     __vmwrite(VM_ENTRY_MSR_LOAD_COUNT, 0);
1167 
1168     __vmwrite(VM_ENTRY_INTR_INFO, 0);
1169 
1170     __vmwrite(CR0_GUEST_HOST_MASK, ~0UL);
1171     __vmwrite(CR4_GUEST_HOST_MASK, ~0UL);
1172 
1173     __vmwrite(PAGE_FAULT_ERROR_CODE_MASK, 0);
1174     __vmwrite(PAGE_FAULT_ERROR_CODE_MATCH, 0);
1175 
1176     __vmwrite(CR3_TARGET_COUNT, 0);
1177 
1178     __vmwrite(GUEST_ACTIVITY_STATE, 0);
1179 
1180     /* Guest segment bases. */
1181     __vmwrite(GUEST_ES_BASE, 0);
1182     __vmwrite(GUEST_SS_BASE, 0);
1183     __vmwrite(GUEST_DS_BASE, 0);
1184     __vmwrite(GUEST_FS_BASE, 0);
1185     __vmwrite(GUEST_GS_BASE, 0);
1186     __vmwrite(GUEST_CS_BASE, 0);
1187 
1188     /* Guest segment limits. */
1189     __vmwrite(GUEST_ES_LIMIT, ~0u);
1190     __vmwrite(GUEST_SS_LIMIT, ~0u);
1191     __vmwrite(GUEST_DS_LIMIT, ~0u);
1192     __vmwrite(GUEST_FS_LIMIT, ~0u);
1193     __vmwrite(GUEST_GS_LIMIT, ~0u);
1194     __vmwrite(GUEST_CS_LIMIT, ~0u);
1195 
1196     /* Guest segment AR bytes. */
1197     __vmwrite(GUEST_ES_AR_BYTES, 0xc093); /* read/write, accessed */
1198     __vmwrite(GUEST_SS_AR_BYTES, 0xc093);
1199     __vmwrite(GUEST_DS_AR_BYTES, 0xc093);
1200     __vmwrite(GUEST_FS_AR_BYTES, 0xc093);
1201     __vmwrite(GUEST_GS_AR_BYTES, 0xc093);
1202     __vmwrite(GUEST_CS_AR_BYTES, 0xc09b); /* exec/read, accessed */
1203 
1204     /* Guest IDT. */
1205     __vmwrite(GUEST_IDTR_BASE, 0);
1206     __vmwrite(GUEST_IDTR_LIMIT, 0);
1207 
1208     /* Guest GDT. */
1209     __vmwrite(GUEST_GDTR_BASE, 0);
1210     __vmwrite(GUEST_GDTR_LIMIT, 0);
1211 
1212     /* Guest LDT. */
1213     __vmwrite(GUEST_LDTR_AR_BYTES, 0x0082); /* LDT */
1214     __vmwrite(GUEST_LDTR_SELECTOR, 0);
1215     __vmwrite(GUEST_LDTR_BASE, 0);
1216     __vmwrite(GUEST_LDTR_LIMIT, 0);
1217 
1218     /* Guest TSS. */
1219     __vmwrite(GUEST_TR_AR_BYTES, 0x008b); /* 32-bit TSS (busy) */
1220     __vmwrite(GUEST_TR_BASE, 0);
1221     __vmwrite(GUEST_TR_LIMIT, 0xff);
1222 
1223     __vmwrite(GUEST_INTERRUPTIBILITY_INFO, 0);
1224     __vmwrite(GUEST_DR7, 0);
1225     __vmwrite(VMCS_LINK_POINTER, ~0UL);
1226 
1227     v->arch.hvm_vmx.exception_bitmap = HVM_TRAP_MASK
1228               | (paging_mode_hap(d) ? 0 : (1U << TRAP_page_fault))
1229               | (1U << TRAP_no_device);
1230     vmx_update_exception_bitmap(v);
1231 
1232     v->arch.hvm_vcpu.guest_cr[0] = X86_CR0_PE | X86_CR0_ET;
1233     hvm_update_guest_cr(v, 0);
1234 
1235     v->arch.hvm_vcpu.guest_cr[4] = 0;
1236     hvm_update_guest_cr(v, 4);
1237 
1238     if ( cpu_has_vmx_tpr_shadow )
1239     {
1240         __vmwrite(VIRTUAL_APIC_PAGE_ADDR,
1241                   page_to_maddr(vcpu_vlapic(v)->regs_page));
1242         __vmwrite(TPR_THRESHOLD, 0);
1243     }
1244 
1245     if ( paging_mode_hap(d) )
1246     {
1247         struct p2m_domain *p2m = p2m_get_hostp2m(d);
1248         struct ept_data *ept = &p2m->ept;
1249 
1250         ept->mfn = pagetable_get_pfn(p2m_get_pagetable(p2m));
1251         __vmwrite(EPT_POINTER, ept->eptp);
1252     }
1253 
1254     if ( paging_mode_hap(d) )
1255     {
1256         u64 host_pat, guest_pat;
1257 
1258         rdmsrl(MSR_IA32_CR_PAT, host_pat);
1259         guest_pat = MSR_IA32_CR_PAT_RESET;
1260 
1261         __vmwrite(HOST_PAT, host_pat);
1262         __vmwrite(GUEST_PAT, guest_pat);
1263     }
1264     if ( cpu_has_vmx_mpx )
1265         __vmwrite(GUEST_BNDCFGS, 0);
1266     if ( cpu_has_vmx_xsaves )
1267         __vmwrite(XSS_EXIT_BITMAP, 0);
1268 
1269     if ( cpu_has_vmx_tsc_scaling )
1270         __vmwrite(TSC_MULTIPLIER, d->arch.hvm_domain.tsc_scaling_ratio);
1271 
1272     vmx_vmcs_exit(v);
1273 
1274     /* will update HOST & GUEST_CR3 as reqd */
1275     paging_update_paging_modes(v);
1276 
1277     vmx_vlapic_msr_changed(v);
1278 
1279     return 0;
1280 }
1281 
vmx_msr_entry_key_cmp(const void * key,const void * elt)1282 static int vmx_msr_entry_key_cmp(const void *key, const void *elt)
1283 {
1284     const u32 *msr = key;
1285     const struct vmx_msr_entry *entry = elt;
1286 
1287     if ( *msr > entry->index )
1288         return 1;
1289     if ( *msr < entry->index )
1290         return -1;
1291 
1292     return 0;
1293 }
1294 
vmx_find_msr(u32 msr,int type)1295 struct vmx_msr_entry *vmx_find_msr(u32 msr, int type)
1296 {
1297     struct vcpu *curr = current;
1298     unsigned int msr_count;
1299     struct vmx_msr_entry *msr_area;
1300 
1301     if ( type == VMX_GUEST_MSR )
1302     {
1303         msr_count = curr->arch.hvm_vmx.msr_count;
1304         msr_area = curr->arch.hvm_vmx.msr_area;
1305     }
1306     else
1307     {
1308         ASSERT(type == VMX_HOST_MSR);
1309         msr_count = curr->arch.hvm_vmx.host_msr_count;
1310         msr_area = curr->arch.hvm_vmx.host_msr_area;
1311     }
1312 
1313     if ( msr_area == NULL )
1314         return NULL;
1315 
1316     return bsearch(&msr, msr_area, msr_count, sizeof(struct vmx_msr_entry),
1317                    vmx_msr_entry_key_cmp);
1318 }
1319 
vmx_read_guest_msr(u32 msr,u64 * val)1320 int vmx_read_guest_msr(u32 msr, u64 *val)
1321 {
1322     struct vmx_msr_entry *ent;
1323 
1324     if ( (ent = vmx_find_msr(msr, VMX_GUEST_MSR)) != NULL )
1325     {
1326         *val = ent->data;
1327         return 0;
1328     }
1329 
1330     return -ESRCH;
1331 }
1332 
vmx_write_guest_msr(u32 msr,u64 val)1333 int vmx_write_guest_msr(u32 msr, u64 val)
1334 {
1335     struct vmx_msr_entry *ent;
1336 
1337     if ( (ent = vmx_find_msr(msr, VMX_GUEST_MSR)) != NULL )
1338     {
1339         ent->data = val;
1340         return 0;
1341     }
1342 
1343     return -ESRCH;
1344 }
1345 
vmx_add_msr(u32 msr,int type)1346 int vmx_add_msr(u32 msr, int type)
1347 {
1348     struct vcpu *curr = current;
1349     unsigned int idx, *msr_count;
1350     struct vmx_msr_entry **msr_area, *msr_area_elem;
1351 
1352     if ( type == VMX_GUEST_MSR )
1353     {
1354         msr_count = &curr->arch.hvm_vmx.msr_count;
1355         msr_area = &curr->arch.hvm_vmx.msr_area;
1356     }
1357     else
1358     {
1359         ASSERT(type == VMX_HOST_MSR);
1360         msr_count = &curr->arch.hvm_vmx.host_msr_count;
1361         msr_area = &curr->arch.hvm_vmx.host_msr_area;
1362     }
1363 
1364     if ( *msr_area == NULL )
1365     {
1366         if ( (*msr_area = alloc_xenheap_page()) == NULL )
1367             return -ENOMEM;
1368 
1369         if ( type == VMX_GUEST_MSR )
1370         {
1371             __vmwrite(VM_EXIT_MSR_STORE_ADDR, virt_to_maddr(*msr_area));
1372             __vmwrite(VM_ENTRY_MSR_LOAD_ADDR, virt_to_maddr(*msr_area));
1373         }
1374         else
1375             __vmwrite(VM_EXIT_MSR_LOAD_ADDR, virt_to_maddr(*msr_area));
1376     }
1377 
1378     for ( idx = 0; idx < *msr_count && (*msr_area)[idx].index <= msr; idx++ )
1379         if ( (*msr_area)[idx].index == msr )
1380             return 0;
1381 
1382     if ( *msr_count == (PAGE_SIZE / sizeof(struct vmx_msr_entry)) )
1383         return -ENOSPC;
1384 
1385     memmove(*msr_area + idx + 1, *msr_area + idx,
1386             sizeof(*msr_area_elem) * (*msr_count - idx));
1387 
1388     msr_area_elem = *msr_area + idx;
1389     msr_area_elem->index = msr;
1390     msr_area_elem->mbz = 0;
1391 
1392     ++*msr_count;
1393 
1394     if ( type == VMX_GUEST_MSR )
1395     {
1396         msr_area_elem->data = 0;
1397         __vmwrite(VM_EXIT_MSR_STORE_COUNT, *msr_count);
1398         __vmwrite(VM_ENTRY_MSR_LOAD_COUNT, *msr_count);
1399     }
1400     else
1401     {
1402         rdmsrl(msr, msr_area_elem->data);
1403         __vmwrite(VM_EXIT_MSR_LOAD_COUNT, *msr_count);
1404     }
1405 
1406     return 0;
1407 }
1408 
vmx_set_eoi_exit_bitmap(struct vcpu * v,u8 vector)1409 void vmx_set_eoi_exit_bitmap(struct vcpu *v, u8 vector)
1410 {
1411     if ( !test_and_set_bit(vector, v->arch.hvm_vmx.eoi_exit_bitmap) )
1412         set_bit(vector / BITS_PER_LONG,
1413                 &v->arch.hvm_vmx.eoi_exitmap_changed);
1414 }
1415 
vmx_clear_eoi_exit_bitmap(struct vcpu * v,u8 vector)1416 void vmx_clear_eoi_exit_bitmap(struct vcpu *v, u8 vector)
1417 {
1418     if ( test_and_clear_bit(vector, v->arch.hvm_vmx.eoi_exit_bitmap) )
1419         set_bit(vector / BITS_PER_LONG,
1420                 &v->arch.hvm_vmx.eoi_exitmap_changed);
1421 }
1422 
vmx_vcpu_pml_enabled(const struct vcpu * v)1423 bool_t vmx_vcpu_pml_enabled(const struct vcpu *v)
1424 {
1425     return !!(v->arch.hvm_vmx.secondary_exec_control &
1426               SECONDARY_EXEC_ENABLE_PML);
1427 }
1428 
vmx_vcpu_enable_pml(struct vcpu * v)1429 int vmx_vcpu_enable_pml(struct vcpu *v)
1430 {
1431     if ( vmx_vcpu_pml_enabled(v) )
1432         return 0;
1433 
1434     v->arch.hvm_vmx.pml_pg = v->domain->arch.paging.alloc_page(v->domain);
1435     if ( !v->arch.hvm_vmx.pml_pg )
1436         return -ENOMEM;
1437 
1438     vmx_vmcs_enter(v);
1439 
1440     __vmwrite(PML_ADDRESS, page_to_mfn(v->arch.hvm_vmx.pml_pg) << PAGE_SHIFT);
1441     __vmwrite(GUEST_PML_INDEX, NR_PML_ENTRIES - 1);
1442 
1443     v->arch.hvm_vmx.secondary_exec_control |= SECONDARY_EXEC_ENABLE_PML;
1444 
1445     __vmwrite(SECONDARY_VM_EXEC_CONTROL,
1446               v->arch.hvm_vmx.secondary_exec_control);
1447 
1448     vmx_vmcs_exit(v);
1449 
1450     return 0;
1451 }
1452 
vmx_vcpu_disable_pml(struct vcpu * v)1453 void vmx_vcpu_disable_pml(struct vcpu *v)
1454 {
1455     if ( !vmx_vcpu_pml_enabled(v) )
1456         return;
1457 
1458     /* Make sure we don't lose any logged GPAs. */
1459     vmx_vcpu_flush_pml_buffer(v);
1460 
1461     vmx_vmcs_enter(v);
1462 
1463     v->arch.hvm_vmx.secondary_exec_control &= ~SECONDARY_EXEC_ENABLE_PML;
1464     __vmwrite(SECONDARY_VM_EXEC_CONTROL,
1465               v->arch.hvm_vmx.secondary_exec_control);
1466 
1467     vmx_vmcs_exit(v);
1468 
1469     v->domain->arch.paging.free_page(v->domain, v->arch.hvm_vmx.pml_pg);
1470     v->arch.hvm_vmx.pml_pg = NULL;
1471 }
1472 
vmx_vcpu_flush_pml_buffer(struct vcpu * v)1473 void vmx_vcpu_flush_pml_buffer(struct vcpu *v)
1474 {
1475     uint64_t *pml_buf;
1476     unsigned long pml_idx;
1477 
1478     ASSERT((v == current) || (!vcpu_runnable(v) && !v->is_running));
1479     ASSERT(vmx_vcpu_pml_enabled(v));
1480 
1481     vmx_vmcs_enter(v);
1482 
1483     __vmread(GUEST_PML_INDEX, &pml_idx);
1484 
1485     /* Do nothing if PML buffer is empty. */
1486     if ( pml_idx == (NR_PML_ENTRIES - 1) )
1487         goto out;
1488 
1489     pml_buf = __map_domain_page(v->arch.hvm_vmx.pml_pg);
1490 
1491     /*
1492      * PML index can be either 2^16-1 (buffer is full), or 0 ~ NR_PML_ENTRIES-1
1493      * (buffer is not full), and in latter case PML index always points to next
1494      * available entity.
1495      */
1496     if ( pml_idx >= NR_PML_ENTRIES )
1497         pml_idx = 0;
1498     else
1499         pml_idx++;
1500 
1501     for ( ; pml_idx < NR_PML_ENTRIES; pml_idx++ )
1502     {
1503         unsigned long gfn = pml_buf[pml_idx] >> PAGE_SHIFT;
1504 
1505         /*
1506          * Need to change type from log-dirty to normal memory for logged GFN.
1507          * hap_track_dirty_vram depends on it to work. And we mark all logged
1508          * GFNs to be dirty, as we cannot be sure whether it's safe to ignore
1509          * GFNs on which p2m_change_type_one returns failure. The failure cases
1510          * are very rare, and additional cost is negligible, but a missing mark
1511          * is extremely difficult to debug.
1512          */
1513         p2m_change_type_one(v->domain, gfn, p2m_ram_logdirty, p2m_ram_rw);
1514 
1515         /* HVM guest: pfn == gfn */
1516         paging_mark_pfn_dirty(v->domain, _pfn(gfn));
1517     }
1518 
1519     unmap_domain_page(pml_buf);
1520 
1521     /* Reset PML index */
1522     __vmwrite(GUEST_PML_INDEX, NR_PML_ENTRIES - 1);
1523 
1524  out:
1525     vmx_vmcs_exit(v);
1526 }
1527 
vmx_domain_pml_enabled(const struct domain * d)1528 bool_t vmx_domain_pml_enabled(const struct domain *d)
1529 {
1530     return !!(d->arch.hvm_domain.vmx.status & VMX_DOMAIN_PML_ENABLED);
1531 }
1532 
1533 /*
1534  * This function enables PML for particular domain. It should be called when
1535  * domain is paused.
1536  *
1537  * PML needs to be enabled globally for all vcpus of the domain, as PML buffer
1538  * and PML index are pre-vcpu, but EPT table is shared by vcpus, therefore
1539  * enabling PML on partial vcpus won't work.
1540  */
vmx_domain_enable_pml(struct domain * d)1541 int vmx_domain_enable_pml(struct domain *d)
1542 {
1543     struct vcpu *v;
1544     int rc;
1545 
1546     ASSERT(atomic_read(&d->pause_count));
1547 
1548     if ( vmx_domain_pml_enabled(d) )
1549         return 0;
1550 
1551     for_each_vcpu ( d, v )
1552         if ( (rc = vmx_vcpu_enable_pml(v)) != 0 )
1553             goto error;
1554 
1555     d->arch.hvm_domain.vmx.status |= VMX_DOMAIN_PML_ENABLED;
1556 
1557     return 0;
1558 
1559  error:
1560     for_each_vcpu ( d, v )
1561         if ( vmx_vcpu_pml_enabled(v) )
1562             vmx_vcpu_disable_pml(v);
1563     return rc;
1564 }
1565 
1566 /*
1567  * Disable PML for particular domain. Called when domain is paused.
1568  *
1569  * The same as enabling PML for domain, disabling PML should be done for all
1570  * vcpus at once.
1571  */
vmx_domain_disable_pml(struct domain * d)1572 void vmx_domain_disable_pml(struct domain *d)
1573 {
1574     struct vcpu *v;
1575 
1576     ASSERT(atomic_read(&d->pause_count));
1577 
1578     if ( !vmx_domain_pml_enabled(d) )
1579         return;
1580 
1581     for_each_vcpu ( d, v )
1582         vmx_vcpu_disable_pml(v);
1583 
1584     d->arch.hvm_domain.vmx.status &= ~VMX_DOMAIN_PML_ENABLED;
1585 }
1586 
1587 /*
1588  * Flush PML buffer of all vcpus, and update the logged dirty pages to log-dirty
1589  * radix tree. Called when domain is paused.
1590  */
vmx_domain_flush_pml_buffers(struct domain * d)1591 void vmx_domain_flush_pml_buffers(struct domain *d)
1592 {
1593     struct vcpu *v;
1594 
1595     ASSERT(atomic_read(&d->pause_count));
1596 
1597     if ( !vmx_domain_pml_enabled(d) )
1598         return;
1599 
1600     for_each_vcpu ( d, v )
1601         vmx_vcpu_flush_pml_buffer(v);
1602 }
1603 
vmx_vcpu_update_eptp(struct vcpu * v,u64 eptp)1604 static void vmx_vcpu_update_eptp(struct vcpu *v, u64 eptp)
1605 {
1606     vmx_vmcs_enter(v);
1607     __vmwrite(EPT_POINTER, eptp);
1608     vmx_vmcs_exit(v);
1609 }
1610 
1611 /*
1612  * Update EPTP data to VMCS of all vcpus of the domain. Must be called when
1613  * domain is paused.
1614  */
vmx_domain_update_eptp(struct domain * d)1615 void vmx_domain_update_eptp(struct domain *d)
1616 {
1617     struct p2m_domain *p2m = p2m_get_hostp2m(d);
1618     struct vcpu *v;
1619 
1620     ASSERT(atomic_read(&d->pause_count));
1621 
1622     for_each_vcpu ( d, v )
1623         vmx_vcpu_update_eptp(v, p2m->ept.eptp);
1624 
1625     ept_sync_domain(p2m);
1626 }
1627 
vmx_create_vmcs(struct vcpu * v)1628 int vmx_create_vmcs(struct vcpu *v)
1629 {
1630     struct arch_vmx_struct *arch_vmx = &v->arch.hvm_vmx;
1631     int rc;
1632 
1633     if ( (arch_vmx->vmcs_pa = vmx_alloc_vmcs()) == 0 )
1634         return -ENOMEM;
1635 
1636     INIT_LIST_HEAD(&arch_vmx->active_list);
1637     __vmpclear(arch_vmx->vmcs_pa);
1638     arch_vmx->active_cpu = -1;
1639     arch_vmx->launched   = 0;
1640 
1641     if ( (rc = construct_vmcs(v)) != 0 )
1642     {
1643         vmx_free_vmcs(arch_vmx->vmcs_pa);
1644         return rc;
1645     }
1646 
1647     return 0;
1648 }
1649 
vmx_destroy_vmcs(struct vcpu * v)1650 void vmx_destroy_vmcs(struct vcpu *v)
1651 {
1652     struct arch_vmx_struct *arch_vmx = &v->arch.hvm_vmx;
1653 
1654     vmx_clear_vmcs(v);
1655 
1656     vmx_free_vmcs(arch_vmx->vmcs_pa);
1657 
1658     free_xenheap_page(v->arch.hvm_vmx.host_msr_area);
1659     free_xenheap_page(v->arch.hvm_vmx.msr_area);
1660     free_xenheap_page(v->arch.hvm_vmx.msr_bitmap);
1661 }
1662 
vmx_vmentry_failure(void)1663 void vmx_vmentry_failure(void)
1664 {
1665     struct vcpu *curr = current;
1666     unsigned long error;
1667 
1668     __vmread(VM_INSTRUCTION_ERROR, &error);
1669     gprintk(XENLOG_ERR, "VM%s error: %#lx\n",
1670             curr->arch.hvm_vmx.launched ? "RESUME" : "LAUNCH", error);
1671 
1672     if ( error == VMX_INSN_INVALID_CONTROL_STATE ||
1673          error == VMX_INSN_INVALID_HOST_STATE )
1674         vmcs_dump_vcpu(curr);
1675 
1676     domain_crash_synchronous();
1677 }
1678 
vmx_do_resume(struct vcpu * v)1679 void vmx_do_resume(struct vcpu *v)
1680 {
1681     bool_t debug_state;
1682 
1683     if ( v->arch.hvm_vmx.active_cpu == smp_processor_id() )
1684         vmx_vmcs_reload(v);
1685     else
1686     {
1687         /*
1688          * For pass-through domain, guest PCI-E device driver may leverage the
1689          * "Non-Snoop" I/O, and explicitly WBINVD or CLFLUSH to a RAM space.
1690          * Since migration may occur before WBINVD or CLFLUSH, we need to
1691          * maintain data consistency either by:
1692          *  1: flushing cache (wbinvd) when the guest is scheduled out if
1693          *     there is no wbinvd exit, or
1694          *  2: execute wbinvd on all dirty pCPUs when guest wbinvd exits.
1695          * If VT-d engine can force snooping, we don't need to do these.
1696          */
1697         if ( has_arch_pdevs(v->domain) && !iommu_snoop
1698                 && !cpu_has_wbinvd_exiting )
1699         {
1700             int cpu = v->arch.hvm_vmx.active_cpu;
1701             if ( cpu != -1 )
1702                 flush_mask(cpumask_of(cpu), FLUSH_CACHE);
1703         }
1704 
1705         vmx_clear_vmcs(v);
1706         vmx_load_vmcs(v);
1707         hvm_migrate_timers(v);
1708         hvm_migrate_pirqs(v);
1709         vmx_set_host_env(v);
1710         /*
1711          * Both n1 VMCS and n2 VMCS need to update the host environment after
1712          * VCPU migration. The environment of current VMCS is updated in place,
1713          * but the action of another VMCS is deferred till it is switched in.
1714          */
1715         v->arch.hvm_vmx.hostenv_migrated = 1;
1716 
1717         hvm_asid_flush_vcpu(v);
1718     }
1719 
1720     debug_state = v->domain->debugger_attached
1721                   || v->domain->arch.monitor.software_breakpoint_enabled
1722                   || v->domain->arch.monitor.singlestep_enabled;
1723 
1724     if ( unlikely(v->arch.hvm_vcpu.debug_state_latch != debug_state) )
1725     {
1726         v->arch.hvm_vcpu.debug_state_latch = debug_state;
1727         vmx_update_debug_state(v);
1728     }
1729 
1730     hvm_do_resume(v);
1731     reset_stack_and_jump(vmx_asm_do_vmentry);
1732 }
1733 
vmr(unsigned long field)1734 static inline unsigned long vmr(unsigned long field)
1735 {
1736     unsigned long val;
1737 
1738     return vmread_safe(field, &val) ? 0 : val;
1739 }
1740 
1741 #define vmr16(fld) ({             \
1742     BUILD_BUG_ON((fld) & 0x6001); \
1743     (uint16_t)vmr(fld);           \
1744 })
1745 
1746 #define vmr32(fld) ({                         \
1747     BUILD_BUG_ON(((fld) & 0x6001) != 0x4000); \
1748     (uint32_t)vmr(fld);                       \
1749 })
1750 
vmx_dump_sel(char * name,uint32_t selector)1751 static void vmx_dump_sel(char *name, uint32_t selector)
1752 {
1753     uint32_t sel, attr, limit;
1754     uint64_t base;
1755     sel = vmr(selector);
1756     attr = vmr(selector + (GUEST_ES_AR_BYTES - GUEST_ES_SELECTOR));
1757     limit = vmr(selector + (GUEST_ES_LIMIT - GUEST_ES_SELECTOR));
1758     base = vmr(selector + (GUEST_ES_BASE - GUEST_ES_SELECTOR));
1759     printk("%s: %04x %05x %08x %016"PRIx64"\n", name, sel, attr, limit, base);
1760 }
1761 
vmx_dump_sel2(char * name,uint32_t lim)1762 static void vmx_dump_sel2(char *name, uint32_t lim)
1763 {
1764     uint32_t limit;
1765     uint64_t base;
1766     limit = vmr(lim);
1767     base = vmr(lim + (GUEST_GDTR_BASE - GUEST_GDTR_LIMIT));
1768     printk("%s:            %08x %016"PRIx64"\n", name, limit, base);
1769 }
1770 
vmcs_dump_vcpu(struct vcpu * v)1771 void vmcs_dump_vcpu(struct vcpu *v)
1772 {
1773     struct cpu_user_regs *regs = &v->arch.user_regs;
1774     uint32_t vmentry_ctl, vmexit_ctl;
1775     unsigned long cr4;
1776     uint64_t efer;
1777     unsigned int i, n;
1778 
1779     if ( v == current )
1780         regs = guest_cpu_user_regs();
1781 
1782     vmx_vmcs_enter(v);
1783 
1784     vmentry_ctl = vmr32(VM_ENTRY_CONTROLS),
1785     vmexit_ctl = vmr32(VM_EXIT_CONTROLS);
1786     cr4 = vmr(GUEST_CR4);
1787     efer = vmr(GUEST_EFER);
1788 
1789     printk("*** Guest State ***\n");
1790     printk("CR0: actual=0x%016lx, shadow=0x%016lx, gh_mask=%016lx\n",
1791            vmr(GUEST_CR0), vmr(CR0_READ_SHADOW), vmr(CR0_GUEST_HOST_MASK));
1792     printk("CR4: actual=0x%016lx, shadow=0x%016lx, gh_mask=%016lx\n",
1793            cr4, vmr(CR4_READ_SHADOW), vmr(CR4_GUEST_HOST_MASK));
1794     printk("CR3 = 0x%016lx\n", vmr(GUEST_CR3));
1795     if ( (v->arch.hvm_vmx.secondary_exec_control &
1796           SECONDARY_EXEC_ENABLE_EPT) &&
1797          (cr4 & X86_CR4_PAE) && !(efer & EFER_LMA) )
1798     {
1799         printk("PDPTE0 = 0x%016lx  PDPTE1 = 0x%016lx\n",
1800                vmr(GUEST_PDPTE(0)), vmr(GUEST_PDPTE(1)));
1801         printk("PDPTE2 = 0x%016lx  PDPTE3 = 0x%016lx\n",
1802                vmr(GUEST_PDPTE(2)), vmr(GUEST_PDPTE(3)));
1803     }
1804     printk("RSP = 0x%016lx (0x%016lx)  RIP = 0x%016lx (0x%016lx)\n",
1805            vmr(GUEST_RSP), regs->rsp,
1806            vmr(GUEST_RIP), regs->rip);
1807     printk("RFLAGS=0x%08lx (0x%08lx)  DR7 = 0x%016lx\n",
1808            vmr(GUEST_RFLAGS), regs->rflags,
1809            vmr(GUEST_DR7));
1810     printk("Sysenter RSP=%016lx CS:RIP=%04x:%016lx\n",
1811            vmr(GUEST_SYSENTER_ESP),
1812            vmr32(GUEST_SYSENTER_CS), vmr(GUEST_SYSENTER_EIP));
1813     printk("       sel  attr  limit   base\n");
1814     vmx_dump_sel("  CS", GUEST_CS_SELECTOR);
1815     vmx_dump_sel("  DS", GUEST_DS_SELECTOR);
1816     vmx_dump_sel("  SS", GUEST_SS_SELECTOR);
1817     vmx_dump_sel("  ES", GUEST_ES_SELECTOR);
1818     vmx_dump_sel("  FS", GUEST_FS_SELECTOR);
1819     vmx_dump_sel("  GS", GUEST_GS_SELECTOR);
1820     vmx_dump_sel2("GDTR", GUEST_GDTR_LIMIT);
1821     vmx_dump_sel("LDTR", GUEST_LDTR_SELECTOR);
1822     vmx_dump_sel2("IDTR", GUEST_IDTR_LIMIT);
1823     vmx_dump_sel("  TR", GUEST_TR_SELECTOR);
1824     if ( (vmexit_ctl & (VM_EXIT_SAVE_GUEST_PAT | VM_EXIT_SAVE_GUEST_EFER)) ||
1825          (vmentry_ctl & (VM_ENTRY_LOAD_GUEST_PAT | VM_ENTRY_LOAD_GUEST_EFER)) )
1826         printk("EFER = 0x%016lx  PAT = 0x%016lx\n", efer, vmr(GUEST_PAT));
1827     printk("PreemptionTimer = 0x%08x  SM Base = 0x%08x\n",
1828            vmr32(GUEST_PREEMPTION_TIMER), vmr32(GUEST_SMBASE));
1829     printk("DebugCtl = 0x%016lx  DebugExceptions = 0x%016lx\n",
1830            vmr(GUEST_IA32_DEBUGCTL), vmr(GUEST_PENDING_DBG_EXCEPTIONS));
1831     if ( vmentry_ctl & (VM_ENTRY_LOAD_PERF_GLOBAL_CTRL | VM_ENTRY_LOAD_BNDCFGS) )
1832         printk("PerfGlobCtl = 0x%016lx  BndCfgS = 0x%016lx\n",
1833                vmr(GUEST_PERF_GLOBAL_CTRL), vmr(GUEST_BNDCFGS));
1834     printk("Interruptibility = %08x  ActivityState = %08x\n",
1835            vmr32(GUEST_INTERRUPTIBILITY_INFO), vmr32(GUEST_ACTIVITY_STATE));
1836     if ( v->arch.hvm_vmx.secondary_exec_control &
1837          SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY )
1838         printk("InterruptStatus = %04x\n", vmr16(GUEST_INTR_STATUS));
1839 
1840     printk("*** Host State ***\n");
1841     printk("RIP = 0x%016lx (%ps)  RSP = 0x%016lx\n",
1842            vmr(HOST_RIP), (void *)vmr(HOST_RIP), vmr(HOST_RSP));
1843     printk("CS=%04x SS=%04x DS=%04x ES=%04x FS=%04x GS=%04x TR=%04x\n",
1844            vmr16(HOST_CS_SELECTOR), vmr16(HOST_SS_SELECTOR),
1845            vmr16(HOST_DS_SELECTOR), vmr16(HOST_ES_SELECTOR),
1846            vmr16(HOST_FS_SELECTOR), vmr16(HOST_GS_SELECTOR),
1847            vmr16(HOST_TR_SELECTOR));
1848     printk("FSBase=%016lx GSBase=%016lx TRBase=%016lx\n",
1849            vmr(HOST_FS_BASE), vmr(HOST_GS_BASE), vmr(HOST_TR_BASE));
1850     printk("GDTBase=%016lx IDTBase=%016lx\n",
1851            vmr(HOST_GDTR_BASE), vmr(HOST_IDTR_BASE));
1852     printk("CR0=%016lx CR3=%016lx CR4=%016lx\n",
1853            vmr(HOST_CR0), vmr(HOST_CR3), vmr(HOST_CR4));
1854     printk("Sysenter RSP=%016lx CS:RIP=%04x:%016lx\n",
1855            vmr(HOST_SYSENTER_ESP),
1856            vmr32(HOST_SYSENTER_CS), vmr(HOST_SYSENTER_EIP));
1857     if ( vmexit_ctl & (VM_EXIT_LOAD_HOST_PAT | VM_EXIT_LOAD_HOST_EFER) )
1858         printk("EFER = 0x%016lx  PAT = 0x%016lx\n", vmr(HOST_EFER), vmr(HOST_PAT));
1859     if ( vmexit_ctl & VM_EXIT_LOAD_PERF_GLOBAL_CTRL )
1860         printk("PerfGlobCtl = 0x%016lx\n",
1861                vmr(HOST_PERF_GLOBAL_CTRL));
1862 
1863     printk("*** Control State ***\n");
1864     printk("PinBased=%08x CPUBased=%08x SecondaryExec=%08x\n",
1865            vmr32(PIN_BASED_VM_EXEC_CONTROL),
1866            vmr32(CPU_BASED_VM_EXEC_CONTROL),
1867            vmr32(SECONDARY_VM_EXEC_CONTROL));
1868     printk("EntryControls=%08x ExitControls=%08x\n", vmentry_ctl, vmexit_ctl);
1869     printk("ExceptionBitmap=%08x PFECmask=%08x PFECmatch=%08x\n",
1870            vmr32(EXCEPTION_BITMAP),
1871            vmr32(PAGE_FAULT_ERROR_CODE_MASK),
1872            vmr32(PAGE_FAULT_ERROR_CODE_MATCH));
1873     printk("VMEntry: intr_info=%08x errcode=%08x ilen=%08x\n",
1874            vmr32(VM_ENTRY_INTR_INFO),
1875            vmr32(VM_ENTRY_EXCEPTION_ERROR_CODE),
1876            vmr32(VM_ENTRY_INSTRUCTION_LEN));
1877     printk("VMExit: intr_info=%08x errcode=%08x ilen=%08x\n",
1878            vmr32(VM_EXIT_INTR_INFO),
1879            vmr32(VM_EXIT_INTR_ERROR_CODE),
1880            vmr32(VM_EXIT_INSTRUCTION_LEN));
1881     printk("        reason=%08x qualification=%016lx\n",
1882            vmr32(VM_EXIT_REASON), vmr(EXIT_QUALIFICATION));
1883     printk("IDTVectoring: info=%08x errcode=%08x\n",
1884            vmr32(IDT_VECTORING_INFO), vmr32(IDT_VECTORING_ERROR_CODE));
1885     printk("TSC Offset = 0x%016lx  TSC Multiplier = 0x%016lx\n",
1886            vmr(TSC_OFFSET), vmr(TSC_MULTIPLIER));
1887     if ( (v->arch.hvm_vmx.exec_control & CPU_BASED_TPR_SHADOW) ||
1888          (vmx_pin_based_exec_control & PIN_BASED_POSTED_INTERRUPT) )
1889         printk("TPR Threshold = 0x%02x  PostedIntrVec = 0x%02x\n",
1890                vmr32(TPR_THRESHOLD), vmr16(POSTED_INTR_NOTIFICATION_VECTOR));
1891     if ( (v->arch.hvm_vmx.secondary_exec_control &
1892           SECONDARY_EXEC_ENABLE_EPT) )
1893         printk("EPT pointer = 0x%016lx  EPTP index = 0x%04x\n",
1894                vmr(EPT_POINTER), vmr16(EPTP_INDEX));
1895     n = vmr32(CR3_TARGET_COUNT);
1896     for ( i = 0; i + 1 < n; i += 2 )
1897         printk("CR3 target%u=%016lx target%u=%016lx\n",
1898                i, vmr(CR3_TARGET_VALUE(i)),
1899                i + 1, vmr(CR3_TARGET_VALUE(i + 1)));
1900     if ( i < n )
1901         printk("CR3 target%u=%016lx\n", i, vmr(CR3_TARGET_VALUE(i)));
1902     if ( v->arch.hvm_vmx.secondary_exec_control &
1903          SECONDARY_EXEC_PAUSE_LOOP_EXITING )
1904         printk("PLE Gap=%08x Window=%08x\n",
1905                vmr32(PLE_GAP), vmr32(PLE_WINDOW));
1906     if ( v->arch.hvm_vmx.secondary_exec_control &
1907          (SECONDARY_EXEC_ENABLE_VPID | SECONDARY_EXEC_ENABLE_VM_FUNCTIONS) )
1908         printk("Virtual processor ID = 0x%04x VMfunc controls = %016lx\n",
1909                vmr16(VIRTUAL_PROCESSOR_ID), vmr(VM_FUNCTION_CONTROL));
1910 
1911     vmx_vmcs_exit(v);
1912 }
1913 
vmcs_dump(unsigned char ch)1914 static void vmcs_dump(unsigned char ch)
1915 {
1916     struct domain *d;
1917     struct vcpu *v;
1918 
1919     printk("*********** VMCS Areas **************\n");
1920 
1921     rcu_read_lock(&domlist_read_lock);
1922 
1923     for_each_domain ( d )
1924     {
1925         if ( !is_hvm_domain(d) )
1926             continue;
1927         printk("\n>>> Domain %d <<<\n", d->domain_id);
1928         for_each_vcpu ( d, v )
1929         {
1930             printk("\tVCPU %d\n", v->vcpu_id);
1931             vmcs_dump_vcpu(v);
1932         }
1933     }
1934 
1935     rcu_read_unlock(&domlist_read_lock);
1936 
1937     printk("**************************************\n");
1938 }
1939 
setup_vmcs_dump(void)1940 void __init setup_vmcs_dump(void)
1941 {
1942     register_keyhandler('v', vmcs_dump, "dump VT-x VMCSs", 1);
1943 }
1944 
build_assertions(void)1945 static void __init __maybe_unused build_assertions(void)
1946 {
1947     struct vmx_msr_bitmap bitmap;
1948 
1949     /* Check vmx_msr_bitmap layoug against hardware expectations. */
1950     BUILD_BUG_ON(sizeof(bitmap)            != PAGE_SIZE);
1951     BUILD_BUG_ON(sizeof(bitmap.read_low)   != 1024);
1952     BUILD_BUG_ON(sizeof(bitmap.read_high)  != 1024);
1953     BUILD_BUG_ON(sizeof(bitmap.write_low)  != 1024);
1954     BUILD_BUG_ON(sizeof(bitmap.write_high) != 1024);
1955     BUILD_BUG_ON(offsetof(struct vmx_msr_bitmap, read_low)   != 0);
1956     BUILD_BUG_ON(offsetof(struct vmx_msr_bitmap, read_high)  != 1024);
1957     BUILD_BUG_ON(offsetof(struct vmx_msr_bitmap, write_low)  != 2048);
1958     BUILD_BUG_ON(offsetof(struct vmx_msr_bitmap, write_high) != 3072);
1959 }
1960 
1961 /*
1962  * Local variables:
1963  * mode: C
1964  * c-file-style: "BSD"
1965  * c-basic-offset: 4
1966  * tab-width: 4
1967  * indent-tabs-mode: nil
1968  * End:
1969  */
1970