1 /*
2 * vmcs.c: VMCS management
3 * Copyright (c) 2004, Intel Corporation.
4 *
5 * This program is free software; you can redistribute it and/or modify it
6 * under the terms and conditions of the GNU General Public License,
7 * version 2, as published by the Free Software Foundation.
8 *
9 * This program is distributed in the hope it will be useful, but WITHOUT
10 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
11 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
12 * more details.
13 *
14 * You should have received a copy of the GNU General Public License along with
15 * this program; If not, see <http://www.gnu.org/licenses/>.
16 */
17
18 #include <xen/init.h>
19 #include <xen/mm.h>
20 #include <xen/lib.h>
21 #include <xen/errno.h>
22 #include <xen/domain_page.h>
23 #include <xen/event.h>
24 #include <xen/kernel.h>
25 #include <xen/keyhandler.h>
26 #include <xen/vm_event.h>
27 #include <asm/current.h>
28 #include <asm/cpufeature.h>
29 #include <asm/processor.h>
30 #include <asm/msr.h>
31 #include <asm/xstate.h>
32 #include <asm/hvm/hvm.h>
33 #include <asm/hvm/io.h>
34 #include <asm/hvm/support.h>
35 #include <asm/hvm/vmx/vmx.h>
36 #include <asm/hvm/vmx/vvmx.h>
37 #include <asm/hvm/vmx/vmcs.h>
38 #include <asm/flushtlb.h>
39 #include <asm/monitor.h>
40 #include <asm/shadow.h>
41 #include <asm/tboot.h>
42 #include <asm/apic.h>
43
44 static bool_t __read_mostly opt_vpid_enabled = 1;
45 boolean_param("vpid", opt_vpid_enabled);
46
47 static bool_t __read_mostly opt_unrestricted_guest_enabled = 1;
48 boolean_param("unrestricted_guest", opt_unrestricted_guest_enabled);
49
50 static bool_t __read_mostly opt_apicv_enabled = 1;
51 boolean_param("apicv", opt_apicv_enabled);
52
53 /*
54 * These two parameters are used to config the controls for Pause-Loop Exiting:
55 * ple_gap: upper bound on the amount of time between two successive
56 * executions of PAUSE in a loop.
57 * ple_window: upper bound on the amount of time a guest is allowed to execute
58 * in a PAUSE loop.
59 * Time is measured based on a counter that runs at the same rate as the TSC,
60 * refer SDM volume 3b section 21.6.13 & 22.1.3.
61 */
62 static unsigned int __read_mostly ple_gap = 128;
63 integer_param("ple_gap", ple_gap);
64 static unsigned int __read_mostly ple_window = 4096;
65 integer_param("ple_window", ple_window);
66
67 static bool_t __read_mostly opt_pml_enabled = 1;
68 static s8 __read_mostly opt_ept_ad = -1;
69
70 /*
71 * The 'ept' parameter controls functionalities that depend on, or impact the
72 * EPT mechanism. Optional comma separated value may contain:
73 *
74 * pml Enable PML
75 * ad Use A/D bits
76 */
parse_ept_param(const char * s)77 static int __init parse_ept_param(const char *s)
78 {
79 const char *ss;
80 int rc = 0;
81
82 do {
83 bool_t val = !!strncmp(s, "no-", 3);
84
85 if ( !val )
86 s += 3;
87
88 ss = strchr(s, ',');
89 if ( !ss )
90 ss = strchr(s, '\0');
91
92 if ( !strncmp(s, "pml", ss - s) )
93 opt_pml_enabled = val;
94 else if ( !strncmp(s, "ad", ss - s) )
95 opt_ept_ad = val;
96 else
97 rc = -EINVAL;
98
99 s = ss + 1;
100 } while ( *ss );
101
102 return rc;
103 }
104 custom_param("ept", parse_ept_param);
105
106 /* Dynamic (run-time adjusted) execution control flags. */
107 u32 vmx_pin_based_exec_control __read_mostly;
108 u32 vmx_cpu_based_exec_control __read_mostly;
109 u32 vmx_secondary_exec_control __read_mostly;
110 u32 vmx_vmexit_control __read_mostly;
111 u32 vmx_vmentry_control __read_mostly;
112 u64 vmx_ept_vpid_cap __read_mostly;
113 u64 vmx_vmfunc __read_mostly;
114 bool_t vmx_virt_exception __read_mostly;
115
116 static DEFINE_PER_CPU_READ_MOSTLY(paddr_t, vmxon_region);
117 static DEFINE_PER_CPU(paddr_t, current_vmcs);
118 static DEFINE_PER_CPU(struct list_head, active_vmcs_list);
119 DEFINE_PER_CPU(bool_t, vmxon);
120
121 static u32 vmcs_revision_id __read_mostly;
122 u64 __read_mostly vmx_basic_msr;
123
vmx_display_features(void)124 static void __init vmx_display_features(void)
125 {
126 int printed = 0;
127
128 printk("VMX: Supported advanced features:\n");
129
130 #define P(p,s) if ( p ) { printk(" - %s\n", s); printed = 1; }
131 P(cpu_has_vmx_virtualize_apic_accesses, "APIC MMIO access virtualisation");
132 P(cpu_has_vmx_tpr_shadow, "APIC TPR shadow");
133 P(cpu_has_vmx_ept, "Extended Page Tables (EPT)");
134 P(cpu_has_vmx_vpid, "Virtual-Processor Identifiers (VPID)");
135 P(cpu_has_vmx_vnmi, "Virtual NMI");
136 P(cpu_has_vmx_msr_bitmap, "MSR direct-access bitmap");
137 P(cpu_has_vmx_unrestricted_guest, "Unrestricted Guest");
138 P(cpu_has_vmx_apic_reg_virt, "APIC Register Virtualization");
139 P(cpu_has_vmx_virtual_intr_delivery, "Virtual Interrupt Delivery");
140 P(cpu_has_vmx_posted_intr_processing, "Posted Interrupt Processing");
141 P(cpu_has_vmx_vmcs_shadowing, "VMCS shadowing");
142 P(cpu_has_vmx_vmfunc, "VM Functions");
143 P(cpu_has_vmx_virt_exceptions, "Virtualisation Exceptions");
144 P(cpu_has_vmx_pml, "Page Modification Logging");
145 P(cpu_has_vmx_tsc_scaling, "TSC Scaling");
146 #undef P
147
148 if ( !printed )
149 printk(" - none\n");
150 }
151
adjust_vmx_controls(const char * name,u32 ctl_min,u32 ctl_opt,u32 msr,bool_t * mismatch)152 static u32 adjust_vmx_controls(
153 const char *name, u32 ctl_min, u32 ctl_opt, u32 msr, bool_t *mismatch)
154 {
155 u32 vmx_msr_low, vmx_msr_high, ctl = ctl_min | ctl_opt;
156
157 rdmsr(msr, vmx_msr_low, vmx_msr_high);
158
159 ctl &= vmx_msr_high; /* bit == 0 in high word ==> must be zero */
160 ctl |= vmx_msr_low; /* bit == 1 in low word ==> must be one */
161
162 /* Ensure minimum (required) set of control bits are supported. */
163 if ( ctl_min & ~ctl )
164 {
165 *mismatch = 1;
166 printk("VMX: CPU%d has insufficient %s (%08x; requires %08x)\n",
167 smp_processor_id(), name, ctl, ctl_min);
168 }
169
170 return ctl;
171 }
172
cap_check(const char * name,u32 expected,u32 saw)173 static bool_t cap_check(const char *name, u32 expected, u32 saw)
174 {
175 if ( saw != expected )
176 printk("VMX %s: saw %#x expected %#x\n", name, saw, expected);
177 return saw != expected;
178 }
179
vmx_init_vmcs_config(void)180 static int vmx_init_vmcs_config(void)
181 {
182 u32 vmx_basic_msr_low, vmx_basic_msr_high, min, opt;
183 u32 _vmx_pin_based_exec_control;
184 u32 _vmx_cpu_based_exec_control;
185 u32 _vmx_secondary_exec_control = 0;
186 u64 _vmx_ept_vpid_cap = 0;
187 u64 _vmx_misc_cap = 0;
188 u32 _vmx_vmexit_control;
189 u32 _vmx_vmentry_control;
190 u64 _vmx_vmfunc = 0;
191 bool_t mismatch = 0;
192
193 rdmsr(MSR_IA32_VMX_BASIC, vmx_basic_msr_low, vmx_basic_msr_high);
194
195 min = (PIN_BASED_EXT_INTR_MASK |
196 PIN_BASED_NMI_EXITING);
197 opt = (PIN_BASED_VIRTUAL_NMIS |
198 PIN_BASED_POSTED_INTERRUPT);
199 _vmx_pin_based_exec_control = adjust_vmx_controls(
200 "Pin-Based Exec Control", min, opt,
201 MSR_IA32_VMX_PINBASED_CTLS, &mismatch);
202
203 min = (CPU_BASED_HLT_EXITING |
204 CPU_BASED_VIRTUAL_INTR_PENDING |
205 CPU_BASED_CR8_LOAD_EXITING |
206 CPU_BASED_CR8_STORE_EXITING |
207 CPU_BASED_INVLPG_EXITING |
208 CPU_BASED_CR3_LOAD_EXITING |
209 CPU_BASED_CR3_STORE_EXITING |
210 CPU_BASED_MONITOR_EXITING |
211 CPU_BASED_MWAIT_EXITING |
212 CPU_BASED_MOV_DR_EXITING |
213 CPU_BASED_ACTIVATE_IO_BITMAP |
214 CPU_BASED_USE_TSC_OFFSETING |
215 CPU_BASED_RDTSC_EXITING);
216 opt = (CPU_BASED_ACTIVATE_MSR_BITMAP |
217 CPU_BASED_TPR_SHADOW |
218 CPU_BASED_MONITOR_TRAP_FLAG |
219 CPU_BASED_ACTIVATE_SECONDARY_CONTROLS);
220 _vmx_cpu_based_exec_control = adjust_vmx_controls(
221 "CPU-Based Exec Control", min, opt,
222 MSR_IA32_VMX_PROCBASED_CTLS, &mismatch);
223 _vmx_cpu_based_exec_control &= ~CPU_BASED_RDTSC_EXITING;
224 if ( _vmx_cpu_based_exec_control & CPU_BASED_TPR_SHADOW )
225 _vmx_cpu_based_exec_control &=
226 ~(CPU_BASED_CR8_LOAD_EXITING | CPU_BASED_CR8_STORE_EXITING);
227
228 if ( _vmx_cpu_based_exec_control & CPU_BASED_ACTIVATE_SECONDARY_CONTROLS )
229 {
230 min = 0;
231 opt = (SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES |
232 SECONDARY_EXEC_WBINVD_EXITING |
233 SECONDARY_EXEC_ENABLE_EPT |
234 SECONDARY_EXEC_DESCRIPTOR_TABLE_EXITING |
235 SECONDARY_EXEC_ENABLE_RDTSCP |
236 SECONDARY_EXEC_PAUSE_LOOP_EXITING |
237 SECONDARY_EXEC_ENABLE_INVPCID |
238 SECONDARY_EXEC_ENABLE_VM_FUNCTIONS |
239 SECONDARY_EXEC_ENABLE_VIRT_EXCEPTIONS |
240 SECONDARY_EXEC_XSAVES |
241 SECONDARY_EXEC_TSC_SCALING);
242 rdmsrl(MSR_IA32_VMX_MISC, _vmx_misc_cap);
243 if ( _vmx_misc_cap & VMX_MISC_VMWRITE_ALL )
244 opt |= SECONDARY_EXEC_ENABLE_VMCS_SHADOWING;
245 if ( opt_vpid_enabled )
246 opt |= SECONDARY_EXEC_ENABLE_VPID;
247 if ( opt_unrestricted_guest_enabled )
248 opt |= SECONDARY_EXEC_UNRESTRICTED_GUEST;
249 if ( opt_pml_enabled )
250 opt |= SECONDARY_EXEC_ENABLE_PML;
251
252 /*
253 * "APIC Register Virtualization" and "Virtual Interrupt Delivery"
254 * can be set only when "use TPR shadow" is set
255 */
256 if ( (_vmx_cpu_based_exec_control & CPU_BASED_TPR_SHADOW) &&
257 opt_apicv_enabled )
258 opt |= SECONDARY_EXEC_APIC_REGISTER_VIRT |
259 SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY |
260 SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE;
261
262 _vmx_secondary_exec_control = adjust_vmx_controls(
263 "Secondary Exec Control", min, opt,
264 MSR_IA32_VMX_PROCBASED_CTLS2, &mismatch);
265 }
266
267 /* The IA32_VMX_EPT_VPID_CAP MSR exists only when EPT or VPID available */
268 if ( _vmx_secondary_exec_control & (SECONDARY_EXEC_ENABLE_EPT |
269 SECONDARY_EXEC_ENABLE_VPID) )
270 {
271 rdmsrl(MSR_IA32_VMX_EPT_VPID_CAP, _vmx_ept_vpid_cap);
272
273 if ( !opt_ept_ad )
274 _vmx_ept_vpid_cap &= ~VMX_EPT_AD_BIT;
275 else if ( /* Work around Erratum AVR41 on Avoton processors. */
276 boot_cpu_data.x86 == 6 && boot_cpu_data.x86_model == 0x4d &&
277 opt_ept_ad < 0 )
278 _vmx_ept_vpid_cap &= ~VMX_EPT_AD_BIT;
279
280 /*
281 * Additional sanity checking before using EPT:
282 * 1) the CPU we are running on must support EPT WB, as we will set
283 * ept paging structures memory type to WB;
284 * 2) the CPU must support the EPT page-walk length of 4 according to
285 * Intel SDM 25.2.2.
286 * 3) the CPU must support INVEPT all context invalidation, because we
287 * will use it as final resort if other types are not supported.
288 *
289 * Or we just don't use EPT.
290 */
291 if ( !(_vmx_ept_vpid_cap & VMX_EPT_MEMORY_TYPE_WB) ||
292 !(_vmx_ept_vpid_cap & VMX_EPT_WALK_LENGTH_4_SUPPORTED) ||
293 !(_vmx_ept_vpid_cap & VMX_EPT_INVEPT_ALL_CONTEXT) )
294 _vmx_secondary_exec_control &= ~SECONDARY_EXEC_ENABLE_EPT;
295
296 /*
297 * the CPU must support INVVPID all context invalidation, because we
298 * will use it as final resort if other types are not supported.
299 *
300 * Or we just don't use VPID.
301 */
302 if ( !(_vmx_ept_vpid_cap & VMX_VPID_INVVPID_ALL_CONTEXT) )
303 _vmx_secondary_exec_control &= ~SECONDARY_EXEC_ENABLE_VPID;
304
305 /* EPT A/D bits is required for PML */
306 if ( !(_vmx_ept_vpid_cap & VMX_EPT_AD_BIT) )
307 _vmx_secondary_exec_control &= ~SECONDARY_EXEC_ENABLE_PML;
308 }
309
310 if ( _vmx_secondary_exec_control & SECONDARY_EXEC_ENABLE_EPT )
311 {
312 /*
313 * To use EPT we expect to be able to clear certain intercepts.
314 * We check VMX_BASIC_MSR[55] to correctly handle default controls.
315 */
316 uint32_t must_be_one, must_be_zero, msr = MSR_IA32_VMX_PROCBASED_CTLS;
317 if ( vmx_basic_msr_high & (VMX_BASIC_DEFAULT1_ZERO >> 32) )
318 msr = MSR_IA32_VMX_TRUE_PROCBASED_CTLS;
319 rdmsr(msr, must_be_one, must_be_zero);
320 if ( must_be_one & (CPU_BASED_INVLPG_EXITING |
321 CPU_BASED_CR3_LOAD_EXITING |
322 CPU_BASED_CR3_STORE_EXITING) )
323 _vmx_secondary_exec_control &=
324 ~(SECONDARY_EXEC_ENABLE_EPT |
325 SECONDARY_EXEC_UNRESTRICTED_GUEST);
326 }
327
328 /* PML cannot be supported if EPT is not used */
329 if ( !(_vmx_secondary_exec_control & SECONDARY_EXEC_ENABLE_EPT) )
330 _vmx_secondary_exec_control &= ~SECONDARY_EXEC_ENABLE_PML;
331
332 /* Turn off opt_pml_enabled if PML feature is not present */
333 if ( !(_vmx_secondary_exec_control & SECONDARY_EXEC_ENABLE_PML) )
334 opt_pml_enabled = 0;
335
336 if ( (_vmx_secondary_exec_control & SECONDARY_EXEC_PAUSE_LOOP_EXITING) &&
337 ple_gap == 0 )
338 {
339 if ( !vmx_pin_based_exec_control )
340 printk(XENLOG_INFO "Disable Pause-Loop Exiting.\n");
341 _vmx_secondary_exec_control &= ~ SECONDARY_EXEC_PAUSE_LOOP_EXITING;
342 }
343
344 min = VM_EXIT_ACK_INTR_ON_EXIT;
345 opt = VM_EXIT_SAVE_GUEST_PAT | VM_EXIT_LOAD_HOST_PAT |
346 VM_EXIT_CLEAR_BNDCFGS;
347 min |= VM_EXIT_IA32E_MODE;
348 _vmx_vmexit_control = adjust_vmx_controls(
349 "VMExit Control", min, opt, MSR_IA32_VMX_EXIT_CTLS, &mismatch);
350
351 /*
352 * "Process posted interrupt" can be set only when "virtual-interrupt
353 * delivery" and "acknowledge interrupt on exit" is set. For the latter
354 * is a minimal requirement, only check the former, which is optional.
355 */
356 if ( !(_vmx_secondary_exec_control & SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY) )
357 _vmx_pin_based_exec_control &= ~PIN_BASED_POSTED_INTERRUPT;
358
359 if ( iommu_intpost &&
360 !(_vmx_pin_based_exec_control & PIN_BASED_POSTED_INTERRUPT) )
361 {
362 printk("Intel VT-d Posted Interrupt is disabled for CPU-side Posted "
363 "Interrupt is not enabled\n");
364 iommu_intpost = 0;
365 }
366
367 /* The IA32_VMX_VMFUNC MSR exists only when VMFUNC is available */
368 if ( _vmx_secondary_exec_control & SECONDARY_EXEC_ENABLE_VM_FUNCTIONS )
369 {
370 rdmsrl(MSR_IA32_VMX_VMFUNC, _vmx_vmfunc);
371
372 /*
373 * VMFUNC leaf 0 (EPTP switching) must be supported.
374 *
375 * Or we just don't use VMFUNC.
376 */
377 if ( !(_vmx_vmfunc & VMX_VMFUNC_EPTP_SWITCHING) )
378 _vmx_secondary_exec_control &= ~SECONDARY_EXEC_ENABLE_VM_FUNCTIONS;
379 }
380
381 /* Virtualization exceptions are only enabled if VMFUNC is enabled */
382 if ( !(_vmx_secondary_exec_control & SECONDARY_EXEC_ENABLE_VM_FUNCTIONS) )
383 _vmx_secondary_exec_control &= ~SECONDARY_EXEC_ENABLE_VIRT_EXCEPTIONS;
384
385 min = 0;
386 opt = VM_ENTRY_LOAD_GUEST_PAT | VM_ENTRY_LOAD_BNDCFGS;
387 _vmx_vmentry_control = adjust_vmx_controls(
388 "VMEntry Control", min, opt, MSR_IA32_VMX_ENTRY_CTLS, &mismatch);
389
390 if ( mismatch )
391 return -EINVAL;
392
393 if ( !vmx_pin_based_exec_control )
394 {
395 /* First time through. */
396 vmcs_revision_id = vmx_basic_msr_low & VMX_BASIC_REVISION_MASK;
397 vmx_pin_based_exec_control = _vmx_pin_based_exec_control;
398 vmx_cpu_based_exec_control = _vmx_cpu_based_exec_control;
399 vmx_secondary_exec_control = _vmx_secondary_exec_control;
400 vmx_ept_vpid_cap = _vmx_ept_vpid_cap;
401 vmx_vmexit_control = _vmx_vmexit_control;
402 vmx_vmentry_control = _vmx_vmentry_control;
403 vmx_basic_msr = ((u64)vmx_basic_msr_high << 32) |
404 vmx_basic_msr_low;
405 vmx_vmfunc = _vmx_vmfunc;
406 vmx_virt_exception = !!(_vmx_secondary_exec_control &
407 SECONDARY_EXEC_ENABLE_VIRT_EXCEPTIONS);
408 vmx_display_features();
409
410 /* IA-32 SDM Vol 3B: VMCS size is never greater than 4kB. */
411 if ( (vmx_basic_msr_high & (VMX_BASIC_VMCS_SIZE_MASK >> 32)) >
412 PAGE_SIZE )
413 {
414 printk("VMX: CPU%d VMCS size is too big (%Lu bytes)\n",
415 smp_processor_id(),
416 vmx_basic_msr_high & (VMX_BASIC_VMCS_SIZE_MASK >> 32));
417 return -EINVAL;
418 }
419 }
420 else
421 {
422 /* Globals are already initialised: re-check them. */
423 mismatch |= cap_check(
424 "VMCS revision ID",
425 vmcs_revision_id, vmx_basic_msr_low & VMX_BASIC_REVISION_MASK);
426 mismatch |= cap_check(
427 "Pin-Based Exec Control",
428 vmx_pin_based_exec_control, _vmx_pin_based_exec_control);
429 mismatch |= cap_check(
430 "CPU-Based Exec Control",
431 vmx_cpu_based_exec_control, _vmx_cpu_based_exec_control);
432 mismatch |= cap_check(
433 "Secondary Exec Control",
434 vmx_secondary_exec_control, _vmx_secondary_exec_control);
435 mismatch |= cap_check(
436 "VMExit Control",
437 vmx_vmexit_control, _vmx_vmexit_control);
438 mismatch |= cap_check(
439 "VMEntry Control",
440 vmx_vmentry_control, _vmx_vmentry_control);
441 mismatch |= cap_check(
442 "EPT and VPID Capability",
443 vmx_ept_vpid_cap, _vmx_ept_vpid_cap);
444 mismatch |= cap_check(
445 "VMFUNC Capability",
446 vmx_vmfunc, _vmx_vmfunc);
447 if ( cpu_has_vmx_ins_outs_instr_info !=
448 !!(vmx_basic_msr_high & (VMX_BASIC_INS_OUT_INFO >> 32)) )
449 {
450 printk("VMX INS/OUTS Instruction Info: saw %d expected %d\n",
451 !!(vmx_basic_msr_high & (VMX_BASIC_INS_OUT_INFO >> 32)),
452 cpu_has_vmx_ins_outs_instr_info);
453 mismatch = 1;
454 }
455 if ( (vmx_basic_msr_high & (VMX_BASIC_VMCS_SIZE_MASK >> 32)) !=
456 ((vmx_basic_msr & VMX_BASIC_VMCS_SIZE_MASK) >> 32) )
457 {
458 printk("VMX: CPU%d unexpected VMCS size %Lu\n",
459 smp_processor_id(),
460 vmx_basic_msr_high & (VMX_BASIC_VMCS_SIZE_MASK >> 32));
461 mismatch = 1;
462 }
463 if ( mismatch )
464 {
465 printk("VMX: Capabilities fatally differ between CPU%d and CPU0\n",
466 smp_processor_id());
467 return -EINVAL;
468 }
469 }
470
471 /* IA-32 SDM Vol 3B: 64-bit CPUs always have VMX_BASIC_MSR[48]==0. */
472 if ( vmx_basic_msr_high & (VMX_BASIC_32BIT_ADDRESSES >> 32) )
473 {
474 printk("VMX: CPU%d limits VMX structure pointers to 32 bits\n",
475 smp_processor_id());
476 return -EINVAL;
477 }
478
479 /* Require Write-Back (WB) memory type for VMCS accesses. */
480 opt = (vmx_basic_msr_high & (VMX_BASIC_MEMORY_TYPE_MASK >> 32)) /
481 ((VMX_BASIC_MEMORY_TYPE_MASK & -VMX_BASIC_MEMORY_TYPE_MASK) >> 32);
482 if ( opt != MTRR_TYPE_WRBACK )
483 {
484 printk("VMX: CPU%d has unexpected VMCS access type %u\n",
485 smp_processor_id(), opt);
486 return -EINVAL;
487 }
488
489 return 0;
490 }
491
vmx_alloc_vmcs(void)492 static paddr_t vmx_alloc_vmcs(void)
493 {
494 struct page_info *pg;
495 struct vmcs_struct *vmcs;
496
497 if ( (pg = alloc_domheap_page(NULL, 0)) == NULL )
498 {
499 gdprintk(XENLOG_WARNING, "Failed to allocate VMCS.\n");
500 return 0;
501 }
502
503 vmcs = __map_domain_page(pg);
504 clear_page(vmcs);
505 vmcs->vmcs_revision_id = vmcs_revision_id;
506 unmap_domain_page(vmcs);
507
508 return page_to_maddr(pg);
509 }
510
vmx_free_vmcs(paddr_t pa)511 static void vmx_free_vmcs(paddr_t pa)
512 {
513 free_domheap_page(maddr_to_page(pa));
514 }
515
__vmx_clear_vmcs(void * info)516 static void __vmx_clear_vmcs(void *info)
517 {
518 struct vcpu *v = info;
519 struct arch_vmx_struct *arch_vmx = &v->arch.hvm_vmx;
520
521 /* Otherwise we can nest (vmx_cpu_down() vs. vmx_clear_vmcs()). */
522 ASSERT(!local_irq_is_enabled());
523
524 if ( arch_vmx->active_cpu == smp_processor_id() )
525 {
526 __vmpclear(arch_vmx->vmcs_pa);
527 if ( arch_vmx->vmcs_shadow_maddr )
528 __vmpclear(arch_vmx->vmcs_shadow_maddr);
529
530 arch_vmx->active_cpu = -1;
531 arch_vmx->launched = 0;
532
533 list_del(&arch_vmx->active_list);
534
535 if ( arch_vmx->vmcs_pa == this_cpu(current_vmcs) )
536 this_cpu(current_vmcs) = 0;
537 }
538 }
539
vmx_clear_vmcs(struct vcpu * v)540 static void vmx_clear_vmcs(struct vcpu *v)
541 {
542 int cpu = v->arch.hvm_vmx.active_cpu;
543
544 if ( cpu != -1 )
545 on_selected_cpus(cpumask_of(cpu), __vmx_clear_vmcs, v, 1);
546 }
547
vmx_load_vmcs(struct vcpu * v)548 static void vmx_load_vmcs(struct vcpu *v)
549 {
550 unsigned long flags;
551
552 local_irq_save(flags);
553
554 if ( v->arch.hvm_vmx.active_cpu == -1 )
555 {
556 list_add(&v->arch.hvm_vmx.active_list, &this_cpu(active_vmcs_list));
557 v->arch.hvm_vmx.active_cpu = smp_processor_id();
558 }
559
560 ASSERT(v->arch.hvm_vmx.active_cpu == smp_processor_id());
561
562 __vmptrld(v->arch.hvm_vmx.vmcs_pa);
563 this_cpu(current_vmcs) = v->arch.hvm_vmx.vmcs_pa;
564
565 local_irq_restore(flags);
566 }
567
vmx_vmcs_reload(struct vcpu * v)568 void vmx_vmcs_reload(struct vcpu *v)
569 {
570 /*
571 * As we may be running with interrupts disabled, we can't acquire
572 * v->arch.hvm_vmx.vmcs_lock here. However, with interrupts disabled
573 * the VMCS can't be taken away from us anymore if we still own it.
574 */
575 ASSERT(v->is_running || !local_irq_is_enabled());
576 if ( v->arch.hvm_vmx.vmcs_pa == this_cpu(current_vmcs) )
577 return;
578
579 vmx_load_vmcs(v);
580 }
581
vmx_cpu_up_prepare(unsigned int cpu)582 int vmx_cpu_up_prepare(unsigned int cpu)
583 {
584 /*
585 * If nvmx_cpu_up_prepare() failed, do not return failure and just fallback
586 * to legacy mode for vvmcs synchronization.
587 */
588 if ( nvmx_cpu_up_prepare(cpu) != 0 )
589 printk("CPU%d: Could not allocate virtual VMCS buffer.\n", cpu);
590
591 if ( per_cpu(vmxon_region, cpu) )
592 return 0;
593
594 per_cpu(vmxon_region, cpu) = vmx_alloc_vmcs();
595 if ( per_cpu(vmxon_region, cpu) )
596 return 0;
597
598 printk("CPU%d: Could not allocate host VMCS\n", cpu);
599 nvmx_cpu_dead(cpu);
600 return -ENOMEM;
601 }
602
vmx_cpu_dead(unsigned int cpu)603 void vmx_cpu_dead(unsigned int cpu)
604 {
605 vmx_free_vmcs(per_cpu(vmxon_region, cpu));
606 per_cpu(vmxon_region, cpu) = 0;
607 nvmx_cpu_dead(cpu);
608 vmx_pi_desc_fixup(cpu);
609 }
610
_vmx_cpu_up(bool bsp)611 int _vmx_cpu_up(bool bsp)
612 {
613 u32 eax, edx;
614 int rc, bios_locked, cpu = smp_processor_id();
615 u64 cr0, vmx_cr0_fixed0, vmx_cr0_fixed1;
616
617 BUG_ON(!(read_cr4() & X86_CR4_VMXE));
618
619 /*
620 * Ensure the current processor operating mode meets
621 * the requred CRO fixed bits in VMX operation.
622 */
623 cr0 = read_cr0();
624 rdmsrl(MSR_IA32_VMX_CR0_FIXED0, vmx_cr0_fixed0);
625 rdmsrl(MSR_IA32_VMX_CR0_FIXED1, vmx_cr0_fixed1);
626 if ( (~cr0 & vmx_cr0_fixed0) || (cr0 & ~vmx_cr0_fixed1) )
627 {
628 printk("CPU%d: some settings of host CR0 are "
629 "not allowed in VMX operation.\n", cpu);
630 return -EINVAL;
631 }
632
633 rdmsr(MSR_IA32_FEATURE_CONTROL, eax, edx);
634
635 bios_locked = !!(eax & IA32_FEATURE_CONTROL_LOCK);
636 if ( bios_locked )
637 {
638 if ( !(eax & (tboot_in_measured_env()
639 ? IA32_FEATURE_CONTROL_ENABLE_VMXON_INSIDE_SMX
640 : IA32_FEATURE_CONTROL_ENABLE_VMXON_OUTSIDE_SMX)) )
641 {
642 printk("CPU%d: VMX disabled by BIOS.\n", cpu);
643 return -EINVAL;
644 }
645 }
646 else
647 {
648 eax = IA32_FEATURE_CONTROL_LOCK;
649 eax |= IA32_FEATURE_CONTROL_ENABLE_VMXON_OUTSIDE_SMX;
650 if ( test_bit(X86_FEATURE_SMX, &boot_cpu_data.x86_capability) )
651 eax |= IA32_FEATURE_CONTROL_ENABLE_VMXON_INSIDE_SMX;
652 wrmsr(MSR_IA32_FEATURE_CONTROL, eax, 0);
653 }
654
655 if ( (rc = vmx_init_vmcs_config()) != 0 )
656 return rc;
657
658 INIT_LIST_HEAD(&this_cpu(active_vmcs_list));
659
660 if ( bsp && (rc = vmx_cpu_up_prepare(cpu)) != 0 )
661 return rc;
662
663 switch ( __vmxon(this_cpu(vmxon_region)) )
664 {
665 case -2: /* #UD or #GP */
666 if ( bios_locked &&
667 test_bit(X86_FEATURE_SMX, &boot_cpu_data.x86_capability) &&
668 (!(eax & IA32_FEATURE_CONTROL_ENABLE_VMXON_OUTSIDE_SMX) ||
669 !(eax & IA32_FEATURE_CONTROL_ENABLE_VMXON_INSIDE_SMX)) )
670 {
671 printk("CPU%d: VMXON failed: perhaps because of TXT settings "
672 "in your BIOS configuration?\n", cpu);
673 printk(" --> Disable TXT in your BIOS unless using a secure "
674 "bootloader.\n");
675 return -EINVAL;
676 }
677 /* fall through */
678 case -1: /* CF==1 or ZF==1 */
679 printk("CPU%d: unexpected VMXON failure\n", cpu);
680 return -EINVAL;
681 case 0: /* success */
682 this_cpu(vmxon) = 1;
683 break;
684 default:
685 BUG();
686 }
687
688 hvm_asid_init(cpu_has_vmx_vpid ? (1u << VMCS_VPID_WIDTH) : 0);
689
690 if ( cpu_has_vmx_ept )
691 ept_sync_all();
692
693 if ( cpu_has_vmx_vpid )
694 vpid_sync_all();
695
696 vmx_pi_per_cpu_init(cpu);
697
698 return 0;
699 }
700
vmx_cpu_up()701 int vmx_cpu_up()
702 {
703 return _vmx_cpu_up(false);
704 }
705
vmx_cpu_down(void)706 void vmx_cpu_down(void)
707 {
708 struct list_head *active_vmcs_list = &this_cpu(active_vmcs_list);
709 unsigned long flags;
710
711 if ( !this_cpu(vmxon) )
712 return;
713
714 local_irq_save(flags);
715
716 while ( !list_empty(active_vmcs_list) )
717 __vmx_clear_vmcs(list_entry(active_vmcs_list->next,
718 struct vcpu, arch.hvm_vmx.active_list));
719
720 BUG_ON(!(read_cr4() & X86_CR4_VMXE));
721 this_cpu(vmxon) = 0;
722 __vmxoff();
723
724 local_irq_restore(flags);
725 }
726
727 struct foreign_vmcs {
728 struct vcpu *v;
729 unsigned int count;
730 };
731 static DEFINE_PER_CPU(struct foreign_vmcs, foreign_vmcs);
732
vmx_vmcs_try_enter(struct vcpu * v)733 bool_t vmx_vmcs_try_enter(struct vcpu *v)
734 {
735 struct foreign_vmcs *fv;
736
737 /*
738 * NB. We must *always* run an HVM VCPU on its own VMCS, except for
739 * vmx_vmcs_enter/exit and scheduling tail critical regions.
740 */
741 if ( likely(v == current) )
742 return v->arch.hvm_vmx.vmcs_pa == this_cpu(current_vmcs);
743
744 fv = &this_cpu(foreign_vmcs);
745
746 if ( fv->v == v )
747 {
748 BUG_ON(fv->count == 0);
749 }
750 else
751 {
752 BUG_ON(fv->v != NULL);
753 BUG_ON(fv->count != 0);
754
755 vcpu_pause(v);
756 spin_lock(&v->arch.hvm_vmx.vmcs_lock);
757
758 vmx_clear_vmcs(v);
759 vmx_load_vmcs(v);
760
761 fv->v = v;
762 }
763
764 fv->count++;
765
766 return 1;
767 }
768
vmx_vmcs_enter(struct vcpu * v)769 void vmx_vmcs_enter(struct vcpu *v)
770 {
771 bool_t okay = vmx_vmcs_try_enter(v);
772
773 ASSERT(okay);
774 }
775
vmx_vmcs_exit(struct vcpu * v)776 void vmx_vmcs_exit(struct vcpu *v)
777 {
778 struct foreign_vmcs *fv;
779
780 if ( likely(v == current) )
781 return;
782
783 fv = &this_cpu(foreign_vmcs);
784 BUG_ON(fv->v != v);
785 BUG_ON(fv->count == 0);
786
787 if ( --fv->count == 0 )
788 {
789 /* Don't confuse vmx_do_resume (for @v or @current!) */
790 vmx_clear_vmcs(v);
791 if ( is_hvm_vcpu(current) )
792 vmx_load_vmcs(current);
793
794 spin_unlock(&v->arch.hvm_vmx.vmcs_lock);
795 vcpu_unpause(v);
796
797 fv->v = NULL;
798 }
799 }
800
vmx_set_host_env(struct vcpu * v)801 static void vmx_set_host_env(struct vcpu *v)
802 {
803 unsigned int cpu = smp_processor_id();
804
805 __vmwrite(HOST_GDTR_BASE,
806 (unsigned long)(this_cpu(gdt_table) - FIRST_RESERVED_GDT_ENTRY));
807 __vmwrite(HOST_IDTR_BASE, (unsigned long)idt_tables[cpu]);
808
809 __vmwrite(HOST_TR_SELECTOR, TSS_ENTRY << 3);
810 __vmwrite(HOST_TR_BASE, (unsigned long)&per_cpu(init_tss, cpu));
811
812 __vmwrite(HOST_SYSENTER_ESP, get_stack_bottom());
813
814 /*
815 * Skip end of cpu_user_regs when entering the hypervisor because the
816 * CPU does not save context onto the stack. SS,RSP,CS,RIP,RFLAGS,etc
817 * all get saved into the VMCS instead.
818 */
819 __vmwrite(HOST_RSP,
820 (unsigned long)&get_cpu_info()->guest_cpu_user_regs.error_code);
821 }
822
vmx_clear_msr_intercept(struct vcpu * v,unsigned int msr,enum vmx_msr_intercept_type type)823 void vmx_clear_msr_intercept(struct vcpu *v, unsigned int msr,
824 enum vmx_msr_intercept_type type)
825 {
826 struct vmx_msr_bitmap *msr_bitmap = v->arch.hvm_vmx.msr_bitmap;
827 struct domain *d = v->domain;
828
829 /* VMX MSR bitmap supported? */
830 if ( msr_bitmap == NULL )
831 return;
832
833 if ( unlikely(monitored_msr(d, msr)) )
834 return;
835
836 if ( msr <= 0x1fff )
837 {
838 if ( type & VMX_MSR_R )
839 clear_bit(msr, msr_bitmap->read_low);
840 if ( type & VMX_MSR_W )
841 clear_bit(msr, msr_bitmap->write_low);
842 }
843 else if ( (msr >= 0xc0000000) && (msr <= 0xc0001fff) )
844 {
845 msr &= 0x1fff;
846 if ( type & VMX_MSR_R )
847 clear_bit(msr, msr_bitmap->read_high);
848 if ( type & VMX_MSR_W )
849 clear_bit(msr, msr_bitmap->write_high);
850 }
851 else
852 ASSERT(!"MSR out of range for interception\n");
853 }
854
vmx_set_msr_intercept(struct vcpu * v,unsigned int msr,enum vmx_msr_intercept_type type)855 void vmx_set_msr_intercept(struct vcpu *v, unsigned int msr,
856 enum vmx_msr_intercept_type type)
857 {
858 struct vmx_msr_bitmap *msr_bitmap = v->arch.hvm_vmx.msr_bitmap;
859
860 /* VMX MSR bitmap supported? */
861 if ( msr_bitmap == NULL )
862 return;
863
864 if ( msr <= 0x1fff )
865 {
866 if ( type & VMX_MSR_R )
867 set_bit(msr, msr_bitmap->read_low);
868 if ( type & VMX_MSR_W )
869 set_bit(msr, msr_bitmap->write_low);
870 }
871 else if ( (msr >= 0xc0000000) && (msr <= 0xc0001fff) )
872 {
873 msr &= 0x1fff;
874 if ( type & VMX_MSR_R )
875 set_bit(msr, msr_bitmap->read_high);
876 if ( type & VMX_MSR_W )
877 set_bit(msr, msr_bitmap->write_high);
878 }
879 else
880 ASSERT(!"MSR out of range for interception\n");
881 }
882
vmx_msr_is_intercepted(struct vmx_msr_bitmap * msr_bitmap,unsigned int msr,bool is_write)883 bool vmx_msr_is_intercepted(struct vmx_msr_bitmap *msr_bitmap,
884 unsigned int msr, bool is_write)
885 {
886 if ( msr <= 0x1fff )
887 return test_bit(msr, is_write ? msr_bitmap->write_low
888 : msr_bitmap->read_low);
889 else if ( (msr >= 0xc0000000) && (msr <= 0xc0001fff) )
890 return test_bit(msr & 0x1fff, is_write ? msr_bitmap->write_high
891 : msr_bitmap->read_high);
892 else
893 /* MSRs outside the bitmap ranges are always intercepted. */
894 return true;
895 }
896
897
898 /*
899 * Switch VMCS between layer 1 & 2 guest
900 */
vmx_vmcs_switch(paddr_t from,paddr_t to)901 void vmx_vmcs_switch(paddr_t from, paddr_t to)
902 {
903 struct arch_vmx_struct *vmx = ¤t->arch.hvm_vmx;
904 spin_lock(&vmx->vmcs_lock);
905
906 __vmpclear(from);
907 if ( vmx->vmcs_shadow_maddr )
908 __vmpclear(vmx->vmcs_shadow_maddr);
909 __vmptrld(to);
910
911 vmx->vmcs_pa = to;
912 vmx->launched = 0;
913 this_cpu(current_vmcs) = to;
914
915 if ( vmx->hostenv_migrated )
916 {
917 vmx->hostenv_migrated = 0;
918 vmx_set_host_env(current);
919 }
920
921 spin_unlock(&vmx->vmcs_lock);
922 }
923
virtual_vmcs_enter(const struct vcpu * v)924 void virtual_vmcs_enter(const struct vcpu *v)
925 {
926 __vmptrld(v->arch.hvm_vmx.vmcs_shadow_maddr);
927 }
928
virtual_vmcs_exit(const struct vcpu * v)929 void virtual_vmcs_exit(const struct vcpu *v)
930 {
931 paddr_t cur = this_cpu(current_vmcs);
932
933 __vmpclear(v->arch.hvm_vmx.vmcs_shadow_maddr);
934 if ( cur )
935 __vmptrld(cur);
936 }
937
virtual_vmcs_vmread(const struct vcpu * v,u32 vmcs_encoding)938 u64 virtual_vmcs_vmread(const struct vcpu *v, u32 vmcs_encoding)
939 {
940 u64 res;
941
942 virtual_vmcs_enter(v);
943 __vmread(vmcs_encoding, &res);
944 virtual_vmcs_exit(v);
945
946 return res;
947 }
948
virtual_vmcs_vmread_safe(const struct vcpu * v,u32 vmcs_encoding,u64 * val)949 enum vmx_insn_errno virtual_vmcs_vmread_safe(const struct vcpu *v,
950 u32 vmcs_encoding, u64 *val)
951 {
952 enum vmx_insn_errno ret;
953
954 virtual_vmcs_enter(v);
955 ret = vmread_safe(vmcs_encoding, val);
956 virtual_vmcs_exit(v);
957
958 return ret;
959 }
960
virtual_vmcs_vmwrite(const struct vcpu * v,u32 vmcs_encoding,u64 val)961 void virtual_vmcs_vmwrite(const struct vcpu *v, u32 vmcs_encoding, u64 val)
962 {
963 virtual_vmcs_enter(v);
964 __vmwrite(vmcs_encoding, val);
965 virtual_vmcs_exit(v);
966 }
967
virtual_vmcs_vmwrite_safe(const struct vcpu * v,u32 vmcs_encoding,u64 val)968 enum vmx_insn_errno virtual_vmcs_vmwrite_safe(const struct vcpu *v,
969 u32 vmcs_encoding, u64 val)
970 {
971 enum vmx_insn_errno ret;
972
973 virtual_vmcs_enter(v);
974 ret = vmwrite_safe(vmcs_encoding, val);
975 virtual_vmcs_exit(v);
976
977 return ret;
978 }
979
980 /*
981 * This function is only called in a vCPU's initialization phase,
982 * so we can update the posted-interrupt descriptor in non-atomic way.
983 */
pi_desc_init(struct vcpu * v)984 static void pi_desc_init(struct vcpu *v)
985 {
986 v->arch.hvm_vmx.pi_desc.nv = posted_intr_vector;
987
988 /*
989 * Mark NDST as invalid, then we can use this invalid value as a
990 * marker to whether update NDST or not in vmx_pi_hooks_assign().
991 */
992 v->arch.hvm_vmx.pi_desc.ndst = APIC_INVALID_DEST;
993 }
994
construct_vmcs(struct vcpu * v)995 static int construct_vmcs(struct vcpu *v)
996 {
997 struct domain *d = v->domain;
998 uint16_t sysenter_cs;
999 unsigned long sysenter_eip;
1000 u32 vmexit_ctl = vmx_vmexit_control;
1001 u32 vmentry_ctl = vmx_vmentry_control;
1002
1003 vmx_vmcs_enter(v);
1004
1005 /* VMCS controls. */
1006 __vmwrite(PIN_BASED_VM_EXEC_CONTROL, vmx_pin_based_exec_control);
1007
1008 v->arch.hvm_vmx.exec_control = vmx_cpu_based_exec_control;
1009 if ( d->arch.vtsc && !cpu_has_vmx_tsc_scaling )
1010 v->arch.hvm_vmx.exec_control |= CPU_BASED_RDTSC_EXITING;
1011
1012 v->arch.hvm_vmx.secondary_exec_control = vmx_secondary_exec_control;
1013
1014 /*
1015 * Disable descriptor table exiting: It's controlled by the VM event
1016 * monitor requesting it.
1017 */
1018 v->arch.hvm_vmx.secondary_exec_control &=
1019 ~SECONDARY_EXEC_DESCRIPTOR_TABLE_EXITING;
1020
1021 /* Disable VPID for now: we decide when to enable it on VMENTER. */
1022 v->arch.hvm_vmx.secondary_exec_control &= ~SECONDARY_EXEC_ENABLE_VPID;
1023
1024 if ( paging_mode_hap(d) )
1025 {
1026 v->arch.hvm_vmx.exec_control &= ~(CPU_BASED_INVLPG_EXITING |
1027 CPU_BASED_CR3_LOAD_EXITING |
1028 CPU_BASED_CR3_STORE_EXITING);
1029 }
1030 else
1031 {
1032 v->arch.hvm_vmx.secondary_exec_control &=
1033 ~(SECONDARY_EXEC_ENABLE_EPT |
1034 SECONDARY_EXEC_UNRESTRICTED_GUEST |
1035 SECONDARY_EXEC_ENABLE_INVPCID);
1036 vmexit_ctl &= ~(VM_EXIT_SAVE_GUEST_PAT |
1037 VM_EXIT_LOAD_HOST_PAT);
1038 vmentry_ctl &= ~VM_ENTRY_LOAD_GUEST_PAT;
1039 }
1040
1041 /* Disable Virtualize x2APIC mode by default. */
1042 v->arch.hvm_vmx.secondary_exec_control &=
1043 ~SECONDARY_EXEC_VIRTUALIZE_X2APIC_MODE;
1044
1045 /* Do not enable Monitor Trap Flag unless start single step debug */
1046 v->arch.hvm_vmx.exec_control &= ~CPU_BASED_MONITOR_TRAP_FLAG;
1047
1048 /* Disable VMFUNC and #VE for now: they may be enabled later by altp2m. */
1049 v->arch.hvm_vmx.secondary_exec_control &=
1050 ~(SECONDARY_EXEC_ENABLE_VM_FUNCTIONS |
1051 SECONDARY_EXEC_ENABLE_VIRT_EXCEPTIONS);
1052
1053 if ( !has_vlapic(d) )
1054 {
1055 /* Disable virtual apics, TPR */
1056 v->arch.hvm_vmx.secondary_exec_control &=
1057 ~(SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES
1058 | SECONDARY_EXEC_APIC_REGISTER_VIRT
1059 | SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY);
1060 v->arch.hvm_vmx.exec_control &= ~CPU_BASED_TPR_SHADOW;
1061
1062 /* In turn, disable posted interrupts. */
1063 __vmwrite(PIN_BASED_VM_EXEC_CONTROL,
1064 vmx_pin_based_exec_control & ~PIN_BASED_POSTED_INTERRUPT);
1065 }
1066
1067 vmx_update_cpu_exec_control(v);
1068
1069 __vmwrite(VM_EXIT_CONTROLS, vmexit_ctl);
1070 __vmwrite(VM_ENTRY_CONTROLS, vmentry_ctl);
1071
1072 if ( cpu_has_vmx_ple )
1073 {
1074 __vmwrite(PLE_GAP, ple_gap);
1075 __vmwrite(PLE_WINDOW, ple_window);
1076 }
1077
1078 if ( cpu_has_vmx_secondary_exec_control )
1079 __vmwrite(SECONDARY_VM_EXEC_CONTROL,
1080 v->arch.hvm_vmx.secondary_exec_control);
1081
1082 /* MSR access bitmap. */
1083 if ( cpu_has_vmx_msr_bitmap )
1084 {
1085 struct vmx_msr_bitmap *msr_bitmap = alloc_xenheap_page();
1086
1087 if ( msr_bitmap == NULL )
1088 {
1089 vmx_vmcs_exit(v);
1090 return -ENOMEM;
1091 }
1092
1093 memset(msr_bitmap, ~0, PAGE_SIZE);
1094 v->arch.hvm_vmx.msr_bitmap = msr_bitmap;
1095 __vmwrite(MSR_BITMAP, virt_to_maddr(msr_bitmap));
1096
1097 vmx_clear_msr_intercept(v, MSR_FS_BASE, VMX_MSR_RW);
1098 vmx_clear_msr_intercept(v, MSR_GS_BASE, VMX_MSR_RW);
1099 vmx_clear_msr_intercept(v, MSR_SHADOW_GS_BASE, VMX_MSR_RW);
1100 vmx_clear_msr_intercept(v, MSR_IA32_SYSENTER_CS, VMX_MSR_RW);
1101 vmx_clear_msr_intercept(v, MSR_IA32_SYSENTER_ESP, VMX_MSR_RW);
1102 vmx_clear_msr_intercept(v, MSR_IA32_SYSENTER_EIP, VMX_MSR_RW);
1103 if ( paging_mode_hap(d) && (!iommu_enabled || iommu_snoop) )
1104 vmx_clear_msr_intercept(v, MSR_IA32_CR_PAT, VMX_MSR_RW);
1105 if ( (vmexit_ctl & VM_EXIT_CLEAR_BNDCFGS) &&
1106 (vmentry_ctl & VM_ENTRY_LOAD_BNDCFGS) )
1107 vmx_clear_msr_intercept(v, MSR_IA32_BNDCFGS, VMX_MSR_RW);
1108 }
1109
1110 /* I/O access bitmap. */
1111 __vmwrite(IO_BITMAP_A, __pa(d->arch.hvm_domain.io_bitmap));
1112 __vmwrite(IO_BITMAP_B, __pa(d->arch.hvm_domain.io_bitmap) + PAGE_SIZE);
1113
1114 if ( cpu_has_vmx_virtual_intr_delivery )
1115 {
1116 unsigned int i;
1117
1118 /* EOI-exit bitmap */
1119 bitmap_zero(v->arch.hvm_vmx.eoi_exit_bitmap, NR_VECTORS);
1120 for ( i = 0; i < ARRAY_SIZE(v->arch.hvm_vmx.eoi_exit_bitmap); ++i )
1121 __vmwrite(EOI_EXIT_BITMAP(i), 0);
1122
1123 /* Initialise Guest Interrupt Status (RVI and SVI) to 0 */
1124 __vmwrite(GUEST_INTR_STATUS, 0);
1125 }
1126
1127 if ( cpu_has_vmx_posted_intr_processing )
1128 {
1129 if ( iommu_intpost )
1130 pi_desc_init(v);
1131
1132 __vmwrite(PI_DESC_ADDR, virt_to_maddr(&v->arch.hvm_vmx.pi_desc));
1133 __vmwrite(POSTED_INTR_NOTIFICATION_VECTOR, posted_intr_vector);
1134 }
1135
1136 /* Disable PML anyway here as it will only be enabled in log dirty mode */
1137 v->arch.hvm_vmx.secondary_exec_control &= ~SECONDARY_EXEC_ENABLE_PML;
1138
1139 /* Host data selectors. */
1140 __vmwrite(HOST_SS_SELECTOR, __HYPERVISOR_DS);
1141 __vmwrite(HOST_DS_SELECTOR, __HYPERVISOR_DS);
1142 __vmwrite(HOST_ES_SELECTOR, __HYPERVISOR_DS);
1143 __vmwrite(HOST_FS_SELECTOR, 0);
1144 __vmwrite(HOST_GS_SELECTOR, 0);
1145 __vmwrite(HOST_FS_BASE, 0);
1146 __vmwrite(HOST_GS_BASE, 0);
1147
1148 /* Host control registers. */
1149 v->arch.hvm_vmx.host_cr0 = read_cr0() | X86_CR0_TS;
1150 __vmwrite(HOST_CR0, v->arch.hvm_vmx.host_cr0);
1151 __vmwrite(HOST_CR4, mmu_cr4_features);
1152
1153 /* Host CS:RIP. */
1154 __vmwrite(HOST_CS_SELECTOR, __HYPERVISOR_CS);
1155 __vmwrite(HOST_RIP, (unsigned long)vmx_asm_vmexit_handler);
1156
1157 /* Host SYSENTER CS:RIP. */
1158 rdmsrl(MSR_IA32_SYSENTER_CS, sysenter_cs);
1159 __vmwrite(HOST_SYSENTER_CS, sysenter_cs);
1160 rdmsrl(MSR_IA32_SYSENTER_EIP, sysenter_eip);
1161 __vmwrite(HOST_SYSENTER_EIP, sysenter_eip);
1162
1163 /* MSR intercepts. */
1164 __vmwrite(VM_EXIT_MSR_LOAD_COUNT, 0);
1165 __vmwrite(VM_EXIT_MSR_STORE_COUNT, 0);
1166 __vmwrite(VM_ENTRY_MSR_LOAD_COUNT, 0);
1167
1168 __vmwrite(VM_ENTRY_INTR_INFO, 0);
1169
1170 __vmwrite(CR0_GUEST_HOST_MASK, ~0UL);
1171 __vmwrite(CR4_GUEST_HOST_MASK, ~0UL);
1172
1173 __vmwrite(PAGE_FAULT_ERROR_CODE_MASK, 0);
1174 __vmwrite(PAGE_FAULT_ERROR_CODE_MATCH, 0);
1175
1176 __vmwrite(CR3_TARGET_COUNT, 0);
1177
1178 __vmwrite(GUEST_ACTIVITY_STATE, 0);
1179
1180 /* Guest segment bases. */
1181 __vmwrite(GUEST_ES_BASE, 0);
1182 __vmwrite(GUEST_SS_BASE, 0);
1183 __vmwrite(GUEST_DS_BASE, 0);
1184 __vmwrite(GUEST_FS_BASE, 0);
1185 __vmwrite(GUEST_GS_BASE, 0);
1186 __vmwrite(GUEST_CS_BASE, 0);
1187
1188 /* Guest segment limits. */
1189 __vmwrite(GUEST_ES_LIMIT, ~0u);
1190 __vmwrite(GUEST_SS_LIMIT, ~0u);
1191 __vmwrite(GUEST_DS_LIMIT, ~0u);
1192 __vmwrite(GUEST_FS_LIMIT, ~0u);
1193 __vmwrite(GUEST_GS_LIMIT, ~0u);
1194 __vmwrite(GUEST_CS_LIMIT, ~0u);
1195
1196 /* Guest segment AR bytes. */
1197 __vmwrite(GUEST_ES_AR_BYTES, 0xc093); /* read/write, accessed */
1198 __vmwrite(GUEST_SS_AR_BYTES, 0xc093);
1199 __vmwrite(GUEST_DS_AR_BYTES, 0xc093);
1200 __vmwrite(GUEST_FS_AR_BYTES, 0xc093);
1201 __vmwrite(GUEST_GS_AR_BYTES, 0xc093);
1202 __vmwrite(GUEST_CS_AR_BYTES, 0xc09b); /* exec/read, accessed */
1203
1204 /* Guest IDT. */
1205 __vmwrite(GUEST_IDTR_BASE, 0);
1206 __vmwrite(GUEST_IDTR_LIMIT, 0);
1207
1208 /* Guest GDT. */
1209 __vmwrite(GUEST_GDTR_BASE, 0);
1210 __vmwrite(GUEST_GDTR_LIMIT, 0);
1211
1212 /* Guest LDT. */
1213 __vmwrite(GUEST_LDTR_AR_BYTES, 0x0082); /* LDT */
1214 __vmwrite(GUEST_LDTR_SELECTOR, 0);
1215 __vmwrite(GUEST_LDTR_BASE, 0);
1216 __vmwrite(GUEST_LDTR_LIMIT, 0);
1217
1218 /* Guest TSS. */
1219 __vmwrite(GUEST_TR_AR_BYTES, 0x008b); /* 32-bit TSS (busy) */
1220 __vmwrite(GUEST_TR_BASE, 0);
1221 __vmwrite(GUEST_TR_LIMIT, 0xff);
1222
1223 __vmwrite(GUEST_INTERRUPTIBILITY_INFO, 0);
1224 __vmwrite(GUEST_DR7, 0);
1225 __vmwrite(VMCS_LINK_POINTER, ~0UL);
1226
1227 v->arch.hvm_vmx.exception_bitmap = HVM_TRAP_MASK
1228 | (paging_mode_hap(d) ? 0 : (1U << TRAP_page_fault))
1229 | (1U << TRAP_no_device);
1230 vmx_update_exception_bitmap(v);
1231
1232 v->arch.hvm_vcpu.guest_cr[0] = X86_CR0_PE | X86_CR0_ET;
1233 hvm_update_guest_cr(v, 0);
1234
1235 v->arch.hvm_vcpu.guest_cr[4] = 0;
1236 hvm_update_guest_cr(v, 4);
1237
1238 if ( cpu_has_vmx_tpr_shadow )
1239 {
1240 __vmwrite(VIRTUAL_APIC_PAGE_ADDR,
1241 page_to_maddr(vcpu_vlapic(v)->regs_page));
1242 __vmwrite(TPR_THRESHOLD, 0);
1243 }
1244
1245 if ( paging_mode_hap(d) )
1246 {
1247 struct p2m_domain *p2m = p2m_get_hostp2m(d);
1248 struct ept_data *ept = &p2m->ept;
1249
1250 ept->mfn = pagetable_get_pfn(p2m_get_pagetable(p2m));
1251 __vmwrite(EPT_POINTER, ept->eptp);
1252 }
1253
1254 if ( paging_mode_hap(d) )
1255 {
1256 u64 host_pat, guest_pat;
1257
1258 rdmsrl(MSR_IA32_CR_PAT, host_pat);
1259 guest_pat = MSR_IA32_CR_PAT_RESET;
1260
1261 __vmwrite(HOST_PAT, host_pat);
1262 __vmwrite(GUEST_PAT, guest_pat);
1263 }
1264 if ( cpu_has_vmx_mpx )
1265 __vmwrite(GUEST_BNDCFGS, 0);
1266 if ( cpu_has_vmx_xsaves )
1267 __vmwrite(XSS_EXIT_BITMAP, 0);
1268
1269 if ( cpu_has_vmx_tsc_scaling )
1270 __vmwrite(TSC_MULTIPLIER, d->arch.hvm_domain.tsc_scaling_ratio);
1271
1272 vmx_vmcs_exit(v);
1273
1274 /* will update HOST & GUEST_CR3 as reqd */
1275 paging_update_paging_modes(v);
1276
1277 vmx_vlapic_msr_changed(v);
1278
1279 return 0;
1280 }
1281
vmx_msr_entry_key_cmp(const void * key,const void * elt)1282 static int vmx_msr_entry_key_cmp(const void *key, const void *elt)
1283 {
1284 const u32 *msr = key;
1285 const struct vmx_msr_entry *entry = elt;
1286
1287 if ( *msr > entry->index )
1288 return 1;
1289 if ( *msr < entry->index )
1290 return -1;
1291
1292 return 0;
1293 }
1294
vmx_find_msr(u32 msr,int type)1295 struct vmx_msr_entry *vmx_find_msr(u32 msr, int type)
1296 {
1297 struct vcpu *curr = current;
1298 unsigned int msr_count;
1299 struct vmx_msr_entry *msr_area;
1300
1301 if ( type == VMX_GUEST_MSR )
1302 {
1303 msr_count = curr->arch.hvm_vmx.msr_count;
1304 msr_area = curr->arch.hvm_vmx.msr_area;
1305 }
1306 else
1307 {
1308 ASSERT(type == VMX_HOST_MSR);
1309 msr_count = curr->arch.hvm_vmx.host_msr_count;
1310 msr_area = curr->arch.hvm_vmx.host_msr_area;
1311 }
1312
1313 if ( msr_area == NULL )
1314 return NULL;
1315
1316 return bsearch(&msr, msr_area, msr_count, sizeof(struct vmx_msr_entry),
1317 vmx_msr_entry_key_cmp);
1318 }
1319
vmx_read_guest_msr(u32 msr,u64 * val)1320 int vmx_read_guest_msr(u32 msr, u64 *val)
1321 {
1322 struct vmx_msr_entry *ent;
1323
1324 if ( (ent = vmx_find_msr(msr, VMX_GUEST_MSR)) != NULL )
1325 {
1326 *val = ent->data;
1327 return 0;
1328 }
1329
1330 return -ESRCH;
1331 }
1332
vmx_write_guest_msr(u32 msr,u64 val)1333 int vmx_write_guest_msr(u32 msr, u64 val)
1334 {
1335 struct vmx_msr_entry *ent;
1336
1337 if ( (ent = vmx_find_msr(msr, VMX_GUEST_MSR)) != NULL )
1338 {
1339 ent->data = val;
1340 return 0;
1341 }
1342
1343 return -ESRCH;
1344 }
1345
vmx_add_msr(u32 msr,int type)1346 int vmx_add_msr(u32 msr, int type)
1347 {
1348 struct vcpu *curr = current;
1349 unsigned int idx, *msr_count;
1350 struct vmx_msr_entry **msr_area, *msr_area_elem;
1351
1352 if ( type == VMX_GUEST_MSR )
1353 {
1354 msr_count = &curr->arch.hvm_vmx.msr_count;
1355 msr_area = &curr->arch.hvm_vmx.msr_area;
1356 }
1357 else
1358 {
1359 ASSERT(type == VMX_HOST_MSR);
1360 msr_count = &curr->arch.hvm_vmx.host_msr_count;
1361 msr_area = &curr->arch.hvm_vmx.host_msr_area;
1362 }
1363
1364 if ( *msr_area == NULL )
1365 {
1366 if ( (*msr_area = alloc_xenheap_page()) == NULL )
1367 return -ENOMEM;
1368
1369 if ( type == VMX_GUEST_MSR )
1370 {
1371 __vmwrite(VM_EXIT_MSR_STORE_ADDR, virt_to_maddr(*msr_area));
1372 __vmwrite(VM_ENTRY_MSR_LOAD_ADDR, virt_to_maddr(*msr_area));
1373 }
1374 else
1375 __vmwrite(VM_EXIT_MSR_LOAD_ADDR, virt_to_maddr(*msr_area));
1376 }
1377
1378 for ( idx = 0; idx < *msr_count && (*msr_area)[idx].index <= msr; idx++ )
1379 if ( (*msr_area)[idx].index == msr )
1380 return 0;
1381
1382 if ( *msr_count == (PAGE_SIZE / sizeof(struct vmx_msr_entry)) )
1383 return -ENOSPC;
1384
1385 memmove(*msr_area + idx + 1, *msr_area + idx,
1386 sizeof(*msr_area_elem) * (*msr_count - idx));
1387
1388 msr_area_elem = *msr_area + idx;
1389 msr_area_elem->index = msr;
1390 msr_area_elem->mbz = 0;
1391
1392 ++*msr_count;
1393
1394 if ( type == VMX_GUEST_MSR )
1395 {
1396 msr_area_elem->data = 0;
1397 __vmwrite(VM_EXIT_MSR_STORE_COUNT, *msr_count);
1398 __vmwrite(VM_ENTRY_MSR_LOAD_COUNT, *msr_count);
1399 }
1400 else
1401 {
1402 rdmsrl(msr, msr_area_elem->data);
1403 __vmwrite(VM_EXIT_MSR_LOAD_COUNT, *msr_count);
1404 }
1405
1406 return 0;
1407 }
1408
vmx_set_eoi_exit_bitmap(struct vcpu * v,u8 vector)1409 void vmx_set_eoi_exit_bitmap(struct vcpu *v, u8 vector)
1410 {
1411 if ( !test_and_set_bit(vector, v->arch.hvm_vmx.eoi_exit_bitmap) )
1412 set_bit(vector / BITS_PER_LONG,
1413 &v->arch.hvm_vmx.eoi_exitmap_changed);
1414 }
1415
vmx_clear_eoi_exit_bitmap(struct vcpu * v,u8 vector)1416 void vmx_clear_eoi_exit_bitmap(struct vcpu *v, u8 vector)
1417 {
1418 if ( test_and_clear_bit(vector, v->arch.hvm_vmx.eoi_exit_bitmap) )
1419 set_bit(vector / BITS_PER_LONG,
1420 &v->arch.hvm_vmx.eoi_exitmap_changed);
1421 }
1422
vmx_vcpu_pml_enabled(const struct vcpu * v)1423 bool_t vmx_vcpu_pml_enabled(const struct vcpu *v)
1424 {
1425 return !!(v->arch.hvm_vmx.secondary_exec_control &
1426 SECONDARY_EXEC_ENABLE_PML);
1427 }
1428
vmx_vcpu_enable_pml(struct vcpu * v)1429 int vmx_vcpu_enable_pml(struct vcpu *v)
1430 {
1431 if ( vmx_vcpu_pml_enabled(v) )
1432 return 0;
1433
1434 v->arch.hvm_vmx.pml_pg = v->domain->arch.paging.alloc_page(v->domain);
1435 if ( !v->arch.hvm_vmx.pml_pg )
1436 return -ENOMEM;
1437
1438 vmx_vmcs_enter(v);
1439
1440 __vmwrite(PML_ADDRESS, page_to_mfn(v->arch.hvm_vmx.pml_pg) << PAGE_SHIFT);
1441 __vmwrite(GUEST_PML_INDEX, NR_PML_ENTRIES - 1);
1442
1443 v->arch.hvm_vmx.secondary_exec_control |= SECONDARY_EXEC_ENABLE_PML;
1444
1445 __vmwrite(SECONDARY_VM_EXEC_CONTROL,
1446 v->arch.hvm_vmx.secondary_exec_control);
1447
1448 vmx_vmcs_exit(v);
1449
1450 return 0;
1451 }
1452
vmx_vcpu_disable_pml(struct vcpu * v)1453 void vmx_vcpu_disable_pml(struct vcpu *v)
1454 {
1455 if ( !vmx_vcpu_pml_enabled(v) )
1456 return;
1457
1458 /* Make sure we don't lose any logged GPAs. */
1459 vmx_vcpu_flush_pml_buffer(v);
1460
1461 vmx_vmcs_enter(v);
1462
1463 v->arch.hvm_vmx.secondary_exec_control &= ~SECONDARY_EXEC_ENABLE_PML;
1464 __vmwrite(SECONDARY_VM_EXEC_CONTROL,
1465 v->arch.hvm_vmx.secondary_exec_control);
1466
1467 vmx_vmcs_exit(v);
1468
1469 v->domain->arch.paging.free_page(v->domain, v->arch.hvm_vmx.pml_pg);
1470 v->arch.hvm_vmx.pml_pg = NULL;
1471 }
1472
vmx_vcpu_flush_pml_buffer(struct vcpu * v)1473 void vmx_vcpu_flush_pml_buffer(struct vcpu *v)
1474 {
1475 uint64_t *pml_buf;
1476 unsigned long pml_idx;
1477
1478 ASSERT((v == current) || (!vcpu_runnable(v) && !v->is_running));
1479 ASSERT(vmx_vcpu_pml_enabled(v));
1480
1481 vmx_vmcs_enter(v);
1482
1483 __vmread(GUEST_PML_INDEX, &pml_idx);
1484
1485 /* Do nothing if PML buffer is empty. */
1486 if ( pml_idx == (NR_PML_ENTRIES - 1) )
1487 goto out;
1488
1489 pml_buf = __map_domain_page(v->arch.hvm_vmx.pml_pg);
1490
1491 /*
1492 * PML index can be either 2^16-1 (buffer is full), or 0 ~ NR_PML_ENTRIES-1
1493 * (buffer is not full), and in latter case PML index always points to next
1494 * available entity.
1495 */
1496 if ( pml_idx >= NR_PML_ENTRIES )
1497 pml_idx = 0;
1498 else
1499 pml_idx++;
1500
1501 for ( ; pml_idx < NR_PML_ENTRIES; pml_idx++ )
1502 {
1503 unsigned long gfn = pml_buf[pml_idx] >> PAGE_SHIFT;
1504
1505 /*
1506 * Need to change type from log-dirty to normal memory for logged GFN.
1507 * hap_track_dirty_vram depends on it to work. And we mark all logged
1508 * GFNs to be dirty, as we cannot be sure whether it's safe to ignore
1509 * GFNs on which p2m_change_type_one returns failure. The failure cases
1510 * are very rare, and additional cost is negligible, but a missing mark
1511 * is extremely difficult to debug.
1512 */
1513 p2m_change_type_one(v->domain, gfn, p2m_ram_logdirty, p2m_ram_rw);
1514
1515 /* HVM guest: pfn == gfn */
1516 paging_mark_pfn_dirty(v->domain, _pfn(gfn));
1517 }
1518
1519 unmap_domain_page(pml_buf);
1520
1521 /* Reset PML index */
1522 __vmwrite(GUEST_PML_INDEX, NR_PML_ENTRIES - 1);
1523
1524 out:
1525 vmx_vmcs_exit(v);
1526 }
1527
vmx_domain_pml_enabled(const struct domain * d)1528 bool_t vmx_domain_pml_enabled(const struct domain *d)
1529 {
1530 return !!(d->arch.hvm_domain.vmx.status & VMX_DOMAIN_PML_ENABLED);
1531 }
1532
1533 /*
1534 * This function enables PML for particular domain. It should be called when
1535 * domain is paused.
1536 *
1537 * PML needs to be enabled globally for all vcpus of the domain, as PML buffer
1538 * and PML index are pre-vcpu, but EPT table is shared by vcpus, therefore
1539 * enabling PML on partial vcpus won't work.
1540 */
vmx_domain_enable_pml(struct domain * d)1541 int vmx_domain_enable_pml(struct domain *d)
1542 {
1543 struct vcpu *v;
1544 int rc;
1545
1546 ASSERT(atomic_read(&d->pause_count));
1547
1548 if ( vmx_domain_pml_enabled(d) )
1549 return 0;
1550
1551 for_each_vcpu ( d, v )
1552 if ( (rc = vmx_vcpu_enable_pml(v)) != 0 )
1553 goto error;
1554
1555 d->arch.hvm_domain.vmx.status |= VMX_DOMAIN_PML_ENABLED;
1556
1557 return 0;
1558
1559 error:
1560 for_each_vcpu ( d, v )
1561 if ( vmx_vcpu_pml_enabled(v) )
1562 vmx_vcpu_disable_pml(v);
1563 return rc;
1564 }
1565
1566 /*
1567 * Disable PML for particular domain. Called when domain is paused.
1568 *
1569 * The same as enabling PML for domain, disabling PML should be done for all
1570 * vcpus at once.
1571 */
vmx_domain_disable_pml(struct domain * d)1572 void vmx_domain_disable_pml(struct domain *d)
1573 {
1574 struct vcpu *v;
1575
1576 ASSERT(atomic_read(&d->pause_count));
1577
1578 if ( !vmx_domain_pml_enabled(d) )
1579 return;
1580
1581 for_each_vcpu ( d, v )
1582 vmx_vcpu_disable_pml(v);
1583
1584 d->arch.hvm_domain.vmx.status &= ~VMX_DOMAIN_PML_ENABLED;
1585 }
1586
1587 /*
1588 * Flush PML buffer of all vcpus, and update the logged dirty pages to log-dirty
1589 * radix tree. Called when domain is paused.
1590 */
vmx_domain_flush_pml_buffers(struct domain * d)1591 void vmx_domain_flush_pml_buffers(struct domain *d)
1592 {
1593 struct vcpu *v;
1594
1595 ASSERT(atomic_read(&d->pause_count));
1596
1597 if ( !vmx_domain_pml_enabled(d) )
1598 return;
1599
1600 for_each_vcpu ( d, v )
1601 vmx_vcpu_flush_pml_buffer(v);
1602 }
1603
vmx_vcpu_update_eptp(struct vcpu * v,u64 eptp)1604 static void vmx_vcpu_update_eptp(struct vcpu *v, u64 eptp)
1605 {
1606 vmx_vmcs_enter(v);
1607 __vmwrite(EPT_POINTER, eptp);
1608 vmx_vmcs_exit(v);
1609 }
1610
1611 /*
1612 * Update EPTP data to VMCS of all vcpus of the domain. Must be called when
1613 * domain is paused.
1614 */
vmx_domain_update_eptp(struct domain * d)1615 void vmx_domain_update_eptp(struct domain *d)
1616 {
1617 struct p2m_domain *p2m = p2m_get_hostp2m(d);
1618 struct vcpu *v;
1619
1620 ASSERT(atomic_read(&d->pause_count));
1621
1622 for_each_vcpu ( d, v )
1623 vmx_vcpu_update_eptp(v, p2m->ept.eptp);
1624
1625 ept_sync_domain(p2m);
1626 }
1627
vmx_create_vmcs(struct vcpu * v)1628 int vmx_create_vmcs(struct vcpu *v)
1629 {
1630 struct arch_vmx_struct *arch_vmx = &v->arch.hvm_vmx;
1631 int rc;
1632
1633 if ( (arch_vmx->vmcs_pa = vmx_alloc_vmcs()) == 0 )
1634 return -ENOMEM;
1635
1636 INIT_LIST_HEAD(&arch_vmx->active_list);
1637 __vmpclear(arch_vmx->vmcs_pa);
1638 arch_vmx->active_cpu = -1;
1639 arch_vmx->launched = 0;
1640
1641 if ( (rc = construct_vmcs(v)) != 0 )
1642 {
1643 vmx_free_vmcs(arch_vmx->vmcs_pa);
1644 return rc;
1645 }
1646
1647 return 0;
1648 }
1649
vmx_destroy_vmcs(struct vcpu * v)1650 void vmx_destroy_vmcs(struct vcpu *v)
1651 {
1652 struct arch_vmx_struct *arch_vmx = &v->arch.hvm_vmx;
1653
1654 vmx_clear_vmcs(v);
1655
1656 vmx_free_vmcs(arch_vmx->vmcs_pa);
1657
1658 free_xenheap_page(v->arch.hvm_vmx.host_msr_area);
1659 free_xenheap_page(v->arch.hvm_vmx.msr_area);
1660 free_xenheap_page(v->arch.hvm_vmx.msr_bitmap);
1661 }
1662
vmx_vmentry_failure(void)1663 void vmx_vmentry_failure(void)
1664 {
1665 struct vcpu *curr = current;
1666 unsigned long error;
1667
1668 __vmread(VM_INSTRUCTION_ERROR, &error);
1669 gprintk(XENLOG_ERR, "VM%s error: %#lx\n",
1670 curr->arch.hvm_vmx.launched ? "RESUME" : "LAUNCH", error);
1671
1672 if ( error == VMX_INSN_INVALID_CONTROL_STATE ||
1673 error == VMX_INSN_INVALID_HOST_STATE )
1674 vmcs_dump_vcpu(curr);
1675
1676 domain_crash_synchronous();
1677 }
1678
vmx_do_resume(struct vcpu * v)1679 void vmx_do_resume(struct vcpu *v)
1680 {
1681 bool_t debug_state;
1682
1683 if ( v->arch.hvm_vmx.active_cpu == smp_processor_id() )
1684 vmx_vmcs_reload(v);
1685 else
1686 {
1687 /*
1688 * For pass-through domain, guest PCI-E device driver may leverage the
1689 * "Non-Snoop" I/O, and explicitly WBINVD or CLFLUSH to a RAM space.
1690 * Since migration may occur before WBINVD or CLFLUSH, we need to
1691 * maintain data consistency either by:
1692 * 1: flushing cache (wbinvd) when the guest is scheduled out if
1693 * there is no wbinvd exit, or
1694 * 2: execute wbinvd on all dirty pCPUs when guest wbinvd exits.
1695 * If VT-d engine can force snooping, we don't need to do these.
1696 */
1697 if ( has_arch_pdevs(v->domain) && !iommu_snoop
1698 && !cpu_has_wbinvd_exiting )
1699 {
1700 int cpu = v->arch.hvm_vmx.active_cpu;
1701 if ( cpu != -1 )
1702 flush_mask(cpumask_of(cpu), FLUSH_CACHE);
1703 }
1704
1705 vmx_clear_vmcs(v);
1706 vmx_load_vmcs(v);
1707 hvm_migrate_timers(v);
1708 hvm_migrate_pirqs(v);
1709 vmx_set_host_env(v);
1710 /*
1711 * Both n1 VMCS and n2 VMCS need to update the host environment after
1712 * VCPU migration. The environment of current VMCS is updated in place,
1713 * but the action of another VMCS is deferred till it is switched in.
1714 */
1715 v->arch.hvm_vmx.hostenv_migrated = 1;
1716
1717 hvm_asid_flush_vcpu(v);
1718 }
1719
1720 debug_state = v->domain->debugger_attached
1721 || v->domain->arch.monitor.software_breakpoint_enabled
1722 || v->domain->arch.monitor.singlestep_enabled;
1723
1724 if ( unlikely(v->arch.hvm_vcpu.debug_state_latch != debug_state) )
1725 {
1726 v->arch.hvm_vcpu.debug_state_latch = debug_state;
1727 vmx_update_debug_state(v);
1728 }
1729
1730 hvm_do_resume(v);
1731 reset_stack_and_jump(vmx_asm_do_vmentry);
1732 }
1733
vmr(unsigned long field)1734 static inline unsigned long vmr(unsigned long field)
1735 {
1736 unsigned long val;
1737
1738 return vmread_safe(field, &val) ? 0 : val;
1739 }
1740
1741 #define vmr16(fld) ({ \
1742 BUILD_BUG_ON((fld) & 0x6001); \
1743 (uint16_t)vmr(fld); \
1744 })
1745
1746 #define vmr32(fld) ({ \
1747 BUILD_BUG_ON(((fld) & 0x6001) != 0x4000); \
1748 (uint32_t)vmr(fld); \
1749 })
1750
vmx_dump_sel(char * name,uint32_t selector)1751 static void vmx_dump_sel(char *name, uint32_t selector)
1752 {
1753 uint32_t sel, attr, limit;
1754 uint64_t base;
1755 sel = vmr(selector);
1756 attr = vmr(selector + (GUEST_ES_AR_BYTES - GUEST_ES_SELECTOR));
1757 limit = vmr(selector + (GUEST_ES_LIMIT - GUEST_ES_SELECTOR));
1758 base = vmr(selector + (GUEST_ES_BASE - GUEST_ES_SELECTOR));
1759 printk("%s: %04x %05x %08x %016"PRIx64"\n", name, sel, attr, limit, base);
1760 }
1761
vmx_dump_sel2(char * name,uint32_t lim)1762 static void vmx_dump_sel2(char *name, uint32_t lim)
1763 {
1764 uint32_t limit;
1765 uint64_t base;
1766 limit = vmr(lim);
1767 base = vmr(lim + (GUEST_GDTR_BASE - GUEST_GDTR_LIMIT));
1768 printk("%s: %08x %016"PRIx64"\n", name, limit, base);
1769 }
1770
vmcs_dump_vcpu(struct vcpu * v)1771 void vmcs_dump_vcpu(struct vcpu *v)
1772 {
1773 struct cpu_user_regs *regs = &v->arch.user_regs;
1774 uint32_t vmentry_ctl, vmexit_ctl;
1775 unsigned long cr4;
1776 uint64_t efer;
1777 unsigned int i, n;
1778
1779 if ( v == current )
1780 regs = guest_cpu_user_regs();
1781
1782 vmx_vmcs_enter(v);
1783
1784 vmentry_ctl = vmr32(VM_ENTRY_CONTROLS),
1785 vmexit_ctl = vmr32(VM_EXIT_CONTROLS);
1786 cr4 = vmr(GUEST_CR4);
1787 efer = vmr(GUEST_EFER);
1788
1789 printk("*** Guest State ***\n");
1790 printk("CR0: actual=0x%016lx, shadow=0x%016lx, gh_mask=%016lx\n",
1791 vmr(GUEST_CR0), vmr(CR0_READ_SHADOW), vmr(CR0_GUEST_HOST_MASK));
1792 printk("CR4: actual=0x%016lx, shadow=0x%016lx, gh_mask=%016lx\n",
1793 cr4, vmr(CR4_READ_SHADOW), vmr(CR4_GUEST_HOST_MASK));
1794 printk("CR3 = 0x%016lx\n", vmr(GUEST_CR3));
1795 if ( (v->arch.hvm_vmx.secondary_exec_control &
1796 SECONDARY_EXEC_ENABLE_EPT) &&
1797 (cr4 & X86_CR4_PAE) && !(efer & EFER_LMA) )
1798 {
1799 printk("PDPTE0 = 0x%016lx PDPTE1 = 0x%016lx\n",
1800 vmr(GUEST_PDPTE(0)), vmr(GUEST_PDPTE(1)));
1801 printk("PDPTE2 = 0x%016lx PDPTE3 = 0x%016lx\n",
1802 vmr(GUEST_PDPTE(2)), vmr(GUEST_PDPTE(3)));
1803 }
1804 printk("RSP = 0x%016lx (0x%016lx) RIP = 0x%016lx (0x%016lx)\n",
1805 vmr(GUEST_RSP), regs->rsp,
1806 vmr(GUEST_RIP), regs->rip);
1807 printk("RFLAGS=0x%08lx (0x%08lx) DR7 = 0x%016lx\n",
1808 vmr(GUEST_RFLAGS), regs->rflags,
1809 vmr(GUEST_DR7));
1810 printk("Sysenter RSP=%016lx CS:RIP=%04x:%016lx\n",
1811 vmr(GUEST_SYSENTER_ESP),
1812 vmr32(GUEST_SYSENTER_CS), vmr(GUEST_SYSENTER_EIP));
1813 printk(" sel attr limit base\n");
1814 vmx_dump_sel(" CS", GUEST_CS_SELECTOR);
1815 vmx_dump_sel(" DS", GUEST_DS_SELECTOR);
1816 vmx_dump_sel(" SS", GUEST_SS_SELECTOR);
1817 vmx_dump_sel(" ES", GUEST_ES_SELECTOR);
1818 vmx_dump_sel(" FS", GUEST_FS_SELECTOR);
1819 vmx_dump_sel(" GS", GUEST_GS_SELECTOR);
1820 vmx_dump_sel2("GDTR", GUEST_GDTR_LIMIT);
1821 vmx_dump_sel("LDTR", GUEST_LDTR_SELECTOR);
1822 vmx_dump_sel2("IDTR", GUEST_IDTR_LIMIT);
1823 vmx_dump_sel(" TR", GUEST_TR_SELECTOR);
1824 if ( (vmexit_ctl & (VM_EXIT_SAVE_GUEST_PAT | VM_EXIT_SAVE_GUEST_EFER)) ||
1825 (vmentry_ctl & (VM_ENTRY_LOAD_GUEST_PAT | VM_ENTRY_LOAD_GUEST_EFER)) )
1826 printk("EFER = 0x%016lx PAT = 0x%016lx\n", efer, vmr(GUEST_PAT));
1827 printk("PreemptionTimer = 0x%08x SM Base = 0x%08x\n",
1828 vmr32(GUEST_PREEMPTION_TIMER), vmr32(GUEST_SMBASE));
1829 printk("DebugCtl = 0x%016lx DebugExceptions = 0x%016lx\n",
1830 vmr(GUEST_IA32_DEBUGCTL), vmr(GUEST_PENDING_DBG_EXCEPTIONS));
1831 if ( vmentry_ctl & (VM_ENTRY_LOAD_PERF_GLOBAL_CTRL | VM_ENTRY_LOAD_BNDCFGS) )
1832 printk("PerfGlobCtl = 0x%016lx BndCfgS = 0x%016lx\n",
1833 vmr(GUEST_PERF_GLOBAL_CTRL), vmr(GUEST_BNDCFGS));
1834 printk("Interruptibility = %08x ActivityState = %08x\n",
1835 vmr32(GUEST_INTERRUPTIBILITY_INFO), vmr32(GUEST_ACTIVITY_STATE));
1836 if ( v->arch.hvm_vmx.secondary_exec_control &
1837 SECONDARY_EXEC_VIRTUAL_INTR_DELIVERY )
1838 printk("InterruptStatus = %04x\n", vmr16(GUEST_INTR_STATUS));
1839
1840 printk("*** Host State ***\n");
1841 printk("RIP = 0x%016lx (%ps) RSP = 0x%016lx\n",
1842 vmr(HOST_RIP), (void *)vmr(HOST_RIP), vmr(HOST_RSP));
1843 printk("CS=%04x SS=%04x DS=%04x ES=%04x FS=%04x GS=%04x TR=%04x\n",
1844 vmr16(HOST_CS_SELECTOR), vmr16(HOST_SS_SELECTOR),
1845 vmr16(HOST_DS_SELECTOR), vmr16(HOST_ES_SELECTOR),
1846 vmr16(HOST_FS_SELECTOR), vmr16(HOST_GS_SELECTOR),
1847 vmr16(HOST_TR_SELECTOR));
1848 printk("FSBase=%016lx GSBase=%016lx TRBase=%016lx\n",
1849 vmr(HOST_FS_BASE), vmr(HOST_GS_BASE), vmr(HOST_TR_BASE));
1850 printk("GDTBase=%016lx IDTBase=%016lx\n",
1851 vmr(HOST_GDTR_BASE), vmr(HOST_IDTR_BASE));
1852 printk("CR0=%016lx CR3=%016lx CR4=%016lx\n",
1853 vmr(HOST_CR0), vmr(HOST_CR3), vmr(HOST_CR4));
1854 printk("Sysenter RSP=%016lx CS:RIP=%04x:%016lx\n",
1855 vmr(HOST_SYSENTER_ESP),
1856 vmr32(HOST_SYSENTER_CS), vmr(HOST_SYSENTER_EIP));
1857 if ( vmexit_ctl & (VM_EXIT_LOAD_HOST_PAT | VM_EXIT_LOAD_HOST_EFER) )
1858 printk("EFER = 0x%016lx PAT = 0x%016lx\n", vmr(HOST_EFER), vmr(HOST_PAT));
1859 if ( vmexit_ctl & VM_EXIT_LOAD_PERF_GLOBAL_CTRL )
1860 printk("PerfGlobCtl = 0x%016lx\n",
1861 vmr(HOST_PERF_GLOBAL_CTRL));
1862
1863 printk("*** Control State ***\n");
1864 printk("PinBased=%08x CPUBased=%08x SecondaryExec=%08x\n",
1865 vmr32(PIN_BASED_VM_EXEC_CONTROL),
1866 vmr32(CPU_BASED_VM_EXEC_CONTROL),
1867 vmr32(SECONDARY_VM_EXEC_CONTROL));
1868 printk("EntryControls=%08x ExitControls=%08x\n", vmentry_ctl, vmexit_ctl);
1869 printk("ExceptionBitmap=%08x PFECmask=%08x PFECmatch=%08x\n",
1870 vmr32(EXCEPTION_BITMAP),
1871 vmr32(PAGE_FAULT_ERROR_CODE_MASK),
1872 vmr32(PAGE_FAULT_ERROR_CODE_MATCH));
1873 printk("VMEntry: intr_info=%08x errcode=%08x ilen=%08x\n",
1874 vmr32(VM_ENTRY_INTR_INFO),
1875 vmr32(VM_ENTRY_EXCEPTION_ERROR_CODE),
1876 vmr32(VM_ENTRY_INSTRUCTION_LEN));
1877 printk("VMExit: intr_info=%08x errcode=%08x ilen=%08x\n",
1878 vmr32(VM_EXIT_INTR_INFO),
1879 vmr32(VM_EXIT_INTR_ERROR_CODE),
1880 vmr32(VM_EXIT_INSTRUCTION_LEN));
1881 printk(" reason=%08x qualification=%016lx\n",
1882 vmr32(VM_EXIT_REASON), vmr(EXIT_QUALIFICATION));
1883 printk("IDTVectoring: info=%08x errcode=%08x\n",
1884 vmr32(IDT_VECTORING_INFO), vmr32(IDT_VECTORING_ERROR_CODE));
1885 printk("TSC Offset = 0x%016lx TSC Multiplier = 0x%016lx\n",
1886 vmr(TSC_OFFSET), vmr(TSC_MULTIPLIER));
1887 if ( (v->arch.hvm_vmx.exec_control & CPU_BASED_TPR_SHADOW) ||
1888 (vmx_pin_based_exec_control & PIN_BASED_POSTED_INTERRUPT) )
1889 printk("TPR Threshold = 0x%02x PostedIntrVec = 0x%02x\n",
1890 vmr32(TPR_THRESHOLD), vmr16(POSTED_INTR_NOTIFICATION_VECTOR));
1891 if ( (v->arch.hvm_vmx.secondary_exec_control &
1892 SECONDARY_EXEC_ENABLE_EPT) )
1893 printk("EPT pointer = 0x%016lx EPTP index = 0x%04x\n",
1894 vmr(EPT_POINTER), vmr16(EPTP_INDEX));
1895 n = vmr32(CR3_TARGET_COUNT);
1896 for ( i = 0; i + 1 < n; i += 2 )
1897 printk("CR3 target%u=%016lx target%u=%016lx\n",
1898 i, vmr(CR3_TARGET_VALUE(i)),
1899 i + 1, vmr(CR3_TARGET_VALUE(i + 1)));
1900 if ( i < n )
1901 printk("CR3 target%u=%016lx\n", i, vmr(CR3_TARGET_VALUE(i)));
1902 if ( v->arch.hvm_vmx.secondary_exec_control &
1903 SECONDARY_EXEC_PAUSE_LOOP_EXITING )
1904 printk("PLE Gap=%08x Window=%08x\n",
1905 vmr32(PLE_GAP), vmr32(PLE_WINDOW));
1906 if ( v->arch.hvm_vmx.secondary_exec_control &
1907 (SECONDARY_EXEC_ENABLE_VPID | SECONDARY_EXEC_ENABLE_VM_FUNCTIONS) )
1908 printk("Virtual processor ID = 0x%04x VMfunc controls = %016lx\n",
1909 vmr16(VIRTUAL_PROCESSOR_ID), vmr(VM_FUNCTION_CONTROL));
1910
1911 vmx_vmcs_exit(v);
1912 }
1913
vmcs_dump(unsigned char ch)1914 static void vmcs_dump(unsigned char ch)
1915 {
1916 struct domain *d;
1917 struct vcpu *v;
1918
1919 printk("*********** VMCS Areas **************\n");
1920
1921 rcu_read_lock(&domlist_read_lock);
1922
1923 for_each_domain ( d )
1924 {
1925 if ( !is_hvm_domain(d) )
1926 continue;
1927 printk("\n>>> Domain %d <<<\n", d->domain_id);
1928 for_each_vcpu ( d, v )
1929 {
1930 printk("\tVCPU %d\n", v->vcpu_id);
1931 vmcs_dump_vcpu(v);
1932 }
1933 }
1934
1935 rcu_read_unlock(&domlist_read_lock);
1936
1937 printk("**************************************\n");
1938 }
1939
setup_vmcs_dump(void)1940 void __init setup_vmcs_dump(void)
1941 {
1942 register_keyhandler('v', vmcs_dump, "dump VT-x VMCSs", 1);
1943 }
1944
build_assertions(void)1945 static void __init __maybe_unused build_assertions(void)
1946 {
1947 struct vmx_msr_bitmap bitmap;
1948
1949 /* Check vmx_msr_bitmap layoug against hardware expectations. */
1950 BUILD_BUG_ON(sizeof(bitmap) != PAGE_SIZE);
1951 BUILD_BUG_ON(sizeof(bitmap.read_low) != 1024);
1952 BUILD_BUG_ON(sizeof(bitmap.read_high) != 1024);
1953 BUILD_BUG_ON(sizeof(bitmap.write_low) != 1024);
1954 BUILD_BUG_ON(sizeof(bitmap.write_high) != 1024);
1955 BUILD_BUG_ON(offsetof(struct vmx_msr_bitmap, read_low) != 0);
1956 BUILD_BUG_ON(offsetof(struct vmx_msr_bitmap, read_high) != 1024);
1957 BUILD_BUG_ON(offsetof(struct vmx_msr_bitmap, write_low) != 2048);
1958 BUILD_BUG_ON(offsetof(struct vmx_msr_bitmap, write_high) != 3072);
1959 }
1960
1961 /*
1962 * Local variables:
1963 * mode: C
1964 * c-file-style: "BSD"
1965 * c-basic-offset: 4
1966 * tab-width: 4
1967 * indent-tabs-mode: nil
1968 * End:
1969 */
1970