1 // Copyright 2017 The Fuchsia Authors
2 //
3 // Use of this source code is governed by a MIT-style
4 // license that can be found in the LICENSE file or at
5 // https://opensource.org/licenses/MIT
6 
7 #include <bits.h>
8 
9 #include <arch/x86/descriptor.h>
10 #include <arch/x86/feature.h>
11 #include <arch/x86/pvclock.h>
12 #include <fbl/auto_call.h>
13 #include <hypervisor/cpu.h>
14 #include <hypervisor/ktrace.h>
15 #include <kernel/mp.h>
16 #include <lib/ktrace.h>
17 #include <vm/fault.h>
18 #include <vm/pmm.h>
19 #include <vm/vm_object.h>
20 #include <zircon/syscalls/hypervisor.h>
21 
22 #include "pvclock_priv.h"
23 #include "vcpu_priv.h"
24 #include "vmexit_priv.h"
25 #include "vmx_cpu_state_priv.h"
26 
27 static constexpr uint32_t kInterruptInfoValid = 1u << 31;
28 static constexpr uint32_t kInterruptInfoDeliverErrorCode = 1u << 11;
29 static constexpr uint32_t kInterruptTypeNmi = 2u << 8;
30 static constexpr uint32_t kInterruptTypeHardwareException = 3u << 8;
31 static constexpr uint32_t kInterruptTypeSoftwareException = 6u << 8;
32 static constexpr uint16_t kBaseProcessorVpid = 1;
33 
invept(InvEpt invalidation,uint64_t eptp)34 static zx_status_t invept(InvEpt invalidation, uint64_t eptp) {
35     uint8_t err;
36     uint64_t descriptor[] = {eptp, 0};
37 
38     __asm__ volatile(
39         "invept %[descriptor], %[invalidation];" VMX_ERR_CHECK(err)
40         : [err] "=r"(err)
41         : [descriptor] "m"(descriptor), [invalidation] "r"(invalidation)
42         : "cc");
43 
44     return err ? ZX_ERR_INTERNAL : ZX_OK;
45 }
46 
vmptrld(paddr_t pa)47 static zx_status_t vmptrld(paddr_t pa) {
48     uint8_t err;
49 
50     __asm__ volatile(
51         "vmptrld %[pa];" VMX_ERR_CHECK(err)
52         : [err] "=r"(err)
53         : [pa] "m"(pa)
54         : "cc", "memory");
55 
56     return err ? ZX_ERR_INTERNAL : ZX_OK;
57 }
58 
vmclear(paddr_t pa)59 static zx_status_t vmclear(paddr_t pa) {
60     uint8_t err;
61 
62     __asm__ volatile(
63         "vmclear %[pa];" VMX_ERR_CHECK(err)
64         : [err] "=r"(err)
65         : [pa] "m"(pa)
66         : "cc", "memory");
67 
68     return err ? ZX_ERR_INTERNAL : ZX_OK;
69 }
70 
vmread(uint64_t field)71 static uint64_t vmread(uint64_t field) {
72     uint8_t err;
73     uint64_t val;
74 
75     __asm__ volatile(
76         "vmread %[field], %[val];" VMX_ERR_CHECK(err)
77         : [err] "=r"(err), [val] "=m"(val)
78         : [field] "r"(field)
79         : "cc");
80 
81     DEBUG_ASSERT(err == ZX_OK);
82     return val;
83 }
84 
vmwrite(uint64_t field,uint64_t val)85 static void vmwrite(uint64_t field, uint64_t val) {
86     uint8_t err;
87 
88     __asm__ volatile(
89         "vmwrite %[val], %[field];" VMX_ERR_CHECK(err)
90         : [err] "=r"(err)
91         : [val] "r"(val), [field] "r"(field)
92         : "cc");
93 
94     DEBUG_ASSERT(err == ZX_OK);
95 }
96 
AutoVmcs(paddr_t vmcs_address)97 AutoVmcs::AutoVmcs(paddr_t vmcs_address)
98     : vmcs_address_(vmcs_address) {
99     DEBUG_ASSERT(!arch_ints_disabled());
100     arch_disable_ints();
101     __UNUSED zx_status_t status = vmptrld(vmcs_address_);
102     DEBUG_ASSERT(status == ZX_OK);
103 }
104 
~AutoVmcs()105 AutoVmcs::~AutoVmcs() {
106     DEBUG_ASSERT(arch_ints_disabled());
107     arch_enable_ints();
108 }
109 
Invalidate()110 void AutoVmcs::Invalidate() {
111 #if LK_DEBUGLEVEL > 0
112     vmcs_address_ = 0;
113 #endif
114 }
115 
InterruptWindowExiting(bool enable)116 void AutoVmcs::InterruptWindowExiting(bool enable) {
117     DEBUG_ASSERT(vmcs_address_ != 0);
118     uint32_t controls = Read(VmcsField32::PROCBASED_CTLS);
119     if (enable) {
120         controls |= kProcbasedCtlsIntWindowExiting;
121     } else {
122         controls &= ~kProcbasedCtlsIntWindowExiting;
123     }
124     Write(VmcsField32::PROCBASED_CTLS, controls);
125 }
126 
has_error_code(uint32_t vector)127 static bool has_error_code(uint32_t vector) {
128     switch (vector) {
129     case X86_INT_DOUBLE_FAULT:
130     case X86_INT_INVALID_TSS:
131     case X86_INT_SEGMENT_NOT_PRESENT:
132     case X86_INT_STACK_FAULT:
133     case X86_INT_GP_FAULT:
134     case X86_INT_PAGE_FAULT:
135     case X86_INT_ALIGNMENT_CHECK:
136         return true;
137     default:
138         return false;
139     }
140 }
141 
IssueInterrupt(uint32_t vector)142 void AutoVmcs::IssueInterrupt(uint32_t vector) {
143     DEBUG_ASSERT(vmcs_address_ != 0);
144     uint32_t interrupt_info = kInterruptInfoValid | (vector & UINT8_MAX);
145     if (vector == X86_INT_BREAKPOINT || vector == X86_INT_OVERFLOW) {
146         // From Volume 3, Section 24.8.3. A VMM should use type hardware exception for all
147         // exceptions other than breakpoints and overflows, which should be software exceptions.
148         interrupt_info |= kInterruptTypeSoftwareException;
149     } else if (vector == X86_INT_NMI) {
150         interrupt_info |= kInterruptTypeNmi;
151     } else if (vector <= X86_INT_VIRT) {
152         // From Volume 3, Section 6.15. All other vectors from 0 to X86_INT_VIRT are exceptions.
153         interrupt_info |= kInterruptTypeHardwareException;
154     }
155     if (has_error_code(vector)) {
156         interrupt_info |= kInterruptInfoDeliverErrorCode;
157         Write(VmcsField32::ENTRY_EXCEPTION_ERROR_CODE, 0);
158     }
159 
160     DEBUG_ASSERT((Read(VmcsField32::ENTRY_INTERRUPTION_INFORMATION) & kInterruptInfoValid) == 0);
161     Write(VmcsField32::ENTRY_INTERRUPTION_INFORMATION, interrupt_info);
162 }
163 
Read(VmcsField16 field) const164 uint16_t AutoVmcs::Read(VmcsField16 field) const {
165     DEBUG_ASSERT(vmcs_address_ != 0);
166     return static_cast<uint16_t>(vmread(static_cast<uint64_t>(field)));
167 }
168 
Read(VmcsField32 field) const169 uint32_t AutoVmcs::Read(VmcsField32 field) const {
170     DEBUG_ASSERT(vmcs_address_ != 0);
171     return static_cast<uint32_t>(vmread(static_cast<uint64_t>(field)));
172 }
173 
Read(VmcsField64 field) const174 uint64_t AutoVmcs::Read(VmcsField64 field) const {
175     DEBUG_ASSERT(vmcs_address_ != 0);
176     return vmread(static_cast<uint64_t>(field));
177 }
178 
Read(VmcsFieldXX field) const179 uint64_t AutoVmcs::Read(VmcsFieldXX field) const {
180     DEBUG_ASSERT(vmcs_address_ != 0);
181     return vmread(static_cast<uint64_t>(field));
182 }
183 
Write(VmcsField16 field,uint16_t val)184 void AutoVmcs::Write(VmcsField16 field, uint16_t val) {
185     DEBUG_ASSERT(vmcs_address_ != 0);
186     vmwrite(static_cast<uint64_t>(field), val);
187 }
188 
Write(VmcsField32 field,uint32_t val)189 void AutoVmcs::Write(VmcsField32 field, uint32_t val) {
190     DEBUG_ASSERT(vmcs_address_ != 0);
191     vmwrite(static_cast<uint64_t>(field), val);
192 }
193 
Write(VmcsField64 field,uint64_t val)194 void AutoVmcs::Write(VmcsField64 field, uint64_t val) {
195     DEBUG_ASSERT(vmcs_address_ != 0);
196     vmwrite(static_cast<uint64_t>(field), val);
197 }
198 
Write(VmcsFieldXX field,uint64_t val)199 void AutoVmcs::Write(VmcsFieldXX field, uint64_t val) {
200     DEBUG_ASSERT(vmcs_address_ != 0);
201     vmwrite(static_cast<uint64_t>(field), val);
202 }
203 
SetControl(VmcsField32 controls,uint64_t true_msr,uint64_t old_msr,uint32_t set,uint32_t clear)204 zx_status_t AutoVmcs::SetControl(VmcsField32 controls, uint64_t true_msr, uint64_t old_msr,
205                                  uint32_t set, uint32_t clear) {
206     DEBUG_ASSERT(vmcs_address_ != 0);
207     uint32_t allowed_0 = static_cast<uint32_t>(BITS(true_msr, 31, 0));
208     uint32_t allowed_1 = static_cast<uint32_t>(BITS_SHIFT(true_msr, 63, 32));
209     if ((allowed_1 & set) != set) {
210         dprintf(INFO, "can not set vmcs controls %#x\n", static_cast<uint>(controls));
211         return ZX_ERR_NOT_SUPPORTED;
212     }
213     if ((~allowed_0 & clear) != clear) {
214         dprintf(INFO, "can not clear vmcs controls %#x\n", static_cast<uint>(controls));
215         return ZX_ERR_NOT_SUPPORTED;
216     }
217     if ((set & clear) != 0) {
218         dprintf(INFO, "can not set and clear the same vmcs controls %#x\n",
219                 static_cast<uint>(controls));
220         return ZX_ERR_INVALID_ARGS;
221     }
222 
223     // See Volume 3, Section 31.5.1, Algorithm 3, Part C. If the control can be
224     // either 0 or 1 (flexible), and the control is unknown, then refer to the
225     // old MSR to find the default value.
226     uint32_t flexible = allowed_0 ^ allowed_1;
227     uint32_t unknown = flexible & ~(set | clear);
228     uint32_t defaults = unknown & BITS(old_msr, 31, 0);
229     Write(controls, allowed_0 | defaults | set);
230     return ZX_OK;
231 }
232 
AutoPin(uint16_t vpid)233 AutoPin::AutoPin(uint16_t vpid)
234     : prev_cpu_mask_(get_current_thread()->cpu_affinity), thread_(hypervisor::pin_thread(vpid)) {}
235 
~AutoPin()236 AutoPin::~AutoPin() {
237     thread_set_cpu_affinity(thread_, prev_cpu_mask_);
238 }
239 
ept_pointer(paddr_t pml4_address)240 static uint64_t ept_pointer(paddr_t pml4_address) {
241     return
242         // Physical address of the PML4 page, page aligned.
243         pml4_address |
244         // Use write-back memory type for paging structures.
245         VMX_MEMORY_TYPE_WRITE_BACK << 0 |
246         // Page walk length of 4 (defined as N minus 1).
247         3u << 3;
248 }
249 
250 struct MsrListEntry {
251     uint32_t msr;
252     uint32_t reserved;
253     uint64_t value;
254 } __PACKED;
255 
edit_msr_list(VmxPage * msr_list_page,size_t index,uint32_t msr,uint64_t value)256 static void edit_msr_list(VmxPage* msr_list_page, size_t index, uint32_t msr, uint64_t value) {
257     // From Volume 3, Section 24.7.2.
258 
259     // From Volume 3, Appendix A.6: Specifically, if the value bits 27:25 of
260     // IA32_VMX_MISC is N, then 512 * (N + 1) is the recommended maximum number
261     // of MSRs to be included in each list.
262     //
263     // From Volume 3, Section 24.7.2: This field specifies the number of MSRs to
264     // be stored on VM exit. It is recommended that this count not exceed 512
265     // bytes.
266     //
267     // Since these two statements conflict, we are taking the conservative
268     // minimum and asserting that: index < (512 bytes / size of MsrListEntry).
269     ASSERT(index < (512 / sizeof(MsrListEntry)));
270 
271     MsrListEntry* entry = msr_list_page->VirtualAddress<MsrListEntry>() + index;
272     entry->msr = msr;
273     entry->value = value;
274 }
275 
vmcs_init(paddr_t vmcs_address,uint16_t vpid,uintptr_t entry,paddr_t msr_bitmaps_address,paddr_t pml4_address,VmxState * vmx_state,VmxPage * host_msr_page,VmxPage * guest_msr_page)276 static zx_status_t vmcs_init(paddr_t vmcs_address, uint16_t vpid, uintptr_t entry,
277                              paddr_t msr_bitmaps_address, paddr_t pml4_address, VmxState* vmx_state,
278                              VmxPage* host_msr_page, VmxPage* guest_msr_page) {
279     zx_status_t status = vmclear(vmcs_address);
280     if (status != ZX_OK)
281         return status;
282 
283     AutoVmcs vmcs(vmcs_address);
284     // Setup secondary processor-based VMCS controls.
285     status = vmcs.SetControl(VmcsField32::PROCBASED_CTLS2,
286                              read_msr(X86_MSR_IA32_VMX_PROCBASED_CTLS2),
287                              0,
288                              // Enable use of extended page tables.
289                              kProcbasedCtls2Ept |
290                                  // Enable use of RDTSCP instruction.
291                                  kProcbasedCtls2Rdtscp |
292                                  // Enable X2APIC.
293                                  kProcbasedCtls2x2Apic |
294                                  // Associate cached translations of linear
295                                  // addresses with a virtual processor ID.
296                                  kProcbasedCtls2Vpid |
297                                  // Enable unrestricted guest.
298                                  kProcbasedCtls2UnrestrictedGuest,
299                              0);
300     if (status != ZX_OK)
301         return status;
302 
303     // Enable use of INVPCID instruction if available.
304     vmcs.SetControl(VmcsField32::PROCBASED_CTLS2,
305                     read_msr(X86_MSR_IA32_VMX_PROCBASED_CTLS2),
306                     vmcs.Read(VmcsField32::PROCBASED_CTLS2),
307                     kProcbasedCtls2Invpcid,
308                     0);
309 
310     // Setup pin-based VMCS controls.
311     status = vmcs.SetControl(VmcsField32::PINBASED_CTLS,
312                              read_msr(X86_MSR_IA32_VMX_TRUE_PINBASED_CTLS),
313                              read_msr(X86_MSR_IA32_VMX_PINBASED_CTLS),
314                              // External interrupts cause a VM exit.
315                              kPinbasedCtlsExtIntExiting |
316                                  // Non-maskable interrupts cause a VM exit.
317                                  kPinbasedCtlsNmiExiting,
318                              0);
319     if (status != ZX_OK)
320         return status;
321 
322     // Setup primary processor-based VMCS controls.
323     status = vmcs.SetControl(VmcsField32::PROCBASED_CTLS,
324                              read_msr(X86_MSR_IA32_VMX_TRUE_PROCBASED_CTLS),
325                              read_msr(X86_MSR_IA32_VMX_PROCBASED_CTLS),
326                              // Enable VM exit when interrupts are enabled.
327                              kProcbasedCtlsIntWindowExiting |
328                                  // Enable VM exit on HLT instruction.
329                                  kProcbasedCtlsHltExiting |
330                                  // Enable TPR virtualization.
331                                  kProcbasedCtlsTprShadow |
332                                  // Enable VM exit on IO instructions.
333                                  kProcbasedCtlsIoExiting |
334                                  // Enable use of MSR bitmaps.
335                                  kProcbasedCtlsMsrBitmaps |
336                                  // Enable VM exit on pause instruction.
337                                  kProcbasedCtlsPauseExiting |
338                                  // Enable secondary processor-based controls.
339                                  kProcbasedCtlsProcbasedCtls2,
340                              // Disable VM exit on CR3 load.
341                              kProcbasedCtlsCr3LoadExiting |
342                                  // Disable VM exit on CR3 store.
343                                  kProcbasedCtlsCr3StoreExiting |
344                                  // Disable VM exit on CR8 load.
345                                  kProcbasedCtlsCr8LoadExiting |
346                                  // Disable VM exit on CR8 store.
347                                  kProcbasedCtlsCr8StoreExiting);
348     if (status != ZX_OK)
349         return status;
350 
351     // We only enable interrupt-window exiting above to ensure that the
352     // processor supports it for later use. So disable it for now.
353     vmcs.InterruptWindowExiting(false);
354 
355     // Setup VM-exit VMCS controls.
356     status = vmcs.SetControl(VmcsField32::EXIT_CTLS,
357                              read_msr(X86_MSR_IA32_VMX_TRUE_EXIT_CTLS),
358                              read_msr(X86_MSR_IA32_VMX_EXIT_CTLS),
359                              // Logical processor is in 64-bit mode after VM
360                              // exit. On VM exit CS.L, IA32_EFER.LME, and
361                              // IA32_EFER.LMA is set to true.
362                              kExitCtls64bitMode |
363                                  // Save the guest IA32_PAT MSR on exit.
364                                  kExitCtlsSaveIa32Pat |
365                                  // Load the host IA32_PAT MSR on exit.
366                                  kExitCtlsLoadIa32Pat |
367                                  // Save the guest IA32_EFER MSR on exit.
368                                  kExitCtlsSaveIa32Efer |
369                                  // Load the host IA32_EFER MSR on exit.
370                                  kExitCtlsLoadIa32Efer |
371                                  // Acknowledge external interrupt on exit.
372                                  kExitCtlsAckIntOnExit,
373                              0);
374     if (status != ZX_OK)
375         return status;
376 
377     // Setup VM-entry VMCS controls.
378     // Load the guest IA32_PAT MSR and IA32_EFER MSR on entry.
379     uint32_t entry_ctls = kEntryCtlsLoadIa32Pat | kEntryCtlsLoadIa32Efer;
380     if (vpid == kBaseProcessorVpid) {
381         // On the BSP, go straight to IA32E mode on entry.
382         entry_ctls |= kEntryCtlsIa32eMode;
383     }
384     status = vmcs.SetControl(VmcsField32::ENTRY_CTLS,
385                              read_msr(X86_MSR_IA32_VMX_TRUE_ENTRY_CTLS),
386                              read_msr(X86_MSR_IA32_VMX_ENTRY_CTLS),
387                              entry_ctls, 0);
388     if (status != ZX_OK)
389         return status;
390 
391     // From Volume 3, Section 24.6.3: The exception bitmap is a 32-bit field
392     // that contains one bit for each exception. When an exception occurs,
393     // its vector is used to select a bit in this field. If the bit is 1,
394     // the exception causes a VM exit. If the bit is 0, the exception is
395     // delivered normally through the IDT, using the descriptor
396     // corresponding to the exception’s vector.
397     //
398     // From Volume 3, Section 25.2: If software desires VM exits on all page
399     // faults, it can set bit 14 in the exception bitmap to 1 and set the
400     // page-fault error-code mask and match fields each to 00000000H.
401     vmcs.Write(VmcsField32::EXCEPTION_BITMAP, 0);
402     vmcs.Write(VmcsField32::PAGEFAULT_ERRORCODE_MASK, 0);
403     vmcs.Write(VmcsField32::PAGEFAULT_ERRORCODE_MATCH, 0);
404 
405     // From Volume 3, Section 28.1: Virtual-processor identifiers (VPIDs)
406     // introduce to VMX operation a facility by which a logical processor may
407     // cache information for multiple linear-address spaces. When VPIDs are
408     // used, VMX transitions may retain cached information and the logical
409     // processor switches to a different linear-address space.
410     //
411     // From Volume 3, Section 26.2.1.1: If the “enable VPID” VM-execution
412     // control is 1, the value of the VPID VM-execution control field must not
413     // be 0000H.
414     //
415     // From Volume 3, Section 28.3.3.3: If EPT is in use, the logical processor
416     // associates all mappings it creates with the value of bits 51:12 of
417     // current EPTP. If a VMM uses different EPTP values for different guests,
418     // it may use the same VPID for those guests.
419     //
420     // From Volume 3, Section 28.3.3.1: Operations that architecturally
421     // invalidate entries in the TLBs or paging-structure caches independent of
422     // VMX operation (e.g., the INVLPG and INVPCID instructions) invalidate
423     // linear mappings and combined mappings. They are required to do so only
424     // for the current VPID (but, for combined mappings, all EP4TAs). Linear
425     // mappings for the current VPID are invalidated even if EPT is in use.
426     // Combined mappings for the current VPID are invalidated even if EPT is
427     // not in use.
428     vmcs.Write(VmcsField16::VPID, vpid);
429 
430     // From Volume 3, Section 28.2: The extended page-table mechanism (EPT) is a
431     // feature that can be used to support the virtualization of physical
432     // memory. When EPT is in use, certain addresses that would normally be
433     // treated as physical addresses (and used to access memory) are instead
434     // treated as guest-physical addresses. Guest-physical addresses are
435     // translated by traversing a set of EPT paging structures to produce
436     // physical addresses that are used to access memory.
437     const auto eptp = ept_pointer(pml4_address);
438     vmcs.Write(VmcsField64::EPT_POINTER, eptp);
439 
440     // From Volume 3, Section 28.3.3.4: Software can use an INVEPT with type all
441     // ALL_CONTEXT to prevent undesired retention of cached EPT information. Here,
442     // we only care about invalidating information associated with this EPTP.
443     invept(InvEpt::SINGLE_CONTEXT, eptp);
444 
445     // Setup MSR handling.
446     vmcs.Write(VmcsField64::MSR_BITMAPS_ADDRESS, msr_bitmaps_address);
447 
448     edit_msr_list(host_msr_page, 0, X86_MSR_IA32_KERNEL_GS_BASE,
449                   read_msr(X86_MSR_IA32_KERNEL_GS_BASE));
450     edit_msr_list(host_msr_page, 1, X86_MSR_IA32_STAR, read_msr(X86_MSR_IA32_STAR));
451     edit_msr_list(host_msr_page, 2, X86_MSR_IA32_LSTAR, read_msr(X86_MSR_IA32_LSTAR));
452     edit_msr_list(host_msr_page, 3, X86_MSR_IA32_FMASK, read_msr(X86_MSR_IA32_FMASK));
453     edit_msr_list(host_msr_page, 4, X86_MSR_IA32_TSC_ADJUST, read_msr(X86_MSR_IA32_TSC_ADJUST));
454     edit_msr_list(host_msr_page, 5, X86_MSR_IA32_TSC_AUX, read_msr(X86_MSR_IA32_TSC_AUX));
455 
456     vmcs.Write(VmcsField64::EXIT_MSR_LOAD_ADDRESS, host_msr_page->PhysicalAddress());
457     vmcs.Write(VmcsField32::EXIT_MSR_LOAD_COUNT, 6);
458 
459     edit_msr_list(guest_msr_page, 0, X86_MSR_IA32_KERNEL_GS_BASE, 0);
460     edit_msr_list(guest_msr_page, 1, X86_MSR_IA32_STAR, 0);
461     edit_msr_list(guest_msr_page, 2, X86_MSR_IA32_LSTAR, 0);
462     edit_msr_list(guest_msr_page, 3, X86_MSR_IA32_FMASK, 0);
463     edit_msr_list(guest_msr_page, 4, X86_MSR_IA32_TSC_ADJUST, 0);
464     edit_msr_list(guest_msr_page, 5, X86_MSR_IA32_TSC_AUX, 0);
465     vmcs.Write(VmcsField64::EXIT_MSR_STORE_ADDRESS, guest_msr_page->PhysicalAddress());
466     vmcs.Write(VmcsField32::EXIT_MSR_STORE_COUNT, 6);
467     vmcs.Write(VmcsField64::ENTRY_MSR_LOAD_ADDRESS, guest_msr_page->PhysicalAddress());
468     vmcs.Write(VmcsField32::ENTRY_MSR_LOAD_COUNT, 6);
469 
470     // Setup VMCS host state.
471     //
472     // NOTE: We are pinned to a thread when executing this function, therefore
473     // it is acceptable to use per-CPU state.
474     x86_percpu* percpu = x86_get_percpu();
475     vmcs.Write(VmcsField64::HOST_IA32_PAT, read_msr(X86_MSR_IA32_PAT));
476     vmcs.Write(VmcsField64::HOST_IA32_EFER, read_msr(X86_MSR_IA32_EFER));
477     vmcs.Write(VmcsFieldXX::HOST_CR0, x86_get_cr0());
478     vmcs.Write(VmcsFieldXX::HOST_CR3, x86_get_cr3());
479     vmcs.Write(VmcsFieldXX::HOST_CR4, x86_get_cr4());
480     vmcs.Write(VmcsField16::HOST_ES_SELECTOR, 0);
481     vmcs.Write(VmcsField16::HOST_CS_SELECTOR, CODE_64_SELECTOR);
482     vmcs.Write(VmcsField16::HOST_SS_SELECTOR, DATA_SELECTOR);
483     vmcs.Write(VmcsField16::HOST_DS_SELECTOR, 0);
484     vmcs.Write(VmcsField16::HOST_FS_SELECTOR, 0);
485     vmcs.Write(VmcsField16::HOST_GS_SELECTOR, 0);
486     vmcs.Write(VmcsField16::HOST_TR_SELECTOR, TSS_SELECTOR(percpu->cpu_num));
487     vmcs.Write(VmcsFieldXX::HOST_FS_BASE, read_msr(X86_MSR_IA32_FS_BASE));
488     vmcs.Write(VmcsFieldXX::HOST_GS_BASE, read_msr(X86_MSR_IA32_GS_BASE));
489     vmcs.Write(VmcsFieldXX::HOST_TR_BASE, reinterpret_cast<uint64_t>(&percpu->default_tss));
490     vmcs.Write(VmcsFieldXX::HOST_GDTR_BASE, reinterpret_cast<uint64_t>(gdt_get()));
491     vmcs.Write(VmcsFieldXX::HOST_IDTR_BASE, reinterpret_cast<uint64_t>(idt_get_readonly()));
492     vmcs.Write(VmcsFieldXX::HOST_IA32_SYSENTER_ESP, 0);
493     vmcs.Write(VmcsFieldXX::HOST_IA32_SYSENTER_EIP, 0);
494     vmcs.Write(VmcsField32::HOST_IA32_SYSENTER_CS, 0);
495     vmcs.Write(VmcsFieldXX::HOST_RSP, reinterpret_cast<uint64_t>(vmx_state));
496     vmcs.Write(VmcsFieldXX::HOST_RIP, reinterpret_cast<uint64_t>(vmx_exit_entry));
497 
498     // Setup VMCS guest state.
499     uint64_t cr0 = X86_CR0_PE | // Enable protected mode
500                    X86_CR0_PG | // Enable paging
501                    X86_CR0_NE;  // Enable internal x87 exception handling
502     if (vpid != kBaseProcessorVpid) {
503         // Disable protected mode and paging on secondary VCPUs.
504         cr0 &= ~(X86_CR0_PE | X86_CR0_PG);
505     }
506     if (cr0_is_invalid(&vmcs, cr0)) {
507         return ZX_ERR_BAD_STATE;
508     }
509     vmcs.Write(VmcsFieldXX::GUEST_CR0, cr0);
510 
511     // Ensure that CR0.NE remains set by masking and manually handling writes to CR0 that unset it.
512     vmcs.Write(VmcsFieldXX::CR0_GUEST_HOST_MASK, X86_CR0_NE);
513     vmcs.Write(VmcsFieldXX::CR0_READ_SHADOW, X86_CR0_NE);
514 
515     uint64_t cr4 = X86_CR4_VMXE; // Enable VMX
516     if (vpid == kBaseProcessorVpid) {
517         // Enable the PAE bit on the BSP for 64-bit paging.
518         cr4 |= X86_CR4_PAE;
519     }
520     if (cr_is_invalid(cr4, X86_MSR_IA32_VMX_CR4_FIXED0, X86_MSR_IA32_VMX_CR4_FIXED1)) {
521         return ZX_ERR_BAD_STATE;
522     }
523     vmcs.Write(VmcsFieldXX::GUEST_CR4, cr4);
524 
525     // For now, the guest can own all of the CR4 bits except VMXE, which it shouldn't touch.
526     // TODO(andymutton): Implement proper CR4 handling.
527     vmcs.Write(VmcsFieldXX::CR4_GUEST_HOST_MASK, X86_CR4_VMXE);
528     vmcs.Write(VmcsFieldXX::CR4_READ_SHADOW, 0);
529 
530     vmcs.Write(VmcsField64::GUEST_IA32_PAT, read_msr(X86_MSR_IA32_PAT));
531 
532     uint64_t guest_efer = read_msr(X86_MSR_IA32_EFER);
533     if (vpid != kBaseProcessorVpid) {
534         // Disable LME and LMA on all but the BSP.
535         guest_efer &= ~(X86_EFER_LME | X86_EFER_LMA);
536     }
537     vmcs.Write(VmcsField64::GUEST_IA32_EFER, guest_efer);
538 
539     uint32_t cs_access_rights = kGuestXxAccessRightsDefault |
540                                 kGuestXxAccessRightsTypeE |
541                                 kGuestXxAccessRightsTypeCode;
542     if (vpid == kBaseProcessorVpid) {
543         // Ensure that the BSP starts with a 64-bit code segment.
544         cs_access_rights |= kGuestXxAccessRightsL;
545     }
546     vmcs.Write(VmcsField32::GUEST_CS_ACCESS_RIGHTS, cs_access_rights);
547 
548     vmcs.Write(VmcsField32::GUEST_TR_ACCESS_RIGHTS,
549                kGuestTrAccessRightsTssBusy | kGuestXxAccessRightsP);
550 
551     vmcs.Write(VmcsField32::GUEST_SS_ACCESS_RIGHTS, kGuestXxAccessRightsDefault);
552     vmcs.Write(VmcsField32::GUEST_DS_ACCESS_RIGHTS, kGuestXxAccessRightsDefault);
553     vmcs.Write(VmcsField32::GUEST_ES_ACCESS_RIGHTS, kGuestXxAccessRightsDefault);
554     vmcs.Write(VmcsField32::GUEST_FS_ACCESS_RIGHTS, kGuestXxAccessRightsDefault);
555     vmcs.Write(VmcsField32::GUEST_GS_ACCESS_RIGHTS, kGuestXxAccessRightsDefault);
556 
557     vmcs.Write(VmcsField32::GUEST_LDTR_ACCESS_RIGHTS,
558                kGuestXxAccessRightsTypeW | kGuestXxAccessRightsP);
559 
560     if (vpid == kBaseProcessorVpid) {
561         // Use GUEST_RIP to set the entry point on the BSP.
562         vmcs.Write(VmcsFieldXX::GUEST_CS_BASE, 0);
563         vmcs.Write(VmcsField16::GUEST_CS_SELECTOR, 0);
564         vmcs.Write(VmcsFieldXX::GUEST_RIP, entry);
565     } else {
566         // Use CS to set the entry point on APs.
567         vmcs.Write(VmcsFieldXX::GUEST_CS_BASE, entry);
568         vmcs.Write(VmcsField16::GUEST_CS_SELECTOR, static_cast<uint16_t>(entry >> 4));
569         vmcs.Write(VmcsFieldXX::GUEST_RIP, 0);
570     }
571     vmcs.Write(VmcsField32::GUEST_CS_LIMIT, 0xffff);
572     vmcs.Write(VmcsFieldXX::GUEST_TR_BASE, 0);
573     vmcs.Write(VmcsField16::GUEST_TR_SELECTOR, 0);
574     vmcs.Write(VmcsField32::GUEST_TR_LIMIT, 0xffff);
575     vmcs.Write(VmcsFieldXX::GUEST_DS_BASE, 0);
576     vmcs.Write(VmcsField32::GUEST_DS_LIMIT, 0xffff);
577     vmcs.Write(VmcsFieldXX::GUEST_SS_BASE, 0);
578     vmcs.Write(VmcsField32::GUEST_SS_LIMIT, 0xffff);
579     vmcs.Write(VmcsFieldXX::GUEST_ES_BASE, 0);
580     vmcs.Write(VmcsField32::GUEST_ES_LIMIT, 0xffff);
581     vmcs.Write(VmcsFieldXX::GUEST_FS_BASE, 0);
582     vmcs.Write(VmcsField32::GUEST_FS_LIMIT, 0xffff);
583     vmcs.Write(VmcsFieldXX::GUEST_GS_BASE, 0);
584     vmcs.Write(VmcsField32::GUEST_GS_LIMIT, 0xffff);
585     vmcs.Write(VmcsField32::GUEST_LDTR_LIMIT, 0xffff);
586     vmcs.Write(VmcsFieldXX::GUEST_GDTR_BASE, 0);
587     vmcs.Write(VmcsField32::GUEST_GDTR_LIMIT, 0xffff);
588     vmcs.Write(VmcsFieldXX::GUEST_IDTR_BASE, 0);
589     vmcs.Write(VmcsField32::GUEST_IDTR_LIMIT, 0xffff);
590 
591     // Set all reserved RFLAGS bits to their correct values
592     vmcs.Write(VmcsFieldXX::GUEST_RFLAGS, X86_FLAGS_RESERVED_ONES);
593 
594     vmcs.Write(VmcsField32::GUEST_ACTIVITY_STATE, 0);
595     vmcs.Write(VmcsField32::GUEST_INTERRUPTIBILITY_STATE, 0);
596     vmcs.Write(VmcsFieldXX::GUEST_PENDING_DEBUG_EXCEPTIONS, 0);
597 
598     // From Volume 3, Section 26.3.1.1: The IA32_SYSENTER_ESP field and the
599     // IA32_SYSENTER_EIP field must each contain a canonical address.
600     vmcs.Write(VmcsFieldXX::GUEST_IA32_SYSENTER_ESP, 0);
601     vmcs.Write(VmcsFieldXX::GUEST_IA32_SYSENTER_EIP, 0);
602     vmcs.Write(VmcsField32::GUEST_IA32_SYSENTER_CS, 0);
603 
604     vmcs.Write(VmcsFieldXX::GUEST_RSP, 0);
605     vmcs.Write(VmcsFieldXX::GUEST_CR3, 0);
606 
607     // From Volume 3, Section 24.4.2: If the “VMCS shadowing” VM-execution
608     // control is 1, the VMREAD and VMWRITE instructions access the VMCS
609     // referenced by this pointer (see Section 24.10). Otherwise, software
610     // should set this field to FFFFFFFF_FFFFFFFFH to avoid VM-entry
611     // failures (see Section 26.3.1.5).
612     vmcs.Write(VmcsField64::LINK_POINTER, kLinkPointerInvalidate);
613 
614     if (x86_feature_test(X86_FEATURE_XSAVE)) {
615         // Enable x87 state in guest XCR0.
616         vmx_state->guest_state.xcr0 = X86_XSAVE_STATE_BIT_X87;
617     }
618 
619     return ZX_OK;
620 }
621 
622 // static
Create(Guest * guest,zx_vaddr_t entry,ktl::unique_ptr<Vcpu> * out)623 zx_status_t Vcpu::Create(Guest* guest, zx_vaddr_t entry, ktl::unique_ptr<Vcpu>* out) {
624     hypervisor::GuestPhysicalAddressSpace* gpas = guest->AddressSpace();
625     if (entry >= gpas->size())
626         return ZX_ERR_INVALID_ARGS;
627 
628     uint16_t vpid;
629     zx_status_t status = guest->AllocVpid(&vpid);
630     if (status != ZX_OK) {
631         return status;
632     }
633 
634     auto auto_call = fbl::MakeAutoCall([guest, vpid]() {
635         guest->FreeVpid(vpid);
636     });
637 
638     // When we create a VCPU, we bind it to the current thread and a CPU based
639     // on the VPID. The VCPU must always be run on the current thread and the
640     // given CPU, unless an explicit migration is performed.
641     //
642     // The reason we do this is that:
643     // 1. The state of the current thread is stored within the VMCS, to be
644     //    restored upon a guest-to-host transition.
645     // 2. The state of the VMCS associated with the VCPU is cached within the
646     //    CPU. To move to a different CPU, we must perform an explicit migration
647     //    which will cost us performance.
648     thread_t* thread = hypervisor::pin_thread(vpid);
649 
650     fbl::AllocChecker ac;
651     ktl::unique_ptr<Vcpu> vcpu(new (&ac) Vcpu(guest, vpid, thread));
652     if (!ac.check())
653         return ZX_ERR_NO_MEMORY;
654 
655     timer_init(&vcpu->local_apic_state_.timer);
656     status = vcpu->local_apic_state_.interrupt_tracker.Init();
657     if (status != ZX_OK)
658         return status;
659 
660     vcpu->pvclock_state_.is_stable =
661         pvclock_is_present() ? pvclock_is_stable() : x86_feature_test(X86_FEATURE_INVAR_TSC);
662 
663     VmxInfo vmx_info;
664     status = vcpu->host_msr_page_.Alloc(vmx_info, 0);
665     if (status != ZX_OK)
666         return status;
667 
668     status = vcpu->guest_msr_page_.Alloc(vmx_info, 0);
669     if (status != ZX_OK)
670         return status;
671 
672     status = vcpu->vmcs_page_.Alloc(vmx_info, 0);
673     if (status != ZX_OK)
674         return status;
675     auto_call.cancel();
676 
677     VmxRegion* region = vcpu->vmcs_page_.VirtualAddress<VmxRegion>();
678     region->revision_id = vmx_info.revision_id;
679     zx_paddr_t table = gpas->arch_aspace()->arch_table_phys();
680     status = vmcs_init(vcpu->vmcs_page_.PhysicalAddress(), vpid, entry, guest->MsrBitmapsAddress(),
681                        table, &vcpu->vmx_state_, &vcpu->host_msr_page_, &vcpu->guest_msr_page_);
682     if (status != ZX_OK)
683         return status;
684 
685     *out = ktl::move(vcpu);
686     return ZX_OK;
687 }
688 
Vcpu(Guest * guest,uint16_t vpid,const thread_t * thread)689 Vcpu::Vcpu(Guest* guest, uint16_t vpid, const thread_t* thread)
690     : guest_(guest), vpid_(vpid), thread_(thread), running_(false), vmx_state_(/* zero-init */) {}
691 
~Vcpu()692 Vcpu::~Vcpu() {
693     if (!vmcs_page_.IsAllocated()) {
694         return;
695     }
696     timer_cancel(&local_apic_state_.timer);
697     // The destructor may be called from a different thread, therefore we must
698     // pin the current thread to the same CPU as the VCPU.
699     AutoPin pin(vpid_);
700     vmclear(vmcs_page_.PhysicalAddress());
701     __UNUSED zx_status_t status = guest_->FreeVpid(vpid_);
702     DEBUG_ASSERT(status == ZX_OK);
703 }
704 
705 // Injects an interrupt into the guest, if there is one pending.
local_apic_maybe_interrupt(AutoVmcs * vmcs,LocalApicState * local_apic_state)706 static zx_status_t local_apic_maybe_interrupt(AutoVmcs* vmcs, LocalApicState* local_apic_state) {
707     // Since hardware generated exceptions are delivered to the guest directly, the only exceptions
708     // we see here are those we generate in the VMM, e.g. GP faults in vmexit handlers. Therefore
709     // we simplify interrupt priority to 1) NMIs, 2) interrupts, and 3) generated exceptions. See
710     // Volume 3, Section 6.9, Table 6-2.
711     uint32_t vector;
712     hypervisor::InterruptType type = local_apic_state->interrupt_tracker.TryPop(X86_INT_NMI);
713     if (type != hypervisor::InterruptType::INACTIVE) {
714         vector = X86_INT_NMI;
715     } else {
716         // Pop scans vectors from highest to lowest, which will correctly pop interrupts before
717         // exceptions. All vectors <= X86_INT_VIRT except the NMI vector are exceptions.
718         type = local_apic_state->interrupt_tracker.Pop(&vector);
719         if (type == hypervisor::InterruptType::INACTIVE) {
720             return ZX_OK;
721         }
722     }
723 
724     if (vector > X86_INT_VIRT && vector < X86_INT_PLATFORM_BASE) {
725         dprintf(INFO, "Invalid interrupt vector: %u\n", vector);
726         return ZX_ERR_NOT_SUPPORTED;
727     } else if (vector >= X86_INT_PLATFORM_BASE &&
728         !(vmcs->Read(VmcsFieldXX::GUEST_RFLAGS) & X86_FLAGS_IF)) {
729         // Volume 3, Section 6.8.1: The IF flag does not affect non-maskable interrupts (NMIs),
730         // [...] nor does it affect processor generated exceptions.
731         local_apic_state->interrupt_tracker.Track(vector, type);
732         // If interrupts are disabled, we set VM exit on interrupt enable.
733         vmcs->InterruptWindowExiting(true);
734         return ZX_OK;
735     }
736 
737     // If the vector is non-maskable or interrupts are enabled, we inject an interrupt.
738     vmcs->IssueInterrupt(vector);
739 
740     // Volume 3, Section 6.9: Lower priority exceptions are discarded; lower priority interrupts are
741     // held pending. Discarded exceptions are re-generated when the interrupt handler returns
742     // execution to the point in the program or task where the exceptions and/or interrupts
743     // occurred.
744     local_apic_state->interrupt_tracker.Clear(0, X86_INT_NMI);
745     local_apic_state->interrupt_tracker.Clear(X86_INT_NMI + 1, X86_INT_VIRT + 1);
746 
747     return ZX_OK;
748 }
749 
Resume(zx_port_packet_t * packet)750 zx_status_t Vcpu::Resume(zx_port_packet_t* packet) {
751     if (!hypervisor::check_pinned_cpu_invariant(vpid_, thread_))
752         return ZX_ERR_BAD_STATE;
753     zx_status_t status;
754     do {
755         AutoVmcs vmcs(vmcs_page_.PhysicalAddress());
756         status = local_apic_maybe_interrupt(&vmcs, &local_apic_state_);
757         if (status != ZX_OK) {
758             return status;
759         }
760         if (x86_feature_test(X86_FEATURE_XSAVE)) {
761             // Save the host XCR0, and load the guest XCR0.
762             vmx_state_.host_state.xcr0 = x86_xgetbv(0);
763             x86_xsetbv(0, vmx_state_.guest_state.xcr0);
764         }
765 
766         // Updates guest system time if the guest subscribed to updates.
767         pvclock_update_system_time(&pvclock_state_, guest_->AddressSpace());
768 
769         ktrace(TAG_VCPU_ENTER, 0, 0, 0, 0);
770         running_.store(true);
771         status = vmx_enter(&vmx_state_);
772         running_.store(false);
773         if (x86_feature_test(X86_FEATURE_XSAVE)) {
774             // Save the guest XCR0, and load the host XCR0.
775             vmx_state_.guest_state.xcr0 = x86_xgetbv(0);
776             x86_xsetbv(0, vmx_state_.host_state.xcr0);
777         }
778 
779         if (status != ZX_OK) {
780             ktrace_vcpu_exit(VCPU_FAILURE, vmcs.Read(VmcsFieldXX::GUEST_RIP));
781             uint64_t error = vmcs.Read(VmcsField32::INSTRUCTION_ERROR);
782             dprintf(INFO, "VCPU resume failed: %#lx\n", error);
783         } else {
784             vmx_state_.resume = true;
785             status = vmexit_handler(&vmcs, &vmx_state_.guest_state, &local_apic_state_,
786                                     &pvclock_state_, guest_->AddressSpace(), guest_->Traps(),
787                                     packet);
788         }
789     } while (status == ZX_OK);
790     return status == ZX_ERR_NEXT ? ZX_OK : status;
791 }
792 
vmx_exit(VmxState * vmx_state)793 void vmx_exit(VmxState* vmx_state) {
794     DEBUG_ASSERT(arch_ints_disabled());
795 
796     // Reload the task segment in order to restore its limit. VMX always
797     // restores it with a limit of 0x67, which excludes the IO bitmap.
798     seg_sel_t selector = TSS_SELECTOR(arch_curr_cpu_num());
799     x86_clear_tss_busy(selector);
800     x86_ltr(selector);
801 }
802 
Interrupt(uint32_t vector,hypervisor::InterruptType type)803 cpu_mask_t Vcpu::Interrupt(uint32_t vector, hypervisor::InterruptType type) {
804     bool signaled = false;
805     local_apic_state_.interrupt_tracker.Interrupt(vector, type, &signaled);
806     if (signaled || !running_.load()) {
807         return 0;
808     }
809     return cpu_num_to_mask(hypervisor::cpu_of(vpid_));
810 }
811 
VirtualInterrupt(uint32_t vector)812 void Vcpu::VirtualInterrupt(uint32_t vector) {
813     cpu_mask_t mask = Interrupt(vector, hypervisor::InterruptType::VIRTUAL);
814     if (mask != 0) {
815         mp_interrupt(MP_IPI_TARGET_MASK, mask);
816     }
817 }
818 
819 template <typename Out, typename In>
register_copy(Out * out,const In & in)820 static void register_copy(Out* out, const In& in) {
821     out->rax = in.rax;
822     out->rcx = in.rcx;
823     out->rdx = in.rdx;
824     out->rbx = in.rbx;
825     out->rbp = in.rbp;
826     out->rsi = in.rsi;
827     out->rdi = in.rdi;
828     out->r8 = in.r8;
829     out->r9 = in.r9;
830     out->r10 = in.r10;
831     out->r11 = in.r11;
832     out->r12 = in.r12;
833     out->r13 = in.r13;
834     out->r14 = in.r14;
835     out->r15 = in.r15;
836 }
837 
ReadState(uint32_t kind,void * buf,size_t len) const838 zx_status_t Vcpu::ReadState(uint32_t kind, void* buf, size_t len) const {
839     if (!hypervisor::check_pinned_cpu_invariant(vpid_, thread_))
840         return ZX_ERR_BAD_STATE;
841     switch (kind) {
842     case ZX_VCPU_STATE: {
843         if (len != sizeof(zx_vcpu_state_t))
844             break;
845         auto state = static_cast<zx_vcpu_state_t*>(buf);
846         register_copy(state, vmx_state_.guest_state);
847         AutoVmcs vmcs(vmcs_page_.PhysicalAddress());
848         state->rsp = vmcs.Read(VmcsFieldXX::GUEST_RSP);
849         state->rflags = vmcs.Read(VmcsFieldXX::GUEST_RFLAGS) & X86_FLAGS_USER;
850         return ZX_OK;
851     }
852     }
853     return ZX_ERR_INVALID_ARGS;
854 }
855 
WriteState(uint32_t kind,const void * buf,size_t len)856 zx_status_t Vcpu::WriteState(uint32_t kind, const void* buf, size_t len) {
857     if (!hypervisor::check_pinned_cpu_invariant(vpid_, thread_))
858         return ZX_ERR_BAD_STATE;
859     switch (kind) {
860     case ZX_VCPU_STATE: {
861         if (len != sizeof(zx_vcpu_state_t))
862             break;
863         auto state = static_cast<const zx_vcpu_state_t*>(buf);
864         register_copy(&vmx_state_.guest_state, *state);
865         AutoVmcs vmcs(vmcs_page_.PhysicalAddress());
866         vmcs.Write(VmcsFieldXX::GUEST_RSP, state->rsp);
867         if (state->rflags & X86_FLAGS_RESERVED_ONES) {
868             const uint64_t rflags = vmcs.Read(VmcsFieldXX::GUEST_RFLAGS);
869             const uint64_t user_flags = (rflags & ~X86_FLAGS_USER) |
870                                         (state->rflags & X86_FLAGS_USER);
871             vmcs.Write(VmcsFieldXX::GUEST_RFLAGS, user_flags);
872         }
873         return ZX_OK;
874     }
875     case ZX_VCPU_IO: {
876         if (len != sizeof(zx_vcpu_io_t))
877             break;
878         auto io = static_cast<const zx_vcpu_io_t*>(buf);
879         memcpy(&vmx_state_.guest_state.rax, io->data, io->access_size);
880         return ZX_OK;
881     }
882     }
883     return ZX_ERR_INVALID_ARGS;
884 }
885 
cr0_is_invalid(AutoVmcs * vmcs,uint64_t cr0_value)886 bool cr0_is_invalid(AutoVmcs* vmcs, uint64_t cr0_value) {
887     uint64_t check_value = cr0_value;
888     // From Volume 3, Section 26.3.1.1: PE and PG bits of CR0 are not checked when unrestricted
889     // guest is enabled. Set both here to avoid clashing with X86_MSR_IA32_VMX_CR0_FIXED1.
890     if (vmcs->Read(VmcsField32::PROCBASED_CTLS2) & kProcbasedCtls2UnrestrictedGuest) {
891         check_value |= X86_CR0_PE | X86_CR0_PG;
892     }
893     return cr_is_invalid(check_value, X86_MSR_IA32_VMX_CR0_FIXED0, X86_MSR_IA32_VMX_CR0_FIXED1);
894 }
895