1 /*
2  * Copyright (C) 2021-2022 Intel Corporation.
3  *
4  * SPDX-License-Identifier: BSD-3-Clause
5  */
6 
7 #include <types.h>
8 #include <logmsg.h>
9 #include <asm/mmu.h>
10 #include <asm/guest/virq.h>
11 #include <asm/guest/ept.h>
12 #include <asm/guest/vcpu.h>
13 #include <asm/guest/vm.h>
14 #include <asm/guest/vmcs.h>
15 #include <asm/guest/nested.h>
16 #include <asm/guest/vept.h>
17 
18 /* Cache the content of MSR_IA32_VMX_BASIC */
19 static uint32_t vmx_basic;
20 
21 static void disable_vmcs_shadowing(void);
22 static void clear_vvmcs(struct acrn_vcpu *vcpu, struct acrn_vvmcs *vvmcs);
23 
24 /* The only purpose of this array is to serve the is_vmx_msr() function */
25 static const uint32_t vmx_msrs[NUM_VMX_MSRS] = {
26 	LIST_OF_VMX_MSRS
27 };
28 
is_vmx_msr(uint32_t msr)29 bool is_vmx_msr(uint32_t msr)
30 {
31 	bool found = false;
32 	uint32_t i;
33 
34 	for (i = 0U; i < NUM_VMX_MSRS; i++) {
35 		if (msr == vmx_msrs[i]) {
36 			found = true;
37 			break;
38 		}
39 	}
40 
41 	return found;
42 }
43 
adjust_vmx_ctrls(uint32_t msr,uint64_t request_bits)44 static uint64_t adjust_vmx_ctrls(uint32_t msr, uint64_t request_bits)
45 {
46 	union value_64 val64, msr_val;
47 
48 	/*
49 	 * ISDM Appendix A.3, A.4, A.5:
50 	 * - Bits 31:0 indicate the allowed 0-settings of these controls.
51 	 *   bit X of the corresponding VM-execution controls field is allowed to be 0
52 	 *   if bit X in the MSR is cleared to 0
53 	 * - Bits 63:32 indicate the allowed 1-settings of these controls.
54 	 *   VM entry allows control X to be 1 if bit 32+X in the MSR is set to 1
55 	 */
56 	msr_val.full = msr_read(msr);
57 
58 	/*
59 	 * The reserved bits in VMCS Control fields could be 0 or 1, determined by the
60 	 * corresponding capability MSR. So need to read them from physical MSR.
61 	 *
62 	 * We consider the bits that are set in the allowed 0-settings group as the
63 	 * minimal set of bits that need to be set from the physical processor's perspective.
64 	 * Since we shadow this control field, we passthru the allowed 0-settings bits.
65 	 */
66 	val64.u.lo_32 = msr_val.u.lo_32;
67 
68 	/* allowed 1-settings include those bits are NOT allowed to be 0 */
69 	val64.u.hi_32 = msr_val.u.lo_32;
70 
71 	/* make sure the requested features are supported by hardware */
72 	val64.u.hi_32 |= (msr_val.u.hi_32 & request_bits);
73 
74 	return val64.full;
75 }
76 
77 /*
78  * @pre vcpu != NULL
79  */
init_vmx_msrs(struct acrn_vcpu * vcpu)80 void init_vmx_msrs(struct acrn_vcpu *vcpu)
81 {
82 	union value_64 val64;
83 	uint64_t request_bits, msr_value;
84 
85 	if (is_nvmx_configured(vcpu->vm)) {
86 		/* MSR_IA32_VMX_BASIC */
87 		val64.full = VMCS12_REVISION_ID	/* Bits 30:0 - VMCS revision ID */
88 			| (4096UL << 32U)	/* Bits 44:32 - size of VMXON region and VMCS region */
89 			| (6UL << 50U)		/* Bits 53:50 - memory type for VMCS etc. (6: Write Back) */
90 			| (1UL << 54U)		/* Bit 54: VM-exit instruction-information for INS and OUTS */
91 			| (1UL << 55U);		/* Bit 55: VMX controls that default to 1 may be cleared to 0 */
92 		vcpu_set_guest_msr(vcpu, MSR_IA32_VMX_BASIC, val64.full);
93 
94 		/* MSR_IA32_VMX_MISC */
95 
96 		/*
97 		 * some bits need to read from physical MSR. For exmaple Bits 4:0 report the relationship between
98 		 * the rate of the VMX-preemption timer and that of the timestamp counter (TSC).
99 		 */
100 		val64.full = msr_read(MSR_IA32_VMX_MISC);
101 		val64.u.hi_32 = 0U;
102 
103 		/* Don't support Intel® Processor Trace (Intel PT) in VMX operation */
104 		val64.u.lo_32 &= ~(1U << 14U);
105 
106 		/* Don't support SMM in VMX operation */
107 		val64.u.lo_32 &= ~((1U << 15U) | (1U << 28U));
108 
109 		vcpu_set_guest_msr(vcpu, MSR_IA32_VMX_MISC, val64.full);
110 
111 		/*
112 		 * TODO: These emulated VMX Control MSRs work for Tiger Lake and Kaby Lake,
113 		 * potentially it may have problems if run on other platforms.
114 		 *
115 		 * We haven't put our best efforts to try to enable as much as features as
116 		 * possible.
117 		 */
118 
119 		/* MSR_IA32_VMX_PINBASED_CTLS */
120 		request_bits = VMX_PINBASED_CTLS_IRQ_EXIT
121 			| VMX_PINBASED_CTLS_NMI_EXIT
122 			| VMX_PINBASED_CTLS_ENABLE_PTMR;
123 		msr_value = adjust_vmx_ctrls(MSR_IA32_VMX_PINBASED_CTLS, request_bits);
124 		vcpu_set_guest_msr(vcpu, MSR_IA32_VMX_PINBASED_CTLS, msr_value);
125 		msr_value = adjust_vmx_ctrls(MSR_IA32_VMX_TRUE_PINBASED_CTLS, request_bits);
126 		vcpu_set_guest_msr(vcpu, MSR_IA32_VMX_TRUE_PINBASED_CTLS, msr_value);
127 
128 		/* MSR_IA32_VMX_PROCBASED_CTLS */
129 		request_bits = VMX_PROCBASED_CTLS_IRQ_WIN | VMX_PROCBASED_CTLS_TSC_OFF
130 			| VMX_PROCBASED_CTLS_HLT | VMX_PROCBASED_CTLS_INVLPG
131 			| VMX_PROCBASED_CTLS_MWAIT | VMX_PROCBASED_CTLS_RDPMC
132 			| VMX_PROCBASED_CTLS_RDTSC | VMX_PROCBASED_CTLS_CR3_LOAD
133 			| VMX_PROCBASED_CTLS_CR3_STORE | VMX_PROCBASED_CTLS_CR8_LOAD
134 			| VMX_PROCBASED_CTLS_CR8_STORE | VMX_PROCBASED_CTLS_NMI_WINEXIT
135 			| VMX_PROCBASED_CTLS_MOV_DR | VMX_PROCBASED_CTLS_UNCOND_IO
136 			| VMX_PROCBASED_CTLS_MSR_BITMAP | VMX_PROCBASED_CTLS_MONITOR
137 			| VMX_PROCBASED_CTLS_PAUSE | VMX_PROCBASED_CTLS_SECONDARY;
138 		msr_value = adjust_vmx_ctrls(MSR_IA32_VMX_PROCBASED_CTLS, request_bits);
139 		vcpu_set_guest_msr(vcpu, MSR_IA32_VMX_PROCBASED_CTLS, msr_value);
140 		msr_value = adjust_vmx_ctrls(MSR_IA32_VMX_TRUE_PROCBASED_CTLS, request_bits);
141 		vcpu_set_guest_msr(vcpu, MSR_IA32_VMX_TRUE_PROCBASED_CTLS, msr_value);
142 
143 		/* MSR_IA32_VMX_PROCBASED_CTLS2 */
144 		request_bits = VMX_PROCBASED_CTLS2_EPT | VMX_PROCBASED_CTLS2_RDTSCP
145 			| VMX_PROCBASED_CTLS2_VPID | VMX_PROCBASED_CTLS2_WBINVD
146 			| VMX_PROCBASED_CTLS2_UNRESTRICT | VMX_PROCBASED_CTLS2_PAUSE_LOOP
147 			| VMX_PROCBASED_CTLS2_RDRAND | VMX_PROCBASED_CTLS2_INVPCID
148 			| VMX_PROCBASED_CTLS2_RDSEED | VMX_PROCBASED_CTLS2_XSVE_XRSTR
149 			| VMX_PROCBASED_CTLS2_TSC_SCALING;
150 		msr_value = adjust_vmx_ctrls(MSR_IA32_VMX_PROCBASED_CTLS2, request_bits);
151 		vcpu_set_guest_msr(vcpu, MSR_IA32_VMX_PROCBASED_CTLS2, msr_value);
152 
153 		/* MSR_IA32_VMX_EXIT_CTLS */
154 		request_bits = VMX_EXIT_CTLS_SAVE_DBG | VMX_EXIT_CTLS_HOST_ADDR64
155 			| VMX_EXIT_CTLS_ACK_IRQ | VMX_EXIT_CTLS_LOAD_PAT
156 			| VMX_EXIT_CTLS_LOAD_EFER;
157 		msr_value = adjust_vmx_ctrls(MSR_IA32_VMX_EXIT_CTLS, request_bits);
158 		vcpu_set_guest_msr(vcpu, MSR_IA32_VMX_EXIT_CTLS, msr_value);
159 		msr_value = adjust_vmx_ctrls(MSR_IA32_VMX_TRUE_EXIT_CTLS, request_bits);
160 		vcpu_set_guest_msr(vcpu, MSR_IA32_VMX_TRUE_EXIT_CTLS, msr_value);
161 
162 		/* MSR_IA32_VMX_ENTRY_CTLS */
163 		request_bits = VMX_ENTRY_CTLS_LOAD_DBG | VMX_ENTRY_CTLS_IA32E_MODE
164 			| VMX_ENTRY_CTLS_LOAD_PERF | VMX_ENTRY_CTLS_LOAD_PAT
165 			| VMX_ENTRY_CTLS_LOAD_EFER;
166 		msr_value = adjust_vmx_ctrls(MSR_IA32_VMX_ENTRY_CTLS, request_bits);
167 		vcpu_set_guest_msr(vcpu, MSR_IA32_VMX_ENTRY_CTLS, msr_value);
168 		msr_value = adjust_vmx_ctrls(MSR_IA32_VMX_TRUE_ENTRY_CTLS, request_bits);
169 		vcpu_set_guest_msr(vcpu, MSR_IA32_VMX_TRUE_ENTRY_CTLS, msr_value);
170 
171 		msr_value = msr_read(MSR_IA32_VMX_EPT_VPID_CAP);
172 		/*
173 		 * Hide 5 level EPT capability
174 		 * Hide accessed and dirty flags for EPT
175 		 */
176 		msr_value &= ~(VMX_EPT_PAGE_WALK_5 | VMX_EPT_AD | VMX_EPT_2MB_PAGE | VMX_EPT_1GB_PAGE);
177 		vcpu_set_guest_msr(vcpu, MSR_IA32_VMX_EPT_VPID_CAP, msr_value);
178 
179 		/* For now passthru the value from physical MSR to L1 guest */
180 		msr_value = msr_read(MSR_IA32_VMX_CR0_FIXED0);
181 		vcpu_set_guest_msr(vcpu, MSR_IA32_VMX_CR0_FIXED0, msr_value);
182 
183 		msr_value = msr_read(MSR_IA32_VMX_CR0_FIXED1);
184 		vcpu_set_guest_msr(vcpu, MSR_IA32_VMX_CR0_FIXED1, msr_value);
185 
186 		msr_value = msr_read(MSR_IA32_VMX_CR4_FIXED0);
187 		vcpu_set_guest_msr(vcpu, MSR_IA32_VMX_CR4_FIXED0, msr_value);
188 
189 		msr_value = msr_read(MSR_IA32_VMX_CR4_FIXED1);
190 		vcpu_set_guest_msr(vcpu, MSR_IA32_VMX_CR4_FIXED1, msr_value);
191 
192 		msr_value = msr_read(MSR_IA32_VMX_VMCS_ENUM);
193 		vcpu_set_guest_msr(vcpu, MSR_IA32_VMX_VMCS_ENUM, msr_value);
194 	}
195 }
196 
197 /*
198  * @pre vcpu != NULL
199  */
read_vmx_msr(struct acrn_vcpu * vcpu,uint32_t msr,uint64_t * val)200 int32_t read_vmx_msr(struct acrn_vcpu *vcpu, uint32_t msr, uint64_t *val)
201 {
202 	uint64_t v = 0UL;
203 	int32_t err = 0;
204 
205 	if (is_nvmx_configured(vcpu->vm)) {
206 		switch (msr) {
207 		case MSR_IA32_VMX_TRUE_PINBASED_CTLS:
208 		case MSR_IA32_VMX_PINBASED_CTLS:
209 		case MSR_IA32_VMX_PROCBASED_CTLS:
210 		case MSR_IA32_VMX_TRUE_PROCBASED_CTLS:
211 		case MSR_IA32_VMX_PROCBASED_CTLS2:
212 		case MSR_IA32_VMX_EXIT_CTLS:
213 		case MSR_IA32_VMX_TRUE_EXIT_CTLS:
214 		case MSR_IA32_VMX_ENTRY_CTLS:
215 		case MSR_IA32_VMX_TRUE_ENTRY_CTLS:
216 		case MSR_IA32_VMX_BASIC:
217 		case MSR_IA32_VMX_MISC:
218 		case MSR_IA32_VMX_EPT_VPID_CAP:
219 		case MSR_IA32_VMX_CR0_FIXED0:
220 		case MSR_IA32_VMX_CR0_FIXED1:
221 		case MSR_IA32_VMX_CR4_FIXED0:
222 		case MSR_IA32_VMX_CR4_FIXED1:
223 		case MSR_IA32_VMX_VMCS_ENUM:
224 		{
225 			v = vcpu_get_guest_msr(vcpu, msr);
226 			break;
227 		}
228 		/* Don't support these MSRs yet */
229 		case MSR_IA32_SMBASE:
230 		case MSR_IA32_VMX_PROCBASED_CTLS3:
231 		case MSR_IA32_VMX_VMFUNC:
232 		default:
233 			err = -EACCES;
234 			break;
235 		}
236 	} else {
237 		err = -EACCES;
238 	}
239 
240 	*val = v;
241 	return err;
242 }
243 
244 #define MAX_SHADOW_VMCS_FIELDS 113U
245 /*
246  * VMCS fields included in the dual-purpose VMCS: as shadow for L1 and
247  * as hardware VMCS for nested guest (L2).
248  *
249  * TODO: This list is for TGL and CFL machines and the fields
250  * for advacned APICv features such as Posted Interrupt and Virtual
251  * Interrupt Delivery are not included, as these are not available
252  * on those platforms.
253  *
254  * Certain fields, e.g. VMX_TSC_MULTIPLIER_FULL is available only if
255  * "use TSC scaling” is supported. Thus a static array may not work
256  * for all platforms.
257  */
258 static const uint32_t vmcs_shadowing_fields[MAX_SHADOW_VMCS_FIELDS] = {
259 	/* 16-bits */
260 	VMX_GUEST_ES_SEL,
261 	VMX_GUEST_CS_SEL,
262 	VMX_GUEST_SS_SEL,
263 	VMX_GUEST_DS_SEL,
264 	VMX_GUEST_FS_SEL,
265 	VMX_GUEST_GS_SEL,
266 	VMX_GUEST_LDTR_SEL,
267 	VMX_GUEST_TR_SEL,
268 	VMX_GUEST_PML_INDEX,
269 
270 	/* 64-bits */
271 	VMX_IO_BITMAP_A_FULL,
272 	VMX_IO_BITMAP_B_FULL,
273 	VMX_EXIT_MSR_STORE_ADDR_FULL,
274 	VMX_EXIT_MSR_LOAD_ADDR_FULL,
275 	VMX_ENTRY_MSR_LOAD_ADDR_FULL,
276 	VMX_EXECUTIVE_VMCS_PTR_FULL,
277 	VMX_TSC_OFFSET_FULL,
278 	VMX_VIRTUAL_APIC_PAGE_ADDR_FULL,
279 	VMX_APIC_ACCESS_ADDR_FULL,
280 	VMX_VMREAD_BITMAP_FULL,
281 	VMX_VMWRITE_BITMAP_FULL,
282 	VMX_XSS_EXITING_BITMAP_FULL,
283 	VMX_TSC_MULTIPLIER_FULL,
284 	VMX_GUEST_PHYSICAL_ADDR_FULL,
285 	VMX_VMS_LINK_PTR_FULL,
286 	VMX_GUEST_IA32_DEBUGCTL_FULL,
287 	VMX_GUEST_IA32_PAT_FULL,
288 	VMX_GUEST_IA32_EFER_FULL,
289 	VMX_GUEST_IA32_PERF_CTL_FULL,
290 	VMX_GUEST_PDPTE0_FULL,
291 	VMX_GUEST_PDPTE1_FULL,
292 	VMX_GUEST_PDPTE2_FULL,
293 	VMX_GUEST_PDPTE3_FULL,
294 
295 	/* 32-bits */
296 	VMX_PIN_VM_EXEC_CONTROLS,
297 	VMX_PROC_VM_EXEC_CONTROLS,
298 	VMX_EXCEPTION_BITMAP,
299 	VMX_PF_ERROR_CODE_MASK,
300 	VMX_PF_ERROR_CODE_MATCH,
301 	VMX_CR3_TARGET_COUNT,
302 	VMX_EXIT_MSR_STORE_COUNT,
303 	VMX_EXIT_MSR_LOAD_COUNT,
304 	VMX_ENTRY_MSR_LOAD_COUNT,
305 	VMX_ENTRY_INT_INFO_FIELD,
306 	VMX_ENTRY_EXCEPTION_ERROR_CODE,
307 	VMX_ENTRY_INSTR_LENGTH,
308 	VMX_TPR_THRESHOLD,
309 	VMX_PROC_VM_EXEC_CONTROLS2,
310 	VMX_PLE_GAP,
311 	VMX_PLE_WINDOW,
312 	VMX_INSTR_ERROR,
313 	VMX_EXIT_REASON,
314 	VMX_EXIT_INT_INFO,
315 	VMX_EXIT_INT_ERROR_CODE,
316 	VMX_IDT_VEC_INFO_FIELD,
317 	VMX_IDT_VEC_ERROR_CODE,
318 	VMX_EXIT_INSTR_LEN,
319 	VMX_INSTR_INFO,
320 	VMX_GUEST_ES_LIMIT,
321 	VMX_GUEST_CS_LIMIT,
322 	VMX_GUEST_SS_LIMIT,
323 	VMX_GUEST_DS_LIMIT,
324 	VMX_GUEST_FS_LIMIT,
325 	VMX_GUEST_GS_LIMIT,
326 	VMX_GUEST_LDTR_LIMIT,
327 	VMX_GUEST_TR_LIMIT,
328 	VMX_GUEST_GDTR_LIMIT,
329 	VMX_GUEST_IDTR_LIMIT,
330 	VMX_GUEST_ES_ATTR,
331 	VMX_GUEST_CS_ATTR,
332 	VMX_GUEST_SS_ATTR,
333 	VMX_GUEST_DS_ATTR,
334 	VMX_GUEST_FS_ATTR,
335 	VMX_GUEST_GS_ATTR,
336 	VMX_GUEST_LDTR_ATTR,
337 	VMX_GUEST_TR_ATTR,
338 	VMX_GUEST_INTERRUPTIBILITY_INFO,
339 	VMX_GUEST_ACTIVITY_STATE,
340 	VMX_GUEST_SMBASE,
341 	VMX_GUEST_IA32_SYSENTER_CS,
342 	VMX_GUEST_TIMER,
343 	VMX_CR0_GUEST_HOST_MASK,
344 	VMX_CR4_GUEST_HOST_MASK,
345 	VMX_CR0_READ_SHADOW,
346 	VMX_CR4_READ_SHADOW,
347 	VMX_CR3_TARGET_0,
348 	VMX_CR3_TARGET_1,
349 	VMX_CR3_TARGET_2,
350 	VMX_CR3_TARGET_3,
351 	VMX_EXIT_QUALIFICATION,
352 	VMX_IO_RCX,
353 	VMX_IO_RSI,
354 	VMX_IO_RDI,
355 	VMX_IO_RIP,
356 	VMX_GUEST_LINEAR_ADDR,
357 	VMX_GUEST_CR0,
358 	VMX_GUEST_CR3,
359 	VMX_GUEST_CR4,
360 	VMX_GUEST_ES_BASE,
361 	VMX_GUEST_CS_BASE,
362 	VMX_GUEST_SS_BASE,
363 	VMX_GUEST_DS_BASE,
364 	VMX_GUEST_FS_BASE,
365 	VMX_GUEST_GS_BASE,
366 	VMX_GUEST_LDTR_BASE,
367 	VMX_GUEST_TR_BASE,
368 	VMX_GUEST_GDTR_BASE,
369 	VMX_GUEST_IDTR_BASE,
370 	VMX_GUEST_DR7,
371 	VMX_GUEST_RSP,
372 	VMX_GUEST_RIP,
373 	VMX_GUEST_RFLAGS,
374 	VMX_GUEST_PENDING_DEBUG_EXCEPT,
375 	VMX_GUEST_IA32_SYSENTER_ESP,
376 	VMX_GUEST_IA32_SYSENTER_EIP
377 };
378 
379 /* to be shared by all vCPUs for all nested guests */
380 static uint64_t vmcs_shadowing_bitmap[PAGE_SIZE / sizeof(uint64_t)] __aligned(PAGE_SIZE);
381 
setup_vmcs_shadowing_bitmap(void)382 static void setup_vmcs_shadowing_bitmap(void)
383 {
384 	uint16_t field_index;
385 	uint32_t array_index;
386 	uint16_t bit_pos;
387 
388 	/*
389 	 * Set all the bits to 1s first and clear out the bits for
390 	 * the corresponding fields that ACRN lets its guest to access Shadow VMCS
391 	 */
392 	memset((void *)vmcs_shadowing_bitmap, 0xFFU, PAGE_SIZE);
393 
394 	/*
395 	 * Refer to ISDM Section 24.6.15 VMCS Shadowing Bitmap Addresses
396 	 * and Section 30.3 VMX Instructions - VMWRITE/VMREAD
397 	 */
398 	for (field_index = 0U; field_index < MAX_SHADOW_VMCS_FIELDS; field_index++) {
399 		bit_pos = vmcs_shadowing_fields[field_index] % 64U;
400 		array_index = vmcs_shadowing_fields[field_index] / 64U;
401 		bitmap_clear_nolock(bit_pos, &vmcs_shadowing_bitmap[array_index]);
402 	}
403 }
404 
405 /*
406  * This is an array of offsets into a structure of type "struct acrn_vmcs12"
407  * 16 offsets for a total of 16 GROUPs. 4 "field widths" by 4 "field types".
408  * "Field type" is either Control, Read-Only Data, Guest State or Host State.
409  * Refer to the definition of "struct acrn_vmcs12" on how the fields are
410  * grouped together for these offsets to work in tandem.
411  * Refer to Intel SDM Appendix B Field Encoding in VMCS for info on how
412  * fields are grouped and indexed within a group.
413  */
414 static const uint16_t vmcs12_group_offset_table[16] = {
415 	offsetof(struct acrn_vmcs12, vpid),		/* 16-bit Control Fields */
416 	offsetof(struct acrn_vmcs12, padding),		/* 16-bit Read-Only Fields */
417 	offsetof(struct acrn_vmcs12, guest_es),		/* 16-bit Guest-State Fields */
418 	offsetof(struct acrn_vmcs12, host_es),		/* 16-bit Host-State Fields */
419 	offsetof(struct acrn_vmcs12, io_bitmap_a),	/* 64-bit Control Fields */
420 	offsetof(struct acrn_vmcs12, guest_phys_addr),	/* 64-bit Read-Only Data Fields */
421 	offsetof(struct acrn_vmcs12, vmcs_link_ptr),	/* 64-bit Guest-State Fields */
422 	offsetof(struct acrn_vmcs12, host_ia32_pat),	/* 64-bit Host-State Fields */
423 	offsetof(struct acrn_vmcs12, pin_based_exec_ctrl),	/* 32-bit Control Fields */
424 	offsetof(struct acrn_vmcs12, vm_instr_error),	/* 32-bit Read-Only Data Fields */
425 	offsetof(struct acrn_vmcs12, guest_es_limit),	/* 32-bit Guest-State Fields */
426 	offsetof(struct acrn_vmcs12, host_ia32_sysenter_cs),	/* 32-bit Host-State Fields */
427 	offsetof(struct acrn_vmcs12, cr0_guest_host_mask),	/* Natural-width Control Fields */
428 	offsetof(struct acrn_vmcs12, exit_qual),		/* Natural-width Read-Only Data Fields */
429 	offsetof(struct acrn_vmcs12, guest_cr0),		/* Natural-width Guest-State Fields */
430 	offsetof(struct acrn_vmcs12, host_cr0),			/* Natural-width Host-State Fields */
431 };
432 
433 /*
434  * field_idx is the index of the field within the group.
435  *
436  * Access-type is 0 for all widths except for 64-bit
437  * For 64-bit if Access-type is 1, offset is moved to
438  * high 4 bytes of the field.
439  */
440 #define OFFSET_INTO_VMCS12(group_idx, field_idx, width_in_bytes, access_type) \
441 	(vmcs12_group_offset_table[group_idx] + \
442 	field_idx * width_in_bytes + \
443 	access_type * sizeof(uint32_t))
444 
445 /* Given a vmcs field, this API returns the offset into "struct acrn_vmcs12" */
vmcs_field_to_vmcs12_offset(uint32_t vmcs_field)446 static uint16_t vmcs_field_to_vmcs12_offset(uint32_t vmcs_field)
447 {
448 	/*
449 	 * Refer to Appendix B Field Encoding in VMCS in SDM
450 	 * A value of group index 0001b is not valid because there are no 16-bit
451 	 * Read-Only fields.
452 	 *
453 	 * TODO: check invalid VMCS field
454 	 */
455 	uint16_t group_idx = (VMX_VMCS_FIELD_WIDTH(vmcs_field) << 2U) | VMX_VMCS_FIELD_TYPE(vmcs_field);
456 	uint8_t field_width = VMX_VMCS_FIELD_WIDTH(vmcs_field);
457 	uint8_t width_in_bytes;
458 
459 	if (field_width == VMX_VMCS_FIELD_WIDTH_16) {
460 		width_in_bytes = 2U;
461 	} else if (field_width == VMX_VMCS_FIELD_WIDTH_32) {
462 		width_in_bytes = 4U;
463 	} else {
464 		/*
465 		 * Natural-width or 64-bit
466 		 */
467 		width_in_bytes = 8U;
468 	}
469 
470 	return OFFSET_INTO_VMCS12(group_idx,
471 		VMX_VMCS_FIELD_INDEX(vmcs_field), width_in_bytes, /* field index within the group */
472 		VMX_VMCS_FIELD_ACCESS_HIGH(vmcs_field));
473 }
474 
475 /*
476  * Given a vmcs field and the pointer to the vmcs12, this API returns the
477  * corresponding value from the VMCS
478  */
vmcs12_read_field(void * vmcs_hva,uint32_t field)479 static uint64_t vmcs12_read_field(void *vmcs_hva, uint32_t field)
480 {
481 	uint64_t *ptr = (uint64_t *)(vmcs_hva + vmcs_field_to_vmcs12_offset(field));
482 	uint64_t val64 = 0UL;
483 
484 	switch (VMX_VMCS_FIELD_WIDTH(field)) {
485 		case VMX_VMCS_FIELD_WIDTH_16:
486 			val64 = *(uint16_t *)ptr;
487 			break;
488 		case VMX_VMCS_FIELD_WIDTH_32:
489 			val64 = *(uint32_t *)ptr;
490 			break;
491 		case VMX_VMCS_FIELD_WIDTH_64:
492 			if (!!VMX_VMCS_FIELD_ACCESS_HIGH(field)) {
493 				val64 = *(uint32_t *)ptr;
494 			} else {
495 				val64 = *ptr;
496 			}
497 			break;
498 		case VMX_VMCS_FIELD_WIDTH_NATURAL:
499 		default:
500 			val64 = *ptr;
501 			break;
502 	}
503 
504 	return val64;
505 }
506 
507 /*
508  * Write the given VMCS field to the given vmcs12 data structure.
509  */
vmcs12_write_field(void * vmcs_hva,uint32_t field,uint64_t val64)510 static void vmcs12_write_field(void *vmcs_hva, uint32_t field, uint64_t val64)
511 {
512 	uint64_t *ptr = (uint64_t *)(vmcs_hva + vmcs_field_to_vmcs12_offset(field));
513 
514 	switch (VMX_VMCS_FIELD_WIDTH(field)) {
515 		case VMX_VMCS_FIELD_WIDTH_16:
516 			*(uint16_t *)ptr = (uint16_t)val64;
517 			break;
518 		case VMX_VMCS_FIELD_WIDTH_32:
519 			*(uint32_t *)ptr = (uint32_t)val64;
520 			break;
521 		case VMX_VMCS_FIELD_WIDTH_64:
522 			if (!!VMX_VMCS_FIELD_ACCESS_HIGH(field)) {
523 				*(uint32_t *)ptr = (uint32_t)val64;
524 			} else {
525 				*ptr = val64;
526 			}
527 			break;
528 		case VMX_VMCS_FIELD_WIDTH_NATURAL:
529 		default:
530 			*ptr = val64;
531 			break;
532 	}
533 }
534 
nested_vmx_result(enum VMXResult result,int error_number)535 void nested_vmx_result(enum VMXResult result, int error_number)
536 {
537 	uint64_t rflags = exec_vmread(VMX_GUEST_RFLAGS);
538 
539 	/* ISDM: section 30.2 CONVENTIONS */
540 	rflags &= ~(RFLAGS_C | RFLAGS_P | RFLAGS_A | RFLAGS_Z | RFLAGS_S | RFLAGS_O);
541 
542 	if (result == VMfailValid) {
543 		rflags |= RFLAGS_Z;
544 		exec_vmwrite(VMX_INSTR_ERROR, error_number);
545 	} else if (result == VMfailInvalid) {
546 		rflags |= RFLAGS_C;
547 	} else {
548 		/* VMsucceed, do nothing */
549 	}
550 
551 	if (result != VMsucceed) {
552 		pr_err("VMX failed: %d/%d", result, error_number);
553 	}
554 
555 	exec_vmwrite(VMX_GUEST_RFLAGS, rflags);
556 }
557 
558 /**
559  * @brief get the memory-address operand of a vmx instruction
560  *
561  * @pre vcpu != NULL
562  */
get_vmx_memory_operand(struct acrn_vcpu * vcpu,uint32_t instr_info)563 static uint64_t get_vmx_memory_operand(struct acrn_vcpu *vcpu, uint32_t instr_info)
564 {
565 	uint64_t gva, gpa, seg_base = 0UL;
566 	uint32_t seg, err_code = 0U;
567 	uint64_t offset;
568 
569 	/*
570 	 * According to ISDM 3B: Basic VM-Exit Information: For INVEPT, INVPCID, INVVPID, LGDT,
571 	 * LIDT, LLDT, LTR, SGDT, SIDT, SLDT, STR, VMCLEAR, VMPTRLD, VMPTRST, VMREAD, VMWRITE,
572 	 * VMXON, XRSTORS, and XSAVES, the exit qualification receives the value of the instruction’s
573 	 * displacement field, which is sign-extended to 64 bits.
574 	 */
575 	offset = vcpu->arch.exit_qualification;
576 
577 	/* TODO: should we consider the cases of address size (bits 9:7 in instr_info) is 16 or 32? */
578 
579 	/*
580 	 * refer to ISDM Vol.1-3-24 Operand addressing on how to calculate an effective address
581 	 * offset = base + [index * scale] + displacement
582 	 * address = segment_base + offset
583 	 */
584 	if (VMX_II_BASE_REG_VALID(instr_info)) {
585 		offset += vcpu_get_gpreg(vcpu, VMX_II_BASE_REG(instr_info));
586 	}
587 
588 	if (VMX_II_IDX_REG_VALID(instr_info)) {
589 		uint64_t val64 = vcpu_get_gpreg(vcpu, VMX_II_IDX_REG(instr_info));
590 		offset += (val64 << VMX_II_SCALING(instr_info));
591 	}
592 
593 	/*
594 	 * In 64-bit mode, the processor treats the segment base of CS, DS, ES, SS as zero,
595 	 * creating a linear address that is equal to the effective address.
596 	 * The exceptions are the FS and GS segments, whose segment registers can be used as
597 	 * additional base registers in some linear address calculations.
598 	 */
599 	seg = VMX_II_SEG_REG(instr_info);
600 	if (seg == 4U) {
601 		seg_base = exec_vmread(VMX_GUEST_FS_BASE);
602 	}
603 
604 	if (seg == 5U) {
605 		seg_base = exec_vmread(VMX_GUEST_GS_BASE);
606 	}
607 
608 	gva = seg_base + offset;
609 	(void)gva2gpa(vcpu, gva, &gpa, &err_code);
610 
611 	return gpa;
612 }
613 
614 /*
615  * @pre vcpu != NULL
616  */
get_vmptr_gpa(struct acrn_vcpu * vcpu)617 static uint64_t get_vmptr_gpa(struct acrn_vcpu *vcpu)
618 {
619 	uint64_t gpa, vmptr;
620 
621 	/* get VMX pointer, which points to the VMCS or VMXON region GPA */
622 	gpa = get_vmx_memory_operand(vcpu, exec_vmread(VMX_INSTR_INFO));
623 
624 	/* get the address (GPA) of the VMCS for VMPTRLD/VMCLEAR, or VMXON region for VMXON */
625 	(void)copy_from_gpa(vcpu->vm, (void *)&vmptr, gpa, sizeof(uint64_t));
626 
627 	return vmptr;
628 }
629 
validate_vmptr_gpa(uint64_t vmptr_gpa)630 static bool validate_vmptr_gpa(uint64_t vmptr_gpa)
631 {
632 	/* We don't emulate CPUID.80000008H for guests, so check with physical address width */
633 	struct cpuinfo_x86 *cpu_info = get_pcpu_info();
634 
635 	return (mem_aligned_check(vmptr_gpa, PAGE_SIZE) && ((vmptr_gpa >> cpu_info->phys_bits) == 0UL));
636 }
637 
638 /**
639  * @pre vm != NULL
640  */
validate_vmcs_revision_id(struct acrn_vcpu * vcpu,uint64_t vmptr_gpa)641 static bool validate_vmcs_revision_id(struct acrn_vcpu *vcpu, uint64_t vmptr_gpa)
642 {
643 	uint32_t revision_id;
644 
645 	(void)copy_from_gpa(vcpu->vm, (void *)&revision_id, vmptr_gpa, sizeof(uint32_t));
646 
647 	/*
648 	 * VMCS revision ID must equal to what reported by the emulated IA32_VMX_BASIC MSR.
649 	 * The MSB of VMCS12_REVISION_ID is always smaller than 31, so the following statement
650 	 * implicitly validates revision_id[31] as well.
651 	 */
652 	return (revision_id == VMCS12_REVISION_ID);
653 }
654 
get_guest_cpl(void)655 int32_t get_guest_cpl(void)
656 {
657 	/*
658 	 * We get CPL from SS.DPL because:
659 	 *
660 	 * CS.DPL could not equal to the CPL for conforming code segments. ISDM 5.5 PRIVILEGE LEVELS:
661 	 * Conforming code segments can be accessed from any privilege level that is equal to or
662 	 * numerically greater (less privileged) than the DPL of the conforming code segment.
663 	 *
664 	 * ISDM 24.4.1 Guest Register State: The value of the DPL field for SS is always
665 	 * equal to the logical processor’s current privilege level (CPL).
666 	 */
667 	uint32_t ar = exec_vmread32(VMX_GUEST_SS_ATTR);
668 	return ((ar >> 5) & 3);
669 }
670 
validate_nvmx_cr0_cr4(uint64_t cr0_4,uint64_t fixed0,uint64_t fixed1)671 static bool validate_nvmx_cr0_cr4(uint64_t cr0_4, uint64_t fixed0, uint64_t fixed1)
672 {
673 	bool valid = true;
674 
675 	/* If bit X is 1 in IA32_VMX_CR0/4_FIXED0, then that bit of CR0/4 is fixed to 1 in VMX operation */
676 	if ((cr0_4 & fixed0) != fixed0) {
677 		valid = false;
678 	}
679 
680 	/* if bit X is 0 in IA32_VMX_CR0/4_FIXED1, then that bit of CR0/4 is fixed to 0 in VMX operation */
681 	/* Bits 63:32 of CR0 and CR4 are reserved and must be written with zeros */
682 	if ((uint32_t)(~cr0_4 & ~fixed1) != (uint32_t)~fixed1) {
683 		valid = false;
684 	}
685 
686 	return valid;
687 }
688 
689 /*
690  * @pre vcpu != NULL
691  */
validate_nvmx_cr0(struct acrn_vcpu * vcpu)692 static bool validate_nvmx_cr0(struct acrn_vcpu *vcpu)
693 {
694 	return validate_nvmx_cr0_cr4(vcpu_get_cr0(vcpu), msr_read(MSR_IA32_VMX_CR0_FIXED0),
695 		msr_read(MSR_IA32_VMX_CR0_FIXED1));
696 }
697 
698 /*
699  * @pre vcpu != NULL
700  */
validate_nvmx_cr4(struct acrn_vcpu * vcpu)701 static bool validate_nvmx_cr4(struct acrn_vcpu *vcpu)
702 {
703 	return validate_nvmx_cr0_cr4(vcpu_get_cr4(vcpu), msr_read(MSR_IA32_VMX_CR4_FIXED0),
704 		msr_read(MSR_IA32_VMX_CR4_FIXED1));
705 }
706 
707 /*
708  * @pre vcpu != NULL
709  */
reset_vvmcs(struct acrn_vcpu * vcpu)710 static void reset_vvmcs(struct acrn_vcpu *vcpu)
711 {
712 	struct acrn_vvmcs *vvmcs;
713 	uint32_t idx;
714 
715 	vcpu->arch.nested.current_vvmcs = NULL;
716 
717 	for (idx = 0U; idx < MAX_ACTIVE_VVMCS_NUM; idx++) {
718 		vvmcs = &vcpu->arch.nested.vvmcs[idx];
719 		vvmcs->host_state_dirty = false;
720 		vvmcs->control_fields_dirty = false;
721 		vvmcs->vmcs12_gpa = INVALID_GPA;
722 		vvmcs->ref_cnt = 0;
723 
724 		(void)memset(vvmcs->vmcs02, 0U, PAGE_SIZE);
725 		(void)memset(&vvmcs->vmcs12, 0U, sizeof(struct acrn_vmcs12));
726 	}
727 }
728 
729 /*
730  * @pre vcpu != NULL
731  */
vmxon_vmexit_handler(struct acrn_vcpu * vcpu)732 int32_t vmxon_vmexit_handler(struct acrn_vcpu *vcpu)
733 {
734 	const uint64_t features = MSR_IA32_FEATURE_CONTROL_LOCK | MSR_IA32_FEATURE_CONTROL_VMX_NO_SMX;
735 	uint32_t ar = exec_vmread32(VMX_GUEST_CS_ATTR);
736 
737 	if (is_nvmx_configured(vcpu->vm)) {
738 		if (((vcpu_get_cr0(vcpu) & CR0_PE) == 0UL)
739 			|| ((vcpu_get_cr4(vcpu) & CR4_VMXE) == 0UL)
740 			|| ((vcpu_get_rflags(vcpu) & RFLAGS_VM) != 0U)) {
741 			vcpu_inject_ud(vcpu);
742 		} else if (((vcpu_get_efer(vcpu) & MSR_IA32_EFER_LMA_BIT) == 0U)
743 			|| ((ar & (1U << 13U)) == 0U)) {
744 			/* Current ACRN doesn't support 32 bits L1 hypervisor */
745 			vcpu_inject_ud(vcpu);
746 		} else if ((get_guest_cpl() != 0)
747 			|| !validate_nvmx_cr0(vcpu)
748 			|| !validate_nvmx_cr4(vcpu)
749 			|| ((vcpu_get_guest_msr(vcpu, MSR_IA32_FEATURE_CONTROL) & features) != features)) {
750 			vcpu_inject_gp(vcpu, 0U);
751 		} else if (vcpu->arch.nested.vmxon == true) {
752 			nested_vmx_result(VMfailValid, VMXERR_VMXON_IN_VMX_ROOT_OPERATION);
753 		} else {
754 			uint64_t vmptr_gpa = get_vmptr_gpa(vcpu);
755 
756 			if (!validate_vmptr_gpa(vmptr_gpa)) {
757 				nested_vmx_result(VMfailInvalid, 0);
758 			} else if (!validate_vmcs_revision_id(vcpu, vmptr_gpa)) {
759 				nested_vmx_result(VMfailInvalid, 0);
760 			} else {
761 				vcpu->arch.nested.vmxon = true;
762 				vcpu->arch.nested.in_l2_guest = false;
763 				vcpu->arch.nested.vmxon_ptr = vmptr_gpa;
764 
765 				reset_vvmcs(vcpu);
766 				nested_vmx_result(VMsucceed, 0);
767 			}
768 		}
769 	} else {
770 		vcpu_inject_ud(vcpu);
771 	}
772 
773 	return 0;
774 }
775 
776 /*
777  * @pre vcpu != NULL
778  */
check_vmx_permission(struct acrn_vcpu * vcpu)779 bool check_vmx_permission(struct acrn_vcpu *vcpu)
780 {
781 	bool permit = true;
782 
783 	/* If this VM is not nVMX enabled, it implies that 'vmxon == false' */
784 	if ((vcpu->arch.nested.vmxon == false)
785 		|| ((vcpu_get_cr0(vcpu) & CR0_PE) == 0UL)
786 		|| ((vcpu_get_rflags(vcpu) & RFLAGS_VM) != 0U)) {
787 		/* We rely on hardware to check "IA32_EFER.LMA = 1 and CS.L = 0" */
788 		vcpu_inject_ud(vcpu);
789 		permit = false;
790 	} else if (get_guest_cpl() != 0) {
791 		vcpu_inject_gp(vcpu, 0U);
792 		permit = false;
793 	}
794 
795 	return permit;
796 }
797 
798 /*
799  * @pre vcpu != NULL
800  * @pre vcpu->vm != NULL
801  */
vmxoff_vmexit_handler(struct acrn_vcpu * vcpu)802 int32_t vmxoff_vmexit_handler(struct acrn_vcpu *vcpu)
803 {
804 	if (check_vmx_permission(vcpu)) {
805 		disable_vmcs_shadowing();
806 
807 		vcpu->arch.nested.vmxon = false;
808 		vcpu->arch.nested.in_l2_guest = false;
809 
810 		reset_vvmcs(vcpu);
811 		nested_vmx_result(VMsucceed, 0);
812 	}
813 
814 	return 0;
815 }
816 
817 /*
818  * Only VMCS fields of width 64-bit, 32-bit, and natural-width can be
819  * read-only. A value of 1 in bits [11:10] of these field encodings
820  * indicates a read-only field. ISDM Appendix B.
821  */
is_ro_vmcs_field(uint32_t field)822 static inline bool is_ro_vmcs_field(uint32_t field)
823 {
824 	const uint8_t w = VMX_VMCS_FIELD_WIDTH(field);
825 	return (VMX_VMCS_FIELD_WIDTH_16 != w) && (VMX_VMCS_FIELD_TYPE(field) == 1U);
826 }
827 
828 /*
829  * @pre vcpu != NULL
830  */
lookup_vvmcs(struct acrn_vcpu * vcpu,uint64_t vmcs12_gpa)831 static struct acrn_vvmcs *lookup_vvmcs(struct acrn_vcpu *vcpu, uint64_t vmcs12_gpa)
832 {
833 	struct acrn_vvmcs *vvmcs = NULL;
834 	uint32_t idx;
835 
836 	for (idx = 0U; idx < MAX_ACTIVE_VVMCS_NUM; idx++) {
837 		if (vcpu->arch.nested.vvmcs[idx].vmcs12_gpa == vmcs12_gpa) {
838 			vvmcs = &vcpu->arch.nested.vvmcs[idx];
839 			break;
840 		}
841 	}
842 
843 	return vvmcs;
844 }
845 
846 /*
847  * @pre vcpu != NULL
848  */
get_or_replace_vvmcs_entry(struct acrn_vcpu * vcpu)849 static struct acrn_vvmcs *get_or_replace_vvmcs_entry(struct acrn_vcpu *vcpu)
850 {
851 	struct acrn_nested *nested = &vcpu->arch.nested;
852 	struct acrn_vvmcs *vvmcs = NULL;
853 	uint32_t idx, min_cnt = ~0U;
854 
855 	/* look for an inactive entry first */
856 	for (idx = 0U; idx < MAX_ACTIVE_VVMCS_NUM; idx++) {
857 		if (nested->vvmcs[idx].vmcs12_gpa == INVALID_GPA) {
858 			/* found an inactive vvmcs[] entry. */
859 			vvmcs = &nested->vvmcs[idx];
860 			break;
861 		}
862 	}
863 
864 	/* In case we have to release an active entry to make room for the new VMCS12 */
865 	if (vvmcs == NULL) {
866 		for (idx = 0U; idx < MAX_ACTIVE_VVMCS_NUM; idx++) {
867 			/* look for the entry with least reference count */
868 			if (nested->vvmcs[idx].ref_cnt < min_cnt) {
869 				min_cnt = nested->vvmcs[idx].ref_cnt;
870 				vvmcs = &nested->vvmcs[idx];
871 			}
872 		}
873 
874 		clear_vvmcs(vcpu, vvmcs);
875 	}
876 
877 	/* reset ref_cnt for all entries */
878 	for (idx = 0U; idx < MAX_ACTIVE_VVMCS_NUM; idx++) {
879 		nested->vvmcs[idx].ref_cnt = 0U;
880 	}
881 
882 	return vvmcs;
883 }
884 
885 /*
886  * @brief emulate VMREAD instruction from L1
887  * @pre vcpu != NULL
888  */
vmread_vmexit_handler(struct acrn_vcpu * vcpu)889 int32_t vmread_vmexit_handler(struct acrn_vcpu *vcpu)
890 {
891 	struct acrn_vvmcs *cur_vvmcs = vcpu->arch.nested.current_vvmcs;
892 	const uint32_t info = exec_vmread(VMX_INSTR_INFO);
893 	uint64_t vmcs_value, gpa;
894 	uint32_t vmcs_field;
895 
896 	if (check_vmx_permission(vcpu)) {
897 		if ((cur_vvmcs == NULL) || (cur_vvmcs->vmcs12_gpa == INVALID_GPA)) {
898 			nested_vmx_result(VMfailInvalid, 0);
899 		} else {
900 			/* TODO: VMfailValid for invalid VMCS fields */
901 			vmcs_field = (uint32_t)vcpu_get_gpreg(vcpu, VMX_II_REG2(info));
902 			vmcs_value = vmcs12_read_field(&cur_vvmcs->vmcs12, vmcs_field);
903 
904 			/* Currently ACRN doesn't support 32bits L1 hypervisor, assuming operands are 64 bits */
905 			if (VMX_II_IS_REG(info)) {
906 				vcpu_set_gpreg(vcpu, VMX_II_REG1(info), vmcs_value);
907 			} else {
908 				gpa = get_vmx_memory_operand(vcpu, info);
909 				(void)copy_to_gpa(vcpu->vm, &vmcs_value, gpa, 8U);
910 			}
911 
912 			pr_dbg("vmcs_field: %x vmcs_value: %llx", vmcs_field, vmcs_value);
913 			nested_vmx_result(VMsucceed, 0);
914 		}
915 	}
916 
917 	return 0;
918 }
919 
920 /*
921  * @brief emulate VMWRITE instruction from L1
922  * @pre vcpu != NULL
923  */
vmwrite_vmexit_handler(struct acrn_vcpu * vcpu)924 int32_t vmwrite_vmexit_handler(struct acrn_vcpu *vcpu)
925 {
926 	struct acrn_vvmcs *cur_vvmcs = vcpu->arch.nested.current_vvmcs;
927 	const uint32_t info = exec_vmread(VMX_INSTR_INFO);
928 	uint64_t vmcs_value, gpa;
929 	uint32_t vmcs_field;
930 
931 	if (check_vmx_permission(vcpu)) {
932 		if ((cur_vvmcs == NULL) || (cur_vvmcs->vmcs12_gpa == INVALID_GPA)) {
933 			nested_vmx_result(VMfailInvalid, 0);
934 		} else {
935 			/* TODO: VMfailValid for invalid VMCS fields */
936 			vmcs_field = (uint32_t)vcpu_get_gpreg(vcpu, VMX_II_REG2(info));
937 
938 			if (is_ro_vmcs_field(vmcs_field) &&
939 				((vcpu_get_guest_msr(vcpu, MSR_IA32_VMX_MISC) & (1UL << 29U)) == 0UL)) {
940 				nested_vmx_result(VMfailValid, VMXERR_VMWRITE_RO_COMPONENT);
941 			} else {
942 				/* Currently not support 32bits L1 hypervisor, assuming operands are 64 bits */
943 				if (VMX_II_IS_REG(info)) {
944 					vmcs_value = vcpu_get_gpreg(vcpu, VMX_II_REG1(info));
945 				} else {
946 					gpa = get_vmx_memory_operand(vcpu, info);
947 					(void)copy_from_gpa(vcpu->vm, &vmcs_value, gpa, 8U);
948 				}
949 
950 				if (VMX_VMCS_FIELD_TYPE(vmcs_field) == VMX_VMCS_FIELD_TYPE_HOST) {
951 					cur_vvmcs->host_state_dirty = true;
952 				}
953 
954 				if ((vmcs_field == VMX_MSR_BITMAP_FULL)
955 					|| (vmcs_field == VMX_EPT_POINTER_FULL)
956 					|| (vmcs_field == VMX_VPID)
957 					|| (vmcs_field == VMX_ENTRY_CONTROLS)
958 					|| (vmcs_field == VMX_EXIT_CONTROLS)) {
959 					cur_vvmcs->control_fields_dirty = true;
960 
961 					if (vmcs_field == VMX_EPT_POINTER_FULL) {
962 						if (cur_vvmcs->vmcs12.ept_pointer != vmcs_value) {
963 							put_vept_desc(cur_vvmcs->vmcs12.ept_pointer);
964 							get_vept_desc(vmcs_value);
965 						}
966 					}
967 				}
968 
969 				pr_dbg("vmcs_field: %x vmcs_value: %llx", vmcs_field, vmcs_value);
970 				vmcs12_write_field(&cur_vvmcs->vmcs12, vmcs_field, vmcs_value);
971 				nested_vmx_result(VMsucceed, 0);
972 			}
973 		}
974 	}
975 
976 	return 0;
977 }
978 
979 /**
980  * @brief Sync shadow fields from vmcs02 to cache VMCS12
981  *
982  * @pre vcpu != NULL
983  * @pre vmcs02 is current
984  */
sync_vmcs02_to_vmcs12(struct acrn_vmcs12 * vmcs12)985 static void sync_vmcs02_to_vmcs12(struct acrn_vmcs12 *vmcs12)
986 {
987 	uint64_t val64;
988 	uint32_t idx;
989 
990 	for (idx = 0; idx < MAX_SHADOW_VMCS_FIELDS; idx++) {
991 		val64 = exec_vmread(vmcs_shadowing_fields[idx]);
992 		vmcs12_write_field(vmcs12, vmcs_shadowing_fields[idx], val64);
993 	}
994 }
995 
996 /*
997  * @pre vcpu != NULL
998  * @pre VMCS02 (as an ordinary VMCS) is current
999  */
merge_and_sync_control_fields(struct acrn_vcpu * vcpu,struct acrn_vmcs12 * vmcs12)1000 static void merge_and_sync_control_fields(struct acrn_vcpu *vcpu, struct acrn_vmcs12 *vmcs12)
1001 {
1002 	uint64_t value64;
1003 
1004 	/* Sync VMCS fields that are not shadowing. Don't need to sync these fields back to VMCS12. */
1005 
1006 	exec_vmwrite(VMX_MSR_BITMAP_FULL, gpa2hpa(vcpu->vm, vmcs12->msr_bitmap));
1007 	exec_vmwrite(VMX_EPT_POINTER_FULL, get_shadow_eptp(vmcs12->ept_pointer));
1008 
1009 	/* For VM-execution, entry and exit controls */
1010 	value64 = vmcs12->vm_entry_controls;
1011 	if ((value64 & VMX_ENTRY_CTLS_LOAD_EFER) != VMX_ENTRY_CTLS_LOAD_EFER) {
1012 		/*
1013 		 * L1 hypervisor wishes to use its IA32_EFER for L2 guest so we turn on the
1014 		 * VMX_ENTRY_CTLS_LOAD_EFER on VMCS02.
1015 		 */
1016 		value64 |= VMX_ENTRY_CTLS_LOAD_EFER;
1017 		exec_vmwrite(VMX_GUEST_IA32_EFER_FULL, vcpu_get_efer(vcpu));
1018 	}
1019 
1020 	exec_vmwrite(VMX_ENTRY_CONTROLS, value64);
1021 
1022 	/* Host is alway runing in 64-bit mode */
1023 	value64 = vmcs12->vm_exit_controls | VMX_EXIT_CTLS_HOST_ADDR64;
1024 	exec_vmwrite(VMX_EXIT_CONTROLS, value64);
1025 
1026 	exec_vmwrite(VMX_VPID, vmcs12->vpid);
1027 }
1028 
1029 /**
1030  * @brief Sync shadow fields from vmcs12 to vmcs02
1031  *
1032  * @pre vcpu != NULL
1033  * @pre vmcs02 is current
1034  */
sync_vmcs12_to_vmcs02(struct acrn_vcpu * vcpu,struct acrn_vmcs12 * vmcs12)1035 static void sync_vmcs12_to_vmcs02(struct acrn_vcpu *vcpu, struct acrn_vmcs12 *vmcs12)
1036 {
1037 	uint64_t val64;
1038 	uint32_t idx;
1039 
1040 	for (idx = 0; idx < MAX_SHADOW_VMCS_FIELDS; idx++) {
1041 		val64 = vmcs12_read_field(vmcs12, vmcs_shadowing_fields[idx]);
1042 		exec_vmwrite(vmcs_shadowing_fields[idx], val64);
1043 	}
1044 
1045 	merge_and_sync_control_fields(vcpu, vmcs12);
1046 }
1047 
1048 /*
1049  * @pre vcpu != NULL
1050  */
set_vmcs02_shadow_indicator(struct acrn_vvmcs * vvmcs)1051 static void set_vmcs02_shadow_indicator(struct acrn_vvmcs *vvmcs)
1052 {
1053 	/* vmcs02 is shadowing */
1054 	*((uint32_t*)vvmcs->vmcs02) |= VMCS_SHADOW_BIT_INDICATOR;
1055 }
1056 
1057 /*
1058  * @pre vcpu != NULL
1059  * @pre vmcs01 is current
1060  */
clear_vmcs02_shadow_indicator(struct acrn_vvmcs * vvmcs)1061 static void clear_vmcs02_shadow_indicator(struct acrn_vvmcs *vvmcs)
1062 {
1063 	/* vmcs02 is s an ordinary VMCS */
1064 	*((uint32_t*)vvmcs->vmcs02) &= ~VMCS_SHADOW_BIT_INDICATOR;
1065 }
1066 
1067 /*
1068  * @pre vcpu != NULL
1069  * @pre vmcs01 is current
1070  */
enable_vmcs_shadowing(struct acrn_vvmcs * vvmcs)1071 static void enable_vmcs_shadowing(struct acrn_vvmcs *vvmcs)
1072 {
1073 	uint32_t val32;
1074 
1075 	/*
1076 	 * This method of using the same bitmap for VMRead and VMWrite is not typical.
1077 	 * Here we assume L1 hypervisor will not erroneously write to Read-Only fields.
1078 	 * TODO: may use different bitmap to exclude read-only fields from VMWRITE bitmap.
1079 	 */
1080 	exec_vmwrite(VMX_VMREAD_BITMAP_FULL, hva2hpa(vmcs_shadowing_bitmap));
1081 	exec_vmwrite(VMX_VMWRITE_BITMAP_FULL, hva2hpa(vmcs_shadowing_bitmap));
1082 
1083 	/* Set VMCS shadowing bit in Secondary Proc Exec Controls */
1084 	val32 = exec_vmread(VMX_PROC_VM_EXEC_CONTROLS2);
1085 	val32 |= VMX_PROCBASED_CTLS2_VMCS_SHADW;
1086 	exec_vmwrite32(VMX_PROC_VM_EXEC_CONTROLS2, val32);
1087 
1088 	/* Set VMCS Link pointer */
1089 	exec_vmwrite(VMX_VMS_LINK_PTR_FULL, hva2hpa(vvmcs->vmcs02));
1090 }
1091 
1092 /*
1093  * @pre vcpu != NULL
1094  * @pre vmcs01 is current
1095  */
disable_vmcs_shadowing(void)1096 static void disable_vmcs_shadowing(void)
1097 {
1098 	uint32_t val32;
1099 
1100 	/* clear VMCS shadowing bit in Secondary Proc Exec Controls */
1101 	val32 = exec_vmread(VMX_PROC_VM_EXEC_CONTROLS2);
1102 	val32 &= ~VMX_PROCBASED_CTLS2_VMCS_SHADW;
1103 	exec_vmwrite32(VMX_PROC_VM_EXEC_CONTROLS2, val32);
1104 
1105 	exec_vmwrite(VMX_VMS_LINK_PTR_FULL, ~0UL);
1106 }
1107 
1108 /*
1109  * @pre vcpu != NULL
1110  * @pre vmcs01 is current
1111  */
clear_vvmcs(struct acrn_vcpu * vcpu,struct acrn_vvmcs * vvmcs)1112 static void clear_vvmcs(struct acrn_vcpu *vcpu, struct acrn_vvmcs *vvmcs)
1113 {
1114 	/*
1115 	 * Now VMCS02 is active and being used as a shadow VMCS.
1116 	 * Disable VMCS shadowing to avoid VMCS02 will be loaded by VMPTRLD
1117 	 * and referenced by VMCS01 as a shadow VMCS simultaneously.
1118 	 */
1119 	disable_vmcs_shadowing();
1120 
1121 	/* Flush shadow VMCS to memory */
1122 	clear_va_vmcs(vvmcs->vmcs02);
1123 
1124 	/* VMPTRLD the shadow VMCS so that we are able to sync it to VMCS12 */
1125 	load_va_vmcs(vvmcs->vmcs02);
1126 
1127 	sync_vmcs02_to_vmcs12(&vvmcs->vmcs12);
1128 
1129 	/* flush cached VMCS12 back to L1 guest */
1130 	(void)copy_to_gpa(vcpu->vm, (void *)&vvmcs->vmcs12, vvmcs->vmcs12_gpa, sizeof(struct acrn_vmcs12));
1131 
1132 	/*
1133 	 * The current VMCS12 has been flushed out, so that the active VMCS02
1134 	 * needs to be VMCLEARed as well
1135 	 */
1136 	clear_va_vmcs(vvmcs->vmcs02);
1137 
1138 	/* This VMCS can no longer refer to any shadow EPT */
1139 	put_vept_desc(vvmcs->vmcs12.ept_pointer);
1140 
1141 	/* This vvmcs[] entry doesn't cache a VMCS12 any more */
1142 	vvmcs->vmcs12_gpa = INVALID_GPA;
1143 
1144 	/* Cleanup per VVMCS dirty flags */
1145 	vvmcs->host_state_dirty = false;
1146 	vvmcs->control_fields_dirty = false;
1147 }
1148 
1149 /*
1150  * @pre vcpu != NULL
1151  */
vmptrld_vmexit_handler(struct acrn_vcpu * vcpu)1152 int32_t vmptrld_vmexit_handler(struct acrn_vcpu *vcpu)
1153 {
1154 	struct acrn_nested *nested = &vcpu->arch.nested;
1155 	struct acrn_vvmcs *vvmcs;
1156 	uint64_t vmcs12_gpa;
1157 
1158 	if (check_vmx_permission(vcpu)) {
1159 		vmcs12_gpa = get_vmptr_gpa(vcpu);
1160 
1161 		if (!validate_vmptr_gpa(vmcs12_gpa)) {
1162 			nested_vmx_result(VMfailValid, VMXERR_VMPTRLD_INVALID_ADDRESS);
1163 		} else if (vmcs12_gpa == nested->vmxon_ptr) {
1164 			nested_vmx_result(VMfailValid, VMXERR_VMPTRLD_VMXON_POINTER);
1165 		} else if (!validate_vmcs_revision_id(vcpu, vmcs12_gpa)) {
1166 			nested_vmx_result(VMfailValid, VMXERR_VMPTRLD_INCORRECT_VMCS_REVISION_ID);
1167 		} else if ((nested->current_vvmcs != NULL) && (nested->current_vvmcs->vmcs12_gpa == vmcs12_gpa)) {
1168 			/* VMPTRLD current VMCS12, do nothing */
1169 			nested_vmx_result(VMsucceed, 0);
1170 		} else {
1171 			vvmcs = lookup_vvmcs(vcpu, vmcs12_gpa);
1172 			if (vvmcs == NULL) {
1173 				vvmcs = get_or_replace_vvmcs_entry(vcpu);
1174 
1175 				/* Create the VMCS02 based on this new VMCS12 */
1176 
1177 				/*
1178 				 * initialize VMCS02
1179 				 * VMCS revision ID must equal to what reported by IA32_VMX_BASIC MSR
1180 				 */
1181 				(void)memcpy_s(vvmcs->vmcs02, 4U, (void *)&vmx_basic, 4U);
1182 
1183 				/* VMPTRLD VMCS02 so that we can VMWRITE to it */
1184 				load_va_vmcs(vvmcs->vmcs02);
1185 				init_host_state();
1186 
1187 				/* Load VMCS12 from L1 guest memory */
1188 				(void)copy_from_gpa(vcpu->vm, (void *)&vvmcs->vmcs12, vmcs12_gpa,
1189 					sizeof(struct acrn_vmcs12));
1190 
1191 				/* if needed, create nept_desc and allocate shadow root for the EPTP */
1192 				get_vept_desc(vvmcs->vmcs12.ept_pointer);
1193 
1194 				/* Need to load shadow fields from this new VMCS12 to VMCS02 */
1195 				sync_vmcs12_to_vmcs02(vcpu, &vvmcs->vmcs12);
1196 			} else {
1197 				vvmcs->ref_cnt += 1U;
1198 			}
1199 
1200 			/* Before VMCS02 is being used as a shadow VMCS, VMCLEAR it */
1201 			clear_va_vmcs(vvmcs->vmcs02);
1202 
1203 			/*
1204 			 * Now VMCS02 is not active, set the shadow-VMCS indicator.
1205 			 * At L1 VM entry, VMCS02 will be referenced as a shadow VMCS.
1206 			 */
1207 			set_vmcs02_shadow_indicator(vvmcs);
1208 
1209 			/* Switch back to vmcs01 */
1210 			load_va_vmcs(vcpu->arch.vmcs);
1211 
1212 			/* VMCS02 is referenced by VMCS01 Link Pointer */
1213 			enable_vmcs_shadowing(vvmcs);
1214 
1215 			vvmcs->vmcs12_gpa = vmcs12_gpa;
1216 			nested->current_vvmcs = vvmcs;
1217 			nested_vmx_result(VMsucceed, 0);
1218 		}
1219 	}
1220 
1221 	return 0;
1222 }
1223 
1224 /*
1225  * @pre vcpu != NULL
1226  */
vmclear_vmexit_handler(struct acrn_vcpu * vcpu)1227 int32_t vmclear_vmexit_handler(struct acrn_vcpu *vcpu)
1228 {
1229 	struct acrn_nested *nested = &vcpu->arch.nested;
1230 	struct acrn_vvmcs *vvmcs;
1231 	uint64_t vmcs12_gpa;
1232 
1233 	if (check_vmx_permission(vcpu)) {
1234 		vmcs12_gpa = get_vmptr_gpa(vcpu);
1235 
1236 		if (!validate_vmptr_gpa(vmcs12_gpa)) {
1237 			nested_vmx_result(VMfailValid, VMXERR_VMPTRLD_INVALID_ADDRESS);
1238 		} else if (vmcs12_gpa == nested->vmxon_ptr) {
1239 			nested_vmx_result(VMfailValid, VMXERR_VMCLEAR_VMXON_POINTER);
1240 		} else {
1241 			vvmcs = lookup_vvmcs(vcpu, vmcs12_gpa);
1242 			if (vvmcs != NULL) {
1243 				uint64_t current_vmcs12_gpa = INVALID_GPA;
1244 
1245 				/* Save for comparison */
1246 				if (nested->current_vvmcs) {
1247 					current_vmcs12_gpa = nested->current_vvmcs->vmcs12_gpa;
1248 				}
1249 
1250 				/* VMCLEAR an active VMCS12, may or may not be current */
1251 				vvmcs->vmcs12.launch_state = VMCS12_LAUNCH_STATE_CLEAR;
1252 				clear_vvmcs(vcpu, vvmcs);
1253 
1254 				/* Switch back to vmcs01 (no VMCS shadowing) */
1255 				load_va_vmcs(vcpu->arch.vmcs);
1256 
1257 				if (current_vmcs12_gpa != INVALID_GPA) {
1258 					if (current_vmcs12_gpa == vmcs12_gpa) {
1259 						/* VMCLEAR current VMCS12 */
1260 						nested->current_vvmcs = NULL;
1261 					} else {
1262 						/*
1263 						 * VMCLEAR an active but not current VMCS12.
1264 						 * VMCS shadowing was cleared earlier in clear_vvmcs()
1265 						 */
1266 						enable_vmcs_shadowing(nested->current_vvmcs);
1267 					}
1268 				} else {
1269 					/* do nothing if there is no current VMCS12 */
1270 				}
1271 			} else {
1272 				 /*
1273 				  * we need to update the VMCS12 launch state in L1 memory in these two cases:
1274 				  * - L1 hypervisor VMCLEAR a VMCS12 that is already flushed by ACRN to L1 guest
1275 				  * - L1 hypervisor VMCLEAR a never VMPTRLDed VMCS12.
1276 				  */
1277 				uint32_t launch_state = VMCS12_LAUNCH_STATE_CLEAR;
1278 				(void)copy_to_gpa(vcpu->vm, &launch_state, vmcs12_gpa +
1279 					offsetof(struct acrn_vmcs12, launch_state), sizeof(launch_state));
1280 			}
1281 
1282 			nested_vmx_result(VMsucceed, 0);
1283 		}
1284 	}
1285 
1286 	return 0;
1287 }
1288 
1289 /*
1290  * @pre vcpu != NULL
1291  */
is_vcpu_in_l2_guest(struct acrn_vcpu * vcpu)1292 bool is_vcpu_in_l2_guest(struct acrn_vcpu *vcpu)
1293 {
1294 	return vcpu->arch.nested.in_l2_guest;
1295 }
1296 
1297 /*
1298  * @pre seg != NULL
1299  */
set_segment(struct segment_sel * seg,uint16_t sel,uint64_t b,uint32_t l,uint32_t a)1300 static void set_segment(struct segment_sel *seg, uint16_t sel, uint64_t b, uint32_t l, uint32_t a)
1301 {
1302 	seg->selector = sel;
1303 	seg->base = b;
1304 	seg->limit = l;
1305 	seg->attr = a;
1306 }
1307 
1308 /*
1309  * @pre vcpu != NULL
1310  * @pre vmcs01 is current
1311  */
set_vmcs01_guest_state(struct acrn_vcpu * vcpu)1312 static void set_vmcs01_guest_state(struct acrn_vcpu *vcpu)
1313 {
1314 	/*
1315 	 * All host fields are not shadowing, and all VMWRITE to these fields
1316 	 * are saved in vmcs12.
1317 	 *
1318 	 * Load host state from vmcs12 to vmcs01 guest state before entering
1319 	 * L1 to emulate VMExit from L2 to L1.
1320 	 *
1321 	 * We assume host only change these host-state fields in run time.
1322 	 *
1323 	 * Section 27.5 Loading Host State
1324 	 * 1. Load Control Registers, Debug Registers, MSRs
1325 	 * 2. Load RSP/RIP/RFLAGS
1326 	 * 3. Load Segmentation State
1327 	 * 4. Non-Register state
1328 	 */
1329 	struct acrn_vmcs12 *vmcs12 = &vcpu->arch.nested.current_vvmcs->vmcs12;
1330 	struct segment_sel seg;
1331 
1332 	if (vcpu->arch.nested.current_vvmcs->host_state_dirty == true) {
1333 		vcpu->arch.nested.current_vvmcs->host_state_dirty = false;
1334 
1335 		/*
1336 		 * We want vcpu_get_cr0/4() can get the up-to-date values, but we don't
1337 		 * want to call vcpu_set_cr0/4() to handle the CR0/4 write.
1338 		 */
1339 		exec_vmwrite(VMX_GUEST_CR0, vmcs12->host_cr0);
1340 		exec_vmwrite(VMX_GUEST_CR4, vmcs12->host_cr4);
1341 		bitmap_clear_nolock(CPU_REG_CR0, &vcpu->reg_cached);
1342 		bitmap_clear_nolock(CPU_REG_CR4, &vcpu->reg_cached);
1343 
1344 		exec_vmwrite(VMX_GUEST_CR3, vmcs12->host_cr3);
1345 		exec_vmwrite(VMX_GUEST_DR7, DR7_INIT_VALUE);
1346 		exec_vmwrite64(VMX_GUEST_IA32_DEBUGCTL_FULL, 0UL);
1347 		exec_vmwrite32(VMX_GUEST_IA32_SYSENTER_CS, vmcs12->host_ia32_sysenter_cs);
1348 		exec_vmwrite(VMX_GUEST_IA32_SYSENTER_ESP, vmcs12->host_ia32_sysenter_esp);
1349 		exec_vmwrite(VMX_GUEST_IA32_SYSENTER_EIP, vmcs12->host_ia32_sysenter_eip);
1350 
1351 		exec_vmwrite(VMX_GUEST_IA32_EFER_FULL, vmcs12->host_ia32_efer);
1352 
1353 		/*
1354 		 * type: 11 (Execute/Read, accessed)
1355 		 * l: 64-bit mode active
1356 		 */
1357 		set_segment(&seg, vmcs12->host_cs, 0UL, 0xFFFFFFFFU, 0xa09bU);
1358 		load_segment(seg, VMX_GUEST_CS);
1359 
1360 		/*
1361 		 * type: 3 (Read/Write, accessed)
1362 		 * D/B: 1 (32-bit segment)
1363 		 */
1364 		set_segment(&seg, vmcs12->host_ds, 0UL, 0xFFFFFFFFU, 0xc093);
1365 		load_segment(seg, VMX_GUEST_DS);
1366 
1367 		seg.selector = vmcs12->host_ss;
1368 		load_segment(seg, VMX_GUEST_SS);
1369 
1370 		seg.selector = vmcs12->host_es;
1371 		load_segment(seg, VMX_GUEST_ES);
1372 
1373 		seg.selector = vmcs12->host_fs;
1374 		seg.base = vmcs12->host_fs_base;
1375 		load_segment(seg, VMX_GUEST_FS);
1376 
1377 		seg.selector = vmcs12->host_gs;
1378 		seg.base = vmcs12->host_gs_base;
1379 		load_segment(seg, VMX_GUEST_GS);
1380 
1381 		/*
1382 		 * ISDM 27.5.2: segment limit for TR is set to 67H
1383 		 * Type set to 11 and S set to 0 (busy 32-bit task-state segment).
1384 		 */
1385 		set_segment(&seg, vmcs12->host_tr, vmcs12->host_tr_base, 0x67U, TR_AR);
1386 		load_segment(seg, VMX_GUEST_TR);
1387 
1388 		/*
1389 		 * ISDM 27.5.2: LDTR is established as follows on all VM exits:
1390 		 * the selector is cleared to 0000H, the segment is marked unusable
1391 		 * and is otherwise undefined (although the base address is always canonical).
1392 		 */
1393 		exec_vmwrite16(VMX_GUEST_LDTR_SEL, 0U);
1394 		exec_vmwrite32(VMX_GUEST_LDTR_ATTR, 0x10000U);
1395 	}
1396 
1397 	/*
1398 	 * For those registers that are managed by the vcpu->reg_updated flag,
1399 	 * need to write with vcpu_set_xxx() so that vcpu_get_xxx() can get the
1400 	 * correct values.
1401 	 */
1402 	vcpu_set_rip(vcpu, vmcs12->host_rip);
1403 	vcpu_set_rsp(vcpu, vmcs12->host_rsp);
1404 	vcpu_set_rflags(vcpu, 0x2U);
1405 }
1406 
1407 /**
1408  * @pre vcpu != NULL
1409  */
sanitize_l2_vpid(struct acrn_vmcs12 * vmcs12)1410 static void sanitize_l2_vpid(struct acrn_vmcs12 *vmcs12)
1411 {
1412 	/* Flush VPID if the L2 VPID could be conflicted with any L1 VPIDs */
1413 	if (vmcs12->vpid >= ALLOCATED_MIN_L1_VPID) {
1414 		flush_vpid_single(vmcs12->vpid);
1415 	}
1416 }
1417 
1418 /**
1419  * @brief handler for all VMEXITs from nested guests
1420  *
1421  * @pre vcpu != NULL
1422  * @pre VMCS02 (as an ordinary VMCS) is current
1423  */
nested_vmexit_handler(struct acrn_vcpu * vcpu)1424 int32_t nested_vmexit_handler(struct acrn_vcpu *vcpu)
1425 {
1426 	struct acrn_vvmcs *cur_vvmcs = vcpu->arch.nested.current_vvmcs;
1427 	bool is_l1_vmexit = true;
1428 
1429 	if ((vcpu->arch.exit_reason & 0xFFFFU) == VMX_EXIT_REASON_EPT_VIOLATION) {
1430 		is_l1_vmexit = handle_l2_ept_violation(vcpu);
1431 	}
1432 
1433 	if (is_l1_vmexit) {
1434 		sanitize_l2_vpid(&cur_vvmcs->vmcs12);
1435 
1436 		/*
1437 		 * Clear VMCS02 because: ISDM: Before modifying the shadow-VMCS indicator,
1438 		 * software should execute VMCLEAR for the VMCS to ensure that it is not active.
1439 		 */
1440 		clear_va_vmcs(cur_vvmcs->vmcs02);
1441 		set_vmcs02_shadow_indicator(cur_vvmcs);
1442 
1443 		/* Switch to VMCS01, and VMCS02 is referenced as a shadow VMCS */
1444 		load_va_vmcs(vcpu->arch.vmcs);
1445 
1446 		/* Load host state from VMCS12 host area to Guest state of VMCS01 */
1447 		set_vmcs01_guest_state(vcpu);
1448 
1449 		/* vCPU is NOT in guest mode from this point */
1450 		vcpu->arch.nested.in_l2_guest = false;
1451 	}
1452 
1453 	/*
1454 	 * For VM-exits that reflect to L1 hypervisor, ACRN can't advance to next guest RIP
1455 	 * which is up to the L1 hypervisor to make the decision.
1456 	 *
1457 	 * The only case that doesn't need to be reflected is EPT violations that can be
1458 	 * completely handled by ACRN, which requires L2 VM to re-execute the instruction
1459 	 * after the shadow EPT is being properly setup.
1460 
1461 	 * In either case, need to set vcpu->arch.inst_len to zero.
1462 	 */
1463 	vcpu_retain_rip(vcpu);
1464 	return 0;
1465 }
1466 
1467 /*
1468  * @pre vcpu != NULL
1469  * @pre VMCS01 is current and VMCS02 is referenced by VMCS Link Pointer
1470  */
nested_vmentry(struct acrn_vcpu * vcpu,bool is_launch)1471 static void nested_vmentry(struct acrn_vcpu *vcpu, bool is_launch)
1472 {
1473 	struct acrn_vvmcs *cur_vvmcs = vcpu->arch.nested.current_vvmcs;
1474 	struct acrn_vmcs12 *vmcs12 = &cur_vvmcs->vmcs12;
1475 
1476 	if ((cur_vvmcs == NULL) || (cur_vvmcs->vmcs12_gpa == INVALID_GPA)) {
1477 		nested_vmx_result(VMfailInvalid, 0);
1478 	} else if (is_launch && (vmcs12->launch_state != VMCS12_LAUNCH_STATE_CLEAR)) {
1479 		nested_vmx_result(VMfailValid, VMXERR_VMLAUNCH_NONCLEAR_VMCS);
1480 	} else if (!is_launch && (vmcs12->launch_state != VMCS12_LAUNCH_STATE_LAUNCHED)) {
1481 		nested_vmx_result(VMfailValid, VMXERR_VMRESUME_NONLAUNCHED_VMCS);
1482 	} else {
1483 		/*
1484 		 * TODO: Need to do VM-Entry checks before L2 VM entry.
1485 		 * Refer to ISDM Vol3 VMX Instructions reference.
1486 		 */
1487 
1488 		/*
1489 		 * Convert the shadow VMCS to an ordinary VMCS.
1490 		 * ISDM: Software should not modify the shadow-VMCS indicator in
1491 		 * the VMCS region of a VMCS that is active
1492 		 */
1493 		clear_va_vmcs(cur_vvmcs->vmcs02);
1494 		clear_vmcs02_shadow_indicator(cur_vvmcs);
1495 
1496 		/* as an ordinary VMCS, VMCS02 is active and currernt when L2 guest is running */
1497 		load_va_vmcs(cur_vvmcs->vmcs02);
1498 
1499 		if (cur_vvmcs->control_fields_dirty) {
1500 			cur_vvmcs->control_fields_dirty = false;
1501 			merge_and_sync_control_fields(vcpu, vmcs12);
1502 		}
1503 
1504 		/* vCPU is in guest mode from this point */
1505 		vcpu->arch.nested.in_l2_guest = true;
1506 
1507 		if (is_launch) {
1508 			vmcs12->launch_state = VMCS12_LAUNCH_STATE_LAUNCHED;
1509 		}
1510 
1511 		sanitize_l2_vpid(vmcs12);
1512 
1513 		/*
1514 		 * set vcpu->launched to false because the launch state of VMCS02 is
1515 		 * clear at this moment, even for VMRESUME
1516 		 */
1517 		vcpu->launched = false;
1518 	}
1519 }
1520 
1521 /*
1522  * @pre vcpu != NULL
1523  */
vmresume_vmexit_handler(struct acrn_vcpu * vcpu)1524 int32_t vmresume_vmexit_handler(struct acrn_vcpu *vcpu)
1525 {
1526 	if (check_vmx_permission(vcpu)) {
1527 		nested_vmentry(vcpu, false);
1528 	}
1529 
1530 	return 0;
1531 }
1532 
1533 /*
1534  * @pre vcpu != NULL
1535  */
vmlaunch_vmexit_handler(struct acrn_vcpu * vcpu)1536 int32_t vmlaunch_vmexit_handler(struct acrn_vcpu *vcpu)
1537 {
1538 	if (check_vmx_permission(vcpu)) {
1539 		nested_vmentry(vcpu, true);
1540 	}
1541 
1542 	return 0;
1543 }
1544 
1545 /*
1546  * @pre vcpu != NULL
1547  * @pre desc != NULL
1548  */
get_invvpid_ept_operands(struct acrn_vcpu * vcpu,void * desc,size_t size)1549 int64_t get_invvpid_ept_operands(struct acrn_vcpu *vcpu, void *desc, size_t size)
1550 {
1551 	const uint32_t info = exec_vmread(VMX_INSTR_INFO);
1552 	uint64_t gpa;
1553 
1554 	gpa = get_vmx_memory_operand(vcpu, info);
1555 	(void)copy_from_gpa(vcpu->vm, desc, gpa, size);
1556 
1557 	return vcpu_get_gpreg(vcpu, VMX_II_REG2(info));
1558 }
1559 
1560 /*
1561  * @pre vcpu != NULL
1562  */
validate_canonical_addr(struct acrn_vcpu * vcpu,uint64_t va)1563 static bool validate_canonical_addr(struct acrn_vcpu *vcpu, uint64_t va)
1564 {
1565 	uint32_t addr_width = 48U; /* linear address width */
1566 	uint64_t msb_mask;
1567 
1568 	if (vcpu_get_cr4(vcpu) & CR4_LA57) {
1569 		addr_width = 57U;
1570 	}
1571 
1572 	/*
1573 	 * In 64-bit mode, an address is considered to be in canonical form if address
1574 	 * bits 63 through to the most-significant implemented bit by the microarchitecture
1575 	 * are set to either all ones or all zeros.
1576 	 */
1577 
1578 	msb_mask = ~((1UL << addr_width) - 1UL);
1579 	return ((msb_mask & va) == 0UL) || ((msb_mask & va) == msb_mask);
1580 }
1581 
1582 /*
1583  * @pre vcpu != NULL
1584  */
invvpid_vmexit_handler(struct acrn_vcpu * vcpu)1585 int32_t invvpid_vmexit_handler(struct acrn_vcpu *vcpu)
1586 {
1587 	uint32_t supported_types = (vcpu_get_guest_msr(vcpu, MSR_IA32_VMX_EPT_VPID_CAP) >> 40U) & 0xfU;
1588 	struct invvpid_operand desc;
1589 	uint64_t type;
1590 
1591 	if (check_vmx_permission(vcpu)) {
1592 		type = get_invvpid_ept_operands(vcpu, (void *)&desc, sizeof(desc));
1593 
1594 		if ((type > VMX_VPID_TYPE_SINGLE_NON_GLOBAL) || ((supported_types & (1U << type)) == 0)) {
1595 			nested_vmx_result(VMfailValid, VMXERR_INVEPT_INVVPID_INVALID_OPERAND);
1596 		} else if ((desc.rsvd1 != 0U) || (desc.rsvd2 != 0U)) {
1597 			nested_vmx_result(VMfailValid, VMXERR_INVEPT_INVVPID_INVALID_OPERAND);
1598 		} else if ((type != VMX_VPID_TYPE_ALL_CONTEXT) && (desc.vpid == 0U)) {
1599 			/* check VPID for type 0, 1, 3 */
1600 			nested_vmx_result(VMfailValid, VMXERR_INVEPT_INVVPID_INVALID_OPERAND);
1601 		} else if ((type == VMX_VPID_TYPE_INDIVIDUAL_ADDR) && !validate_canonical_addr(vcpu, desc.gva)) {
1602 			nested_vmx_result(VMfailValid, VMXERR_INVEPT_INVVPID_INVALID_OPERAND);
1603 		} else {
1604 			/*
1605 			 * VPIDs are pass-thru. Values programmed by L1 are used by L0.
1606 			 * INVVPID type, VPID and GLA, operands of INVVPID instruction, are
1607 			 * passed as is to the pCPU.
1608 			 */
1609 			asm_invvpid(desc, type);
1610 			nested_vmx_result(VMsucceed, 0);
1611 		}
1612 	}
1613 
1614 	return 0;
1615 }
1616 
init_nested_vmx(__unused struct acrn_vm * vm)1617 void init_nested_vmx(__unused struct acrn_vm *vm)
1618 {
1619 	static bool initialized = false;
1620 
1621 	if (!initialized) {
1622 		initialized = true;
1623 
1624 		/* Cache the value of physical MSR_IA32_VMX_BASIC */
1625 		vmx_basic = (uint32_t)msr_read(MSR_IA32_VMX_BASIC);
1626 		setup_vmcs_shadowing_bitmap();
1627 	}
1628 }
1629