1 /*
2  * Copyright (C) 2018-2022 Intel Corporation.
3  *
4  * SPDX-License-Identifier: BSD-3-Clause
5  */
6 
7 #include <types.h>
8 #include <errno.h>
9 #include <asm/guest/vm.h>
10 #include <asm/guest/virq.h>
11 #include <asm/pgtable.h>
12 #include <asm/mmu.h>
13 #include <asm/guest/ept.h>
14 #include <asm/vmx.h>
15 #include <asm/vtd.h>
16 #include <logmsg.h>
17 #include <trace.h>
18 #include <asm/rtct.h>
19 
20 #define DBG_LEVEL_EPT	6U
21 
22 /* EPT address space will not beyond the platform physical address space */
23 #define EPT_PML4_PAGE_NUM	PML4_PAGE_NUM(MAX_PHY_ADDRESS_SPACE)
24 #define EPT_PDPT_PAGE_NUM	PDPT_PAGE_NUM(MAX_PHY_ADDRESS_SPACE)
25 
26 /* ept_pd_page_num consists of three parts:
27  * 1) DRAM - and low MMIO are contiguous (we could assume this because ve820 was build by us),
28  *            CONFIG_MAX_VM_NUM at most
29  * 2) low MMIO - and DRAM are contiguous
30  * 3) high MMIO - Only PCI BARs're high MMIO (we didn't build the high MMIO EPT mapping
31  *                except writing PCI 64 bits BARs)
32  *
33  * The first two parts may use PD_PAGE_NUM(get_e820_ram_size() + MEM_4G) PD pages to build EPT mapping
34  * at most;
35  * The high MMIO may use (CONFIG_MAX_PCI_DEV_NUM * 6U) PD pages (may plus some PDPT entries
36  * if the high MMIO BAR size is larger than 1GB) to build EPT mapping at most
37 
38  * ept_pt_page_num consists of three parts:
39  * 1) DRAM - and low MMIO are contiguous (we could assume this because ve820 was build by us),
40  *            CONFIG_MAX_VM_NUM at most
41  * 2) low MMIO - and DRAM are contiguous
42  * 3) high MMIO - Only PCI BARs're high MMIO (we didn't build the high MMIO EPT mapping
43  *                except writing PCI 64 bits BARs)
44  *
45  * The first two parts may use PT_PAGE_NUM(get_e820_ram_size() + MEM_4G) PT pages to build EPT mapping
46  * at most;
47  * The high MMIO may use (CONFIG_MAX_PCI_DEV_NUM * 6U) PT pages to build EPT mapping at most:
48  * this is because: (a) each 64 bits MMIO BAR may spend one PT page at most to build EPT mapping,
49  *                      MMIO BAR size must be a power of 2 from 16 bytes;
50  *                      MMIO BAR base address must be power of two in size and are aligned with its size;
51  *                      So if the MMIO BAR size is less than 2M, one PT page is enough to cover its EPT mapping,
52  *                      if the MMIO size is larger than 2M, it must be multiple of 2M, we could use large pages
53  *                      to build EPT mapping for it. The single exception is fliter the MSI-X structure part
54  *                      from the MSI-X table BAR. In this case, it will also spend one PT page.
55  *                  (b) each PCI device may have six 64 bits MMIO (three general BARs plus three VF BARs)
56  *                  (c) The Maximum number of PCI devices for ACRN and the Maximum number of virtual PCI devices
57  *                      for VM both are get_e820_ram_size()
58  */
get_ept_page_num(void)59 static uint64_t get_ept_page_num(void)
60 {
61 	uint64_t ept_pd_page_num = PD_PAGE_NUM(get_e820_ram_size() + MEM_4G) + CONFIG_MAX_PCI_DEV_NUM * 6U;
62 	uint64_t ept_pt_page_num = PT_PAGE_NUM(get_e820_ram_size() + MEM_4G) + CONFIG_MAX_PCI_DEV_NUM * 6U;
63 
64 	return roundup((EPT_PML4_PAGE_NUM + EPT_PDPT_PAGE_NUM + ept_pd_page_num + ept_pt_page_num), 64U);
65 }
66 
get_total_ept_4k_pages_size(void)67 uint64_t get_total_ept_4k_pages_size(void)
68 {
69 	return CONFIG_MAX_VM_NUM * (get_ept_page_num()) * PAGE_SIZE;
70 }
71 
72 static struct page *ept_pages[CONFIG_MAX_VM_NUM];
73 static uint64_t *ept_page_bitmap[CONFIG_MAX_VM_NUM];
74 static struct page ept_dummy_pages[CONFIG_MAX_VM_NUM];
75 
76 /* ept: extended page pool*/
77 static struct page_pool ept_page_pool[CONFIG_MAX_VM_NUM];
78 
reserve_ept_bitmap(void)79 static void reserve_ept_bitmap(void)
80 {
81 	uint32_t i;
82 	uint64_t bitmap_base;
83 	uint64_t bitmap_size;
84 	uint64_t bitmap_offset;
85 
86 	bitmap_size = (get_ept_page_num() * CONFIG_MAX_VM_NUM) / 8;
87 	bitmap_offset = get_ept_page_num() / 8;
88 
89 	bitmap_base = e820_alloc_memory(bitmap_size, MEM_SIZE_MAX);
90 	set_paging_supervisor(bitmap_base, bitmap_size);
91 
92 	for(i = 0; i < CONFIG_MAX_VM_NUM; i++){
93 		ept_page_bitmap[i] = (uint64_t *)(void *)(bitmap_base + bitmap_offset * i);
94 	}
95 }
96 
97 /*
98  * @brief Reserve space for EPT 4K pages from platform E820 table
99  */
reserve_buffer_for_ept_pages(void)100 void reserve_buffer_for_ept_pages(void)
101 {
102 	uint64_t page_base;
103 	uint16_t vm_id;
104 	uint32_t offset = 0U;
105 
106 	page_base = e820_alloc_memory(get_total_ept_4k_pages_size(), MEM_SIZE_MAX);
107 	set_paging_supervisor(page_base, get_total_ept_4k_pages_size());
108 	for (vm_id = 0U; vm_id < CONFIG_MAX_VM_NUM; vm_id++) {
109 		ept_pages[vm_id] = (struct page *)(void *)(page_base + offset);
110 		/* assume each VM has same amount of EPT pages */
111 		offset += get_ept_page_num() * PAGE_SIZE;
112 	}
113 
114 	reserve_ept_bitmap();
115 }
116 
117 /* @pre: The PPT and EPT have same page granularity */
ept_large_page_support(enum _page_table_level level,__unused uint64_t prot)118 static inline bool ept_large_page_support(enum _page_table_level level, __unused uint64_t prot)
119 {
120 	bool support;
121 
122 	if (level == IA32E_PD) {
123 		support = true;
124 	} else if (level == IA32E_PDPT) {
125 		support = pcpu_has_vmx_ept_vpid_cap(VMX_EPT_1GB_PAGE);
126 	} else {
127 		support = false;
128 	}
129 
130 	return support;
131 }
132 
133 /*
134  * Pages without execution right, such as MMIO, can always use large page
135  * base on hardware capability, even if the VM is an RTVM. This can save
136  * page table page # and improve TLB hit rate.
137  */
use_large_page(enum _page_table_level level,uint64_t prot)138 static inline bool use_large_page(enum _page_table_level level, uint64_t prot)
139 {
140 	bool ret = false;	/* for code page */
141 
142 	if ((prot & EPT_EXE) == 0UL) {
143 		ret = ept_large_page_support(level, prot);
144 	}
145 
146 	return ret;
147 }
148 
ept_clflush_pagewalk(const void * etry)149 static inline void ept_clflush_pagewalk(const void* etry)
150 {
151 	iommu_flush_cache(etry, sizeof(uint64_t));
152 }
153 
ept_nop_tweak_exe_right(uint64_t * entry)154 static inline void ept_nop_tweak_exe_right(uint64_t *entry __attribute__((unused))) {}
ept_nop_recover_exe_right(uint64_t * entry)155 static inline void ept_nop_recover_exe_right(uint64_t *entry __attribute__((unused))) {}
156 
157 /* The function is used to disable execute right for (2MB / 1GB)large pages in EPT */
ept_tweak_exe_right(uint64_t * entry)158 static inline void ept_tweak_exe_right(uint64_t *entry)
159 {
160 	*entry &= ~EPT_EXE;
161 }
162 
163 /* The function is used to recover the execute right when large pages are breaking into 4KB pages
164  * Hypervisor doesn't control execute right for guest memory, recovers execute right by default.
165  */
ept_recover_exe_right(uint64_t * entry)166 static inline void ept_recover_exe_right(uint64_t *entry)
167 {
168 	*entry |= EPT_EXE;
169 }
170 
init_ept_pgtable(struct pgtable * table,uint16_t vm_id)171 void init_ept_pgtable(struct pgtable *table, uint16_t vm_id)
172 {
173 	struct acrn_vm *vm = get_vm_from_vmid(vm_id);
174 
175 	ept_page_pool[vm_id].start_page = ept_pages[vm_id];
176 	ept_page_pool[vm_id].bitmap_size = get_ept_page_num() / 64;
177 	ept_page_pool[vm_id].bitmap = ept_page_bitmap[vm_id];
178 	ept_page_pool[vm_id].dummy_page = &ept_dummy_pages[vm_id];
179 
180 	spinlock_init(&ept_page_pool[vm_id].lock);
181 	memset((void *)ept_page_pool[vm_id].bitmap, 0, ept_page_pool[vm_id].bitmap_size * sizeof(uint64_t));
182 	ept_page_pool[vm_id].last_hint_id = 0UL;
183 
184 	table->pool = &ept_page_pool[vm_id];
185 	table->default_access_right = EPT_RWX;
186 	table->pgentry_present_mask = EPT_RWX;
187 	table->clflush_pagewalk = ept_clflush_pagewalk;
188 	table->large_page_support = ept_large_page_support;
189 
190 	/* Mitigation for issue "Machine Check Error on Page Size Change" */
191 	if (is_ept_force_4k_ipage()) {
192 		table->tweak_exe_right = ept_tweak_exe_right;
193 		table->recover_exe_right = ept_recover_exe_right;
194 		/* For RTVM, build 4KB page mapping in EPT for code pages */
195 		if (is_rt_vm(vm)) {
196 			table->large_page_support = use_large_page;
197 		}
198 	} else {
199 		table->tweak_exe_right = ept_nop_tweak_exe_right;
200 		table->recover_exe_right = ept_nop_recover_exe_right;
201 	}
202 }
203 /*
204  * To enable the identical map and support of legacy devices/ACPI method in Service VM,
205  * ACRN presents the entire host 0-4GB memory region to Service VM, except the memory
206  * regions explicitly assigned to pre-launched VMs or HV (DRAM and MMIO). However,
207  * virtual e820 only contains the known DRAM regions. For this reason,
208  * we can't know if the GPA range is guest valid or not, by checking with
209  * its ve820 tables only.
210  *
211  * instead, we Check if the GPA range is guest valid by whether the GPA range is mapped
212  * in EPT pagetable or not
213  */
ept_is_valid_mr(struct acrn_vm * vm,uint64_t mr_base_gpa,uint64_t mr_size)214 bool ept_is_valid_mr(struct acrn_vm *vm, uint64_t mr_base_gpa, uint64_t mr_size)
215 {
216 	bool present = true;
217 	uint32_t sz;
218 	uint64_t end = mr_base_gpa + mr_size, address = mr_base_gpa;
219 
220 	while (address < end) {
221 		if (local_gpa2hpa(vm, address, &sz) == INVALID_HPA) {
222 			present = false;
223 			break;
224 		}
225 		address += sz;
226 	}
227 
228 	return present;
229 }
230 
destroy_ept(struct acrn_vm * vm)231 void destroy_ept(struct acrn_vm *vm)
232 {
233 	/* Destroy secure world */
234 	if (vm->sworld_control.flag.active != 0UL) {
235 		destroy_secure_world(vm, true);
236 	}
237 
238 	if (vm->arch_vm.nworld_eptp != NULL) {
239 		(void)memset(vm->arch_vm.nworld_eptp, 0U, PAGE_SIZE);
240 	}
241 }
242 
243 /**
244  * @pre: vm != NULL.
245  */
local_gpa2hpa(struct acrn_vm * vm,uint64_t gpa,uint32_t * size)246 uint64_t local_gpa2hpa(struct acrn_vm *vm, uint64_t gpa, uint32_t *size)
247 {
248 	/* using return value INVALID_HPA as error code */
249 	uint64_t hpa = INVALID_HPA;
250 	const uint64_t *pgentry;
251 	uint64_t pg_size = 0UL;
252 	void *eptp;
253 
254 	eptp = get_eptp(vm);
255 	pgentry = pgtable_lookup_entry((uint64_t *)eptp, gpa, &pg_size, &vm->arch_vm.ept_pgtable);
256 	if (pgentry != NULL) {
257 		hpa = (((*pgentry & (~EPT_PFN_HIGH_MASK)) & (~(pg_size - 1UL)))
258 				| (gpa & (pg_size - 1UL)));
259 	}
260 
261 	/**
262 	 * If specified parameter size is not NULL and
263 	 * the HPA of parameter gpa is found, pg_size shall
264 	 * be returned through parameter size.
265 	 */
266 	if ((size != NULL) && (hpa != INVALID_HPA)) {
267 		*size = (uint32_t)pg_size;
268 	}
269 
270 	return hpa;
271 }
272 
273 /* using return value INVALID_HPA as error code */
gpa2hpa(struct acrn_vm * vm,uint64_t gpa)274 uint64_t gpa2hpa(struct acrn_vm *vm, uint64_t gpa)
275 {
276 	return local_gpa2hpa(vm, gpa, NULL);
277 }
278 
279 /**
280  * @pre: the gpa and hpa are identical mapping in Service VM.
281  */
service_vm_hpa2gpa(uint64_t hpa)282 uint64_t service_vm_hpa2gpa(uint64_t hpa)
283 {
284 	return hpa;
285 }
286 
ept_misconfig_vmexit_handler(__unused struct acrn_vcpu * vcpu)287 int32_t ept_misconfig_vmexit_handler(__unused struct acrn_vcpu *vcpu)
288 {
289 	int32_t status;
290 
291 	status = -EINVAL;
292 
293 	/* TODO - EPT Violation handler */
294 	pr_fatal("%s, Guest linear address: 0x%016lx ",
295 			__func__, exec_vmread(VMX_GUEST_LINEAR_ADDR));
296 
297 	pr_fatal("%s, Guest physical address: 0x%016lx ",
298 			__func__, exec_vmread64(VMX_GUEST_PHYSICAL_ADDR_FULL));
299 
300 	ASSERT(status == 0, "EPT Misconfiguration is not handled.\n");
301 
302 	TRACE_2L(TRACE_VMEXIT_EPT_MISCONFIGURATION, 0UL, 0UL);
303 
304 	return status;
305 }
306 
ept_flush_guest(struct acrn_vm * vm)307 static inline void ept_flush_guest(struct acrn_vm *vm)
308 {
309 	uint16_t i;
310 	struct acrn_vcpu *vcpu;
311 	/* Here doesn't do the real flush, just makes the request which will be handled before vcpu vmenter */
312 	foreach_vcpu(i, vm, vcpu) {
313 		vcpu_make_request(vcpu, ACRN_REQUEST_EPT_FLUSH);
314 	}
315 }
316 
ept_add_mr(struct acrn_vm * vm,uint64_t * pml4_page,uint64_t hpa,uint64_t gpa,uint64_t size,uint64_t prot_orig)317 void ept_add_mr(struct acrn_vm *vm, uint64_t *pml4_page,
318 	uint64_t hpa, uint64_t gpa, uint64_t size, uint64_t prot_orig)
319 {
320 	uint64_t prot = prot_orig;
321 
322 	dev_dbg(DBG_LEVEL_EPT, "%s, vm[%d] hpa: 0x%016lx gpa: 0x%016lx size: 0x%016lx prot: 0x%016x\n",
323 			__func__, vm->vm_id, hpa, gpa, size, prot);
324 
325 	spinlock_obtain(&vm->ept_lock);
326 
327 	pgtable_add_map(pml4_page, hpa, gpa, size, prot, &vm->arch_vm.ept_pgtable);
328 
329 	spinlock_release(&vm->ept_lock);
330 
331 	ept_flush_guest(vm);
332 }
333 
ept_modify_mr(struct acrn_vm * vm,uint64_t * pml4_page,uint64_t gpa,uint64_t size,uint64_t prot_set,uint64_t prot_clr)334 void ept_modify_mr(struct acrn_vm *vm, uint64_t *pml4_page,
335 		uint64_t gpa, uint64_t size,
336 		uint64_t prot_set, uint64_t prot_clr)
337 {
338 	uint64_t local_prot = prot_set;
339 
340 	dev_dbg(DBG_LEVEL_EPT, "%s,vm[%d] gpa 0x%lx size 0x%lx\n", __func__, vm->vm_id, gpa, size);
341 
342 	spinlock_obtain(&vm->ept_lock);
343 
344 	pgtable_modify_or_del_map(pml4_page, gpa, size, local_prot, prot_clr, &(vm->arch_vm.ept_pgtable), MR_MODIFY);
345 
346 	spinlock_release(&vm->ept_lock);
347 
348 	ept_flush_guest(vm);
349 }
350 /**
351  * @pre [gpa,gpa+size) has been mapped into host physical memory region
352  */
ept_del_mr(struct acrn_vm * vm,uint64_t * pml4_page,uint64_t gpa,uint64_t size)353 void ept_del_mr(struct acrn_vm *vm, uint64_t *pml4_page, uint64_t gpa, uint64_t size)
354 {
355 	dev_dbg(DBG_LEVEL_EPT, "%s,vm[%d] gpa 0x%lx size 0x%lx\n", __func__, vm->vm_id, gpa, size);
356 
357 	spinlock_obtain(&vm->ept_lock);
358 
359 	pgtable_modify_or_del_map(pml4_page, gpa, size, 0UL, 0UL, &(vm->arch_vm.ept_pgtable), MR_DEL);
360 
361 	spinlock_release(&vm->ept_lock);
362 
363 	ept_flush_guest(vm);
364 }
365 
366 /**
367  * @pre pge != NULL && size > 0.
368  */
ept_flush_leaf_page(uint64_t * pge,uint64_t size)369 void ept_flush_leaf_page(uint64_t *pge, uint64_t size)
370 {
371 	uint64_t base_hpa, end_hpa;
372 	uint64_t sw_sram_bottom, sw_sram_top;
373 
374 	if ((*pge & EPT_MT_MASK) != EPT_UNCACHED) {
375 		base_hpa = (*pge & (~(size - 1UL)));
376 		end_hpa = base_hpa + size;
377 
378 		 sw_sram_bottom = get_software_sram_base();
379 		 sw_sram_top = sw_sram_bottom + get_software_sram_size();
380 		/* When Software SRAM is not initialized, both sw_sram_bottom and sw_sram_top is 0,
381 		 * so the first if below will have no use.
382 		 */
383 		if (base_hpa < sw_sram_bottom) {
384 			/*
385 			 * For end_hpa < sw_sram_bottom, flush [base_hpa, end_hpa);
386 			 * For end_hpa >= sw_sram_bottom && end_hpa < sw_sram_top, flush [base_hpa, sw_sram_bottom);
387 			 * For end_hpa > sw_sram_top, flush [base_hpa, sw_sram_bottom) first,
388 			 *                            flush [sw_sram_top, end_hpa) in the next if condition
389 			 */
390 			stac();
391 			flush_cache_range(hpa2hva(base_hpa), min(end_hpa, sw_sram_bottom) - base_hpa);
392 			clac();
393 		}
394 
395 		if (end_hpa > sw_sram_top) {
396 			/*
397 			 * For base_hpa > sw_sram_top, flush [base_hpa, end_hpa);
398 			 * For base_hpa >= sw_sram_bottom && base_hpa < sw_sram_top, flush [sw_sram_top, end_hpa);
399 			 * For base_hpa < sw_sram_bottom, flush [sw_sram_top, end_hpa) here,
400 			 *                            flush [base_hpa, sw_sram_bottom) in the below if condition
401 			 */
402 			stac();
403 			flush_cache_range(hpa2hva(max(base_hpa, sw_sram_top)), end_hpa - max(base_hpa, sw_sram_top));
404 			clac();
405 		}
406 	}
407 }
408 
409 /**
410  * @pre: vm != NULL.
411  */
get_eptp(struct acrn_vm * vm)412 void *get_eptp(struct acrn_vm *vm)
413 {
414 	void *eptp;
415 	struct acrn_vcpu *vcpu = vcpu_from_pid(vm, get_pcpu_id());
416 
417 	if ((vcpu != NULL) && (vcpu->arch.cur_context == SECURE_WORLD)) {
418 		eptp = vm->arch_vm.sworld_eptp;
419 	} else {
420 		eptp = vm->arch_vm.nworld_eptp;
421 	}
422 
423 	return eptp;
424 }
425 
426 /**
427  * @pre vm != NULL && cb != NULL.
428  */
walk_ept_table(struct acrn_vm * vm,pge_handler cb)429 void walk_ept_table(struct acrn_vm *vm, pge_handler cb)
430 {
431 	const struct pgtable *table = &vm->arch_vm.ept_pgtable;
432 	uint64_t *pml4e, *pdpte, *pde, *pte;
433 	uint64_t i, j, k, m;
434 
435 	for (i = 0UL; i < PTRS_PER_PML4E; i++) {
436 		pml4e = pml4e_offset((uint64_t *)get_eptp(vm), i << PML4E_SHIFT);
437 		if (!pgentry_present(table, (*pml4e))) {
438 			continue;
439 		}
440 		for (j = 0UL; j < PTRS_PER_PDPTE; j++) {
441 			pdpte = pdpte_offset(pml4e, j << PDPTE_SHIFT);
442 			if (!pgentry_present(table, (*pdpte))) {
443 				continue;
444 			}
445 			if (pdpte_large(*pdpte) != 0UL) {
446 				cb(pdpte, PDPTE_SIZE);
447 				continue;
448 			}
449 			for (k = 0UL; k < PTRS_PER_PDE; k++) {
450 				pde = pde_offset(pdpte, k << PDE_SHIFT);
451 				if (!pgentry_present(table, (*pde))) {
452 					continue;
453 				}
454 				if (pde_large(*pde) != 0UL) {
455 					cb(pde, PDE_SIZE);
456 					continue;
457 				}
458 				for (m = 0UL; m < PTRS_PER_PTE; m++) {
459 					pte = pte_offset(pde, m << PTE_SHIFT);
460 					if (pgentry_present(table, (*pte))) {
461 						cb(pte, PTE_SIZE);
462 					}
463 				}
464 			}
465 			/*
466 			 * Walk through the whole page tables of one VM is a time-consuming
467 			 * operation. Preemption is not support by hypervisor scheduling
468 			 * currently, so the walk through page tables operation might occupy
469 			 * CPU for long time what starve other threads.
470 			 *
471 			 * Give chance to release CPU to make other threads happy.
472 			 */
473 			if (need_reschedule(get_pcpu_id())) {
474 				schedule();
475 			}
476 		}
477 	}
478 }
479