1 /*
2 * Copyright (C) 2018-2022 Intel Corporation.
3 *
4 * SPDX-License-Identifier: BSD-3-Clause
5 */
6
7 #include <types.h>
8 #include <errno.h>
9 #include <asm/guest/vm.h>
10 #include <asm/guest/virq.h>
11 #include <asm/pgtable.h>
12 #include <asm/mmu.h>
13 #include <asm/guest/ept.h>
14 #include <asm/vmx.h>
15 #include <asm/vtd.h>
16 #include <logmsg.h>
17 #include <trace.h>
18 #include <asm/rtct.h>
19
20 #define DBG_LEVEL_EPT 6U
21
22 /* EPT address space will not beyond the platform physical address space */
23 #define EPT_PML4_PAGE_NUM PML4_PAGE_NUM(MAX_PHY_ADDRESS_SPACE)
24 #define EPT_PDPT_PAGE_NUM PDPT_PAGE_NUM(MAX_PHY_ADDRESS_SPACE)
25
26 /* ept_pd_page_num consists of three parts:
27 * 1) DRAM - and low MMIO are contiguous (we could assume this because ve820 was build by us),
28 * CONFIG_MAX_VM_NUM at most
29 * 2) low MMIO - and DRAM are contiguous
30 * 3) high MMIO - Only PCI BARs're high MMIO (we didn't build the high MMIO EPT mapping
31 * except writing PCI 64 bits BARs)
32 *
33 * The first two parts may use PD_PAGE_NUM(get_e820_ram_size() + MEM_4G) PD pages to build EPT mapping
34 * at most;
35 * The high MMIO may use (CONFIG_MAX_PCI_DEV_NUM * 6U) PD pages (may plus some PDPT entries
36 * if the high MMIO BAR size is larger than 1GB) to build EPT mapping at most
37
38 * ept_pt_page_num consists of three parts:
39 * 1) DRAM - and low MMIO are contiguous (we could assume this because ve820 was build by us),
40 * CONFIG_MAX_VM_NUM at most
41 * 2) low MMIO - and DRAM are contiguous
42 * 3) high MMIO - Only PCI BARs're high MMIO (we didn't build the high MMIO EPT mapping
43 * except writing PCI 64 bits BARs)
44 *
45 * The first two parts may use PT_PAGE_NUM(get_e820_ram_size() + MEM_4G) PT pages to build EPT mapping
46 * at most;
47 * The high MMIO may use (CONFIG_MAX_PCI_DEV_NUM * 6U) PT pages to build EPT mapping at most:
48 * this is because: (a) each 64 bits MMIO BAR may spend one PT page at most to build EPT mapping,
49 * MMIO BAR size must be a power of 2 from 16 bytes;
50 * MMIO BAR base address must be power of two in size and are aligned with its size;
51 * So if the MMIO BAR size is less than 2M, one PT page is enough to cover its EPT mapping,
52 * if the MMIO size is larger than 2M, it must be multiple of 2M, we could use large pages
53 * to build EPT mapping for it. The single exception is fliter the MSI-X structure part
54 * from the MSI-X table BAR. In this case, it will also spend one PT page.
55 * (b) each PCI device may have six 64 bits MMIO (three general BARs plus three VF BARs)
56 * (c) The Maximum number of PCI devices for ACRN and the Maximum number of virtual PCI devices
57 * for VM both are get_e820_ram_size()
58 */
get_ept_page_num(void)59 static uint64_t get_ept_page_num(void)
60 {
61 uint64_t ept_pd_page_num = PD_PAGE_NUM(get_e820_ram_size() + MEM_4G) + CONFIG_MAX_PCI_DEV_NUM * 6U;
62 uint64_t ept_pt_page_num = PT_PAGE_NUM(get_e820_ram_size() + MEM_4G) + CONFIG_MAX_PCI_DEV_NUM * 6U;
63
64 return roundup((EPT_PML4_PAGE_NUM + EPT_PDPT_PAGE_NUM + ept_pd_page_num + ept_pt_page_num), 64U);
65 }
66
get_total_ept_4k_pages_size(void)67 uint64_t get_total_ept_4k_pages_size(void)
68 {
69 return CONFIG_MAX_VM_NUM * (get_ept_page_num()) * PAGE_SIZE;
70 }
71
72 static struct page *ept_pages[CONFIG_MAX_VM_NUM];
73 static uint64_t *ept_page_bitmap[CONFIG_MAX_VM_NUM];
74 static struct page ept_dummy_pages[CONFIG_MAX_VM_NUM];
75
76 /* ept: extended page pool*/
77 static struct page_pool ept_page_pool[CONFIG_MAX_VM_NUM];
78
reserve_ept_bitmap(void)79 static void reserve_ept_bitmap(void)
80 {
81 uint32_t i;
82 uint64_t bitmap_base;
83 uint64_t bitmap_size;
84 uint64_t bitmap_offset;
85
86 bitmap_size = (get_ept_page_num() * CONFIG_MAX_VM_NUM) / 8;
87 bitmap_offset = get_ept_page_num() / 8;
88
89 bitmap_base = e820_alloc_memory(bitmap_size, MEM_SIZE_MAX);
90 set_paging_supervisor(bitmap_base, bitmap_size);
91
92 for(i = 0; i < CONFIG_MAX_VM_NUM; i++){
93 ept_page_bitmap[i] = (uint64_t *)(void *)(bitmap_base + bitmap_offset * i);
94 }
95 }
96
97 /*
98 * @brief Reserve space for EPT 4K pages from platform E820 table
99 */
reserve_buffer_for_ept_pages(void)100 void reserve_buffer_for_ept_pages(void)
101 {
102 uint64_t page_base;
103 uint16_t vm_id;
104 uint32_t offset = 0U;
105
106 page_base = e820_alloc_memory(get_total_ept_4k_pages_size(), MEM_SIZE_MAX);
107 set_paging_supervisor(page_base, get_total_ept_4k_pages_size());
108 for (vm_id = 0U; vm_id < CONFIG_MAX_VM_NUM; vm_id++) {
109 ept_pages[vm_id] = (struct page *)(void *)(page_base + offset);
110 /* assume each VM has same amount of EPT pages */
111 offset += get_ept_page_num() * PAGE_SIZE;
112 }
113
114 reserve_ept_bitmap();
115 }
116
117 /* @pre: The PPT and EPT have same page granularity */
ept_large_page_support(enum _page_table_level level,__unused uint64_t prot)118 static inline bool ept_large_page_support(enum _page_table_level level, __unused uint64_t prot)
119 {
120 bool support;
121
122 if (level == IA32E_PD) {
123 support = true;
124 } else if (level == IA32E_PDPT) {
125 support = pcpu_has_vmx_ept_vpid_cap(VMX_EPT_1GB_PAGE);
126 } else {
127 support = false;
128 }
129
130 return support;
131 }
132
133 /*
134 * Pages without execution right, such as MMIO, can always use large page
135 * base on hardware capability, even if the VM is an RTVM. This can save
136 * page table page # and improve TLB hit rate.
137 */
use_large_page(enum _page_table_level level,uint64_t prot)138 static inline bool use_large_page(enum _page_table_level level, uint64_t prot)
139 {
140 bool ret = false; /* for code page */
141
142 if ((prot & EPT_EXE) == 0UL) {
143 ret = ept_large_page_support(level, prot);
144 }
145
146 return ret;
147 }
148
ept_clflush_pagewalk(const void * etry)149 static inline void ept_clflush_pagewalk(const void* etry)
150 {
151 iommu_flush_cache(etry, sizeof(uint64_t));
152 }
153
ept_nop_tweak_exe_right(uint64_t * entry)154 static inline void ept_nop_tweak_exe_right(uint64_t *entry __attribute__((unused))) {}
ept_nop_recover_exe_right(uint64_t * entry)155 static inline void ept_nop_recover_exe_right(uint64_t *entry __attribute__((unused))) {}
156
157 /* The function is used to disable execute right for (2MB / 1GB)large pages in EPT */
ept_tweak_exe_right(uint64_t * entry)158 static inline void ept_tweak_exe_right(uint64_t *entry)
159 {
160 *entry &= ~EPT_EXE;
161 }
162
163 /* The function is used to recover the execute right when large pages are breaking into 4KB pages
164 * Hypervisor doesn't control execute right for guest memory, recovers execute right by default.
165 */
ept_recover_exe_right(uint64_t * entry)166 static inline void ept_recover_exe_right(uint64_t *entry)
167 {
168 *entry |= EPT_EXE;
169 }
170
init_ept_pgtable(struct pgtable * table,uint16_t vm_id)171 void init_ept_pgtable(struct pgtable *table, uint16_t vm_id)
172 {
173 struct acrn_vm *vm = get_vm_from_vmid(vm_id);
174
175 ept_page_pool[vm_id].start_page = ept_pages[vm_id];
176 ept_page_pool[vm_id].bitmap_size = get_ept_page_num() / 64;
177 ept_page_pool[vm_id].bitmap = ept_page_bitmap[vm_id];
178 ept_page_pool[vm_id].dummy_page = &ept_dummy_pages[vm_id];
179
180 spinlock_init(&ept_page_pool[vm_id].lock);
181 memset((void *)ept_page_pool[vm_id].bitmap, 0, ept_page_pool[vm_id].bitmap_size * sizeof(uint64_t));
182 ept_page_pool[vm_id].last_hint_id = 0UL;
183
184 table->pool = &ept_page_pool[vm_id];
185 table->default_access_right = EPT_RWX;
186 table->pgentry_present_mask = EPT_RWX;
187 table->clflush_pagewalk = ept_clflush_pagewalk;
188 table->large_page_support = ept_large_page_support;
189
190 /* Mitigation for issue "Machine Check Error on Page Size Change" */
191 if (is_ept_force_4k_ipage()) {
192 table->tweak_exe_right = ept_tweak_exe_right;
193 table->recover_exe_right = ept_recover_exe_right;
194 /* For RTVM, build 4KB page mapping in EPT for code pages */
195 if (is_rt_vm(vm)) {
196 table->large_page_support = use_large_page;
197 }
198 } else {
199 table->tweak_exe_right = ept_nop_tweak_exe_right;
200 table->recover_exe_right = ept_nop_recover_exe_right;
201 }
202 }
203 /*
204 * To enable the identical map and support of legacy devices/ACPI method in Service VM,
205 * ACRN presents the entire host 0-4GB memory region to Service VM, except the memory
206 * regions explicitly assigned to pre-launched VMs or HV (DRAM and MMIO). However,
207 * virtual e820 only contains the known DRAM regions. For this reason,
208 * we can't know if the GPA range is guest valid or not, by checking with
209 * its ve820 tables only.
210 *
211 * instead, we Check if the GPA range is guest valid by whether the GPA range is mapped
212 * in EPT pagetable or not
213 */
ept_is_valid_mr(struct acrn_vm * vm,uint64_t mr_base_gpa,uint64_t mr_size)214 bool ept_is_valid_mr(struct acrn_vm *vm, uint64_t mr_base_gpa, uint64_t mr_size)
215 {
216 bool present = true;
217 uint32_t sz;
218 uint64_t end = mr_base_gpa + mr_size, address = mr_base_gpa;
219
220 while (address < end) {
221 if (local_gpa2hpa(vm, address, &sz) == INVALID_HPA) {
222 present = false;
223 break;
224 }
225 address += sz;
226 }
227
228 return present;
229 }
230
destroy_ept(struct acrn_vm * vm)231 void destroy_ept(struct acrn_vm *vm)
232 {
233 /* Destroy secure world */
234 if (vm->sworld_control.flag.active != 0UL) {
235 destroy_secure_world(vm, true);
236 }
237
238 if (vm->arch_vm.nworld_eptp != NULL) {
239 (void)memset(vm->arch_vm.nworld_eptp, 0U, PAGE_SIZE);
240 }
241 }
242
243 /**
244 * @pre: vm != NULL.
245 */
local_gpa2hpa(struct acrn_vm * vm,uint64_t gpa,uint32_t * size)246 uint64_t local_gpa2hpa(struct acrn_vm *vm, uint64_t gpa, uint32_t *size)
247 {
248 /* using return value INVALID_HPA as error code */
249 uint64_t hpa = INVALID_HPA;
250 const uint64_t *pgentry;
251 uint64_t pg_size = 0UL;
252 void *eptp;
253
254 eptp = get_eptp(vm);
255 pgentry = pgtable_lookup_entry((uint64_t *)eptp, gpa, &pg_size, &vm->arch_vm.ept_pgtable);
256 if (pgentry != NULL) {
257 hpa = (((*pgentry & (~EPT_PFN_HIGH_MASK)) & (~(pg_size - 1UL)))
258 | (gpa & (pg_size - 1UL)));
259 }
260
261 /**
262 * If specified parameter size is not NULL and
263 * the HPA of parameter gpa is found, pg_size shall
264 * be returned through parameter size.
265 */
266 if ((size != NULL) && (hpa != INVALID_HPA)) {
267 *size = (uint32_t)pg_size;
268 }
269
270 return hpa;
271 }
272
273 /* using return value INVALID_HPA as error code */
gpa2hpa(struct acrn_vm * vm,uint64_t gpa)274 uint64_t gpa2hpa(struct acrn_vm *vm, uint64_t gpa)
275 {
276 return local_gpa2hpa(vm, gpa, NULL);
277 }
278
279 /**
280 * @pre: the gpa and hpa are identical mapping in Service VM.
281 */
service_vm_hpa2gpa(uint64_t hpa)282 uint64_t service_vm_hpa2gpa(uint64_t hpa)
283 {
284 return hpa;
285 }
286
ept_misconfig_vmexit_handler(__unused struct acrn_vcpu * vcpu)287 int32_t ept_misconfig_vmexit_handler(__unused struct acrn_vcpu *vcpu)
288 {
289 int32_t status;
290
291 status = -EINVAL;
292
293 /* TODO - EPT Violation handler */
294 pr_fatal("%s, Guest linear address: 0x%016lx ",
295 __func__, exec_vmread(VMX_GUEST_LINEAR_ADDR));
296
297 pr_fatal("%s, Guest physical address: 0x%016lx ",
298 __func__, exec_vmread64(VMX_GUEST_PHYSICAL_ADDR_FULL));
299
300 ASSERT(status == 0, "EPT Misconfiguration is not handled.\n");
301
302 TRACE_2L(TRACE_VMEXIT_EPT_MISCONFIGURATION, 0UL, 0UL);
303
304 return status;
305 }
306
ept_flush_guest(struct acrn_vm * vm)307 static inline void ept_flush_guest(struct acrn_vm *vm)
308 {
309 uint16_t i;
310 struct acrn_vcpu *vcpu;
311 /* Here doesn't do the real flush, just makes the request which will be handled before vcpu vmenter */
312 foreach_vcpu(i, vm, vcpu) {
313 vcpu_make_request(vcpu, ACRN_REQUEST_EPT_FLUSH);
314 }
315 }
316
ept_add_mr(struct acrn_vm * vm,uint64_t * pml4_page,uint64_t hpa,uint64_t gpa,uint64_t size,uint64_t prot_orig)317 void ept_add_mr(struct acrn_vm *vm, uint64_t *pml4_page,
318 uint64_t hpa, uint64_t gpa, uint64_t size, uint64_t prot_orig)
319 {
320 uint64_t prot = prot_orig;
321
322 dev_dbg(DBG_LEVEL_EPT, "%s, vm[%d] hpa: 0x%016lx gpa: 0x%016lx size: 0x%016lx prot: 0x%016x\n",
323 __func__, vm->vm_id, hpa, gpa, size, prot);
324
325 spinlock_obtain(&vm->ept_lock);
326
327 pgtable_add_map(pml4_page, hpa, gpa, size, prot, &vm->arch_vm.ept_pgtable);
328
329 spinlock_release(&vm->ept_lock);
330
331 ept_flush_guest(vm);
332 }
333
ept_modify_mr(struct acrn_vm * vm,uint64_t * pml4_page,uint64_t gpa,uint64_t size,uint64_t prot_set,uint64_t prot_clr)334 void ept_modify_mr(struct acrn_vm *vm, uint64_t *pml4_page,
335 uint64_t gpa, uint64_t size,
336 uint64_t prot_set, uint64_t prot_clr)
337 {
338 uint64_t local_prot = prot_set;
339
340 dev_dbg(DBG_LEVEL_EPT, "%s,vm[%d] gpa 0x%lx size 0x%lx\n", __func__, vm->vm_id, gpa, size);
341
342 spinlock_obtain(&vm->ept_lock);
343
344 pgtable_modify_or_del_map(pml4_page, gpa, size, local_prot, prot_clr, &(vm->arch_vm.ept_pgtable), MR_MODIFY);
345
346 spinlock_release(&vm->ept_lock);
347
348 ept_flush_guest(vm);
349 }
350 /**
351 * @pre [gpa,gpa+size) has been mapped into host physical memory region
352 */
ept_del_mr(struct acrn_vm * vm,uint64_t * pml4_page,uint64_t gpa,uint64_t size)353 void ept_del_mr(struct acrn_vm *vm, uint64_t *pml4_page, uint64_t gpa, uint64_t size)
354 {
355 dev_dbg(DBG_LEVEL_EPT, "%s,vm[%d] gpa 0x%lx size 0x%lx\n", __func__, vm->vm_id, gpa, size);
356
357 spinlock_obtain(&vm->ept_lock);
358
359 pgtable_modify_or_del_map(pml4_page, gpa, size, 0UL, 0UL, &(vm->arch_vm.ept_pgtable), MR_DEL);
360
361 spinlock_release(&vm->ept_lock);
362
363 ept_flush_guest(vm);
364 }
365
366 /**
367 * @pre pge != NULL && size > 0.
368 */
ept_flush_leaf_page(uint64_t * pge,uint64_t size)369 void ept_flush_leaf_page(uint64_t *pge, uint64_t size)
370 {
371 uint64_t base_hpa, end_hpa;
372 uint64_t sw_sram_bottom, sw_sram_top;
373
374 if ((*pge & EPT_MT_MASK) != EPT_UNCACHED) {
375 base_hpa = (*pge & (~(size - 1UL)));
376 end_hpa = base_hpa + size;
377
378 sw_sram_bottom = get_software_sram_base();
379 sw_sram_top = sw_sram_bottom + get_software_sram_size();
380 /* When Software SRAM is not initialized, both sw_sram_bottom and sw_sram_top is 0,
381 * so the first if below will have no use.
382 */
383 if (base_hpa < sw_sram_bottom) {
384 /*
385 * For end_hpa < sw_sram_bottom, flush [base_hpa, end_hpa);
386 * For end_hpa >= sw_sram_bottom && end_hpa < sw_sram_top, flush [base_hpa, sw_sram_bottom);
387 * For end_hpa > sw_sram_top, flush [base_hpa, sw_sram_bottom) first,
388 * flush [sw_sram_top, end_hpa) in the next if condition
389 */
390 stac();
391 flush_cache_range(hpa2hva(base_hpa), min(end_hpa, sw_sram_bottom) - base_hpa);
392 clac();
393 }
394
395 if (end_hpa > sw_sram_top) {
396 /*
397 * For base_hpa > sw_sram_top, flush [base_hpa, end_hpa);
398 * For base_hpa >= sw_sram_bottom && base_hpa < sw_sram_top, flush [sw_sram_top, end_hpa);
399 * For base_hpa < sw_sram_bottom, flush [sw_sram_top, end_hpa) here,
400 * flush [base_hpa, sw_sram_bottom) in the below if condition
401 */
402 stac();
403 flush_cache_range(hpa2hva(max(base_hpa, sw_sram_top)), end_hpa - max(base_hpa, sw_sram_top));
404 clac();
405 }
406 }
407 }
408
409 /**
410 * @pre: vm != NULL.
411 */
get_eptp(struct acrn_vm * vm)412 void *get_eptp(struct acrn_vm *vm)
413 {
414 void *eptp;
415 struct acrn_vcpu *vcpu = vcpu_from_pid(vm, get_pcpu_id());
416
417 if ((vcpu != NULL) && (vcpu->arch.cur_context == SECURE_WORLD)) {
418 eptp = vm->arch_vm.sworld_eptp;
419 } else {
420 eptp = vm->arch_vm.nworld_eptp;
421 }
422
423 return eptp;
424 }
425
426 /**
427 * @pre vm != NULL && cb != NULL.
428 */
walk_ept_table(struct acrn_vm * vm,pge_handler cb)429 void walk_ept_table(struct acrn_vm *vm, pge_handler cb)
430 {
431 const struct pgtable *table = &vm->arch_vm.ept_pgtable;
432 uint64_t *pml4e, *pdpte, *pde, *pte;
433 uint64_t i, j, k, m;
434
435 for (i = 0UL; i < PTRS_PER_PML4E; i++) {
436 pml4e = pml4e_offset((uint64_t *)get_eptp(vm), i << PML4E_SHIFT);
437 if (!pgentry_present(table, (*pml4e))) {
438 continue;
439 }
440 for (j = 0UL; j < PTRS_PER_PDPTE; j++) {
441 pdpte = pdpte_offset(pml4e, j << PDPTE_SHIFT);
442 if (!pgentry_present(table, (*pdpte))) {
443 continue;
444 }
445 if (pdpte_large(*pdpte) != 0UL) {
446 cb(pdpte, PDPTE_SIZE);
447 continue;
448 }
449 for (k = 0UL; k < PTRS_PER_PDE; k++) {
450 pde = pde_offset(pdpte, k << PDE_SHIFT);
451 if (!pgentry_present(table, (*pde))) {
452 continue;
453 }
454 if (pde_large(*pde) != 0UL) {
455 cb(pde, PDE_SIZE);
456 continue;
457 }
458 for (m = 0UL; m < PTRS_PER_PTE; m++) {
459 pte = pte_offset(pde, m << PTE_SHIFT);
460 if (pgentry_present(table, (*pte))) {
461 cb(pte, PTE_SIZE);
462 }
463 }
464 }
465 /*
466 * Walk through the whole page tables of one VM is a time-consuming
467 * operation. Preemption is not support by hypervisor scheduling
468 * currently, so the walk through page tables operation might occupy
469 * CPU for long time what starve other threads.
470 *
471 * Give chance to release CPU to make other threads happy.
472 */
473 if (need_reschedule(get_pcpu_id())) {
474 schedule();
475 }
476 }
477 }
478 }
479