1 /* SPDX-License-Identifier: GPL-2.0-or-later */
2 /******************************************************************************
3 * arch/x86/mm/guest_walk.c
4 *
5 * Pagetable walker for guest memory accesses.
6 *
7 * Parts of this code are Copyright (c) 2006 by XenSource Inc.
8 * Parts of this code are Copyright (c) 2006 by Michael A Fetterman
9 * Parts based on earlier work by Michael A Fetterman, Ian Pratt et al.
10 */
11
12 #include <xen/types.h>
13 #include <xen/mm.h>
14 #include <xen/paging.h>
15 #include <xen/domain_page.h>
16 #include <xen/sched.h>
17
18 #include <asm/page.h>
19 #include <asm/prot-key.h>
20 #include <asm/guest_pt.h>
21 #include <asm/hvm/emulate.h>
22
23 /*
24 * Modify a guest pagetable entry to set the Accessed and Dirty bits.
25 * Returns true if it actually writes to guest memory.
26 */
set_ad_bits(guest_intpte_t * guest_p,guest_intpte_t * walk_p,bool set_dirty)27 static bool set_ad_bits(guest_intpte_t *guest_p, guest_intpte_t *walk_p,
28 bool set_dirty)
29 {
30 guest_intpte_t new, old = *walk_p;
31
32 new = old | _PAGE_ACCESSED | (set_dirty ? _PAGE_DIRTY : 0);
33 if ( old != new )
34 {
35 /*
36 * Write the new entry into the walk, and try to write it back
37 * into the guest table as well. If the guest table has changed
38 * under our feet then leave it alone.
39 */
40 *walk_p = new;
41 if ( cmpxchg(guest_p, old, new) == old )
42 return true;
43 }
44 return false;
45 }
46
47 /*
48 * Walk the guest pagetables, after the manner of a hardware walker.
49 *
50 * This is a condensing of the 'Paging' chapters from Intel and AMD software
51 * manuals. Please refer closely to them.
52 *
53 * A pagetable walk consists of two parts:
54 * 1) to find whether a translation exists, and
55 * 2) if a translation does exist, to check whether the translation's access
56 * rights permit the access.
57 *
58 * A translation is found by following the pagetable structure (starting at
59 * %cr3) to a leaf entry (an L1 PTE, or a higher level entry with PSE set)
60 * which identifies the physical destination of the access.
61 *
62 * A translation from one level to the next exists if the PTE is both present
63 * and has no reserved bits set. If the pagewalk counters a situation where a
64 * translation does not exist, the walk stops at that point.
65 *
66 * The access rights (NX, User, RW bits) are collected as the walk progresses.
67 * If a translation exists, the accumulated access rights are compared to the
68 * requested walk, to see whether the access is permitted.
69 */
70 bool
guest_walk_tables(const struct vcpu * v,struct p2m_domain * p2m,unsigned long va,walk_t * gw,uint32_t walk,gfn_t top_gfn,mfn_t top_mfn,void * top_map)71 guest_walk_tables(const struct vcpu *v, struct p2m_domain *p2m,
72 unsigned long va, walk_t *gw, uint32_t walk,
73 gfn_t top_gfn, mfn_t top_mfn, void *top_map)
74 {
75 struct domain *d = v->domain;
76 guest_l1e_t *l1p = NULL;
77 guest_l2e_t *l2p = NULL;
78 #if GUEST_PAGING_LEVELS >= 4 /* 64-bit only... */
79 guest_l3e_t *l3p = NULL;
80 guest_l4e_t *l4p;
81 paddr_t l4gpa;
82 #endif
83 #if GUEST_PAGING_LEVELS >= 3 /* PAE or 64... */
84 paddr_t l3gpa;
85 #endif
86 uint32_t gflags, rc;
87 paddr_t l1gpa = 0, l2gpa = 0;
88 unsigned int leaf_level;
89 p2m_query_t qt = P2M_ALLOC | P2M_UNSHARE;
90
91 #define AR_ACCUM_AND (_PAGE_USER | _PAGE_RW)
92 #define AR_ACCUM_OR (_PAGE_NX_BIT)
93 /* Start with all AND bits set, all OR bits clear. */
94 uint32_t ar, ar_and = ~0u, ar_or = 0;
95
96 bool walk_ok = false;
97
98 /*
99 * TODO - We should ASSERT() that only the following bits are set as
100 * inputs to a guest walk, but a whole load of code currently passes in
101 * other PFEC_ constants.
102 */
103 walk &= (PFEC_implicit | PFEC_insn_fetch | PFEC_user_mode | PFEC_write_access);
104
105 /* Only implicit supervisor data accesses exist. */
106 ASSERT(!(walk & PFEC_implicit) ||
107 !(walk & (PFEC_insn_fetch | PFEC_user_mode)));
108
109 perfc_incr(guest_walk);
110 memset(gw, 0, sizeof(*gw));
111 gw->va = va;
112 gw->pfec = walk & (PFEC_user_mode | PFEC_write_access);
113
114 /*
115 * PFEC_insn_fetch is only reported if NX or SMEP are enabled. Hardware
116 * still distingueses instruction fetches during determination of access
117 * rights.
118 */
119 if ( guest_nx_enabled(v) || guest_smep_enabled(v) )
120 gw->pfec |= (walk & PFEC_insn_fetch);
121
122 #if GUEST_PAGING_LEVELS >= 3 /* PAE or 64... */
123 #if GUEST_PAGING_LEVELS >= 4 /* 64-bit only... */
124
125 /* Get the l4e from the top level table and check its flags*/
126 gw->l4mfn = top_mfn;
127 l4p = (guest_l4e_t *) top_map;
128 l4gpa = gfn_to_gaddr(top_gfn) +
129 guest_l4_table_offset(va) * sizeof(gw->l4e);
130 if ( !hvmemul_read_cache(v, l4gpa, &gw->l4e, sizeof(gw->l4e)) )
131 {
132 gw->l4e = l4p[guest_l4_table_offset(va)];
133 hvmemul_write_cache(v, l4gpa, &gw->l4e, sizeof(gw->l4e));
134 }
135 gflags = guest_l4e_get_flags(gw->l4e);
136 if ( !(gflags & _PAGE_PRESENT) )
137 goto out;
138
139 /* Check for reserved bits. */
140 if ( guest_l4e_rsvd_bits(v, gw->l4e) )
141 {
142 gw->pfec |= PFEC_reserved_bit | PFEC_page_present;
143 goto out;
144 }
145
146 /* Accumulate l4e access rights. */
147 ar_and &= gflags;
148 ar_or |= gflags;
149
150 /* Map the l3 table */
151 l3p = map_domain_gfn(p2m,
152 guest_l4e_get_gfn(gw->l4e),
153 &gw->l3mfn,
154 qt,
155 &rc);
156 if ( l3p == NULL )
157 {
158 gw->pfec |= rc & PFEC_synth_mask;
159 goto out;
160 }
161
162 /* Get the l3e and check its flags*/
163 l3gpa = gfn_to_gaddr(guest_l4e_get_gfn(gw->l4e)) +
164 guest_l3_table_offset(va) * sizeof(gw->l3e);
165 if ( !hvmemul_read_cache(v, l3gpa, &gw->l3e, sizeof(gw->l3e)) )
166 {
167 gw->l3e = l3p[guest_l3_table_offset(va)];
168 hvmemul_write_cache(v, l3gpa, &gw->l3e, sizeof(gw->l3e));
169 }
170 gflags = guest_l3e_get_flags(gw->l3e);
171 if ( !(gflags & _PAGE_PRESENT) )
172 goto out;
173
174 /* Check for reserved bits, including possibly _PAGE_PSE. */
175 if ( guest_l3e_rsvd_bits(v, gw->l3e) )
176 {
177 gw->pfec |= PFEC_reserved_bit | PFEC_page_present;
178 goto out;
179 }
180
181 /* Accumulate l3e access rights. */
182 ar_and &= gflags;
183 ar_or |= gflags;
184
185 if ( gflags & _PAGE_PSE )
186 {
187 /*
188 * Generate a fake l1 table entry so callers don't all
189 * have to understand superpages.
190 */
191 gfn_t start = guest_l3e_get_gfn(gw->l3e);
192 /*
193 * Grant full access in the l1e, since all the guest entry's
194 * access controls are enforced in the l3e.
195 */
196 int flags = (_PAGE_PRESENT|_PAGE_USER|_PAGE_RW|
197 _PAGE_ACCESSED|_PAGE_DIRTY);
198 /*
199 * Import protection key and cache-control bits. Note that _PAGE_PAT
200 * is actually _PAGE_PSE, and it is always set. We will clear it in
201 * case _PAGE_PSE_PAT (bit 12, i.e. first bit of gfn) is clear.
202 */
203 flags |= (guest_l3e_get_flags(gw->l3e)
204 & (_PAGE_PKEY_BITS|_PAGE_PAT|_PAGE_PWT|_PAGE_PCD));
205 if ( !(gfn_x(start) & 1) )
206 /* _PAGE_PSE_PAT not set: remove _PAGE_PAT from flags. */
207 flags &= ~_PAGE_PAT;
208
209 /* Increment the pfn by the right number of 4k pages. */
210 start = _gfn((gfn_x(start) & ~GUEST_L3_GFN_MASK) +
211 ((va >> PAGE_SHIFT) & GUEST_L3_GFN_MASK));
212 gw->l1e = guest_l1e_from_gfn(start, flags);
213 gw->l2mfn = gw->l1mfn = INVALID_MFN;
214 leaf_level = 3;
215 goto leaf;
216 }
217
218 #else /* PAE only... */
219
220 /* Get the l3e and check its flag */
221 l3gpa = gfn_to_gaddr(top_gfn) + ((unsigned long)top_map & ~PAGE_MASK) +
222 guest_l3_table_offset(va) * sizeof(gw->l3e);
223 if ( !hvmemul_read_cache(v, l3gpa, &gw->l3e, sizeof(gw->l3e)) )
224 {
225 gw->l3e = ((guest_l3e_t *)top_map)[guest_l3_table_offset(va)];
226 hvmemul_write_cache(v, l3gpa, &gw->l3e, sizeof(gw->l3e));
227 }
228
229 gflags = guest_l3e_get_flags(gw->l3e);
230 if ( !(gflags & _PAGE_PRESENT) )
231 goto out;
232
233 if ( guest_l3e_rsvd_bits(v, gw->l3e) )
234 {
235 gw->pfec |= PFEC_reserved_bit | PFEC_page_present;
236 goto out;
237 }
238
239 #endif /* PAE or 64... */
240
241 /* Map the l2 table */
242 l2p = map_domain_gfn(p2m,
243 guest_l3e_get_gfn(gw->l3e),
244 &gw->l2mfn,
245 qt,
246 &rc);
247 if ( l2p == NULL )
248 {
249 gw->pfec |= rc & PFEC_synth_mask;
250 goto out;
251 }
252
253 l2gpa = gfn_to_gaddr(guest_l3e_get_gfn(gw->l3e));
254
255 #else /* 32-bit only... */
256
257 gw->l2mfn = top_mfn;
258 l2p = (guest_l2e_t *) top_map;
259 l2gpa = gfn_to_gaddr(top_gfn);
260
261 #endif /* All levels... */
262
263 /* Get the l2e */
264 l2gpa += guest_l2_table_offset(va) * sizeof(gw->l2e);
265 if ( !hvmemul_read_cache(v, l2gpa, &gw->l2e, sizeof(gw->l2e)) )
266 {
267 gw->l2e = l2p[guest_l2_table_offset(va)];
268 hvmemul_write_cache(v, l2gpa, &gw->l2e, sizeof(gw->l2e));
269 }
270
271 /* Check the l2e flags. */
272 gflags = guest_l2e_get_flags(gw->l2e);
273 if ( !(gflags & _PAGE_PRESENT) )
274 goto out;
275
276 /*
277 * In 2-level paging without CR0.PSE, there are no reserved bits, and the
278 * PAT/PSE bit is ignored.
279 */
280 if ( GUEST_PAGING_LEVELS == 2 && !guest_can_use_l2_superpages(v) )
281 {
282 gw->l2e.l2 &= ~_PAGE_PSE;
283 gflags &= ~_PAGE_PSE;
284 }
285 /* else check for reserved bits, including possibly _PAGE_PSE. */
286 else if ( guest_l2e_rsvd_bits(v, gw->l2e) )
287 {
288 gw->pfec |= PFEC_reserved_bit | PFEC_page_present;
289 goto out;
290 }
291
292 /* Accumulate l2e access rights. */
293 ar_and &= gflags;
294 ar_or |= gflags;
295
296 if ( gflags & _PAGE_PSE )
297 {
298 /*
299 * Special case: this guest VA is in a PSE superpage, so there's
300 * no guest l1e. We make one up so that the propagation code
301 * can generate a shadow l1 table. Start with the gfn of the
302 * first 4k-page of the superpage.
303 */
304 #if GUEST_PAGING_LEVELS == 2
305 gfn_t start = _gfn(unfold_pse36(gw->l2e.l2) >> PAGE_SHIFT);
306 #else
307 gfn_t start = guest_l2e_get_gfn(gw->l2e);
308 #endif
309 /*
310 * Grant full access in the l1e, since all the guest entry's
311 * access controls are enforced in the shadow l2e.
312 */
313 int flags = (_PAGE_PRESENT|_PAGE_USER|_PAGE_RW|
314 _PAGE_ACCESSED|_PAGE_DIRTY);
315 /*
316 * Import protection key and cache-control bits. Note that _PAGE_PAT
317 * is actually _PAGE_PSE, and it is always set. We will clear it in
318 * case _PAGE_PSE_PAT (bit 12, i.e. first bit of gfn) is clear.
319 */
320 flags |= (guest_l2e_get_flags(gw->l2e)
321 & (_PAGE_PKEY_BITS|_PAGE_PAT|_PAGE_PWT|_PAGE_PCD));
322 if ( !(gfn_x(start) & 1) )
323 /* _PAGE_PSE_PAT not set: remove _PAGE_PAT from flags. */
324 flags &= ~_PAGE_PAT;
325
326 /* Increment the pfn by the right number of 4k pages. */
327 start = _gfn((gfn_x(start) & ~GUEST_L2_GFN_MASK) +
328 guest_l1_table_offset(va));
329 #if GUEST_PAGING_LEVELS == 2
330 /* Wider than 32 bits if PSE36 superpage. */
331 gw->el1e = (gfn_x(start) << PAGE_SHIFT) | flags;
332 #else
333 gw->l1e = guest_l1e_from_gfn(start, flags);
334 #endif
335 gw->l1mfn = INVALID_MFN;
336 leaf_level = 2;
337 goto leaf;
338 }
339
340 /* Map the l1 table */
341 l1p = map_domain_gfn(p2m,
342 guest_l2e_get_gfn(gw->l2e),
343 &gw->l1mfn,
344 qt,
345 &rc);
346 if ( l1p == NULL )
347 {
348 gw->pfec |= rc & PFEC_synth_mask;
349 goto out;
350 }
351
352 l1gpa = gfn_to_gaddr(guest_l2e_get_gfn(gw->l2e)) +
353 guest_l1_table_offset(va) * sizeof(gw->l1e);
354 if ( !hvmemul_read_cache(v, l1gpa, &gw->l1e, sizeof(gw->l1e)) )
355 {
356 gw->l1e = l1p[guest_l1_table_offset(va)];
357 hvmemul_write_cache(v, l1gpa, &gw->l1e, sizeof(gw->l1e));
358 }
359
360 gflags = guest_l1e_get_flags(gw->l1e);
361 if ( !(gflags & _PAGE_PRESENT) )
362 goto out;
363
364 /* Check for reserved bits. */
365 if ( guest_l1e_rsvd_bits(v, gw->l1e) )
366 {
367 gw->pfec |= PFEC_reserved_bit | PFEC_page_present;
368 goto out;
369 }
370
371 /* Accumulate l1e access rights. */
372 ar_and &= gflags;
373 ar_or |= gflags;
374
375 leaf_level = 1;
376
377 leaf:
378 gw->pfec |= PFEC_page_present;
379
380 /*
381 * The pagetable walk has returned a successful translation (i.e. All PTEs
382 * are present and have no reserved bits set). Now check access rights to
383 * see whether the access should succeed.
384 */
385 ar = (ar_and & AR_ACCUM_AND) | (ar_or & AR_ACCUM_OR);
386
387 /*
388 * Sanity check. If EFER.NX is disabled, _PAGE_NX_BIT is reserved and
389 * should have caused a translation failure before we get here.
390 */
391 if ( ar & _PAGE_NX_BIT )
392 ASSERT(guest_nx_enabled(v));
393
394 #if GUEST_PAGING_LEVELS >= 4 /* 64-bit only... */
395 /*
396 * If all access checks are thus far ok, check Protection Key for 64bit
397 * data accesses.
398 *
399 * N.B. In the case that the walk ended with a superpage, the fabricated
400 * gw->l1e contains the appropriate leaf pkey.
401 */
402 if ( !(walk & PFEC_insn_fetch) &&
403 ((ar & _PAGE_USER) ? guest_pku_enabled(v)
404 : guest_pks_enabled(v)) )
405 {
406 unsigned int pkey = guest_l1e_get_pkey(gw->l1e);
407 unsigned int pkr = (ar & _PAGE_USER) ? rdpkru() : rdpkrs();
408 unsigned int pk_ar = (pkr >> (pkey * PKEY_WIDTH)) & (PKEY_AD | PKEY_WD);
409
410 if ( (pk_ar & PKEY_AD) ||
411 ((walk & PFEC_write_access) && (pk_ar & PKEY_WD) &&
412 ((walk & PFEC_user_mode) || guest_wp_enabled(v))) )
413 {
414 gw->pfec |= PFEC_prot_key;
415 goto out;
416 }
417 }
418 #endif
419
420 if ( (walk & PFEC_insn_fetch) && (ar & _PAGE_NX_BIT) )
421 /* Requested an instruction fetch and found NX? Fail. */
422 goto out;
423
424 if ( walk & PFEC_user_mode ) /* Requested a user acess. */
425 {
426 if ( !(ar & _PAGE_USER) )
427 /* Got a supervisor walk? Unconditional fail. */
428 goto out;
429
430 if ( (walk & PFEC_write_access) && !(ar & _PAGE_RW) )
431 /* Requested a write and only got a read? Fail. */
432 goto out;
433 }
434 else /* Requested a supervisor access. */
435 {
436 if ( ar & _PAGE_USER ) /* Got a user walk. */
437 {
438 if ( (walk & PFEC_insn_fetch) && guest_smep_enabled(v) )
439 /* User insn fetch and smep? Fail. */
440 goto out;
441
442 if ( !(walk & PFEC_insn_fetch) && guest_smap_enabled(v) &&
443 ((walk & PFEC_implicit) ||
444 !(guest_cpu_user_regs()->eflags & X86_EFLAGS_AC)) )
445 /* User data access and smap? Fail. */
446 goto out;
447 }
448
449 if ( (walk & PFEC_write_access) && !(ar & _PAGE_RW) &&
450 guest_wp_enabled(v) )
451 /* Requested a write, got a read, and CR0.WP is set? Fail. */
452 goto out;
453 }
454
455 walk_ok = true;
456
457 /*
458 * Go back and set accessed and dirty bits only if the walk was a
459 * success. Although the PRMs say higher-level _PAGE_ACCESSED bits
460 * get set whenever a lower-level PT is used, at least some hardware
461 * walkers behave this way.
462 */
463 switch ( leaf_level )
464 {
465 default:
466 ASSERT_UNREACHABLE();
467 break;
468
469 case 1:
470 if ( set_ad_bits(&l1p[guest_l1_table_offset(va)].l1, &gw->l1e.l1,
471 (walk & PFEC_write_access)) )
472 {
473 paging_mark_dirty(d, gw->l1mfn);
474 hvmemul_write_cache(v, l1gpa, &gw->l1e, sizeof(gw->l1e));
475 }
476 /* Fallthrough */
477 case 2:
478 if ( set_ad_bits(&l2p[guest_l2_table_offset(va)].l2, &gw->l2e.l2,
479 (walk & PFEC_write_access) && leaf_level == 2) )
480 {
481 paging_mark_dirty(d, gw->l2mfn);
482 hvmemul_write_cache(v, l2gpa, &gw->l2e, sizeof(gw->l2e));
483 }
484 /* Fallthrough */
485 #if GUEST_PAGING_LEVELS == 4 /* 64-bit only... */
486 case 3:
487 if ( set_ad_bits(&l3p[guest_l3_table_offset(va)].l3, &gw->l3e.l3,
488 (walk & PFEC_write_access) && leaf_level == 3) )
489 {
490 paging_mark_dirty(d, gw->l3mfn);
491 hvmemul_write_cache(v, l3gpa, &gw->l3e, sizeof(gw->l3e));
492 }
493
494 if ( set_ad_bits(&l4p[guest_l4_table_offset(va)].l4, &gw->l4e.l4,
495 false) )
496 {
497 paging_mark_dirty(d, gw->l4mfn);
498 hvmemul_write_cache(v, l4gpa, &gw->l4e, sizeof(gw->l4e));
499 }
500 #endif
501 break;
502 }
503
504 out:
505 #if GUEST_PAGING_LEVELS == 4
506 if ( l3p )
507 {
508 unmap_domain_page(l3p);
509 put_page(mfn_to_page(gw->l3mfn));
510 }
511 #endif
512 #if GUEST_PAGING_LEVELS >= 3
513 if ( l2p )
514 {
515 unmap_domain_page(l2p);
516 put_page(mfn_to_page(gw->l2mfn));
517 }
518 #endif
519 if ( l1p )
520 {
521 unmap_domain_page(l1p);
522 put_page(mfn_to_page(gw->l1mfn));
523 }
524
525 return walk_ok;
526 }
527
528 #if GUEST_PAGING_LEVELS == CONFIG_PAGING_LEVELS
529 /*
530 * If the map is non-NULL, we leave this function having acquired an extra ref
531 * on mfn_to_page(*mfn). In all cases, *pfec contains appropriate
532 * synthetic/structure PFEC_* bits.
533 */
map_domain_gfn(struct p2m_domain * p2m,gfn_t gfn,mfn_t * mfn,p2m_query_t q,uint32_t * pfec)534 void *map_domain_gfn(struct p2m_domain *p2m, gfn_t gfn, mfn_t *mfn,
535 p2m_query_t q, uint32_t *pfec)
536 {
537 p2m_type_t p2mt;
538 struct page_info *page;
539
540 if ( !gfn_valid(p2m->domain, gfn) )
541 {
542 *pfec = PFEC_reserved_bit | PFEC_page_present;
543 return NULL;
544 }
545
546 /* Translate the gfn, unsharing if shared. */
547 page = paging_mode_translate(p2m->domain)
548 ? p2m_get_page_from_gfn(p2m, gfn, &p2mt, NULL, q)
549 : get_page_from_gfn(p2m->domain, gfn_x(gfn), &p2mt, q);
550 if ( p2m_is_paging(p2mt) )
551 {
552 ASSERT(p2m_is_hostp2m(p2m));
553 if ( page )
554 put_page(page);
555 p2m_mem_paging_populate(p2m->domain, gfn);
556 *pfec = PFEC_page_paged;
557 return NULL;
558 }
559 if ( p2m_is_shared(p2mt) )
560 {
561 if ( page )
562 put_page(page);
563 *pfec = PFEC_page_shared;
564 return NULL;
565 }
566 if ( !page )
567 {
568 *pfec = 0;
569 return NULL;
570 }
571
572 *pfec = PFEC_page_present;
573 *mfn = page_to_mfn(page);
574 ASSERT(mfn_valid(*mfn));
575
576 return map_domain_page(*mfn);
577 }
578 #endif
579
580 /*
581 * Local variables:
582 * mode: C
583 * c-file-style: "BSD"
584 * c-basic-offset: 4
585 * tab-width: 4
586 * indent-tabs-mode: nil
587 * End:
588 */
589