1 /* SPDX-License-Identifier: GPL-2.0-or-later */
2 /******************************************************************************
3  * arch/x86/mm/guest_walk.c
4  *
5  * Pagetable walker for guest memory accesses.
6  *
7  * Parts of this code are Copyright (c) 2006 by XenSource Inc.
8  * Parts of this code are Copyright (c) 2006 by Michael A Fetterman
9  * Parts based on earlier work by Michael A Fetterman, Ian Pratt et al.
10  */
11 
12 #include <xen/types.h>
13 #include <xen/mm.h>
14 #include <xen/paging.h>
15 #include <xen/domain_page.h>
16 #include <xen/sched.h>
17 
18 #include <asm/page.h>
19 #include <asm/prot-key.h>
20 #include <asm/guest_pt.h>
21 #include <asm/hvm/emulate.h>
22 
23 /*
24  * Modify a guest pagetable entry to set the Accessed and Dirty bits.
25  * Returns true if it actually writes to guest memory.
26  */
set_ad_bits(guest_intpte_t * guest_p,guest_intpte_t * walk_p,bool set_dirty)27 static bool set_ad_bits(guest_intpte_t *guest_p, guest_intpte_t *walk_p,
28                         bool set_dirty)
29 {
30     guest_intpte_t new, old = *walk_p;
31 
32     new = old | _PAGE_ACCESSED | (set_dirty ? _PAGE_DIRTY : 0);
33     if ( old != new )
34     {
35         /*
36          * Write the new entry into the walk, and try to write it back
37          * into the guest table as well.  If the guest table has changed
38          * under our feet then leave it alone.
39          */
40         *walk_p = new;
41         if ( cmpxchg(guest_p, old, new) == old )
42             return true;
43     }
44     return false;
45 }
46 
47 /*
48  * Walk the guest pagetables, after the manner of a hardware walker.
49  *
50  * This is a condensing of the 'Paging' chapters from Intel and AMD software
51  * manuals.  Please refer closely to them.
52  *
53  * A pagetable walk consists of two parts:
54  *   1) to find whether a translation exists, and
55  *   2) if a translation does exist, to check whether the translation's access
56  *      rights permit the access.
57  *
58  * A translation is found by following the pagetable structure (starting at
59  * %cr3) to a leaf entry (an L1 PTE, or a higher level entry with PSE set)
60  * which identifies the physical destination of the access.
61  *
62  * A translation from one level to the next exists if the PTE is both present
63  * and has no reserved bits set.  If the pagewalk counters a situation where a
64  * translation does not exist, the walk stops at that point.
65  *
66  * The access rights (NX, User, RW bits) are collected as the walk progresses.
67  * If a translation exists, the accumulated access rights are compared to the
68  * requested walk, to see whether the access is permitted.
69  */
70 bool
guest_walk_tables(const struct vcpu * v,struct p2m_domain * p2m,unsigned long va,walk_t * gw,uint32_t walk,gfn_t top_gfn,mfn_t top_mfn,void * top_map)71 guest_walk_tables(const struct vcpu *v, struct p2m_domain *p2m,
72                   unsigned long va, walk_t *gw, uint32_t walk,
73                   gfn_t top_gfn, mfn_t top_mfn, void *top_map)
74 {
75     struct domain *d = v->domain;
76     guest_l1e_t *l1p = NULL;
77     guest_l2e_t *l2p = NULL;
78 #if GUEST_PAGING_LEVELS >= 4 /* 64-bit only... */
79     guest_l3e_t *l3p = NULL;
80     guest_l4e_t *l4p;
81     paddr_t l4gpa;
82 #endif
83 #if GUEST_PAGING_LEVELS >= 3 /* PAE or 64... */
84     paddr_t l3gpa;
85 #endif
86     uint32_t gflags, rc;
87     paddr_t l1gpa = 0, l2gpa = 0;
88     unsigned int leaf_level;
89     p2m_query_t qt = P2M_ALLOC | P2M_UNSHARE;
90 
91 #define AR_ACCUM_AND (_PAGE_USER | _PAGE_RW)
92 #define AR_ACCUM_OR  (_PAGE_NX_BIT)
93     /* Start with all AND bits set, all OR bits clear. */
94     uint32_t ar, ar_and = ~0u, ar_or = 0;
95 
96     bool walk_ok = false;
97 
98     /*
99      * TODO - We should ASSERT() that only the following bits are set as
100      * inputs to a guest walk, but a whole load of code currently passes in
101      * other PFEC_ constants.
102      */
103     walk &= (PFEC_implicit | PFEC_insn_fetch | PFEC_user_mode | PFEC_write_access);
104 
105     /* Only implicit supervisor data accesses exist. */
106     ASSERT(!(walk & PFEC_implicit) ||
107            !(walk & (PFEC_insn_fetch | PFEC_user_mode)));
108 
109     perfc_incr(guest_walk);
110     memset(gw, 0, sizeof(*gw));
111     gw->va = va;
112     gw->pfec = walk & (PFEC_user_mode | PFEC_write_access);
113 
114     /*
115      * PFEC_insn_fetch is only reported if NX or SMEP are enabled.  Hardware
116      * still distingueses instruction fetches during determination of access
117      * rights.
118      */
119     if ( guest_nx_enabled(v) || guest_smep_enabled(v) )
120         gw->pfec |= (walk & PFEC_insn_fetch);
121 
122 #if GUEST_PAGING_LEVELS >= 3 /* PAE or 64... */
123 #if GUEST_PAGING_LEVELS >= 4 /* 64-bit only... */
124 
125     /* Get the l4e from the top level table and check its flags*/
126     gw->l4mfn = top_mfn;
127     l4p = (guest_l4e_t *) top_map;
128     l4gpa = gfn_to_gaddr(top_gfn) +
129             guest_l4_table_offset(va) * sizeof(gw->l4e);
130     if ( !hvmemul_read_cache(v, l4gpa, &gw->l4e, sizeof(gw->l4e)) )
131     {
132         gw->l4e = l4p[guest_l4_table_offset(va)];
133         hvmemul_write_cache(v, l4gpa, &gw->l4e, sizeof(gw->l4e));
134     }
135     gflags = guest_l4e_get_flags(gw->l4e);
136     if ( !(gflags & _PAGE_PRESENT) )
137         goto out;
138 
139     /* Check for reserved bits. */
140     if ( guest_l4e_rsvd_bits(v, gw->l4e) )
141     {
142         gw->pfec |= PFEC_reserved_bit | PFEC_page_present;
143         goto out;
144     }
145 
146     /* Accumulate l4e access rights. */
147     ar_and &= gflags;
148     ar_or  |= gflags;
149 
150     /* Map the l3 table */
151     l3p = map_domain_gfn(p2m,
152                          guest_l4e_get_gfn(gw->l4e),
153                          &gw->l3mfn,
154                          qt,
155                          &rc);
156     if ( l3p == NULL )
157     {
158         gw->pfec |= rc & PFEC_synth_mask;
159         goto out;
160     }
161 
162     /* Get the l3e and check its flags*/
163     l3gpa = gfn_to_gaddr(guest_l4e_get_gfn(gw->l4e)) +
164             guest_l3_table_offset(va) * sizeof(gw->l3e);
165     if ( !hvmemul_read_cache(v, l3gpa, &gw->l3e, sizeof(gw->l3e)) )
166     {
167         gw->l3e = l3p[guest_l3_table_offset(va)];
168         hvmemul_write_cache(v, l3gpa, &gw->l3e, sizeof(gw->l3e));
169     }
170     gflags = guest_l3e_get_flags(gw->l3e);
171     if ( !(gflags & _PAGE_PRESENT) )
172         goto out;
173 
174     /* Check for reserved bits, including possibly _PAGE_PSE. */
175     if ( guest_l3e_rsvd_bits(v, gw->l3e) )
176     {
177         gw->pfec |= PFEC_reserved_bit | PFEC_page_present;
178         goto out;
179     }
180 
181     /* Accumulate l3e access rights. */
182     ar_and &= gflags;
183     ar_or  |= gflags;
184 
185     if ( gflags & _PAGE_PSE )
186     {
187         /*
188          * Generate a fake l1 table entry so callers don't all
189          * have to understand superpages.
190          */
191         gfn_t start = guest_l3e_get_gfn(gw->l3e);
192         /*
193          * Grant full access in the l1e, since all the guest entry's
194          * access controls are enforced in the l3e.
195          */
196         int flags = (_PAGE_PRESENT|_PAGE_USER|_PAGE_RW|
197                      _PAGE_ACCESSED|_PAGE_DIRTY);
198         /*
199          * Import protection key and cache-control bits. Note that _PAGE_PAT
200          * is actually _PAGE_PSE, and it is always set. We will clear it in
201          * case _PAGE_PSE_PAT (bit 12, i.e. first bit of gfn) is clear.
202          */
203         flags |= (guest_l3e_get_flags(gw->l3e)
204                   & (_PAGE_PKEY_BITS|_PAGE_PAT|_PAGE_PWT|_PAGE_PCD));
205         if ( !(gfn_x(start) & 1) )
206             /* _PAGE_PSE_PAT not set: remove _PAGE_PAT from flags. */
207             flags &= ~_PAGE_PAT;
208 
209         /* Increment the pfn by the right number of 4k pages. */
210         start = _gfn((gfn_x(start) & ~GUEST_L3_GFN_MASK) +
211                      ((va >> PAGE_SHIFT) & GUEST_L3_GFN_MASK));
212         gw->l1e = guest_l1e_from_gfn(start, flags);
213         gw->l2mfn = gw->l1mfn = INVALID_MFN;
214         leaf_level = 3;
215         goto leaf;
216     }
217 
218 #else /* PAE only... */
219 
220     /* Get the l3e and check its flag */
221     l3gpa = gfn_to_gaddr(top_gfn) + ((unsigned long)top_map & ~PAGE_MASK) +
222             guest_l3_table_offset(va) * sizeof(gw->l3e);
223     if ( !hvmemul_read_cache(v, l3gpa, &gw->l3e, sizeof(gw->l3e)) )
224     {
225         gw->l3e = ((guest_l3e_t *)top_map)[guest_l3_table_offset(va)];
226         hvmemul_write_cache(v, l3gpa, &gw->l3e, sizeof(gw->l3e));
227     }
228 
229     gflags = guest_l3e_get_flags(gw->l3e);
230     if ( !(gflags & _PAGE_PRESENT) )
231         goto out;
232 
233     if ( guest_l3e_rsvd_bits(v, gw->l3e) )
234     {
235         gw->pfec |= PFEC_reserved_bit | PFEC_page_present;
236         goto out;
237     }
238 
239 #endif /* PAE or 64... */
240 
241     /* Map the l2 table */
242     l2p = map_domain_gfn(p2m,
243                          guest_l3e_get_gfn(gw->l3e),
244                          &gw->l2mfn,
245                          qt,
246                          &rc);
247     if ( l2p == NULL )
248     {
249         gw->pfec |= rc & PFEC_synth_mask;
250         goto out;
251     }
252 
253     l2gpa = gfn_to_gaddr(guest_l3e_get_gfn(gw->l3e));
254 
255 #else /* 32-bit only... */
256 
257     gw->l2mfn = top_mfn;
258     l2p = (guest_l2e_t *) top_map;
259     l2gpa = gfn_to_gaddr(top_gfn);
260 
261 #endif /* All levels... */
262 
263     /* Get the l2e */
264     l2gpa += guest_l2_table_offset(va) * sizeof(gw->l2e);
265     if ( !hvmemul_read_cache(v, l2gpa, &gw->l2e, sizeof(gw->l2e)) )
266     {
267         gw->l2e = l2p[guest_l2_table_offset(va)];
268         hvmemul_write_cache(v, l2gpa, &gw->l2e, sizeof(gw->l2e));
269     }
270 
271     /* Check the l2e flags. */
272     gflags = guest_l2e_get_flags(gw->l2e);
273     if ( !(gflags & _PAGE_PRESENT) )
274         goto out;
275 
276     /*
277      * In 2-level paging without CR0.PSE, there are no reserved bits, and the
278      * PAT/PSE bit is ignored.
279      */
280     if ( GUEST_PAGING_LEVELS == 2 && !guest_can_use_l2_superpages(v) )
281     {
282         gw->l2e.l2 &= ~_PAGE_PSE;
283         gflags &= ~_PAGE_PSE;
284     }
285     /* else check for reserved bits, including possibly _PAGE_PSE. */
286     else if ( guest_l2e_rsvd_bits(v, gw->l2e) )
287     {
288         gw->pfec |= PFEC_reserved_bit | PFEC_page_present;
289         goto out;
290     }
291 
292     /* Accumulate l2e access rights. */
293     ar_and &= gflags;
294     ar_or  |= gflags;
295 
296     if ( gflags & _PAGE_PSE )
297     {
298         /*
299          * Special case: this guest VA is in a PSE superpage, so there's
300          * no guest l1e.  We make one up so that the propagation code
301          * can generate a shadow l1 table.  Start with the gfn of the
302          * first 4k-page of the superpage.
303          */
304 #if GUEST_PAGING_LEVELS == 2
305         gfn_t start = _gfn(unfold_pse36(gw->l2e.l2) >> PAGE_SHIFT);
306 #else
307         gfn_t start = guest_l2e_get_gfn(gw->l2e);
308 #endif
309         /*
310          * Grant full access in the l1e, since all the guest entry's
311          * access controls are enforced in the shadow l2e.
312          */
313         int flags = (_PAGE_PRESENT|_PAGE_USER|_PAGE_RW|
314                      _PAGE_ACCESSED|_PAGE_DIRTY);
315         /*
316          * Import protection key and cache-control bits. Note that _PAGE_PAT
317          * is actually _PAGE_PSE, and it is always set. We will clear it in
318          * case _PAGE_PSE_PAT (bit 12, i.e. first bit of gfn) is clear.
319          */
320         flags |= (guest_l2e_get_flags(gw->l2e)
321                   & (_PAGE_PKEY_BITS|_PAGE_PAT|_PAGE_PWT|_PAGE_PCD));
322         if ( !(gfn_x(start) & 1) )
323             /* _PAGE_PSE_PAT not set: remove _PAGE_PAT from flags. */
324             flags &= ~_PAGE_PAT;
325 
326         /* Increment the pfn by the right number of 4k pages. */
327         start = _gfn((gfn_x(start) & ~GUEST_L2_GFN_MASK) +
328                      guest_l1_table_offset(va));
329 #if GUEST_PAGING_LEVELS == 2
330          /* Wider than 32 bits if PSE36 superpage. */
331         gw->el1e = (gfn_x(start) << PAGE_SHIFT) | flags;
332 #else
333         gw->l1e = guest_l1e_from_gfn(start, flags);
334 #endif
335         gw->l1mfn = INVALID_MFN;
336         leaf_level = 2;
337         goto leaf;
338     }
339 
340     /* Map the l1 table */
341     l1p = map_domain_gfn(p2m,
342                          guest_l2e_get_gfn(gw->l2e),
343                          &gw->l1mfn,
344                          qt,
345                          &rc);
346     if ( l1p == NULL )
347     {
348         gw->pfec |= rc & PFEC_synth_mask;
349         goto out;
350     }
351 
352     l1gpa = gfn_to_gaddr(guest_l2e_get_gfn(gw->l2e)) +
353             guest_l1_table_offset(va) * sizeof(gw->l1e);
354     if ( !hvmemul_read_cache(v, l1gpa, &gw->l1e, sizeof(gw->l1e)) )
355     {
356         gw->l1e = l1p[guest_l1_table_offset(va)];
357         hvmemul_write_cache(v, l1gpa, &gw->l1e, sizeof(gw->l1e));
358     }
359 
360     gflags = guest_l1e_get_flags(gw->l1e);
361     if ( !(gflags & _PAGE_PRESENT) )
362         goto out;
363 
364     /* Check for reserved bits. */
365     if ( guest_l1e_rsvd_bits(v, gw->l1e) )
366     {
367         gw->pfec |= PFEC_reserved_bit | PFEC_page_present;
368         goto out;
369     }
370 
371     /* Accumulate l1e access rights. */
372     ar_and &= gflags;
373     ar_or  |= gflags;
374 
375     leaf_level = 1;
376 
377  leaf:
378     gw->pfec |= PFEC_page_present;
379 
380     /*
381      * The pagetable walk has returned a successful translation (i.e. All PTEs
382      * are present and have no reserved bits set).  Now check access rights to
383      * see whether the access should succeed.
384      */
385     ar = (ar_and & AR_ACCUM_AND) | (ar_or & AR_ACCUM_OR);
386 
387     /*
388      * Sanity check.  If EFER.NX is disabled, _PAGE_NX_BIT is reserved and
389      * should have caused a translation failure before we get here.
390      */
391     if ( ar & _PAGE_NX_BIT )
392         ASSERT(guest_nx_enabled(v));
393 
394 #if GUEST_PAGING_LEVELS >= 4 /* 64-bit only... */
395     /*
396      * If all access checks are thus far ok, check Protection Key for 64bit
397      * data accesses.
398      *
399      * N.B. In the case that the walk ended with a superpage, the fabricated
400      * gw->l1e contains the appropriate leaf pkey.
401      */
402     if ( !(walk & PFEC_insn_fetch) &&
403          ((ar & _PAGE_USER) ? guest_pku_enabled(v)
404                             : guest_pks_enabled(v)) )
405     {
406         unsigned int pkey = guest_l1e_get_pkey(gw->l1e);
407         unsigned int pkr = (ar & _PAGE_USER) ? rdpkru() : rdpkrs();
408         unsigned int pk_ar = (pkr >> (pkey * PKEY_WIDTH)) & (PKEY_AD | PKEY_WD);
409 
410         if ( (pk_ar & PKEY_AD) ||
411              ((walk & PFEC_write_access) && (pk_ar & PKEY_WD) &&
412               ((walk & PFEC_user_mode) || guest_wp_enabled(v))) )
413         {
414             gw->pfec |= PFEC_prot_key;
415             goto out;
416         }
417     }
418 #endif
419 
420     if ( (walk & PFEC_insn_fetch) && (ar & _PAGE_NX_BIT) )
421         /* Requested an instruction fetch and found NX? Fail. */
422         goto out;
423 
424     if ( walk & PFEC_user_mode ) /* Requested a user acess. */
425     {
426         if ( !(ar & _PAGE_USER) )
427             /* Got a supervisor walk?  Unconditional fail. */
428             goto out;
429 
430         if ( (walk & PFEC_write_access) && !(ar & _PAGE_RW) )
431             /* Requested a write and only got a read? Fail. */
432             goto out;
433     }
434     else /* Requested a supervisor access. */
435     {
436         if ( ar & _PAGE_USER ) /* Got a user walk. */
437         {
438             if ( (walk & PFEC_insn_fetch) && guest_smep_enabled(v) )
439                 /* User insn fetch and smep? Fail. */
440                 goto out;
441 
442             if ( !(walk & PFEC_insn_fetch) && guest_smap_enabled(v) &&
443                  ((walk & PFEC_implicit) ||
444                   !(guest_cpu_user_regs()->eflags & X86_EFLAGS_AC)) )
445                 /* User data access and smap? Fail. */
446                 goto out;
447         }
448 
449         if ( (walk & PFEC_write_access) && !(ar & _PAGE_RW) &&
450              guest_wp_enabled(v) )
451             /* Requested a write, got a read, and CR0.WP is set? Fail. */
452             goto out;
453     }
454 
455     walk_ok = true;
456 
457     /*
458      * Go back and set accessed and dirty bits only if the walk was a
459      * success.  Although the PRMs say higher-level _PAGE_ACCESSED bits
460      * get set whenever a lower-level PT is used, at least some hardware
461      * walkers behave this way.
462      */
463     switch ( leaf_level )
464     {
465     default:
466         ASSERT_UNREACHABLE();
467         break;
468 
469     case 1:
470         if ( set_ad_bits(&l1p[guest_l1_table_offset(va)].l1, &gw->l1e.l1,
471                          (walk & PFEC_write_access)) )
472         {
473             paging_mark_dirty(d, gw->l1mfn);
474             hvmemul_write_cache(v, l1gpa, &gw->l1e, sizeof(gw->l1e));
475         }
476         /* Fallthrough */
477     case 2:
478         if ( set_ad_bits(&l2p[guest_l2_table_offset(va)].l2, &gw->l2e.l2,
479                          (walk & PFEC_write_access) && leaf_level == 2) )
480         {
481             paging_mark_dirty(d, gw->l2mfn);
482             hvmemul_write_cache(v, l2gpa, &gw->l2e, sizeof(gw->l2e));
483         }
484         /* Fallthrough */
485 #if GUEST_PAGING_LEVELS == 4 /* 64-bit only... */
486     case 3:
487         if ( set_ad_bits(&l3p[guest_l3_table_offset(va)].l3, &gw->l3e.l3,
488                          (walk & PFEC_write_access) && leaf_level == 3) )
489         {
490             paging_mark_dirty(d, gw->l3mfn);
491             hvmemul_write_cache(v, l3gpa, &gw->l3e, sizeof(gw->l3e));
492         }
493 
494         if ( set_ad_bits(&l4p[guest_l4_table_offset(va)].l4, &gw->l4e.l4,
495                          false) )
496         {
497             paging_mark_dirty(d, gw->l4mfn);
498             hvmemul_write_cache(v, l4gpa, &gw->l4e, sizeof(gw->l4e));
499         }
500 #endif
501         break;
502     }
503 
504  out:
505 #if GUEST_PAGING_LEVELS == 4
506     if ( l3p )
507     {
508         unmap_domain_page(l3p);
509         put_page(mfn_to_page(gw->l3mfn));
510     }
511 #endif
512 #if GUEST_PAGING_LEVELS >= 3
513     if ( l2p )
514     {
515         unmap_domain_page(l2p);
516         put_page(mfn_to_page(gw->l2mfn));
517     }
518 #endif
519     if ( l1p )
520     {
521         unmap_domain_page(l1p);
522         put_page(mfn_to_page(gw->l1mfn));
523     }
524 
525     return walk_ok;
526 }
527 
528 #if GUEST_PAGING_LEVELS == CONFIG_PAGING_LEVELS
529 /*
530  * If the map is non-NULL, we leave this function having acquired an extra ref
531  * on mfn_to_page(*mfn).  In all cases, *pfec contains appropriate
532  * synthetic/structure PFEC_* bits.
533  */
map_domain_gfn(struct p2m_domain * p2m,gfn_t gfn,mfn_t * mfn,p2m_query_t q,uint32_t * pfec)534 void *map_domain_gfn(struct p2m_domain *p2m, gfn_t gfn, mfn_t *mfn,
535                      p2m_query_t q, uint32_t *pfec)
536 {
537     p2m_type_t p2mt;
538     struct page_info *page;
539 
540     if ( !gfn_valid(p2m->domain, gfn) )
541     {
542         *pfec = PFEC_reserved_bit | PFEC_page_present;
543         return NULL;
544     }
545 
546     /* Translate the gfn, unsharing if shared. */
547     page = paging_mode_translate(p2m->domain)
548            ? p2m_get_page_from_gfn(p2m, gfn, &p2mt, NULL, q)
549            : get_page_from_gfn(p2m->domain, gfn_x(gfn), &p2mt, q);
550     if ( p2m_is_paging(p2mt) )
551     {
552         ASSERT(p2m_is_hostp2m(p2m));
553         if ( page )
554             put_page(page);
555         p2m_mem_paging_populate(p2m->domain, gfn);
556         *pfec = PFEC_page_paged;
557         return NULL;
558     }
559     if ( p2m_is_shared(p2mt) )
560     {
561         if ( page )
562             put_page(page);
563         *pfec = PFEC_page_shared;
564         return NULL;
565     }
566     if ( !page )
567     {
568         *pfec = 0;
569         return NULL;
570     }
571 
572     *pfec = PFEC_page_present;
573     *mfn = page_to_mfn(page);
574     ASSERT(mfn_valid(*mfn));
575 
576     return map_domain_page(*mfn);
577 }
578 #endif
579 
580 /*
581  * Local variables:
582  * mode: C
583  * c-file-style: "BSD"
584  * c-basic-offset: 4
585  * tab-width: 4
586  * indent-tabs-mode: nil
587  * End:
588  */
589