1 /******************************************************************************
2  * arch/x86/mm/guest_walk.c
3  *
4  * Pagetable walker for guest memory accesses.
5  *
6  * Parts of this code are Copyright (c) 2006 by XenSource Inc.
7  * Parts of this code are Copyright (c) 2006 by Michael A Fetterman
8  * Parts based on earlier work by Michael A Fetterman, Ian Pratt et al.
9  *
10  * This program is free software; you can redistribute it and/or modify
11  * it under the terms of the GNU General Public License as published by
12  * the Free Software Foundation; either version 2 of the License, or
13  * (at your option) any later version.
14  *
15  * This program is distributed in the hope that it will be useful,
16  * but WITHOUT ANY WARRANTY; without even the implied warranty of
17  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
18  * GNU General Public License for more details.
19  *
20  * You should have received a copy of the GNU General Public License
21  * along with this program; If not, see <http://www.gnu.org/licenses/>.
22  */
23 
24 /* Allow uniquely identifying static symbols in the 3 generated objects. */
25 asm(".file \"" __OBJECT_FILE__ "\"");
26 
27 #include <xen/types.h>
28 #include <xen/mm.h>
29 #include <xen/paging.h>
30 #include <xen/domain_page.h>
31 #include <xen/sched.h>
32 #include <asm/page.h>
33 #include <asm/guest_pt.h>
34 
35 /*
36  * Modify a guest pagetable entry to set the Accessed and Dirty bits.
37  * Returns true if it actually writes to guest memory.
38  */
set_ad_bits(guest_intpte_t * guest_p,guest_intpte_t * walk_p,bool set_dirty)39 static bool set_ad_bits(guest_intpte_t *guest_p, guest_intpte_t *walk_p,
40                         bool set_dirty)
41 {
42     guest_intpte_t new, old = *walk_p;
43 
44     new = old | _PAGE_ACCESSED | (set_dirty ? _PAGE_DIRTY : 0);
45     if ( old != new )
46     {
47         /*
48          * Write the new entry into the walk, and try to write it back
49          * into the guest table as well.  If the guest table has changed
50          * under our feet then leave it alone.
51          */
52         *walk_p = new;
53         if ( cmpxchg(guest_p, old, new) == old )
54             return true;
55     }
56     return false;
57 }
58 
59 /*
60  * Walk the guest pagetables, after the manner of a hardware walker.
61  *
62  * This is a condensing of the 'Paging' chapters from Intel and AMD software
63  * manuals.  Please refer closely to them.
64  *
65  * A pagetable walk consists of two parts:
66  *   1) to find whether a translation exists, and
67  *   2) if a translation does exist, to check whether the translation's access
68  *      rights permit the access.
69  *
70  * A translation is found by following the pagetable structure (starting at
71  * %cr3) to a leaf entry (an L1 PTE, or a higher level entry with PSE set)
72  * which identifies the physical destination of the access.
73  *
74  * A translation from one level to the next exists if the PTE is both present
75  * and has no reserved bits set.  If the pagewalk counters a situation where a
76  * translation does not exist, the walk stops at that point.
77  *
78  * The access rights (NX, User, RW bits) are collected as the walk progresses.
79  * If a translation exists, the accumulated access rights are compared to the
80  * requested walk, to see whether the access is permitted.
81  */
82 bool
guest_walk_tables(struct vcpu * v,struct p2m_domain * p2m,unsigned long va,walk_t * gw,uint32_t walk,mfn_t top_mfn,void * top_map)83 guest_walk_tables(struct vcpu *v, struct p2m_domain *p2m,
84                   unsigned long va, walk_t *gw,
85                   uint32_t walk, mfn_t top_mfn, void *top_map)
86 {
87     struct domain *d = v->domain;
88     p2m_type_t p2mt;
89     guest_l1e_t *l1p = NULL;
90     guest_l2e_t *l2p = NULL;
91 #if GUEST_PAGING_LEVELS >= 4 /* 64-bit only... */
92     guest_l3e_t *l3p = NULL;
93     guest_l4e_t *l4p;
94 #endif
95     uint32_t gflags, rc;
96     unsigned int leaf_level;
97     p2m_query_t qt = P2M_ALLOC | P2M_UNSHARE;
98 
99 #define AR_ACCUM_AND (_PAGE_USER | _PAGE_RW)
100 #define AR_ACCUM_OR  (_PAGE_NX_BIT)
101     /* Start with all AND bits set, all OR bits clear. */
102     uint32_t ar, ar_and = ~0u, ar_or = 0;
103 
104     bool walk_ok = false;
105 
106     /*
107      * TODO - We should ASSERT() that only the following bits are set as
108      * inputs to a guest walk, but a whole load of code currently passes in
109      * other PFEC_ constants.
110      */
111     walk &= (PFEC_implicit | PFEC_insn_fetch | PFEC_user_mode | PFEC_write_access);
112 
113     /* Only implicit supervisor data accesses exist. */
114     ASSERT(!(walk & PFEC_implicit) ||
115            !(walk & (PFEC_insn_fetch | PFEC_user_mode)));
116 
117     perfc_incr(guest_walk);
118     memset(gw, 0, sizeof(*gw));
119     gw->va = va;
120     gw->pfec = walk & (PFEC_user_mode | PFEC_write_access);
121 
122     /*
123      * PFEC_insn_fetch is only reported if NX or SMEP are enabled.  Hardware
124      * still distingueses instruction fetches during determination of access
125      * rights.
126      */
127     if ( guest_nx_enabled(v) || guest_smep_enabled(v) )
128         gw->pfec |= (walk & PFEC_insn_fetch);
129 
130 #if GUEST_PAGING_LEVELS >= 3 /* PAE or 64... */
131 #if GUEST_PAGING_LEVELS >= 4 /* 64-bit only... */
132 
133     /* Get the l4e from the top level table and check its flags*/
134     gw->l4mfn = top_mfn;
135     l4p = (guest_l4e_t *) top_map;
136     gw->l4e = l4p[guest_l4_table_offset(va)];
137     gflags = guest_l4e_get_flags(gw->l4e);
138     if ( !(gflags & _PAGE_PRESENT) )
139         goto out;
140 
141     /* Check for reserved bits. */
142     if ( guest_l4e_rsvd_bits(v, gw->l4e) )
143     {
144         gw->pfec |= PFEC_reserved_bit | PFEC_page_present;
145         goto out;
146     }
147 
148     /* Accumulate l4e access rights. */
149     ar_and &= gflags;
150     ar_or  |= gflags;
151 
152     /* Map the l3 table */
153     l3p = map_domain_gfn(p2m,
154                          guest_l4e_get_gfn(gw->l4e),
155                          &gw->l3mfn,
156                          &p2mt,
157                          qt,
158                          &rc);
159     if ( l3p == NULL )
160     {
161         gw->pfec |= rc & PFEC_synth_mask;
162         goto out;
163     }
164 
165     /* Get the l3e and check its flags*/
166     gw->l3e = l3p[guest_l3_table_offset(va)];
167     gflags = guest_l3e_get_flags(gw->l3e);
168     if ( !(gflags & _PAGE_PRESENT) )
169         goto out;
170 
171     /* Check for reserved bits, including possibly _PAGE_PSE. */
172     if ( guest_l3e_rsvd_bits(v, gw->l3e) )
173     {
174         gw->pfec |= PFEC_reserved_bit | PFEC_page_present;
175         goto out;
176     }
177 
178     /* Accumulate l3e access rights. */
179     ar_and &= gflags;
180     ar_or  |= gflags;
181 
182     if ( gflags & _PAGE_PSE )
183     {
184         /*
185          * Generate a fake l1 table entry so callers don't all
186          * have to understand superpages.
187          */
188         gfn_t start = guest_l3e_get_gfn(gw->l3e);
189         /*
190          * Grant full access in the l1e, since all the guest entry's
191          * access controls are enforced in the l3e.
192          */
193         int flags = (_PAGE_PRESENT|_PAGE_USER|_PAGE_RW|
194                      _PAGE_ACCESSED|_PAGE_DIRTY);
195         /*
196          * Import protection key and cache-control bits. Note that _PAGE_PAT
197          * is actually _PAGE_PSE, and it is always set. We will clear it in
198          * case _PAGE_PSE_PAT (bit 12, i.e. first bit of gfn) is clear.
199          */
200         flags |= (guest_l3e_get_flags(gw->l3e)
201                   & (_PAGE_PKEY_BITS|_PAGE_PAT|_PAGE_PWT|_PAGE_PCD));
202         if ( !(gfn_x(start) & 1) )
203             /* _PAGE_PSE_PAT not set: remove _PAGE_PAT from flags. */
204             flags &= ~_PAGE_PAT;
205 
206         /* Increment the pfn by the right number of 4k pages. */
207         start = _gfn((gfn_x(start) & ~GUEST_L3_GFN_MASK) +
208                      ((va >> PAGE_SHIFT) & GUEST_L3_GFN_MASK));
209         gw->l1e = guest_l1e_from_gfn(start, flags);
210         gw->l2mfn = gw->l1mfn = INVALID_MFN;
211         leaf_level = 3;
212         goto leaf;
213     }
214 
215 #else /* PAE only... */
216 
217     /* Get the l3e and check its flag */
218     gw->l3e = ((guest_l3e_t *) top_map)[guest_l3_table_offset(va)];
219     gflags = guest_l3e_get_flags(gw->l3e);
220     if ( !(gflags & _PAGE_PRESENT) )
221         goto out;
222 
223     if ( guest_l3e_rsvd_bits(v, gw->l3e) )
224     {
225         gw->pfec |= PFEC_reserved_bit | PFEC_page_present;
226         goto out;
227     }
228 
229 #endif /* PAE or 64... */
230 
231     /* Map the l2 table */
232     l2p = map_domain_gfn(p2m,
233                          guest_l3e_get_gfn(gw->l3e),
234                          &gw->l2mfn,
235                          &p2mt,
236                          qt,
237                          &rc);
238     if ( l2p == NULL )
239     {
240         gw->pfec |= rc & PFEC_synth_mask;
241         goto out;
242     }
243 
244     /* Get the l2e */
245     gw->l2e = l2p[guest_l2_table_offset(va)];
246 
247 #else /* 32-bit only... */
248 
249     /* Get l2e from the top level table */
250     gw->l2mfn = top_mfn;
251     l2p = (guest_l2e_t *) top_map;
252     gw->l2e = l2p[guest_l2_table_offset(va)];
253 
254 #endif /* All levels... */
255 
256     /* Check the l2e flags. */
257     gflags = guest_l2e_get_flags(gw->l2e);
258     if ( !(gflags & _PAGE_PRESENT) )
259         goto out;
260 
261     /*
262      * In 2-level paging without CR0.PSE, there are no reserved bits, and the
263      * PAT/PSE bit is ignored.
264      */
265     if ( GUEST_PAGING_LEVELS == 2 && !guest_can_use_l2_superpages(v) )
266     {
267         gw->l2e.l2 &= ~_PAGE_PSE;
268         gflags &= ~_PAGE_PSE;
269     }
270     /* else check for reserved bits, including possibly _PAGE_PSE. */
271     else if ( guest_l2e_rsvd_bits(v, gw->l2e) )
272     {
273         gw->pfec |= PFEC_reserved_bit | PFEC_page_present;
274         goto out;
275     }
276 
277     /* Accumulate l2e access rights. */
278     ar_and &= gflags;
279     ar_or  |= gflags;
280 
281     if ( gflags & _PAGE_PSE )
282     {
283         /*
284          * Special case: this guest VA is in a PSE superpage, so there's
285          * no guest l1e.  We make one up so that the propagation code
286          * can generate a shadow l1 table.  Start with the gfn of the
287          * first 4k-page of the superpage.
288          */
289 #if GUEST_PAGING_LEVELS == 2
290         gfn_t start = _gfn(unfold_pse36(gw->l2e.l2) >> PAGE_SHIFT);
291 #else
292         gfn_t start = guest_l2e_get_gfn(gw->l2e);
293 #endif
294         /*
295          * Grant full access in the l1e, since all the guest entry's
296          * access controls are enforced in the shadow l2e.
297          */
298         int flags = (_PAGE_PRESENT|_PAGE_USER|_PAGE_RW|
299                      _PAGE_ACCESSED|_PAGE_DIRTY);
300         /*
301          * Import protection key and cache-control bits. Note that _PAGE_PAT
302          * is actually _PAGE_PSE, and it is always set. We will clear it in
303          * case _PAGE_PSE_PAT (bit 12, i.e. first bit of gfn) is clear.
304          */
305         flags |= (guest_l2e_get_flags(gw->l2e)
306                   & (_PAGE_PKEY_BITS|_PAGE_PAT|_PAGE_PWT|_PAGE_PCD));
307         if ( !(gfn_x(start) & 1) )
308             /* _PAGE_PSE_PAT not set: remove _PAGE_PAT from flags. */
309             flags &= ~_PAGE_PAT;
310 
311         /* Increment the pfn by the right number of 4k pages. */
312         start = _gfn((gfn_x(start) & ~GUEST_L2_GFN_MASK) +
313                      guest_l1_table_offset(va));
314 #if GUEST_PAGING_LEVELS == 2
315          /* Wider than 32 bits if PSE36 superpage. */
316         gw->el1e = (gfn_x(start) << PAGE_SHIFT) | flags;
317 #else
318         gw->l1e = guest_l1e_from_gfn(start, flags);
319 #endif
320         gw->l1mfn = INVALID_MFN;
321         leaf_level = 2;
322         goto leaf;
323     }
324 
325     /* Map the l1 table */
326     l1p = map_domain_gfn(p2m,
327                          guest_l2e_get_gfn(gw->l2e),
328                          &gw->l1mfn,
329                          &p2mt,
330                          qt,
331                          &rc);
332     if ( l1p == NULL )
333     {
334         gw->pfec |= rc & PFEC_synth_mask;
335         goto out;
336     }
337     gw->l1e = l1p[guest_l1_table_offset(va)];
338     gflags = guest_l1e_get_flags(gw->l1e);
339     if ( !(gflags & _PAGE_PRESENT) )
340         goto out;
341 
342     /* Check for reserved bits. */
343     if ( guest_l1e_rsvd_bits(v, gw->l1e) )
344     {
345         gw->pfec |= PFEC_reserved_bit | PFEC_page_present;
346         goto out;
347     }
348 
349     /* Accumulate l1e access rights. */
350     ar_and &= gflags;
351     ar_or  |= gflags;
352 
353     leaf_level = 1;
354 
355  leaf:
356     gw->pfec |= PFEC_page_present;
357 
358     /*
359      * The pagetable walk has returned a successful translation (i.e. All PTEs
360      * are present and have no reserved bits set).  Now check access rights to
361      * see whether the access should succeed.
362      */
363     ar = (ar_and & AR_ACCUM_AND) | (ar_or & AR_ACCUM_OR);
364 
365     /*
366      * Sanity check.  If EFER.NX is disabled, _PAGE_NX_BIT is reserved and
367      * should have caused a translation failure before we get here.
368      */
369     if ( ar & _PAGE_NX_BIT )
370         ASSERT(guest_nx_enabled(v));
371 
372 #if GUEST_PAGING_LEVELS >= 4 /* 64-bit only... */
373     /*
374      * If all access checks are thus far ok, check Protection Key for 64bit
375      * data accesses to user mappings.
376      *
377      * N.B. In the case that the walk ended with a superpage, the fabricated
378      * gw->l1e contains the appropriate leaf pkey.
379      */
380     if ( (ar & _PAGE_USER) && !(walk & PFEC_insn_fetch) &&
381          guest_pku_enabled(v) )
382     {
383         unsigned int pkey = guest_l1e_get_pkey(gw->l1e);
384         unsigned int pkru = read_pkru();
385 
386         if ( read_pkru_ad(pkru, pkey) ||
387              ((walk & PFEC_write_access) && read_pkru_wd(pkru, pkey) &&
388               ((walk & PFEC_user_mode) || guest_wp_enabled(v))) )
389         {
390             gw->pfec |= PFEC_prot_key;
391             goto out;
392         }
393     }
394 #endif
395 
396     if ( (walk & PFEC_insn_fetch) && (ar & _PAGE_NX_BIT) )
397         /* Requested an instruction fetch and found NX? Fail. */
398         goto out;
399 
400     if ( walk & PFEC_user_mode ) /* Requested a user acess. */
401     {
402         if ( !(ar & _PAGE_USER) )
403             /* Got a supervisor walk?  Unconditional fail. */
404             goto out;
405 
406         if ( (walk & PFEC_write_access) && !(ar & _PAGE_RW) )
407             /* Requested a write and only got a read? Fail. */
408             goto out;
409     }
410     else /* Requested a supervisor access. */
411     {
412         if ( ar & _PAGE_USER ) /* Got a user walk. */
413         {
414             if ( (walk & PFEC_insn_fetch) && guest_smep_enabled(v) )
415                 /* User insn fetch and smep? Fail. */
416                 goto out;
417 
418             if ( !(walk & PFEC_insn_fetch) && guest_smap_enabled(v) &&
419                  ((walk & PFEC_implicit) ||
420                   !(guest_cpu_user_regs()->eflags & X86_EFLAGS_AC)) )
421                 /* User data access and smap? Fail. */
422                 goto out;
423         }
424 
425         if ( (walk & PFEC_write_access) && !(ar & _PAGE_RW) &&
426              guest_wp_enabled(v) )
427             /* Requested a write, got a read, and CR0.WP is set? Fail. */
428             goto out;
429     }
430 
431     walk_ok = true;
432 
433     /*
434      * Go back and set accessed and dirty bits only if the walk was a
435      * success.  Although the PRMs say higher-level _PAGE_ACCESSED bits
436      * get set whenever a lower-level PT is used, at least some hardware
437      * walkers behave this way.
438      */
439     switch ( leaf_level )
440     {
441     default:
442         ASSERT_UNREACHABLE();
443         break;
444 
445     case 1:
446         if ( set_ad_bits(&l1p[guest_l1_table_offset(va)].l1, &gw->l1e.l1,
447                          (walk & PFEC_write_access)) )
448             paging_mark_dirty(d, gw->l1mfn);
449         /* Fallthrough */
450     case 2:
451         if ( set_ad_bits(&l2p[guest_l2_table_offset(va)].l2, &gw->l2e.l2,
452                          (walk & PFEC_write_access) && leaf_level == 2) )
453             paging_mark_dirty(d, gw->l2mfn);
454         /* Fallthrough */
455 #if GUEST_PAGING_LEVELS == 4 /* 64-bit only... */
456     case 3:
457         if ( set_ad_bits(&l3p[guest_l3_table_offset(va)].l3, &gw->l3e.l3,
458                          (walk & PFEC_write_access) && leaf_level == 3) )
459             paging_mark_dirty(d, gw->l3mfn);
460 
461         if ( set_ad_bits(&l4p[guest_l4_table_offset(va)].l4, &gw->l4e.l4,
462                          false) )
463             paging_mark_dirty(d, gw->l4mfn);
464 #endif
465     }
466 
467  out:
468 #if GUEST_PAGING_LEVELS == 4
469     if ( l3p )
470     {
471         unmap_domain_page(l3p);
472         put_page(mfn_to_page(mfn_x(gw->l3mfn)));
473     }
474 #endif
475 #if GUEST_PAGING_LEVELS >= 3
476     if ( l2p )
477     {
478         unmap_domain_page(l2p);
479         put_page(mfn_to_page(mfn_x(gw->l2mfn)));
480     }
481 #endif
482     if ( l1p )
483     {
484         unmap_domain_page(l1p);
485         put_page(mfn_to_page(mfn_x(gw->l1mfn)));
486     }
487 
488     return walk_ok;
489 }
490 
491 /*
492  * Local variables:
493  * mode: C
494  * c-file-style: "BSD"
495  * c-basic-offset: 4
496  * tab-width: 4
497  * indent-tabs-mode: nil
498  * End:
499  */
500