1 /******************************************************************************
2 * arch/x86/mm/guest_walk.c
3 *
4 * Pagetable walker for guest memory accesses.
5 *
6 * Parts of this code are Copyright (c) 2006 by XenSource Inc.
7 * Parts of this code are Copyright (c) 2006 by Michael A Fetterman
8 * Parts based on earlier work by Michael A Fetterman, Ian Pratt et al.
9 *
10 * This program is free software; you can redistribute it and/or modify
11 * it under the terms of the GNU General Public License as published by
12 * the Free Software Foundation; either version 2 of the License, or
13 * (at your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful,
16 * but WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18 * GNU General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public License
21 * along with this program; If not, see <http://www.gnu.org/licenses/>.
22 */
23
24 /* Allow uniquely identifying static symbols in the 3 generated objects. */
25 asm(".file \"" __OBJECT_FILE__ "\"");
26
27 #include <xen/types.h>
28 #include <xen/mm.h>
29 #include <xen/paging.h>
30 #include <xen/domain_page.h>
31 #include <xen/sched.h>
32 #include <asm/page.h>
33 #include <asm/guest_pt.h>
34
35 /*
36 * Modify a guest pagetable entry to set the Accessed and Dirty bits.
37 * Returns true if it actually writes to guest memory.
38 */
set_ad_bits(guest_intpte_t * guest_p,guest_intpte_t * walk_p,bool set_dirty)39 static bool set_ad_bits(guest_intpte_t *guest_p, guest_intpte_t *walk_p,
40 bool set_dirty)
41 {
42 guest_intpte_t new, old = *walk_p;
43
44 new = old | _PAGE_ACCESSED | (set_dirty ? _PAGE_DIRTY : 0);
45 if ( old != new )
46 {
47 /*
48 * Write the new entry into the walk, and try to write it back
49 * into the guest table as well. If the guest table has changed
50 * under our feet then leave it alone.
51 */
52 *walk_p = new;
53 if ( cmpxchg(guest_p, old, new) == old )
54 return true;
55 }
56 return false;
57 }
58
59 /*
60 * Walk the guest pagetables, after the manner of a hardware walker.
61 *
62 * This is a condensing of the 'Paging' chapters from Intel and AMD software
63 * manuals. Please refer closely to them.
64 *
65 * A pagetable walk consists of two parts:
66 * 1) to find whether a translation exists, and
67 * 2) if a translation does exist, to check whether the translation's access
68 * rights permit the access.
69 *
70 * A translation is found by following the pagetable structure (starting at
71 * %cr3) to a leaf entry (an L1 PTE, or a higher level entry with PSE set)
72 * which identifies the physical destination of the access.
73 *
74 * A translation from one level to the next exists if the PTE is both present
75 * and has no reserved bits set. If the pagewalk counters a situation where a
76 * translation does not exist, the walk stops at that point.
77 *
78 * The access rights (NX, User, RW bits) are collected as the walk progresses.
79 * If a translation exists, the accumulated access rights are compared to the
80 * requested walk, to see whether the access is permitted.
81 */
82 bool
guest_walk_tables(struct vcpu * v,struct p2m_domain * p2m,unsigned long va,walk_t * gw,uint32_t walk,mfn_t top_mfn,void * top_map)83 guest_walk_tables(struct vcpu *v, struct p2m_domain *p2m,
84 unsigned long va, walk_t *gw,
85 uint32_t walk, mfn_t top_mfn, void *top_map)
86 {
87 struct domain *d = v->domain;
88 p2m_type_t p2mt;
89 guest_l1e_t *l1p = NULL;
90 guest_l2e_t *l2p = NULL;
91 #if GUEST_PAGING_LEVELS >= 4 /* 64-bit only... */
92 guest_l3e_t *l3p = NULL;
93 guest_l4e_t *l4p;
94 #endif
95 uint32_t gflags, rc;
96 unsigned int leaf_level;
97 p2m_query_t qt = P2M_ALLOC | P2M_UNSHARE;
98
99 #define AR_ACCUM_AND (_PAGE_USER | _PAGE_RW)
100 #define AR_ACCUM_OR (_PAGE_NX_BIT)
101 /* Start with all AND bits set, all OR bits clear. */
102 uint32_t ar, ar_and = ~0u, ar_or = 0;
103
104 bool walk_ok = false;
105
106 /*
107 * TODO - We should ASSERT() that only the following bits are set as
108 * inputs to a guest walk, but a whole load of code currently passes in
109 * other PFEC_ constants.
110 */
111 walk &= (PFEC_implicit | PFEC_insn_fetch | PFEC_user_mode | PFEC_write_access);
112
113 /* Only implicit supervisor data accesses exist. */
114 ASSERT(!(walk & PFEC_implicit) ||
115 !(walk & (PFEC_insn_fetch | PFEC_user_mode)));
116
117 perfc_incr(guest_walk);
118 memset(gw, 0, sizeof(*gw));
119 gw->va = va;
120 gw->pfec = walk & (PFEC_user_mode | PFEC_write_access);
121
122 /*
123 * PFEC_insn_fetch is only reported if NX or SMEP are enabled. Hardware
124 * still distingueses instruction fetches during determination of access
125 * rights.
126 */
127 if ( guest_nx_enabled(v) || guest_smep_enabled(v) )
128 gw->pfec |= (walk & PFEC_insn_fetch);
129
130 #if GUEST_PAGING_LEVELS >= 3 /* PAE or 64... */
131 #if GUEST_PAGING_LEVELS >= 4 /* 64-bit only... */
132
133 /* Get the l4e from the top level table and check its flags*/
134 gw->l4mfn = top_mfn;
135 l4p = (guest_l4e_t *) top_map;
136 gw->l4e = l4p[guest_l4_table_offset(va)];
137 gflags = guest_l4e_get_flags(gw->l4e);
138 if ( !(gflags & _PAGE_PRESENT) )
139 goto out;
140
141 /* Check for reserved bits. */
142 if ( guest_l4e_rsvd_bits(v, gw->l4e) )
143 {
144 gw->pfec |= PFEC_reserved_bit | PFEC_page_present;
145 goto out;
146 }
147
148 /* Accumulate l4e access rights. */
149 ar_and &= gflags;
150 ar_or |= gflags;
151
152 /* Map the l3 table */
153 l3p = map_domain_gfn(p2m,
154 guest_l4e_get_gfn(gw->l4e),
155 &gw->l3mfn,
156 &p2mt,
157 qt,
158 &rc);
159 if ( l3p == NULL )
160 {
161 gw->pfec |= rc & PFEC_synth_mask;
162 goto out;
163 }
164
165 /* Get the l3e and check its flags*/
166 gw->l3e = l3p[guest_l3_table_offset(va)];
167 gflags = guest_l3e_get_flags(gw->l3e);
168 if ( !(gflags & _PAGE_PRESENT) )
169 goto out;
170
171 /* Check for reserved bits, including possibly _PAGE_PSE. */
172 if ( guest_l3e_rsvd_bits(v, gw->l3e) )
173 {
174 gw->pfec |= PFEC_reserved_bit | PFEC_page_present;
175 goto out;
176 }
177
178 /* Accumulate l3e access rights. */
179 ar_and &= gflags;
180 ar_or |= gflags;
181
182 if ( gflags & _PAGE_PSE )
183 {
184 /*
185 * Generate a fake l1 table entry so callers don't all
186 * have to understand superpages.
187 */
188 gfn_t start = guest_l3e_get_gfn(gw->l3e);
189 /*
190 * Grant full access in the l1e, since all the guest entry's
191 * access controls are enforced in the l3e.
192 */
193 int flags = (_PAGE_PRESENT|_PAGE_USER|_PAGE_RW|
194 _PAGE_ACCESSED|_PAGE_DIRTY);
195 /*
196 * Import protection key and cache-control bits. Note that _PAGE_PAT
197 * is actually _PAGE_PSE, and it is always set. We will clear it in
198 * case _PAGE_PSE_PAT (bit 12, i.e. first bit of gfn) is clear.
199 */
200 flags |= (guest_l3e_get_flags(gw->l3e)
201 & (_PAGE_PKEY_BITS|_PAGE_PAT|_PAGE_PWT|_PAGE_PCD));
202 if ( !(gfn_x(start) & 1) )
203 /* _PAGE_PSE_PAT not set: remove _PAGE_PAT from flags. */
204 flags &= ~_PAGE_PAT;
205
206 /* Increment the pfn by the right number of 4k pages. */
207 start = _gfn((gfn_x(start) & ~GUEST_L3_GFN_MASK) +
208 ((va >> PAGE_SHIFT) & GUEST_L3_GFN_MASK));
209 gw->l1e = guest_l1e_from_gfn(start, flags);
210 gw->l2mfn = gw->l1mfn = INVALID_MFN;
211 leaf_level = 3;
212 goto leaf;
213 }
214
215 #else /* PAE only... */
216
217 /* Get the l3e and check its flag */
218 gw->l3e = ((guest_l3e_t *) top_map)[guest_l3_table_offset(va)];
219 gflags = guest_l3e_get_flags(gw->l3e);
220 if ( !(gflags & _PAGE_PRESENT) )
221 goto out;
222
223 if ( guest_l3e_rsvd_bits(v, gw->l3e) )
224 {
225 gw->pfec |= PFEC_reserved_bit | PFEC_page_present;
226 goto out;
227 }
228
229 #endif /* PAE or 64... */
230
231 /* Map the l2 table */
232 l2p = map_domain_gfn(p2m,
233 guest_l3e_get_gfn(gw->l3e),
234 &gw->l2mfn,
235 &p2mt,
236 qt,
237 &rc);
238 if ( l2p == NULL )
239 {
240 gw->pfec |= rc & PFEC_synth_mask;
241 goto out;
242 }
243
244 /* Get the l2e */
245 gw->l2e = l2p[guest_l2_table_offset(va)];
246
247 #else /* 32-bit only... */
248
249 /* Get l2e from the top level table */
250 gw->l2mfn = top_mfn;
251 l2p = (guest_l2e_t *) top_map;
252 gw->l2e = l2p[guest_l2_table_offset(va)];
253
254 #endif /* All levels... */
255
256 /* Check the l2e flags. */
257 gflags = guest_l2e_get_flags(gw->l2e);
258 if ( !(gflags & _PAGE_PRESENT) )
259 goto out;
260
261 /*
262 * In 2-level paging without CR0.PSE, there are no reserved bits, and the
263 * PAT/PSE bit is ignored.
264 */
265 if ( GUEST_PAGING_LEVELS == 2 && !guest_can_use_l2_superpages(v) )
266 {
267 gw->l2e.l2 &= ~_PAGE_PSE;
268 gflags &= ~_PAGE_PSE;
269 }
270 /* else check for reserved bits, including possibly _PAGE_PSE. */
271 else if ( guest_l2e_rsvd_bits(v, gw->l2e) )
272 {
273 gw->pfec |= PFEC_reserved_bit | PFEC_page_present;
274 goto out;
275 }
276
277 /* Accumulate l2e access rights. */
278 ar_and &= gflags;
279 ar_or |= gflags;
280
281 if ( gflags & _PAGE_PSE )
282 {
283 /*
284 * Special case: this guest VA is in a PSE superpage, so there's
285 * no guest l1e. We make one up so that the propagation code
286 * can generate a shadow l1 table. Start with the gfn of the
287 * first 4k-page of the superpage.
288 */
289 #if GUEST_PAGING_LEVELS == 2
290 gfn_t start = _gfn(unfold_pse36(gw->l2e.l2) >> PAGE_SHIFT);
291 #else
292 gfn_t start = guest_l2e_get_gfn(gw->l2e);
293 #endif
294 /*
295 * Grant full access in the l1e, since all the guest entry's
296 * access controls are enforced in the shadow l2e.
297 */
298 int flags = (_PAGE_PRESENT|_PAGE_USER|_PAGE_RW|
299 _PAGE_ACCESSED|_PAGE_DIRTY);
300 /*
301 * Import protection key and cache-control bits. Note that _PAGE_PAT
302 * is actually _PAGE_PSE, and it is always set. We will clear it in
303 * case _PAGE_PSE_PAT (bit 12, i.e. first bit of gfn) is clear.
304 */
305 flags |= (guest_l2e_get_flags(gw->l2e)
306 & (_PAGE_PKEY_BITS|_PAGE_PAT|_PAGE_PWT|_PAGE_PCD));
307 if ( !(gfn_x(start) & 1) )
308 /* _PAGE_PSE_PAT not set: remove _PAGE_PAT from flags. */
309 flags &= ~_PAGE_PAT;
310
311 /* Increment the pfn by the right number of 4k pages. */
312 start = _gfn((gfn_x(start) & ~GUEST_L2_GFN_MASK) +
313 guest_l1_table_offset(va));
314 #if GUEST_PAGING_LEVELS == 2
315 /* Wider than 32 bits if PSE36 superpage. */
316 gw->el1e = (gfn_x(start) << PAGE_SHIFT) | flags;
317 #else
318 gw->l1e = guest_l1e_from_gfn(start, flags);
319 #endif
320 gw->l1mfn = INVALID_MFN;
321 leaf_level = 2;
322 goto leaf;
323 }
324
325 /* Map the l1 table */
326 l1p = map_domain_gfn(p2m,
327 guest_l2e_get_gfn(gw->l2e),
328 &gw->l1mfn,
329 &p2mt,
330 qt,
331 &rc);
332 if ( l1p == NULL )
333 {
334 gw->pfec |= rc & PFEC_synth_mask;
335 goto out;
336 }
337 gw->l1e = l1p[guest_l1_table_offset(va)];
338 gflags = guest_l1e_get_flags(gw->l1e);
339 if ( !(gflags & _PAGE_PRESENT) )
340 goto out;
341
342 /* Check for reserved bits. */
343 if ( guest_l1e_rsvd_bits(v, gw->l1e) )
344 {
345 gw->pfec |= PFEC_reserved_bit | PFEC_page_present;
346 goto out;
347 }
348
349 /* Accumulate l1e access rights. */
350 ar_and &= gflags;
351 ar_or |= gflags;
352
353 leaf_level = 1;
354
355 leaf:
356 gw->pfec |= PFEC_page_present;
357
358 /*
359 * The pagetable walk has returned a successful translation (i.e. All PTEs
360 * are present and have no reserved bits set). Now check access rights to
361 * see whether the access should succeed.
362 */
363 ar = (ar_and & AR_ACCUM_AND) | (ar_or & AR_ACCUM_OR);
364
365 /*
366 * Sanity check. If EFER.NX is disabled, _PAGE_NX_BIT is reserved and
367 * should have caused a translation failure before we get here.
368 */
369 if ( ar & _PAGE_NX_BIT )
370 ASSERT(guest_nx_enabled(v));
371
372 #if GUEST_PAGING_LEVELS >= 4 /* 64-bit only... */
373 /*
374 * If all access checks are thus far ok, check Protection Key for 64bit
375 * data accesses to user mappings.
376 *
377 * N.B. In the case that the walk ended with a superpage, the fabricated
378 * gw->l1e contains the appropriate leaf pkey.
379 */
380 if ( (ar & _PAGE_USER) && !(walk & PFEC_insn_fetch) &&
381 guest_pku_enabled(v) )
382 {
383 unsigned int pkey = guest_l1e_get_pkey(gw->l1e);
384 unsigned int pkru = read_pkru();
385
386 if ( read_pkru_ad(pkru, pkey) ||
387 ((walk & PFEC_write_access) && read_pkru_wd(pkru, pkey) &&
388 ((walk & PFEC_user_mode) || guest_wp_enabled(v))) )
389 {
390 gw->pfec |= PFEC_prot_key;
391 goto out;
392 }
393 }
394 #endif
395
396 if ( (walk & PFEC_insn_fetch) && (ar & _PAGE_NX_BIT) )
397 /* Requested an instruction fetch and found NX? Fail. */
398 goto out;
399
400 if ( walk & PFEC_user_mode ) /* Requested a user acess. */
401 {
402 if ( !(ar & _PAGE_USER) )
403 /* Got a supervisor walk? Unconditional fail. */
404 goto out;
405
406 if ( (walk & PFEC_write_access) && !(ar & _PAGE_RW) )
407 /* Requested a write and only got a read? Fail. */
408 goto out;
409 }
410 else /* Requested a supervisor access. */
411 {
412 if ( ar & _PAGE_USER ) /* Got a user walk. */
413 {
414 if ( (walk & PFEC_insn_fetch) && guest_smep_enabled(v) )
415 /* User insn fetch and smep? Fail. */
416 goto out;
417
418 if ( !(walk & PFEC_insn_fetch) && guest_smap_enabled(v) &&
419 ((walk & PFEC_implicit) ||
420 !(guest_cpu_user_regs()->eflags & X86_EFLAGS_AC)) )
421 /* User data access and smap? Fail. */
422 goto out;
423 }
424
425 if ( (walk & PFEC_write_access) && !(ar & _PAGE_RW) &&
426 guest_wp_enabled(v) )
427 /* Requested a write, got a read, and CR0.WP is set? Fail. */
428 goto out;
429 }
430
431 walk_ok = true;
432
433 /*
434 * Go back and set accessed and dirty bits only if the walk was a
435 * success. Although the PRMs say higher-level _PAGE_ACCESSED bits
436 * get set whenever a lower-level PT is used, at least some hardware
437 * walkers behave this way.
438 */
439 switch ( leaf_level )
440 {
441 default:
442 ASSERT_UNREACHABLE();
443 break;
444
445 case 1:
446 if ( set_ad_bits(&l1p[guest_l1_table_offset(va)].l1, &gw->l1e.l1,
447 (walk & PFEC_write_access)) )
448 paging_mark_dirty(d, gw->l1mfn);
449 /* Fallthrough */
450 case 2:
451 if ( set_ad_bits(&l2p[guest_l2_table_offset(va)].l2, &gw->l2e.l2,
452 (walk & PFEC_write_access) && leaf_level == 2) )
453 paging_mark_dirty(d, gw->l2mfn);
454 /* Fallthrough */
455 #if GUEST_PAGING_LEVELS == 4 /* 64-bit only... */
456 case 3:
457 if ( set_ad_bits(&l3p[guest_l3_table_offset(va)].l3, &gw->l3e.l3,
458 (walk & PFEC_write_access) && leaf_level == 3) )
459 paging_mark_dirty(d, gw->l3mfn);
460
461 if ( set_ad_bits(&l4p[guest_l4_table_offset(va)].l4, &gw->l4e.l4,
462 false) )
463 paging_mark_dirty(d, gw->l4mfn);
464 #endif
465 }
466
467 out:
468 #if GUEST_PAGING_LEVELS == 4
469 if ( l3p )
470 {
471 unmap_domain_page(l3p);
472 put_page(mfn_to_page(mfn_x(gw->l3mfn)));
473 }
474 #endif
475 #if GUEST_PAGING_LEVELS >= 3
476 if ( l2p )
477 {
478 unmap_domain_page(l2p);
479 put_page(mfn_to_page(mfn_x(gw->l2mfn)));
480 }
481 #endif
482 if ( l1p )
483 {
484 unmap_domain_page(l1p);
485 put_page(mfn_to_page(mfn_x(gw->l1mfn)));
486 }
487
488 return walk_ok;
489 }
490
491 /*
492 * Local variables:
493 * mode: C
494 * c-file-style: "BSD"
495 * c-basic-offset: 4
496 * tab-width: 4
497 * indent-tabs-mode: nil
498 * End:
499 */
500