1 /******************************************************************************
2 * arch/x86/mm.c
3 *
4 * Copyright (c) 2002-2005 K A Fraser
5 * Copyright (c) 2004 Christian Limpach
6 *
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation; either version 2 of the License, or
10 * (at your option) any later version.
11 *
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with this program; If not, see <http://www.gnu.org/licenses/>.
19 */
20
21 /*
22 * A description of the x86 page table API:
23 *
24 * Domains trap to do_mmu_update with a list of update requests.
25 * This is a list of (ptr, val) pairs, where the requested operation
26 * is *ptr = val.
27 *
28 * Reference counting of pages:
29 * ----------------------------
30 * Each page has two refcounts: tot_count and type_count.
31 *
32 * TOT_COUNT is the obvious reference count. It counts all uses of a
33 * physical page frame by a domain, including uses as a page directory,
34 * a page table, or simple mappings via a PTE. This count prevents a
35 * domain from releasing a frame back to the free pool when it still holds
36 * a reference to it.
37 *
38 * TYPE_COUNT is more subtle. A frame can be put to one of three
39 * mutually-exclusive uses: it might be used as a page directory, or a
40 * page table, or it may be mapped writable by the domain [of course, a
41 * frame may not be used in any of these three ways!].
42 * So, type_count is a count of the number of times a frame is being
43 * referred to in its current incarnation. Therefore, a page can only
44 * change its type when its type count is zero.
45 *
46 * Pinning the page type:
47 * ----------------------
48 * The type of a page can be pinned/unpinned with the commands
49 * MMUEXT_[UN]PIN_L?_TABLE. Each page can be pinned exactly once (that is,
50 * pinning is not reference counted, so it can't be nested).
51 * This is useful to prevent a page's type count falling to zero, at which
52 * point safety checks would need to be carried out next time the count
53 * is increased again.
54 *
55 * A further note on writable page mappings:
56 * -----------------------------------------
57 * For simplicity, the count of writable mappings for a page may not
58 * correspond to reality. The 'writable count' is incremented for every
59 * PTE which maps the page with the _PAGE_RW flag set. However, for
60 * write access to be possible the page directory entry must also have
61 * its _PAGE_RW bit set. We do not check this as it complicates the
62 * reference counting considerably [consider the case of multiple
63 * directory entries referencing a single page table, some with the RW
64 * bit set, others not -- it starts getting a bit messy].
65 * In normal use, this simplification shouldn't be a problem.
66 * However, the logic can be added if required.
67 *
68 * One more note on read-only page mappings:
69 * -----------------------------------------
70 * We want domains to be able to map pages for read-only access. The
71 * main reason is that page tables and directories should be readable
72 * by a domain, but it would not be safe for them to be writable.
73 * However, domains have free access to rings 1 & 2 of the Intel
74 * privilege model. In terms of page protection, these are considered
75 * to be part of 'supervisor mode'. The WP bit in CR0 controls whether
76 * read-only restrictions are respected in supervisor mode -- if the
77 * bit is clear then any mapped page is writable.
78 *
79 * We get round this by always setting the WP bit and disallowing
80 * updates to it. This is very unlikely to cause a problem for guest
81 * OS's, which will generally use the WP bit to simplify copy-on-write
82 * implementation (in that case, OS wants a fault when it writes to
83 * an application-supplied buffer).
84 */
85
86 #include <xen/init.h>
87 #include <xen/kernel.h>
88 #include <xen/lib.h>
89 #include <xen/mm.h>
90 #include <xen/domain.h>
91 #include <xen/sched.h>
92 #include <xen/err.h>
93 #include <xen/perfc.h>
94 #include <xen/irq.h>
95 #include <xen/softirq.h>
96 #include <xen/domain_page.h>
97 #include <xen/event.h>
98 #include <xen/iocap.h>
99 #include <xen/guest_access.h>
100 #include <xen/pfn.h>
101 #include <xen/vmap.h>
102 #include <xen/xmalloc.h>
103 #include <xen/efi.h>
104 #include <xen/grant_table.h>
105 #include <xen/hypercall.h>
106 #include <asm/paging.h>
107 #include <asm/shadow.h>
108 #include <asm/page.h>
109 #include <asm/flushtlb.h>
110 #include <asm/io.h>
111 #include <asm/ldt.h>
112 #include <asm/x86_emulate.h>
113 #include <asm/e820.h>
114 #include <asm/hypercall.h>
115 #include <asm/shared.h>
116 #include <asm/mem_sharing.h>
117 #include <public/memory.h>
118 #include <public/sched.h>
119 #include <xsm/xsm.h>
120 #include <xen/trace.h>
121 #include <asm/setup.h>
122 #include <asm/fixmap.h>
123 #include <asm/io_apic.h>
124 #include <asm/pci.h>
125 #include <asm/guest.h>
126
127 #include <asm/hvm/grant_table.h>
128 #include <asm/pv/grant_table.h>
129
130 #include "pv/mm.h"
131
132 /* Override macros from asm/page.h to make them work with mfn_t */
133 #undef mfn_to_page
134 #define mfn_to_page(mfn) __mfn_to_page(mfn_x(mfn))
135 #undef page_to_mfn
136 #define page_to_mfn(pg) _mfn(__page_to_mfn(pg))
137
138 /* Mapping of the fixmap space needed early. */
139 l1_pgentry_t __section(".bss.page_aligned") __aligned(PAGE_SIZE)
140 l1_fixmap[L1_PAGETABLE_ENTRIES];
141
142 paddr_t __read_mostly mem_hotplug;
143
144 /* Private domain structs for DOMID_XEN and DOMID_IO. */
145 struct domain *dom_xen, *dom_io, *dom_cow;
146
147 /* Frame table size in pages. */
148 unsigned long max_page;
149 unsigned long total_pages;
150
151 bool __read_mostly machine_to_phys_mapping_valid;
152
153 struct rangeset *__read_mostly mmio_ro_ranges;
154
155 static uint32_t base_disallow_mask;
156 /* Global bit is allowed to be set on L1 PTEs. Intended for user mappings. */
157 #define L1_DISALLOW_MASK ((base_disallow_mask | _PAGE_GNTTAB) & ~_PAGE_GLOBAL)
158
159 #define L2_DISALLOW_MASK base_disallow_mask
160
161 #define l3_disallow_mask(d) (!is_pv_32bit_domain(d) ? \
162 base_disallow_mask : 0xFFFFF198U)
163
164 #define L4_DISALLOW_MASK (base_disallow_mask)
165
166 #define l1_disallow_mask(d) \
167 ((d != dom_io) && \
168 (rangeset_is_empty((d)->iomem_caps) && \
169 rangeset_is_empty((d)->arch.ioport_caps) && \
170 !has_arch_pdevs(d) && \
171 is_pv_domain(d)) ? \
172 L1_DISALLOW_MASK : (L1_DISALLOW_MASK & ~PAGE_CACHE_ATTRS))
173
174 static s8 __read_mostly opt_mmio_relax;
175
parse_mmio_relax(const char * s)176 static int __init parse_mmio_relax(const char *s)
177 {
178 if ( !*s )
179 opt_mmio_relax = 1;
180 else
181 opt_mmio_relax = parse_bool(s, NULL);
182 if ( opt_mmio_relax < 0 && strcmp(s, "all") )
183 {
184 opt_mmio_relax = 0;
185 return -EINVAL;
186 }
187
188 return 0;
189 }
190 custom_param("mmio-relax", parse_mmio_relax);
191
init_frametable_chunk(void * start,void * end)192 static void __init init_frametable_chunk(void *start, void *end)
193 {
194 unsigned long s = (unsigned long)start;
195 unsigned long e = (unsigned long)end;
196 unsigned long step;
197 mfn_t mfn;
198
199 ASSERT(!(s & ((1 << L2_PAGETABLE_SHIFT) - 1)));
200 for ( ; s < e; s += step << PAGE_SHIFT )
201 {
202 step = 1UL << (cpu_has_page1gb &&
203 !(s & ((1UL << L3_PAGETABLE_SHIFT) - 1)) ?
204 L3_PAGETABLE_SHIFT - PAGE_SHIFT :
205 L2_PAGETABLE_SHIFT - PAGE_SHIFT);
206 /*
207 * The hardcoded 4 below is arbitrary - just pick whatever you think
208 * is reasonable to waste as a trade-off for using a large page.
209 */
210 while ( step && s + (step << PAGE_SHIFT) > e + (4 << PAGE_SHIFT) )
211 step >>= PAGETABLE_ORDER;
212 mfn = alloc_boot_pages(step, step);
213 map_pages_to_xen(s, mfn_x(mfn), step, PAGE_HYPERVISOR);
214 }
215
216 memset(start, 0, end - start);
217 memset(end, -1, s - e);
218 }
219
init_frametable(void)220 void __init init_frametable(void)
221 {
222 unsigned int sidx, eidx, nidx;
223 unsigned int max_idx = (max_pdx + PDX_GROUP_COUNT - 1) / PDX_GROUP_COUNT;
224 struct page_info *end_pg, *top_pg;
225
226 BUILD_BUG_ON(XEN_VIRT_END > FRAMETABLE_VIRT_START);
227 BUILD_BUG_ON(FRAMETABLE_VIRT_START & ((1UL << L2_PAGETABLE_SHIFT) - 1));
228
229 for ( sidx = 0; ; sidx = nidx )
230 {
231 eidx = find_next_zero_bit(pdx_group_valid, max_idx, sidx);
232 nidx = find_next_bit(pdx_group_valid, max_idx, eidx);
233 if ( nidx >= max_idx )
234 break;
235 init_frametable_chunk(pdx_to_page(sidx * PDX_GROUP_COUNT),
236 pdx_to_page(eidx * PDX_GROUP_COUNT));
237 }
238
239 end_pg = pdx_to_page(max_pdx - 1) + 1;
240 top_pg = mem_hotplug ? pdx_to_page(max_idx * PDX_GROUP_COUNT - 1) + 1
241 : end_pg;
242 init_frametable_chunk(pdx_to_page(sidx * PDX_GROUP_COUNT), top_pg);
243 memset(end_pg, -1, (unsigned long)top_pg - (unsigned long)end_pg);
244 }
245
246 #ifndef NDEBUG
247 static unsigned int __read_mostly root_pgt_pv_xen_slots
248 = ROOT_PAGETABLE_PV_XEN_SLOTS;
249 static l4_pgentry_t __read_mostly split_l4e;
250 #else
251 #define root_pgt_pv_xen_slots ROOT_PAGETABLE_PV_XEN_SLOTS
252 #endif
253
arch_init_memory(void)254 void __init arch_init_memory(void)
255 {
256 unsigned long i, pfn, rstart_pfn, rend_pfn, iostart_pfn, ioend_pfn;
257
258 /*
259 * Basic guest-accessible flags:
260 * PRESENT, R/W, USER, A/D, AVAIL[0,1,2], AVAIL_HIGH, NX (if available).
261 */
262 base_disallow_mask =
263 ~(_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | _PAGE_ACCESSED |
264 _PAGE_DIRTY | _PAGE_AVAIL | _PAGE_AVAIL_HIGH | _PAGE_NX);
265
266 /*
267 * Initialise our DOMID_XEN domain.
268 * Any Xen-heap pages that we will allow to be mapped will have
269 * their domain field set to dom_xen.
270 * Hidden PCI devices will also be associated with this domain
271 * (but be [partly] controlled by Dom0 nevertheless).
272 */
273 dom_xen = domain_create(DOMID_XEN, DOMCRF_dummy, 0, NULL);
274 BUG_ON(IS_ERR(dom_xen));
275 INIT_LIST_HEAD(&dom_xen->arch.pdev_list);
276
277 /*
278 * Initialise our DOMID_IO domain.
279 * This domain owns I/O pages that are within the range of the page_info
280 * array. Mappings occur at the priv of the caller.
281 */
282 dom_io = domain_create(DOMID_IO, DOMCRF_dummy, 0, NULL);
283 BUG_ON(IS_ERR(dom_io));
284
285 /*
286 * Initialise our COW domain.
287 * This domain owns sharable pages.
288 */
289 dom_cow = domain_create(DOMID_COW, DOMCRF_dummy, 0, NULL);
290 BUG_ON(IS_ERR(dom_cow));
291
292 /*
293 * First 1MB of RAM is historically marked as I/O. If we booted PVH,
294 * reclaim the space. Irrespective, leave MFN 0 as special for the sake
295 * of 0 being a very common default value. Also reserve page 0x1 which is
296 * used by the trampoline code on PVH.
297 */
298 BUG_ON(pvh_boot && trampoline_phys != 0x1000);
299 for ( i = 0;
300 i < (pvh_boot ? (1 + PFN_UP(trampoline_end - trampoline_start))
301 : 0x100);
302 i++ )
303 share_xen_page_with_guest(mfn_to_page(_mfn(i)),
304 dom_io, XENSHARE_writable);
305
306 /* Any areas not specified as RAM by the e820 map are considered I/O. */
307 for ( i = 0, pfn = 0; pfn < max_page; i++ )
308 {
309 while ( (i < e820.nr_map) &&
310 (e820.map[i].type != E820_RAM) &&
311 (e820.map[i].type != E820_UNUSABLE) )
312 i++;
313
314 if ( i >= e820.nr_map )
315 {
316 /* No more RAM regions: mark as I/O right to end of memory map. */
317 rstart_pfn = rend_pfn = max_page;
318 }
319 else
320 {
321 /* Mark as I/O just up as far as next RAM region. */
322 rstart_pfn = min_t(unsigned long, max_page,
323 PFN_UP(e820.map[i].addr));
324 rend_pfn = max_t(unsigned long, rstart_pfn,
325 PFN_DOWN(e820.map[i].addr + e820.map[i].size));
326 }
327
328 /*
329 * Make sure any Xen mappings of RAM holes above 1MB are blown away.
330 * In particular this ensures that RAM holes are respected even in
331 * the statically-initialised 1-16MB mapping area.
332 */
333 iostart_pfn = max_t(unsigned long, pfn, 1UL << (20 - PAGE_SHIFT));
334 ioend_pfn = min(rstart_pfn, 16UL << (20 - PAGE_SHIFT));
335 if ( iostart_pfn < ioend_pfn )
336 destroy_xen_mappings((unsigned long)mfn_to_virt(iostart_pfn),
337 (unsigned long)mfn_to_virt(ioend_pfn));
338
339 /* Mark as I/O up to next RAM region. */
340 for ( ; pfn < rstart_pfn; pfn++ )
341 {
342 if ( !mfn_valid(_mfn(pfn)) )
343 continue;
344 share_xen_page_with_guest(
345 mfn_to_page(_mfn(pfn)), dom_io, XENSHARE_writable);
346 }
347
348 /* Skip the RAM region. */
349 pfn = rend_pfn;
350 }
351
352 subarch_init_memory();
353
354 efi_init_memory();
355
356 mem_sharing_init();
357
358 #ifndef NDEBUG
359 if ( highmem_start )
360 {
361 unsigned long split_va = (unsigned long)__va(highmem_start);
362
363 if ( split_va < HYPERVISOR_VIRT_END &&
364 split_va - 1 == (unsigned long)__va(highmem_start - 1) )
365 {
366 root_pgt_pv_xen_slots = l4_table_offset(split_va) -
367 ROOT_PAGETABLE_FIRST_XEN_SLOT;
368 ASSERT(root_pgt_pv_xen_slots < ROOT_PAGETABLE_PV_XEN_SLOTS);
369 if ( l4_table_offset(split_va) == l4_table_offset(split_va - 1) )
370 {
371 l3_pgentry_t *l3tab = alloc_xen_pagetable();
372
373 if ( l3tab )
374 {
375 const l3_pgentry_t *l3idle =
376 l4e_to_l3e(idle_pg_table[l4_table_offset(split_va)]);
377
378 for ( i = 0; i < l3_table_offset(split_va); ++i )
379 l3tab[i] = l3idle[i];
380 for ( ; i < L3_PAGETABLE_ENTRIES; ++i )
381 l3tab[i] = l3e_empty();
382 split_l4e = l4e_from_pfn(virt_to_mfn(l3tab),
383 __PAGE_HYPERVISOR_RW);
384 }
385 else
386 ++root_pgt_pv_xen_slots;
387 }
388 }
389 }
390 #endif
391 }
392
page_is_ram_type(unsigned long mfn,unsigned long mem_type)393 int page_is_ram_type(unsigned long mfn, unsigned long mem_type)
394 {
395 uint64_t maddr = pfn_to_paddr(mfn);
396 int i;
397
398 for ( i = 0; i < e820.nr_map; i++ )
399 {
400 switch ( e820.map[i].type )
401 {
402 case E820_RAM:
403 if ( mem_type & RAM_TYPE_CONVENTIONAL )
404 break;
405 continue;
406 case E820_RESERVED:
407 if ( mem_type & RAM_TYPE_RESERVED )
408 break;
409 continue;
410 case E820_UNUSABLE:
411 if ( mem_type & RAM_TYPE_UNUSABLE )
412 break;
413 continue;
414 case E820_ACPI:
415 case E820_NVS:
416 if ( mem_type & RAM_TYPE_ACPI )
417 break;
418 continue;
419 default:
420 /* unknown */
421 continue;
422 }
423
424 /* Test the range. */
425 if ( (e820.map[i].addr <= maddr) &&
426 ((e820.map[i].addr + e820.map[i].size) >= (maddr + PAGE_SIZE)) )
427 return 1;
428 }
429
430 return 0;
431 }
432
domain_get_maximum_gpfn(struct domain * d)433 unsigned long domain_get_maximum_gpfn(struct domain *d)
434 {
435 if ( is_hvm_domain(d) )
436 return p2m_get_hostp2m(d)->max_mapped_pfn;
437 /* NB. PV guests specify nr_pfns rather than max_pfn so we adjust here. */
438 return (arch_get_max_pfn(d) ?: 1) - 1;
439 }
440
share_xen_page_with_guest(struct page_info * page,struct domain * d,int readonly)441 void share_xen_page_with_guest(
442 struct page_info *page, struct domain *d, int readonly)
443 {
444 if ( page_get_owner(page) == d )
445 return;
446
447 set_gpfn_from_mfn(mfn_x(page_to_mfn(page)), INVALID_M2P_ENTRY);
448
449 spin_lock(&d->page_alloc_lock);
450
451 /* The incremented type count pins as writable or read-only. */
452 page->u.inuse.type_info = (readonly ? PGT_none : PGT_writable_page);
453 page->u.inuse.type_info |= PGT_validated | 1;
454
455 page_set_owner(page, d);
456 smp_wmb(); /* install valid domain ptr before updating refcnt. */
457 ASSERT((page->count_info & ~PGC_xen_heap) == 0);
458
459 /* Only add to the allocation list if the domain isn't dying. */
460 if ( !d->is_dying )
461 {
462 page->count_info |= PGC_xen_heap | PGC_allocated | 1;
463 if ( unlikely(d->xenheap_pages++ == 0) )
464 get_knownalive_domain(d);
465 page_list_add_tail(page, &d->xenpage_list);
466 }
467
468 spin_unlock(&d->page_alloc_lock);
469 }
470
unshare_xen_page_with_guest(struct page_info * page,struct domain * d)471 int __init unshare_xen_page_with_guest(struct page_info *page,
472 struct domain *d)
473 {
474 if ( page_get_owner(page) != d || !is_xen_heap_page(page) )
475 return -EINVAL;
476
477 if ( test_and_clear_bit(_PGC_allocated, &page->count_info) )
478 put_page(page);
479
480 /* Remove the owner and clear the flags. */
481 page->u.inuse.type_info = 0;
482 page_set_owner(page, NULL);
483
484 return 0;
485 }
486
share_xen_page_with_privileged_guests(struct page_info * page,int readonly)487 void share_xen_page_with_privileged_guests(
488 struct page_info *page, int readonly)
489 {
490 share_xen_page_with_guest(page, dom_xen, readonly);
491 }
492
free_shared_domheap_page(struct page_info * page)493 void free_shared_domheap_page(struct page_info *page)
494 {
495 if ( test_and_clear_bit(_PGC_allocated, &page->count_info) )
496 put_page(page);
497 if ( !test_and_clear_bit(_PGC_xen_heap, &page->count_info) )
498 ASSERT_UNREACHABLE();
499 page->u.inuse.type_info = 0;
500 page_set_owner(page, NULL);
501 free_domheap_page(page);
502 }
503
make_cr3(struct vcpu * v,mfn_t mfn)504 void make_cr3(struct vcpu *v, mfn_t mfn)
505 {
506 v->arch.cr3 = mfn_x(mfn) << PAGE_SHIFT;
507 }
508
write_ptbase(struct vcpu * v)509 void write_ptbase(struct vcpu *v)
510 {
511 write_cr3(v->arch.cr3);
512 }
513
514 /*
515 * Should be called after CR3 is updated.
516 *
517 * Uses values found in vcpu->arch.(guest_table and guest_table_user), and
518 * for HVM guests, arch.monitor_table and hvm's guest CR3.
519 *
520 * Update ref counts to shadow tables appropriately.
521 */
update_cr3(struct vcpu * v)522 void update_cr3(struct vcpu *v)
523 {
524 mfn_t cr3_mfn;
525
526 if ( paging_mode_enabled(v->domain) )
527 {
528 paging_update_cr3(v);
529 return;
530 }
531
532 if ( !(v->arch.flags & TF_kernel_mode) )
533 cr3_mfn = pagetable_get_mfn(v->arch.guest_table_user);
534 else
535 cr3_mfn = pagetable_get_mfn(v->arch.guest_table);
536
537 make_cr3(v, cr3_mfn);
538 }
539
set_tlbflush_timestamp(struct page_info * page)540 static inline void set_tlbflush_timestamp(struct page_info *page)
541 {
542 /*
543 * Record TLB information for flush later. We do not stamp page tables
544 * when running in shadow mode:
545 * 1. Pointless, since it's the shadow pt's which must be tracked.
546 * 2. Shadow mode reuses this field for shadowed page tables to store
547 * flags info -- we don't want to conflict with that.
548 */
549 if ( !(page->count_info & PGC_page_table) ||
550 !shadow_mode_enabled(page_get_owner(page)) )
551 page_set_tlbflush_timestamp(page);
552 }
553
554 const char __section(".bss.page_aligned.const") __aligned(PAGE_SIZE)
555 zero_page[PAGE_SIZE];
556
invalidate_shadow_ldt(struct vcpu * v,int flush)557 static void invalidate_shadow_ldt(struct vcpu *v, int flush)
558 {
559 l1_pgentry_t *pl1e;
560 unsigned int i;
561 struct page_info *page;
562
563 BUG_ON(unlikely(in_irq()));
564
565 spin_lock(&v->arch.pv_vcpu.shadow_ldt_lock);
566
567 if ( v->arch.pv_vcpu.shadow_ldt_mapcnt == 0 )
568 goto out;
569
570 v->arch.pv_vcpu.shadow_ldt_mapcnt = 0;
571 pl1e = pv_ldt_ptes(v);
572
573 for ( i = 0; i < 16; i++ )
574 {
575 if ( !(l1e_get_flags(pl1e[i]) & _PAGE_PRESENT) )
576 continue;
577 page = l1e_get_page(pl1e[i]);
578 l1e_write(&pl1e[i], l1e_empty());
579 ASSERT_PAGE_IS_TYPE(page, PGT_seg_desc_page);
580 ASSERT_PAGE_IS_DOMAIN(page, v->domain);
581 put_page_and_type(page);
582 }
583
584 /* Rid TLBs of stale mappings (guest mappings and shadow mappings). */
585 if ( flush )
586 flush_tlb_mask(v->vcpu_dirty_cpumask);
587
588 out:
589 spin_unlock(&v->arch.pv_vcpu.shadow_ldt_lock);
590 }
591
592
alloc_segdesc_page(struct page_info * page)593 static int alloc_segdesc_page(struct page_info *page)
594 {
595 const struct domain *owner = page_get_owner(page);
596 struct desc_struct *descs = __map_domain_page(page);
597 unsigned i;
598
599 for ( i = 0; i < 512; i++ )
600 if ( unlikely(!check_descriptor(owner, &descs[i])) )
601 break;
602
603 unmap_domain_page(descs);
604
605 return i == 512 ? 0 : -EINVAL;
606 }
607
get_page_and_type_from_mfn(mfn_t mfn,unsigned long type,struct domain * d,int partial,int preemptible)608 static int get_page_and_type_from_mfn(
609 mfn_t mfn, unsigned long type, struct domain *d,
610 int partial, int preemptible)
611 {
612 struct page_info *page = mfn_to_page(mfn);
613 int rc;
614
615 if ( likely(partial >= 0) &&
616 unlikely(!get_page_from_mfn(mfn, d)) )
617 return -EINVAL;
618
619 rc = (preemptible ?
620 get_page_type_preemptible(page, type) :
621 (get_page_type(page, type) ? 0 : -EINVAL));
622
623 if ( unlikely(rc) && partial >= 0 &&
624 (!preemptible || page != current->arch.old_guest_table) )
625 put_page(page);
626
627 return rc;
628 }
629
put_data_page(struct page_info * page,int writeable)630 static void put_data_page(
631 struct page_info *page, int writeable)
632 {
633 if ( writeable )
634 put_page_and_type(page);
635 else
636 put_page(page);
637 }
638
639 #ifdef CONFIG_PV_LINEAR_PT
640
inc_linear_entries(struct page_info * pg)641 static bool inc_linear_entries(struct page_info *pg)
642 {
643 typeof(pg->linear_pt_count) nc = read_atomic(&pg->linear_pt_count), oc;
644
645 do {
646 /*
647 * The check below checks for the "linear use" count being non-zero
648 * as well as overflow. Signed integer overflow is undefined behavior
649 * according to the C spec. However, as long as linear_pt_count is
650 * smaller in size than 'int', the arithmetic operation of the
651 * increment below won't overflow; rather the result will be truncated
652 * when stored. Ensure that this is always true.
653 */
654 BUILD_BUG_ON(sizeof(nc) >= sizeof(int));
655 oc = nc++;
656 if ( nc <= 0 )
657 return false;
658 nc = cmpxchg(&pg->linear_pt_count, oc, nc);
659 } while ( oc != nc );
660
661 return true;
662 }
663
dec_linear_entries(struct page_info * pg)664 static void dec_linear_entries(struct page_info *pg)
665 {
666 typeof(pg->linear_pt_count) oc;
667
668 oc = arch_fetch_and_add(&pg->linear_pt_count, -1);
669 ASSERT(oc > 0);
670 }
671
inc_linear_uses(struct page_info * pg)672 static bool inc_linear_uses(struct page_info *pg)
673 {
674 typeof(pg->linear_pt_count) nc = read_atomic(&pg->linear_pt_count), oc;
675
676 do {
677 /* See the respective comment in inc_linear_entries(). */
678 BUILD_BUG_ON(sizeof(nc) >= sizeof(int));
679 oc = nc--;
680 if ( nc >= 0 )
681 return false;
682 nc = cmpxchg(&pg->linear_pt_count, oc, nc);
683 } while ( oc != nc );
684
685 return true;
686 }
687
dec_linear_uses(struct page_info * pg)688 static void dec_linear_uses(struct page_info *pg)
689 {
690 typeof(pg->linear_pt_count) oc;
691
692 oc = arch_fetch_and_add(&pg->linear_pt_count, 1);
693 ASSERT(oc < 0);
694 }
695
696 /*
697 * We allow root tables to map each other (a.k.a. linear page tables). It
698 * needs some special care with reference counts and access permissions:
699 * 1. The mapping entry must be read-only, or the guest may get write access
700 * to its own PTEs.
701 * 2. We must only bump the reference counts for an *already validated*
702 * L2 table, or we can end up in a deadlock in get_page_type() by waiting
703 * on a validation that is required to complete that validation.
704 * 3. We only need to increment the reference counts for the mapped page
705 * frame if it is mapped by a different root table. This is sufficient and
706 * also necessary to allow validation of a root table mapping itself.
707 */
708 static bool __read_mostly opt_pv_linear_pt = true;
709 boolean_param("pv-linear-pt", opt_pv_linear_pt);
710
711 #define define_get_linear_pagetable(level) \
712 static int \
713 get_##level##_linear_pagetable( \
714 level##_pgentry_t pde, unsigned long pde_pfn, struct domain *d) \
715 { \
716 unsigned long x, y; \
717 struct page_info *page; \
718 unsigned long pfn; \
719 \
720 if ( !opt_pv_linear_pt ) \
721 { \
722 gdprintk(XENLOG_WARNING, \
723 "Attempt to create linear p.t. (feature disabled)\n"); \
724 return 0; \
725 } \
726 \
727 if ( (level##e_get_flags(pde) & _PAGE_RW) ) \
728 { \
729 gdprintk(XENLOG_WARNING, \
730 "Attempt to create linear p.t. with write perms\n"); \
731 return 0; \
732 } \
733 \
734 if ( (pfn = level##e_get_pfn(pde)) != pde_pfn ) \
735 { \
736 struct page_info *ptpg = mfn_to_page(_mfn(pde_pfn)); \
737 \
738 /* Make sure the page table belongs to the correct domain. */ \
739 if ( unlikely(page_get_owner(ptpg) != d) ) \
740 return 0; \
741 \
742 /* Make sure the mapped frame belongs to the correct domain. */ \
743 if ( unlikely(!get_page_from_mfn(_mfn(pfn), d)) ) \
744 return 0; \
745 \
746 /* \
747 * Ensure that the mapped frame is an already-validated page table \
748 * and is not itself having linear entries, as well as that the \
749 * containing page table is not iself in use as a linear page table \
750 * elsewhere. \
751 * If so, atomically increment the count (checking for overflow). \
752 */ \
753 page = mfn_to_page(_mfn(pfn)); \
754 if ( !inc_linear_entries(ptpg) ) \
755 { \
756 put_page(page); \
757 return 0; \
758 } \
759 if ( !inc_linear_uses(page) ) \
760 { \
761 dec_linear_entries(ptpg); \
762 put_page(page); \
763 return 0; \
764 } \
765 y = page->u.inuse.type_info; \
766 do { \
767 x = y; \
768 if ( unlikely((x & PGT_count_mask) == PGT_count_mask) || \
769 unlikely((x & (PGT_type_mask|PGT_validated)) != \
770 (PGT_##level##_page_table|PGT_validated)) ) \
771 { \
772 dec_linear_uses(page); \
773 dec_linear_entries(ptpg); \
774 put_page(page); \
775 return 0; \
776 } \
777 } \
778 while ( (y = cmpxchg(&page->u.inuse.type_info, x, x + 1)) != x ); \
779 } \
780 \
781 return 1; \
782 }
783
784 #else /* CONFIG_PV_LINEAR_PT */
785
786 #define define_get_linear_pagetable(level) \
787 static int \
788 get_##level##_linear_pagetable( \
789 level##_pgentry_t pde, unsigned long pde_pfn, struct domain *d) \
790 { \
791 return 0; \
792 }
793
dec_linear_uses(struct page_info * pg)794 static void dec_linear_uses(struct page_info *pg)
795 {
796 ASSERT(pg->linear_pt_count == 0);
797 }
798
dec_linear_entries(struct page_info * pg)799 static void dec_linear_entries(struct page_info *pg)
800 {
801 ASSERT(pg->linear_pt_count == 0);
802 }
803
804 #endif /* CONFIG_PV_LINEAR_PT */
805
is_iomem_page(mfn_t mfn)806 bool is_iomem_page(mfn_t mfn)
807 {
808 struct page_info *page;
809
810 if ( !mfn_valid(mfn) )
811 return true;
812
813 /* Caller must know that it is an iomem page, or a reference is held. */
814 page = mfn_to_page(mfn);
815 ASSERT((page->count_info & PGC_count_mask) != 0);
816
817 return (page_get_owner(page) == dom_io);
818 }
819
update_xen_mappings(unsigned long mfn,unsigned int cacheattr)820 static int update_xen_mappings(unsigned long mfn, unsigned int cacheattr)
821 {
822 int err = 0;
823 bool alias = mfn >= PFN_DOWN(xen_phys_start) &&
824 mfn < PFN_UP(xen_phys_start + xen_virt_end - XEN_VIRT_START);
825 unsigned long xen_va =
826 XEN_VIRT_START + ((mfn - PFN_DOWN(xen_phys_start)) << PAGE_SHIFT);
827
828 if ( unlikely(alias) && cacheattr )
829 err = map_pages_to_xen(xen_va, mfn, 1, 0);
830 if ( !err )
831 err = map_pages_to_xen((unsigned long)mfn_to_virt(mfn), mfn, 1,
832 PAGE_HYPERVISOR | cacheattr_to_pte_flags(cacheattr));
833 if ( unlikely(alias) && !cacheattr && !err )
834 err = map_pages_to_xen(xen_va, mfn, 1, PAGE_HYPERVISOR);
835 return err;
836 }
837
838 #ifndef NDEBUG
839 struct mmio_emul_range_ctxt {
840 const struct domain *d;
841 unsigned long mfn;
842 };
843
print_mmio_emul_range(unsigned long s,unsigned long e,void * arg)844 static int print_mmio_emul_range(unsigned long s, unsigned long e, void *arg)
845 {
846 const struct mmio_emul_range_ctxt *ctxt = arg;
847
848 if ( ctxt->mfn > e )
849 return 0;
850
851 if ( ctxt->mfn >= s )
852 {
853 static DEFINE_SPINLOCK(last_lock);
854 static const struct domain *last_d;
855 static unsigned long last_s = ~0UL, last_e;
856 bool print = false;
857
858 spin_lock(&last_lock);
859 if ( last_d != ctxt->d || last_s != s || last_e != e )
860 {
861 last_d = ctxt->d;
862 last_s = s;
863 last_e = e;
864 print = true;
865 }
866 spin_unlock(&last_lock);
867
868 if ( print )
869 printk(XENLOG_G_INFO
870 "d%d: Forcing write emulation on MFNs %lx-%lx\n",
871 ctxt->d->domain_id, s, e);
872 }
873
874 return 1;
875 }
876 #endif
877
878 /*
879 * get_page_from_l1e returns:
880 * 0 => success (page not present also counts as such)
881 * <0 => error code
882 * >0 => the page flags to be flipped
883 */
884 int
get_page_from_l1e(l1_pgentry_t l1e,struct domain * l1e_owner,struct domain * pg_owner)885 get_page_from_l1e(
886 l1_pgentry_t l1e, struct domain *l1e_owner, struct domain *pg_owner)
887 {
888 unsigned long mfn = l1e_get_pfn(l1e);
889 struct page_info *page = mfn_to_page(_mfn(mfn));
890 uint32_t l1f = l1e_get_flags(l1e);
891 struct vcpu *curr = current;
892 struct domain *real_pg_owner;
893 bool write;
894
895 if ( !(l1f & _PAGE_PRESENT) )
896 return 0;
897
898 if ( unlikely(l1f & l1_disallow_mask(l1e_owner)) )
899 {
900 gdprintk(XENLOG_WARNING, "Bad L1 flags %x\n",
901 l1f & l1_disallow_mask(l1e_owner));
902 return -EINVAL;
903 }
904
905 if ( !mfn_valid(_mfn(mfn)) ||
906 (real_pg_owner = page_get_owner_and_reference(page)) == dom_io )
907 {
908 int flip = 0;
909
910 /* Only needed the reference to confirm dom_io ownership. */
911 if ( mfn_valid(_mfn(mfn)) )
912 put_page(page);
913
914 /* DOMID_IO reverts to caller for privilege checks. */
915 if ( pg_owner == dom_io )
916 pg_owner = curr->domain;
917
918 if ( !iomem_access_permitted(pg_owner, mfn, mfn) )
919 {
920 if ( mfn != (PADDR_MASK >> PAGE_SHIFT) ) /* INVALID_MFN? */
921 {
922 gdprintk(XENLOG_WARNING,
923 "d%d non-privileged attempt to map MMIO space %"PRI_mfn"\n",
924 pg_owner->domain_id, mfn);
925 return -EPERM;
926 }
927 return -EINVAL;
928 }
929
930 if ( pg_owner != l1e_owner &&
931 !iomem_access_permitted(l1e_owner, mfn, mfn) )
932 {
933 if ( mfn != (PADDR_MASK >> PAGE_SHIFT) ) /* INVALID_MFN? */
934 {
935 gdprintk(XENLOG_WARNING,
936 "d%d attempted to map MMIO space %"PRI_mfn" in d%d to d%d\n",
937 curr->domain->domain_id, mfn, pg_owner->domain_id,
938 l1e_owner->domain_id);
939 return -EPERM;
940 }
941 return -EINVAL;
942 }
943
944 if ( !rangeset_contains_singleton(mmio_ro_ranges, mfn) )
945 {
946 /* MMIO pages must not be mapped cachable unless requested so. */
947 switch ( opt_mmio_relax )
948 {
949 case 0:
950 break;
951 case 1:
952 if ( !is_hardware_domain(l1e_owner) )
953 break;
954 /* fallthrough */
955 case -1:
956 return 0;
957 default:
958 ASSERT_UNREACHABLE();
959 }
960 }
961 else if ( l1f & _PAGE_RW )
962 {
963 #ifndef NDEBUG
964 const unsigned long *ro_map;
965 unsigned int seg, bdf;
966
967 if ( !pci_mmcfg_decode(mfn, &seg, &bdf) ||
968 ((ro_map = pci_get_ro_map(seg)) != NULL &&
969 test_bit(bdf, ro_map)) )
970 printk(XENLOG_G_WARNING
971 "d%d: Forcing read-only access to MFN %lx\n",
972 l1e_owner->domain_id, mfn);
973 else
974 rangeset_report_ranges(mmio_ro_ranges, 0, ~0UL,
975 print_mmio_emul_range,
976 &(struct mmio_emul_range_ctxt){
977 .d = l1e_owner,
978 .mfn = mfn });
979 #endif
980 flip = _PAGE_RW;
981 }
982
983 switch ( l1f & PAGE_CACHE_ATTRS )
984 {
985 case 0: /* WB */
986 flip |= _PAGE_PWT | _PAGE_PCD;
987 break;
988 case _PAGE_PWT: /* WT */
989 case _PAGE_PWT | _PAGE_PAT: /* WP */
990 flip |= _PAGE_PCD | (l1f & _PAGE_PAT);
991 break;
992 }
993
994 return flip;
995 }
996
997 if ( unlikely( (real_pg_owner != pg_owner) &&
998 (real_pg_owner != dom_cow) ) )
999 {
1000 /*
1001 * Let privileged domains transfer the right to map their target
1002 * domain's pages. This is used to allow stub-domain pvfb export to
1003 * dom0, until pvfb supports granted mappings. At that time this
1004 * minor hack can go away.
1005 */
1006 if ( (real_pg_owner == NULL) || (pg_owner == l1e_owner) ||
1007 xsm_priv_mapping(XSM_TARGET, pg_owner, real_pg_owner) )
1008 {
1009 gdprintk(XENLOG_WARNING,
1010 "pg_owner d%d l1e_owner d%d, but real_pg_owner d%d\n",
1011 pg_owner->domain_id, l1e_owner->domain_id,
1012 real_pg_owner ? real_pg_owner->domain_id : -1);
1013 goto could_not_pin;
1014 }
1015 pg_owner = real_pg_owner;
1016 }
1017
1018 /*
1019 * Extra paranoid check for shared memory. Writable mappings
1020 * disallowed (unshare first!)
1021 */
1022 if ( (l1f & _PAGE_RW) && (real_pg_owner == dom_cow) )
1023 goto could_not_pin;
1024
1025 /*
1026 * Foreign mappings into guests in shadow external mode don't
1027 * contribute to writeable mapping refcounts. (This allows the
1028 * qemu-dm helper process in dom0 to map the domain's memory without
1029 * messing up the count of "real" writable mappings.)
1030 */
1031 write = (l1f & _PAGE_RW) &&
1032 ((l1e_owner == pg_owner) || !paging_mode_external(pg_owner));
1033 if ( write && !get_page_type(page, PGT_writable_page) )
1034 {
1035 gdprintk(XENLOG_WARNING, "Could not get page type PGT_writable_page\n");
1036 goto could_not_pin;
1037 }
1038
1039 if ( pte_flags_to_cacheattr(l1f) !=
1040 ((page->count_info & PGC_cacheattr_mask) >> PGC_cacheattr_base) )
1041 {
1042 unsigned long x, nx, y = page->count_info;
1043 unsigned long cacheattr = pte_flags_to_cacheattr(l1f);
1044 int err;
1045
1046 if ( is_xen_heap_page(page) )
1047 {
1048 if ( write )
1049 put_page_type(page);
1050 put_page(page);
1051 gdprintk(XENLOG_WARNING,
1052 "Attempt to change cache attributes of Xen heap page\n");
1053 return -EACCES;
1054 }
1055
1056 do {
1057 x = y;
1058 nx = (x & ~PGC_cacheattr_mask) | (cacheattr << PGC_cacheattr_base);
1059 } while ( (y = cmpxchg(&page->count_info, x, nx)) != x );
1060
1061 err = update_xen_mappings(mfn, cacheattr);
1062 if ( unlikely(err) )
1063 {
1064 cacheattr = y & PGC_cacheattr_mask;
1065 do {
1066 x = y;
1067 nx = (x & ~PGC_cacheattr_mask) | cacheattr;
1068 } while ( (y = cmpxchg(&page->count_info, x, nx)) != x );
1069
1070 if ( write )
1071 put_page_type(page);
1072 put_page(page);
1073
1074 gdprintk(XENLOG_WARNING, "Error updating mappings for mfn %" PRI_mfn
1075 " (pfn %" PRI_pfn ", from L1 entry %" PRIpte ") for d%d\n",
1076 mfn, get_gpfn_from_mfn(mfn),
1077 l1e_get_intpte(l1e), l1e_owner->domain_id);
1078 return err;
1079 }
1080 }
1081
1082 return 0;
1083
1084 could_not_pin:
1085 gdprintk(XENLOG_WARNING, "Error getting mfn %" PRI_mfn " (pfn %" PRI_pfn
1086 ") from L1 entry %" PRIpte " for l1e_owner d%d, pg_owner d%d\n",
1087 mfn, get_gpfn_from_mfn(mfn),
1088 l1e_get_intpte(l1e), l1e_owner->domain_id, pg_owner->domain_id);
1089 if ( real_pg_owner != NULL )
1090 put_page(page);
1091 return -EBUSY;
1092 }
1093
1094
1095 /* NB. Virtual address 'l2e' maps to a machine address within frame 'pfn'. */
1096 /*
1097 * get_page_from_l2e returns:
1098 * 1 => page not present
1099 * 0 => success
1100 * <0 => error code
1101 */
1102 define_get_linear_pagetable(l2);
1103 static int
get_page_from_l2e(l2_pgentry_t l2e,unsigned long pfn,struct domain * d)1104 get_page_from_l2e(
1105 l2_pgentry_t l2e, unsigned long pfn, struct domain *d)
1106 {
1107 unsigned long mfn = l2e_get_pfn(l2e);
1108 int rc;
1109
1110 if ( !(l2e_get_flags(l2e) & _PAGE_PRESENT) )
1111 return 1;
1112
1113 if ( unlikely((l2e_get_flags(l2e) & L2_DISALLOW_MASK)) )
1114 {
1115 gdprintk(XENLOG_WARNING, "Bad L2 flags %x\n",
1116 l2e_get_flags(l2e) & L2_DISALLOW_MASK);
1117 return -EINVAL;
1118 }
1119
1120 if ( !(l2e_get_flags(l2e) & _PAGE_PSE) )
1121 {
1122 rc = get_page_and_type_from_mfn(_mfn(mfn), PGT_l1_page_table, d, 0, 0);
1123 if ( unlikely(rc == -EINVAL) && get_l2_linear_pagetable(l2e, pfn, d) )
1124 rc = 0;
1125 return rc;
1126 }
1127
1128 return -EINVAL;
1129 }
1130
1131
1132 /*
1133 * get_page_from_l3e returns:
1134 * 1 => page not present
1135 * 0 => success
1136 * <0 => error code
1137 */
1138 define_get_linear_pagetable(l3);
1139 static int
get_page_from_l3e(l3_pgentry_t l3e,unsigned long pfn,struct domain * d,int partial)1140 get_page_from_l3e(
1141 l3_pgentry_t l3e, unsigned long pfn, struct domain *d, int partial)
1142 {
1143 int rc;
1144
1145 if ( !(l3e_get_flags(l3e) & _PAGE_PRESENT) )
1146 return 1;
1147
1148 if ( unlikely((l3e_get_flags(l3e) & l3_disallow_mask(d))) )
1149 {
1150 gdprintk(XENLOG_WARNING, "Bad L3 flags %x\n",
1151 l3e_get_flags(l3e) & l3_disallow_mask(d));
1152 return -EINVAL;
1153 }
1154
1155 rc = get_page_and_type_from_mfn(
1156 l3e_get_mfn(l3e), PGT_l2_page_table, d, partial, 1);
1157 if ( unlikely(rc == -EINVAL) &&
1158 !is_pv_32bit_domain(d) &&
1159 get_l3_linear_pagetable(l3e, pfn, d) )
1160 rc = 0;
1161
1162 return rc;
1163 }
1164
1165 /*
1166 * get_page_from_l4e returns:
1167 * 1 => page not present
1168 * 0 => success
1169 * <0 => error code
1170 */
1171 define_get_linear_pagetable(l4);
1172 static int
get_page_from_l4e(l4_pgentry_t l4e,unsigned long pfn,struct domain * d,int partial)1173 get_page_from_l4e(
1174 l4_pgentry_t l4e, unsigned long pfn, struct domain *d, int partial)
1175 {
1176 int rc;
1177
1178 if ( !(l4e_get_flags(l4e) & _PAGE_PRESENT) )
1179 return 1;
1180
1181 if ( unlikely((l4e_get_flags(l4e) & L4_DISALLOW_MASK)) )
1182 {
1183 gdprintk(XENLOG_WARNING, "Bad L4 flags %x\n",
1184 l4e_get_flags(l4e) & L4_DISALLOW_MASK);
1185 return -EINVAL;
1186 }
1187
1188 rc = get_page_and_type_from_mfn(
1189 l4e_get_mfn(l4e), PGT_l3_page_table, d, partial, 1);
1190 if ( unlikely(rc == -EINVAL) && get_l4_linear_pagetable(l4e, pfn, d) )
1191 rc = 0;
1192
1193 return rc;
1194 }
1195
1196 static int _put_page_type(struct page_info *page, bool preemptible,
1197 struct page_info *ptpg);
1198
put_page_from_l1e(l1_pgentry_t l1e,struct domain * l1e_owner)1199 void put_page_from_l1e(l1_pgentry_t l1e, struct domain *l1e_owner)
1200 {
1201 unsigned long pfn = l1e_get_pfn(l1e);
1202 struct page_info *page;
1203 struct domain *pg_owner;
1204 struct vcpu *v;
1205
1206 if ( !(l1e_get_flags(l1e) & _PAGE_PRESENT) || is_iomem_page(_mfn(pfn)) )
1207 return;
1208
1209 page = mfn_to_page(_mfn(pfn));
1210 pg_owner = page_get_owner(page);
1211
1212 /*
1213 * Check if this is a mapping that was established via a grant reference.
1214 * If it was then we should not be here: we require that such mappings are
1215 * explicitly destroyed via the grant-table interface.
1216 *
1217 * The upshot of this is that the guest can end up with active grants that
1218 * it cannot destroy (because it no longer has a PTE to present to the
1219 * grant-table interface). This can lead to subtle hard-to-catch bugs,
1220 * hence a special grant PTE flag can be enabled to catch the bug early.
1221 *
1222 * (Note that the undestroyable active grants are not a security hole in
1223 * Xen. All active grants can safely be cleaned up when the domain dies.)
1224 */
1225 if ( (l1e_get_flags(l1e) & _PAGE_GNTTAB) &&
1226 !l1e_owner->is_shutting_down && !l1e_owner->is_dying )
1227 {
1228 gdprintk(XENLOG_WARNING,
1229 "Attempt to implicitly unmap a granted PTE %" PRIpte "\n",
1230 l1e_get_intpte(l1e));
1231 domain_crash(l1e_owner);
1232 }
1233
1234 /*
1235 * Remember we didn't take a type-count of foreign writable mappings
1236 * to paging-external domains.
1237 */
1238 if ( (l1e_get_flags(l1e) & _PAGE_RW) &&
1239 ((l1e_owner == pg_owner) || !paging_mode_external(pg_owner)) )
1240 {
1241 put_page_and_type(page);
1242 }
1243 else
1244 {
1245 /* We expect this is rare so we blow the entire shadow LDT. */
1246 if ( unlikely(((page->u.inuse.type_info & PGT_type_mask) ==
1247 PGT_seg_desc_page)) &&
1248 unlikely(((page->u.inuse.type_info & PGT_count_mask) != 0)) &&
1249 (l1e_owner == pg_owner) )
1250 {
1251 for_each_vcpu ( pg_owner, v )
1252 invalidate_shadow_ldt(v, 1);
1253 }
1254 put_page(page);
1255 }
1256 }
1257
1258
1259 /*
1260 * NB. Virtual address 'l2e' maps to a machine address within frame 'pfn'.
1261 * Note also that this automatically deals correctly with linear p.t.'s.
1262 */
put_page_from_l2e(l2_pgentry_t l2e,unsigned long pfn)1263 static int put_page_from_l2e(l2_pgentry_t l2e, unsigned long pfn)
1264 {
1265 if ( !(l2e_get_flags(l2e) & _PAGE_PRESENT) || (l2e_get_pfn(l2e) == pfn) )
1266 return 1;
1267
1268 if ( l2e_get_flags(l2e) & _PAGE_PSE )
1269 {
1270 struct page_info *page = l2e_get_page(l2e);
1271 unsigned int i;
1272
1273 for ( i = 0; i < (1u << PAGETABLE_ORDER); i++, page++ )
1274 put_page_and_type(page);
1275 }
1276 else
1277 {
1278 struct page_info *pg = l2e_get_page(l2e);
1279 int rc = _put_page_type(pg, false, mfn_to_page(_mfn(pfn)));
1280
1281 ASSERT(!rc);
1282 put_page(pg);
1283 }
1284
1285 return 0;
1286 }
1287
put_page_from_l3e(l3_pgentry_t l3e,unsigned long pfn,int partial,bool defer)1288 static int put_page_from_l3e(l3_pgentry_t l3e, unsigned long pfn,
1289 int partial, bool defer)
1290 {
1291 struct page_info *pg;
1292 int rc;
1293
1294 if ( !(l3e_get_flags(l3e) & _PAGE_PRESENT) || (l3e_get_pfn(l3e) == pfn) )
1295 return 1;
1296
1297 if ( unlikely(l3e_get_flags(l3e) & _PAGE_PSE) )
1298 {
1299 unsigned long mfn = l3e_get_pfn(l3e);
1300 int writeable = l3e_get_flags(l3e) & _PAGE_RW;
1301
1302 ASSERT(!(mfn & ((1UL << (L3_PAGETABLE_SHIFT - PAGE_SHIFT)) - 1)));
1303 do {
1304 put_data_page(mfn_to_page(_mfn(mfn)), writeable);
1305 } while ( ++mfn & ((1UL << (L3_PAGETABLE_SHIFT - PAGE_SHIFT)) - 1) );
1306
1307 return 0;
1308 }
1309
1310 pg = l3e_get_page(l3e);
1311
1312 if ( unlikely(partial > 0) )
1313 {
1314 ASSERT(!defer);
1315 return _put_page_type(pg, true, mfn_to_page(_mfn(pfn)));
1316 }
1317
1318 if ( defer )
1319 {
1320 current->arch.old_guest_ptpg = mfn_to_page(_mfn(pfn));
1321 current->arch.old_guest_table = pg;
1322 return 0;
1323 }
1324
1325 rc = _put_page_type(pg, true, mfn_to_page(_mfn(pfn)));
1326 if ( likely(!rc) )
1327 put_page(pg);
1328
1329 return rc;
1330 }
1331
put_page_from_l4e(l4_pgentry_t l4e,unsigned long pfn,int partial,bool defer)1332 static int put_page_from_l4e(l4_pgentry_t l4e, unsigned long pfn,
1333 int partial, bool defer)
1334 {
1335 int rc = 1;
1336
1337 if ( (l4e_get_flags(l4e) & _PAGE_PRESENT) &&
1338 (l4e_get_pfn(l4e) != pfn) )
1339 {
1340 struct page_info *pg = l4e_get_page(l4e);
1341
1342 if ( unlikely(partial > 0) )
1343 {
1344 ASSERT(!defer);
1345 return _put_page_type(pg, true, mfn_to_page(_mfn(pfn)));
1346 }
1347
1348 if ( defer )
1349 {
1350 current->arch.old_guest_ptpg = mfn_to_page(_mfn(pfn));
1351 current->arch.old_guest_table = pg;
1352 return 0;
1353 }
1354
1355 rc = _put_page_type(pg, true, mfn_to_page(_mfn(pfn)));
1356 if ( likely(!rc) )
1357 put_page(pg);
1358 }
1359
1360 return rc;
1361 }
1362
alloc_l1_table(struct page_info * page)1363 static int alloc_l1_table(struct page_info *page)
1364 {
1365 struct domain *d = page_get_owner(page);
1366 l1_pgentry_t *pl1e;
1367 unsigned int i;
1368 int ret = 0;
1369
1370 pl1e = __map_domain_page(page);
1371
1372 for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++ )
1373 {
1374 switch ( ret = get_page_from_l1e(pl1e[i], d, d) )
1375 {
1376 default:
1377 goto fail;
1378 case 0:
1379 break;
1380 case _PAGE_RW ... _PAGE_RW | PAGE_CACHE_ATTRS:
1381 ASSERT(!(ret & ~(_PAGE_RW | PAGE_CACHE_ATTRS)));
1382 l1e_flip_flags(pl1e[i], ret);
1383 break;
1384 }
1385
1386 pl1e[i] = adjust_guest_l1e(pl1e[i], d);
1387 }
1388
1389 unmap_domain_page(pl1e);
1390 return 0;
1391
1392 fail:
1393 gdprintk(XENLOG_WARNING, "Failure in alloc_l1_table: slot %#x\n", i);
1394 while ( i-- > 0 )
1395 put_page_from_l1e(pl1e[i], d);
1396
1397 unmap_domain_page(pl1e);
1398 return ret;
1399 }
1400
create_pae_xen_mappings(struct domain * d,l3_pgentry_t * pl3e)1401 static int create_pae_xen_mappings(struct domain *d, l3_pgentry_t *pl3e)
1402 {
1403 struct page_info *page;
1404 l3_pgentry_t l3e3;
1405
1406 if ( !is_pv_32bit_domain(d) )
1407 return 1;
1408
1409 pl3e = (l3_pgentry_t *)((unsigned long)pl3e & PAGE_MASK);
1410
1411 /* 3rd L3 slot contains L2 with Xen-private mappings. It *must* exist. */
1412 l3e3 = pl3e[3];
1413 if ( !(l3e_get_flags(l3e3) & _PAGE_PRESENT) )
1414 {
1415 gdprintk(XENLOG_WARNING, "PAE L3 3rd slot is empty\n");
1416 return 0;
1417 }
1418
1419 /*
1420 * The Xen-private mappings include linear mappings. The L2 thus cannot
1421 * be shared by multiple L3 tables. The test here is adequate because:
1422 * 1. Cannot appear in slots != 3 because get_page_type() checks the
1423 * PGT_pae_xen_l2 flag, which is asserted iff the L2 appears in slot 3
1424 * 2. Cannot appear in another page table's L3:
1425 * a. alloc_l3_table() calls this function and this check will fail
1426 * b. mod_l3_entry() disallows updates to slot 3 in an existing table
1427 */
1428 page = l3e_get_page(l3e3);
1429 BUG_ON(page->u.inuse.type_info & PGT_pinned);
1430 BUG_ON((page->u.inuse.type_info & PGT_count_mask) == 0);
1431 BUG_ON(!(page->u.inuse.type_info & PGT_pae_xen_l2));
1432 if ( (page->u.inuse.type_info & PGT_count_mask) != 1 )
1433 {
1434 gdprintk(XENLOG_WARNING, "PAE L3 3rd slot is shared\n");
1435 return 0;
1436 }
1437
1438 return 1;
1439 }
1440
alloc_l2_table(struct page_info * page,unsigned long type,int preemptible)1441 static int alloc_l2_table(struct page_info *page, unsigned long type,
1442 int preemptible)
1443 {
1444 struct domain *d = page_get_owner(page);
1445 unsigned long pfn = mfn_x(page_to_mfn(page));
1446 l2_pgentry_t *pl2e;
1447 unsigned int i;
1448 int rc = 0;
1449
1450 pl2e = map_domain_page(_mfn(pfn));
1451
1452 for ( i = page->nr_validated_ptes; i < L2_PAGETABLE_ENTRIES; i++ )
1453 {
1454 if ( preemptible && i > page->nr_validated_ptes
1455 && hypercall_preempt_check() )
1456 {
1457 page->nr_validated_ptes = i;
1458 rc = -ERESTART;
1459 break;
1460 }
1461
1462 if ( !is_guest_l2_slot(d, type, i) ||
1463 (rc = get_page_from_l2e(pl2e[i], pfn, d)) > 0 )
1464 continue;
1465
1466 if ( rc < 0 )
1467 {
1468 gdprintk(XENLOG_WARNING, "Failure in alloc_l2_table: slot %#x\n", i);
1469 while ( i-- > 0 )
1470 if ( is_guest_l2_slot(d, type, i) )
1471 put_page_from_l2e(pl2e[i], pfn);
1472 break;
1473 }
1474
1475 pl2e[i] = adjust_guest_l2e(pl2e[i], d);
1476 }
1477
1478 if ( rc >= 0 && (type & PGT_pae_xen_l2) )
1479 init_xen_pae_l2_slots(pl2e, d);
1480
1481 unmap_domain_page(pl2e);
1482 return rc > 0 ? 0 : rc;
1483 }
1484
alloc_l3_table(struct page_info * page)1485 static int alloc_l3_table(struct page_info *page)
1486 {
1487 struct domain *d = page_get_owner(page);
1488 unsigned long pfn = mfn_x(page_to_mfn(page));
1489 l3_pgentry_t *pl3e;
1490 unsigned int i;
1491 int rc = 0, partial = page->partial_pte;
1492
1493 pl3e = map_domain_page(_mfn(pfn));
1494
1495 /*
1496 * PAE guests allocate full pages, but aren't required to initialize
1497 * more than the first four entries; when running in compatibility
1498 * mode, however, the full page is visible to the MMU, and hence all
1499 * 512 entries must be valid/verified, which is most easily achieved
1500 * by clearing them out.
1501 */
1502 if ( is_pv_32bit_domain(d) )
1503 memset(pl3e + 4, 0, (L3_PAGETABLE_ENTRIES - 4) * sizeof(*pl3e));
1504
1505 for ( i = page->nr_validated_ptes; i < L3_PAGETABLE_ENTRIES;
1506 i++, partial = 0 )
1507 {
1508 if ( is_pv_32bit_domain(d) && (i == 3) )
1509 {
1510 if ( !(l3e_get_flags(pl3e[i]) & _PAGE_PRESENT) ||
1511 (l3e_get_flags(pl3e[i]) & l3_disallow_mask(d)) )
1512 rc = -EINVAL;
1513 else
1514 rc = get_page_and_type_from_mfn(
1515 l3e_get_mfn(pl3e[i]),
1516 PGT_l2_page_table | PGT_pae_xen_l2, d, partial, 1);
1517 }
1518 else if ( (rc = get_page_from_l3e(pl3e[i], pfn, d, partial)) > 0 )
1519 continue;
1520
1521 if ( rc == -ERESTART )
1522 {
1523 page->nr_validated_ptes = i;
1524 page->partial_pte = partial ?: 1;
1525 }
1526 else if ( rc == -EINTR && i )
1527 {
1528 page->nr_validated_ptes = i;
1529 page->partial_pte = 0;
1530 rc = -ERESTART;
1531 }
1532 if ( rc < 0 )
1533 break;
1534
1535 pl3e[i] = adjust_guest_l3e(pl3e[i], d);
1536 }
1537
1538 if ( rc >= 0 && !create_pae_xen_mappings(d, pl3e) )
1539 rc = -EINVAL;
1540 if ( rc < 0 && rc != -ERESTART && rc != -EINTR )
1541 {
1542 gdprintk(XENLOG_WARNING, "Failure in alloc_l3_table: slot %#x\n", i);
1543 if ( i )
1544 {
1545 page->nr_validated_ptes = i;
1546 page->partial_pte = 0;
1547 current->arch.old_guest_ptpg = NULL;
1548 current->arch.old_guest_table = page;
1549 }
1550 while ( i-- > 0 )
1551 pl3e[i] = unadjust_guest_l3e(pl3e[i], d);
1552 }
1553
1554 unmap_domain_page(pl3e);
1555 return rc > 0 ? 0 : rc;
1556 }
1557
init_xen_pae_l2_slots(l2_pgentry_t * l2t,const struct domain * d)1558 void init_xen_pae_l2_slots(l2_pgentry_t *l2t, const struct domain *d)
1559 {
1560 memcpy(&l2t[COMPAT_L2_PAGETABLE_FIRST_XEN_SLOT(d)],
1561 &compat_idle_pg_table_l2[
1562 l2_table_offset(HIRO_COMPAT_MPT_VIRT_START)],
1563 COMPAT_L2_PAGETABLE_XEN_SLOTS(d) * sizeof(*l2t));
1564 }
1565
1566 /*
1567 * Fill an L4 with Xen entries.
1568 *
1569 * This function must write all ROOT_PAGETABLE_PV_XEN_SLOTS, to clobber any
1570 * values a guest may have left there from alloc_l4_table().
1571 *
1572 * l4t and l4mfn are mandatory, but l4mfn doesn't need to be the mfn under
1573 * *l4t. All other parameters are optional and will either fill or zero the
1574 * appropriate slots. Pagetables not shared with guests will gain the
1575 * extended directmap.
1576 */
init_xen_l4_slots(l4_pgentry_t * l4t,mfn_t l4mfn,const struct domain * d,mfn_t sl4mfn,bool ro_mpt)1577 void init_xen_l4_slots(l4_pgentry_t *l4t, mfn_t l4mfn,
1578 const struct domain *d, mfn_t sl4mfn, bool ro_mpt)
1579 {
1580 /*
1581 * PV vcpus need a shortened directmap. HVM and Idle vcpus get the full
1582 * directmap.
1583 */
1584 bool short_directmap = d && !paging_mode_external(d);
1585
1586 /* Slot 256: RO M2P (if applicable). */
1587 l4t[l4_table_offset(RO_MPT_VIRT_START)] =
1588 ro_mpt ? idle_pg_table[l4_table_offset(RO_MPT_VIRT_START)]
1589 : l4e_empty();
1590
1591 /* Slot 257: PCI MMCFG. */
1592 l4t[l4_table_offset(PCI_MCFG_VIRT_START)] =
1593 idle_pg_table[l4_table_offset(PCI_MCFG_VIRT_START)];
1594
1595 /* Slot 258: Self linear mappings. */
1596 ASSERT(!mfn_eq(l4mfn, INVALID_MFN));
1597 l4t[l4_table_offset(LINEAR_PT_VIRT_START)] =
1598 l4e_from_mfn(l4mfn, __PAGE_HYPERVISOR_RW);
1599
1600 /* Slot 259: Shadow linear mappings (if applicable) .*/
1601 l4t[l4_table_offset(SH_LINEAR_PT_VIRT_START)] =
1602 mfn_eq(sl4mfn, INVALID_MFN) ? l4e_empty() :
1603 l4e_from_mfn(sl4mfn, __PAGE_HYPERVISOR_RW);
1604
1605 /* Slot 260: Per-domain mappings (if applicable). */
1606 l4t[l4_table_offset(PERDOMAIN_VIRT_START)] =
1607 d ? l4e_from_page(d->arch.perdomain_l3_pg, __PAGE_HYPERVISOR_RW)
1608 : l4e_empty();
1609
1610 /* Slot 261-: text/data/bss, RW M2P, vmap, frametable, directmap. */
1611 #ifndef NDEBUG
1612 if ( short_directmap &&
1613 unlikely(root_pgt_pv_xen_slots < ROOT_PAGETABLE_PV_XEN_SLOTS) )
1614 {
1615 /*
1616 * If using highmem-start=, artificially shorten the directmap to
1617 * simulate very large machines.
1618 */
1619 l4_pgentry_t *next;
1620
1621 memcpy(&l4t[l4_table_offset(XEN_VIRT_START)],
1622 &idle_pg_table[l4_table_offset(XEN_VIRT_START)],
1623 (ROOT_PAGETABLE_FIRST_XEN_SLOT + root_pgt_pv_xen_slots -
1624 l4_table_offset(XEN_VIRT_START)) * sizeof(*l4t));
1625
1626 next = &l4t[ROOT_PAGETABLE_FIRST_XEN_SLOT + root_pgt_pv_xen_slots];
1627
1628 if ( l4e_get_intpte(split_l4e) )
1629 *next++ = split_l4e;
1630
1631 memset(next, 0,
1632 _p(&l4t[ROOT_PAGETABLE_LAST_XEN_SLOT + 1]) - _p(next));
1633 }
1634 else
1635 #endif
1636 {
1637 unsigned int slots = (short_directmap
1638 ? ROOT_PAGETABLE_PV_XEN_SLOTS
1639 : ROOT_PAGETABLE_XEN_SLOTS);
1640
1641 memcpy(&l4t[l4_table_offset(XEN_VIRT_START)],
1642 &idle_pg_table[l4_table_offset(XEN_VIRT_START)],
1643 (ROOT_PAGETABLE_FIRST_XEN_SLOT + slots -
1644 l4_table_offset(XEN_VIRT_START)) * sizeof(*l4t));
1645 }
1646 }
1647
fill_ro_mpt(mfn_t mfn)1648 bool fill_ro_mpt(mfn_t mfn)
1649 {
1650 l4_pgentry_t *l4tab = map_domain_page(mfn);
1651 bool ret = false;
1652
1653 if ( !l4e_get_intpte(l4tab[l4_table_offset(RO_MPT_VIRT_START)]) )
1654 {
1655 l4tab[l4_table_offset(RO_MPT_VIRT_START)] =
1656 idle_pg_table[l4_table_offset(RO_MPT_VIRT_START)];
1657 ret = true;
1658 }
1659 unmap_domain_page(l4tab);
1660
1661 return ret;
1662 }
1663
zap_ro_mpt(mfn_t mfn)1664 void zap_ro_mpt(mfn_t mfn)
1665 {
1666 l4_pgentry_t *l4tab = map_domain_page(mfn);
1667
1668 l4tab[l4_table_offset(RO_MPT_VIRT_START)] = l4e_empty();
1669 unmap_domain_page(l4tab);
1670 }
1671
alloc_l4_table(struct page_info * page)1672 static int alloc_l4_table(struct page_info *page)
1673 {
1674 struct domain *d = page_get_owner(page);
1675 unsigned long pfn = mfn_x(page_to_mfn(page));
1676 l4_pgentry_t *pl4e = map_domain_page(_mfn(pfn));
1677 unsigned int i;
1678 int rc = 0, partial = page->partial_pte;
1679
1680 for ( i = page->nr_validated_ptes; i < L4_PAGETABLE_ENTRIES;
1681 i++, partial = 0 )
1682 {
1683 if ( !is_guest_l4_slot(d, i) ||
1684 (rc = get_page_from_l4e(pl4e[i], pfn, d, partial)) > 0 )
1685 continue;
1686
1687 if ( rc == -ERESTART )
1688 {
1689 page->nr_validated_ptes = i;
1690 page->partial_pte = partial ?: 1;
1691 }
1692 else if ( rc < 0 )
1693 {
1694 if ( rc != -EINTR )
1695 gdprintk(XENLOG_WARNING,
1696 "Failure in alloc_l4_table: slot %#x\n", i);
1697 if ( i )
1698 {
1699 page->nr_validated_ptes = i;
1700 page->partial_pte = 0;
1701 if ( rc == -EINTR )
1702 rc = -ERESTART;
1703 else
1704 {
1705 if ( current->arch.old_guest_table )
1706 page->nr_validated_ptes++;
1707 current->arch.old_guest_ptpg = NULL;
1708 current->arch.old_guest_table = page;
1709 }
1710 }
1711 }
1712 if ( rc < 0 )
1713 {
1714 unmap_domain_page(pl4e);
1715 return rc;
1716 }
1717
1718 pl4e[i] = adjust_guest_l4e(pl4e[i], d);
1719 }
1720
1721 if ( rc >= 0 )
1722 {
1723 init_xen_l4_slots(pl4e, _mfn(pfn),
1724 d, INVALID_MFN, VM_ASSIST(d, m2p_strict));
1725 atomic_inc(&d->arch.pv_domain.nr_l4_pages);
1726 rc = 0;
1727 }
1728 unmap_domain_page(pl4e);
1729
1730 return rc;
1731 }
1732
free_l1_table(struct page_info * page)1733 static void free_l1_table(struct page_info *page)
1734 {
1735 struct domain *d = page_get_owner(page);
1736 l1_pgentry_t *pl1e;
1737 unsigned int i;
1738
1739 pl1e = __map_domain_page(page);
1740
1741 for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++ )
1742 put_page_from_l1e(pl1e[i], d);
1743
1744 unmap_domain_page(pl1e);
1745 }
1746
1747
free_l2_table(struct page_info * page,int preemptible)1748 static int free_l2_table(struct page_info *page, int preemptible)
1749 {
1750 struct domain *d = page_get_owner(page);
1751 unsigned long pfn = mfn_x(page_to_mfn(page));
1752 l2_pgentry_t *pl2e;
1753 unsigned int i = page->nr_validated_ptes - 1;
1754 int err = 0;
1755
1756 pl2e = map_domain_page(_mfn(pfn));
1757
1758 ASSERT(page->nr_validated_ptes);
1759 do {
1760 if ( is_guest_l2_slot(d, page->u.inuse.type_info, i) &&
1761 put_page_from_l2e(pl2e[i], pfn) == 0 &&
1762 preemptible && i && hypercall_preempt_check() )
1763 {
1764 page->nr_validated_ptes = i;
1765 err = -ERESTART;
1766 }
1767 } while ( !err && i-- );
1768
1769 unmap_domain_page(pl2e);
1770
1771 if ( !err )
1772 page->u.inuse.type_info &= ~PGT_pae_xen_l2;
1773
1774 return err;
1775 }
1776
free_l3_table(struct page_info * page)1777 static int free_l3_table(struct page_info *page)
1778 {
1779 struct domain *d = page_get_owner(page);
1780 unsigned long pfn = mfn_x(page_to_mfn(page));
1781 l3_pgentry_t *pl3e;
1782 int rc = 0, partial = page->partial_pte;
1783 unsigned int i = page->nr_validated_ptes - !partial;
1784
1785 pl3e = map_domain_page(_mfn(pfn));
1786
1787 do {
1788 rc = put_page_from_l3e(pl3e[i], pfn, partial, 0);
1789 if ( rc < 0 )
1790 break;
1791 partial = 0;
1792 if ( rc > 0 )
1793 continue;
1794 pl3e[i] = unadjust_guest_l3e(pl3e[i], d);
1795 } while ( i-- );
1796
1797 unmap_domain_page(pl3e);
1798
1799 if ( rc == -ERESTART )
1800 {
1801 page->nr_validated_ptes = i;
1802 page->partial_pte = partial ?: -1;
1803 }
1804 else if ( rc == -EINTR && i < L3_PAGETABLE_ENTRIES - 1 )
1805 {
1806 page->nr_validated_ptes = i + 1;
1807 page->partial_pte = 0;
1808 rc = -ERESTART;
1809 }
1810 return rc > 0 ? 0 : rc;
1811 }
1812
free_l4_table(struct page_info * page)1813 static int free_l4_table(struct page_info *page)
1814 {
1815 struct domain *d = page_get_owner(page);
1816 unsigned long pfn = mfn_x(page_to_mfn(page));
1817 l4_pgentry_t *pl4e = map_domain_page(_mfn(pfn));
1818 int rc = 0, partial = page->partial_pte;
1819 unsigned int i = page->nr_validated_ptes - !partial;
1820
1821 do {
1822 if ( is_guest_l4_slot(d, i) )
1823 rc = put_page_from_l4e(pl4e[i], pfn, partial, 0);
1824 if ( rc < 0 )
1825 break;
1826 partial = 0;
1827 } while ( i-- );
1828
1829 if ( rc == -ERESTART )
1830 {
1831 page->nr_validated_ptes = i;
1832 page->partial_pte = partial ?: -1;
1833 }
1834 else if ( rc == -EINTR && i < L4_PAGETABLE_ENTRIES - 1 )
1835 {
1836 page->nr_validated_ptes = i + 1;
1837 page->partial_pte = 0;
1838 rc = -ERESTART;
1839 }
1840
1841 unmap_domain_page(pl4e);
1842
1843 if ( rc >= 0 )
1844 {
1845 atomic_dec(&d->arch.pv_domain.nr_l4_pages);
1846 rc = 0;
1847 }
1848
1849 return rc;
1850 }
1851
page_lock(struct page_info * page)1852 int page_lock(struct page_info *page)
1853 {
1854 unsigned long x, nx;
1855
1856 do {
1857 while ( (x = page->u.inuse.type_info) & PGT_locked )
1858 cpu_relax();
1859 nx = x + (1 | PGT_locked);
1860 if ( !(x & PGT_validated) ||
1861 !(x & PGT_count_mask) ||
1862 !(nx & PGT_count_mask) )
1863 return 0;
1864 } while ( cmpxchg(&page->u.inuse.type_info, x, nx) != x );
1865
1866 return 1;
1867 }
1868
page_unlock(struct page_info * page)1869 void page_unlock(struct page_info *page)
1870 {
1871 unsigned long x, nx, y = page->u.inuse.type_info;
1872
1873 do {
1874 x = y;
1875 ASSERT((x & PGT_count_mask) && (x & PGT_locked));
1876
1877 nx = x - (1 | PGT_locked);
1878 /* We must not drop the last reference here. */
1879 ASSERT(nx & PGT_count_mask);
1880 } while ( (y = cmpxchg(&page->u.inuse.type_info, x, nx)) != x );
1881 }
1882
1883 /*
1884 * PTE flags that a guest may change without re-validating the PTE.
1885 * All other bits affect translation, caching, or Xen's safety.
1886 */
1887 #define FASTPATH_FLAG_WHITELIST \
1888 (_PAGE_NX_BIT | _PAGE_AVAIL_HIGH | _PAGE_AVAIL | _PAGE_GLOBAL | \
1889 _PAGE_DIRTY | _PAGE_ACCESSED | _PAGE_USER)
1890
1891 /* Update the L1 entry at pl1e to new value nl1e. */
mod_l1_entry(l1_pgentry_t * pl1e,l1_pgentry_t nl1e,unsigned long gl1mfn,int preserve_ad,struct vcpu * pt_vcpu,struct domain * pg_dom)1892 static int mod_l1_entry(l1_pgentry_t *pl1e, l1_pgentry_t nl1e,
1893 unsigned long gl1mfn, int preserve_ad,
1894 struct vcpu *pt_vcpu, struct domain *pg_dom)
1895 {
1896 l1_pgentry_t ol1e;
1897 struct domain *pt_dom = pt_vcpu->domain;
1898 int rc = 0;
1899
1900 if ( unlikely(__copy_from_user(&ol1e, pl1e, sizeof(ol1e)) != 0) )
1901 return -EFAULT;
1902
1903 ASSERT(!paging_mode_refcounts(pt_dom));
1904
1905 if ( l1e_get_flags(nl1e) & _PAGE_PRESENT )
1906 {
1907 struct page_info *page = NULL;
1908
1909 if ( unlikely(l1e_get_flags(nl1e) & l1_disallow_mask(pt_dom)) )
1910 {
1911 gdprintk(XENLOG_WARNING, "Bad L1 flags %x\n",
1912 l1e_get_flags(nl1e) & l1_disallow_mask(pt_dom));
1913 return -EINVAL;
1914 }
1915
1916 /* Translate foreign guest address. */
1917 if ( paging_mode_translate(pg_dom) )
1918 {
1919 p2m_type_t p2mt;
1920 p2m_query_t q = l1e_get_flags(nl1e) & _PAGE_RW ?
1921 P2M_ALLOC | P2M_UNSHARE : P2M_ALLOC;
1922
1923 page = get_page_from_gfn(pg_dom, l1e_get_pfn(nl1e), &p2mt, q);
1924
1925 if ( p2m_is_paged(p2mt) )
1926 {
1927 if ( page )
1928 put_page(page);
1929 p2m_mem_paging_populate(pg_dom, l1e_get_pfn(nl1e));
1930 return -ENOENT;
1931 }
1932
1933 if ( p2mt == p2m_ram_paging_in && !page )
1934 return -ENOENT;
1935
1936 /* Did our attempt to unshare fail? */
1937 if ( (q & P2M_UNSHARE) && p2m_is_shared(p2mt) )
1938 {
1939 /* We could not have obtained a page ref. */
1940 ASSERT(!page);
1941 /* And mem_sharing_notify has already been called. */
1942 return -ENOMEM;
1943 }
1944
1945 if ( !page )
1946 return -EINVAL;
1947 nl1e = l1e_from_page(page, l1e_get_flags(nl1e));
1948 }
1949
1950 /* Fast path for sufficiently-similar mappings. */
1951 if ( !l1e_has_changed(ol1e, nl1e, ~FASTPATH_FLAG_WHITELIST) )
1952 {
1953 nl1e = adjust_guest_l1e(nl1e, pt_dom);
1954 rc = UPDATE_ENTRY(l1, pl1e, ol1e, nl1e, gl1mfn, pt_vcpu,
1955 preserve_ad);
1956 if ( page )
1957 put_page(page);
1958 return rc ? 0 : -EBUSY;
1959 }
1960
1961 switch ( rc = get_page_from_l1e(nl1e, pt_dom, pg_dom) )
1962 {
1963 default:
1964 if ( page )
1965 put_page(page);
1966 return rc;
1967 case 0:
1968 break;
1969 case _PAGE_RW ... _PAGE_RW | PAGE_CACHE_ATTRS:
1970 ASSERT(!(rc & ~(_PAGE_RW | PAGE_CACHE_ATTRS)));
1971 l1e_flip_flags(nl1e, rc);
1972 rc = 0;
1973 break;
1974 }
1975 if ( page )
1976 put_page(page);
1977
1978 nl1e = adjust_guest_l1e(nl1e, pt_dom);
1979 if ( unlikely(!UPDATE_ENTRY(l1, pl1e, ol1e, nl1e, gl1mfn, pt_vcpu,
1980 preserve_ad)) )
1981 {
1982 ol1e = nl1e;
1983 rc = -EBUSY;
1984 }
1985 }
1986 else if ( unlikely(!UPDATE_ENTRY(l1, pl1e, ol1e, nl1e, gl1mfn, pt_vcpu,
1987 preserve_ad)) )
1988 {
1989 return -EBUSY;
1990 }
1991
1992 put_page_from_l1e(ol1e, pt_dom);
1993 return rc;
1994 }
1995
1996
1997 /* Update the L2 entry at pl2e to new value nl2e. pl2e is within frame pfn. */
mod_l2_entry(l2_pgentry_t * pl2e,l2_pgentry_t nl2e,unsigned long pfn,int preserve_ad,struct vcpu * vcpu)1998 static int mod_l2_entry(l2_pgentry_t *pl2e,
1999 l2_pgentry_t nl2e,
2000 unsigned long pfn,
2001 int preserve_ad,
2002 struct vcpu *vcpu)
2003 {
2004 l2_pgentry_t ol2e;
2005 struct domain *d = vcpu->domain;
2006 struct page_info *l2pg = mfn_to_page(_mfn(pfn));
2007 unsigned long type = l2pg->u.inuse.type_info;
2008 int rc = 0;
2009
2010 if ( unlikely(!is_guest_l2_slot(d, type, pgentry_ptr_to_slot(pl2e))) )
2011 {
2012 gdprintk(XENLOG_WARNING, "L2 update in Xen-private area, slot %#lx\n",
2013 pgentry_ptr_to_slot(pl2e));
2014 return -EPERM;
2015 }
2016
2017 if ( unlikely(__copy_from_user(&ol2e, pl2e, sizeof(ol2e)) != 0) )
2018 return -EFAULT;
2019
2020 if ( l2e_get_flags(nl2e) & _PAGE_PRESENT )
2021 {
2022 if ( unlikely(l2e_get_flags(nl2e) & L2_DISALLOW_MASK) )
2023 {
2024 gdprintk(XENLOG_WARNING, "Bad L2 flags %x\n",
2025 l2e_get_flags(nl2e) & L2_DISALLOW_MASK);
2026 return -EINVAL;
2027 }
2028
2029 /* Fast path for sufficiently-similar mappings. */
2030 if ( !l2e_has_changed(ol2e, nl2e, ~FASTPATH_FLAG_WHITELIST) )
2031 {
2032 nl2e = adjust_guest_l2e(nl2e, d);
2033 if ( UPDATE_ENTRY(l2, pl2e, ol2e, nl2e, pfn, vcpu, preserve_ad) )
2034 return 0;
2035 return -EBUSY;
2036 }
2037
2038 if ( unlikely((rc = get_page_from_l2e(nl2e, pfn, d)) < 0) )
2039 return rc;
2040
2041 nl2e = adjust_guest_l2e(nl2e, d);
2042 if ( unlikely(!UPDATE_ENTRY(l2, pl2e, ol2e, nl2e, pfn, vcpu,
2043 preserve_ad)) )
2044 {
2045 ol2e = nl2e;
2046 rc = -EBUSY;
2047 }
2048 }
2049 else if ( unlikely(!UPDATE_ENTRY(l2, pl2e, ol2e, nl2e, pfn, vcpu,
2050 preserve_ad)) )
2051 {
2052 return -EBUSY;
2053 }
2054
2055 put_page_from_l2e(ol2e, pfn);
2056 return rc;
2057 }
2058
2059 /* Update the L3 entry at pl3e to new value nl3e. pl3e is within frame pfn. */
mod_l3_entry(l3_pgentry_t * pl3e,l3_pgentry_t nl3e,unsigned long pfn,int preserve_ad,struct vcpu * vcpu)2060 static int mod_l3_entry(l3_pgentry_t *pl3e,
2061 l3_pgentry_t nl3e,
2062 unsigned long pfn,
2063 int preserve_ad,
2064 struct vcpu *vcpu)
2065 {
2066 l3_pgentry_t ol3e;
2067 struct domain *d = vcpu->domain;
2068 int rc = 0;
2069
2070 /*
2071 * Disallow updates to final L3 slot. It contains Xen mappings, and it
2072 * would be a pain to ensure they remain continuously valid throughout.
2073 */
2074 if ( is_pv_32bit_domain(d) && (pgentry_ptr_to_slot(pl3e) >= 3) )
2075 return -EINVAL;
2076
2077 if ( unlikely(__copy_from_user(&ol3e, pl3e, sizeof(ol3e)) != 0) )
2078 return -EFAULT;
2079
2080 if ( l3e_get_flags(nl3e) & _PAGE_PRESENT )
2081 {
2082 if ( unlikely(l3e_get_flags(nl3e) & l3_disallow_mask(d)) )
2083 {
2084 gdprintk(XENLOG_WARNING, "Bad L3 flags %x\n",
2085 l3e_get_flags(nl3e) & l3_disallow_mask(d));
2086 return -EINVAL;
2087 }
2088
2089 /* Fast path for sufficiently-similar mappings. */
2090 if ( !l3e_has_changed(ol3e, nl3e, ~FASTPATH_FLAG_WHITELIST) )
2091 {
2092 nl3e = adjust_guest_l3e(nl3e, d);
2093 rc = UPDATE_ENTRY(l3, pl3e, ol3e, nl3e, pfn, vcpu, preserve_ad);
2094 return rc ? 0 : -EFAULT;
2095 }
2096
2097 rc = get_page_from_l3e(nl3e, pfn, d, 0);
2098 if ( unlikely(rc < 0) )
2099 return rc;
2100 rc = 0;
2101
2102 nl3e = adjust_guest_l3e(nl3e, d);
2103 if ( unlikely(!UPDATE_ENTRY(l3, pl3e, ol3e, nl3e, pfn, vcpu,
2104 preserve_ad)) )
2105 {
2106 ol3e = nl3e;
2107 rc = -EFAULT;
2108 }
2109 }
2110 else if ( unlikely(!UPDATE_ENTRY(l3, pl3e, ol3e, nl3e, pfn, vcpu,
2111 preserve_ad)) )
2112 {
2113 return -EFAULT;
2114 }
2115
2116 if ( likely(rc == 0) )
2117 if ( !create_pae_xen_mappings(d, pl3e) )
2118 BUG();
2119
2120 put_page_from_l3e(ol3e, pfn, 0, 1);
2121 return rc;
2122 }
2123
2124 /* Update the L4 entry at pl4e to new value nl4e. pl4e is within frame pfn. */
mod_l4_entry(l4_pgentry_t * pl4e,l4_pgentry_t nl4e,unsigned long pfn,int preserve_ad,struct vcpu * vcpu)2125 static int mod_l4_entry(l4_pgentry_t *pl4e,
2126 l4_pgentry_t nl4e,
2127 unsigned long pfn,
2128 int preserve_ad,
2129 struct vcpu *vcpu)
2130 {
2131 struct domain *d = vcpu->domain;
2132 l4_pgentry_t ol4e;
2133 int rc = 0;
2134
2135 if ( unlikely(!is_guest_l4_slot(d, pgentry_ptr_to_slot(pl4e))) )
2136 {
2137 gdprintk(XENLOG_WARNING, "L4 update in Xen-private area, slot %#lx\n",
2138 pgentry_ptr_to_slot(pl4e));
2139 return -EINVAL;
2140 }
2141
2142 if ( unlikely(__copy_from_user(&ol4e, pl4e, sizeof(ol4e)) != 0) )
2143 return -EFAULT;
2144
2145 if ( l4e_get_flags(nl4e) & _PAGE_PRESENT )
2146 {
2147 if ( unlikely(l4e_get_flags(nl4e) & L4_DISALLOW_MASK) )
2148 {
2149 gdprintk(XENLOG_WARNING, "Bad L4 flags %x\n",
2150 l4e_get_flags(nl4e) & L4_DISALLOW_MASK);
2151 return -EINVAL;
2152 }
2153
2154 /* Fast path for sufficiently-similar mappings. */
2155 if ( !l4e_has_changed(ol4e, nl4e, ~FASTPATH_FLAG_WHITELIST) )
2156 {
2157 nl4e = adjust_guest_l4e(nl4e, d);
2158 rc = UPDATE_ENTRY(l4, pl4e, ol4e, nl4e, pfn, vcpu, preserve_ad);
2159 return rc ? 0 : -EFAULT;
2160 }
2161
2162 rc = get_page_from_l4e(nl4e, pfn, d, 0);
2163 if ( unlikely(rc < 0) )
2164 return rc;
2165 rc = 0;
2166
2167 nl4e = adjust_guest_l4e(nl4e, d);
2168 if ( unlikely(!UPDATE_ENTRY(l4, pl4e, ol4e, nl4e, pfn, vcpu,
2169 preserve_ad)) )
2170 {
2171 ol4e = nl4e;
2172 rc = -EFAULT;
2173 }
2174 }
2175 else if ( unlikely(!UPDATE_ENTRY(l4, pl4e, ol4e, nl4e, pfn, vcpu,
2176 preserve_ad)) )
2177 {
2178 return -EFAULT;
2179 }
2180
2181 put_page_from_l4e(ol4e, pfn, 0, 1);
2182 return rc;
2183 }
2184
cleanup_page_cacheattr(struct page_info * page)2185 static int cleanup_page_cacheattr(struct page_info *page)
2186 {
2187 unsigned int cacheattr =
2188 (page->count_info & PGC_cacheattr_mask) >> PGC_cacheattr_base;
2189
2190 if ( likely(cacheattr == 0) )
2191 return 0;
2192
2193 page->count_info &= ~PGC_cacheattr_mask;
2194
2195 BUG_ON(is_xen_heap_page(page));
2196
2197 return update_xen_mappings(mfn_x(page_to_mfn(page)), 0);
2198 }
2199
put_page(struct page_info * page)2200 void put_page(struct page_info *page)
2201 {
2202 unsigned long nx, x, y = page->count_info;
2203
2204 do {
2205 ASSERT((y & PGC_count_mask) != 0);
2206 x = y;
2207 nx = x - 1;
2208 }
2209 while ( unlikely((y = cmpxchg(&page->count_info, x, nx)) != x) );
2210
2211 if ( unlikely((nx & PGC_count_mask) == 0) )
2212 {
2213 if ( cleanup_page_cacheattr(page) == 0 )
2214 free_domheap_page(page);
2215 else
2216 gdprintk(XENLOG_WARNING,
2217 "Leaking mfn %" PRI_mfn "\n", mfn_x(page_to_mfn(page)));
2218 }
2219 }
2220
2221
page_get_owner_and_reference(struct page_info * page)2222 struct domain *page_get_owner_and_reference(struct page_info *page)
2223 {
2224 unsigned long x, y = page->count_info;
2225 struct domain *owner;
2226
2227 do {
2228 x = y;
2229 /*
2230 * Count == 0: Page is not allocated, so we cannot take a reference.
2231 * Count == -1: Reference count would wrap, which is invalid.
2232 * Count == -2: Remaining unused ref is reserved for get_page_light().
2233 */
2234 if ( unlikely(((x + 2) & PGC_count_mask) <= 2) )
2235 return NULL;
2236 }
2237 while ( (y = cmpxchg(&page->count_info, x, x + 1)) != x );
2238
2239 owner = page_get_owner(page);
2240 ASSERT(owner);
2241
2242 return owner;
2243 }
2244
2245
get_page(struct page_info * page,struct domain * domain)2246 int get_page(struct page_info *page, struct domain *domain)
2247 {
2248 struct domain *owner = page_get_owner_and_reference(page);
2249
2250 if ( likely(owner == domain) )
2251 return 1;
2252
2253 if ( !paging_mode_refcounts(domain) && !domain->is_dying )
2254 gprintk(XENLOG_INFO,
2255 "Error mfn %"PRI_mfn": rd=%d od=%d caf=%08lx taf=%" PRtype_info "\n",
2256 mfn_x(page_to_mfn(page)), domain->domain_id,
2257 owner ? owner->domain_id : DOMID_INVALID,
2258 page->count_info - !!owner, page->u.inuse.type_info);
2259
2260 if ( owner )
2261 put_page(page);
2262
2263 return 0;
2264 }
2265
2266 /*
2267 * Special version of get_page() to be used exclusively when
2268 * - a page is known to already have a non-zero reference count
2269 * - the page does not need its owner to be checked
2270 * - it will not be called more than once without dropping the thus
2271 * acquired reference again.
2272 * Due to get_page() reserving one reference, this call cannot fail.
2273 */
get_page_light(struct page_info * page)2274 static void get_page_light(struct page_info *page)
2275 {
2276 unsigned long x, nx, y = page->count_info;
2277
2278 do {
2279 x = y;
2280 nx = x + 1;
2281 BUG_ON(!(x & PGC_count_mask)); /* Not allocated? */
2282 BUG_ON(!(nx & PGC_count_mask)); /* Overflow? */
2283 y = cmpxchg(&page->count_info, x, nx);
2284 }
2285 while ( unlikely(y != x) );
2286 }
2287
alloc_page_type(struct page_info * page,unsigned long type,int preemptible)2288 static int alloc_page_type(struct page_info *page, unsigned long type,
2289 int preemptible)
2290 {
2291 struct domain *owner = page_get_owner(page);
2292 int rc;
2293
2294 /* A page table is dirtied when its type count becomes non-zero. */
2295 if ( likely(owner != NULL) )
2296 paging_mark_dirty(owner, page_to_mfn(page));
2297
2298 switch ( type & PGT_type_mask )
2299 {
2300 case PGT_l1_page_table:
2301 rc = alloc_l1_table(page);
2302 break;
2303 case PGT_l2_page_table:
2304 rc = alloc_l2_table(page, type, preemptible);
2305 break;
2306 case PGT_l3_page_table:
2307 ASSERT(preemptible);
2308 rc = alloc_l3_table(page);
2309 break;
2310 case PGT_l4_page_table:
2311 ASSERT(preemptible);
2312 rc = alloc_l4_table(page);
2313 break;
2314 case PGT_seg_desc_page:
2315 rc = alloc_segdesc_page(page);
2316 break;
2317 default:
2318 printk("Bad type in alloc_page_type %lx t=%" PRtype_info " c=%lx\n",
2319 type, page->u.inuse.type_info,
2320 page->count_info);
2321 rc = -EINVAL;
2322 BUG();
2323 }
2324
2325 /* No need for atomic update of type_info here: noone else updates it. */
2326 smp_wmb();
2327 switch ( rc )
2328 {
2329 case 0:
2330 page->u.inuse.type_info |= PGT_validated;
2331 break;
2332 case -EINTR:
2333 ASSERT((page->u.inuse.type_info &
2334 (PGT_count_mask|PGT_validated|PGT_partial)) == 1);
2335 page->u.inuse.type_info &= ~PGT_count_mask;
2336 break;
2337 default:
2338 ASSERT(rc < 0);
2339 gdprintk(XENLOG_WARNING, "Error while validating mfn %" PRI_mfn
2340 " (pfn %" PRI_pfn ") for type %" PRtype_info
2341 ": caf=%08lx taf=%" PRtype_info "\n",
2342 mfn_x(page_to_mfn(page)),
2343 get_gpfn_from_mfn(mfn_x(page_to_mfn(page))),
2344 type, page->count_info, page->u.inuse.type_info);
2345 if ( page != current->arch.old_guest_table )
2346 page->u.inuse.type_info = 0;
2347 else
2348 {
2349 ASSERT((page->u.inuse.type_info &
2350 (PGT_count_mask | PGT_validated)) == 1);
2351 case -ERESTART:
2352 get_page_light(page);
2353 page->u.inuse.type_info |= PGT_partial;
2354 }
2355 break;
2356 }
2357
2358 return rc;
2359 }
2360
2361
free_page_type(struct page_info * page,unsigned long type,int preemptible)2362 int free_page_type(struct page_info *page, unsigned long type,
2363 int preemptible)
2364 {
2365 struct domain *owner = page_get_owner(page);
2366 unsigned long gmfn;
2367 int rc;
2368
2369 if ( likely(owner != NULL) && unlikely(paging_mode_enabled(owner)) )
2370 {
2371 /* A page table is dirtied when its type count becomes zero. */
2372 paging_mark_dirty(owner, page_to_mfn(page));
2373
2374 ASSERT(!shadow_mode_refcounts(owner));
2375
2376 gmfn = mfn_to_gmfn(owner, mfn_x(page_to_mfn(page)));
2377 ASSERT(VALID_M2P(gmfn));
2378 /* Page sharing not supported for shadowed domains */
2379 if(!SHARED_M2P(gmfn))
2380 shadow_remove_all_shadows(owner, _mfn(gmfn));
2381 }
2382
2383 if ( !(type & PGT_partial) )
2384 {
2385 page->nr_validated_ptes = 1U << PAGETABLE_ORDER;
2386 page->partial_pte = 0;
2387 }
2388
2389 switch ( type & PGT_type_mask )
2390 {
2391 case PGT_l1_page_table:
2392 free_l1_table(page);
2393 rc = 0;
2394 break;
2395 case PGT_l2_page_table:
2396 rc = free_l2_table(page, preemptible);
2397 break;
2398 case PGT_l3_page_table:
2399 ASSERT(preemptible);
2400 rc = free_l3_table(page);
2401 break;
2402 case PGT_l4_page_table:
2403 ASSERT(preemptible);
2404 rc = free_l4_table(page);
2405 break;
2406 default:
2407 gdprintk(XENLOG_WARNING, "type %" PRtype_info " mfn %" PRI_mfn "\n",
2408 type, mfn_x(page_to_mfn(page)));
2409 rc = -EINVAL;
2410 BUG();
2411 }
2412
2413 return rc;
2414 }
2415
2416
_put_final_page_type(struct page_info * page,unsigned long type,bool preemptible,struct page_info * ptpg)2417 static int _put_final_page_type(struct page_info *page, unsigned long type,
2418 bool preemptible, struct page_info *ptpg)
2419 {
2420 int rc = free_page_type(page, type, preemptible);
2421
2422 /* No need for atomic update of type_info here: noone else updates it. */
2423 if ( rc == 0 )
2424 {
2425 if ( ptpg && PGT_type_equal(type, ptpg->u.inuse.type_info) )
2426 {
2427 dec_linear_uses(page);
2428 dec_linear_entries(ptpg);
2429 }
2430 ASSERT(!page->linear_pt_count || page_get_owner(page)->is_dying);
2431 set_tlbflush_timestamp(page);
2432 smp_wmb();
2433 page->u.inuse.type_info--;
2434 }
2435 else if ( rc == -EINTR )
2436 {
2437 ASSERT((page->u.inuse.type_info &
2438 (PGT_count_mask|PGT_validated|PGT_partial)) == 1);
2439 smp_wmb();
2440 page->u.inuse.type_info |= PGT_validated;
2441 }
2442 else
2443 {
2444 BUG_ON(rc != -ERESTART);
2445 smp_wmb();
2446 get_page_light(page);
2447 page->u.inuse.type_info |= PGT_partial;
2448 }
2449
2450 return rc;
2451 }
2452
2453
_put_page_type(struct page_info * page,bool preemptible,struct page_info * ptpg)2454 static int _put_page_type(struct page_info *page, bool preemptible,
2455 struct page_info *ptpg)
2456 {
2457 unsigned long nx, x, y = page->u.inuse.type_info;
2458 int rc = 0;
2459
2460 for ( ; ; )
2461 {
2462 x = y;
2463 nx = x - 1;
2464
2465 ASSERT((x & PGT_count_mask) != 0);
2466
2467 if ( unlikely((nx & PGT_count_mask) == 0) )
2468 {
2469 if ( unlikely((nx & PGT_type_mask) <= PGT_l4_page_table) &&
2470 likely(nx & (PGT_validated|PGT_partial)) )
2471 {
2472 /*
2473 * Page-table pages must be unvalidated when count is zero. The
2474 * 'free' is safe because the refcnt is non-zero and validated
2475 * bit is clear => other ops will spin or fail.
2476 */
2477 nx = x & ~(PGT_validated|PGT_partial);
2478 if ( unlikely((y = cmpxchg(&page->u.inuse.type_info,
2479 x, nx)) != x) )
2480 continue;
2481 /* We cleared the 'valid bit' so we do the clean up. */
2482 rc = _put_final_page_type(page, x, preemptible, ptpg);
2483 ptpg = NULL;
2484 if ( x & PGT_partial )
2485 put_page(page);
2486 break;
2487 }
2488
2489 if ( !ptpg || !PGT_type_equal(x, ptpg->u.inuse.type_info) )
2490 {
2491 /*
2492 * set_tlbflush_timestamp() accesses the same union
2493 * linear_pt_count lives in. Pages (including page table ones),
2494 * however, don't need their flush time stamp set except when
2495 * the last reference is being dropped. For page table pages
2496 * this happens in _put_final_page_type().
2497 */
2498 set_tlbflush_timestamp(page);
2499 }
2500 else
2501 BUG_ON(!IS_ENABLED(CONFIG_PV_LINEAR_PT));
2502 }
2503 else if ( unlikely((nx & (PGT_locked | PGT_count_mask)) ==
2504 (PGT_locked | 1)) )
2505 {
2506 /*
2507 * We must not drop the second to last reference when the page is
2508 * locked, as page_unlock() doesn't do any cleanup of the type.
2509 */
2510 cpu_relax();
2511 y = page->u.inuse.type_info;
2512 continue;
2513 }
2514
2515 if ( likely((y = cmpxchg(&page->u.inuse.type_info, x, nx)) == x) )
2516 break;
2517
2518 if ( preemptible && hypercall_preempt_check() )
2519 return -EINTR;
2520 }
2521
2522 if ( ptpg && PGT_type_equal(x, ptpg->u.inuse.type_info) )
2523 {
2524 ASSERT(!rc);
2525 dec_linear_uses(page);
2526 dec_linear_entries(ptpg);
2527 }
2528
2529 return rc;
2530 }
2531
2532
__get_page_type(struct page_info * page,unsigned long type,int preemptible)2533 static int __get_page_type(struct page_info *page, unsigned long type,
2534 int preemptible)
2535 {
2536 unsigned long nx, x, y = page->u.inuse.type_info;
2537 int rc = 0, iommu_ret = 0;
2538
2539 ASSERT(!(type & ~(PGT_type_mask | PGT_pae_xen_l2)));
2540 ASSERT(!in_irq());
2541
2542 for ( ; ; )
2543 {
2544 x = y;
2545 nx = x + 1;
2546 if ( unlikely((nx & PGT_count_mask) == 0) )
2547 {
2548 gdprintk(XENLOG_WARNING,
2549 "Type count overflow on mfn %"PRI_mfn"\n",
2550 mfn_x(page_to_mfn(page)));
2551 return -EINVAL;
2552 }
2553 else if ( unlikely((x & PGT_count_mask) == 0) )
2554 {
2555 struct domain *d = page_get_owner(page);
2556
2557 /*
2558 * Normally we should never let a page go from type count 0
2559 * to type count 1 when it is shadowed. One exception:
2560 * out-of-sync shadowed pages are allowed to become
2561 * writeable.
2562 */
2563 if ( d && shadow_mode_enabled(d)
2564 && (page->count_info & PGC_page_table)
2565 && !((page->shadow_flags & (1u<<29))
2566 && type == PGT_writable_page) )
2567 shadow_remove_all_shadows(d, page_to_mfn(page));
2568
2569 ASSERT(!(x & PGT_pae_xen_l2));
2570 if ( (x & PGT_type_mask) != type )
2571 {
2572 /*
2573 * On type change we check to flush stale TLB entries. This
2574 * may be unnecessary (e.g., page was GDT/LDT) but those
2575 * circumstances should be very rare.
2576 */
2577 cpumask_t *mask = this_cpu(scratch_cpumask);
2578
2579 BUG_ON(in_irq());
2580 cpumask_copy(mask, d->domain_dirty_cpumask);
2581
2582 /* Don't flush if the timestamp is old enough */
2583 tlbflush_filter(mask, page->tlbflush_timestamp);
2584
2585 if ( unlikely(!cpumask_empty(mask)) &&
2586 /* Shadow mode: track only writable pages. */
2587 (!shadow_mode_enabled(page_get_owner(page)) ||
2588 ((nx & PGT_type_mask) == PGT_writable_page)) )
2589 {
2590 perfc_incr(need_flush_tlb_flush);
2591 flush_tlb_mask(mask);
2592 }
2593
2594 /* We lose existing type and validity. */
2595 nx &= ~(PGT_type_mask | PGT_validated);
2596 nx |= type;
2597
2598 /*
2599 * No special validation needed for writable pages.
2600 * Page tables and GDT/LDT need to be scanned for validity.
2601 */
2602 if ( type == PGT_writable_page || type == PGT_shared_page )
2603 nx |= PGT_validated;
2604 }
2605 }
2606 else if ( unlikely((x & (PGT_type_mask|PGT_pae_xen_l2)) != type) )
2607 {
2608 /* Don't log failure if it could be a recursive-mapping attempt. */
2609 if ( ((x & PGT_type_mask) == PGT_l2_page_table) &&
2610 (type == PGT_l1_page_table) )
2611 return -EINVAL;
2612 if ( ((x & PGT_type_mask) == PGT_l3_page_table) &&
2613 (type == PGT_l2_page_table) )
2614 return -EINVAL;
2615 if ( ((x & PGT_type_mask) == PGT_l4_page_table) &&
2616 (type == PGT_l3_page_table) )
2617 return -EINVAL;
2618 gdprintk(XENLOG_WARNING,
2619 "Bad type (saw %" PRtype_info " != exp %" PRtype_info ") "
2620 "for mfn %" PRI_mfn " (pfn %" PRI_pfn ")\n",
2621 x, type, mfn_x(page_to_mfn(page)),
2622 get_gpfn_from_mfn(mfn_x(page_to_mfn(page))));
2623 return -EINVAL;
2624 }
2625 else if ( unlikely(!(x & PGT_validated)) )
2626 {
2627 if ( !(x & PGT_partial) )
2628 {
2629 /* Someone else is updating validation of this page. Wait... */
2630 while ( (y = page->u.inuse.type_info) == x )
2631 {
2632 if ( preemptible && hypercall_preempt_check() )
2633 return -EINTR;
2634 cpu_relax();
2635 }
2636 continue;
2637 }
2638 /* Type ref count was left at 1 when PGT_partial got set. */
2639 ASSERT((x & PGT_count_mask) == 1);
2640 nx = x & ~PGT_partial;
2641 }
2642
2643 if ( likely((y = cmpxchg(&page->u.inuse.type_info, x, nx)) == x) )
2644 break;
2645
2646 if ( preemptible && hypercall_preempt_check() )
2647 return -EINTR;
2648 }
2649
2650 if ( unlikely((x & PGT_type_mask) != type) )
2651 {
2652 /* Special pages should not be accessible from devices. */
2653 struct domain *d = page_get_owner(page);
2654 if ( d && is_pv_domain(d) && unlikely(need_iommu(d)) )
2655 {
2656 gfn_t gfn = _gfn(mfn_to_gmfn(d, mfn_x(page_to_mfn(page))));
2657
2658 if ( (x & PGT_type_mask) == PGT_writable_page )
2659 iommu_ret = iommu_unmap_page(d, gfn_x(gfn));
2660 else if ( type == PGT_writable_page )
2661 iommu_ret = iommu_map_page(d, gfn_x(gfn),
2662 mfn_x(page_to_mfn(page)),
2663 IOMMUF_readable|IOMMUF_writable);
2664 }
2665 }
2666
2667 if ( unlikely(!(nx & PGT_validated)) )
2668 {
2669 if ( !(x & PGT_partial) )
2670 {
2671 page->nr_validated_ptes = 0;
2672 page->partial_pte = 0;
2673 }
2674 page->linear_pt_count = 0;
2675 rc = alloc_page_type(page, type, preemptible);
2676 }
2677
2678 if ( (x & PGT_partial) && !(nx & PGT_partial) )
2679 put_page(page);
2680
2681 if ( !rc )
2682 rc = iommu_ret;
2683
2684 return rc;
2685 }
2686
put_page_type(struct page_info * page)2687 void put_page_type(struct page_info *page)
2688 {
2689 int rc = _put_page_type(page, false, NULL);
2690 ASSERT(rc == 0);
2691 (void)rc;
2692 }
2693
get_page_type(struct page_info * page,unsigned long type)2694 int get_page_type(struct page_info *page, unsigned long type)
2695 {
2696 int rc = __get_page_type(page, type, 0);
2697 if ( likely(rc == 0) )
2698 return 1;
2699 ASSERT(rc != -EINTR && rc != -ERESTART);
2700 return 0;
2701 }
2702
put_page_type_preemptible(struct page_info * page)2703 int put_page_type_preemptible(struct page_info *page)
2704 {
2705 return _put_page_type(page, true, NULL);
2706 }
2707
get_page_type_preemptible(struct page_info * page,unsigned long type)2708 int get_page_type_preemptible(struct page_info *page, unsigned long type)
2709 {
2710 ASSERT(!current->arch.old_guest_table);
2711 return __get_page_type(page, type, 1);
2712 }
2713
put_old_guest_table(struct vcpu * v)2714 int put_old_guest_table(struct vcpu *v)
2715 {
2716 int rc;
2717
2718 if ( !v->arch.old_guest_table )
2719 return 0;
2720
2721 switch ( rc = _put_page_type(v->arch.old_guest_table, true,
2722 v->arch.old_guest_ptpg) )
2723 {
2724 case -EINTR:
2725 case -ERESTART:
2726 return -ERESTART;
2727 case 0:
2728 put_page(v->arch.old_guest_table);
2729 }
2730
2731 v->arch.old_guest_table = NULL;
2732
2733 return rc;
2734 }
2735
vcpu_destroy_pagetables(struct vcpu * v)2736 int vcpu_destroy_pagetables(struct vcpu *v)
2737 {
2738 unsigned long mfn = pagetable_get_pfn(v->arch.guest_table);
2739 struct page_info *page;
2740 l4_pgentry_t *l4tab = NULL;
2741 int rc = put_old_guest_table(v);
2742
2743 if ( rc )
2744 return rc;
2745
2746 if ( is_pv_32bit_vcpu(v) )
2747 {
2748 l4tab = map_domain_page(_mfn(mfn));
2749 mfn = l4e_get_pfn(*l4tab);
2750 }
2751
2752 if ( mfn )
2753 {
2754 page = mfn_to_page(_mfn(mfn));
2755 if ( paging_mode_refcounts(v->domain) )
2756 put_page(page);
2757 else
2758 rc = put_page_and_type_preemptible(page);
2759 }
2760
2761 if ( l4tab )
2762 {
2763 if ( !rc )
2764 l4e_write(l4tab, l4e_empty());
2765 unmap_domain_page(l4tab);
2766 }
2767 else if ( !rc )
2768 {
2769 v->arch.guest_table = pagetable_null();
2770
2771 /* Drop ref to guest_table_user (from MMUEXT_NEW_USER_BASEPTR) */
2772 mfn = pagetable_get_pfn(v->arch.guest_table_user);
2773 if ( mfn )
2774 {
2775 page = mfn_to_page(_mfn(mfn));
2776 if ( paging_mode_refcounts(v->domain) )
2777 put_page(page);
2778 else
2779 rc = put_page_and_type_preemptible(page);
2780 }
2781 if ( !rc )
2782 v->arch.guest_table_user = pagetable_null();
2783 }
2784
2785 v->arch.cr3 = 0;
2786
2787 /*
2788 * put_page_and_type_preemptible() is liable to return -EINTR. The
2789 * callers of us expect -ERESTART so convert it over.
2790 */
2791 return rc != -EINTR ? rc : -ERESTART;
2792 }
2793
new_guest_cr3(mfn_t mfn)2794 int new_guest_cr3(mfn_t mfn)
2795 {
2796 struct vcpu *curr = current;
2797 struct domain *d = curr->domain;
2798 int rc;
2799 mfn_t old_base_mfn;
2800
2801 if ( is_pv_32bit_domain(d) )
2802 {
2803 mfn_t gt_mfn = pagetable_get_mfn(curr->arch.guest_table);
2804 l4_pgentry_t *pl4e = map_domain_page(gt_mfn);
2805
2806 rc = mod_l4_entry(pl4e,
2807 l4e_from_mfn(mfn,
2808 (_PAGE_PRESENT | _PAGE_RW |
2809 _PAGE_USER | _PAGE_ACCESSED)),
2810 mfn_x(gt_mfn), 0, curr);
2811 unmap_domain_page(pl4e);
2812 switch ( rc )
2813 {
2814 case 0:
2815 break;
2816 case -EINTR:
2817 case -ERESTART:
2818 return -ERESTART;
2819 default:
2820 gdprintk(XENLOG_WARNING,
2821 "Error while installing new compat baseptr %" PRI_mfn "\n",
2822 mfn_x(mfn));
2823 return rc;
2824 }
2825
2826 invalidate_shadow_ldt(curr, 0);
2827 write_ptbase(curr);
2828
2829 return 0;
2830 }
2831
2832 rc = put_old_guest_table(curr);
2833 if ( unlikely(rc) )
2834 return rc;
2835
2836 old_base_mfn = pagetable_get_mfn(curr->arch.guest_table);
2837 /*
2838 * This is particularly important when getting restarted after the
2839 * previous attempt got preempted in the put-old-MFN phase.
2840 */
2841 if ( mfn_eq(old_base_mfn, mfn) )
2842 {
2843 write_ptbase(curr);
2844 return 0;
2845 }
2846
2847 rc = paging_mode_refcounts(d)
2848 ? (get_page_from_mfn(mfn, d) ? 0 : -EINVAL)
2849 : get_page_and_type_from_mfn(mfn, PGT_root_page_table, d, 0, 1);
2850 switch ( rc )
2851 {
2852 case 0:
2853 break;
2854 case -EINTR:
2855 case -ERESTART:
2856 return -ERESTART;
2857 default:
2858 gdprintk(XENLOG_WARNING,
2859 "Error while installing new baseptr %" PRI_mfn "\n",
2860 mfn_x(mfn));
2861 return rc;
2862 }
2863
2864 invalidate_shadow_ldt(curr, 0);
2865
2866 if ( !VM_ASSIST(d, m2p_strict) && !paging_mode_refcounts(d) )
2867 fill_ro_mpt(mfn);
2868 curr->arch.guest_table = pagetable_from_mfn(mfn);
2869 update_cr3(curr);
2870
2871 write_ptbase(curr);
2872
2873 if ( likely(mfn_x(old_base_mfn) != 0) )
2874 {
2875 struct page_info *page = mfn_to_page(old_base_mfn);
2876
2877 if ( paging_mode_refcounts(d) )
2878 put_page(page);
2879 else
2880 switch ( rc = put_page_and_type_preemptible(page) )
2881 {
2882 case -EINTR:
2883 rc = -ERESTART;
2884 /* fallthrough */
2885 case -ERESTART:
2886 curr->arch.old_guest_ptpg = NULL;
2887 curr->arch.old_guest_table = page;
2888 break;
2889 default:
2890 BUG_ON(rc);
2891 break;
2892 }
2893 }
2894
2895 return rc;
2896 }
2897
get_pg_owner(domid_t domid)2898 static struct domain *get_pg_owner(domid_t domid)
2899 {
2900 struct domain *pg_owner = NULL, *curr = current->domain;
2901
2902 if ( likely(domid == DOMID_SELF) )
2903 {
2904 pg_owner = rcu_lock_current_domain();
2905 goto out;
2906 }
2907
2908 if ( unlikely(domid == curr->domain_id) )
2909 {
2910 gdprintk(XENLOG_WARNING, "Cannot specify itself as foreign domain\n");
2911 goto out;
2912 }
2913
2914 switch ( domid )
2915 {
2916 case DOMID_IO:
2917 pg_owner = rcu_lock_domain(dom_io);
2918 break;
2919 case DOMID_XEN:
2920 pg_owner = rcu_lock_domain(dom_xen);
2921 break;
2922 default:
2923 if ( (pg_owner = rcu_lock_domain_by_id(domid)) == NULL )
2924 {
2925 gdprintk(XENLOG_WARNING, "Unknown domain d%d\n", domid);
2926 break;
2927 }
2928 break;
2929 }
2930
2931 out:
2932 return pg_owner;
2933 }
2934
put_pg_owner(struct domain * pg_owner)2935 static void put_pg_owner(struct domain *pg_owner)
2936 {
2937 rcu_unlock_domain(pg_owner);
2938 }
2939
vcpumask_to_pcpumask(struct domain * d,XEN_GUEST_HANDLE_PARAM (const_void)bmap,cpumask_t * pmask)2940 static inline int vcpumask_to_pcpumask(
2941 struct domain *d, XEN_GUEST_HANDLE_PARAM(const_void) bmap, cpumask_t *pmask)
2942 {
2943 unsigned int vcpu_id, vcpu_bias, offs;
2944 unsigned long vmask;
2945 struct vcpu *v;
2946 bool is_native = !is_pv_32bit_domain(d);
2947
2948 cpumask_clear(pmask);
2949 for ( vmask = 0, offs = 0; ; ++offs )
2950 {
2951 vcpu_bias = offs * (is_native ? BITS_PER_LONG : 32);
2952 if ( vcpu_bias >= d->max_vcpus )
2953 return 0;
2954
2955 if ( unlikely(is_native ?
2956 copy_from_guest_offset(&vmask, bmap, offs, 1) :
2957 copy_from_guest_offset((unsigned int *)&vmask, bmap,
2958 offs, 1)) )
2959 {
2960 cpumask_clear(pmask);
2961 return -EFAULT;
2962 }
2963
2964 while ( vmask )
2965 {
2966 vcpu_id = find_first_set_bit(vmask);
2967 vmask &= ~(1UL << vcpu_id);
2968 vcpu_id += vcpu_bias;
2969 if ( (vcpu_id >= d->max_vcpus) )
2970 return 0;
2971 if ( ((v = d->vcpu[vcpu_id]) != NULL) )
2972 cpumask_or(pmask, pmask, v->vcpu_dirty_cpumask);
2973 }
2974 }
2975 }
2976
do_mmuext_op(XEN_GUEST_HANDLE_PARAM (mmuext_op_t)uops,unsigned int count,XEN_GUEST_HANDLE_PARAM (uint)pdone,unsigned int foreigndom)2977 long do_mmuext_op(
2978 XEN_GUEST_HANDLE_PARAM(mmuext_op_t) uops,
2979 unsigned int count,
2980 XEN_GUEST_HANDLE_PARAM(uint) pdone,
2981 unsigned int foreigndom)
2982 {
2983 struct mmuext_op op;
2984 unsigned long type;
2985 unsigned int i, done = 0;
2986 struct vcpu *curr = current;
2987 struct domain *currd = curr->domain;
2988 struct domain *pg_owner;
2989 int rc = put_old_guest_table(curr);
2990
2991 if ( unlikely(rc) )
2992 {
2993 if ( likely(rc == -ERESTART) )
2994 rc = hypercall_create_continuation(
2995 __HYPERVISOR_mmuext_op, "hihi", uops, count, pdone,
2996 foreigndom);
2997 return rc;
2998 }
2999
3000 if ( unlikely(count == MMU_UPDATE_PREEMPTED) &&
3001 likely(guest_handle_is_null(uops)) )
3002 {
3003 /*
3004 * See the curr->arch.old_guest_table related
3005 * hypercall_create_continuation() below.
3006 */
3007 return (int)foreigndom;
3008 }
3009
3010 if ( unlikely(count & MMU_UPDATE_PREEMPTED) )
3011 {
3012 count &= ~MMU_UPDATE_PREEMPTED;
3013 if ( unlikely(!guest_handle_is_null(pdone)) )
3014 (void)copy_from_guest(&done, pdone, 1);
3015 }
3016 else
3017 perfc_incr(calls_to_mmuext_op);
3018
3019 if ( unlikely(!guest_handle_okay(uops, count)) )
3020 return -EFAULT;
3021
3022 if ( (pg_owner = get_pg_owner(foreigndom)) == NULL )
3023 return -ESRCH;
3024
3025 if ( !is_pv_domain(pg_owner) )
3026 {
3027 put_pg_owner(pg_owner);
3028 return -EINVAL;
3029 }
3030
3031 rc = xsm_mmuext_op(XSM_TARGET, currd, pg_owner);
3032 if ( rc )
3033 {
3034 put_pg_owner(pg_owner);
3035 return rc;
3036 }
3037
3038 for ( i = 0; i < count; i++ )
3039 {
3040 if ( curr->arch.old_guest_table || (i && hypercall_preempt_check()) )
3041 {
3042 rc = -ERESTART;
3043 break;
3044 }
3045
3046 if ( unlikely(__copy_from_guest(&op, uops, 1) != 0) )
3047 {
3048 rc = -EFAULT;
3049 break;
3050 }
3051
3052 if ( is_hvm_domain(currd) )
3053 {
3054 switch ( op.cmd )
3055 {
3056 case MMUEXT_PIN_L1_TABLE:
3057 case MMUEXT_PIN_L2_TABLE:
3058 case MMUEXT_PIN_L3_TABLE:
3059 case MMUEXT_PIN_L4_TABLE:
3060 case MMUEXT_UNPIN_TABLE:
3061 break;
3062 default:
3063 rc = -EOPNOTSUPP;
3064 goto done;
3065 }
3066 }
3067
3068 rc = 0;
3069
3070 switch ( op.cmd )
3071 {
3072 struct page_info *page;
3073 p2m_type_t p2mt;
3074
3075 case MMUEXT_PIN_L1_TABLE:
3076 type = PGT_l1_page_table;
3077 goto pin_page;
3078
3079 case MMUEXT_PIN_L2_TABLE:
3080 type = PGT_l2_page_table;
3081 goto pin_page;
3082
3083 case MMUEXT_PIN_L3_TABLE:
3084 type = PGT_l3_page_table;
3085 goto pin_page;
3086
3087 case MMUEXT_PIN_L4_TABLE:
3088 if ( is_pv_32bit_domain(pg_owner) )
3089 break;
3090 type = PGT_l4_page_table;
3091
3092 pin_page:
3093 /* Ignore pinning of invalid paging levels. */
3094 if ( (op.cmd - MMUEXT_PIN_L1_TABLE) > (CONFIG_PAGING_LEVELS - 1) )
3095 break;
3096
3097 if ( paging_mode_refcounts(pg_owner) )
3098 break;
3099
3100 page = get_page_from_gfn(pg_owner, op.arg1.mfn, NULL, P2M_ALLOC);
3101 if ( unlikely(!page) )
3102 {
3103 rc = -EINVAL;
3104 break;
3105 }
3106
3107 rc = get_page_type_preemptible(page, type);
3108 if ( unlikely(rc) )
3109 {
3110 if ( rc == -EINTR )
3111 rc = -ERESTART;
3112 else if ( rc != -ERESTART )
3113 gdprintk(XENLOG_WARNING,
3114 "Error %d while pinning mfn %" PRI_mfn "\n",
3115 rc, mfn_x(page_to_mfn(page)));
3116 if ( page != curr->arch.old_guest_table )
3117 put_page(page);
3118 break;
3119 }
3120
3121 rc = xsm_memory_pin_page(XSM_HOOK, currd, pg_owner, page);
3122 if ( !rc && unlikely(test_and_set_bit(_PGT_pinned,
3123 &page->u.inuse.type_info)) )
3124 {
3125 gdprintk(XENLOG_WARNING,
3126 "mfn %" PRI_mfn " already pinned\n",
3127 mfn_x(page_to_mfn(page)));
3128 rc = -EINVAL;
3129 }
3130
3131 if ( unlikely(rc) )
3132 goto pin_drop;
3133
3134 /* A page is dirtied when its pin status is set. */
3135 paging_mark_dirty(pg_owner, page_to_mfn(page));
3136
3137 /* We can race domain destruction (domain_relinquish_resources). */
3138 if ( unlikely(pg_owner != currd) )
3139 {
3140 bool drop_ref;
3141
3142 spin_lock(&pg_owner->page_alloc_lock);
3143 drop_ref = (pg_owner->is_dying &&
3144 test_and_clear_bit(_PGT_pinned,
3145 &page->u.inuse.type_info));
3146 spin_unlock(&pg_owner->page_alloc_lock);
3147 if ( drop_ref )
3148 {
3149 pin_drop:
3150 if ( type == PGT_l1_page_table )
3151 put_page_and_type(page);
3152 else
3153 {
3154 curr->arch.old_guest_ptpg = NULL;
3155 curr->arch.old_guest_table = page;
3156 }
3157 }
3158 }
3159 break;
3160
3161 case MMUEXT_UNPIN_TABLE:
3162 if ( paging_mode_refcounts(pg_owner) )
3163 break;
3164
3165 page = get_page_from_gfn(pg_owner, op.arg1.mfn, NULL, P2M_ALLOC);
3166 if ( unlikely(!page) )
3167 {
3168 gdprintk(XENLOG_WARNING,
3169 "mfn %" PRI_mfn " bad, or bad owner d%d\n",
3170 op.arg1.mfn, pg_owner->domain_id);
3171 rc = -EINVAL;
3172 break;
3173 }
3174
3175 if ( !test_and_clear_bit(_PGT_pinned, &page->u.inuse.type_info) )
3176 {
3177 put_page(page);
3178 gdprintk(XENLOG_WARNING,
3179 "mfn %" PRI_mfn " not pinned\n", op.arg1.mfn);
3180 rc = -EINVAL;
3181 break;
3182 }
3183
3184 switch ( rc = put_page_and_type_preemptible(page) )
3185 {
3186 case -EINTR:
3187 case -ERESTART:
3188 curr->arch.old_guest_ptpg = NULL;
3189 curr->arch.old_guest_table = page;
3190 rc = 0;
3191 break;
3192 default:
3193 BUG_ON(rc);
3194 break;
3195 }
3196 put_page(page);
3197
3198 /* A page is dirtied when its pin status is cleared. */
3199 paging_mark_dirty(pg_owner, page_to_mfn(page));
3200 break;
3201
3202 case MMUEXT_NEW_BASEPTR:
3203 if ( unlikely(currd != pg_owner) )
3204 rc = -EPERM;
3205 else if ( unlikely(paging_mode_translate(currd)) )
3206 rc = -EINVAL;
3207 else
3208 rc = new_guest_cr3(_mfn(op.arg1.mfn));
3209 break;
3210
3211 case MMUEXT_NEW_USER_BASEPTR: {
3212 unsigned long old_mfn;
3213
3214 if ( unlikely(currd != pg_owner) )
3215 rc = -EPERM;
3216 else if ( unlikely(paging_mode_translate(currd)) )
3217 rc = -EINVAL;
3218 if ( unlikely(rc) )
3219 break;
3220
3221 old_mfn = pagetable_get_pfn(curr->arch.guest_table_user);
3222 /*
3223 * This is particularly important when getting restarted after the
3224 * previous attempt got preempted in the put-old-MFN phase.
3225 */
3226 if ( old_mfn == op.arg1.mfn )
3227 break;
3228
3229 if ( op.arg1.mfn != 0 )
3230 {
3231 rc = get_page_and_type_from_mfn(
3232 _mfn(op.arg1.mfn), PGT_root_page_table, currd, 0, 1);
3233
3234 if ( unlikely(rc) )
3235 {
3236 if ( rc == -EINTR )
3237 rc = -ERESTART;
3238 else if ( rc != -ERESTART )
3239 gdprintk(XENLOG_WARNING,
3240 "Error %d installing new mfn %" PRI_mfn "\n",
3241 rc, op.arg1.mfn);
3242 break;
3243 }
3244
3245 if ( VM_ASSIST(currd, m2p_strict) )
3246 zap_ro_mpt(_mfn(op.arg1.mfn));
3247 }
3248
3249 curr->arch.guest_table_user = pagetable_from_pfn(op.arg1.mfn);
3250
3251 if ( old_mfn != 0 )
3252 {
3253 page = mfn_to_page(_mfn(old_mfn));
3254
3255 switch ( rc = put_page_and_type_preemptible(page) )
3256 {
3257 case -EINTR:
3258 rc = -ERESTART;
3259 /* fallthrough */
3260 case -ERESTART:
3261 curr->arch.old_guest_ptpg = NULL;
3262 curr->arch.old_guest_table = page;
3263 break;
3264 default:
3265 BUG_ON(rc);
3266 break;
3267 }
3268 }
3269
3270 break;
3271 }
3272
3273 case MMUEXT_TLB_FLUSH_LOCAL:
3274 if ( likely(currd == pg_owner) )
3275 flush_tlb_local();
3276 else
3277 rc = -EPERM;
3278 break;
3279
3280 case MMUEXT_INVLPG_LOCAL:
3281 if ( unlikely(currd != pg_owner) )
3282 rc = -EPERM;
3283 else
3284 paging_invlpg(curr, op.arg1.linear_addr);
3285 break;
3286
3287 case MMUEXT_TLB_FLUSH_MULTI:
3288 case MMUEXT_INVLPG_MULTI:
3289 {
3290 cpumask_t *mask = this_cpu(scratch_cpumask);
3291
3292 if ( unlikely(currd != pg_owner) )
3293 rc = -EPERM;
3294 else if ( unlikely(vcpumask_to_pcpumask(currd,
3295 guest_handle_to_param(op.arg2.vcpumask,
3296 const_void),
3297 mask)) )
3298 rc = -EINVAL;
3299 if ( unlikely(rc) )
3300 break;
3301
3302 if ( op.cmd == MMUEXT_TLB_FLUSH_MULTI )
3303 flush_tlb_mask(mask);
3304 else if ( __addr_ok(op.arg1.linear_addr) )
3305 flush_tlb_one_mask(mask, op.arg1.linear_addr);
3306 break;
3307 }
3308
3309 case MMUEXT_TLB_FLUSH_ALL:
3310 if ( likely(currd == pg_owner) )
3311 flush_tlb_mask(currd->domain_dirty_cpumask);
3312 else
3313 rc = -EPERM;
3314 break;
3315
3316 case MMUEXT_INVLPG_ALL:
3317 if ( unlikely(currd != pg_owner) )
3318 rc = -EPERM;
3319 else if ( __addr_ok(op.arg1.linear_addr) )
3320 flush_tlb_one_mask(currd->domain_dirty_cpumask,
3321 op.arg1.linear_addr);
3322 break;
3323
3324 case MMUEXT_FLUSH_CACHE:
3325 if ( unlikely(currd != pg_owner) )
3326 rc = -EPERM;
3327 else if ( unlikely(!cache_flush_permitted(currd)) )
3328 rc = -EACCES;
3329 else
3330 wbinvd();
3331 break;
3332
3333 case MMUEXT_FLUSH_CACHE_GLOBAL:
3334 if ( unlikely(currd != pg_owner) )
3335 rc = -EPERM;
3336 else if ( likely(cache_flush_permitted(currd)) )
3337 {
3338 unsigned int cpu;
3339 cpumask_t *mask = this_cpu(scratch_cpumask);
3340
3341 cpumask_clear(mask);
3342 for_each_online_cpu(cpu)
3343 if ( !cpumask_intersects(mask,
3344 per_cpu(cpu_sibling_mask, cpu)) )
3345 __cpumask_set_cpu(cpu, mask);
3346 flush_mask(mask, FLUSH_CACHE);
3347 }
3348 else
3349 rc = -EINVAL;
3350 break;
3351
3352 case MMUEXT_SET_LDT:
3353 {
3354 unsigned int ents = op.arg2.nr_ents;
3355 unsigned long ptr = ents ? op.arg1.linear_addr : 0;
3356
3357 if ( unlikely(currd != pg_owner) )
3358 rc = -EPERM;
3359 else if ( paging_mode_external(currd) )
3360 rc = -EINVAL;
3361 else if ( ((ptr & (PAGE_SIZE - 1)) != 0) || !__addr_ok(ptr) ||
3362 (ents > 8192) )
3363 {
3364 gdprintk(XENLOG_WARNING,
3365 "Bad args to SET_LDT: ptr=%lx, ents=%x\n", ptr, ents);
3366 rc = -EINVAL;
3367 }
3368 else if ( (curr->arch.pv_vcpu.ldt_ents != ents) ||
3369 (curr->arch.pv_vcpu.ldt_base != ptr) )
3370 {
3371 invalidate_shadow_ldt(curr, 0);
3372 flush_tlb_local();
3373 curr->arch.pv_vcpu.ldt_base = ptr;
3374 curr->arch.pv_vcpu.ldt_ents = ents;
3375 load_LDT(curr);
3376 }
3377 break;
3378 }
3379
3380 case MMUEXT_CLEAR_PAGE:
3381 page = get_page_from_gfn(pg_owner, op.arg1.mfn, &p2mt, P2M_ALLOC);
3382 if ( unlikely(p2mt != p2m_ram_rw) && page )
3383 {
3384 put_page(page);
3385 page = NULL;
3386 }
3387 if ( !page || !get_page_type(page, PGT_writable_page) )
3388 {
3389 if ( page )
3390 put_page(page);
3391 gdprintk(XENLOG_WARNING,
3392 "Error clearing mfn %" PRI_mfn "\n", op.arg1.mfn);
3393 rc = -EINVAL;
3394 break;
3395 }
3396
3397 /* A page is dirtied when it's being cleared. */
3398 paging_mark_dirty(pg_owner, page_to_mfn(page));
3399
3400 clear_domain_page(page_to_mfn(page));
3401
3402 put_page_and_type(page);
3403 break;
3404
3405 case MMUEXT_COPY_PAGE:
3406 {
3407 struct page_info *src_page, *dst_page;
3408
3409 src_page = get_page_from_gfn(pg_owner, op.arg2.src_mfn, &p2mt,
3410 P2M_ALLOC);
3411 if ( unlikely(p2mt != p2m_ram_rw) && src_page )
3412 {
3413 put_page(src_page);
3414 src_page = NULL;
3415 }
3416 if ( unlikely(!src_page) )
3417 {
3418 gdprintk(XENLOG_WARNING,
3419 "Error copying from mfn %" PRI_mfn "\n",
3420 op.arg2.src_mfn);
3421 rc = -EINVAL;
3422 break;
3423 }
3424
3425 dst_page = get_page_from_gfn(pg_owner, op.arg1.mfn, &p2mt,
3426 P2M_ALLOC);
3427 if ( unlikely(p2mt != p2m_ram_rw) && dst_page )
3428 {
3429 put_page(dst_page);
3430 dst_page = NULL;
3431 }
3432 rc = (dst_page &&
3433 get_page_type(dst_page, PGT_writable_page)) ? 0 : -EINVAL;
3434 if ( unlikely(rc) )
3435 {
3436 put_page(src_page);
3437 if ( dst_page )
3438 put_page(dst_page);
3439 gdprintk(XENLOG_WARNING,
3440 "Error copying to mfn %" PRI_mfn "\n", op.arg1.mfn);
3441 break;
3442 }
3443
3444 /* A page is dirtied when it's being copied to. */
3445 paging_mark_dirty(pg_owner, page_to_mfn(dst_page));
3446
3447 copy_domain_page(page_to_mfn(dst_page), page_to_mfn(src_page));
3448
3449 put_page_and_type(dst_page);
3450 put_page(src_page);
3451 break;
3452 }
3453
3454 case MMUEXT_MARK_SUPER:
3455 case MMUEXT_UNMARK_SUPER:
3456 rc = -EOPNOTSUPP;
3457 break;
3458
3459 default:
3460 rc = -ENOSYS;
3461 break;
3462 }
3463
3464 done:
3465 if ( unlikely(rc) )
3466 break;
3467
3468 guest_handle_add_offset(uops, 1);
3469 }
3470
3471 if ( rc == -ERESTART )
3472 {
3473 ASSERT(i < count);
3474 rc = hypercall_create_continuation(
3475 __HYPERVISOR_mmuext_op, "hihi",
3476 uops, (count - i) | MMU_UPDATE_PREEMPTED, pdone, foreigndom);
3477 }
3478 else if ( curr->arch.old_guest_table )
3479 {
3480 XEN_GUEST_HANDLE_PARAM(void) null;
3481
3482 ASSERT(rc || i == count);
3483 set_xen_guest_handle(null, NULL);
3484 /*
3485 * In order to have a way to communicate the final return value to
3486 * our continuation, we pass this in place of "foreigndom", building
3487 * on the fact that this argument isn't needed anymore.
3488 */
3489 rc = hypercall_create_continuation(
3490 __HYPERVISOR_mmuext_op, "hihi", null,
3491 MMU_UPDATE_PREEMPTED, null, rc);
3492 }
3493
3494 put_pg_owner(pg_owner);
3495
3496 perfc_add(num_mmuext_ops, i);
3497
3498 /* Add incremental work we have done to the @done output parameter. */
3499 if ( unlikely(!guest_handle_is_null(pdone)) )
3500 {
3501 done += i;
3502 copy_to_guest(pdone, &done, 1);
3503 }
3504
3505 return rc;
3506 }
3507
do_mmu_update(XEN_GUEST_HANDLE_PARAM (mmu_update_t)ureqs,unsigned int count,XEN_GUEST_HANDLE_PARAM (uint)pdone,unsigned int foreigndom)3508 long do_mmu_update(
3509 XEN_GUEST_HANDLE_PARAM(mmu_update_t) ureqs,
3510 unsigned int count,
3511 XEN_GUEST_HANDLE_PARAM(uint) pdone,
3512 unsigned int foreigndom)
3513 {
3514 struct mmu_update req;
3515 void *va = NULL;
3516 unsigned long gpfn, gmfn, mfn;
3517 struct page_info *page;
3518 unsigned int cmd, i = 0, done = 0, pt_dom;
3519 struct vcpu *curr = current, *v = curr;
3520 struct domain *d = v->domain, *pt_owner = d, *pg_owner;
3521 mfn_t map_mfn = INVALID_MFN;
3522 uint32_t xsm_needed = 0;
3523 uint32_t xsm_checked = 0;
3524 int rc = put_old_guest_table(curr);
3525
3526 if ( unlikely(rc) )
3527 {
3528 if ( likely(rc == -ERESTART) )
3529 rc = hypercall_create_continuation(
3530 __HYPERVISOR_mmu_update, "hihi", ureqs, count, pdone,
3531 foreigndom);
3532 return rc;
3533 }
3534
3535 if ( unlikely(count == MMU_UPDATE_PREEMPTED) &&
3536 likely(guest_handle_is_null(ureqs)) )
3537 {
3538 /*
3539 * See the curr->arch.old_guest_table related
3540 * hypercall_create_continuation() below.
3541 */
3542 return (int)foreigndom;
3543 }
3544
3545 if ( unlikely(count & MMU_UPDATE_PREEMPTED) )
3546 {
3547 count &= ~MMU_UPDATE_PREEMPTED;
3548 if ( unlikely(!guest_handle_is_null(pdone)) )
3549 (void)copy_from_guest(&done, pdone, 1);
3550 }
3551 else
3552 perfc_incr(calls_to_mmu_update);
3553
3554 if ( unlikely(!guest_handle_okay(ureqs, count)) )
3555 return -EFAULT;
3556
3557 if ( (pt_dom = foreigndom >> 16) != 0 )
3558 {
3559 /* Pagetables belong to a foreign domain (PFD). */
3560 if ( (pt_owner = rcu_lock_domain_by_id(pt_dom - 1)) == NULL )
3561 return -ESRCH;
3562
3563 if ( pt_owner == d )
3564 rcu_unlock_domain(pt_owner);
3565 else if ( !pt_owner->vcpu || (v = pt_owner->vcpu[0]) == NULL )
3566 {
3567 rc = -EINVAL;
3568 goto out;
3569 }
3570 }
3571
3572 if ( (pg_owner = get_pg_owner((uint16_t)foreigndom)) == NULL )
3573 {
3574 rc = -ESRCH;
3575 goto out;
3576 }
3577
3578 for ( i = 0; i < count; i++ )
3579 {
3580 if ( curr->arch.old_guest_table || (i && hypercall_preempt_check()) )
3581 {
3582 rc = -ERESTART;
3583 break;
3584 }
3585
3586 if ( unlikely(__copy_from_guest(&req, ureqs, 1) != 0) )
3587 {
3588 rc = -EFAULT;
3589 break;
3590 }
3591
3592 cmd = req.ptr & (sizeof(l1_pgentry_t)-1);
3593
3594 switch ( cmd )
3595 {
3596 /*
3597 * MMU_NORMAL_PT_UPDATE: Normal update to any level of page table.
3598 * MMU_UPDATE_PT_PRESERVE_AD: As above but also preserve (OR)
3599 * current A/D bits.
3600 */
3601 case MMU_NORMAL_PT_UPDATE:
3602 case MMU_PT_UPDATE_PRESERVE_AD:
3603 {
3604 p2m_type_t p2mt;
3605
3606 rc = -EOPNOTSUPP;
3607 if ( unlikely(paging_mode_refcounts(pt_owner)) )
3608 break;
3609
3610 xsm_needed |= XSM_MMU_NORMAL_UPDATE;
3611 if ( get_pte_flags(req.val) & _PAGE_PRESENT )
3612 {
3613 xsm_needed |= XSM_MMU_UPDATE_READ;
3614 if ( get_pte_flags(req.val) & _PAGE_RW )
3615 xsm_needed |= XSM_MMU_UPDATE_WRITE;
3616 }
3617 if ( xsm_needed != xsm_checked )
3618 {
3619 rc = xsm_mmu_update(XSM_TARGET, d, pt_owner, pg_owner, xsm_needed);
3620 if ( rc )
3621 break;
3622 xsm_checked = xsm_needed;
3623 }
3624 rc = -EINVAL;
3625
3626 req.ptr -= cmd;
3627 gmfn = req.ptr >> PAGE_SHIFT;
3628 page = get_page_from_gfn(pt_owner, gmfn, &p2mt, P2M_ALLOC);
3629
3630 if ( p2m_is_paged(p2mt) )
3631 {
3632 ASSERT(!page);
3633 p2m_mem_paging_populate(pt_owner, gmfn);
3634 rc = -ENOENT;
3635 break;
3636 }
3637
3638 if ( unlikely(!page) )
3639 {
3640 gdprintk(XENLOG_WARNING,
3641 "Could not get page for normal update\n");
3642 break;
3643 }
3644
3645 mfn = mfn_x(page_to_mfn(page));
3646
3647 if ( !mfn_eq(_mfn(mfn), map_mfn) )
3648 {
3649 if ( va )
3650 unmap_domain_page(va);
3651 va = map_domain_page(_mfn(mfn));
3652 map_mfn = _mfn(mfn);
3653 }
3654 va = _p(((unsigned long)va & PAGE_MASK) + (req.ptr & ~PAGE_MASK));
3655
3656 if ( page_lock(page) )
3657 {
3658 switch ( page->u.inuse.type_info & PGT_type_mask )
3659 {
3660 case PGT_l1_page_table:
3661 rc = mod_l1_entry(va, l1e_from_intpte(req.val), mfn,
3662 cmd == MMU_PT_UPDATE_PRESERVE_AD, v,
3663 pg_owner);
3664 break;
3665 case PGT_l2_page_table:
3666 rc = mod_l2_entry(va, l2e_from_intpte(req.val), mfn,
3667 cmd == MMU_PT_UPDATE_PRESERVE_AD, v);
3668 break;
3669 case PGT_l3_page_table:
3670 rc = mod_l3_entry(va, l3e_from_intpte(req.val), mfn,
3671 cmd == MMU_PT_UPDATE_PRESERVE_AD, v);
3672 break;
3673 case PGT_l4_page_table:
3674 rc = mod_l4_entry(va, l4e_from_intpte(req.val), mfn,
3675 cmd == MMU_PT_UPDATE_PRESERVE_AD, v);
3676 break;
3677 case PGT_writable_page:
3678 perfc_incr(writable_mmu_updates);
3679 if ( paging_write_guest_entry(v, va, req.val, _mfn(mfn)) )
3680 rc = 0;
3681 break;
3682 }
3683 page_unlock(page);
3684 if ( rc == -EINTR )
3685 rc = -ERESTART;
3686 }
3687 else if ( get_page_type(page, PGT_writable_page) )
3688 {
3689 perfc_incr(writable_mmu_updates);
3690 if ( paging_write_guest_entry(v, va, req.val, _mfn(mfn)) )
3691 rc = 0;
3692 put_page_type(page);
3693 }
3694
3695 put_page(page);
3696 }
3697 break;
3698
3699 case MMU_MACHPHYS_UPDATE:
3700 if ( unlikely(d != pt_owner) )
3701 {
3702 rc = -EPERM;
3703 break;
3704 }
3705
3706 if ( unlikely(paging_mode_translate(pg_owner)) )
3707 {
3708 rc = -EINVAL;
3709 break;
3710 }
3711
3712 mfn = req.ptr >> PAGE_SHIFT;
3713 gpfn = req.val;
3714
3715 xsm_needed |= XSM_MMU_MACHPHYS_UPDATE;
3716 if ( xsm_needed != xsm_checked )
3717 {
3718 rc = xsm_mmu_update(XSM_TARGET, d, NULL, pg_owner, xsm_needed);
3719 if ( rc )
3720 break;
3721 xsm_checked = xsm_needed;
3722 }
3723
3724 if ( unlikely(!get_page_from_mfn(_mfn(mfn), pg_owner)) )
3725 {
3726 gdprintk(XENLOG_WARNING,
3727 "Could not get page for mach->phys update\n");
3728 rc = -EINVAL;
3729 break;
3730 }
3731
3732 set_gpfn_from_mfn(mfn, gpfn);
3733
3734 paging_mark_dirty(pg_owner, _mfn(mfn));
3735
3736 put_page(mfn_to_page(_mfn(mfn)));
3737 break;
3738
3739 default:
3740 rc = -ENOSYS;
3741 break;
3742 }
3743
3744 if ( unlikely(rc) )
3745 break;
3746
3747 guest_handle_add_offset(ureqs, 1);
3748 }
3749
3750 if ( rc == -ERESTART )
3751 {
3752 ASSERT(i < count);
3753 rc = hypercall_create_continuation(
3754 __HYPERVISOR_mmu_update, "hihi",
3755 ureqs, (count - i) | MMU_UPDATE_PREEMPTED, pdone, foreigndom);
3756 }
3757 else if ( curr->arch.old_guest_table )
3758 {
3759 XEN_GUEST_HANDLE_PARAM(void) null;
3760
3761 ASSERT(rc || i == count);
3762 set_xen_guest_handle(null, NULL);
3763 /*
3764 * In order to have a way to communicate the final return value to
3765 * our continuation, we pass this in place of "foreigndom", building
3766 * on the fact that this argument isn't needed anymore.
3767 */
3768 rc = hypercall_create_continuation(
3769 __HYPERVISOR_mmu_update, "hihi", null,
3770 MMU_UPDATE_PREEMPTED, null, rc);
3771 }
3772
3773 put_pg_owner(pg_owner);
3774
3775 if ( va )
3776 unmap_domain_page(va);
3777
3778 perfc_add(num_page_updates, i);
3779
3780 out:
3781 if ( pt_owner != d )
3782 rcu_unlock_domain(pt_owner);
3783
3784 /* Add incremental work we have done to the @done output parameter. */
3785 if ( unlikely(!guest_handle_is_null(pdone)) )
3786 {
3787 done += i;
3788 copy_to_guest(pdone, &done, 1);
3789 }
3790
3791 return rc;
3792 }
3793
donate_page(struct domain * d,struct page_info * page,unsigned int memflags)3794 int donate_page(
3795 struct domain *d, struct page_info *page, unsigned int memflags)
3796 {
3797 const struct domain *owner = dom_xen;
3798
3799 spin_lock(&d->page_alloc_lock);
3800
3801 if ( is_xen_heap_page(page) || ((owner = page_get_owner(page)) != NULL) )
3802 goto fail;
3803
3804 if ( d->is_dying )
3805 goto fail;
3806
3807 if ( page->count_info & ~(PGC_allocated | 1) )
3808 goto fail;
3809
3810 if ( !(memflags & MEMF_no_refcount) )
3811 {
3812 if ( d->tot_pages >= d->max_pages )
3813 goto fail;
3814 domain_adjust_tot_pages(d, 1);
3815 }
3816
3817 page->count_info = PGC_allocated | 1;
3818 page_set_owner(page, d);
3819 page_list_add_tail(page,&d->page_list);
3820
3821 spin_unlock(&d->page_alloc_lock);
3822 return 0;
3823
3824 fail:
3825 spin_unlock(&d->page_alloc_lock);
3826 gdprintk(XENLOG_WARNING, "Bad donate mfn %" PRI_mfn
3827 " to d%d (owner d%d) caf=%08lx taf=%" PRtype_info "\n",
3828 mfn_x(page_to_mfn(page)), d->domain_id,
3829 owner ? owner->domain_id : DOMID_INVALID,
3830 page->count_info, page->u.inuse.type_info);
3831 return -EINVAL;
3832 }
3833
steal_page(struct domain * d,struct page_info * page,unsigned int memflags)3834 int steal_page(
3835 struct domain *d, struct page_info *page, unsigned int memflags)
3836 {
3837 unsigned long x, y;
3838 bool drop_dom_ref = false;
3839 const struct domain *owner = dom_xen;
3840
3841 if ( paging_mode_external(d) )
3842 return -EOPNOTSUPP;
3843
3844 spin_lock(&d->page_alloc_lock);
3845
3846 if ( is_xen_heap_page(page) || ((owner = page_get_owner(page)) != d) )
3847 goto fail;
3848
3849 /*
3850 * We require there is just one reference (PGC_allocated). We temporarily
3851 * drop this reference now so that we can safely swizzle the owner.
3852 */
3853 y = page->count_info;
3854 do {
3855 x = y;
3856 if ( (x & (PGC_count_mask|PGC_allocated)) != (1 | PGC_allocated) )
3857 goto fail;
3858 y = cmpxchg(&page->count_info, x, x & ~PGC_count_mask);
3859 } while ( y != x );
3860
3861 /*
3862 * With the sole reference dropped temporarily, no-one can update type
3863 * information. Type count also needs to be zero in this case, but e.g.
3864 * PGT_seg_desc_page may still have PGT_validated set, which we need to
3865 * clear before transferring ownership (as validation criteria vary
3866 * depending on domain type).
3867 */
3868 BUG_ON(page->u.inuse.type_info & (PGT_count_mask | PGT_locked |
3869 PGT_pinned));
3870 page->u.inuse.type_info = 0;
3871
3872 /* Swizzle the owner then reinstate the PGC_allocated reference. */
3873 page_set_owner(page, NULL);
3874 y = page->count_info;
3875 do {
3876 x = y;
3877 BUG_ON((x & (PGC_count_mask|PGC_allocated)) != PGC_allocated);
3878 } while ( (y = cmpxchg(&page->count_info, x, x | 1)) != x );
3879
3880 /* Unlink from original owner. */
3881 if ( !(memflags & MEMF_no_refcount) && !domain_adjust_tot_pages(d, -1) )
3882 drop_dom_ref = true;
3883 page_list_del(page, &d->page_list);
3884
3885 spin_unlock(&d->page_alloc_lock);
3886 if ( unlikely(drop_dom_ref) )
3887 put_domain(d);
3888 return 0;
3889
3890 fail:
3891 spin_unlock(&d->page_alloc_lock);
3892 gdprintk(XENLOG_WARNING, "Bad steal mfn %" PRI_mfn
3893 " from d%d (owner d%d) caf=%08lx taf=%" PRtype_info "\n",
3894 mfn_x(page_to_mfn(page)), d->domain_id,
3895 owner ? owner->domain_id : DOMID_INVALID,
3896 page->count_info, page->u.inuse.type_info);
3897 return -EINVAL;
3898 }
3899
__do_update_va_mapping(unsigned long va,u64 val64,unsigned long flags,struct domain * pg_owner)3900 static int __do_update_va_mapping(
3901 unsigned long va, u64 val64, unsigned long flags, struct domain *pg_owner)
3902 {
3903 l1_pgentry_t val = l1e_from_intpte(val64);
3904 struct vcpu *v = current;
3905 struct domain *d = v->domain;
3906 struct page_info *gl1pg;
3907 l1_pgentry_t *pl1e;
3908 unsigned long bmap_ptr;
3909 mfn_t gl1mfn;
3910 cpumask_t *mask = NULL;
3911 int rc;
3912
3913 perfc_incr(calls_to_update_va);
3914
3915 rc = xsm_update_va_mapping(XSM_TARGET, d, pg_owner, val);
3916 if ( rc )
3917 return rc;
3918
3919 rc = -EINVAL;
3920 pl1e = map_guest_l1e(va, &gl1mfn);
3921 if ( unlikely(!pl1e || !get_page_from_mfn(gl1mfn, d)) )
3922 goto out;
3923
3924 gl1pg = mfn_to_page(gl1mfn);
3925 if ( !page_lock(gl1pg) )
3926 {
3927 put_page(gl1pg);
3928 goto out;
3929 }
3930
3931 if ( (gl1pg->u.inuse.type_info & PGT_type_mask) != PGT_l1_page_table )
3932 {
3933 page_unlock(gl1pg);
3934 put_page(gl1pg);
3935 goto out;
3936 }
3937
3938 rc = mod_l1_entry(pl1e, val, mfn_x(gl1mfn), 0, v, pg_owner);
3939
3940 page_unlock(gl1pg);
3941 put_page(gl1pg);
3942
3943 out:
3944 if ( pl1e )
3945 unmap_domain_page(pl1e);
3946
3947 switch ( flags & UVMF_FLUSHTYPE_MASK )
3948 {
3949 case UVMF_TLB_FLUSH:
3950 switch ( (bmap_ptr = flags & ~UVMF_FLUSHTYPE_MASK) )
3951 {
3952 case UVMF_LOCAL:
3953 flush_tlb_local();
3954 break;
3955 case UVMF_ALL:
3956 mask = d->domain_dirty_cpumask;
3957 break;
3958 default:
3959 mask = this_cpu(scratch_cpumask);
3960 rc = vcpumask_to_pcpumask(d, const_guest_handle_from_ptr(bmap_ptr,
3961 void),
3962 mask);
3963 break;
3964 }
3965 if ( mask )
3966 flush_tlb_mask(mask);
3967 break;
3968
3969 case UVMF_INVLPG:
3970 switch ( (bmap_ptr = flags & ~UVMF_FLUSHTYPE_MASK) )
3971 {
3972 case UVMF_LOCAL:
3973 paging_invlpg(v, va);
3974 break;
3975 case UVMF_ALL:
3976 mask = d->domain_dirty_cpumask;
3977 break;
3978 default:
3979 mask = this_cpu(scratch_cpumask);
3980 rc = vcpumask_to_pcpumask(d, const_guest_handle_from_ptr(bmap_ptr,
3981 void),
3982 mask);
3983 break;
3984 }
3985 if ( mask )
3986 flush_tlb_one_mask(mask, va);
3987 break;
3988 }
3989
3990 return rc;
3991 }
3992
do_update_va_mapping(unsigned long va,u64 val64,unsigned long flags)3993 long do_update_va_mapping(unsigned long va, u64 val64,
3994 unsigned long flags)
3995 {
3996 return __do_update_va_mapping(va, val64, flags, current->domain);
3997 }
3998
do_update_va_mapping_otherdomain(unsigned long va,u64 val64,unsigned long flags,domid_t domid)3999 long do_update_va_mapping_otherdomain(unsigned long va, u64 val64,
4000 unsigned long flags,
4001 domid_t domid)
4002 {
4003 struct domain *pg_owner;
4004 int rc;
4005
4006 if ( (pg_owner = get_pg_owner(domid)) == NULL )
4007 return -ESRCH;
4008
4009 rc = __do_update_va_mapping(va, val64, flags, pg_owner);
4010
4011 put_pg_owner(pg_owner);
4012
4013 return rc;
4014 }
4015
4016 typedef struct e820entry e820entry_t;
4017 DEFINE_XEN_GUEST_HANDLE(e820entry_t);
4018
4019 struct memory_map_context
4020 {
4021 unsigned int n;
4022 unsigned long s;
4023 struct xen_memory_map map;
4024 };
4025
_handle_iomem_range(unsigned long s,unsigned long e,struct memory_map_context * ctxt)4026 static int _handle_iomem_range(unsigned long s, unsigned long e,
4027 struct memory_map_context *ctxt)
4028 {
4029 if ( s > ctxt->s && !(s >> (paddr_bits - PAGE_SHIFT)) )
4030 {
4031 e820entry_t ent;
4032 XEN_GUEST_HANDLE_PARAM(e820entry_t) buffer_param;
4033 XEN_GUEST_HANDLE(e820entry_t) buffer;
4034
4035 if ( !guest_handle_is_null(ctxt->map.buffer) )
4036 {
4037 if ( ctxt->n + 1 >= ctxt->map.nr_entries )
4038 return -EINVAL;
4039 ent.addr = (uint64_t)ctxt->s << PAGE_SHIFT;
4040 ent.size = (uint64_t)(s - ctxt->s) << PAGE_SHIFT;
4041 ent.type = E820_RESERVED;
4042 buffer_param = guest_handle_cast(ctxt->map.buffer, e820entry_t);
4043 buffer = guest_handle_from_param(buffer_param, e820entry_t);
4044 if ( __copy_to_guest_offset(buffer, ctxt->n, &ent, 1) )
4045 return -EFAULT;
4046 }
4047 ctxt->n++;
4048 }
4049 ctxt->s = e + 1;
4050
4051 return 0;
4052 }
4053
handle_iomem_range(unsigned long s,unsigned long e,void * p)4054 static int handle_iomem_range(unsigned long s, unsigned long e, void *p)
4055 {
4056 int err = 0;
4057
4058 do {
4059 unsigned long low = -1UL;
4060 unsigned int i;
4061
4062 for ( i = 0; i < nr_ioapics; ++i )
4063 {
4064 unsigned long mfn = paddr_to_pfn(mp_ioapics[i].mpc_apicaddr);
4065
4066 if ( mfn >= s && mfn <= e && mfn < low )
4067 low = mfn;
4068 }
4069 if ( !(low + 1) )
4070 break;
4071 if ( s < low )
4072 err = _handle_iomem_range(s, low - 1, p);
4073 s = low + 1;
4074 } while ( !err );
4075
4076 return err || s > e ? err : _handle_iomem_range(s, e, p);
4077 }
4078
xenmem_add_to_physmap_one(struct domain * d,unsigned int space,union xen_add_to_physmap_batch_extra extra,unsigned long idx,gfn_t gpfn)4079 int xenmem_add_to_physmap_one(
4080 struct domain *d,
4081 unsigned int space,
4082 union xen_add_to_physmap_batch_extra extra,
4083 unsigned long idx,
4084 gfn_t gpfn)
4085 {
4086 struct page_info *page = NULL;
4087 unsigned long gfn = 0; /* gcc ... */
4088 unsigned long prev_mfn, old_gpfn;
4089 int rc = 0;
4090 mfn_t mfn = INVALID_MFN;
4091 p2m_type_t p2mt;
4092
4093 if ( !paging_mode_translate(d) )
4094 return -EACCES;
4095
4096 switch ( space )
4097 {
4098 case XENMAPSPACE_shared_info:
4099 if ( idx == 0 )
4100 mfn = _mfn(virt_to_mfn(d->shared_info));
4101 break;
4102 case XENMAPSPACE_grant_table:
4103 rc = gnttab_map_frame(d, idx, gpfn, &mfn);
4104 if ( rc )
4105 return rc;
4106 break;
4107 case XENMAPSPACE_gmfn_range:
4108 case XENMAPSPACE_gmfn:
4109 {
4110 p2m_type_t p2mt;
4111
4112 gfn = idx;
4113 idx = mfn_x(get_gfn_unshare(d, idx, &p2mt));
4114 /* If the page is still shared, exit early */
4115 if ( p2m_is_shared(p2mt) )
4116 {
4117 put_gfn(d, gfn);
4118 return -ENOMEM;
4119 }
4120 if ( !get_page_from_mfn(_mfn(idx), d) )
4121 break;
4122 mfn = _mfn(idx);
4123 page = mfn_to_page(mfn);
4124 break;
4125 }
4126 case XENMAPSPACE_gmfn_foreign:
4127 return p2m_add_foreign(d, idx, gfn_x(gpfn), extra.foreign_domid);
4128 default:
4129 break;
4130 }
4131
4132 if ( mfn_eq(mfn, INVALID_MFN) )
4133 {
4134 rc = -EINVAL;
4135 goto put_both;
4136 }
4137
4138 /* Remove previously mapped page if it was present. */
4139 prev_mfn = mfn_x(get_gfn(d, gfn_x(gpfn), &p2mt));
4140 if ( mfn_valid(_mfn(prev_mfn)) )
4141 {
4142 if ( is_xen_heap_mfn(prev_mfn) )
4143 /* Xen heap frames are simply unhooked from this phys slot. */
4144 rc = guest_physmap_remove_page(d, gpfn, _mfn(prev_mfn), PAGE_ORDER_4K);
4145 else
4146 /* Normal domain memory is freed, to avoid leaking memory. */
4147 rc = guest_remove_page(d, gfn_x(gpfn));
4148 }
4149 /* In the XENMAPSPACE_gmfn case we still hold a ref on the old page. */
4150 put_gfn(d, gfn_x(gpfn));
4151
4152 if ( rc )
4153 goto put_both;
4154
4155 /* Unmap from old location, if any. */
4156 old_gpfn = get_gpfn_from_mfn(mfn_x(mfn));
4157 ASSERT( old_gpfn != SHARED_M2P_ENTRY );
4158 if ( (space == XENMAPSPACE_gmfn || space == XENMAPSPACE_gmfn_range) &&
4159 old_gpfn != gfn )
4160 {
4161 rc = -EXDEV;
4162 goto put_both;
4163 }
4164 if ( old_gpfn != INVALID_M2P_ENTRY )
4165 rc = guest_physmap_remove_page(d, _gfn(old_gpfn), mfn, PAGE_ORDER_4K);
4166
4167 /* Map at new location. */
4168 if ( !rc )
4169 rc = guest_physmap_add_page(d, gpfn, mfn, PAGE_ORDER_4K);
4170
4171 put_both:
4172 /* In the XENMAPSPACE_gmfn, we took a ref of the gfn at the top */
4173 if ( space == XENMAPSPACE_gmfn || space == XENMAPSPACE_gmfn_range )
4174 put_gfn(d, gfn);
4175
4176 if ( page )
4177 put_page(page);
4178
4179 return rc;
4180 }
4181
arch_memory_op(unsigned long cmd,XEN_GUEST_HANDLE_PARAM (void)arg)4182 long arch_memory_op(unsigned long cmd, XEN_GUEST_HANDLE_PARAM(void) arg)
4183 {
4184 int rc;
4185
4186 switch ( cmd )
4187 {
4188 case XENMEM_set_memory_map:
4189 {
4190 struct xen_foreign_memory_map fmap;
4191 struct domain *d;
4192 struct e820entry *e820;
4193
4194 if ( copy_from_guest(&fmap, arg, 1) )
4195 return -EFAULT;
4196
4197 if ( fmap.map.nr_entries > E820MAX )
4198 return -EINVAL;
4199
4200 d = rcu_lock_domain_by_any_id(fmap.domid);
4201 if ( d == NULL )
4202 return -ESRCH;
4203
4204 rc = xsm_domain_memory_map(XSM_TARGET, d);
4205 if ( rc )
4206 {
4207 rcu_unlock_domain(d);
4208 return rc;
4209 }
4210
4211 e820 = xmalloc_array(e820entry_t, fmap.map.nr_entries);
4212 if ( e820 == NULL )
4213 {
4214 rcu_unlock_domain(d);
4215 return -ENOMEM;
4216 }
4217
4218 if ( copy_from_guest(e820, fmap.map.buffer, fmap.map.nr_entries) )
4219 {
4220 xfree(e820);
4221 rcu_unlock_domain(d);
4222 return -EFAULT;
4223 }
4224
4225 spin_lock(&d->arch.e820_lock);
4226 xfree(d->arch.e820);
4227 d->arch.e820 = e820;
4228 d->arch.nr_e820 = fmap.map.nr_entries;
4229 spin_unlock(&d->arch.e820_lock);
4230
4231 rcu_unlock_domain(d);
4232 return rc;
4233 }
4234
4235 case XENMEM_memory_map:
4236 {
4237 struct xen_memory_map map;
4238 struct domain *d = current->domain;
4239
4240 if ( copy_from_guest(&map, arg, 1) )
4241 return -EFAULT;
4242
4243 spin_lock(&d->arch.e820_lock);
4244
4245 /* Backwards compatibility. */
4246 if ( (d->arch.nr_e820 == 0) || (d->arch.e820 == NULL) )
4247 {
4248 spin_unlock(&d->arch.e820_lock);
4249 return -ENOSYS;
4250 }
4251
4252 map.nr_entries = min(map.nr_entries, d->arch.nr_e820);
4253 if ( copy_to_guest(map.buffer, d->arch.e820, map.nr_entries) ||
4254 __copy_to_guest(arg, &map, 1) )
4255 {
4256 spin_unlock(&d->arch.e820_lock);
4257 return -EFAULT;
4258 }
4259
4260 spin_unlock(&d->arch.e820_lock);
4261 return 0;
4262 }
4263
4264 case XENMEM_machine_memory_map:
4265 {
4266 struct memory_map_context ctxt;
4267 XEN_GUEST_HANDLE(e820entry_t) buffer;
4268 XEN_GUEST_HANDLE_PARAM(e820entry_t) buffer_param;
4269 unsigned int i;
4270 bool store;
4271
4272 rc = xsm_machine_memory_map(XSM_PRIV);
4273 if ( rc )
4274 return rc;
4275
4276 if ( copy_from_guest(&ctxt.map, arg, 1) )
4277 return -EFAULT;
4278
4279 store = !guest_handle_is_null(ctxt.map.buffer);
4280
4281 if ( store && ctxt.map.nr_entries < e820.nr_map + 1 )
4282 return -EINVAL;
4283
4284 buffer_param = guest_handle_cast(ctxt.map.buffer, e820entry_t);
4285 buffer = guest_handle_from_param(buffer_param, e820entry_t);
4286 if ( store && !guest_handle_okay(buffer, ctxt.map.nr_entries) )
4287 return -EFAULT;
4288
4289 for ( i = 0, ctxt.n = 0, ctxt.s = 0; i < e820.nr_map; ++i, ++ctxt.n )
4290 {
4291 unsigned long s = PFN_DOWN(e820.map[i].addr);
4292
4293 if ( s > ctxt.s )
4294 {
4295 rc = rangeset_report_ranges(current->domain->iomem_caps,
4296 ctxt.s, s - 1,
4297 handle_iomem_range, &ctxt);
4298 if ( !rc )
4299 rc = handle_iomem_range(s, s, &ctxt);
4300 if ( rc )
4301 return rc;
4302 }
4303 if ( store )
4304 {
4305 if ( ctxt.map.nr_entries <= ctxt.n + (e820.nr_map - i) )
4306 return -EINVAL;
4307 if ( __copy_to_guest_offset(buffer, ctxt.n, e820.map + i, 1) )
4308 return -EFAULT;
4309 }
4310 ctxt.s = PFN_UP(e820.map[i].addr + e820.map[i].size);
4311 }
4312
4313 if ( ctxt.s )
4314 {
4315 rc = rangeset_report_ranges(current->domain->iomem_caps, ctxt.s,
4316 ~0UL, handle_iomem_range, &ctxt);
4317 if ( !rc && ctxt.s )
4318 rc = handle_iomem_range(~0UL, ~0UL, &ctxt);
4319 if ( rc )
4320 return rc;
4321 }
4322
4323 ctxt.map.nr_entries = ctxt.n;
4324
4325 if ( __copy_to_guest(arg, &ctxt.map, 1) )
4326 return -EFAULT;
4327
4328 return 0;
4329 }
4330
4331 case XENMEM_machphys_mapping:
4332 {
4333 struct xen_machphys_mapping mapping = {
4334 .v_start = MACH2PHYS_VIRT_START,
4335 .v_end = MACH2PHYS_VIRT_END,
4336 .max_mfn = MACH2PHYS_NR_ENTRIES - 1
4337 };
4338
4339 if ( !mem_hotplug && is_hardware_domain(current->domain) )
4340 mapping.max_mfn = max_page - 1;
4341 if ( copy_to_guest(arg, &mapping, 1) )
4342 return -EFAULT;
4343
4344 return 0;
4345 }
4346
4347 case XENMEM_set_pod_target:
4348 case XENMEM_get_pod_target:
4349 {
4350 xen_pod_target_t target;
4351 struct domain *d;
4352 struct p2m_domain *p2m;
4353
4354 if ( copy_from_guest(&target, arg, 1) )
4355 return -EFAULT;
4356
4357 d = rcu_lock_domain_by_any_id(target.domid);
4358 if ( d == NULL )
4359 return -ESRCH;
4360
4361 if ( cmd == XENMEM_set_pod_target )
4362 rc = xsm_set_pod_target(XSM_PRIV, d);
4363 else
4364 rc = xsm_get_pod_target(XSM_PRIV, d);
4365
4366 if ( rc != 0 )
4367 goto pod_target_out_unlock;
4368
4369 if ( cmd == XENMEM_set_pod_target )
4370 {
4371 if ( target.target_pages > d->max_pages )
4372 {
4373 rc = -EINVAL;
4374 goto pod_target_out_unlock;
4375 }
4376
4377 rc = p2m_pod_set_mem_target(d, target.target_pages);
4378 }
4379
4380 if ( rc == -ERESTART )
4381 {
4382 rc = hypercall_create_continuation(
4383 __HYPERVISOR_memory_op, "lh", cmd, arg);
4384 }
4385 else if ( rc >= 0 )
4386 {
4387 p2m = p2m_get_hostp2m(d);
4388 target.tot_pages = d->tot_pages;
4389 target.pod_cache_pages = p2m->pod.count;
4390 target.pod_entries = p2m->pod.entry_count;
4391
4392 if ( __copy_to_guest(arg, &target, 1) )
4393 {
4394 rc= -EFAULT;
4395 goto pod_target_out_unlock;
4396 }
4397 }
4398
4399 pod_target_out_unlock:
4400 rcu_unlock_domain(d);
4401 return rc;
4402 }
4403
4404 default:
4405 return subarch_memory_op(cmd, arg);
4406 }
4407
4408 return 0;
4409 }
4410
mmio_ro_emulated_write(enum x86_segment seg,unsigned long offset,void * p_data,unsigned int bytes,struct x86_emulate_ctxt * ctxt)4411 int mmio_ro_emulated_write(
4412 enum x86_segment seg,
4413 unsigned long offset,
4414 void *p_data,
4415 unsigned int bytes,
4416 struct x86_emulate_ctxt *ctxt)
4417 {
4418 struct mmio_ro_emulate_ctxt *mmio_ro_ctxt = ctxt->data;
4419
4420 /* Only allow naturally-aligned stores at the original %cr2 address. */
4421 if ( ((bytes | offset) & (bytes - 1)) || !bytes ||
4422 offset != mmio_ro_ctxt->cr2 )
4423 {
4424 gdprintk(XENLOG_WARNING, "bad access (cr2=%lx, addr=%lx, bytes=%u)\n",
4425 mmio_ro_ctxt->cr2, offset, bytes);
4426 return X86EMUL_UNHANDLEABLE;
4427 }
4428
4429 return X86EMUL_OKAY;
4430 }
4431
mmcfg_intercept_write(enum x86_segment seg,unsigned long offset,void * p_data,unsigned int bytes,struct x86_emulate_ctxt * ctxt)4432 int mmcfg_intercept_write(
4433 enum x86_segment seg,
4434 unsigned long offset,
4435 void *p_data,
4436 unsigned int bytes,
4437 struct x86_emulate_ctxt *ctxt)
4438 {
4439 struct mmio_ro_emulate_ctxt *mmio_ctxt = ctxt->data;
4440
4441 /*
4442 * Only allow naturally-aligned stores no wider than 4 bytes to the
4443 * original %cr2 address.
4444 */
4445 if ( ((bytes | offset) & (bytes - 1)) || bytes > 4 || !bytes ||
4446 offset != mmio_ctxt->cr2 )
4447 {
4448 gdprintk(XENLOG_WARNING, "bad write (cr2=%lx, addr=%lx, bytes=%u)\n",
4449 mmio_ctxt->cr2, offset, bytes);
4450 return X86EMUL_UNHANDLEABLE;
4451 }
4452
4453 offset &= 0xfff;
4454 if ( pci_conf_write_intercept(mmio_ctxt->seg, mmio_ctxt->bdf,
4455 offset, bytes, p_data) >= 0 )
4456 pci_mmcfg_write(mmio_ctxt->seg, PCI_BUS(mmio_ctxt->bdf),
4457 PCI_DEVFN2(mmio_ctxt->bdf), offset, bytes,
4458 *(uint32_t *)p_data);
4459
4460 return X86EMUL_OKAY;
4461 }
4462
alloc_xen_pagetable(void)4463 void *alloc_xen_pagetable(void)
4464 {
4465 if ( system_state != SYS_STATE_early_boot )
4466 {
4467 void *ptr = alloc_xenheap_page();
4468
4469 BUG_ON(!hardware_domain && !ptr);
4470 return ptr;
4471 }
4472
4473 return mfn_to_virt(mfn_x(alloc_boot_pages(1, 1)));
4474 }
4475
free_xen_pagetable(void * v)4476 void free_xen_pagetable(void *v)
4477 {
4478 if ( system_state != SYS_STATE_early_boot )
4479 free_xenheap_page(v);
4480 }
4481
4482 static DEFINE_SPINLOCK(map_pgdir_lock);
4483
virt_to_xen_l3e(unsigned long v)4484 static l3_pgentry_t *virt_to_xen_l3e(unsigned long v)
4485 {
4486 l4_pgentry_t *pl4e;
4487
4488 pl4e = &idle_pg_table[l4_table_offset(v)];
4489 if ( !(l4e_get_flags(*pl4e) & _PAGE_PRESENT) )
4490 {
4491 bool locking = system_state > SYS_STATE_boot;
4492 l3_pgentry_t *pl3e = alloc_xen_pagetable();
4493
4494 if ( !pl3e )
4495 return NULL;
4496 clear_page(pl3e);
4497 if ( locking )
4498 spin_lock(&map_pgdir_lock);
4499 if ( !(l4e_get_flags(*pl4e) & _PAGE_PRESENT) )
4500 {
4501 l4_pgentry_t l4e = l4e_from_paddr(__pa(pl3e), __PAGE_HYPERVISOR);
4502
4503 l4e_write(pl4e, l4e);
4504 efi_update_l4_pgtable(l4_table_offset(v), l4e);
4505 pl3e = NULL;
4506 }
4507 if ( locking )
4508 spin_unlock(&map_pgdir_lock);
4509 if ( pl3e )
4510 free_xen_pagetable(pl3e);
4511 }
4512
4513 return l4e_to_l3e(*pl4e) + l3_table_offset(v);
4514 }
4515
virt_to_xen_l2e(unsigned long v)4516 static l2_pgentry_t *virt_to_xen_l2e(unsigned long v)
4517 {
4518 l3_pgentry_t *pl3e;
4519
4520 pl3e = virt_to_xen_l3e(v);
4521 if ( !pl3e )
4522 return NULL;
4523
4524 if ( !(l3e_get_flags(*pl3e) & _PAGE_PRESENT) )
4525 {
4526 bool locking = system_state > SYS_STATE_boot;
4527 l2_pgentry_t *pl2e = alloc_xen_pagetable();
4528
4529 if ( !pl2e )
4530 return NULL;
4531 clear_page(pl2e);
4532 if ( locking )
4533 spin_lock(&map_pgdir_lock);
4534 if ( !(l3e_get_flags(*pl3e) & _PAGE_PRESENT) )
4535 {
4536 l3e_write(pl3e, l3e_from_paddr(__pa(pl2e), __PAGE_HYPERVISOR));
4537 pl2e = NULL;
4538 }
4539 if ( locking )
4540 spin_unlock(&map_pgdir_lock);
4541 if ( pl2e )
4542 free_xen_pagetable(pl2e);
4543 }
4544
4545 BUG_ON(l3e_get_flags(*pl3e) & _PAGE_PSE);
4546 return l3e_to_l2e(*pl3e) + l2_table_offset(v);
4547 }
4548
virt_to_xen_l1e(unsigned long v)4549 l1_pgentry_t *virt_to_xen_l1e(unsigned long v)
4550 {
4551 l2_pgentry_t *pl2e;
4552
4553 pl2e = virt_to_xen_l2e(v);
4554 if ( !pl2e )
4555 return NULL;
4556
4557 if ( !(l2e_get_flags(*pl2e) & _PAGE_PRESENT) )
4558 {
4559 bool locking = system_state > SYS_STATE_boot;
4560 l1_pgentry_t *pl1e = alloc_xen_pagetable();
4561
4562 if ( !pl1e )
4563 return NULL;
4564 clear_page(pl1e);
4565 if ( locking )
4566 spin_lock(&map_pgdir_lock);
4567 if ( !(l2e_get_flags(*pl2e) & _PAGE_PRESENT) )
4568 {
4569 l2e_write(pl2e, l2e_from_paddr(__pa(pl1e), __PAGE_HYPERVISOR));
4570 pl1e = NULL;
4571 }
4572 if ( locking )
4573 spin_unlock(&map_pgdir_lock);
4574 if ( pl1e )
4575 free_xen_pagetable(pl1e);
4576 }
4577
4578 BUG_ON(l2e_get_flags(*pl2e) & _PAGE_PSE);
4579 return l2e_to_l1e(*pl2e) + l1_table_offset(v);
4580 }
4581
4582 /* Convert to from superpage-mapping flags for map_pages_to_xen(). */
4583 #define l1f_to_lNf(f) (((f) & _PAGE_PRESENT) ? ((f) | _PAGE_PSE) : (f))
4584 #define lNf_to_l1f(f) (((f) & _PAGE_PRESENT) ? ((f) & ~_PAGE_PSE) : (f))
4585
4586 /*
4587 * map_pages_to_xen() can be called with interrupts disabled during
4588 * early bootstrap. In this case it is safe to use flush_area_local()
4589 * and avoid locking because only the local CPU is online.
4590 */
4591 #define flush_area(v,f) (!local_irq_is_enabled() ? \
4592 flush_area_local((const void *)v, f) : \
4593 flush_area_all((const void *)v, f))
4594
map_pages_to_xen(unsigned long virt,unsigned long mfn,unsigned long nr_mfns,unsigned int flags)4595 int map_pages_to_xen(
4596 unsigned long virt,
4597 unsigned long mfn,
4598 unsigned long nr_mfns,
4599 unsigned int flags)
4600 {
4601 bool locking = system_state > SYS_STATE_boot;
4602 l2_pgentry_t *pl2e, ol2e;
4603 l1_pgentry_t *pl1e, ol1e;
4604 unsigned int i;
4605
4606 #define flush_flags(oldf) do { \
4607 unsigned int o_ = (oldf); \
4608 if ( (o_) & _PAGE_GLOBAL ) \
4609 flush_flags |= FLUSH_TLB_GLOBAL; \
4610 if ( (flags & _PAGE_PRESENT) && \
4611 (((o_) ^ flags) & PAGE_CACHE_ATTRS) ) \
4612 { \
4613 flush_flags |= FLUSH_CACHE; \
4614 if ( virt >= DIRECTMAP_VIRT_START && \
4615 virt < HYPERVISOR_VIRT_END ) \
4616 flush_flags |= FLUSH_VA_VALID; \
4617 } \
4618 } while (0)
4619
4620 while ( nr_mfns != 0 )
4621 {
4622 l3_pgentry_t ol3e, *pl3e = virt_to_xen_l3e(virt);
4623
4624 if ( !pl3e )
4625 return -ENOMEM;
4626 ol3e = *pl3e;
4627
4628 if ( cpu_has_page1gb &&
4629 !(((virt >> PAGE_SHIFT) | mfn) &
4630 ((1UL << (L3_PAGETABLE_SHIFT - PAGE_SHIFT)) - 1)) &&
4631 nr_mfns >= (1UL << (L3_PAGETABLE_SHIFT - PAGE_SHIFT)) &&
4632 !(flags & (_PAGE_PAT | MAP_SMALL_PAGES)) )
4633 {
4634 /* 1GB-page mapping. */
4635 l3e_write_atomic(pl3e, l3e_from_pfn(mfn, l1f_to_lNf(flags)));
4636
4637 if ( (l3e_get_flags(ol3e) & _PAGE_PRESENT) )
4638 {
4639 unsigned int flush_flags =
4640 FLUSH_TLB | FLUSH_ORDER(2 * PAGETABLE_ORDER);
4641
4642 if ( l3e_get_flags(ol3e) & _PAGE_PSE )
4643 {
4644 flush_flags(lNf_to_l1f(l3e_get_flags(ol3e)));
4645 flush_area(virt, flush_flags);
4646 }
4647 else
4648 {
4649 pl2e = l3e_to_l2e(ol3e);
4650 for ( i = 0; i < L2_PAGETABLE_ENTRIES; i++ )
4651 {
4652 ol2e = pl2e[i];
4653 if ( !(l2e_get_flags(ol2e) & _PAGE_PRESENT) )
4654 continue;
4655 if ( l2e_get_flags(ol2e) & _PAGE_PSE )
4656 flush_flags(lNf_to_l1f(l2e_get_flags(ol2e)));
4657 else
4658 {
4659 unsigned int j;
4660
4661 pl1e = l2e_to_l1e(ol2e);
4662 for ( j = 0; j < L1_PAGETABLE_ENTRIES; j++ )
4663 flush_flags(l1e_get_flags(pl1e[j]));
4664 }
4665 }
4666 flush_area(virt, flush_flags);
4667 for ( i = 0; i < L2_PAGETABLE_ENTRIES; i++ )
4668 {
4669 ol2e = pl2e[i];
4670 if ( (l2e_get_flags(ol2e) & _PAGE_PRESENT) &&
4671 !(l2e_get_flags(ol2e) & _PAGE_PSE) )
4672 free_xen_pagetable(l2e_to_l1e(ol2e));
4673 }
4674 free_xen_pagetable(pl2e);
4675 }
4676 }
4677
4678 virt += 1UL << L3_PAGETABLE_SHIFT;
4679 mfn += 1UL << (L3_PAGETABLE_SHIFT - PAGE_SHIFT);
4680 nr_mfns -= 1UL << (L3_PAGETABLE_SHIFT - PAGE_SHIFT);
4681 continue;
4682 }
4683
4684 if ( (l3e_get_flags(ol3e) & _PAGE_PRESENT) &&
4685 (l3e_get_flags(ol3e) & _PAGE_PSE) )
4686 {
4687 unsigned int flush_flags =
4688 FLUSH_TLB | FLUSH_ORDER(2 * PAGETABLE_ORDER);
4689
4690 /* Skip this PTE if there is no change. */
4691 if ( ((l3e_get_pfn(ol3e) & ~(L2_PAGETABLE_ENTRIES *
4692 L1_PAGETABLE_ENTRIES - 1)) +
4693 (l2_table_offset(virt) << PAGETABLE_ORDER) +
4694 l1_table_offset(virt) == mfn) &&
4695 ((lNf_to_l1f(l3e_get_flags(ol3e)) ^ flags) &
4696 ~(_PAGE_ACCESSED|_PAGE_DIRTY)) == 0 )
4697 {
4698 /* We can skip to end of L3 superpage if we got a match. */
4699 i = (1u << (L3_PAGETABLE_SHIFT - PAGE_SHIFT)) -
4700 (mfn & ((1 << (L3_PAGETABLE_SHIFT - PAGE_SHIFT)) - 1));
4701 if ( i > nr_mfns )
4702 i = nr_mfns;
4703 virt += i << PAGE_SHIFT;
4704 mfn += i;
4705 nr_mfns -= i;
4706 continue;
4707 }
4708
4709 pl2e = alloc_xen_pagetable();
4710 if ( pl2e == NULL )
4711 return -ENOMEM;
4712
4713 for ( i = 0; i < L2_PAGETABLE_ENTRIES; i++ )
4714 l2e_write(pl2e + i,
4715 l2e_from_pfn(l3e_get_pfn(ol3e) +
4716 (i << PAGETABLE_ORDER),
4717 l3e_get_flags(ol3e)));
4718
4719 if ( l3e_get_flags(ol3e) & _PAGE_GLOBAL )
4720 flush_flags |= FLUSH_TLB_GLOBAL;
4721
4722 if ( locking )
4723 spin_lock(&map_pgdir_lock);
4724 if ( (l3e_get_flags(*pl3e) & _PAGE_PRESENT) &&
4725 (l3e_get_flags(*pl3e) & _PAGE_PSE) )
4726 {
4727 l3e_write_atomic(pl3e, l3e_from_pfn(virt_to_mfn(pl2e),
4728 __PAGE_HYPERVISOR));
4729 pl2e = NULL;
4730 }
4731 if ( locking )
4732 spin_unlock(&map_pgdir_lock);
4733 flush_area(virt, flush_flags);
4734 if ( pl2e )
4735 free_xen_pagetable(pl2e);
4736 }
4737
4738 pl2e = virt_to_xen_l2e(virt);
4739 if ( !pl2e )
4740 return -ENOMEM;
4741
4742 if ( ((((virt >> PAGE_SHIFT) | mfn) &
4743 ((1u << PAGETABLE_ORDER) - 1)) == 0) &&
4744 (nr_mfns >= (1u << PAGETABLE_ORDER)) &&
4745 !(flags & (_PAGE_PAT|MAP_SMALL_PAGES)) )
4746 {
4747 /* Super-page mapping. */
4748 ol2e = *pl2e;
4749 l2e_write_atomic(pl2e, l2e_from_pfn(mfn, l1f_to_lNf(flags)));
4750
4751 if ( (l2e_get_flags(ol2e) & _PAGE_PRESENT) )
4752 {
4753 unsigned int flush_flags =
4754 FLUSH_TLB | FLUSH_ORDER(PAGETABLE_ORDER);
4755
4756 if ( l2e_get_flags(ol2e) & _PAGE_PSE )
4757 {
4758 flush_flags(lNf_to_l1f(l2e_get_flags(ol2e)));
4759 flush_area(virt, flush_flags);
4760 }
4761 else
4762 {
4763 pl1e = l2e_to_l1e(ol2e);
4764 for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++ )
4765 flush_flags(l1e_get_flags(pl1e[i]));
4766 flush_area(virt, flush_flags);
4767 free_xen_pagetable(pl1e);
4768 }
4769 }
4770
4771 virt += 1UL << L2_PAGETABLE_SHIFT;
4772 mfn += 1UL << PAGETABLE_ORDER;
4773 nr_mfns -= 1UL << PAGETABLE_ORDER;
4774 }
4775 else
4776 {
4777 /* Normal page mapping. */
4778 if ( !(l2e_get_flags(*pl2e) & _PAGE_PRESENT) )
4779 {
4780 pl1e = virt_to_xen_l1e(virt);
4781 if ( pl1e == NULL )
4782 return -ENOMEM;
4783 }
4784 else if ( l2e_get_flags(*pl2e) & _PAGE_PSE )
4785 {
4786 unsigned int flush_flags =
4787 FLUSH_TLB | FLUSH_ORDER(PAGETABLE_ORDER);
4788
4789 /* Skip this PTE if there is no change. */
4790 if ( (((l2e_get_pfn(*pl2e) & ~(L1_PAGETABLE_ENTRIES - 1)) +
4791 l1_table_offset(virt)) == mfn) &&
4792 (((lNf_to_l1f(l2e_get_flags(*pl2e)) ^ flags) &
4793 ~(_PAGE_ACCESSED|_PAGE_DIRTY)) == 0) )
4794 {
4795 /* We can skip to end of L2 superpage if we got a match. */
4796 i = (1u << (L2_PAGETABLE_SHIFT - PAGE_SHIFT)) -
4797 (mfn & ((1u << (L2_PAGETABLE_SHIFT - PAGE_SHIFT)) - 1));
4798 if ( i > nr_mfns )
4799 i = nr_mfns;
4800 virt += i << L1_PAGETABLE_SHIFT;
4801 mfn += i;
4802 nr_mfns -= i;
4803 goto check_l3;
4804 }
4805
4806 pl1e = alloc_xen_pagetable();
4807 if ( pl1e == NULL )
4808 return -ENOMEM;
4809
4810 for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++ )
4811 l1e_write(&pl1e[i],
4812 l1e_from_pfn(l2e_get_pfn(*pl2e) + i,
4813 lNf_to_l1f(l2e_get_flags(*pl2e))));
4814
4815 if ( l2e_get_flags(*pl2e) & _PAGE_GLOBAL )
4816 flush_flags |= FLUSH_TLB_GLOBAL;
4817
4818 if ( locking )
4819 spin_lock(&map_pgdir_lock);
4820 if ( (l2e_get_flags(*pl2e) & _PAGE_PRESENT) &&
4821 (l2e_get_flags(*pl2e) & _PAGE_PSE) )
4822 {
4823 l2e_write_atomic(pl2e, l2e_from_pfn(virt_to_mfn(pl1e),
4824 __PAGE_HYPERVISOR));
4825 pl1e = NULL;
4826 }
4827 if ( locking )
4828 spin_unlock(&map_pgdir_lock);
4829 flush_area(virt, flush_flags);
4830 if ( pl1e )
4831 free_xen_pagetable(pl1e);
4832 }
4833
4834 pl1e = l2e_to_l1e(*pl2e) + l1_table_offset(virt);
4835 ol1e = *pl1e;
4836 l1e_write_atomic(pl1e, l1e_from_pfn(mfn, flags));
4837 if ( (l1e_get_flags(ol1e) & _PAGE_PRESENT) )
4838 {
4839 unsigned int flush_flags = FLUSH_TLB | FLUSH_ORDER(0);
4840
4841 flush_flags(l1e_get_flags(ol1e));
4842 flush_area(virt, flush_flags);
4843 }
4844
4845 virt += 1UL << L1_PAGETABLE_SHIFT;
4846 mfn += 1UL;
4847 nr_mfns -= 1UL;
4848
4849 if ( (flags == PAGE_HYPERVISOR) &&
4850 ((nr_mfns == 0) ||
4851 ((((virt >> PAGE_SHIFT) | mfn) &
4852 ((1u << PAGETABLE_ORDER) - 1)) == 0)) )
4853 {
4854 unsigned long base_mfn;
4855
4856 if ( locking )
4857 spin_lock(&map_pgdir_lock);
4858
4859 ol2e = *pl2e;
4860 /*
4861 * L2E may be already cleared, or set to a superpage, by
4862 * concurrent paging structure modifications on other CPUs.
4863 */
4864 if ( !(l2e_get_flags(ol2e) & _PAGE_PRESENT) )
4865 {
4866 if ( locking )
4867 spin_unlock(&map_pgdir_lock);
4868 continue;
4869 }
4870
4871 if ( l2e_get_flags(ol2e) & _PAGE_PSE )
4872 {
4873 if ( locking )
4874 spin_unlock(&map_pgdir_lock);
4875 goto check_l3;
4876 }
4877
4878 pl1e = l2e_to_l1e(ol2e);
4879 base_mfn = l1e_get_pfn(*pl1e) & ~(L1_PAGETABLE_ENTRIES - 1);
4880 for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++, pl1e++ )
4881 if ( (l1e_get_pfn(*pl1e) != (base_mfn + i)) ||
4882 (l1e_get_flags(*pl1e) != flags) )
4883 break;
4884 if ( i == L1_PAGETABLE_ENTRIES )
4885 {
4886 l2e_write_atomic(pl2e, l2e_from_pfn(base_mfn,
4887 l1f_to_lNf(flags)));
4888 if ( locking )
4889 spin_unlock(&map_pgdir_lock);
4890 flush_area(virt - PAGE_SIZE,
4891 FLUSH_TLB_GLOBAL |
4892 FLUSH_ORDER(PAGETABLE_ORDER));
4893 free_xen_pagetable(l2e_to_l1e(ol2e));
4894 }
4895 else if ( locking )
4896 spin_unlock(&map_pgdir_lock);
4897 }
4898 }
4899
4900 check_l3:
4901 if ( cpu_has_page1gb &&
4902 (flags == PAGE_HYPERVISOR) &&
4903 ((nr_mfns == 0) ||
4904 !(((virt >> PAGE_SHIFT) | mfn) &
4905 ((1UL << (L3_PAGETABLE_SHIFT - PAGE_SHIFT)) - 1))) )
4906 {
4907 unsigned long base_mfn;
4908
4909 if ( locking )
4910 spin_lock(&map_pgdir_lock);
4911
4912 ol3e = *pl3e;
4913 /*
4914 * L3E may be already cleared, or set to a superpage, by
4915 * concurrent paging structure modifications on other CPUs.
4916 */
4917 if ( !(l3e_get_flags(ol3e) & _PAGE_PRESENT) ||
4918 (l3e_get_flags(ol3e) & _PAGE_PSE) )
4919 {
4920 if ( locking )
4921 spin_unlock(&map_pgdir_lock);
4922 continue;
4923 }
4924
4925 pl2e = l3e_to_l2e(ol3e);
4926 base_mfn = l2e_get_pfn(*pl2e) & ~(L2_PAGETABLE_ENTRIES *
4927 L1_PAGETABLE_ENTRIES - 1);
4928 for ( i = 0; i < L2_PAGETABLE_ENTRIES; i++, pl2e++ )
4929 if ( (l2e_get_pfn(*pl2e) !=
4930 (base_mfn + (i << PAGETABLE_ORDER))) ||
4931 (l2e_get_flags(*pl2e) != l1f_to_lNf(flags)) )
4932 break;
4933 if ( i == L2_PAGETABLE_ENTRIES )
4934 {
4935 l3e_write_atomic(pl3e, l3e_from_pfn(base_mfn,
4936 l1f_to_lNf(flags)));
4937 if ( locking )
4938 spin_unlock(&map_pgdir_lock);
4939 flush_area(virt - PAGE_SIZE,
4940 FLUSH_TLB_GLOBAL |
4941 FLUSH_ORDER(2*PAGETABLE_ORDER));
4942 free_xen_pagetable(l3e_to_l2e(ol3e));
4943 }
4944 else if ( locking )
4945 spin_unlock(&map_pgdir_lock);
4946 }
4947 }
4948
4949 #undef flush_flags
4950
4951 return 0;
4952 }
4953
populate_pt_range(unsigned long virt,unsigned long mfn,unsigned long nr_mfns)4954 int populate_pt_range(unsigned long virt, unsigned long mfn,
4955 unsigned long nr_mfns)
4956 {
4957 return map_pages_to_xen(virt, mfn, nr_mfns, MAP_SMALL_PAGES);
4958 }
4959
4960 /*
4961 * Alter the permissions of a range of Xen virtual address space.
4962 *
4963 * Does not create new mappings, and does not modify the mfn in existing
4964 * mappings, but will shatter superpages if necessary, and will destroy
4965 * mappings if not passed _PAGE_PRESENT.
4966 *
4967 * The only flags considered are NX, RW and PRESENT. All other input flags
4968 * are ignored.
4969 *
4970 * It is an error to call with present flags over an unpopulated range.
4971 */
modify_xen_mappings(unsigned long s,unsigned long e,unsigned int nf)4972 int modify_xen_mappings(unsigned long s, unsigned long e, unsigned int nf)
4973 {
4974 bool locking = system_state > SYS_STATE_boot;
4975 l2_pgentry_t *pl2e;
4976 l1_pgentry_t *pl1e;
4977 unsigned int i;
4978 unsigned long v = s;
4979
4980 /* Set of valid PTE bits which may be altered. */
4981 #define FLAGS_MASK (_PAGE_NX|_PAGE_RW|_PAGE_PRESENT)
4982 nf &= FLAGS_MASK;
4983
4984 ASSERT(IS_ALIGNED(s, PAGE_SIZE));
4985 ASSERT(IS_ALIGNED(e, PAGE_SIZE));
4986
4987 while ( v < e )
4988 {
4989 l3_pgentry_t *pl3e = virt_to_xen_l3e(v);
4990
4991 if ( !pl3e || !(l3e_get_flags(*pl3e) & _PAGE_PRESENT) )
4992 {
4993 /* Confirm the caller isn't trying to create new mappings. */
4994 ASSERT(!(nf & _PAGE_PRESENT));
4995
4996 v += 1UL << L3_PAGETABLE_SHIFT;
4997 v &= ~((1UL << L3_PAGETABLE_SHIFT) - 1);
4998 continue;
4999 }
5000
5001 if ( l3e_get_flags(*pl3e) & _PAGE_PSE )
5002 {
5003 if ( l2_table_offset(v) == 0 &&
5004 l1_table_offset(v) == 0 &&
5005 ((e - v) >= (1UL << L3_PAGETABLE_SHIFT)) )
5006 {
5007 /* PAGE1GB: whole superpage is modified. */
5008 l3_pgentry_t nl3e = !(nf & _PAGE_PRESENT) ? l3e_empty()
5009 : l3e_from_pfn(l3e_get_pfn(*pl3e),
5010 (l3e_get_flags(*pl3e) & ~FLAGS_MASK) | nf);
5011
5012 l3e_write_atomic(pl3e, nl3e);
5013 v += 1UL << L3_PAGETABLE_SHIFT;
5014 continue;
5015 }
5016
5017 /* PAGE1GB: shatter the superpage and fall through. */
5018 pl2e = alloc_xen_pagetable();
5019 if ( !pl2e )
5020 return -ENOMEM;
5021 for ( i = 0; i < L2_PAGETABLE_ENTRIES; i++ )
5022 l2e_write(pl2e + i,
5023 l2e_from_pfn(l3e_get_pfn(*pl3e) +
5024 (i << PAGETABLE_ORDER),
5025 l3e_get_flags(*pl3e)));
5026 if ( locking )
5027 spin_lock(&map_pgdir_lock);
5028 if ( (l3e_get_flags(*pl3e) & _PAGE_PRESENT) &&
5029 (l3e_get_flags(*pl3e) & _PAGE_PSE) )
5030 {
5031 l3e_write_atomic(pl3e, l3e_from_pfn(virt_to_mfn(pl2e),
5032 __PAGE_HYPERVISOR));
5033 pl2e = NULL;
5034 }
5035 if ( locking )
5036 spin_unlock(&map_pgdir_lock);
5037 if ( pl2e )
5038 free_xen_pagetable(pl2e);
5039 }
5040
5041 /*
5042 * The L3 entry has been verified to be present, and we've dealt with
5043 * 1G pages as well, so the L2 table cannot require allocation.
5044 */
5045 pl2e = l3e_to_l2e(*pl3e) + l2_table_offset(v);
5046
5047 if ( !(l2e_get_flags(*pl2e) & _PAGE_PRESENT) )
5048 {
5049 /* Confirm the caller isn't trying to create new mappings. */
5050 ASSERT(!(nf & _PAGE_PRESENT));
5051
5052 v += 1UL << L2_PAGETABLE_SHIFT;
5053 v &= ~((1UL << L2_PAGETABLE_SHIFT) - 1);
5054 continue;
5055 }
5056
5057 if ( l2e_get_flags(*pl2e) & _PAGE_PSE )
5058 {
5059 if ( (l1_table_offset(v) == 0) &&
5060 ((e-v) >= (1UL << L2_PAGETABLE_SHIFT)) )
5061 {
5062 /* PSE: whole superpage is modified. */
5063 l2_pgentry_t nl2e = !(nf & _PAGE_PRESENT) ? l2e_empty()
5064 : l2e_from_pfn(l2e_get_pfn(*pl2e),
5065 (l2e_get_flags(*pl2e) & ~FLAGS_MASK) | nf);
5066
5067 l2e_write_atomic(pl2e, nl2e);
5068 v += 1UL << L2_PAGETABLE_SHIFT;
5069 }
5070 else
5071 {
5072 /* PSE: shatter the superpage and try again. */
5073 pl1e = alloc_xen_pagetable();
5074 if ( !pl1e )
5075 return -ENOMEM;
5076 for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++ )
5077 l1e_write(&pl1e[i],
5078 l1e_from_pfn(l2e_get_pfn(*pl2e) + i,
5079 l2e_get_flags(*pl2e) & ~_PAGE_PSE));
5080 if ( locking )
5081 spin_lock(&map_pgdir_lock);
5082 if ( (l2e_get_flags(*pl2e) & _PAGE_PRESENT) &&
5083 (l2e_get_flags(*pl2e) & _PAGE_PSE) )
5084 {
5085 l2e_write_atomic(pl2e, l2e_from_pfn(virt_to_mfn(pl1e),
5086 __PAGE_HYPERVISOR));
5087 pl1e = NULL;
5088 }
5089 if ( locking )
5090 spin_unlock(&map_pgdir_lock);
5091 if ( pl1e )
5092 free_xen_pagetable(pl1e);
5093 }
5094 }
5095 else
5096 {
5097 l1_pgentry_t nl1e;
5098
5099 /*
5100 * Ordinary 4kB mapping: The L2 entry has been verified to be
5101 * present, and we've dealt with 2M pages as well, so the L1 table
5102 * cannot require allocation.
5103 */
5104 pl1e = l2e_to_l1e(*pl2e) + l1_table_offset(v);
5105
5106 /* Confirm the caller isn't trying to create new mappings. */
5107 if ( !(l1e_get_flags(*pl1e) & _PAGE_PRESENT) )
5108 ASSERT(!(nf & _PAGE_PRESENT));
5109
5110 nl1e = !(nf & _PAGE_PRESENT) ? l1e_empty()
5111 : l1e_from_pfn(l1e_get_pfn(*pl1e),
5112 (l1e_get_flags(*pl1e) & ~FLAGS_MASK) | nf);
5113
5114 l1e_write_atomic(pl1e, nl1e);
5115 v += PAGE_SIZE;
5116
5117 /*
5118 * If we are not destroying mappings, or not done with the L2E,
5119 * skip the empty&free check.
5120 */
5121 if ( (nf & _PAGE_PRESENT) || ((v != e) && (l1_table_offset(v) != 0)) )
5122 continue;
5123 if ( locking )
5124 spin_lock(&map_pgdir_lock);
5125
5126 /*
5127 * L2E may be already cleared, or set to a superpage, by
5128 * concurrent paging structure modifications on other CPUs.
5129 */
5130 if ( !(l2e_get_flags(*pl2e) & _PAGE_PRESENT) )
5131 {
5132 if ( locking )
5133 spin_unlock(&map_pgdir_lock);
5134 goto check_l3;
5135 }
5136
5137 if ( l2e_get_flags(*pl2e) & _PAGE_PSE )
5138 {
5139 if ( locking )
5140 spin_unlock(&map_pgdir_lock);
5141 continue;
5142 }
5143
5144 pl1e = l2e_to_l1e(*pl2e);
5145 for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++ )
5146 if ( l1e_get_intpte(pl1e[i]) != 0 )
5147 break;
5148 if ( i == L1_PAGETABLE_ENTRIES )
5149 {
5150 /* Empty: zap the L2E and free the L1 page. */
5151 l2e_write_atomic(pl2e, l2e_empty());
5152 if ( locking )
5153 spin_unlock(&map_pgdir_lock);
5154 flush_area(NULL, FLUSH_TLB_GLOBAL); /* flush before free */
5155 free_xen_pagetable(pl1e);
5156 }
5157 else if ( locking )
5158 spin_unlock(&map_pgdir_lock);
5159 }
5160
5161 check_l3:
5162 /*
5163 * If we are not destroying mappings, or not done with the L3E,
5164 * skip the empty&free check.
5165 */
5166 if ( (nf & _PAGE_PRESENT) ||
5167 ((v != e) && (l2_table_offset(v) + l1_table_offset(v) != 0)) )
5168 continue;
5169 if ( locking )
5170 spin_lock(&map_pgdir_lock);
5171
5172 /*
5173 * L3E may be already cleared, or set to a superpage, by
5174 * concurrent paging structure modifications on other CPUs.
5175 */
5176 if ( !(l3e_get_flags(*pl3e) & _PAGE_PRESENT) ||
5177 (l3e_get_flags(*pl3e) & _PAGE_PSE) )
5178 {
5179 if ( locking )
5180 spin_unlock(&map_pgdir_lock);
5181 continue;
5182 }
5183
5184 pl2e = l3e_to_l2e(*pl3e);
5185 for ( i = 0; i < L2_PAGETABLE_ENTRIES; i++ )
5186 if ( l2e_get_intpte(pl2e[i]) != 0 )
5187 break;
5188 if ( i == L2_PAGETABLE_ENTRIES )
5189 {
5190 /* Empty: zap the L3E and free the L2 page. */
5191 l3e_write_atomic(pl3e, l3e_empty());
5192 if ( locking )
5193 spin_unlock(&map_pgdir_lock);
5194 flush_area(NULL, FLUSH_TLB_GLOBAL); /* flush before free */
5195 free_xen_pagetable(pl2e);
5196 }
5197 else if ( locking )
5198 spin_unlock(&map_pgdir_lock);
5199 }
5200
5201 flush_area(NULL, FLUSH_TLB_GLOBAL);
5202
5203 #undef FLAGS_MASK
5204 return 0;
5205 }
5206
5207 #undef flush_area
5208
destroy_xen_mappings(unsigned long s,unsigned long e)5209 int destroy_xen_mappings(unsigned long s, unsigned long e)
5210 {
5211 return modify_xen_mappings(s, e, _PAGE_NONE);
5212 }
5213
__set_fixmap(enum fixed_addresses idx,unsigned long mfn,unsigned long flags)5214 void __set_fixmap(
5215 enum fixed_addresses idx, unsigned long mfn, unsigned long flags)
5216 {
5217 BUG_ON(idx >= __end_of_fixed_addresses);
5218 map_pages_to_xen(__fix_to_virt(idx), mfn, 1, flags);
5219 }
5220
arch_vmap_virt_end(void)5221 void *__init arch_vmap_virt_end(void)
5222 {
5223 return fix_to_virt(__end_of_fixed_addresses);
5224 }
5225
ioremap(paddr_t pa,size_t len)5226 void __iomem *ioremap(paddr_t pa, size_t len)
5227 {
5228 mfn_t mfn = _mfn(PFN_DOWN(pa));
5229 void *va;
5230
5231 WARN_ON(page_is_ram_type(mfn_x(mfn), RAM_TYPE_CONVENTIONAL));
5232
5233 /* The low first Mb is always mapped. */
5234 if ( !((pa + len - 1) >> 20) )
5235 va = __va(pa);
5236 else
5237 {
5238 unsigned int offs = pa & (PAGE_SIZE - 1);
5239 unsigned int nr = PFN_UP(offs + len);
5240
5241 va = __vmap(&mfn, nr, 1, 1, PAGE_HYPERVISOR_UCMINUS, VMAP_DEFAULT) + offs;
5242 }
5243
5244 return (void __force __iomem *)va;
5245 }
5246
create_perdomain_mapping(struct domain * d,unsigned long va,unsigned int nr,l1_pgentry_t ** pl1tab,struct page_info ** ppg)5247 int create_perdomain_mapping(struct domain *d, unsigned long va,
5248 unsigned int nr, l1_pgentry_t **pl1tab,
5249 struct page_info **ppg)
5250 {
5251 struct page_info *pg;
5252 l3_pgentry_t *l3tab;
5253 l2_pgentry_t *l2tab;
5254 l1_pgentry_t *l1tab;
5255 int rc = 0;
5256
5257 ASSERT(va >= PERDOMAIN_VIRT_START &&
5258 va < PERDOMAIN_VIRT_SLOT(PERDOMAIN_SLOTS));
5259
5260 if ( !d->arch.perdomain_l3_pg )
5261 {
5262 pg = alloc_domheap_page(d, MEMF_no_owner);
5263 if ( !pg )
5264 return -ENOMEM;
5265 l3tab = __map_domain_page(pg);
5266 clear_page(l3tab);
5267 d->arch.perdomain_l3_pg = pg;
5268 if ( !nr )
5269 {
5270 unmap_domain_page(l3tab);
5271 return 0;
5272 }
5273 }
5274 else if ( !nr )
5275 return 0;
5276 else
5277 l3tab = __map_domain_page(d->arch.perdomain_l3_pg);
5278
5279 ASSERT(!l3_table_offset(va ^ (va + nr * PAGE_SIZE - 1)));
5280
5281 if ( !(l3e_get_flags(l3tab[l3_table_offset(va)]) & _PAGE_PRESENT) )
5282 {
5283 pg = alloc_domheap_page(d, MEMF_no_owner);
5284 if ( !pg )
5285 {
5286 unmap_domain_page(l3tab);
5287 return -ENOMEM;
5288 }
5289 l2tab = __map_domain_page(pg);
5290 clear_page(l2tab);
5291 l3tab[l3_table_offset(va)] = l3e_from_page(pg, __PAGE_HYPERVISOR_RW);
5292 }
5293 else
5294 l2tab = map_l2t_from_l3e(l3tab[l3_table_offset(va)]);
5295
5296 unmap_domain_page(l3tab);
5297
5298 if ( !pl1tab && !ppg )
5299 {
5300 unmap_domain_page(l2tab);
5301 return 0;
5302 }
5303
5304 for ( l1tab = NULL; !rc && nr--; )
5305 {
5306 l2_pgentry_t *pl2e = l2tab + l2_table_offset(va);
5307
5308 if ( !(l2e_get_flags(*pl2e) & _PAGE_PRESENT) )
5309 {
5310 if ( pl1tab && !IS_NIL(pl1tab) )
5311 {
5312 l1tab = alloc_xenheap_pages(0, MEMF_node(domain_to_node(d)));
5313 if ( !l1tab )
5314 {
5315 rc = -ENOMEM;
5316 break;
5317 }
5318 ASSERT(!pl1tab[l2_table_offset(va)]);
5319 pl1tab[l2_table_offset(va)] = l1tab;
5320 pg = virt_to_page(l1tab);
5321 }
5322 else
5323 {
5324 pg = alloc_domheap_page(d, MEMF_no_owner);
5325 if ( !pg )
5326 {
5327 rc = -ENOMEM;
5328 break;
5329 }
5330 l1tab = __map_domain_page(pg);
5331 }
5332 clear_page(l1tab);
5333 *pl2e = l2e_from_page(pg, __PAGE_HYPERVISOR_RW);
5334 }
5335 else if ( !l1tab )
5336 l1tab = map_l1t_from_l2e(*pl2e);
5337
5338 if ( ppg &&
5339 !(l1e_get_flags(l1tab[l1_table_offset(va)]) & _PAGE_PRESENT) )
5340 {
5341 pg = alloc_domheap_page(d, MEMF_no_owner);
5342 if ( pg )
5343 {
5344 clear_domain_page(page_to_mfn(pg));
5345 if ( !IS_NIL(ppg) )
5346 *ppg++ = pg;
5347 l1tab[l1_table_offset(va)] =
5348 l1e_from_page(pg, __PAGE_HYPERVISOR_RW | _PAGE_AVAIL0);
5349 l2e_add_flags(*pl2e, _PAGE_AVAIL0);
5350 }
5351 else
5352 rc = -ENOMEM;
5353 }
5354
5355 va += PAGE_SIZE;
5356 if ( rc || !nr || !l1_table_offset(va) )
5357 {
5358 /* Note that this is a no-op for the alloc_xenheap_page() case. */
5359 unmap_domain_page(l1tab);
5360 l1tab = NULL;
5361 }
5362 }
5363
5364 ASSERT(!l1tab);
5365 unmap_domain_page(l2tab);
5366
5367 return rc;
5368 }
5369
destroy_perdomain_mapping(struct domain * d,unsigned long va,unsigned int nr)5370 void destroy_perdomain_mapping(struct domain *d, unsigned long va,
5371 unsigned int nr)
5372 {
5373 const l3_pgentry_t *l3tab, *pl3e;
5374
5375 ASSERT(va >= PERDOMAIN_VIRT_START &&
5376 va < PERDOMAIN_VIRT_SLOT(PERDOMAIN_SLOTS));
5377 ASSERT(!l3_table_offset(va ^ (va + nr * PAGE_SIZE - 1)));
5378
5379 if ( !d->arch.perdomain_l3_pg )
5380 return;
5381
5382 l3tab = __map_domain_page(d->arch.perdomain_l3_pg);
5383 pl3e = l3tab + l3_table_offset(va);
5384
5385 if ( l3e_get_flags(*pl3e) & _PAGE_PRESENT )
5386 {
5387 const l2_pgentry_t *l2tab = map_l2t_from_l3e(*pl3e);
5388 const l2_pgentry_t *pl2e = l2tab + l2_table_offset(va);
5389 unsigned int i = l1_table_offset(va);
5390
5391 while ( nr )
5392 {
5393 if ( l2e_get_flags(*pl2e) & _PAGE_PRESENT )
5394 {
5395 l1_pgentry_t *l1tab = map_l1t_from_l2e(*pl2e);
5396
5397 for ( ; nr && i < L1_PAGETABLE_ENTRIES; --nr, ++i )
5398 {
5399 if ( (l1e_get_flags(l1tab[i]) &
5400 (_PAGE_PRESENT | _PAGE_AVAIL0)) ==
5401 (_PAGE_PRESENT | _PAGE_AVAIL0) )
5402 free_domheap_page(l1e_get_page(l1tab[i]));
5403 l1tab[i] = l1e_empty();
5404 }
5405
5406 unmap_domain_page(l1tab);
5407 }
5408 else if ( nr + i < L1_PAGETABLE_ENTRIES )
5409 break;
5410 else
5411 nr -= L1_PAGETABLE_ENTRIES - i;
5412
5413 ++pl2e;
5414 i = 0;
5415 }
5416
5417 unmap_domain_page(l2tab);
5418 }
5419
5420 unmap_domain_page(l3tab);
5421 }
5422
free_perdomain_mappings(struct domain * d)5423 void free_perdomain_mappings(struct domain *d)
5424 {
5425 l3_pgentry_t *l3tab;
5426 unsigned int i;
5427
5428 if ( !d->arch.perdomain_l3_pg )
5429 return;
5430
5431 l3tab = __map_domain_page(d->arch.perdomain_l3_pg);
5432
5433 for ( i = 0; i < PERDOMAIN_SLOTS; ++i)
5434 if ( l3e_get_flags(l3tab[i]) & _PAGE_PRESENT )
5435 {
5436 struct page_info *l2pg = l3e_get_page(l3tab[i]);
5437 l2_pgentry_t *l2tab = __map_domain_page(l2pg);
5438 unsigned int j;
5439
5440 for ( j = 0; j < L2_PAGETABLE_ENTRIES; ++j )
5441 if ( l2e_get_flags(l2tab[j]) & _PAGE_PRESENT )
5442 {
5443 struct page_info *l1pg = l2e_get_page(l2tab[j]);
5444
5445 if ( l2e_get_flags(l2tab[j]) & _PAGE_AVAIL0 )
5446 {
5447 l1_pgentry_t *l1tab = __map_domain_page(l1pg);
5448 unsigned int k;
5449
5450 for ( k = 0; k < L1_PAGETABLE_ENTRIES; ++k )
5451 if ( (l1e_get_flags(l1tab[k]) &
5452 (_PAGE_PRESENT | _PAGE_AVAIL0)) ==
5453 (_PAGE_PRESENT | _PAGE_AVAIL0) )
5454 free_domheap_page(l1e_get_page(l1tab[k]));
5455
5456 unmap_domain_page(l1tab);
5457 }
5458
5459 if ( is_xen_heap_page(l1pg) )
5460 free_xenheap_page(page_to_virt(l1pg));
5461 else
5462 free_domheap_page(l1pg);
5463 }
5464
5465 unmap_domain_page(l2tab);
5466 free_domheap_page(l2pg);
5467 }
5468
5469 unmap_domain_page(l3tab);
5470 free_domheap_page(d->arch.perdomain_l3_pg);
5471 d->arch.perdomain_l3_pg = NULL;
5472 }
5473
5474 #ifdef MEMORY_GUARD
5475
__memguard_change_range(void * p,unsigned long l,int guard)5476 static void __memguard_change_range(void *p, unsigned long l, int guard)
5477 {
5478 unsigned long _p = (unsigned long)p;
5479 unsigned long _l = (unsigned long)l;
5480 unsigned int flags = __PAGE_HYPERVISOR_RW | MAP_SMALL_PAGES;
5481
5482 /* Ensure we are dealing with a page-aligned whole number of pages. */
5483 ASSERT(IS_ALIGNED(_p, PAGE_SIZE));
5484 ASSERT(IS_ALIGNED(_l, PAGE_SIZE));
5485
5486 if ( guard )
5487 flags &= ~_PAGE_PRESENT;
5488
5489 map_pages_to_xen(
5490 _p, virt_to_maddr(p) >> PAGE_SHIFT, _l >> PAGE_SHIFT, flags);
5491 }
5492
memguard_guard_range(void * p,unsigned long l)5493 void memguard_guard_range(void *p, unsigned long l)
5494 {
5495 __memguard_change_range(p, l, 1);
5496 }
5497
memguard_unguard_range(void * p,unsigned long l)5498 void memguard_unguard_range(void *p, unsigned long l)
5499 {
5500 __memguard_change_range(p, l, 0);
5501 }
5502
5503 #endif
5504
memguard_guard_stack(void * p)5505 void memguard_guard_stack(void *p)
5506 {
5507 BUILD_BUG_ON((PRIMARY_STACK_SIZE + PAGE_SIZE) > STACK_SIZE);
5508 p = (void *)((unsigned long)p + STACK_SIZE -
5509 PRIMARY_STACK_SIZE - PAGE_SIZE);
5510 memguard_guard_range(p, PAGE_SIZE);
5511 }
5512
memguard_unguard_stack(void * p)5513 void memguard_unguard_stack(void *p)
5514 {
5515 p = (void *)((unsigned long)p + STACK_SIZE -
5516 PRIMARY_STACK_SIZE - PAGE_SIZE);
5517 memguard_unguard_range(p, PAGE_SIZE);
5518 }
5519
arch_dump_shared_mem_info(void)5520 void arch_dump_shared_mem_info(void)
5521 {
5522 printk("Shared frames %u -- Saved frames %u\n",
5523 mem_sharing_get_nr_shared_mfns(),
5524 mem_sharing_get_nr_saved_mfns());
5525 }
5526
get_platform_badpages(unsigned int * array_size)5527 const unsigned long *__init get_platform_badpages(unsigned int *array_size)
5528 {
5529 u32 igd_id;
5530 static unsigned long __initdata bad_pages[] = {
5531 0x20050000,
5532 0x20110000,
5533 0x20130000,
5534 0x20138000,
5535 0x40004000,
5536 };
5537
5538 *array_size = ARRAY_SIZE(bad_pages);
5539 igd_id = pci_conf_read32(0, 0, 2, 0, 0);
5540 if ( !IS_SNB_GFX(igd_id) )
5541 return NULL;
5542
5543 return bad_pages;
5544 }
5545
paging_invlpg(struct vcpu * v,unsigned long va)5546 void paging_invlpg(struct vcpu *v, unsigned long va)
5547 {
5548 if ( !is_canonical_address(va) )
5549 return;
5550
5551 if ( paging_mode_enabled(v->domain) &&
5552 !paging_get_hostmode(v)->invlpg(v, va) )
5553 return;
5554
5555 if ( is_pv_vcpu(v) )
5556 flush_tlb_one_local(va);
5557 else
5558 hvm_funcs.invlpg(v, va);
5559 }
5560
5561 /* Build a 32bit PSE page table using 4MB pages. */
write_32bit_pse_identmap(uint32_t * l2)5562 void write_32bit_pse_identmap(uint32_t *l2)
5563 {
5564 unsigned int i;
5565
5566 for ( i = 0; i < PAGE_SIZE / sizeof(*l2); i++ )
5567 l2[i] = ((i << 22) | _PAGE_PRESENT | _PAGE_RW | _PAGE_USER |
5568 _PAGE_ACCESSED | _PAGE_DIRTY | _PAGE_PSE);
5569 }
5570
get_upper_mfn_bound(void)5571 unsigned long get_upper_mfn_bound(void)
5572 {
5573 unsigned long max_mfn;
5574
5575 max_mfn = mem_hotplug ? PFN_DOWN(mem_hotplug) : max_page;
5576 #ifndef CONFIG_BIGMEM
5577 max_mfn = min(max_mfn, 1UL << 32);
5578 #endif
5579 return min(max_mfn, 1UL << (paddr_bits - PAGE_SHIFT)) - 1;
5580 }
5581
5582 /*
5583 * Local variables:
5584 * mode: C
5585 * c-file-style: "BSD"
5586 * c-basic-offset: 4
5587 * tab-width: 4
5588 * indent-tabs-mode: nil
5589 * End:
5590 */
5591