1 /******************************************************************************
2  * arch/x86/mm.c
3  *
4  * Copyright (c) 2002-2005 K A Fraser
5  * Copyright (c) 2004 Christian Limpach
6  *
7  * This program is free software; you can redistribute it and/or modify
8  * it under the terms of the GNU General Public License as published by
9  * the Free Software Foundation; either version 2 of the License, or
10  * (at your option) any later version.
11  *
12  * This program is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15  * GNU General Public License for more details.
16  *
17  * You should have received a copy of the GNU General Public License
18  * along with this program; If not, see <http://www.gnu.org/licenses/>.
19  */
20 
21 /*
22  * A description of the x86 page table API:
23  *
24  * Domains trap to do_mmu_update with a list of update requests.
25  * This is a list of (ptr, val) pairs, where the requested operation
26  * is *ptr = val.
27  *
28  * Reference counting of pages:
29  * ----------------------------
30  * Each page has two refcounts: tot_count and type_count.
31  *
32  * TOT_COUNT is the obvious reference count. It counts all uses of a
33  * physical page frame by a domain, including uses as a page directory,
34  * a page table, or simple mappings via a PTE. This count prevents a
35  * domain from releasing a frame back to the free pool when it still holds
36  * a reference to it.
37  *
38  * TYPE_COUNT is more subtle. A frame can be put to one of three
39  * mutually-exclusive uses: it might be used as a page directory, or a
40  * page table, or it may be mapped writable by the domain [of course, a
41  * frame may not be used in any of these three ways!].
42  * So, type_count is a count of the number of times a frame is being
43  * referred to in its current incarnation. Therefore, a page can only
44  * change its type when its type count is zero.
45  *
46  * Pinning the page type:
47  * ----------------------
48  * The type of a page can be pinned/unpinned with the commands
49  * MMUEXT_[UN]PIN_L?_TABLE. Each page can be pinned exactly once (that is,
50  * pinning is not reference counted, so it can't be nested).
51  * This is useful to prevent a page's type count falling to zero, at which
52  * point safety checks would need to be carried out next time the count
53  * is increased again.
54  *
55  * A further note on writable page mappings:
56  * -----------------------------------------
57  * For simplicity, the count of writable mappings for a page may not
58  * correspond to reality. The 'writable count' is incremented for every
59  * PTE which maps the page with the _PAGE_RW flag set. However, for
60  * write access to be possible the page directory entry must also have
61  * its _PAGE_RW bit set. We do not check this as it complicates the
62  * reference counting considerably [consider the case of multiple
63  * directory entries referencing a single page table, some with the RW
64  * bit set, others not -- it starts getting a bit messy].
65  * In normal use, this simplification shouldn't be a problem.
66  * However, the logic can be added if required.
67  *
68  * One more note on read-only page mappings:
69  * -----------------------------------------
70  * We want domains to be able to map pages for read-only access. The
71  * main reason is that page tables and directories should be readable
72  * by a domain, but it would not be safe for them to be writable.
73  * However, domains have free access to rings 1 & 2 of the Intel
74  * privilege model. In terms of page protection, these are considered
75  * to be part of 'supervisor mode'. The WP bit in CR0 controls whether
76  * read-only restrictions are respected in supervisor mode -- if the
77  * bit is clear then any mapped page is writable.
78  *
79  * We get round this by always setting the WP bit and disallowing
80  * updates to it. This is very unlikely to cause a problem for guest
81  * OS's, which will generally use the WP bit to simplify copy-on-write
82  * implementation (in that case, OS wants a fault when it writes to
83  * an application-supplied buffer).
84  */
85 
86 #include <xen/init.h>
87 #include <xen/kernel.h>
88 #include <xen/lib.h>
89 #include <xen/mm.h>
90 #include <xen/domain.h>
91 #include <xen/sched.h>
92 #include <xen/err.h>
93 #include <xen/perfc.h>
94 #include <xen/irq.h>
95 #include <xen/softirq.h>
96 #include <xen/domain_page.h>
97 #include <xen/event.h>
98 #include <xen/iocap.h>
99 #include <xen/guest_access.h>
100 #include <xen/pfn.h>
101 #include <xen/vmap.h>
102 #include <xen/xmalloc.h>
103 #include <xen/efi.h>
104 #include <xen/grant_table.h>
105 #include <xen/hypercall.h>
106 #include <asm/paging.h>
107 #include <asm/shadow.h>
108 #include <asm/page.h>
109 #include <asm/flushtlb.h>
110 #include <asm/io.h>
111 #include <asm/ldt.h>
112 #include <asm/x86_emulate.h>
113 #include <asm/e820.h>
114 #include <asm/hypercall.h>
115 #include <asm/shared.h>
116 #include <asm/mem_sharing.h>
117 #include <public/memory.h>
118 #include <public/sched.h>
119 #include <xsm/xsm.h>
120 #include <xen/trace.h>
121 #include <asm/setup.h>
122 #include <asm/fixmap.h>
123 #include <asm/io_apic.h>
124 #include <asm/pci.h>
125 #include <asm/guest.h>
126 
127 #include <asm/hvm/grant_table.h>
128 #include <asm/pv/grant_table.h>
129 
130 #include "pv/mm.h"
131 
132 /* Override macros from asm/page.h to make them work with mfn_t */
133 #undef mfn_to_page
134 #define mfn_to_page(mfn) __mfn_to_page(mfn_x(mfn))
135 #undef page_to_mfn
136 #define page_to_mfn(pg) _mfn(__page_to_mfn(pg))
137 
138 /* Mapping of the fixmap space needed early. */
139 l1_pgentry_t __section(".bss.page_aligned") __aligned(PAGE_SIZE)
140     l1_fixmap[L1_PAGETABLE_ENTRIES];
141 
142 paddr_t __read_mostly mem_hotplug;
143 
144 /* Private domain structs for DOMID_XEN and DOMID_IO. */
145 struct domain *dom_xen, *dom_io, *dom_cow;
146 
147 /* Frame table size in pages. */
148 unsigned long max_page;
149 unsigned long total_pages;
150 
151 bool __read_mostly machine_to_phys_mapping_valid;
152 
153 struct rangeset *__read_mostly mmio_ro_ranges;
154 
155 static uint32_t base_disallow_mask;
156 /* Global bit is allowed to be set on L1 PTEs. Intended for user mappings. */
157 #define L1_DISALLOW_MASK ((base_disallow_mask | _PAGE_GNTTAB) & ~_PAGE_GLOBAL)
158 
159 #define L2_DISALLOW_MASK base_disallow_mask
160 
161 #define l3_disallow_mask(d) (!is_pv_32bit_domain(d) ? \
162                              base_disallow_mask : 0xFFFFF198U)
163 
164 #define L4_DISALLOW_MASK (base_disallow_mask)
165 
166 #define l1_disallow_mask(d)                                     \
167     ((d != dom_io) &&                                           \
168      (rangeset_is_empty((d)->iomem_caps) &&                     \
169       rangeset_is_empty((d)->arch.ioport_caps) &&               \
170       !has_arch_pdevs(d) &&                                     \
171       is_pv_domain(d)) ?                                        \
172      L1_DISALLOW_MASK : (L1_DISALLOW_MASK & ~PAGE_CACHE_ATTRS))
173 
174 static s8 __read_mostly opt_mmio_relax;
175 
parse_mmio_relax(const char * s)176 static int __init parse_mmio_relax(const char *s)
177 {
178     if ( !*s )
179         opt_mmio_relax = 1;
180     else
181         opt_mmio_relax = parse_bool(s, NULL);
182     if ( opt_mmio_relax < 0 && strcmp(s, "all") )
183     {
184         opt_mmio_relax = 0;
185         return -EINVAL;
186     }
187 
188     return 0;
189 }
190 custom_param("mmio-relax", parse_mmio_relax);
191 
init_frametable_chunk(void * start,void * end)192 static void __init init_frametable_chunk(void *start, void *end)
193 {
194     unsigned long s = (unsigned long)start;
195     unsigned long e = (unsigned long)end;
196     unsigned long step;
197     mfn_t mfn;
198 
199     ASSERT(!(s & ((1 << L2_PAGETABLE_SHIFT) - 1)));
200     for ( ; s < e; s += step << PAGE_SHIFT )
201     {
202         step = 1UL << (cpu_has_page1gb &&
203                        !(s & ((1UL << L3_PAGETABLE_SHIFT) - 1)) ?
204                        L3_PAGETABLE_SHIFT - PAGE_SHIFT :
205                        L2_PAGETABLE_SHIFT - PAGE_SHIFT);
206         /*
207          * The hardcoded 4 below is arbitrary - just pick whatever you think
208          * is reasonable to waste as a trade-off for using a large page.
209          */
210         while ( step && s + (step << PAGE_SHIFT) > e + (4 << PAGE_SHIFT) )
211             step >>= PAGETABLE_ORDER;
212         mfn = alloc_boot_pages(step, step);
213         map_pages_to_xen(s, mfn_x(mfn), step, PAGE_HYPERVISOR);
214     }
215 
216     memset(start, 0, end - start);
217     memset(end, -1, s - e);
218 }
219 
init_frametable(void)220 void __init init_frametable(void)
221 {
222     unsigned int sidx, eidx, nidx;
223     unsigned int max_idx = (max_pdx + PDX_GROUP_COUNT - 1) / PDX_GROUP_COUNT;
224     struct page_info *end_pg, *top_pg;
225 
226     BUILD_BUG_ON(XEN_VIRT_END > FRAMETABLE_VIRT_START);
227     BUILD_BUG_ON(FRAMETABLE_VIRT_START & ((1UL << L2_PAGETABLE_SHIFT) - 1));
228 
229     for ( sidx = 0; ; sidx = nidx )
230     {
231         eidx = find_next_zero_bit(pdx_group_valid, max_idx, sidx);
232         nidx = find_next_bit(pdx_group_valid, max_idx, eidx);
233         if ( nidx >= max_idx )
234             break;
235         init_frametable_chunk(pdx_to_page(sidx * PDX_GROUP_COUNT),
236                               pdx_to_page(eidx * PDX_GROUP_COUNT));
237     }
238 
239     end_pg = pdx_to_page(max_pdx - 1) + 1;
240     top_pg = mem_hotplug ? pdx_to_page(max_idx * PDX_GROUP_COUNT - 1) + 1
241                          : end_pg;
242     init_frametable_chunk(pdx_to_page(sidx * PDX_GROUP_COUNT), top_pg);
243     memset(end_pg, -1, (unsigned long)top_pg - (unsigned long)end_pg);
244 }
245 
246 #ifndef NDEBUG
247 static unsigned int __read_mostly root_pgt_pv_xen_slots
248     = ROOT_PAGETABLE_PV_XEN_SLOTS;
249 static l4_pgentry_t __read_mostly split_l4e;
250 #else
251 #define root_pgt_pv_xen_slots ROOT_PAGETABLE_PV_XEN_SLOTS
252 #endif
253 
arch_init_memory(void)254 void __init arch_init_memory(void)
255 {
256     unsigned long i, pfn, rstart_pfn, rend_pfn, iostart_pfn, ioend_pfn;
257 
258     /*
259      * Basic guest-accessible flags:
260      *   PRESENT, R/W, USER, A/D, AVAIL[0,1,2], AVAIL_HIGH, NX (if available).
261      */
262     base_disallow_mask =
263         ~(_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | _PAGE_ACCESSED |
264           _PAGE_DIRTY | _PAGE_AVAIL | _PAGE_AVAIL_HIGH | _PAGE_NX);
265 
266     /*
267      * Initialise our DOMID_XEN domain.
268      * Any Xen-heap pages that we will allow to be mapped will have
269      * their domain field set to dom_xen.
270      * Hidden PCI devices will also be associated with this domain
271      * (but be [partly] controlled by Dom0 nevertheless).
272      */
273     dom_xen = domain_create(DOMID_XEN, DOMCRF_dummy, 0, NULL);
274     BUG_ON(IS_ERR(dom_xen));
275     INIT_LIST_HEAD(&dom_xen->arch.pdev_list);
276 
277     /*
278      * Initialise our DOMID_IO domain.
279      * This domain owns I/O pages that are within the range of the page_info
280      * array. Mappings occur at the priv of the caller.
281      */
282     dom_io = domain_create(DOMID_IO, DOMCRF_dummy, 0, NULL);
283     BUG_ON(IS_ERR(dom_io));
284 
285     /*
286      * Initialise our COW domain.
287      * This domain owns sharable pages.
288      */
289     dom_cow = domain_create(DOMID_COW, DOMCRF_dummy, 0, NULL);
290     BUG_ON(IS_ERR(dom_cow));
291 
292     /*
293      * First 1MB of RAM is historically marked as I/O.  If we booted PVH,
294      * reclaim the space.  Irrespective, leave MFN 0 as special for the sake
295      * of 0 being a very common default value. Also reserve page 0x1 which is
296      * used by the trampoline code on PVH.
297      */
298     BUG_ON(pvh_boot && trampoline_phys != 0x1000);
299     for ( i = 0;
300           i < (pvh_boot ? (1 + PFN_UP(trampoline_end - trampoline_start))
301                         : 0x100);
302           i++ )
303         share_xen_page_with_guest(mfn_to_page(_mfn(i)),
304                                   dom_io, XENSHARE_writable);
305 
306     /* Any areas not specified as RAM by the e820 map are considered I/O. */
307     for ( i = 0, pfn = 0; pfn < max_page; i++ )
308     {
309         while ( (i < e820.nr_map) &&
310                 (e820.map[i].type != E820_RAM) &&
311                 (e820.map[i].type != E820_UNUSABLE) )
312             i++;
313 
314         if ( i >= e820.nr_map )
315         {
316             /* No more RAM regions: mark as I/O right to end of memory map. */
317             rstart_pfn = rend_pfn = max_page;
318         }
319         else
320         {
321             /* Mark as I/O just up as far as next RAM region. */
322             rstart_pfn = min_t(unsigned long, max_page,
323                                PFN_UP(e820.map[i].addr));
324             rend_pfn   = max_t(unsigned long, rstart_pfn,
325                                PFN_DOWN(e820.map[i].addr + e820.map[i].size));
326         }
327 
328         /*
329          * Make sure any Xen mappings of RAM holes above 1MB are blown away.
330          * In particular this ensures that RAM holes are respected even in
331          * the statically-initialised 1-16MB mapping area.
332          */
333         iostart_pfn = max_t(unsigned long, pfn, 1UL << (20 - PAGE_SHIFT));
334         ioend_pfn = min(rstart_pfn, 16UL << (20 - PAGE_SHIFT));
335         if ( iostart_pfn < ioend_pfn )
336             destroy_xen_mappings((unsigned long)mfn_to_virt(iostart_pfn),
337                                  (unsigned long)mfn_to_virt(ioend_pfn));
338 
339         /* Mark as I/O up to next RAM region. */
340         for ( ; pfn < rstart_pfn; pfn++ )
341         {
342             if ( !mfn_valid(_mfn(pfn)) )
343                 continue;
344             share_xen_page_with_guest(
345                 mfn_to_page(_mfn(pfn)), dom_io, XENSHARE_writable);
346         }
347 
348         /* Skip the RAM region. */
349         pfn = rend_pfn;
350     }
351 
352     subarch_init_memory();
353 
354     efi_init_memory();
355 
356     mem_sharing_init();
357 
358 #ifndef NDEBUG
359     if ( highmem_start )
360     {
361         unsigned long split_va = (unsigned long)__va(highmem_start);
362 
363         if ( split_va < HYPERVISOR_VIRT_END &&
364              split_va - 1 == (unsigned long)__va(highmem_start - 1) )
365         {
366             root_pgt_pv_xen_slots = l4_table_offset(split_va) -
367                                     ROOT_PAGETABLE_FIRST_XEN_SLOT;
368             ASSERT(root_pgt_pv_xen_slots < ROOT_PAGETABLE_PV_XEN_SLOTS);
369             if ( l4_table_offset(split_va) == l4_table_offset(split_va - 1) )
370             {
371                 l3_pgentry_t *l3tab = alloc_xen_pagetable();
372 
373                 if ( l3tab )
374                 {
375                     const l3_pgentry_t *l3idle =
376                         l4e_to_l3e(idle_pg_table[l4_table_offset(split_va)]);
377 
378                     for ( i = 0; i < l3_table_offset(split_va); ++i )
379                         l3tab[i] = l3idle[i];
380                     for ( ; i < L3_PAGETABLE_ENTRIES; ++i )
381                         l3tab[i] = l3e_empty();
382                     split_l4e = l4e_from_pfn(virt_to_mfn(l3tab),
383                                              __PAGE_HYPERVISOR_RW);
384                 }
385                 else
386                     ++root_pgt_pv_xen_slots;
387             }
388         }
389     }
390 #endif
391 }
392 
page_is_ram_type(unsigned long mfn,unsigned long mem_type)393 int page_is_ram_type(unsigned long mfn, unsigned long mem_type)
394 {
395     uint64_t maddr = pfn_to_paddr(mfn);
396     int i;
397 
398     for ( i = 0; i < e820.nr_map; i++ )
399     {
400         switch ( e820.map[i].type )
401         {
402         case E820_RAM:
403             if ( mem_type & RAM_TYPE_CONVENTIONAL )
404                 break;
405             continue;
406         case E820_RESERVED:
407             if ( mem_type & RAM_TYPE_RESERVED )
408                 break;
409             continue;
410         case E820_UNUSABLE:
411             if ( mem_type & RAM_TYPE_UNUSABLE )
412                 break;
413             continue;
414         case E820_ACPI:
415         case E820_NVS:
416             if ( mem_type & RAM_TYPE_ACPI )
417                 break;
418             continue;
419         default:
420             /* unknown */
421             continue;
422         }
423 
424         /* Test the range. */
425         if ( (e820.map[i].addr <= maddr) &&
426              ((e820.map[i].addr + e820.map[i].size) >= (maddr + PAGE_SIZE)) )
427             return 1;
428     }
429 
430     return 0;
431 }
432 
domain_get_maximum_gpfn(struct domain * d)433 unsigned long domain_get_maximum_gpfn(struct domain *d)
434 {
435     if ( is_hvm_domain(d) )
436         return p2m_get_hostp2m(d)->max_mapped_pfn;
437     /* NB. PV guests specify nr_pfns rather than max_pfn so we adjust here. */
438     return (arch_get_max_pfn(d) ?: 1) - 1;
439 }
440 
share_xen_page_with_guest(struct page_info * page,struct domain * d,int readonly)441 void share_xen_page_with_guest(
442     struct page_info *page, struct domain *d, int readonly)
443 {
444     if ( page_get_owner(page) == d )
445         return;
446 
447     set_gpfn_from_mfn(mfn_x(page_to_mfn(page)), INVALID_M2P_ENTRY);
448 
449     spin_lock(&d->page_alloc_lock);
450 
451     /* The incremented type count pins as writable or read-only. */
452     page->u.inuse.type_info  = (readonly ? PGT_none : PGT_writable_page);
453     page->u.inuse.type_info |= PGT_validated | 1;
454 
455     page_set_owner(page, d);
456     smp_wmb(); /* install valid domain ptr before updating refcnt. */
457     ASSERT((page->count_info & ~PGC_xen_heap) == 0);
458 
459     /* Only add to the allocation list if the domain isn't dying. */
460     if ( !d->is_dying )
461     {
462         page->count_info |= PGC_xen_heap | PGC_allocated | 1;
463         if ( unlikely(d->xenheap_pages++ == 0) )
464             get_knownalive_domain(d);
465         page_list_add_tail(page, &d->xenpage_list);
466     }
467 
468     spin_unlock(&d->page_alloc_lock);
469 }
470 
unshare_xen_page_with_guest(struct page_info * page,struct domain * d)471 int __init unshare_xen_page_with_guest(struct page_info *page,
472                                        struct domain *d)
473 {
474     if ( page_get_owner(page) != d || !is_xen_heap_page(page) )
475         return -EINVAL;
476 
477     if ( test_and_clear_bit(_PGC_allocated, &page->count_info) )
478         put_page(page);
479 
480     /* Remove the owner and clear the flags. */
481     page->u.inuse.type_info = 0;
482     page_set_owner(page, NULL);
483 
484     return 0;
485 }
486 
share_xen_page_with_privileged_guests(struct page_info * page,int readonly)487 void share_xen_page_with_privileged_guests(
488     struct page_info *page, int readonly)
489 {
490     share_xen_page_with_guest(page, dom_xen, readonly);
491 }
492 
free_shared_domheap_page(struct page_info * page)493 void free_shared_domheap_page(struct page_info *page)
494 {
495     if ( test_and_clear_bit(_PGC_allocated, &page->count_info) )
496         put_page(page);
497     if ( !test_and_clear_bit(_PGC_xen_heap, &page->count_info) )
498         ASSERT_UNREACHABLE();
499     page->u.inuse.type_info = 0;
500     page_set_owner(page, NULL);
501     free_domheap_page(page);
502 }
503 
make_cr3(struct vcpu * v,mfn_t mfn)504 void make_cr3(struct vcpu *v, mfn_t mfn)
505 {
506     v->arch.cr3 = mfn_x(mfn) << PAGE_SHIFT;
507 }
508 
write_ptbase(struct vcpu * v)509 void write_ptbase(struct vcpu *v)
510 {
511     write_cr3(v->arch.cr3);
512 }
513 
514 /*
515  * Should be called after CR3 is updated.
516  *
517  * Uses values found in vcpu->arch.(guest_table and guest_table_user), and
518  * for HVM guests, arch.monitor_table and hvm's guest CR3.
519  *
520  * Update ref counts to shadow tables appropriately.
521  */
update_cr3(struct vcpu * v)522 void update_cr3(struct vcpu *v)
523 {
524     mfn_t cr3_mfn;
525 
526     if ( paging_mode_enabled(v->domain) )
527     {
528         paging_update_cr3(v);
529         return;
530     }
531 
532     if ( !(v->arch.flags & TF_kernel_mode) )
533         cr3_mfn = pagetable_get_mfn(v->arch.guest_table_user);
534     else
535         cr3_mfn = pagetable_get_mfn(v->arch.guest_table);
536 
537     make_cr3(v, cr3_mfn);
538 }
539 
set_tlbflush_timestamp(struct page_info * page)540 static inline void set_tlbflush_timestamp(struct page_info *page)
541 {
542     /*
543      * Record TLB information for flush later. We do not stamp page tables
544      * when running in shadow mode:
545      *  1. Pointless, since it's the shadow pt's which must be tracked.
546      *  2. Shadow mode reuses this field for shadowed page tables to store
547      *     flags info -- we don't want to conflict with that.
548      */
549     if ( !(page->count_info & PGC_page_table) ||
550          !shadow_mode_enabled(page_get_owner(page)) )
551         page_set_tlbflush_timestamp(page);
552 }
553 
554 const char __section(".bss.page_aligned.const") __aligned(PAGE_SIZE)
555     zero_page[PAGE_SIZE];
556 
invalidate_shadow_ldt(struct vcpu * v,int flush)557 static void invalidate_shadow_ldt(struct vcpu *v, int flush)
558 {
559     l1_pgentry_t *pl1e;
560     unsigned int i;
561     struct page_info *page;
562 
563     BUG_ON(unlikely(in_irq()));
564 
565     spin_lock(&v->arch.pv_vcpu.shadow_ldt_lock);
566 
567     if ( v->arch.pv_vcpu.shadow_ldt_mapcnt == 0 )
568         goto out;
569 
570     v->arch.pv_vcpu.shadow_ldt_mapcnt = 0;
571     pl1e = pv_ldt_ptes(v);
572 
573     for ( i = 0; i < 16; i++ )
574     {
575         if ( !(l1e_get_flags(pl1e[i]) & _PAGE_PRESENT) )
576             continue;
577         page = l1e_get_page(pl1e[i]);
578         l1e_write(&pl1e[i], l1e_empty());
579         ASSERT_PAGE_IS_TYPE(page, PGT_seg_desc_page);
580         ASSERT_PAGE_IS_DOMAIN(page, v->domain);
581         put_page_and_type(page);
582     }
583 
584     /* Rid TLBs of stale mappings (guest mappings and shadow mappings). */
585     if ( flush )
586         flush_tlb_mask(v->vcpu_dirty_cpumask);
587 
588  out:
589     spin_unlock(&v->arch.pv_vcpu.shadow_ldt_lock);
590 }
591 
592 
alloc_segdesc_page(struct page_info * page)593 static int alloc_segdesc_page(struct page_info *page)
594 {
595     const struct domain *owner = page_get_owner(page);
596     struct desc_struct *descs = __map_domain_page(page);
597     unsigned i;
598 
599     for ( i = 0; i < 512; i++ )
600         if ( unlikely(!check_descriptor(owner, &descs[i])) )
601             break;
602 
603     unmap_domain_page(descs);
604 
605     return i == 512 ? 0 : -EINVAL;
606 }
607 
get_page_and_type_from_mfn(mfn_t mfn,unsigned long type,struct domain * d,int partial,int preemptible)608 static int get_page_and_type_from_mfn(
609     mfn_t mfn, unsigned long type, struct domain *d,
610     int partial, int preemptible)
611 {
612     struct page_info *page = mfn_to_page(mfn);
613     int rc;
614 
615     if ( likely(partial >= 0) &&
616          unlikely(!get_page_from_mfn(mfn, d)) )
617         return -EINVAL;
618 
619     rc = (preemptible ?
620           get_page_type_preemptible(page, type) :
621           (get_page_type(page, type) ? 0 : -EINVAL));
622 
623     if ( unlikely(rc) && partial >= 0 &&
624          (!preemptible || page != current->arch.old_guest_table) )
625         put_page(page);
626 
627     return rc;
628 }
629 
put_data_page(struct page_info * page,int writeable)630 static void put_data_page(
631     struct page_info *page, int writeable)
632 {
633     if ( writeable )
634         put_page_and_type(page);
635     else
636         put_page(page);
637 }
638 
639 #ifdef CONFIG_PV_LINEAR_PT
640 
inc_linear_entries(struct page_info * pg)641 static bool inc_linear_entries(struct page_info *pg)
642 {
643     typeof(pg->linear_pt_count) nc = read_atomic(&pg->linear_pt_count), oc;
644 
645     do {
646         /*
647          * The check below checks for the "linear use" count being non-zero
648          * as well as overflow.  Signed integer overflow is undefined behavior
649          * according to the C spec.  However, as long as linear_pt_count is
650          * smaller in size than 'int', the arithmetic operation of the
651          * increment below won't overflow; rather the result will be truncated
652          * when stored.  Ensure that this is always true.
653          */
654         BUILD_BUG_ON(sizeof(nc) >= sizeof(int));
655         oc = nc++;
656         if ( nc <= 0 )
657             return false;
658         nc = cmpxchg(&pg->linear_pt_count, oc, nc);
659     } while ( oc != nc );
660 
661     return true;
662 }
663 
dec_linear_entries(struct page_info * pg)664 static void dec_linear_entries(struct page_info *pg)
665 {
666     typeof(pg->linear_pt_count) oc;
667 
668     oc = arch_fetch_and_add(&pg->linear_pt_count, -1);
669     ASSERT(oc > 0);
670 }
671 
inc_linear_uses(struct page_info * pg)672 static bool inc_linear_uses(struct page_info *pg)
673 {
674     typeof(pg->linear_pt_count) nc = read_atomic(&pg->linear_pt_count), oc;
675 
676     do {
677         /* See the respective comment in inc_linear_entries(). */
678         BUILD_BUG_ON(sizeof(nc) >= sizeof(int));
679         oc = nc--;
680         if ( nc >= 0 )
681             return false;
682         nc = cmpxchg(&pg->linear_pt_count, oc, nc);
683     } while ( oc != nc );
684 
685     return true;
686 }
687 
dec_linear_uses(struct page_info * pg)688 static void dec_linear_uses(struct page_info *pg)
689 {
690     typeof(pg->linear_pt_count) oc;
691 
692     oc = arch_fetch_and_add(&pg->linear_pt_count, 1);
693     ASSERT(oc < 0);
694 }
695 
696 /*
697  * We allow root tables to map each other (a.k.a. linear page tables). It
698  * needs some special care with reference counts and access permissions:
699  *  1. The mapping entry must be read-only, or the guest may get write access
700  *     to its own PTEs.
701  *  2. We must only bump the reference counts for an *already validated*
702  *     L2 table, or we can end up in a deadlock in get_page_type() by waiting
703  *     on a validation that is required to complete that validation.
704  *  3. We only need to increment the reference counts for the mapped page
705  *     frame if it is mapped by a different root table. This is sufficient and
706  *     also necessary to allow validation of a root table mapping itself.
707  */
708 static bool __read_mostly opt_pv_linear_pt = true;
709 boolean_param("pv-linear-pt", opt_pv_linear_pt);
710 
711 #define define_get_linear_pagetable(level)                                  \
712 static int                                                                  \
713 get_##level##_linear_pagetable(                                             \
714     level##_pgentry_t pde, unsigned long pde_pfn, struct domain *d)         \
715 {                                                                           \
716     unsigned long x, y;                                                     \
717     struct page_info *page;                                                 \
718     unsigned long pfn;                                                      \
719                                                                             \
720     if ( !opt_pv_linear_pt )                                                \
721     {                                                                       \
722         gdprintk(XENLOG_WARNING,                                            \
723                  "Attempt to create linear p.t. (feature disabled)\n");     \
724         return 0;                                                           \
725     }                                                                       \
726                                                                             \
727     if ( (level##e_get_flags(pde) & _PAGE_RW) )                             \
728     {                                                                       \
729         gdprintk(XENLOG_WARNING,                                            \
730                  "Attempt to create linear p.t. with write perms\n");       \
731         return 0;                                                           \
732     }                                                                       \
733                                                                             \
734     if ( (pfn = level##e_get_pfn(pde)) != pde_pfn )                         \
735     {                                                                       \
736         struct page_info *ptpg = mfn_to_page(_mfn(pde_pfn));                \
737                                                                             \
738         /* Make sure the page table belongs to the correct domain. */       \
739         if ( unlikely(page_get_owner(ptpg) != d) )                          \
740             return 0;                                                       \
741                                                                             \
742         /* Make sure the mapped frame belongs to the correct domain. */     \
743         if ( unlikely(!get_page_from_mfn(_mfn(pfn), d)) )                   \
744             return 0;                                                       \
745                                                                             \
746         /*                                                                  \
747          * Ensure that the mapped frame is an already-validated page table  \
748          * and is not itself having linear entries, as well as that the     \
749          * containing page table is not iself in use as a linear page table \
750          * elsewhere.                                                       \
751          * If so, atomically increment the count (checking for overflow).   \
752          */                                                                 \
753         page = mfn_to_page(_mfn(pfn));                                      \
754         if ( !inc_linear_entries(ptpg) )                                    \
755         {                                                                   \
756             put_page(page);                                                 \
757             return 0;                                                       \
758         }                                                                   \
759         if ( !inc_linear_uses(page) )                                       \
760         {                                                                   \
761             dec_linear_entries(ptpg);                                       \
762             put_page(page);                                                 \
763             return 0;                                                       \
764         }                                                                   \
765         y = page->u.inuse.type_info;                                        \
766         do {                                                                \
767             x = y;                                                          \
768             if ( unlikely((x & PGT_count_mask) == PGT_count_mask) ||        \
769                  unlikely((x & (PGT_type_mask|PGT_validated)) !=            \
770                           (PGT_##level##_page_table|PGT_validated)) )       \
771             {                                                               \
772                 dec_linear_uses(page);                                      \
773                 dec_linear_entries(ptpg);                                   \
774                 put_page(page);                                             \
775                 return 0;                                                   \
776             }                                                               \
777         }                                                                   \
778         while ( (y = cmpxchg(&page->u.inuse.type_info, x, x + 1)) != x );   \
779     }                                                                       \
780                                                                             \
781     return 1;                                                               \
782 }
783 
784 #else /* CONFIG_PV_LINEAR_PT */
785 
786 #define define_get_linear_pagetable(level)                              \
787 static int                                                              \
788 get_##level##_linear_pagetable(                                         \
789         level##_pgentry_t pde, unsigned long pde_pfn, struct domain *d) \
790 {                                                                       \
791         return 0;                                                       \
792 }
793 
dec_linear_uses(struct page_info * pg)794 static void dec_linear_uses(struct page_info *pg)
795 {
796     ASSERT(pg->linear_pt_count == 0);
797 }
798 
dec_linear_entries(struct page_info * pg)799 static void dec_linear_entries(struct page_info *pg)
800 {
801     ASSERT(pg->linear_pt_count == 0);
802 }
803 
804 #endif /* CONFIG_PV_LINEAR_PT */
805 
is_iomem_page(mfn_t mfn)806 bool is_iomem_page(mfn_t mfn)
807 {
808     struct page_info *page;
809 
810     if ( !mfn_valid(mfn) )
811         return true;
812 
813     /* Caller must know that it is an iomem page, or a reference is held. */
814     page = mfn_to_page(mfn);
815     ASSERT((page->count_info & PGC_count_mask) != 0);
816 
817     return (page_get_owner(page) == dom_io);
818 }
819 
update_xen_mappings(unsigned long mfn,unsigned int cacheattr)820 static int update_xen_mappings(unsigned long mfn, unsigned int cacheattr)
821 {
822     int err = 0;
823     bool alias = mfn >= PFN_DOWN(xen_phys_start) &&
824          mfn < PFN_UP(xen_phys_start + xen_virt_end - XEN_VIRT_START);
825     unsigned long xen_va =
826         XEN_VIRT_START + ((mfn - PFN_DOWN(xen_phys_start)) << PAGE_SHIFT);
827 
828     if ( unlikely(alias) && cacheattr )
829         err = map_pages_to_xen(xen_va, mfn, 1, 0);
830     if ( !err )
831         err = map_pages_to_xen((unsigned long)mfn_to_virt(mfn), mfn, 1,
832                      PAGE_HYPERVISOR | cacheattr_to_pte_flags(cacheattr));
833     if ( unlikely(alias) && !cacheattr && !err )
834         err = map_pages_to_xen(xen_va, mfn, 1, PAGE_HYPERVISOR);
835     return err;
836 }
837 
838 #ifndef NDEBUG
839 struct mmio_emul_range_ctxt {
840     const struct domain *d;
841     unsigned long mfn;
842 };
843 
print_mmio_emul_range(unsigned long s,unsigned long e,void * arg)844 static int print_mmio_emul_range(unsigned long s, unsigned long e, void *arg)
845 {
846     const struct mmio_emul_range_ctxt *ctxt = arg;
847 
848     if ( ctxt->mfn > e )
849         return 0;
850 
851     if ( ctxt->mfn >= s )
852     {
853         static DEFINE_SPINLOCK(last_lock);
854         static const struct domain *last_d;
855         static unsigned long last_s = ~0UL, last_e;
856         bool print = false;
857 
858         spin_lock(&last_lock);
859         if ( last_d != ctxt->d || last_s != s || last_e != e )
860         {
861             last_d = ctxt->d;
862             last_s = s;
863             last_e = e;
864             print = true;
865         }
866         spin_unlock(&last_lock);
867 
868         if ( print )
869             printk(XENLOG_G_INFO
870                    "d%d: Forcing write emulation on MFNs %lx-%lx\n",
871                    ctxt->d->domain_id, s, e);
872     }
873 
874     return 1;
875 }
876 #endif
877 
878 /*
879  * get_page_from_l1e returns:
880  *   0  => success (page not present also counts as such)
881  *  <0  => error code
882  *  >0  => the page flags to be flipped
883  */
884 int
get_page_from_l1e(l1_pgentry_t l1e,struct domain * l1e_owner,struct domain * pg_owner)885 get_page_from_l1e(
886     l1_pgentry_t l1e, struct domain *l1e_owner, struct domain *pg_owner)
887 {
888     unsigned long mfn = l1e_get_pfn(l1e);
889     struct page_info *page = mfn_to_page(_mfn(mfn));
890     uint32_t l1f = l1e_get_flags(l1e);
891     struct vcpu *curr = current;
892     struct domain *real_pg_owner;
893     bool write;
894 
895     if ( !(l1f & _PAGE_PRESENT) )
896         return 0;
897 
898     if ( unlikely(l1f & l1_disallow_mask(l1e_owner)) )
899     {
900         gdprintk(XENLOG_WARNING, "Bad L1 flags %x\n",
901                  l1f & l1_disallow_mask(l1e_owner));
902         return -EINVAL;
903     }
904 
905     if ( !mfn_valid(_mfn(mfn)) ||
906          (real_pg_owner = page_get_owner_and_reference(page)) == dom_io )
907     {
908         int flip = 0;
909 
910         /* Only needed the reference to confirm dom_io ownership. */
911         if ( mfn_valid(_mfn(mfn)) )
912             put_page(page);
913 
914         /* DOMID_IO reverts to caller for privilege checks. */
915         if ( pg_owner == dom_io )
916             pg_owner = curr->domain;
917 
918         if ( !iomem_access_permitted(pg_owner, mfn, mfn) )
919         {
920             if ( mfn != (PADDR_MASK >> PAGE_SHIFT) ) /* INVALID_MFN? */
921             {
922                 gdprintk(XENLOG_WARNING,
923                          "d%d non-privileged attempt to map MMIO space %"PRI_mfn"\n",
924                          pg_owner->domain_id, mfn);
925                 return -EPERM;
926             }
927             return -EINVAL;
928         }
929 
930         if ( pg_owner != l1e_owner &&
931              !iomem_access_permitted(l1e_owner, mfn, mfn) )
932         {
933             if ( mfn != (PADDR_MASK >> PAGE_SHIFT) ) /* INVALID_MFN? */
934             {
935                 gdprintk(XENLOG_WARNING,
936                          "d%d attempted to map MMIO space %"PRI_mfn" in d%d to d%d\n",
937                          curr->domain->domain_id, mfn, pg_owner->domain_id,
938                          l1e_owner->domain_id);
939                 return -EPERM;
940             }
941             return -EINVAL;
942         }
943 
944         if ( !rangeset_contains_singleton(mmio_ro_ranges, mfn) )
945         {
946             /* MMIO pages must not be mapped cachable unless requested so. */
947             switch ( opt_mmio_relax )
948             {
949             case 0:
950                 break;
951             case 1:
952                 if ( !is_hardware_domain(l1e_owner) )
953                     break;
954                 /* fallthrough */
955             case -1:
956                 return 0;
957             default:
958                 ASSERT_UNREACHABLE();
959             }
960         }
961         else if ( l1f & _PAGE_RW )
962         {
963 #ifndef NDEBUG
964             const unsigned long *ro_map;
965             unsigned int seg, bdf;
966 
967             if ( !pci_mmcfg_decode(mfn, &seg, &bdf) ||
968                  ((ro_map = pci_get_ro_map(seg)) != NULL &&
969                   test_bit(bdf, ro_map)) )
970                 printk(XENLOG_G_WARNING
971                        "d%d: Forcing read-only access to MFN %lx\n",
972                        l1e_owner->domain_id, mfn);
973             else
974                 rangeset_report_ranges(mmio_ro_ranges, 0, ~0UL,
975                                        print_mmio_emul_range,
976                                        &(struct mmio_emul_range_ctxt){
977                                            .d = l1e_owner,
978                                            .mfn = mfn });
979 #endif
980             flip = _PAGE_RW;
981         }
982 
983         switch ( l1f & PAGE_CACHE_ATTRS )
984         {
985         case 0: /* WB */
986             flip |= _PAGE_PWT | _PAGE_PCD;
987             break;
988         case _PAGE_PWT: /* WT */
989         case _PAGE_PWT | _PAGE_PAT: /* WP */
990             flip |= _PAGE_PCD | (l1f & _PAGE_PAT);
991             break;
992         }
993 
994         return flip;
995     }
996 
997     if ( unlikely( (real_pg_owner != pg_owner) &&
998                    (real_pg_owner != dom_cow) ) )
999     {
1000         /*
1001          * Let privileged domains transfer the right to map their target
1002          * domain's pages. This is used to allow stub-domain pvfb export to
1003          * dom0, until pvfb supports granted mappings. At that time this
1004          * minor hack can go away.
1005          */
1006         if ( (real_pg_owner == NULL) || (pg_owner == l1e_owner) ||
1007              xsm_priv_mapping(XSM_TARGET, pg_owner, real_pg_owner) )
1008         {
1009             gdprintk(XENLOG_WARNING,
1010                      "pg_owner d%d l1e_owner d%d, but real_pg_owner d%d\n",
1011                      pg_owner->domain_id, l1e_owner->domain_id,
1012                      real_pg_owner ? real_pg_owner->domain_id : -1);
1013             goto could_not_pin;
1014         }
1015         pg_owner = real_pg_owner;
1016     }
1017 
1018     /*
1019      * Extra paranoid check for shared memory. Writable mappings
1020      * disallowed (unshare first!)
1021      */
1022     if ( (l1f & _PAGE_RW) && (real_pg_owner == dom_cow) )
1023         goto could_not_pin;
1024 
1025     /*
1026      * Foreign mappings into guests in shadow external mode don't
1027      * contribute to writeable mapping refcounts.  (This allows the
1028      * qemu-dm helper process in dom0 to map the domain's memory without
1029      * messing up the count of "real" writable mappings.)
1030      */
1031     write = (l1f & _PAGE_RW) &&
1032             ((l1e_owner == pg_owner) || !paging_mode_external(pg_owner));
1033     if ( write && !get_page_type(page, PGT_writable_page) )
1034     {
1035         gdprintk(XENLOG_WARNING, "Could not get page type PGT_writable_page\n");
1036         goto could_not_pin;
1037     }
1038 
1039     if ( pte_flags_to_cacheattr(l1f) !=
1040          ((page->count_info & PGC_cacheattr_mask) >> PGC_cacheattr_base) )
1041     {
1042         unsigned long x, nx, y = page->count_info;
1043         unsigned long cacheattr = pte_flags_to_cacheattr(l1f);
1044         int err;
1045 
1046         if ( is_xen_heap_page(page) )
1047         {
1048             if ( write )
1049                 put_page_type(page);
1050             put_page(page);
1051             gdprintk(XENLOG_WARNING,
1052                      "Attempt to change cache attributes of Xen heap page\n");
1053             return -EACCES;
1054         }
1055 
1056         do {
1057             x  = y;
1058             nx = (x & ~PGC_cacheattr_mask) | (cacheattr << PGC_cacheattr_base);
1059         } while ( (y = cmpxchg(&page->count_info, x, nx)) != x );
1060 
1061         err = update_xen_mappings(mfn, cacheattr);
1062         if ( unlikely(err) )
1063         {
1064             cacheattr = y & PGC_cacheattr_mask;
1065             do {
1066                 x  = y;
1067                 nx = (x & ~PGC_cacheattr_mask) | cacheattr;
1068             } while ( (y = cmpxchg(&page->count_info, x, nx)) != x );
1069 
1070             if ( write )
1071                 put_page_type(page);
1072             put_page(page);
1073 
1074             gdprintk(XENLOG_WARNING, "Error updating mappings for mfn %" PRI_mfn
1075                      " (pfn %" PRI_pfn ", from L1 entry %" PRIpte ") for d%d\n",
1076                      mfn, get_gpfn_from_mfn(mfn),
1077                      l1e_get_intpte(l1e), l1e_owner->domain_id);
1078             return err;
1079         }
1080     }
1081 
1082     return 0;
1083 
1084  could_not_pin:
1085     gdprintk(XENLOG_WARNING, "Error getting mfn %" PRI_mfn " (pfn %" PRI_pfn
1086              ") from L1 entry %" PRIpte " for l1e_owner d%d, pg_owner d%d\n",
1087              mfn, get_gpfn_from_mfn(mfn),
1088              l1e_get_intpte(l1e), l1e_owner->domain_id, pg_owner->domain_id);
1089     if ( real_pg_owner != NULL )
1090         put_page(page);
1091     return -EBUSY;
1092 }
1093 
1094 
1095 /* NB. Virtual address 'l2e' maps to a machine address within frame 'pfn'. */
1096 /*
1097  * get_page_from_l2e returns:
1098  *   1 => page not present
1099  *   0 => success
1100  *  <0 => error code
1101  */
1102 define_get_linear_pagetable(l2);
1103 static int
get_page_from_l2e(l2_pgentry_t l2e,unsigned long pfn,struct domain * d)1104 get_page_from_l2e(
1105     l2_pgentry_t l2e, unsigned long pfn, struct domain *d)
1106 {
1107     unsigned long mfn = l2e_get_pfn(l2e);
1108     int rc;
1109 
1110     if ( !(l2e_get_flags(l2e) & _PAGE_PRESENT) )
1111         return 1;
1112 
1113     if ( unlikely((l2e_get_flags(l2e) & L2_DISALLOW_MASK)) )
1114     {
1115         gdprintk(XENLOG_WARNING, "Bad L2 flags %x\n",
1116                  l2e_get_flags(l2e) & L2_DISALLOW_MASK);
1117         return -EINVAL;
1118     }
1119 
1120     if ( !(l2e_get_flags(l2e) & _PAGE_PSE) )
1121     {
1122         rc = get_page_and_type_from_mfn(_mfn(mfn), PGT_l1_page_table, d, 0, 0);
1123         if ( unlikely(rc == -EINVAL) && get_l2_linear_pagetable(l2e, pfn, d) )
1124             rc = 0;
1125         return rc;
1126     }
1127 
1128     return -EINVAL;
1129 }
1130 
1131 
1132 /*
1133  * get_page_from_l3e returns:
1134  *   1 => page not present
1135  *   0 => success
1136  *  <0 => error code
1137  */
1138 define_get_linear_pagetable(l3);
1139 static int
get_page_from_l3e(l3_pgentry_t l3e,unsigned long pfn,struct domain * d,int partial)1140 get_page_from_l3e(
1141     l3_pgentry_t l3e, unsigned long pfn, struct domain *d, int partial)
1142 {
1143     int rc;
1144 
1145     if ( !(l3e_get_flags(l3e) & _PAGE_PRESENT) )
1146         return 1;
1147 
1148     if ( unlikely((l3e_get_flags(l3e) & l3_disallow_mask(d))) )
1149     {
1150         gdprintk(XENLOG_WARNING, "Bad L3 flags %x\n",
1151                  l3e_get_flags(l3e) & l3_disallow_mask(d));
1152         return -EINVAL;
1153     }
1154 
1155     rc = get_page_and_type_from_mfn(
1156         l3e_get_mfn(l3e), PGT_l2_page_table, d, partial, 1);
1157     if ( unlikely(rc == -EINVAL) &&
1158          !is_pv_32bit_domain(d) &&
1159          get_l3_linear_pagetable(l3e, pfn, d) )
1160         rc = 0;
1161 
1162     return rc;
1163 }
1164 
1165 /*
1166  * get_page_from_l4e returns:
1167  *   1 => page not present
1168  *   0 => success
1169  *  <0 => error code
1170  */
1171 define_get_linear_pagetable(l4);
1172 static int
get_page_from_l4e(l4_pgentry_t l4e,unsigned long pfn,struct domain * d,int partial)1173 get_page_from_l4e(
1174     l4_pgentry_t l4e, unsigned long pfn, struct domain *d, int partial)
1175 {
1176     int rc;
1177 
1178     if ( !(l4e_get_flags(l4e) & _PAGE_PRESENT) )
1179         return 1;
1180 
1181     if ( unlikely((l4e_get_flags(l4e) & L4_DISALLOW_MASK)) )
1182     {
1183         gdprintk(XENLOG_WARNING, "Bad L4 flags %x\n",
1184                  l4e_get_flags(l4e) & L4_DISALLOW_MASK);
1185         return -EINVAL;
1186     }
1187 
1188     rc = get_page_and_type_from_mfn(
1189         l4e_get_mfn(l4e), PGT_l3_page_table, d, partial, 1);
1190     if ( unlikely(rc == -EINVAL) && get_l4_linear_pagetable(l4e, pfn, d) )
1191         rc = 0;
1192 
1193     return rc;
1194 }
1195 
1196 static int _put_page_type(struct page_info *page, bool preemptible,
1197                           struct page_info *ptpg);
1198 
put_page_from_l1e(l1_pgentry_t l1e,struct domain * l1e_owner)1199 void put_page_from_l1e(l1_pgentry_t l1e, struct domain *l1e_owner)
1200 {
1201     unsigned long     pfn = l1e_get_pfn(l1e);
1202     struct page_info *page;
1203     struct domain    *pg_owner;
1204     struct vcpu      *v;
1205 
1206     if ( !(l1e_get_flags(l1e) & _PAGE_PRESENT) || is_iomem_page(_mfn(pfn)) )
1207         return;
1208 
1209     page = mfn_to_page(_mfn(pfn));
1210     pg_owner = page_get_owner(page);
1211 
1212     /*
1213      * Check if this is a mapping that was established via a grant reference.
1214      * If it was then we should not be here: we require that such mappings are
1215      * explicitly destroyed via the grant-table interface.
1216      *
1217      * The upshot of this is that the guest can end up with active grants that
1218      * it cannot destroy (because it no longer has a PTE to present to the
1219      * grant-table interface). This can lead to subtle hard-to-catch bugs,
1220      * hence a special grant PTE flag can be enabled to catch the bug early.
1221      *
1222      * (Note that the undestroyable active grants are not a security hole in
1223      * Xen. All active grants can safely be cleaned up when the domain dies.)
1224      */
1225     if ( (l1e_get_flags(l1e) & _PAGE_GNTTAB) &&
1226          !l1e_owner->is_shutting_down && !l1e_owner->is_dying )
1227     {
1228         gdprintk(XENLOG_WARNING,
1229                  "Attempt to implicitly unmap a granted PTE %" PRIpte "\n",
1230                  l1e_get_intpte(l1e));
1231         domain_crash(l1e_owner);
1232     }
1233 
1234     /*
1235      * Remember we didn't take a type-count of foreign writable mappings
1236      * to paging-external domains.
1237      */
1238     if ( (l1e_get_flags(l1e) & _PAGE_RW) &&
1239          ((l1e_owner == pg_owner) || !paging_mode_external(pg_owner)) )
1240     {
1241         put_page_and_type(page);
1242     }
1243     else
1244     {
1245         /* We expect this is rare so we blow the entire shadow LDT. */
1246         if ( unlikely(((page->u.inuse.type_info & PGT_type_mask) ==
1247                        PGT_seg_desc_page)) &&
1248              unlikely(((page->u.inuse.type_info & PGT_count_mask) != 0)) &&
1249              (l1e_owner == pg_owner) )
1250         {
1251             for_each_vcpu ( pg_owner, v )
1252                 invalidate_shadow_ldt(v, 1);
1253         }
1254         put_page(page);
1255     }
1256 }
1257 
1258 
1259 /*
1260  * NB. Virtual address 'l2e' maps to a machine address within frame 'pfn'.
1261  * Note also that this automatically deals correctly with linear p.t.'s.
1262  */
put_page_from_l2e(l2_pgentry_t l2e,unsigned long pfn)1263 static int put_page_from_l2e(l2_pgentry_t l2e, unsigned long pfn)
1264 {
1265     if ( !(l2e_get_flags(l2e) & _PAGE_PRESENT) || (l2e_get_pfn(l2e) == pfn) )
1266         return 1;
1267 
1268     if ( l2e_get_flags(l2e) & _PAGE_PSE )
1269     {
1270         struct page_info *page = l2e_get_page(l2e);
1271         unsigned int i;
1272 
1273         for ( i = 0; i < (1u << PAGETABLE_ORDER); i++, page++ )
1274             put_page_and_type(page);
1275     }
1276     else
1277     {
1278         struct page_info *pg = l2e_get_page(l2e);
1279         int rc = _put_page_type(pg, false, mfn_to_page(_mfn(pfn)));
1280 
1281         ASSERT(!rc);
1282         put_page(pg);
1283     }
1284 
1285     return 0;
1286 }
1287 
put_page_from_l3e(l3_pgentry_t l3e,unsigned long pfn,int partial,bool defer)1288 static int put_page_from_l3e(l3_pgentry_t l3e, unsigned long pfn,
1289                              int partial, bool defer)
1290 {
1291     struct page_info *pg;
1292     int rc;
1293 
1294     if ( !(l3e_get_flags(l3e) & _PAGE_PRESENT) || (l3e_get_pfn(l3e) == pfn) )
1295         return 1;
1296 
1297     if ( unlikely(l3e_get_flags(l3e) & _PAGE_PSE) )
1298     {
1299         unsigned long mfn = l3e_get_pfn(l3e);
1300         int writeable = l3e_get_flags(l3e) & _PAGE_RW;
1301 
1302         ASSERT(!(mfn & ((1UL << (L3_PAGETABLE_SHIFT - PAGE_SHIFT)) - 1)));
1303         do {
1304             put_data_page(mfn_to_page(_mfn(mfn)), writeable);
1305         } while ( ++mfn & ((1UL << (L3_PAGETABLE_SHIFT - PAGE_SHIFT)) - 1) );
1306 
1307         return 0;
1308     }
1309 
1310     pg = l3e_get_page(l3e);
1311 
1312     if ( unlikely(partial > 0) )
1313     {
1314         ASSERT(!defer);
1315         return _put_page_type(pg, true, mfn_to_page(_mfn(pfn)));
1316     }
1317 
1318     if ( defer )
1319     {
1320         current->arch.old_guest_ptpg = mfn_to_page(_mfn(pfn));
1321         current->arch.old_guest_table = pg;
1322         return 0;
1323     }
1324 
1325     rc = _put_page_type(pg, true, mfn_to_page(_mfn(pfn)));
1326     if ( likely(!rc) )
1327         put_page(pg);
1328 
1329     return rc;
1330 }
1331 
put_page_from_l4e(l4_pgentry_t l4e,unsigned long pfn,int partial,bool defer)1332 static int put_page_from_l4e(l4_pgentry_t l4e, unsigned long pfn,
1333                              int partial, bool defer)
1334 {
1335     int rc = 1;
1336 
1337     if ( (l4e_get_flags(l4e) & _PAGE_PRESENT) &&
1338          (l4e_get_pfn(l4e) != pfn) )
1339     {
1340         struct page_info *pg = l4e_get_page(l4e);
1341 
1342         if ( unlikely(partial > 0) )
1343         {
1344             ASSERT(!defer);
1345             return _put_page_type(pg, true, mfn_to_page(_mfn(pfn)));
1346         }
1347 
1348         if ( defer )
1349         {
1350             current->arch.old_guest_ptpg = mfn_to_page(_mfn(pfn));
1351             current->arch.old_guest_table = pg;
1352             return 0;
1353         }
1354 
1355         rc = _put_page_type(pg, true, mfn_to_page(_mfn(pfn)));
1356         if ( likely(!rc) )
1357             put_page(pg);
1358     }
1359 
1360     return rc;
1361 }
1362 
alloc_l1_table(struct page_info * page)1363 static int alloc_l1_table(struct page_info *page)
1364 {
1365     struct domain *d = page_get_owner(page);
1366     l1_pgentry_t  *pl1e;
1367     unsigned int   i;
1368     int            ret = 0;
1369 
1370     pl1e = __map_domain_page(page);
1371 
1372     for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++ )
1373     {
1374         switch ( ret = get_page_from_l1e(pl1e[i], d, d) )
1375         {
1376         default:
1377             goto fail;
1378         case 0:
1379             break;
1380         case _PAGE_RW ... _PAGE_RW | PAGE_CACHE_ATTRS:
1381             ASSERT(!(ret & ~(_PAGE_RW | PAGE_CACHE_ATTRS)));
1382             l1e_flip_flags(pl1e[i], ret);
1383             break;
1384         }
1385 
1386         pl1e[i] = adjust_guest_l1e(pl1e[i], d);
1387     }
1388 
1389     unmap_domain_page(pl1e);
1390     return 0;
1391 
1392  fail:
1393     gdprintk(XENLOG_WARNING, "Failure in alloc_l1_table: slot %#x\n", i);
1394     while ( i-- > 0 )
1395         put_page_from_l1e(pl1e[i], d);
1396 
1397     unmap_domain_page(pl1e);
1398     return ret;
1399 }
1400 
create_pae_xen_mappings(struct domain * d,l3_pgentry_t * pl3e)1401 static int create_pae_xen_mappings(struct domain *d, l3_pgentry_t *pl3e)
1402 {
1403     struct page_info *page;
1404     l3_pgentry_t     l3e3;
1405 
1406     if ( !is_pv_32bit_domain(d) )
1407         return 1;
1408 
1409     pl3e = (l3_pgentry_t *)((unsigned long)pl3e & PAGE_MASK);
1410 
1411     /* 3rd L3 slot contains L2 with Xen-private mappings. It *must* exist. */
1412     l3e3 = pl3e[3];
1413     if ( !(l3e_get_flags(l3e3) & _PAGE_PRESENT) )
1414     {
1415         gdprintk(XENLOG_WARNING, "PAE L3 3rd slot is empty\n");
1416         return 0;
1417     }
1418 
1419     /*
1420      * The Xen-private mappings include linear mappings. The L2 thus cannot
1421      * be shared by multiple L3 tables. The test here is adequate because:
1422      *  1. Cannot appear in slots != 3 because get_page_type() checks the
1423      *     PGT_pae_xen_l2 flag, which is asserted iff the L2 appears in slot 3
1424      *  2. Cannot appear in another page table's L3:
1425      *     a. alloc_l3_table() calls this function and this check will fail
1426      *     b. mod_l3_entry() disallows updates to slot 3 in an existing table
1427      */
1428     page = l3e_get_page(l3e3);
1429     BUG_ON(page->u.inuse.type_info & PGT_pinned);
1430     BUG_ON((page->u.inuse.type_info & PGT_count_mask) == 0);
1431     BUG_ON(!(page->u.inuse.type_info & PGT_pae_xen_l2));
1432     if ( (page->u.inuse.type_info & PGT_count_mask) != 1 )
1433     {
1434         gdprintk(XENLOG_WARNING, "PAE L3 3rd slot is shared\n");
1435         return 0;
1436     }
1437 
1438     return 1;
1439 }
1440 
alloc_l2_table(struct page_info * page,unsigned long type,int preemptible)1441 static int alloc_l2_table(struct page_info *page, unsigned long type,
1442                           int preemptible)
1443 {
1444     struct domain *d = page_get_owner(page);
1445     unsigned long  pfn = mfn_x(page_to_mfn(page));
1446     l2_pgentry_t  *pl2e;
1447     unsigned int   i;
1448     int            rc = 0;
1449 
1450     pl2e = map_domain_page(_mfn(pfn));
1451 
1452     for ( i = page->nr_validated_ptes; i < L2_PAGETABLE_ENTRIES; i++ )
1453     {
1454         if ( preemptible && i > page->nr_validated_ptes
1455              && hypercall_preempt_check() )
1456         {
1457             page->nr_validated_ptes = i;
1458             rc = -ERESTART;
1459             break;
1460         }
1461 
1462         if ( !is_guest_l2_slot(d, type, i) ||
1463              (rc = get_page_from_l2e(pl2e[i], pfn, d)) > 0 )
1464             continue;
1465 
1466         if ( rc < 0 )
1467         {
1468             gdprintk(XENLOG_WARNING, "Failure in alloc_l2_table: slot %#x\n", i);
1469             while ( i-- > 0 )
1470                 if ( is_guest_l2_slot(d, type, i) )
1471                     put_page_from_l2e(pl2e[i], pfn);
1472             break;
1473         }
1474 
1475         pl2e[i] = adjust_guest_l2e(pl2e[i], d);
1476     }
1477 
1478     if ( rc >= 0 && (type & PGT_pae_xen_l2) )
1479         init_xen_pae_l2_slots(pl2e, d);
1480 
1481     unmap_domain_page(pl2e);
1482     return rc > 0 ? 0 : rc;
1483 }
1484 
alloc_l3_table(struct page_info * page)1485 static int alloc_l3_table(struct page_info *page)
1486 {
1487     struct domain *d = page_get_owner(page);
1488     unsigned long  pfn = mfn_x(page_to_mfn(page));
1489     l3_pgentry_t  *pl3e;
1490     unsigned int   i;
1491     int            rc = 0, partial = page->partial_pte;
1492 
1493     pl3e = map_domain_page(_mfn(pfn));
1494 
1495     /*
1496      * PAE guests allocate full pages, but aren't required to initialize
1497      * more than the first four entries; when running in compatibility
1498      * mode, however, the full page is visible to the MMU, and hence all
1499      * 512 entries must be valid/verified, which is most easily achieved
1500      * by clearing them out.
1501      */
1502     if ( is_pv_32bit_domain(d) )
1503         memset(pl3e + 4, 0, (L3_PAGETABLE_ENTRIES - 4) * sizeof(*pl3e));
1504 
1505     for ( i = page->nr_validated_ptes; i < L3_PAGETABLE_ENTRIES;
1506           i++, partial = 0 )
1507     {
1508         if ( is_pv_32bit_domain(d) && (i == 3) )
1509         {
1510             if ( !(l3e_get_flags(pl3e[i]) & _PAGE_PRESENT) ||
1511                  (l3e_get_flags(pl3e[i]) & l3_disallow_mask(d)) )
1512                 rc = -EINVAL;
1513             else
1514                 rc = get_page_and_type_from_mfn(
1515                     l3e_get_mfn(pl3e[i]),
1516                     PGT_l2_page_table | PGT_pae_xen_l2, d, partial, 1);
1517         }
1518         else if ( (rc = get_page_from_l3e(pl3e[i], pfn, d, partial)) > 0 )
1519             continue;
1520 
1521         if ( rc == -ERESTART )
1522         {
1523             page->nr_validated_ptes = i;
1524             page->partial_pte = partial ?: 1;
1525         }
1526         else if ( rc == -EINTR && i )
1527         {
1528             page->nr_validated_ptes = i;
1529             page->partial_pte = 0;
1530             rc = -ERESTART;
1531         }
1532         if ( rc < 0 )
1533             break;
1534 
1535         pl3e[i] = adjust_guest_l3e(pl3e[i], d);
1536     }
1537 
1538     if ( rc >= 0 && !create_pae_xen_mappings(d, pl3e) )
1539         rc = -EINVAL;
1540     if ( rc < 0 && rc != -ERESTART && rc != -EINTR )
1541     {
1542         gdprintk(XENLOG_WARNING, "Failure in alloc_l3_table: slot %#x\n", i);
1543         if ( i )
1544         {
1545             page->nr_validated_ptes = i;
1546             page->partial_pte = 0;
1547             current->arch.old_guest_ptpg = NULL;
1548             current->arch.old_guest_table = page;
1549         }
1550         while ( i-- > 0 )
1551             pl3e[i] = unadjust_guest_l3e(pl3e[i], d);
1552     }
1553 
1554     unmap_domain_page(pl3e);
1555     return rc > 0 ? 0 : rc;
1556 }
1557 
init_xen_pae_l2_slots(l2_pgentry_t * l2t,const struct domain * d)1558 void init_xen_pae_l2_slots(l2_pgentry_t *l2t, const struct domain *d)
1559 {
1560     memcpy(&l2t[COMPAT_L2_PAGETABLE_FIRST_XEN_SLOT(d)],
1561            &compat_idle_pg_table_l2[
1562                l2_table_offset(HIRO_COMPAT_MPT_VIRT_START)],
1563            COMPAT_L2_PAGETABLE_XEN_SLOTS(d) * sizeof(*l2t));
1564 }
1565 
1566 /*
1567  * Fill an L4 with Xen entries.
1568  *
1569  * This function must write all ROOT_PAGETABLE_PV_XEN_SLOTS, to clobber any
1570  * values a guest may have left there from alloc_l4_table().
1571  *
1572  * l4t and l4mfn are mandatory, but l4mfn doesn't need to be the mfn under
1573  * *l4t.  All other parameters are optional and will either fill or zero the
1574  * appropriate slots.  Pagetables not shared with guests will gain the
1575  * extended directmap.
1576  */
init_xen_l4_slots(l4_pgentry_t * l4t,mfn_t l4mfn,const struct domain * d,mfn_t sl4mfn,bool ro_mpt)1577 void init_xen_l4_slots(l4_pgentry_t *l4t, mfn_t l4mfn,
1578                        const struct domain *d, mfn_t sl4mfn, bool ro_mpt)
1579 {
1580     /*
1581      * PV vcpus need a shortened directmap.  HVM and Idle vcpus get the full
1582      * directmap.
1583      */
1584     bool short_directmap = d && !paging_mode_external(d);
1585 
1586     /* Slot 256: RO M2P (if applicable). */
1587     l4t[l4_table_offset(RO_MPT_VIRT_START)] =
1588         ro_mpt ? idle_pg_table[l4_table_offset(RO_MPT_VIRT_START)]
1589                : l4e_empty();
1590 
1591     /* Slot 257: PCI MMCFG. */
1592     l4t[l4_table_offset(PCI_MCFG_VIRT_START)] =
1593         idle_pg_table[l4_table_offset(PCI_MCFG_VIRT_START)];
1594 
1595     /* Slot 258: Self linear mappings. */
1596     ASSERT(!mfn_eq(l4mfn, INVALID_MFN));
1597     l4t[l4_table_offset(LINEAR_PT_VIRT_START)] =
1598         l4e_from_mfn(l4mfn, __PAGE_HYPERVISOR_RW);
1599 
1600     /* Slot 259: Shadow linear mappings (if applicable) .*/
1601     l4t[l4_table_offset(SH_LINEAR_PT_VIRT_START)] =
1602         mfn_eq(sl4mfn, INVALID_MFN) ? l4e_empty() :
1603         l4e_from_mfn(sl4mfn, __PAGE_HYPERVISOR_RW);
1604 
1605     /* Slot 260: Per-domain mappings (if applicable). */
1606     l4t[l4_table_offset(PERDOMAIN_VIRT_START)] =
1607         d ? l4e_from_page(d->arch.perdomain_l3_pg, __PAGE_HYPERVISOR_RW)
1608           : l4e_empty();
1609 
1610     /* Slot 261-: text/data/bss, RW M2P, vmap, frametable, directmap. */
1611 #ifndef NDEBUG
1612     if ( short_directmap &&
1613          unlikely(root_pgt_pv_xen_slots < ROOT_PAGETABLE_PV_XEN_SLOTS) )
1614     {
1615         /*
1616          * If using highmem-start=, artificially shorten the directmap to
1617          * simulate very large machines.
1618          */
1619         l4_pgentry_t *next;
1620 
1621         memcpy(&l4t[l4_table_offset(XEN_VIRT_START)],
1622                &idle_pg_table[l4_table_offset(XEN_VIRT_START)],
1623                (ROOT_PAGETABLE_FIRST_XEN_SLOT + root_pgt_pv_xen_slots -
1624                 l4_table_offset(XEN_VIRT_START)) * sizeof(*l4t));
1625 
1626         next = &l4t[ROOT_PAGETABLE_FIRST_XEN_SLOT + root_pgt_pv_xen_slots];
1627 
1628         if ( l4e_get_intpte(split_l4e) )
1629             *next++ = split_l4e;
1630 
1631         memset(next, 0,
1632                _p(&l4t[ROOT_PAGETABLE_LAST_XEN_SLOT + 1]) - _p(next));
1633     }
1634     else
1635 #endif
1636     {
1637         unsigned int slots = (short_directmap
1638                               ? ROOT_PAGETABLE_PV_XEN_SLOTS
1639                               : ROOT_PAGETABLE_XEN_SLOTS);
1640 
1641         memcpy(&l4t[l4_table_offset(XEN_VIRT_START)],
1642                &idle_pg_table[l4_table_offset(XEN_VIRT_START)],
1643                (ROOT_PAGETABLE_FIRST_XEN_SLOT + slots -
1644                 l4_table_offset(XEN_VIRT_START)) * sizeof(*l4t));
1645     }
1646 }
1647 
fill_ro_mpt(mfn_t mfn)1648 bool fill_ro_mpt(mfn_t mfn)
1649 {
1650     l4_pgentry_t *l4tab = map_domain_page(mfn);
1651     bool ret = false;
1652 
1653     if ( !l4e_get_intpte(l4tab[l4_table_offset(RO_MPT_VIRT_START)]) )
1654     {
1655         l4tab[l4_table_offset(RO_MPT_VIRT_START)] =
1656             idle_pg_table[l4_table_offset(RO_MPT_VIRT_START)];
1657         ret = true;
1658     }
1659     unmap_domain_page(l4tab);
1660 
1661     return ret;
1662 }
1663 
zap_ro_mpt(mfn_t mfn)1664 void zap_ro_mpt(mfn_t mfn)
1665 {
1666     l4_pgentry_t *l4tab = map_domain_page(mfn);
1667 
1668     l4tab[l4_table_offset(RO_MPT_VIRT_START)] = l4e_empty();
1669     unmap_domain_page(l4tab);
1670 }
1671 
alloc_l4_table(struct page_info * page)1672 static int alloc_l4_table(struct page_info *page)
1673 {
1674     struct domain *d = page_get_owner(page);
1675     unsigned long  pfn = mfn_x(page_to_mfn(page));
1676     l4_pgentry_t  *pl4e = map_domain_page(_mfn(pfn));
1677     unsigned int   i;
1678     int            rc = 0, partial = page->partial_pte;
1679 
1680     for ( i = page->nr_validated_ptes; i < L4_PAGETABLE_ENTRIES;
1681           i++, partial = 0 )
1682     {
1683         if ( !is_guest_l4_slot(d, i) ||
1684              (rc = get_page_from_l4e(pl4e[i], pfn, d, partial)) > 0 )
1685             continue;
1686 
1687         if ( rc == -ERESTART )
1688         {
1689             page->nr_validated_ptes = i;
1690             page->partial_pte = partial ?: 1;
1691         }
1692         else if ( rc < 0 )
1693         {
1694             if ( rc != -EINTR )
1695                 gdprintk(XENLOG_WARNING,
1696                          "Failure in alloc_l4_table: slot %#x\n", i);
1697             if ( i )
1698             {
1699                 page->nr_validated_ptes = i;
1700                 page->partial_pte = 0;
1701                 if ( rc == -EINTR )
1702                     rc = -ERESTART;
1703                 else
1704                 {
1705                     if ( current->arch.old_guest_table )
1706                         page->nr_validated_ptes++;
1707                     current->arch.old_guest_ptpg = NULL;
1708                     current->arch.old_guest_table = page;
1709                 }
1710             }
1711         }
1712         if ( rc < 0 )
1713         {
1714             unmap_domain_page(pl4e);
1715             return rc;
1716         }
1717 
1718         pl4e[i] = adjust_guest_l4e(pl4e[i], d);
1719     }
1720 
1721     if ( rc >= 0 )
1722     {
1723         init_xen_l4_slots(pl4e, _mfn(pfn),
1724                           d, INVALID_MFN, VM_ASSIST(d, m2p_strict));
1725         atomic_inc(&d->arch.pv_domain.nr_l4_pages);
1726         rc = 0;
1727     }
1728     unmap_domain_page(pl4e);
1729 
1730     return rc;
1731 }
1732 
free_l1_table(struct page_info * page)1733 static void free_l1_table(struct page_info *page)
1734 {
1735     struct domain *d = page_get_owner(page);
1736     l1_pgentry_t *pl1e;
1737     unsigned int  i;
1738 
1739     pl1e = __map_domain_page(page);
1740 
1741     for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++ )
1742         put_page_from_l1e(pl1e[i], d);
1743 
1744     unmap_domain_page(pl1e);
1745 }
1746 
1747 
free_l2_table(struct page_info * page,int preemptible)1748 static int free_l2_table(struct page_info *page, int preemptible)
1749 {
1750     struct domain *d = page_get_owner(page);
1751     unsigned long pfn = mfn_x(page_to_mfn(page));
1752     l2_pgentry_t *pl2e;
1753     unsigned int  i = page->nr_validated_ptes - 1;
1754     int err = 0;
1755 
1756     pl2e = map_domain_page(_mfn(pfn));
1757 
1758     ASSERT(page->nr_validated_ptes);
1759     do {
1760         if ( is_guest_l2_slot(d, page->u.inuse.type_info, i) &&
1761              put_page_from_l2e(pl2e[i], pfn) == 0 &&
1762              preemptible && i && hypercall_preempt_check() )
1763         {
1764            page->nr_validated_ptes = i;
1765            err = -ERESTART;
1766         }
1767     } while ( !err && i-- );
1768 
1769     unmap_domain_page(pl2e);
1770 
1771     if ( !err )
1772         page->u.inuse.type_info &= ~PGT_pae_xen_l2;
1773 
1774     return err;
1775 }
1776 
free_l3_table(struct page_info * page)1777 static int free_l3_table(struct page_info *page)
1778 {
1779     struct domain *d = page_get_owner(page);
1780     unsigned long pfn = mfn_x(page_to_mfn(page));
1781     l3_pgentry_t *pl3e;
1782     int rc = 0, partial = page->partial_pte;
1783     unsigned int  i = page->nr_validated_ptes - !partial;
1784 
1785     pl3e = map_domain_page(_mfn(pfn));
1786 
1787     do {
1788         rc = put_page_from_l3e(pl3e[i], pfn, partial, 0);
1789         if ( rc < 0 )
1790             break;
1791         partial = 0;
1792         if ( rc > 0 )
1793             continue;
1794         pl3e[i] = unadjust_guest_l3e(pl3e[i], d);
1795     } while ( i-- );
1796 
1797     unmap_domain_page(pl3e);
1798 
1799     if ( rc == -ERESTART )
1800     {
1801         page->nr_validated_ptes = i;
1802         page->partial_pte = partial ?: -1;
1803     }
1804     else if ( rc == -EINTR && i < L3_PAGETABLE_ENTRIES - 1 )
1805     {
1806         page->nr_validated_ptes = i + 1;
1807         page->partial_pte = 0;
1808         rc = -ERESTART;
1809     }
1810     return rc > 0 ? 0 : rc;
1811 }
1812 
free_l4_table(struct page_info * page)1813 static int free_l4_table(struct page_info *page)
1814 {
1815     struct domain *d = page_get_owner(page);
1816     unsigned long pfn = mfn_x(page_to_mfn(page));
1817     l4_pgentry_t *pl4e = map_domain_page(_mfn(pfn));
1818     int rc = 0, partial = page->partial_pte;
1819     unsigned int  i = page->nr_validated_ptes - !partial;
1820 
1821     do {
1822         if ( is_guest_l4_slot(d, i) )
1823             rc = put_page_from_l4e(pl4e[i], pfn, partial, 0);
1824         if ( rc < 0 )
1825             break;
1826         partial = 0;
1827     } while ( i-- );
1828 
1829     if ( rc == -ERESTART )
1830     {
1831         page->nr_validated_ptes = i;
1832         page->partial_pte = partial ?: -1;
1833     }
1834     else if ( rc == -EINTR && i < L4_PAGETABLE_ENTRIES - 1 )
1835     {
1836         page->nr_validated_ptes = i + 1;
1837         page->partial_pte = 0;
1838         rc = -ERESTART;
1839     }
1840 
1841     unmap_domain_page(pl4e);
1842 
1843     if ( rc >= 0 )
1844     {
1845         atomic_dec(&d->arch.pv_domain.nr_l4_pages);
1846         rc = 0;
1847     }
1848 
1849     return rc;
1850 }
1851 
page_lock(struct page_info * page)1852 int page_lock(struct page_info *page)
1853 {
1854     unsigned long x, nx;
1855 
1856     do {
1857         while ( (x = page->u.inuse.type_info) & PGT_locked )
1858             cpu_relax();
1859         nx = x + (1 | PGT_locked);
1860         if ( !(x & PGT_validated) ||
1861              !(x & PGT_count_mask) ||
1862              !(nx & PGT_count_mask) )
1863             return 0;
1864     } while ( cmpxchg(&page->u.inuse.type_info, x, nx) != x );
1865 
1866     return 1;
1867 }
1868 
page_unlock(struct page_info * page)1869 void page_unlock(struct page_info *page)
1870 {
1871     unsigned long x, nx, y = page->u.inuse.type_info;
1872 
1873     do {
1874         x = y;
1875         ASSERT((x & PGT_count_mask) && (x & PGT_locked));
1876 
1877         nx = x - (1 | PGT_locked);
1878         /* We must not drop the last reference here. */
1879         ASSERT(nx & PGT_count_mask);
1880     } while ( (y = cmpxchg(&page->u.inuse.type_info, x, nx)) != x );
1881 }
1882 
1883 /*
1884  * PTE flags that a guest may change without re-validating the PTE.
1885  * All other bits affect translation, caching, or Xen's safety.
1886  */
1887 #define FASTPATH_FLAG_WHITELIST                                     \
1888     (_PAGE_NX_BIT | _PAGE_AVAIL_HIGH | _PAGE_AVAIL | _PAGE_GLOBAL | \
1889      _PAGE_DIRTY | _PAGE_ACCESSED | _PAGE_USER)
1890 
1891 /* Update the L1 entry at pl1e to new value nl1e. */
mod_l1_entry(l1_pgentry_t * pl1e,l1_pgentry_t nl1e,unsigned long gl1mfn,int preserve_ad,struct vcpu * pt_vcpu,struct domain * pg_dom)1892 static int mod_l1_entry(l1_pgentry_t *pl1e, l1_pgentry_t nl1e,
1893                         unsigned long gl1mfn, int preserve_ad,
1894                         struct vcpu *pt_vcpu, struct domain *pg_dom)
1895 {
1896     l1_pgentry_t ol1e;
1897     struct domain *pt_dom = pt_vcpu->domain;
1898     int rc = 0;
1899 
1900     if ( unlikely(__copy_from_user(&ol1e, pl1e, sizeof(ol1e)) != 0) )
1901         return -EFAULT;
1902 
1903     ASSERT(!paging_mode_refcounts(pt_dom));
1904 
1905     if ( l1e_get_flags(nl1e) & _PAGE_PRESENT )
1906     {
1907         struct page_info *page = NULL;
1908 
1909         if ( unlikely(l1e_get_flags(nl1e) & l1_disallow_mask(pt_dom)) )
1910         {
1911             gdprintk(XENLOG_WARNING, "Bad L1 flags %x\n",
1912                     l1e_get_flags(nl1e) & l1_disallow_mask(pt_dom));
1913             return -EINVAL;
1914         }
1915 
1916         /* Translate foreign guest address. */
1917         if ( paging_mode_translate(pg_dom) )
1918         {
1919             p2m_type_t p2mt;
1920             p2m_query_t q = l1e_get_flags(nl1e) & _PAGE_RW ?
1921                             P2M_ALLOC | P2M_UNSHARE : P2M_ALLOC;
1922 
1923             page = get_page_from_gfn(pg_dom, l1e_get_pfn(nl1e), &p2mt, q);
1924 
1925             if ( p2m_is_paged(p2mt) )
1926             {
1927                 if ( page )
1928                     put_page(page);
1929                 p2m_mem_paging_populate(pg_dom, l1e_get_pfn(nl1e));
1930                 return -ENOENT;
1931             }
1932 
1933             if ( p2mt == p2m_ram_paging_in && !page )
1934                 return -ENOENT;
1935 
1936             /* Did our attempt to unshare fail? */
1937             if ( (q & P2M_UNSHARE) && p2m_is_shared(p2mt) )
1938             {
1939                 /* We could not have obtained a page ref. */
1940                 ASSERT(!page);
1941                 /* And mem_sharing_notify has already been called. */
1942                 return -ENOMEM;
1943             }
1944 
1945             if ( !page )
1946                 return -EINVAL;
1947             nl1e = l1e_from_page(page, l1e_get_flags(nl1e));
1948         }
1949 
1950         /* Fast path for sufficiently-similar mappings. */
1951         if ( !l1e_has_changed(ol1e, nl1e, ~FASTPATH_FLAG_WHITELIST) )
1952         {
1953             nl1e = adjust_guest_l1e(nl1e, pt_dom);
1954             rc = UPDATE_ENTRY(l1, pl1e, ol1e, nl1e, gl1mfn, pt_vcpu,
1955                               preserve_ad);
1956             if ( page )
1957                 put_page(page);
1958             return rc ? 0 : -EBUSY;
1959         }
1960 
1961         switch ( rc = get_page_from_l1e(nl1e, pt_dom, pg_dom) )
1962         {
1963         default:
1964             if ( page )
1965                 put_page(page);
1966             return rc;
1967         case 0:
1968             break;
1969         case _PAGE_RW ... _PAGE_RW | PAGE_CACHE_ATTRS:
1970             ASSERT(!(rc & ~(_PAGE_RW | PAGE_CACHE_ATTRS)));
1971             l1e_flip_flags(nl1e, rc);
1972             rc = 0;
1973             break;
1974         }
1975         if ( page )
1976             put_page(page);
1977 
1978         nl1e = adjust_guest_l1e(nl1e, pt_dom);
1979         if ( unlikely(!UPDATE_ENTRY(l1, pl1e, ol1e, nl1e, gl1mfn, pt_vcpu,
1980                                     preserve_ad)) )
1981         {
1982             ol1e = nl1e;
1983             rc = -EBUSY;
1984         }
1985     }
1986     else if ( unlikely(!UPDATE_ENTRY(l1, pl1e, ol1e, nl1e, gl1mfn, pt_vcpu,
1987                                      preserve_ad)) )
1988     {
1989         return -EBUSY;
1990     }
1991 
1992     put_page_from_l1e(ol1e, pt_dom);
1993     return rc;
1994 }
1995 
1996 
1997 /* Update the L2 entry at pl2e to new value nl2e. pl2e is within frame pfn. */
mod_l2_entry(l2_pgentry_t * pl2e,l2_pgentry_t nl2e,unsigned long pfn,int preserve_ad,struct vcpu * vcpu)1998 static int mod_l2_entry(l2_pgentry_t *pl2e,
1999                         l2_pgentry_t nl2e,
2000                         unsigned long pfn,
2001                         int preserve_ad,
2002                         struct vcpu *vcpu)
2003 {
2004     l2_pgentry_t ol2e;
2005     struct domain *d = vcpu->domain;
2006     struct page_info *l2pg = mfn_to_page(_mfn(pfn));
2007     unsigned long type = l2pg->u.inuse.type_info;
2008     int rc = 0;
2009 
2010     if ( unlikely(!is_guest_l2_slot(d, type, pgentry_ptr_to_slot(pl2e))) )
2011     {
2012         gdprintk(XENLOG_WARNING, "L2 update in Xen-private area, slot %#lx\n",
2013                  pgentry_ptr_to_slot(pl2e));
2014         return -EPERM;
2015     }
2016 
2017     if ( unlikely(__copy_from_user(&ol2e, pl2e, sizeof(ol2e)) != 0) )
2018         return -EFAULT;
2019 
2020     if ( l2e_get_flags(nl2e) & _PAGE_PRESENT )
2021     {
2022         if ( unlikely(l2e_get_flags(nl2e) & L2_DISALLOW_MASK) )
2023         {
2024             gdprintk(XENLOG_WARNING, "Bad L2 flags %x\n",
2025                     l2e_get_flags(nl2e) & L2_DISALLOW_MASK);
2026             return -EINVAL;
2027         }
2028 
2029         /* Fast path for sufficiently-similar mappings. */
2030         if ( !l2e_has_changed(ol2e, nl2e, ~FASTPATH_FLAG_WHITELIST) )
2031         {
2032             nl2e = adjust_guest_l2e(nl2e, d);
2033             if ( UPDATE_ENTRY(l2, pl2e, ol2e, nl2e, pfn, vcpu, preserve_ad) )
2034                 return 0;
2035             return -EBUSY;
2036         }
2037 
2038         if ( unlikely((rc = get_page_from_l2e(nl2e, pfn, d)) < 0) )
2039             return rc;
2040 
2041         nl2e = adjust_guest_l2e(nl2e, d);
2042         if ( unlikely(!UPDATE_ENTRY(l2, pl2e, ol2e, nl2e, pfn, vcpu,
2043                                     preserve_ad)) )
2044         {
2045             ol2e = nl2e;
2046             rc = -EBUSY;
2047         }
2048     }
2049     else if ( unlikely(!UPDATE_ENTRY(l2, pl2e, ol2e, nl2e, pfn, vcpu,
2050                                      preserve_ad)) )
2051     {
2052         return -EBUSY;
2053     }
2054 
2055     put_page_from_l2e(ol2e, pfn);
2056     return rc;
2057 }
2058 
2059 /* Update the L3 entry at pl3e to new value nl3e. pl3e is within frame pfn. */
mod_l3_entry(l3_pgentry_t * pl3e,l3_pgentry_t nl3e,unsigned long pfn,int preserve_ad,struct vcpu * vcpu)2060 static int mod_l3_entry(l3_pgentry_t *pl3e,
2061                         l3_pgentry_t nl3e,
2062                         unsigned long pfn,
2063                         int preserve_ad,
2064                         struct vcpu *vcpu)
2065 {
2066     l3_pgentry_t ol3e;
2067     struct domain *d = vcpu->domain;
2068     int rc = 0;
2069 
2070     /*
2071      * Disallow updates to final L3 slot. It contains Xen mappings, and it
2072      * would be a pain to ensure they remain continuously valid throughout.
2073      */
2074     if ( is_pv_32bit_domain(d) && (pgentry_ptr_to_slot(pl3e) >= 3) )
2075         return -EINVAL;
2076 
2077     if ( unlikely(__copy_from_user(&ol3e, pl3e, sizeof(ol3e)) != 0) )
2078         return -EFAULT;
2079 
2080     if ( l3e_get_flags(nl3e) & _PAGE_PRESENT )
2081     {
2082         if ( unlikely(l3e_get_flags(nl3e) & l3_disallow_mask(d)) )
2083         {
2084             gdprintk(XENLOG_WARNING, "Bad L3 flags %x\n",
2085                     l3e_get_flags(nl3e) & l3_disallow_mask(d));
2086             return -EINVAL;
2087         }
2088 
2089         /* Fast path for sufficiently-similar mappings. */
2090         if ( !l3e_has_changed(ol3e, nl3e, ~FASTPATH_FLAG_WHITELIST) )
2091         {
2092             nl3e = adjust_guest_l3e(nl3e, d);
2093             rc = UPDATE_ENTRY(l3, pl3e, ol3e, nl3e, pfn, vcpu, preserve_ad);
2094             return rc ? 0 : -EFAULT;
2095         }
2096 
2097         rc = get_page_from_l3e(nl3e, pfn, d, 0);
2098         if ( unlikely(rc < 0) )
2099             return rc;
2100         rc = 0;
2101 
2102         nl3e = adjust_guest_l3e(nl3e, d);
2103         if ( unlikely(!UPDATE_ENTRY(l3, pl3e, ol3e, nl3e, pfn, vcpu,
2104                                     preserve_ad)) )
2105         {
2106             ol3e = nl3e;
2107             rc = -EFAULT;
2108         }
2109     }
2110     else if ( unlikely(!UPDATE_ENTRY(l3, pl3e, ol3e, nl3e, pfn, vcpu,
2111                                      preserve_ad)) )
2112     {
2113         return -EFAULT;
2114     }
2115 
2116     if ( likely(rc == 0) )
2117         if ( !create_pae_xen_mappings(d, pl3e) )
2118             BUG();
2119 
2120     put_page_from_l3e(ol3e, pfn, 0, 1);
2121     return rc;
2122 }
2123 
2124 /* Update the L4 entry at pl4e to new value nl4e. pl4e is within frame pfn. */
mod_l4_entry(l4_pgentry_t * pl4e,l4_pgentry_t nl4e,unsigned long pfn,int preserve_ad,struct vcpu * vcpu)2125 static int mod_l4_entry(l4_pgentry_t *pl4e,
2126                         l4_pgentry_t nl4e,
2127                         unsigned long pfn,
2128                         int preserve_ad,
2129                         struct vcpu *vcpu)
2130 {
2131     struct domain *d = vcpu->domain;
2132     l4_pgentry_t ol4e;
2133     int rc = 0;
2134 
2135     if ( unlikely(!is_guest_l4_slot(d, pgentry_ptr_to_slot(pl4e))) )
2136     {
2137         gdprintk(XENLOG_WARNING, "L4 update in Xen-private area, slot %#lx\n",
2138                  pgentry_ptr_to_slot(pl4e));
2139         return -EINVAL;
2140     }
2141 
2142     if ( unlikely(__copy_from_user(&ol4e, pl4e, sizeof(ol4e)) != 0) )
2143         return -EFAULT;
2144 
2145     if ( l4e_get_flags(nl4e) & _PAGE_PRESENT )
2146     {
2147         if ( unlikely(l4e_get_flags(nl4e) & L4_DISALLOW_MASK) )
2148         {
2149             gdprintk(XENLOG_WARNING, "Bad L4 flags %x\n",
2150                     l4e_get_flags(nl4e) & L4_DISALLOW_MASK);
2151             return -EINVAL;
2152         }
2153 
2154         /* Fast path for sufficiently-similar mappings. */
2155         if ( !l4e_has_changed(ol4e, nl4e, ~FASTPATH_FLAG_WHITELIST) )
2156         {
2157             nl4e = adjust_guest_l4e(nl4e, d);
2158             rc = UPDATE_ENTRY(l4, pl4e, ol4e, nl4e, pfn, vcpu, preserve_ad);
2159             return rc ? 0 : -EFAULT;
2160         }
2161 
2162         rc = get_page_from_l4e(nl4e, pfn, d, 0);
2163         if ( unlikely(rc < 0) )
2164             return rc;
2165         rc = 0;
2166 
2167         nl4e = adjust_guest_l4e(nl4e, d);
2168         if ( unlikely(!UPDATE_ENTRY(l4, pl4e, ol4e, nl4e, pfn, vcpu,
2169                                     preserve_ad)) )
2170         {
2171             ol4e = nl4e;
2172             rc = -EFAULT;
2173         }
2174     }
2175     else if ( unlikely(!UPDATE_ENTRY(l4, pl4e, ol4e, nl4e, pfn, vcpu,
2176                                      preserve_ad)) )
2177     {
2178         return -EFAULT;
2179     }
2180 
2181     put_page_from_l4e(ol4e, pfn, 0, 1);
2182     return rc;
2183 }
2184 
cleanup_page_cacheattr(struct page_info * page)2185 static int cleanup_page_cacheattr(struct page_info *page)
2186 {
2187     unsigned int cacheattr =
2188         (page->count_info & PGC_cacheattr_mask) >> PGC_cacheattr_base;
2189 
2190     if ( likely(cacheattr == 0) )
2191         return 0;
2192 
2193     page->count_info &= ~PGC_cacheattr_mask;
2194 
2195     BUG_ON(is_xen_heap_page(page));
2196 
2197     return update_xen_mappings(mfn_x(page_to_mfn(page)), 0);
2198 }
2199 
put_page(struct page_info * page)2200 void put_page(struct page_info *page)
2201 {
2202     unsigned long nx, x, y = page->count_info;
2203 
2204     do {
2205         ASSERT((y & PGC_count_mask) != 0);
2206         x  = y;
2207         nx = x - 1;
2208     }
2209     while ( unlikely((y = cmpxchg(&page->count_info, x, nx)) != x) );
2210 
2211     if ( unlikely((nx & PGC_count_mask) == 0) )
2212     {
2213         if ( cleanup_page_cacheattr(page) == 0 )
2214             free_domheap_page(page);
2215         else
2216             gdprintk(XENLOG_WARNING,
2217                      "Leaking mfn %" PRI_mfn "\n", mfn_x(page_to_mfn(page)));
2218     }
2219 }
2220 
2221 
page_get_owner_and_reference(struct page_info * page)2222 struct domain *page_get_owner_and_reference(struct page_info *page)
2223 {
2224     unsigned long x, y = page->count_info;
2225     struct domain *owner;
2226 
2227     do {
2228         x = y;
2229         /*
2230          * Count ==  0: Page is not allocated, so we cannot take a reference.
2231          * Count == -1: Reference count would wrap, which is invalid.
2232          * Count == -2: Remaining unused ref is reserved for get_page_light().
2233          */
2234         if ( unlikely(((x + 2) & PGC_count_mask) <= 2) )
2235             return NULL;
2236     }
2237     while ( (y = cmpxchg(&page->count_info, x, x + 1)) != x );
2238 
2239     owner = page_get_owner(page);
2240     ASSERT(owner);
2241 
2242     return owner;
2243 }
2244 
2245 
get_page(struct page_info * page,struct domain * domain)2246 int get_page(struct page_info *page, struct domain *domain)
2247 {
2248     struct domain *owner = page_get_owner_and_reference(page);
2249 
2250     if ( likely(owner == domain) )
2251         return 1;
2252 
2253     if ( !paging_mode_refcounts(domain) && !domain->is_dying )
2254         gprintk(XENLOG_INFO,
2255                 "Error mfn %"PRI_mfn": rd=%d od=%d caf=%08lx taf=%" PRtype_info "\n",
2256                 mfn_x(page_to_mfn(page)), domain->domain_id,
2257                 owner ? owner->domain_id : DOMID_INVALID,
2258                 page->count_info - !!owner, page->u.inuse.type_info);
2259 
2260     if ( owner )
2261         put_page(page);
2262 
2263     return 0;
2264 }
2265 
2266 /*
2267  * Special version of get_page() to be used exclusively when
2268  * - a page is known to already have a non-zero reference count
2269  * - the page does not need its owner to be checked
2270  * - it will not be called more than once without dropping the thus
2271  *   acquired reference again.
2272  * Due to get_page() reserving one reference, this call cannot fail.
2273  */
get_page_light(struct page_info * page)2274 static void get_page_light(struct page_info *page)
2275 {
2276     unsigned long x, nx, y = page->count_info;
2277 
2278     do {
2279         x  = y;
2280         nx = x + 1;
2281         BUG_ON(!(x & PGC_count_mask)); /* Not allocated? */
2282         BUG_ON(!(nx & PGC_count_mask)); /* Overflow? */
2283         y = cmpxchg(&page->count_info, x, nx);
2284     }
2285     while ( unlikely(y != x) );
2286 }
2287 
alloc_page_type(struct page_info * page,unsigned long type,int preemptible)2288 static int alloc_page_type(struct page_info *page, unsigned long type,
2289                            int preemptible)
2290 {
2291     struct domain *owner = page_get_owner(page);
2292     int rc;
2293 
2294     /* A page table is dirtied when its type count becomes non-zero. */
2295     if ( likely(owner != NULL) )
2296         paging_mark_dirty(owner, page_to_mfn(page));
2297 
2298     switch ( type & PGT_type_mask )
2299     {
2300     case PGT_l1_page_table:
2301         rc = alloc_l1_table(page);
2302         break;
2303     case PGT_l2_page_table:
2304         rc = alloc_l2_table(page, type, preemptible);
2305         break;
2306     case PGT_l3_page_table:
2307         ASSERT(preemptible);
2308         rc = alloc_l3_table(page);
2309         break;
2310     case PGT_l4_page_table:
2311         ASSERT(preemptible);
2312         rc = alloc_l4_table(page);
2313         break;
2314     case PGT_seg_desc_page:
2315         rc = alloc_segdesc_page(page);
2316         break;
2317     default:
2318         printk("Bad type in alloc_page_type %lx t=%" PRtype_info " c=%lx\n",
2319                type, page->u.inuse.type_info,
2320                page->count_info);
2321         rc = -EINVAL;
2322         BUG();
2323     }
2324 
2325     /* No need for atomic update of type_info here: noone else updates it. */
2326     smp_wmb();
2327     switch ( rc )
2328     {
2329     case 0:
2330         page->u.inuse.type_info |= PGT_validated;
2331         break;
2332     case -EINTR:
2333         ASSERT((page->u.inuse.type_info &
2334                 (PGT_count_mask|PGT_validated|PGT_partial)) == 1);
2335         page->u.inuse.type_info &= ~PGT_count_mask;
2336         break;
2337     default:
2338         ASSERT(rc < 0);
2339         gdprintk(XENLOG_WARNING, "Error while validating mfn %" PRI_mfn
2340                  " (pfn %" PRI_pfn ") for type %" PRtype_info
2341                  ": caf=%08lx taf=%" PRtype_info "\n",
2342                  mfn_x(page_to_mfn(page)),
2343                  get_gpfn_from_mfn(mfn_x(page_to_mfn(page))),
2344                  type, page->count_info, page->u.inuse.type_info);
2345         if ( page != current->arch.old_guest_table )
2346             page->u.inuse.type_info = 0;
2347         else
2348         {
2349             ASSERT((page->u.inuse.type_info &
2350                     (PGT_count_mask | PGT_validated)) == 1);
2351     case -ERESTART:
2352             get_page_light(page);
2353             page->u.inuse.type_info |= PGT_partial;
2354         }
2355         break;
2356     }
2357 
2358     return rc;
2359 }
2360 
2361 
free_page_type(struct page_info * page,unsigned long type,int preemptible)2362 int free_page_type(struct page_info *page, unsigned long type,
2363                    int preemptible)
2364 {
2365     struct domain *owner = page_get_owner(page);
2366     unsigned long gmfn;
2367     int rc;
2368 
2369     if ( likely(owner != NULL) && unlikely(paging_mode_enabled(owner)) )
2370     {
2371         /* A page table is dirtied when its type count becomes zero. */
2372         paging_mark_dirty(owner, page_to_mfn(page));
2373 
2374         ASSERT(!shadow_mode_refcounts(owner));
2375 
2376         gmfn = mfn_to_gmfn(owner, mfn_x(page_to_mfn(page)));
2377         ASSERT(VALID_M2P(gmfn));
2378         /* Page sharing not supported for shadowed domains */
2379         if(!SHARED_M2P(gmfn))
2380             shadow_remove_all_shadows(owner, _mfn(gmfn));
2381     }
2382 
2383     if ( !(type & PGT_partial) )
2384     {
2385         page->nr_validated_ptes = 1U << PAGETABLE_ORDER;
2386         page->partial_pte = 0;
2387     }
2388 
2389     switch ( type & PGT_type_mask )
2390     {
2391     case PGT_l1_page_table:
2392         free_l1_table(page);
2393         rc = 0;
2394         break;
2395     case PGT_l2_page_table:
2396         rc = free_l2_table(page, preemptible);
2397         break;
2398     case PGT_l3_page_table:
2399         ASSERT(preemptible);
2400         rc = free_l3_table(page);
2401         break;
2402     case PGT_l4_page_table:
2403         ASSERT(preemptible);
2404         rc = free_l4_table(page);
2405         break;
2406     default:
2407         gdprintk(XENLOG_WARNING, "type %" PRtype_info " mfn %" PRI_mfn "\n",
2408                  type, mfn_x(page_to_mfn(page)));
2409         rc = -EINVAL;
2410         BUG();
2411     }
2412 
2413     return rc;
2414 }
2415 
2416 
_put_final_page_type(struct page_info * page,unsigned long type,bool preemptible,struct page_info * ptpg)2417 static int _put_final_page_type(struct page_info *page, unsigned long type,
2418                                 bool preemptible, struct page_info *ptpg)
2419 {
2420     int rc = free_page_type(page, type, preemptible);
2421 
2422     /* No need for atomic update of type_info here: noone else updates it. */
2423     if ( rc == 0 )
2424     {
2425         if ( ptpg && PGT_type_equal(type, ptpg->u.inuse.type_info) )
2426         {
2427             dec_linear_uses(page);
2428             dec_linear_entries(ptpg);
2429         }
2430         ASSERT(!page->linear_pt_count || page_get_owner(page)->is_dying);
2431         set_tlbflush_timestamp(page);
2432         smp_wmb();
2433         page->u.inuse.type_info--;
2434     }
2435     else if ( rc == -EINTR )
2436     {
2437         ASSERT((page->u.inuse.type_info &
2438                 (PGT_count_mask|PGT_validated|PGT_partial)) == 1);
2439         smp_wmb();
2440         page->u.inuse.type_info |= PGT_validated;
2441     }
2442     else
2443     {
2444         BUG_ON(rc != -ERESTART);
2445         smp_wmb();
2446         get_page_light(page);
2447         page->u.inuse.type_info |= PGT_partial;
2448     }
2449 
2450     return rc;
2451 }
2452 
2453 
_put_page_type(struct page_info * page,bool preemptible,struct page_info * ptpg)2454 static int _put_page_type(struct page_info *page, bool preemptible,
2455                           struct page_info *ptpg)
2456 {
2457     unsigned long nx, x, y = page->u.inuse.type_info;
2458     int rc = 0;
2459 
2460     for ( ; ; )
2461     {
2462         x  = y;
2463         nx = x - 1;
2464 
2465         ASSERT((x & PGT_count_mask) != 0);
2466 
2467         if ( unlikely((nx & PGT_count_mask) == 0) )
2468         {
2469             if ( unlikely((nx & PGT_type_mask) <= PGT_l4_page_table) &&
2470                  likely(nx & (PGT_validated|PGT_partial)) )
2471             {
2472                 /*
2473                  * Page-table pages must be unvalidated when count is zero. The
2474                  * 'free' is safe because the refcnt is non-zero and validated
2475                  * bit is clear => other ops will spin or fail.
2476                  */
2477                 nx = x & ~(PGT_validated|PGT_partial);
2478                 if ( unlikely((y = cmpxchg(&page->u.inuse.type_info,
2479                                            x, nx)) != x) )
2480                     continue;
2481                 /* We cleared the 'valid bit' so we do the clean up. */
2482                 rc = _put_final_page_type(page, x, preemptible, ptpg);
2483                 ptpg = NULL;
2484                 if ( x & PGT_partial )
2485                     put_page(page);
2486                 break;
2487             }
2488 
2489             if ( !ptpg || !PGT_type_equal(x, ptpg->u.inuse.type_info) )
2490             {
2491                 /*
2492                  * set_tlbflush_timestamp() accesses the same union
2493                  * linear_pt_count lives in. Pages (including page table ones),
2494                  * however, don't need their flush time stamp set except when
2495                  * the last reference is being dropped. For page table pages
2496                  * this happens in _put_final_page_type().
2497                  */
2498                 set_tlbflush_timestamp(page);
2499             }
2500             else
2501                 BUG_ON(!IS_ENABLED(CONFIG_PV_LINEAR_PT));
2502         }
2503         else if ( unlikely((nx & (PGT_locked | PGT_count_mask)) ==
2504                            (PGT_locked | 1)) )
2505         {
2506             /*
2507              * We must not drop the second to last reference when the page is
2508              * locked, as page_unlock() doesn't do any cleanup of the type.
2509              */
2510             cpu_relax();
2511             y = page->u.inuse.type_info;
2512             continue;
2513         }
2514 
2515         if ( likely((y = cmpxchg(&page->u.inuse.type_info, x, nx)) == x) )
2516             break;
2517 
2518         if ( preemptible && hypercall_preempt_check() )
2519             return -EINTR;
2520     }
2521 
2522     if ( ptpg && PGT_type_equal(x, ptpg->u.inuse.type_info) )
2523     {
2524         ASSERT(!rc);
2525         dec_linear_uses(page);
2526         dec_linear_entries(ptpg);
2527     }
2528 
2529     return rc;
2530 }
2531 
2532 
__get_page_type(struct page_info * page,unsigned long type,int preemptible)2533 static int __get_page_type(struct page_info *page, unsigned long type,
2534                            int preemptible)
2535 {
2536     unsigned long nx, x, y = page->u.inuse.type_info;
2537     int rc = 0, iommu_ret = 0;
2538 
2539     ASSERT(!(type & ~(PGT_type_mask | PGT_pae_xen_l2)));
2540     ASSERT(!in_irq());
2541 
2542     for ( ; ; )
2543     {
2544         x  = y;
2545         nx = x + 1;
2546         if ( unlikely((nx & PGT_count_mask) == 0) )
2547         {
2548             gdprintk(XENLOG_WARNING,
2549                      "Type count overflow on mfn %"PRI_mfn"\n",
2550                      mfn_x(page_to_mfn(page)));
2551             return -EINVAL;
2552         }
2553         else if ( unlikely((x & PGT_count_mask) == 0) )
2554         {
2555             struct domain *d = page_get_owner(page);
2556 
2557             /*
2558              * Normally we should never let a page go from type count 0
2559              * to type count 1 when it is shadowed. One exception:
2560              * out-of-sync shadowed pages are allowed to become
2561              * writeable.
2562              */
2563             if ( d && shadow_mode_enabled(d)
2564                  && (page->count_info & PGC_page_table)
2565                  && !((page->shadow_flags & (1u<<29))
2566                       && type == PGT_writable_page) )
2567                shadow_remove_all_shadows(d, page_to_mfn(page));
2568 
2569             ASSERT(!(x & PGT_pae_xen_l2));
2570             if ( (x & PGT_type_mask) != type )
2571             {
2572                 /*
2573                  * On type change we check to flush stale TLB entries. This
2574                  * may be unnecessary (e.g., page was GDT/LDT) but those
2575                  * circumstances should be very rare.
2576                  */
2577                 cpumask_t *mask = this_cpu(scratch_cpumask);
2578 
2579                 BUG_ON(in_irq());
2580                 cpumask_copy(mask, d->domain_dirty_cpumask);
2581 
2582                 /* Don't flush if the timestamp is old enough */
2583                 tlbflush_filter(mask, page->tlbflush_timestamp);
2584 
2585                 if ( unlikely(!cpumask_empty(mask)) &&
2586                      /* Shadow mode: track only writable pages. */
2587                      (!shadow_mode_enabled(page_get_owner(page)) ||
2588                       ((nx & PGT_type_mask) == PGT_writable_page)) )
2589                 {
2590                     perfc_incr(need_flush_tlb_flush);
2591                     flush_tlb_mask(mask);
2592                 }
2593 
2594                 /* We lose existing type and validity. */
2595                 nx &= ~(PGT_type_mask | PGT_validated);
2596                 nx |= type;
2597 
2598                 /*
2599                  * No special validation needed for writable pages.
2600                  * Page tables and GDT/LDT need to be scanned for validity.
2601                  */
2602                 if ( type == PGT_writable_page || type == PGT_shared_page )
2603                     nx |= PGT_validated;
2604             }
2605         }
2606         else if ( unlikely((x & (PGT_type_mask|PGT_pae_xen_l2)) != type) )
2607         {
2608             /* Don't log failure if it could be a recursive-mapping attempt. */
2609             if ( ((x & PGT_type_mask) == PGT_l2_page_table) &&
2610                  (type == PGT_l1_page_table) )
2611                 return -EINVAL;
2612             if ( ((x & PGT_type_mask) == PGT_l3_page_table) &&
2613                  (type == PGT_l2_page_table) )
2614                 return -EINVAL;
2615             if ( ((x & PGT_type_mask) == PGT_l4_page_table) &&
2616                  (type == PGT_l3_page_table) )
2617                 return -EINVAL;
2618             gdprintk(XENLOG_WARNING,
2619                      "Bad type (saw %" PRtype_info " != exp %" PRtype_info ") "
2620                      "for mfn %" PRI_mfn " (pfn %" PRI_pfn ")\n",
2621                      x, type, mfn_x(page_to_mfn(page)),
2622                      get_gpfn_from_mfn(mfn_x(page_to_mfn(page))));
2623             return -EINVAL;
2624         }
2625         else if ( unlikely(!(x & PGT_validated)) )
2626         {
2627             if ( !(x & PGT_partial) )
2628             {
2629                 /* Someone else is updating validation of this page. Wait... */
2630                 while ( (y = page->u.inuse.type_info) == x )
2631                 {
2632                     if ( preemptible && hypercall_preempt_check() )
2633                         return -EINTR;
2634                     cpu_relax();
2635                 }
2636                 continue;
2637             }
2638             /* Type ref count was left at 1 when PGT_partial got set. */
2639             ASSERT((x & PGT_count_mask) == 1);
2640             nx = x & ~PGT_partial;
2641         }
2642 
2643         if ( likely((y = cmpxchg(&page->u.inuse.type_info, x, nx)) == x) )
2644             break;
2645 
2646         if ( preemptible && hypercall_preempt_check() )
2647             return -EINTR;
2648     }
2649 
2650     if ( unlikely((x & PGT_type_mask) != type) )
2651     {
2652         /* Special pages should not be accessible from devices. */
2653         struct domain *d = page_get_owner(page);
2654         if ( d && is_pv_domain(d) && unlikely(need_iommu(d)) )
2655         {
2656             gfn_t gfn = _gfn(mfn_to_gmfn(d, mfn_x(page_to_mfn(page))));
2657 
2658             if ( (x & PGT_type_mask) == PGT_writable_page )
2659                 iommu_ret = iommu_unmap_page(d, gfn_x(gfn));
2660             else if ( type == PGT_writable_page )
2661                 iommu_ret = iommu_map_page(d, gfn_x(gfn),
2662                                            mfn_x(page_to_mfn(page)),
2663                                            IOMMUF_readable|IOMMUF_writable);
2664         }
2665     }
2666 
2667     if ( unlikely(!(nx & PGT_validated)) )
2668     {
2669         if ( !(x & PGT_partial) )
2670         {
2671             page->nr_validated_ptes = 0;
2672             page->partial_pte = 0;
2673         }
2674         page->linear_pt_count = 0;
2675         rc = alloc_page_type(page, type, preemptible);
2676     }
2677 
2678     if ( (x & PGT_partial) && !(nx & PGT_partial) )
2679         put_page(page);
2680 
2681     if ( !rc )
2682         rc = iommu_ret;
2683 
2684     return rc;
2685 }
2686 
put_page_type(struct page_info * page)2687 void put_page_type(struct page_info *page)
2688 {
2689     int rc = _put_page_type(page, false, NULL);
2690     ASSERT(rc == 0);
2691     (void)rc;
2692 }
2693 
get_page_type(struct page_info * page,unsigned long type)2694 int get_page_type(struct page_info *page, unsigned long type)
2695 {
2696     int rc = __get_page_type(page, type, 0);
2697     if ( likely(rc == 0) )
2698         return 1;
2699     ASSERT(rc != -EINTR && rc != -ERESTART);
2700     return 0;
2701 }
2702 
put_page_type_preemptible(struct page_info * page)2703 int put_page_type_preemptible(struct page_info *page)
2704 {
2705     return _put_page_type(page, true, NULL);
2706 }
2707 
get_page_type_preemptible(struct page_info * page,unsigned long type)2708 int get_page_type_preemptible(struct page_info *page, unsigned long type)
2709 {
2710     ASSERT(!current->arch.old_guest_table);
2711     return __get_page_type(page, type, 1);
2712 }
2713 
put_old_guest_table(struct vcpu * v)2714 int put_old_guest_table(struct vcpu *v)
2715 {
2716     int rc;
2717 
2718     if ( !v->arch.old_guest_table )
2719         return 0;
2720 
2721     switch ( rc = _put_page_type(v->arch.old_guest_table, true,
2722                                  v->arch.old_guest_ptpg) )
2723     {
2724     case -EINTR:
2725     case -ERESTART:
2726         return -ERESTART;
2727     case 0:
2728         put_page(v->arch.old_guest_table);
2729     }
2730 
2731     v->arch.old_guest_table = NULL;
2732 
2733     return rc;
2734 }
2735 
vcpu_destroy_pagetables(struct vcpu * v)2736 int vcpu_destroy_pagetables(struct vcpu *v)
2737 {
2738     unsigned long mfn = pagetable_get_pfn(v->arch.guest_table);
2739     struct page_info *page;
2740     l4_pgentry_t *l4tab = NULL;
2741     int rc = put_old_guest_table(v);
2742 
2743     if ( rc )
2744         return rc;
2745 
2746     if ( is_pv_32bit_vcpu(v) )
2747     {
2748         l4tab = map_domain_page(_mfn(mfn));
2749         mfn = l4e_get_pfn(*l4tab);
2750     }
2751 
2752     if ( mfn )
2753     {
2754         page = mfn_to_page(_mfn(mfn));
2755         if ( paging_mode_refcounts(v->domain) )
2756             put_page(page);
2757         else
2758             rc = put_page_and_type_preemptible(page);
2759     }
2760 
2761     if ( l4tab )
2762     {
2763         if ( !rc )
2764             l4e_write(l4tab, l4e_empty());
2765         unmap_domain_page(l4tab);
2766     }
2767     else if ( !rc )
2768     {
2769         v->arch.guest_table = pagetable_null();
2770 
2771         /* Drop ref to guest_table_user (from MMUEXT_NEW_USER_BASEPTR) */
2772         mfn = pagetable_get_pfn(v->arch.guest_table_user);
2773         if ( mfn )
2774         {
2775             page = mfn_to_page(_mfn(mfn));
2776             if ( paging_mode_refcounts(v->domain) )
2777                 put_page(page);
2778             else
2779                 rc = put_page_and_type_preemptible(page);
2780         }
2781         if ( !rc )
2782             v->arch.guest_table_user = pagetable_null();
2783     }
2784 
2785     v->arch.cr3 = 0;
2786 
2787     /*
2788      * put_page_and_type_preemptible() is liable to return -EINTR. The
2789      * callers of us expect -ERESTART so convert it over.
2790      */
2791     return rc != -EINTR ? rc : -ERESTART;
2792 }
2793 
new_guest_cr3(mfn_t mfn)2794 int new_guest_cr3(mfn_t mfn)
2795 {
2796     struct vcpu *curr = current;
2797     struct domain *d = curr->domain;
2798     int rc;
2799     mfn_t old_base_mfn;
2800 
2801     if ( is_pv_32bit_domain(d) )
2802     {
2803         mfn_t gt_mfn = pagetable_get_mfn(curr->arch.guest_table);
2804         l4_pgentry_t *pl4e = map_domain_page(gt_mfn);
2805 
2806         rc = mod_l4_entry(pl4e,
2807                           l4e_from_mfn(mfn,
2808                                        (_PAGE_PRESENT | _PAGE_RW |
2809                                         _PAGE_USER | _PAGE_ACCESSED)),
2810                           mfn_x(gt_mfn), 0, curr);
2811         unmap_domain_page(pl4e);
2812         switch ( rc )
2813         {
2814         case 0:
2815             break;
2816         case -EINTR:
2817         case -ERESTART:
2818             return -ERESTART;
2819         default:
2820             gdprintk(XENLOG_WARNING,
2821                      "Error while installing new compat baseptr %" PRI_mfn "\n",
2822                      mfn_x(mfn));
2823             return rc;
2824         }
2825 
2826         invalidate_shadow_ldt(curr, 0);
2827         write_ptbase(curr);
2828 
2829         return 0;
2830     }
2831 
2832     rc = put_old_guest_table(curr);
2833     if ( unlikely(rc) )
2834         return rc;
2835 
2836     old_base_mfn = pagetable_get_mfn(curr->arch.guest_table);
2837     /*
2838      * This is particularly important when getting restarted after the
2839      * previous attempt got preempted in the put-old-MFN phase.
2840      */
2841     if ( mfn_eq(old_base_mfn, mfn) )
2842     {
2843         write_ptbase(curr);
2844         return 0;
2845     }
2846 
2847     rc = paging_mode_refcounts(d)
2848          ? (get_page_from_mfn(mfn, d) ? 0 : -EINVAL)
2849          : get_page_and_type_from_mfn(mfn, PGT_root_page_table, d, 0, 1);
2850     switch ( rc )
2851     {
2852     case 0:
2853         break;
2854     case -EINTR:
2855     case -ERESTART:
2856         return -ERESTART;
2857     default:
2858         gdprintk(XENLOG_WARNING,
2859                  "Error while installing new baseptr %" PRI_mfn "\n",
2860                  mfn_x(mfn));
2861         return rc;
2862     }
2863 
2864     invalidate_shadow_ldt(curr, 0);
2865 
2866     if ( !VM_ASSIST(d, m2p_strict) && !paging_mode_refcounts(d) )
2867         fill_ro_mpt(mfn);
2868     curr->arch.guest_table = pagetable_from_mfn(mfn);
2869     update_cr3(curr);
2870 
2871     write_ptbase(curr);
2872 
2873     if ( likely(mfn_x(old_base_mfn) != 0) )
2874     {
2875         struct page_info *page = mfn_to_page(old_base_mfn);
2876 
2877         if ( paging_mode_refcounts(d) )
2878             put_page(page);
2879         else
2880             switch ( rc = put_page_and_type_preemptible(page) )
2881             {
2882             case -EINTR:
2883                 rc = -ERESTART;
2884                 /* fallthrough */
2885             case -ERESTART:
2886                 curr->arch.old_guest_ptpg = NULL;
2887                 curr->arch.old_guest_table = page;
2888                 break;
2889             default:
2890                 BUG_ON(rc);
2891                 break;
2892             }
2893     }
2894 
2895     return rc;
2896 }
2897 
get_pg_owner(domid_t domid)2898 static struct domain *get_pg_owner(domid_t domid)
2899 {
2900     struct domain *pg_owner = NULL, *curr = current->domain;
2901 
2902     if ( likely(domid == DOMID_SELF) )
2903     {
2904         pg_owner = rcu_lock_current_domain();
2905         goto out;
2906     }
2907 
2908     if ( unlikely(domid == curr->domain_id) )
2909     {
2910         gdprintk(XENLOG_WARNING, "Cannot specify itself as foreign domain\n");
2911         goto out;
2912     }
2913 
2914     switch ( domid )
2915     {
2916     case DOMID_IO:
2917         pg_owner = rcu_lock_domain(dom_io);
2918         break;
2919     case DOMID_XEN:
2920         pg_owner = rcu_lock_domain(dom_xen);
2921         break;
2922     default:
2923         if ( (pg_owner = rcu_lock_domain_by_id(domid)) == NULL )
2924         {
2925             gdprintk(XENLOG_WARNING, "Unknown domain d%d\n", domid);
2926             break;
2927         }
2928         break;
2929     }
2930 
2931  out:
2932     return pg_owner;
2933 }
2934 
put_pg_owner(struct domain * pg_owner)2935 static void put_pg_owner(struct domain *pg_owner)
2936 {
2937     rcu_unlock_domain(pg_owner);
2938 }
2939 
vcpumask_to_pcpumask(struct domain * d,XEN_GUEST_HANDLE_PARAM (const_void)bmap,cpumask_t * pmask)2940 static inline int vcpumask_to_pcpumask(
2941     struct domain *d, XEN_GUEST_HANDLE_PARAM(const_void) bmap, cpumask_t *pmask)
2942 {
2943     unsigned int vcpu_id, vcpu_bias, offs;
2944     unsigned long vmask;
2945     struct vcpu *v;
2946     bool is_native = !is_pv_32bit_domain(d);
2947 
2948     cpumask_clear(pmask);
2949     for ( vmask = 0, offs = 0; ; ++offs )
2950     {
2951         vcpu_bias = offs * (is_native ? BITS_PER_LONG : 32);
2952         if ( vcpu_bias >= d->max_vcpus )
2953             return 0;
2954 
2955         if ( unlikely(is_native ?
2956                       copy_from_guest_offset(&vmask, bmap, offs, 1) :
2957                       copy_from_guest_offset((unsigned int *)&vmask, bmap,
2958                                              offs, 1)) )
2959         {
2960             cpumask_clear(pmask);
2961             return -EFAULT;
2962         }
2963 
2964         while ( vmask )
2965         {
2966             vcpu_id = find_first_set_bit(vmask);
2967             vmask &= ~(1UL << vcpu_id);
2968             vcpu_id += vcpu_bias;
2969             if ( (vcpu_id >= d->max_vcpus) )
2970                 return 0;
2971             if ( ((v = d->vcpu[vcpu_id]) != NULL) )
2972                 cpumask_or(pmask, pmask, v->vcpu_dirty_cpumask);
2973         }
2974     }
2975 }
2976 
do_mmuext_op(XEN_GUEST_HANDLE_PARAM (mmuext_op_t)uops,unsigned int count,XEN_GUEST_HANDLE_PARAM (uint)pdone,unsigned int foreigndom)2977 long do_mmuext_op(
2978     XEN_GUEST_HANDLE_PARAM(mmuext_op_t) uops,
2979     unsigned int count,
2980     XEN_GUEST_HANDLE_PARAM(uint) pdone,
2981     unsigned int foreigndom)
2982 {
2983     struct mmuext_op op;
2984     unsigned long type;
2985     unsigned int i, done = 0;
2986     struct vcpu *curr = current;
2987     struct domain *currd = curr->domain;
2988     struct domain *pg_owner;
2989     int rc = put_old_guest_table(curr);
2990 
2991     if ( unlikely(rc) )
2992     {
2993         if ( likely(rc == -ERESTART) )
2994             rc = hypercall_create_continuation(
2995                      __HYPERVISOR_mmuext_op, "hihi", uops, count, pdone,
2996                      foreigndom);
2997         return rc;
2998     }
2999 
3000     if ( unlikely(count == MMU_UPDATE_PREEMPTED) &&
3001          likely(guest_handle_is_null(uops)) )
3002     {
3003         /*
3004          * See the curr->arch.old_guest_table related
3005          * hypercall_create_continuation() below.
3006          */
3007         return (int)foreigndom;
3008     }
3009 
3010     if ( unlikely(count & MMU_UPDATE_PREEMPTED) )
3011     {
3012         count &= ~MMU_UPDATE_PREEMPTED;
3013         if ( unlikely(!guest_handle_is_null(pdone)) )
3014             (void)copy_from_guest(&done, pdone, 1);
3015     }
3016     else
3017         perfc_incr(calls_to_mmuext_op);
3018 
3019     if ( unlikely(!guest_handle_okay(uops, count)) )
3020         return -EFAULT;
3021 
3022     if ( (pg_owner = get_pg_owner(foreigndom)) == NULL )
3023         return -ESRCH;
3024 
3025     if ( !is_pv_domain(pg_owner) )
3026     {
3027         put_pg_owner(pg_owner);
3028         return -EINVAL;
3029     }
3030 
3031     rc = xsm_mmuext_op(XSM_TARGET, currd, pg_owner);
3032     if ( rc )
3033     {
3034         put_pg_owner(pg_owner);
3035         return rc;
3036     }
3037 
3038     for ( i = 0; i < count; i++ )
3039     {
3040         if ( curr->arch.old_guest_table || (i && hypercall_preempt_check()) )
3041         {
3042             rc = -ERESTART;
3043             break;
3044         }
3045 
3046         if ( unlikely(__copy_from_guest(&op, uops, 1) != 0) )
3047         {
3048             rc = -EFAULT;
3049             break;
3050         }
3051 
3052         if ( is_hvm_domain(currd) )
3053         {
3054             switch ( op.cmd )
3055             {
3056             case MMUEXT_PIN_L1_TABLE:
3057             case MMUEXT_PIN_L2_TABLE:
3058             case MMUEXT_PIN_L3_TABLE:
3059             case MMUEXT_PIN_L4_TABLE:
3060             case MMUEXT_UNPIN_TABLE:
3061                 break;
3062             default:
3063                 rc = -EOPNOTSUPP;
3064                 goto done;
3065             }
3066         }
3067 
3068         rc = 0;
3069 
3070         switch ( op.cmd )
3071         {
3072             struct page_info *page;
3073             p2m_type_t p2mt;
3074 
3075         case MMUEXT_PIN_L1_TABLE:
3076             type = PGT_l1_page_table;
3077             goto pin_page;
3078 
3079         case MMUEXT_PIN_L2_TABLE:
3080             type = PGT_l2_page_table;
3081             goto pin_page;
3082 
3083         case MMUEXT_PIN_L3_TABLE:
3084             type = PGT_l3_page_table;
3085             goto pin_page;
3086 
3087         case MMUEXT_PIN_L4_TABLE:
3088             if ( is_pv_32bit_domain(pg_owner) )
3089                 break;
3090             type = PGT_l4_page_table;
3091 
3092         pin_page:
3093             /* Ignore pinning of invalid paging levels. */
3094             if ( (op.cmd - MMUEXT_PIN_L1_TABLE) > (CONFIG_PAGING_LEVELS - 1) )
3095                 break;
3096 
3097             if ( paging_mode_refcounts(pg_owner) )
3098                 break;
3099 
3100             page = get_page_from_gfn(pg_owner, op.arg1.mfn, NULL, P2M_ALLOC);
3101             if ( unlikely(!page) )
3102             {
3103                 rc = -EINVAL;
3104                 break;
3105             }
3106 
3107             rc = get_page_type_preemptible(page, type);
3108             if ( unlikely(rc) )
3109             {
3110                 if ( rc == -EINTR )
3111                     rc = -ERESTART;
3112                 else if ( rc != -ERESTART )
3113                     gdprintk(XENLOG_WARNING,
3114                              "Error %d while pinning mfn %" PRI_mfn "\n",
3115                              rc, mfn_x(page_to_mfn(page)));
3116                 if ( page != curr->arch.old_guest_table )
3117                     put_page(page);
3118                 break;
3119             }
3120 
3121             rc = xsm_memory_pin_page(XSM_HOOK, currd, pg_owner, page);
3122             if ( !rc && unlikely(test_and_set_bit(_PGT_pinned,
3123                                                   &page->u.inuse.type_info)) )
3124             {
3125                 gdprintk(XENLOG_WARNING,
3126                          "mfn %" PRI_mfn " already pinned\n",
3127                          mfn_x(page_to_mfn(page)));
3128                 rc = -EINVAL;
3129             }
3130 
3131             if ( unlikely(rc) )
3132                 goto pin_drop;
3133 
3134             /* A page is dirtied when its pin status is set. */
3135             paging_mark_dirty(pg_owner, page_to_mfn(page));
3136 
3137             /* We can race domain destruction (domain_relinquish_resources). */
3138             if ( unlikely(pg_owner != currd) )
3139             {
3140                 bool drop_ref;
3141 
3142                 spin_lock(&pg_owner->page_alloc_lock);
3143                 drop_ref = (pg_owner->is_dying &&
3144                             test_and_clear_bit(_PGT_pinned,
3145                                                &page->u.inuse.type_info));
3146                 spin_unlock(&pg_owner->page_alloc_lock);
3147                 if ( drop_ref )
3148                 {
3149         pin_drop:
3150                     if ( type == PGT_l1_page_table )
3151                         put_page_and_type(page);
3152                     else
3153                     {
3154                         curr->arch.old_guest_ptpg = NULL;
3155                         curr->arch.old_guest_table = page;
3156                     }
3157                 }
3158             }
3159             break;
3160 
3161         case MMUEXT_UNPIN_TABLE:
3162             if ( paging_mode_refcounts(pg_owner) )
3163                 break;
3164 
3165             page = get_page_from_gfn(pg_owner, op.arg1.mfn, NULL, P2M_ALLOC);
3166             if ( unlikely(!page) )
3167             {
3168                 gdprintk(XENLOG_WARNING,
3169                          "mfn %" PRI_mfn " bad, or bad owner d%d\n",
3170                          op.arg1.mfn, pg_owner->domain_id);
3171                 rc = -EINVAL;
3172                 break;
3173             }
3174 
3175             if ( !test_and_clear_bit(_PGT_pinned, &page->u.inuse.type_info) )
3176             {
3177                 put_page(page);
3178                 gdprintk(XENLOG_WARNING,
3179                          "mfn %" PRI_mfn " not pinned\n", op.arg1.mfn);
3180                 rc = -EINVAL;
3181                 break;
3182             }
3183 
3184             switch ( rc = put_page_and_type_preemptible(page) )
3185             {
3186             case -EINTR:
3187             case -ERESTART:
3188                 curr->arch.old_guest_ptpg = NULL;
3189                 curr->arch.old_guest_table = page;
3190                 rc = 0;
3191                 break;
3192             default:
3193                 BUG_ON(rc);
3194                 break;
3195             }
3196             put_page(page);
3197 
3198             /* A page is dirtied when its pin status is cleared. */
3199             paging_mark_dirty(pg_owner, page_to_mfn(page));
3200             break;
3201 
3202         case MMUEXT_NEW_BASEPTR:
3203             if ( unlikely(currd != pg_owner) )
3204                 rc = -EPERM;
3205             else if ( unlikely(paging_mode_translate(currd)) )
3206                 rc = -EINVAL;
3207             else
3208                 rc = new_guest_cr3(_mfn(op.arg1.mfn));
3209             break;
3210 
3211         case MMUEXT_NEW_USER_BASEPTR: {
3212             unsigned long old_mfn;
3213 
3214             if ( unlikely(currd != pg_owner) )
3215                 rc = -EPERM;
3216             else if ( unlikely(paging_mode_translate(currd)) )
3217                 rc = -EINVAL;
3218             if ( unlikely(rc) )
3219                 break;
3220 
3221             old_mfn = pagetable_get_pfn(curr->arch.guest_table_user);
3222             /*
3223              * This is particularly important when getting restarted after the
3224              * previous attempt got preempted in the put-old-MFN phase.
3225              */
3226             if ( old_mfn == op.arg1.mfn )
3227                 break;
3228 
3229             if ( op.arg1.mfn != 0 )
3230             {
3231                 rc = get_page_and_type_from_mfn(
3232                     _mfn(op.arg1.mfn), PGT_root_page_table, currd, 0, 1);
3233 
3234                 if ( unlikely(rc) )
3235                 {
3236                     if ( rc == -EINTR )
3237                         rc = -ERESTART;
3238                     else if ( rc != -ERESTART )
3239                         gdprintk(XENLOG_WARNING,
3240                                  "Error %d installing new mfn %" PRI_mfn "\n",
3241                                  rc, op.arg1.mfn);
3242                     break;
3243                 }
3244 
3245                 if ( VM_ASSIST(currd, m2p_strict) )
3246                     zap_ro_mpt(_mfn(op.arg1.mfn));
3247             }
3248 
3249             curr->arch.guest_table_user = pagetable_from_pfn(op.arg1.mfn);
3250 
3251             if ( old_mfn != 0 )
3252             {
3253                 page = mfn_to_page(_mfn(old_mfn));
3254 
3255                 switch ( rc = put_page_and_type_preemptible(page) )
3256                 {
3257                 case -EINTR:
3258                     rc = -ERESTART;
3259                     /* fallthrough */
3260                 case -ERESTART:
3261                     curr->arch.old_guest_ptpg = NULL;
3262                     curr->arch.old_guest_table = page;
3263                     break;
3264                 default:
3265                     BUG_ON(rc);
3266                     break;
3267                 }
3268             }
3269 
3270             break;
3271         }
3272 
3273         case MMUEXT_TLB_FLUSH_LOCAL:
3274             if ( likely(currd == pg_owner) )
3275                 flush_tlb_local();
3276             else
3277                 rc = -EPERM;
3278             break;
3279 
3280         case MMUEXT_INVLPG_LOCAL:
3281             if ( unlikely(currd != pg_owner) )
3282                 rc = -EPERM;
3283             else
3284                 paging_invlpg(curr, op.arg1.linear_addr);
3285             break;
3286 
3287         case MMUEXT_TLB_FLUSH_MULTI:
3288         case MMUEXT_INVLPG_MULTI:
3289         {
3290             cpumask_t *mask = this_cpu(scratch_cpumask);
3291 
3292             if ( unlikely(currd != pg_owner) )
3293                 rc = -EPERM;
3294             else if ( unlikely(vcpumask_to_pcpumask(currd,
3295                                    guest_handle_to_param(op.arg2.vcpumask,
3296                                                          const_void),
3297                                    mask)) )
3298                 rc = -EINVAL;
3299             if ( unlikely(rc) )
3300                 break;
3301 
3302             if ( op.cmd == MMUEXT_TLB_FLUSH_MULTI )
3303                 flush_tlb_mask(mask);
3304             else if ( __addr_ok(op.arg1.linear_addr) )
3305                 flush_tlb_one_mask(mask, op.arg1.linear_addr);
3306             break;
3307         }
3308 
3309         case MMUEXT_TLB_FLUSH_ALL:
3310             if ( likely(currd == pg_owner) )
3311                 flush_tlb_mask(currd->domain_dirty_cpumask);
3312             else
3313                 rc = -EPERM;
3314             break;
3315 
3316         case MMUEXT_INVLPG_ALL:
3317             if ( unlikely(currd != pg_owner) )
3318                 rc = -EPERM;
3319             else if ( __addr_ok(op.arg1.linear_addr) )
3320                 flush_tlb_one_mask(currd->domain_dirty_cpumask,
3321                                    op.arg1.linear_addr);
3322             break;
3323 
3324         case MMUEXT_FLUSH_CACHE:
3325             if ( unlikely(currd != pg_owner) )
3326                 rc = -EPERM;
3327             else if ( unlikely(!cache_flush_permitted(currd)) )
3328                 rc = -EACCES;
3329             else
3330                 wbinvd();
3331             break;
3332 
3333         case MMUEXT_FLUSH_CACHE_GLOBAL:
3334             if ( unlikely(currd != pg_owner) )
3335                 rc = -EPERM;
3336             else if ( likely(cache_flush_permitted(currd)) )
3337             {
3338                 unsigned int cpu;
3339                 cpumask_t *mask = this_cpu(scratch_cpumask);
3340 
3341                 cpumask_clear(mask);
3342                 for_each_online_cpu(cpu)
3343                     if ( !cpumask_intersects(mask,
3344                                              per_cpu(cpu_sibling_mask, cpu)) )
3345                         __cpumask_set_cpu(cpu, mask);
3346                 flush_mask(mask, FLUSH_CACHE);
3347             }
3348             else
3349                 rc = -EINVAL;
3350             break;
3351 
3352         case MMUEXT_SET_LDT:
3353         {
3354             unsigned int ents = op.arg2.nr_ents;
3355             unsigned long ptr = ents ? op.arg1.linear_addr : 0;
3356 
3357             if ( unlikely(currd != pg_owner) )
3358                 rc = -EPERM;
3359             else if ( paging_mode_external(currd) )
3360                 rc = -EINVAL;
3361             else if ( ((ptr & (PAGE_SIZE - 1)) != 0) || !__addr_ok(ptr) ||
3362                       (ents > 8192) )
3363             {
3364                 gdprintk(XENLOG_WARNING,
3365                          "Bad args to SET_LDT: ptr=%lx, ents=%x\n", ptr, ents);
3366                 rc = -EINVAL;
3367             }
3368             else if ( (curr->arch.pv_vcpu.ldt_ents != ents) ||
3369                       (curr->arch.pv_vcpu.ldt_base != ptr) )
3370             {
3371                 invalidate_shadow_ldt(curr, 0);
3372                 flush_tlb_local();
3373                 curr->arch.pv_vcpu.ldt_base = ptr;
3374                 curr->arch.pv_vcpu.ldt_ents = ents;
3375                 load_LDT(curr);
3376             }
3377             break;
3378         }
3379 
3380         case MMUEXT_CLEAR_PAGE:
3381             page = get_page_from_gfn(pg_owner, op.arg1.mfn, &p2mt, P2M_ALLOC);
3382             if ( unlikely(p2mt != p2m_ram_rw) && page )
3383             {
3384                 put_page(page);
3385                 page = NULL;
3386             }
3387             if ( !page || !get_page_type(page, PGT_writable_page) )
3388             {
3389                 if ( page )
3390                     put_page(page);
3391                 gdprintk(XENLOG_WARNING,
3392                          "Error clearing mfn %" PRI_mfn "\n", op.arg1.mfn);
3393                 rc = -EINVAL;
3394                 break;
3395             }
3396 
3397             /* A page is dirtied when it's being cleared. */
3398             paging_mark_dirty(pg_owner, page_to_mfn(page));
3399 
3400             clear_domain_page(page_to_mfn(page));
3401 
3402             put_page_and_type(page);
3403             break;
3404 
3405         case MMUEXT_COPY_PAGE:
3406         {
3407             struct page_info *src_page, *dst_page;
3408 
3409             src_page = get_page_from_gfn(pg_owner, op.arg2.src_mfn, &p2mt,
3410                                          P2M_ALLOC);
3411             if ( unlikely(p2mt != p2m_ram_rw) && src_page )
3412             {
3413                 put_page(src_page);
3414                 src_page = NULL;
3415             }
3416             if ( unlikely(!src_page) )
3417             {
3418                 gdprintk(XENLOG_WARNING,
3419                          "Error copying from mfn %" PRI_mfn "\n",
3420                          op.arg2.src_mfn);
3421                 rc = -EINVAL;
3422                 break;
3423             }
3424 
3425             dst_page = get_page_from_gfn(pg_owner, op.arg1.mfn, &p2mt,
3426                                          P2M_ALLOC);
3427             if ( unlikely(p2mt != p2m_ram_rw) && dst_page )
3428             {
3429                 put_page(dst_page);
3430                 dst_page = NULL;
3431             }
3432             rc = (dst_page &&
3433                   get_page_type(dst_page, PGT_writable_page)) ? 0 : -EINVAL;
3434             if ( unlikely(rc) )
3435             {
3436                 put_page(src_page);
3437                 if ( dst_page )
3438                     put_page(dst_page);
3439                 gdprintk(XENLOG_WARNING,
3440                          "Error copying to mfn %" PRI_mfn "\n", op.arg1.mfn);
3441                 break;
3442             }
3443 
3444             /* A page is dirtied when it's being copied to. */
3445             paging_mark_dirty(pg_owner, page_to_mfn(dst_page));
3446 
3447             copy_domain_page(page_to_mfn(dst_page), page_to_mfn(src_page));
3448 
3449             put_page_and_type(dst_page);
3450             put_page(src_page);
3451             break;
3452         }
3453 
3454         case MMUEXT_MARK_SUPER:
3455         case MMUEXT_UNMARK_SUPER:
3456             rc = -EOPNOTSUPP;
3457             break;
3458 
3459         default:
3460             rc = -ENOSYS;
3461             break;
3462         }
3463 
3464  done:
3465         if ( unlikely(rc) )
3466             break;
3467 
3468         guest_handle_add_offset(uops, 1);
3469     }
3470 
3471     if ( rc == -ERESTART )
3472     {
3473         ASSERT(i < count);
3474         rc = hypercall_create_continuation(
3475             __HYPERVISOR_mmuext_op, "hihi",
3476             uops, (count - i) | MMU_UPDATE_PREEMPTED, pdone, foreigndom);
3477     }
3478     else if ( curr->arch.old_guest_table )
3479     {
3480         XEN_GUEST_HANDLE_PARAM(void) null;
3481 
3482         ASSERT(rc || i == count);
3483         set_xen_guest_handle(null, NULL);
3484         /*
3485          * In order to have a way to communicate the final return value to
3486          * our continuation, we pass this in place of "foreigndom", building
3487          * on the fact that this argument isn't needed anymore.
3488          */
3489         rc = hypercall_create_continuation(
3490                 __HYPERVISOR_mmuext_op, "hihi", null,
3491                 MMU_UPDATE_PREEMPTED, null, rc);
3492     }
3493 
3494     put_pg_owner(pg_owner);
3495 
3496     perfc_add(num_mmuext_ops, i);
3497 
3498     /* Add incremental work we have done to the @done output parameter. */
3499     if ( unlikely(!guest_handle_is_null(pdone)) )
3500     {
3501         done += i;
3502         copy_to_guest(pdone, &done, 1);
3503     }
3504 
3505     return rc;
3506 }
3507 
do_mmu_update(XEN_GUEST_HANDLE_PARAM (mmu_update_t)ureqs,unsigned int count,XEN_GUEST_HANDLE_PARAM (uint)pdone,unsigned int foreigndom)3508 long do_mmu_update(
3509     XEN_GUEST_HANDLE_PARAM(mmu_update_t) ureqs,
3510     unsigned int count,
3511     XEN_GUEST_HANDLE_PARAM(uint) pdone,
3512     unsigned int foreigndom)
3513 {
3514     struct mmu_update req;
3515     void *va = NULL;
3516     unsigned long gpfn, gmfn, mfn;
3517     struct page_info *page;
3518     unsigned int cmd, i = 0, done = 0, pt_dom;
3519     struct vcpu *curr = current, *v = curr;
3520     struct domain *d = v->domain, *pt_owner = d, *pg_owner;
3521     mfn_t map_mfn = INVALID_MFN;
3522     uint32_t xsm_needed = 0;
3523     uint32_t xsm_checked = 0;
3524     int rc = put_old_guest_table(curr);
3525 
3526     if ( unlikely(rc) )
3527     {
3528         if ( likely(rc == -ERESTART) )
3529             rc = hypercall_create_continuation(
3530                      __HYPERVISOR_mmu_update, "hihi", ureqs, count, pdone,
3531                      foreigndom);
3532         return rc;
3533     }
3534 
3535     if ( unlikely(count == MMU_UPDATE_PREEMPTED) &&
3536          likely(guest_handle_is_null(ureqs)) )
3537     {
3538         /*
3539          * See the curr->arch.old_guest_table related
3540          * hypercall_create_continuation() below.
3541          */
3542         return (int)foreigndom;
3543     }
3544 
3545     if ( unlikely(count & MMU_UPDATE_PREEMPTED) )
3546     {
3547         count &= ~MMU_UPDATE_PREEMPTED;
3548         if ( unlikely(!guest_handle_is_null(pdone)) )
3549             (void)copy_from_guest(&done, pdone, 1);
3550     }
3551     else
3552         perfc_incr(calls_to_mmu_update);
3553 
3554     if ( unlikely(!guest_handle_okay(ureqs, count)) )
3555         return -EFAULT;
3556 
3557     if ( (pt_dom = foreigndom >> 16) != 0 )
3558     {
3559         /* Pagetables belong to a foreign domain (PFD). */
3560         if ( (pt_owner = rcu_lock_domain_by_id(pt_dom - 1)) == NULL )
3561             return -ESRCH;
3562 
3563         if ( pt_owner == d )
3564             rcu_unlock_domain(pt_owner);
3565         else if ( !pt_owner->vcpu || (v = pt_owner->vcpu[0]) == NULL )
3566         {
3567             rc = -EINVAL;
3568             goto out;
3569         }
3570     }
3571 
3572     if ( (pg_owner = get_pg_owner((uint16_t)foreigndom)) == NULL )
3573     {
3574         rc = -ESRCH;
3575         goto out;
3576     }
3577 
3578     for ( i = 0; i < count; i++ )
3579     {
3580         if ( curr->arch.old_guest_table || (i && hypercall_preempt_check()) )
3581         {
3582             rc = -ERESTART;
3583             break;
3584         }
3585 
3586         if ( unlikely(__copy_from_guest(&req, ureqs, 1) != 0) )
3587         {
3588             rc = -EFAULT;
3589             break;
3590         }
3591 
3592         cmd = req.ptr & (sizeof(l1_pgentry_t)-1);
3593 
3594         switch ( cmd )
3595         {
3596             /*
3597              * MMU_NORMAL_PT_UPDATE: Normal update to any level of page table.
3598              * MMU_UPDATE_PT_PRESERVE_AD: As above but also preserve (OR)
3599              * current A/D bits.
3600              */
3601         case MMU_NORMAL_PT_UPDATE:
3602         case MMU_PT_UPDATE_PRESERVE_AD:
3603         {
3604             p2m_type_t p2mt;
3605 
3606             rc = -EOPNOTSUPP;
3607             if ( unlikely(paging_mode_refcounts(pt_owner)) )
3608                 break;
3609 
3610             xsm_needed |= XSM_MMU_NORMAL_UPDATE;
3611             if ( get_pte_flags(req.val) & _PAGE_PRESENT )
3612             {
3613                 xsm_needed |= XSM_MMU_UPDATE_READ;
3614                 if ( get_pte_flags(req.val) & _PAGE_RW )
3615                     xsm_needed |= XSM_MMU_UPDATE_WRITE;
3616             }
3617             if ( xsm_needed != xsm_checked )
3618             {
3619                 rc = xsm_mmu_update(XSM_TARGET, d, pt_owner, pg_owner, xsm_needed);
3620                 if ( rc )
3621                     break;
3622                 xsm_checked = xsm_needed;
3623             }
3624             rc = -EINVAL;
3625 
3626             req.ptr -= cmd;
3627             gmfn = req.ptr >> PAGE_SHIFT;
3628             page = get_page_from_gfn(pt_owner, gmfn, &p2mt, P2M_ALLOC);
3629 
3630             if ( p2m_is_paged(p2mt) )
3631             {
3632                 ASSERT(!page);
3633                 p2m_mem_paging_populate(pt_owner, gmfn);
3634                 rc = -ENOENT;
3635                 break;
3636             }
3637 
3638             if ( unlikely(!page) )
3639             {
3640                 gdprintk(XENLOG_WARNING,
3641                          "Could not get page for normal update\n");
3642                 break;
3643             }
3644 
3645             mfn = mfn_x(page_to_mfn(page));
3646 
3647             if ( !mfn_eq(_mfn(mfn), map_mfn) )
3648             {
3649                 if ( va )
3650                     unmap_domain_page(va);
3651                 va = map_domain_page(_mfn(mfn));
3652                 map_mfn = _mfn(mfn);
3653             }
3654             va = _p(((unsigned long)va & PAGE_MASK) + (req.ptr & ~PAGE_MASK));
3655 
3656             if ( page_lock(page) )
3657             {
3658                 switch ( page->u.inuse.type_info & PGT_type_mask )
3659                 {
3660                 case PGT_l1_page_table:
3661                     rc = mod_l1_entry(va, l1e_from_intpte(req.val), mfn,
3662                                       cmd == MMU_PT_UPDATE_PRESERVE_AD, v,
3663                                       pg_owner);
3664                     break;
3665                 case PGT_l2_page_table:
3666                     rc = mod_l2_entry(va, l2e_from_intpte(req.val), mfn,
3667                                       cmd == MMU_PT_UPDATE_PRESERVE_AD, v);
3668                     break;
3669                 case PGT_l3_page_table:
3670                     rc = mod_l3_entry(va, l3e_from_intpte(req.val), mfn,
3671                                       cmd == MMU_PT_UPDATE_PRESERVE_AD, v);
3672                     break;
3673                 case PGT_l4_page_table:
3674                     rc = mod_l4_entry(va, l4e_from_intpte(req.val), mfn,
3675                                       cmd == MMU_PT_UPDATE_PRESERVE_AD, v);
3676                     break;
3677                 case PGT_writable_page:
3678                     perfc_incr(writable_mmu_updates);
3679                     if ( paging_write_guest_entry(v, va, req.val, _mfn(mfn)) )
3680                         rc = 0;
3681                     break;
3682                 }
3683                 page_unlock(page);
3684                 if ( rc == -EINTR )
3685                     rc = -ERESTART;
3686             }
3687             else if ( get_page_type(page, PGT_writable_page) )
3688             {
3689                 perfc_incr(writable_mmu_updates);
3690                 if ( paging_write_guest_entry(v, va, req.val, _mfn(mfn)) )
3691                     rc = 0;
3692                 put_page_type(page);
3693             }
3694 
3695             put_page(page);
3696         }
3697         break;
3698 
3699         case MMU_MACHPHYS_UPDATE:
3700             if ( unlikely(d != pt_owner) )
3701             {
3702                 rc = -EPERM;
3703                 break;
3704             }
3705 
3706             if ( unlikely(paging_mode_translate(pg_owner)) )
3707             {
3708                 rc = -EINVAL;
3709                 break;
3710             }
3711 
3712             mfn = req.ptr >> PAGE_SHIFT;
3713             gpfn = req.val;
3714 
3715             xsm_needed |= XSM_MMU_MACHPHYS_UPDATE;
3716             if ( xsm_needed != xsm_checked )
3717             {
3718                 rc = xsm_mmu_update(XSM_TARGET, d, NULL, pg_owner, xsm_needed);
3719                 if ( rc )
3720                     break;
3721                 xsm_checked = xsm_needed;
3722             }
3723 
3724             if ( unlikely(!get_page_from_mfn(_mfn(mfn), pg_owner)) )
3725             {
3726                 gdprintk(XENLOG_WARNING,
3727                          "Could not get page for mach->phys update\n");
3728                 rc = -EINVAL;
3729                 break;
3730             }
3731 
3732             set_gpfn_from_mfn(mfn, gpfn);
3733 
3734             paging_mark_dirty(pg_owner, _mfn(mfn));
3735 
3736             put_page(mfn_to_page(_mfn(mfn)));
3737             break;
3738 
3739         default:
3740             rc = -ENOSYS;
3741             break;
3742         }
3743 
3744         if ( unlikely(rc) )
3745             break;
3746 
3747         guest_handle_add_offset(ureqs, 1);
3748     }
3749 
3750     if ( rc == -ERESTART )
3751     {
3752         ASSERT(i < count);
3753         rc = hypercall_create_continuation(
3754             __HYPERVISOR_mmu_update, "hihi",
3755             ureqs, (count - i) | MMU_UPDATE_PREEMPTED, pdone, foreigndom);
3756     }
3757     else if ( curr->arch.old_guest_table )
3758     {
3759         XEN_GUEST_HANDLE_PARAM(void) null;
3760 
3761         ASSERT(rc || i == count);
3762         set_xen_guest_handle(null, NULL);
3763         /*
3764          * In order to have a way to communicate the final return value to
3765          * our continuation, we pass this in place of "foreigndom", building
3766          * on the fact that this argument isn't needed anymore.
3767          */
3768         rc = hypercall_create_continuation(
3769                 __HYPERVISOR_mmu_update, "hihi", null,
3770                 MMU_UPDATE_PREEMPTED, null, rc);
3771     }
3772 
3773     put_pg_owner(pg_owner);
3774 
3775     if ( va )
3776         unmap_domain_page(va);
3777 
3778     perfc_add(num_page_updates, i);
3779 
3780  out:
3781     if ( pt_owner != d )
3782         rcu_unlock_domain(pt_owner);
3783 
3784     /* Add incremental work we have done to the @done output parameter. */
3785     if ( unlikely(!guest_handle_is_null(pdone)) )
3786     {
3787         done += i;
3788         copy_to_guest(pdone, &done, 1);
3789     }
3790 
3791     return rc;
3792 }
3793 
donate_page(struct domain * d,struct page_info * page,unsigned int memflags)3794 int donate_page(
3795     struct domain *d, struct page_info *page, unsigned int memflags)
3796 {
3797     const struct domain *owner = dom_xen;
3798 
3799     spin_lock(&d->page_alloc_lock);
3800 
3801     if ( is_xen_heap_page(page) || ((owner = page_get_owner(page)) != NULL) )
3802         goto fail;
3803 
3804     if ( d->is_dying )
3805         goto fail;
3806 
3807     if ( page->count_info & ~(PGC_allocated | 1) )
3808         goto fail;
3809 
3810     if ( !(memflags & MEMF_no_refcount) )
3811     {
3812         if ( d->tot_pages >= d->max_pages )
3813             goto fail;
3814         domain_adjust_tot_pages(d, 1);
3815     }
3816 
3817     page->count_info = PGC_allocated | 1;
3818     page_set_owner(page, d);
3819     page_list_add_tail(page,&d->page_list);
3820 
3821     spin_unlock(&d->page_alloc_lock);
3822     return 0;
3823 
3824  fail:
3825     spin_unlock(&d->page_alloc_lock);
3826     gdprintk(XENLOG_WARNING, "Bad donate mfn %" PRI_mfn
3827              " to d%d (owner d%d) caf=%08lx taf=%" PRtype_info "\n",
3828              mfn_x(page_to_mfn(page)), d->domain_id,
3829              owner ? owner->domain_id : DOMID_INVALID,
3830              page->count_info, page->u.inuse.type_info);
3831     return -EINVAL;
3832 }
3833 
steal_page(struct domain * d,struct page_info * page,unsigned int memflags)3834 int steal_page(
3835     struct domain *d, struct page_info *page, unsigned int memflags)
3836 {
3837     unsigned long x, y;
3838     bool drop_dom_ref = false;
3839     const struct domain *owner = dom_xen;
3840 
3841     if ( paging_mode_external(d) )
3842         return -EOPNOTSUPP;
3843 
3844     spin_lock(&d->page_alloc_lock);
3845 
3846     if ( is_xen_heap_page(page) || ((owner = page_get_owner(page)) != d) )
3847         goto fail;
3848 
3849     /*
3850      * We require there is just one reference (PGC_allocated). We temporarily
3851      * drop this reference now so that we can safely swizzle the owner.
3852      */
3853     y = page->count_info;
3854     do {
3855         x = y;
3856         if ( (x & (PGC_count_mask|PGC_allocated)) != (1 | PGC_allocated) )
3857             goto fail;
3858         y = cmpxchg(&page->count_info, x, x & ~PGC_count_mask);
3859     } while ( y != x );
3860 
3861     /*
3862      * With the sole reference dropped temporarily, no-one can update type
3863      * information. Type count also needs to be zero in this case, but e.g.
3864      * PGT_seg_desc_page may still have PGT_validated set, which we need to
3865      * clear before transferring ownership (as validation criteria vary
3866      * depending on domain type).
3867      */
3868     BUG_ON(page->u.inuse.type_info & (PGT_count_mask | PGT_locked |
3869                                       PGT_pinned));
3870     page->u.inuse.type_info = 0;
3871 
3872     /* Swizzle the owner then reinstate the PGC_allocated reference. */
3873     page_set_owner(page, NULL);
3874     y = page->count_info;
3875     do {
3876         x = y;
3877         BUG_ON((x & (PGC_count_mask|PGC_allocated)) != PGC_allocated);
3878     } while ( (y = cmpxchg(&page->count_info, x, x | 1)) != x );
3879 
3880     /* Unlink from original owner. */
3881     if ( !(memflags & MEMF_no_refcount) && !domain_adjust_tot_pages(d, -1) )
3882         drop_dom_ref = true;
3883     page_list_del(page, &d->page_list);
3884 
3885     spin_unlock(&d->page_alloc_lock);
3886     if ( unlikely(drop_dom_ref) )
3887         put_domain(d);
3888     return 0;
3889 
3890  fail:
3891     spin_unlock(&d->page_alloc_lock);
3892     gdprintk(XENLOG_WARNING, "Bad steal mfn %" PRI_mfn
3893              " from d%d (owner d%d) caf=%08lx taf=%" PRtype_info "\n",
3894              mfn_x(page_to_mfn(page)), d->domain_id,
3895              owner ? owner->domain_id : DOMID_INVALID,
3896              page->count_info, page->u.inuse.type_info);
3897     return -EINVAL;
3898 }
3899 
__do_update_va_mapping(unsigned long va,u64 val64,unsigned long flags,struct domain * pg_owner)3900 static int __do_update_va_mapping(
3901     unsigned long va, u64 val64, unsigned long flags, struct domain *pg_owner)
3902 {
3903     l1_pgentry_t   val = l1e_from_intpte(val64);
3904     struct vcpu   *v   = current;
3905     struct domain *d   = v->domain;
3906     struct page_info *gl1pg;
3907     l1_pgentry_t  *pl1e;
3908     unsigned long  bmap_ptr;
3909     mfn_t          gl1mfn;
3910     cpumask_t     *mask = NULL;
3911     int            rc;
3912 
3913     perfc_incr(calls_to_update_va);
3914 
3915     rc = xsm_update_va_mapping(XSM_TARGET, d, pg_owner, val);
3916     if ( rc )
3917         return rc;
3918 
3919     rc = -EINVAL;
3920     pl1e = map_guest_l1e(va, &gl1mfn);
3921     if ( unlikely(!pl1e || !get_page_from_mfn(gl1mfn, d)) )
3922         goto out;
3923 
3924     gl1pg = mfn_to_page(gl1mfn);
3925     if ( !page_lock(gl1pg) )
3926     {
3927         put_page(gl1pg);
3928         goto out;
3929     }
3930 
3931     if ( (gl1pg->u.inuse.type_info & PGT_type_mask) != PGT_l1_page_table )
3932     {
3933         page_unlock(gl1pg);
3934         put_page(gl1pg);
3935         goto out;
3936     }
3937 
3938     rc = mod_l1_entry(pl1e, val, mfn_x(gl1mfn), 0, v, pg_owner);
3939 
3940     page_unlock(gl1pg);
3941     put_page(gl1pg);
3942 
3943  out:
3944     if ( pl1e )
3945         unmap_domain_page(pl1e);
3946 
3947     switch ( flags & UVMF_FLUSHTYPE_MASK )
3948     {
3949     case UVMF_TLB_FLUSH:
3950         switch ( (bmap_ptr = flags & ~UVMF_FLUSHTYPE_MASK) )
3951         {
3952         case UVMF_LOCAL:
3953             flush_tlb_local();
3954             break;
3955         case UVMF_ALL:
3956             mask = d->domain_dirty_cpumask;
3957             break;
3958         default:
3959             mask = this_cpu(scratch_cpumask);
3960             rc = vcpumask_to_pcpumask(d, const_guest_handle_from_ptr(bmap_ptr,
3961                                                                      void),
3962                                       mask);
3963             break;
3964         }
3965         if ( mask )
3966             flush_tlb_mask(mask);
3967         break;
3968 
3969     case UVMF_INVLPG:
3970         switch ( (bmap_ptr = flags & ~UVMF_FLUSHTYPE_MASK) )
3971         {
3972         case UVMF_LOCAL:
3973             paging_invlpg(v, va);
3974             break;
3975         case UVMF_ALL:
3976             mask = d->domain_dirty_cpumask;
3977             break;
3978         default:
3979             mask = this_cpu(scratch_cpumask);
3980             rc = vcpumask_to_pcpumask(d, const_guest_handle_from_ptr(bmap_ptr,
3981                                                                      void),
3982                                       mask);
3983             break;
3984         }
3985         if ( mask )
3986             flush_tlb_one_mask(mask, va);
3987         break;
3988     }
3989 
3990     return rc;
3991 }
3992 
do_update_va_mapping(unsigned long va,u64 val64,unsigned long flags)3993 long do_update_va_mapping(unsigned long va, u64 val64,
3994                           unsigned long flags)
3995 {
3996     return __do_update_va_mapping(va, val64, flags, current->domain);
3997 }
3998 
do_update_va_mapping_otherdomain(unsigned long va,u64 val64,unsigned long flags,domid_t domid)3999 long do_update_va_mapping_otherdomain(unsigned long va, u64 val64,
4000                                       unsigned long flags,
4001                                       domid_t domid)
4002 {
4003     struct domain *pg_owner;
4004     int rc;
4005 
4006     if ( (pg_owner = get_pg_owner(domid)) == NULL )
4007         return -ESRCH;
4008 
4009     rc = __do_update_va_mapping(va, val64, flags, pg_owner);
4010 
4011     put_pg_owner(pg_owner);
4012 
4013     return rc;
4014 }
4015 
4016 typedef struct e820entry e820entry_t;
4017 DEFINE_XEN_GUEST_HANDLE(e820entry_t);
4018 
4019 struct memory_map_context
4020 {
4021     unsigned int n;
4022     unsigned long s;
4023     struct xen_memory_map map;
4024 };
4025 
_handle_iomem_range(unsigned long s,unsigned long e,struct memory_map_context * ctxt)4026 static int _handle_iomem_range(unsigned long s, unsigned long e,
4027                                struct memory_map_context *ctxt)
4028 {
4029     if ( s > ctxt->s && !(s >> (paddr_bits - PAGE_SHIFT)) )
4030     {
4031         e820entry_t ent;
4032         XEN_GUEST_HANDLE_PARAM(e820entry_t) buffer_param;
4033         XEN_GUEST_HANDLE(e820entry_t) buffer;
4034 
4035         if ( !guest_handle_is_null(ctxt->map.buffer) )
4036         {
4037             if ( ctxt->n + 1 >= ctxt->map.nr_entries )
4038                 return -EINVAL;
4039             ent.addr = (uint64_t)ctxt->s << PAGE_SHIFT;
4040             ent.size = (uint64_t)(s - ctxt->s) << PAGE_SHIFT;
4041             ent.type = E820_RESERVED;
4042             buffer_param = guest_handle_cast(ctxt->map.buffer, e820entry_t);
4043             buffer = guest_handle_from_param(buffer_param, e820entry_t);
4044             if ( __copy_to_guest_offset(buffer, ctxt->n, &ent, 1) )
4045                 return -EFAULT;
4046         }
4047         ctxt->n++;
4048     }
4049     ctxt->s = e + 1;
4050 
4051     return 0;
4052 }
4053 
handle_iomem_range(unsigned long s,unsigned long e,void * p)4054 static int handle_iomem_range(unsigned long s, unsigned long e, void *p)
4055 {
4056     int err = 0;
4057 
4058     do {
4059         unsigned long low = -1UL;
4060         unsigned int i;
4061 
4062         for ( i = 0; i < nr_ioapics; ++i )
4063         {
4064             unsigned long mfn = paddr_to_pfn(mp_ioapics[i].mpc_apicaddr);
4065 
4066             if ( mfn >= s && mfn <= e && mfn < low )
4067                 low = mfn;
4068         }
4069         if ( !(low + 1) )
4070             break;
4071         if ( s < low )
4072             err = _handle_iomem_range(s, low - 1, p);
4073         s = low + 1;
4074     } while ( !err );
4075 
4076     return err || s > e ? err : _handle_iomem_range(s, e, p);
4077 }
4078 
xenmem_add_to_physmap_one(struct domain * d,unsigned int space,union xen_add_to_physmap_batch_extra extra,unsigned long idx,gfn_t gpfn)4079 int xenmem_add_to_physmap_one(
4080     struct domain *d,
4081     unsigned int space,
4082     union xen_add_to_physmap_batch_extra extra,
4083     unsigned long idx,
4084     gfn_t gpfn)
4085 {
4086     struct page_info *page = NULL;
4087     unsigned long gfn = 0; /* gcc ... */
4088     unsigned long prev_mfn, old_gpfn;
4089     int rc = 0;
4090     mfn_t mfn = INVALID_MFN;
4091     p2m_type_t p2mt;
4092 
4093     if ( !paging_mode_translate(d) )
4094         return -EACCES;
4095 
4096     switch ( space )
4097     {
4098         case XENMAPSPACE_shared_info:
4099             if ( idx == 0 )
4100                 mfn = _mfn(virt_to_mfn(d->shared_info));
4101             break;
4102         case XENMAPSPACE_grant_table:
4103             rc = gnttab_map_frame(d, idx, gpfn, &mfn);
4104             if ( rc )
4105                 return rc;
4106             break;
4107         case XENMAPSPACE_gmfn_range:
4108         case XENMAPSPACE_gmfn:
4109         {
4110             p2m_type_t p2mt;
4111 
4112             gfn = idx;
4113             idx = mfn_x(get_gfn_unshare(d, idx, &p2mt));
4114             /* If the page is still shared, exit early */
4115             if ( p2m_is_shared(p2mt) )
4116             {
4117                 put_gfn(d, gfn);
4118                 return -ENOMEM;
4119             }
4120             if ( !get_page_from_mfn(_mfn(idx), d) )
4121                 break;
4122             mfn = _mfn(idx);
4123             page = mfn_to_page(mfn);
4124             break;
4125         }
4126         case XENMAPSPACE_gmfn_foreign:
4127             return p2m_add_foreign(d, idx, gfn_x(gpfn), extra.foreign_domid);
4128         default:
4129             break;
4130     }
4131 
4132     if ( mfn_eq(mfn, INVALID_MFN) )
4133     {
4134         rc = -EINVAL;
4135         goto put_both;
4136     }
4137 
4138     /* Remove previously mapped page if it was present. */
4139     prev_mfn = mfn_x(get_gfn(d, gfn_x(gpfn), &p2mt));
4140     if ( mfn_valid(_mfn(prev_mfn)) )
4141     {
4142         if ( is_xen_heap_mfn(prev_mfn) )
4143             /* Xen heap frames are simply unhooked from this phys slot. */
4144             rc = guest_physmap_remove_page(d, gpfn, _mfn(prev_mfn), PAGE_ORDER_4K);
4145         else
4146             /* Normal domain memory is freed, to avoid leaking memory. */
4147             rc = guest_remove_page(d, gfn_x(gpfn));
4148     }
4149     /* In the XENMAPSPACE_gmfn case we still hold a ref on the old page. */
4150     put_gfn(d, gfn_x(gpfn));
4151 
4152     if ( rc )
4153         goto put_both;
4154 
4155     /* Unmap from old location, if any. */
4156     old_gpfn = get_gpfn_from_mfn(mfn_x(mfn));
4157     ASSERT( old_gpfn != SHARED_M2P_ENTRY );
4158     if ( (space == XENMAPSPACE_gmfn || space == XENMAPSPACE_gmfn_range) &&
4159          old_gpfn != gfn )
4160     {
4161         rc = -EXDEV;
4162         goto put_both;
4163     }
4164     if ( old_gpfn != INVALID_M2P_ENTRY )
4165         rc = guest_physmap_remove_page(d, _gfn(old_gpfn), mfn, PAGE_ORDER_4K);
4166 
4167     /* Map at new location. */
4168     if ( !rc )
4169         rc = guest_physmap_add_page(d, gpfn, mfn, PAGE_ORDER_4K);
4170 
4171  put_both:
4172     /* In the XENMAPSPACE_gmfn, we took a ref of the gfn at the top */
4173     if ( space == XENMAPSPACE_gmfn || space == XENMAPSPACE_gmfn_range )
4174         put_gfn(d, gfn);
4175 
4176     if ( page )
4177         put_page(page);
4178 
4179     return rc;
4180 }
4181 
arch_memory_op(unsigned long cmd,XEN_GUEST_HANDLE_PARAM (void)arg)4182 long arch_memory_op(unsigned long cmd, XEN_GUEST_HANDLE_PARAM(void) arg)
4183 {
4184     int rc;
4185 
4186     switch ( cmd )
4187     {
4188     case XENMEM_set_memory_map:
4189     {
4190         struct xen_foreign_memory_map fmap;
4191         struct domain *d;
4192         struct e820entry *e820;
4193 
4194         if ( copy_from_guest(&fmap, arg, 1) )
4195             return -EFAULT;
4196 
4197         if ( fmap.map.nr_entries > E820MAX )
4198             return -EINVAL;
4199 
4200         d = rcu_lock_domain_by_any_id(fmap.domid);
4201         if ( d == NULL )
4202             return -ESRCH;
4203 
4204         rc = xsm_domain_memory_map(XSM_TARGET, d);
4205         if ( rc )
4206         {
4207             rcu_unlock_domain(d);
4208             return rc;
4209         }
4210 
4211         e820 = xmalloc_array(e820entry_t, fmap.map.nr_entries);
4212         if ( e820 == NULL )
4213         {
4214             rcu_unlock_domain(d);
4215             return -ENOMEM;
4216         }
4217 
4218         if ( copy_from_guest(e820, fmap.map.buffer, fmap.map.nr_entries) )
4219         {
4220             xfree(e820);
4221             rcu_unlock_domain(d);
4222             return -EFAULT;
4223         }
4224 
4225         spin_lock(&d->arch.e820_lock);
4226         xfree(d->arch.e820);
4227         d->arch.e820 = e820;
4228         d->arch.nr_e820 = fmap.map.nr_entries;
4229         spin_unlock(&d->arch.e820_lock);
4230 
4231         rcu_unlock_domain(d);
4232         return rc;
4233     }
4234 
4235     case XENMEM_memory_map:
4236     {
4237         struct xen_memory_map map;
4238         struct domain *d = current->domain;
4239 
4240         if ( copy_from_guest(&map, arg, 1) )
4241             return -EFAULT;
4242 
4243         spin_lock(&d->arch.e820_lock);
4244 
4245         /* Backwards compatibility. */
4246         if ( (d->arch.nr_e820 == 0) || (d->arch.e820 == NULL) )
4247         {
4248             spin_unlock(&d->arch.e820_lock);
4249             return -ENOSYS;
4250         }
4251 
4252         map.nr_entries = min(map.nr_entries, d->arch.nr_e820);
4253         if ( copy_to_guest(map.buffer, d->arch.e820, map.nr_entries) ||
4254              __copy_to_guest(arg, &map, 1) )
4255         {
4256             spin_unlock(&d->arch.e820_lock);
4257             return -EFAULT;
4258         }
4259 
4260         spin_unlock(&d->arch.e820_lock);
4261         return 0;
4262     }
4263 
4264     case XENMEM_machine_memory_map:
4265     {
4266         struct memory_map_context ctxt;
4267         XEN_GUEST_HANDLE(e820entry_t) buffer;
4268         XEN_GUEST_HANDLE_PARAM(e820entry_t) buffer_param;
4269         unsigned int i;
4270         bool store;
4271 
4272         rc = xsm_machine_memory_map(XSM_PRIV);
4273         if ( rc )
4274             return rc;
4275 
4276         if ( copy_from_guest(&ctxt.map, arg, 1) )
4277             return -EFAULT;
4278 
4279         store = !guest_handle_is_null(ctxt.map.buffer);
4280 
4281         if ( store && ctxt.map.nr_entries < e820.nr_map + 1 )
4282             return -EINVAL;
4283 
4284         buffer_param = guest_handle_cast(ctxt.map.buffer, e820entry_t);
4285         buffer = guest_handle_from_param(buffer_param, e820entry_t);
4286         if ( store && !guest_handle_okay(buffer, ctxt.map.nr_entries) )
4287             return -EFAULT;
4288 
4289         for ( i = 0, ctxt.n = 0, ctxt.s = 0; i < e820.nr_map; ++i, ++ctxt.n )
4290         {
4291             unsigned long s = PFN_DOWN(e820.map[i].addr);
4292 
4293             if ( s > ctxt.s )
4294             {
4295                 rc = rangeset_report_ranges(current->domain->iomem_caps,
4296                                             ctxt.s, s - 1,
4297                                             handle_iomem_range, &ctxt);
4298                 if ( !rc )
4299                     rc = handle_iomem_range(s, s, &ctxt);
4300                 if ( rc )
4301                     return rc;
4302             }
4303             if ( store )
4304             {
4305                 if ( ctxt.map.nr_entries <= ctxt.n + (e820.nr_map - i) )
4306                     return -EINVAL;
4307                 if ( __copy_to_guest_offset(buffer, ctxt.n, e820.map + i, 1) )
4308                     return -EFAULT;
4309             }
4310             ctxt.s = PFN_UP(e820.map[i].addr + e820.map[i].size);
4311         }
4312 
4313         if ( ctxt.s )
4314         {
4315             rc = rangeset_report_ranges(current->domain->iomem_caps, ctxt.s,
4316                                         ~0UL, handle_iomem_range, &ctxt);
4317             if ( !rc && ctxt.s )
4318                 rc = handle_iomem_range(~0UL, ~0UL, &ctxt);
4319             if ( rc )
4320                 return rc;
4321         }
4322 
4323         ctxt.map.nr_entries = ctxt.n;
4324 
4325         if ( __copy_to_guest(arg, &ctxt.map, 1) )
4326             return -EFAULT;
4327 
4328         return 0;
4329     }
4330 
4331     case XENMEM_machphys_mapping:
4332     {
4333         struct xen_machphys_mapping mapping = {
4334             .v_start = MACH2PHYS_VIRT_START,
4335             .v_end   = MACH2PHYS_VIRT_END,
4336             .max_mfn = MACH2PHYS_NR_ENTRIES - 1
4337         };
4338 
4339         if ( !mem_hotplug && is_hardware_domain(current->domain) )
4340             mapping.max_mfn = max_page - 1;
4341         if ( copy_to_guest(arg, &mapping, 1) )
4342             return -EFAULT;
4343 
4344         return 0;
4345     }
4346 
4347     case XENMEM_set_pod_target:
4348     case XENMEM_get_pod_target:
4349     {
4350         xen_pod_target_t target;
4351         struct domain *d;
4352         struct p2m_domain *p2m;
4353 
4354         if ( copy_from_guest(&target, arg, 1) )
4355             return -EFAULT;
4356 
4357         d = rcu_lock_domain_by_any_id(target.domid);
4358         if ( d == NULL )
4359             return -ESRCH;
4360 
4361         if ( cmd == XENMEM_set_pod_target )
4362             rc = xsm_set_pod_target(XSM_PRIV, d);
4363         else
4364             rc = xsm_get_pod_target(XSM_PRIV, d);
4365 
4366         if ( rc != 0 )
4367             goto pod_target_out_unlock;
4368 
4369         if ( cmd == XENMEM_set_pod_target )
4370         {
4371             if ( target.target_pages > d->max_pages )
4372             {
4373                 rc = -EINVAL;
4374                 goto pod_target_out_unlock;
4375             }
4376 
4377             rc = p2m_pod_set_mem_target(d, target.target_pages);
4378         }
4379 
4380         if ( rc == -ERESTART )
4381         {
4382             rc = hypercall_create_continuation(
4383                 __HYPERVISOR_memory_op, "lh", cmd, arg);
4384         }
4385         else if ( rc >= 0 )
4386         {
4387             p2m = p2m_get_hostp2m(d);
4388             target.tot_pages       = d->tot_pages;
4389             target.pod_cache_pages = p2m->pod.count;
4390             target.pod_entries     = p2m->pod.entry_count;
4391 
4392             if ( __copy_to_guest(arg, &target, 1) )
4393             {
4394                 rc= -EFAULT;
4395                 goto pod_target_out_unlock;
4396             }
4397         }
4398 
4399     pod_target_out_unlock:
4400         rcu_unlock_domain(d);
4401         return rc;
4402     }
4403 
4404     default:
4405         return subarch_memory_op(cmd, arg);
4406     }
4407 
4408     return 0;
4409 }
4410 
mmio_ro_emulated_write(enum x86_segment seg,unsigned long offset,void * p_data,unsigned int bytes,struct x86_emulate_ctxt * ctxt)4411 int mmio_ro_emulated_write(
4412     enum x86_segment seg,
4413     unsigned long offset,
4414     void *p_data,
4415     unsigned int bytes,
4416     struct x86_emulate_ctxt *ctxt)
4417 {
4418     struct mmio_ro_emulate_ctxt *mmio_ro_ctxt = ctxt->data;
4419 
4420     /* Only allow naturally-aligned stores at the original %cr2 address. */
4421     if ( ((bytes | offset) & (bytes - 1)) || !bytes ||
4422          offset != mmio_ro_ctxt->cr2 )
4423     {
4424         gdprintk(XENLOG_WARNING, "bad access (cr2=%lx, addr=%lx, bytes=%u)\n",
4425                 mmio_ro_ctxt->cr2, offset, bytes);
4426         return X86EMUL_UNHANDLEABLE;
4427     }
4428 
4429     return X86EMUL_OKAY;
4430 }
4431 
mmcfg_intercept_write(enum x86_segment seg,unsigned long offset,void * p_data,unsigned int bytes,struct x86_emulate_ctxt * ctxt)4432 int mmcfg_intercept_write(
4433     enum x86_segment seg,
4434     unsigned long offset,
4435     void *p_data,
4436     unsigned int bytes,
4437     struct x86_emulate_ctxt *ctxt)
4438 {
4439     struct mmio_ro_emulate_ctxt *mmio_ctxt = ctxt->data;
4440 
4441     /*
4442      * Only allow naturally-aligned stores no wider than 4 bytes to the
4443      * original %cr2 address.
4444      */
4445     if ( ((bytes | offset) & (bytes - 1)) || bytes > 4 || !bytes ||
4446          offset != mmio_ctxt->cr2 )
4447     {
4448         gdprintk(XENLOG_WARNING, "bad write (cr2=%lx, addr=%lx, bytes=%u)\n",
4449                 mmio_ctxt->cr2, offset, bytes);
4450         return X86EMUL_UNHANDLEABLE;
4451     }
4452 
4453     offset &= 0xfff;
4454     if ( pci_conf_write_intercept(mmio_ctxt->seg, mmio_ctxt->bdf,
4455                                   offset, bytes, p_data) >= 0 )
4456         pci_mmcfg_write(mmio_ctxt->seg, PCI_BUS(mmio_ctxt->bdf),
4457                         PCI_DEVFN2(mmio_ctxt->bdf), offset, bytes,
4458                         *(uint32_t *)p_data);
4459 
4460     return X86EMUL_OKAY;
4461 }
4462 
alloc_xen_pagetable(void)4463 void *alloc_xen_pagetable(void)
4464 {
4465     if ( system_state != SYS_STATE_early_boot )
4466     {
4467         void *ptr = alloc_xenheap_page();
4468 
4469         BUG_ON(!hardware_domain && !ptr);
4470         return ptr;
4471     }
4472 
4473     return mfn_to_virt(mfn_x(alloc_boot_pages(1, 1)));
4474 }
4475 
free_xen_pagetable(void * v)4476 void free_xen_pagetable(void *v)
4477 {
4478     if ( system_state != SYS_STATE_early_boot )
4479         free_xenheap_page(v);
4480 }
4481 
4482 static DEFINE_SPINLOCK(map_pgdir_lock);
4483 
virt_to_xen_l3e(unsigned long v)4484 static l3_pgentry_t *virt_to_xen_l3e(unsigned long v)
4485 {
4486     l4_pgentry_t *pl4e;
4487 
4488     pl4e = &idle_pg_table[l4_table_offset(v)];
4489     if ( !(l4e_get_flags(*pl4e) & _PAGE_PRESENT) )
4490     {
4491         bool locking = system_state > SYS_STATE_boot;
4492         l3_pgentry_t *pl3e = alloc_xen_pagetable();
4493 
4494         if ( !pl3e )
4495             return NULL;
4496         clear_page(pl3e);
4497         if ( locking )
4498             spin_lock(&map_pgdir_lock);
4499         if ( !(l4e_get_flags(*pl4e) & _PAGE_PRESENT) )
4500         {
4501             l4_pgentry_t l4e = l4e_from_paddr(__pa(pl3e), __PAGE_HYPERVISOR);
4502 
4503             l4e_write(pl4e, l4e);
4504             efi_update_l4_pgtable(l4_table_offset(v), l4e);
4505             pl3e = NULL;
4506         }
4507         if ( locking )
4508             spin_unlock(&map_pgdir_lock);
4509         if ( pl3e )
4510             free_xen_pagetable(pl3e);
4511     }
4512 
4513     return l4e_to_l3e(*pl4e) + l3_table_offset(v);
4514 }
4515 
virt_to_xen_l2e(unsigned long v)4516 static l2_pgentry_t *virt_to_xen_l2e(unsigned long v)
4517 {
4518     l3_pgentry_t *pl3e;
4519 
4520     pl3e = virt_to_xen_l3e(v);
4521     if ( !pl3e )
4522         return NULL;
4523 
4524     if ( !(l3e_get_flags(*pl3e) & _PAGE_PRESENT) )
4525     {
4526         bool locking = system_state > SYS_STATE_boot;
4527         l2_pgentry_t *pl2e = alloc_xen_pagetable();
4528 
4529         if ( !pl2e )
4530             return NULL;
4531         clear_page(pl2e);
4532         if ( locking )
4533             spin_lock(&map_pgdir_lock);
4534         if ( !(l3e_get_flags(*pl3e) & _PAGE_PRESENT) )
4535         {
4536             l3e_write(pl3e, l3e_from_paddr(__pa(pl2e), __PAGE_HYPERVISOR));
4537             pl2e = NULL;
4538         }
4539         if ( locking )
4540             spin_unlock(&map_pgdir_lock);
4541         if ( pl2e )
4542             free_xen_pagetable(pl2e);
4543     }
4544 
4545     BUG_ON(l3e_get_flags(*pl3e) & _PAGE_PSE);
4546     return l3e_to_l2e(*pl3e) + l2_table_offset(v);
4547 }
4548 
virt_to_xen_l1e(unsigned long v)4549 l1_pgentry_t *virt_to_xen_l1e(unsigned long v)
4550 {
4551     l2_pgentry_t *pl2e;
4552 
4553     pl2e = virt_to_xen_l2e(v);
4554     if ( !pl2e )
4555         return NULL;
4556 
4557     if ( !(l2e_get_flags(*pl2e) & _PAGE_PRESENT) )
4558     {
4559         bool locking = system_state > SYS_STATE_boot;
4560         l1_pgentry_t *pl1e = alloc_xen_pagetable();
4561 
4562         if ( !pl1e )
4563             return NULL;
4564         clear_page(pl1e);
4565         if ( locking )
4566             spin_lock(&map_pgdir_lock);
4567         if ( !(l2e_get_flags(*pl2e) & _PAGE_PRESENT) )
4568         {
4569             l2e_write(pl2e, l2e_from_paddr(__pa(pl1e), __PAGE_HYPERVISOR));
4570             pl1e = NULL;
4571         }
4572         if ( locking )
4573             spin_unlock(&map_pgdir_lock);
4574         if ( pl1e )
4575             free_xen_pagetable(pl1e);
4576     }
4577 
4578     BUG_ON(l2e_get_flags(*pl2e) & _PAGE_PSE);
4579     return l2e_to_l1e(*pl2e) + l1_table_offset(v);
4580 }
4581 
4582 /* Convert to from superpage-mapping flags for map_pages_to_xen(). */
4583 #define l1f_to_lNf(f) (((f) & _PAGE_PRESENT) ? ((f) |  _PAGE_PSE) : (f))
4584 #define lNf_to_l1f(f) (((f) & _PAGE_PRESENT) ? ((f) & ~_PAGE_PSE) : (f))
4585 
4586 /*
4587  * map_pages_to_xen() can be called with interrupts disabled during
4588  * early bootstrap. In this case it is safe to use flush_area_local()
4589  * and avoid locking because only the local CPU is online.
4590  */
4591 #define flush_area(v,f) (!local_irq_is_enabled() ?              \
4592                          flush_area_local((const void *)v, f) : \
4593                          flush_area_all((const void *)v, f))
4594 
map_pages_to_xen(unsigned long virt,unsigned long mfn,unsigned long nr_mfns,unsigned int flags)4595 int map_pages_to_xen(
4596     unsigned long virt,
4597     unsigned long mfn,
4598     unsigned long nr_mfns,
4599     unsigned int flags)
4600 {
4601     bool locking = system_state > SYS_STATE_boot;
4602     l2_pgentry_t *pl2e, ol2e;
4603     l1_pgentry_t *pl1e, ol1e;
4604     unsigned int  i;
4605 
4606 #define flush_flags(oldf) do {                 \
4607     unsigned int o_ = (oldf);                  \
4608     if ( (o_) & _PAGE_GLOBAL )                 \
4609         flush_flags |= FLUSH_TLB_GLOBAL;       \
4610     if ( (flags & _PAGE_PRESENT) &&            \
4611          (((o_) ^ flags) & PAGE_CACHE_ATTRS) ) \
4612     {                                          \
4613         flush_flags |= FLUSH_CACHE;            \
4614         if ( virt >= DIRECTMAP_VIRT_START &&   \
4615              virt < HYPERVISOR_VIRT_END )      \
4616             flush_flags |= FLUSH_VA_VALID;     \
4617     }                                          \
4618 } while (0)
4619 
4620     while ( nr_mfns != 0 )
4621     {
4622         l3_pgentry_t ol3e, *pl3e = virt_to_xen_l3e(virt);
4623 
4624         if ( !pl3e )
4625             return -ENOMEM;
4626         ol3e = *pl3e;
4627 
4628         if ( cpu_has_page1gb &&
4629              !(((virt >> PAGE_SHIFT) | mfn) &
4630                ((1UL << (L3_PAGETABLE_SHIFT - PAGE_SHIFT)) - 1)) &&
4631              nr_mfns >= (1UL << (L3_PAGETABLE_SHIFT - PAGE_SHIFT)) &&
4632              !(flags & (_PAGE_PAT | MAP_SMALL_PAGES)) )
4633         {
4634             /* 1GB-page mapping. */
4635             l3e_write_atomic(pl3e, l3e_from_pfn(mfn, l1f_to_lNf(flags)));
4636 
4637             if ( (l3e_get_flags(ol3e) & _PAGE_PRESENT) )
4638             {
4639                 unsigned int flush_flags =
4640                     FLUSH_TLB | FLUSH_ORDER(2 * PAGETABLE_ORDER);
4641 
4642                 if ( l3e_get_flags(ol3e) & _PAGE_PSE )
4643                 {
4644                     flush_flags(lNf_to_l1f(l3e_get_flags(ol3e)));
4645                     flush_area(virt, flush_flags);
4646                 }
4647                 else
4648                 {
4649                     pl2e = l3e_to_l2e(ol3e);
4650                     for ( i = 0; i < L2_PAGETABLE_ENTRIES; i++ )
4651                     {
4652                         ol2e = pl2e[i];
4653                         if ( !(l2e_get_flags(ol2e) & _PAGE_PRESENT) )
4654                             continue;
4655                         if ( l2e_get_flags(ol2e) & _PAGE_PSE )
4656                             flush_flags(lNf_to_l1f(l2e_get_flags(ol2e)));
4657                         else
4658                         {
4659                             unsigned int j;
4660 
4661                             pl1e = l2e_to_l1e(ol2e);
4662                             for ( j = 0; j < L1_PAGETABLE_ENTRIES; j++ )
4663                                 flush_flags(l1e_get_flags(pl1e[j]));
4664                         }
4665                     }
4666                     flush_area(virt, flush_flags);
4667                     for ( i = 0; i < L2_PAGETABLE_ENTRIES; i++ )
4668                     {
4669                         ol2e = pl2e[i];
4670                         if ( (l2e_get_flags(ol2e) & _PAGE_PRESENT) &&
4671                              !(l2e_get_flags(ol2e) & _PAGE_PSE) )
4672                             free_xen_pagetable(l2e_to_l1e(ol2e));
4673                     }
4674                     free_xen_pagetable(pl2e);
4675                 }
4676             }
4677 
4678             virt    += 1UL << L3_PAGETABLE_SHIFT;
4679             mfn     += 1UL << (L3_PAGETABLE_SHIFT - PAGE_SHIFT);
4680             nr_mfns -= 1UL << (L3_PAGETABLE_SHIFT - PAGE_SHIFT);
4681             continue;
4682         }
4683 
4684         if ( (l3e_get_flags(ol3e) & _PAGE_PRESENT) &&
4685              (l3e_get_flags(ol3e) & _PAGE_PSE) )
4686         {
4687             unsigned int flush_flags =
4688                 FLUSH_TLB | FLUSH_ORDER(2 * PAGETABLE_ORDER);
4689 
4690             /* Skip this PTE if there is no change. */
4691             if ( ((l3e_get_pfn(ol3e) & ~(L2_PAGETABLE_ENTRIES *
4692                                          L1_PAGETABLE_ENTRIES - 1)) +
4693                   (l2_table_offset(virt) << PAGETABLE_ORDER) +
4694                   l1_table_offset(virt) == mfn) &&
4695                  ((lNf_to_l1f(l3e_get_flags(ol3e)) ^ flags) &
4696                   ~(_PAGE_ACCESSED|_PAGE_DIRTY)) == 0 )
4697             {
4698                 /* We can skip to end of L3 superpage if we got a match. */
4699                 i = (1u << (L3_PAGETABLE_SHIFT - PAGE_SHIFT)) -
4700                     (mfn & ((1 << (L3_PAGETABLE_SHIFT - PAGE_SHIFT)) - 1));
4701                 if ( i > nr_mfns )
4702                     i = nr_mfns;
4703                 virt    += i << PAGE_SHIFT;
4704                 mfn     += i;
4705                 nr_mfns -= i;
4706                 continue;
4707             }
4708 
4709             pl2e = alloc_xen_pagetable();
4710             if ( pl2e == NULL )
4711                 return -ENOMEM;
4712 
4713             for ( i = 0; i < L2_PAGETABLE_ENTRIES; i++ )
4714                 l2e_write(pl2e + i,
4715                           l2e_from_pfn(l3e_get_pfn(ol3e) +
4716                                        (i << PAGETABLE_ORDER),
4717                                        l3e_get_flags(ol3e)));
4718 
4719             if ( l3e_get_flags(ol3e) & _PAGE_GLOBAL )
4720                 flush_flags |= FLUSH_TLB_GLOBAL;
4721 
4722             if ( locking )
4723                 spin_lock(&map_pgdir_lock);
4724             if ( (l3e_get_flags(*pl3e) & _PAGE_PRESENT) &&
4725                  (l3e_get_flags(*pl3e) & _PAGE_PSE) )
4726             {
4727                 l3e_write_atomic(pl3e, l3e_from_pfn(virt_to_mfn(pl2e),
4728                                                     __PAGE_HYPERVISOR));
4729                 pl2e = NULL;
4730             }
4731             if ( locking )
4732                 spin_unlock(&map_pgdir_lock);
4733             flush_area(virt, flush_flags);
4734             if ( pl2e )
4735                 free_xen_pagetable(pl2e);
4736         }
4737 
4738         pl2e = virt_to_xen_l2e(virt);
4739         if ( !pl2e )
4740             return -ENOMEM;
4741 
4742         if ( ((((virt >> PAGE_SHIFT) | mfn) &
4743                ((1u << PAGETABLE_ORDER) - 1)) == 0) &&
4744              (nr_mfns >= (1u << PAGETABLE_ORDER)) &&
4745              !(flags & (_PAGE_PAT|MAP_SMALL_PAGES)) )
4746         {
4747             /* Super-page mapping. */
4748             ol2e = *pl2e;
4749             l2e_write_atomic(pl2e, l2e_from_pfn(mfn, l1f_to_lNf(flags)));
4750 
4751             if ( (l2e_get_flags(ol2e) & _PAGE_PRESENT) )
4752             {
4753                 unsigned int flush_flags =
4754                     FLUSH_TLB | FLUSH_ORDER(PAGETABLE_ORDER);
4755 
4756                 if ( l2e_get_flags(ol2e) & _PAGE_PSE )
4757                 {
4758                     flush_flags(lNf_to_l1f(l2e_get_flags(ol2e)));
4759                     flush_area(virt, flush_flags);
4760                 }
4761                 else
4762                 {
4763                     pl1e = l2e_to_l1e(ol2e);
4764                     for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++ )
4765                         flush_flags(l1e_get_flags(pl1e[i]));
4766                     flush_area(virt, flush_flags);
4767                     free_xen_pagetable(pl1e);
4768                 }
4769             }
4770 
4771             virt    += 1UL << L2_PAGETABLE_SHIFT;
4772             mfn     += 1UL << PAGETABLE_ORDER;
4773             nr_mfns -= 1UL << PAGETABLE_ORDER;
4774         }
4775         else
4776         {
4777             /* Normal page mapping. */
4778             if ( !(l2e_get_flags(*pl2e) & _PAGE_PRESENT) )
4779             {
4780                 pl1e = virt_to_xen_l1e(virt);
4781                 if ( pl1e == NULL )
4782                     return -ENOMEM;
4783             }
4784             else if ( l2e_get_flags(*pl2e) & _PAGE_PSE )
4785             {
4786                 unsigned int flush_flags =
4787                     FLUSH_TLB | FLUSH_ORDER(PAGETABLE_ORDER);
4788 
4789                 /* Skip this PTE if there is no change. */
4790                 if ( (((l2e_get_pfn(*pl2e) & ~(L1_PAGETABLE_ENTRIES - 1)) +
4791                        l1_table_offset(virt)) == mfn) &&
4792                      (((lNf_to_l1f(l2e_get_flags(*pl2e)) ^ flags) &
4793                        ~(_PAGE_ACCESSED|_PAGE_DIRTY)) == 0) )
4794                 {
4795                     /* We can skip to end of L2 superpage if we got a match. */
4796                     i = (1u << (L2_PAGETABLE_SHIFT - PAGE_SHIFT)) -
4797                         (mfn & ((1u << (L2_PAGETABLE_SHIFT - PAGE_SHIFT)) - 1));
4798                     if ( i > nr_mfns )
4799                         i = nr_mfns;
4800                     virt    += i << L1_PAGETABLE_SHIFT;
4801                     mfn     += i;
4802                     nr_mfns -= i;
4803                     goto check_l3;
4804                 }
4805 
4806                 pl1e = alloc_xen_pagetable();
4807                 if ( pl1e == NULL )
4808                     return -ENOMEM;
4809 
4810                 for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++ )
4811                     l1e_write(&pl1e[i],
4812                               l1e_from_pfn(l2e_get_pfn(*pl2e) + i,
4813                                            lNf_to_l1f(l2e_get_flags(*pl2e))));
4814 
4815                 if ( l2e_get_flags(*pl2e) & _PAGE_GLOBAL )
4816                     flush_flags |= FLUSH_TLB_GLOBAL;
4817 
4818                 if ( locking )
4819                     spin_lock(&map_pgdir_lock);
4820                 if ( (l2e_get_flags(*pl2e) & _PAGE_PRESENT) &&
4821                      (l2e_get_flags(*pl2e) & _PAGE_PSE) )
4822                 {
4823                     l2e_write_atomic(pl2e, l2e_from_pfn(virt_to_mfn(pl1e),
4824                                                         __PAGE_HYPERVISOR));
4825                     pl1e = NULL;
4826                 }
4827                 if ( locking )
4828                     spin_unlock(&map_pgdir_lock);
4829                 flush_area(virt, flush_flags);
4830                 if ( pl1e )
4831                     free_xen_pagetable(pl1e);
4832             }
4833 
4834             pl1e  = l2e_to_l1e(*pl2e) + l1_table_offset(virt);
4835             ol1e  = *pl1e;
4836             l1e_write_atomic(pl1e, l1e_from_pfn(mfn, flags));
4837             if ( (l1e_get_flags(ol1e) & _PAGE_PRESENT) )
4838             {
4839                 unsigned int flush_flags = FLUSH_TLB | FLUSH_ORDER(0);
4840 
4841                 flush_flags(l1e_get_flags(ol1e));
4842                 flush_area(virt, flush_flags);
4843             }
4844 
4845             virt    += 1UL << L1_PAGETABLE_SHIFT;
4846             mfn     += 1UL;
4847             nr_mfns -= 1UL;
4848 
4849             if ( (flags == PAGE_HYPERVISOR) &&
4850                  ((nr_mfns == 0) ||
4851                   ((((virt >> PAGE_SHIFT) | mfn) &
4852                     ((1u << PAGETABLE_ORDER) - 1)) == 0)) )
4853             {
4854                 unsigned long base_mfn;
4855 
4856                 if ( locking )
4857                     spin_lock(&map_pgdir_lock);
4858 
4859                 ol2e = *pl2e;
4860                 /*
4861                  * L2E may be already cleared, or set to a superpage, by
4862                  * concurrent paging structure modifications on other CPUs.
4863                  */
4864                 if ( !(l2e_get_flags(ol2e) & _PAGE_PRESENT) )
4865                 {
4866                     if ( locking )
4867                         spin_unlock(&map_pgdir_lock);
4868                     continue;
4869                 }
4870 
4871                 if ( l2e_get_flags(ol2e) & _PAGE_PSE )
4872                 {
4873                     if ( locking )
4874                         spin_unlock(&map_pgdir_lock);
4875                     goto check_l3;
4876                 }
4877 
4878                 pl1e = l2e_to_l1e(ol2e);
4879                 base_mfn = l1e_get_pfn(*pl1e) & ~(L1_PAGETABLE_ENTRIES - 1);
4880                 for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++, pl1e++ )
4881                     if ( (l1e_get_pfn(*pl1e) != (base_mfn + i)) ||
4882                          (l1e_get_flags(*pl1e) != flags) )
4883                         break;
4884                 if ( i == L1_PAGETABLE_ENTRIES )
4885                 {
4886                     l2e_write_atomic(pl2e, l2e_from_pfn(base_mfn,
4887                                                         l1f_to_lNf(flags)));
4888                     if ( locking )
4889                         spin_unlock(&map_pgdir_lock);
4890                     flush_area(virt - PAGE_SIZE,
4891                                FLUSH_TLB_GLOBAL |
4892                                FLUSH_ORDER(PAGETABLE_ORDER));
4893                     free_xen_pagetable(l2e_to_l1e(ol2e));
4894                 }
4895                 else if ( locking )
4896                     spin_unlock(&map_pgdir_lock);
4897             }
4898         }
4899 
4900  check_l3:
4901         if ( cpu_has_page1gb &&
4902              (flags == PAGE_HYPERVISOR) &&
4903              ((nr_mfns == 0) ||
4904               !(((virt >> PAGE_SHIFT) | mfn) &
4905                 ((1UL << (L3_PAGETABLE_SHIFT - PAGE_SHIFT)) - 1))) )
4906         {
4907             unsigned long base_mfn;
4908 
4909             if ( locking )
4910                 spin_lock(&map_pgdir_lock);
4911 
4912             ol3e = *pl3e;
4913             /*
4914              * L3E may be already cleared, or set to a superpage, by
4915              * concurrent paging structure modifications on other CPUs.
4916              */
4917             if ( !(l3e_get_flags(ol3e) & _PAGE_PRESENT) ||
4918                 (l3e_get_flags(ol3e) & _PAGE_PSE) )
4919             {
4920                 if ( locking )
4921                     spin_unlock(&map_pgdir_lock);
4922                 continue;
4923             }
4924 
4925             pl2e = l3e_to_l2e(ol3e);
4926             base_mfn = l2e_get_pfn(*pl2e) & ~(L2_PAGETABLE_ENTRIES *
4927                                               L1_PAGETABLE_ENTRIES - 1);
4928             for ( i = 0; i < L2_PAGETABLE_ENTRIES; i++, pl2e++ )
4929                 if ( (l2e_get_pfn(*pl2e) !=
4930                       (base_mfn + (i << PAGETABLE_ORDER))) ||
4931                      (l2e_get_flags(*pl2e) != l1f_to_lNf(flags)) )
4932                     break;
4933             if ( i == L2_PAGETABLE_ENTRIES )
4934             {
4935                 l3e_write_atomic(pl3e, l3e_from_pfn(base_mfn,
4936                                                     l1f_to_lNf(flags)));
4937                 if ( locking )
4938                     spin_unlock(&map_pgdir_lock);
4939                 flush_area(virt - PAGE_SIZE,
4940                            FLUSH_TLB_GLOBAL |
4941                            FLUSH_ORDER(2*PAGETABLE_ORDER));
4942                 free_xen_pagetable(l3e_to_l2e(ol3e));
4943             }
4944             else if ( locking )
4945                 spin_unlock(&map_pgdir_lock);
4946         }
4947     }
4948 
4949 #undef flush_flags
4950 
4951     return 0;
4952 }
4953 
populate_pt_range(unsigned long virt,unsigned long mfn,unsigned long nr_mfns)4954 int populate_pt_range(unsigned long virt, unsigned long mfn,
4955                       unsigned long nr_mfns)
4956 {
4957     return map_pages_to_xen(virt, mfn, nr_mfns, MAP_SMALL_PAGES);
4958 }
4959 
4960 /*
4961  * Alter the permissions of a range of Xen virtual address space.
4962  *
4963  * Does not create new mappings, and does not modify the mfn in existing
4964  * mappings, but will shatter superpages if necessary, and will destroy
4965  * mappings if not passed _PAGE_PRESENT.
4966  *
4967  * The only flags considered are NX, RW and PRESENT.  All other input flags
4968  * are ignored.
4969  *
4970  * It is an error to call with present flags over an unpopulated range.
4971  */
modify_xen_mappings(unsigned long s,unsigned long e,unsigned int nf)4972 int modify_xen_mappings(unsigned long s, unsigned long e, unsigned int nf)
4973 {
4974     bool locking = system_state > SYS_STATE_boot;
4975     l2_pgentry_t *pl2e;
4976     l1_pgentry_t *pl1e;
4977     unsigned int  i;
4978     unsigned long v = s;
4979 
4980     /* Set of valid PTE bits which may be altered. */
4981 #define FLAGS_MASK (_PAGE_NX|_PAGE_RW|_PAGE_PRESENT)
4982     nf &= FLAGS_MASK;
4983 
4984     ASSERT(IS_ALIGNED(s, PAGE_SIZE));
4985     ASSERT(IS_ALIGNED(e, PAGE_SIZE));
4986 
4987     while ( v < e )
4988     {
4989         l3_pgentry_t *pl3e = virt_to_xen_l3e(v);
4990 
4991         if ( !pl3e || !(l3e_get_flags(*pl3e) & _PAGE_PRESENT) )
4992         {
4993             /* Confirm the caller isn't trying to create new mappings. */
4994             ASSERT(!(nf & _PAGE_PRESENT));
4995 
4996             v += 1UL << L3_PAGETABLE_SHIFT;
4997             v &= ~((1UL << L3_PAGETABLE_SHIFT) - 1);
4998             continue;
4999         }
5000 
5001         if ( l3e_get_flags(*pl3e) & _PAGE_PSE )
5002         {
5003             if ( l2_table_offset(v) == 0 &&
5004                  l1_table_offset(v) == 0 &&
5005                  ((e - v) >= (1UL << L3_PAGETABLE_SHIFT)) )
5006             {
5007                 /* PAGE1GB: whole superpage is modified. */
5008                 l3_pgentry_t nl3e = !(nf & _PAGE_PRESENT) ? l3e_empty()
5009                     : l3e_from_pfn(l3e_get_pfn(*pl3e),
5010                                    (l3e_get_flags(*pl3e) & ~FLAGS_MASK) | nf);
5011 
5012                 l3e_write_atomic(pl3e, nl3e);
5013                 v += 1UL << L3_PAGETABLE_SHIFT;
5014                 continue;
5015             }
5016 
5017             /* PAGE1GB: shatter the superpage and fall through. */
5018             pl2e = alloc_xen_pagetable();
5019             if ( !pl2e )
5020                 return -ENOMEM;
5021             for ( i = 0; i < L2_PAGETABLE_ENTRIES; i++ )
5022                 l2e_write(pl2e + i,
5023                           l2e_from_pfn(l3e_get_pfn(*pl3e) +
5024                                        (i << PAGETABLE_ORDER),
5025                                        l3e_get_flags(*pl3e)));
5026             if ( locking )
5027                 spin_lock(&map_pgdir_lock);
5028             if ( (l3e_get_flags(*pl3e) & _PAGE_PRESENT) &&
5029                  (l3e_get_flags(*pl3e) & _PAGE_PSE) )
5030             {
5031                 l3e_write_atomic(pl3e, l3e_from_pfn(virt_to_mfn(pl2e),
5032                                                     __PAGE_HYPERVISOR));
5033                 pl2e = NULL;
5034             }
5035             if ( locking )
5036                 spin_unlock(&map_pgdir_lock);
5037             if ( pl2e )
5038                 free_xen_pagetable(pl2e);
5039         }
5040 
5041         /*
5042          * The L3 entry has been verified to be present, and we've dealt with
5043          * 1G pages as well, so the L2 table cannot require allocation.
5044          */
5045         pl2e = l3e_to_l2e(*pl3e) + l2_table_offset(v);
5046 
5047         if ( !(l2e_get_flags(*pl2e) & _PAGE_PRESENT) )
5048         {
5049             /* Confirm the caller isn't trying to create new mappings. */
5050             ASSERT(!(nf & _PAGE_PRESENT));
5051 
5052             v += 1UL << L2_PAGETABLE_SHIFT;
5053             v &= ~((1UL << L2_PAGETABLE_SHIFT) - 1);
5054             continue;
5055         }
5056 
5057         if ( l2e_get_flags(*pl2e) & _PAGE_PSE )
5058         {
5059             if ( (l1_table_offset(v) == 0) &&
5060                  ((e-v) >= (1UL << L2_PAGETABLE_SHIFT)) )
5061             {
5062                 /* PSE: whole superpage is modified. */
5063                 l2_pgentry_t nl2e = !(nf & _PAGE_PRESENT) ? l2e_empty()
5064                     : l2e_from_pfn(l2e_get_pfn(*pl2e),
5065                                    (l2e_get_flags(*pl2e) & ~FLAGS_MASK) | nf);
5066 
5067                 l2e_write_atomic(pl2e, nl2e);
5068                 v += 1UL << L2_PAGETABLE_SHIFT;
5069             }
5070             else
5071             {
5072                 /* PSE: shatter the superpage and try again. */
5073                 pl1e = alloc_xen_pagetable();
5074                 if ( !pl1e )
5075                     return -ENOMEM;
5076                 for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++ )
5077                     l1e_write(&pl1e[i],
5078                               l1e_from_pfn(l2e_get_pfn(*pl2e) + i,
5079                                            l2e_get_flags(*pl2e) & ~_PAGE_PSE));
5080                 if ( locking )
5081                     spin_lock(&map_pgdir_lock);
5082                 if ( (l2e_get_flags(*pl2e) & _PAGE_PRESENT) &&
5083                      (l2e_get_flags(*pl2e) & _PAGE_PSE) )
5084                 {
5085                     l2e_write_atomic(pl2e, l2e_from_pfn(virt_to_mfn(pl1e),
5086                                                         __PAGE_HYPERVISOR));
5087                     pl1e = NULL;
5088                 }
5089                 if ( locking )
5090                     spin_unlock(&map_pgdir_lock);
5091                 if ( pl1e )
5092                     free_xen_pagetable(pl1e);
5093             }
5094         }
5095         else
5096         {
5097             l1_pgentry_t nl1e;
5098 
5099             /*
5100              * Ordinary 4kB mapping: The L2 entry has been verified to be
5101              * present, and we've dealt with 2M pages as well, so the L1 table
5102              * cannot require allocation.
5103              */
5104             pl1e = l2e_to_l1e(*pl2e) + l1_table_offset(v);
5105 
5106             /* Confirm the caller isn't trying to create new mappings. */
5107             if ( !(l1e_get_flags(*pl1e) & _PAGE_PRESENT) )
5108                 ASSERT(!(nf & _PAGE_PRESENT));
5109 
5110             nl1e = !(nf & _PAGE_PRESENT) ? l1e_empty()
5111                 : l1e_from_pfn(l1e_get_pfn(*pl1e),
5112                                (l1e_get_flags(*pl1e) & ~FLAGS_MASK) | nf);
5113 
5114             l1e_write_atomic(pl1e, nl1e);
5115             v += PAGE_SIZE;
5116 
5117             /*
5118              * If we are not destroying mappings, or not done with the L2E,
5119              * skip the empty&free check.
5120              */
5121             if ( (nf & _PAGE_PRESENT) || ((v != e) && (l1_table_offset(v) != 0)) )
5122                 continue;
5123             if ( locking )
5124                 spin_lock(&map_pgdir_lock);
5125 
5126             /*
5127              * L2E may be already cleared, or set to a superpage, by
5128              * concurrent paging structure modifications on other CPUs.
5129              */
5130             if ( !(l2e_get_flags(*pl2e) & _PAGE_PRESENT) )
5131             {
5132                 if ( locking )
5133                     spin_unlock(&map_pgdir_lock);
5134                 goto check_l3;
5135             }
5136 
5137             if ( l2e_get_flags(*pl2e) & _PAGE_PSE )
5138             {
5139                 if ( locking )
5140                     spin_unlock(&map_pgdir_lock);
5141                 continue;
5142             }
5143 
5144             pl1e = l2e_to_l1e(*pl2e);
5145             for ( i = 0; i < L1_PAGETABLE_ENTRIES; i++ )
5146                 if ( l1e_get_intpte(pl1e[i]) != 0 )
5147                     break;
5148             if ( i == L1_PAGETABLE_ENTRIES )
5149             {
5150                 /* Empty: zap the L2E and free the L1 page. */
5151                 l2e_write_atomic(pl2e, l2e_empty());
5152                 if ( locking )
5153                     spin_unlock(&map_pgdir_lock);
5154                 flush_area(NULL, FLUSH_TLB_GLOBAL); /* flush before free */
5155                 free_xen_pagetable(pl1e);
5156             }
5157             else if ( locking )
5158                 spin_unlock(&map_pgdir_lock);
5159         }
5160 
5161  check_l3:
5162         /*
5163          * If we are not destroying mappings, or not done with the L3E,
5164          * skip the empty&free check.
5165          */
5166         if ( (nf & _PAGE_PRESENT) ||
5167              ((v != e) && (l2_table_offset(v) + l1_table_offset(v) != 0)) )
5168             continue;
5169         if ( locking )
5170             spin_lock(&map_pgdir_lock);
5171 
5172         /*
5173          * L3E may be already cleared, or set to a superpage, by
5174          * concurrent paging structure modifications on other CPUs.
5175          */
5176         if ( !(l3e_get_flags(*pl3e) & _PAGE_PRESENT) ||
5177               (l3e_get_flags(*pl3e) & _PAGE_PSE) )
5178         {
5179             if ( locking )
5180                 spin_unlock(&map_pgdir_lock);
5181             continue;
5182         }
5183 
5184         pl2e = l3e_to_l2e(*pl3e);
5185         for ( i = 0; i < L2_PAGETABLE_ENTRIES; i++ )
5186             if ( l2e_get_intpte(pl2e[i]) != 0 )
5187                 break;
5188         if ( i == L2_PAGETABLE_ENTRIES )
5189         {
5190             /* Empty: zap the L3E and free the L2 page. */
5191             l3e_write_atomic(pl3e, l3e_empty());
5192             if ( locking )
5193                 spin_unlock(&map_pgdir_lock);
5194             flush_area(NULL, FLUSH_TLB_GLOBAL); /* flush before free */
5195             free_xen_pagetable(pl2e);
5196         }
5197         else if ( locking )
5198             spin_unlock(&map_pgdir_lock);
5199     }
5200 
5201     flush_area(NULL, FLUSH_TLB_GLOBAL);
5202 
5203 #undef FLAGS_MASK
5204     return 0;
5205 }
5206 
5207 #undef flush_area
5208 
destroy_xen_mappings(unsigned long s,unsigned long e)5209 int destroy_xen_mappings(unsigned long s, unsigned long e)
5210 {
5211     return modify_xen_mappings(s, e, _PAGE_NONE);
5212 }
5213 
__set_fixmap(enum fixed_addresses idx,unsigned long mfn,unsigned long flags)5214 void __set_fixmap(
5215     enum fixed_addresses idx, unsigned long mfn, unsigned long flags)
5216 {
5217     BUG_ON(idx >= __end_of_fixed_addresses);
5218     map_pages_to_xen(__fix_to_virt(idx), mfn, 1, flags);
5219 }
5220 
arch_vmap_virt_end(void)5221 void *__init arch_vmap_virt_end(void)
5222 {
5223     return fix_to_virt(__end_of_fixed_addresses);
5224 }
5225 
ioremap(paddr_t pa,size_t len)5226 void __iomem *ioremap(paddr_t pa, size_t len)
5227 {
5228     mfn_t mfn = _mfn(PFN_DOWN(pa));
5229     void *va;
5230 
5231     WARN_ON(page_is_ram_type(mfn_x(mfn), RAM_TYPE_CONVENTIONAL));
5232 
5233     /* The low first Mb is always mapped. */
5234     if ( !((pa + len - 1) >> 20) )
5235         va = __va(pa);
5236     else
5237     {
5238         unsigned int offs = pa & (PAGE_SIZE - 1);
5239         unsigned int nr = PFN_UP(offs + len);
5240 
5241         va = __vmap(&mfn, nr, 1, 1, PAGE_HYPERVISOR_UCMINUS, VMAP_DEFAULT) + offs;
5242     }
5243 
5244     return (void __force __iomem *)va;
5245 }
5246 
create_perdomain_mapping(struct domain * d,unsigned long va,unsigned int nr,l1_pgentry_t ** pl1tab,struct page_info ** ppg)5247 int create_perdomain_mapping(struct domain *d, unsigned long va,
5248                              unsigned int nr, l1_pgentry_t **pl1tab,
5249                              struct page_info **ppg)
5250 {
5251     struct page_info *pg;
5252     l3_pgentry_t *l3tab;
5253     l2_pgentry_t *l2tab;
5254     l1_pgentry_t *l1tab;
5255     int rc = 0;
5256 
5257     ASSERT(va >= PERDOMAIN_VIRT_START &&
5258            va < PERDOMAIN_VIRT_SLOT(PERDOMAIN_SLOTS));
5259 
5260     if ( !d->arch.perdomain_l3_pg )
5261     {
5262         pg = alloc_domheap_page(d, MEMF_no_owner);
5263         if ( !pg )
5264             return -ENOMEM;
5265         l3tab = __map_domain_page(pg);
5266         clear_page(l3tab);
5267         d->arch.perdomain_l3_pg = pg;
5268         if ( !nr )
5269         {
5270             unmap_domain_page(l3tab);
5271             return 0;
5272         }
5273     }
5274     else if ( !nr )
5275         return 0;
5276     else
5277         l3tab = __map_domain_page(d->arch.perdomain_l3_pg);
5278 
5279     ASSERT(!l3_table_offset(va ^ (va + nr * PAGE_SIZE - 1)));
5280 
5281     if ( !(l3e_get_flags(l3tab[l3_table_offset(va)]) & _PAGE_PRESENT) )
5282     {
5283         pg = alloc_domheap_page(d, MEMF_no_owner);
5284         if ( !pg )
5285         {
5286             unmap_domain_page(l3tab);
5287             return -ENOMEM;
5288         }
5289         l2tab = __map_domain_page(pg);
5290         clear_page(l2tab);
5291         l3tab[l3_table_offset(va)] = l3e_from_page(pg, __PAGE_HYPERVISOR_RW);
5292     }
5293     else
5294         l2tab = map_l2t_from_l3e(l3tab[l3_table_offset(va)]);
5295 
5296     unmap_domain_page(l3tab);
5297 
5298     if ( !pl1tab && !ppg )
5299     {
5300         unmap_domain_page(l2tab);
5301         return 0;
5302     }
5303 
5304     for ( l1tab = NULL; !rc && nr--; )
5305     {
5306         l2_pgentry_t *pl2e = l2tab + l2_table_offset(va);
5307 
5308         if ( !(l2e_get_flags(*pl2e) & _PAGE_PRESENT) )
5309         {
5310             if ( pl1tab && !IS_NIL(pl1tab) )
5311             {
5312                 l1tab = alloc_xenheap_pages(0, MEMF_node(domain_to_node(d)));
5313                 if ( !l1tab )
5314                 {
5315                     rc = -ENOMEM;
5316                     break;
5317                 }
5318                 ASSERT(!pl1tab[l2_table_offset(va)]);
5319                 pl1tab[l2_table_offset(va)] = l1tab;
5320                 pg = virt_to_page(l1tab);
5321             }
5322             else
5323             {
5324                 pg = alloc_domheap_page(d, MEMF_no_owner);
5325                 if ( !pg )
5326                 {
5327                     rc = -ENOMEM;
5328                     break;
5329                 }
5330                 l1tab = __map_domain_page(pg);
5331             }
5332             clear_page(l1tab);
5333             *pl2e = l2e_from_page(pg, __PAGE_HYPERVISOR_RW);
5334         }
5335         else if ( !l1tab )
5336             l1tab = map_l1t_from_l2e(*pl2e);
5337 
5338         if ( ppg &&
5339              !(l1e_get_flags(l1tab[l1_table_offset(va)]) & _PAGE_PRESENT) )
5340         {
5341             pg = alloc_domheap_page(d, MEMF_no_owner);
5342             if ( pg )
5343             {
5344                 clear_domain_page(page_to_mfn(pg));
5345                 if ( !IS_NIL(ppg) )
5346                     *ppg++ = pg;
5347                 l1tab[l1_table_offset(va)] =
5348                     l1e_from_page(pg, __PAGE_HYPERVISOR_RW | _PAGE_AVAIL0);
5349                 l2e_add_flags(*pl2e, _PAGE_AVAIL0);
5350             }
5351             else
5352                 rc = -ENOMEM;
5353         }
5354 
5355         va += PAGE_SIZE;
5356         if ( rc || !nr || !l1_table_offset(va) )
5357         {
5358             /* Note that this is a no-op for the alloc_xenheap_page() case. */
5359             unmap_domain_page(l1tab);
5360             l1tab = NULL;
5361         }
5362     }
5363 
5364     ASSERT(!l1tab);
5365     unmap_domain_page(l2tab);
5366 
5367     return rc;
5368 }
5369 
destroy_perdomain_mapping(struct domain * d,unsigned long va,unsigned int nr)5370 void destroy_perdomain_mapping(struct domain *d, unsigned long va,
5371                                unsigned int nr)
5372 {
5373     const l3_pgentry_t *l3tab, *pl3e;
5374 
5375     ASSERT(va >= PERDOMAIN_VIRT_START &&
5376            va < PERDOMAIN_VIRT_SLOT(PERDOMAIN_SLOTS));
5377     ASSERT(!l3_table_offset(va ^ (va + nr * PAGE_SIZE - 1)));
5378 
5379     if ( !d->arch.perdomain_l3_pg )
5380         return;
5381 
5382     l3tab = __map_domain_page(d->arch.perdomain_l3_pg);
5383     pl3e = l3tab + l3_table_offset(va);
5384 
5385     if ( l3e_get_flags(*pl3e) & _PAGE_PRESENT )
5386     {
5387         const l2_pgentry_t *l2tab = map_l2t_from_l3e(*pl3e);
5388         const l2_pgentry_t *pl2e = l2tab + l2_table_offset(va);
5389         unsigned int i = l1_table_offset(va);
5390 
5391         while ( nr )
5392         {
5393             if ( l2e_get_flags(*pl2e) & _PAGE_PRESENT )
5394             {
5395                 l1_pgentry_t *l1tab = map_l1t_from_l2e(*pl2e);
5396 
5397                 for ( ; nr && i < L1_PAGETABLE_ENTRIES; --nr, ++i )
5398                 {
5399                     if ( (l1e_get_flags(l1tab[i]) &
5400                           (_PAGE_PRESENT | _PAGE_AVAIL0)) ==
5401                          (_PAGE_PRESENT | _PAGE_AVAIL0) )
5402                         free_domheap_page(l1e_get_page(l1tab[i]));
5403                     l1tab[i] = l1e_empty();
5404                 }
5405 
5406                 unmap_domain_page(l1tab);
5407             }
5408             else if ( nr + i < L1_PAGETABLE_ENTRIES )
5409                 break;
5410             else
5411                 nr -= L1_PAGETABLE_ENTRIES - i;
5412 
5413             ++pl2e;
5414             i = 0;
5415         }
5416 
5417         unmap_domain_page(l2tab);
5418     }
5419 
5420     unmap_domain_page(l3tab);
5421 }
5422 
free_perdomain_mappings(struct domain * d)5423 void free_perdomain_mappings(struct domain *d)
5424 {
5425     l3_pgentry_t *l3tab;
5426     unsigned int i;
5427 
5428     if ( !d->arch.perdomain_l3_pg )
5429         return;
5430 
5431     l3tab = __map_domain_page(d->arch.perdomain_l3_pg);
5432 
5433     for ( i = 0; i < PERDOMAIN_SLOTS; ++i)
5434         if ( l3e_get_flags(l3tab[i]) & _PAGE_PRESENT )
5435         {
5436             struct page_info *l2pg = l3e_get_page(l3tab[i]);
5437             l2_pgentry_t *l2tab = __map_domain_page(l2pg);
5438             unsigned int j;
5439 
5440             for ( j = 0; j < L2_PAGETABLE_ENTRIES; ++j )
5441                 if ( l2e_get_flags(l2tab[j]) & _PAGE_PRESENT )
5442                 {
5443                     struct page_info *l1pg = l2e_get_page(l2tab[j]);
5444 
5445                     if ( l2e_get_flags(l2tab[j]) & _PAGE_AVAIL0 )
5446                     {
5447                         l1_pgentry_t *l1tab = __map_domain_page(l1pg);
5448                         unsigned int k;
5449 
5450                         for ( k = 0; k < L1_PAGETABLE_ENTRIES; ++k )
5451                             if ( (l1e_get_flags(l1tab[k]) &
5452                                   (_PAGE_PRESENT | _PAGE_AVAIL0)) ==
5453                                  (_PAGE_PRESENT | _PAGE_AVAIL0) )
5454                                 free_domheap_page(l1e_get_page(l1tab[k]));
5455 
5456                         unmap_domain_page(l1tab);
5457                     }
5458 
5459                     if ( is_xen_heap_page(l1pg) )
5460                         free_xenheap_page(page_to_virt(l1pg));
5461                     else
5462                         free_domheap_page(l1pg);
5463                 }
5464 
5465             unmap_domain_page(l2tab);
5466             free_domheap_page(l2pg);
5467         }
5468 
5469     unmap_domain_page(l3tab);
5470     free_domheap_page(d->arch.perdomain_l3_pg);
5471     d->arch.perdomain_l3_pg = NULL;
5472 }
5473 
5474 #ifdef MEMORY_GUARD
5475 
__memguard_change_range(void * p,unsigned long l,int guard)5476 static void __memguard_change_range(void *p, unsigned long l, int guard)
5477 {
5478     unsigned long _p = (unsigned long)p;
5479     unsigned long _l = (unsigned long)l;
5480     unsigned int flags = __PAGE_HYPERVISOR_RW | MAP_SMALL_PAGES;
5481 
5482     /* Ensure we are dealing with a page-aligned whole number of pages. */
5483     ASSERT(IS_ALIGNED(_p, PAGE_SIZE));
5484     ASSERT(IS_ALIGNED(_l, PAGE_SIZE));
5485 
5486     if ( guard )
5487         flags &= ~_PAGE_PRESENT;
5488 
5489     map_pages_to_xen(
5490         _p, virt_to_maddr(p) >> PAGE_SHIFT, _l >> PAGE_SHIFT, flags);
5491 }
5492 
memguard_guard_range(void * p,unsigned long l)5493 void memguard_guard_range(void *p, unsigned long l)
5494 {
5495     __memguard_change_range(p, l, 1);
5496 }
5497 
memguard_unguard_range(void * p,unsigned long l)5498 void memguard_unguard_range(void *p, unsigned long l)
5499 {
5500     __memguard_change_range(p, l, 0);
5501 }
5502 
5503 #endif
5504 
memguard_guard_stack(void * p)5505 void memguard_guard_stack(void *p)
5506 {
5507     BUILD_BUG_ON((PRIMARY_STACK_SIZE + PAGE_SIZE) > STACK_SIZE);
5508     p = (void *)((unsigned long)p + STACK_SIZE -
5509                  PRIMARY_STACK_SIZE - PAGE_SIZE);
5510     memguard_guard_range(p, PAGE_SIZE);
5511 }
5512 
memguard_unguard_stack(void * p)5513 void memguard_unguard_stack(void *p)
5514 {
5515     p = (void *)((unsigned long)p + STACK_SIZE -
5516                  PRIMARY_STACK_SIZE - PAGE_SIZE);
5517     memguard_unguard_range(p, PAGE_SIZE);
5518 }
5519 
arch_dump_shared_mem_info(void)5520 void arch_dump_shared_mem_info(void)
5521 {
5522     printk("Shared frames %u -- Saved frames %u\n",
5523             mem_sharing_get_nr_shared_mfns(),
5524             mem_sharing_get_nr_saved_mfns());
5525 }
5526 
get_platform_badpages(unsigned int * array_size)5527 const unsigned long *__init get_platform_badpages(unsigned int *array_size)
5528 {
5529     u32 igd_id;
5530     static unsigned long __initdata bad_pages[] = {
5531         0x20050000,
5532         0x20110000,
5533         0x20130000,
5534         0x20138000,
5535         0x40004000,
5536     };
5537 
5538     *array_size = ARRAY_SIZE(bad_pages);
5539     igd_id = pci_conf_read32(0, 0, 2, 0, 0);
5540     if ( !IS_SNB_GFX(igd_id) )
5541         return NULL;
5542 
5543     return bad_pages;
5544 }
5545 
paging_invlpg(struct vcpu * v,unsigned long va)5546 void paging_invlpg(struct vcpu *v, unsigned long va)
5547 {
5548     if ( !is_canonical_address(va) )
5549         return;
5550 
5551     if ( paging_mode_enabled(v->domain) &&
5552          !paging_get_hostmode(v)->invlpg(v, va) )
5553         return;
5554 
5555     if ( is_pv_vcpu(v) )
5556         flush_tlb_one_local(va);
5557     else
5558         hvm_funcs.invlpg(v, va);
5559 }
5560 
5561 /* Build a 32bit PSE page table using 4MB pages. */
write_32bit_pse_identmap(uint32_t * l2)5562 void write_32bit_pse_identmap(uint32_t *l2)
5563 {
5564     unsigned int i;
5565 
5566     for ( i = 0; i < PAGE_SIZE / sizeof(*l2); i++ )
5567         l2[i] = ((i << 22) | _PAGE_PRESENT | _PAGE_RW | _PAGE_USER |
5568                  _PAGE_ACCESSED | _PAGE_DIRTY | _PAGE_PSE);
5569 }
5570 
get_upper_mfn_bound(void)5571 unsigned long get_upper_mfn_bound(void)
5572 {
5573     unsigned long max_mfn;
5574 
5575     max_mfn = mem_hotplug ? PFN_DOWN(mem_hotplug) : max_page;
5576 #ifndef CONFIG_BIGMEM
5577     max_mfn = min(max_mfn, 1UL << 32);
5578 #endif
5579     return min(max_mfn, 1UL << (paddr_bits - PAGE_SHIFT)) - 1;
5580 }
5581 
5582 /*
5583  * Local variables:
5584  * mode: C
5585  * c-file-style: "BSD"
5586  * c-basic-offset: 4
5587  * tab-width: 4
5588  * indent-tabs-mode: nil
5589  * End:
5590  */
5591