/****************************************************************************** * arch/x86/mm/p2m-pod.c * * Populate-on-demand p2m entries. * * Copyright (c) 2009-2011 Citrix Systems, Inc. * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; If not, see . */ #include #include #include #include #include #include #include #include "mm-locks.h" /* Override macros from asm/page.h to make them work with mfn_t */ #undef mfn_to_page #define mfn_to_page(_m) __mfn_to_page(mfn_x(_m)) #undef page_to_mfn #define page_to_mfn(_pg) _mfn(__page_to_mfn(_pg)) #define superpage_aligned(_x) (((_x)&(SUPERPAGE_PAGES-1))==0) /* Enforce lock ordering when grabbing the "external" page_alloc lock */ static inline void lock_page_alloc(struct p2m_domain *p2m) { page_alloc_mm_pre_lock(); spin_lock(&(p2m->domain->page_alloc_lock)); page_alloc_mm_post_lock(p2m->domain->arch.page_alloc_unlock_level); } static inline void unlock_page_alloc(struct p2m_domain *p2m) { page_alloc_mm_unlock(p2m->domain->arch.page_alloc_unlock_level); spin_unlock(&(p2m->domain->page_alloc_lock)); } /* * Populate-on-demand functionality */ static int p2m_pod_cache_add(struct p2m_domain *p2m, struct page_info *page, unsigned int order) { unsigned long i; struct page_info *p; struct domain *d = p2m->domain; #ifndef NDEBUG mfn_t mfn; mfn = page_to_mfn(page); /* Check to make sure this is a contiguous region */ if ( mfn_x(mfn) & ((1UL << order) - 1) ) { printk("%s: mfn %lx not aligned order %u! (mask %lx)\n", __func__, mfn_x(mfn), order, ((1UL << order) - 1)); return -1; } for ( i = 0; i < 1UL << order ; i++) { struct domain * od; p = mfn_to_page(_mfn(mfn_x(mfn) + i)); od = page_get_owner(p); if ( od != d ) { printk("%s: mfn %lx expected owner d%d, got owner d%d!\n", __func__, mfn_x(mfn), d->domain_id, od ? od->domain_id : -1); return -1; } } #endif ASSERT(pod_locked_by_me(p2m)); /* * Pages from domain_alloc and returned by the balloon driver aren't * guaranteed to be zero; but by reclaiming zero pages, we implicitly * promise to provide zero pages. So we scrub pages before using. */ for ( i = 0; i < (1UL << order); i++ ) clear_domain_page(mfn_add(page_to_mfn(page), i)); /* First, take all pages off the domain list */ lock_page_alloc(p2m); for ( i = 0; i < 1UL << order ; i++ ) { p = page + i; page_list_del(p, &d->page_list); } unlock_page_alloc(p2m); /* Then add to the appropriate populate-on-demand list. */ switch ( order ) { case PAGE_ORDER_1G: for ( i = 0; i < (1UL << PAGE_ORDER_1G); i += 1UL << PAGE_ORDER_2M ) page_list_add_tail(page + i, &p2m->pod.super); break; case PAGE_ORDER_2M: page_list_add_tail(page, &p2m->pod.super); break; case PAGE_ORDER_4K: page_list_add_tail(page, &p2m->pod.single); break; default: BUG(); } p2m->pod.count += 1UL << order; return 0; } /* Get a page of size order from the populate-on-demand cache. Will break * down 2-meg pages into singleton pages automatically. Returns null if * a superpage is requested and no superpages are available. */ static struct page_info * p2m_pod_cache_get(struct p2m_domain *p2m, unsigned int order) { struct page_info *p = NULL; unsigned long i; ASSERT(pod_locked_by_me(p2m)); if ( order == PAGE_ORDER_2M && page_list_empty(&p2m->pod.super) ) { return NULL; } else if ( order == PAGE_ORDER_4K && page_list_empty(&p2m->pod.single) ) { unsigned long mfn; struct page_info *q; BUG_ON( page_list_empty(&p2m->pod.super) ); /* * Break up a superpage to make single pages. NB count doesn't * need to be adjusted. */ p = page_list_remove_head(&p2m->pod.super); mfn = mfn_x(page_to_mfn(p)); for ( i = 0; i < SUPERPAGE_PAGES; i++ ) { q = mfn_to_page(_mfn(mfn+i)); page_list_add_tail(q, &p2m->pod.single); } } switch ( order ) { case PAGE_ORDER_2M: BUG_ON( page_list_empty(&p2m->pod.super) ); p = page_list_remove_head(&p2m->pod.super); p2m->pod.count -= 1UL << order; break; case PAGE_ORDER_4K: BUG_ON( page_list_empty(&p2m->pod.single) ); p = page_list_remove_head(&p2m->pod.single); p2m->pod.count -= 1UL; break; default: BUG(); } /* Put the pages back on the domain page_list */ lock_page_alloc(p2m); for ( i = 0 ; i < (1UL << order); i++ ) { BUG_ON(page_get_owner(p + i) != p2m->domain); page_list_add_tail(p + i, &p2m->domain->page_list); } unlock_page_alloc(p2m); return p; } /* Set the size of the cache, allocating or freeing as necessary. */ static int p2m_pod_set_cache_target(struct p2m_domain *p2m, unsigned long pod_target, int preemptible) { struct domain *d = p2m->domain; int ret = 0; ASSERT(pod_locked_by_me(p2m)); /* Increasing the target */ while ( pod_target > p2m->pod.count ) { struct page_info * page; int order; if ( (pod_target - p2m->pod.count) >= SUPERPAGE_PAGES ) order = PAGE_ORDER_2M; else order = PAGE_ORDER_4K; retry: page = alloc_domheap_pages(d, order, 0); if ( unlikely(page == NULL) ) { if ( order == PAGE_ORDER_2M ) { /* If we can't allocate a superpage, try singleton pages */ order = PAGE_ORDER_4K; goto retry; } printk("%s: Unable to allocate page for PoD cache (target=%lu cache=%ld)\n", __func__, pod_target, p2m->pod.count); ret = -ENOMEM; goto out; } p2m_pod_cache_add(p2m, page, order); if ( preemptible && pod_target != p2m->pod.count && hypercall_preempt_check() ) { ret = -ERESTART; goto out; } } /* Decreasing the target */ /* * We hold the pod lock here, so we don't need to worry about * cache disappearing under our feet. */ while ( pod_target < p2m->pod.count ) { struct page_info * page; unsigned int order; unsigned long i; if ( (p2m->pod.count - pod_target) > SUPERPAGE_PAGES && !page_list_empty(&p2m->pod.super) ) order = PAGE_ORDER_2M; else order = PAGE_ORDER_4K; page = p2m_pod_cache_get(p2m, order); ASSERT(page != NULL); /* Then free them */ for ( i = 0 ; i < (1UL << order) ; i++ ) { /* Copied from common/memory.c:guest_remove_page() */ if ( unlikely(!get_page(page + i, d)) ) { gdprintk(XENLOG_INFO, "Bad page free for domain %u\n", d->domain_id); ret = -EINVAL; goto out; } if ( test_and_clear_bit(_PGT_pinned, &(page+i)->u.inuse.type_info) ) put_page_and_type(page + i); if ( test_and_clear_bit(_PGC_allocated, &(page+i)->count_info) ) put_page(page + i); put_page(page + i); if ( preemptible && pod_target != p2m->pod.count && hypercall_preempt_check() ) { ret = -ERESTART; goto out; } } } out: return ret; } /* * The "right behavior" here requires some careful thought. First, some * definitions: * + M: static_max * + B: number of pages the balloon driver has ballooned down to. * + P: Number of populated pages. * + T: Old target * + T': New target * * The following equations should hold: * 0 <= P <= T <= B <= M * d->arch.p2m->pod.entry_count == B - P * d->tot_pages == P + d->arch.p2m->pod.count * * Now we have the following potential cases to cover: * B tot_pages - p2m->pod.count; if ( populated > 0 && p2m->pod.entry_count == 0 ) goto out; /* Don't do anything if the domain is being torn down */ if ( d->is_dying ) goto out; /* * T' < B: Don't reduce the cache size; let the balloon driver * take care of it. */ if ( target < d->tot_pages ) goto out; pod_target = target - populated; /* * B < T': Set the cache size equal to # of outstanding entries, * let the balloon driver fill in the rest. */ if ( populated > 0 && pod_target > p2m->pod.entry_count ) pod_target = p2m->pod.entry_count; ASSERT( pod_target >= p2m->pod.count ); ret = p2m_pod_set_cache_target(p2m, pod_target, 1/*preemptible*/); out: pod_unlock(p2m); return ret; } int p2m_pod_empty_cache(struct domain *d) { struct p2m_domain *p2m = p2m_get_hostp2m(d); struct page_info *page; unsigned int i; /* After this barrier no new PoD activities can happen. */ BUG_ON(!d->is_dying); spin_barrier(&p2m->pod.lock.lock); lock_page_alloc(p2m); while ( (page = page_list_remove_head(&p2m->pod.super)) ) { for ( i = 0 ; i < SUPERPAGE_PAGES ; i++ ) { BUG_ON(page_get_owner(page + i) != d); page_list_add_tail(page + i, &d->page_list); } p2m->pod.count -= SUPERPAGE_PAGES; if ( hypercall_preempt_check() ) goto out; } for ( i = 0; (page = page_list_remove_head(&p2m->pod.single)); ++i ) { BUG_ON(page_get_owner(page) != d); page_list_add_tail(page, &d->page_list); p2m->pod.count -= 1; if ( i && !(i & 511) && hypercall_preempt_check() ) goto out; } BUG_ON(p2m->pod.count != 0); out: unlock_page_alloc(p2m); return p2m->pod.count ? -ERESTART : 0; } int p2m_pod_offline_or_broken_hit(struct page_info *p) { struct domain *d; struct p2m_domain *p2m; struct page_info *q, *tmp; unsigned long mfn, bmfn; if ( !(d = page_get_owner(p)) || !(p2m = p2m_get_hostp2m(d)) ) return 0; pod_lock(p2m); bmfn = mfn_x(page_to_mfn(p)); page_list_for_each_safe(q, tmp, &p2m->pod.super) { mfn = mfn_x(page_to_mfn(q)); if ( (bmfn >= mfn) && ((bmfn - mfn) < SUPERPAGE_PAGES) ) { unsigned long i; page_list_del(q, &p2m->pod.super); for ( i = 0; i < SUPERPAGE_PAGES; i++) { q = mfn_to_page(_mfn(mfn + i)); page_list_add_tail(q, &p2m->pod.single); } page_list_del(p, &p2m->pod.single); p2m->pod.count--; goto pod_hit; } } page_list_for_each_safe(q, tmp, &p2m->pod.single) { mfn = mfn_x(page_to_mfn(q)); if ( mfn == bmfn ) { page_list_del(p, &p2m->pod.single); p2m->pod.count--; goto pod_hit; } } pod_unlock(p2m); return 0; pod_hit: lock_page_alloc(p2m); /* Insertion must be at list head (see iommu_populate_page_table()). */ page_list_add(p, &d->arch.relmem_list); unlock_page_alloc(p2m); pod_unlock(p2m); return 1; } void p2m_pod_offline_or_broken_replace(struct page_info *p) { struct domain *d; struct p2m_domain *p2m; nodeid_t node = phys_to_nid(page_to_maddr(p)); if ( !(d = page_get_owner(p)) || !(p2m = p2m_get_hostp2m(d)) ) return; free_domheap_page(p); p = alloc_domheap_page(d, MEMF_node(node)); if ( unlikely(!p) ) return; pod_lock(p2m); p2m_pod_cache_add(p2m, p, PAGE_ORDER_4K); pod_unlock(p2m); return; } static int p2m_pod_zero_check_superpage(struct p2m_domain *p2m, gfn_t gfn); /* * This function is needed for two reasons: * + To properly handle clearing of PoD entries * + To "steal back" memory being freed for the PoD cache, rather than * releasing it. * * Once both of these functions have been completed, we can return and * allow decrease_reservation() to handle everything else. */ int p2m_pod_decrease_reservation(struct domain *d, gfn_t gfn, unsigned int order) { int ret = 0; unsigned long i, n; struct p2m_domain *p2m = p2m_get_hostp2m(d); bool_t steal_for_cache; long pod, nonpod, ram; gfn_lock(p2m, gfn, order); pod_lock(p2m); /* * If we don't have any outstanding PoD entries, let things take their * course. */ if ( p2m->pod.entry_count == 0 ) goto out_unlock; if ( unlikely(d->is_dying) ) goto out_unlock; pod = nonpod = ram = 0; /* Figure out if we need to steal some freed memory for our cache */ steal_for_cache = ( p2m->pod.entry_count > p2m->pod.count ); for ( i = 0; i < (1UL << order); i += n ) { p2m_access_t a; p2m_type_t t; unsigned int cur_order; p2m->get_entry(p2m, gfn_add(gfn, i), &t, &a, 0, &cur_order, NULL); n = 1UL << min(order, cur_order); if ( t == p2m_populate_on_demand ) pod += n; else { nonpod += n; if ( p2m_is_ram(t) ) ram += n; } } /* No populate-on-demand? Don't need to steal anything? Then we're done!*/ if ( !pod && !steal_for_cache ) goto out_unlock; if ( !nonpod ) { /* * All PoD: Mark the whole region invalid and tell caller * we're done. */ if ( p2m_set_entry(p2m, gfn, INVALID_MFN, order, p2m_invalid, p2m->default_access) ) { /* * If this fails, we can't tell how much of the range was changed. * Best to crash the domain unless we're sure a partial change is * impossible. */ if ( order != 0 ) domain_crash(d); goto out_unlock; } p2m->pod.entry_count -= 1UL << order; BUG_ON(p2m->pod.entry_count < 0); ret = 1; goto out_entry_check; } /* * Try to grab entire superpages if possible. Since the common case is for * drivers to pass back singleton pages, see if we can take the whole page * back and mark the rest PoD. * No need to do this though if * - order >= SUPERPAGE_ORDER (the loop below will take care of this) * - not all of the pages were RAM (now knowing order < SUPERPAGE_ORDER) */ if ( steal_for_cache && order < SUPERPAGE_ORDER && ram == (1UL << order) && p2m_pod_zero_check_superpage(p2m, _gfn(gfn_x(gfn) & ~(SUPERPAGE_PAGES - 1))) ) { pod = 1UL << order; ram = nonpod = 0; ASSERT(steal_for_cache == (p2m->pod.entry_count > p2m->pod.count)); } /* * Process as long as: * + There are PoD entries to handle, or * + There is ram left, and we want to steal it */ for ( i = 0; i < (1UL << order) && (pod > 0 || (steal_for_cache && ram > 0)); i += n ) { mfn_t mfn; p2m_type_t t; p2m_access_t a; unsigned int cur_order; mfn = p2m->get_entry(p2m, gfn_add(gfn, i), &t, &a, 0, &cur_order, NULL); if ( order < cur_order ) cur_order = order; n = 1UL << cur_order; if ( t == p2m_populate_on_demand ) { /* This shouldn't be able to fail */ if ( p2m_set_entry(p2m, gfn_add(gfn, i), INVALID_MFN, cur_order, p2m_invalid, p2m->default_access) ) { ASSERT_UNREACHABLE(); domain_crash(d); goto out_unlock; } p2m->pod.entry_count -= n; BUG_ON(p2m->pod.entry_count < 0); pod -= n; } else if ( steal_for_cache && p2m_is_ram(t) ) { /* * If we need less than 1 << cur_order, we may end up stealing * more memory here than we actually need. This will be rectified * below, however; and stealing too much and then freeing what we * need may allow us to free smaller pages from the cache, and * avoid breaking up superpages. */ struct page_info *page; unsigned long j; ASSERT(mfn_valid(mfn)); page = mfn_to_page(mfn); /* This shouldn't be able to fail */ if ( p2m_set_entry(p2m, gfn_add(gfn, i), INVALID_MFN, cur_order, p2m_invalid, p2m->default_access) ) { ASSERT_UNREACHABLE(); domain_crash(d); goto out_unlock; } p2m_tlb_flush_sync(p2m); for ( j = 0; j < n; ++j ) set_gpfn_from_mfn(mfn_x(mfn), INVALID_M2P_ENTRY); p2m_pod_cache_add(p2m, page, cur_order); steal_for_cache = ( p2m->pod.entry_count > p2m->pod.count ); nonpod -= n; ram -= n; } } /* * If there are no more non-PoD entries, tell decrease_reservation() that * there's nothing left to do. */ if ( nonpod == 0 ) ret = 1; out_entry_check: /* If we've reduced our "liabilities" beyond our "assets", free some */ if ( p2m->pod.entry_count < p2m->pod.count ) { p2m_pod_set_cache_target(p2m, p2m->pod.entry_count, 0/*can't preempt*/); } out_unlock: pod_unlock(p2m); gfn_unlock(p2m, gfn, order); return ret; } void p2m_pod_dump_data(struct domain *d) { struct p2m_domain *p2m = p2m_get_hostp2m(d); printk(" PoD entries=%ld cachesize=%ld\n", p2m->pod.entry_count, p2m->pod.count); } /* * Search for all-zero superpages to be reclaimed as superpages for the * PoD cache. Must be called w/ pod lock held, must lock the superpage * in the p2m. */ static int p2m_pod_zero_check_superpage(struct p2m_domain *p2m, gfn_t gfn) { mfn_t mfn, mfn0 = INVALID_MFN; p2m_type_t type, type0 = 0; unsigned long * map = NULL; int ret=0, reset = 0; unsigned long i, n; unsigned int j; int max_ref = 1; struct domain *d = p2m->domain; ASSERT(pod_locked_by_me(p2m)); if ( !superpage_aligned(gfn_x(gfn)) ) goto out; /* Allow an extra refcount for one shadow pt mapping in shadowed domains */ if ( paging_mode_shadow(d) ) max_ref++; /* * NOTE: this is why we don't enforce deadlock constraints between p2m * and pod locks. */ gfn_lock(p2m, gfn, SUPERPAGE_ORDER); /* * Look up the mfns, checking to make sure they're the same mfn * and aligned, and mapping them. */ for ( i = 0; i < SUPERPAGE_PAGES; i += n ) { p2m_access_t a; unsigned int cur_order; unsigned long k; const struct page_info *page; mfn = p2m->get_entry(p2m, gfn_add(gfn, i), &type, &a, 0, &cur_order, NULL); /* * Conditions that must be met for superpage-superpage: * + All gfns are ram types * + All gfns have the same type * + All of the mfns are allocated to a domain * + None of the mfns are used as pagetables, or allocated via xenheap * + The first mfn is 2-meg aligned * + All the other mfns are in sequence * Adding for good measure: * + None of the mfns are likely to be mapped elsewhere (refcount * 2 or less for shadow, 1 for hap) */ if ( !p2m_is_ram(type) ) goto out; if ( i == 0 ) { if ( !superpage_aligned(mfn_x(mfn)) ) goto out; mfn0 = mfn; type0 = type; } else if ( type != type0 || !mfn_eq(mfn, mfn_add(mfn0, i)) ) goto out; n = 1UL << min(cur_order, SUPERPAGE_ORDER + 0U); for ( k = 0, page = mfn_to_page(mfn); k < n; ++k, ++page ) if ( !(page->count_info & PGC_allocated) || (page->count_info & (PGC_page_table | PGC_xen_heap)) || (page->count_info & PGC_count_mask) > max_ref ) goto out; } /* Now, do a quick check to see if it may be zero before unmapping. */ for ( i = 0; i < SUPERPAGE_PAGES; i++ ) { /* Quick zero-check */ map = map_domain_page(mfn_add(mfn0, i)); for ( j = 0; j < 16; j++ ) if ( *(map + j) != 0 ) break; unmap_domain_page(map); if ( j < 16 ) goto out; } /* Try to remove the page, restoring old mapping if it fails. */ if ( p2m_set_entry(p2m, gfn, INVALID_MFN, PAGE_ORDER_2M, p2m_populate_on_demand, p2m->default_access) ) goto out; p2m_tlb_flush_sync(p2m); /* * Make none of the MFNs are used elsewhere... for example, mapped * via the grant table interface, or by qemu. Allow one refcount for * being allocated to the domain. */ for ( i = 0; i < SUPERPAGE_PAGES; i++ ) { mfn = mfn_add(mfn0, i); if ( (mfn_to_page(mfn)->count_info & PGC_count_mask) > 1 ) { reset = 1; goto out_reset; } } /* Finally, do a full zero-check */ for ( i = 0; i < SUPERPAGE_PAGES; i++ ) { map = map_domain_page(mfn_add(mfn0, i)); for ( j = 0; j < (PAGE_SIZE / sizeof(*map)); j++ ) if ( *(map+j) != 0 ) { reset = 1; break; } unmap_domain_page(map); if ( reset ) goto out_reset; } if ( tb_init_done ) { struct { u64 gfn, mfn; int d:16,order:16; } t; t.gfn = gfn_x(gfn); t.mfn = mfn_x(mfn); t.d = d->domain_id; t.order = 9; __trace_var(TRC_MEM_POD_ZERO_RECLAIM, 0, sizeof(t), &t); } /* * Finally! We've passed all the checks, and can add the mfn superpage * back on the PoD cache, and account for the new p2m PoD entries. */ p2m_pod_cache_add(p2m, mfn_to_page(mfn0), PAGE_ORDER_2M); p2m->pod.entry_count += SUPERPAGE_PAGES; ret = SUPERPAGE_PAGES; out_reset: /* * This p2m_set_entry() call shouldn't be able to fail, since the same order * on the same gfn succeeded above. If that turns out to be false, crashing * the domain should be the safest way of making sure we don't leak memory. */ if ( reset && p2m_set_entry(p2m, gfn, mfn0, PAGE_ORDER_2M, type0, p2m->default_access) ) { ASSERT_UNREACHABLE(); domain_crash(d); } out: gfn_unlock(p2m, gfn, SUPERPAGE_ORDER); return ret; } static void p2m_pod_zero_check(struct p2m_domain *p2m, const gfn_t *gfns, int count) { mfn_t mfns[count]; p2m_type_t types[count]; unsigned long *map[count]; struct domain *d = p2m->domain; int i, j; int max_ref = 1; /* Allow an extra refcount for one shadow pt mapping in shadowed domains */ if ( paging_mode_shadow(d) ) max_ref++; /* First, get the gfn list, translate to mfns, and map the pages. */ for ( i = 0; i < count; i++ ) { p2m_access_t a; struct page_info *pg; mfns[i] = p2m->get_entry(p2m, gfns[i], types + i, &a, 0, NULL, NULL); pg = mfn_to_page(mfns[i]); /* * If this is ram, and not a pagetable or from the xen heap, and * probably not mapped elsewhere, map it; otherwise, skip. */ if ( p2m_is_ram(types[i]) && (pg->count_info & PGC_allocated) && !(pg->count_info & (PGC_page_table | PGC_xen_heap)) && ((pg->count_info & PGC_count_mask) <= max_ref) ) map[i] = map_domain_page(mfns[i]); else map[i] = NULL; } /* * Then, go through and check for zeroed pages, removing write permission * for those with zeroes. */ for ( i = 0; i < count; i++ ) { if ( !map[i] ) continue; /* Quick zero-check */ for ( j = 0; j < 16; j++ ) if ( *(map[i] + j) != 0 ) break; if ( j < 16 ) { unmap_domain_page(map[i]); map[i] = NULL; continue; } /* Try to remove the page, restoring old mapping if it fails. */ if ( p2m_set_entry(p2m, gfns[i], INVALID_MFN, PAGE_ORDER_4K, p2m_populate_on_demand, p2m->default_access) ) goto skip; /* * See if the page was successfully unmapped. (Allow one refcount * for being allocated to a domain.) */ if ( (mfn_to_page(mfns[i])->count_info & PGC_count_mask) > 1 ) { /* * If the previous p2m_set_entry call succeeded, this one shouldn't * be able to fail. If it does, crashing the domain should be safe. */ if ( p2m_set_entry(p2m, gfns[i], mfns[i], PAGE_ORDER_4K, types[i], p2m->default_access) ) { ASSERT_UNREACHABLE(); domain_crash(d); goto out_unmap; } skip: unmap_domain_page(map[i]); map[i] = NULL; continue; } } p2m_tlb_flush_sync(p2m); /* Now check each page for real */ for ( i = 0; i < count; i++ ) { if ( !map[i] ) continue; for ( j = 0; j < (PAGE_SIZE / sizeof(*map[i])); j++ ) if ( *(map[i] + j) != 0 ) break; unmap_domain_page(map[i]); map[i] = NULL; /* * See comment in p2m_pod_zero_check_superpage() re gnttab * check timing. */ if ( j < (PAGE_SIZE / sizeof(*map[i])) ) { /* * If the previous p2m_set_entry call succeeded, this one shouldn't * be able to fail. If it does, crashing the domain should be safe. */ if ( p2m_set_entry(p2m, gfns[i], mfns[i], PAGE_ORDER_4K, types[i], p2m->default_access) ) { ASSERT_UNREACHABLE(); domain_crash(d); goto out_unmap; } } else { if ( tb_init_done ) { struct { u64 gfn, mfn; int d:16,order:16; } t; t.gfn = gfn_x(gfns[i]); t.mfn = mfn_x(mfns[i]); t.d = d->domain_id; t.order = 0; __trace_var(TRC_MEM_POD_ZERO_RECLAIM, 0, sizeof(t), &t); } /* Add to cache, and account for the new p2m PoD entry */ p2m_pod_cache_add(p2m, mfn_to_page(mfns[i]), PAGE_ORDER_4K); p2m->pod.entry_count++; } } return; out_unmap: /* * Something went wrong, probably crashing the domain. Unmap * everything and return. */ for ( i = 0; i < count; i++ ) if ( map[i] ) unmap_domain_page(map[i]); } #define POD_SWEEP_LIMIT 1024 #define POD_SWEEP_STRIDE 16 static void p2m_pod_emergency_sweep(struct p2m_domain *p2m) { gfn_t gfns[POD_SWEEP_STRIDE]; unsigned long i, j = 0, start, limit; p2m_type_t t; if ( gfn_eq(p2m->pod.reclaim_single, _gfn(0)) ) p2m->pod.reclaim_single = p2m->pod.max_guest; start = gfn_x(p2m->pod.reclaim_single); limit = (start > POD_SWEEP_LIMIT) ? (start - POD_SWEEP_LIMIT) : 0; /* FIXME: Figure out how to avoid superpages */ /* * NOTE: Promote to globally locking the p2m. This will get complicated * in a fine-grained scenario. If we lock each gfn individually we must be * careful about spinlock recursion limits and POD_SWEEP_STRIDE. */ p2m_lock(p2m); for ( i = gfn_x(p2m->pod.reclaim_single); i > 0 ; i-- ) { p2m_access_t a; (void)p2m->get_entry(p2m, _gfn(i), &t, &a, 0, NULL, NULL); if ( p2m_is_ram(t) ) { gfns[j] = _gfn(i); j++; BUG_ON(j > POD_SWEEP_STRIDE); if ( j == POD_SWEEP_STRIDE ) { p2m_pod_zero_check(p2m, gfns, j); j = 0; } } /* * Stop if we're past our limit and we have found *something*. * * NB that this is a zero-sum game; we're increasing our cache size * by re-increasing our 'debt'. Since we hold the pod lock, * (entry_count - count) must remain the same. */ if ( i < limit && (p2m->pod.count > 0 || hypercall_preempt_check()) ) break; } if ( j ) p2m_pod_zero_check(p2m, gfns, j); p2m_unlock(p2m); p2m->pod.reclaim_single = _gfn(i ? i - 1 : i); } static void pod_eager_reclaim(struct p2m_domain *p2m) { struct pod_mrp_list *mrp = &p2m->pod.mrp; unsigned int i = 0; /* * Always check one page for reclaimation. * * If the PoD pool is empty, keep checking some space is found, or all * entries have been exhaused. */ do { unsigned int idx = (mrp->idx + i++) % ARRAY_SIZE(mrp->list); gfn_t gfn = _gfn(mrp->list[idx]); if ( !gfn_eq(gfn, INVALID_GFN) ) { if ( gfn_x(gfn) & POD_LAST_SUPERPAGE ) { gfn = _gfn(gfn_x(gfn) & ~POD_LAST_SUPERPAGE); if ( p2m_pod_zero_check_superpage(p2m, gfn) == 0 ) { unsigned int x; for ( x = 0; x < SUPERPAGE_PAGES; ++x, gfn = gfn_add(gfn, 1) ) p2m_pod_zero_check(p2m, &gfn, 1); } } else p2m_pod_zero_check(p2m, &gfn, 1); mrp->list[idx] = gfn_x(INVALID_GFN); } } while ( (p2m->pod.count == 0) && (i < ARRAY_SIZE(mrp->list)) ); } static void pod_eager_record(struct p2m_domain *p2m, gfn_t gfn, unsigned int order) { struct pod_mrp_list *mrp = &p2m->pod.mrp; ASSERT(!gfn_eq(gfn, INVALID_GFN)); mrp->list[mrp->idx++] = gfn_x(gfn) | (order == PAGE_ORDER_2M ? POD_LAST_SUPERPAGE : 0); mrp->idx %= ARRAY_SIZE(mrp->list); } bool p2m_pod_demand_populate(struct p2m_domain *p2m, gfn_t gfn, unsigned int order) { struct domain *d = p2m->domain; struct page_info *p = NULL; /* Compiler warnings */ gfn_t gfn_aligned = _gfn((gfn_x(gfn) >> order) << order); mfn_t mfn; unsigned long i; ASSERT(gfn_locked_by_me(p2m, gfn)); pod_lock(p2m); /* * This check is done with the pod lock held. This will make sure that * even if d->is_dying changes under our feet, p2m_pod_empty_cache() * won't start until we're done. */ if ( unlikely(d->is_dying) ) goto out_fail; /* * Because PoD does not have cache list for 1GB pages, it has to remap * 1GB region to 2MB chunks for a retry. */ if ( order == PAGE_ORDER_1G ) { pod_unlock(p2m); /* * Note that we are supposed to call p2m_set_entry() 512 times to * split 1GB into 512 2MB pages here. But We only do once here because * p2m_set_entry() should automatically shatter the 1GB page into * 512 2MB pages. The rest of 511 calls are unnecessary. * * NOTE: In a fine-grained p2m locking scenario this operation * may need to promote its locking from gfn->1g superpage */ return !p2m_set_entry(p2m, gfn_aligned, INVALID_MFN, PAGE_ORDER_2M, p2m_populate_on_demand, p2m->default_access); } /* Only reclaim if we're in actual need of more cache. */ if ( p2m->pod.entry_count > p2m->pod.count ) pod_eager_reclaim(p2m); /* * Only sweep if we're actually out of memory. Doing anything else * causes unnecessary time and fragmentation of superpages in the p2m. */ if ( p2m->pod.count == 0 ) p2m_pod_emergency_sweep(p2m); /* If the sweep failed, give up. */ if ( p2m->pod.count == 0 ) goto out_of_memory; /* Keep track of the highest gfn demand-populated by a guest fault */ p2m->pod.max_guest = gfn_max(gfn, p2m->pod.max_guest); /* * Get a page f/ the cache. A NULL return value indicates that the * 2-meg range should be marked singleton PoD, and retried. */ if ( (p = p2m_pod_cache_get(p2m, order)) == NULL ) goto remap_and_retry; mfn = page_to_mfn(p); BUG_ON((mfn_x(mfn) & ((1UL << order) - 1)) != 0); if ( p2m_set_entry(p2m, gfn_aligned, mfn, order, p2m_ram_rw, p2m->default_access) ) { p2m_pod_cache_add(p2m, p, order); goto out_fail; } for( i = 0; i < (1UL << order); i++ ) { set_gpfn_from_mfn(mfn_x(mfn) + i, gfn_x(gfn_aligned) + i); paging_mark_dirty(d, mfn_add(mfn, i)); } p2m->pod.entry_count -= (1UL << order); BUG_ON(p2m->pod.entry_count < 0); pod_eager_record(p2m, gfn_aligned, order); if ( tb_init_done ) { struct { u64 gfn, mfn; int d:16,order:16; } t; t.gfn = gfn_x(gfn); t.mfn = mfn_x(mfn); t.d = d->domain_id; t.order = order; __trace_var(TRC_MEM_POD_POPULATE, 0, sizeof(t), &t); } pod_unlock(p2m); return true; out_of_memory: pod_unlock(p2m); printk("%s: Dom%d out of PoD memory! (tot=%"PRIu32" ents=%ld dom%d)\n", __func__, d->domain_id, d->tot_pages, p2m->pod.entry_count, current->domain->domain_id); domain_crash(d); return false; out_fail: pod_unlock(p2m); return false; remap_and_retry: BUG_ON(order != PAGE_ORDER_2M); pod_unlock(p2m); /* * Remap this 2-meg region in singleton chunks. See the comment on the * 1G page splitting path above for why a single call suffices. * * NOTE: In a p2m fine-grained lock scenario this might * need promoting the gfn lock from gfn->2M superpage. */ if ( p2m_set_entry(p2m, gfn_aligned, INVALID_MFN, PAGE_ORDER_4K, p2m_populate_on_demand, p2m->default_access) ) return false; if ( tb_init_done ) { struct { u64 gfn; int d:16; } t; t.gfn = gfn_x(gfn); t.d = d->domain_id; __trace_var(TRC_MEM_POD_SUPERPAGE_SPLINTER, 0, sizeof(t), &t); } return true; } int guest_physmap_mark_populate_on_demand(struct domain *d, unsigned long gfn_l, unsigned int order) { struct p2m_domain *p2m = p2m_get_hostp2m(d); gfn_t gfn = _gfn(gfn_l); unsigned long i, n, pod_count = 0; int rc = 0; if ( !paging_mode_translate(d) ) return -EINVAL; gfn_lock(p2m, gfn, order); P2M_DEBUG("mark pod gfn=%#lx\n", gfn_l); /* Make sure all gpfns are unused */ for ( i = 0; i < (1UL << order); i += n ) { p2m_type_t ot; p2m_access_t a; unsigned int cur_order; p2m->get_entry(p2m, gfn_add(gfn, i), &ot, &a, 0, &cur_order, NULL); n = 1UL << min(order, cur_order); if ( p2m_is_ram(ot) ) { P2M_DEBUG("gfn_to_mfn returned type %d!\n", ot); rc = -EBUSY; goto out; } else if ( ot == p2m_populate_on_demand ) { /* Count how man PoD entries we'll be replacing if successful */ pod_count += n; } } /* Now, actually do the two-way mapping */ rc = p2m_set_entry(p2m, gfn, INVALID_MFN, order, p2m_populate_on_demand, p2m->default_access); if ( rc == 0 ) { pod_lock(p2m); p2m->pod.entry_count += 1UL << order; p2m->pod.entry_count -= pod_count; BUG_ON(p2m->pod.entry_count < 0); pod_unlock(p2m); } out: gfn_unlock(p2m, gfn, order); return rc; }