/******************************************************************************
* arch/x86/mm/p2m-pod.c
*
* Populate-on-demand p2m entries.
*
* Copyright (c) 2009-2011 Citrix Systems, Inc.
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; If not, see .
*/
#include
#include
#include
#include
#include
#include
#include
#include "mm-locks.h"
/* Override macros from asm/page.h to make them work with mfn_t */
#undef mfn_to_page
#define mfn_to_page(_m) __mfn_to_page(mfn_x(_m))
#undef page_to_mfn
#define page_to_mfn(_pg) _mfn(__page_to_mfn(_pg))
#define superpage_aligned(_x) (((_x)&(SUPERPAGE_PAGES-1))==0)
/* Enforce lock ordering when grabbing the "external" page_alloc lock */
static inline void lock_page_alloc(struct p2m_domain *p2m)
{
page_alloc_mm_pre_lock();
spin_lock(&(p2m->domain->page_alloc_lock));
page_alloc_mm_post_lock(p2m->domain->arch.page_alloc_unlock_level);
}
static inline void unlock_page_alloc(struct p2m_domain *p2m)
{
page_alloc_mm_unlock(p2m->domain->arch.page_alloc_unlock_level);
spin_unlock(&(p2m->domain->page_alloc_lock));
}
/*
* Populate-on-demand functionality
*/
static int
p2m_pod_cache_add(struct p2m_domain *p2m,
struct page_info *page,
unsigned int order)
{
unsigned long i;
struct page_info *p;
struct domain *d = p2m->domain;
#ifndef NDEBUG
mfn_t mfn;
mfn = page_to_mfn(page);
/* Check to make sure this is a contiguous region */
if ( mfn_x(mfn) & ((1UL << order) - 1) )
{
printk("%s: mfn %lx not aligned order %u! (mask %lx)\n",
__func__, mfn_x(mfn), order, ((1UL << order) - 1));
return -1;
}
for ( i = 0; i < 1UL << order ; i++)
{
struct domain * od;
p = mfn_to_page(_mfn(mfn_x(mfn) + i));
od = page_get_owner(p);
if ( od != d )
{
printk("%s: mfn %lx expected owner d%d, got owner d%d!\n",
__func__, mfn_x(mfn), d->domain_id,
od ? od->domain_id : -1);
return -1;
}
}
#endif
ASSERT(pod_locked_by_me(p2m));
/*
* Pages from domain_alloc and returned by the balloon driver aren't
* guaranteed to be zero; but by reclaiming zero pages, we implicitly
* promise to provide zero pages. So we scrub pages before using.
*/
for ( i = 0; i < (1UL << order); i++ )
clear_domain_page(mfn_add(page_to_mfn(page), i));
/* First, take all pages off the domain list */
lock_page_alloc(p2m);
for ( i = 0; i < 1UL << order ; i++ )
{
p = page + i;
page_list_del(p, &d->page_list);
}
unlock_page_alloc(p2m);
/* Then add to the appropriate populate-on-demand list. */
switch ( order )
{
case PAGE_ORDER_1G:
for ( i = 0; i < (1UL << PAGE_ORDER_1G); i += 1UL << PAGE_ORDER_2M )
page_list_add_tail(page + i, &p2m->pod.super);
break;
case PAGE_ORDER_2M:
page_list_add_tail(page, &p2m->pod.super);
break;
case PAGE_ORDER_4K:
page_list_add_tail(page, &p2m->pod.single);
break;
default:
BUG();
}
p2m->pod.count += 1UL << order;
return 0;
}
/* Get a page of size order from the populate-on-demand cache. Will break
* down 2-meg pages into singleton pages automatically. Returns null if
* a superpage is requested and no superpages are available. */
static struct page_info * p2m_pod_cache_get(struct p2m_domain *p2m,
unsigned int order)
{
struct page_info *p = NULL;
unsigned long i;
ASSERT(pod_locked_by_me(p2m));
if ( order == PAGE_ORDER_2M && page_list_empty(&p2m->pod.super) )
{
return NULL;
}
else if ( order == PAGE_ORDER_4K && page_list_empty(&p2m->pod.single) )
{
unsigned long mfn;
struct page_info *q;
BUG_ON( page_list_empty(&p2m->pod.super) );
/*
* Break up a superpage to make single pages. NB count doesn't
* need to be adjusted.
*/
p = page_list_remove_head(&p2m->pod.super);
mfn = mfn_x(page_to_mfn(p));
for ( i = 0; i < SUPERPAGE_PAGES; i++ )
{
q = mfn_to_page(_mfn(mfn+i));
page_list_add_tail(q, &p2m->pod.single);
}
}
switch ( order )
{
case PAGE_ORDER_2M:
BUG_ON( page_list_empty(&p2m->pod.super) );
p = page_list_remove_head(&p2m->pod.super);
p2m->pod.count -= 1UL << order;
break;
case PAGE_ORDER_4K:
BUG_ON( page_list_empty(&p2m->pod.single) );
p = page_list_remove_head(&p2m->pod.single);
p2m->pod.count -= 1UL;
break;
default:
BUG();
}
/* Put the pages back on the domain page_list */
lock_page_alloc(p2m);
for ( i = 0 ; i < (1UL << order); i++ )
{
BUG_ON(page_get_owner(p + i) != p2m->domain);
page_list_add_tail(p + i, &p2m->domain->page_list);
}
unlock_page_alloc(p2m);
return p;
}
/* Set the size of the cache, allocating or freeing as necessary. */
static int
p2m_pod_set_cache_target(struct p2m_domain *p2m, unsigned long pod_target, int preemptible)
{
struct domain *d = p2m->domain;
int ret = 0;
ASSERT(pod_locked_by_me(p2m));
/* Increasing the target */
while ( pod_target > p2m->pod.count )
{
struct page_info * page;
int order;
if ( (pod_target - p2m->pod.count) >= SUPERPAGE_PAGES )
order = PAGE_ORDER_2M;
else
order = PAGE_ORDER_4K;
retry:
page = alloc_domheap_pages(d, order, 0);
if ( unlikely(page == NULL) )
{
if ( order == PAGE_ORDER_2M )
{
/* If we can't allocate a superpage, try singleton pages */
order = PAGE_ORDER_4K;
goto retry;
}
printk("%s: Unable to allocate page for PoD cache (target=%lu cache=%ld)\n",
__func__, pod_target, p2m->pod.count);
ret = -ENOMEM;
goto out;
}
p2m_pod_cache_add(p2m, page, order);
if ( preemptible && pod_target != p2m->pod.count &&
hypercall_preempt_check() )
{
ret = -ERESTART;
goto out;
}
}
/* Decreasing the target */
/*
* We hold the pod lock here, so we don't need to worry about
* cache disappearing under our feet.
*/
while ( pod_target < p2m->pod.count )
{
struct page_info * page;
unsigned int order;
unsigned long i;
if ( (p2m->pod.count - pod_target) > SUPERPAGE_PAGES
&& !page_list_empty(&p2m->pod.super) )
order = PAGE_ORDER_2M;
else
order = PAGE_ORDER_4K;
page = p2m_pod_cache_get(p2m, order);
ASSERT(page != NULL);
/* Then free them */
for ( i = 0 ; i < (1UL << order) ; i++ )
{
/* Copied from common/memory.c:guest_remove_page() */
if ( unlikely(!get_page(page + i, d)) )
{
gdprintk(XENLOG_INFO, "Bad page free for domain %u\n", d->domain_id);
ret = -EINVAL;
goto out;
}
if ( test_and_clear_bit(_PGT_pinned, &(page+i)->u.inuse.type_info) )
put_page_and_type(page + i);
if ( test_and_clear_bit(_PGC_allocated, &(page+i)->count_info) )
put_page(page + i);
put_page(page + i);
if ( preemptible && pod_target != p2m->pod.count &&
hypercall_preempt_check() )
{
ret = -ERESTART;
goto out;
}
}
}
out:
return ret;
}
/*
* The "right behavior" here requires some careful thought. First, some
* definitions:
* + M: static_max
* + B: number of pages the balloon driver has ballooned down to.
* + P: Number of populated pages.
* + T: Old target
* + T': New target
*
* The following equations should hold:
* 0 <= P <= T <= B <= M
* d->arch.p2m->pod.entry_count == B - P
* d->tot_pages == P + d->arch.p2m->pod.count
*
* Now we have the following potential cases to cover:
* B tot_pages - p2m->pod.count;
if ( populated > 0 && p2m->pod.entry_count == 0 )
goto out;
/* Don't do anything if the domain is being torn down */
if ( d->is_dying )
goto out;
/*
* T' < B: Don't reduce the cache size; let the balloon driver
* take care of it.
*/
if ( target < d->tot_pages )
goto out;
pod_target = target - populated;
/*
* B < T': Set the cache size equal to # of outstanding entries,
* let the balloon driver fill in the rest.
*/
if ( populated > 0 && pod_target > p2m->pod.entry_count )
pod_target = p2m->pod.entry_count;
ASSERT( pod_target >= p2m->pod.count );
ret = p2m_pod_set_cache_target(p2m, pod_target, 1/*preemptible*/);
out:
pod_unlock(p2m);
return ret;
}
int p2m_pod_empty_cache(struct domain *d)
{
struct p2m_domain *p2m = p2m_get_hostp2m(d);
struct page_info *page;
unsigned int i;
/* After this barrier no new PoD activities can happen. */
BUG_ON(!d->is_dying);
spin_barrier(&p2m->pod.lock.lock);
lock_page_alloc(p2m);
while ( (page = page_list_remove_head(&p2m->pod.super)) )
{
for ( i = 0 ; i < SUPERPAGE_PAGES ; i++ )
{
BUG_ON(page_get_owner(page + i) != d);
page_list_add_tail(page + i, &d->page_list);
}
p2m->pod.count -= SUPERPAGE_PAGES;
if ( hypercall_preempt_check() )
goto out;
}
for ( i = 0; (page = page_list_remove_head(&p2m->pod.single)); ++i )
{
BUG_ON(page_get_owner(page) != d);
page_list_add_tail(page, &d->page_list);
p2m->pod.count -= 1;
if ( i && !(i & 511) && hypercall_preempt_check() )
goto out;
}
BUG_ON(p2m->pod.count != 0);
out:
unlock_page_alloc(p2m);
return p2m->pod.count ? -ERESTART : 0;
}
int
p2m_pod_offline_or_broken_hit(struct page_info *p)
{
struct domain *d;
struct p2m_domain *p2m;
struct page_info *q, *tmp;
unsigned long mfn, bmfn;
if ( !(d = page_get_owner(p)) || !(p2m = p2m_get_hostp2m(d)) )
return 0;
pod_lock(p2m);
bmfn = mfn_x(page_to_mfn(p));
page_list_for_each_safe(q, tmp, &p2m->pod.super)
{
mfn = mfn_x(page_to_mfn(q));
if ( (bmfn >= mfn) && ((bmfn - mfn) < SUPERPAGE_PAGES) )
{
unsigned long i;
page_list_del(q, &p2m->pod.super);
for ( i = 0; i < SUPERPAGE_PAGES; i++)
{
q = mfn_to_page(_mfn(mfn + i));
page_list_add_tail(q, &p2m->pod.single);
}
page_list_del(p, &p2m->pod.single);
p2m->pod.count--;
goto pod_hit;
}
}
page_list_for_each_safe(q, tmp, &p2m->pod.single)
{
mfn = mfn_x(page_to_mfn(q));
if ( mfn == bmfn )
{
page_list_del(p, &p2m->pod.single);
p2m->pod.count--;
goto pod_hit;
}
}
pod_unlock(p2m);
return 0;
pod_hit:
lock_page_alloc(p2m);
/* Insertion must be at list head (see iommu_populate_page_table()). */
page_list_add(p, &d->arch.relmem_list);
unlock_page_alloc(p2m);
pod_unlock(p2m);
return 1;
}
void
p2m_pod_offline_or_broken_replace(struct page_info *p)
{
struct domain *d;
struct p2m_domain *p2m;
nodeid_t node = phys_to_nid(page_to_maddr(p));
if ( !(d = page_get_owner(p)) || !(p2m = p2m_get_hostp2m(d)) )
return;
free_domheap_page(p);
p = alloc_domheap_page(d, MEMF_node(node));
if ( unlikely(!p) )
return;
pod_lock(p2m);
p2m_pod_cache_add(p2m, p, PAGE_ORDER_4K);
pod_unlock(p2m);
return;
}
static int
p2m_pod_zero_check_superpage(struct p2m_domain *p2m, gfn_t gfn);
/*
* This function is needed for two reasons:
* + To properly handle clearing of PoD entries
* + To "steal back" memory being freed for the PoD cache, rather than
* releasing it.
*
* Once both of these functions have been completed, we can return and
* allow decrease_reservation() to handle everything else.
*/
int
p2m_pod_decrease_reservation(struct domain *d, gfn_t gfn, unsigned int order)
{
int ret = 0;
unsigned long i, n;
struct p2m_domain *p2m = p2m_get_hostp2m(d);
bool_t steal_for_cache;
long pod, nonpod, ram;
gfn_lock(p2m, gfn, order);
pod_lock(p2m);
/*
* If we don't have any outstanding PoD entries, let things take their
* course.
*/
if ( p2m->pod.entry_count == 0 )
goto out_unlock;
if ( unlikely(d->is_dying) )
goto out_unlock;
pod = nonpod = ram = 0;
/* Figure out if we need to steal some freed memory for our cache */
steal_for_cache = ( p2m->pod.entry_count > p2m->pod.count );
for ( i = 0; i < (1UL << order); i += n )
{
p2m_access_t a;
p2m_type_t t;
unsigned int cur_order;
p2m->get_entry(p2m, gfn_add(gfn, i), &t, &a, 0, &cur_order, NULL);
n = 1UL << min(order, cur_order);
if ( t == p2m_populate_on_demand )
pod += n;
else
{
nonpod += n;
if ( p2m_is_ram(t) )
ram += n;
}
}
/* No populate-on-demand? Don't need to steal anything? Then we're done!*/
if ( !pod && !steal_for_cache )
goto out_unlock;
if ( !nonpod )
{
/*
* All PoD: Mark the whole region invalid and tell caller
* we're done.
*/
if ( p2m_set_entry(p2m, gfn, INVALID_MFN, order, p2m_invalid,
p2m->default_access) )
{
/*
* If this fails, we can't tell how much of the range was changed.
* Best to crash the domain unless we're sure a partial change is
* impossible.
*/
if ( order != 0 )
domain_crash(d);
goto out_unlock;
}
p2m->pod.entry_count -= 1UL << order;
BUG_ON(p2m->pod.entry_count < 0);
ret = 1;
goto out_entry_check;
}
/*
* Try to grab entire superpages if possible. Since the common case is for
* drivers to pass back singleton pages, see if we can take the whole page
* back and mark the rest PoD.
* No need to do this though if
* - order >= SUPERPAGE_ORDER (the loop below will take care of this)
* - not all of the pages were RAM (now knowing order < SUPERPAGE_ORDER)
*/
if ( steal_for_cache && order < SUPERPAGE_ORDER && ram == (1UL << order) &&
p2m_pod_zero_check_superpage(p2m, _gfn(gfn_x(gfn) & ~(SUPERPAGE_PAGES - 1))) )
{
pod = 1UL << order;
ram = nonpod = 0;
ASSERT(steal_for_cache == (p2m->pod.entry_count > p2m->pod.count));
}
/*
* Process as long as:
* + There are PoD entries to handle, or
* + There is ram left, and we want to steal it
*/
for ( i = 0;
i < (1UL << order) && (pod > 0 || (steal_for_cache && ram > 0));
i += n )
{
mfn_t mfn;
p2m_type_t t;
p2m_access_t a;
unsigned int cur_order;
mfn = p2m->get_entry(p2m, gfn_add(gfn, i), &t, &a, 0, &cur_order, NULL);
if ( order < cur_order )
cur_order = order;
n = 1UL << cur_order;
if ( t == p2m_populate_on_demand )
{
/* This shouldn't be able to fail */
if ( p2m_set_entry(p2m, gfn_add(gfn, i), INVALID_MFN, cur_order,
p2m_invalid, p2m->default_access) )
{
ASSERT_UNREACHABLE();
domain_crash(d);
goto out_unlock;
}
p2m->pod.entry_count -= n;
BUG_ON(p2m->pod.entry_count < 0);
pod -= n;
}
else if ( steal_for_cache && p2m_is_ram(t) )
{
/*
* If we need less than 1 << cur_order, we may end up stealing
* more memory here than we actually need. This will be rectified
* below, however; and stealing too much and then freeing what we
* need may allow us to free smaller pages from the cache, and
* avoid breaking up superpages.
*/
struct page_info *page;
unsigned long j;
ASSERT(mfn_valid(mfn));
page = mfn_to_page(mfn);
/* This shouldn't be able to fail */
if ( p2m_set_entry(p2m, gfn_add(gfn, i), INVALID_MFN, cur_order,
p2m_invalid, p2m->default_access) )
{
ASSERT_UNREACHABLE();
domain_crash(d);
goto out_unlock;
}
p2m_tlb_flush_sync(p2m);
for ( j = 0; j < n; ++j )
set_gpfn_from_mfn(mfn_x(mfn), INVALID_M2P_ENTRY);
p2m_pod_cache_add(p2m, page, cur_order);
steal_for_cache = ( p2m->pod.entry_count > p2m->pod.count );
nonpod -= n;
ram -= n;
}
}
/*
* If there are no more non-PoD entries, tell decrease_reservation() that
* there's nothing left to do.
*/
if ( nonpod == 0 )
ret = 1;
out_entry_check:
/* If we've reduced our "liabilities" beyond our "assets", free some */
if ( p2m->pod.entry_count < p2m->pod.count )
{
p2m_pod_set_cache_target(p2m, p2m->pod.entry_count, 0/*can't preempt*/);
}
out_unlock:
pod_unlock(p2m);
gfn_unlock(p2m, gfn, order);
return ret;
}
void p2m_pod_dump_data(struct domain *d)
{
struct p2m_domain *p2m = p2m_get_hostp2m(d);
printk(" PoD entries=%ld cachesize=%ld\n",
p2m->pod.entry_count, p2m->pod.count);
}
/*
* Search for all-zero superpages to be reclaimed as superpages for the
* PoD cache. Must be called w/ pod lock held, must lock the superpage
* in the p2m.
*/
static int
p2m_pod_zero_check_superpage(struct p2m_domain *p2m, gfn_t gfn)
{
mfn_t mfn, mfn0 = INVALID_MFN;
p2m_type_t type, type0 = 0;
unsigned long * map = NULL;
int ret=0, reset = 0;
unsigned long i, n;
unsigned int j;
int max_ref = 1;
struct domain *d = p2m->domain;
ASSERT(pod_locked_by_me(p2m));
if ( !superpage_aligned(gfn_x(gfn)) )
goto out;
/* Allow an extra refcount for one shadow pt mapping in shadowed domains */
if ( paging_mode_shadow(d) )
max_ref++;
/*
* NOTE: this is why we don't enforce deadlock constraints between p2m
* and pod locks.
*/
gfn_lock(p2m, gfn, SUPERPAGE_ORDER);
/*
* Look up the mfns, checking to make sure they're the same mfn
* and aligned, and mapping them.
*/
for ( i = 0; i < SUPERPAGE_PAGES; i += n )
{
p2m_access_t a;
unsigned int cur_order;
unsigned long k;
const struct page_info *page;
mfn = p2m->get_entry(p2m, gfn_add(gfn, i), &type, &a, 0,
&cur_order, NULL);
/*
* Conditions that must be met for superpage-superpage:
* + All gfns are ram types
* + All gfns have the same type
* + All of the mfns are allocated to a domain
* + None of the mfns are used as pagetables, or allocated via xenheap
* + The first mfn is 2-meg aligned
* + All the other mfns are in sequence
* Adding for good measure:
* + None of the mfns are likely to be mapped elsewhere (refcount
* 2 or less for shadow, 1 for hap)
*/
if ( !p2m_is_ram(type) )
goto out;
if ( i == 0 )
{
if ( !superpage_aligned(mfn_x(mfn)) )
goto out;
mfn0 = mfn;
type0 = type;
}
else if ( type != type0 || !mfn_eq(mfn, mfn_add(mfn0, i)) )
goto out;
n = 1UL << min(cur_order, SUPERPAGE_ORDER + 0U);
for ( k = 0, page = mfn_to_page(mfn); k < n; ++k, ++page )
if ( !(page->count_info & PGC_allocated) ||
(page->count_info & (PGC_page_table | PGC_xen_heap)) ||
(page->count_info & PGC_count_mask) > max_ref )
goto out;
}
/* Now, do a quick check to see if it may be zero before unmapping. */
for ( i = 0; i < SUPERPAGE_PAGES; i++ )
{
/* Quick zero-check */
map = map_domain_page(mfn_add(mfn0, i));
for ( j = 0; j < 16; j++ )
if ( *(map + j) != 0 )
break;
unmap_domain_page(map);
if ( j < 16 )
goto out;
}
/* Try to remove the page, restoring old mapping if it fails. */
if ( p2m_set_entry(p2m, gfn, INVALID_MFN, PAGE_ORDER_2M,
p2m_populate_on_demand, p2m->default_access) )
goto out;
p2m_tlb_flush_sync(p2m);
/*
* Make none of the MFNs are used elsewhere... for example, mapped
* via the grant table interface, or by qemu. Allow one refcount for
* being allocated to the domain.
*/
for ( i = 0; i < SUPERPAGE_PAGES; i++ )
{
mfn = mfn_add(mfn0, i);
if ( (mfn_to_page(mfn)->count_info & PGC_count_mask) > 1 )
{
reset = 1;
goto out_reset;
}
}
/* Finally, do a full zero-check */
for ( i = 0; i < SUPERPAGE_PAGES; i++ )
{
map = map_domain_page(mfn_add(mfn0, i));
for ( j = 0; j < (PAGE_SIZE / sizeof(*map)); j++ )
if ( *(map+j) != 0 )
{
reset = 1;
break;
}
unmap_domain_page(map);
if ( reset )
goto out_reset;
}
if ( tb_init_done )
{
struct {
u64 gfn, mfn;
int d:16,order:16;
} t;
t.gfn = gfn_x(gfn);
t.mfn = mfn_x(mfn);
t.d = d->domain_id;
t.order = 9;
__trace_var(TRC_MEM_POD_ZERO_RECLAIM, 0, sizeof(t), &t);
}
/*
* Finally! We've passed all the checks, and can add the mfn superpage
* back on the PoD cache, and account for the new p2m PoD entries.
*/
p2m_pod_cache_add(p2m, mfn_to_page(mfn0), PAGE_ORDER_2M);
p2m->pod.entry_count += SUPERPAGE_PAGES;
ret = SUPERPAGE_PAGES;
out_reset:
/*
* This p2m_set_entry() call shouldn't be able to fail, since the same order
* on the same gfn succeeded above. If that turns out to be false, crashing
* the domain should be the safest way of making sure we don't leak memory.
*/
if ( reset && p2m_set_entry(p2m, gfn, mfn0, PAGE_ORDER_2M,
type0, p2m->default_access) )
{
ASSERT_UNREACHABLE();
domain_crash(d);
}
out:
gfn_unlock(p2m, gfn, SUPERPAGE_ORDER);
return ret;
}
static void
p2m_pod_zero_check(struct p2m_domain *p2m, const gfn_t *gfns, int count)
{
mfn_t mfns[count];
p2m_type_t types[count];
unsigned long *map[count];
struct domain *d = p2m->domain;
int i, j;
int max_ref = 1;
/* Allow an extra refcount for one shadow pt mapping in shadowed domains */
if ( paging_mode_shadow(d) )
max_ref++;
/* First, get the gfn list, translate to mfns, and map the pages. */
for ( i = 0; i < count; i++ )
{
p2m_access_t a;
struct page_info *pg;
mfns[i] = p2m->get_entry(p2m, gfns[i], types + i, &a,
0, NULL, NULL);
pg = mfn_to_page(mfns[i]);
/*
* If this is ram, and not a pagetable or from the xen heap, and
* probably not mapped elsewhere, map it; otherwise, skip.
*/
if ( p2m_is_ram(types[i]) && (pg->count_info & PGC_allocated) &&
!(pg->count_info & (PGC_page_table | PGC_xen_heap)) &&
((pg->count_info & PGC_count_mask) <= max_ref) )
map[i] = map_domain_page(mfns[i]);
else
map[i] = NULL;
}
/*
* Then, go through and check for zeroed pages, removing write permission
* for those with zeroes.
*/
for ( i = 0; i < count; i++ )
{
if ( !map[i] )
continue;
/* Quick zero-check */
for ( j = 0; j < 16; j++ )
if ( *(map[i] + j) != 0 )
break;
if ( j < 16 )
{
unmap_domain_page(map[i]);
map[i] = NULL;
continue;
}
/* Try to remove the page, restoring old mapping if it fails. */
if ( p2m_set_entry(p2m, gfns[i], INVALID_MFN, PAGE_ORDER_4K,
p2m_populate_on_demand, p2m->default_access) )
goto skip;
/*
* See if the page was successfully unmapped. (Allow one refcount
* for being allocated to a domain.)
*/
if ( (mfn_to_page(mfns[i])->count_info & PGC_count_mask) > 1 )
{
/*
* If the previous p2m_set_entry call succeeded, this one shouldn't
* be able to fail. If it does, crashing the domain should be safe.
*/
if ( p2m_set_entry(p2m, gfns[i], mfns[i], PAGE_ORDER_4K,
types[i], p2m->default_access) )
{
ASSERT_UNREACHABLE();
domain_crash(d);
goto out_unmap;
}
skip:
unmap_domain_page(map[i]);
map[i] = NULL;
continue;
}
}
p2m_tlb_flush_sync(p2m);
/* Now check each page for real */
for ( i = 0; i < count; i++ )
{
if ( !map[i] )
continue;
for ( j = 0; j < (PAGE_SIZE / sizeof(*map[i])); j++ )
if ( *(map[i] + j) != 0 )
break;
unmap_domain_page(map[i]);
map[i] = NULL;
/*
* See comment in p2m_pod_zero_check_superpage() re gnttab
* check timing.
*/
if ( j < (PAGE_SIZE / sizeof(*map[i])) )
{
/*
* If the previous p2m_set_entry call succeeded, this one shouldn't
* be able to fail. If it does, crashing the domain should be safe.
*/
if ( p2m_set_entry(p2m, gfns[i], mfns[i], PAGE_ORDER_4K,
types[i], p2m->default_access) )
{
ASSERT_UNREACHABLE();
domain_crash(d);
goto out_unmap;
}
}
else
{
if ( tb_init_done )
{
struct {
u64 gfn, mfn;
int d:16,order:16;
} t;
t.gfn = gfn_x(gfns[i]);
t.mfn = mfn_x(mfns[i]);
t.d = d->domain_id;
t.order = 0;
__trace_var(TRC_MEM_POD_ZERO_RECLAIM, 0, sizeof(t), &t);
}
/* Add to cache, and account for the new p2m PoD entry */
p2m_pod_cache_add(p2m, mfn_to_page(mfns[i]), PAGE_ORDER_4K);
p2m->pod.entry_count++;
}
}
return;
out_unmap:
/*
* Something went wrong, probably crashing the domain. Unmap
* everything and return.
*/
for ( i = 0; i < count; i++ )
if ( map[i] )
unmap_domain_page(map[i]);
}
#define POD_SWEEP_LIMIT 1024
#define POD_SWEEP_STRIDE 16
static void
p2m_pod_emergency_sweep(struct p2m_domain *p2m)
{
gfn_t gfns[POD_SWEEP_STRIDE];
unsigned long i, j = 0, start, limit;
p2m_type_t t;
if ( gfn_eq(p2m->pod.reclaim_single, _gfn(0)) )
p2m->pod.reclaim_single = p2m->pod.max_guest;
start = gfn_x(p2m->pod.reclaim_single);
limit = (start > POD_SWEEP_LIMIT) ? (start - POD_SWEEP_LIMIT) : 0;
/* FIXME: Figure out how to avoid superpages */
/*
* NOTE: Promote to globally locking the p2m. This will get complicated
* in a fine-grained scenario. If we lock each gfn individually we must be
* careful about spinlock recursion limits and POD_SWEEP_STRIDE.
*/
p2m_lock(p2m);
for ( i = gfn_x(p2m->pod.reclaim_single); i > 0 ; i-- )
{
p2m_access_t a;
(void)p2m->get_entry(p2m, _gfn(i), &t, &a, 0, NULL, NULL);
if ( p2m_is_ram(t) )
{
gfns[j] = _gfn(i);
j++;
BUG_ON(j > POD_SWEEP_STRIDE);
if ( j == POD_SWEEP_STRIDE )
{
p2m_pod_zero_check(p2m, gfns, j);
j = 0;
}
}
/*
* Stop if we're past our limit and we have found *something*.
*
* NB that this is a zero-sum game; we're increasing our cache size
* by re-increasing our 'debt'. Since we hold the pod lock,
* (entry_count - count) must remain the same.
*/
if ( i < limit && (p2m->pod.count > 0 || hypercall_preempt_check()) )
break;
}
if ( j )
p2m_pod_zero_check(p2m, gfns, j);
p2m_unlock(p2m);
p2m->pod.reclaim_single = _gfn(i ? i - 1 : i);
}
static void pod_eager_reclaim(struct p2m_domain *p2m)
{
struct pod_mrp_list *mrp = &p2m->pod.mrp;
unsigned int i = 0;
/*
* Always check one page for reclaimation.
*
* If the PoD pool is empty, keep checking some space is found, or all
* entries have been exhaused.
*/
do
{
unsigned int idx = (mrp->idx + i++) % ARRAY_SIZE(mrp->list);
gfn_t gfn = _gfn(mrp->list[idx]);
if ( !gfn_eq(gfn, INVALID_GFN) )
{
if ( gfn_x(gfn) & POD_LAST_SUPERPAGE )
{
gfn = _gfn(gfn_x(gfn) & ~POD_LAST_SUPERPAGE);
if ( p2m_pod_zero_check_superpage(p2m, gfn) == 0 )
{
unsigned int x;
for ( x = 0; x < SUPERPAGE_PAGES; ++x, gfn = gfn_add(gfn, 1) )
p2m_pod_zero_check(p2m, &gfn, 1);
}
}
else
p2m_pod_zero_check(p2m, &gfn, 1);
mrp->list[idx] = gfn_x(INVALID_GFN);
}
} while ( (p2m->pod.count == 0) && (i < ARRAY_SIZE(mrp->list)) );
}
static void pod_eager_record(struct p2m_domain *p2m, gfn_t gfn,
unsigned int order)
{
struct pod_mrp_list *mrp = &p2m->pod.mrp;
ASSERT(!gfn_eq(gfn, INVALID_GFN));
mrp->list[mrp->idx++] =
gfn_x(gfn) | (order == PAGE_ORDER_2M ? POD_LAST_SUPERPAGE : 0);
mrp->idx %= ARRAY_SIZE(mrp->list);
}
bool
p2m_pod_demand_populate(struct p2m_domain *p2m, gfn_t gfn,
unsigned int order)
{
struct domain *d = p2m->domain;
struct page_info *p = NULL; /* Compiler warnings */
gfn_t gfn_aligned = _gfn((gfn_x(gfn) >> order) << order);
mfn_t mfn;
unsigned long i;
ASSERT(gfn_locked_by_me(p2m, gfn));
pod_lock(p2m);
/*
* This check is done with the pod lock held. This will make sure that
* even if d->is_dying changes under our feet, p2m_pod_empty_cache()
* won't start until we're done.
*/
if ( unlikely(d->is_dying) )
goto out_fail;
/*
* Because PoD does not have cache list for 1GB pages, it has to remap
* 1GB region to 2MB chunks for a retry.
*/
if ( order == PAGE_ORDER_1G )
{
pod_unlock(p2m);
/*
* Note that we are supposed to call p2m_set_entry() 512 times to
* split 1GB into 512 2MB pages here. But We only do once here because
* p2m_set_entry() should automatically shatter the 1GB page into
* 512 2MB pages. The rest of 511 calls are unnecessary.
*
* NOTE: In a fine-grained p2m locking scenario this operation
* may need to promote its locking from gfn->1g superpage
*/
return !p2m_set_entry(p2m, gfn_aligned, INVALID_MFN, PAGE_ORDER_2M,
p2m_populate_on_demand, p2m->default_access);
}
/* Only reclaim if we're in actual need of more cache. */
if ( p2m->pod.entry_count > p2m->pod.count )
pod_eager_reclaim(p2m);
/*
* Only sweep if we're actually out of memory. Doing anything else
* causes unnecessary time and fragmentation of superpages in the p2m.
*/
if ( p2m->pod.count == 0 )
p2m_pod_emergency_sweep(p2m);
/* If the sweep failed, give up. */
if ( p2m->pod.count == 0 )
goto out_of_memory;
/* Keep track of the highest gfn demand-populated by a guest fault */
p2m->pod.max_guest = gfn_max(gfn, p2m->pod.max_guest);
/*
* Get a page f/ the cache. A NULL return value indicates that the
* 2-meg range should be marked singleton PoD, and retried.
*/
if ( (p = p2m_pod_cache_get(p2m, order)) == NULL )
goto remap_and_retry;
mfn = page_to_mfn(p);
BUG_ON((mfn_x(mfn) & ((1UL << order) - 1)) != 0);
if ( p2m_set_entry(p2m, gfn_aligned, mfn, order, p2m_ram_rw,
p2m->default_access) )
{
p2m_pod_cache_add(p2m, p, order);
goto out_fail;
}
for( i = 0; i < (1UL << order); i++ )
{
set_gpfn_from_mfn(mfn_x(mfn) + i, gfn_x(gfn_aligned) + i);
paging_mark_dirty(d, mfn_add(mfn, i));
}
p2m->pod.entry_count -= (1UL << order);
BUG_ON(p2m->pod.entry_count < 0);
pod_eager_record(p2m, gfn_aligned, order);
if ( tb_init_done )
{
struct {
u64 gfn, mfn;
int d:16,order:16;
} t;
t.gfn = gfn_x(gfn);
t.mfn = mfn_x(mfn);
t.d = d->domain_id;
t.order = order;
__trace_var(TRC_MEM_POD_POPULATE, 0, sizeof(t), &t);
}
pod_unlock(p2m);
return true;
out_of_memory:
pod_unlock(p2m);
printk("%s: Dom%d out of PoD memory! (tot=%"PRIu32" ents=%ld dom%d)\n",
__func__, d->domain_id, d->tot_pages, p2m->pod.entry_count,
current->domain->domain_id);
domain_crash(d);
return false;
out_fail:
pod_unlock(p2m);
return false;
remap_and_retry:
BUG_ON(order != PAGE_ORDER_2M);
pod_unlock(p2m);
/*
* Remap this 2-meg region in singleton chunks. See the comment on the
* 1G page splitting path above for why a single call suffices.
*
* NOTE: In a p2m fine-grained lock scenario this might
* need promoting the gfn lock from gfn->2M superpage.
*/
if ( p2m_set_entry(p2m, gfn_aligned, INVALID_MFN, PAGE_ORDER_4K,
p2m_populate_on_demand, p2m->default_access) )
return false;
if ( tb_init_done )
{
struct {
u64 gfn;
int d:16;
} t;
t.gfn = gfn_x(gfn);
t.d = d->domain_id;
__trace_var(TRC_MEM_POD_SUPERPAGE_SPLINTER, 0, sizeof(t), &t);
}
return true;
}
int
guest_physmap_mark_populate_on_demand(struct domain *d, unsigned long gfn_l,
unsigned int order)
{
struct p2m_domain *p2m = p2m_get_hostp2m(d);
gfn_t gfn = _gfn(gfn_l);
unsigned long i, n, pod_count = 0;
int rc = 0;
if ( !paging_mode_translate(d) )
return -EINVAL;
gfn_lock(p2m, gfn, order);
P2M_DEBUG("mark pod gfn=%#lx\n", gfn_l);
/* Make sure all gpfns are unused */
for ( i = 0; i < (1UL << order); i += n )
{
p2m_type_t ot;
p2m_access_t a;
unsigned int cur_order;
p2m->get_entry(p2m, gfn_add(gfn, i), &ot, &a, 0, &cur_order, NULL);
n = 1UL << min(order, cur_order);
if ( p2m_is_ram(ot) )
{
P2M_DEBUG("gfn_to_mfn returned type %d!\n", ot);
rc = -EBUSY;
goto out;
}
else if ( ot == p2m_populate_on_demand )
{
/* Count how man PoD entries we'll be replacing if successful */
pod_count += n;
}
}
/* Now, actually do the two-way mapping */
rc = p2m_set_entry(p2m, gfn, INVALID_MFN, order,
p2m_populate_on_demand, p2m->default_access);
if ( rc == 0 )
{
pod_lock(p2m);
p2m->pod.entry_count += 1UL << order;
p2m->pod.entry_count -= pod_count;
BUG_ON(p2m->pod.entry_count < 0);
pod_unlock(p2m);
}
out:
gfn_unlock(p2m, gfn, order);
return rc;
}