/******************************************************************************
* arch/x86/mm/mem_sharing.c
*
* Memory sharing support.
*
* Copyright (c) 2011 GridCentric, Inc. (Adin Scannell & Andres Lagar-Cavilla)
* Copyright (c) 2009 Citrix Systems, Inc. (Grzegorz Milos)
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; If not, see .
*/
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include "mm-locks.h"
static shr_handle_t next_handle = 1;
typedef struct pg_lock_data {
int mm_unlock_level;
unsigned short recurse_count;
} pg_lock_data_t;
static DEFINE_PER_CPU(pg_lock_data_t, __pld);
#define MEM_SHARING_DEBUG(_f, _a...) \
debugtrace_printk("mem_sharing_debug: %s(): " _f, __func__, ##_a)
/* Reverse map defines */
#define RMAP_HASHTAB_ORDER 0
#define RMAP_HASHTAB_SIZE \
((PAGE_SIZE << RMAP_HASHTAB_ORDER) / sizeof(struct list_head))
#define RMAP_USES_HASHTAB(page) \
((page)->sharing->hash_table.flag == NULL)
#define RMAP_HEAVY_SHARED_PAGE RMAP_HASHTAB_SIZE
/* A bit of hysteresis. We don't want to be mutating between list and hash
* table constantly. */
#define RMAP_LIGHT_SHARED_PAGE (RMAP_HEAVY_SHARED_PAGE >> 2)
#if MEM_SHARING_AUDIT
static struct list_head shr_audit_list;
static spinlock_t shr_audit_lock;
static DEFINE_RCU_READ_LOCK(shr_audit_read_lock);
/* RCU delayed free of audit list entry */
static void _free_pg_shared_info(struct rcu_head *head)
{
xfree(container_of(head, struct page_sharing_info, rcu_head));
}
static inline void audit_add_list(struct page_info *page)
{
INIT_LIST_HEAD(&page->sharing->entry);
spin_lock(&shr_audit_lock);
list_add_rcu(&page->sharing->entry, &shr_audit_list);
spin_unlock(&shr_audit_lock);
}
/* Removes from the audit list and cleans up the page sharing metadata. */
static inline void page_sharing_dispose(struct page_info *page)
{
/* Unlikely given our thresholds, but we should be careful. */
if ( unlikely(RMAP_USES_HASHTAB(page)) )
free_xenheap_pages(page->sharing->hash_table.bucket,
RMAP_HASHTAB_ORDER);
spin_lock(&shr_audit_lock);
list_del_rcu(&page->sharing->entry);
spin_unlock(&shr_audit_lock);
INIT_RCU_HEAD(&page->sharing->rcu_head);
call_rcu(&page->sharing->rcu_head, _free_pg_shared_info);
}
#else
#define audit_add_list(p) ((void)0)
static inline void page_sharing_dispose(struct page_info *page)
{
/* Unlikely given our thresholds, but we should be careful. */
if ( unlikely(RMAP_USES_HASHTAB(page)) )
free_xenheap_pages(page->sharing->hash_table.bucket,
RMAP_HASHTAB_ORDER);
xfree(page->sharing);
}
#endif /* MEM_SHARING_AUDIT */
static inline int mem_sharing_page_lock(struct page_info *pg)
{
int rc;
pg_lock_data_t *pld = &(this_cpu(__pld));
page_sharing_mm_pre_lock();
rc = page_lock(pg);
if ( rc )
{
preempt_disable();
page_sharing_mm_post_lock(&pld->mm_unlock_level,
&pld->recurse_count);
}
return rc;
}
static inline void mem_sharing_page_unlock(struct page_info *pg)
{
pg_lock_data_t *pld = &(this_cpu(__pld));
page_sharing_mm_unlock(pld->mm_unlock_level,
&pld->recurse_count);
preempt_enable();
page_unlock(pg);
}
static inline shr_handle_t get_next_handle(void)
{
/* Get the next handle get_page style */
uint64_t x, y = next_handle;
do {
x = y;
}
while ( (y = cmpxchg(&next_handle, x, x + 1)) != x );
return x + 1;
}
#define mem_sharing_enabled(d) \
(is_hvm_domain(d) && (d)->arch.hvm_domain.mem_sharing_enabled)
#undef mfn_to_page
#define mfn_to_page(_m) __mfn_to_page(mfn_x(_m))
#undef page_to_mfn
#define page_to_mfn(_pg) _mfn(__page_to_mfn(_pg))
static atomic_t nr_saved_mfns = ATOMIC_INIT(0);
static atomic_t nr_shared_mfns = ATOMIC_INIT(0);
/** Reverse map **/
/* Every shared frame keeps a reverse map (rmap) of tuples that
* this shared frame backs. For pages with a low degree of sharing, a O(n)
* search linked list is good enough. For pages with higher degree of sharing,
* we use a hash table instead. */
typedef struct gfn_info
{
unsigned long gfn;
domid_t domain;
struct list_head list;
} gfn_info_t;
static inline void
rmap_init(struct page_info *page)
{
/* We always start off as a doubly linked list. */
INIT_LIST_HEAD(&page->sharing->gfns);
}
/* Exceedingly simple "hash function" */
#define HASH(domain, gfn) \
(((gfn) + (domain)) % RMAP_HASHTAB_SIZE)
/* Conversions. Tuned by the thresholds. Should only happen twice
* (once each) during the lifetime of a shared page */
static inline int
rmap_list_to_hash_table(struct page_info *page)
{
unsigned int i;
struct list_head *pos, *tmp, *b =
alloc_xenheap_pages(RMAP_HASHTAB_ORDER, 0);
if ( b == NULL )
return -ENOMEM;
for ( i = 0; i < RMAP_HASHTAB_SIZE; i++ )
INIT_LIST_HEAD(b + i);
list_for_each_safe(pos, tmp, &page->sharing->gfns)
{
gfn_info_t *gfn_info = list_entry(pos, gfn_info_t, list);
struct list_head *bucket = b + HASH(gfn_info->domain, gfn_info->gfn);
list_del(pos);
list_add(pos, bucket);
}
page->sharing->hash_table.bucket = b;
page->sharing->hash_table.flag = NULL;
return 0;
}
static inline void
rmap_hash_table_to_list(struct page_info *page)
{
unsigned int i;
struct list_head *bucket = page->sharing->hash_table.bucket;
INIT_LIST_HEAD(&page->sharing->gfns);
for ( i = 0; i < RMAP_HASHTAB_SIZE; i++ )
{
struct list_head *pos, *tmp, *head = bucket + i;
list_for_each_safe(pos, tmp, head)
{
list_del(pos);
list_add(pos, &page->sharing->gfns);
}
}
free_xenheap_pages(bucket, RMAP_HASHTAB_ORDER);
}
/* Generic accessors to the rmap */
static inline unsigned long
rmap_count(struct page_info *pg)
{
unsigned long count;
unsigned long t = read_atomic(&pg->u.inuse.type_info);
count = t & PGT_count_mask;
if ( t & PGT_locked )
count--;
return count;
}
/* The page type count is always decreased after removing from the rmap.
* Use a convert flag to avoid mutating the rmap if in the middle of an
* iterator, or if the page will be soon destroyed anyways. */
static inline void
rmap_del(gfn_info_t *gfn_info, struct page_info *page, int convert)
{
if ( RMAP_USES_HASHTAB(page) && convert &&
(rmap_count(page) <= RMAP_LIGHT_SHARED_PAGE) )
rmap_hash_table_to_list(page);
/* Regardless of rmap type, same removal operation */
list_del(&gfn_info->list);
}
/* The page type count is always increased before adding to the rmap. */
static inline void
rmap_add(gfn_info_t *gfn_info, struct page_info *page)
{
struct list_head *head;
if ( !RMAP_USES_HASHTAB(page) &&
(rmap_count(page) >= RMAP_HEAVY_SHARED_PAGE) )
/* The conversion may fail with ENOMEM. We'll be less efficient,
* but no reason to panic. */
(void)rmap_list_to_hash_table(page);
head = (RMAP_USES_HASHTAB(page)) ?
page->sharing->hash_table.bucket +
HASH(gfn_info->domain, gfn_info->gfn) :
&page->sharing->gfns;
INIT_LIST_HEAD(&gfn_info->list);
list_add(&gfn_info->list, head);
}
static inline gfn_info_t *
rmap_retrieve(uint16_t domain_id, unsigned long gfn,
struct page_info *page)
{
gfn_info_t *gfn_info;
struct list_head *le, *head;
head = (RMAP_USES_HASHTAB(page)) ?
page->sharing->hash_table.bucket + HASH(domain_id, gfn) :
&page->sharing->gfns;
list_for_each(le, head)
{
gfn_info = list_entry(le, gfn_info_t, list);
if ( (gfn_info->gfn == gfn) && (gfn_info->domain == domain_id) )
return gfn_info;
}
/* Nothing was found */
return NULL;
}
/* Returns true if the rmap has only one entry. O(1) complexity. */
static inline int rmap_has_one_entry(struct page_info *page)
{
return (rmap_count(page) == 1);
}
/* Returns true if the rmap has any entries. O(1) complexity. */
static inline int rmap_has_entries(struct page_info *page)
{
return (rmap_count(page) != 0);
}
/* The iterator hides the details of how the rmap is implemented. This
* involves splitting the list_for_each_safe macro into two steps. */
struct rmap_iterator {
struct list_head *curr;
struct list_head *next;
unsigned int bucket;
};
static inline void
rmap_seed_iterator(struct page_info *page, struct rmap_iterator *ri)
{
ri->curr = (RMAP_USES_HASHTAB(page)) ?
page->sharing->hash_table.bucket :
&page->sharing->gfns;
ri->next = ri->curr->next;
ri->bucket = 0;
}
static inline gfn_info_t *
rmap_iterate(struct page_info *page, struct rmap_iterator *ri)
{
struct list_head *head = (RMAP_USES_HASHTAB(page)) ?
page->sharing->hash_table.bucket + ri->bucket :
&page->sharing->gfns;
retry:
if ( ri->next == head)
{
if ( RMAP_USES_HASHTAB(page) )
{
ri->bucket++;
if ( ri->bucket >= RMAP_HASHTAB_SIZE )
/* No more hash table buckets */
return NULL;
head = page->sharing->hash_table.bucket + ri->bucket;
ri->curr = head;
ri->next = ri->curr->next;
goto retry;
} else
/* List exhausted */
return NULL;
}
ri->curr = ri->next;
ri->next = ri->curr->next;
return list_entry(ri->curr, gfn_info_t, list);
}
static inline gfn_info_t *mem_sharing_gfn_alloc(struct page_info *page,
struct domain *d,
unsigned long gfn)
{
gfn_info_t *gfn_info = xmalloc(gfn_info_t);
if ( gfn_info == NULL )
return NULL;
gfn_info->gfn = gfn;
gfn_info->domain = d->domain_id;
rmap_add(gfn_info, page);
/* Increment our number of shared pges. */
atomic_inc(&d->shr_pages);
return gfn_info;
}
static inline void mem_sharing_gfn_destroy(struct page_info *page,
struct domain *d,
gfn_info_t *gfn_info)
{
/* Decrement the number of pages. */
atomic_dec(&d->shr_pages);
/* Free the gfn_info structure. */
rmap_del(gfn_info, page, 1);
xfree(gfn_info);
}
static struct page_info* mem_sharing_lookup(unsigned long mfn)
{
if ( mfn_valid(_mfn(mfn)) )
{
struct page_info* page = mfn_to_page(_mfn(mfn));
if ( page_get_owner(page) == dom_cow )
{
/* Count has to be at least two, because we're called
* with the mfn locked (1) and this is supposed to be
* a shared page (1). */
unsigned long t = read_atomic(&page->u.inuse.type_info);
ASSERT((t & PGT_type_mask) == PGT_shared_page);
ASSERT((t & PGT_count_mask) >= 2);
ASSERT(get_gpfn_from_mfn(mfn) == SHARED_M2P_ENTRY);
return page;
}
}
return NULL;
}
static int audit(void)
{
#if MEM_SHARING_AUDIT
int errors = 0;
unsigned long count_expected;
unsigned long count_found = 0;
struct list_head *ae;
count_expected = atomic_read(&nr_shared_mfns);
rcu_read_lock(&shr_audit_read_lock);
list_for_each_rcu(ae, &shr_audit_list)
{
struct page_sharing_info *pg_shared_info;
unsigned long nr_gfns = 0;
struct page_info *pg;
mfn_t mfn;
gfn_info_t *g;
struct rmap_iterator ri;
pg_shared_info = list_entry(ae, struct page_sharing_info, entry);
pg = pg_shared_info->pg;
mfn = page_to_mfn(pg);
/* If we can't lock it, it's definitely not a shared page */
if ( !mem_sharing_page_lock(pg) )
{
MEM_SHARING_DEBUG("mfn %lx in audit list, but cannot be locked (%lx)!\n",
mfn_x(mfn), pg->u.inuse.type_info);
errors++;
continue;
}
/* Check if the MFN has correct type, owner and handle. */
if ( (pg->u.inuse.type_info & PGT_type_mask) != PGT_shared_page )
{
MEM_SHARING_DEBUG("mfn %lx in audit list, but not PGT_shared_page (%lx)!\n",
mfn_x(mfn), pg->u.inuse.type_info & PGT_type_mask);
errors++;
continue;
}
/* Check the page owner. */
if ( page_get_owner(pg) != dom_cow )
{
MEM_SHARING_DEBUG("mfn %lx shared, but wrong owner (%hu)!\n",
mfn_x(mfn), page_get_owner(pg)->domain_id);
errors++;
}
/* Check the m2p entry */
if ( get_gpfn_from_mfn(mfn_x(mfn)) != SHARED_M2P_ENTRY )
{
MEM_SHARING_DEBUG("mfn %lx shared, but wrong m2p entry (%lx)!\n",
mfn_x(mfn), get_gpfn_from_mfn(mfn_x(mfn)));
errors++;
}
/* Check we have a list */
if ( (!pg->sharing) || !rmap_has_entries(pg) )
{
MEM_SHARING_DEBUG("mfn %lx shared, but empty gfn list!\n",
mfn_x(mfn));
errors++;
continue;
}
/* We've found a page that is shared */
count_found++;
/* Check if all GFNs map to the MFN, and the p2m types */
rmap_seed_iterator(pg, &ri);
while ( (g = rmap_iterate(pg, &ri)) != NULL )
{
struct domain *d;
p2m_type_t t;
mfn_t o_mfn;
d = get_domain_by_id(g->domain);
if ( d == NULL )
{
MEM_SHARING_DEBUG("Unknown dom: %hu, for PFN=%lx, MFN=%lx\n",
g->domain, g->gfn, mfn_x(mfn));
errors++;
continue;
}
o_mfn = get_gfn_query_unlocked(d, g->gfn, &t);
if ( mfn_x(o_mfn) != mfn_x(mfn) )
{
MEM_SHARING_DEBUG("Incorrect P2M for d=%hu, PFN=%lx."
"Expecting MFN=%lx, got %lx\n",
g->domain, g->gfn, mfn_x(mfn), mfn_x(o_mfn));
errors++;
}
if ( t != p2m_ram_shared )
{
MEM_SHARING_DEBUG("Incorrect P2M type for d=%hu, PFN=%lx MFN=%lx."
"Expecting t=%d, got %d\n",
g->domain, g->gfn, mfn_x(mfn), p2m_ram_shared, t);
errors++;
}
put_domain(d);
nr_gfns++;
}
/* The type count has an extra ref because we have locked the page */
if ( (nr_gfns + 1) != (pg->u.inuse.type_info & PGT_count_mask) )
{
MEM_SHARING_DEBUG("Mismatched counts for MFN=%lx."
"nr_gfns in list %lu, in type_info %lx\n",
mfn_x(mfn), nr_gfns,
(pg->u.inuse.type_info & PGT_count_mask));
errors++;
}
mem_sharing_page_unlock(pg);
}
rcu_read_unlock(&shr_audit_read_lock);
if ( count_found != count_expected )
{
MEM_SHARING_DEBUG("Expected %ld shared mfns, found %ld.",
count_expected, count_found);
errors++;
}
return errors;
#else
return -EOPNOTSUPP;
#endif
}
int mem_sharing_notify_enomem(struct domain *d, unsigned long gfn,
bool_t allow_sleep)
{
struct vcpu *v = current;
int rc;
vm_event_request_t req = {
.reason = VM_EVENT_REASON_MEM_SHARING,
.vcpu_id = v->vcpu_id,
.u.mem_sharing.gfn = gfn,
.u.mem_sharing.p2mt = p2m_ram_shared
};
if ( (rc = __vm_event_claim_slot(d,
d->vm_event_share, allow_sleep)) < 0 )
return rc;
if ( v->domain == d )
{
req.flags = VM_EVENT_FLAG_VCPU_PAUSED;
vm_event_vcpu_pause(v);
}
vm_event_put_request(d, d->vm_event_share, &req);
return 0;
}
unsigned int mem_sharing_get_nr_saved_mfns(void)
{
return ((unsigned int)atomic_read(&nr_saved_mfns));
}
unsigned int mem_sharing_get_nr_shared_mfns(void)
{
return (unsigned int)atomic_read(&nr_shared_mfns);
}
/* Functions that change a page's type and ownership */
static int page_make_sharable(struct domain *d,
struct page_info *page,
int expected_refcnt)
{
bool_t drop_dom_ref;
spin_lock(&d->page_alloc_lock);
if ( d->is_dying )
{
spin_unlock(&d->page_alloc_lock);
return -EBUSY;
}
/* Change page type and count atomically */
if ( !get_page_and_type(page, d, PGT_shared_page) )
{
spin_unlock(&d->page_alloc_lock);
return -EINVAL;
}
/* Check it wasn't already sharable and undo if it was */
if ( (page->u.inuse.type_info & PGT_count_mask) != 1 )
{
spin_unlock(&d->page_alloc_lock);
put_page_and_type(page);
return -EEXIST;
}
/* Check if the ref count is 2. The first from PGC_allocated, and
* the second from get_page_and_type at the top of this function */
if ( page->count_info != (PGC_allocated | (2 + expected_refcnt)) )
{
spin_unlock(&d->page_alloc_lock);
/* Return type count back to zero */
put_page_and_type(page);
return -E2BIG;
}
page_set_owner(page, dom_cow);
drop_dom_ref = !domain_adjust_tot_pages(d, -1);
page_list_del(page, &d->page_list);
spin_unlock(&d->page_alloc_lock);
if ( drop_dom_ref )
put_domain(d);
return 0;
}
static int page_make_private(struct domain *d, struct page_info *page)
{
unsigned long expected_type;
if ( !get_page(page, dom_cow) )
return -EINVAL;
spin_lock(&d->page_alloc_lock);
if ( d->is_dying )
{
spin_unlock(&d->page_alloc_lock);
put_page(page);
return -EBUSY;
}
/* We can only change the type if count is one */
/* Because we are locking pages individually, we need to drop
* the lock here, while the page is typed. We cannot risk the
* race of page_unlock and then put_page_type. */
expected_type = (PGT_shared_page | PGT_validated | PGT_locked | 2);
if ( page->u.inuse.type_info != expected_type )
{
spin_unlock(&d->page_alloc_lock);
put_page(page);
return -EEXIST;
}
/* Drop the final typecount */
put_page_and_type(page);
/* Now that we've dropped the type, we can unlock */
mem_sharing_page_unlock(page);
/* Change the owner */
ASSERT(page_get_owner(page) == dom_cow);
page_set_owner(page, d);
if ( domain_adjust_tot_pages(d, 1) == 1 )
get_knownalive_domain(d);
page_list_add_tail(page, &d->page_list);
spin_unlock(&d->page_alloc_lock);
put_page(page);
return 0;
}
static inline struct page_info *__grab_shared_page(mfn_t mfn)
{
struct page_info *pg = NULL;
if ( !mfn_valid(mfn) )
return NULL;
pg = mfn_to_page(mfn);
/* If the page is not validated we can't lock it, and if it's
* not validated it's obviously not shared. */
if ( !mem_sharing_page_lock(pg) )
return NULL;
if ( mem_sharing_lookup(mfn_x(mfn)) == NULL )
{
mem_sharing_page_unlock(pg);
return NULL;
}
return pg;
}
static int debug_mfn(mfn_t mfn)
{
struct page_info *page;
int num_refs;
if ( (page = __grab_shared_page(mfn)) == NULL)
{
gdprintk(XENLOG_ERR, "Invalid MFN=%lx\n", mfn_x(mfn));
return -EINVAL;
}
MEM_SHARING_DEBUG(
"Debug page: MFN=%lx is ci=%lx, ti=%lx, owner_id=%d\n",
mfn_x(page_to_mfn(page)),
page->count_info,
page->u.inuse.type_info,
page_get_owner(page)->domain_id);
/* -1 because the page is locked and that's an additional type ref */
num_refs = ((int) (page->u.inuse.type_info & PGT_count_mask)) - 1;
mem_sharing_page_unlock(page);
return num_refs;
}
static int debug_gfn(struct domain *d, gfn_t gfn)
{
p2m_type_t p2mt;
mfn_t mfn;
int num_refs;
mfn = get_gfn_query(d, gfn_x(gfn), &p2mt);
MEM_SHARING_DEBUG("Debug for dom%d, gfn=%" PRI_gfn "\n",
d->domain_id, gfn_x(gfn));
num_refs = debug_mfn(mfn);
put_gfn(d, gfn_x(gfn));
return num_refs;
}
static int debug_gref(struct domain *d, grant_ref_t ref)
{
int rc;
uint16_t status;
gfn_t gfn;
rc = mem_sharing_gref_to_gfn(d->grant_table, ref, &gfn, &status);
if ( rc )
{
MEM_SHARING_DEBUG("Asked to debug [dom=%d,gref=%u]: error %d.\n",
d->domain_id, ref, rc);
return rc;
}
MEM_SHARING_DEBUG(
"==> Grant [dom=%d,ref=%d], status=%x. ",
d->domain_id, ref, status);
return debug_gfn(d, gfn);
}
static int nominate_page(struct domain *d, gfn_t gfn,
int expected_refcnt, shr_handle_t *phandle)
{
struct p2m_domain *hp2m = p2m_get_hostp2m(d);
p2m_type_t p2mt;
p2m_access_t p2ma;
mfn_t mfn;
struct page_info *page = NULL; /* gcc... */
int ret;
*phandle = 0UL;
mfn = get_gfn_type_access(hp2m, gfn_x(gfn), &p2mt, &p2ma, 0, NULL);
/* Check if mfn is valid */
ret = -EINVAL;
if ( !mfn_valid(mfn) )
goto out;
/* Return the handle if the page is already shared */
if ( p2m_is_shared(p2mt) ) {
struct page_info *pg = __grab_shared_page(mfn);
if ( !pg )
{
gprintk(XENLOG_ERR,
"Shared p2m entry gfn %" PRI_gfn ", but could not grab mfn %" PRI_mfn " dom%d\n",
gfn_x(gfn), mfn_x(mfn), d->domain_id);
BUG();
}
*phandle = pg->sharing->handle;
ret = 0;
mem_sharing_page_unlock(pg);
goto out;
}
/* Check p2m type */
if ( !p2m_is_sharable(p2mt) )
goto out;
/* Check if there are mem_access/remapped altp2m entries for this page */
if ( altp2m_active(d) )
{
unsigned int i;
struct p2m_domain *ap2m;
mfn_t amfn;
p2m_type_t ap2mt;
p2m_access_t ap2ma;
altp2m_list_lock(d);
for ( i = 0; i < MAX_ALTP2M; i++ )
{
ap2m = d->arch.altp2m_p2m[i];
if ( !ap2m )
continue;
amfn = get_gfn_type_access(ap2m, gfn_x(gfn), &ap2mt, &ap2ma, 0, NULL);
if ( mfn_valid(amfn) && (!mfn_eq(amfn, mfn) || ap2ma != p2ma) )
{
altp2m_list_unlock(d);
goto out;
}
}
altp2m_list_unlock(d);
}
/* Try to convert the mfn to the sharable type */
page = mfn_to_page(mfn);
ret = page_make_sharable(d, page, expected_refcnt);
if ( ret )
goto out;
/* Now that the page is validated, we can lock it. There is no
* race because we're holding the p2m entry, so no one else
* could be nominating this gfn */
ret = -ENOENT;
if ( !mem_sharing_page_lock(page) )
goto out;
/* Initialize the shared state */
ret = -ENOMEM;
if ( (page->sharing =
xmalloc(struct page_sharing_info)) == NULL )
{
/* Making a page private atomically unlocks it */
BUG_ON(page_make_private(d, page) != 0);
goto out;
}
page->sharing->pg = page;
rmap_init(page);
/* Create the handle */
page->sharing->handle = get_next_handle();
/* Create the local gfn info */
if ( mem_sharing_gfn_alloc(page, d, gfn_x(gfn)) == NULL )
{
xfree(page->sharing);
page->sharing = NULL;
BUG_ON(page_make_private(d, page) != 0);
goto out;
}
/* Change the p2m type, should never fail with p2m locked. */
BUG_ON(p2m_change_type_one(d, gfn_x(gfn), p2mt, p2m_ram_shared));
/* Account for this page. */
atomic_inc(&nr_shared_mfns);
/* Update m2p entry to SHARED_M2P_ENTRY */
set_gpfn_from_mfn(mfn_x(mfn), SHARED_M2P_ENTRY);
*phandle = page->sharing->handle;
audit_add_list(page);
mem_sharing_page_unlock(page);
ret = 0;
out:
put_gfn(d, gfn_x(gfn));
return ret;
}
static int share_pages(struct domain *sd, gfn_t sgfn, shr_handle_t sh,
struct domain *cd, gfn_t cgfn, shr_handle_t ch)
{
struct page_info *spage, *cpage, *firstpg, *secondpg;
gfn_info_t *gfn;
struct domain *d;
int ret = -EINVAL;
mfn_t smfn, cmfn;
p2m_type_t smfn_type, cmfn_type;
struct two_gfns tg;
struct rmap_iterator ri;
get_two_gfns(sd, gfn_x(sgfn), &smfn_type, NULL, &smfn,
cd, gfn_x(cgfn), &cmfn_type, NULL, &cmfn,
0, &tg);
/* This tricky business is to avoid two callers deadlocking if
* grabbing pages in opposite client/source order */
if( mfn_x(smfn) == mfn_x(cmfn) )
{
/* The pages are already the same. We could return some
* kind of error here, but no matter how you look at it,
* the pages are already 'shared'. It possibly represents
* a big problem somewhere else, but as far as sharing is
* concerned: great success! */
ret = 0;
goto err_out;
}
else if ( mfn_x(smfn) < mfn_x(cmfn) )
{
ret = XENMEM_SHARING_OP_S_HANDLE_INVALID;
spage = firstpg = __grab_shared_page(smfn);
if ( spage == NULL )
goto err_out;
ret = XENMEM_SHARING_OP_C_HANDLE_INVALID;
cpage = secondpg = __grab_shared_page(cmfn);
if ( cpage == NULL )
{
mem_sharing_page_unlock(spage);
goto err_out;
}
} else {
ret = XENMEM_SHARING_OP_C_HANDLE_INVALID;
cpage = firstpg = __grab_shared_page(cmfn);
if ( cpage == NULL )
goto err_out;
ret = XENMEM_SHARING_OP_S_HANDLE_INVALID;
spage = secondpg = __grab_shared_page(smfn);
if ( spage == NULL )
{
mem_sharing_page_unlock(cpage);
goto err_out;
}
}
ASSERT(smfn_type == p2m_ram_shared);
ASSERT(cmfn_type == p2m_ram_shared);
/* Check that the handles match */
if ( spage->sharing->handle != sh )
{
ret = XENMEM_SHARING_OP_S_HANDLE_INVALID;
mem_sharing_page_unlock(secondpg);
mem_sharing_page_unlock(firstpg);
goto err_out;
}
if ( cpage->sharing->handle != ch )
{
ret = XENMEM_SHARING_OP_C_HANDLE_INVALID;
mem_sharing_page_unlock(secondpg);
mem_sharing_page_unlock(firstpg);
goto err_out;
}
/* Merge the lists together */
rmap_seed_iterator(cpage, &ri);
while ( (gfn = rmap_iterate(cpage, &ri)) != NULL)
{
/* Get the source page and type, this should never fail:
* we are under shr lock, and got a successful lookup */
BUG_ON(!get_page_and_type(spage, dom_cow, PGT_shared_page));
/* Move the gfn_info from client list to source list.
* Don't change the type of rmap for the client page. */
rmap_del(gfn, cpage, 0);
rmap_add(gfn, spage);
put_page_and_type(cpage);
d = get_domain_by_id(gfn->domain);
BUG_ON(!d);
BUG_ON(set_shared_p2m_entry(d, gfn->gfn, smfn));
put_domain(d);
}
ASSERT(list_empty(&cpage->sharing->gfns));
/* Clear the rest of the shared state */
page_sharing_dispose(cpage);
cpage->sharing = NULL;
mem_sharing_page_unlock(secondpg);
mem_sharing_page_unlock(firstpg);
/* Free the client page */
if(test_and_clear_bit(_PGC_allocated, &cpage->count_info))
put_page(cpage);
/* We managed to free a domain page. */
atomic_dec(&nr_shared_mfns);
atomic_inc(&nr_saved_mfns);
ret = 0;
err_out:
put_two_gfns(&tg);
return ret;
}
int mem_sharing_add_to_physmap(struct domain *sd, unsigned long sgfn, shr_handle_t sh,
struct domain *cd, unsigned long cgfn)
{
struct page_info *spage;
int ret = -EINVAL;
mfn_t smfn, cmfn;
p2m_type_t smfn_type, cmfn_type;
struct gfn_info *gfn_info;
struct p2m_domain *p2m = p2m_get_hostp2m(cd);
p2m_access_t a;
struct two_gfns tg;
get_two_gfns(sd, sgfn, &smfn_type, NULL, &smfn,
cd, cgfn, &cmfn_type, &a, &cmfn,
0, &tg);
/* Get the source shared page, check and lock */
ret = XENMEM_SHARING_OP_S_HANDLE_INVALID;
spage = __grab_shared_page(smfn);
if ( spage == NULL )
goto err_out;
ASSERT(smfn_type == p2m_ram_shared);
/* Check that the handles match */
if ( spage->sharing->handle != sh )
goto err_unlock;
/* Make sure the target page is a hole in the physmap. These are typically
* p2m_mmio_dm, but also accept p2m_invalid and paged out pages. See the
* definition of p2m_is_hole in p2m.h. */
if ( !p2m_is_hole(cmfn_type) )
{
ret = XENMEM_SHARING_OP_C_HANDLE_INVALID;
goto err_unlock;
}
/* This is simpler than regular sharing */
BUG_ON(!get_page_and_type(spage, dom_cow, PGT_shared_page));
if ( (gfn_info = mem_sharing_gfn_alloc(spage, cd, cgfn)) == NULL )
{
put_page_and_type(spage);
ret = -ENOMEM;
goto err_unlock;
}
ret = p2m_set_entry(p2m, _gfn(cgfn), smfn, PAGE_ORDER_4K,
p2m_ram_shared, a);
/* Tempted to turn this into an assert */
if ( ret )
{
mem_sharing_gfn_destroy(spage, cd, gfn_info);
put_page_and_type(spage);
} else {
/* There is a chance we're plugging a hole where a paged out page was */
if ( p2m_is_paging(cmfn_type) && (cmfn_type != p2m_ram_paging_out) )
{
atomic_dec(&cd->paged_pages);
/* Further, there is a chance this was a valid page. Don't leak it. */
if ( mfn_valid(cmfn) )
{
struct page_info *cpage = mfn_to_page(cmfn);
ASSERT(cpage != NULL);
if ( test_and_clear_bit(_PGC_allocated, &cpage->count_info) )
put_page(cpage);
}
}
}
atomic_inc(&nr_saved_mfns);
err_unlock:
mem_sharing_page_unlock(spage);
err_out:
put_two_gfns(&tg);
return ret;
}
/* A note on the rationale for unshare error handling:
* 1. Unshare can only fail with ENOMEM. Any other error conditions BUG_ON()'s
* 2. We notify a potential dom0 helper through a vm_event ring. But we
* allow the notification to not go to sleep. If the event ring is full
* of ENOMEM warnings, then it's on the ball.
* 3. We cannot go to sleep until the unshare is resolved, because we might
* be buried deep into locks (e.g. something -> copy_to_user -> __hvm_copy)
* 4. So, we make sure we:
* 4.1. return an error
* 4.2. do not corrupt shared memory
* 4.3. do not corrupt guest memory
* 4.4. let the guest deal with it if the error propagation will reach it
*/
int __mem_sharing_unshare_page(struct domain *d,
unsigned long gfn,
uint16_t flags)
{
p2m_type_t p2mt;
mfn_t mfn;
struct page_info *page, *old_page;
int last_gfn;
gfn_info_t *gfn_info = NULL;
mfn = get_gfn(d, gfn, &p2mt);
/* Has someone already unshared it? */
if ( !p2m_is_shared(p2mt) ) {
put_gfn(d, gfn);
return 0;
}
page = __grab_shared_page(mfn);
if ( page == NULL )
{
gdprintk(XENLOG_ERR, "Domain p2m is shared, but page is not: "
"%lx\n", gfn);
BUG();
}
gfn_info = rmap_retrieve(d->domain_id, gfn, page);
if ( unlikely(gfn_info == NULL) )
{
gdprintk(XENLOG_ERR, "Could not find gfn_info for shared gfn: "
"%lx\n", gfn);
BUG();
}
/* Do the accounting first. If anything fails below, we have bigger
* bigger fish to fry. First, remove the gfn from the list. */
last_gfn = rmap_has_one_entry(page);
if ( last_gfn )
{
/* Clean up shared state. Get rid of the tuple
* before destroying the rmap. */
mem_sharing_gfn_destroy(page, d, gfn_info);
page_sharing_dispose(page);
page->sharing = NULL;
atomic_dec(&nr_shared_mfns);
}
else
atomic_dec(&nr_saved_mfns);
/* If the GFN is getting destroyed drop the references to MFN
* (possibly freeing the page), and exit early */
if ( flags & MEM_SHARING_DESTROY_GFN )
{
if ( !last_gfn )
mem_sharing_gfn_destroy(page, d, gfn_info);
put_page_and_type(page);
mem_sharing_page_unlock(page);
if ( last_gfn &&
test_and_clear_bit(_PGC_allocated, &page->count_info) )
put_page(page);
put_gfn(d, gfn);
return 0;
}
if ( last_gfn )
{
/* Making a page private atomically unlocks it */
BUG_ON(page_make_private(d, page) != 0);
goto private_page_found;
}
old_page = page;
page = alloc_domheap_page(d, 0);
if ( !page )
{
/* Undo dec of nr_saved_mfns, as the retry will decrease again. */
atomic_inc(&nr_saved_mfns);
mem_sharing_page_unlock(old_page);
put_gfn(d, gfn);
/* Caller is responsible for placing an event
* in the ring */
return -ENOMEM;
}
copy_domain_page(page_to_mfn(page), page_to_mfn(old_page));
BUG_ON(set_shared_p2m_entry(d, gfn, page_to_mfn(page)));
mem_sharing_gfn_destroy(old_page, d, gfn_info);
mem_sharing_page_unlock(old_page);
put_page_and_type(old_page);
private_page_found:
if ( p2m_change_type_one(d, gfn, p2m_ram_shared, p2m_ram_rw) )
{
gdprintk(XENLOG_ERR, "Could not change p2m type d %hu gfn %lx.\n",
d->domain_id, gfn);
BUG();
}
/* Update m2p entry */
set_gpfn_from_mfn(mfn_x(page_to_mfn(page)), gfn);
/* Now that the gfn<->mfn map is properly established,
* marking dirty is feasible */
paging_mark_dirty(d, page_to_mfn(page));
/* We do not need to unlock a private page */
put_gfn(d, gfn);
return 0;
}
int relinquish_shared_pages(struct domain *d)
{
int rc = 0;
struct p2m_domain *p2m = p2m_get_hostp2m(d);
unsigned long gfn, count = 0;
if ( p2m == NULL )
return 0;
p2m_lock(p2m);
for ( gfn = p2m->next_shared_gfn_to_relinquish;
gfn <= p2m->max_mapped_pfn; gfn++ )
{
p2m_access_t a;
p2m_type_t t;
mfn_t mfn;
int set_rc;
if ( atomic_read(&d->shr_pages) == 0 )
break;
mfn = p2m->get_entry(p2m, _gfn(gfn), &t, &a, 0, NULL, NULL);
if ( mfn_valid(mfn) && (t == p2m_ram_shared) )
{
/* Does not fail with ENOMEM given the DESTROY flag */
BUG_ON(__mem_sharing_unshare_page(d, gfn,
MEM_SHARING_DESTROY_GFN));
/* Clear out the p2m entry so no one else may try to
* unshare. Must succeed: we just read the old entry and
* we hold the p2m lock. */
set_rc = p2m->set_entry(p2m, _gfn(gfn), _mfn(0), PAGE_ORDER_4K,
p2m_invalid, p2m_access_rwx, -1);
ASSERT(set_rc == 0);
count += 0x10;
}
else
++count;
/* Preempt every 2MiB (shared) or 32MiB (unshared) - arbitrary. */
if ( count >= 0x2000 )
{
if ( hypercall_preempt_check() )
{
p2m->next_shared_gfn_to_relinquish = gfn + 1;
rc = -ERESTART;
break;
}
count = 0;
}
}
p2m_unlock(p2m);
return rc;
}
static int range_share(struct domain *d, struct domain *cd,
struct mem_sharing_op_range *range)
{
int rc = 0;
shr_handle_t sh, ch;
unsigned long start = range->opaque ?: range->first_gfn;
while ( range->last_gfn >= start )
{
/*
* We only break out if we run out of memory as individual pages may
* legitimately be unsharable and we just want to skip over those.
*/
rc = nominate_page(d, _gfn(start), 0, &sh);
if ( rc == -ENOMEM )
break;
if ( !rc )
{
rc = nominate_page(cd, _gfn(start), 0, &ch);
if ( rc == -ENOMEM )
break;
if ( !rc )
{
/* If we get here this should be guaranteed to succeed. */
rc = share_pages(d, _gfn(start), sh, cd, _gfn(start), ch);
ASSERT(!rc);
}
}
/* Check for continuation if it's not the last iteration. */
if ( range->last_gfn >= ++start && hypercall_preempt_check() )
{
rc = 1;
break;
}
}
range->opaque = start;
/*
* The last page may fail with -EINVAL, and for range sharing we don't
* care about that.
*/
if ( range->last_gfn < start && rc == -EINVAL )
rc = 0;
return rc;
}
int mem_sharing_memop(XEN_GUEST_HANDLE_PARAM(xen_mem_sharing_op_t) arg)
{
int rc;
xen_mem_sharing_op_t mso;
struct domain *d;
rc = -EFAULT;
if ( copy_from_guest(&mso, arg, 1) )
return rc;
if ( mso.op == XENMEM_sharing_op_audit )
return audit();
rc = rcu_lock_live_remote_domain_by_id(mso.domain, &d);
if ( rc )
return rc;
rc = xsm_mem_sharing(XSM_DM_PRIV, d);
if ( rc )
goto out;
/* Only HAP is supported */
rc = -ENODEV;
if ( !hap_enabled(d) || !d->arch.hvm_domain.mem_sharing_enabled )
goto out;
switch ( mso.op )
{
case XENMEM_sharing_op_nominate_gfn:
{
shr_handle_t handle;
rc = -EINVAL;
if ( !mem_sharing_enabled(d) )
goto out;
rc = nominate_page(d, _gfn(mso.u.nominate.u.gfn), 0, &handle);
mso.u.nominate.handle = handle;
}
break;
case XENMEM_sharing_op_nominate_gref:
{
grant_ref_t gref = mso.u.nominate.u.grant_ref;
gfn_t gfn;
shr_handle_t handle;
rc = -EINVAL;
if ( !mem_sharing_enabled(d) )
goto out;
rc = mem_sharing_gref_to_gfn(d->grant_table, gref, &gfn, NULL);
if ( rc < 0 )
goto out;
rc = nominate_page(d, gfn, 3, &handle);
mso.u.nominate.handle = handle;
}
break;
case XENMEM_sharing_op_share:
{
gfn_t sgfn, cgfn;
struct domain *cd;
shr_handle_t sh, ch;
rc = -EINVAL;
if ( !mem_sharing_enabled(d) )
goto out;
rc = rcu_lock_live_remote_domain_by_id(mso.u.share.client_domain,
&cd);
if ( rc )
goto out;
rc = xsm_mem_sharing_op(XSM_DM_PRIV, d, cd, mso.op);
if ( rc )
{
rcu_unlock_domain(cd);
goto out;
}
if ( !mem_sharing_enabled(cd) )
{
rcu_unlock_domain(cd);
rc = -EINVAL;
goto out;
}
if ( XENMEM_SHARING_OP_FIELD_IS_GREF(mso.u.share.source_gfn) )
{
grant_ref_t gref = (grant_ref_t)
(XENMEM_SHARING_OP_FIELD_GET_GREF(
mso.u.share.source_gfn));
rc = mem_sharing_gref_to_gfn(d->grant_table, gref, &sgfn,
NULL);
if ( rc < 0 )
{
rcu_unlock_domain(cd);
goto out;
}
}
else
sgfn = _gfn(mso.u.share.source_gfn);
if ( XENMEM_SHARING_OP_FIELD_IS_GREF(mso.u.share.client_gfn) )
{
grant_ref_t gref = (grant_ref_t)
(XENMEM_SHARING_OP_FIELD_GET_GREF(
mso.u.share.client_gfn));
rc = mem_sharing_gref_to_gfn(cd->grant_table, gref, &cgfn,
NULL);
if ( rc < 0 )
{
rcu_unlock_domain(cd);
goto out;
}
}
else
cgfn = _gfn(mso.u.share.client_gfn);
sh = mso.u.share.source_handle;
ch = mso.u.share.client_handle;
rc = share_pages(d, sgfn, sh, cd, cgfn, ch);
rcu_unlock_domain(cd);
}
break;
case XENMEM_sharing_op_add_physmap:
{
unsigned long sgfn, cgfn;
struct domain *cd;
shr_handle_t sh;
rc = -EINVAL;
if ( !mem_sharing_enabled(d) )
goto out;
rc = rcu_lock_live_remote_domain_by_id(mso.u.share.client_domain,
&cd);
if ( rc )
goto out;
rc = xsm_mem_sharing_op(XSM_DM_PRIV, d, cd, mso.op);
if ( rc )
{
rcu_unlock_domain(cd);
goto out;
}
if ( !mem_sharing_enabled(cd) )
{
rcu_unlock_domain(cd);
rc = -EINVAL;
goto out;
}
if ( XENMEM_SHARING_OP_FIELD_IS_GREF(mso.u.share.source_gfn) )
{
/* Cannot add a gref to the physmap */
rcu_unlock_domain(cd);
rc = -EINVAL;
goto out;
}
sgfn = mso.u.share.source_gfn;
sh = mso.u.share.source_handle;
cgfn = mso.u.share.client_gfn;
rc = mem_sharing_add_to_physmap(d, sgfn, sh, cd, cgfn);
rcu_unlock_domain(cd);
}
break;
case XENMEM_sharing_op_range_share:
{
unsigned long max_sgfn, max_cgfn;
struct domain *cd;
rc = -EINVAL;
if ( mso.u.range._pad[0] || mso.u.range._pad[1] ||
mso.u.range._pad[2] )
goto out;
/*
* We use opaque for the hypercall continuation value.
* Ideally the user sets this to 0 in the beginning but
* there is no good way of enforcing that here, so we just check
* that it's at least in range.
*/
if ( mso.u.range.opaque &&
(mso.u.range.opaque < mso.u.range.first_gfn ||
mso.u.range.opaque > mso.u.range.last_gfn) )
goto out;
if ( !mem_sharing_enabled(d) )
goto out;
rc = rcu_lock_live_remote_domain_by_id(mso.u.range.client_domain,
&cd);
if ( rc )
goto out;
/*
* We reuse XENMEM_sharing_op_share XSM check here as this is
* essentially the same concept repeated over multiple pages.
*/
rc = xsm_mem_sharing_op(XSM_DM_PRIV, d, cd,
XENMEM_sharing_op_share);
if ( rc )
{
rcu_unlock_domain(cd);
goto out;
}
if ( !mem_sharing_enabled(cd) )
{
rcu_unlock_domain(cd);
rc = -EINVAL;
goto out;
}
/*
* Sanity check only, the client should keep the domains paused for
* the duration of this op.
*/
if ( !atomic_read(&d->pause_count) ||
!atomic_read(&cd->pause_count) )
{
rcu_unlock_domain(cd);
rc = -EINVAL;
goto out;
}
max_sgfn = domain_get_maximum_gpfn(d);
max_cgfn = domain_get_maximum_gpfn(cd);
if ( max_sgfn < mso.u.range.first_gfn ||
max_sgfn < mso.u.range.last_gfn ||
max_cgfn < mso.u.range.first_gfn ||
max_cgfn < mso.u.range.last_gfn )
{
rcu_unlock_domain(cd);
rc = -EINVAL;
goto out;
}
rc = range_share(d, cd, &mso.u.range);
rcu_unlock_domain(cd);
if ( rc > 0 )
{
if ( __copy_to_guest(arg, &mso, 1) )
rc = -EFAULT;
else
rc = hypercall_create_continuation(__HYPERVISOR_memory_op,
"lh", XENMEM_sharing_op,
arg);
}
else
mso.u.range.opaque = 0;
}
break;
case XENMEM_sharing_op_debug_gfn:
rc = debug_gfn(d, _gfn(mso.u.debug.u.gfn));
break;
case XENMEM_sharing_op_debug_gref:
rc = debug_gref(d, mso.u.debug.u.gref);
break;
default:
rc = -ENOSYS;
break;
}
if ( !rc && __copy_to_guest(arg, &mso, 1) )
rc = -EFAULT;
out:
rcu_unlock_domain(d);
return rc;
}
int mem_sharing_domctl(struct domain *d, struct xen_domctl_mem_sharing_op *mec)
{
int rc;
/* Only HAP is supported */
if ( !hap_enabled(d) )
return -ENODEV;
switch(mec->op)
{
case XEN_DOMCTL_MEM_SHARING_CONTROL:
{
rc = 0;
if ( unlikely(need_iommu(d) && mec->u.enable) )
rc = -EXDEV;
else
d->arch.hvm_domain.mem_sharing_enabled = mec->u.enable;
}
break;
default:
rc = -ENOSYS;
}
return rc;
}
void __init mem_sharing_init(void)
{
printk("Initing memory sharing.\n");
#if MEM_SHARING_AUDIT
spin_lock_init(&shr_audit_lock);
INIT_LIST_HEAD(&shr_audit_list);
#endif
}