/*
* ept-p2m.c: use the EPT page table as p2m
* Copyright (c) 2007, Intel Corporation.
*
* This program is free software; you can redistribute it and/or modify it
* under the terms and conditions of the GNU General Public License,
* version 2, as published by the Free Software Foundation.
*
* This program is distributed in the hope it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
* more details.
*
* You should have received a copy of the GNU General Public License along with
* this program; If not, see .
*/
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include "mm-locks.h"
#define atomic_read_ept_entry(__pepte) \
( (ept_entry_t) { .epte = read_atomic(&(__pepte)->epte) } )
#define is_epte_present(ept_entry) ((ept_entry)->epte & 0x7)
#define is_epte_superpage(ept_entry) ((ept_entry)->sp)
static inline bool_t is_epte_valid(ept_entry_t *e)
{
/* suppress_ve alone is not considered valid, so mask it off */
return ((e->epte & ~(1ul << 63)) != 0 && e->sa_p2mt != p2m_invalid);
}
/* returns : 0 for success, -errno otherwise */
static int atomic_write_ept_entry(ept_entry_t *entryptr, ept_entry_t new,
int level)
{
int rc;
unsigned long oldmfn = mfn_x(INVALID_MFN);
bool_t check_foreign = (new.mfn != entryptr->mfn ||
new.sa_p2mt != entryptr->sa_p2mt);
if ( level )
{
ASSERT(!is_epte_superpage(&new) || !p2m_is_foreign(new.sa_p2mt));
write_atomic(&entryptr->epte, new.epte);
return 0;
}
if ( unlikely(p2m_is_foreign(new.sa_p2mt)) )
{
rc = -EINVAL;
if ( !is_epte_present(&new) )
goto out;
if ( check_foreign )
{
struct domain *fdom;
if ( !mfn_valid(_mfn(new.mfn)) )
goto out;
rc = -ESRCH;
fdom = page_get_owner(mfn_to_page(new.mfn));
if ( fdom == NULL )
goto out;
/* get refcount on the page */
rc = -EBUSY;
if ( !get_page(mfn_to_page(new.mfn), fdom) )
goto out;
}
}
if ( unlikely(p2m_is_foreign(entryptr->sa_p2mt)) && check_foreign )
oldmfn = entryptr->mfn;
write_atomic(&entryptr->epte, new.epte);
if ( unlikely(oldmfn != mfn_x(INVALID_MFN)) )
put_page(mfn_to_page(oldmfn));
rc = 0;
out:
if ( rc )
gdprintk(XENLOG_ERR, "epte o:%"PRIx64" n:%"PRIx64" rc:%d\n",
entryptr->epte, new.epte, rc);
return rc;
}
static void ept_p2m_type_to_flags(struct p2m_domain *p2m, ept_entry_t *entry,
p2m_type_t type, p2m_access_t access)
{
/*
* First apply type permissions.
*
* A/D bits are also manually set to avoid overhead of MMU having to set
* them later. Both A/D bits are safe to be updated directly as they are
* ignored by processor if EPT A/D bits is not turned on.
*
* A bit is set for all present p2m types in middle and leaf EPT entries.
* D bit is set for all writable types in EPT leaf entry, except for
* log-dirty type with PML.
*/
switch(type)
{
case p2m_invalid:
case p2m_mmio_dm:
case p2m_populate_on_demand:
case p2m_ram_paging_out:
case p2m_ram_paged:
case p2m_ram_paging_in:
default:
entry->r = entry->w = entry->x = 0;
break;
case p2m_ram_rw:
entry->r = entry->w = entry->x = 1;
entry->a = entry->d = !!cpu_has_vmx_ept_ad;
break;
case p2m_ioreq_server:
entry->r = 1;
entry->w = !(p2m->ioreq.flags & XEN_DMOP_IOREQ_MEM_ACCESS_WRITE);
entry->x = 0;
entry->a = !!cpu_has_vmx_ept_ad;
entry->d = entry->w && entry->a;
break;
case p2m_mmio_direct:
entry->r = entry->x = 1;
entry->w = !rangeset_contains_singleton(mmio_ro_ranges,
entry->mfn);
ASSERT(entry->w || !is_epte_superpage(entry));
entry->a = !!cpu_has_vmx_ept_ad;
entry->d = entry->w && cpu_has_vmx_ept_ad;
break;
case p2m_ram_logdirty:
entry->r = entry->x = 1;
/*
* In case of PML, we don't have to write protect 4K page, but
* only need to clear D-bit for it, but we still need to write
* protect super page in order to split it to 4K pages in EPT
* violation.
*/
if ( vmx_domain_pml_enabled(p2m->domain) &&
!is_epte_superpage(entry) )
entry->w = 1;
else
entry->w = 0;
entry->a = !!cpu_has_vmx_ept_ad;
/* For both PML or non-PML cases we clear D bit anyway */
entry->d = 0;
break;
case p2m_ram_ro:
case p2m_ram_shared:
entry->r = entry->x = 1;
entry->w = 0;
entry->a = !!cpu_has_vmx_ept_ad;
entry->d = 0;
break;
case p2m_grant_map_rw:
case p2m_map_foreign:
entry->r = entry->w = 1;
entry->x = 0;
entry->a = entry->d = !!cpu_has_vmx_ept_ad;
break;
case p2m_grant_map_ro:
entry->r = 1;
entry->w = entry->x = 0;
entry->a = !!cpu_has_vmx_ept_ad;
entry->d = 0;
break;
}
/* Then restrict with access permissions */
switch (access)
{
case p2m_access_n:
case p2m_access_n2rwx:
entry->r = entry->w = entry->x = 0;
break;
case p2m_access_r:
entry->w = entry->x = 0;
break;
case p2m_access_w:
entry->r = entry->x = 0;
break;
case p2m_access_x:
entry->r = entry->w = 0;
break;
case p2m_access_rx:
case p2m_access_rx2rw:
entry->w = 0;
break;
case p2m_access_wx:
entry->r = 0;
break;
case p2m_access_rw:
entry->x = 0;
break;
case p2m_access_rwx:
break;
}
}
#define GUEST_TABLE_MAP_FAILED 0
#define GUEST_TABLE_NORMAL_PAGE 1
#define GUEST_TABLE_SUPER_PAGE 2
#define GUEST_TABLE_POD_PAGE 3
/* Fill in middle levels of ept table */
static int ept_set_middle_entry(struct p2m_domain *p2m, ept_entry_t *ept_entry)
{
mfn_t mfn;
ept_entry_t *table;
unsigned int i;
mfn = p2m_alloc_ptp(p2m, 0);
if ( mfn_eq(mfn, INVALID_MFN) )
return 0;
ept_entry->epte = 0;
ept_entry->mfn = mfn_x(mfn);
ept_entry->access = p2m->default_access;
ept_entry->r = ept_entry->w = ept_entry->x = 1;
/* Manually set A bit to avoid overhead of MMU having to write it later. */
ept_entry->a = !!cpu_has_vmx_ept_ad;
ept_entry->suppress_ve = 1;
table = map_domain_page(mfn);
for ( i = 0; i < EPT_PAGETABLE_ENTRIES; i++ )
table[i].suppress_ve = 1;
unmap_domain_page(table);
return 1;
}
/* free ept sub tree behind an entry */
static void ept_free_entry(struct p2m_domain *p2m, ept_entry_t *ept_entry, int level)
{
/* End if the entry is a leaf entry. */
if ( level == 0 || !is_epte_present(ept_entry) ||
is_epte_superpage(ept_entry) )
return;
if ( level > 1 )
{
ept_entry_t *epte = map_domain_page(_mfn(ept_entry->mfn));
for ( int i = 0; i < EPT_PAGETABLE_ENTRIES; i++ )
ept_free_entry(p2m, epte + i, level - 1);
unmap_domain_page(epte);
}
p2m_tlb_flush_sync(p2m);
p2m_free_ptp(p2m, mfn_to_page(ept_entry->mfn));
}
static bool_t ept_split_super_page(struct p2m_domain *p2m,
ept_entry_t *ept_entry,
unsigned int level, unsigned int target)
{
ept_entry_t new_ept, *table;
uint64_t trunk;
unsigned int i;
bool_t rv = 1;
/* End if the entry is a leaf entry or reaches the target level. */
if ( level <= target )
return 1;
ASSERT(is_epte_superpage(ept_entry));
if ( !ept_set_middle_entry(p2m, &new_ept) )
return 0;
table = map_domain_page(_mfn(new_ept.mfn));
trunk = 1UL << ((level - 1) * EPT_TABLE_ORDER);
for ( i = 0; i < EPT_PAGETABLE_ENTRIES; i++ )
{
ept_entry_t *epte = table + i;
*epte = *ept_entry;
epte->sp = (level > 1);
epte->mfn += i * trunk;
epte->snp = (iommu_enabled && iommu_snoop);
epte->suppress_ve = 1;
ept_p2m_type_to_flags(p2m, epte, epte->sa_p2mt, epte->access);
if ( (level - 1) == target )
continue;
ASSERT(is_epte_superpage(epte));
if ( !(rv = ept_split_super_page(p2m, epte, level - 1, target)) )
break;
}
unmap_domain_page(table);
/* Even failed we should install the newly allocated ept page. */
*ept_entry = new_ept;
return rv;
}
/* Take the currently mapped table, find the corresponding gfn entry,
* and map the next table, if available. If the entry is empty
* and read_only is set,
* Return values:
* 0: Failed to map. Either read_only was set and the entry was
* empty, or allocating a new page failed.
* GUEST_TABLE_NORMAL_PAGE: next level mapped normally
* GUEST_TABLE_SUPER_PAGE:
* The next entry points to a superpage, and caller indicates
* that they are going to the superpage level, or are only doing
* a read.
* GUEST_TABLE_POD:
* The next entry is marked populate-on-demand.
*/
static int ept_next_level(struct p2m_domain *p2m, bool_t read_only,
ept_entry_t **table, unsigned long *gfn_remainder,
int next_level)
{
unsigned long mfn;
ept_entry_t *ept_entry, e;
u32 shift, index;
shift = next_level * EPT_TABLE_ORDER;
index = *gfn_remainder >> shift;
/* index must be falling into the page */
ASSERT(index < EPT_PAGETABLE_ENTRIES);
ept_entry = (*table) + index;
/* ept_next_level() is called (sometimes) without a lock. Read
* the entry once, and act on the "cached" entry after that to
* avoid races. */
e = atomic_read_ept_entry(ept_entry);
if ( !is_epte_present(&e) )
{
if ( e.sa_p2mt == p2m_populate_on_demand )
return GUEST_TABLE_POD_PAGE;
if ( read_only )
return GUEST_TABLE_MAP_FAILED;
if ( !ept_set_middle_entry(p2m, ept_entry) )
return GUEST_TABLE_MAP_FAILED;
else
e = atomic_read_ept_entry(ept_entry); /* Refresh */
}
/* The only time sp would be set here is if we had hit a superpage */
if ( is_epte_superpage(&e) )
return GUEST_TABLE_SUPER_PAGE;
mfn = e.mfn;
unmap_domain_page(*table);
*table = map_domain_page(_mfn(mfn));
*gfn_remainder &= (1UL << shift) - 1;
return GUEST_TABLE_NORMAL_PAGE;
}
/*
* Invalidate (via setting the EMT field to an invalid value) all valid
* present entries in the given page table, optionally marking the entries
* also for their subtrees needing P2M type re-calculation.
*/
static bool_t ept_invalidate_emt(mfn_t mfn, bool_t recalc, int level)
{
int rc;
ept_entry_t *epte = map_domain_page(mfn);
unsigned int i;
bool_t changed = 0;
for ( i = 0; i < EPT_PAGETABLE_ENTRIES; i++ )
{
ept_entry_t e = atomic_read_ept_entry(&epte[i]);
if ( !is_epte_valid(&e) || !is_epte_present(&e) ||
(e.emt == MTRR_NUM_TYPES && (e.recalc || !recalc)) )
continue;
e.emt = MTRR_NUM_TYPES;
if ( recalc )
e.recalc = 1;
rc = atomic_write_ept_entry(&epte[i], e, level);
ASSERT(rc == 0);
changed = 1;
}
unmap_domain_page(epte);
return changed;
}
/*
* Just like ept_invalidate_emt() except that
* - not all entries at the targeted level may need processing,
* - the re-calculation flag gets always set.
* The passed in range is guaranteed to not cross a page (table)
* boundary at the targeted level.
*/
static int ept_invalidate_emt_range(struct p2m_domain *p2m,
unsigned int target,
unsigned long first_gfn,
unsigned long last_gfn)
{
ept_entry_t *table;
unsigned long gfn_remainder = first_gfn;
unsigned int i, index;
int wrc, rc = 0, ret = GUEST_TABLE_MAP_FAILED;
table = map_domain_page(pagetable_get_mfn(p2m_get_pagetable(p2m)));
for ( i = p2m->ept.wl; i > target; --i )
{
ret = ept_next_level(p2m, 1, &table, &gfn_remainder, i);
if ( ret == GUEST_TABLE_MAP_FAILED )
goto out;
if ( ret != GUEST_TABLE_NORMAL_PAGE )
break;
}
if ( i > target )
{
/* We need to split the original page. */
ept_entry_t split_ept_entry;
index = gfn_remainder >> (i * EPT_TABLE_ORDER);
split_ept_entry = atomic_read_ept_entry(&table[index]);
ASSERT(is_epte_superpage(&split_ept_entry));
if ( !ept_split_super_page(p2m, &split_ept_entry, i, target) )
{
ept_free_entry(p2m, &split_ept_entry, i);
rc = -ENOMEM;
goto out;
}
wrc = atomic_write_ept_entry(&table[index], split_ept_entry, i);
ASSERT(wrc == 0);
for ( ; i > target; --i )
if ( !ept_next_level(p2m, 1, &table, &gfn_remainder, i) )
break;
ASSERT(i == target);
}
index = gfn_remainder >> (i * EPT_TABLE_ORDER);
i = (last_gfn >> (i * EPT_TABLE_ORDER)) & (EPT_PAGETABLE_ENTRIES - 1);
for ( ; index <= i; ++index )
{
ept_entry_t e = atomic_read_ept_entry(&table[index]);
if ( is_epte_valid(&e) && is_epte_present(&e) &&
(e.emt != MTRR_NUM_TYPES || !e.recalc) )
{
e.emt = MTRR_NUM_TYPES;
e.recalc = 1;
wrc = atomic_write_ept_entry(&table[index], e, target);
ASSERT(wrc == 0);
rc = 1;
}
}
out:
unmap_domain_page(table);
return rc;
}
/*
* Resolve deliberately mis-configured (EMT field set to an invalid value)
* entries in the page table hierarchy for the given GFN:
* - calculate the correct value for the EMT field,
* - if marked so, re-calculate the P2M type,
* - propagate EMT and re-calculation flag down to the next page table level
* for entries not involved in the translation of the given GFN.
* Returns:
* - negative errno values in error,
* - zero if no adjustment was done,
* - a positive value if at least one adjustment was done.
*/
static int resolve_misconfig(struct p2m_domain *p2m, unsigned long gfn)
{
struct ept_data *ept = &p2m->ept;
unsigned int level = ept->wl;
unsigned long mfn = ept->mfn;
ept_entry_t *epte;
int wrc, rc = 0;
if ( !mfn )
return 0;
for ( ; ; --level )
{
ept_entry_t e;
unsigned int i;
epte = map_domain_page(_mfn(mfn));
i = (gfn >> (level * EPT_TABLE_ORDER)) & (EPT_PAGETABLE_ENTRIES - 1);
e = atomic_read_ept_entry(&epte[i]);
if ( level == 0 || is_epte_superpage(&e) )
{
uint8_t ipat = 0;
if ( e.emt != MTRR_NUM_TYPES )
break;
if ( level == 0 )
{
for ( gfn -= i, i = 0; i < EPT_PAGETABLE_ENTRIES; ++i )
{
p2m_type_t nt;
e = atomic_read_ept_entry(&epte[i]);
if ( e.emt == MTRR_NUM_TYPES )
e.emt = 0;
if ( !is_epte_valid(&e) || !is_epte_present(&e) )
continue;
e.emt = epte_get_entry_emt(p2m->domain, gfn + i,
_mfn(e.mfn), 0, &ipat,
e.sa_p2mt == p2m_mmio_direct);
e.ipat = ipat;
nt = p2m_recalc_type(e.recalc, e.sa_p2mt, p2m, gfn + i);
if ( nt != e.sa_p2mt )
{
if ( e.sa_p2mt == p2m_ioreq_server )
{
ASSERT(p2m->ioreq.entry_count > 0);
p2m->ioreq.entry_count--;
}
e.sa_p2mt = nt;
ept_p2m_type_to_flags(p2m, &e, e.sa_p2mt, e.access);
}
e.recalc = 0;
wrc = atomic_write_ept_entry(&epte[i], e, level);
ASSERT(wrc == 0);
}
}
else
{
int emt = epte_get_entry_emt(p2m->domain, gfn, _mfn(e.mfn),
level * EPT_TABLE_ORDER, &ipat,
e.sa_p2mt == p2m_mmio_direct);
bool_t recalc = e.recalc;
if ( recalc && p2m_is_changeable(e.sa_p2mt) )
{
unsigned long mask = ~0UL << (level * EPT_TABLE_ORDER);
ASSERT(e.sa_p2mt != p2m_ioreq_server);
switch ( p2m_is_logdirty_range(p2m, gfn & mask,
gfn | ~mask) )
{
case 0:
e.sa_p2mt = p2m_ram_rw;
e.recalc = 0;
break;
case 1:
e.sa_p2mt = p2m_ram_logdirty;
e.recalc = 0;
break;
default: /* Force split. */
emt = -1;
break;
}
}
if ( unlikely(emt < 0) )
{
if ( ept_split_super_page(p2m, &e, level, level - 1) )
{
wrc = atomic_write_ept_entry(&epte[i], e, level);
ASSERT(wrc == 0);
unmap_domain_page(epte);
mfn = e.mfn;
continue;
}
ept_free_entry(p2m, &e, level);
rc = -ENOMEM;
break;
}
e.emt = emt;
e.ipat = ipat;
e.recalc = 0;
if ( recalc && p2m_is_changeable(e.sa_p2mt) )
ept_p2m_type_to_flags(p2m, &e, e.sa_p2mt, e.access);
wrc = atomic_write_ept_entry(&epte[i], e, level);
ASSERT(wrc == 0);
}
rc = 1;
break;
}
if ( e.emt == MTRR_NUM_TYPES )
{
ASSERT(is_epte_present(&e));
ept_invalidate_emt(_mfn(e.mfn), e.recalc, level);
smp_wmb();
e.emt = 0;
e.recalc = 0;
wrc = atomic_write_ept_entry(&epte[i], e, level);
ASSERT(wrc == 0);
unmap_domain_page(epte);
rc = 1;
}
else if ( is_epte_present(&e) && !e.emt )
unmap_domain_page(epte);
else
break;
mfn = e.mfn;
}
unmap_domain_page(epte);
if ( rc )
{
struct vcpu *v;
for_each_vcpu ( p2m->domain, v )
v->arch.hvm_vmx.ept_spurious_misconfig = 1;
}
return rc;
}
bool_t ept_handle_misconfig(uint64_t gpa)
{
struct vcpu *curr = current;
struct p2m_domain *p2m = p2m_get_hostp2m(curr->domain);
bool_t spurious;
int rc;
p2m_lock(p2m);
spurious = curr->arch.hvm_vmx.ept_spurious_misconfig;
rc = resolve_misconfig(p2m, PFN_DOWN(gpa));
curr->arch.hvm_vmx.ept_spurious_misconfig = 0;
p2m_unlock(p2m);
return spurious ? (rc >= 0) : (rc > 0);
}
/*
* ept_set_entry() computes 'need_modify_vtd_table' for itself,
* by observing whether any gfn->mfn translations are modified.
*
* Returns: 0 for success, -errno for failure
*/
static int
ept_set_entry(struct p2m_domain *p2m, gfn_t gfn_, mfn_t mfn,
unsigned int order, p2m_type_t p2mt, p2m_access_t p2ma,
int sve)
{
ept_entry_t *table, *ept_entry = NULL;
unsigned long gfn = gfn_x(gfn_);
unsigned long gfn_remainder = gfn;
unsigned int i, target = order / EPT_TABLE_ORDER;
unsigned long fn_mask = !mfn_eq(mfn, INVALID_MFN) ? (gfn | mfn_x(mfn)) : gfn;
int ret, rc = 0;
bool_t entry_written = 0;
bool_t direct_mmio = (p2mt == p2m_mmio_direct);
uint8_t ipat = 0;
bool_t need_modify_vtd_table = 1;
bool_t vtd_pte_present = 0;
unsigned int iommu_flags = p2m_get_iommu_flags(p2mt, mfn);
bool_t needs_sync = 1;
ept_entry_t old_entry = { .epte = 0 };
ept_entry_t new_entry = { .epte = 0 };
struct ept_data *ept = &p2m->ept;
struct domain *d = p2m->domain;
ASSERT(ept);
/*
* the caller must make sure:
* 1. passing valid gfn and mfn at order boundary.
* 2. gfn not exceeding guest physical address width.
* 3. passing a valid order.
*/
if ( (fn_mask & ((1UL << order) - 1)) ||
((u64)gfn >> ((ept->wl + 1) * EPT_TABLE_ORDER)) ||
(order % EPT_TABLE_ORDER) )
return -EINVAL;
/* Carry out any eventually pending earlier changes first. */
ret = resolve_misconfig(p2m, gfn);
if ( ret < 0 )
return ret;
ASSERT((target == 2 && hap_has_1gb) ||
(target == 1 && hap_has_2mb) ||
(target == 0));
ASSERT(!p2m_is_foreign(p2mt) || target == 0);
table = map_domain_page(pagetable_get_mfn(p2m_get_pagetable(p2m)));
ret = GUEST_TABLE_MAP_FAILED;
for ( i = ept->wl; i > target; i-- )
{
ret = ept_next_level(p2m, 0, &table, &gfn_remainder, i);
if ( !ret )
{
rc = -ENOENT;
goto out;
}
else if ( ret != GUEST_TABLE_NORMAL_PAGE )
break;
}
ASSERT(ret != GUEST_TABLE_POD_PAGE || i != target);
ept_entry = table + (gfn_remainder >> (i * EPT_TABLE_ORDER));
/* In case VT-d uses same page table, this flag is needed by VT-d */
vtd_pte_present = is_epte_present(ept_entry);
/*
* If we're here with i > target, we must be at a leaf node, and
* we need to break up the superpage.
*
* If we're here with i == target and i > 0, we need to check to see
* if we're replacing a non-leaf entry (i.e., pointing to an N-1 table)
* with a leaf entry (a 1GiB or 2MiB page), and handle things appropriately.
*/
if ( i == target )
{
/* We reached the target level. */
/* No need to flush if the old entry wasn't valid */
if ( !is_epte_present(ept_entry) )
needs_sync = 0;
/* If we're replacing a non-leaf entry with a leaf entry (1GiB or 2MiB),
* the intermediate tables will be freed below after the ept flush
*
* Read-then-write is OK because we hold the p2m lock. */
old_entry = *ept_entry;
}
else
{
/* We need to split the original page. */
ept_entry_t split_ept_entry;
ASSERT(is_epte_superpage(ept_entry));
split_ept_entry = atomic_read_ept_entry(ept_entry);
if ( !ept_split_super_page(p2m, &split_ept_entry, i, target) )
{
ept_free_entry(p2m, &split_ept_entry, i);
rc = -ENOMEM;
goto out;
}
/* now install the newly split ept sub-tree */
/* NB: please make sure domian is paused and no in-fly VT-d DMA. */
rc = atomic_write_ept_entry(ept_entry, split_ept_entry, i);
ASSERT(rc == 0);
/* then move to the level we want to make real changes */
for ( ; i > target; i-- )
if ( !ept_next_level(p2m, 0, &table, &gfn_remainder, i) )
break;
/* We just installed the pages we need. */
ASSERT(i == target);
ept_entry = table + (gfn_remainder >> (i * EPT_TABLE_ORDER));
}
if ( mfn_valid(mfn) || p2m_allows_invalid_mfn(p2mt) )
{
int emt = epte_get_entry_emt(p2m->domain, gfn, mfn,
i * EPT_TABLE_ORDER, &ipat, direct_mmio);
if ( emt >= 0 )
new_entry.emt = emt;
else /* ept_handle_misconfig() will need to take care of this. */
new_entry.emt = MTRR_NUM_TYPES;
new_entry.ipat = ipat;
new_entry.sp = !!i;
new_entry.sa_p2mt = p2mt;
new_entry.access = p2ma;
new_entry.snp = (iommu_enabled && iommu_snoop);
/* the caller should take care of the previous page */
new_entry.mfn = mfn_x(mfn);
/* Safe to read-then-write because we hold the p2m lock */
if ( ept_entry->mfn == new_entry.mfn &&
p2m_get_iommu_flags(ept_entry->sa_p2mt, _mfn(ept_entry->mfn)) ==
iommu_flags )
need_modify_vtd_table = 0;
ept_p2m_type_to_flags(p2m, &new_entry, p2mt, p2ma);
}
if ( sve != -1 )
new_entry.suppress_ve = !!sve;
else
new_entry.suppress_ve = is_epte_valid(&old_entry) ?
old_entry.suppress_ve : 1;
/*
* p2m_ioreq_server is only used for 4K pages, so the
* count is only done on ept page table entries.
*/
if ( p2mt == p2m_ioreq_server )
{
ASSERT(i == 0);
p2m->ioreq.entry_count++;
}
if ( ept_entry->sa_p2mt == p2m_ioreq_server )
{
ASSERT(i == 0);
ASSERT(p2m->ioreq.entry_count > 0);
p2m->ioreq.entry_count--;
}
rc = atomic_write_ept_entry(ept_entry, new_entry, target);
if ( unlikely(rc) )
old_entry.epte = 0;
else
{
entry_written = 1;
if ( p2mt != p2m_invalid &&
(gfn + (1UL << order) - 1 > p2m->max_mapped_pfn) )
/* Track the highest gfn for which we have ever had a valid mapping */
p2m->max_mapped_pfn = gfn + (1UL << order) - 1;
}
out:
if ( needs_sync )
ept_sync_domain(p2m);
/* For host p2m, may need to change VT-d page table.*/
if ( rc == 0 && p2m_is_hostp2m(p2m) && need_iommu(d) &&
need_modify_vtd_table )
{
if ( iommu_hap_pt_share )
rc = iommu_pte_flush(d, gfn, &ept_entry->epte, order, vtd_pte_present);
else
{
if ( iommu_flags )
for ( i = 0; i < (1 << order); i++ )
{
rc = iommu_map_page(d, gfn + i, mfn_x(mfn) + i, iommu_flags);
if ( unlikely(rc) )
{
while ( i-- )
/* If statement to satisfy __must_check. */
if ( iommu_unmap_page(p2m->domain, gfn + i) )
continue;
break;
}
}
else
for ( i = 0; i < (1 << order); i++ )
{
ret = iommu_unmap_page(d, gfn + i);
if ( !rc )
rc = ret;
}
}
}
unmap_domain_page(table);
/* Release the old intermediate tables, if any. This has to be the
last thing we do, after the ept_sync_domain() and removal
from the iommu tables, so as to avoid a potential
use-after-free. */
if ( is_epte_present(&old_entry) )
ept_free_entry(p2m, &old_entry, target);
if ( entry_written && p2m_is_hostp2m(p2m) )
p2m_altp2m_propagate_change(d, _gfn(gfn), mfn, order, p2mt, p2ma);
return rc;
}
/* Read ept p2m entries */
static mfn_t ept_get_entry(struct p2m_domain *p2m,
gfn_t gfn_, p2m_type_t *t, p2m_access_t* a,
p2m_query_t q, unsigned int *page_order,
bool_t *sve)
{
ept_entry_t *table =
map_domain_page(pagetable_get_mfn(p2m_get_pagetable(p2m)));
unsigned long gfn = gfn_x(gfn_);
unsigned long gfn_remainder = gfn;
ept_entry_t *ept_entry;
u32 index;
int i;
int ret = 0;
bool_t recalc = 0;
mfn_t mfn = INVALID_MFN;
struct ept_data *ept = &p2m->ept;
*t = p2m_mmio_dm;
*a = p2m_access_n;
if ( sve )
*sve = 1;
/* This pfn is higher than the highest the p2m map currently holds */
if ( gfn > p2m->max_mapped_pfn )
{
for ( i = ept->wl; i > 0; --i )
if ( (gfn & ~((1UL << (i * EPT_TABLE_ORDER)) - 1)) >
p2m->max_mapped_pfn )
break;
goto out;
}
/* Should check if gfn obeys GAW here. */
for ( i = ept->wl; i > 0; i-- )
{
retry:
if ( table[gfn_remainder >> (i * EPT_TABLE_ORDER)].recalc )
recalc = 1;
ret = ept_next_level(p2m, 1, &table, &gfn_remainder, i);
if ( !ret )
goto out;
else if ( ret == GUEST_TABLE_POD_PAGE )
{
if ( !(q & P2M_ALLOC) )
{
*t = p2m_populate_on_demand;
goto out;
}
/* Populate this superpage */
ASSERT(i <= 2);
index = gfn_remainder >> ( i * EPT_TABLE_ORDER);
ept_entry = table + index;
if ( p2m_pod_demand_populate(p2m, gfn_, i * EPT_TABLE_ORDER) )
goto retry;
else
goto out;
}
else if ( ret == GUEST_TABLE_SUPER_PAGE )
break;
}
index = gfn_remainder >> (i * EPT_TABLE_ORDER);
ept_entry = table + index;
if ( ept_entry->sa_p2mt == p2m_populate_on_demand )
{
if ( !(q & P2M_ALLOC) )
{
*t = p2m_populate_on_demand;
goto out;
}
ASSERT(i == 0);
if ( !p2m_pod_demand_populate(p2m, gfn_, PAGE_ORDER_4K) )
goto out;
}
if ( is_epte_valid(ept_entry) )
{
*t = p2m_recalc_type(recalc || ept_entry->recalc,
ept_entry->sa_p2mt, p2m, gfn);
*a = ept_entry->access;
if ( sve )
*sve = ept_entry->suppress_ve;
mfn = _mfn(ept_entry->mfn);
if ( i )
{
/*
* We may meet super pages, and to split into 4k pages
* to emulate p2m table
*/
unsigned long split_mfn = mfn_x(mfn) +
(gfn_remainder &
((1 << (i * EPT_TABLE_ORDER)) - 1));
mfn = _mfn(split_mfn);
}
}
out:
if ( page_order )
*page_order = i * EPT_TABLE_ORDER;
unmap_domain_page(table);
return mfn;
}
void ept_walk_table(struct domain *d, unsigned long gfn)
{
struct p2m_domain *p2m = p2m_get_hostp2m(d);
struct ept_data *ept = &p2m->ept;
ept_entry_t *table =
map_domain_page(pagetable_get_mfn(p2m_get_pagetable(p2m)));
unsigned long gfn_remainder = gfn;
int i;
gprintk(XENLOG_ERR, "Walking EPT tables for GFN %lx:\n", gfn);
/* This pfn is higher than the highest the p2m map currently holds */
if ( gfn > p2m->max_mapped_pfn )
{
gprintk(XENLOG_ERR, " gfn exceeds max_mapped_pfn %lx\n",
p2m->max_mapped_pfn);
goto out;
}
for ( i = ept->wl; i >= 0; i-- )
{
ept_entry_t *ept_entry, *next;
u32 index;
/* Stolen from ept_next_level */
index = gfn_remainder >> (i*EPT_TABLE_ORDER);
ept_entry = table + index;
gprintk(XENLOG_ERR, " epte %"PRIx64"\n", ept_entry->epte);
if ( (i == 0) || !is_epte_present(ept_entry) ||
is_epte_superpage(ept_entry) )
goto out;
else
{
gfn_remainder &= (1UL << (i*EPT_TABLE_ORDER)) - 1;
next = map_domain_page(_mfn(ept_entry->mfn));
unmap_domain_page(table);
table = next;
}
}
out:
unmap_domain_page(table);
return;
}
static void ept_change_entry_type_global(struct p2m_domain *p2m,
p2m_type_t ot, p2m_type_t nt)
{
unsigned long mfn = p2m->ept.mfn;
if ( !mfn )
return;
if ( ept_invalidate_emt(_mfn(mfn), 1, p2m->ept.wl) )
ept_sync_domain(p2m);
}
static int ept_change_entry_type_range(struct p2m_domain *p2m,
p2m_type_t ot, p2m_type_t nt,
unsigned long first_gfn,
unsigned long last_gfn)
{
unsigned int i, wl = p2m->ept.wl;
unsigned long mask = (1 << EPT_TABLE_ORDER) - 1;
int rc = 0, sync = 0;
if ( !p2m->ept.mfn )
return -EINVAL;
for ( i = 0; i <= wl; )
{
if ( first_gfn & mask )
{
unsigned long end_gfn = min(first_gfn | mask, last_gfn);
rc = ept_invalidate_emt_range(p2m, i, first_gfn, end_gfn);
sync |= rc;
if ( rc < 0 || end_gfn >= last_gfn )
break;
first_gfn = end_gfn + 1;
}
else if ( (last_gfn & mask) != mask )
{
unsigned long start_gfn = max(first_gfn, last_gfn & ~mask);
rc = ept_invalidate_emt_range(p2m, i, start_gfn, last_gfn);
sync |= rc;
if ( rc < 0 || start_gfn <= first_gfn )
break;
last_gfn = start_gfn - 1;
}
else
{
++i;
mask |= mask << EPT_TABLE_ORDER;
}
}
if ( sync )
ept_sync_domain(p2m);
return rc < 0 ? rc : 0;
}
static void ept_memory_type_changed(struct p2m_domain *p2m)
{
unsigned long mfn = p2m->ept.mfn;
if ( !mfn )
return;
if ( ept_invalidate_emt(_mfn(mfn), 0, p2m->ept.wl) )
ept_sync_domain(p2m);
}
static void __ept_sync_domain(void *info)
{
/*
* The invalidation will be done before VMENTER (see
* vmx_vmenter_helper()).
*/
}
static void ept_sync_domain_prepare(struct p2m_domain *p2m)
{
struct domain *d = p2m->domain;
struct ept_data *ept = &p2m->ept;
if ( nestedhvm_enabled(d) )
{
if ( p2m_is_nestedp2m(p2m) )
ept = &p2m_get_hostp2m(d)->ept;
else
p2m_flush_nestedp2m(d);
}
/*
* Need to invalidate on all PCPUs because either:
*
* a) A VCPU has run and some translations may be cached.
* b) A VCPU has not run and and the initial invalidation in case
* of an EP4TA reuse is still needed.
*/
cpumask_setall(ept->invalidate);
}
static void ept_sync_domain_mask(struct p2m_domain *p2m, const cpumask_t *mask)
{
on_selected_cpus(mask, __ept_sync_domain, p2m, 1);
}
void ept_sync_domain(struct p2m_domain *p2m)
{
struct domain *d = p2m->domain;
/* Only if using EPT and this domain has some VCPUs to dirty. */
if ( !paging_mode_hap(d) || !d->vcpu || !d->vcpu[0] )
return;
ept_sync_domain_prepare(p2m);
if ( p2m->defer_flush )
{
p2m->need_flush = 1;
return;
}
ept_sync_domain_mask(p2m, d->domain_dirty_cpumask);
}
static void ept_tlb_flush(struct p2m_domain *p2m)
{
ept_sync_domain_mask(p2m, p2m->domain->domain_dirty_cpumask);
}
static void ept_enable_pml(struct p2m_domain *p2m)
{
/* Domain must have been paused */
ASSERT(atomic_read(&p2m->domain->pause_count));
/*
* No need to return whether vmx_domain_enable_pml has succeeded, as
* ept_p2m_type_to_flags will do the check, and write protection will be
* used if PML is not enabled.
*/
if ( vmx_domain_enable_pml(p2m->domain) )
return;
/* Enable EPT A/D bit for PML */
p2m->ept.ad = 1;
vmx_domain_update_eptp(p2m->domain);
}
static void ept_disable_pml(struct p2m_domain *p2m)
{
/* Domain must have been paused */
ASSERT(atomic_read(&p2m->domain->pause_count));
vmx_domain_disable_pml(p2m->domain);
/* Disable EPT A/D bit */
p2m->ept.ad = 0;
vmx_domain_update_eptp(p2m->domain);
}
static void ept_flush_pml_buffers(struct p2m_domain *p2m)
{
/* Domain must have been paused */
ASSERT(atomic_read(&p2m->domain->pause_count));
vmx_domain_flush_pml_buffers(p2m->domain);
}
int ept_p2m_init(struct p2m_domain *p2m)
{
struct ept_data *ept = &p2m->ept;
p2m->set_entry = ept_set_entry;
p2m->get_entry = ept_get_entry;
p2m->recalc = resolve_misconfig;
p2m->change_entry_type_global = ept_change_entry_type_global;
p2m->change_entry_type_range = ept_change_entry_type_range;
p2m->memory_type_changed = ept_memory_type_changed;
p2m->audit_p2m = NULL;
p2m->tlb_flush = ept_tlb_flush;
/* Set the memory type used when accessing EPT paging structures. */
ept->mt = EPT_DEFAULT_MT;
/* set EPT page-walk length, now it's actual walk length - 1, i.e. 3 */
ept->wl = 3;
if ( cpu_has_vmx_pml )
{
p2m->enable_hardware_log_dirty = ept_enable_pml;
p2m->disable_hardware_log_dirty = ept_disable_pml;
p2m->flush_hardware_cached_dirty = ept_flush_pml_buffers;
}
if ( !zalloc_cpumask_var(&ept->invalidate) )
return -ENOMEM;
/*
* Assume an initial invalidation is required, in case an EP4TA is
* reused.
*/
cpumask_setall(ept->invalidate);
return 0;
}
void ept_p2m_uninit(struct p2m_domain *p2m)
{
struct ept_data *ept = &p2m->ept;
free_cpumask_var(ept->invalidate);
}
static const char *memory_type_to_str(unsigned int x)
{
static const char memory_types[8][3] = {
[MTRR_TYPE_UNCACHABLE] = "UC",
[MTRR_TYPE_WRCOMB] = "WC",
[MTRR_TYPE_WRTHROUGH] = "WT",
[MTRR_TYPE_WRPROT] = "WP",
[MTRR_TYPE_WRBACK] = "WB",
[MTRR_NUM_TYPES] = "??"
};
ASSERT(x < ARRAY_SIZE(memory_types));
return memory_types[x][0] ? memory_types[x] : "?";
}
static void ept_dump_p2m_table(unsigned char key)
{
struct domain *d;
ept_entry_t *table, *ept_entry;
int order;
int i;
int ret = 0;
unsigned long gfn, gfn_remainder;
unsigned long record_counter = 0;
struct p2m_domain *p2m;
struct ept_data *ept;
for_each_domain(d)
{
if ( !hap_enabled(d) )
continue;
p2m = p2m_get_hostp2m(d);
ept = &p2m->ept;
printk("\ndomain%d EPT p2m table:\n", d->domain_id);
for ( gfn = 0; gfn <= p2m->max_mapped_pfn; gfn += 1UL << order )
{
char c = 0;
gfn_remainder = gfn;
table = map_domain_page(pagetable_get_mfn(p2m_get_pagetable(p2m)));
for ( i = ept->wl; i > 0; i-- )
{
ept_entry = table + (gfn_remainder >> (i * EPT_TABLE_ORDER));
if ( ept_entry->emt == MTRR_NUM_TYPES )
c = '?';
ret = ept_next_level(p2m, 1, &table, &gfn_remainder, i);
if ( ret != GUEST_TABLE_NORMAL_PAGE )
break;
}
order = i * EPT_TABLE_ORDER;
ept_entry = table + (gfn_remainder >> order);
if ( ret != GUEST_TABLE_MAP_FAILED && is_epte_valid(ept_entry) )
{
if ( ept_entry->sa_p2mt == p2m_populate_on_demand )
printk("gfn: %13lx order: %2d PoD\n", gfn, order);
else
printk("gfn: %13lx order: %2d mfn: %13lx %c%c%c %c%c%c\n",
gfn, order, ept_entry->mfn + 0UL,
ept_entry->r ? 'r' : ' ',
ept_entry->w ? 'w' : ' ',
ept_entry->x ? 'x' : ' ',
memory_type_to_str(ept_entry->emt)[0],
memory_type_to_str(ept_entry->emt)[1]
?: ept_entry->emt + '0',
c ?: ept_entry->ipat ? '!' : ' ');
if ( !(record_counter++ % 100) )
process_pending_softirqs();
}
unmap_domain_page(table);
}
}
}
void setup_ept_dump(void)
{
register_keyhandler('D', ept_dump_p2m_table, "dump VT-x EPT tables", 0);
}
void p2m_init_altp2m_ept(struct domain *d, unsigned int i)
{
struct p2m_domain *p2m = d->arch.altp2m_p2m[i];
struct ept_data *ept;
p2m->min_remapped_gfn = gfn_x(INVALID_GFN);
p2m->max_remapped_gfn = 0;
ept = &p2m->ept;
ept->mfn = pagetable_get_pfn(p2m_get_pagetable(p2m));
d->arch.altp2m_eptp[i] = ept->eptp;
}
unsigned int p2m_find_altp2m_by_eptp(struct domain *d, uint64_t eptp)
{
struct p2m_domain *p2m;
struct ept_data *ept;
unsigned int i;
altp2m_list_lock(d);
for ( i = 0; i < MAX_ALTP2M; i++ )
{
if ( d->arch.altp2m_eptp[i] == mfn_x(INVALID_MFN) )
continue;
p2m = d->arch.altp2m_p2m[i];
ept = &p2m->ept;
if ( eptp == ept->eptp )
goto out;
}
i = INVALID_ALTP2M;
out:
altp2m_list_unlock(d);
return i;
}
/*
* Local variables:
* mode: C
* c-file-style: "BSD"
* c-basic-offset: 4
* tab-width: 4
* indent-tabs-mode: nil
* End:
*/