/*
* Xen domain builder -- i386 and x86_64 bits.
*
* Most architecture-specific code for x86 goes here.
* - prepare page tables.
* - fill architecture-specific structs.
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation;
* version 2.1 of the License.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with this library; If not, see .
*
* written 2006 by Gerd Hoffmann .
*
*/
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include
#include "xg_private.h"
#include "xc_dom.h"
#include "xenctrl.h"
/* ------------------------------------------------------------------------ */
#define SUPERPAGE_BATCH_SIZE 512
#define SUPERPAGE_2MB_SHIFT 9
#define SUPERPAGE_2MB_NR_PFNS (1UL << SUPERPAGE_2MB_SHIFT)
#define SUPERPAGE_1GB_SHIFT 18
#define SUPERPAGE_1GB_NR_PFNS (1UL << SUPERPAGE_1GB_SHIFT)
#define X86_CR0_PE 0x01
#define X86_CR0_ET 0x10
#define SPECIALPAGE_PAGING 0
#define SPECIALPAGE_ACCESS 1
#define SPECIALPAGE_SHARING 2
#define SPECIALPAGE_BUFIOREQ 3
#define SPECIALPAGE_XENSTORE 4
#define SPECIALPAGE_IOREQ 5
#define SPECIALPAGE_IDENT_PT 6
#define SPECIALPAGE_CONSOLE 7
#define special_pfn(x) \
(X86_HVM_END_SPECIAL_REGION - X86_HVM_NR_SPECIAL_PAGES + (x))
#define NR_IOREQ_SERVER_PAGES 8
#define ioreq_server_pfn(x) (special_pfn(0) - NR_IOREQ_SERVER_PAGES + (x))
#define bits_to_mask(bits) (((xen_vaddr_t)1 << (bits))-1)
#define round_down(addr, mask) ((addr) & ~(mask))
#define round_up(addr, mask) ((addr) | (mask))
#define round_pg_up(addr) (((addr) + PAGE_SIZE_X86 - 1) & ~(PAGE_SIZE_X86 - 1))
#define HVMLOADER_MODULE_MAX_COUNT 2
#define HVMLOADER_MODULE_CMDLINE_SIZE MAX_GUEST_CMDLINE
struct xc_dom_params {
unsigned levels;
xen_vaddr_t vaddr_mask;
x86_pgentry_t lvl_prot[4];
};
struct xc_dom_x86_mapping_lvl {
xen_vaddr_t from;
xen_vaddr_t to;
xen_pfn_t pfn;
unsigned int pgtables;
};
struct xc_dom_x86_mapping {
struct xc_dom_x86_mapping_lvl area;
struct xc_dom_x86_mapping_lvl lvls[4];
};
struct xc_dom_image_x86 {
unsigned n_mappings;
#define MAPPING_MAX 2
struct xc_dom_x86_mapping maps[MAPPING_MAX];
struct xc_dom_params *params;
};
/* get guest IO ABI protocol */
const char *xc_domain_get_native_protocol(xc_interface *xch,
uint32_t domid)
{
int ret;
uint32_t guest_width;
const char *protocol;
ret = xc_domain_get_guest_width(xch, domid, &guest_width);
if ( ret )
return NULL;
switch (guest_width) {
case 4: /* 32 bit guest */
protocol = XEN_IO_PROTO_ABI_X86_32;
break;
case 8: /* 64 bit guest */
protocol = XEN_IO_PROTO_ABI_X86_64;
break;
default:
protocol = NULL;
}
return protocol;
}
static int count_pgtables(struct xc_dom_image *dom, xen_vaddr_t from,
xen_vaddr_t to, xen_pfn_t pfn)
{
struct xc_dom_image_x86 *domx86 = dom->arch_private;
struct xc_dom_x86_mapping *map, *map_cmp;
xen_pfn_t pfn_end;
xen_vaddr_t mask;
unsigned bits;
int l, m;
if ( domx86->n_mappings == MAPPING_MAX )
{
xc_dom_panic(dom->xch, XC_OUT_OF_MEMORY,
"%s: too many mappings\n", __FUNCTION__);
return -ENOMEM;
}
map = domx86->maps + domx86->n_mappings;
pfn_end = pfn + ((to - from) >> PAGE_SHIFT_X86);
if ( pfn_end >= dom->p2m_size )
{
xc_dom_panic(dom->xch, XC_OUT_OF_MEMORY,
"%s: not enough memory for initial mapping (%#"PRIpfn" > %#"PRIpfn")",
__FUNCTION__, pfn_end, dom->p2m_size);
return -ENOMEM;
}
for ( m = 0; m < domx86->n_mappings; m++ )
{
map_cmp = domx86->maps + m;
if ( from < map_cmp->area.to && to > map_cmp->area.from )
{
xc_dom_panic(dom->xch, XC_INTERNAL_ERROR,
"%s: overlapping mappings\n", __FUNCTION__);
return -EINVAL;
}
}
memset(map, 0, sizeof(*map));
map->area.from = from & domx86->params->vaddr_mask;
map->area.to = to & domx86->params->vaddr_mask;
for ( l = domx86->params->levels - 1; l >= 0; l-- )
{
map->lvls[l].pfn = dom->pfn_alloc_end + map->area.pgtables;
if ( l == domx86->params->levels - 1 )
{
/* Top level page table in first mapping only. */
if ( domx86->n_mappings == 0 )
{
map->lvls[l].from = 0;
map->lvls[l].to = domx86->params->vaddr_mask;
map->lvls[l].pgtables = 1;
map->area.pgtables++;
}
continue;
}
bits = PAGE_SHIFT_X86 + (l + 1) * PGTBL_LEVEL_SHIFT_X86;
mask = bits_to_mask(bits);
map->lvls[l].from = map->area.from & ~mask;
map->lvls[l].to = map->area.to | mask;
if ( domx86->params->levels == PGTBL_LEVELS_I386 &&
domx86->n_mappings == 0 && to < 0xc0000000 && l == 1 )
{
DOMPRINTF("%s: PAE: extra l2 page table for l3#3", __FUNCTION__);
map->lvls[l].to = domx86->params->vaddr_mask;
}
for ( m = 0; m < domx86->n_mappings; m++ )
{
map_cmp = domx86->maps + m;
if ( map_cmp->lvls[l].from == map_cmp->lvls[l].to )
continue;
if ( map->lvls[l].from >= map_cmp->lvls[l].from &&
map->lvls[l].to <= map_cmp->lvls[l].to )
{
map->lvls[l].from = 0;
map->lvls[l].to = 0;
break;
}
assert(map->lvls[l].from >= map_cmp->lvls[l].from ||
map->lvls[l].to <= map_cmp->lvls[l].to);
if ( map->lvls[l].from >= map_cmp->lvls[l].from &&
map->lvls[l].from <= map_cmp->lvls[l].to )
map->lvls[l].from = map_cmp->lvls[l].to + 1;
if ( map->lvls[l].to >= map_cmp->lvls[l].from &&
map->lvls[l].to <= map_cmp->lvls[l].to )
map->lvls[l].to = map_cmp->lvls[l].from - 1;
}
if ( map->lvls[l].from < map->lvls[l].to )
map->lvls[l].pgtables =
((map->lvls[l].to - map->lvls[l].from) >> bits) + 1;
DOMPRINTF("%s: 0x%016" PRIx64 "/%d: 0x%016" PRIx64 " -> 0x%016" PRIx64
", %d table(s)", __FUNCTION__, mask, bits,
map->lvls[l].from, map->lvls[l].to, map->lvls[l].pgtables);
map->area.pgtables += map->lvls[l].pgtables;
}
return 0;
}
static int alloc_pgtables(struct xc_dom_image *dom)
{
int pages, extra_pages;
xen_vaddr_t try_virt_end;
struct xc_dom_image_x86 *domx86 = dom->arch_private;
struct xc_dom_x86_mapping *map = domx86->maps + domx86->n_mappings;
extra_pages = dom->alloc_bootstack ? 1 : 0;
extra_pages += (512 * 1024) / PAGE_SIZE_X86; /* 512kB padding */
pages = extra_pages;
for ( ; ; )
{
try_virt_end = round_up(dom->virt_alloc_end + pages * PAGE_SIZE_X86,
bits_to_mask(22)); /* 4MB alignment */
if ( count_pgtables(dom, dom->parms.virt_base, try_virt_end, 0) )
return -1;
pages = map->area.pgtables + extra_pages;
if ( dom->virt_alloc_end + pages * PAGE_SIZE_X86 <= try_virt_end + 1 )
break;
}
map->area.pfn = 0;
domx86->n_mappings++;
dom->virt_pgtab_end = try_virt_end + 1;
return xc_dom_alloc_segment(dom, &dom->pgtables_seg, "page tables", 0,
map->area.pgtables * PAGE_SIZE_X86);
}
/* ------------------------------------------------------------------------ */
/* i386 pagetables */
static struct xc_dom_params x86_32_params = {
.levels = PGTBL_LEVELS_I386,
.vaddr_mask = bits_to_mask(VIRT_BITS_I386),
.lvl_prot[0] = _PAGE_PRESENT|_PAGE_RW|_PAGE_ACCESSED,
.lvl_prot[1] = _PAGE_PRESENT|_PAGE_RW|_PAGE_ACCESSED|_PAGE_DIRTY|_PAGE_USER,
.lvl_prot[2] = _PAGE_PRESENT,
};
static int alloc_pgtables_x86_32_pae(struct xc_dom_image *dom)
{
struct xc_dom_image_x86 *domx86 = dom->arch_private;
domx86->params = &x86_32_params;
return alloc_pgtables(dom);
}
#define pfn_to_paddr(pfn) ((xen_paddr_t)(pfn) << PAGE_SHIFT_X86)
#define pgentry_to_pfn(entry) ((xen_pfn_t)((entry) >> PAGE_SHIFT_X86))
/*
* Move the l3 page table page below 4G for guests which do not
* support the extended-cr3 format. The l3 is currently empty so we
* do not need to preserve the current contents.
*/
static xen_pfn_t move_l3_below_4G(struct xc_dom_image *dom,
xen_pfn_t l3pfn,
xen_pfn_t l3mfn)
{
xen_pfn_t new_l3mfn;
struct xc_mmu *mmu;
void *l3tab;
mmu = xc_alloc_mmu_updates(dom->xch, dom->guest_domid);
if ( mmu == NULL )
{
DOMPRINTF("%s: failed at %d", __FUNCTION__, __LINE__);
return l3mfn;
}
xc_dom_unmap_one(dom, l3pfn);
new_l3mfn = xc_make_page_below_4G(dom->xch, dom->guest_domid, l3mfn);
if ( !new_l3mfn )
goto out;
dom->p2m_host[l3pfn] = new_l3mfn;
if ( xc_dom_update_guest_p2m(dom) != 0 )
goto out;
if ( xc_add_mmu_update(dom->xch, mmu,
(((unsigned long long)new_l3mfn)
<< XC_DOM_PAGE_SHIFT(dom)) |
MMU_MACHPHYS_UPDATE, l3pfn) )
goto out;
if ( xc_flush_mmu_updates(dom->xch, mmu) )
goto out;
/*
* This ensures that the entire pgtables_seg is mapped by a single
* mmap region. arch_setup_bootlate() relies on this to be able to
* unmap and pin the pagetables.
*/
if ( xc_dom_seg_to_ptr(dom, &dom->pgtables_seg) == NULL )
goto out;
l3tab = xc_dom_pfn_to_ptr(dom, l3pfn, 1);
if ( l3tab == NULL )
{
DOMPRINTF("%s: xc_dom_pfn_to_ptr(dom, l3pfn, 1) => NULL",
__FUNCTION__);
goto out; /* our one call site will call xc_dom_panic and fail */
}
memset(l3tab, 0, XC_DOM_PAGE_SIZE(dom));
DOMPRINTF("%s: successfully relocated L3 below 4G. "
"(L3 PFN %#"PRIpfn" MFN %#"PRIpfn"=>%#"PRIpfn")",
__FUNCTION__, l3pfn, l3mfn, new_l3mfn);
l3mfn = new_l3mfn;
out:
free(mmu);
return l3mfn;
}
static x86_pgentry_t *get_pg_table_x86(struct xc_dom_image *dom, int m, int l)
{
struct xc_dom_image_x86 *domx86 = dom->arch_private;
struct xc_dom_x86_mapping *map;
x86_pgentry_t *pg;
map = domx86->maps + m;
pg = xc_dom_pfn_to_ptr(dom, map->lvls[l].pfn, 0);
if ( pg )
return pg;
xc_dom_panic(dom->xch, XC_INTERNAL_ERROR,
"%s: xc_dom_pfn_to_ptr failed", __FUNCTION__);
return NULL;
}
static x86_pgentry_t get_pg_prot_x86(struct xc_dom_image *dom, int l,
xen_pfn_t pfn)
{
struct xc_dom_image_x86 *domx86 = dom->arch_private;
struct xc_dom_x86_mapping *map;
xen_pfn_t pfn_s, pfn_e;
x86_pgentry_t prot;
unsigned m;
prot = domx86->params->lvl_prot[l];
if ( l > 0 )
return prot;
for ( m = 0; m < domx86->n_mappings; m++ )
{
map = domx86->maps + m;
pfn_s = map->lvls[domx86->params->levels - 1].pfn;
pfn_e = map->area.pgtables + pfn_s;
if ( pfn >= pfn_s && pfn < pfn_e )
return prot & ~_PAGE_RW;
}
return prot;
}
static int setup_pgtables_x86(struct xc_dom_image *dom)
{
struct xc_dom_image_x86 *domx86 = dom->arch_private;
struct xc_dom_x86_mapping *map1, *map2;
struct xc_dom_x86_mapping_lvl *lvl;
xen_vaddr_t from, to;
xen_pfn_t pfn, p, p_s, p_e;
x86_pgentry_t *pg;
unsigned m1, m2;
int l;
for ( l = domx86->params->levels - 1; l >= 0; l-- )
for ( m1 = 0; m1 < domx86->n_mappings; m1++ )
{
map1 = domx86->maps + m1;
from = map1->lvls[l].from;
to = map1->lvls[l].to;
pg = get_pg_table_x86(dom, m1, l);
if ( !pg )
return -1;
for ( m2 = 0; m2 < domx86->n_mappings; m2++ )
{
map2 = domx86->maps + m2;
lvl = (l > 0) ? map2->lvls + l - 1 : &map2->area;
if ( l > 0 && lvl->pgtables == 0 )
continue;
if ( lvl->from >= to || lvl->to <= from )
continue;
p_s = (max(from, lvl->from) - from) >>
(PAGE_SHIFT_X86 + l * PGTBL_LEVEL_SHIFT_X86);
p_e = (min(to, lvl->to) - from) >>
(PAGE_SHIFT_X86 + l * PGTBL_LEVEL_SHIFT_X86);
pfn = ((max(from, lvl->from) - lvl->from) >>
(PAGE_SHIFT_X86 + l * PGTBL_LEVEL_SHIFT_X86)) + lvl->pfn;
for ( p = p_s; p <= p_e; p++ )
{
pg[p] = pfn_to_paddr(xc_dom_p2m(dom, pfn)) |
get_pg_prot_x86(dom, l, pfn);
pfn++;
}
}
}
return 0;
}
static int setup_pgtables_x86_32_pae(struct xc_dom_image *dom)
{
struct xc_dom_image_x86 *domx86 = dom->arch_private;
xen_pfn_t l3mfn, l3pfn;
l3pfn = domx86->maps[0].lvls[2].pfn;
l3mfn = xc_dom_p2m(dom, l3pfn);
if ( dom->parms.pae == XEN_PAE_YES )
{
if ( l3mfn >= 0x100000 )
l3mfn = move_l3_below_4G(dom, l3pfn, l3mfn);
if ( l3mfn >= 0x100000 )
{
xc_dom_panic(dom->xch, XC_INTERNAL_ERROR,"%s: cannot move L3"
" below 4G. extended-cr3 not supported by guest. "
"(L3 PFN %#"PRIpfn" MFN %#"PRIpfn")",
__FUNCTION__, l3pfn, l3mfn);
return -EINVAL;
}
}
return setup_pgtables_x86(dom);
}
/* ------------------------------------------------------------------------ */
/* x86_64 pagetables */
static struct xc_dom_params x86_64_params = {
.levels = PGTBL_LEVELS_X86_64,
.vaddr_mask = bits_to_mask(VIRT_BITS_X86_64),
.lvl_prot[0] = _PAGE_PRESENT|_PAGE_RW|_PAGE_ACCESSED,
.lvl_prot[1] = _PAGE_PRESENT|_PAGE_RW|_PAGE_ACCESSED|_PAGE_DIRTY|_PAGE_USER,
.lvl_prot[2] = _PAGE_PRESENT|_PAGE_RW|_PAGE_ACCESSED|_PAGE_DIRTY|_PAGE_USER,
.lvl_prot[3] = _PAGE_PRESENT|_PAGE_RW|_PAGE_ACCESSED|_PAGE_DIRTY|_PAGE_USER,
};
static int alloc_pgtables_x86_64(struct xc_dom_image *dom)
{
struct xc_dom_image_x86 *domx86 = dom->arch_private;
domx86->params = &x86_64_params;
return alloc_pgtables(dom);
}
static int setup_pgtables_x86_64(struct xc_dom_image *dom)
{
return setup_pgtables_x86(dom);
}
/* ------------------------------------------------------------------------ */
static int alloc_p2m_list(struct xc_dom_image *dom, size_t p2m_alloc_size)
{
if ( xc_dom_alloc_segment(dom, &dom->p2m_seg, "phys2mach",
0, p2m_alloc_size) )
return -1;
dom->p2m_guest = xc_dom_seg_to_ptr(dom, &dom->p2m_seg);
if ( dom->p2m_guest == NULL )
return -1;
return 0;
}
static int alloc_p2m_list_x86_32(struct xc_dom_image *dom)
{
size_t p2m_alloc_size = dom->p2m_size * dom->arch_hooks->sizeof_pfn;
p2m_alloc_size = round_pg_up(p2m_alloc_size);
return alloc_p2m_list(dom, p2m_alloc_size);
}
static int alloc_p2m_list_x86_64(struct xc_dom_image *dom)
{
struct xc_dom_image_x86 *domx86 = dom->arch_private;
struct xc_dom_x86_mapping *map = domx86->maps + domx86->n_mappings;
size_t p2m_alloc_size = dom->p2m_size * dom->arch_hooks->sizeof_pfn;
xen_vaddr_t from, to;
unsigned lvl;
p2m_alloc_size = round_pg_up(p2m_alloc_size);
if ( dom->parms.p2m_base != UNSET_ADDR )
{
from = dom->parms.p2m_base;
to = from + p2m_alloc_size - 1;
if ( count_pgtables(dom, from, to, dom->pfn_alloc_end) )
return -1;
map->area.pfn = dom->pfn_alloc_end;
for ( lvl = 0; lvl < 4; lvl++ )
map->lvls[lvl].pfn += p2m_alloc_size >> PAGE_SHIFT_X86;
domx86->n_mappings++;
p2m_alloc_size += map->area.pgtables << PAGE_SHIFT_X86;
}
return alloc_p2m_list(dom, p2m_alloc_size);
}
/* ------------------------------------------------------------------------ */
static int alloc_magic_pages_pv(struct xc_dom_image *dom)
{
dom->start_info_pfn = xc_dom_alloc_page(dom, "start info");
if ( dom->start_info_pfn == INVALID_PFN )
return -1;
dom->xenstore_pfn = xc_dom_alloc_page(dom, "xenstore");
if ( dom->xenstore_pfn == INVALID_PFN )
return -1;
xc_clear_domain_page(dom->xch, dom->guest_domid,
xc_dom_p2m(dom, dom->xenstore_pfn));
dom->console_pfn = xc_dom_alloc_page(dom, "console");
if ( dom->console_pfn == INVALID_PFN )
return -1;
xc_clear_domain_page(dom->xch, dom->guest_domid,
xc_dom_p2m(dom, dom->console_pfn));
dom->alloc_bootstack = 1;
return 0;
}
static void build_hvm_info(void *hvm_info_page, struct xc_dom_image *dom)
{
struct hvm_info_table *hvm_info = (struct hvm_info_table *)
(((unsigned char *)hvm_info_page) + HVM_INFO_OFFSET);
uint8_t sum;
int i;
memset(hvm_info_page, 0, PAGE_SIZE);
/* Fill in the header. */
memcpy(hvm_info->signature, "HVM INFO", sizeof(hvm_info->signature));
hvm_info->length = sizeof(struct hvm_info_table);
/* Sensible defaults: these can be overridden by the caller. */
hvm_info->apic_mode = 1;
hvm_info->nr_vcpus = 1;
memset(hvm_info->vcpu_online, 0xff, sizeof(hvm_info->vcpu_online));
/* Memory parameters. */
hvm_info->low_mem_pgend = dom->lowmem_end >> PAGE_SHIFT;
hvm_info->high_mem_pgend = dom->highmem_end >> PAGE_SHIFT;
hvm_info->reserved_mem_pgstart = ioreq_server_pfn(0);
/* Finish with the checksum. */
for ( i = 0, sum = 0; i < hvm_info->length; i++ )
sum += ((uint8_t *)hvm_info)[i];
hvm_info->checksum = -sum;
}
static int alloc_magic_pages_hvm(struct xc_dom_image *dom)
{
unsigned long i;
uint32_t *ident_pt, domid = dom->guest_domid;
int rc;
xen_pfn_t special_array[X86_HVM_NR_SPECIAL_PAGES];
xen_pfn_t ioreq_server_array[NR_IOREQ_SERVER_PAGES];
xc_interface *xch = dom->xch;
size_t start_info_size = sizeof(struct hvm_start_info);
/* Allocate and clear special pages. */
for ( i = 0; i < X86_HVM_NR_SPECIAL_PAGES; i++ )
special_array[i] = special_pfn(i);
rc = xc_domain_populate_physmap_exact(xch, domid, X86_HVM_NR_SPECIAL_PAGES,
0, 0, special_array);
if ( rc != 0 )
{
DOMPRINTF("Could not allocate special pages.");
goto error_out;
}
if ( xc_clear_domain_pages(xch, domid, special_pfn(0),
X86_HVM_NR_SPECIAL_PAGES) )
goto error_out;
xc_hvm_param_set(xch, domid, HVM_PARAM_STORE_PFN,
special_pfn(SPECIALPAGE_XENSTORE));
xc_hvm_param_set(xch, domid, HVM_PARAM_BUFIOREQ_PFN,
special_pfn(SPECIALPAGE_BUFIOREQ));
xc_hvm_param_set(xch, domid, HVM_PARAM_IOREQ_PFN,
special_pfn(SPECIALPAGE_IOREQ));
xc_hvm_param_set(xch, domid, HVM_PARAM_CONSOLE_PFN,
special_pfn(SPECIALPAGE_CONSOLE));
xc_hvm_param_set(xch, domid, HVM_PARAM_PAGING_RING_PFN,
special_pfn(SPECIALPAGE_PAGING));
xc_hvm_param_set(xch, domid, HVM_PARAM_MONITOR_RING_PFN,
special_pfn(SPECIALPAGE_ACCESS));
xc_hvm_param_set(xch, domid, HVM_PARAM_SHARING_RING_PFN,
special_pfn(SPECIALPAGE_SHARING));
start_info_size +=
sizeof(struct hvm_modlist_entry) * HVMLOADER_MODULE_MAX_COUNT;
start_info_size +=
HVMLOADER_MODULE_CMDLINE_SIZE * HVMLOADER_MODULE_MAX_COUNT;
if ( !dom->device_model )
{
if ( dom->cmdline )
{
dom->cmdline_size = ROUNDUP(strlen(dom->cmdline) + 1, 8);
start_info_size += dom->cmdline_size;
}
}
else
{
/*
* Allocate and clear additional ioreq server pages. The default
* server will use the IOREQ and BUFIOREQ special pages above.
*/
for ( i = 0; i < NR_IOREQ_SERVER_PAGES; i++ )
ioreq_server_array[i] = ioreq_server_pfn(i);
rc = xc_domain_populate_physmap_exact(xch, domid, NR_IOREQ_SERVER_PAGES, 0,
0, ioreq_server_array);
if ( rc != 0 )
{
DOMPRINTF("Could not allocate ioreq server pages.");
goto error_out;
}
if ( xc_clear_domain_pages(xch, domid, ioreq_server_pfn(0),
NR_IOREQ_SERVER_PAGES) )
goto error_out;
/* Tell the domain where the pages are and how many there are */
xc_hvm_param_set(xch, domid, HVM_PARAM_IOREQ_SERVER_PFN,
ioreq_server_pfn(0));
xc_hvm_param_set(xch, domid, HVM_PARAM_NR_IOREQ_SERVER_PAGES,
NR_IOREQ_SERVER_PAGES);
}
rc = xc_dom_alloc_segment(dom, &dom->start_info_seg,
"HVM start info", 0, start_info_size);
if ( rc != 0 )
{
DOMPRINTF("Unable to reserve memory for the start info");
goto out;
}
/*
* Identity-map page table is required for running with CR0.PG=0 when
* using Intel EPT. Create a 32-bit non-PAE page directory of superpages.
*/
if ( (ident_pt = xc_map_foreign_range(
xch, domid, PAGE_SIZE, PROT_READ | PROT_WRITE,
special_pfn(SPECIALPAGE_IDENT_PT))) == NULL )
goto error_out;
for ( i = 0; i < PAGE_SIZE / sizeof(*ident_pt); i++ )
ident_pt[i] = ((i << 22) | _PAGE_PRESENT | _PAGE_RW | _PAGE_USER |
_PAGE_ACCESSED | _PAGE_DIRTY | _PAGE_PSE);
munmap(ident_pt, PAGE_SIZE);
xc_hvm_param_set(xch, domid, HVM_PARAM_IDENT_PT,
special_pfn(SPECIALPAGE_IDENT_PT) << PAGE_SHIFT);
dom->console_pfn = special_pfn(SPECIALPAGE_CONSOLE);
xc_clear_domain_page(dom->xch, dom->guest_domid, dom->console_pfn);
dom->xenstore_pfn = special_pfn(SPECIALPAGE_XENSTORE);
xc_clear_domain_page(dom->xch, dom->guest_domid, dom->xenstore_pfn);
dom->parms.virt_hypercall = -1;
rc = 0;
goto out;
error_out:
rc = -1;
out:
return rc;
}
/* ------------------------------------------------------------------------ */
static int start_info_x86_32(struct xc_dom_image *dom)
{
struct xc_dom_image_x86 *domx86 = dom->arch_private;
start_info_x86_32_t *start_info =
xc_dom_pfn_to_ptr(dom, dom->start_info_pfn, 1);
xen_pfn_t shinfo =
xc_dom_translated(dom) ? dom->shared_info_pfn : dom->shared_info_mfn;
DOMPRINTF_CALLED(dom->xch);
if ( start_info == NULL )
{
DOMPRINTF("%s: xc_dom_pfn_to_ptr failed on start_info", __FUNCTION__);
return -1; /* our caller throws away our return value :-/ */
}
memset(start_info, 0, sizeof(*start_info));
strncpy(start_info->magic, dom->guest_type, sizeof(start_info->magic));
start_info->magic[sizeof(start_info->magic) - 1] = '\0';
start_info->nr_pages = dom->total_pages;
start_info->shared_info = shinfo << PAGE_SHIFT_X86;
start_info->pt_base = dom->pgtables_seg.vstart;
start_info->nr_pt_frames = domx86->maps[0].area.pgtables;
start_info->mfn_list = dom->p2m_seg.vstart;
start_info->flags = dom->flags;
start_info->store_mfn = xc_dom_p2m(dom, dom->xenstore_pfn);
start_info->store_evtchn = dom->xenstore_evtchn;
start_info->console.domU.mfn = xc_dom_p2m(dom, dom->console_pfn);
start_info->console.domU.evtchn = dom->console_evtchn;
if ( dom->modules[0].blob )
{
start_info->mod_start = dom->initrd_start;
start_info->mod_len = dom->initrd_len;
}
if ( dom->cmdline )
{
strncpy((char *)start_info->cmd_line, dom->cmdline, MAX_GUEST_CMDLINE);
start_info->cmd_line[MAX_GUEST_CMDLINE - 1] = '\0';
}
return 0;
}
static int start_info_x86_64(struct xc_dom_image *dom)
{
struct xc_dom_image_x86 *domx86 = dom->arch_private;
start_info_x86_64_t *start_info =
xc_dom_pfn_to_ptr(dom, dom->start_info_pfn, 1);
xen_pfn_t shinfo =
xc_dom_translated(dom) ? dom->shared_info_pfn : dom->shared_info_mfn;
DOMPRINTF_CALLED(dom->xch);
if ( start_info == NULL )
{
DOMPRINTF("%s: xc_dom_pfn_to_ptr failed on start_info", __FUNCTION__);
return -1; /* our caller throws away our return value :-/ */
}
memset(start_info, 0, sizeof(*start_info));
strncpy(start_info->magic, dom->guest_type, sizeof(start_info->magic));
start_info->magic[sizeof(start_info->magic) - 1] = '\0';
start_info->nr_pages = dom->total_pages;
start_info->shared_info = shinfo << PAGE_SHIFT_X86;
start_info->pt_base = dom->pgtables_seg.vstart;
start_info->nr_pt_frames = domx86->maps[0].area.pgtables;
start_info->mfn_list = dom->p2m_seg.vstart;
if ( dom->parms.p2m_base != UNSET_ADDR )
{
start_info->first_p2m_pfn = dom->p2m_seg.pfn;
start_info->nr_p2m_frames = dom->p2m_seg.pages;
}
start_info->flags = dom->flags;
start_info->store_mfn = xc_dom_p2m(dom, dom->xenstore_pfn);
start_info->store_evtchn = dom->xenstore_evtchn;
start_info->console.domU.mfn = xc_dom_p2m(dom, dom->console_pfn);
start_info->console.domU.evtchn = dom->console_evtchn;
if ( dom->modules[0].blob )
{
start_info->mod_start = dom->initrd_start;
start_info->mod_len = dom->initrd_len;
}
if ( dom->cmdline )
{
strncpy((char *)start_info->cmd_line, dom->cmdline, MAX_GUEST_CMDLINE);
start_info->cmd_line[MAX_GUEST_CMDLINE - 1] = '\0';
}
return 0;
}
static int shared_info_x86_32(struct xc_dom_image *dom, void *ptr)
{
shared_info_x86_32_t *shared_info = ptr;
int i;
DOMPRINTF_CALLED(dom->xch);
memset(shared_info, 0, sizeof(*shared_info));
for ( i = 0; i < XEN_LEGACY_MAX_VCPUS; i++ )
shared_info->vcpu_info[i].evtchn_upcall_mask = 1;
return 0;
}
static int shared_info_x86_64(struct xc_dom_image *dom, void *ptr)
{
shared_info_x86_64_t *shared_info = ptr;
int i;
DOMPRINTF_CALLED(dom->xch);
memset(shared_info, 0, sizeof(*shared_info));
for ( i = 0; i < XEN_LEGACY_MAX_VCPUS; i++ )
shared_info->vcpu_info[i].evtchn_upcall_mask = 1;
return 0;
}
/* ------------------------------------------------------------------------ */
static int vcpu_x86_32(struct xc_dom_image *dom)
{
vcpu_guest_context_any_t any_ctx;
vcpu_guest_context_x86_32_t *ctxt = &any_ctx.x32;
xen_pfn_t cr3_pfn;
int rc;
DOMPRINTF_CALLED(dom->xch);
/* clear everything */
memset(ctxt, 0, sizeof(*ctxt));
ctxt->user_regs.eip = dom->parms.virt_entry;
ctxt->user_regs.esp =
dom->parms.virt_base + (dom->bootstack_pfn + 1) * PAGE_SIZE_X86;
ctxt->user_regs.esi =
dom->parms.virt_base + (dom->start_info_pfn) * PAGE_SIZE_X86;
ctxt->user_regs.eflags = 1 << 9; /* Interrupt Enable */
ctxt->flags = VGCF_in_kernel_X86_32 | VGCF_online_X86_32;
if ( dom->parms.pae == XEN_PAE_EXTCR3 ||
dom->parms.pae == XEN_PAE_BIMODAL )
ctxt->vm_assist |= (1UL << VMASST_TYPE_pae_extended_cr3);
cr3_pfn = xc_dom_p2m(dom, dom->pgtables_seg.pfn);
ctxt->ctrlreg[3] = xen_pfn_to_cr3_x86_32(cr3_pfn);
DOMPRINTF("%s: cr3: pfn 0x%" PRIpfn " mfn 0x%" PRIpfn "",
__FUNCTION__, dom->pgtables_seg.pfn, cr3_pfn);
ctxt->user_regs.ds = FLAT_KERNEL_DS_X86_32;
ctxt->user_regs.es = FLAT_KERNEL_DS_X86_32;
ctxt->user_regs.fs = FLAT_KERNEL_DS_X86_32;
ctxt->user_regs.gs = FLAT_KERNEL_DS_X86_32;
ctxt->user_regs.ss = FLAT_KERNEL_SS_X86_32;
ctxt->user_regs.cs = FLAT_KERNEL_CS_X86_32;
ctxt->kernel_ss = ctxt->user_regs.ss;
ctxt->kernel_sp = ctxt->user_regs.esp;
rc = xc_vcpu_setcontext(dom->xch, dom->guest_domid, 0, &any_ctx);
if ( rc != 0 )
xc_dom_panic(dom->xch, XC_INTERNAL_ERROR,
"%s: SETVCPUCONTEXT failed (rc=%d)", __func__, rc);
return rc;
}
static int vcpu_x86_64(struct xc_dom_image *dom)
{
vcpu_guest_context_any_t any_ctx;
vcpu_guest_context_x86_64_t *ctxt = &any_ctx.x64;
xen_pfn_t cr3_pfn;
int rc;
DOMPRINTF_CALLED(dom->xch);
/* clear everything */
memset(ctxt, 0, sizeof(*ctxt));
ctxt->user_regs.rip = dom->parms.virt_entry;
ctxt->user_regs.rsp =
dom->parms.virt_base + (dom->bootstack_pfn + 1) * PAGE_SIZE_X86;
ctxt->user_regs.rsi =
dom->parms.virt_base + (dom->start_info_pfn) * PAGE_SIZE_X86;
ctxt->user_regs.rflags = 1 << 9; /* Interrupt Enable */
ctxt->flags = VGCF_in_kernel_X86_64 | VGCF_online_X86_64;
cr3_pfn = xc_dom_p2m(dom, dom->pgtables_seg.pfn);
ctxt->ctrlreg[3] = xen_pfn_to_cr3_x86_64(cr3_pfn);
DOMPRINTF("%s: cr3: pfn 0x%" PRIpfn " mfn 0x%" PRIpfn "",
__FUNCTION__, dom->pgtables_seg.pfn, cr3_pfn);
ctxt->user_regs.ds = FLAT_KERNEL_DS_X86_64;
ctxt->user_regs.es = FLAT_KERNEL_DS_X86_64;
ctxt->user_regs.fs = FLAT_KERNEL_DS_X86_64;
ctxt->user_regs.gs = FLAT_KERNEL_DS_X86_64;
ctxt->user_regs.ss = FLAT_KERNEL_SS_X86_64;
ctxt->user_regs.cs = FLAT_KERNEL_CS_X86_64;
ctxt->kernel_ss = ctxt->user_regs.ss;
ctxt->kernel_sp = ctxt->user_regs.esp;
rc = xc_vcpu_setcontext(dom->xch, dom->guest_domid, 0, &any_ctx);
if ( rc != 0 )
xc_dom_panic(dom->xch, XC_INTERNAL_ERROR,
"%s: SETVCPUCONTEXT failed (rc=%d)", __func__, rc);
return rc;
}
static int vcpu_hvm(struct xc_dom_image *dom)
{
struct {
struct hvm_save_descriptor header_d;
HVM_SAVE_TYPE(HEADER) header;
struct hvm_save_descriptor cpu_d;
HVM_SAVE_TYPE(CPU) cpu;
struct hvm_save_descriptor end_d;
HVM_SAVE_TYPE(END) end;
} bsp_ctx;
uint8_t *full_ctx = NULL;
int rc;
DOMPRINTF_CALLED(dom->xch);
/*
* Get the full HVM context in order to have the header, it is not
* possible to get the header with getcontext_partial, and crafting one
* from userspace is also not an option since cpuid is trapped and
* modified by Xen.
*/
rc = xc_domain_hvm_getcontext(dom->xch, dom->guest_domid, NULL, 0);
if ( rc <= 0 )
{
xc_dom_panic(dom->xch, XC_INTERNAL_ERROR,
"%s: unable to fetch HVM context size (rc=%d)",
__func__, rc);
goto out;
}
full_ctx = calloc(1, rc);
if ( full_ctx == NULL )
{
xc_dom_panic(dom->xch, XC_INTERNAL_ERROR,
"%s: unable to allocate memory for HVM context (rc=%d)",
__func__, rc);
rc = -ENOMEM;
goto out;
}
rc = xc_domain_hvm_getcontext(dom->xch, dom->guest_domid, full_ctx, rc);
if ( rc <= 0 )
{
xc_dom_panic(dom->xch, XC_INTERNAL_ERROR,
"%s: unable to fetch HVM context (rc=%d)",
__func__, rc);
goto out;
}
/* Copy the header to our partial context. */
memset(&bsp_ctx, 0, sizeof(bsp_ctx));
memcpy(&bsp_ctx, full_ctx,
sizeof(struct hvm_save_descriptor) + HVM_SAVE_LENGTH(HEADER));
/* Set the CPU descriptor. */
bsp_ctx.cpu_d.typecode = HVM_SAVE_CODE(CPU);
bsp_ctx.cpu_d.instance = 0;
bsp_ctx.cpu_d.length = HVM_SAVE_LENGTH(CPU);
/* Set the cached part of the relevant segment registers. */
bsp_ctx.cpu.cs_base = 0;
bsp_ctx.cpu.ds_base = 0;
bsp_ctx.cpu.ss_base = 0;
bsp_ctx.cpu.tr_base = 0;
bsp_ctx.cpu.cs_limit = ~0u;
bsp_ctx.cpu.ds_limit = ~0u;
bsp_ctx.cpu.ss_limit = ~0u;
bsp_ctx.cpu.tr_limit = 0x67;
bsp_ctx.cpu.cs_arbytes = 0xc9b;
bsp_ctx.cpu.ds_arbytes = 0xc93;
bsp_ctx.cpu.ss_arbytes = 0xc93;
bsp_ctx.cpu.tr_arbytes = 0x8b;
/* Set the control registers. */
bsp_ctx.cpu.cr0 = X86_CR0_PE | X86_CR0_ET;
/* Set the IP. */
bsp_ctx.cpu.rip = dom->parms.phys_entry;
if ( dom->start_info_seg.pfn )
bsp_ctx.cpu.rbx = dom->start_info_seg.pfn << PAGE_SHIFT;
/* Set the end descriptor. */
bsp_ctx.end_d.typecode = HVM_SAVE_CODE(END);
bsp_ctx.end_d.instance = 0;
bsp_ctx.end_d.length = HVM_SAVE_LENGTH(END);
rc = xc_domain_hvm_setcontext(dom->xch, dom->guest_domid,
(uint8_t *)&bsp_ctx, sizeof(bsp_ctx));
if ( rc != 0 )
xc_dom_panic(dom->xch, XC_INTERNAL_ERROR,
"%s: SETHVMCONTEXT failed (rc=%d)", __func__, rc);
out:
free(full_ctx);
return rc;
}
/* ------------------------------------------------------------------------ */
static int x86_compat(xc_interface *xch, uint32_t domid, char *guest_type)
{
static const struct {
char *guest;
uint32_t size;
} types[] = {
{ "xen-3.0-x86_32p", 32 },
{ "xen-3.0-x86_64", 64 },
};
DECLARE_DOMCTL;
int i,rc;
memset(&domctl, 0, sizeof(domctl));
domctl.domain = domid;
domctl.cmd = XEN_DOMCTL_set_address_size;
for ( i = 0; i < ARRAY_SIZE(types); i++ )
if ( !strcmp(types[i].guest, guest_type) )
domctl.u.address_size.size = types[i].size;
if ( domctl.u.address_size.size == 0 )
/* nothing to do */
return 0;
xc_dom_printf(xch, "%s: guest %s, address size %" PRId32 "", __FUNCTION__,
guest_type, domctl.u.address_size.size);
rc = do_domctl(xch, &domctl);
if ( rc != 0 )
xc_dom_printf(xch, "%s: warning: failed (rc=%d)",
__FUNCTION__, rc);
return rc;
}
static int meminit_pv(struct xc_dom_image *dom)
{
int rc;
xen_pfn_t pfn, allocsz, mfn, total, pfn_base;
int i, j, k;
xen_vmemrange_t dummy_vmemrange[1];
unsigned int dummy_vnode_to_pnode[1];
xen_vmemrange_t *vmemranges;
unsigned int *vnode_to_pnode;
unsigned int nr_vmemranges, nr_vnodes;
rc = x86_compat(dom->xch, dom->guest_domid, dom->guest_type);
if ( rc )
return rc;
/* try to claim pages for early warning of insufficient memory avail */
if ( dom->claim_enabled )
{
rc = xc_domain_claim_pages(dom->xch, dom->guest_domid,
dom->total_pages);
if ( rc )
return rc;
}
/* Setup dummy vNUMA information if it's not provided. Note
* that this is a valid state if libxl doesn't provide any
* vNUMA information.
*
* The dummy values make libxc allocate all pages from
* arbitrary physical nodes. This is the expected behaviour if
* no vNUMA configuration is provided to libxc.
*
* Note that the following hunk is just for the convenience of
* allocation code. No defaulting happens in libxc.
*/
if ( dom->nr_vmemranges == 0 )
{
nr_vmemranges = 1;
vmemranges = dummy_vmemrange;
vmemranges[0].start = 0;
vmemranges[0].end = (uint64_t)dom->total_pages << PAGE_SHIFT;
vmemranges[0].flags = 0;
vmemranges[0].nid = 0;
nr_vnodes = 1;
vnode_to_pnode = dummy_vnode_to_pnode;
vnode_to_pnode[0] = XC_NUMA_NO_NODE;
}
else
{
nr_vmemranges = dom->nr_vmemranges;
nr_vnodes = dom->nr_vnodes;
vmemranges = dom->vmemranges;
vnode_to_pnode = dom->vnode_to_pnode;
}
total = dom->p2m_size = 0;
for ( i = 0; i < nr_vmemranges; i++ )
{
total += ((vmemranges[i].end - vmemranges[i].start) >> PAGE_SHIFT);
dom->p2m_size = max(dom->p2m_size,
(xen_pfn_t)(vmemranges[i].end >> PAGE_SHIFT));
}
if ( total != dom->total_pages )
{
xc_dom_panic(dom->xch, XC_INTERNAL_ERROR,
"%s: vNUMA page count mismatch (0x%"PRIpfn" != 0x%"PRIpfn")",
__func__, total, dom->total_pages);
return -EINVAL;
}
dom->p2m_host = xc_dom_malloc(dom, sizeof(xen_pfn_t) * dom->p2m_size);
if ( dom->p2m_host == NULL )
return -EINVAL;
for ( pfn = 0; pfn < dom->p2m_size; pfn++ )
dom->p2m_host[pfn] = INVALID_PFN;
/* allocate guest memory */
for ( i = 0; i < nr_vmemranges; i++ )
{
unsigned int memflags;
uint64_t pages, super_pages;
unsigned int pnode = vnode_to_pnode[vmemranges[i].nid];
xen_pfn_t extents[SUPERPAGE_BATCH_SIZE];
xen_pfn_t pfn_base_idx;
memflags = 0;
if ( pnode != XC_NUMA_NO_NODE )
memflags |= XENMEMF_exact_node(pnode);
pages = (vmemranges[i].end - vmemranges[i].start) >> PAGE_SHIFT;
super_pages = pages >> SUPERPAGE_2MB_SHIFT;
pfn_base = vmemranges[i].start >> PAGE_SHIFT;
for ( pfn = pfn_base; pfn < pfn_base+pages; pfn++ )
dom->p2m_host[pfn] = pfn;
pfn_base_idx = pfn_base;
while ( super_pages ) {
uint64_t count = min_t(uint64_t, super_pages, SUPERPAGE_BATCH_SIZE);
super_pages -= count;
for ( pfn = pfn_base_idx, j = 0;
pfn < pfn_base_idx + (count << SUPERPAGE_2MB_SHIFT);
pfn += SUPERPAGE_2MB_NR_PFNS, j++ )
extents[j] = dom->p2m_host[pfn];
rc = xc_domain_populate_physmap(dom->xch, dom->guest_domid, count,
SUPERPAGE_2MB_SHIFT, memflags,
extents);
if ( rc < 0 )
return rc;
/* Expand the returned mfns into the p2m array. */
pfn = pfn_base_idx;
for ( j = 0; j < rc; j++ )
{
mfn = extents[j];
for ( k = 0; k < SUPERPAGE_2MB_NR_PFNS; k++, pfn++ )
dom->p2m_host[pfn] = mfn + k;
}
pfn_base_idx = pfn;
}
for ( j = pfn_base_idx - pfn_base; j < pages; j += allocsz )
{
allocsz = min_t(uint64_t, 1024 * 1024, pages - j);
rc = xc_domain_populate_physmap_exact(dom->xch, dom->guest_domid,
allocsz, 0, memflags, &dom->p2m_host[pfn_base + j]);
if ( rc )
{
if ( pnode != XC_NUMA_NO_NODE )
xc_dom_panic(dom->xch, XC_INTERNAL_ERROR,
"%s: failed to allocate 0x%"PRIx64" pages (v=%d, p=%d)",
__func__, pages, i, pnode);
else
xc_dom_panic(dom->xch, XC_INTERNAL_ERROR,
"%s: failed to allocate 0x%"PRIx64" pages",
__func__, pages);
return rc;
}
}
rc = 0;
}
/* Ensure no unclaimed pages are left unused.
* OK to call if hadn't done the earlier claim call. */
xc_domain_claim_pages(dom->xch, dom->guest_domid, 0 /* cancel claim */);
return rc;
}
/*
* Check whether there exists mmio hole in the specified memory range.
* Returns 1 if exists, else returns 0.
*/
static int check_mmio_hole(uint64_t start, uint64_t memsize,
uint64_t mmio_start, uint64_t mmio_size)
{
if ( start + memsize <= mmio_start || start >= mmio_start + mmio_size )
return 0;
else
return 1;
}
static int meminit_hvm(struct xc_dom_image *dom)
{
unsigned long i, vmemid, nr_pages = dom->total_pages;
unsigned long p2m_size;
unsigned long target_pages = dom->target_pages;
unsigned long cur_pages, cur_pfn;
int rc;
unsigned long stat_normal_pages = 0, stat_2mb_pages = 0,
stat_1gb_pages = 0;
unsigned int memflags = 0;
int claim_enabled = dom->claim_enabled;
uint64_t total_pages;
xen_vmemrange_t dummy_vmemrange[2];
unsigned int dummy_vnode_to_pnode[1];
xen_vmemrange_t *vmemranges;
unsigned int *vnode_to_pnode;
unsigned int nr_vmemranges, nr_vnodes;
xc_interface *xch = dom->xch;
uint32_t domid = dom->guest_domid;
if ( nr_pages > target_pages )
memflags |= XENMEMF_populate_on_demand;
if ( dom->nr_vmemranges == 0 )
{
/* Build dummy vnode information
*
* Guest physical address space layout:
* [0, hole_start) [hole_start, 4G) [4G, highmem_end)
*
* Of course if there is no high memory, the second vmemrange
* has no effect on the actual result.
*/
dummy_vmemrange[0].start = 0;
dummy_vmemrange[0].end = dom->lowmem_end;
dummy_vmemrange[0].flags = 0;
dummy_vmemrange[0].nid = 0;
nr_vmemranges = 1;
if ( dom->highmem_end > (1ULL << 32) )
{
dummy_vmemrange[1].start = 1ULL << 32;
dummy_vmemrange[1].end = dom->highmem_end;
dummy_vmemrange[1].flags = 0;
dummy_vmemrange[1].nid = 0;
nr_vmemranges++;
}
dummy_vnode_to_pnode[0] = XC_NUMA_NO_NODE;
nr_vnodes = 1;
vmemranges = dummy_vmemrange;
vnode_to_pnode = dummy_vnode_to_pnode;
}
else
{
if ( nr_pages > target_pages )
{
DOMPRINTF("Cannot enable vNUMA and PoD at the same time");
goto error_out;
}
nr_vmemranges = dom->nr_vmemranges;
nr_vnodes = dom->nr_vnodes;
vmemranges = dom->vmemranges;
vnode_to_pnode = dom->vnode_to_pnode;
}
total_pages = 0;
p2m_size = 0;
for ( i = 0; i < nr_vmemranges; i++ )
{
DOMPRINTF("range: start=0x%"PRIx64" end=0x%"PRIx64, vmemranges[i].start, vmemranges[i].end);
total_pages += ((vmemranges[i].end - vmemranges[i].start)
>> PAGE_SHIFT);
p2m_size = p2m_size > (vmemranges[i].end >> PAGE_SHIFT) ?
p2m_size : (vmemranges[i].end >> PAGE_SHIFT);
}
if ( total_pages != nr_pages )
{
DOMPRINTF("vNUMA memory pages mismatch (0x%"PRIx64" != 0x%lx)",
total_pages, nr_pages);
goto error_out;
}
dom->p2m_size = p2m_size;
dom->p2m_host = xc_dom_malloc(dom, sizeof(xen_pfn_t) *
dom->p2m_size);
if ( dom->p2m_host == NULL )
{
DOMPRINTF("Could not allocate p2m");
goto error_out;
}
for ( i = 0; i < p2m_size; i++ )
dom->p2m_host[i] = ((xen_pfn_t)-1);
for ( vmemid = 0; vmemid < nr_vmemranges; vmemid++ )
{
uint64_t pfn;
for ( pfn = vmemranges[vmemid].start >> PAGE_SHIFT;
pfn < vmemranges[vmemid].end >> PAGE_SHIFT;
pfn++ )
dom->p2m_host[pfn] = pfn;
}
/*
* Try to claim pages for early warning of insufficient memory available.
* This should go before xc_domain_set_pod_target, becuase that function
* actually allocates memory for the guest. Claiming after memory has been
* allocated is pointless.
*/
if ( claim_enabled ) {
rc = xc_domain_claim_pages(xch, domid,
target_pages - dom->vga_hole_size);
if ( rc != 0 )
{
DOMPRINTF("Could not allocate memory for HVM guest as we cannot claim memory!");
goto error_out;
}
}
if ( memflags & XENMEMF_populate_on_demand )
{
/*
* Subtract VGA_HOLE_SIZE from target_pages for the VGA
* "hole". Xen will adjust the PoD cache size so that domain
* tot_pages will be target_pages - VGA_HOLE_SIZE after
* this call.
*/
rc = xc_domain_set_pod_target(xch, domid,
target_pages - dom->vga_hole_size,
NULL, NULL, NULL);
if ( rc != 0 )
{
DOMPRINTF("Could not set PoD target for HVM guest.\n");
goto error_out;
}
}
/*
* Allocate memory for HVM guest, skipping VGA hole 0xA0000-0xC0000.
*
* We attempt to allocate 1GB pages if possible. It falls back on 2MB
* pages if 1GB allocation fails. 4KB pages will be used eventually if
* both fail.
*
* Under 2MB mode, we allocate pages in batches of no more than 8MB to
* ensure that we can be preempted and hence dom0 remains responsive.
*/
if ( dom->device_model )
{
rc = xc_domain_populate_physmap_exact(
xch, domid, 0xa0, 0, memflags, &dom->p2m_host[0x00]);
if ( rc != 0 )
{
DOMPRINTF("Could not populate low memory (< 0xA0).\n");
goto error_out;
}
}
stat_normal_pages = 0;
for ( vmemid = 0; vmemid < nr_vmemranges; vmemid++ )
{
unsigned int new_memflags = memflags;
uint64_t end_pages;
unsigned int vnode = vmemranges[vmemid].nid;
unsigned int pnode = vnode_to_pnode[vnode];
if ( pnode != XC_NUMA_NO_NODE )
new_memflags |= XENMEMF_exact_node(pnode);
end_pages = vmemranges[vmemid].end >> PAGE_SHIFT;
/*
* Consider vga hole belongs to the vmemrange that covers
* 0xA0000-0xC0000. Note that 0x00000-0xA0000 is populated just
* before this loop.
*/
if ( vmemranges[vmemid].start == 0 && dom->device_model )
{
cur_pages = 0xc0;
stat_normal_pages += 0xc0;
}
else
cur_pages = vmemranges[vmemid].start >> PAGE_SHIFT;
rc = 0;
while ( (rc == 0) && (end_pages > cur_pages) )
{
/* Clip count to maximum 1GB extent. */
unsigned long count = end_pages - cur_pages;
unsigned long max_pages = SUPERPAGE_1GB_NR_PFNS;
if ( count > max_pages )
count = max_pages;
cur_pfn = dom->p2m_host[cur_pages];
/* Take care the corner cases of super page tails */
if ( ((cur_pfn & (SUPERPAGE_1GB_NR_PFNS-1)) != 0) &&
(count > (-cur_pfn & (SUPERPAGE_1GB_NR_PFNS-1))) )
count = -cur_pfn & (SUPERPAGE_1GB_NR_PFNS-1);
else if ( ((count & (SUPERPAGE_1GB_NR_PFNS-1)) != 0) &&
(count > SUPERPAGE_1GB_NR_PFNS) )
count &= ~(SUPERPAGE_1GB_NR_PFNS - 1);
/* Attemp to allocate 1GB super page. Because in each pass
* we only allocate at most 1GB, we don't have to clip
* super page boundaries.
*/
if ( ((count | cur_pfn) & (SUPERPAGE_1GB_NR_PFNS - 1)) == 0 &&
/* Check if there exists MMIO hole in the 1GB memory
* range */
!check_mmio_hole(cur_pfn << PAGE_SHIFT,
SUPERPAGE_1GB_NR_PFNS << PAGE_SHIFT,
dom->mmio_start, dom->mmio_size) )
{
long done;
unsigned long nr_extents = count >> SUPERPAGE_1GB_SHIFT;
xen_pfn_t sp_extents[nr_extents];
for ( i = 0; i < nr_extents; i++ )
sp_extents[i] =
dom->p2m_host[cur_pages+(i< 0 )
{
stat_1gb_pages += done;
done <<= SUPERPAGE_1GB_SHIFT;
cur_pages += done;
count -= done;
}
}
if ( count != 0 )
{
/* Clip count to maximum 8MB extent. */
max_pages = SUPERPAGE_2MB_NR_PFNS * 4;
if ( count > max_pages )
count = max_pages;
/* Clip partial superpage extents to superpage
* boundaries. */
if ( ((cur_pfn & (SUPERPAGE_2MB_NR_PFNS-1)) != 0) &&
(count > (-cur_pfn & (SUPERPAGE_2MB_NR_PFNS-1))) )
count = -cur_pfn & (SUPERPAGE_2MB_NR_PFNS-1);
else if ( ((count & (SUPERPAGE_2MB_NR_PFNS-1)) != 0) &&
(count > SUPERPAGE_2MB_NR_PFNS) )
count &= ~(SUPERPAGE_2MB_NR_PFNS - 1); /* clip non-s.p. tail */
/* Attempt to allocate superpage extents. */
if ( ((count | cur_pfn) & (SUPERPAGE_2MB_NR_PFNS - 1)) == 0 )
{
long done;
unsigned long nr_extents = count >> SUPERPAGE_2MB_SHIFT;
xen_pfn_t sp_extents[nr_extents];
for ( i = 0; i < nr_extents; i++ )
sp_extents[i] =
dom->p2m_host[cur_pages+(i< 0 )
{
stat_2mb_pages += done;
done <<= SUPERPAGE_2MB_SHIFT;
cur_pages += done;
count -= done;
}
}
}
/* Fall back to 4kB extents. */
if ( count != 0 )
{
rc = xc_domain_populate_physmap_exact(
xch, domid, count, 0, new_memflags, &dom->p2m_host[cur_pages]);
cur_pages += count;
stat_normal_pages += count;
}
}
if ( rc != 0 )
{
DOMPRINTF("Could not allocate memory for HVM guest.");
goto error_out;
}
}
DPRINTF("PHYSICAL MEMORY ALLOCATION:\n");
DPRINTF(" 4KB PAGES: 0x%016lx\n", stat_normal_pages);
DPRINTF(" 2MB PAGES: 0x%016lx\n", stat_2mb_pages);
DPRINTF(" 1GB PAGES: 0x%016lx\n", stat_1gb_pages);
rc = 0;
goto out;
error_out:
rc = -1;
out:
/* ensure no unclaimed pages are left unused */
xc_domain_claim_pages(xch, domid, 0 /* cancels the claim */);
return rc;
}
/* ------------------------------------------------------------------------ */
static int bootearly(struct xc_dom_image *dom)
{
if ( dom->container_type == XC_DOM_PV_CONTAINER &&
elf_xen_feature_get(XENFEAT_auto_translated_physmap, dom->f_active) )
{
DOMPRINTF("PV Autotranslate guests no longer supported");
errno = EOPNOTSUPP;
return -1;
}
return 0;
}
static int bootlate_pv(struct xc_dom_image *dom)
{
static const struct {
char *guest;
unsigned long pgd_type;
} types[] = {
{ "xen-3.0-x86_32", MMUEXT_PIN_L2_TABLE},
{ "xen-3.0-x86_32p", MMUEXT_PIN_L3_TABLE},
{ "xen-3.0-x86_64", MMUEXT_PIN_L4_TABLE},
};
unsigned long pgd_type = 0;
shared_info_t *shared_info;
xen_pfn_t shinfo;
int i, rc;
for ( i = 0; i < ARRAY_SIZE(types); i++ )
if ( !strcmp(types[i].guest, dom->guest_type) )
pgd_type = types[i].pgd_type;
/* Drop references to all initial page tables before pinning. */
xc_dom_unmap_one(dom, dom->pgtables_seg.pfn);
xc_dom_unmap_one(dom, dom->p2m_seg.pfn);
rc = pin_table(dom->xch, pgd_type,
xc_dom_p2m(dom, dom->pgtables_seg.pfn),
dom->guest_domid);
if ( rc != 0 )
{
xc_dom_panic(dom->xch, XC_INTERNAL_ERROR,
"%s: pin_table failed (pfn 0x%" PRIpfn ", rc=%d)",
__FUNCTION__, dom->pgtables_seg.pfn, rc);
return rc;
}
shinfo = dom->shared_info_mfn;
/* setup shared_info page */
DOMPRINTF("%s: shared_info: pfn 0x%" PRIpfn ", mfn 0x%" PRIpfn "",
__FUNCTION__, dom->shared_info_pfn, dom->shared_info_mfn);
shared_info = xc_map_foreign_range(dom->xch, dom->guest_domid,
PAGE_SIZE_X86,
PROT_READ | PROT_WRITE,
shinfo);
if ( shared_info == NULL )
return -1;
dom->arch_hooks->shared_info(dom, shared_info);
munmap(shared_info, PAGE_SIZE_X86);
return 0;
}
static int alloc_pgtables_hvm(struct xc_dom_image *dom)
{
DOMPRINTF("%s: doing nothing", __func__);
return 0;
}
/*
* The memory layout of the start_info page and the modules, and where the
* addresses are stored:
*
* /----------------------------------\
* | struct hvm_start_info |
* +----------------------------------+ <- start_info->modlist_paddr
* | struct hvm_modlist_entry[0] |
* +----------------------------------+
* | struct hvm_modlist_entry[1] |
* +----------------------------------+ <- modlist[0].cmdline_paddr
* | cmdline of module 0 |
* | char[HVMLOADER_MODULE_NAME_SIZE] |
* +----------------------------------+ <- modlist[1].cmdline_paddr
* | cmdline of module 1 |
* +----------------------------------+
*/
static void add_module_to_list(struct xc_dom_image *dom,
struct xc_hvm_firmware_module *module,
const char *cmdline,
struct hvm_modlist_entry *modlist,
struct hvm_start_info *start_info)
{
uint32_t index = start_info->nr_modules;
void *modules_cmdline_start = modlist + HVMLOADER_MODULE_MAX_COUNT;
uint64_t modlist_paddr = (dom->start_info_seg.pfn << PAGE_SHIFT) +
((uintptr_t)modlist - (uintptr_t)start_info);
uint64_t modules_cmdline_paddr = modlist_paddr +
sizeof(struct hvm_modlist_entry) * HVMLOADER_MODULE_MAX_COUNT;
if ( module->length == 0 )
return;
assert(start_info->nr_modules < HVMLOADER_MODULE_MAX_COUNT);
modlist[index].paddr = module->guest_addr_out;
modlist[index].size = module->length;
if ( cmdline )
{
assert(strnlen(cmdline, HVMLOADER_MODULE_CMDLINE_SIZE)
< HVMLOADER_MODULE_CMDLINE_SIZE);
strncpy(modules_cmdline_start + HVMLOADER_MODULE_CMDLINE_SIZE * index,
cmdline, HVMLOADER_MODULE_CMDLINE_SIZE);
}
modlist[index].cmdline_paddr =
modules_cmdline_paddr + HVMLOADER_MODULE_CMDLINE_SIZE * index;
start_info->nr_modules++;
}
static int bootlate_hvm(struct xc_dom_image *dom)
{
uint32_t domid = dom->guest_domid;
xc_interface *xch = dom->xch;
struct hvm_start_info *start_info;
size_t start_info_size;
struct hvm_modlist_entry *modlist;
unsigned int i;
start_info_size = sizeof(*start_info) + dom->cmdline_size;
start_info_size += sizeof(struct hvm_modlist_entry) * dom->num_modules;
if ( start_info_size >
dom->start_info_seg.pages << XC_DOM_PAGE_SHIFT(dom) )
{
DOMPRINTF("Trying to map beyond start_info_seg");
return -1;
}
start_info = xc_map_foreign_range(xch, domid, start_info_size,
PROT_READ | PROT_WRITE,
dom->start_info_seg.pfn);
if ( start_info == NULL )
{
DOMPRINTF("Unable to map HVM start info page");
return -1;
}
modlist = (void*)(start_info + 1) + dom->cmdline_size;
if ( !dom->device_model )
{
if ( dom->cmdline )
{
char *cmdline = (void*)(start_info + 1);
strncpy(cmdline, dom->cmdline, dom->cmdline_size);
start_info->cmdline_paddr = (dom->start_info_seg.pfn << PAGE_SHIFT) +
((uintptr_t)cmdline - (uintptr_t)start_info);
}
for ( i = 0; i < dom->num_modules; i++ )
{
struct xc_hvm_firmware_module mod;
DOMPRINTF("Adding module %u", i);
mod.guest_addr_out =
dom->modules[i].seg.vstart - dom->parms.virt_base;
mod.length =
dom->modules[i].seg.vend - dom->modules[i].seg.vstart;
add_module_to_list(dom, &mod, dom->modules[i].cmdline,
modlist, start_info);
}
/* ACPI module 0 is the RSDP */
start_info->rsdp_paddr = dom->acpi_modules[0].guest_addr_out ? : 0;
}
else
{
add_module_to_list(dom, &dom->system_firmware_module, "firmware",
modlist, start_info);
}
if ( start_info->nr_modules )
{
start_info->modlist_paddr = (dom->start_info_seg.pfn << PAGE_SHIFT) +
((uintptr_t)modlist - (uintptr_t)start_info);
}
start_info->magic = XEN_HVM_START_MAGIC_VALUE;
munmap(start_info, start_info_size);
if ( dom->device_model )
{
void *hvm_info_page;
if ( (hvm_info_page = xc_map_foreign_range(
xch, domid, PAGE_SIZE, PROT_READ | PROT_WRITE,
HVM_INFO_PFN)) == NULL )
return -1;
build_hvm_info(hvm_info_page, dom);
munmap(hvm_info_page, PAGE_SIZE);
}
return 0;
}
bool xc_dom_translated(const struct xc_dom_image *dom)
{
/* HVM guests are translated. PV guests are not. */
return dom->container_type == XC_DOM_HVM_CONTAINER;
}
/* ------------------------------------------------------------------------ */
static struct xc_dom_arch xc_dom_32_pae = {
.guest_type = "xen-3.0-x86_32p",
.native_protocol = XEN_IO_PROTO_ABI_X86_32,
.page_shift = PAGE_SHIFT_X86,
.sizeof_pfn = 4,
.p2m_base_supported = 0,
.arch_private_size = sizeof(struct xc_dom_image_x86),
.alloc_magic_pages = alloc_magic_pages_pv,
.alloc_pgtables = alloc_pgtables_x86_32_pae,
.alloc_p2m_list = alloc_p2m_list_x86_32,
.setup_pgtables = setup_pgtables_x86_32_pae,
.start_info = start_info_x86_32,
.shared_info = shared_info_x86_32,
.vcpu = vcpu_x86_32,
.meminit = meminit_pv,
.bootearly = bootearly,
.bootlate = bootlate_pv,
};
static struct xc_dom_arch xc_dom_64 = {
.guest_type = "xen-3.0-x86_64",
.native_protocol = XEN_IO_PROTO_ABI_X86_64,
.page_shift = PAGE_SHIFT_X86,
.sizeof_pfn = 8,
.p2m_base_supported = 1,
.arch_private_size = sizeof(struct xc_dom_image_x86),
.alloc_magic_pages = alloc_magic_pages_pv,
.alloc_pgtables = alloc_pgtables_x86_64,
.alloc_p2m_list = alloc_p2m_list_x86_64,
.setup_pgtables = setup_pgtables_x86_64,
.start_info = start_info_x86_64,
.shared_info = shared_info_x86_64,
.vcpu = vcpu_x86_64,
.meminit = meminit_pv,
.bootearly = bootearly,
.bootlate = bootlate_pv,
};
static struct xc_dom_arch xc_hvm_32 = {
.guest_type = "hvm-3.0-x86_32",
.native_protocol = XEN_IO_PROTO_ABI_X86_32,
.page_shift = PAGE_SHIFT_X86,
.sizeof_pfn = 4,
.alloc_magic_pages = alloc_magic_pages_hvm,
.alloc_pgtables = alloc_pgtables_hvm,
.setup_pgtables = NULL,
.start_info = NULL,
.shared_info = NULL,
.vcpu = vcpu_hvm,
.meminit = meminit_hvm,
.bootearly = bootearly,
.bootlate = bootlate_hvm,
};
static void __init register_arch_hooks(void)
{
xc_dom_register_arch_hooks(&xc_dom_32_pae);
xc_dom_register_arch_hooks(&xc_dom_64);
xc_dom_register_arch_hooks(&xc_hvm_32);
}
/*
* Local variables:
* mode: C
* c-file-style: "BSD"
* c-basic-offset: 4
* tab-width: 4
* indent-tabs-mode: nil
* End:
*/