/* * Xen domain builder -- i386 and x86_64 bits. * * Most architecture-specific code for x86 goes here. * - prepare page tables. * - fill architecture-specific structs. * * This library is free software; you can redistribute it and/or * modify it under the terms of the GNU Lesser General Public * License as published by the Free Software Foundation; * version 2.1 of the License. * * This library is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU * Lesser General Public License for more details. * * You should have received a copy of the GNU Lesser General Public * License along with this library; If not, see . * * written 2006 by Gerd Hoffmann . * */ #include #include #include #include #include #include #include #include #include #include #include #include "xg_private.h" #include "xc_dom.h" #include "xenctrl.h" /* ------------------------------------------------------------------------ */ #define SUPERPAGE_BATCH_SIZE 512 #define SUPERPAGE_2MB_SHIFT 9 #define SUPERPAGE_2MB_NR_PFNS (1UL << SUPERPAGE_2MB_SHIFT) #define SUPERPAGE_1GB_SHIFT 18 #define SUPERPAGE_1GB_NR_PFNS (1UL << SUPERPAGE_1GB_SHIFT) #define X86_CR0_PE 0x01 #define X86_CR0_ET 0x10 #define SPECIALPAGE_PAGING 0 #define SPECIALPAGE_ACCESS 1 #define SPECIALPAGE_SHARING 2 #define SPECIALPAGE_BUFIOREQ 3 #define SPECIALPAGE_XENSTORE 4 #define SPECIALPAGE_IOREQ 5 #define SPECIALPAGE_IDENT_PT 6 #define SPECIALPAGE_CONSOLE 7 #define special_pfn(x) \ (X86_HVM_END_SPECIAL_REGION - X86_HVM_NR_SPECIAL_PAGES + (x)) #define NR_IOREQ_SERVER_PAGES 8 #define ioreq_server_pfn(x) (special_pfn(0) - NR_IOREQ_SERVER_PAGES + (x)) #define bits_to_mask(bits) (((xen_vaddr_t)1 << (bits))-1) #define round_down(addr, mask) ((addr) & ~(mask)) #define round_up(addr, mask) ((addr) | (mask)) #define round_pg_up(addr) (((addr) + PAGE_SIZE_X86 - 1) & ~(PAGE_SIZE_X86 - 1)) #define HVMLOADER_MODULE_MAX_COUNT 2 #define HVMLOADER_MODULE_CMDLINE_SIZE MAX_GUEST_CMDLINE struct xc_dom_params { unsigned levels; xen_vaddr_t vaddr_mask; x86_pgentry_t lvl_prot[4]; }; struct xc_dom_x86_mapping_lvl { xen_vaddr_t from; xen_vaddr_t to; xen_pfn_t pfn; unsigned int pgtables; }; struct xc_dom_x86_mapping { struct xc_dom_x86_mapping_lvl area; struct xc_dom_x86_mapping_lvl lvls[4]; }; struct xc_dom_image_x86 { unsigned n_mappings; #define MAPPING_MAX 2 struct xc_dom_x86_mapping maps[MAPPING_MAX]; struct xc_dom_params *params; }; /* get guest IO ABI protocol */ const char *xc_domain_get_native_protocol(xc_interface *xch, uint32_t domid) { int ret; uint32_t guest_width; const char *protocol; ret = xc_domain_get_guest_width(xch, domid, &guest_width); if ( ret ) return NULL; switch (guest_width) { case 4: /* 32 bit guest */ protocol = XEN_IO_PROTO_ABI_X86_32; break; case 8: /* 64 bit guest */ protocol = XEN_IO_PROTO_ABI_X86_64; break; default: protocol = NULL; } return protocol; } static int count_pgtables(struct xc_dom_image *dom, xen_vaddr_t from, xen_vaddr_t to, xen_pfn_t pfn) { struct xc_dom_image_x86 *domx86 = dom->arch_private; struct xc_dom_x86_mapping *map, *map_cmp; xen_pfn_t pfn_end; xen_vaddr_t mask; unsigned bits; int l, m; if ( domx86->n_mappings == MAPPING_MAX ) { xc_dom_panic(dom->xch, XC_OUT_OF_MEMORY, "%s: too many mappings\n", __FUNCTION__); return -ENOMEM; } map = domx86->maps + domx86->n_mappings; pfn_end = pfn + ((to - from) >> PAGE_SHIFT_X86); if ( pfn_end >= dom->p2m_size ) { xc_dom_panic(dom->xch, XC_OUT_OF_MEMORY, "%s: not enough memory for initial mapping (%#"PRIpfn" > %#"PRIpfn")", __FUNCTION__, pfn_end, dom->p2m_size); return -ENOMEM; } for ( m = 0; m < domx86->n_mappings; m++ ) { map_cmp = domx86->maps + m; if ( from < map_cmp->area.to && to > map_cmp->area.from ) { xc_dom_panic(dom->xch, XC_INTERNAL_ERROR, "%s: overlapping mappings\n", __FUNCTION__); return -EINVAL; } } memset(map, 0, sizeof(*map)); map->area.from = from & domx86->params->vaddr_mask; map->area.to = to & domx86->params->vaddr_mask; for ( l = domx86->params->levels - 1; l >= 0; l-- ) { map->lvls[l].pfn = dom->pfn_alloc_end + map->area.pgtables; if ( l == domx86->params->levels - 1 ) { /* Top level page table in first mapping only. */ if ( domx86->n_mappings == 0 ) { map->lvls[l].from = 0; map->lvls[l].to = domx86->params->vaddr_mask; map->lvls[l].pgtables = 1; map->area.pgtables++; } continue; } bits = PAGE_SHIFT_X86 + (l + 1) * PGTBL_LEVEL_SHIFT_X86; mask = bits_to_mask(bits); map->lvls[l].from = map->area.from & ~mask; map->lvls[l].to = map->area.to | mask; if ( domx86->params->levels == PGTBL_LEVELS_I386 && domx86->n_mappings == 0 && to < 0xc0000000 && l == 1 ) { DOMPRINTF("%s: PAE: extra l2 page table for l3#3", __FUNCTION__); map->lvls[l].to = domx86->params->vaddr_mask; } for ( m = 0; m < domx86->n_mappings; m++ ) { map_cmp = domx86->maps + m; if ( map_cmp->lvls[l].from == map_cmp->lvls[l].to ) continue; if ( map->lvls[l].from >= map_cmp->lvls[l].from && map->lvls[l].to <= map_cmp->lvls[l].to ) { map->lvls[l].from = 0; map->lvls[l].to = 0; break; } assert(map->lvls[l].from >= map_cmp->lvls[l].from || map->lvls[l].to <= map_cmp->lvls[l].to); if ( map->lvls[l].from >= map_cmp->lvls[l].from && map->lvls[l].from <= map_cmp->lvls[l].to ) map->lvls[l].from = map_cmp->lvls[l].to + 1; if ( map->lvls[l].to >= map_cmp->lvls[l].from && map->lvls[l].to <= map_cmp->lvls[l].to ) map->lvls[l].to = map_cmp->lvls[l].from - 1; } if ( map->lvls[l].from < map->lvls[l].to ) map->lvls[l].pgtables = ((map->lvls[l].to - map->lvls[l].from) >> bits) + 1; DOMPRINTF("%s: 0x%016" PRIx64 "/%d: 0x%016" PRIx64 " -> 0x%016" PRIx64 ", %d table(s)", __FUNCTION__, mask, bits, map->lvls[l].from, map->lvls[l].to, map->lvls[l].pgtables); map->area.pgtables += map->lvls[l].pgtables; } return 0; } static int alloc_pgtables(struct xc_dom_image *dom) { int pages, extra_pages; xen_vaddr_t try_virt_end; struct xc_dom_image_x86 *domx86 = dom->arch_private; struct xc_dom_x86_mapping *map = domx86->maps + domx86->n_mappings; extra_pages = dom->alloc_bootstack ? 1 : 0; extra_pages += (512 * 1024) / PAGE_SIZE_X86; /* 512kB padding */ pages = extra_pages; for ( ; ; ) { try_virt_end = round_up(dom->virt_alloc_end + pages * PAGE_SIZE_X86, bits_to_mask(22)); /* 4MB alignment */ if ( count_pgtables(dom, dom->parms.virt_base, try_virt_end, 0) ) return -1; pages = map->area.pgtables + extra_pages; if ( dom->virt_alloc_end + pages * PAGE_SIZE_X86 <= try_virt_end + 1 ) break; } map->area.pfn = 0; domx86->n_mappings++; dom->virt_pgtab_end = try_virt_end + 1; return xc_dom_alloc_segment(dom, &dom->pgtables_seg, "page tables", 0, map->area.pgtables * PAGE_SIZE_X86); } /* ------------------------------------------------------------------------ */ /* i386 pagetables */ static struct xc_dom_params x86_32_params = { .levels = PGTBL_LEVELS_I386, .vaddr_mask = bits_to_mask(VIRT_BITS_I386), .lvl_prot[0] = _PAGE_PRESENT|_PAGE_RW|_PAGE_ACCESSED, .lvl_prot[1] = _PAGE_PRESENT|_PAGE_RW|_PAGE_ACCESSED|_PAGE_DIRTY|_PAGE_USER, .lvl_prot[2] = _PAGE_PRESENT, }; static int alloc_pgtables_x86_32_pae(struct xc_dom_image *dom) { struct xc_dom_image_x86 *domx86 = dom->arch_private; domx86->params = &x86_32_params; return alloc_pgtables(dom); } #define pfn_to_paddr(pfn) ((xen_paddr_t)(pfn) << PAGE_SHIFT_X86) #define pgentry_to_pfn(entry) ((xen_pfn_t)((entry) >> PAGE_SHIFT_X86)) /* * Move the l3 page table page below 4G for guests which do not * support the extended-cr3 format. The l3 is currently empty so we * do not need to preserve the current contents. */ static xen_pfn_t move_l3_below_4G(struct xc_dom_image *dom, xen_pfn_t l3pfn, xen_pfn_t l3mfn) { xen_pfn_t new_l3mfn; struct xc_mmu *mmu; void *l3tab; mmu = xc_alloc_mmu_updates(dom->xch, dom->guest_domid); if ( mmu == NULL ) { DOMPRINTF("%s: failed at %d", __FUNCTION__, __LINE__); return l3mfn; } xc_dom_unmap_one(dom, l3pfn); new_l3mfn = xc_make_page_below_4G(dom->xch, dom->guest_domid, l3mfn); if ( !new_l3mfn ) goto out; dom->p2m_host[l3pfn] = new_l3mfn; if ( xc_dom_update_guest_p2m(dom) != 0 ) goto out; if ( xc_add_mmu_update(dom->xch, mmu, (((unsigned long long)new_l3mfn) << XC_DOM_PAGE_SHIFT(dom)) | MMU_MACHPHYS_UPDATE, l3pfn) ) goto out; if ( xc_flush_mmu_updates(dom->xch, mmu) ) goto out; /* * This ensures that the entire pgtables_seg is mapped by a single * mmap region. arch_setup_bootlate() relies on this to be able to * unmap and pin the pagetables. */ if ( xc_dom_seg_to_ptr(dom, &dom->pgtables_seg) == NULL ) goto out; l3tab = xc_dom_pfn_to_ptr(dom, l3pfn, 1); if ( l3tab == NULL ) { DOMPRINTF("%s: xc_dom_pfn_to_ptr(dom, l3pfn, 1) => NULL", __FUNCTION__); goto out; /* our one call site will call xc_dom_panic and fail */ } memset(l3tab, 0, XC_DOM_PAGE_SIZE(dom)); DOMPRINTF("%s: successfully relocated L3 below 4G. " "(L3 PFN %#"PRIpfn" MFN %#"PRIpfn"=>%#"PRIpfn")", __FUNCTION__, l3pfn, l3mfn, new_l3mfn); l3mfn = new_l3mfn; out: free(mmu); return l3mfn; } static x86_pgentry_t *get_pg_table_x86(struct xc_dom_image *dom, int m, int l) { struct xc_dom_image_x86 *domx86 = dom->arch_private; struct xc_dom_x86_mapping *map; x86_pgentry_t *pg; map = domx86->maps + m; pg = xc_dom_pfn_to_ptr(dom, map->lvls[l].pfn, 0); if ( pg ) return pg; xc_dom_panic(dom->xch, XC_INTERNAL_ERROR, "%s: xc_dom_pfn_to_ptr failed", __FUNCTION__); return NULL; } static x86_pgentry_t get_pg_prot_x86(struct xc_dom_image *dom, int l, xen_pfn_t pfn) { struct xc_dom_image_x86 *domx86 = dom->arch_private; struct xc_dom_x86_mapping *map; xen_pfn_t pfn_s, pfn_e; x86_pgentry_t prot; unsigned m; prot = domx86->params->lvl_prot[l]; if ( l > 0 ) return prot; for ( m = 0; m < domx86->n_mappings; m++ ) { map = domx86->maps + m; pfn_s = map->lvls[domx86->params->levels - 1].pfn; pfn_e = map->area.pgtables + pfn_s; if ( pfn >= pfn_s && pfn < pfn_e ) return prot & ~_PAGE_RW; } return prot; } static int setup_pgtables_x86(struct xc_dom_image *dom) { struct xc_dom_image_x86 *domx86 = dom->arch_private; struct xc_dom_x86_mapping *map1, *map2; struct xc_dom_x86_mapping_lvl *lvl; xen_vaddr_t from, to; xen_pfn_t pfn, p, p_s, p_e; x86_pgentry_t *pg; unsigned m1, m2; int l; for ( l = domx86->params->levels - 1; l >= 0; l-- ) for ( m1 = 0; m1 < domx86->n_mappings; m1++ ) { map1 = domx86->maps + m1; from = map1->lvls[l].from; to = map1->lvls[l].to; pg = get_pg_table_x86(dom, m1, l); if ( !pg ) return -1; for ( m2 = 0; m2 < domx86->n_mappings; m2++ ) { map2 = domx86->maps + m2; lvl = (l > 0) ? map2->lvls + l - 1 : &map2->area; if ( l > 0 && lvl->pgtables == 0 ) continue; if ( lvl->from >= to || lvl->to <= from ) continue; p_s = (max(from, lvl->from) - from) >> (PAGE_SHIFT_X86 + l * PGTBL_LEVEL_SHIFT_X86); p_e = (min(to, lvl->to) - from) >> (PAGE_SHIFT_X86 + l * PGTBL_LEVEL_SHIFT_X86); pfn = ((max(from, lvl->from) - lvl->from) >> (PAGE_SHIFT_X86 + l * PGTBL_LEVEL_SHIFT_X86)) + lvl->pfn; for ( p = p_s; p <= p_e; p++ ) { pg[p] = pfn_to_paddr(xc_dom_p2m(dom, pfn)) | get_pg_prot_x86(dom, l, pfn); pfn++; } } } return 0; } static int setup_pgtables_x86_32_pae(struct xc_dom_image *dom) { struct xc_dom_image_x86 *domx86 = dom->arch_private; xen_pfn_t l3mfn, l3pfn; l3pfn = domx86->maps[0].lvls[2].pfn; l3mfn = xc_dom_p2m(dom, l3pfn); if ( dom->parms.pae == XEN_PAE_YES ) { if ( l3mfn >= 0x100000 ) l3mfn = move_l3_below_4G(dom, l3pfn, l3mfn); if ( l3mfn >= 0x100000 ) { xc_dom_panic(dom->xch, XC_INTERNAL_ERROR,"%s: cannot move L3" " below 4G. extended-cr3 not supported by guest. " "(L3 PFN %#"PRIpfn" MFN %#"PRIpfn")", __FUNCTION__, l3pfn, l3mfn); return -EINVAL; } } return setup_pgtables_x86(dom); } /* ------------------------------------------------------------------------ */ /* x86_64 pagetables */ static struct xc_dom_params x86_64_params = { .levels = PGTBL_LEVELS_X86_64, .vaddr_mask = bits_to_mask(VIRT_BITS_X86_64), .lvl_prot[0] = _PAGE_PRESENT|_PAGE_RW|_PAGE_ACCESSED, .lvl_prot[1] = _PAGE_PRESENT|_PAGE_RW|_PAGE_ACCESSED|_PAGE_DIRTY|_PAGE_USER, .lvl_prot[2] = _PAGE_PRESENT|_PAGE_RW|_PAGE_ACCESSED|_PAGE_DIRTY|_PAGE_USER, .lvl_prot[3] = _PAGE_PRESENT|_PAGE_RW|_PAGE_ACCESSED|_PAGE_DIRTY|_PAGE_USER, }; static int alloc_pgtables_x86_64(struct xc_dom_image *dom) { struct xc_dom_image_x86 *domx86 = dom->arch_private; domx86->params = &x86_64_params; return alloc_pgtables(dom); } static int setup_pgtables_x86_64(struct xc_dom_image *dom) { return setup_pgtables_x86(dom); } /* ------------------------------------------------------------------------ */ static int alloc_p2m_list(struct xc_dom_image *dom, size_t p2m_alloc_size) { if ( xc_dom_alloc_segment(dom, &dom->p2m_seg, "phys2mach", 0, p2m_alloc_size) ) return -1; dom->p2m_guest = xc_dom_seg_to_ptr(dom, &dom->p2m_seg); if ( dom->p2m_guest == NULL ) return -1; return 0; } static int alloc_p2m_list_x86_32(struct xc_dom_image *dom) { size_t p2m_alloc_size = dom->p2m_size * dom->arch_hooks->sizeof_pfn; p2m_alloc_size = round_pg_up(p2m_alloc_size); return alloc_p2m_list(dom, p2m_alloc_size); } static int alloc_p2m_list_x86_64(struct xc_dom_image *dom) { struct xc_dom_image_x86 *domx86 = dom->arch_private; struct xc_dom_x86_mapping *map = domx86->maps + domx86->n_mappings; size_t p2m_alloc_size = dom->p2m_size * dom->arch_hooks->sizeof_pfn; xen_vaddr_t from, to; unsigned lvl; p2m_alloc_size = round_pg_up(p2m_alloc_size); if ( dom->parms.p2m_base != UNSET_ADDR ) { from = dom->parms.p2m_base; to = from + p2m_alloc_size - 1; if ( count_pgtables(dom, from, to, dom->pfn_alloc_end) ) return -1; map->area.pfn = dom->pfn_alloc_end; for ( lvl = 0; lvl < 4; lvl++ ) map->lvls[lvl].pfn += p2m_alloc_size >> PAGE_SHIFT_X86; domx86->n_mappings++; p2m_alloc_size += map->area.pgtables << PAGE_SHIFT_X86; } return alloc_p2m_list(dom, p2m_alloc_size); } /* ------------------------------------------------------------------------ */ static int alloc_magic_pages_pv(struct xc_dom_image *dom) { dom->start_info_pfn = xc_dom_alloc_page(dom, "start info"); if ( dom->start_info_pfn == INVALID_PFN ) return -1; dom->xenstore_pfn = xc_dom_alloc_page(dom, "xenstore"); if ( dom->xenstore_pfn == INVALID_PFN ) return -1; xc_clear_domain_page(dom->xch, dom->guest_domid, xc_dom_p2m(dom, dom->xenstore_pfn)); dom->console_pfn = xc_dom_alloc_page(dom, "console"); if ( dom->console_pfn == INVALID_PFN ) return -1; xc_clear_domain_page(dom->xch, dom->guest_domid, xc_dom_p2m(dom, dom->console_pfn)); dom->alloc_bootstack = 1; return 0; } static void build_hvm_info(void *hvm_info_page, struct xc_dom_image *dom) { struct hvm_info_table *hvm_info = (struct hvm_info_table *) (((unsigned char *)hvm_info_page) + HVM_INFO_OFFSET); uint8_t sum; int i; memset(hvm_info_page, 0, PAGE_SIZE); /* Fill in the header. */ memcpy(hvm_info->signature, "HVM INFO", sizeof(hvm_info->signature)); hvm_info->length = sizeof(struct hvm_info_table); /* Sensible defaults: these can be overridden by the caller. */ hvm_info->apic_mode = 1; hvm_info->nr_vcpus = 1; memset(hvm_info->vcpu_online, 0xff, sizeof(hvm_info->vcpu_online)); /* Memory parameters. */ hvm_info->low_mem_pgend = dom->lowmem_end >> PAGE_SHIFT; hvm_info->high_mem_pgend = dom->highmem_end >> PAGE_SHIFT; hvm_info->reserved_mem_pgstart = ioreq_server_pfn(0); /* Finish with the checksum. */ for ( i = 0, sum = 0; i < hvm_info->length; i++ ) sum += ((uint8_t *)hvm_info)[i]; hvm_info->checksum = -sum; } static int alloc_magic_pages_hvm(struct xc_dom_image *dom) { unsigned long i; uint32_t *ident_pt, domid = dom->guest_domid; int rc; xen_pfn_t special_array[X86_HVM_NR_SPECIAL_PAGES]; xen_pfn_t ioreq_server_array[NR_IOREQ_SERVER_PAGES]; xc_interface *xch = dom->xch; size_t start_info_size = sizeof(struct hvm_start_info); /* Allocate and clear special pages. */ for ( i = 0; i < X86_HVM_NR_SPECIAL_PAGES; i++ ) special_array[i] = special_pfn(i); rc = xc_domain_populate_physmap_exact(xch, domid, X86_HVM_NR_SPECIAL_PAGES, 0, 0, special_array); if ( rc != 0 ) { DOMPRINTF("Could not allocate special pages."); goto error_out; } if ( xc_clear_domain_pages(xch, domid, special_pfn(0), X86_HVM_NR_SPECIAL_PAGES) ) goto error_out; xc_hvm_param_set(xch, domid, HVM_PARAM_STORE_PFN, special_pfn(SPECIALPAGE_XENSTORE)); xc_hvm_param_set(xch, domid, HVM_PARAM_BUFIOREQ_PFN, special_pfn(SPECIALPAGE_BUFIOREQ)); xc_hvm_param_set(xch, domid, HVM_PARAM_IOREQ_PFN, special_pfn(SPECIALPAGE_IOREQ)); xc_hvm_param_set(xch, domid, HVM_PARAM_CONSOLE_PFN, special_pfn(SPECIALPAGE_CONSOLE)); xc_hvm_param_set(xch, domid, HVM_PARAM_PAGING_RING_PFN, special_pfn(SPECIALPAGE_PAGING)); xc_hvm_param_set(xch, domid, HVM_PARAM_MONITOR_RING_PFN, special_pfn(SPECIALPAGE_ACCESS)); xc_hvm_param_set(xch, domid, HVM_PARAM_SHARING_RING_PFN, special_pfn(SPECIALPAGE_SHARING)); start_info_size += sizeof(struct hvm_modlist_entry) * HVMLOADER_MODULE_MAX_COUNT; start_info_size += HVMLOADER_MODULE_CMDLINE_SIZE * HVMLOADER_MODULE_MAX_COUNT; if ( !dom->device_model ) { if ( dom->cmdline ) { dom->cmdline_size = ROUNDUP(strlen(dom->cmdline) + 1, 8); start_info_size += dom->cmdline_size; } } else { /* * Allocate and clear additional ioreq server pages. The default * server will use the IOREQ and BUFIOREQ special pages above. */ for ( i = 0; i < NR_IOREQ_SERVER_PAGES; i++ ) ioreq_server_array[i] = ioreq_server_pfn(i); rc = xc_domain_populate_physmap_exact(xch, domid, NR_IOREQ_SERVER_PAGES, 0, 0, ioreq_server_array); if ( rc != 0 ) { DOMPRINTF("Could not allocate ioreq server pages."); goto error_out; } if ( xc_clear_domain_pages(xch, domid, ioreq_server_pfn(0), NR_IOREQ_SERVER_PAGES) ) goto error_out; /* Tell the domain where the pages are and how many there are */ xc_hvm_param_set(xch, domid, HVM_PARAM_IOREQ_SERVER_PFN, ioreq_server_pfn(0)); xc_hvm_param_set(xch, domid, HVM_PARAM_NR_IOREQ_SERVER_PAGES, NR_IOREQ_SERVER_PAGES); } rc = xc_dom_alloc_segment(dom, &dom->start_info_seg, "HVM start info", 0, start_info_size); if ( rc != 0 ) { DOMPRINTF("Unable to reserve memory for the start info"); goto out; } /* * Identity-map page table is required for running with CR0.PG=0 when * using Intel EPT. Create a 32-bit non-PAE page directory of superpages. */ if ( (ident_pt = xc_map_foreign_range( xch, domid, PAGE_SIZE, PROT_READ | PROT_WRITE, special_pfn(SPECIALPAGE_IDENT_PT))) == NULL ) goto error_out; for ( i = 0; i < PAGE_SIZE / sizeof(*ident_pt); i++ ) ident_pt[i] = ((i << 22) | _PAGE_PRESENT | _PAGE_RW | _PAGE_USER | _PAGE_ACCESSED | _PAGE_DIRTY | _PAGE_PSE); munmap(ident_pt, PAGE_SIZE); xc_hvm_param_set(xch, domid, HVM_PARAM_IDENT_PT, special_pfn(SPECIALPAGE_IDENT_PT) << PAGE_SHIFT); dom->console_pfn = special_pfn(SPECIALPAGE_CONSOLE); xc_clear_domain_page(dom->xch, dom->guest_domid, dom->console_pfn); dom->xenstore_pfn = special_pfn(SPECIALPAGE_XENSTORE); xc_clear_domain_page(dom->xch, dom->guest_domid, dom->xenstore_pfn); dom->parms.virt_hypercall = -1; rc = 0; goto out; error_out: rc = -1; out: return rc; } /* ------------------------------------------------------------------------ */ static int start_info_x86_32(struct xc_dom_image *dom) { struct xc_dom_image_x86 *domx86 = dom->arch_private; start_info_x86_32_t *start_info = xc_dom_pfn_to_ptr(dom, dom->start_info_pfn, 1); xen_pfn_t shinfo = xc_dom_translated(dom) ? dom->shared_info_pfn : dom->shared_info_mfn; DOMPRINTF_CALLED(dom->xch); if ( start_info == NULL ) { DOMPRINTF("%s: xc_dom_pfn_to_ptr failed on start_info", __FUNCTION__); return -1; /* our caller throws away our return value :-/ */ } memset(start_info, 0, sizeof(*start_info)); strncpy(start_info->magic, dom->guest_type, sizeof(start_info->magic)); start_info->magic[sizeof(start_info->magic) - 1] = '\0'; start_info->nr_pages = dom->total_pages; start_info->shared_info = shinfo << PAGE_SHIFT_X86; start_info->pt_base = dom->pgtables_seg.vstart; start_info->nr_pt_frames = domx86->maps[0].area.pgtables; start_info->mfn_list = dom->p2m_seg.vstart; start_info->flags = dom->flags; start_info->store_mfn = xc_dom_p2m(dom, dom->xenstore_pfn); start_info->store_evtchn = dom->xenstore_evtchn; start_info->console.domU.mfn = xc_dom_p2m(dom, dom->console_pfn); start_info->console.domU.evtchn = dom->console_evtchn; if ( dom->modules[0].blob ) { start_info->mod_start = dom->initrd_start; start_info->mod_len = dom->initrd_len; } if ( dom->cmdline ) { strncpy((char *)start_info->cmd_line, dom->cmdline, MAX_GUEST_CMDLINE); start_info->cmd_line[MAX_GUEST_CMDLINE - 1] = '\0'; } return 0; } static int start_info_x86_64(struct xc_dom_image *dom) { struct xc_dom_image_x86 *domx86 = dom->arch_private; start_info_x86_64_t *start_info = xc_dom_pfn_to_ptr(dom, dom->start_info_pfn, 1); xen_pfn_t shinfo = xc_dom_translated(dom) ? dom->shared_info_pfn : dom->shared_info_mfn; DOMPRINTF_CALLED(dom->xch); if ( start_info == NULL ) { DOMPRINTF("%s: xc_dom_pfn_to_ptr failed on start_info", __FUNCTION__); return -1; /* our caller throws away our return value :-/ */ } memset(start_info, 0, sizeof(*start_info)); strncpy(start_info->magic, dom->guest_type, sizeof(start_info->magic)); start_info->magic[sizeof(start_info->magic) - 1] = '\0'; start_info->nr_pages = dom->total_pages; start_info->shared_info = shinfo << PAGE_SHIFT_X86; start_info->pt_base = dom->pgtables_seg.vstart; start_info->nr_pt_frames = domx86->maps[0].area.pgtables; start_info->mfn_list = dom->p2m_seg.vstart; if ( dom->parms.p2m_base != UNSET_ADDR ) { start_info->first_p2m_pfn = dom->p2m_seg.pfn; start_info->nr_p2m_frames = dom->p2m_seg.pages; } start_info->flags = dom->flags; start_info->store_mfn = xc_dom_p2m(dom, dom->xenstore_pfn); start_info->store_evtchn = dom->xenstore_evtchn; start_info->console.domU.mfn = xc_dom_p2m(dom, dom->console_pfn); start_info->console.domU.evtchn = dom->console_evtchn; if ( dom->modules[0].blob ) { start_info->mod_start = dom->initrd_start; start_info->mod_len = dom->initrd_len; } if ( dom->cmdline ) { strncpy((char *)start_info->cmd_line, dom->cmdline, MAX_GUEST_CMDLINE); start_info->cmd_line[MAX_GUEST_CMDLINE - 1] = '\0'; } return 0; } static int shared_info_x86_32(struct xc_dom_image *dom, void *ptr) { shared_info_x86_32_t *shared_info = ptr; int i; DOMPRINTF_CALLED(dom->xch); memset(shared_info, 0, sizeof(*shared_info)); for ( i = 0; i < XEN_LEGACY_MAX_VCPUS; i++ ) shared_info->vcpu_info[i].evtchn_upcall_mask = 1; return 0; } static int shared_info_x86_64(struct xc_dom_image *dom, void *ptr) { shared_info_x86_64_t *shared_info = ptr; int i; DOMPRINTF_CALLED(dom->xch); memset(shared_info, 0, sizeof(*shared_info)); for ( i = 0; i < XEN_LEGACY_MAX_VCPUS; i++ ) shared_info->vcpu_info[i].evtchn_upcall_mask = 1; return 0; } /* ------------------------------------------------------------------------ */ static int vcpu_x86_32(struct xc_dom_image *dom) { vcpu_guest_context_any_t any_ctx; vcpu_guest_context_x86_32_t *ctxt = &any_ctx.x32; xen_pfn_t cr3_pfn; int rc; DOMPRINTF_CALLED(dom->xch); /* clear everything */ memset(ctxt, 0, sizeof(*ctxt)); ctxt->user_regs.eip = dom->parms.virt_entry; ctxt->user_regs.esp = dom->parms.virt_base + (dom->bootstack_pfn + 1) * PAGE_SIZE_X86; ctxt->user_regs.esi = dom->parms.virt_base + (dom->start_info_pfn) * PAGE_SIZE_X86; ctxt->user_regs.eflags = 1 << 9; /* Interrupt Enable */ ctxt->flags = VGCF_in_kernel_X86_32 | VGCF_online_X86_32; if ( dom->parms.pae == XEN_PAE_EXTCR3 || dom->parms.pae == XEN_PAE_BIMODAL ) ctxt->vm_assist |= (1UL << VMASST_TYPE_pae_extended_cr3); cr3_pfn = xc_dom_p2m(dom, dom->pgtables_seg.pfn); ctxt->ctrlreg[3] = xen_pfn_to_cr3_x86_32(cr3_pfn); DOMPRINTF("%s: cr3: pfn 0x%" PRIpfn " mfn 0x%" PRIpfn "", __FUNCTION__, dom->pgtables_seg.pfn, cr3_pfn); ctxt->user_regs.ds = FLAT_KERNEL_DS_X86_32; ctxt->user_regs.es = FLAT_KERNEL_DS_X86_32; ctxt->user_regs.fs = FLAT_KERNEL_DS_X86_32; ctxt->user_regs.gs = FLAT_KERNEL_DS_X86_32; ctxt->user_regs.ss = FLAT_KERNEL_SS_X86_32; ctxt->user_regs.cs = FLAT_KERNEL_CS_X86_32; ctxt->kernel_ss = ctxt->user_regs.ss; ctxt->kernel_sp = ctxt->user_regs.esp; rc = xc_vcpu_setcontext(dom->xch, dom->guest_domid, 0, &any_ctx); if ( rc != 0 ) xc_dom_panic(dom->xch, XC_INTERNAL_ERROR, "%s: SETVCPUCONTEXT failed (rc=%d)", __func__, rc); return rc; } static int vcpu_x86_64(struct xc_dom_image *dom) { vcpu_guest_context_any_t any_ctx; vcpu_guest_context_x86_64_t *ctxt = &any_ctx.x64; xen_pfn_t cr3_pfn; int rc; DOMPRINTF_CALLED(dom->xch); /* clear everything */ memset(ctxt, 0, sizeof(*ctxt)); ctxt->user_regs.rip = dom->parms.virt_entry; ctxt->user_regs.rsp = dom->parms.virt_base + (dom->bootstack_pfn + 1) * PAGE_SIZE_X86; ctxt->user_regs.rsi = dom->parms.virt_base + (dom->start_info_pfn) * PAGE_SIZE_X86; ctxt->user_regs.rflags = 1 << 9; /* Interrupt Enable */ ctxt->flags = VGCF_in_kernel_X86_64 | VGCF_online_X86_64; cr3_pfn = xc_dom_p2m(dom, dom->pgtables_seg.pfn); ctxt->ctrlreg[3] = xen_pfn_to_cr3_x86_64(cr3_pfn); DOMPRINTF("%s: cr3: pfn 0x%" PRIpfn " mfn 0x%" PRIpfn "", __FUNCTION__, dom->pgtables_seg.pfn, cr3_pfn); ctxt->user_regs.ds = FLAT_KERNEL_DS_X86_64; ctxt->user_regs.es = FLAT_KERNEL_DS_X86_64; ctxt->user_regs.fs = FLAT_KERNEL_DS_X86_64; ctxt->user_regs.gs = FLAT_KERNEL_DS_X86_64; ctxt->user_regs.ss = FLAT_KERNEL_SS_X86_64; ctxt->user_regs.cs = FLAT_KERNEL_CS_X86_64; ctxt->kernel_ss = ctxt->user_regs.ss; ctxt->kernel_sp = ctxt->user_regs.esp; rc = xc_vcpu_setcontext(dom->xch, dom->guest_domid, 0, &any_ctx); if ( rc != 0 ) xc_dom_panic(dom->xch, XC_INTERNAL_ERROR, "%s: SETVCPUCONTEXT failed (rc=%d)", __func__, rc); return rc; } static int vcpu_hvm(struct xc_dom_image *dom) { struct { struct hvm_save_descriptor header_d; HVM_SAVE_TYPE(HEADER) header; struct hvm_save_descriptor cpu_d; HVM_SAVE_TYPE(CPU) cpu; struct hvm_save_descriptor end_d; HVM_SAVE_TYPE(END) end; } bsp_ctx; uint8_t *full_ctx = NULL; int rc; DOMPRINTF_CALLED(dom->xch); /* * Get the full HVM context in order to have the header, it is not * possible to get the header with getcontext_partial, and crafting one * from userspace is also not an option since cpuid is trapped and * modified by Xen. */ rc = xc_domain_hvm_getcontext(dom->xch, dom->guest_domid, NULL, 0); if ( rc <= 0 ) { xc_dom_panic(dom->xch, XC_INTERNAL_ERROR, "%s: unable to fetch HVM context size (rc=%d)", __func__, rc); goto out; } full_ctx = calloc(1, rc); if ( full_ctx == NULL ) { xc_dom_panic(dom->xch, XC_INTERNAL_ERROR, "%s: unable to allocate memory for HVM context (rc=%d)", __func__, rc); rc = -ENOMEM; goto out; } rc = xc_domain_hvm_getcontext(dom->xch, dom->guest_domid, full_ctx, rc); if ( rc <= 0 ) { xc_dom_panic(dom->xch, XC_INTERNAL_ERROR, "%s: unable to fetch HVM context (rc=%d)", __func__, rc); goto out; } /* Copy the header to our partial context. */ memset(&bsp_ctx, 0, sizeof(bsp_ctx)); memcpy(&bsp_ctx, full_ctx, sizeof(struct hvm_save_descriptor) + HVM_SAVE_LENGTH(HEADER)); /* Set the CPU descriptor. */ bsp_ctx.cpu_d.typecode = HVM_SAVE_CODE(CPU); bsp_ctx.cpu_d.instance = 0; bsp_ctx.cpu_d.length = HVM_SAVE_LENGTH(CPU); /* Set the cached part of the relevant segment registers. */ bsp_ctx.cpu.cs_base = 0; bsp_ctx.cpu.ds_base = 0; bsp_ctx.cpu.ss_base = 0; bsp_ctx.cpu.tr_base = 0; bsp_ctx.cpu.cs_limit = ~0u; bsp_ctx.cpu.ds_limit = ~0u; bsp_ctx.cpu.ss_limit = ~0u; bsp_ctx.cpu.tr_limit = 0x67; bsp_ctx.cpu.cs_arbytes = 0xc9b; bsp_ctx.cpu.ds_arbytes = 0xc93; bsp_ctx.cpu.ss_arbytes = 0xc93; bsp_ctx.cpu.tr_arbytes = 0x8b; /* Set the control registers. */ bsp_ctx.cpu.cr0 = X86_CR0_PE | X86_CR0_ET; /* Set the IP. */ bsp_ctx.cpu.rip = dom->parms.phys_entry; if ( dom->start_info_seg.pfn ) bsp_ctx.cpu.rbx = dom->start_info_seg.pfn << PAGE_SHIFT; /* Set the end descriptor. */ bsp_ctx.end_d.typecode = HVM_SAVE_CODE(END); bsp_ctx.end_d.instance = 0; bsp_ctx.end_d.length = HVM_SAVE_LENGTH(END); rc = xc_domain_hvm_setcontext(dom->xch, dom->guest_domid, (uint8_t *)&bsp_ctx, sizeof(bsp_ctx)); if ( rc != 0 ) xc_dom_panic(dom->xch, XC_INTERNAL_ERROR, "%s: SETHVMCONTEXT failed (rc=%d)", __func__, rc); out: free(full_ctx); return rc; } /* ------------------------------------------------------------------------ */ static int x86_compat(xc_interface *xch, uint32_t domid, char *guest_type) { static const struct { char *guest; uint32_t size; } types[] = { { "xen-3.0-x86_32p", 32 }, { "xen-3.0-x86_64", 64 }, }; DECLARE_DOMCTL; int i,rc; memset(&domctl, 0, sizeof(domctl)); domctl.domain = domid; domctl.cmd = XEN_DOMCTL_set_address_size; for ( i = 0; i < ARRAY_SIZE(types); i++ ) if ( !strcmp(types[i].guest, guest_type) ) domctl.u.address_size.size = types[i].size; if ( domctl.u.address_size.size == 0 ) /* nothing to do */ return 0; xc_dom_printf(xch, "%s: guest %s, address size %" PRId32 "", __FUNCTION__, guest_type, domctl.u.address_size.size); rc = do_domctl(xch, &domctl); if ( rc != 0 ) xc_dom_printf(xch, "%s: warning: failed (rc=%d)", __FUNCTION__, rc); return rc; } static int meminit_pv(struct xc_dom_image *dom) { int rc; xen_pfn_t pfn, allocsz, mfn, total, pfn_base; int i, j, k; xen_vmemrange_t dummy_vmemrange[1]; unsigned int dummy_vnode_to_pnode[1]; xen_vmemrange_t *vmemranges; unsigned int *vnode_to_pnode; unsigned int nr_vmemranges, nr_vnodes; rc = x86_compat(dom->xch, dom->guest_domid, dom->guest_type); if ( rc ) return rc; /* try to claim pages for early warning of insufficient memory avail */ if ( dom->claim_enabled ) { rc = xc_domain_claim_pages(dom->xch, dom->guest_domid, dom->total_pages); if ( rc ) return rc; } /* Setup dummy vNUMA information if it's not provided. Note * that this is a valid state if libxl doesn't provide any * vNUMA information. * * The dummy values make libxc allocate all pages from * arbitrary physical nodes. This is the expected behaviour if * no vNUMA configuration is provided to libxc. * * Note that the following hunk is just for the convenience of * allocation code. No defaulting happens in libxc. */ if ( dom->nr_vmemranges == 0 ) { nr_vmemranges = 1; vmemranges = dummy_vmemrange; vmemranges[0].start = 0; vmemranges[0].end = (uint64_t)dom->total_pages << PAGE_SHIFT; vmemranges[0].flags = 0; vmemranges[0].nid = 0; nr_vnodes = 1; vnode_to_pnode = dummy_vnode_to_pnode; vnode_to_pnode[0] = XC_NUMA_NO_NODE; } else { nr_vmemranges = dom->nr_vmemranges; nr_vnodes = dom->nr_vnodes; vmemranges = dom->vmemranges; vnode_to_pnode = dom->vnode_to_pnode; } total = dom->p2m_size = 0; for ( i = 0; i < nr_vmemranges; i++ ) { total += ((vmemranges[i].end - vmemranges[i].start) >> PAGE_SHIFT); dom->p2m_size = max(dom->p2m_size, (xen_pfn_t)(vmemranges[i].end >> PAGE_SHIFT)); } if ( total != dom->total_pages ) { xc_dom_panic(dom->xch, XC_INTERNAL_ERROR, "%s: vNUMA page count mismatch (0x%"PRIpfn" != 0x%"PRIpfn")", __func__, total, dom->total_pages); return -EINVAL; } dom->p2m_host = xc_dom_malloc(dom, sizeof(xen_pfn_t) * dom->p2m_size); if ( dom->p2m_host == NULL ) return -EINVAL; for ( pfn = 0; pfn < dom->p2m_size; pfn++ ) dom->p2m_host[pfn] = INVALID_PFN; /* allocate guest memory */ for ( i = 0; i < nr_vmemranges; i++ ) { unsigned int memflags; uint64_t pages, super_pages; unsigned int pnode = vnode_to_pnode[vmemranges[i].nid]; xen_pfn_t extents[SUPERPAGE_BATCH_SIZE]; xen_pfn_t pfn_base_idx; memflags = 0; if ( pnode != XC_NUMA_NO_NODE ) memflags |= XENMEMF_exact_node(pnode); pages = (vmemranges[i].end - vmemranges[i].start) >> PAGE_SHIFT; super_pages = pages >> SUPERPAGE_2MB_SHIFT; pfn_base = vmemranges[i].start >> PAGE_SHIFT; for ( pfn = pfn_base; pfn < pfn_base+pages; pfn++ ) dom->p2m_host[pfn] = pfn; pfn_base_idx = pfn_base; while ( super_pages ) { uint64_t count = min_t(uint64_t, super_pages, SUPERPAGE_BATCH_SIZE); super_pages -= count; for ( pfn = pfn_base_idx, j = 0; pfn < pfn_base_idx + (count << SUPERPAGE_2MB_SHIFT); pfn += SUPERPAGE_2MB_NR_PFNS, j++ ) extents[j] = dom->p2m_host[pfn]; rc = xc_domain_populate_physmap(dom->xch, dom->guest_domid, count, SUPERPAGE_2MB_SHIFT, memflags, extents); if ( rc < 0 ) return rc; /* Expand the returned mfns into the p2m array. */ pfn = pfn_base_idx; for ( j = 0; j < rc; j++ ) { mfn = extents[j]; for ( k = 0; k < SUPERPAGE_2MB_NR_PFNS; k++, pfn++ ) dom->p2m_host[pfn] = mfn + k; } pfn_base_idx = pfn; } for ( j = pfn_base_idx - pfn_base; j < pages; j += allocsz ) { allocsz = min_t(uint64_t, 1024 * 1024, pages - j); rc = xc_domain_populate_physmap_exact(dom->xch, dom->guest_domid, allocsz, 0, memflags, &dom->p2m_host[pfn_base + j]); if ( rc ) { if ( pnode != XC_NUMA_NO_NODE ) xc_dom_panic(dom->xch, XC_INTERNAL_ERROR, "%s: failed to allocate 0x%"PRIx64" pages (v=%d, p=%d)", __func__, pages, i, pnode); else xc_dom_panic(dom->xch, XC_INTERNAL_ERROR, "%s: failed to allocate 0x%"PRIx64" pages", __func__, pages); return rc; } } rc = 0; } /* Ensure no unclaimed pages are left unused. * OK to call if hadn't done the earlier claim call. */ xc_domain_claim_pages(dom->xch, dom->guest_domid, 0 /* cancel claim */); return rc; } /* * Check whether there exists mmio hole in the specified memory range. * Returns 1 if exists, else returns 0. */ static int check_mmio_hole(uint64_t start, uint64_t memsize, uint64_t mmio_start, uint64_t mmio_size) { if ( start + memsize <= mmio_start || start >= mmio_start + mmio_size ) return 0; else return 1; } static int meminit_hvm(struct xc_dom_image *dom) { unsigned long i, vmemid, nr_pages = dom->total_pages; unsigned long p2m_size; unsigned long target_pages = dom->target_pages; unsigned long cur_pages, cur_pfn; int rc; unsigned long stat_normal_pages = 0, stat_2mb_pages = 0, stat_1gb_pages = 0; unsigned int memflags = 0; int claim_enabled = dom->claim_enabled; uint64_t total_pages; xen_vmemrange_t dummy_vmemrange[2]; unsigned int dummy_vnode_to_pnode[1]; xen_vmemrange_t *vmemranges; unsigned int *vnode_to_pnode; unsigned int nr_vmemranges, nr_vnodes; xc_interface *xch = dom->xch; uint32_t domid = dom->guest_domid; if ( nr_pages > target_pages ) memflags |= XENMEMF_populate_on_demand; if ( dom->nr_vmemranges == 0 ) { /* Build dummy vnode information * * Guest physical address space layout: * [0, hole_start) [hole_start, 4G) [4G, highmem_end) * * Of course if there is no high memory, the second vmemrange * has no effect on the actual result. */ dummy_vmemrange[0].start = 0; dummy_vmemrange[0].end = dom->lowmem_end; dummy_vmemrange[0].flags = 0; dummy_vmemrange[0].nid = 0; nr_vmemranges = 1; if ( dom->highmem_end > (1ULL << 32) ) { dummy_vmemrange[1].start = 1ULL << 32; dummy_vmemrange[1].end = dom->highmem_end; dummy_vmemrange[1].flags = 0; dummy_vmemrange[1].nid = 0; nr_vmemranges++; } dummy_vnode_to_pnode[0] = XC_NUMA_NO_NODE; nr_vnodes = 1; vmemranges = dummy_vmemrange; vnode_to_pnode = dummy_vnode_to_pnode; } else { if ( nr_pages > target_pages ) { DOMPRINTF("Cannot enable vNUMA and PoD at the same time"); goto error_out; } nr_vmemranges = dom->nr_vmemranges; nr_vnodes = dom->nr_vnodes; vmemranges = dom->vmemranges; vnode_to_pnode = dom->vnode_to_pnode; } total_pages = 0; p2m_size = 0; for ( i = 0; i < nr_vmemranges; i++ ) { DOMPRINTF("range: start=0x%"PRIx64" end=0x%"PRIx64, vmemranges[i].start, vmemranges[i].end); total_pages += ((vmemranges[i].end - vmemranges[i].start) >> PAGE_SHIFT); p2m_size = p2m_size > (vmemranges[i].end >> PAGE_SHIFT) ? p2m_size : (vmemranges[i].end >> PAGE_SHIFT); } if ( total_pages != nr_pages ) { DOMPRINTF("vNUMA memory pages mismatch (0x%"PRIx64" != 0x%lx)", total_pages, nr_pages); goto error_out; } dom->p2m_size = p2m_size; dom->p2m_host = xc_dom_malloc(dom, sizeof(xen_pfn_t) * dom->p2m_size); if ( dom->p2m_host == NULL ) { DOMPRINTF("Could not allocate p2m"); goto error_out; } for ( i = 0; i < p2m_size; i++ ) dom->p2m_host[i] = ((xen_pfn_t)-1); for ( vmemid = 0; vmemid < nr_vmemranges; vmemid++ ) { uint64_t pfn; for ( pfn = vmemranges[vmemid].start >> PAGE_SHIFT; pfn < vmemranges[vmemid].end >> PAGE_SHIFT; pfn++ ) dom->p2m_host[pfn] = pfn; } /* * Try to claim pages for early warning of insufficient memory available. * This should go before xc_domain_set_pod_target, becuase that function * actually allocates memory for the guest. Claiming after memory has been * allocated is pointless. */ if ( claim_enabled ) { rc = xc_domain_claim_pages(xch, domid, target_pages - dom->vga_hole_size); if ( rc != 0 ) { DOMPRINTF("Could not allocate memory for HVM guest as we cannot claim memory!"); goto error_out; } } if ( memflags & XENMEMF_populate_on_demand ) { /* * Subtract VGA_HOLE_SIZE from target_pages for the VGA * "hole". Xen will adjust the PoD cache size so that domain * tot_pages will be target_pages - VGA_HOLE_SIZE after * this call. */ rc = xc_domain_set_pod_target(xch, domid, target_pages - dom->vga_hole_size, NULL, NULL, NULL); if ( rc != 0 ) { DOMPRINTF("Could not set PoD target for HVM guest.\n"); goto error_out; } } /* * Allocate memory for HVM guest, skipping VGA hole 0xA0000-0xC0000. * * We attempt to allocate 1GB pages if possible. It falls back on 2MB * pages if 1GB allocation fails. 4KB pages will be used eventually if * both fail. * * Under 2MB mode, we allocate pages in batches of no more than 8MB to * ensure that we can be preempted and hence dom0 remains responsive. */ if ( dom->device_model ) { rc = xc_domain_populate_physmap_exact( xch, domid, 0xa0, 0, memflags, &dom->p2m_host[0x00]); if ( rc != 0 ) { DOMPRINTF("Could not populate low memory (< 0xA0).\n"); goto error_out; } } stat_normal_pages = 0; for ( vmemid = 0; vmemid < nr_vmemranges; vmemid++ ) { unsigned int new_memflags = memflags; uint64_t end_pages; unsigned int vnode = vmemranges[vmemid].nid; unsigned int pnode = vnode_to_pnode[vnode]; if ( pnode != XC_NUMA_NO_NODE ) new_memflags |= XENMEMF_exact_node(pnode); end_pages = vmemranges[vmemid].end >> PAGE_SHIFT; /* * Consider vga hole belongs to the vmemrange that covers * 0xA0000-0xC0000. Note that 0x00000-0xA0000 is populated just * before this loop. */ if ( vmemranges[vmemid].start == 0 && dom->device_model ) { cur_pages = 0xc0; stat_normal_pages += 0xc0; } else cur_pages = vmemranges[vmemid].start >> PAGE_SHIFT; rc = 0; while ( (rc == 0) && (end_pages > cur_pages) ) { /* Clip count to maximum 1GB extent. */ unsigned long count = end_pages - cur_pages; unsigned long max_pages = SUPERPAGE_1GB_NR_PFNS; if ( count > max_pages ) count = max_pages; cur_pfn = dom->p2m_host[cur_pages]; /* Take care the corner cases of super page tails */ if ( ((cur_pfn & (SUPERPAGE_1GB_NR_PFNS-1)) != 0) && (count > (-cur_pfn & (SUPERPAGE_1GB_NR_PFNS-1))) ) count = -cur_pfn & (SUPERPAGE_1GB_NR_PFNS-1); else if ( ((count & (SUPERPAGE_1GB_NR_PFNS-1)) != 0) && (count > SUPERPAGE_1GB_NR_PFNS) ) count &= ~(SUPERPAGE_1GB_NR_PFNS - 1); /* Attemp to allocate 1GB super page. Because in each pass * we only allocate at most 1GB, we don't have to clip * super page boundaries. */ if ( ((count | cur_pfn) & (SUPERPAGE_1GB_NR_PFNS - 1)) == 0 && /* Check if there exists MMIO hole in the 1GB memory * range */ !check_mmio_hole(cur_pfn << PAGE_SHIFT, SUPERPAGE_1GB_NR_PFNS << PAGE_SHIFT, dom->mmio_start, dom->mmio_size) ) { long done; unsigned long nr_extents = count >> SUPERPAGE_1GB_SHIFT; xen_pfn_t sp_extents[nr_extents]; for ( i = 0; i < nr_extents; i++ ) sp_extents[i] = dom->p2m_host[cur_pages+(i< 0 ) { stat_1gb_pages += done; done <<= SUPERPAGE_1GB_SHIFT; cur_pages += done; count -= done; } } if ( count != 0 ) { /* Clip count to maximum 8MB extent. */ max_pages = SUPERPAGE_2MB_NR_PFNS * 4; if ( count > max_pages ) count = max_pages; /* Clip partial superpage extents to superpage * boundaries. */ if ( ((cur_pfn & (SUPERPAGE_2MB_NR_PFNS-1)) != 0) && (count > (-cur_pfn & (SUPERPAGE_2MB_NR_PFNS-1))) ) count = -cur_pfn & (SUPERPAGE_2MB_NR_PFNS-1); else if ( ((count & (SUPERPAGE_2MB_NR_PFNS-1)) != 0) && (count > SUPERPAGE_2MB_NR_PFNS) ) count &= ~(SUPERPAGE_2MB_NR_PFNS - 1); /* clip non-s.p. tail */ /* Attempt to allocate superpage extents. */ if ( ((count | cur_pfn) & (SUPERPAGE_2MB_NR_PFNS - 1)) == 0 ) { long done; unsigned long nr_extents = count >> SUPERPAGE_2MB_SHIFT; xen_pfn_t sp_extents[nr_extents]; for ( i = 0; i < nr_extents; i++ ) sp_extents[i] = dom->p2m_host[cur_pages+(i< 0 ) { stat_2mb_pages += done; done <<= SUPERPAGE_2MB_SHIFT; cur_pages += done; count -= done; } } } /* Fall back to 4kB extents. */ if ( count != 0 ) { rc = xc_domain_populate_physmap_exact( xch, domid, count, 0, new_memflags, &dom->p2m_host[cur_pages]); cur_pages += count; stat_normal_pages += count; } } if ( rc != 0 ) { DOMPRINTF("Could not allocate memory for HVM guest."); goto error_out; } } DPRINTF("PHYSICAL MEMORY ALLOCATION:\n"); DPRINTF(" 4KB PAGES: 0x%016lx\n", stat_normal_pages); DPRINTF(" 2MB PAGES: 0x%016lx\n", stat_2mb_pages); DPRINTF(" 1GB PAGES: 0x%016lx\n", stat_1gb_pages); rc = 0; goto out; error_out: rc = -1; out: /* ensure no unclaimed pages are left unused */ xc_domain_claim_pages(xch, domid, 0 /* cancels the claim */); return rc; } /* ------------------------------------------------------------------------ */ static int bootearly(struct xc_dom_image *dom) { if ( dom->container_type == XC_DOM_PV_CONTAINER && elf_xen_feature_get(XENFEAT_auto_translated_physmap, dom->f_active) ) { DOMPRINTF("PV Autotranslate guests no longer supported"); errno = EOPNOTSUPP; return -1; } return 0; } static int bootlate_pv(struct xc_dom_image *dom) { static const struct { char *guest; unsigned long pgd_type; } types[] = { { "xen-3.0-x86_32", MMUEXT_PIN_L2_TABLE}, { "xen-3.0-x86_32p", MMUEXT_PIN_L3_TABLE}, { "xen-3.0-x86_64", MMUEXT_PIN_L4_TABLE}, }; unsigned long pgd_type = 0; shared_info_t *shared_info; xen_pfn_t shinfo; int i, rc; for ( i = 0; i < ARRAY_SIZE(types); i++ ) if ( !strcmp(types[i].guest, dom->guest_type) ) pgd_type = types[i].pgd_type; /* Drop references to all initial page tables before pinning. */ xc_dom_unmap_one(dom, dom->pgtables_seg.pfn); xc_dom_unmap_one(dom, dom->p2m_seg.pfn); rc = pin_table(dom->xch, pgd_type, xc_dom_p2m(dom, dom->pgtables_seg.pfn), dom->guest_domid); if ( rc != 0 ) { xc_dom_panic(dom->xch, XC_INTERNAL_ERROR, "%s: pin_table failed (pfn 0x%" PRIpfn ", rc=%d)", __FUNCTION__, dom->pgtables_seg.pfn, rc); return rc; } shinfo = dom->shared_info_mfn; /* setup shared_info page */ DOMPRINTF("%s: shared_info: pfn 0x%" PRIpfn ", mfn 0x%" PRIpfn "", __FUNCTION__, dom->shared_info_pfn, dom->shared_info_mfn); shared_info = xc_map_foreign_range(dom->xch, dom->guest_domid, PAGE_SIZE_X86, PROT_READ | PROT_WRITE, shinfo); if ( shared_info == NULL ) return -1; dom->arch_hooks->shared_info(dom, shared_info); munmap(shared_info, PAGE_SIZE_X86); return 0; } static int alloc_pgtables_hvm(struct xc_dom_image *dom) { DOMPRINTF("%s: doing nothing", __func__); return 0; } /* * The memory layout of the start_info page and the modules, and where the * addresses are stored: * * /----------------------------------\ * | struct hvm_start_info | * +----------------------------------+ <- start_info->modlist_paddr * | struct hvm_modlist_entry[0] | * +----------------------------------+ * | struct hvm_modlist_entry[1] | * +----------------------------------+ <- modlist[0].cmdline_paddr * | cmdline of module 0 | * | char[HVMLOADER_MODULE_NAME_SIZE] | * +----------------------------------+ <- modlist[1].cmdline_paddr * | cmdline of module 1 | * +----------------------------------+ */ static void add_module_to_list(struct xc_dom_image *dom, struct xc_hvm_firmware_module *module, const char *cmdline, struct hvm_modlist_entry *modlist, struct hvm_start_info *start_info) { uint32_t index = start_info->nr_modules; void *modules_cmdline_start = modlist + HVMLOADER_MODULE_MAX_COUNT; uint64_t modlist_paddr = (dom->start_info_seg.pfn << PAGE_SHIFT) + ((uintptr_t)modlist - (uintptr_t)start_info); uint64_t modules_cmdline_paddr = modlist_paddr + sizeof(struct hvm_modlist_entry) * HVMLOADER_MODULE_MAX_COUNT; if ( module->length == 0 ) return; assert(start_info->nr_modules < HVMLOADER_MODULE_MAX_COUNT); modlist[index].paddr = module->guest_addr_out; modlist[index].size = module->length; if ( cmdline ) { assert(strnlen(cmdline, HVMLOADER_MODULE_CMDLINE_SIZE) < HVMLOADER_MODULE_CMDLINE_SIZE); strncpy(modules_cmdline_start + HVMLOADER_MODULE_CMDLINE_SIZE * index, cmdline, HVMLOADER_MODULE_CMDLINE_SIZE); } modlist[index].cmdline_paddr = modules_cmdline_paddr + HVMLOADER_MODULE_CMDLINE_SIZE * index; start_info->nr_modules++; } static int bootlate_hvm(struct xc_dom_image *dom) { uint32_t domid = dom->guest_domid; xc_interface *xch = dom->xch; struct hvm_start_info *start_info; size_t start_info_size; struct hvm_modlist_entry *modlist; unsigned int i; start_info_size = sizeof(*start_info) + dom->cmdline_size; start_info_size += sizeof(struct hvm_modlist_entry) * dom->num_modules; if ( start_info_size > dom->start_info_seg.pages << XC_DOM_PAGE_SHIFT(dom) ) { DOMPRINTF("Trying to map beyond start_info_seg"); return -1; } start_info = xc_map_foreign_range(xch, domid, start_info_size, PROT_READ | PROT_WRITE, dom->start_info_seg.pfn); if ( start_info == NULL ) { DOMPRINTF("Unable to map HVM start info page"); return -1; } modlist = (void*)(start_info + 1) + dom->cmdline_size; if ( !dom->device_model ) { if ( dom->cmdline ) { char *cmdline = (void*)(start_info + 1); strncpy(cmdline, dom->cmdline, dom->cmdline_size); start_info->cmdline_paddr = (dom->start_info_seg.pfn << PAGE_SHIFT) + ((uintptr_t)cmdline - (uintptr_t)start_info); } for ( i = 0; i < dom->num_modules; i++ ) { struct xc_hvm_firmware_module mod; DOMPRINTF("Adding module %u", i); mod.guest_addr_out = dom->modules[i].seg.vstart - dom->parms.virt_base; mod.length = dom->modules[i].seg.vend - dom->modules[i].seg.vstart; add_module_to_list(dom, &mod, dom->modules[i].cmdline, modlist, start_info); } /* ACPI module 0 is the RSDP */ start_info->rsdp_paddr = dom->acpi_modules[0].guest_addr_out ? : 0; } else { add_module_to_list(dom, &dom->system_firmware_module, "firmware", modlist, start_info); } if ( start_info->nr_modules ) { start_info->modlist_paddr = (dom->start_info_seg.pfn << PAGE_SHIFT) + ((uintptr_t)modlist - (uintptr_t)start_info); } start_info->magic = XEN_HVM_START_MAGIC_VALUE; munmap(start_info, start_info_size); if ( dom->device_model ) { void *hvm_info_page; if ( (hvm_info_page = xc_map_foreign_range( xch, domid, PAGE_SIZE, PROT_READ | PROT_WRITE, HVM_INFO_PFN)) == NULL ) return -1; build_hvm_info(hvm_info_page, dom); munmap(hvm_info_page, PAGE_SIZE); } return 0; } bool xc_dom_translated(const struct xc_dom_image *dom) { /* HVM guests are translated. PV guests are not. */ return dom->container_type == XC_DOM_HVM_CONTAINER; } /* ------------------------------------------------------------------------ */ static struct xc_dom_arch xc_dom_32_pae = { .guest_type = "xen-3.0-x86_32p", .native_protocol = XEN_IO_PROTO_ABI_X86_32, .page_shift = PAGE_SHIFT_X86, .sizeof_pfn = 4, .p2m_base_supported = 0, .arch_private_size = sizeof(struct xc_dom_image_x86), .alloc_magic_pages = alloc_magic_pages_pv, .alloc_pgtables = alloc_pgtables_x86_32_pae, .alloc_p2m_list = alloc_p2m_list_x86_32, .setup_pgtables = setup_pgtables_x86_32_pae, .start_info = start_info_x86_32, .shared_info = shared_info_x86_32, .vcpu = vcpu_x86_32, .meminit = meminit_pv, .bootearly = bootearly, .bootlate = bootlate_pv, }; static struct xc_dom_arch xc_dom_64 = { .guest_type = "xen-3.0-x86_64", .native_protocol = XEN_IO_PROTO_ABI_X86_64, .page_shift = PAGE_SHIFT_X86, .sizeof_pfn = 8, .p2m_base_supported = 1, .arch_private_size = sizeof(struct xc_dom_image_x86), .alloc_magic_pages = alloc_magic_pages_pv, .alloc_pgtables = alloc_pgtables_x86_64, .alloc_p2m_list = alloc_p2m_list_x86_64, .setup_pgtables = setup_pgtables_x86_64, .start_info = start_info_x86_64, .shared_info = shared_info_x86_64, .vcpu = vcpu_x86_64, .meminit = meminit_pv, .bootearly = bootearly, .bootlate = bootlate_pv, }; static struct xc_dom_arch xc_hvm_32 = { .guest_type = "hvm-3.0-x86_32", .native_protocol = XEN_IO_PROTO_ABI_X86_32, .page_shift = PAGE_SHIFT_X86, .sizeof_pfn = 4, .alloc_magic_pages = alloc_magic_pages_hvm, .alloc_pgtables = alloc_pgtables_hvm, .setup_pgtables = NULL, .start_info = NULL, .shared_info = NULL, .vcpu = vcpu_hvm, .meminit = meminit_hvm, .bootearly = bootearly, .bootlate = bootlate_hvm, }; static void __init register_arch_hooks(void) { xc_dom_register_arch_hooks(&xc_dom_32_pae); xc_dom_register_arch_hooks(&xc_dom_64); xc_dom_register_arch_hooks(&xc_hvm_32); } /* * Local variables: * mode: C * c-file-style: "BSD" * c-basic-offset: 4 * tab-width: 4 * indent-tabs-mode: nil * End: */