/****************************************************************************** * arch/x86/mm/hap/hap.c * * hardware assisted paging * Copyright (c) 2007 Advanced Micro Devices (Wei Huang) * Parts of this code are Copyright (c) 2007 by XenSource Inc. * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; If not, see . */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "private.h" /* Override macros from asm/page.h to make them work with mfn_t */ #undef mfn_to_page #define mfn_to_page(_m) __mfn_to_page(mfn_x(_m)) #undef page_to_mfn #define page_to_mfn(_pg) _mfn(__page_to_mfn(_pg)) /************************************************/ /* HAP VRAM TRACKING SUPPORT */ /************************************************/ /* * hap_track_dirty_vram() * Create the domain's dv_dirty_vram struct on demand. * Create a dirty vram range on demand when some [begin_pfn:begin_pfn+nr] is * first encountered. * Collect the guest_dirty bitmask, a bit mask of the dirty vram pages, by * calling paging_log_dirty_range(), which interrogates each vram * page's p2m type looking for pages that have been made writable. */ int hap_track_dirty_vram(struct domain *d, unsigned long begin_pfn, unsigned long nr, XEN_GUEST_HANDLE_PARAM(void) guest_dirty_bitmap) { long rc = 0; struct sh_dirty_vram *dirty_vram; uint8_t *dirty_bitmap = NULL; if ( nr ) { int size = (nr + BITS_PER_BYTE - 1) / BITS_PER_BYTE; if ( !paging_mode_log_dirty(d) ) { rc = paging_log_dirty_enable(d, 0); if ( rc ) goto out; } rc = -ENOMEM; dirty_bitmap = vzalloc(size); if ( !dirty_bitmap ) goto out; paging_lock(d); dirty_vram = d->arch.hvm_domain.dirty_vram; if ( !dirty_vram ) { rc = -ENOMEM; if ( (dirty_vram = xzalloc(struct sh_dirty_vram)) == NULL ) { paging_unlock(d); goto out; } d->arch.hvm_domain.dirty_vram = dirty_vram; } if ( begin_pfn != dirty_vram->begin_pfn || begin_pfn + nr != dirty_vram->end_pfn ) { unsigned long ostart = dirty_vram->begin_pfn; unsigned long oend = dirty_vram->end_pfn; dirty_vram->begin_pfn = begin_pfn; dirty_vram->end_pfn = begin_pfn + nr; paging_unlock(d); if ( oend > ostart ) p2m_change_type_range(d, ostart, oend, p2m_ram_logdirty, p2m_ram_rw); /* * Switch vram to log dirty mode, either by setting l1e entries of * P2M table to be read-only, or via hardware-assisted log-dirty. */ p2m_change_type_range(d, begin_pfn, begin_pfn + nr, p2m_ram_rw, p2m_ram_logdirty); flush_tlb_mask(d->domain_dirty_cpumask); memset(dirty_bitmap, 0xff, size); /* consider all pages dirty */ } else { paging_unlock(d); domain_pause(d); /* Flush dirty GFNs potentially cached by hardware. */ p2m_flush_hardware_cached_dirty(d); /* get the bitmap */ paging_log_dirty_range(d, begin_pfn, nr, dirty_bitmap); domain_unpause(d); } rc = -EFAULT; if ( copy_to_guest(guest_dirty_bitmap, dirty_bitmap, size) == 0 ) rc = 0; } else { paging_lock(d); dirty_vram = d->arch.hvm_domain.dirty_vram; if ( dirty_vram ) { /* * If zero pages specified while tracking dirty vram * then stop tracking */ begin_pfn = dirty_vram->begin_pfn; nr = dirty_vram->end_pfn - dirty_vram->begin_pfn; xfree(dirty_vram); d->arch.hvm_domain.dirty_vram = NULL; } paging_unlock(d); if ( nr ) p2m_change_type_range(d, begin_pfn, begin_pfn + nr, p2m_ram_logdirty, p2m_ram_rw); } out: vfree(dirty_bitmap); return rc; } /************************************************/ /* HAP LOG DIRTY SUPPORT */ /************************************************/ /* * hap code to call when log_dirty is enable. return 0 if no problem found. * * NB: Domain that having device assigned should not set log_global. Because * there is no way to track the memory updating from device. */ static int hap_enable_log_dirty(struct domain *d, bool_t log_global) { struct p2m_domain *p2m = p2m_get_hostp2m(d); /* * Refuse to turn on global log-dirty mode if * there are outstanding p2m_ioreq_server pages. */ if ( log_global && read_atomic(&p2m->ioreq.entry_count) ) return -EBUSY; /* turn on PG_log_dirty bit in paging mode */ paging_lock(d); d->arch.paging.mode |= PG_log_dirty; paging_unlock(d); /* Enable hardware-assisted log-dirty if it is supported. */ p2m_enable_hardware_log_dirty(d); if ( log_global ) { /* * Switch to log dirty mode, either by setting l1e entries of P2M table * to be read-only, or via hardware-assisted log-dirty. */ p2m_change_entry_type_global(d, p2m_ram_rw, p2m_ram_logdirty); flush_tlb_mask(d->domain_dirty_cpumask); } return 0; } static int hap_disable_log_dirty(struct domain *d) { paging_lock(d); d->arch.paging.mode &= ~PG_log_dirty; paging_unlock(d); /* Disable hardware-assisted log-dirty if it is supported. */ p2m_disable_hardware_log_dirty(d); /* * switch to normal mode, either by setting l1e entries of P2M table to * normal mode, or via hardware-assisted log-dirty. */ p2m_change_entry_type_global(d, p2m_ram_logdirty, p2m_ram_rw); return 0; } static void hap_clean_dirty_bitmap(struct domain *d) { /* * Switch to log-dirty mode, either by setting l1e entries of P2M table to * be read-only, or via hardware-assisted log-dirty. */ p2m_change_entry_type_global(d, p2m_ram_rw, p2m_ram_logdirty); flush_tlb_mask(d->domain_dirty_cpumask); } /************************************************/ /* HAP SUPPORT FUNCTIONS */ /************************************************/ static struct page_info *hap_alloc(struct domain *d) { struct page_info *pg; ASSERT(paging_locked_by_me(d)); pg = page_list_remove_head(&d->arch.paging.hap.freelist); if ( unlikely(!pg) ) return NULL; d->arch.paging.hap.free_pages--; clear_domain_page(page_to_mfn(pg)); return pg; } static void hap_free(struct domain *d, mfn_t mfn) { struct page_info *pg = mfn_to_page(mfn); ASSERT(paging_locked_by_me(d)); d->arch.paging.hap.free_pages++; page_list_add_tail(pg, &d->arch.paging.hap.freelist); } static struct page_info *hap_alloc_p2m_page(struct domain *d) { struct page_info *pg; /* This is called both from the p2m code (which never holds the * paging lock) and the log-dirty code (which always does). */ paging_lock_recursive(d); pg = hap_alloc(d); if ( likely(pg != NULL) ) { d->arch.paging.hap.total_pages--; d->arch.paging.hap.p2m_pages++; ASSERT(!page_get_owner(pg) && !(pg->count_info & PGC_count_mask)); } else if ( !d->arch.paging.p2m_alloc_failed ) { d->arch.paging.p2m_alloc_failed = 1; dprintk(XENLOG_ERR, "d%i failed to allocate from HAP pool\n", d->domain_id); } paging_unlock(d); return pg; } static void hap_free_p2m_page(struct domain *d, struct page_info *pg) { struct domain *owner = page_get_owner(pg); /* This is called both from the p2m code (which never holds the * paging lock) and the log-dirty code (which always does). */ paging_lock_recursive(d); /* Should still have no owner and count zero. */ if ( owner || (pg->count_info & PGC_count_mask) ) { HAP_ERROR("d%d: Odd p2m page %"PRI_mfn" d=%d c=%lx t=%"PRtype_info"\n", d->domain_id, mfn_x(page_to_mfn(pg)), owner ? owner->domain_id : DOMID_INVALID, pg->count_info, pg->u.inuse.type_info); WARN(); pg->count_info &= ~PGC_count_mask; page_set_owner(pg, NULL); } d->arch.paging.hap.p2m_pages--; d->arch.paging.hap.total_pages++; hap_free(d, page_to_mfn(pg)); paging_unlock(d); } /* Return the size of the pool, rounded up to the nearest MB */ static unsigned int hap_get_allocation(struct domain *d) { unsigned int pg = d->arch.paging.hap.total_pages + d->arch.paging.hap.p2m_pages; return ((pg >> (20 - PAGE_SHIFT)) + ((pg & ((1 << (20 - PAGE_SHIFT)) - 1)) ? 1 : 0)); } /* Set the pool of pages to the required number of pages. * Returns 0 for success, non-zero for failure. */ int hap_set_allocation(struct domain *d, unsigned int pages, bool *preempted) { struct page_info *pg; ASSERT(paging_locked_by_me(d)); if ( pages < d->arch.paging.hap.p2m_pages ) pages = 0; else pages -= d->arch.paging.hap.p2m_pages; for ( ; ; ) { if ( d->arch.paging.hap.total_pages < pages ) { /* Need to allocate more memory from domheap */ pg = alloc_domheap_page(d, MEMF_no_owner); if ( pg == NULL ) { HAP_PRINTK("failed to allocate hap pages.\n"); return -ENOMEM; } d->arch.paging.hap.free_pages++; d->arch.paging.hap.total_pages++; page_list_add_tail(pg, &d->arch.paging.hap.freelist); } else if ( d->arch.paging.hap.total_pages > pages ) { /* Need to return memory to domheap */ if ( page_list_empty(&d->arch.paging.hap.freelist) ) { HAP_PRINTK("failed to free enough hap pages.\n"); return -ENOMEM; } pg = page_list_remove_head(&d->arch.paging.hap.freelist); ASSERT(pg); d->arch.paging.hap.free_pages--; d->arch.paging.hap.total_pages--; free_domheap_page(pg); } else break; /* Check to see if we need to yield and try again */ if ( preempted && general_preempt_check() ) { *preempted = true; return 0; } } return 0; } static mfn_t hap_make_monitor_table(struct vcpu *v) { struct domain *d = v->domain; struct page_info *pg; l4_pgentry_t *l4e; mfn_t m4mfn; ASSERT(pagetable_get_pfn(v->arch.monitor_table) == 0); if ( (pg = hap_alloc(d)) == NULL ) goto oom; m4mfn = page_to_mfn(pg); l4e = map_domain_page(m4mfn); init_xen_l4_slots(l4e, m4mfn, d, INVALID_MFN, false); unmap_domain_page(l4e); return m4mfn; oom: HAP_ERROR("out of memory building monitor pagetable\n"); domain_crash(d); return INVALID_MFN; } static void hap_destroy_monitor_table(struct vcpu* v, mfn_t mmfn) { struct domain *d = v->domain; /* Put the memory back in the pool */ hap_free(d, mmfn); } /************************************************/ /* HAP DOMAIN LEVEL FUNCTIONS */ /************************************************/ void hap_domain_init(struct domain *d) { static const struct log_dirty_ops hap_ops = { .enable = hap_enable_log_dirty, .disable = hap_disable_log_dirty, .clean = hap_clean_dirty_bitmap, }; INIT_PAGE_LIST_HEAD(&d->arch.paging.hap.freelist); /* Use HAP logdirty mechanism. */ paging_log_dirty_init(d, &hap_ops); } /* return 0 for success, -errno for failure */ int hap_enable(struct domain *d, u32 mode) { unsigned int old_pages; unsigned int i; int rv = 0; domain_pause(d); old_pages = d->arch.paging.hap.total_pages; if ( old_pages == 0 ) { paging_lock(d); rv = hap_set_allocation(d, 256, NULL); if ( rv != 0 ) { hap_set_allocation(d, 0, NULL); paging_unlock(d); goto out; } paging_unlock(d); } /* Allow p2m and log-dirty code to borrow our memory */ d->arch.paging.alloc_page = hap_alloc_p2m_page; d->arch.paging.free_page = hap_free_p2m_page; /* allocate P2m table */ if ( mode & PG_translate ) { rv = p2m_alloc_table(p2m_get_hostp2m(d)); if ( rv != 0 ) goto out; } for (i = 0; i < MAX_NESTEDP2M; i++) { rv = p2m_alloc_table(d->arch.nested_p2m[i]); if ( rv != 0 ) goto out; } if ( hvm_altp2m_supported() ) { /* Init alternate p2m data */ if ( (d->arch.altp2m_eptp = alloc_xenheap_page()) == NULL ) { rv = -ENOMEM; goto out; } for ( i = 0; i < MAX_EPTP; i++ ) d->arch.altp2m_eptp[i] = mfn_x(INVALID_MFN); for ( i = 0; i < MAX_ALTP2M; i++ ) { rv = p2m_alloc_table(d->arch.altp2m_p2m[i]); if ( rv != 0 ) goto out; } d->arch.altp2m_active = 0; } /* Now let other users see the new mode */ d->arch.paging.mode = mode | PG_HAP_enable; out: domain_unpause(d); return rv; } void hap_final_teardown(struct domain *d) { unsigned int i; if ( hvm_altp2m_supported() ) { d->arch.altp2m_active = 0; if ( d->arch.altp2m_eptp ) { free_xenheap_page(d->arch.altp2m_eptp); d->arch.altp2m_eptp = NULL; } for ( i = 0; i < MAX_ALTP2M; i++ ) p2m_teardown(d->arch.altp2m_p2m[i]); } /* Destroy nestedp2m's first */ for (i = 0; i < MAX_NESTEDP2M; i++) { p2m_teardown(d->arch.nested_p2m[i]); } if ( d->arch.paging.hap.total_pages != 0 ) hap_teardown(d, NULL); p2m_teardown(p2m_get_hostp2m(d)); /* Free any memory that the p2m teardown released */ paging_lock(d); hap_set_allocation(d, 0, NULL); ASSERT(d->arch.paging.hap.p2m_pages == 0); paging_unlock(d); } void hap_teardown(struct domain *d, bool *preempted) { struct vcpu *v; mfn_t mfn; ASSERT(d->is_dying); ASSERT(d != current->domain); paging_lock(d); /* Keep various asserts happy */ if ( paging_mode_enabled(d) ) { /* release the monitor table held by each vcpu */ for_each_vcpu ( d, v ) { if ( paging_get_hostmode(v) && paging_mode_external(d) ) { mfn = pagetable_get_mfn(v->arch.monitor_table); if ( mfn_valid(mfn) && (mfn_x(mfn) != 0) ) hap_destroy_monitor_table(v, mfn); v->arch.monitor_table = pagetable_null(); } } } if ( d->arch.paging.hap.total_pages != 0 ) { hap_set_allocation(d, 0, preempted); if ( preempted && *preempted ) goto out; ASSERT(d->arch.paging.hap.total_pages == 0); } d->arch.paging.mode &= ~PG_log_dirty; xfree(d->arch.hvm_domain.dirty_vram); d->arch.hvm_domain.dirty_vram = NULL; out: paging_unlock(d); } int hap_domctl(struct domain *d, struct xen_domctl_shadow_op *sc, XEN_GUEST_HANDLE_PARAM(xen_domctl_t) u_domctl) { int rc; bool preempted = false; switch ( sc->op ) { case XEN_DOMCTL_SHADOW_OP_SET_ALLOCATION: paging_lock(d); rc = hap_set_allocation(d, sc->mb << (20 - PAGE_SHIFT), &preempted); paging_unlock(d); if ( preempted ) /* Not finished. Set up to re-run the call. */ rc = hypercall_create_continuation(__HYPERVISOR_domctl, "h", u_domctl); else /* Finished. Return the new allocation */ sc->mb = hap_get_allocation(d); return rc; case XEN_DOMCTL_SHADOW_OP_GET_ALLOCATION: sc->mb = hap_get_allocation(d); /* Fall through... */ case XEN_DOMCTL_SHADOW_OP_OFF: return 0; default: HAP_PRINTK("Bad hap domctl op %u\n", sc->op); return -EINVAL; } } static const struct paging_mode hap_paging_real_mode; static const struct paging_mode hap_paging_protected_mode; static const struct paging_mode hap_paging_pae_mode; static const struct paging_mode hap_paging_long_mode; void hap_vcpu_init(struct vcpu *v) { v->arch.paging.mode = &hap_paging_real_mode; v->arch.paging.nestedmode = &hap_paging_real_mode; } /************************************************/ /* HAP PAGING MODE FUNCTIONS */ /************************************************/ /* * HAP guests can handle page faults (in the guest page tables) without * needing any action from Xen, so we should not be intercepting them. */ static int hap_page_fault(struct vcpu *v, unsigned long va, struct cpu_user_regs *regs) { struct domain *d = v->domain; HAP_ERROR("Intercepted a guest #PF (%pv) with HAP enabled\n", v); domain_crash(d); return 0; } /* * HAP guests can handle invlpg without needing any action from Xen, so * should not be intercepting it. However, we need to correctly handle * getting here from instruction emulation. */ static bool_t hap_invlpg(struct vcpu *v, unsigned long va) { /* * Emulate INVLPGA: * Must perform the flush right now or an other vcpu may * use it when we use the next VMRUN emulation, otherwise. */ if ( nestedhvm_enabled(v->domain) && vcpu_nestedhvm(v).nv_p2m ) p2m_flush(v, vcpu_nestedhvm(v).nv_p2m); return 1; } static void hap_update_cr3(struct vcpu *v, int do_locking) { v->arch.hvm_vcpu.hw_cr[3] = v->arch.hvm_vcpu.guest_cr[3]; hvm_update_guest_cr(v, 3); } const struct paging_mode * hap_paging_get_mode(struct vcpu *v) { return (!hvm_paging_enabled(v) ? &hap_paging_real_mode : hvm_long_mode_active(v) ? &hap_paging_long_mode : hvm_pae_enabled(v) ? &hap_paging_pae_mode : &hap_paging_protected_mode); } static void hap_update_paging_modes(struct vcpu *v) { struct domain *d = v->domain; unsigned long cr3_gfn = v->arch.hvm_vcpu.guest_cr[3] >> PAGE_SHIFT; p2m_type_t t; /* We hold onto the cr3 as it may be modified later, and * we need to respect lock ordering. No need for * checks here as they are performed by vmx_load_pdptrs * (the potential user of the cr3) */ (void)get_gfn(d, cr3_gfn, &t); paging_lock(d); v->arch.paging.mode = hap_paging_get_mode(v); if ( pagetable_is_null(v->arch.monitor_table) ) { mfn_t mmfn = hap_make_monitor_table(v); v->arch.monitor_table = pagetable_from_mfn(mmfn); make_cr3(v, mmfn); hvm_update_host_cr3(v); } /* CR3 is effectively updated by a mode change. Flush ASIDs, etc. */ hap_update_cr3(v, 0); paging_unlock(d); put_gfn(d, cr3_gfn); } static void hap_write_p2m_entry(struct domain *d, unsigned long gfn, l1_pgentry_t *p, l1_pgentry_t new, unsigned int level) { uint32_t old_flags; bool_t flush_nestedp2m = 0; /* We know always use the host p2m here, regardless if the vcpu * is in host or guest mode. The vcpu can be in guest mode by * a hypercall which passes a domain and chooses mostly the first * vcpu. */ paging_lock(d); old_flags = l1e_get_flags(*p); if ( nestedhvm_enabled(d) && (old_flags & _PAGE_PRESENT) && !p2m_get_hostp2m(d)->defer_nested_flush ) { /* We are replacing a valid entry so we need to flush nested p2ms, * unless the only change is an increase in access rights. */ mfn_t omfn = l1e_get_mfn(*p); mfn_t nmfn = l1e_get_mfn(new); flush_nestedp2m = !( mfn_x(omfn) == mfn_x(nmfn) && perms_strictly_increased(old_flags, l1e_get_flags(new)) ); } safe_write_pte(p, new); if ( old_flags & _PAGE_PRESENT ) flush_tlb_mask(d->domain_dirty_cpumask); paging_unlock(d); if ( flush_nestedp2m ) p2m_flush_nestedp2m(d); } static unsigned long hap_gva_to_gfn_real_mode( struct vcpu *v, struct p2m_domain *p2m, unsigned long gva, uint32_t *pfec) { return ((paddr_t)gva >> PAGE_SHIFT); } static unsigned long hap_p2m_ga_to_gfn_real_mode( struct vcpu *v, struct p2m_domain *p2m, unsigned long cr3, paddr_t ga, uint32_t *pfec, unsigned int *page_order) { if ( page_order ) *page_order = PAGE_ORDER_4K; return (ga >> PAGE_SHIFT); } /* Entry points into this mode of the hap code. */ static const struct paging_mode hap_paging_real_mode = { .page_fault = hap_page_fault, .invlpg = hap_invlpg, .gva_to_gfn = hap_gva_to_gfn_real_mode, .p2m_ga_to_gfn = hap_p2m_ga_to_gfn_real_mode, .update_cr3 = hap_update_cr3, .update_paging_modes = hap_update_paging_modes, .write_p2m_entry = hap_write_p2m_entry, .guest_levels = 1 }; static const struct paging_mode hap_paging_protected_mode = { .page_fault = hap_page_fault, .invlpg = hap_invlpg, .gva_to_gfn = hap_gva_to_gfn_2_levels, .p2m_ga_to_gfn = hap_p2m_ga_to_gfn_2_levels, .update_cr3 = hap_update_cr3, .update_paging_modes = hap_update_paging_modes, .write_p2m_entry = hap_write_p2m_entry, .guest_levels = 2 }; static const struct paging_mode hap_paging_pae_mode = { .page_fault = hap_page_fault, .invlpg = hap_invlpg, .gva_to_gfn = hap_gva_to_gfn_3_levels, .p2m_ga_to_gfn = hap_p2m_ga_to_gfn_3_levels, .update_cr3 = hap_update_cr3, .update_paging_modes = hap_update_paging_modes, .write_p2m_entry = hap_write_p2m_entry, .guest_levels = 3 }; static const struct paging_mode hap_paging_long_mode = { .page_fault = hap_page_fault, .invlpg = hap_invlpg, .gva_to_gfn = hap_gva_to_gfn_4_levels, .p2m_ga_to_gfn = hap_p2m_ga_to_gfn_4_levels, .update_cr3 = hap_update_cr3, .update_paging_modes = hap_update_paging_modes, .write_p2m_entry = hap_write_p2m_entry, .guest_levels = 4 }; /* * Local variables: * mode: C * c-file-style: "BSD" * c-basic-offset: 4 * indent-tabs-mode: nil * End: */