/* * hvm.c: Common hardware virtual machine abstractions. * * Copyright (c) 2004, Intel Corporation. * Copyright (c) 2005, International Business Machines Corporation. * Copyright (c) 2008, Citrix Systems, Inc. * * This program is free software; you can redistribute it and/or modify it * under the terms and conditions of the GNU General Public License, * version 2, as published by the Free Software Foundation. * * This program is distributed in the hope it will be useful, but WITHOUT * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for * more details. * * You should have received a copy of the GNU General Public License along with * this program; If not, see . */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include bool_t __read_mostly hvm_enabled; #ifdef DBG_LEVEL_0 unsigned int opt_hvm_debug_level __read_mostly; integer_param("hvm_debug", opt_hvm_debug_level); #endif struct hvm_function_table hvm_funcs __read_mostly; /* * The I/O permission bitmap is globally shared by all HVM guests except * the hardware domain which needs a more permissive one. */ #define HVM_IOBITMAP_SIZE (3 * PAGE_SIZE) unsigned long __section(".bss.page_aligned") __aligned(PAGE_SIZE) hvm_io_bitmap[HVM_IOBITMAP_SIZE / BYTES_PER_LONG]; /* Xen command-line option to enable HAP */ static bool_t __initdata opt_hap_enabled = 1; boolean_param("hap", opt_hap_enabled); #ifndef opt_hvm_fep /* Permit use of the Forced Emulation Prefix in HVM guests */ bool_t __read_mostly opt_hvm_fep; boolean_param("hvm_fep", opt_hvm_fep); #endif static const char __initconst warning_hvm_fep[] = "WARNING: HVM FORCED EMULATION PREFIX IS AVAILABLE\n" "This option is *ONLY* intended to aid testing of Xen.\n" "It has implications on the security of the system.\n" "Please *DO NOT* use this in production.\n"; /* Xen command-line option to enable altp2m */ static bool_t __initdata opt_altp2m_enabled = 0; boolean_param("altp2m", opt_altp2m_enabled); static int cpu_callback( struct notifier_block *nfb, unsigned long action, void *hcpu) { unsigned int cpu = (unsigned long)hcpu; int rc = 0; switch ( action ) { case CPU_UP_PREPARE: rc = hvm_funcs.cpu_up_prepare(cpu); break; case CPU_DYING: hvm_cpu_down(); break; case CPU_UP_CANCELED: case CPU_DEAD: hvm_funcs.cpu_dead(cpu); break; default: break; } return !rc ? NOTIFY_DONE : notifier_from_errno(rc); } static struct notifier_block cpu_nfb = { .notifier_call = cpu_callback }; static int __init hvm_enable(void) { const struct hvm_function_table *fns = NULL; if ( cpu_has_vmx ) fns = start_vmx(); else if ( cpu_has_svm ) fns = start_svm(); if ( fns == NULL ) return 0; hvm_funcs = *fns; hvm_enabled = 1; printk("HVM: %s enabled\n", fns->name); if ( !fns->hap_supported ) printk("HVM: Hardware Assisted Paging (HAP) not detected\n"); else if ( !opt_hap_enabled ) { hvm_funcs.hap_supported = 0; printk("HVM: Hardware Assisted Paging (HAP) detected but disabled\n"); } else { printk("HVM: Hardware Assisted Paging (HAP) detected\n"); printk("HVM: HAP page sizes: 4kB"); if ( fns->hap_capabilities & HVM_HAP_SUPERPAGE_2MB ) { printk(", 2MB%s", opt_hap_2mb ? "" : " [disabled]"); if ( !opt_hap_2mb ) hvm_funcs.hap_capabilities &= ~HVM_HAP_SUPERPAGE_2MB; } if ( fns->hap_capabilities & HVM_HAP_SUPERPAGE_1GB ) { printk(", 1GB%s", opt_hap_1gb ? "" : " [disabled]"); if ( !opt_hap_1gb ) hvm_funcs.hap_capabilities &= ~HVM_HAP_SUPERPAGE_1GB; } printk("\n"); } if ( !opt_altp2m_enabled ) hvm_funcs.altp2m_supported = 0; if ( opt_hvm_fep ) warning_add(warning_hvm_fep); /* * Allow direct access to the PC debug ports 0x80 and 0xed (they are * often used for I/O delays, but the vmexits simply slow things down). */ memset(hvm_io_bitmap, ~0, sizeof(hvm_io_bitmap)); if ( hvm_port80_allowed ) __clear_bit(0x80, hvm_io_bitmap); __clear_bit(0xed, hvm_io_bitmap); register_cpu_notifier(&cpu_nfb); return 0; } presmp_initcall(hvm_enable); /* * Need to re-inject a given event? We avoid re-injecting software exceptions * and interrupts because the faulting/trapping instruction can simply be * re-executed (neither VMX nor SVM update RIP when they VMEXIT during * INT3/INTO/INTn). */ int hvm_event_needs_reinjection(uint8_t type, uint8_t vector) { switch ( type ) { case X86_EVENTTYPE_EXT_INTR: case X86_EVENTTYPE_NMI: return 1; case X86_EVENTTYPE_HW_EXCEPTION: /* * SVM uses type 3 ("HW Exception") for #OF and #BP. We explicitly * check for these vectors, as they are really SW Exceptions. SVM has * not updated RIP to point after the trapping instruction (INT3/INTO). */ return (vector != 3) && (vector != 4); default: /* Software exceptions/interrupts can be re-executed (e.g., INT n). */ break; } return 0; } /* * Combine two hardware exceptions: @vec2 was raised during delivery of @vec1. * This means we can assume that @vec2 is contributory or a page fault. */ uint8_t hvm_combine_hw_exceptions(uint8_t vec1, uint8_t vec2) { const unsigned int contributory_exceptions = (1 << TRAP_divide_error) | (1 << TRAP_invalid_tss) | (1 << TRAP_no_segment) | (1 << TRAP_stack_error) | (1 << TRAP_gp_fault); const unsigned int page_faults = (1 << TRAP_page_fault) | (1 << TRAP_virtualisation); /* Exception during double-fault delivery always causes a triple fault. */ if ( vec1 == TRAP_double_fault ) { hvm_triple_fault(); return TRAP_double_fault; /* dummy return */ } /* Exception during page-fault delivery always causes a double fault. */ if ( (1u << vec1) & page_faults ) return TRAP_double_fault; /* Discard the first exception if it's benign or if we now have a #PF. */ if ( !((1u << vec1) & contributory_exceptions) || ((1u << vec2) & page_faults) ) return vec2; /* Cannot combine the exceptions: double fault. */ return TRAP_double_fault; } void hvm_set_rdtsc_exiting(struct domain *d, bool_t enable) { struct vcpu *v; for_each_vcpu ( d, v ) hvm_funcs.set_rdtsc_exiting(v, enable); } void hvm_get_guest_pat(struct vcpu *v, u64 *guest_pat) { if ( !hvm_funcs.get_guest_pat(v, guest_pat) ) *guest_pat = v->arch.hvm_vcpu.pat_cr; } int hvm_set_guest_pat(struct vcpu *v, u64 guest_pat) { int i; uint8_t *value = (uint8_t *)&guest_pat; for ( i = 0; i < 8; i++ ) switch ( value[i] ) { case PAT_TYPE_UC_MINUS: case PAT_TYPE_UNCACHABLE: case PAT_TYPE_WRBACK: case PAT_TYPE_WRCOMB: case PAT_TYPE_WRPROT: case PAT_TYPE_WRTHROUGH: break; default: HVM_DBG_LOG(DBG_LEVEL_MSR, "invalid guest PAT: %"PRIx64"\n", guest_pat); return 0; } if ( !hvm_funcs.set_guest_pat(v, guest_pat) ) v->arch.hvm_vcpu.pat_cr = guest_pat; return 1; } bool hvm_set_guest_bndcfgs(struct vcpu *v, u64 val) { if ( !hvm_funcs.set_guest_bndcfgs || !is_canonical_address(val) || (val & IA32_BNDCFGS_RESERVED) ) return false; /* * While MPX instructions are supposed to be gated on XCR0.BND*, let's * nevertheless force the relevant XCR0 bits on when the feature is being * enabled in BNDCFGS. */ if ( (val & IA32_BNDCFGS_ENABLE) && !(v->arch.xcr0_accum & (XSTATE_BNDREGS | XSTATE_BNDCSR)) ) { uint64_t xcr0 = get_xcr0(); int rc; if ( v != current ) return false; rc = handle_xsetbv(XCR_XFEATURE_ENABLED_MASK, xcr0 | XSTATE_BNDREGS | XSTATE_BNDCSR); if ( rc ) { HVM_DBG_LOG(DBG_LEVEL_1, "Failed to force XCR0.BND*: %d", rc); return false; } if ( handle_xsetbv(XCR_XFEATURE_ENABLED_MASK, xcr0) ) /* nothing, best effort only */; } return hvm_funcs.set_guest_bndcfgs(v, val); } /* * Get the ratio to scale host TSC frequency to gtsc_khz. zero will be * returned if TSC scaling is unavailable or ratio cannot be handled * by host CPU. Otherwise, a non-zero ratio will be returned. */ u64 hvm_get_tsc_scaling_ratio(u32 gtsc_khz) { u8 ratio_frac_bits = hvm_funcs.tsc_scaling.ratio_frac_bits; u64 max_ratio = hvm_funcs.tsc_scaling.max_ratio; u64 ratio, dummy; if ( !hvm_tsc_scaling_supported ) return 0; /* * Return early if the quotient is too large to fit in the integral * part of TSC scaling ratio. This also avoids #DE from the following * divq when the quotient can not fit in a 64-bit integer. */ if ( gtsc_khz / cpu_khz > (max_ratio >> ratio_frac_bits) ) return 0; /* ratio = (gtsc_khz << hvm_funcs.tsc_scaling.ratio_frac_bits) / cpu_khz */ asm ( "shldq %[frac],%[gkhz],%[zero] ; " "shlq %[frac],%[gkhz] ; " "divq %[hkhz] " : "=d" (dummy), "=a" (ratio) : [frac] "c" (ratio_frac_bits), [gkhz] "a" ((u64) gtsc_khz), [zero] "d" (0ULL), [hkhz] "rm" ((u64) cpu_khz) ); return ratio > max_ratio ? 0 : ratio; } u64 hvm_scale_tsc(const struct domain *d, u64 tsc) { u64 ratio = d->arch.hvm_domain.tsc_scaling_ratio; u64 dummy; if ( ratio == hvm_default_tsc_scaling_ratio ) return tsc; /* tsc = (tsc * ratio) >> hvm_funcs.tsc_scaling.ratio_frac_bits */ asm ( "mulq %[ratio]; shrdq %[frac],%%rdx,%[tsc]" : [tsc] "+a" (tsc), "=&d" (dummy) : [frac] "c" (hvm_funcs.tsc_scaling.ratio_frac_bits), [ratio] "rm" (ratio) ); return tsc; } static void hvm_set_guest_tsc_fixed(struct vcpu *v, u64 guest_tsc, u64 at_tsc) { uint64_t tsc; uint64_t delta_tsc; if ( v->domain->arch.vtsc ) { tsc = hvm_get_guest_time_fixed(v, at_tsc); tsc = gtime_to_gtsc(v->domain, tsc); } else { tsc = at_tsc ?: rdtsc(); if ( hvm_tsc_scaling_supported ) tsc = hvm_scale_tsc(v->domain, tsc); } delta_tsc = guest_tsc - tsc; v->arch.hvm_vcpu.cache_tsc_offset = delta_tsc; hvm_funcs.set_tsc_offset(v, v->arch.hvm_vcpu.cache_tsc_offset, at_tsc); } #define hvm_set_guest_tsc(v, t) hvm_set_guest_tsc_fixed(v, t, 0) static void hvm_set_guest_tsc_msr(struct vcpu *v, u64 guest_tsc) { uint64_t tsc_offset = v->arch.hvm_vcpu.cache_tsc_offset; hvm_set_guest_tsc(v, guest_tsc); v->arch.hvm_vcpu.msr_tsc_adjust += v->arch.hvm_vcpu.cache_tsc_offset - tsc_offset; } static void hvm_set_guest_tsc_adjust(struct vcpu *v, u64 tsc_adjust) { v->arch.hvm_vcpu.cache_tsc_offset += tsc_adjust - v->arch.hvm_vcpu.msr_tsc_adjust; hvm_funcs.set_tsc_offset(v, v->arch.hvm_vcpu.cache_tsc_offset, 0); v->arch.hvm_vcpu.msr_tsc_adjust = tsc_adjust; } u64 hvm_get_guest_tsc_fixed(struct vcpu *v, uint64_t at_tsc) { uint64_t tsc; if ( v->domain->arch.vtsc ) { tsc = hvm_get_guest_time_fixed(v, at_tsc); tsc = gtime_to_gtsc(v->domain, tsc); } else { tsc = at_tsc ?: rdtsc(); if ( hvm_tsc_scaling_supported ) tsc = hvm_scale_tsc(v->domain, tsc); } return tsc + v->arch.hvm_vcpu.cache_tsc_offset; } void hvm_migrate_timers(struct vcpu *v) { rtc_migrate_timers(v); pt_migrate(v); } static int hvm_migrate_pirq(struct domain *d, struct hvm_pirq_dpci *pirq_dpci, void *arg) { struct vcpu *v = arg; if ( (pirq_dpci->flags & HVM_IRQ_DPCI_MACH_MSI) && /* Needn't migrate pirq if this pirq is delivered to guest directly.*/ !pirq_dpci->gmsi.posted && (pirq_dpci->gmsi.dest_vcpu_id == v->vcpu_id) ) { struct irq_desc *desc = pirq_spin_lock_irq_desc(dpci_pirq(pirq_dpci), NULL); if ( !desc ) return 0; ASSERT(MSI_IRQ(desc - irq_desc)); irq_set_affinity(desc, cpumask_of(v->processor)); spin_unlock_irq(&desc->lock); } return 0; } void hvm_migrate_pirqs(struct vcpu *v) { struct domain *d = v->domain; if ( !iommu_enabled || !hvm_domain_irq(d)->dpci ) return; spin_lock(&d->event_lock); pt_pirq_iterate(d, hvm_migrate_pirq, v); spin_unlock(&d->event_lock); } static bool hvm_get_pending_event(struct vcpu *v, struct x86_event *info) { info->cr2 = v->arch.hvm_vcpu.guest_cr[2]; return hvm_funcs.get_pending_event(v, info); } void hvm_do_resume(struct vcpu *v) { check_wakeup_from_wait(); pt_restore_timer(v); if ( !handle_hvm_io_completion(v) ) return; if ( unlikely(v->arch.vm_event) ) hvm_vm_event_do_resume(v); /* Inject pending hw/sw event */ if ( v->arch.hvm_vcpu.inject_event.vector >= 0 ) { smp_rmb(); if ( !hvm_event_pending(v) ) hvm_inject_event(&v->arch.hvm_vcpu.inject_event); v->arch.hvm_vcpu.inject_event.vector = HVM_EVENT_VECTOR_UNSET; } if ( unlikely(v->arch.vm_event) && v->arch.monitor.next_interrupt_enabled ) { struct x86_event info; if ( hvm_get_pending_event(v, &info) ) { hvm_monitor_interrupt(info.vector, info.type, info.error_code, info.cr2); v->arch.monitor.next_interrupt_enabled = false; } } } static int hvm_print_line( int dir, unsigned int port, unsigned int bytes, uint32_t *val) { struct domain *cd = current->domain; char c = *val; BUG_ON(bytes != 1); /* Accept only printable characters, newline, and horizontal tab. */ if ( !isprint(c) && (c != '\n') && (c != '\t') ) return X86EMUL_OKAY; spin_lock(&cd->pbuf_lock); if ( c != '\n' ) cd->pbuf[cd->pbuf_idx++] = c; if ( (cd->pbuf_idx == (DOMAIN_PBUF_SIZE - 1)) || (c == '\n') ) { cd->pbuf[cd->pbuf_idx] = '\0'; guest_printk(cd, XENLOG_G_DEBUG "%s\n", cd->pbuf); cd->pbuf_idx = 0; } spin_unlock(&cd->pbuf_lock); return X86EMUL_OKAY; } int hvm_domain_initialise(struct domain *d, unsigned long domcr_flags, struct xen_arch_domainconfig *config) { unsigned int nr_gsis; int rc; if ( !hvm_enabled ) { gdprintk(XENLOG_WARNING, "Attempt to create a HVM guest " "on a non-VT/AMDV platform.\n"); return -EINVAL; } spin_lock_init(&d->arch.hvm_domain.irq_lock); spin_lock_init(&d->arch.hvm_domain.uc_lock); spin_lock_init(&d->arch.hvm_domain.write_map.lock); INIT_LIST_HEAD(&d->arch.hvm_domain.write_map.list); INIT_LIST_HEAD(&d->arch.hvm_domain.g2m_ioport_list); rc = create_perdomain_mapping(d, PERDOMAIN_VIRT_START, 0, NULL, NULL); if ( rc ) goto fail; hvm_init_cacheattr_region_list(d); rc = paging_enable(d, PG_refcounts|PG_translate|PG_external); if ( rc != 0 ) goto fail0; nr_gsis = is_hardware_domain(d) ? nr_irqs_gsi : NR_HVM_DOMU_IRQS; d->arch.hvm_domain.pl_time = xzalloc(struct pl_time); d->arch.hvm_domain.params = xzalloc_array(uint64_t, HVM_NR_PARAMS); d->arch.hvm_domain.io_handler = xzalloc_array(struct hvm_io_handler, NR_IO_HANDLERS); d->arch.hvm_domain.irq = xzalloc_bytes(hvm_irq_size(nr_gsis)); rc = -ENOMEM; if ( !d->arch.hvm_domain.pl_time || !d->arch.hvm_domain.irq || !d->arch.hvm_domain.params || !d->arch.hvm_domain.io_handler ) goto fail1; /* Set the number of GSIs */ hvm_domain_irq(d)->nr_gsis = nr_gsis; BUILD_BUG_ON(NR_HVM_DOMU_IRQS < NR_ISAIRQS); ASSERT(hvm_domain_irq(d)->nr_gsis >= NR_ISAIRQS); /* need link to containing domain */ d->arch.hvm_domain.pl_time->domain = d; /* Set the default IO Bitmap. */ if ( is_hardware_domain(d) ) { d->arch.hvm_domain.io_bitmap = _xmalloc(HVM_IOBITMAP_SIZE, PAGE_SIZE); if ( d->arch.hvm_domain.io_bitmap == NULL ) { rc = -ENOMEM; goto fail1; } memset(d->arch.hvm_domain.io_bitmap, ~0, HVM_IOBITMAP_SIZE); } else d->arch.hvm_domain.io_bitmap = hvm_io_bitmap; register_g2m_portio_handler(d); hvm_ioreq_init(d); hvm_init_guest_time(d); d->arch.hvm_domain.params[HVM_PARAM_TRIPLE_FAULT_REASON] = SHUTDOWN_reboot; vpic_init(d); rc = vioapic_init(d); if ( rc != 0 ) goto fail1; stdvga_init(d); rtc_init(d); register_portio_handler(d, 0xe9, 1, hvm_print_line); if ( hvm_tsc_scaling_supported ) d->arch.hvm_domain.tsc_scaling_ratio = hvm_default_tsc_scaling_ratio; rc = hvm_funcs.domain_initialise(d); if ( rc != 0 ) goto fail2; return 0; fail2: rtc_deinit(d); stdvga_deinit(d); vioapic_deinit(d); fail1: if ( is_hardware_domain(d) ) xfree(d->arch.hvm_domain.io_bitmap); xfree(d->arch.hvm_domain.io_handler); xfree(d->arch.hvm_domain.params); xfree(d->arch.hvm_domain.pl_time); xfree(d->arch.hvm_domain.irq); fail0: hvm_destroy_cacheattr_region_list(d); destroy_perdomain_mapping(d, PERDOMAIN_VIRT_START, 0); fail: return rc; } void hvm_domain_relinquish_resources(struct domain *d) { if ( hvm_funcs.nhvm_domain_relinquish_resources ) hvm_funcs.nhvm_domain_relinquish_resources(d); viridian_domain_deinit(d); hvm_destroy_all_ioreq_servers(d); msixtbl_pt_cleanup(d); /* Stop all asynchronous timer actions. */ rtc_deinit(d); if ( d->vcpu != NULL && d->vcpu[0] != NULL ) { pmtimer_deinit(d); hpet_deinit(d); } } void hvm_domain_destroy(struct domain *d) { struct list_head *ioport_list, *tmp; struct g2m_ioport *ioport; xfree(d->arch.hvm_domain.io_handler); d->arch.hvm_domain.io_handler = NULL; xfree(d->arch.hvm_domain.params); d->arch.hvm_domain.params = NULL; hvm_destroy_cacheattr_region_list(d); hvm_funcs.domain_destroy(d); rtc_deinit(d); stdvga_deinit(d); vioapic_deinit(d); xfree(d->arch.hvm_domain.pl_time); d->arch.hvm_domain.pl_time = NULL; xfree(d->arch.hvm_domain.irq); d->arch.hvm_domain.irq = NULL; list_for_each_safe ( ioport_list, tmp, &d->arch.hvm_domain.g2m_ioport_list ) { ioport = list_entry(ioport_list, struct g2m_ioport, list); list_del(&ioport->list); xfree(ioport); } } static int hvm_save_tsc_adjust(struct domain *d, hvm_domain_context_t *h) { struct vcpu *v; struct hvm_tsc_adjust ctxt; int err = 0; for_each_vcpu ( d, v ) { ctxt.tsc_adjust = v->arch.hvm_vcpu.msr_tsc_adjust; err = hvm_save_entry(TSC_ADJUST, v->vcpu_id, h, &ctxt); if ( err ) break; } return err; } static int hvm_load_tsc_adjust(struct domain *d, hvm_domain_context_t *h) { unsigned int vcpuid = hvm_load_instance(h); struct vcpu *v; struct hvm_tsc_adjust ctxt; if ( vcpuid >= d->max_vcpus || (v = d->vcpu[vcpuid]) == NULL ) { dprintk(XENLOG_G_ERR, "HVM restore: dom%d has no vcpu%u\n", d->domain_id, vcpuid); return -EINVAL; } if ( hvm_load_entry(TSC_ADJUST, h, &ctxt) != 0 ) return -EINVAL; v->arch.hvm_vcpu.msr_tsc_adjust = ctxt.tsc_adjust; return 0; } HVM_REGISTER_SAVE_RESTORE(TSC_ADJUST, hvm_save_tsc_adjust, hvm_load_tsc_adjust, 1, HVMSR_PER_VCPU); static int hvm_save_cpu_ctxt(struct domain *d, hvm_domain_context_t *h) { struct vcpu *v; struct hvm_hw_cpu ctxt; struct segment_register seg; for_each_vcpu ( d, v ) { /* We don't need to save state for a vcpu that is down; the restore * code will leave it down if there is nothing saved. */ if ( v->pause_flags & VPF_down ) continue; memset(&ctxt, 0, sizeof(ctxt)); /* Architecture-specific vmcs/vmcb bits */ hvm_funcs.save_cpu_ctxt(v, &ctxt); ctxt.tsc = hvm_get_guest_tsc_fixed(v, d->arch.hvm_domain.sync_tsc); ctxt.msr_tsc_aux = hvm_msr_tsc_aux(v); hvm_get_segment_register(v, x86_seg_idtr, &seg); ctxt.idtr_limit = seg.limit; ctxt.idtr_base = seg.base; hvm_get_segment_register(v, x86_seg_gdtr, &seg); ctxt.gdtr_limit = seg.limit; ctxt.gdtr_base = seg.base; hvm_get_segment_register(v, x86_seg_cs, &seg); ctxt.cs_sel = seg.sel; ctxt.cs_limit = seg.limit; ctxt.cs_base = seg.base; ctxt.cs_arbytes = seg.attr; hvm_get_segment_register(v, x86_seg_ds, &seg); ctxt.ds_sel = seg.sel; ctxt.ds_limit = seg.limit; ctxt.ds_base = seg.base; ctxt.ds_arbytes = seg.attr; hvm_get_segment_register(v, x86_seg_es, &seg); ctxt.es_sel = seg.sel; ctxt.es_limit = seg.limit; ctxt.es_base = seg.base; ctxt.es_arbytes = seg.attr; hvm_get_segment_register(v, x86_seg_ss, &seg); ctxt.ss_sel = seg.sel; ctxt.ss_limit = seg.limit; ctxt.ss_base = seg.base; ctxt.ss_arbytes = seg.attr; hvm_get_segment_register(v, x86_seg_fs, &seg); ctxt.fs_sel = seg.sel; ctxt.fs_limit = seg.limit; ctxt.fs_base = seg.base; ctxt.fs_arbytes = seg.attr; hvm_get_segment_register(v, x86_seg_gs, &seg); ctxt.gs_sel = seg.sel; ctxt.gs_limit = seg.limit; ctxt.gs_base = seg.base; ctxt.gs_arbytes = seg.attr; hvm_get_segment_register(v, x86_seg_tr, &seg); ctxt.tr_sel = seg.sel; ctxt.tr_limit = seg.limit; ctxt.tr_base = seg.base; ctxt.tr_arbytes = seg.attr; hvm_get_segment_register(v, x86_seg_ldtr, &seg); ctxt.ldtr_sel = seg.sel; ctxt.ldtr_limit = seg.limit; ctxt.ldtr_base = seg.base; ctxt.ldtr_arbytes = seg.attr; if ( v->fpu_initialised ) { memcpy(ctxt.fpu_regs, v->arch.fpu_ctxt, sizeof(ctxt.fpu_regs)); ctxt.flags = XEN_X86_FPU_INITIALISED; } ctxt.rax = v->arch.user_regs.rax; ctxt.rbx = v->arch.user_regs.rbx; ctxt.rcx = v->arch.user_regs.rcx; ctxt.rdx = v->arch.user_regs.rdx; ctxt.rbp = v->arch.user_regs.rbp; ctxt.rsi = v->arch.user_regs.rsi; ctxt.rdi = v->arch.user_regs.rdi; ctxt.rsp = v->arch.user_regs.rsp; ctxt.rip = v->arch.user_regs.rip; ctxt.rflags = v->arch.user_regs.rflags; ctxt.r8 = v->arch.user_regs.r8; ctxt.r9 = v->arch.user_regs.r9; ctxt.r10 = v->arch.user_regs.r10; ctxt.r11 = v->arch.user_regs.r11; ctxt.r12 = v->arch.user_regs.r12; ctxt.r13 = v->arch.user_regs.r13; ctxt.r14 = v->arch.user_regs.r14; ctxt.r15 = v->arch.user_regs.r15; ctxt.dr0 = v->arch.debugreg[0]; ctxt.dr1 = v->arch.debugreg[1]; ctxt.dr2 = v->arch.debugreg[2]; ctxt.dr3 = v->arch.debugreg[3]; ctxt.dr6 = v->arch.debugreg[6]; ctxt.dr7 = v->arch.debugreg[7]; if ( hvm_save_entry(CPU, v->vcpu_id, h, &ctxt) != 0 ) return 1; } return 0; } /* Return a string indicating the error, or NULL for valid. */ const char *hvm_efer_valid(const struct vcpu *v, uint64_t value, signed int cr0_pg) { const struct domain *d = v->domain; const struct cpuid_policy *p; if ( cr0_pg < 0 && !is_hardware_domain(d) ) p = d->arch.cpuid; else p = &host_cpuid_policy; if ( (value & EFER_SCE) && !p->extd.syscall ) return "SCE without feature"; if ( (value & (EFER_LME | EFER_LMA)) && !p->extd.lm ) return "LME/LMA without feature"; if ( (value & EFER_LMA) && (!(value & EFER_LME) || !cr0_pg) ) return "LMA/LME/CR0.PG inconsistency"; if ( (value & EFER_NX) && !p->extd.nx ) return "NX without feature"; if ( (value & EFER_SVME) && (!p->extd.svm || !nestedhvm_enabled(d)) ) return "SVME without nested virt"; if ( (value & EFER_LMSLE) && !cpu_has_lmsl ) return "LMSLE without support"; if ( (value & EFER_FFXSE) && !p->extd.ffxsr ) return "FFXSE without feature"; return NULL; } /* These reserved bits in lower 32 remain 0 after any load of CR0 */ #define HVM_CR0_GUEST_RESERVED_BITS \ (~((unsigned long) \ (X86_CR0_PE | X86_CR0_MP | X86_CR0_EM | \ X86_CR0_TS | X86_CR0_ET | X86_CR0_NE | \ X86_CR0_WP | X86_CR0_AM | X86_CR0_NW | \ X86_CR0_CD | X86_CR0_PG))) /* These bits in CR4 can be set by the guest. */ unsigned long hvm_cr4_guest_valid_bits(const struct vcpu *v, bool restore) { const struct domain *d = v->domain; const struct cpuid_policy *p; bool mce, vmxe; if ( !restore && !is_hardware_domain(d) ) p = d->arch.cpuid; else p = &host_cpuid_policy; /* Logic broken out simply to aid readability below. */ mce = p->basic.mce || p->basic.mca; vmxe = p->basic.vmx && (restore || nestedhvm_enabled(d)); return ((p->basic.vme ? X86_CR4_VME | X86_CR4_PVI : 0) | (p->basic.tsc ? X86_CR4_TSD : 0) | (p->basic.de ? X86_CR4_DE : 0) | (p->basic.pse ? X86_CR4_PSE : 0) | (p->basic.pae ? X86_CR4_PAE : 0) | (mce ? X86_CR4_MCE : 0) | (p->basic.pge ? X86_CR4_PGE : 0) | X86_CR4_PCE | (p->basic.fxsr ? X86_CR4_OSFXSR : 0) | (p->basic.sse ? X86_CR4_OSXMMEXCPT : 0) | (vmxe ? X86_CR4_VMXE : 0) | (p->feat.fsgsbase ? X86_CR4_FSGSBASE : 0) | (p->basic.pcid ? X86_CR4_PCIDE : 0) | (p->basic.xsave ? X86_CR4_OSXSAVE : 0) | (p->feat.smep ? X86_CR4_SMEP : 0) | (p->feat.smap ? X86_CR4_SMAP : 0) | (p->feat.umip ? X86_CR4_UMIP : 0) | (p->feat.pku ? X86_CR4_PKE : 0)); } static int hvm_load_cpu_ctxt(struct domain *d, hvm_domain_context_t *h) { int vcpuid; struct vcpu *v; struct hvm_hw_cpu ctxt; struct segment_register seg; const char *errstr; struct xsave_struct *xsave_area; /* Which vcpu is this? */ vcpuid = hvm_load_instance(h); if ( vcpuid >= d->max_vcpus || (v = d->vcpu[vcpuid]) == NULL ) { dprintk(XENLOG_G_ERR, "HVM restore: dom%u has no vcpu%u\n", d->domain_id, vcpuid); return -EINVAL; } if ( hvm_load_entry_zeroextend(CPU, h, &ctxt) != 0 ) return -EINVAL; if ( ctxt.pad0 != 0 ) return -EINVAL; /* Sanity check some control registers. */ if ( (ctxt.cr0 & HVM_CR0_GUEST_RESERVED_BITS) || !(ctxt.cr0 & X86_CR0_ET) || ((ctxt.cr0 & (X86_CR0_PE|X86_CR0_PG)) == X86_CR0_PG) ) { printk(XENLOG_G_ERR "HVM%d restore: bad CR0 %#" PRIx64 "\n", d->domain_id, ctxt.cr0); return -EINVAL; } if ( ctxt.cr4 & ~hvm_cr4_guest_valid_bits(v, 1) ) { printk(XENLOG_G_ERR "HVM%d restore: bad CR4 %#" PRIx64 "\n", d->domain_id, ctxt.cr4); return -EINVAL; } errstr = hvm_efer_valid(v, ctxt.msr_efer, MASK_EXTR(ctxt.cr0, X86_CR0_PG)); if ( errstr ) { printk(XENLOG_G_ERR "%pv: HVM restore: bad EFER %#" PRIx64 " - %s\n", v, ctxt.msr_efer, errstr); return -EINVAL; } if ( (ctxt.flags & ~XEN_X86_FPU_INITIALISED) != 0 ) { gprintk(XENLOG_ERR, "bad flags value in CPU context: %#x\n", ctxt.flags); return -EINVAL; } /* Older Xen versions used to save the segment arbytes directly * from the VMCS on Intel hosts. Detect this and rearrange them * into the struct segment_register format. */ #define UNFOLD_ARBYTES(_r) \ if ( (_r & 0xf000) && !(_r & 0x0f00) ) \ _r = ((_r & 0xff) | ((_r >> 4) & 0xf00)) UNFOLD_ARBYTES(ctxt.cs_arbytes); UNFOLD_ARBYTES(ctxt.ds_arbytes); UNFOLD_ARBYTES(ctxt.es_arbytes); UNFOLD_ARBYTES(ctxt.fs_arbytes); UNFOLD_ARBYTES(ctxt.gs_arbytes); UNFOLD_ARBYTES(ctxt.ss_arbytes); UNFOLD_ARBYTES(ctxt.tr_arbytes); UNFOLD_ARBYTES(ctxt.ldtr_arbytes); #undef UNFOLD_ARBYTES /* Architecture-specific vmcs/vmcb bits */ if ( hvm_funcs.load_cpu_ctxt(v, &ctxt) < 0 ) return -EINVAL; if ( hvm_funcs.tsc_scaling.setup ) hvm_funcs.tsc_scaling.setup(v); v->arch.hvm_vcpu.msr_tsc_aux = ctxt.msr_tsc_aux; hvm_set_guest_tsc_fixed(v, ctxt.tsc, d->arch.hvm_domain.sync_tsc); seg.limit = ctxt.idtr_limit; seg.base = ctxt.idtr_base; hvm_set_segment_register(v, x86_seg_idtr, &seg); seg.limit = ctxt.gdtr_limit; seg.base = ctxt.gdtr_base; hvm_set_segment_register(v, x86_seg_gdtr, &seg); seg.sel = ctxt.cs_sel; seg.limit = ctxt.cs_limit; seg.base = ctxt.cs_base; seg.attr = ctxt.cs_arbytes; hvm_set_segment_register(v, x86_seg_cs, &seg); seg.sel = ctxt.ds_sel; seg.limit = ctxt.ds_limit; seg.base = ctxt.ds_base; seg.attr = ctxt.ds_arbytes; hvm_set_segment_register(v, x86_seg_ds, &seg); seg.sel = ctxt.es_sel; seg.limit = ctxt.es_limit; seg.base = ctxt.es_base; seg.attr = ctxt.es_arbytes; hvm_set_segment_register(v, x86_seg_es, &seg); seg.sel = ctxt.ss_sel; seg.limit = ctxt.ss_limit; seg.base = ctxt.ss_base; seg.attr = ctxt.ss_arbytes; hvm_set_segment_register(v, x86_seg_ss, &seg); seg.sel = ctxt.fs_sel; seg.limit = ctxt.fs_limit; seg.base = ctxt.fs_base; seg.attr = ctxt.fs_arbytes; hvm_set_segment_register(v, x86_seg_fs, &seg); seg.sel = ctxt.gs_sel; seg.limit = ctxt.gs_limit; seg.base = ctxt.gs_base; seg.attr = ctxt.gs_arbytes; hvm_set_segment_register(v, x86_seg_gs, &seg); seg.sel = ctxt.tr_sel; seg.limit = ctxt.tr_limit; seg.base = ctxt.tr_base; seg.attr = ctxt.tr_arbytes; hvm_set_segment_register(v, x86_seg_tr, &seg); seg.sel = ctxt.ldtr_sel; seg.limit = ctxt.ldtr_limit; seg.base = ctxt.ldtr_base; seg.attr = ctxt.ldtr_arbytes; hvm_set_segment_register(v, x86_seg_ldtr, &seg); /* Cover xsave-absent save file restoration on xsave-capable host. */ xsave_area = xsave_enabled(v) ? NULL : v->arch.xsave_area; v->fpu_initialised = !!(ctxt.flags & XEN_X86_FPU_INITIALISED); if ( v->fpu_initialised ) { memcpy(v->arch.fpu_ctxt, ctxt.fpu_regs, sizeof(ctxt.fpu_regs)); if ( xsave_area ) xsave_area->xsave_hdr.xstate_bv = XSTATE_FP_SSE; } else if ( xsave_area ) { xsave_area->xsave_hdr.xstate_bv = 0; xsave_area->fpu_sse.mxcsr = MXCSR_DEFAULT; } if ( xsave_area ) xsave_area->xsave_hdr.xcomp_bv = 0; v->arch.user_regs.rax = ctxt.rax; v->arch.user_regs.rbx = ctxt.rbx; v->arch.user_regs.rcx = ctxt.rcx; v->arch.user_regs.rdx = ctxt.rdx; v->arch.user_regs.rbp = ctxt.rbp; v->arch.user_regs.rsi = ctxt.rsi; v->arch.user_regs.rdi = ctxt.rdi; v->arch.user_regs.rsp = ctxt.rsp; v->arch.user_regs.rip = ctxt.rip; v->arch.user_regs.rflags = ctxt.rflags | X86_EFLAGS_MBS; v->arch.user_regs.r8 = ctxt.r8; v->arch.user_regs.r9 = ctxt.r9; v->arch.user_regs.r10 = ctxt.r10; v->arch.user_regs.r11 = ctxt.r11; v->arch.user_regs.r12 = ctxt.r12; v->arch.user_regs.r13 = ctxt.r13; v->arch.user_regs.r14 = ctxt.r14; v->arch.user_regs.r15 = ctxt.r15; v->arch.debugreg[0] = ctxt.dr0; v->arch.debugreg[1] = ctxt.dr1; v->arch.debugreg[2] = ctxt.dr2; v->arch.debugreg[3] = ctxt.dr3; v->arch.debugreg[6] = ctxt.dr6; v->arch.debugreg[7] = ctxt.dr7; v->arch.vgc_flags = VGCF_online; /* Auxiliary processors should be woken immediately. */ v->is_initialised = 1; clear_bit(_VPF_down, &v->pause_flags); vcpu_wake(v); return 0; } HVM_REGISTER_SAVE_RESTORE(CPU, hvm_save_cpu_ctxt, hvm_load_cpu_ctxt, 1, HVMSR_PER_VCPU); #define HVM_CPU_XSAVE_SIZE(xcr0) (offsetof(struct hvm_hw_cpu_xsave, \ save_area) + \ xstate_ctxt_size(xcr0)) static int hvm_save_cpu_xsave_states(struct domain *d, hvm_domain_context_t *h) { struct vcpu *v; struct hvm_hw_cpu_xsave *ctxt; if ( !cpu_has_xsave ) return 0; /* do nothing */ for_each_vcpu ( d, v ) { unsigned int size = HVM_CPU_XSAVE_SIZE(v->arch.xcr0_accum); if ( !xsave_enabled(v) ) continue; if ( _hvm_init_entry(h, CPU_XSAVE_CODE, v->vcpu_id, size) ) return 1; ctxt = (struct hvm_hw_cpu_xsave *)&h->data[h->cur]; h->cur += size; ctxt->xfeature_mask = xfeature_mask; ctxt->xcr0 = v->arch.xcr0; ctxt->xcr0_accum = v->arch.xcr0_accum; expand_xsave_states(v, &ctxt->save_area, size - offsetof(typeof(*ctxt), save_area)); } return 0; } /* * Structure layout conformity checks, documenting correctness of the cast in * the invocation of validate_xstate() below. * Leverage CONFIG_COMPAT machinery to perform this. */ #define xen_xsave_hdr xsave_hdr #define compat_xsave_hdr hvm_hw_cpu_xsave_hdr CHECK_FIELD_(struct, xsave_hdr, xstate_bv); CHECK_FIELD_(struct, xsave_hdr, xcomp_bv); CHECK_FIELD_(struct, xsave_hdr, reserved); #undef compat_xsave_hdr #undef xen_xsave_hdr static int hvm_load_cpu_xsave_states(struct domain *d, hvm_domain_context_t *h) { unsigned int vcpuid, size; int err; struct vcpu *v; struct hvm_hw_cpu_xsave *ctxt; const struct hvm_save_descriptor *desc; unsigned int i, desc_start, desc_length; /* Which vcpu is this? */ vcpuid = hvm_load_instance(h); if ( vcpuid >= d->max_vcpus || (v = d->vcpu[vcpuid]) == NULL ) { dprintk(XENLOG_G_ERR, "HVM restore: dom%d has no vcpu%u\n", d->domain_id, vcpuid); return -EINVAL; } /* Fails since we can't restore an img saved on xsave-capable host. */ if ( !cpu_has_xsave ) return -EOPNOTSUPP; /* Customized checking for entry since our entry is of variable length */ desc = (struct hvm_save_descriptor *)&h->data[h->cur]; if ( sizeof (*desc) > h->size - h->cur) { printk(XENLOG_G_WARNING "HVM%d.%d restore: not enough data left to read xsave descriptor\n", d->domain_id, vcpuid); return -ENODATA; } if ( desc->length + sizeof (*desc) > h->size - h->cur) { printk(XENLOG_G_WARNING "HVM%d.%d restore: not enough data left to read %u xsave bytes\n", d->domain_id, vcpuid, desc->length); return -ENODATA; } if ( desc->length < offsetof(struct hvm_hw_cpu_xsave, save_area) + XSTATE_AREA_MIN_SIZE ) { printk(XENLOG_G_WARNING "HVM%d.%d restore mismatch: xsave length %u < %zu\n", d->domain_id, vcpuid, desc->length, offsetof(struct hvm_hw_cpu_xsave, save_area) + XSTATE_AREA_MIN_SIZE); return -EINVAL; } h->cur += sizeof (*desc); desc_start = h->cur; ctxt = (struct hvm_hw_cpu_xsave *)&h->data[h->cur]; h->cur += desc->length; err = validate_xstate(ctxt->xcr0, ctxt->xcr0_accum, (const void *)&ctxt->save_area.xsave_hdr); if ( err ) { printk(XENLOG_G_WARNING "HVM%d.%d restore: inconsistent xsave state (feat=%#"PRIx64 " accum=%#"PRIx64" xcr0=%#"PRIx64" bv=%#"PRIx64" err=%d)\n", d->domain_id, vcpuid, ctxt->xfeature_mask, ctxt->xcr0_accum, ctxt->xcr0, ctxt->save_area.xsave_hdr.xstate_bv, err); return err; } size = HVM_CPU_XSAVE_SIZE(ctxt->xcr0_accum); desc_length = desc->length; if ( desc_length > size ) { /* * Xen 4.3.0, 4.2.3 and older used to send longer-than-needed * xsave regions. Permit loading the record if the extra data * is all zero. */ for ( i = size; i < desc->length; i++ ) { if ( h->data[desc_start + i] ) { printk(XENLOG_G_WARNING "HVM%d.%u restore mismatch: xsave length %#x > %#x (non-zero data at %#x)\n", d->domain_id, vcpuid, desc->length, size, i); return -EOPNOTSUPP; } } printk(XENLOG_G_WARNING "HVM%d.%u restore mismatch: xsave length %#x > %#x\n", d->domain_id, vcpuid, desc->length, size); /* Rewind desc_length to ignore the extraneous zeros. */ desc_length = size; } if ( xsave_area_compressed((const void *)&ctxt->save_area) ) { printk(XENLOG_G_WARNING "HVM%d.%u restore: compressed xsave state not supported\n", d->domain_id, vcpuid); return -EOPNOTSUPP; } else if ( desc_length != size ) { printk(XENLOG_G_WARNING "HVM%d.%u restore mismatch: xsave length %#x != %#x\n", d->domain_id, vcpuid, desc_length, size); return -EINVAL; } /* Checking finished */ v->arch.xcr0 = ctxt->xcr0; v->arch.xcr0_accum = ctxt->xcr0_accum; if ( ctxt->xcr0_accum & XSTATE_NONLAZY ) v->arch.nonlazy_xstate_used = 1; compress_xsave_states(v, &ctxt->save_area, size - offsetof(struct hvm_hw_cpu_xsave, save_area)); return 0; } #define HVM_CPU_MSR_SIZE(cnt) offsetof(struct hvm_msr, msr[cnt]) static const uint32_t msrs_to_send[] = { MSR_INTEL_MISC_FEATURES_ENABLES, }; static unsigned int __read_mostly msr_count_max = ARRAY_SIZE(msrs_to_send); static int hvm_save_cpu_msrs(struct domain *d, hvm_domain_context_t *h) { struct vcpu *v; for_each_vcpu ( d, v ) { struct hvm_save_descriptor *d = _p(&h->data[h->cur]); struct hvm_msr *ctxt; unsigned int i; if ( _hvm_init_entry(h, CPU_MSR_CODE, v->vcpu_id, HVM_CPU_MSR_SIZE(msr_count_max)) ) return 1; ctxt = (struct hvm_msr *)&h->data[h->cur]; ctxt->count = 0; for ( i = 0; i < ARRAY_SIZE(msrs_to_send); ++i ) { uint64_t val; int rc = guest_rdmsr(v, msrs_to_send[i], &val); /* * It is the programmers responsibility to ensure that * msrs_to_send[] contain generally-read/write MSRs. * X86EMUL_EXCEPTION here implies a missing feature, and that the * guest doesn't have access to the MSR. */ if ( rc == X86EMUL_EXCEPTION ) continue; if ( rc != X86EMUL_OKAY ) { ASSERT_UNREACHABLE(); return -ENXIO; } if ( !val ) continue; /* Skip empty MSRs. */ ctxt->msr[ctxt->count].index = msrs_to_send[i]; ctxt->msr[ctxt->count++].val = val; } if ( hvm_funcs.save_msr ) hvm_funcs.save_msr(v, ctxt); ASSERT(ctxt->count <= msr_count_max); for ( i = 0; i < ctxt->count; ++i ) ctxt->msr[i]._rsvd = 0; if ( ctxt->count ) { /* Rewrite length to indicate how much space we actually used. */ d->length = HVM_CPU_MSR_SIZE(ctxt->count); h->cur += HVM_CPU_MSR_SIZE(ctxt->count); } else /* or rewind and remove the descriptor from the stream. */ h->cur -= sizeof(struct hvm_save_descriptor); } return 0; } static int hvm_load_cpu_msrs(struct domain *d, hvm_domain_context_t *h) { unsigned int i, vcpuid = hvm_load_instance(h); struct vcpu *v; const struct hvm_save_descriptor *desc; struct hvm_msr *ctxt; int err = 0; if ( vcpuid >= d->max_vcpus || (v = d->vcpu[vcpuid]) == NULL ) { dprintk(XENLOG_G_ERR, "HVM restore: dom%d has no vcpu%u\n", d->domain_id, vcpuid); return -EINVAL; } /* Customized checking for entry since our entry is of variable length */ desc = (struct hvm_save_descriptor *)&h->data[h->cur]; if ( sizeof (*desc) > h->size - h->cur) { printk(XENLOG_G_WARNING "HVM%d.%d restore: not enough data left to read MSR descriptor\n", d->domain_id, vcpuid); return -ENODATA; } if ( desc->length + sizeof (*desc) > h->size - h->cur) { printk(XENLOG_G_WARNING "HVM%d.%d restore: not enough data left to read %u MSR bytes\n", d->domain_id, vcpuid, desc->length); return -ENODATA; } if ( desc->length < HVM_CPU_MSR_SIZE(1) ) { printk(XENLOG_G_WARNING "HVM%d.%d restore mismatch: MSR length %u < %zu\n", d->domain_id, vcpuid, desc->length, HVM_CPU_MSR_SIZE(1)); return -EINVAL; } h->cur += sizeof(*desc); ctxt = (struct hvm_msr *)&h->data[h->cur]; h->cur += desc->length; if ( desc->length != HVM_CPU_MSR_SIZE(ctxt->count) ) { printk(XENLOG_G_WARNING "HVM%d.%d restore mismatch: MSR length %u != %zu\n", d->domain_id, vcpuid, desc->length, HVM_CPU_MSR_SIZE(ctxt->count)); return -EOPNOTSUPP; } for ( i = 0; i < ctxt->count; ++i ) if ( ctxt->msr[i]._rsvd ) return -EOPNOTSUPP; /* Checking finished */ if ( hvm_funcs.load_msr ) err = hvm_funcs.load_msr(v, ctxt); for ( i = 0; !err && i < ctxt->count; ++i ) { switch ( ctxt->msr[i].index ) { int rc; case MSR_INTEL_MISC_FEATURES_ENABLES: rc = guest_wrmsr(v, ctxt->msr[i].index, ctxt->msr[i].val); if ( rc != X86EMUL_OKAY ) err = -ENXIO; break; default: if ( !ctxt->msr[i]._rsvd ) err = -ENXIO; break; } } return err; } /* We need variable length data chunks for XSAVE area and MSRs, hence * a custom declaration rather than HVM_REGISTER_SAVE_RESTORE. */ static int __init hvm_register_CPU_save_and_restore(void) { hvm_register_savevm(CPU_XSAVE_CODE, "CPU_XSAVE", hvm_save_cpu_xsave_states, hvm_load_cpu_xsave_states, HVM_CPU_XSAVE_SIZE(xfeature_mask) + sizeof(struct hvm_save_descriptor), HVMSR_PER_VCPU); if ( hvm_funcs.init_msr ) msr_count_max += hvm_funcs.init_msr(); if ( msr_count_max ) hvm_register_savevm(CPU_MSR_CODE, "CPU_MSR", hvm_save_cpu_msrs, hvm_load_cpu_msrs, HVM_CPU_MSR_SIZE(msr_count_max) + sizeof(struct hvm_save_descriptor), HVMSR_PER_VCPU); return 0; } __initcall(hvm_register_CPU_save_and_restore); int hvm_vcpu_initialise(struct vcpu *v) { int rc; struct domain *d = v->domain; hvm_asid_flush_vcpu(v); spin_lock_init(&v->arch.hvm_vcpu.tm_lock); INIT_LIST_HEAD(&v->arch.hvm_vcpu.tm_list); rc = hvm_vcpu_cacheattr_init(v); /* teardown: vcpu_cacheattr_destroy */ if ( rc != 0 ) goto fail1; /* NB: vlapic_init must be called before hvm_funcs.vcpu_initialise */ rc = vlapic_init(v); if ( rc != 0 ) /* teardown: vlapic_destroy */ goto fail2; if ( (rc = hvm_funcs.vcpu_initialise(v)) != 0 ) /* teardown: hvm_funcs.vcpu_destroy */ goto fail3; softirq_tasklet_init( &v->arch.hvm_vcpu.assert_evtchn_irq_tasklet, (void(*)(unsigned long))hvm_assert_evtchn_irq, (unsigned long)v); v->arch.hvm_vcpu.inject_event.vector = HVM_EVENT_VECTOR_UNSET; rc = setup_compat_arg_xlat(v); /* teardown: free_compat_arg_xlat() */ if ( rc != 0 ) goto fail4; if ( nestedhvm_enabled(d) && (rc = nestedhvm_vcpu_initialise(v)) < 0 ) /* teardown: nestedhvm_vcpu_destroy */ goto fail5; rc = hvm_all_ioreq_servers_add_vcpu(d, v); if ( rc != 0 ) goto fail6; if ( v->vcpu_id == 0 ) { /* NB. All these really belong in hvm_domain_initialise(). */ pmtimer_init(v); hpet_init(d); /* Init guest TSC to start from zero. */ hvm_set_guest_tsc(v, 0); } hvm_update_guest_vendor(v); return 0; fail6: nestedhvm_vcpu_destroy(v); fail5: free_compat_arg_xlat(v); fail4: hvm_funcs.vcpu_destroy(v); fail3: vlapic_destroy(v); fail2: hvm_vcpu_cacheattr_destroy(v); fail1: return rc; } void hvm_vcpu_destroy(struct vcpu *v) { viridian_vcpu_deinit(v); hvm_all_ioreq_servers_remove_vcpu(v->domain, v); if ( hvm_altp2m_supported() ) altp2m_vcpu_destroy(v); nestedhvm_vcpu_destroy(v); free_compat_arg_xlat(v); tasklet_kill(&v->arch.hvm_vcpu.assert_evtchn_irq_tasklet); hvm_funcs.vcpu_destroy(v); vlapic_destroy(v); hvm_vcpu_cacheattr_destroy(v); } void hvm_vcpu_down(struct vcpu *v) { struct domain *d = v->domain; int online_count = 0; /* Doesn't halt us immediately, but we'll never return to guest context. */ set_bit(_VPF_down, &v->pause_flags); vcpu_sleep_nosync(v); /* Any other VCPUs online? ... */ domain_lock(d); for_each_vcpu ( d, v ) if ( !(v->pause_flags & VPF_down) ) online_count++; domain_unlock(d); /* ... Shut down the domain if not. */ if ( online_count == 0 ) { gdprintk(XENLOG_INFO, "All CPUs offline -- powering off.\n"); domain_shutdown(d, SHUTDOWN_poweroff); } } void hvm_hlt(unsigned int eflags) { struct vcpu *curr = current; if ( hvm_event_pending(curr) ) return; /* * If we halt with interrupts disabled, that's a pretty sure sign that we * want to shut down. In a real processor, NMIs are the only way to break * out of this. */ if ( unlikely(!(eflags & X86_EFLAGS_IF)) ) return hvm_vcpu_down(curr); do_sched_op(SCHEDOP_block, guest_handle_from_ptr(NULL, void)); HVMTRACE_1D(HLT, /* pending = */ vcpu_runnable(curr)); } void hvm_triple_fault(void) { struct vcpu *v = current; struct domain *d = v->domain; u8 reason = d->arch.hvm_domain.params[HVM_PARAM_TRIPLE_FAULT_REASON]; gprintk(XENLOG_INFO, "Triple fault - invoking HVM shutdown action %d\n", reason); vcpu_show_execution_state(v); domain_shutdown(d, reason); } void hvm_inject_event(const struct x86_event *event) { struct vcpu *curr = current; const uint8_t vector = event->vector; const bool has_ec = ((event->type == X86_EVENTTYPE_HW_EXCEPTION) && (vector < 32) && ((TRAP_HAVE_EC & (1u << vector)))); ASSERT(vector == event->vector); /* Confirm no truncation. */ if ( has_ec ) ASSERT(event->error_code != X86_EVENT_NO_EC); else ASSERT(event->error_code == X86_EVENT_NO_EC); if ( nestedhvm_enabled(curr->domain) && !nestedhvm_vmswitch_in_progress(curr) && nestedhvm_vcpu_in_guestmode(curr) && nhvm_vmcx_guest_intercepts_event( curr, event->vector, event->error_code) ) { enum nestedhvm_vmexits nsret; nsret = nhvm_vcpu_vmexit_event(curr, event); switch ( nsret ) { case NESTEDHVM_VMEXIT_DONE: case NESTEDHVM_VMEXIT_ERROR: /* L1 guest will crash L2 guest */ return; case NESTEDHVM_VMEXIT_HOST: case NESTEDHVM_VMEXIT_CONTINUE: case NESTEDHVM_VMEXIT_FATALERROR: default: gdprintk(XENLOG_ERR, "unexpected nestedhvm error %i\n", nsret); return; } } hvm_funcs.inject_event(event); } int hvm_hap_nested_page_fault(paddr_t gpa, unsigned long gla, struct npfec npfec) { unsigned long gfn = gpa >> PAGE_SHIFT; p2m_type_t p2mt; p2m_access_t p2ma; mfn_t mfn; struct vcpu *curr = current; struct domain *currd = curr->domain; struct p2m_domain *p2m, *hostp2m; int rc, fall_through = 0, paged = 0; int sharing_enomem = 0; vm_event_request_t *req_ptr = NULL; bool_t ap2m_active, sync = 0; /* On Nested Virtualization, walk the guest page table. * If this succeeds, all is fine. * If this fails, inject a nested page fault into the guest. */ if ( nestedhvm_enabled(currd) && nestedhvm_vcpu_in_guestmode(curr) && nestedhvm_paging_mode_hap(curr) ) { int rv; /* The vcpu is in guest mode and the l1 guest * uses hap. That means 'gpa' is in l2 guest * physical address space. * Fix the nested p2m or inject nested page fault * into l1 guest if not fixable. The algorithm is * the same as for shadow paging. */ rv = nestedhvm_hap_nested_page_fault(curr, &gpa, npfec.read_access, npfec.write_access, npfec.insn_fetch); switch (rv) { case NESTEDHVM_PAGEFAULT_DONE: case NESTEDHVM_PAGEFAULT_RETRY: return 1; case NESTEDHVM_PAGEFAULT_L1_ERROR: /* An error occured while translating gpa from * l2 guest address to l1 guest address. */ return 0; case NESTEDHVM_PAGEFAULT_INJECT: return -1; case NESTEDHVM_PAGEFAULT_MMIO: if ( !handle_mmio() ) hvm_inject_hw_exception(TRAP_gp_fault, 0); return 1; case NESTEDHVM_PAGEFAULT_L0_ERROR: /* gpa is now translated to l1 guest address, update gfn. */ gfn = gpa >> PAGE_SHIFT; break; } } /* * No need to do the P2M lookup for internally handled MMIO, benefiting * - 32-bit WinXP (& older Windows) on AMD CPUs for LAPIC accesses, * - newer Windows (like Server 2012) for HPET accesses. */ if ( !nestedhvm_vcpu_in_guestmode(curr) && hvm_mmio_internal(gpa) ) { if ( !handle_mmio_with_translation(gla, gpa >> PAGE_SHIFT, npfec) ) hvm_inject_hw_exception(TRAP_gp_fault, 0); rc = 1; goto out; } ap2m_active = altp2m_active(currd); /* * Take a lock on the host p2m speculatively, to avoid potential * locking order problems later and to handle unshare etc. */ hostp2m = p2m_get_hostp2m(currd); mfn = get_gfn_type_access(hostp2m, gfn, &p2mt, &p2ma, P2M_ALLOC | (npfec.write_access ? P2M_UNSHARE : 0), NULL); if ( ap2m_active ) { if ( p2m_altp2m_lazy_copy(curr, gpa, gla, npfec, &p2m) ) { /* entry was lazily copied from host -- retry */ __put_gfn(hostp2m, gfn); rc = 1; goto out; } mfn = get_gfn_type_access(p2m, gfn, &p2mt, &p2ma, 0, NULL); } else p2m = hostp2m; /* Check access permissions first, then handle faults */ if ( !mfn_eq(mfn, INVALID_MFN) ) { bool_t violation; /* If the access is against the permissions, then send to vm_event */ switch (p2ma) { case p2m_access_n: case p2m_access_n2rwx: default: violation = npfec.read_access || npfec.write_access || npfec.insn_fetch; break; case p2m_access_r: violation = npfec.write_access || npfec.insn_fetch; break; case p2m_access_w: violation = npfec.read_access || npfec.insn_fetch; break; case p2m_access_x: violation = npfec.read_access || npfec.write_access; break; case p2m_access_rx: case p2m_access_rx2rw: violation = npfec.write_access; break; case p2m_access_wx: violation = npfec.read_access; break; case p2m_access_rw: violation = npfec.insn_fetch; break; case p2m_access_rwx: violation = 0; break; } if ( violation ) { /* Should #VE be emulated for this fault? */ if ( p2m_is_altp2m(p2m) && !cpu_has_vmx_virt_exceptions ) { bool_t sve; p2m->get_entry(p2m, _gfn(gfn), &p2mt, &p2ma, 0, NULL, &sve); if ( !sve && altp2m_vcpu_emulate_ve(curr) ) { rc = 1; goto out_put_gfn; } } sync = p2m_mem_access_check(gpa, gla, npfec, &req_ptr); if ( !sync ) fall_through = 1; else { /* Rights not promoted (aka. sync event), work here is done */ rc = 1; goto out_put_gfn; } } } /* * If this GFN is emulated MMIO or marked as read-only, pass the fault * to the mmio handler. */ if ( (p2mt == p2m_mmio_dm) || (npfec.write_access && (p2m_is_discard_write(p2mt) || (p2mt == p2m_ioreq_server))) ) { if ( !handle_mmio_with_translation(gla, gpa >> PAGE_SHIFT, npfec) ) hvm_inject_hw_exception(TRAP_gp_fault, 0); rc = 1; goto out_put_gfn; } /* Check if the page has been paged out */ if ( p2m_is_paged(p2mt) || (p2mt == p2m_ram_paging_out) ) paged = 1; /* Mem sharing: unshare the page and try again */ if ( npfec.write_access && (p2mt == p2m_ram_shared) ) { ASSERT(p2m_is_hostp2m(p2m)); sharing_enomem = (mem_sharing_unshare_page(currd, gfn, 0) < 0); rc = 1; goto out_put_gfn; } /* Spurious fault? PoD and log-dirty also take this path. */ if ( p2m_is_ram(p2mt) ) { rc = 1; /* * Page log dirty is always done with order 0. If this mfn resides in * a large page, we do not change other pages type within that large * page. */ if ( npfec.write_access ) { paging_mark_dirty(currd, mfn); /* * If p2m is really an altp2m, unlock here to avoid lock ordering * violation when the change below is propagated from host p2m. */ if ( ap2m_active ) __put_gfn(p2m, gfn); p2m_change_type_one(currd, gfn, p2m_ram_logdirty, p2m_ram_rw); __put_gfn(ap2m_active ? hostp2m : p2m, gfn); goto out; } goto out_put_gfn; } if ( (p2mt == p2m_mmio_direct) && is_hardware_domain(currd) && npfec.write_access && npfec.present && (hvm_emulate_one_mmio(mfn_x(mfn), gla) == X86EMUL_OKAY) ) { rc = 1; goto out_put_gfn; } /* If we fell through, the vcpu will retry now that access restrictions have * been removed. It may fault again if the p2m entry type still requires so. * Otherwise, this is an error condition. */ rc = fall_through; out_put_gfn: __put_gfn(p2m, gfn); if ( ap2m_active ) __put_gfn(hostp2m, gfn); out: /* All of these are delayed until we exit, since we might * sleep on event ring wait queues, and we must not hold * locks in such circumstance */ if ( paged ) p2m_mem_paging_populate(currd, gfn); if ( sharing_enomem ) { int rv; if ( (rv = mem_sharing_notify_enomem(currd, gfn, 1)) < 0 ) { gdprintk(XENLOG_ERR, "Domain %hu attempt to unshare " "gfn %lx, ENOMEM and no helper (rc %d)\n", currd->domain_id, gfn, rv); /* Crash the domain */ rc = 0; } } if ( req_ptr ) { if ( monitor_traps(curr, sync, req_ptr) < 0 ) rc = 0; xfree(req_ptr); } return rc; } int hvm_handle_xsetbv(u32 index, u64 new_bv) { int rc; hvm_monitor_crX(XCR0, new_bv, current->arch.xcr0); rc = handle_xsetbv(index, new_bv); if ( rc ) hvm_inject_hw_exception(TRAP_gp_fault, 0); return rc; } int hvm_set_efer(uint64_t value) { struct vcpu *v = current; const char *errstr; value &= ~EFER_LMA; errstr = hvm_efer_valid(v, value, -1); if ( errstr ) { printk(XENLOG_G_WARNING "%pv: Invalid EFER update: %#"PRIx64" -> %#"PRIx64" - %s\n", v, v->arch.hvm_vcpu.guest_efer, value, errstr); return X86EMUL_EXCEPTION; } if ( ((value ^ v->arch.hvm_vcpu.guest_efer) & EFER_LME) && hvm_paging_enabled(v) ) { gdprintk(XENLOG_WARNING, "Trying to change EFER.LME with paging enabled\n"); return X86EMUL_EXCEPTION; } if ( (value & EFER_LME) && !(v->arch.hvm_vcpu.guest_efer & EFER_LME) ) { struct segment_register cs; hvm_get_segment_register(v, x86_seg_cs, &cs); /* * %cs may be loaded with both .D and .L set in legacy mode, and both * are captured in the VMCS/VMCB. * * If a guest does this and then tries to transition into long mode, * the vmentry from setting LME fails due to invalid guest state, * because %cr0.PG is still clear. * * When LME becomes set, clobber %cs.L to keep the guest firmly in * compatibility mode until it reloads %cs itself. */ if ( cs.l ) { cs.l = 0; hvm_set_segment_register(v, x86_seg_cs, &cs); } } if ( nestedhvm_enabled(v->domain) && cpu_has_svm && ((value & EFER_SVME) == 0 ) && ((value ^ v->arch.hvm_vcpu.guest_efer) & EFER_SVME) ) { /* Cleared EFER.SVME: Flush all nestedp2m tables */ p2m_flush_nestedp2m(v->domain); nestedhvm_vcpu_reset(v); } value |= v->arch.hvm_vcpu.guest_efer & EFER_LMA; v->arch.hvm_vcpu.guest_efer = value; hvm_update_guest_efer(v); return X86EMUL_OKAY; } /* Exit UC mode only if all VCPUs agree on MTRR/PAT and are not in no_fill. */ static bool_t domain_exit_uc_mode(struct vcpu *v) { struct domain *d = v->domain; struct vcpu *vs; for_each_vcpu ( d, vs ) { if ( (vs == v) || !vs->is_initialised ) continue; if ( (vs->arch.hvm_vcpu.cache_mode == NO_FILL_CACHE_MODE) || mtrr_pat_not_equal(vs, v) ) return 0; } return 1; } static void hvm_set_uc_mode(struct vcpu *v, bool_t is_in_uc_mode) { v->domain->arch.hvm_domain.is_in_uc_mode = is_in_uc_mode; shadow_blow_tables_per_domain(v->domain); } int hvm_mov_to_cr(unsigned int cr, unsigned int gpr) { struct vcpu *curr = current; unsigned long val, *reg; int rc; if ( (reg = decode_register(gpr, guest_cpu_user_regs(), 0)) == NULL ) { gdprintk(XENLOG_ERR, "invalid gpr: %u\n", gpr); goto exit_and_crash; } val = *reg; HVMTRACE_LONG_2D(CR_WRITE, cr, TRC_PAR_LONG(val)); HVM_DBG_LOG(DBG_LEVEL_1, "CR%u, value = %lx", cr, val); switch ( cr ) { case 0: rc = hvm_set_cr0(val, 1); break; case 3: rc = hvm_set_cr3(val, 1); break; case 4: rc = hvm_set_cr4(val, 1); break; case 8: vlapic_set_reg(vcpu_vlapic(curr), APIC_TASKPRI, ((val & 0x0f) << 4)); rc = X86EMUL_OKAY; break; default: gdprintk(XENLOG_ERR, "invalid cr: %d\n", cr); goto exit_and_crash; } if ( rc == X86EMUL_EXCEPTION ) hvm_inject_hw_exception(TRAP_gp_fault, 0); return rc; exit_and_crash: domain_crash(curr->domain); return X86EMUL_UNHANDLEABLE; } int hvm_mov_from_cr(unsigned int cr, unsigned int gpr) { struct vcpu *curr = current; unsigned long val = 0, *reg; if ( (reg = decode_register(gpr, guest_cpu_user_regs(), 0)) == NULL ) { gdprintk(XENLOG_ERR, "invalid gpr: %u\n", gpr); goto exit_and_crash; } switch ( cr ) { case 0: case 2: case 3: case 4: val = curr->arch.hvm_vcpu.guest_cr[cr]; break; case 8: val = (vlapic_get_reg(vcpu_vlapic(curr), APIC_TASKPRI) & 0xf0) >> 4; break; default: gdprintk(XENLOG_ERR, "invalid cr: %u\n", cr); goto exit_and_crash; } *reg = val; HVMTRACE_LONG_2D(CR_READ, cr, TRC_PAR_LONG(val)); HVM_DBG_LOG(DBG_LEVEL_VMMU, "CR%u, value = %lx", cr, val); return X86EMUL_OKAY; exit_and_crash: domain_crash(curr->domain); return X86EMUL_UNHANDLEABLE; } void hvm_shadow_handle_cd(struct vcpu *v, unsigned long value) { if ( value & X86_CR0_CD ) { /* Entering no fill cache mode. */ spin_lock(&v->domain->arch.hvm_domain.uc_lock); v->arch.hvm_vcpu.cache_mode = NO_FILL_CACHE_MODE; if ( !v->domain->arch.hvm_domain.is_in_uc_mode ) { domain_pause_nosync(v->domain); /* Flush physical caches. */ flush_all(FLUSH_CACHE); hvm_set_uc_mode(v, 1); domain_unpause(v->domain); } spin_unlock(&v->domain->arch.hvm_domain.uc_lock); } else if ( !(value & X86_CR0_CD) && (v->arch.hvm_vcpu.cache_mode == NO_FILL_CACHE_MODE) ) { /* Exit from no fill cache mode. */ spin_lock(&v->domain->arch.hvm_domain.uc_lock); v->arch.hvm_vcpu.cache_mode = NORMAL_CACHE_MODE; if ( domain_exit_uc_mode(v) ) hvm_set_uc_mode(v, 0); spin_unlock(&v->domain->arch.hvm_domain.uc_lock); } } static void hvm_update_cr(struct vcpu *v, unsigned int cr, unsigned long value) { v->arch.hvm_vcpu.guest_cr[cr] = value; nestedhvm_set_cr(v, cr, value); hvm_update_guest_cr(v, cr); } int hvm_set_cr0(unsigned long value, bool_t may_defer) { struct vcpu *v = current; struct domain *d = v->domain; unsigned long gfn, old_value = v->arch.hvm_vcpu.guest_cr[0]; struct page_info *page; HVM_DBG_LOG(DBG_LEVEL_VMMU, "Update CR0 value = %lx", value); if ( (u32)value != value ) { HVM_DBG_LOG(DBG_LEVEL_1, "Guest attempts to set upper 32 bits in CR0: %lx", value); return X86EMUL_EXCEPTION; } value &= ~HVM_CR0_GUEST_RESERVED_BITS; /* ET is reserved and should be always be 1. */ value |= X86_CR0_ET; if ( !nestedhvm_vmswitch_in_progress(v) && (value & (X86_CR0_PE | X86_CR0_PG)) == X86_CR0_PG ) return X86EMUL_EXCEPTION; if ( may_defer && unlikely(v->domain->arch.monitor.write_ctrlreg_enabled & monitor_ctrlreg_bitmask(VM_EVENT_X86_CR0)) ) { ASSERT(v->arch.vm_event); if ( hvm_monitor_crX(CR0, value, old_value) ) { /* The actual write will occur in hvm_do_resume(), if permitted. */ v->arch.vm_event->write_data.do_write.cr0 = 1; v->arch.vm_event->write_data.cr0 = value; return X86EMUL_OKAY; } } if ( (value & X86_CR0_PG) && !(old_value & X86_CR0_PG) ) { if ( v->arch.hvm_vcpu.guest_efer & EFER_LME ) { if ( !(v->arch.hvm_vcpu.guest_cr[4] & X86_CR4_PAE) && !nestedhvm_vmswitch_in_progress(v) ) { HVM_DBG_LOG(DBG_LEVEL_1, "Enable paging before PAE enable"); return X86EMUL_EXCEPTION; } HVM_DBG_LOG(DBG_LEVEL_1, "Enabling long mode"); v->arch.hvm_vcpu.guest_efer |= EFER_LMA; hvm_update_guest_efer(v); } if ( !paging_mode_hap(d) ) { /* The guest CR3 must be pointing to the guest physical. */ gfn = v->arch.hvm_vcpu.guest_cr[3]>>PAGE_SHIFT; page = get_page_from_gfn(d, gfn, NULL, P2M_ALLOC); if ( !page ) { gdprintk(XENLOG_ERR, "Invalid CR3 value = %lx\n", v->arch.hvm_vcpu.guest_cr[3]); domain_crash(d); return X86EMUL_UNHANDLEABLE; } /* Now arch.guest_table points to machine physical. */ v->arch.guest_table = pagetable_from_page(page); HVM_DBG_LOG(DBG_LEVEL_VMMU, "Update CR3 value = %lx, mfn = %lx", v->arch.hvm_vcpu.guest_cr[3], page_to_mfn(page)); } } else if ( !(value & X86_CR0_PG) && (old_value & X86_CR0_PG) ) { if ( hvm_pcid_enabled(v) ) { HVM_DBG_LOG(DBG_LEVEL_1, "Guest attempts to clear CR0.PG " "while CR4.PCIDE=1"); return X86EMUL_EXCEPTION; } /* When CR0.PG is cleared, LMA is cleared immediately. */ if ( hvm_long_mode_active(v) ) { v->arch.hvm_vcpu.guest_efer &= ~EFER_LMA; hvm_update_guest_efer(v); } if ( !paging_mode_hap(d) ) { put_page(pagetable_get_page(v->arch.guest_table)); v->arch.guest_table = pagetable_null(); } } if ( ((value ^ old_value) & X86_CR0_CD) && iommu_enabled && hvm_funcs.handle_cd && (!rangeset_is_empty(d->iomem_caps) || !rangeset_is_empty(d->arch.ioport_caps) || has_arch_pdevs(d)) ) hvm_funcs.handle_cd(v, value); hvm_update_cr(v, 0, value); if ( (value ^ old_value) & X86_CR0_PG ) { if ( !nestedhvm_vmswitch_in_progress(v) && nestedhvm_vcpu_in_guestmode(v) ) paging_update_nestedmode(v); else paging_update_paging_modes(v); } return X86EMUL_OKAY; } int hvm_set_cr3(unsigned long value, bool_t may_defer) { struct vcpu *v = current; struct page_info *page; unsigned long old = v->arch.hvm_vcpu.guest_cr[3]; if ( may_defer && unlikely(v->domain->arch.monitor.write_ctrlreg_enabled & monitor_ctrlreg_bitmask(VM_EVENT_X86_CR3)) ) { ASSERT(v->arch.vm_event); if ( hvm_monitor_crX(CR3, value, old) ) { /* The actual write will occur in hvm_do_resume(), if permitted. */ v->arch.vm_event->write_data.do_write.cr3 = 1; v->arch.vm_event->write_data.cr3 = value; return X86EMUL_OKAY; } } if ( hvm_paging_enabled(v) && !paging_mode_hap(v->domain) && (value != v->arch.hvm_vcpu.guest_cr[3]) ) { /* Shadow-mode CR3 change. Check PDBR and update refcounts. */ HVM_DBG_LOG(DBG_LEVEL_VMMU, "CR3 value = %lx", value); page = get_page_from_gfn(v->domain, value >> PAGE_SHIFT, NULL, P2M_ALLOC); if ( !page ) goto bad_cr3; put_page(pagetable_get_page(v->arch.guest_table)); v->arch.guest_table = pagetable_from_page(page); HVM_DBG_LOG(DBG_LEVEL_VMMU, "Update CR3 value = %lx", value); } v->arch.hvm_vcpu.guest_cr[3] = value; paging_update_cr3(v); return X86EMUL_OKAY; bad_cr3: gdprintk(XENLOG_ERR, "Invalid CR3\n"); domain_crash(v->domain); return X86EMUL_UNHANDLEABLE; } int hvm_set_cr4(unsigned long value, bool_t may_defer) { struct vcpu *v = current; unsigned long old_cr; if ( value & ~hvm_cr4_guest_valid_bits(v, 0) ) { HVM_DBG_LOG(DBG_LEVEL_1, "Guest attempts to set reserved bit in CR4: %lx", value); return X86EMUL_EXCEPTION; } if ( !(value & X86_CR4_PAE) ) { if ( hvm_long_mode_active(v) ) { HVM_DBG_LOG(DBG_LEVEL_1, "Guest cleared CR4.PAE while " "EFER.LMA is set"); return X86EMUL_EXCEPTION; } } old_cr = v->arch.hvm_vcpu.guest_cr[4]; if ( (value & X86_CR4_PCIDE) && !(old_cr & X86_CR4_PCIDE) && (!hvm_long_mode_active(v) || (v->arch.hvm_vcpu.guest_cr[3] & 0xfff)) ) { HVM_DBG_LOG(DBG_LEVEL_1, "Guest attempts to change CR4.PCIDE from " "0 to 1 while either EFER.LMA=0 or CR3[11:0]!=000H"); return X86EMUL_EXCEPTION; } if ( may_defer && unlikely(v->domain->arch.monitor.write_ctrlreg_enabled & monitor_ctrlreg_bitmask(VM_EVENT_X86_CR4)) ) { ASSERT(v->arch.vm_event); if ( hvm_monitor_crX(CR4, value, old_cr) ) { /* The actual write will occur in hvm_do_resume(), if permitted. */ v->arch.vm_event->write_data.do_write.cr4 = 1; v->arch.vm_event->write_data.cr4 = value; return X86EMUL_OKAY; } } hvm_update_cr(v, 4, value); /* * Modifying CR4.{PSE,PAE,PGE,SMEP}, or clearing CR4.PCIDE * invalidate all TLB entries. */ if ( ((old_cr ^ value) & (X86_CR4_PSE | X86_CR4_PGE | X86_CR4_PAE | X86_CR4_SMEP)) || (!(value & X86_CR4_PCIDE) && (old_cr & X86_CR4_PCIDE)) ) { if ( !nestedhvm_vmswitch_in_progress(v) && nestedhvm_vcpu_in_guestmode(v) ) paging_update_nestedmode(v); else paging_update_paging_modes(v); } /* * {RD,WR}PKRU are not gated on XCR0.PKRU and hence an oddly behaving * guest may enable the feature in CR4 without enabling it in XCR0. We * need to context switch / migrate PKRU nevertheless. */ if ( (value & X86_CR4_PKE) && !(v->arch.xcr0_accum & XSTATE_PKRU) ) { int rc = handle_xsetbv(XCR_XFEATURE_ENABLED_MASK, get_xcr0() | XSTATE_PKRU); if ( rc ) { HVM_DBG_LOG(DBG_LEVEL_1, "Failed to force XCR0.PKRU: %d", rc); return X86EMUL_EXCEPTION; } if ( handle_xsetbv(XCR_XFEATURE_ENABLED_MASK, get_xcr0() & ~XSTATE_PKRU) ) /* nothing, best effort only */; } return X86EMUL_OKAY; } bool_t hvm_virtual_to_linear_addr( enum x86_segment seg, const struct segment_register *reg, unsigned long offset, unsigned int bytes, enum hvm_access_type access_type, const struct segment_register *active_cs, unsigned long *linear_addr) { const struct vcpu *curr = current; unsigned long addr = offset, last_byte; bool_t okay = 0; /* * These checks are for a memory access through an active segment. * * It is expected that the access rights of reg are suitable for seg (and * that this is enforced at the point that seg is loaded). */ ASSERT(seg < x86_seg_none); if ( !(curr->arch.hvm_vcpu.guest_cr[0] & X86_CR0_PE) || (guest_cpu_user_regs()->eflags & X86_EFLAGS_VM) ) { /* * REAL/VM86 MODE: Don't bother with segment access checks. * Certain of them are not done in native real mode anyway. */ addr = (uint32_t)(addr + reg->base); last_byte = (uint32_t)addr + bytes - !!bytes; if ( last_byte < addr ) goto out; } else if ( hvm_long_mode_active(curr) && (is_x86_system_segment(seg) || active_cs->l) ) { /* * User segments are always treated as present. System segment may * not be, and also incur limit checks. */ if ( is_x86_system_segment(seg) && (!reg->p || (offset + bytes - !!bytes) > reg->limit) ) goto out; /* * LONG MODE: FS, GS and system segments: add segment base. All * addresses must be canonical. */ if ( seg >= x86_seg_fs ) addr += reg->base; last_byte = addr + bytes - !!bytes; if ( !is_canonical_address(addr) || last_byte < addr || !is_canonical_address(last_byte) ) goto out; } else { /* * PROTECTED/COMPATIBILITY MODE: Apply segment checks and add base. */ /* * Hardware truncates to 32 bits in compatibility mode. * It does not truncate to 16 bits in 16-bit address-size mode. */ addr = (uint32_t)(addr + reg->base); /* Segment not valid for use (cooked meaning of .p)? */ if ( !reg->p ) goto out; /* Read/write restrictions only exist for user segments. */ if ( reg->s ) { switch ( access_type ) { case hvm_access_read: if ( (reg->type & 0xa) == 0x8 ) goto out; /* execute-only code segment */ break; case hvm_access_write: if ( (reg->type & 0xa) != 0x2 ) goto out; /* not a writable data segment */ break; default: break; } } last_byte = (uint32_t)offset + bytes - !!bytes; /* Is this a grows-down data segment? Special limit check if so. */ if ( reg->s && (reg->type & 0xc) == 0x4 ) { /* Is upper limit 0xFFFF or 0xFFFFFFFF? */ if ( !reg->db ) last_byte = (uint16_t)last_byte; /* Check first byte and last byte against respective bounds. */ if ( (offset <= reg->limit) || (last_byte < offset) ) goto out; } else if ( (last_byte > reg->limit) || (last_byte < offset) ) goto out; /* last byte is beyond limit or wraps 0xFFFFFFFF */ } /* All checks ok. */ okay = 1; out: /* * Always return the correct linear address, even if a permission check * failed. The permissions failure is not relevant to some callers. */ *linear_addr = addr; return okay; } struct hvm_write_map { struct list_head list; struct page_info *page; }; /* On non-NULL return, we leave this function holding an additional * ref on the underlying mfn, if any */ static void *_hvm_map_guest_frame(unsigned long gfn, bool_t permanent, bool_t *writable) { void *map; p2m_type_t p2mt; struct page_info *page; struct domain *d = current->domain; page = get_page_from_gfn(d, gfn, &p2mt, writable ? P2M_UNSHARE : P2M_ALLOC); if ( (p2m_is_shared(p2mt) && writable) || !page ) { if ( page ) put_page(page); return NULL; } if ( p2m_is_paging(p2mt) ) { put_page(page); p2m_mem_paging_populate(d, gfn); return NULL; } if ( writable ) { if ( unlikely(p2m_is_discard_write(p2mt)) ) *writable = 0; else if ( !permanent ) paging_mark_dirty(d, _mfn(page_to_mfn(page))); } if ( !permanent ) return __map_domain_page(page); if ( writable && *writable ) { struct hvm_write_map *track = xmalloc(struct hvm_write_map); if ( !track ) { put_page(page); return NULL; } track->page = page; spin_lock(&d->arch.hvm_domain.write_map.lock); list_add_tail(&track->list, &d->arch.hvm_domain.write_map.list); spin_unlock(&d->arch.hvm_domain.write_map.lock); } map = __map_domain_page_global(page); if ( !map ) put_page(page); return map; } void *hvm_map_guest_frame_rw(unsigned long gfn, bool_t permanent, bool_t *writable) { *writable = 1; return _hvm_map_guest_frame(gfn, permanent, writable); } void *hvm_map_guest_frame_ro(unsigned long gfn, bool_t permanent) { return _hvm_map_guest_frame(gfn, permanent, NULL); } void hvm_unmap_guest_frame(void *p, bool_t permanent) { unsigned long mfn; struct page_info *page; if ( !p ) return; mfn = domain_page_map_to_mfn(p); page = mfn_to_page(mfn); if ( !permanent ) unmap_domain_page(p); else { struct domain *d = page_get_owner(page); struct hvm_write_map *track; unmap_domain_page_global(p); spin_lock(&d->arch.hvm_domain.write_map.lock); list_for_each_entry(track, &d->arch.hvm_domain.write_map.list, list) if ( track->page == page ) { paging_mark_dirty(d, _mfn(mfn)); list_del(&track->list); xfree(track); break; } spin_unlock(&d->arch.hvm_domain.write_map.lock); } put_page(page); } void hvm_mapped_guest_frames_mark_dirty(struct domain *d) { struct hvm_write_map *track; spin_lock(&d->arch.hvm_domain.write_map.lock); list_for_each_entry(track, &d->arch.hvm_domain.write_map.list, list) paging_mark_dirty(d, _mfn(page_to_mfn(track->page))); spin_unlock(&d->arch.hvm_domain.write_map.lock); } static void *hvm_map_entry(unsigned long va, bool_t *writable) { unsigned long gfn; uint32_t pfec; char *v; if ( ((va & ~PAGE_MASK) + 8) > PAGE_SIZE ) { gdprintk(XENLOG_ERR, "Descriptor table entry " "straddles page boundary\n"); goto fail; } /* * We're mapping on behalf of the segment-load logic, which might write * the accessed flags in the descriptors (in 32-bit mode), but we still * treat it as a kernel-mode read (i.e. no access checks). */ pfec = PFEC_page_present; gfn = paging_gva_to_gfn(current, va, &pfec); if ( pfec & (PFEC_page_paged | PFEC_page_shared) ) goto fail; v = hvm_map_guest_frame_rw(gfn, 0, writable); if ( v == NULL ) goto fail; return v + (va & ~PAGE_MASK); fail: domain_crash(current->domain); return NULL; } static void hvm_unmap_entry(void *p) { hvm_unmap_guest_frame(p, 0); } static int hvm_load_segment_selector( enum x86_segment seg, uint16_t sel, unsigned int cpl, unsigned int eflags) { struct segment_register desctab, segr; struct desc_struct *pdesc, desc; u8 dpl, rpl; bool_t writable; int fault_type = TRAP_invalid_tss; struct vcpu *v = current; if ( eflags & X86_EFLAGS_VM ) { segr.sel = sel; segr.base = (uint32_t)sel << 4; segr.limit = 0xffffu; segr.attr = 0xf3; hvm_set_segment_register(v, seg, &segr); return 0; } /* NULL selector? */ if ( (sel & 0xfffc) == 0 ) { if ( (seg == x86_seg_cs) || (seg == x86_seg_ss) ) goto fail; memset(&segr, 0, sizeof(segr)); segr.sel = sel; hvm_set_segment_register(v, seg, &segr); return 0; } /* LDT descriptor must be in the GDT. */ if ( (seg == x86_seg_ldtr) && (sel & 4) ) goto fail; hvm_get_segment_register( v, (sel & 4) ? x86_seg_ldtr : x86_seg_gdtr, &desctab); /* Segment not valid for use (cooked meaning of .p)? */ if ( !desctab.p ) goto fail; /* Check against descriptor table limit. */ if ( ((sel & 0xfff8) + 7) > desctab.limit ) goto fail; pdesc = hvm_map_entry(desctab.base + (sel & 0xfff8), &writable); if ( pdesc == NULL ) goto hvm_map_fail; do { desc = *pdesc; /* LDT descriptor is a system segment. All others are code/data. */ if ( (desc.b & (1u<<12)) == ((seg == x86_seg_ldtr) << 12) ) goto unmap_and_fail; dpl = (desc.b >> 13) & 3; rpl = sel & 3; switch ( seg ) { case x86_seg_cs: /* Code segment? */ if ( !(desc.b & _SEGMENT_CODE) ) goto unmap_and_fail; /* Non-conforming segment: check DPL against RPL. */ if ( !(desc.b & _SEGMENT_EC) && (dpl != rpl) ) goto unmap_and_fail; break; case x86_seg_ss: /* Writable data segment? */ if ( (desc.b & (_SEGMENT_CODE|_SEGMENT_WR)) != _SEGMENT_WR ) goto unmap_and_fail; if ( (dpl != cpl) || (dpl != rpl) ) goto unmap_and_fail; break; case x86_seg_ldtr: /* LDT system segment? */ if ( (desc.b & _SEGMENT_TYPE) != (2u<<8) ) goto unmap_and_fail; goto skip_accessed_flag; default: /* Readable code or data segment? */ if ( (desc.b & (_SEGMENT_CODE|_SEGMENT_WR)) == _SEGMENT_CODE ) goto unmap_and_fail; /* * Data or non-conforming code segment: * check DPL against RPL and CPL. */ if ( ((desc.b & (_SEGMENT_EC|_SEGMENT_CODE)) != (_SEGMENT_EC|_SEGMENT_CODE)) && ((dpl < cpl) || (dpl < rpl)) ) goto unmap_and_fail; break; } /* Segment present in memory? */ if ( !(desc.b & _SEGMENT_P) ) { fault_type = (seg != x86_seg_ss) ? TRAP_no_segment : TRAP_stack_error; goto unmap_and_fail; } } while ( !(desc.b & 0x100) && /* Ensure Accessed flag is set */ writable && /* except if we are to discard writes */ (cmpxchg(&pdesc->b, desc.b, desc.b | 0x100) != desc.b) ); /* Force the Accessed flag in our local copy. */ desc.b |= 0x100; skip_accessed_flag: hvm_unmap_entry(pdesc); segr.base = (((desc.b << 0) & 0xff000000u) | ((desc.b << 16) & 0x00ff0000u) | ((desc.a >> 16) & 0x0000ffffu)); segr.attr = (((desc.b >> 8) & 0x00ffu) | ((desc.b >> 12) & 0x0f00u)); segr.limit = (desc.b & 0x000f0000u) | (desc.a & 0x0000ffffu); if ( segr.g ) segr.limit = (segr.limit << 12) | 0xfffu; segr.sel = sel; hvm_set_segment_register(v, seg, &segr); return 0; unmap_and_fail: hvm_unmap_entry(pdesc); fail: hvm_inject_hw_exception(fault_type, sel & 0xfffc); hvm_map_fail: return 1; } struct tss32 { uint16_t back_link, :16; uint32_t esp0; uint16_t ss0, :16; uint32_t esp1; uint16_t ss1, :16; uint32_t esp2; uint16_t ss2, :16; uint32_t cr3, eip, eflags, eax, ecx, edx, ebx, esp, ebp, esi, edi; uint16_t es, :16, cs, :16, ss, :16, ds, :16, fs, :16, gs, :16, ldt, :16; uint16_t trace /* :1 */, iomap; }; void hvm_prepare_vm86_tss(struct vcpu *v, uint32_t base, uint32_t limit) { /* * If the provided area is large enough to cover at least the ISA port * range, keep the bitmaps outside the base structure. For rather small * areas (namely relevant for guests having been migrated from older * Xen versions), maximize interrupt vector and port coverage by pointing * the I/O bitmap at 0x20 (which puts the interrupt redirection bitmap * right at zero), accepting accesses to port 0x235 (represented by bit 5 * of byte 0x46) to trigger #GP (which will simply result in the access * being handled by the emulator via a slightly different path than it * would be anyway). Be sure to include one extra byte at the end of the * I/O bitmap (hence the missing "- 1" in the comparison is not an * off-by-one mistake), which we deliberately don't fill with all ones. */ uint16_t iomap = (limit >= sizeof(struct tss32) + (0x100 / 8) + (0x400 / 8) ? sizeof(struct tss32) : 0) + (0x100 / 8); ASSERT(limit >= sizeof(struct tss32) - 1); /* * Strictly speaking we'd have to use hvm_copy_to_guest_linear() below, * but since the guest is (supposed to be, unless it corrupts that setup * itself, which would harm only itself) running on an identmap, we can * use the less overhead variant below, which also allows passing a vCPU * argument. */ hvm_copy_to_guest_phys(base, NULL, limit + 1, v); hvm_copy_to_guest_phys(base + offsetof(struct tss32, iomap), &iomap, sizeof(iomap), v); } void hvm_task_switch( uint16_t tss_sel, enum hvm_task_switch_reason taskswitch_reason, int32_t errcode) { struct vcpu *v = current; struct cpu_user_regs *regs = guest_cpu_user_regs(); struct segment_register gdt, tr, prev_tr, segr; struct desc_struct *optss_desc = NULL, *nptss_desc = NULL, tss_desc; bool_t otd_writable, ntd_writable; unsigned int eflags, new_cpl; pagefault_info_t pfinfo; int exn_raised, rc; struct tss32 tss; hvm_get_segment_register(v, x86_seg_gdtr, &gdt); hvm_get_segment_register(v, x86_seg_tr, &prev_tr); if ( ((tss_sel & 0xfff8) + 7) > gdt.limit ) { hvm_inject_hw_exception((taskswitch_reason == TSW_iret) ? TRAP_invalid_tss : TRAP_gp_fault, tss_sel & 0xfff8); goto out; } optss_desc = hvm_map_entry(gdt.base + (prev_tr.sel & 0xfff8), &otd_writable); if ( optss_desc == NULL ) goto out; nptss_desc = hvm_map_entry(gdt.base + (tss_sel & 0xfff8), &ntd_writable); if ( nptss_desc == NULL ) goto out; tss_desc = *nptss_desc; tr.sel = tss_sel; tr.base = (((tss_desc.b << 0) & 0xff000000u) | ((tss_desc.b << 16) & 0x00ff0000u) | ((tss_desc.a >> 16) & 0x0000ffffu)); tr.attr = (((tss_desc.b >> 8) & 0x00ffu) | ((tss_desc.b >> 12) & 0x0f00u)); tr.limit = (tss_desc.b & 0x000f0000u) | (tss_desc.a & 0x0000ffffu); if ( tr.g ) tr.limit = (tr.limit << 12) | 0xfffu; if ( tr.type != ((taskswitch_reason == TSW_iret) ? 0xb : 0x9) ) { hvm_inject_hw_exception( (taskswitch_reason == TSW_iret) ? TRAP_invalid_tss : TRAP_gp_fault, tss_sel & 0xfff8); goto out; } if ( !tr.p ) { hvm_inject_hw_exception(TRAP_no_segment, tss_sel & 0xfff8); goto out; } if ( tr.limit < (sizeof(tss)-1) ) { hvm_inject_hw_exception(TRAP_invalid_tss, tss_sel & 0xfff8); goto out; } rc = hvm_copy_from_guest_linear( &tss, prev_tr.base, sizeof(tss), PFEC_page_present, &pfinfo); if ( rc == HVMTRANS_bad_linear_to_gfn ) hvm_inject_page_fault(pfinfo.ec, pfinfo.linear); if ( rc != HVMTRANS_okay ) goto out; eflags = regs->eflags; if ( taskswitch_reason == TSW_iret ) eflags &= ~X86_EFLAGS_NT; tss.eip = regs->eip; tss.eflags = eflags; tss.eax = regs->eax; tss.ecx = regs->ecx; tss.edx = regs->edx; tss.ebx = regs->ebx; tss.esp = regs->esp; tss.ebp = regs->ebp; tss.esi = regs->esi; tss.edi = regs->edi; hvm_get_segment_register(v, x86_seg_es, &segr); tss.es = segr.sel; hvm_get_segment_register(v, x86_seg_cs, &segr); tss.cs = segr.sel; hvm_get_segment_register(v, x86_seg_ss, &segr); tss.ss = segr.sel; hvm_get_segment_register(v, x86_seg_ds, &segr); tss.ds = segr.sel; hvm_get_segment_register(v, x86_seg_fs, &segr); tss.fs = segr.sel; hvm_get_segment_register(v, x86_seg_gs, &segr); tss.gs = segr.sel; hvm_get_segment_register(v, x86_seg_ldtr, &segr); tss.ldt = segr.sel; rc = hvm_copy_to_guest_linear(prev_tr.base + offsetof(typeof(tss), eip), &tss.eip, offsetof(typeof(tss), trace) - offsetof(typeof(tss), eip), PFEC_page_present, &pfinfo); if ( rc == HVMTRANS_bad_linear_to_gfn ) hvm_inject_page_fault(pfinfo.ec, pfinfo.linear); if ( rc != HVMTRANS_okay ) goto out; rc = hvm_copy_from_guest_linear( &tss, tr.base, sizeof(tss), PFEC_page_present, &pfinfo); if ( rc == HVMTRANS_bad_linear_to_gfn ) hvm_inject_page_fault(pfinfo.ec, pfinfo.linear); /* * Note: The HVMTRANS_gfn_shared case could be optimised, if the callee * functions knew we want RO access. */ if ( rc != HVMTRANS_okay ) goto out; new_cpl = tss.eflags & X86_EFLAGS_VM ? 3 : tss.cs & 3; if ( hvm_load_segment_selector(x86_seg_ldtr, tss.ldt, new_cpl, 0) ) goto out; rc = hvm_set_cr3(tss.cr3, 1); if ( rc == X86EMUL_EXCEPTION ) hvm_inject_hw_exception(TRAP_gp_fault, 0); if ( rc != X86EMUL_OKAY ) goto out; regs->rip = tss.eip; regs->rflags = tss.eflags | X86_EFLAGS_MBS; regs->rax = tss.eax; regs->rcx = tss.ecx; regs->rdx = tss.edx; regs->rbx = tss.ebx; regs->rsp = tss.esp; regs->rbp = tss.ebp; regs->rsi = tss.esi; regs->rdi = tss.edi; exn_raised = 0; if ( hvm_load_segment_selector(x86_seg_es, tss.es, new_cpl, tss.eflags) || hvm_load_segment_selector(x86_seg_cs, tss.cs, new_cpl, tss.eflags) || hvm_load_segment_selector(x86_seg_ss, tss.ss, new_cpl, tss.eflags) || hvm_load_segment_selector(x86_seg_ds, tss.ds, new_cpl, tss.eflags) || hvm_load_segment_selector(x86_seg_fs, tss.fs, new_cpl, tss.eflags) || hvm_load_segment_selector(x86_seg_gs, tss.gs, new_cpl, tss.eflags) ) exn_raised = 1; if ( taskswitch_reason == TSW_call_or_int ) { regs->eflags |= X86_EFLAGS_NT; tss.back_link = prev_tr.sel; rc = hvm_copy_to_guest_linear(tr.base + offsetof(typeof(tss), back_link), &tss.back_link, sizeof(tss.back_link), 0, &pfinfo); if ( rc == HVMTRANS_bad_linear_to_gfn ) { hvm_inject_page_fault(pfinfo.ec, pfinfo.linear); exn_raised = 1; } else if ( rc != HVMTRANS_okay ) goto out; } tr.type = 0xb; /* busy 32-bit tss */ hvm_set_segment_register(v, x86_seg_tr, &tr); v->arch.hvm_vcpu.guest_cr[0] |= X86_CR0_TS; hvm_update_guest_cr(v, 0); if ( (taskswitch_reason == TSW_iret || taskswitch_reason == TSW_jmp) && otd_writable ) clear_bit(41, optss_desc); /* clear B flag of old task */ if ( taskswitch_reason != TSW_iret && ntd_writable ) set_bit(41, nptss_desc); /* set B flag of new task */ if ( errcode >= 0 ) { struct segment_register cs; unsigned long linear_addr; unsigned int opsz, sp; hvm_get_segment_register(v, x86_seg_cs, &cs); opsz = cs.db ? 4 : 2; hvm_get_segment_register(v, x86_seg_ss, &segr); if ( segr.db ) sp = regs->esp -= opsz; else sp = regs->sp -= opsz; if ( hvm_virtual_to_linear_addr(x86_seg_ss, &segr, sp, opsz, hvm_access_write, &cs, &linear_addr) ) { rc = hvm_copy_to_guest_linear(linear_addr, &errcode, opsz, 0, &pfinfo); if ( rc == HVMTRANS_bad_linear_to_gfn ) { hvm_inject_page_fault(pfinfo.ec, pfinfo.linear); exn_raised = 1; } else if ( rc != HVMTRANS_okay ) goto out; } } if ( (tss.trace & 1) && !exn_raised ) hvm_inject_hw_exception(TRAP_debug, X86_EVENT_NO_EC); out: hvm_unmap_entry(optss_desc); hvm_unmap_entry(nptss_desc); } enum hvm_translation_result hvm_translate_get_page( struct vcpu *v, unsigned long addr, bool linear, uint32_t pfec, pagefault_info_t *pfinfo, struct page_info **page_p, gfn_t *gfn_p, p2m_type_t *p2mt_p) { struct page_info *page; p2m_type_t p2mt; gfn_t gfn; if ( linear ) { gfn = _gfn(paging_gva_to_gfn(v, addr, &pfec)); if ( gfn_eq(gfn, INVALID_GFN) ) { if ( pfec & PFEC_page_paged ) return HVMTRANS_gfn_paged_out; if ( pfec & PFEC_page_shared ) return HVMTRANS_gfn_shared; if ( pfinfo ) { pfinfo->linear = addr; pfinfo->ec = pfec & ~PFEC_implicit; } return HVMTRANS_bad_linear_to_gfn; } } else { gfn = gaddr_to_gfn(addr); ASSERT(!pfinfo); } /* * No need to do the P2M lookup for internally handled MMIO, benefiting * - 32-bit WinXP (& older Windows) on AMD CPUs for LAPIC accesses, * - newer Windows (like Server 2012) for HPET accesses. */ if ( v == current && !nestedhvm_vcpu_in_guestmode(v) && hvm_mmio_internal(gfn_to_gaddr(gfn)) ) return HVMTRANS_bad_gfn_to_mfn; page = get_page_from_gfn(v->domain, gfn_x(gfn), &p2mt, P2M_UNSHARE); if ( !page ) return HVMTRANS_bad_gfn_to_mfn; if ( p2m_is_paging(p2mt) ) { put_page(page); p2m_mem_paging_populate(v->domain, gfn_x(gfn)); return HVMTRANS_gfn_paged_out; } if ( p2m_is_shared(p2mt) ) { put_page(page); return HVMTRANS_gfn_shared; } if ( p2m_is_grant(p2mt) ) { put_page(page); return HVMTRANS_unhandleable; } *page_p = page; if ( gfn_p ) *gfn_p = gfn; if ( p2mt_p ) *p2mt_p = p2mt; return HVMTRANS_okay; } #define HVMCOPY_from_guest (0u<<0) #define HVMCOPY_to_guest (1u<<0) #define HVMCOPY_phys (0u<<2) #define HVMCOPY_linear (1u<<2) static enum hvm_translation_result __hvm_copy( void *buf, paddr_t addr, int size, struct vcpu *v, unsigned int flags, uint32_t pfec, pagefault_info_t *pfinfo) { gfn_t gfn; struct page_info *page; p2m_type_t p2mt; char *p; int count, todo = size; ASSERT(is_hvm_vcpu(v)); /* * XXX Disable for 4.1.0: PV-on-HVM drivers will do grant-table ops * such as query_size. Grant-table code currently does copy_to/from_guest * accesses under the big per-domain lock, which this test would disallow. * The test is not needed until we implement sleeping-on-waitqueue when * we access a paged-out frame, and that's post 4.1.0 now. */ #if 0 /* * If the required guest memory is paged out, this function may sleep. * Hence we bail immediately if called from atomic context. */ if ( in_atomic() ) return HVMTRANS_unhandleable; #endif while ( todo > 0 ) { enum hvm_translation_result res; paddr_t gpa = addr & ~PAGE_MASK; count = min_t(int, PAGE_SIZE - gpa, todo); res = hvm_translate_get_page(v, addr, flags & HVMCOPY_linear, pfec, pfinfo, &page, &gfn, &p2mt); if ( res != HVMTRANS_okay ) return res; p = (char *)__map_domain_page(page) + (addr & ~PAGE_MASK); if ( flags & HVMCOPY_to_guest ) { if ( p2m_is_discard_write(p2mt) ) { static unsigned long lastpage; if ( xchg(&lastpage, gfn_x(gfn)) != gfn_x(gfn) ) dprintk(XENLOG_G_DEBUG, "%pv attempted write to read-only gfn %#lx (mfn=%#lx)\n", v, gfn_x(gfn), page_to_mfn(page)); } else { if ( buf ) memcpy(p, buf, count); else memset(p, 0, count); paging_mark_dirty(v->domain, _mfn(page_to_mfn(page))); } } else { memcpy(buf, p, count); } unmap_domain_page(p); addr += count; if ( buf ) buf += count; todo -= count; put_page(page); } return HVMTRANS_okay; } enum hvm_translation_result hvm_copy_to_guest_phys( paddr_t paddr, void *buf, int size, struct vcpu *v) { return __hvm_copy(buf, paddr, size, v, HVMCOPY_to_guest | HVMCOPY_phys, 0, NULL); } enum hvm_translation_result hvm_copy_from_guest_phys( void *buf, paddr_t paddr, int size) { return __hvm_copy(buf, paddr, size, current, HVMCOPY_from_guest | HVMCOPY_phys, 0, NULL); } enum hvm_translation_result hvm_copy_to_guest_linear( unsigned long addr, void *buf, int size, uint32_t pfec, pagefault_info_t *pfinfo) { return __hvm_copy(buf, addr, size, current, HVMCOPY_to_guest | HVMCOPY_linear, PFEC_page_present | PFEC_write_access | pfec, pfinfo); } enum hvm_translation_result hvm_copy_from_guest_linear( void *buf, unsigned long addr, int size, uint32_t pfec, pagefault_info_t *pfinfo) { return __hvm_copy(buf, addr, size, current, HVMCOPY_from_guest | HVMCOPY_linear, PFEC_page_present | pfec, pfinfo); } enum hvm_translation_result hvm_fetch_from_guest_linear( void *buf, unsigned long addr, int size, uint32_t pfec, pagefault_info_t *pfinfo) { return __hvm_copy(buf, addr, size, current, HVMCOPY_from_guest | HVMCOPY_linear, PFEC_page_present | PFEC_insn_fetch | pfec, pfinfo); } unsigned long copy_to_user_hvm(void *to, const void *from, unsigned int len) { int rc; if ( current->hcall_compat && is_compat_arg_xlat_range(to, len) ) { memcpy(to, from, len); return 0; } rc = hvm_copy_to_guest_linear((unsigned long)to, (void *)from, len, 0, NULL); return rc ? len : 0; /* fake a copy_to_user() return code */ } unsigned long clear_user_hvm(void *to, unsigned int len) { int rc; if ( current->hcall_compat && is_compat_arg_xlat_range(to, len) ) { memset(to, 0x00, len); return 0; } rc = hvm_copy_to_guest_linear((unsigned long)to, NULL, len, 0, NULL); return rc ? len : 0; /* fake a copy_to_user() return code */ } unsigned long copy_from_user_hvm(void *to, const void *from, unsigned len) { int rc; if ( current->hcall_compat && is_compat_arg_xlat_range(from, len) ) { memcpy(to, from, len); return 0; } rc = hvm_copy_from_guest_linear(to, (unsigned long)from, len, 0, NULL); return rc ? len : 0; /* fake a copy_from_user() return code */ } bool hvm_check_cpuid_faulting(struct vcpu *v) { const struct msr_vcpu_policy *vp = v->arch.msr; if ( !vp->misc_features_enables.cpuid_faulting ) return false; return hvm_get_cpl(v) > 0; } static uint64_t _hvm_rdtsc_intercept(void) { struct vcpu *curr = current; #if !defined(NDEBUG) || defined(CONFIG_PERF_COUNTERS) struct domain *currd = curr->domain; if ( currd->arch.vtsc ) switch ( hvm_guest_x86_mode(curr) ) { case 8: case 4: case 2: if ( unlikely(hvm_get_cpl(curr)) ) { case 1: currd->arch.vtsc_usercount++; break; } /* fall through */ case 0: currd->arch.vtsc_kerncount++; break; } #endif return hvm_get_guest_tsc(curr); } void hvm_rdtsc_intercept(struct cpu_user_regs *regs) { msr_split(regs, _hvm_rdtsc_intercept()); HVMTRACE_2D(RDTSC, regs->eax, regs->edx); } int hvm_msr_read_intercept(unsigned int msr, uint64_t *msr_content) { struct vcpu *v = current; struct domain *d = v->domain; uint64_t *var_range_base, *fixed_range_base; int ret; var_range_base = (uint64_t *)v->arch.hvm_vcpu.mtrr.var_ranges; fixed_range_base = (uint64_t *)v->arch.hvm_vcpu.mtrr.fixed_ranges; if ( (ret = guest_rdmsr(v, msr, msr_content)) != X86EMUL_UNHANDLEABLE ) return ret; ret = X86EMUL_OKAY; switch ( msr ) { unsigned int index; case MSR_EFER: *msr_content = v->arch.hvm_vcpu.guest_efer; break; case MSR_IA32_TSC: *msr_content = _hvm_rdtsc_intercept(); break; case MSR_IA32_TSC_ADJUST: *msr_content = v->arch.hvm_vcpu.msr_tsc_adjust; break; case MSR_TSC_AUX: *msr_content = hvm_msr_tsc_aux(v); break; case MSR_IA32_APICBASE: *msr_content = vcpu_vlapic(v)->hw.apic_base_msr; break; case MSR_IA32_APICBASE_MSR ... MSR_IA32_APICBASE_MSR + 0x3ff: if ( hvm_x2apic_msr_read(v, msr, msr_content) ) goto gp_fault; break; case MSR_IA32_TSC_DEADLINE: *msr_content = vlapic_tdt_msr_get(vcpu_vlapic(v)); break; case MSR_IA32_CR_PAT: hvm_get_guest_pat(v, msr_content); break; case MSR_MTRRcap: if ( !d->arch.cpuid->basic.mtrr ) goto gp_fault; *msr_content = v->arch.hvm_vcpu.mtrr.mtrr_cap; break; case MSR_MTRRdefType: if ( !d->arch.cpuid->basic.mtrr ) goto gp_fault; *msr_content = v->arch.hvm_vcpu.mtrr.def_type | (v->arch.hvm_vcpu.mtrr.enabled << 10); break; case MSR_MTRRfix64K_00000: if ( !d->arch.cpuid->basic.mtrr ) goto gp_fault; *msr_content = fixed_range_base[0]; break; case MSR_MTRRfix16K_80000: case MSR_MTRRfix16K_A0000: if ( !d->arch.cpuid->basic.mtrr ) goto gp_fault; index = msr - MSR_MTRRfix16K_80000; *msr_content = fixed_range_base[index + 1]; break; case MSR_MTRRfix4K_C0000...MSR_MTRRfix4K_F8000: if ( !d->arch.cpuid->basic.mtrr ) goto gp_fault; index = msr - MSR_MTRRfix4K_C0000; *msr_content = fixed_range_base[index + 3]; break; case MSR_IA32_MTRR_PHYSBASE(0)...MSR_IA32_MTRR_PHYSMASK(MTRR_VCNT-1): if ( !d->arch.cpuid->basic.mtrr ) goto gp_fault; index = msr - MSR_IA32_MTRR_PHYSBASE(0); *msr_content = var_range_base[index]; break; case MSR_IA32_XSS: if ( !d->arch.cpuid->xstate.xsaves ) goto gp_fault; *msr_content = v->arch.hvm_vcpu.msr_xss; break; case MSR_IA32_BNDCFGS: if ( !d->arch.cpuid->feat.mpx || !hvm_get_guest_bndcfgs(v, msr_content) ) goto gp_fault; break; case MSR_K8_ENABLE_C1E: case MSR_AMD64_NB_CFG: /* * These AMD-only registers may be accessed if this HVM guest * has been migrated to an Intel host. This fixes a guest crash * in this case. */ *msr_content = 0; break; default: if ( (ret = vmce_rdmsr(msr, msr_content)) < 0 ) goto gp_fault; /* If ret == 0 then this is not an MCE MSR, see other MSRs. */ ret = ((ret == 0) ? hvm_funcs.msr_read_intercept(msr, msr_content) : X86EMUL_OKAY); break; } out: HVMTRACE_3D(MSR_READ, msr, (uint32_t)*msr_content, (uint32_t)(*msr_content >> 32)); return ret; gp_fault: ret = X86EMUL_EXCEPTION; *msr_content = -1ull; goto out; } int hvm_msr_write_intercept(unsigned int msr, uint64_t msr_content, bool_t may_defer) { struct vcpu *v = current; struct domain *d = v->domain; int ret; HVMTRACE_3D(MSR_WRITE, msr, (uint32_t)msr_content, (uint32_t)(msr_content >> 32)); if ( may_defer && unlikely(monitored_msr(v->domain, msr)) ) { ASSERT(v->arch.vm_event); /* The actual write will occur in hvm_do_resume() (if permitted). */ v->arch.vm_event->write_data.do_write.msr = 1; v->arch.vm_event->write_data.msr = msr; v->arch.vm_event->write_data.value = msr_content; hvm_monitor_msr(msr, msr_content); return X86EMUL_OKAY; } if ( (ret = guest_wrmsr(v, msr, msr_content)) != X86EMUL_UNHANDLEABLE ) return ret; ret = X86EMUL_OKAY; switch ( msr ) { unsigned int index; case MSR_EFER: if ( hvm_set_efer(msr_content) ) return X86EMUL_EXCEPTION; break; case MSR_IA32_TSC: hvm_set_guest_tsc_msr(v, msr_content); break; case MSR_IA32_TSC_ADJUST: hvm_set_guest_tsc_adjust(v, msr_content); break; case MSR_TSC_AUX: v->arch.hvm_vcpu.msr_tsc_aux = (uint32_t)msr_content; if ( cpu_has_rdtscp && (v->domain->arch.tsc_mode != TSC_MODE_PVRDTSCP) ) wrmsrl(MSR_TSC_AUX, (uint32_t)msr_content); break; case MSR_IA32_APICBASE: if ( !vlapic_msr_set(vcpu_vlapic(v), msr_content) ) goto gp_fault; break; case MSR_IA32_TSC_DEADLINE: vlapic_tdt_msr_set(vcpu_vlapic(v), msr_content); break; case MSR_IA32_APICBASE_MSR ... MSR_IA32_APICBASE_MSR + 0x3ff: if ( hvm_x2apic_msr_write(v, msr, msr_content) ) goto gp_fault; break; case MSR_IA32_CR_PAT: if ( !hvm_set_guest_pat(v, msr_content) ) goto gp_fault; break; case MSR_MTRRcap: goto gp_fault; case MSR_MTRRdefType: if ( !d->arch.cpuid->basic.mtrr ) goto gp_fault; if ( !mtrr_def_type_msr_set(v->domain, &v->arch.hvm_vcpu.mtrr, msr_content) ) goto gp_fault; break; case MSR_MTRRfix64K_00000: if ( !d->arch.cpuid->basic.mtrr ) goto gp_fault; if ( !mtrr_fix_range_msr_set(v->domain, &v->arch.hvm_vcpu.mtrr, 0, msr_content) ) goto gp_fault; break; case MSR_MTRRfix16K_80000: case MSR_MTRRfix16K_A0000: if ( !d->arch.cpuid->basic.mtrr ) goto gp_fault; index = msr - MSR_MTRRfix16K_80000 + 1; if ( !mtrr_fix_range_msr_set(v->domain, &v->arch.hvm_vcpu.mtrr, index, msr_content) ) goto gp_fault; break; case MSR_MTRRfix4K_C0000...MSR_MTRRfix4K_F8000: if ( !d->arch.cpuid->basic.mtrr ) goto gp_fault; index = msr - MSR_MTRRfix4K_C0000 + 3; if ( !mtrr_fix_range_msr_set(v->domain, &v->arch.hvm_vcpu.mtrr, index, msr_content) ) goto gp_fault; break; case MSR_IA32_MTRR_PHYSBASE(0)...MSR_IA32_MTRR_PHYSMASK(MTRR_VCNT-1): if ( !d->arch.cpuid->basic.mtrr ) goto gp_fault; if ( !mtrr_var_range_msr_set(v->domain, &v->arch.hvm_vcpu.mtrr, msr, msr_content) ) goto gp_fault; break; case MSR_IA32_XSS: /* No XSS features currently supported for guests. */ if ( !d->arch.cpuid->xstate.xsaves || msr_content != 0 ) goto gp_fault; v->arch.hvm_vcpu.msr_xss = msr_content; break; case MSR_IA32_BNDCFGS: if ( !d->arch.cpuid->feat.mpx || !hvm_set_guest_bndcfgs(v, msr_content) ) goto gp_fault; break; case MSR_AMD64_NB_CFG: /* ignore the write */ break; default: if ( (ret = vmce_wrmsr(msr, msr_content)) < 0 ) goto gp_fault; /* If ret == 0 then this is not an MCE MSR, see other MSRs. */ ret = ((ret == 0) ? hvm_funcs.msr_write_intercept(msr, msr_content) : X86EMUL_OKAY); break; } return ret; gp_fault: return X86EMUL_EXCEPTION; } static bool is_sysdesc_access(const struct x86_emulate_state *state, const struct x86_emulate_ctxt *ctxt) { unsigned int ext; int mode = x86_insn_modrm(state, NULL, &ext); switch ( ctxt->opcode ) { case X86EMUL_OPC(0x0f, 0x00): if ( !(ext & 4) ) /* SLDT / STR / LLDT / LTR */ return true; break; case X86EMUL_OPC(0x0f, 0x01): if ( mode != 3 && !(ext & 4) ) /* SGDT / SIDT / LGDT / LIDT */ return true; break; } return false; } int hvm_descriptor_access_intercept(uint64_t exit_info, uint64_t vmx_exit_qualification, unsigned int descriptor, bool is_write) { struct vcpu *curr = current; struct domain *currd = curr->domain; if ( currd->arch.monitor.descriptor_access_enabled ) { ASSERT(curr->arch.vm_event); hvm_monitor_descriptor_access(exit_info, vmx_exit_qualification, descriptor, is_write); } else if ( !hvm_emulate_one_insn(is_sysdesc_access, "sysdesc access") ) domain_crash(currd); return X86EMUL_OKAY; } static bool is_cross_vendor(const struct x86_emulate_state *state, const struct x86_emulate_ctxt *ctxt) { switch ( ctxt->opcode ) { case X86EMUL_OPC(0x0f, 0x05): /* syscall */ case X86EMUL_OPC(0x0f, 0x34): /* sysenter */ case X86EMUL_OPC(0x0f, 0x35): /* sysexit */ return true; } return false; } void hvm_ud_intercept(struct cpu_user_regs *regs) { struct vcpu *cur = current; bool should_emulate = cur->domain->arch.cpuid->x86_vendor != boot_cpu_data.x86_vendor; struct hvm_emulate_ctxt ctxt; hvm_emulate_init_once(&ctxt, opt_hvm_fep ? NULL : is_cross_vendor, regs); if ( opt_hvm_fep ) { const struct segment_register *cs = &ctxt.seg_reg[x86_seg_cs]; uint32_t walk = (ctxt.seg_reg[x86_seg_ss].dpl == 3) ? PFEC_user_mode : 0; unsigned long addr; char sig[5]; /* ud2; .ascii "xen" */ if ( hvm_virtual_to_linear_addr(x86_seg_cs, cs, regs->rip, sizeof(sig), hvm_access_insn_fetch, cs, &addr) && (hvm_fetch_from_guest_linear(sig, addr, sizeof(sig), walk, NULL) == HVMTRANS_okay) && (memcmp(sig, "\xf\xbxen", sizeof(sig)) == 0) ) { regs->rip += sizeof(sig); regs->eflags &= ~X86_EFLAGS_RF; /* Zero the upper 32 bits of %rip if not in 64bit mode. */ if ( !(hvm_long_mode_active(cur) && cs->l) ) regs->rip = regs->eip; add_taint(TAINT_HVM_FEP); should_emulate = true; } } if ( !should_emulate ) { hvm_inject_hw_exception(TRAP_invalid_op, X86_EVENT_NO_EC); return; } switch ( hvm_emulate_one(&ctxt) ) { case X86EMUL_UNHANDLEABLE: case X86EMUL_UNIMPLEMENTED: hvm_inject_hw_exception(TRAP_invalid_op, X86_EVENT_NO_EC); break; case X86EMUL_EXCEPTION: hvm_inject_event(&ctxt.ctxt.event); /* fall through */ default: hvm_emulate_writeback(&ctxt); break; } } enum hvm_intblk hvm_interrupt_blocked(struct vcpu *v, struct hvm_intack intack) { unsigned long intr_shadow; ASSERT(v == current); if ( nestedhvm_enabled(v->domain) ) { enum hvm_intblk intr; intr = nhvm_interrupt_blocked(v); if ( intr != hvm_intblk_none ) return intr; } if ( (intack.source != hvm_intsrc_nmi) && !(guest_cpu_user_regs()->eflags & X86_EFLAGS_IF) ) return hvm_intblk_rflags_ie; intr_shadow = hvm_funcs.get_interrupt_shadow(v); if ( intr_shadow & (HVM_INTR_SHADOW_STI|HVM_INTR_SHADOW_MOV_SS) ) return hvm_intblk_shadow; if ( intack.source == hvm_intsrc_nmi ) return ((intr_shadow & HVM_INTR_SHADOW_NMI) ? hvm_intblk_nmi_iret : hvm_intblk_none); if ( intack.source == hvm_intsrc_lapic ) { uint32_t tpr = vlapic_get_reg(vcpu_vlapic(v), APIC_TASKPRI) & 0xF0; if ( (tpr >> 4) >= (intack.vector >> 4) ) return hvm_intblk_tpr; } return hvm_intblk_none; } static void hvm_latch_shinfo_size(struct domain *d) { /* * Called from operations which are among the very first executed by * PV drivers on initialisation or after save/restore. These are sensible * points at which to sample the execution mode of the guest and latch * 32- or 64-bit format for shared state. */ if ( current->domain == d ) { d->arch.has_32bit_shinfo = (hvm_guest_x86_mode(current) != 8); /* * Make sure that the timebase in the shared info structure is correct. * * If the bit-ness changed we should arguably try to convert the other * fields as well, but that's much more problematic (e.g. what do you * do if you're going from 64 bit to 32 bit and there's an event * channel pending which doesn't exist in the 32 bit version?). Just * setting the wallclock time seems to be sufficient for everything * we do, even if it is a bit of a hack. */ update_domain_wallclock_time(d); } } /* Initialise a hypercall transfer page for a VMX domain using paravirtualised drivers. */ void hvm_hypercall_page_initialise(struct domain *d, void *hypercall_page) { hvm_latch_shinfo_size(d); hvm_funcs.init_hypercall_page(d, hypercall_page); } void hvm_vcpu_reset_state(struct vcpu *v, uint16_t cs, uint16_t ip) { struct domain *d = v->domain; struct segment_register reg; typeof(v->arch.xsave_area->fpu_sse) *fpu_ctxt = v->arch.fpu_ctxt; domain_lock(d); if ( v->is_initialised ) goto out; if ( !paging_mode_hap(d) ) { if ( v->arch.hvm_vcpu.guest_cr[0] & X86_CR0_PG ) put_page(pagetable_get_page(v->arch.guest_table)); v->arch.guest_table = pagetable_null(); } memset(fpu_ctxt, 0, sizeof(*fpu_ctxt)); fpu_ctxt->fcw = FCW_RESET; fpu_ctxt->mxcsr = MXCSR_DEFAULT; if ( v->arch.xsave_area ) { v->arch.xsave_area->xsave_hdr.xstate_bv = XSTATE_FP; v->arch.xsave_area->xsave_hdr.xcomp_bv = 0; } v->arch.vgc_flags = VGCF_online; memset(&v->arch.user_regs, 0, sizeof(v->arch.user_regs)); v->arch.user_regs.rflags = X86_EFLAGS_MBS; v->arch.user_regs.rdx = 0x00000f00; v->arch.user_regs.rip = ip; memset(&v->arch.debugreg, 0, sizeof(v->arch.debugreg)); v->arch.hvm_vcpu.guest_cr[0] = X86_CR0_ET; hvm_update_guest_cr(v, 0); v->arch.hvm_vcpu.guest_cr[2] = 0; hvm_update_guest_cr(v, 2); v->arch.hvm_vcpu.guest_cr[3] = 0; hvm_update_guest_cr(v, 3); v->arch.hvm_vcpu.guest_cr[4] = 0; hvm_update_guest_cr(v, 4); v->arch.hvm_vcpu.guest_efer = 0; hvm_update_guest_efer(v); reg.sel = cs; reg.base = (uint32_t)reg.sel << 4; reg.limit = 0xffff; reg.attr = 0x9b; hvm_set_segment_register(v, x86_seg_cs, ®); reg.sel = reg.base = 0; reg.limit = 0xffff; reg.attr = 0x93; hvm_set_segment_register(v, x86_seg_ds, ®); hvm_set_segment_register(v, x86_seg_es, ®); hvm_set_segment_register(v, x86_seg_fs, ®); hvm_set_segment_register(v, x86_seg_gs, ®); hvm_set_segment_register(v, x86_seg_ss, ®); reg.attr = 0x82; /* LDT */ hvm_set_segment_register(v, x86_seg_ldtr, ®); reg.attr = 0x8b; /* 32-bit TSS (busy) */ hvm_set_segment_register(v, x86_seg_tr, ®); reg.attr = 0; hvm_set_segment_register(v, x86_seg_gdtr, ®); hvm_set_segment_register(v, x86_seg_idtr, ®); if ( hvm_funcs.tsc_scaling.setup ) hvm_funcs.tsc_scaling.setup(v); /* Sync AP's TSC with BSP's. */ v->arch.hvm_vcpu.cache_tsc_offset = v->domain->vcpu[0]->arch.hvm_vcpu.cache_tsc_offset; hvm_funcs.set_tsc_offset(v, v->arch.hvm_vcpu.cache_tsc_offset, d->arch.hvm_domain.sync_tsc); v->arch.hvm_vcpu.msr_tsc_adjust = 0; paging_update_paging_modes(v); v->arch.flags |= TF_kernel_mode; v->is_initialised = 1; clear_bit(_VPF_down, &v->pause_flags); out: domain_unlock(d); } static void hvm_s3_suspend(struct domain *d) { struct vcpu *v; domain_pause(d); domain_lock(d); if ( d->is_dying || (d->vcpu == NULL) || (d->vcpu[0] == NULL) || test_and_set_bool(d->arch.hvm_domain.is_s3_suspended) ) { domain_unlock(d); domain_unpause(d); return; } for_each_vcpu ( d, v ) { int rc; vlapic_reset(vcpu_vlapic(v)); rc = vcpu_reset(v); ASSERT(!rc); } vpic_reset(d); vioapic_reset(d); pit_reset(d); rtc_reset(d); pmtimer_reset(d); hpet_reset(d); hvm_vcpu_reset_state(d->vcpu[0], 0xf000, 0xfff0); domain_unlock(d); } static void hvm_s3_resume(struct domain *d) { if ( test_and_clear_bool(d->arch.hvm_domain.is_s3_suspended) ) { struct vcpu *v; for_each_vcpu( d, v ) hvm_set_guest_tsc(v, 0); domain_unpause(d); } } static int hvmop_flush_tlb_all(void) { struct domain *d = current->domain; struct vcpu *v; if ( !is_hvm_domain(d) ) return -EINVAL; /* Avoid deadlock if more than one vcpu tries this at the same time. */ if ( !spin_trylock(&d->hypercall_deadlock_mutex) ) return -ERESTART; /* Pause all other vcpus. */ for_each_vcpu ( d, v ) if ( v != current ) vcpu_pause_nosync(v); /* Now that all VCPUs are signalled to deschedule, we wait... */ for_each_vcpu ( d, v ) if ( v != current ) while ( !vcpu_runnable(v) && v->is_running ) cpu_relax(); /* All other vcpus are paused, safe to unlock now. */ spin_unlock(&d->hypercall_deadlock_mutex); /* Flush paging-mode soft state (e.g., va->gfn cache; PAE PDPE cache). */ for_each_vcpu ( d, v ) paging_update_cr3(v); /* Flush all dirty TLBs. */ flush_tlb_mask(d->domain_dirty_cpumask); /* Done. */ for_each_vcpu ( d, v ) if ( v != current ) vcpu_unpause(v); return 0; } static int hvmop_set_evtchn_upcall_vector( XEN_GUEST_HANDLE_PARAM(xen_hvm_evtchn_upcall_vector_t) uop) { xen_hvm_evtchn_upcall_vector_t op; struct domain *d = current->domain; struct vcpu *v; if ( !is_hvm_domain(d) ) return -EINVAL; if ( copy_from_guest(&op, uop, 1) ) return -EFAULT; if ( op.vector < 0x10 ) return -EINVAL; if ( op.vcpu >= d->max_vcpus || (v = d->vcpu[op.vcpu]) == NULL ) return -ENOENT; printk(XENLOG_G_INFO "%pv: upcall vector %02x\n", v, op.vector); v->arch.hvm_vcpu.evtchn_upcall_vector = op.vector; hvm_assert_evtchn_irq(v); return 0; } static int hvm_allow_set_param(struct domain *d, const struct xen_hvm_param *a) { uint64_t value = d->arch.hvm_domain.params[a->index]; int rc; rc = xsm_hvm_param(XSM_TARGET, d, HVMOP_set_param); if ( rc ) return rc; switch ( a->index ) { /* The following parameters can be set by the guest. */ case HVM_PARAM_CALLBACK_IRQ: case HVM_PARAM_VM86_TSS: case HVM_PARAM_VM86_TSS_SIZED: case HVM_PARAM_ACPI_IOPORTS_LOCATION: case HVM_PARAM_VM_GENERATION_ID_ADDR: case HVM_PARAM_STORE_EVTCHN: case HVM_PARAM_CONSOLE_EVTCHN: case HVM_PARAM_X87_FIP_WIDTH: break; /* * The following parameters must not be set by the guest * since the domain may need to be paused. */ case HVM_PARAM_IDENT_PT: case HVM_PARAM_DM_DOMAIN: case HVM_PARAM_ACPI_S_STATE: /* The remaining parameters should not be set by the guest. */ default: if ( d == current->domain ) rc = -EPERM; break; } if ( rc ) return rc; switch ( a->index ) { /* The following parameters should only be changed once. */ case HVM_PARAM_VIRIDIAN: case HVM_PARAM_IOREQ_SERVER_PFN: case HVM_PARAM_NR_IOREQ_SERVER_PAGES: case HVM_PARAM_ALTP2M: case HVM_PARAM_MCA_CAP: if ( value != 0 && a->value != value ) rc = -EEXIST; break; default: break; } return rc; } static int hvmop_set_param( XEN_GUEST_HANDLE_PARAM(xen_hvm_param_t) arg) { struct domain *curr_d = current->domain; struct xen_hvm_param a; struct domain *d; struct vcpu *v; int rc; if ( copy_from_guest(&a, arg, 1) ) return -EFAULT; if ( a.index >= HVM_NR_PARAMS ) return -EINVAL; d = rcu_lock_domain_by_any_id(a.domid); if ( d == NULL ) return -ESRCH; rc = -EINVAL; if ( !is_hvm_domain(d) ) goto out; rc = hvm_allow_set_param(d, &a); if ( rc ) goto out; switch ( a.index ) { case HVM_PARAM_CALLBACK_IRQ: hvm_set_callback_via(d, a.value); hvm_latch_shinfo_size(d); break; case HVM_PARAM_TIMER_MODE: if ( a.value > HVMPTM_one_missed_tick_pending ) rc = -EINVAL; break; case HVM_PARAM_VIRIDIAN: if ( (a.value & ~HVMPV_feature_mask) || !(a.value & HVMPV_base_freq) ) rc = -EINVAL; break; case HVM_PARAM_IDENT_PT: /* * Only actually required for VT-x lacking unrestricted_guest * capabilities. Short circuit the pause if possible. */ if ( !paging_mode_hap(d) || !cpu_has_vmx ) { d->arch.hvm_domain.params[a.index] = a.value; break; } /* * Update GUEST_CR3 in each VMCS to point at identity map. * All foreign updates to guest state must synchronise on * the domctl_lock. */ rc = -ERESTART; if ( !domctl_lock_acquire() ) break; rc = 0; domain_pause(d); d->arch.hvm_domain.params[a.index] = a.value; for_each_vcpu ( d, v ) paging_update_cr3(v); domain_unpause(d); domctl_lock_release(); break; case HVM_PARAM_DM_DOMAIN: if ( a.value == DOMID_SELF ) a.value = curr_d->domain_id; rc = hvm_set_dm_domain(d, a.value); break; case HVM_PARAM_ACPI_S_STATE: rc = 0; if ( a.value == 3 ) hvm_s3_suspend(d); else if ( a.value == 0 ) hvm_s3_resume(d); else rc = -EINVAL; break; case HVM_PARAM_ACPI_IOPORTS_LOCATION: rc = pmtimer_change_ioport(d, a.value); break; case HVM_PARAM_MEMORY_EVENT_CR0: case HVM_PARAM_MEMORY_EVENT_CR3: case HVM_PARAM_MEMORY_EVENT_CR4: case HVM_PARAM_MEMORY_EVENT_INT3: case HVM_PARAM_MEMORY_EVENT_SINGLE_STEP: case HVM_PARAM_MEMORY_EVENT_MSR: /* Deprecated */ rc = -EOPNOTSUPP; break; case HVM_PARAM_NESTEDHVM: rc = xsm_hvm_param_nested(XSM_PRIV, d); if ( rc ) break; if ( a.value > 1 ) rc = -EINVAL; /* * Remove the check below once we have * shadow-on-shadow. */ if ( !paging_mode_hap(d) && a.value ) rc = -EINVAL; if ( a.value && d->arch.hvm_domain.params[HVM_PARAM_ALTP2M] ) rc = -EINVAL; /* Set up NHVM state for any vcpus that are already up. */ if ( a.value && !d->arch.hvm_domain.params[HVM_PARAM_NESTEDHVM] ) for_each_vcpu(d, v) if ( rc == 0 ) rc = nestedhvm_vcpu_initialise(v); if ( !a.value || rc ) for_each_vcpu(d, v) nestedhvm_vcpu_destroy(v); break; case HVM_PARAM_ALTP2M: rc = xsm_hvm_param_altp2mhvm(XSM_PRIV, d); if ( rc ) break; if ( a.value > XEN_ALTP2M_limited ) rc = -EINVAL; if ( a.value && d->arch.hvm_domain.params[HVM_PARAM_NESTEDHVM] ) rc = -EINVAL; break; case HVM_PARAM_BUFIOREQ_EVTCHN: rc = -EINVAL; break; case HVM_PARAM_TRIPLE_FAULT_REASON: if ( a.value > SHUTDOWN_MAX ) rc = -EINVAL; break; case HVM_PARAM_IOREQ_SERVER_PFN: d->arch.hvm_domain.ioreq_gfn.base = a.value; break; case HVM_PARAM_NR_IOREQ_SERVER_PAGES: { unsigned int i; if ( a.value == 0 || a.value > sizeof(d->arch.hvm_domain.ioreq_gfn.mask) * 8 ) { rc = -EINVAL; break; } for ( i = 0; i < a.value; i++ ) set_bit(i, &d->arch.hvm_domain.ioreq_gfn.mask); break; } case HVM_PARAM_X87_FIP_WIDTH: if ( a.value != 0 && a.value != 4 && a.value != 8 ) { rc = -EINVAL; break; } d->arch.x87_fip_width = a.value; break; case HVM_PARAM_VM86_TSS: /* Hardware would silently truncate high bits. */ if ( a.value != (uint32_t)a.value ) { if ( d == curr_d ) domain_crash(d); rc = -EINVAL; } /* Old hvmloader binaries hardcode the size to 128 bytes. */ if ( a.value ) a.value |= (128ULL << 32) | VM86_TSS_UPDATED; a.index = HVM_PARAM_VM86_TSS_SIZED; break; case HVM_PARAM_VM86_TSS_SIZED: if ( (a.value >> 32) < sizeof(struct tss32) ) { if ( d == curr_d ) domain_crash(d); rc = -EINVAL; } /* * Cap at the theoretically useful maximum (base structure plus * 256 bits interrupt redirection bitmap + 64k bits I/O bitmap * plus one padding byte). */ if ( (a.value >> 32) > sizeof(struct tss32) + (0x100 / 8) + (0x10000 / 8) + 1 ) a.value = (uint32_t)a.value | ((sizeof(struct tss32) + (0x100 / 8) + (0x10000 / 8) + 1) << 32); a.value |= VM86_TSS_UPDATED; break; case HVM_PARAM_MCA_CAP: rc = vmce_enable_mca_cap(d, a.value); break; } if ( rc != 0 ) goto out; d->arch.hvm_domain.params[a.index] = a.value; HVM_DBG_LOG(DBG_LEVEL_HCALL, "set param %u = %"PRIx64, a.index, a.value); out: rcu_unlock_domain(d); return rc; } static int hvm_allow_get_param(struct domain *d, const struct xen_hvm_param *a) { int rc; rc = xsm_hvm_param(XSM_TARGET, d, HVMOP_get_param); if ( rc ) return rc; switch ( a->index ) { /* The following parameters can be read by the guest. */ case HVM_PARAM_CALLBACK_IRQ: case HVM_PARAM_VM86_TSS: case HVM_PARAM_VM86_TSS_SIZED: case HVM_PARAM_ACPI_IOPORTS_LOCATION: case HVM_PARAM_VM_GENERATION_ID_ADDR: case HVM_PARAM_STORE_PFN: case HVM_PARAM_STORE_EVTCHN: case HVM_PARAM_CONSOLE_PFN: case HVM_PARAM_CONSOLE_EVTCHN: case HVM_PARAM_ALTP2M: case HVM_PARAM_X87_FIP_WIDTH: break; /* * The following parameters must not be read by the guest * since the domain may need to be paused. */ case HVM_PARAM_IOREQ_PFN: case HVM_PARAM_BUFIOREQ_PFN: case HVM_PARAM_BUFIOREQ_EVTCHN: /* The remaining parameters should not be read by the guest. */ default: if ( d == current->domain ) rc = -EPERM; break; } return rc; } static int hvmop_get_param( XEN_GUEST_HANDLE_PARAM(xen_hvm_param_t) arg) { struct xen_hvm_param a; struct domain *d; int rc; if ( copy_from_guest(&a, arg, 1) ) return -EFAULT; if ( a.index >= HVM_NR_PARAMS ) return -EINVAL; d = rcu_lock_domain_by_any_id(a.domid); if ( d == NULL ) return -ESRCH; rc = -EINVAL; if ( !is_hvm_domain(d) ) goto out; rc = hvm_allow_get_param(d, &a); if ( rc ) goto out; switch ( a.index ) { case HVM_PARAM_ACPI_S_STATE: a.value = d->arch.hvm_domain.is_s3_suspended ? 3 : 0; break; case HVM_PARAM_VM86_TSS: a.value = (uint32_t)d->arch.hvm_domain.params[HVM_PARAM_VM86_TSS_SIZED]; break; case HVM_PARAM_VM86_TSS_SIZED: a.value = d->arch.hvm_domain.params[HVM_PARAM_VM86_TSS_SIZED] & ~VM86_TSS_UPDATED; break; case HVM_PARAM_X87_FIP_WIDTH: a.value = d->arch.x87_fip_width; break; case HVM_PARAM_IOREQ_PFN: case HVM_PARAM_BUFIOREQ_PFN: case HVM_PARAM_BUFIOREQ_EVTCHN: /* * It may be necessary to create a default ioreq server here, * because legacy versions of QEMU are not aware of the new API for * explicit ioreq server creation. However, if the domain is not * under construction then it will not be QEMU querying the * parameters and thus the query should not have that side-effect. */ if ( !d->creation_finished ) { domid_t domid = d->arch.hvm_domain.params[HVM_PARAM_DM_DOMAIN]; rc = hvm_create_ioreq_server(d, domid, true, HVM_IOREQSRV_BUFIOREQ_LEGACY, NULL); if ( rc != 0 && rc != -EEXIST ) goto out; } /*FALLTHRU*/ default: a.value = d->arch.hvm_domain.params[a.index]; break; } rc = __copy_to_guest(arg, &a, 1) ? -EFAULT : 0; HVM_DBG_LOG(DBG_LEVEL_HCALL, "get param %u = %"PRIx64, a.index, a.value); out: rcu_unlock_domain(d); return rc; } static int do_altp2m_op( XEN_GUEST_HANDLE_PARAM(void) arg) { struct xen_hvm_altp2m_op a; struct domain *d = NULL; int rc = 0; uint64_t mode; if ( !hvm_altp2m_supported() ) return -EOPNOTSUPP; if ( copy_from_guest(&a, arg, 1) ) return -EFAULT; if ( a.pad1 || a.pad2 || (a.version != HVMOP_ALTP2M_INTERFACE_VERSION) ) return -EINVAL; switch ( a.cmd ) { case HVMOP_altp2m_get_domain_state: case HVMOP_altp2m_set_domain_state: case HVMOP_altp2m_vcpu_enable_notify: case HVMOP_altp2m_create_p2m: case HVMOP_altp2m_destroy_p2m: case HVMOP_altp2m_switch_p2m: case HVMOP_altp2m_set_mem_access: case HVMOP_altp2m_change_gfn: break; default: return -EOPNOTSUPP; } d = ( a.cmd != HVMOP_altp2m_vcpu_enable_notify ) ? rcu_lock_domain_by_any_id(a.domain) : rcu_lock_current_domain(); if ( d == NULL ) return -ESRCH; if ( !is_hvm_domain(d) ) { rc = -EOPNOTSUPP; goto out; } if ( (a.cmd != HVMOP_altp2m_get_domain_state) && (a.cmd != HVMOP_altp2m_set_domain_state) && !d->arch.altp2m_active ) { rc = -EOPNOTSUPP; goto out; } mode = d->arch.hvm_domain.params[HVM_PARAM_ALTP2M]; if ( XEN_ALTP2M_disabled == mode ) { rc = -EINVAL; goto out; } if ( (rc = xsm_hvm_altp2mhvm_op(XSM_OTHER, d, mode, a.cmd)) ) goto out; switch ( a.cmd ) { case HVMOP_altp2m_get_domain_state: a.u.domain_state.state = altp2m_active(d); rc = __copy_to_guest(arg, &a, 1) ? -EFAULT : 0; break; case HVMOP_altp2m_set_domain_state: { struct vcpu *v; bool_t ostate; if ( nestedhvm_enabled(d) ) { rc = -EINVAL; break; } ostate = d->arch.altp2m_active; d->arch.altp2m_active = !!a.u.domain_state.state; /* If the alternate p2m state has changed, handle appropriately */ if ( d->arch.altp2m_active != ostate && (ostate || !(rc = p2m_init_altp2m_by_id(d, 0))) ) { for_each_vcpu( d, v ) { if ( !ostate ) altp2m_vcpu_initialise(v); else altp2m_vcpu_destroy(v); } if ( ostate ) p2m_flush_altp2m(d); } break; } case HVMOP_altp2m_vcpu_enable_notify: { struct vcpu *curr = current; p2m_type_t p2mt; if ( a.u.enable_notify.pad || a.domain != DOMID_SELF || a.u.enable_notify.vcpu_id != curr->vcpu_id ) { rc = -EINVAL; break; } if ( !gfn_eq(vcpu_altp2m(curr).veinfo_gfn, INVALID_GFN) || mfn_eq(get_gfn_query_unlocked(curr->domain, a.u.enable_notify.gfn, &p2mt), INVALID_MFN) ) { rc = -EINVAL; break; } vcpu_altp2m(curr).veinfo_gfn = _gfn(a.u.enable_notify.gfn); altp2m_vcpu_update_vmfunc_ve(curr); break; } case HVMOP_altp2m_create_p2m: if ( !(rc = p2m_init_next_altp2m(d, &a.u.view.view)) ) rc = __copy_to_guest(arg, &a, 1) ? -EFAULT : 0; break; case HVMOP_altp2m_destroy_p2m: rc = p2m_destroy_altp2m_by_id(d, a.u.view.view); break; case HVMOP_altp2m_switch_p2m: rc = p2m_switch_domain_altp2m_by_id(d, a.u.view.view); break; case HVMOP_altp2m_set_mem_access: if ( a.u.set_mem_access.pad ) rc = -EINVAL; else rc = p2m_set_mem_access(d, _gfn(a.u.set_mem_access.gfn), 1, 0, 0, a.u.set_mem_access.hvmmem_access, a.u.set_mem_access.view); break; case HVMOP_altp2m_change_gfn: if ( a.u.change_gfn.pad1 || a.u.change_gfn.pad2 ) rc = -EINVAL; else rc = p2m_change_altp2m_gfn(d, a.u.change_gfn.view, _gfn(a.u.change_gfn.old_gfn), _gfn(a.u.change_gfn.new_gfn)); break; default: ASSERT_UNREACHABLE(); } out: rcu_unlock_domain(d); return rc; } static int hvmop_get_mem_type( XEN_GUEST_HANDLE_PARAM(xen_hvm_get_mem_type_t) arg) { struct xen_hvm_get_mem_type a; struct domain *d; p2m_type_t t; int rc; if ( copy_from_guest(&a, arg, 1) ) return -EFAULT; d = rcu_lock_domain_by_any_id(a.domid); if ( d == NULL ) return -ESRCH; rc = xsm_hvm_param(XSM_TARGET, d, HVMOP_get_mem_type); if ( rc ) goto out; rc = -EINVAL; if ( !is_hvm_domain(d) ) goto out; /* * Use get_gfn query as we are interested in the current * type, not in allocating or unsharing. That'll happen * on access. */ get_gfn_query_unlocked(d, a.pfn, &t); if ( p2m_is_mmio(t) ) a.mem_type = HVMMEM_mmio_dm; else if ( t == p2m_ioreq_server ) a.mem_type = HVMMEM_ioreq_server; else if ( p2m_is_readonly(t) ) a.mem_type = HVMMEM_ram_ro; else if ( p2m_is_ram(t) ) a.mem_type = HVMMEM_ram_rw; else if ( p2m_is_pod(t) ) a.mem_type = HVMMEM_ram_rw; else if ( p2m_is_grant(t) ) a.mem_type = HVMMEM_ram_rw; else a.mem_type = HVMMEM_mmio_dm; rc = -EFAULT; if ( __copy_to_guest(arg, &a, 1) ) goto out; rc = 0; out: rcu_unlock_domain(d); return rc; } long do_hvm_op(unsigned long op, XEN_GUEST_HANDLE_PARAM(void) arg) { long rc = 0; /* * NB: hvm_op can be part of a restarted hypercall; but at the * moment the only hypercalls which do continuations don't need to * store any iteration information (since they're just re-trying * the acquisition of a lock). */ switch ( op ) { case HVMOP_set_evtchn_upcall_vector: rc = hvmop_set_evtchn_upcall_vector( guest_handle_cast(arg, xen_hvm_evtchn_upcall_vector_t)); break; case HVMOP_set_param: rc = hvmop_set_param( guest_handle_cast(arg, xen_hvm_param_t)); break; case HVMOP_get_param: rc = hvmop_get_param( guest_handle_cast(arg, xen_hvm_param_t)); break; case HVMOP_flush_tlbs: rc = guest_handle_is_null(arg) ? hvmop_flush_tlb_all() : -EINVAL; break; case HVMOP_get_mem_type: rc = hvmop_get_mem_type( guest_handle_cast(arg, xen_hvm_get_mem_type_t)); break; case HVMOP_pagetable_dying: { struct xen_hvm_pagetable_dying a; struct domain *d; if ( copy_from_guest(&a, arg, 1) ) return -EFAULT; d = rcu_lock_domain_by_any_id(a.domid); if ( d == NULL ) return -ESRCH; rc = -EINVAL; if ( is_hvm_domain(d) && paging_mode_shadow(d) ) rc = xsm_hvm_param(XSM_TARGET, d, op); if ( !rc ) pagetable_dying(d, a.gpa); rcu_unlock_domain(d); break; } case HVMOP_get_time: { xen_hvm_get_time_t gxt; gxt.now = NOW(); if ( copy_to_guest(arg, &gxt, 1) ) rc = -EFAULT; break; } case HVMOP_xentrace: { xen_hvm_xentrace_t tr; if ( copy_from_guest(&tr, arg, 1 ) ) return -EFAULT; if ( tr.extra_bytes > sizeof(tr.extra) || (tr.event & ~((1u<arch.hvm_vcpu.single_step = (op == XEN_DOMCTL_DEBUG_OP_SINGLE_STEP_ON); vcpu_unpause(v); /* guest will latch new state */ break; default: rc = -ENOSYS; break; } return rc; } void hvm_toggle_singlestep(struct vcpu *v) { ASSERT(atomic_read(&v->pause_count)); if ( !hvm_is_singlestep_supported() ) return; v->arch.hvm_vcpu.single_step = !v->arch.hvm_vcpu.single_step; } int hvm_set_mode(struct vcpu *v, int mode) { switch ( mode ) { case 4: v->arch.hvm_vcpu.guest_efer &= ~(EFER_LMA | EFER_LME); break; case 8: v->arch.hvm_vcpu.guest_efer |= (EFER_LMA | EFER_LME); break; default: return -EOPNOTSUPP; } hvm_update_guest_efer(v); if ( hvm_funcs.set_mode ) return hvm_funcs.set_mode(v, mode); return 0; } void hvm_domain_soft_reset(struct domain *d) { hvm_destroy_all_ioreq_servers(d); } /* * Segment caches in VMCB/VMCS are inconsistent about which bits are checked, * important, and preserved across vmentry/exit. Cook the values to make them * closer to what is architecturally expected from entries in the segment * cache. */ void hvm_get_segment_register(struct vcpu *v, enum x86_segment seg, struct segment_register *reg) { hvm_funcs.get_segment_register(v, seg, reg); switch ( seg ) { case x86_seg_ss: /* SVM may retain %ss.DB when %ss is loaded with a NULL selector. */ if ( !reg->p ) reg->db = 0; break; case x86_seg_tr: /* * SVM doesn't track %tr.B. Architecturally, a loaded TSS segment will * always be busy. */ reg->type |= 0x2; /* * %cs and %tr are unconditionally present. SVM ignores these present * bits and will happily run without them set. */ case x86_seg_cs: reg->p = 1; break; case x86_seg_gdtr: case x86_seg_idtr: /* * Treat GDTR/IDTR as being present system segments. This avoids them * needing special casing for segmentation checks. */ reg->attr = 0x80; break; default: /* Avoid triggering -Werror=switch */ break; } if ( reg->p ) { /* * For segments which are present/usable, cook the system flag. SVM * ignores the S bit on all segments and will happily run with them in * any state. */ reg->s = is_x86_user_segment(seg); /* * SVM discards %cs.G on #VMEXIT. Other user segments do have .G * tracked, but Linux commit 80112c89ed87 "KVM: Synthesize G bit for * all segments." indicates that this isn't necessarily the case when * nested under ESXi. * * Unconditionally recalculate G. */ reg->g = !!(reg->limit >> 20); /* * SVM doesn't track the Accessed flag. It will always be set for * usable user segments loaded into the descriptor cache. */ if ( is_x86_user_segment(seg) ) reg->type |= 0x1; } } void hvm_set_segment_register(struct vcpu *v, enum x86_segment seg, struct segment_register *reg) { /* Set G to match the limit field. VT-x cares, while SVM doesn't. */ if ( reg->p ) reg->g = !!(reg->limit >> 20); switch ( seg ) { case x86_seg_cs: ASSERT(reg->p); /* Usable. */ ASSERT(reg->s); /* User segment. */ ASSERT(reg->type & 0x1); /* Accessed. */ ASSERT((reg->base >> 32) == 0); /* Upper bits clear. */ break; case x86_seg_ss: if ( reg->p ) { ASSERT(reg->s); /* User segment. */ ASSERT(!(reg->type & 0x8)); /* Data segment. */ ASSERT(reg->type & 0x2); /* Writeable. */ ASSERT(reg->type & 0x1); /* Accessed. */ ASSERT((reg->base >> 32) == 0); /* Upper bits clear. */ } break; case x86_seg_ds: case x86_seg_es: case x86_seg_fs: case x86_seg_gs: if ( reg->p ) { ASSERT(reg->s); /* User segment. */ if ( reg->type & 0x8 ) ASSERT(reg->type & 0x2); /* Readable. */ ASSERT(reg->type & 0x1); /* Accessed. */ if ( seg == x86_seg_fs || seg == x86_seg_gs ) ASSERT(is_canonical_address(reg->base)); else ASSERT((reg->base >> 32) == 0); /* Upper bits clear. */ } break; case x86_seg_tr: ASSERT(reg->p); /* Usable. */ ASSERT(!reg->s); /* System segment. */ ASSERT(!(reg->sel & 0x4)); /* !TI. */ if ( reg->type == SYS_DESC_tss_busy ) ASSERT(is_canonical_address(reg->base)); else if ( reg->type == SYS_DESC_tss16_busy ) ASSERT((reg->base >> 32) == 0); else ASSERT(!"%tr typecheck failure"); break; case x86_seg_ldtr: if ( reg->p ) { ASSERT(!reg->s); /* System segment. */ ASSERT(!(reg->sel & 0x4)); /* !TI. */ ASSERT(reg->type == SYS_DESC_ldt); ASSERT(is_canonical_address(reg->base)); } break; case x86_seg_gdtr: case x86_seg_idtr: ASSERT(is_canonical_address(reg->base)); ASSERT((reg->limit >> 16) == 0); /* Upper bits clear. */ break; default: ASSERT_UNREACHABLE(); return; } hvm_funcs.set_segment_register(v, seg, reg); } /* * Local variables: * mode: C * c-file-style: "BSD" * c-basic-offset: 4 * tab-width: 4 * indent-tabs-mode: nil * End: */