/****************************************************************************** * arch/x86/pv/emul-priv-op.c * * Emulate privileged instructions for PV guests * * Modifications to Linux original are copyright (c) 2002-2004, K A Fraser * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or * (at your option) any later version. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the * GNU General Public License for more details. * * You should have received a copy of the GNU General Public License * along with this program; If not, see . */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "../x86_64/mmconfig.h" #include "emulate.h" #include "mm.h" /* Override macros from asm/page.h to make them work with mfn_t */ #undef mfn_to_page #define mfn_to_page(mfn) __mfn_to_page(mfn_x(mfn)) #undef page_to_mfn #define page_to_mfn(pg) _mfn(__page_to_mfn(pg)) /*********************** * I/O emulation support */ struct priv_op_ctxt { struct x86_emulate_ctxt ctxt; struct { unsigned long base, limit; } cs; char *io_emul_stub; unsigned int bpmatch; unsigned int tsc; #define TSC_BASE 1 #define TSC_AUX 2 }; /* I/O emulation support. Helper routines for, and type of, the stack stub. */ void host_to_guest_gpr_switch(struct cpu_user_regs *); unsigned long guest_to_host_gpr_switch(unsigned long); void (*pv_post_outb_hook)(unsigned int port, u8 value); typedef void io_emul_stub_t(struct cpu_user_regs *); static io_emul_stub_t *io_emul_stub_setup(struct priv_op_ctxt *ctxt, u8 opcode, unsigned int port, unsigned int bytes) { if ( !ctxt->io_emul_stub ) ctxt->io_emul_stub = map_domain_page(_mfn(this_cpu(stubs.mfn))) + (this_cpu(stubs.addr) & ~PAGE_MASK) + STUB_BUF_SIZE / 2; /* movq $host_to_guest_gpr_switch,%rcx */ ctxt->io_emul_stub[0] = 0x48; ctxt->io_emul_stub[1] = 0xb9; *(void **)&ctxt->io_emul_stub[2] = (void *)host_to_guest_gpr_switch; /* callq *%rcx */ ctxt->io_emul_stub[10] = 0xff; ctxt->io_emul_stub[11] = 0xd1; /* data16 or nop */ ctxt->io_emul_stub[12] = (bytes != 2) ? 0x90 : 0x66; /* */ ctxt->io_emul_stub[13] = opcode; /* imm8 or nop */ ctxt->io_emul_stub[14] = !(opcode & 8) ? port : 0x90; /* ret (jumps to guest_to_host_gpr_switch) */ ctxt->io_emul_stub[15] = 0xc3; BUILD_BUG_ON(STUB_BUF_SIZE / 2 < 16); if ( ioemul_handle_quirk ) ioemul_handle_quirk(opcode, &ctxt->io_emul_stub[12], ctxt->ctxt.regs); /* Handy function-typed pointer to the stub. */ return (void *)(this_cpu(stubs.addr) + STUB_BUF_SIZE / 2); } /* Perform IOPL check between the vcpu's shadowed IOPL, and the assumed cpl. */ static bool iopl_ok(const struct vcpu *v, const struct cpu_user_regs *regs) { unsigned int cpl = guest_kernel_mode(v, regs) ? (VM_ASSIST(v->domain, architectural_iopl) ? 0 : 1) : 3; ASSERT((v->arch.pv_vcpu.iopl & ~X86_EFLAGS_IOPL) == 0); return IOPL(cpl) <= v->arch.pv_vcpu.iopl; } /* Has the guest requested sufficient permission for this I/O access? */ static bool guest_io_okay(unsigned int port, unsigned int bytes, struct vcpu *v, struct cpu_user_regs *regs) { /* If in user mode, switch to kernel mode just to read I/O bitmap. */ const bool user_mode = !(v->arch.flags & TF_kernel_mode); if ( iopl_ok(v, regs) ) return true; if ( v->arch.pv_vcpu.iobmp_limit > (port + bytes) ) { union { uint8_t bytes[2]; uint16_t mask; } x; /* * Grab permission bytes from guest space. Inaccessible bytes are * read as 0xff (no access allowed). */ if ( user_mode ) toggle_guest_pt(v); switch ( __copy_from_guest_offset(x.bytes, v->arch.pv_vcpu.iobmp, port>>3, 2) ) { default: x.bytes[0] = ~0; /* fallthrough */ case 1: x.bytes[1] = ~0; /* fallthrough */ case 0: break; } if ( user_mode ) toggle_guest_pt(v); if ( (x.mask & (((1 << bytes) - 1) << (port & 7))) == 0 ) return true; } return false; } /* Has the administrator granted sufficient permission for this I/O access? */ static bool admin_io_okay(unsigned int port, unsigned int bytes, const struct domain *d) { /* * Port 0xcf8 (CONFIG_ADDRESS) is only visible for DWORD accesses. * We never permit direct access to that register. */ if ( (port == 0xcf8) && (bytes == 4) ) return false; /* We also never permit direct access to the RTC/CMOS registers. */ if ( ((port & ~1) == RTC_PORT(0)) ) return false; return ioports_access_permitted(d, port, port + bytes - 1); } static bool pci_cfg_ok(struct domain *currd, unsigned int start, unsigned int size, uint32_t *write) { uint32_t machine_bdf; if ( !is_hardware_domain(currd) ) return false; if ( !CF8_ENABLED(currd->arch.pci_cf8) ) return true; machine_bdf = CF8_BDF(currd->arch.pci_cf8); if ( write ) { const unsigned long *ro_map = pci_get_ro_map(0); if ( ro_map && test_bit(machine_bdf, ro_map) ) return false; } start |= CF8_ADDR_LO(currd->arch.pci_cf8); /* AMD extended configuration space access? */ if ( CF8_ADDR_HI(currd->arch.pci_cf8) && boot_cpu_data.x86_vendor == X86_VENDOR_AMD && boot_cpu_data.x86 >= 0x10 && boot_cpu_data.x86 <= 0x17 ) { uint64_t msr_val; if ( rdmsr_safe(MSR_AMD64_NB_CFG, msr_val) ) return false; if ( msr_val & (1ULL << AMD64_NB_CFG_CF8_EXT_ENABLE_BIT) ) start |= CF8_ADDR_HI(currd->arch.pci_cf8); } return !write ? xsm_pci_config_permission(XSM_HOOK, currd, machine_bdf, start, start + size - 1, 0) == 0 : pci_conf_write_intercept(0, machine_bdf, start, size, write) >= 0; } uint32_t guest_io_read(unsigned int port, unsigned int bytes, struct domain *currd) { uint32_t data = 0; unsigned int shift = 0; if ( admin_io_okay(port, bytes, currd) ) { switch ( bytes ) { case 1: return inb(port); case 2: return inw(port); case 4: return inl(port); } } while ( bytes != 0 ) { unsigned int size = 1; uint32_t sub_data = ~0; if ( (port == 0x42) || (port == 0x43) || (port == 0x61) ) { sub_data = pv_pit_handler(port, 0, 0); } else if ( port == RTC_PORT(0) ) { sub_data = currd->arch.cmos_idx; } else if ( (port == RTC_PORT(1)) && ioports_access_permitted(currd, RTC_PORT(0), RTC_PORT(1)) ) { unsigned long flags; spin_lock_irqsave(&rtc_lock, flags); outb(currd->arch.cmos_idx & 0x7f, RTC_PORT(0)); sub_data = inb(RTC_PORT(1)); spin_unlock_irqrestore(&rtc_lock, flags); } else if ( (port == 0xcf8) && (bytes == 4) ) { size = 4; sub_data = currd->arch.pci_cf8; } else if ( (port & 0xfffc) == 0xcfc ) { size = min(bytes, 4 - (port & 3)); if ( size == 3 ) size = 2; if ( pci_cfg_ok(currd, port & 3, size, NULL) ) sub_data = pci_conf_read(currd->arch.pci_cf8, port & 3, size); } if ( size == 4 ) return sub_data; data |= (sub_data & ((1u << (size * 8)) - 1)) << shift; shift += size * 8; port += size; bytes -= size; } return data; } static unsigned int check_guest_io_breakpoint(struct vcpu *v, unsigned int port, unsigned int len) { unsigned int width, i, match = 0; unsigned long start; if ( !(v->arch.debugreg[5]) || !(v->arch.pv_vcpu.ctrlreg[4] & X86_CR4_DE) ) return 0; for ( i = 0; i < 4; i++ ) { if ( !(v->arch.debugreg[5] & (3 << (i * DR_ENABLE_SIZE))) ) continue; start = v->arch.debugreg[i]; width = 0; switch ( (v->arch.debugreg[7] >> (DR_CONTROL_SHIFT + i * DR_CONTROL_SIZE)) & 0xc ) { case DR_LEN_1: width = 1; break; case DR_LEN_2: width = 2; break; case DR_LEN_4: width = 4; break; case DR_LEN_8: width = 8; break; } if ( (start < (port + len)) && ((start + width) > port) ) match |= 1u << i; } return match; } static int read_io(unsigned int port, unsigned int bytes, unsigned long *val, struct x86_emulate_ctxt *ctxt) { struct priv_op_ctxt *poc = container_of(ctxt, struct priv_op_ctxt, ctxt); struct vcpu *curr = current; struct domain *currd = current->domain; /* INS must not come here. */ ASSERT((ctxt->opcode & ~9) == 0xe4); if ( !guest_io_okay(port, bytes, curr, ctxt->regs) ) return X86EMUL_UNHANDLEABLE; poc->bpmatch = check_guest_io_breakpoint(curr, port, bytes); if ( admin_io_okay(port, bytes, currd) ) { io_emul_stub_t *io_emul = io_emul_stub_setup(poc, ctxt->opcode, port, bytes); mark_regs_dirty(ctxt->regs); io_emul(ctxt->regs); return X86EMUL_DONE; } *val = guest_io_read(port, bytes, currd); return X86EMUL_OKAY; } void guest_io_write(unsigned int port, unsigned int bytes, uint32_t data, struct domain *currd) { if ( admin_io_okay(port, bytes, currd) ) { switch ( bytes ) { case 1: outb((uint8_t)data, port); if ( pv_post_outb_hook ) pv_post_outb_hook(port, (uint8_t)data); break; case 2: outw((uint16_t)data, port); break; case 4: outl(data, port); break; } return; } while ( bytes != 0 ) { unsigned int size = 1; if ( (port == 0x42) || (port == 0x43) || (port == 0x61) ) { pv_pit_handler(port, (uint8_t)data, 1); } else if ( port == RTC_PORT(0) ) { currd->arch.cmos_idx = data; } else if ( (port == RTC_PORT(1)) && ioports_access_permitted(currd, RTC_PORT(0), RTC_PORT(1)) ) { unsigned long flags; if ( pv_rtc_handler ) pv_rtc_handler(currd->arch.cmos_idx & 0x7f, data); spin_lock_irqsave(&rtc_lock, flags); outb(currd->arch.cmos_idx & 0x7f, RTC_PORT(0)); outb(data, RTC_PORT(1)); spin_unlock_irqrestore(&rtc_lock, flags); } else if ( (port == 0xcf8) && (bytes == 4) ) { size = 4; currd->arch.pci_cf8 = data; } else if ( (port & 0xfffc) == 0xcfc ) { size = min(bytes, 4 - (port & 3)); if ( size == 3 ) size = 2; if ( pci_cfg_ok(currd, port & 3, size, &data) ) pci_conf_write(currd->arch.pci_cf8, port & 3, size, data); } if ( size == 4 ) return; port += size; bytes -= size; data >>= size * 8; } } static int write_io(unsigned int port, unsigned int bytes, unsigned long val, struct x86_emulate_ctxt *ctxt) { struct priv_op_ctxt *poc = container_of(ctxt, struct priv_op_ctxt, ctxt); struct vcpu *curr = current; struct domain *currd = current->domain; /* OUTS must not come here. */ ASSERT((ctxt->opcode & ~9) == 0xe6); if ( !guest_io_okay(port, bytes, curr, ctxt->regs) ) return X86EMUL_UNHANDLEABLE; poc->bpmatch = check_guest_io_breakpoint(curr, port, bytes); if ( admin_io_okay(port, bytes, currd) ) { io_emul_stub_t *io_emul = io_emul_stub_setup(poc, ctxt->opcode, port, bytes); mark_regs_dirty(ctxt->regs); io_emul(ctxt->regs); if ( (bytes == 1) && pv_post_outb_hook ) pv_post_outb_hook(port, val); return X86EMUL_DONE; } guest_io_write(port, bytes, val, currd); return X86EMUL_OKAY; } static int read_segment(enum x86_segment seg, struct segment_register *reg, struct x86_emulate_ctxt *ctxt) { /* Check if this is an attempt to access the I/O bitmap. */ if ( seg == x86_seg_tr ) { switch ( ctxt->opcode ) { case 0x6c ... 0x6f: /* ins / outs */ case 0xe4 ... 0xe7: /* in / out (immediate port) */ case 0xec ... 0xef: /* in / out (port in %dx) */ /* Defer the check to priv_op_{read,write}_io(). */ return X86EMUL_DONE; } } if ( ctxt->addr_size < 64 ) { unsigned long limit; unsigned int sel, ar; switch ( seg ) { case x86_seg_cs: sel = ctxt->regs->cs; break; case x86_seg_ds: sel = read_sreg(ds); break; case x86_seg_es: sel = read_sreg(es); break; case x86_seg_fs: sel = read_sreg(fs); break; case x86_seg_gs: sel = read_sreg(gs); break; case x86_seg_ss: sel = ctxt->regs->ss; break; default: return X86EMUL_UNHANDLEABLE; } if ( !pv_emul_read_descriptor(sel, current, ®->base, &limit, &ar, 0) ) return X86EMUL_UNHANDLEABLE; reg->limit = limit; reg->attr = ar >> 8; } else { switch ( seg ) { default: if ( !is_x86_user_segment(seg) ) return X86EMUL_UNHANDLEABLE; reg->base = 0; break; case x86_seg_fs: reg->base = rdfsbase(); break; case x86_seg_gs: reg->base = rdgsbase(); break; } reg->limit = ~0U; reg->attr = 0; reg->type = _SEGMENT_WR >> 8; if ( seg == x86_seg_cs ) { reg->type |= _SEGMENT_CODE >> 8; reg->l = 1; } else reg->db = 1; reg->s = 1; reg->dpl = 3; reg->p = 1; reg->g = 1; } /* * For x86_emulate.c's mode_ring0() to work, fake a DPL of zero. * Also do this for consistency for non-conforming code segments. */ if ( (seg == x86_seg_ss || (seg == x86_seg_cs && !(reg->type & (_SEGMENT_EC >> 8)))) && guest_kernel_mode(current, ctxt->regs) ) reg->dpl = 0; return X86EMUL_OKAY; } static int pv_emul_virt_to_linear(unsigned long base, unsigned long offset, unsigned int bytes, unsigned long limit, enum x86_segment seg, struct x86_emulate_ctxt *ctxt, unsigned long *addr) { int rc = X86EMUL_OKAY; *addr = base + offset; if ( ctxt->addr_size < 64 ) { if ( limit < bytes - 1 || offset > limit - bytes + 1 ) rc = X86EMUL_EXCEPTION; *addr = (uint32_t)*addr; } else if ( !__addr_ok(*addr) ) rc = X86EMUL_EXCEPTION; if ( unlikely(rc == X86EMUL_EXCEPTION) ) x86_emul_hw_exception(seg != x86_seg_ss ? TRAP_gp_fault : TRAP_stack_error, 0, ctxt); return rc; } static int rep_ins(uint16_t port, enum x86_segment seg, unsigned long offset, unsigned int bytes_per_rep, unsigned long *reps, struct x86_emulate_ctxt *ctxt) { struct priv_op_ctxt *poc = container_of(ctxt, struct priv_op_ctxt, ctxt); struct vcpu *curr = current; struct domain *currd = current->domain; unsigned long goal = *reps; struct segment_register sreg; int rc; ASSERT(seg == x86_seg_es); *reps = 0; if ( !guest_io_okay(port, bytes_per_rep, curr, ctxt->regs) ) return X86EMUL_UNHANDLEABLE; rc = read_segment(x86_seg_es, &sreg, ctxt); if ( rc != X86EMUL_OKAY ) return rc; if ( !sreg.p ) return X86EMUL_UNHANDLEABLE; if ( !sreg.s || (sreg.type & (_SEGMENT_CODE >> 8)) || !(sreg.type & (_SEGMENT_WR >> 8)) ) { x86_emul_hw_exception(TRAP_gp_fault, 0, ctxt); return X86EMUL_EXCEPTION; } poc->bpmatch = check_guest_io_breakpoint(curr, port, bytes_per_rep); while ( *reps < goal ) { unsigned int data = guest_io_read(port, bytes_per_rep, currd); unsigned long addr; rc = pv_emul_virt_to_linear(sreg.base, offset, bytes_per_rep, sreg.limit, x86_seg_es, ctxt, &addr); if ( rc != X86EMUL_OKAY ) return rc; if ( (rc = __copy_to_user((void *)addr, &data, bytes_per_rep)) != 0 ) { x86_emul_pagefault(PFEC_write_access, addr + bytes_per_rep - rc, ctxt); return X86EMUL_EXCEPTION; } ++*reps; if ( poc->bpmatch || hypercall_preempt_check() ) break; /* x86_emulate() clips the repetition count to ensure we don't wrap. */ if ( unlikely(ctxt->regs->eflags & X86_EFLAGS_DF) ) offset -= bytes_per_rep; else offset += bytes_per_rep; } return X86EMUL_OKAY; } static int rep_outs(enum x86_segment seg, unsigned long offset, uint16_t port, unsigned int bytes_per_rep, unsigned long *reps, struct x86_emulate_ctxt *ctxt) { struct priv_op_ctxt *poc = container_of(ctxt, struct priv_op_ctxt, ctxt); struct vcpu *curr = current; struct domain *currd = current->domain; unsigned long goal = *reps; struct segment_register sreg; int rc; *reps = 0; if ( !guest_io_okay(port, bytes_per_rep, curr, ctxt->regs) ) return X86EMUL_UNHANDLEABLE; rc = read_segment(seg, &sreg, ctxt); if ( rc != X86EMUL_OKAY ) return rc; if ( !sreg.p ) return X86EMUL_UNHANDLEABLE; if ( !sreg.s || ((sreg.type & (_SEGMENT_CODE >> 8)) && !(sreg.type & (_SEGMENT_WR >> 8))) ) { x86_emul_hw_exception(seg != x86_seg_ss ? TRAP_gp_fault : TRAP_stack_error, 0, ctxt); return X86EMUL_EXCEPTION; } poc->bpmatch = check_guest_io_breakpoint(curr, port, bytes_per_rep); while ( *reps < goal ) { unsigned int data = 0; unsigned long addr; rc = pv_emul_virt_to_linear(sreg.base, offset, bytes_per_rep, sreg.limit, seg, ctxt, &addr); if ( rc != X86EMUL_OKAY ) return rc; if ( (rc = __copy_from_user(&data, (void *)addr, bytes_per_rep)) != 0 ) { x86_emul_pagefault(0, addr + bytes_per_rep - rc, ctxt); return X86EMUL_EXCEPTION; } guest_io_write(port, bytes_per_rep, data, currd); ++*reps; if ( poc->bpmatch || hypercall_preempt_check() ) break; /* x86_emulate() clips the repetition count to ensure we don't wrap. */ if ( unlikely(ctxt->regs->eflags & X86_EFLAGS_DF) ) offset -= bytes_per_rep; else offset += bytes_per_rep; } return X86EMUL_OKAY; } static int read_cr(unsigned int reg, unsigned long *val, struct x86_emulate_ctxt *ctxt) { const struct vcpu *curr = current; switch ( reg ) { case 0: /* Read CR0 */ *val = (read_cr0() & ~X86_CR0_TS) | curr->arch.pv_vcpu.ctrlreg[0]; return X86EMUL_OKAY; case 2: /* Read CR2 */ case 4: /* Read CR4 */ *val = curr->arch.pv_vcpu.ctrlreg[reg]; return X86EMUL_OKAY; case 3: /* Read CR3 */ { const struct domain *currd = curr->domain; mfn_t mfn; if ( !is_pv_32bit_domain(currd) ) { mfn = pagetable_get_mfn(curr->arch.guest_table); *val = xen_pfn_to_cr3(mfn_to_gmfn(currd, mfn_x(mfn))); } else { l4_pgentry_t *pl4e = map_domain_page(pagetable_get_mfn(curr->arch.guest_table)); mfn = l4e_get_mfn(*pl4e); unmap_domain_page(pl4e); *val = compat_pfn_to_cr3(mfn_to_gmfn(currd, mfn_x(mfn))); } /* PTs should not be shared */ BUG_ON(page_get_owner(mfn_to_page(mfn)) == dom_cow); return X86EMUL_OKAY; } } return X86EMUL_UNHANDLEABLE; } static int write_cr(unsigned int reg, unsigned long val, struct x86_emulate_ctxt *ctxt) { struct vcpu *curr = current; switch ( reg ) { case 0: /* Write CR0 */ if ( (val ^ read_cr0()) & ~X86_CR0_TS ) { gdprintk(XENLOG_WARNING, "Attempt to change unmodifiable CR0 flags\n"); break; } do_fpu_taskswitch(!!(val & X86_CR0_TS)); return X86EMUL_OKAY; case 2: /* Write CR2 */ curr->arch.pv_vcpu.ctrlreg[2] = val; arch_set_cr2(curr, val); return X86EMUL_OKAY; case 3: /* Write CR3 */ { struct domain *currd = curr->domain; unsigned long gfn; struct page_info *page; int rc; gfn = !is_pv_32bit_domain(currd) ? xen_cr3_to_pfn(val) : compat_cr3_to_pfn(val); page = get_page_from_gfn(currd, gfn, NULL, P2M_ALLOC); if ( !page ) break; rc = new_guest_cr3(page_to_mfn(page)); put_page(page); switch ( rc ) { case 0: return X86EMUL_OKAY; case -ERESTART: /* retry after preemption */ return X86EMUL_RETRY; } break; } case 4: /* Write CR4 */ curr->arch.pv_vcpu.ctrlreg[4] = pv_guest_cr4_fixup(curr, val); write_cr4(pv_guest_cr4_to_real_cr4(curr)); ctxt_switch_levelling(curr); return X86EMUL_OKAY; } return X86EMUL_UNHANDLEABLE; } static int read_dr(unsigned int reg, unsigned long *val, struct x86_emulate_ctxt *ctxt) { unsigned long res = do_get_debugreg(reg); if ( IS_ERR_VALUE(res) ) return X86EMUL_UNHANDLEABLE; *val = res; return X86EMUL_OKAY; } static int write_dr(unsigned int reg, unsigned long val, struct x86_emulate_ctxt *ctxt) { return do_set_debugreg(reg, val) == 0 ? X86EMUL_OKAY : X86EMUL_UNHANDLEABLE; } static inline uint64_t guest_misc_enable(uint64_t val) { val &= ~(MSR_IA32_MISC_ENABLE_PERF_AVAIL | MSR_IA32_MISC_ENABLE_MONITOR_ENABLE); val |= MSR_IA32_MISC_ENABLE_BTS_UNAVAIL | MSR_IA32_MISC_ENABLE_PEBS_UNAVAIL | MSR_IA32_MISC_ENABLE_XTPR_DISABLE; return val; } static inline bool is_cpufreq_controller(const struct domain *d) { return ((cpufreq_controller == FREQCTL_dom0_kernel) && is_hardware_domain(d)); } static int read_msr(unsigned int reg, uint64_t *val, struct x86_emulate_ctxt *ctxt) { struct priv_op_ctxt *poc = container_of(ctxt, struct priv_op_ctxt, ctxt); const struct vcpu *curr = current; const struct domain *currd = curr->domain; bool vpmu_msr = false; int ret; if ( (ret = guest_rdmsr(curr, reg, val)) != X86EMUL_UNHANDLEABLE ) { if ( ret == X86EMUL_EXCEPTION ) x86_emul_hw_exception(TRAP_gp_fault, 0, ctxt); return ret; } switch ( reg ) { int rc; case MSR_FS_BASE: if ( is_pv_32bit_domain(currd) ) break; *val = cpu_has_fsgsbase ? __rdfsbase() : curr->arch.pv_vcpu.fs_base; return X86EMUL_OKAY; case MSR_GS_BASE: if ( is_pv_32bit_domain(currd) ) break; *val = cpu_has_fsgsbase ? __rdgsbase() : curr->arch.pv_vcpu.gs_base_kernel; return X86EMUL_OKAY; case MSR_SHADOW_GS_BASE: if ( is_pv_32bit_domain(currd) ) break; *val = curr->arch.pv_vcpu.gs_base_user; return X86EMUL_OKAY; /* * In order to fully retain original behavior, defer calling * pv_soft_rdtsc() until after emulation. This may want/need to be * reconsidered. */ case MSR_IA32_TSC: poc->tsc |= TSC_BASE; goto normal; case MSR_TSC_AUX: poc->tsc |= TSC_AUX; if ( cpu_has_rdtscp ) goto normal; *val = 0; return X86EMUL_OKAY; case MSR_EFER: *val = read_efer(); if ( is_pv_32bit_domain(currd) ) *val &= ~(EFER_LME | EFER_LMA | EFER_LMSLE); return X86EMUL_OKAY; case MSR_K7_FID_VID_CTL: case MSR_K7_FID_VID_STATUS: case MSR_K8_PSTATE_LIMIT: case MSR_K8_PSTATE_CTRL: case MSR_K8_PSTATE_STATUS: case MSR_K8_PSTATE0: case MSR_K8_PSTATE1: case MSR_K8_PSTATE2: case MSR_K8_PSTATE3: case MSR_K8_PSTATE4: case MSR_K8_PSTATE5: case MSR_K8_PSTATE6: case MSR_K8_PSTATE7: if ( boot_cpu_data.x86_vendor != X86_VENDOR_AMD ) break; if ( unlikely(is_cpufreq_controller(currd)) ) goto normal; *val = 0; return X86EMUL_OKAY; case MSR_IA32_UCODE_REV: BUILD_BUG_ON(MSR_IA32_UCODE_REV != MSR_AMD_PATCHLEVEL); if ( boot_cpu_data.x86_vendor == X86_VENDOR_INTEL ) { if ( wrmsr_safe(MSR_IA32_UCODE_REV, 0) ) break; /* As documented in the SDM: Do a CPUID 1 here */ cpuid_eax(1); } goto normal; case MSR_IA32_MISC_ENABLE: if ( rdmsr_safe(reg, *val) ) break; *val = guest_misc_enable(*val); return X86EMUL_OKAY; case MSR_AMD64_DR0_ADDRESS_MASK: if ( !boot_cpu_has(X86_FEATURE_DBEXT) ) break; *val = curr->arch.pv_vcpu.dr_mask[0]; return X86EMUL_OKAY; case MSR_AMD64_DR1_ADDRESS_MASK ... MSR_AMD64_DR3_ADDRESS_MASK: if ( !boot_cpu_has(X86_FEATURE_DBEXT) ) break; *val = curr->arch.pv_vcpu.dr_mask[reg - MSR_AMD64_DR1_ADDRESS_MASK + 1]; return X86EMUL_OKAY; case MSR_IA32_PERF_CAPABILITIES: /* No extra capabilities are supported. */ *val = 0; return X86EMUL_OKAY; case MSR_P6_PERFCTR(0) ... MSR_P6_PERFCTR(7): case MSR_P6_EVNTSEL(0) ... MSR_P6_EVNTSEL(3): case MSR_CORE_PERF_FIXED_CTR0 ... MSR_CORE_PERF_FIXED_CTR2: case MSR_CORE_PERF_FIXED_CTR_CTRL ... MSR_CORE_PERF_GLOBAL_OVF_CTRL: if ( boot_cpu_data.x86_vendor == X86_VENDOR_INTEL ) { vpmu_msr = true; /* fall through */ case MSR_AMD_FAM15H_EVNTSEL0 ... MSR_AMD_FAM15H_PERFCTR5: case MSR_K7_EVNTSEL0 ... MSR_K7_PERFCTR3: if ( vpmu_msr || (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) ) { if ( vpmu_do_rdmsr(reg, val) ) break; return X86EMUL_OKAY; } } /* fall through */ default: if ( rdmsr_hypervisor_regs(reg, val) ) return X86EMUL_OKAY; rc = vmce_rdmsr(reg, val); if ( rc < 0 ) break; if ( rc ) return X86EMUL_OKAY; /* fall through */ normal: /* Everyone can read the MSR space. */ /* gdprintk(XENLOG_WARNING, "Domain attempted RDMSR %08x\n", reg); */ if ( rdmsr_safe(reg, *val) ) break; return X86EMUL_OKAY; } return X86EMUL_UNHANDLEABLE; } static int write_msr(unsigned int reg, uint64_t val, struct x86_emulate_ctxt *ctxt) { struct vcpu *curr = current; const struct domain *currd = curr->domain; bool vpmu_msr = false; int ret; if ( (ret = guest_wrmsr(curr, reg, val)) != X86EMUL_UNHANDLEABLE ) { if ( ret == X86EMUL_EXCEPTION ) x86_emul_hw_exception(TRAP_gp_fault, 0, ctxt); return ret; } switch ( reg ) { uint64_t temp; int rc; case MSR_FS_BASE: if ( is_pv_32bit_domain(currd) || !is_canonical_address(val) ) break; wrfsbase(val); curr->arch.pv_vcpu.fs_base = val; return X86EMUL_OKAY; case MSR_GS_BASE: if ( is_pv_32bit_domain(currd) || !is_canonical_address(val) ) break; wrgsbase(val); curr->arch.pv_vcpu.gs_base_kernel = val; return X86EMUL_OKAY; case MSR_SHADOW_GS_BASE: if ( is_pv_32bit_domain(currd) || !is_canonical_address(val) ) break; wrmsrl(MSR_SHADOW_GS_BASE, val); curr->arch.pv_vcpu.gs_base_user = val; return X86EMUL_OKAY; case MSR_K7_FID_VID_STATUS: case MSR_K7_FID_VID_CTL: case MSR_K8_PSTATE_LIMIT: case MSR_K8_PSTATE_CTRL: case MSR_K8_PSTATE_STATUS: case MSR_K8_PSTATE0: case MSR_K8_PSTATE1: case MSR_K8_PSTATE2: case MSR_K8_PSTATE3: case MSR_K8_PSTATE4: case MSR_K8_PSTATE5: case MSR_K8_PSTATE6: case MSR_K8_PSTATE7: case MSR_K8_HWCR: if ( boot_cpu_data.x86_vendor != X86_VENDOR_AMD ) break; if ( likely(!is_cpufreq_controller(currd)) || wrmsr_safe(reg, val) == 0 ) return X86EMUL_OKAY; break; case MSR_AMD64_NB_CFG: if ( boot_cpu_data.x86_vendor != X86_VENDOR_AMD || boot_cpu_data.x86 < 0x10 || boot_cpu_data.x86 > 0x17 ) break; if ( !is_hardware_domain(currd) || !is_pinned_vcpu(curr) ) return X86EMUL_OKAY; if ( (rdmsr_safe(MSR_AMD64_NB_CFG, temp) != 0) || ((val ^ temp) & ~(1ULL << AMD64_NB_CFG_CF8_EXT_ENABLE_BIT)) ) goto invalid; if ( wrmsr_safe(MSR_AMD64_NB_CFG, val) == 0 ) return X86EMUL_OKAY; break; case MSR_FAM10H_MMIO_CONF_BASE: if ( boot_cpu_data.x86_vendor != X86_VENDOR_AMD || boot_cpu_data.x86 < 0x10 || boot_cpu_data.x86 > 0x17 ) break; if ( !is_hardware_domain(currd) || !is_pinned_vcpu(curr) ) return X86EMUL_OKAY; if ( rdmsr_safe(MSR_FAM10H_MMIO_CONF_BASE, temp) != 0 ) break; if ( (pci_probe & PCI_PROBE_MASK) == PCI_PROBE_MMCONF ? temp != val : ((temp ^ val) & ~(FAM10H_MMIO_CONF_ENABLE | (FAM10H_MMIO_CONF_BUSRANGE_MASK << FAM10H_MMIO_CONF_BUSRANGE_SHIFT) | ((u64)FAM10H_MMIO_CONF_BASE_MASK << FAM10H_MMIO_CONF_BASE_SHIFT))) ) goto invalid; if ( wrmsr_safe(MSR_FAM10H_MMIO_CONF_BASE, val) == 0 ) return X86EMUL_OKAY; break; case MSR_IA32_UCODE_REV: if ( boot_cpu_data.x86_vendor != X86_VENDOR_INTEL ) break; if ( !is_hardware_domain(currd) || !is_pinned_vcpu(curr) ) return X86EMUL_OKAY; if ( rdmsr_safe(reg, temp) ) break; if ( val ) goto invalid; return X86EMUL_OKAY; case MSR_IA32_MISC_ENABLE: if ( rdmsr_safe(reg, temp) ) break; if ( val != guest_misc_enable(temp) ) goto invalid; return X86EMUL_OKAY; case MSR_IA32_MPERF: case MSR_IA32_APERF: if ( (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL) && (boot_cpu_data.x86_vendor != X86_VENDOR_AMD) ) break; if ( likely(!is_cpufreq_controller(currd)) || wrmsr_safe(reg, val) == 0 ) return X86EMUL_OKAY; break; case MSR_IA32_PERF_CTL: if ( boot_cpu_data.x86_vendor != X86_VENDOR_INTEL ) break; if ( likely(!is_cpufreq_controller(currd)) || wrmsr_safe(reg, val) == 0 ) return X86EMUL_OKAY; break; case MSR_IA32_THERM_CONTROL: case MSR_IA32_ENERGY_PERF_BIAS: if ( boot_cpu_data.x86_vendor != X86_VENDOR_INTEL ) break; if ( !is_hardware_domain(currd) || !is_pinned_vcpu(curr) || wrmsr_safe(reg, val) == 0 ) return X86EMUL_OKAY; break; case MSR_AMD64_DR0_ADDRESS_MASK: if ( !boot_cpu_has(X86_FEATURE_DBEXT) || (val >> 32) ) break; curr->arch.pv_vcpu.dr_mask[0] = val; if ( curr->arch.debugreg[7] & DR7_ACTIVE_MASK ) wrmsrl(MSR_AMD64_DR0_ADDRESS_MASK, val); return X86EMUL_OKAY; case MSR_AMD64_DR1_ADDRESS_MASK ... MSR_AMD64_DR3_ADDRESS_MASK: if ( !boot_cpu_has(X86_FEATURE_DBEXT) || (val >> 32) ) break; curr->arch.pv_vcpu.dr_mask[reg - MSR_AMD64_DR1_ADDRESS_MASK + 1] = val; if ( curr->arch.debugreg[7] & DR7_ACTIVE_MASK ) wrmsrl(reg, val); return X86EMUL_OKAY; case MSR_P6_PERFCTR(0) ... MSR_P6_PERFCTR(7): case MSR_P6_EVNTSEL(0) ... MSR_P6_EVNTSEL(3): case MSR_CORE_PERF_FIXED_CTR0 ... MSR_CORE_PERF_FIXED_CTR2: case MSR_CORE_PERF_FIXED_CTR_CTRL ... MSR_CORE_PERF_GLOBAL_OVF_CTRL: if ( boot_cpu_data.x86_vendor == X86_VENDOR_INTEL ) { vpmu_msr = true; case MSR_AMD_FAM15H_EVNTSEL0 ... MSR_AMD_FAM15H_PERFCTR5: case MSR_K7_EVNTSEL0 ... MSR_K7_PERFCTR3: if ( vpmu_msr || (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) ) { if ( (vpmu_mode & XENPMU_MODE_ALL) && !is_hardware_domain(currd) ) return X86EMUL_OKAY; if ( vpmu_do_wrmsr(reg, val, 0) ) break; return X86EMUL_OKAY; } } /* fall through */ default: if ( wrmsr_hypervisor_regs(reg, val) == 1 ) return X86EMUL_OKAY; rc = vmce_wrmsr(reg, val); if ( rc < 0 ) break; if ( rc ) return X86EMUL_OKAY; if ( (rdmsr_safe(reg, temp) != 0) || (val != temp) ) invalid: gdprintk(XENLOG_WARNING, "Domain attempted WRMSR %08x from 0x%016"PRIx64" to 0x%016"PRIx64"\n", reg, temp, val); return X86EMUL_OKAY; } return X86EMUL_UNHANDLEABLE; } /* Name it differently to avoid clashing with wbinvd() */ static int _wbinvd(struct x86_emulate_ctxt *ctxt) { /* Ignore the instruction if unprivileged. */ if ( !cache_flush_permitted(current->domain) ) /* * Non-physdev domain attempted WBINVD; ignore for now since * newer linux uses this in some start-of-day timing loops. */ ; else wbinvd(); return X86EMUL_OKAY; } int pv_emul_cpuid(uint32_t leaf, uint32_t subleaf, struct cpuid_leaf *res, struct x86_emulate_ctxt *ctxt) { guest_cpuid(current, leaf, subleaf, res); return X86EMUL_OKAY; } static int validate(const struct x86_emulate_state *state, struct x86_emulate_ctxt *ctxt) { switch ( ctxt->opcode ) { case 0x6c ... 0x6f: /* ins / outs */ case 0xe4 ... 0xe7: /* in / out (immediate port) */ case 0xec ... 0xef: /* in / out (port in %dx) */ case X86EMUL_OPC(0x0f, 0x06): /* clts */ case X86EMUL_OPC(0x0f, 0x09): /* wbinvd */ case X86EMUL_OPC(0x0f, 0x20) ... X86EMUL_OPC(0x0f, 0x23): /* mov to/from cr/dr */ case X86EMUL_OPC(0x0f, 0x30): /* wrmsr */ case X86EMUL_OPC(0x0f, 0x31): /* rdtsc */ case X86EMUL_OPC(0x0f, 0x32): /* rdmsr */ case X86EMUL_OPC(0x0f, 0xa2): /* cpuid */ return X86EMUL_OKAY; case 0xfa: case 0xfb: /* cli / sti */ if ( !iopl_ok(current, ctxt->regs) ) break; /* * This is just too dangerous to allow, in my opinion. Consider if the * caller then tries to reenable interrupts using POPF: we can't trap * that and we'll end up with hard-to-debug lockups. Fast & loose will * do for us. :-) vcpu_info(current, evtchn_upcall_mask) = (ctxt->opcode == 0xfa); */ return X86EMUL_DONE; case X86EMUL_OPC(0x0f, 0x01): { unsigned int modrm_rm, modrm_reg; if ( x86_insn_modrm(state, &modrm_rm, &modrm_reg) != 3 || (modrm_rm & 7) != 1 ) break; switch ( modrm_reg & 7 ) { case 2: /* xsetbv */ case 7: /* rdtscp */ return X86EMUL_OKAY; } break; } } return X86EMUL_UNHANDLEABLE; } static int insn_fetch(enum x86_segment seg, unsigned long offset, void *p_data, unsigned int bytes, struct x86_emulate_ctxt *ctxt) { const struct priv_op_ctxt *poc = container_of(ctxt, struct priv_op_ctxt, ctxt); unsigned int rc; unsigned long addr = poc->cs.base + offset; ASSERT(seg == x86_seg_cs); /* We don't mean to emulate any branches. */ if ( !bytes ) return X86EMUL_UNHANDLEABLE; rc = pv_emul_virt_to_linear(poc->cs.base, offset, bytes, poc->cs.limit, x86_seg_cs, ctxt, &addr); if ( rc != X86EMUL_OKAY ) return rc; if ( (rc = __copy_from_user(p_data, (void *)addr, bytes)) != 0 ) { /* * TODO: This should report PFEC_insn_fetch when goc->insn_fetch && * cpu_has_nx, but we'd then need a "fetch" variant of * __copy_from_user() respecting NX, SMEP, and protection keys. */ x86_emul_pagefault(0, addr + bytes - rc, ctxt); return X86EMUL_EXCEPTION; } return X86EMUL_OKAY; } static const struct x86_emulate_ops priv_op_ops = { .insn_fetch = insn_fetch, .read = x86emul_unhandleable_rw, .validate = validate, .read_io = read_io, .write_io = write_io, .rep_ins = rep_ins, .rep_outs = rep_outs, .read_segment = read_segment, .read_cr = read_cr, .write_cr = write_cr, .read_dr = read_dr, .write_dr = write_dr, .read_msr = read_msr, .write_msr = write_msr, .cpuid = pv_emul_cpuid, .wbinvd = _wbinvd, }; int pv_emulate_privileged_op(struct cpu_user_regs *regs) { struct vcpu *curr = current; struct domain *currd = curr->domain; struct priv_op_ctxt ctxt = { .ctxt.regs = regs, .ctxt.vendor = currd->arch.cpuid->x86_vendor, .ctxt.lma = !is_pv_32bit_domain(currd), }; int rc; unsigned int eflags, ar; if ( !pv_emul_read_descriptor(regs->cs, curr, &ctxt.cs.base, &ctxt.cs.limit, &ar, 1) || !(ar & _SEGMENT_S) || !(ar & _SEGMENT_P) || !(ar & _SEGMENT_CODE) ) return 0; /* Mirror virtualized state into EFLAGS. */ ASSERT(regs->eflags & X86_EFLAGS_IF); if ( vcpu_info(curr, evtchn_upcall_mask) ) regs->eflags &= ~X86_EFLAGS_IF; else regs->eflags |= X86_EFLAGS_IF; ASSERT(!(regs->eflags & X86_EFLAGS_IOPL)); regs->eflags |= curr->arch.pv_vcpu.iopl; eflags = regs->eflags; ctxt.ctxt.addr_size = ar & _SEGMENT_L ? 64 : ar & _SEGMENT_DB ? 32 : 16; /* Leave zero in ctxt.ctxt.sp_size, as it's not needed. */ rc = x86_emulate(&ctxt.ctxt, &priv_op_ops); if ( ctxt.io_emul_stub ) unmap_domain_page(ctxt.io_emul_stub); /* * Un-mirror virtualized state from EFLAGS. * Nothing we allow to be emulated can change anything other than the * arithmetic bits, and the resume flag. */ ASSERT(!((regs->eflags ^ eflags) & ~(X86_EFLAGS_RF | X86_EFLAGS_ARITH_MASK))); regs->eflags |= X86_EFLAGS_IF; regs->eflags &= ~X86_EFLAGS_IOPL; switch ( rc ) { case X86EMUL_OKAY: if ( ctxt.tsc & TSC_BASE ) { if ( ctxt.tsc & TSC_AUX ) pv_soft_rdtsc(curr, regs, 1); else if ( currd->arch.vtsc ) pv_soft_rdtsc(curr, regs, 0); else msr_split(regs, rdtsc()); } if ( ctxt.ctxt.retire.singlestep ) ctxt.bpmatch |= DR_STEP; if ( ctxt.bpmatch ) { curr->arch.debugreg[6] |= ctxt.bpmatch | DR_STATUS_RESERVED_ONE; if ( !(curr->arch.pv_vcpu.trap_bounce.flags & TBF_EXCEPTION) ) pv_inject_hw_exception(TRAP_debug, X86_EVENT_NO_EC); } /* fall through */ case X86EMUL_RETRY: return EXCRET_fault_fixed; case X86EMUL_EXCEPTION: pv_inject_event(&ctxt.ctxt.event); return EXCRET_fault_fixed; } return 0; } /* * Local variables: * mode: C * c-file-style: "BSD" * c-basic-offset: 4 * tab-width: 4 * indent-tabs-mode: nil * End: */