1 /******************************************************************************
2 * arch/x86/pv/emul-priv-op.c
3 *
4 * Emulate privileged instructions for PV guests
5 *
6 * Modifications to Linux original are copyright (c) 2002-2004, K A Fraser
7 *
8 * This program is free software; you can redistribute it and/or modify
9 * it under the terms of the GNU General Public License as published by
10 * the Free Software Foundation; either version 2 of the License, or
11 * (at your option) any later version.
12 *
13 * This program is distributed in the hope that it will be useful,
14 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 * GNU General Public License for more details.
17 *
18 * You should have received a copy of the GNU General Public License
19 * along with this program; If not, see <http://www.gnu.org/licenses/>.
20 */
21
22 #include <xen/errno.h>
23 #include <xen/event.h>
24 #include <xen/guest_access.h>
25 #include <xen/iocap.h>
26 #include <xen/spinlock.h>
27 #include <xen/trace.h>
28
29 #include <asm/apic.h>
30 #include <asm/debugreg.h>
31 #include <asm/hpet.h>
32 #include <asm/hypercall.h>
33 #include <asm/mc146818rtc.h>
34 #include <asm/p2m.h>
35 #include <asm/pv/traps.h>
36 #include <asm/shared.h>
37 #include <asm/traps.h>
38 #include <asm/x86_emulate.h>
39
40 #include <xsm/xsm.h>
41
42 #include "../x86_64/mmconfig.h"
43 #include "emulate.h"
44 #include "mm.h"
45
46 /* Override macros from asm/page.h to make them work with mfn_t */
47 #undef mfn_to_page
48 #define mfn_to_page(mfn) __mfn_to_page(mfn_x(mfn))
49 #undef page_to_mfn
50 #define page_to_mfn(pg) _mfn(__page_to_mfn(pg))
51
52 /***********************
53 * I/O emulation support
54 */
55
56 struct priv_op_ctxt {
57 struct x86_emulate_ctxt ctxt;
58 struct {
59 unsigned long base, limit;
60 } cs;
61 char *io_emul_stub;
62 unsigned int bpmatch;
63 unsigned int tsc;
64 #define TSC_BASE 1
65 #define TSC_AUX 2
66 };
67
68 /* I/O emulation support. Helper routines for, and type of, the stack stub. */
69 void host_to_guest_gpr_switch(struct cpu_user_regs *);
70 unsigned long guest_to_host_gpr_switch(unsigned long);
71
72 void (*pv_post_outb_hook)(unsigned int port, u8 value);
73
74 typedef void io_emul_stub_t(struct cpu_user_regs *);
75
io_emul_stub_setup(struct priv_op_ctxt * ctxt,u8 opcode,unsigned int port,unsigned int bytes)76 static io_emul_stub_t *io_emul_stub_setup(struct priv_op_ctxt *ctxt, u8 opcode,
77 unsigned int port, unsigned int bytes)
78 {
79 if ( !ctxt->io_emul_stub )
80 ctxt->io_emul_stub = map_domain_page(_mfn(this_cpu(stubs.mfn))) +
81 (this_cpu(stubs.addr) &
82 ~PAGE_MASK) +
83 STUB_BUF_SIZE / 2;
84
85 /* movq $host_to_guest_gpr_switch,%rcx */
86 ctxt->io_emul_stub[0] = 0x48;
87 ctxt->io_emul_stub[1] = 0xb9;
88 *(void **)&ctxt->io_emul_stub[2] = (void *)host_to_guest_gpr_switch;
89 /* callq *%rcx */
90 ctxt->io_emul_stub[10] = 0xff;
91 ctxt->io_emul_stub[11] = 0xd1;
92 /* data16 or nop */
93 ctxt->io_emul_stub[12] = (bytes != 2) ? 0x90 : 0x66;
94 /* <io-access opcode> */
95 ctxt->io_emul_stub[13] = opcode;
96 /* imm8 or nop */
97 ctxt->io_emul_stub[14] = !(opcode & 8) ? port : 0x90;
98 /* ret (jumps to guest_to_host_gpr_switch) */
99 ctxt->io_emul_stub[15] = 0xc3;
100 BUILD_BUG_ON(STUB_BUF_SIZE / 2 < 16);
101
102 if ( ioemul_handle_quirk )
103 ioemul_handle_quirk(opcode, &ctxt->io_emul_stub[12], ctxt->ctxt.regs);
104
105 /* Handy function-typed pointer to the stub. */
106 return (void *)(this_cpu(stubs.addr) + STUB_BUF_SIZE / 2);
107 }
108
109
110 /* Perform IOPL check between the vcpu's shadowed IOPL, and the assumed cpl. */
iopl_ok(const struct vcpu * v,const struct cpu_user_regs * regs)111 static bool iopl_ok(const struct vcpu *v, const struct cpu_user_regs *regs)
112 {
113 unsigned int cpl = guest_kernel_mode(v, regs) ?
114 (VM_ASSIST(v->domain, architectural_iopl) ? 0 : 1) : 3;
115
116 ASSERT((v->arch.pv_vcpu.iopl & ~X86_EFLAGS_IOPL) == 0);
117
118 return IOPL(cpl) <= v->arch.pv_vcpu.iopl;
119 }
120
121 /* Has the guest requested sufficient permission for this I/O access? */
guest_io_okay(unsigned int port,unsigned int bytes,struct vcpu * v,struct cpu_user_regs * regs)122 static bool guest_io_okay(unsigned int port, unsigned int bytes,
123 struct vcpu *v, struct cpu_user_regs *regs)
124 {
125 /* If in user mode, switch to kernel mode just to read I/O bitmap. */
126 const bool user_mode = !(v->arch.flags & TF_kernel_mode);
127
128 if ( iopl_ok(v, regs) )
129 return true;
130
131 if ( v->arch.pv_vcpu.iobmp_limit > (port + bytes) )
132 {
133 union { uint8_t bytes[2]; uint16_t mask; } x;
134
135 /*
136 * Grab permission bytes from guest space. Inaccessible bytes are
137 * read as 0xff (no access allowed).
138 */
139 if ( user_mode )
140 toggle_guest_pt(v);
141
142 switch ( __copy_from_guest_offset(x.bytes, v->arch.pv_vcpu.iobmp,
143 port>>3, 2) )
144 {
145 default: x.bytes[0] = ~0;
146 /* fallthrough */
147 case 1: x.bytes[1] = ~0;
148 /* fallthrough */
149 case 0: break;
150 }
151
152 if ( user_mode )
153 toggle_guest_pt(v);
154
155 if ( (x.mask & (((1 << bytes) - 1) << (port & 7))) == 0 )
156 return true;
157 }
158
159 return false;
160 }
161
162 /* Has the administrator granted sufficient permission for this I/O access? */
admin_io_okay(unsigned int port,unsigned int bytes,const struct domain * d)163 static bool admin_io_okay(unsigned int port, unsigned int bytes,
164 const struct domain *d)
165 {
166 /*
167 * Port 0xcf8 (CONFIG_ADDRESS) is only visible for DWORD accesses.
168 * We never permit direct access to that register.
169 */
170 if ( (port == 0xcf8) && (bytes == 4) )
171 return false;
172
173 /* We also never permit direct access to the RTC/CMOS registers. */
174 if ( ((port & ~1) == RTC_PORT(0)) )
175 return false;
176
177 return ioports_access_permitted(d, port, port + bytes - 1);
178 }
179
pci_cfg_ok(struct domain * currd,unsigned int start,unsigned int size,uint32_t * write)180 static bool pci_cfg_ok(struct domain *currd, unsigned int start,
181 unsigned int size, uint32_t *write)
182 {
183 uint32_t machine_bdf;
184
185 if ( !is_hardware_domain(currd) )
186 return false;
187
188 if ( !CF8_ENABLED(currd->arch.pci_cf8) )
189 return true;
190
191 machine_bdf = CF8_BDF(currd->arch.pci_cf8);
192 if ( write )
193 {
194 const unsigned long *ro_map = pci_get_ro_map(0);
195
196 if ( ro_map && test_bit(machine_bdf, ro_map) )
197 return false;
198 }
199 start |= CF8_ADDR_LO(currd->arch.pci_cf8);
200 /* AMD extended configuration space access? */
201 if ( CF8_ADDR_HI(currd->arch.pci_cf8) &&
202 boot_cpu_data.x86_vendor == X86_VENDOR_AMD &&
203 boot_cpu_data.x86 >= 0x10 && boot_cpu_data.x86 <= 0x17 )
204 {
205 uint64_t msr_val;
206
207 if ( rdmsr_safe(MSR_AMD64_NB_CFG, msr_val) )
208 return false;
209 if ( msr_val & (1ULL << AMD64_NB_CFG_CF8_EXT_ENABLE_BIT) )
210 start |= CF8_ADDR_HI(currd->arch.pci_cf8);
211 }
212
213 return !write ?
214 xsm_pci_config_permission(XSM_HOOK, currd, machine_bdf,
215 start, start + size - 1, 0) == 0 :
216 pci_conf_write_intercept(0, machine_bdf, start, size, write) >= 0;
217 }
218
guest_io_read(unsigned int port,unsigned int bytes,struct domain * currd)219 uint32_t guest_io_read(unsigned int port, unsigned int bytes,
220 struct domain *currd)
221 {
222 uint32_t data = 0;
223 unsigned int shift = 0;
224
225 if ( admin_io_okay(port, bytes, currd) )
226 {
227 switch ( bytes )
228 {
229 case 1: return inb(port);
230 case 2: return inw(port);
231 case 4: return inl(port);
232 }
233 }
234
235 while ( bytes != 0 )
236 {
237 unsigned int size = 1;
238 uint32_t sub_data = ~0;
239
240 if ( (port == 0x42) || (port == 0x43) || (port == 0x61) )
241 {
242 sub_data = pv_pit_handler(port, 0, 0);
243 }
244 else if ( port == RTC_PORT(0) )
245 {
246 sub_data = currd->arch.cmos_idx;
247 }
248 else if ( (port == RTC_PORT(1)) &&
249 ioports_access_permitted(currd, RTC_PORT(0), RTC_PORT(1)) )
250 {
251 unsigned long flags;
252
253 spin_lock_irqsave(&rtc_lock, flags);
254 outb(currd->arch.cmos_idx & 0x7f, RTC_PORT(0));
255 sub_data = inb(RTC_PORT(1));
256 spin_unlock_irqrestore(&rtc_lock, flags);
257 }
258 else if ( (port == 0xcf8) && (bytes == 4) )
259 {
260 size = 4;
261 sub_data = currd->arch.pci_cf8;
262 }
263 else if ( (port & 0xfffc) == 0xcfc )
264 {
265 size = min(bytes, 4 - (port & 3));
266 if ( size == 3 )
267 size = 2;
268 if ( pci_cfg_ok(currd, port & 3, size, NULL) )
269 sub_data = pci_conf_read(currd->arch.pci_cf8, port & 3, size);
270 }
271
272 if ( size == 4 )
273 return sub_data;
274
275 data |= (sub_data & ((1u << (size * 8)) - 1)) << shift;
276 shift += size * 8;
277 port += size;
278 bytes -= size;
279 }
280
281 return data;
282 }
283
check_guest_io_breakpoint(struct vcpu * v,unsigned int port,unsigned int len)284 static unsigned int check_guest_io_breakpoint(struct vcpu *v,
285 unsigned int port,
286 unsigned int len)
287 {
288 unsigned int width, i, match = 0;
289 unsigned long start;
290
291 if ( !(v->arch.debugreg[5]) ||
292 !(v->arch.pv_vcpu.ctrlreg[4] & X86_CR4_DE) )
293 return 0;
294
295 for ( i = 0; i < 4; i++ )
296 {
297 if ( !(v->arch.debugreg[5] &
298 (3 << (i * DR_ENABLE_SIZE))) )
299 continue;
300
301 start = v->arch.debugreg[i];
302 width = 0;
303
304 switch ( (v->arch.debugreg[7] >>
305 (DR_CONTROL_SHIFT + i * DR_CONTROL_SIZE)) & 0xc )
306 {
307 case DR_LEN_1: width = 1; break;
308 case DR_LEN_2: width = 2; break;
309 case DR_LEN_4: width = 4; break;
310 case DR_LEN_8: width = 8; break;
311 }
312
313 if ( (start < (port + len)) && ((start + width) > port) )
314 match |= 1u << i;
315 }
316
317 return match;
318 }
319
read_io(unsigned int port,unsigned int bytes,unsigned long * val,struct x86_emulate_ctxt * ctxt)320 static int read_io(unsigned int port, unsigned int bytes,
321 unsigned long *val, struct x86_emulate_ctxt *ctxt)
322 {
323 struct priv_op_ctxt *poc = container_of(ctxt, struct priv_op_ctxt, ctxt);
324 struct vcpu *curr = current;
325 struct domain *currd = current->domain;
326
327 /* INS must not come here. */
328 ASSERT((ctxt->opcode & ~9) == 0xe4);
329
330 if ( !guest_io_okay(port, bytes, curr, ctxt->regs) )
331 return X86EMUL_UNHANDLEABLE;
332
333 poc->bpmatch = check_guest_io_breakpoint(curr, port, bytes);
334
335 if ( admin_io_okay(port, bytes, currd) )
336 {
337 io_emul_stub_t *io_emul =
338 io_emul_stub_setup(poc, ctxt->opcode, port, bytes);
339
340 mark_regs_dirty(ctxt->regs);
341 io_emul(ctxt->regs);
342 return X86EMUL_DONE;
343 }
344
345 *val = guest_io_read(port, bytes, currd);
346
347 return X86EMUL_OKAY;
348 }
349
guest_io_write(unsigned int port,unsigned int bytes,uint32_t data,struct domain * currd)350 void guest_io_write(unsigned int port, unsigned int bytes, uint32_t data,
351 struct domain *currd)
352 {
353 if ( admin_io_okay(port, bytes, currd) )
354 {
355 switch ( bytes )
356 {
357 case 1:
358 outb((uint8_t)data, port);
359 if ( pv_post_outb_hook )
360 pv_post_outb_hook(port, (uint8_t)data);
361 break;
362 case 2:
363 outw((uint16_t)data, port);
364 break;
365 case 4:
366 outl(data, port);
367 break;
368 }
369 return;
370 }
371
372 while ( bytes != 0 )
373 {
374 unsigned int size = 1;
375
376 if ( (port == 0x42) || (port == 0x43) || (port == 0x61) )
377 {
378 pv_pit_handler(port, (uint8_t)data, 1);
379 }
380 else if ( port == RTC_PORT(0) )
381 {
382 currd->arch.cmos_idx = data;
383 }
384 else if ( (port == RTC_PORT(1)) &&
385 ioports_access_permitted(currd, RTC_PORT(0), RTC_PORT(1)) )
386 {
387 unsigned long flags;
388
389 if ( pv_rtc_handler )
390 pv_rtc_handler(currd->arch.cmos_idx & 0x7f, data);
391 spin_lock_irqsave(&rtc_lock, flags);
392 outb(currd->arch.cmos_idx & 0x7f, RTC_PORT(0));
393 outb(data, RTC_PORT(1));
394 spin_unlock_irqrestore(&rtc_lock, flags);
395 }
396 else if ( (port == 0xcf8) && (bytes == 4) )
397 {
398 size = 4;
399 currd->arch.pci_cf8 = data;
400 }
401 else if ( (port & 0xfffc) == 0xcfc )
402 {
403 size = min(bytes, 4 - (port & 3));
404 if ( size == 3 )
405 size = 2;
406 if ( pci_cfg_ok(currd, port & 3, size, &data) )
407 pci_conf_write(currd->arch.pci_cf8, port & 3, size, data);
408 }
409
410 if ( size == 4 )
411 return;
412
413 port += size;
414 bytes -= size;
415 data >>= size * 8;
416 }
417 }
418
write_io(unsigned int port,unsigned int bytes,unsigned long val,struct x86_emulate_ctxt * ctxt)419 static int write_io(unsigned int port, unsigned int bytes,
420 unsigned long val, struct x86_emulate_ctxt *ctxt)
421 {
422 struct priv_op_ctxt *poc = container_of(ctxt, struct priv_op_ctxt, ctxt);
423 struct vcpu *curr = current;
424 struct domain *currd = current->domain;
425
426 /* OUTS must not come here. */
427 ASSERT((ctxt->opcode & ~9) == 0xe6);
428
429 if ( !guest_io_okay(port, bytes, curr, ctxt->regs) )
430 return X86EMUL_UNHANDLEABLE;
431
432 poc->bpmatch = check_guest_io_breakpoint(curr, port, bytes);
433
434 if ( admin_io_okay(port, bytes, currd) )
435 {
436 io_emul_stub_t *io_emul =
437 io_emul_stub_setup(poc, ctxt->opcode, port, bytes);
438
439 mark_regs_dirty(ctxt->regs);
440 io_emul(ctxt->regs);
441 if ( (bytes == 1) && pv_post_outb_hook )
442 pv_post_outb_hook(port, val);
443 return X86EMUL_DONE;
444 }
445
446 guest_io_write(port, bytes, val, currd);
447
448 return X86EMUL_OKAY;
449 }
450
read_segment(enum x86_segment seg,struct segment_register * reg,struct x86_emulate_ctxt * ctxt)451 static int read_segment(enum x86_segment seg,
452 struct segment_register *reg,
453 struct x86_emulate_ctxt *ctxt)
454 {
455 /* Check if this is an attempt to access the I/O bitmap. */
456 if ( seg == x86_seg_tr )
457 {
458 switch ( ctxt->opcode )
459 {
460 case 0x6c ... 0x6f: /* ins / outs */
461 case 0xe4 ... 0xe7: /* in / out (immediate port) */
462 case 0xec ... 0xef: /* in / out (port in %dx) */
463 /* Defer the check to priv_op_{read,write}_io(). */
464 return X86EMUL_DONE;
465 }
466 }
467
468 if ( ctxt->addr_size < 64 )
469 {
470 unsigned long limit;
471 unsigned int sel, ar;
472
473 switch ( seg )
474 {
475 case x86_seg_cs: sel = ctxt->regs->cs; break;
476 case x86_seg_ds: sel = read_sreg(ds); break;
477 case x86_seg_es: sel = read_sreg(es); break;
478 case x86_seg_fs: sel = read_sreg(fs); break;
479 case x86_seg_gs: sel = read_sreg(gs); break;
480 case x86_seg_ss: sel = ctxt->regs->ss; break;
481 default: return X86EMUL_UNHANDLEABLE;
482 }
483
484 if ( !pv_emul_read_descriptor(sel, current, ®->base,
485 &limit, &ar, 0) )
486 return X86EMUL_UNHANDLEABLE;
487
488 reg->limit = limit;
489 reg->attr = ar >> 8;
490 }
491 else
492 {
493 switch ( seg )
494 {
495 default:
496 if ( !is_x86_user_segment(seg) )
497 return X86EMUL_UNHANDLEABLE;
498 reg->base = 0;
499 break;
500 case x86_seg_fs:
501 reg->base = rdfsbase();
502 break;
503 case x86_seg_gs:
504 reg->base = rdgsbase();
505 break;
506 }
507
508 reg->limit = ~0U;
509
510 reg->attr = 0;
511 reg->type = _SEGMENT_WR >> 8;
512 if ( seg == x86_seg_cs )
513 {
514 reg->type |= _SEGMENT_CODE >> 8;
515 reg->l = 1;
516 }
517 else
518 reg->db = 1;
519 reg->s = 1;
520 reg->dpl = 3;
521 reg->p = 1;
522 reg->g = 1;
523 }
524
525 /*
526 * For x86_emulate.c's mode_ring0() to work, fake a DPL of zero.
527 * Also do this for consistency for non-conforming code segments.
528 */
529 if ( (seg == x86_seg_ss ||
530 (seg == x86_seg_cs &&
531 !(reg->type & (_SEGMENT_EC >> 8)))) &&
532 guest_kernel_mode(current, ctxt->regs) )
533 reg->dpl = 0;
534
535 return X86EMUL_OKAY;
536 }
537
pv_emul_virt_to_linear(unsigned long base,unsigned long offset,unsigned int bytes,unsigned long limit,enum x86_segment seg,struct x86_emulate_ctxt * ctxt,unsigned long * addr)538 static int pv_emul_virt_to_linear(unsigned long base, unsigned long offset,
539 unsigned int bytes, unsigned long limit,
540 enum x86_segment seg,
541 struct x86_emulate_ctxt *ctxt,
542 unsigned long *addr)
543 {
544 int rc = X86EMUL_OKAY;
545
546 *addr = base + offset;
547
548 if ( ctxt->addr_size < 64 )
549 {
550 if ( limit < bytes - 1 || offset > limit - bytes + 1 )
551 rc = X86EMUL_EXCEPTION;
552 *addr = (uint32_t)*addr;
553 }
554 else if ( !__addr_ok(*addr) )
555 rc = X86EMUL_EXCEPTION;
556
557 if ( unlikely(rc == X86EMUL_EXCEPTION) )
558 x86_emul_hw_exception(seg != x86_seg_ss ? TRAP_gp_fault
559 : TRAP_stack_error,
560 0, ctxt);
561
562 return rc;
563 }
564
rep_ins(uint16_t port,enum x86_segment seg,unsigned long offset,unsigned int bytes_per_rep,unsigned long * reps,struct x86_emulate_ctxt * ctxt)565 static int rep_ins(uint16_t port,
566 enum x86_segment seg, unsigned long offset,
567 unsigned int bytes_per_rep, unsigned long *reps,
568 struct x86_emulate_ctxt *ctxt)
569 {
570 struct priv_op_ctxt *poc = container_of(ctxt, struct priv_op_ctxt, ctxt);
571 struct vcpu *curr = current;
572 struct domain *currd = current->domain;
573 unsigned long goal = *reps;
574 struct segment_register sreg;
575 int rc;
576
577 ASSERT(seg == x86_seg_es);
578
579 *reps = 0;
580
581 if ( !guest_io_okay(port, bytes_per_rep, curr, ctxt->regs) )
582 return X86EMUL_UNHANDLEABLE;
583
584 rc = read_segment(x86_seg_es, &sreg, ctxt);
585 if ( rc != X86EMUL_OKAY )
586 return rc;
587
588 if ( !sreg.p )
589 return X86EMUL_UNHANDLEABLE;
590 if ( !sreg.s ||
591 (sreg.type & (_SEGMENT_CODE >> 8)) ||
592 !(sreg.type & (_SEGMENT_WR >> 8)) )
593 {
594 x86_emul_hw_exception(TRAP_gp_fault, 0, ctxt);
595 return X86EMUL_EXCEPTION;
596 }
597
598 poc->bpmatch = check_guest_io_breakpoint(curr, port, bytes_per_rep);
599
600 while ( *reps < goal )
601 {
602 unsigned int data = guest_io_read(port, bytes_per_rep, currd);
603 unsigned long addr;
604
605 rc = pv_emul_virt_to_linear(sreg.base, offset, bytes_per_rep,
606 sreg.limit, x86_seg_es, ctxt, &addr);
607 if ( rc != X86EMUL_OKAY )
608 return rc;
609
610 if ( (rc = __copy_to_user((void *)addr, &data, bytes_per_rep)) != 0 )
611 {
612 x86_emul_pagefault(PFEC_write_access,
613 addr + bytes_per_rep - rc, ctxt);
614 return X86EMUL_EXCEPTION;
615 }
616
617 ++*reps;
618
619 if ( poc->bpmatch || hypercall_preempt_check() )
620 break;
621
622 /* x86_emulate() clips the repetition count to ensure we don't wrap. */
623 if ( unlikely(ctxt->regs->eflags & X86_EFLAGS_DF) )
624 offset -= bytes_per_rep;
625 else
626 offset += bytes_per_rep;
627 }
628
629 return X86EMUL_OKAY;
630 }
631
rep_outs(enum x86_segment seg,unsigned long offset,uint16_t port,unsigned int bytes_per_rep,unsigned long * reps,struct x86_emulate_ctxt * ctxt)632 static int rep_outs(enum x86_segment seg, unsigned long offset,
633 uint16_t port,
634 unsigned int bytes_per_rep, unsigned long *reps,
635 struct x86_emulate_ctxt *ctxt)
636 {
637 struct priv_op_ctxt *poc = container_of(ctxt, struct priv_op_ctxt, ctxt);
638 struct vcpu *curr = current;
639 struct domain *currd = current->domain;
640 unsigned long goal = *reps;
641 struct segment_register sreg;
642 int rc;
643
644 *reps = 0;
645
646 if ( !guest_io_okay(port, bytes_per_rep, curr, ctxt->regs) )
647 return X86EMUL_UNHANDLEABLE;
648
649 rc = read_segment(seg, &sreg, ctxt);
650 if ( rc != X86EMUL_OKAY )
651 return rc;
652
653 if ( !sreg.p )
654 return X86EMUL_UNHANDLEABLE;
655 if ( !sreg.s ||
656 ((sreg.type & (_SEGMENT_CODE >> 8)) &&
657 !(sreg.type & (_SEGMENT_WR >> 8))) )
658 {
659 x86_emul_hw_exception(seg != x86_seg_ss ? TRAP_gp_fault
660 : TRAP_stack_error,
661 0, ctxt);
662 return X86EMUL_EXCEPTION;
663 }
664
665 poc->bpmatch = check_guest_io_breakpoint(curr, port, bytes_per_rep);
666
667 while ( *reps < goal )
668 {
669 unsigned int data = 0;
670 unsigned long addr;
671
672 rc = pv_emul_virt_to_linear(sreg.base, offset, bytes_per_rep,
673 sreg.limit, seg, ctxt, &addr);
674 if ( rc != X86EMUL_OKAY )
675 return rc;
676
677 if ( (rc = __copy_from_user(&data, (void *)addr, bytes_per_rep)) != 0 )
678 {
679 x86_emul_pagefault(0, addr + bytes_per_rep - rc, ctxt);
680 return X86EMUL_EXCEPTION;
681 }
682
683 guest_io_write(port, bytes_per_rep, data, currd);
684
685 ++*reps;
686
687 if ( poc->bpmatch || hypercall_preempt_check() )
688 break;
689
690 /* x86_emulate() clips the repetition count to ensure we don't wrap. */
691 if ( unlikely(ctxt->regs->eflags & X86_EFLAGS_DF) )
692 offset -= bytes_per_rep;
693 else
694 offset += bytes_per_rep;
695 }
696
697 return X86EMUL_OKAY;
698 }
699
read_cr(unsigned int reg,unsigned long * val,struct x86_emulate_ctxt * ctxt)700 static int read_cr(unsigned int reg, unsigned long *val,
701 struct x86_emulate_ctxt *ctxt)
702 {
703 const struct vcpu *curr = current;
704
705 switch ( reg )
706 {
707 case 0: /* Read CR0 */
708 *val = (read_cr0() & ~X86_CR0_TS) | curr->arch.pv_vcpu.ctrlreg[0];
709 return X86EMUL_OKAY;
710
711 case 2: /* Read CR2 */
712 case 4: /* Read CR4 */
713 *val = curr->arch.pv_vcpu.ctrlreg[reg];
714 return X86EMUL_OKAY;
715
716 case 3: /* Read CR3 */
717 {
718 const struct domain *currd = curr->domain;
719 mfn_t mfn;
720
721 if ( !is_pv_32bit_domain(currd) )
722 {
723 mfn = pagetable_get_mfn(curr->arch.guest_table);
724 *val = xen_pfn_to_cr3(mfn_to_gmfn(currd, mfn_x(mfn)));
725 }
726 else
727 {
728 l4_pgentry_t *pl4e =
729 map_domain_page(pagetable_get_mfn(curr->arch.guest_table));
730
731 mfn = l4e_get_mfn(*pl4e);
732 unmap_domain_page(pl4e);
733 *val = compat_pfn_to_cr3(mfn_to_gmfn(currd, mfn_x(mfn)));
734 }
735 /* PTs should not be shared */
736 BUG_ON(page_get_owner(mfn_to_page(mfn)) == dom_cow);
737 return X86EMUL_OKAY;
738 }
739 }
740
741 return X86EMUL_UNHANDLEABLE;
742 }
743
write_cr(unsigned int reg,unsigned long val,struct x86_emulate_ctxt * ctxt)744 static int write_cr(unsigned int reg, unsigned long val,
745 struct x86_emulate_ctxt *ctxt)
746 {
747 struct vcpu *curr = current;
748
749 switch ( reg )
750 {
751 case 0: /* Write CR0 */
752 if ( (val ^ read_cr0()) & ~X86_CR0_TS )
753 {
754 gdprintk(XENLOG_WARNING,
755 "Attempt to change unmodifiable CR0 flags\n");
756 break;
757 }
758 do_fpu_taskswitch(!!(val & X86_CR0_TS));
759 return X86EMUL_OKAY;
760
761 case 2: /* Write CR2 */
762 curr->arch.pv_vcpu.ctrlreg[2] = val;
763 arch_set_cr2(curr, val);
764 return X86EMUL_OKAY;
765
766 case 3: /* Write CR3 */
767 {
768 struct domain *currd = curr->domain;
769 unsigned long gfn;
770 struct page_info *page;
771 int rc;
772
773 gfn = !is_pv_32bit_domain(currd)
774 ? xen_cr3_to_pfn(val) : compat_cr3_to_pfn(val);
775 page = get_page_from_gfn(currd, gfn, NULL, P2M_ALLOC);
776 if ( !page )
777 break;
778 rc = new_guest_cr3(page_to_mfn(page));
779 put_page(page);
780
781 switch ( rc )
782 {
783 case 0:
784 return X86EMUL_OKAY;
785 case -ERESTART: /* retry after preemption */
786 return X86EMUL_RETRY;
787 }
788 break;
789 }
790
791 case 4: /* Write CR4 */
792 curr->arch.pv_vcpu.ctrlreg[4] = pv_guest_cr4_fixup(curr, val);
793 write_cr4(pv_guest_cr4_to_real_cr4(curr));
794 ctxt_switch_levelling(curr);
795 return X86EMUL_OKAY;
796 }
797
798 return X86EMUL_UNHANDLEABLE;
799 }
800
read_dr(unsigned int reg,unsigned long * val,struct x86_emulate_ctxt * ctxt)801 static int read_dr(unsigned int reg, unsigned long *val,
802 struct x86_emulate_ctxt *ctxt)
803 {
804 unsigned long res = do_get_debugreg(reg);
805
806 if ( IS_ERR_VALUE(res) )
807 return X86EMUL_UNHANDLEABLE;
808
809 *val = res;
810
811 return X86EMUL_OKAY;
812 }
813
write_dr(unsigned int reg,unsigned long val,struct x86_emulate_ctxt * ctxt)814 static int write_dr(unsigned int reg, unsigned long val,
815 struct x86_emulate_ctxt *ctxt)
816 {
817 return do_set_debugreg(reg, val) == 0
818 ? X86EMUL_OKAY : X86EMUL_UNHANDLEABLE;
819 }
820
guest_misc_enable(uint64_t val)821 static inline uint64_t guest_misc_enable(uint64_t val)
822 {
823 val &= ~(MSR_IA32_MISC_ENABLE_PERF_AVAIL |
824 MSR_IA32_MISC_ENABLE_MONITOR_ENABLE);
825 val |= MSR_IA32_MISC_ENABLE_BTS_UNAVAIL |
826 MSR_IA32_MISC_ENABLE_PEBS_UNAVAIL |
827 MSR_IA32_MISC_ENABLE_XTPR_DISABLE;
828 return val;
829 }
830
is_cpufreq_controller(const struct domain * d)831 static inline bool is_cpufreq_controller(const struct domain *d)
832 {
833 return ((cpufreq_controller == FREQCTL_dom0_kernel) &&
834 is_hardware_domain(d));
835 }
836
read_msr(unsigned int reg,uint64_t * val,struct x86_emulate_ctxt * ctxt)837 static int read_msr(unsigned int reg, uint64_t *val,
838 struct x86_emulate_ctxt *ctxt)
839 {
840 struct priv_op_ctxt *poc = container_of(ctxt, struct priv_op_ctxt, ctxt);
841 const struct vcpu *curr = current;
842 const struct domain *currd = curr->domain;
843 bool vpmu_msr = false;
844 int ret;
845
846 if ( (ret = guest_rdmsr(curr, reg, val)) != X86EMUL_UNHANDLEABLE )
847 {
848 if ( ret == X86EMUL_EXCEPTION )
849 x86_emul_hw_exception(TRAP_gp_fault, 0, ctxt);
850
851 return ret;
852 }
853
854 switch ( reg )
855 {
856 int rc;
857
858 case MSR_FS_BASE:
859 if ( is_pv_32bit_domain(currd) )
860 break;
861 *val = cpu_has_fsgsbase ? __rdfsbase() : curr->arch.pv_vcpu.fs_base;
862 return X86EMUL_OKAY;
863
864 case MSR_GS_BASE:
865 if ( is_pv_32bit_domain(currd) )
866 break;
867 *val = cpu_has_fsgsbase ? __rdgsbase()
868 : curr->arch.pv_vcpu.gs_base_kernel;
869 return X86EMUL_OKAY;
870
871 case MSR_SHADOW_GS_BASE:
872 if ( is_pv_32bit_domain(currd) )
873 break;
874 *val = curr->arch.pv_vcpu.gs_base_user;
875 return X86EMUL_OKAY;
876
877 /*
878 * In order to fully retain original behavior, defer calling
879 * pv_soft_rdtsc() until after emulation. This may want/need to be
880 * reconsidered.
881 */
882 case MSR_IA32_TSC:
883 poc->tsc |= TSC_BASE;
884 goto normal;
885
886 case MSR_TSC_AUX:
887 poc->tsc |= TSC_AUX;
888 if ( cpu_has_rdtscp )
889 goto normal;
890 *val = 0;
891 return X86EMUL_OKAY;
892
893 case MSR_EFER:
894 *val = read_efer();
895 if ( is_pv_32bit_domain(currd) )
896 *val &= ~(EFER_LME | EFER_LMA | EFER_LMSLE);
897 return X86EMUL_OKAY;
898
899 case MSR_K7_FID_VID_CTL:
900 case MSR_K7_FID_VID_STATUS:
901 case MSR_K8_PSTATE_LIMIT:
902 case MSR_K8_PSTATE_CTRL:
903 case MSR_K8_PSTATE_STATUS:
904 case MSR_K8_PSTATE0:
905 case MSR_K8_PSTATE1:
906 case MSR_K8_PSTATE2:
907 case MSR_K8_PSTATE3:
908 case MSR_K8_PSTATE4:
909 case MSR_K8_PSTATE5:
910 case MSR_K8_PSTATE6:
911 case MSR_K8_PSTATE7:
912 if ( boot_cpu_data.x86_vendor != X86_VENDOR_AMD )
913 break;
914 if ( unlikely(is_cpufreq_controller(currd)) )
915 goto normal;
916 *val = 0;
917 return X86EMUL_OKAY;
918
919 case MSR_IA32_UCODE_REV:
920 BUILD_BUG_ON(MSR_IA32_UCODE_REV != MSR_AMD_PATCHLEVEL);
921 if ( boot_cpu_data.x86_vendor == X86_VENDOR_INTEL )
922 {
923 if ( wrmsr_safe(MSR_IA32_UCODE_REV, 0) )
924 break;
925 /* As documented in the SDM: Do a CPUID 1 here */
926 cpuid_eax(1);
927 }
928 goto normal;
929
930 case MSR_IA32_MISC_ENABLE:
931 if ( rdmsr_safe(reg, *val) )
932 break;
933 *val = guest_misc_enable(*val);
934 return X86EMUL_OKAY;
935
936 case MSR_AMD64_DR0_ADDRESS_MASK:
937 if ( !boot_cpu_has(X86_FEATURE_DBEXT) )
938 break;
939 *val = curr->arch.pv_vcpu.dr_mask[0];
940 return X86EMUL_OKAY;
941
942 case MSR_AMD64_DR1_ADDRESS_MASK ... MSR_AMD64_DR3_ADDRESS_MASK:
943 if ( !boot_cpu_has(X86_FEATURE_DBEXT) )
944 break;
945 *val = curr->arch.pv_vcpu.dr_mask[reg - MSR_AMD64_DR1_ADDRESS_MASK + 1];
946 return X86EMUL_OKAY;
947
948 case MSR_IA32_PERF_CAPABILITIES:
949 /* No extra capabilities are supported. */
950 *val = 0;
951 return X86EMUL_OKAY;
952
953 case MSR_P6_PERFCTR(0) ... MSR_P6_PERFCTR(7):
954 case MSR_P6_EVNTSEL(0) ... MSR_P6_EVNTSEL(3):
955 case MSR_CORE_PERF_FIXED_CTR0 ... MSR_CORE_PERF_FIXED_CTR2:
956 case MSR_CORE_PERF_FIXED_CTR_CTRL ... MSR_CORE_PERF_GLOBAL_OVF_CTRL:
957 if ( boot_cpu_data.x86_vendor == X86_VENDOR_INTEL )
958 {
959 vpmu_msr = true;
960 /* fall through */
961 case MSR_AMD_FAM15H_EVNTSEL0 ... MSR_AMD_FAM15H_PERFCTR5:
962 case MSR_K7_EVNTSEL0 ... MSR_K7_PERFCTR3:
963 if ( vpmu_msr || (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) )
964 {
965 if ( vpmu_do_rdmsr(reg, val) )
966 break;
967 return X86EMUL_OKAY;
968 }
969 }
970 /* fall through */
971 default:
972 if ( rdmsr_hypervisor_regs(reg, val) )
973 return X86EMUL_OKAY;
974
975 rc = vmce_rdmsr(reg, val);
976 if ( rc < 0 )
977 break;
978 if ( rc )
979 return X86EMUL_OKAY;
980 /* fall through */
981 normal:
982 /* Everyone can read the MSR space. */
983 /* gdprintk(XENLOG_WARNING, "Domain attempted RDMSR %08x\n", reg); */
984 if ( rdmsr_safe(reg, *val) )
985 break;
986 return X86EMUL_OKAY;
987 }
988
989 return X86EMUL_UNHANDLEABLE;
990 }
991
write_msr(unsigned int reg,uint64_t val,struct x86_emulate_ctxt * ctxt)992 static int write_msr(unsigned int reg, uint64_t val,
993 struct x86_emulate_ctxt *ctxt)
994 {
995 struct vcpu *curr = current;
996 const struct domain *currd = curr->domain;
997 bool vpmu_msr = false;
998 int ret;
999
1000 if ( (ret = guest_wrmsr(curr, reg, val)) != X86EMUL_UNHANDLEABLE )
1001 {
1002 if ( ret == X86EMUL_EXCEPTION )
1003 x86_emul_hw_exception(TRAP_gp_fault, 0, ctxt);
1004
1005 return ret;
1006 }
1007
1008 switch ( reg )
1009 {
1010 uint64_t temp;
1011 int rc;
1012
1013 case MSR_FS_BASE:
1014 if ( is_pv_32bit_domain(currd) || !is_canonical_address(val) )
1015 break;
1016 wrfsbase(val);
1017 curr->arch.pv_vcpu.fs_base = val;
1018 return X86EMUL_OKAY;
1019
1020 case MSR_GS_BASE:
1021 if ( is_pv_32bit_domain(currd) || !is_canonical_address(val) )
1022 break;
1023 wrgsbase(val);
1024 curr->arch.pv_vcpu.gs_base_kernel = val;
1025 return X86EMUL_OKAY;
1026
1027 case MSR_SHADOW_GS_BASE:
1028 if ( is_pv_32bit_domain(currd) || !is_canonical_address(val) )
1029 break;
1030 wrmsrl(MSR_SHADOW_GS_BASE, val);
1031 curr->arch.pv_vcpu.gs_base_user = val;
1032 return X86EMUL_OKAY;
1033
1034 case MSR_K7_FID_VID_STATUS:
1035 case MSR_K7_FID_VID_CTL:
1036 case MSR_K8_PSTATE_LIMIT:
1037 case MSR_K8_PSTATE_CTRL:
1038 case MSR_K8_PSTATE_STATUS:
1039 case MSR_K8_PSTATE0:
1040 case MSR_K8_PSTATE1:
1041 case MSR_K8_PSTATE2:
1042 case MSR_K8_PSTATE3:
1043 case MSR_K8_PSTATE4:
1044 case MSR_K8_PSTATE5:
1045 case MSR_K8_PSTATE6:
1046 case MSR_K8_PSTATE7:
1047 case MSR_K8_HWCR:
1048 if ( boot_cpu_data.x86_vendor != X86_VENDOR_AMD )
1049 break;
1050 if ( likely(!is_cpufreq_controller(currd)) ||
1051 wrmsr_safe(reg, val) == 0 )
1052 return X86EMUL_OKAY;
1053 break;
1054
1055 case MSR_AMD64_NB_CFG:
1056 if ( boot_cpu_data.x86_vendor != X86_VENDOR_AMD ||
1057 boot_cpu_data.x86 < 0x10 || boot_cpu_data.x86 > 0x17 )
1058 break;
1059 if ( !is_hardware_domain(currd) || !is_pinned_vcpu(curr) )
1060 return X86EMUL_OKAY;
1061 if ( (rdmsr_safe(MSR_AMD64_NB_CFG, temp) != 0) ||
1062 ((val ^ temp) & ~(1ULL << AMD64_NB_CFG_CF8_EXT_ENABLE_BIT)) )
1063 goto invalid;
1064 if ( wrmsr_safe(MSR_AMD64_NB_CFG, val) == 0 )
1065 return X86EMUL_OKAY;
1066 break;
1067
1068 case MSR_FAM10H_MMIO_CONF_BASE:
1069 if ( boot_cpu_data.x86_vendor != X86_VENDOR_AMD ||
1070 boot_cpu_data.x86 < 0x10 || boot_cpu_data.x86 > 0x17 )
1071 break;
1072 if ( !is_hardware_domain(currd) || !is_pinned_vcpu(curr) )
1073 return X86EMUL_OKAY;
1074 if ( rdmsr_safe(MSR_FAM10H_MMIO_CONF_BASE, temp) != 0 )
1075 break;
1076 if ( (pci_probe & PCI_PROBE_MASK) == PCI_PROBE_MMCONF ?
1077 temp != val :
1078 ((temp ^ val) &
1079 ~(FAM10H_MMIO_CONF_ENABLE |
1080 (FAM10H_MMIO_CONF_BUSRANGE_MASK <<
1081 FAM10H_MMIO_CONF_BUSRANGE_SHIFT) |
1082 ((u64)FAM10H_MMIO_CONF_BASE_MASK <<
1083 FAM10H_MMIO_CONF_BASE_SHIFT))) )
1084 goto invalid;
1085 if ( wrmsr_safe(MSR_FAM10H_MMIO_CONF_BASE, val) == 0 )
1086 return X86EMUL_OKAY;
1087 break;
1088
1089 case MSR_IA32_UCODE_REV:
1090 if ( boot_cpu_data.x86_vendor != X86_VENDOR_INTEL )
1091 break;
1092 if ( !is_hardware_domain(currd) || !is_pinned_vcpu(curr) )
1093 return X86EMUL_OKAY;
1094 if ( rdmsr_safe(reg, temp) )
1095 break;
1096 if ( val )
1097 goto invalid;
1098 return X86EMUL_OKAY;
1099
1100 case MSR_IA32_MISC_ENABLE:
1101 if ( rdmsr_safe(reg, temp) )
1102 break;
1103 if ( val != guest_misc_enable(temp) )
1104 goto invalid;
1105 return X86EMUL_OKAY;
1106
1107 case MSR_IA32_MPERF:
1108 case MSR_IA32_APERF:
1109 if ( (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL) &&
1110 (boot_cpu_data.x86_vendor != X86_VENDOR_AMD) )
1111 break;
1112 if ( likely(!is_cpufreq_controller(currd)) ||
1113 wrmsr_safe(reg, val) == 0 )
1114 return X86EMUL_OKAY;
1115 break;
1116
1117 case MSR_IA32_PERF_CTL:
1118 if ( boot_cpu_data.x86_vendor != X86_VENDOR_INTEL )
1119 break;
1120 if ( likely(!is_cpufreq_controller(currd)) ||
1121 wrmsr_safe(reg, val) == 0 )
1122 return X86EMUL_OKAY;
1123 break;
1124
1125 case MSR_IA32_THERM_CONTROL:
1126 case MSR_IA32_ENERGY_PERF_BIAS:
1127 if ( boot_cpu_data.x86_vendor != X86_VENDOR_INTEL )
1128 break;
1129 if ( !is_hardware_domain(currd) || !is_pinned_vcpu(curr) ||
1130 wrmsr_safe(reg, val) == 0 )
1131 return X86EMUL_OKAY;
1132 break;
1133
1134 case MSR_AMD64_DR0_ADDRESS_MASK:
1135 if ( !boot_cpu_has(X86_FEATURE_DBEXT) || (val >> 32) )
1136 break;
1137 curr->arch.pv_vcpu.dr_mask[0] = val;
1138 if ( curr->arch.debugreg[7] & DR7_ACTIVE_MASK )
1139 wrmsrl(MSR_AMD64_DR0_ADDRESS_MASK, val);
1140 return X86EMUL_OKAY;
1141
1142 case MSR_AMD64_DR1_ADDRESS_MASK ... MSR_AMD64_DR3_ADDRESS_MASK:
1143 if ( !boot_cpu_has(X86_FEATURE_DBEXT) || (val >> 32) )
1144 break;
1145 curr->arch.pv_vcpu.dr_mask[reg - MSR_AMD64_DR1_ADDRESS_MASK + 1] = val;
1146 if ( curr->arch.debugreg[7] & DR7_ACTIVE_MASK )
1147 wrmsrl(reg, val);
1148 return X86EMUL_OKAY;
1149
1150 case MSR_P6_PERFCTR(0) ... MSR_P6_PERFCTR(7):
1151 case MSR_P6_EVNTSEL(0) ... MSR_P6_EVNTSEL(3):
1152 case MSR_CORE_PERF_FIXED_CTR0 ... MSR_CORE_PERF_FIXED_CTR2:
1153 case MSR_CORE_PERF_FIXED_CTR_CTRL ... MSR_CORE_PERF_GLOBAL_OVF_CTRL:
1154 if ( boot_cpu_data.x86_vendor == X86_VENDOR_INTEL )
1155 {
1156 vpmu_msr = true;
1157 case MSR_AMD_FAM15H_EVNTSEL0 ... MSR_AMD_FAM15H_PERFCTR5:
1158 case MSR_K7_EVNTSEL0 ... MSR_K7_PERFCTR3:
1159 if ( vpmu_msr || (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) )
1160 {
1161 if ( (vpmu_mode & XENPMU_MODE_ALL) &&
1162 !is_hardware_domain(currd) )
1163 return X86EMUL_OKAY;
1164
1165 if ( vpmu_do_wrmsr(reg, val, 0) )
1166 break;
1167 return X86EMUL_OKAY;
1168 }
1169 }
1170 /* fall through */
1171 default:
1172 if ( wrmsr_hypervisor_regs(reg, val) == 1 )
1173 return X86EMUL_OKAY;
1174
1175 rc = vmce_wrmsr(reg, val);
1176 if ( rc < 0 )
1177 break;
1178 if ( rc )
1179 return X86EMUL_OKAY;
1180
1181 if ( (rdmsr_safe(reg, temp) != 0) || (val != temp) )
1182 invalid:
1183 gdprintk(XENLOG_WARNING,
1184 "Domain attempted WRMSR %08x from 0x%016"PRIx64" to 0x%016"PRIx64"\n",
1185 reg, temp, val);
1186 return X86EMUL_OKAY;
1187 }
1188
1189 return X86EMUL_UNHANDLEABLE;
1190 }
1191
1192 /* Name it differently to avoid clashing with wbinvd() */
_wbinvd(struct x86_emulate_ctxt * ctxt)1193 static int _wbinvd(struct x86_emulate_ctxt *ctxt)
1194 {
1195 /* Ignore the instruction if unprivileged. */
1196 if ( !cache_flush_permitted(current->domain) )
1197 /*
1198 * Non-physdev domain attempted WBINVD; ignore for now since
1199 * newer linux uses this in some start-of-day timing loops.
1200 */
1201 ;
1202 else
1203 wbinvd();
1204
1205 return X86EMUL_OKAY;
1206 }
1207
pv_emul_cpuid(uint32_t leaf,uint32_t subleaf,struct cpuid_leaf * res,struct x86_emulate_ctxt * ctxt)1208 int pv_emul_cpuid(uint32_t leaf, uint32_t subleaf,
1209 struct cpuid_leaf *res, struct x86_emulate_ctxt *ctxt)
1210 {
1211 guest_cpuid(current, leaf, subleaf, res);
1212
1213 return X86EMUL_OKAY;
1214 }
1215
validate(const struct x86_emulate_state * state,struct x86_emulate_ctxt * ctxt)1216 static int validate(const struct x86_emulate_state *state,
1217 struct x86_emulate_ctxt *ctxt)
1218 {
1219 switch ( ctxt->opcode )
1220 {
1221 case 0x6c ... 0x6f: /* ins / outs */
1222 case 0xe4 ... 0xe7: /* in / out (immediate port) */
1223 case 0xec ... 0xef: /* in / out (port in %dx) */
1224 case X86EMUL_OPC(0x0f, 0x06): /* clts */
1225 case X86EMUL_OPC(0x0f, 0x09): /* wbinvd */
1226 case X86EMUL_OPC(0x0f, 0x20) ...
1227 X86EMUL_OPC(0x0f, 0x23): /* mov to/from cr/dr */
1228 case X86EMUL_OPC(0x0f, 0x30): /* wrmsr */
1229 case X86EMUL_OPC(0x0f, 0x31): /* rdtsc */
1230 case X86EMUL_OPC(0x0f, 0x32): /* rdmsr */
1231 case X86EMUL_OPC(0x0f, 0xa2): /* cpuid */
1232 return X86EMUL_OKAY;
1233
1234 case 0xfa: case 0xfb: /* cli / sti */
1235 if ( !iopl_ok(current, ctxt->regs) )
1236 break;
1237 /*
1238 * This is just too dangerous to allow, in my opinion. Consider if the
1239 * caller then tries to reenable interrupts using POPF: we can't trap
1240 * that and we'll end up with hard-to-debug lockups. Fast & loose will
1241 * do for us. :-)
1242 vcpu_info(current, evtchn_upcall_mask) = (ctxt->opcode == 0xfa);
1243 */
1244 return X86EMUL_DONE;
1245
1246 case X86EMUL_OPC(0x0f, 0x01):
1247 {
1248 unsigned int modrm_rm, modrm_reg;
1249
1250 if ( x86_insn_modrm(state, &modrm_rm, &modrm_reg) != 3 ||
1251 (modrm_rm & 7) != 1 )
1252 break;
1253 switch ( modrm_reg & 7 )
1254 {
1255 case 2: /* xsetbv */
1256 case 7: /* rdtscp */
1257 return X86EMUL_OKAY;
1258 }
1259 break;
1260 }
1261 }
1262
1263 return X86EMUL_UNHANDLEABLE;
1264 }
1265
insn_fetch(enum x86_segment seg,unsigned long offset,void * p_data,unsigned int bytes,struct x86_emulate_ctxt * ctxt)1266 static int insn_fetch(enum x86_segment seg,
1267 unsigned long offset,
1268 void *p_data,
1269 unsigned int bytes,
1270 struct x86_emulate_ctxt *ctxt)
1271 {
1272 const struct priv_op_ctxt *poc =
1273 container_of(ctxt, struct priv_op_ctxt, ctxt);
1274 unsigned int rc;
1275 unsigned long addr = poc->cs.base + offset;
1276
1277 ASSERT(seg == x86_seg_cs);
1278
1279 /* We don't mean to emulate any branches. */
1280 if ( !bytes )
1281 return X86EMUL_UNHANDLEABLE;
1282
1283 rc = pv_emul_virt_to_linear(poc->cs.base, offset, bytes, poc->cs.limit,
1284 x86_seg_cs, ctxt, &addr);
1285 if ( rc != X86EMUL_OKAY )
1286 return rc;
1287
1288 if ( (rc = __copy_from_user(p_data, (void *)addr, bytes)) != 0 )
1289 {
1290 /*
1291 * TODO: This should report PFEC_insn_fetch when goc->insn_fetch &&
1292 * cpu_has_nx, but we'd then need a "fetch" variant of
1293 * __copy_from_user() respecting NX, SMEP, and protection keys.
1294 */
1295 x86_emul_pagefault(0, addr + bytes - rc, ctxt);
1296 return X86EMUL_EXCEPTION;
1297 }
1298
1299 return X86EMUL_OKAY;
1300 }
1301
1302
1303 static const struct x86_emulate_ops priv_op_ops = {
1304 .insn_fetch = insn_fetch,
1305 .read = x86emul_unhandleable_rw,
1306 .validate = validate,
1307 .read_io = read_io,
1308 .write_io = write_io,
1309 .rep_ins = rep_ins,
1310 .rep_outs = rep_outs,
1311 .read_segment = read_segment,
1312 .read_cr = read_cr,
1313 .write_cr = write_cr,
1314 .read_dr = read_dr,
1315 .write_dr = write_dr,
1316 .read_msr = read_msr,
1317 .write_msr = write_msr,
1318 .cpuid = pv_emul_cpuid,
1319 .wbinvd = _wbinvd,
1320 };
1321
pv_emulate_privileged_op(struct cpu_user_regs * regs)1322 int pv_emulate_privileged_op(struct cpu_user_regs *regs)
1323 {
1324 struct vcpu *curr = current;
1325 struct domain *currd = curr->domain;
1326 struct priv_op_ctxt ctxt = {
1327 .ctxt.regs = regs,
1328 .ctxt.vendor = currd->arch.cpuid->x86_vendor,
1329 .ctxt.lma = !is_pv_32bit_domain(currd),
1330 };
1331 int rc;
1332 unsigned int eflags, ar;
1333
1334 if ( !pv_emul_read_descriptor(regs->cs, curr, &ctxt.cs.base,
1335 &ctxt.cs.limit, &ar, 1) ||
1336 !(ar & _SEGMENT_S) ||
1337 !(ar & _SEGMENT_P) ||
1338 !(ar & _SEGMENT_CODE) )
1339 return 0;
1340
1341 /* Mirror virtualized state into EFLAGS. */
1342 ASSERT(regs->eflags & X86_EFLAGS_IF);
1343 if ( vcpu_info(curr, evtchn_upcall_mask) )
1344 regs->eflags &= ~X86_EFLAGS_IF;
1345 else
1346 regs->eflags |= X86_EFLAGS_IF;
1347 ASSERT(!(regs->eflags & X86_EFLAGS_IOPL));
1348 regs->eflags |= curr->arch.pv_vcpu.iopl;
1349 eflags = regs->eflags;
1350
1351 ctxt.ctxt.addr_size = ar & _SEGMENT_L ? 64 : ar & _SEGMENT_DB ? 32 : 16;
1352 /* Leave zero in ctxt.ctxt.sp_size, as it's not needed. */
1353 rc = x86_emulate(&ctxt.ctxt, &priv_op_ops);
1354
1355 if ( ctxt.io_emul_stub )
1356 unmap_domain_page(ctxt.io_emul_stub);
1357
1358 /*
1359 * Un-mirror virtualized state from EFLAGS.
1360 * Nothing we allow to be emulated can change anything other than the
1361 * arithmetic bits, and the resume flag.
1362 */
1363 ASSERT(!((regs->eflags ^ eflags) &
1364 ~(X86_EFLAGS_RF | X86_EFLAGS_ARITH_MASK)));
1365 regs->eflags |= X86_EFLAGS_IF;
1366 regs->eflags &= ~X86_EFLAGS_IOPL;
1367
1368 switch ( rc )
1369 {
1370 case X86EMUL_OKAY:
1371 if ( ctxt.tsc & TSC_BASE )
1372 {
1373 if ( ctxt.tsc & TSC_AUX )
1374 pv_soft_rdtsc(curr, regs, 1);
1375 else if ( currd->arch.vtsc )
1376 pv_soft_rdtsc(curr, regs, 0);
1377 else
1378 msr_split(regs, rdtsc());
1379 }
1380
1381 if ( ctxt.ctxt.retire.singlestep )
1382 ctxt.bpmatch |= DR_STEP;
1383 if ( ctxt.bpmatch )
1384 {
1385 curr->arch.debugreg[6] |= ctxt.bpmatch | DR_STATUS_RESERVED_ONE;
1386 if ( !(curr->arch.pv_vcpu.trap_bounce.flags & TBF_EXCEPTION) )
1387 pv_inject_hw_exception(TRAP_debug, X86_EVENT_NO_EC);
1388 }
1389 /* fall through */
1390 case X86EMUL_RETRY:
1391 return EXCRET_fault_fixed;
1392
1393 case X86EMUL_EXCEPTION:
1394 pv_inject_event(&ctxt.ctxt.event);
1395 return EXCRET_fault_fixed;
1396 }
1397
1398 return 0;
1399 }
1400
1401 /*
1402 * Local variables:
1403 * mode: C
1404 * c-file-style: "BSD"
1405 * c-basic-offset: 4
1406 * tab-width: 4
1407 * indent-tabs-mode: nil
1408 * End:
1409 */
1410