1 /******************************************************************************
2  * arch/x86/pv/emul-priv-op.c
3  *
4  * Emulate privileged instructions for PV guests
5  *
6  * Modifications to Linux original are copyright (c) 2002-2004, K A Fraser
7  *
8  * This program is free software; you can redistribute it and/or modify
9  * it under the terms of the GNU General Public License as published by
10  * the Free Software Foundation; either version 2 of the License, or
11  * (at your option) any later version.
12  *
13  * This program is distributed in the hope that it will be useful,
14  * but WITHOUT ANY WARRANTY; without even the implied warranty of
15  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
16  * GNU General Public License for more details.
17  *
18  * You should have received a copy of the GNU General Public License
19  * along with this program; If not, see <http://www.gnu.org/licenses/>.
20  */
21 
22 #include <xen/errno.h>
23 #include <xen/event.h>
24 #include <xen/guest_access.h>
25 #include <xen/iocap.h>
26 #include <xen/spinlock.h>
27 #include <xen/trace.h>
28 
29 #include <asm/apic.h>
30 #include <asm/debugreg.h>
31 #include <asm/hpet.h>
32 #include <asm/hypercall.h>
33 #include <asm/mc146818rtc.h>
34 #include <asm/p2m.h>
35 #include <asm/pv/traps.h>
36 #include <asm/shared.h>
37 #include <asm/traps.h>
38 #include <asm/x86_emulate.h>
39 
40 #include <xsm/xsm.h>
41 
42 #include "../x86_64/mmconfig.h"
43 #include "emulate.h"
44 #include "mm.h"
45 
46 /* Override macros from asm/page.h to make them work with mfn_t */
47 #undef mfn_to_page
48 #define mfn_to_page(mfn) __mfn_to_page(mfn_x(mfn))
49 #undef page_to_mfn
50 #define page_to_mfn(pg) _mfn(__page_to_mfn(pg))
51 
52 /***********************
53  * I/O emulation support
54  */
55 
56 struct priv_op_ctxt {
57     struct x86_emulate_ctxt ctxt;
58     struct {
59         unsigned long base, limit;
60     } cs;
61     char *io_emul_stub;
62     unsigned int bpmatch;
63     unsigned int tsc;
64 #define TSC_BASE 1
65 #define TSC_AUX 2
66 };
67 
68 /* I/O emulation support. Helper routines for, and type of, the stack stub. */
69 void host_to_guest_gpr_switch(struct cpu_user_regs *);
70 unsigned long guest_to_host_gpr_switch(unsigned long);
71 
72 void (*pv_post_outb_hook)(unsigned int port, u8 value);
73 
74 typedef void io_emul_stub_t(struct cpu_user_regs *);
75 
io_emul_stub_setup(struct priv_op_ctxt * ctxt,u8 opcode,unsigned int port,unsigned int bytes)76 static io_emul_stub_t *io_emul_stub_setup(struct priv_op_ctxt *ctxt, u8 opcode,
77                                           unsigned int port, unsigned int bytes)
78 {
79     if ( !ctxt->io_emul_stub )
80         ctxt->io_emul_stub = map_domain_page(_mfn(this_cpu(stubs.mfn))) +
81                                              (this_cpu(stubs.addr) &
82                                               ~PAGE_MASK) +
83                                              STUB_BUF_SIZE / 2;
84 
85     /* movq $host_to_guest_gpr_switch,%rcx */
86     ctxt->io_emul_stub[0] = 0x48;
87     ctxt->io_emul_stub[1] = 0xb9;
88     *(void **)&ctxt->io_emul_stub[2] = (void *)host_to_guest_gpr_switch;
89     /* callq *%rcx */
90     ctxt->io_emul_stub[10] = 0xff;
91     ctxt->io_emul_stub[11] = 0xd1;
92     /* data16 or nop */
93     ctxt->io_emul_stub[12] = (bytes != 2) ? 0x90 : 0x66;
94     /* <io-access opcode> */
95     ctxt->io_emul_stub[13] = opcode;
96     /* imm8 or nop */
97     ctxt->io_emul_stub[14] = !(opcode & 8) ? port : 0x90;
98     /* ret (jumps to guest_to_host_gpr_switch) */
99     ctxt->io_emul_stub[15] = 0xc3;
100     BUILD_BUG_ON(STUB_BUF_SIZE / 2 < 16);
101 
102     if ( ioemul_handle_quirk )
103         ioemul_handle_quirk(opcode, &ctxt->io_emul_stub[12], ctxt->ctxt.regs);
104 
105     /* Handy function-typed pointer to the stub. */
106     return (void *)(this_cpu(stubs.addr) + STUB_BUF_SIZE / 2);
107 }
108 
109 
110 /* Perform IOPL check between the vcpu's shadowed IOPL, and the assumed cpl. */
iopl_ok(const struct vcpu * v,const struct cpu_user_regs * regs)111 static bool iopl_ok(const struct vcpu *v, const struct cpu_user_regs *regs)
112 {
113     unsigned int cpl = guest_kernel_mode(v, regs) ?
114         (VM_ASSIST(v->domain, architectural_iopl) ? 0 : 1) : 3;
115 
116     ASSERT((v->arch.pv_vcpu.iopl & ~X86_EFLAGS_IOPL) == 0);
117 
118     return IOPL(cpl) <= v->arch.pv_vcpu.iopl;
119 }
120 
121 /* Has the guest requested sufficient permission for this I/O access? */
guest_io_okay(unsigned int port,unsigned int bytes,struct vcpu * v,struct cpu_user_regs * regs)122 static bool guest_io_okay(unsigned int port, unsigned int bytes,
123                           struct vcpu *v, struct cpu_user_regs *regs)
124 {
125     /* If in user mode, switch to kernel mode just to read I/O bitmap. */
126     const bool user_mode = !(v->arch.flags & TF_kernel_mode);
127 
128     if ( iopl_ok(v, regs) )
129         return true;
130 
131     if ( v->arch.pv_vcpu.iobmp_limit > (port + bytes) )
132     {
133         union { uint8_t bytes[2]; uint16_t mask; } x;
134 
135         /*
136          * Grab permission bytes from guest space. Inaccessible bytes are
137          * read as 0xff (no access allowed).
138          */
139         if ( user_mode )
140             toggle_guest_pt(v);
141 
142         switch ( __copy_from_guest_offset(x.bytes, v->arch.pv_vcpu.iobmp,
143                                           port>>3, 2) )
144         {
145         default: x.bytes[0] = ~0;
146             /* fallthrough */
147         case 1:  x.bytes[1] = ~0;
148             /* fallthrough */
149         case 0:  break;
150         }
151 
152         if ( user_mode )
153             toggle_guest_pt(v);
154 
155         if ( (x.mask & (((1 << bytes) - 1) << (port & 7))) == 0 )
156             return true;
157     }
158 
159     return false;
160 }
161 
162 /* Has the administrator granted sufficient permission for this I/O access? */
admin_io_okay(unsigned int port,unsigned int bytes,const struct domain * d)163 static bool admin_io_okay(unsigned int port, unsigned int bytes,
164                           const struct domain *d)
165 {
166     /*
167      * Port 0xcf8 (CONFIG_ADDRESS) is only visible for DWORD accesses.
168      * We never permit direct access to that register.
169      */
170     if ( (port == 0xcf8) && (bytes == 4) )
171         return false;
172 
173     /* We also never permit direct access to the RTC/CMOS registers. */
174     if ( ((port & ~1) == RTC_PORT(0)) )
175         return false;
176 
177     return ioports_access_permitted(d, port, port + bytes - 1);
178 }
179 
pci_cfg_ok(struct domain * currd,unsigned int start,unsigned int size,uint32_t * write)180 static bool pci_cfg_ok(struct domain *currd, unsigned int start,
181                        unsigned int size, uint32_t *write)
182 {
183     uint32_t machine_bdf;
184 
185     if ( !is_hardware_domain(currd) )
186         return false;
187 
188     if ( !CF8_ENABLED(currd->arch.pci_cf8) )
189         return true;
190 
191     machine_bdf = CF8_BDF(currd->arch.pci_cf8);
192     if ( write )
193     {
194         const unsigned long *ro_map = pci_get_ro_map(0);
195 
196         if ( ro_map && test_bit(machine_bdf, ro_map) )
197             return false;
198     }
199     start |= CF8_ADDR_LO(currd->arch.pci_cf8);
200     /* AMD extended configuration space access? */
201     if ( CF8_ADDR_HI(currd->arch.pci_cf8) &&
202          boot_cpu_data.x86_vendor == X86_VENDOR_AMD &&
203          boot_cpu_data.x86 >= 0x10 && boot_cpu_data.x86 <= 0x17 )
204     {
205         uint64_t msr_val;
206 
207         if ( rdmsr_safe(MSR_AMD64_NB_CFG, msr_val) )
208             return false;
209         if ( msr_val & (1ULL << AMD64_NB_CFG_CF8_EXT_ENABLE_BIT) )
210             start |= CF8_ADDR_HI(currd->arch.pci_cf8);
211     }
212 
213     return !write ?
214            xsm_pci_config_permission(XSM_HOOK, currd, machine_bdf,
215                                      start, start + size - 1, 0) == 0 :
216            pci_conf_write_intercept(0, machine_bdf, start, size, write) >= 0;
217 }
218 
guest_io_read(unsigned int port,unsigned int bytes,struct domain * currd)219 uint32_t guest_io_read(unsigned int port, unsigned int bytes,
220                        struct domain *currd)
221 {
222     uint32_t data = 0;
223     unsigned int shift = 0;
224 
225     if ( admin_io_okay(port, bytes, currd) )
226     {
227         switch ( bytes )
228         {
229         case 1: return inb(port);
230         case 2: return inw(port);
231         case 4: return inl(port);
232         }
233     }
234 
235     while ( bytes != 0 )
236     {
237         unsigned int size = 1;
238         uint32_t sub_data = ~0;
239 
240         if ( (port == 0x42) || (port == 0x43) || (port == 0x61) )
241         {
242             sub_data = pv_pit_handler(port, 0, 0);
243         }
244         else if ( port == RTC_PORT(0) )
245         {
246             sub_data = currd->arch.cmos_idx;
247         }
248         else if ( (port == RTC_PORT(1)) &&
249                   ioports_access_permitted(currd, RTC_PORT(0), RTC_PORT(1)) )
250         {
251             unsigned long flags;
252 
253             spin_lock_irqsave(&rtc_lock, flags);
254             outb(currd->arch.cmos_idx & 0x7f, RTC_PORT(0));
255             sub_data = inb(RTC_PORT(1));
256             spin_unlock_irqrestore(&rtc_lock, flags);
257         }
258         else if ( (port == 0xcf8) && (bytes == 4) )
259         {
260             size = 4;
261             sub_data = currd->arch.pci_cf8;
262         }
263         else if ( (port & 0xfffc) == 0xcfc )
264         {
265             size = min(bytes, 4 - (port & 3));
266             if ( size == 3 )
267                 size = 2;
268             if ( pci_cfg_ok(currd, port & 3, size, NULL) )
269                 sub_data = pci_conf_read(currd->arch.pci_cf8, port & 3, size);
270         }
271 
272         if ( size == 4 )
273             return sub_data;
274 
275         data |= (sub_data & ((1u << (size * 8)) - 1)) << shift;
276         shift += size * 8;
277         port += size;
278         bytes -= size;
279     }
280 
281     return data;
282 }
283 
check_guest_io_breakpoint(struct vcpu * v,unsigned int port,unsigned int len)284 static unsigned int check_guest_io_breakpoint(struct vcpu *v,
285                                               unsigned int port,
286                                               unsigned int len)
287 {
288     unsigned int width, i, match = 0;
289     unsigned long start;
290 
291     if ( !(v->arch.debugreg[5]) ||
292          !(v->arch.pv_vcpu.ctrlreg[4] & X86_CR4_DE) )
293         return 0;
294 
295     for ( i = 0; i < 4; i++ )
296     {
297         if ( !(v->arch.debugreg[5] &
298                (3 << (i * DR_ENABLE_SIZE))) )
299             continue;
300 
301         start = v->arch.debugreg[i];
302         width = 0;
303 
304         switch ( (v->arch.debugreg[7] >>
305                   (DR_CONTROL_SHIFT + i * DR_CONTROL_SIZE)) & 0xc )
306         {
307         case DR_LEN_1: width = 1; break;
308         case DR_LEN_2: width = 2; break;
309         case DR_LEN_4: width = 4; break;
310         case DR_LEN_8: width = 8; break;
311         }
312 
313         if ( (start < (port + len)) && ((start + width) > port) )
314             match |= 1u << i;
315     }
316 
317     return match;
318 }
319 
read_io(unsigned int port,unsigned int bytes,unsigned long * val,struct x86_emulate_ctxt * ctxt)320 static int read_io(unsigned int port, unsigned int bytes,
321                    unsigned long *val, struct x86_emulate_ctxt *ctxt)
322 {
323     struct priv_op_ctxt *poc = container_of(ctxt, struct priv_op_ctxt, ctxt);
324     struct vcpu *curr = current;
325     struct domain *currd = current->domain;
326 
327     /* INS must not come here. */
328     ASSERT((ctxt->opcode & ~9) == 0xe4);
329 
330     if ( !guest_io_okay(port, bytes, curr, ctxt->regs) )
331         return X86EMUL_UNHANDLEABLE;
332 
333     poc->bpmatch = check_guest_io_breakpoint(curr, port, bytes);
334 
335     if ( admin_io_okay(port, bytes, currd) )
336     {
337         io_emul_stub_t *io_emul =
338             io_emul_stub_setup(poc, ctxt->opcode, port, bytes);
339 
340         mark_regs_dirty(ctxt->regs);
341         io_emul(ctxt->regs);
342         return X86EMUL_DONE;
343     }
344 
345     *val = guest_io_read(port, bytes, currd);
346 
347     return X86EMUL_OKAY;
348 }
349 
guest_io_write(unsigned int port,unsigned int bytes,uint32_t data,struct domain * currd)350 void guest_io_write(unsigned int port, unsigned int bytes, uint32_t data,
351                     struct domain *currd)
352 {
353     if ( admin_io_okay(port, bytes, currd) )
354     {
355         switch ( bytes )
356         {
357         case 1:
358             outb((uint8_t)data, port);
359             if ( pv_post_outb_hook )
360                 pv_post_outb_hook(port, (uint8_t)data);
361             break;
362         case 2:
363             outw((uint16_t)data, port);
364             break;
365         case 4:
366             outl(data, port);
367             break;
368         }
369         return;
370     }
371 
372     while ( bytes != 0 )
373     {
374         unsigned int size = 1;
375 
376         if ( (port == 0x42) || (port == 0x43) || (port == 0x61) )
377         {
378             pv_pit_handler(port, (uint8_t)data, 1);
379         }
380         else if ( port == RTC_PORT(0) )
381         {
382             currd->arch.cmos_idx = data;
383         }
384         else if ( (port == RTC_PORT(1)) &&
385                   ioports_access_permitted(currd, RTC_PORT(0), RTC_PORT(1)) )
386         {
387             unsigned long flags;
388 
389             if ( pv_rtc_handler )
390                 pv_rtc_handler(currd->arch.cmos_idx & 0x7f, data);
391             spin_lock_irqsave(&rtc_lock, flags);
392             outb(currd->arch.cmos_idx & 0x7f, RTC_PORT(0));
393             outb(data, RTC_PORT(1));
394             spin_unlock_irqrestore(&rtc_lock, flags);
395         }
396         else if ( (port == 0xcf8) && (bytes == 4) )
397         {
398             size = 4;
399             currd->arch.pci_cf8 = data;
400         }
401         else if ( (port & 0xfffc) == 0xcfc )
402         {
403             size = min(bytes, 4 - (port & 3));
404             if ( size == 3 )
405                 size = 2;
406             if ( pci_cfg_ok(currd, port & 3, size, &data) )
407                 pci_conf_write(currd->arch.pci_cf8, port & 3, size, data);
408         }
409 
410         if ( size == 4 )
411             return;
412 
413         port += size;
414         bytes -= size;
415         data >>= size * 8;
416     }
417 }
418 
write_io(unsigned int port,unsigned int bytes,unsigned long val,struct x86_emulate_ctxt * ctxt)419 static int write_io(unsigned int port, unsigned int bytes,
420                     unsigned long val, struct x86_emulate_ctxt *ctxt)
421 {
422     struct priv_op_ctxt *poc = container_of(ctxt, struct priv_op_ctxt, ctxt);
423     struct vcpu *curr = current;
424     struct domain *currd = current->domain;
425 
426     /* OUTS must not come here. */
427     ASSERT((ctxt->opcode & ~9) == 0xe6);
428 
429     if ( !guest_io_okay(port, bytes, curr, ctxt->regs) )
430         return X86EMUL_UNHANDLEABLE;
431 
432     poc->bpmatch = check_guest_io_breakpoint(curr, port, bytes);
433 
434     if ( admin_io_okay(port, bytes, currd) )
435     {
436         io_emul_stub_t *io_emul =
437             io_emul_stub_setup(poc, ctxt->opcode, port, bytes);
438 
439         mark_regs_dirty(ctxt->regs);
440         io_emul(ctxt->regs);
441         if ( (bytes == 1) && pv_post_outb_hook )
442             pv_post_outb_hook(port, val);
443         return X86EMUL_DONE;
444     }
445 
446     guest_io_write(port, bytes, val, currd);
447 
448     return X86EMUL_OKAY;
449 }
450 
read_segment(enum x86_segment seg,struct segment_register * reg,struct x86_emulate_ctxt * ctxt)451 static int read_segment(enum x86_segment seg,
452                         struct segment_register *reg,
453                         struct x86_emulate_ctxt *ctxt)
454 {
455     /* Check if this is an attempt to access the I/O bitmap. */
456     if ( seg == x86_seg_tr )
457     {
458         switch ( ctxt->opcode )
459         {
460         case 0x6c ... 0x6f: /* ins / outs */
461         case 0xe4 ... 0xe7: /* in / out (immediate port) */
462         case 0xec ... 0xef: /* in / out (port in %dx) */
463             /* Defer the check to priv_op_{read,write}_io(). */
464             return X86EMUL_DONE;
465         }
466     }
467 
468     if ( ctxt->addr_size < 64 )
469     {
470         unsigned long limit;
471         unsigned int sel, ar;
472 
473         switch ( seg )
474         {
475         case x86_seg_cs: sel = ctxt->regs->cs; break;
476         case x86_seg_ds: sel = read_sreg(ds);  break;
477         case x86_seg_es: sel = read_sreg(es);  break;
478         case x86_seg_fs: sel = read_sreg(fs);  break;
479         case x86_seg_gs: sel = read_sreg(gs);  break;
480         case x86_seg_ss: sel = ctxt->regs->ss; break;
481         default: return X86EMUL_UNHANDLEABLE;
482         }
483 
484         if ( !pv_emul_read_descriptor(sel, current, &reg->base,
485                                       &limit, &ar, 0) )
486             return X86EMUL_UNHANDLEABLE;
487 
488         reg->limit = limit;
489         reg->attr = ar >> 8;
490     }
491     else
492     {
493         switch ( seg )
494         {
495         default:
496             if ( !is_x86_user_segment(seg) )
497                 return X86EMUL_UNHANDLEABLE;
498             reg->base = 0;
499             break;
500         case x86_seg_fs:
501             reg->base = rdfsbase();
502             break;
503         case x86_seg_gs:
504             reg->base = rdgsbase();
505             break;
506         }
507 
508         reg->limit = ~0U;
509 
510         reg->attr = 0;
511         reg->type = _SEGMENT_WR >> 8;
512         if ( seg == x86_seg_cs )
513         {
514             reg->type |= _SEGMENT_CODE >> 8;
515             reg->l = 1;
516         }
517         else
518             reg->db = 1;
519         reg->s   = 1;
520         reg->dpl = 3;
521         reg->p   = 1;
522         reg->g   = 1;
523     }
524 
525     /*
526      * For x86_emulate.c's mode_ring0() to work, fake a DPL of zero.
527      * Also do this for consistency for non-conforming code segments.
528      */
529     if ( (seg == x86_seg_ss ||
530           (seg == x86_seg_cs &&
531            !(reg->type & (_SEGMENT_EC >> 8)))) &&
532          guest_kernel_mode(current, ctxt->regs) )
533         reg->dpl = 0;
534 
535     return X86EMUL_OKAY;
536 }
537 
pv_emul_virt_to_linear(unsigned long base,unsigned long offset,unsigned int bytes,unsigned long limit,enum x86_segment seg,struct x86_emulate_ctxt * ctxt,unsigned long * addr)538 static int pv_emul_virt_to_linear(unsigned long base, unsigned long offset,
539                                   unsigned int bytes, unsigned long limit,
540                                   enum x86_segment seg,
541                                   struct x86_emulate_ctxt *ctxt,
542                                   unsigned long *addr)
543 {
544     int rc = X86EMUL_OKAY;
545 
546     *addr = base + offset;
547 
548     if ( ctxt->addr_size < 64 )
549     {
550         if ( limit < bytes - 1 || offset > limit - bytes + 1 )
551             rc = X86EMUL_EXCEPTION;
552         *addr = (uint32_t)*addr;
553     }
554     else if ( !__addr_ok(*addr) )
555         rc = X86EMUL_EXCEPTION;
556 
557     if ( unlikely(rc == X86EMUL_EXCEPTION) )
558         x86_emul_hw_exception(seg != x86_seg_ss ? TRAP_gp_fault
559                                                 : TRAP_stack_error,
560                               0, ctxt);
561 
562     return rc;
563 }
564 
rep_ins(uint16_t port,enum x86_segment seg,unsigned long offset,unsigned int bytes_per_rep,unsigned long * reps,struct x86_emulate_ctxt * ctxt)565 static int rep_ins(uint16_t port,
566                    enum x86_segment seg, unsigned long offset,
567                    unsigned int bytes_per_rep, unsigned long *reps,
568                    struct x86_emulate_ctxt *ctxt)
569 {
570     struct priv_op_ctxt *poc = container_of(ctxt, struct priv_op_ctxt, ctxt);
571     struct vcpu *curr = current;
572     struct domain *currd = current->domain;
573     unsigned long goal = *reps;
574     struct segment_register sreg;
575     int rc;
576 
577     ASSERT(seg == x86_seg_es);
578 
579     *reps = 0;
580 
581     if ( !guest_io_okay(port, bytes_per_rep, curr, ctxt->regs) )
582         return X86EMUL_UNHANDLEABLE;
583 
584     rc = read_segment(x86_seg_es, &sreg, ctxt);
585     if ( rc != X86EMUL_OKAY )
586         return rc;
587 
588     if ( !sreg.p )
589         return X86EMUL_UNHANDLEABLE;
590     if ( !sreg.s ||
591          (sreg.type & (_SEGMENT_CODE >> 8)) ||
592          !(sreg.type & (_SEGMENT_WR >> 8)) )
593     {
594         x86_emul_hw_exception(TRAP_gp_fault, 0, ctxt);
595         return X86EMUL_EXCEPTION;
596     }
597 
598     poc->bpmatch = check_guest_io_breakpoint(curr, port, bytes_per_rep);
599 
600     while ( *reps < goal )
601     {
602         unsigned int data = guest_io_read(port, bytes_per_rep, currd);
603         unsigned long addr;
604 
605         rc = pv_emul_virt_to_linear(sreg.base, offset, bytes_per_rep,
606                                     sreg.limit, x86_seg_es, ctxt, &addr);
607         if ( rc != X86EMUL_OKAY )
608             return rc;
609 
610         if ( (rc = __copy_to_user((void *)addr, &data, bytes_per_rep)) != 0 )
611         {
612             x86_emul_pagefault(PFEC_write_access,
613                                addr + bytes_per_rep - rc, ctxt);
614             return X86EMUL_EXCEPTION;
615         }
616 
617         ++*reps;
618 
619         if ( poc->bpmatch || hypercall_preempt_check() )
620             break;
621 
622         /* x86_emulate() clips the repetition count to ensure we don't wrap. */
623         if ( unlikely(ctxt->regs->eflags & X86_EFLAGS_DF) )
624             offset -= bytes_per_rep;
625         else
626             offset += bytes_per_rep;
627     }
628 
629     return X86EMUL_OKAY;
630 }
631 
rep_outs(enum x86_segment seg,unsigned long offset,uint16_t port,unsigned int bytes_per_rep,unsigned long * reps,struct x86_emulate_ctxt * ctxt)632 static int rep_outs(enum x86_segment seg, unsigned long offset,
633                     uint16_t port,
634                     unsigned int bytes_per_rep, unsigned long *reps,
635                     struct x86_emulate_ctxt *ctxt)
636 {
637     struct priv_op_ctxt *poc = container_of(ctxt, struct priv_op_ctxt, ctxt);
638     struct vcpu *curr = current;
639     struct domain *currd = current->domain;
640     unsigned long goal = *reps;
641     struct segment_register sreg;
642     int rc;
643 
644     *reps = 0;
645 
646     if ( !guest_io_okay(port, bytes_per_rep, curr, ctxt->regs) )
647         return X86EMUL_UNHANDLEABLE;
648 
649     rc = read_segment(seg, &sreg, ctxt);
650     if ( rc != X86EMUL_OKAY )
651         return rc;
652 
653     if ( !sreg.p )
654         return X86EMUL_UNHANDLEABLE;
655     if ( !sreg.s ||
656          ((sreg.type & (_SEGMENT_CODE >> 8)) &&
657           !(sreg.type & (_SEGMENT_WR >> 8))) )
658     {
659         x86_emul_hw_exception(seg != x86_seg_ss ? TRAP_gp_fault
660                                                 : TRAP_stack_error,
661                               0, ctxt);
662         return X86EMUL_EXCEPTION;
663     }
664 
665     poc->bpmatch = check_guest_io_breakpoint(curr, port, bytes_per_rep);
666 
667     while ( *reps < goal )
668     {
669         unsigned int data = 0;
670         unsigned long addr;
671 
672         rc = pv_emul_virt_to_linear(sreg.base, offset, bytes_per_rep,
673                                     sreg.limit, seg, ctxt, &addr);
674         if ( rc != X86EMUL_OKAY )
675             return rc;
676 
677         if ( (rc = __copy_from_user(&data, (void *)addr, bytes_per_rep)) != 0 )
678         {
679             x86_emul_pagefault(0, addr + bytes_per_rep - rc, ctxt);
680             return X86EMUL_EXCEPTION;
681         }
682 
683         guest_io_write(port, bytes_per_rep, data, currd);
684 
685         ++*reps;
686 
687         if ( poc->bpmatch || hypercall_preempt_check() )
688             break;
689 
690         /* x86_emulate() clips the repetition count to ensure we don't wrap. */
691         if ( unlikely(ctxt->regs->eflags & X86_EFLAGS_DF) )
692             offset -= bytes_per_rep;
693         else
694             offset += bytes_per_rep;
695     }
696 
697     return X86EMUL_OKAY;
698 }
699 
read_cr(unsigned int reg,unsigned long * val,struct x86_emulate_ctxt * ctxt)700 static int read_cr(unsigned int reg, unsigned long *val,
701                    struct x86_emulate_ctxt *ctxt)
702 {
703     const struct vcpu *curr = current;
704 
705     switch ( reg )
706     {
707     case 0: /* Read CR0 */
708         *val = (read_cr0() & ~X86_CR0_TS) | curr->arch.pv_vcpu.ctrlreg[0];
709         return X86EMUL_OKAY;
710 
711     case 2: /* Read CR2 */
712     case 4: /* Read CR4 */
713         *val = curr->arch.pv_vcpu.ctrlreg[reg];
714         return X86EMUL_OKAY;
715 
716     case 3: /* Read CR3 */
717     {
718         const struct domain *currd = curr->domain;
719         mfn_t mfn;
720 
721         if ( !is_pv_32bit_domain(currd) )
722         {
723             mfn = pagetable_get_mfn(curr->arch.guest_table);
724             *val = xen_pfn_to_cr3(mfn_to_gmfn(currd, mfn_x(mfn)));
725         }
726         else
727         {
728             l4_pgentry_t *pl4e =
729                 map_domain_page(pagetable_get_mfn(curr->arch.guest_table));
730 
731             mfn = l4e_get_mfn(*pl4e);
732             unmap_domain_page(pl4e);
733             *val = compat_pfn_to_cr3(mfn_to_gmfn(currd, mfn_x(mfn)));
734         }
735         /* PTs should not be shared */
736         BUG_ON(page_get_owner(mfn_to_page(mfn)) == dom_cow);
737         return X86EMUL_OKAY;
738     }
739     }
740 
741     return X86EMUL_UNHANDLEABLE;
742 }
743 
write_cr(unsigned int reg,unsigned long val,struct x86_emulate_ctxt * ctxt)744 static int write_cr(unsigned int reg, unsigned long val,
745                     struct x86_emulate_ctxt *ctxt)
746 {
747     struct vcpu *curr = current;
748 
749     switch ( reg )
750     {
751     case 0: /* Write CR0 */
752         if ( (val ^ read_cr0()) & ~X86_CR0_TS )
753         {
754             gdprintk(XENLOG_WARNING,
755                      "Attempt to change unmodifiable CR0 flags\n");
756             break;
757         }
758         do_fpu_taskswitch(!!(val & X86_CR0_TS));
759         return X86EMUL_OKAY;
760 
761     case 2: /* Write CR2 */
762         curr->arch.pv_vcpu.ctrlreg[2] = val;
763         arch_set_cr2(curr, val);
764         return X86EMUL_OKAY;
765 
766     case 3: /* Write CR3 */
767     {
768         struct domain *currd = curr->domain;
769         unsigned long gfn;
770         struct page_info *page;
771         int rc;
772 
773         gfn = !is_pv_32bit_domain(currd)
774               ? xen_cr3_to_pfn(val) : compat_cr3_to_pfn(val);
775         page = get_page_from_gfn(currd, gfn, NULL, P2M_ALLOC);
776         if ( !page )
777             break;
778         rc = new_guest_cr3(page_to_mfn(page));
779         put_page(page);
780 
781         switch ( rc )
782         {
783         case 0:
784             return X86EMUL_OKAY;
785         case -ERESTART: /* retry after preemption */
786             return X86EMUL_RETRY;
787         }
788         break;
789     }
790 
791     case 4: /* Write CR4 */
792         curr->arch.pv_vcpu.ctrlreg[4] = pv_guest_cr4_fixup(curr, val);
793         write_cr4(pv_guest_cr4_to_real_cr4(curr));
794         ctxt_switch_levelling(curr);
795         return X86EMUL_OKAY;
796     }
797 
798     return X86EMUL_UNHANDLEABLE;
799 }
800 
read_dr(unsigned int reg,unsigned long * val,struct x86_emulate_ctxt * ctxt)801 static int read_dr(unsigned int reg, unsigned long *val,
802                    struct x86_emulate_ctxt *ctxt)
803 {
804     unsigned long res = do_get_debugreg(reg);
805 
806     if ( IS_ERR_VALUE(res) )
807         return X86EMUL_UNHANDLEABLE;
808 
809     *val = res;
810 
811     return X86EMUL_OKAY;
812 }
813 
write_dr(unsigned int reg,unsigned long val,struct x86_emulate_ctxt * ctxt)814 static int write_dr(unsigned int reg, unsigned long val,
815                     struct x86_emulate_ctxt *ctxt)
816 {
817     return do_set_debugreg(reg, val) == 0
818            ? X86EMUL_OKAY : X86EMUL_UNHANDLEABLE;
819 }
820 
guest_misc_enable(uint64_t val)821 static inline uint64_t guest_misc_enable(uint64_t val)
822 {
823     val &= ~(MSR_IA32_MISC_ENABLE_PERF_AVAIL |
824              MSR_IA32_MISC_ENABLE_MONITOR_ENABLE);
825     val |= MSR_IA32_MISC_ENABLE_BTS_UNAVAIL |
826            MSR_IA32_MISC_ENABLE_PEBS_UNAVAIL |
827            MSR_IA32_MISC_ENABLE_XTPR_DISABLE;
828     return val;
829 }
830 
is_cpufreq_controller(const struct domain * d)831 static inline bool is_cpufreq_controller(const struct domain *d)
832 {
833     return ((cpufreq_controller == FREQCTL_dom0_kernel) &&
834             is_hardware_domain(d));
835 }
836 
read_msr(unsigned int reg,uint64_t * val,struct x86_emulate_ctxt * ctxt)837 static int read_msr(unsigned int reg, uint64_t *val,
838                     struct x86_emulate_ctxt *ctxt)
839 {
840     struct priv_op_ctxt *poc = container_of(ctxt, struct priv_op_ctxt, ctxt);
841     const struct vcpu *curr = current;
842     const struct domain *currd = curr->domain;
843     bool vpmu_msr = false;
844     int ret;
845 
846     if ( (ret = guest_rdmsr(curr, reg, val)) != X86EMUL_UNHANDLEABLE )
847     {
848         if ( ret == X86EMUL_EXCEPTION )
849             x86_emul_hw_exception(TRAP_gp_fault, 0, ctxt);
850 
851         return ret;
852     }
853 
854     switch ( reg )
855     {
856         int rc;
857 
858     case MSR_FS_BASE:
859         if ( is_pv_32bit_domain(currd) )
860             break;
861         *val = cpu_has_fsgsbase ? __rdfsbase() : curr->arch.pv_vcpu.fs_base;
862         return X86EMUL_OKAY;
863 
864     case MSR_GS_BASE:
865         if ( is_pv_32bit_domain(currd) )
866             break;
867         *val = cpu_has_fsgsbase ? __rdgsbase()
868                                 : curr->arch.pv_vcpu.gs_base_kernel;
869         return X86EMUL_OKAY;
870 
871     case MSR_SHADOW_GS_BASE:
872         if ( is_pv_32bit_domain(currd) )
873             break;
874         *val = curr->arch.pv_vcpu.gs_base_user;
875         return X86EMUL_OKAY;
876 
877     /*
878      * In order to fully retain original behavior, defer calling
879      * pv_soft_rdtsc() until after emulation. This may want/need to be
880      * reconsidered.
881      */
882     case MSR_IA32_TSC:
883         poc->tsc |= TSC_BASE;
884         goto normal;
885 
886     case MSR_TSC_AUX:
887         poc->tsc |= TSC_AUX;
888         if ( cpu_has_rdtscp )
889             goto normal;
890         *val = 0;
891         return X86EMUL_OKAY;
892 
893     case MSR_EFER:
894         *val = read_efer();
895         if ( is_pv_32bit_domain(currd) )
896             *val &= ~(EFER_LME | EFER_LMA | EFER_LMSLE);
897         return X86EMUL_OKAY;
898 
899     case MSR_K7_FID_VID_CTL:
900     case MSR_K7_FID_VID_STATUS:
901     case MSR_K8_PSTATE_LIMIT:
902     case MSR_K8_PSTATE_CTRL:
903     case MSR_K8_PSTATE_STATUS:
904     case MSR_K8_PSTATE0:
905     case MSR_K8_PSTATE1:
906     case MSR_K8_PSTATE2:
907     case MSR_K8_PSTATE3:
908     case MSR_K8_PSTATE4:
909     case MSR_K8_PSTATE5:
910     case MSR_K8_PSTATE6:
911     case MSR_K8_PSTATE7:
912         if ( boot_cpu_data.x86_vendor != X86_VENDOR_AMD )
913             break;
914         if ( unlikely(is_cpufreq_controller(currd)) )
915             goto normal;
916         *val = 0;
917         return X86EMUL_OKAY;
918 
919     case MSR_IA32_UCODE_REV:
920         BUILD_BUG_ON(MSR_IA32_UCODE_REV != MSR_AMD_PATCHLEVEL);
921         if ( boot_cpu_data.x86_vendor == X86_VENDOR_INTEL )
922         {
923             if ( wrmsr_safe(MSR_IA32_UCODE_REV, 0) )
924                 break;
925             /* As documented in the SDM: Do a CPUID 1 here */
926             cpuid_eax(1);
927         }
928         goto normal;
929 
930     case MSR_IA32_MISC_ENABLE:
931         if ( rdmsr_safe(reg, *val) )
932             break;
933         *val = guest_misc_enable(*val);
934         return X86EMUL_OKAY;
935 
936     case MSR_AMD64_DR0_ADDRESS_MASK:
937         if ( !boot_cpu_has(X86_FEATURE_DBEXT) )
938             break;
939         *val = curr->arch.pv_vcpu.dr_mask[0];
940         return X86EMUL_OKAY;
941 
942     case MSR_AMD64_DR1_ADDRESS_MASK ... MSR_AMD64_DR3_ADDRESS_MASK:
943         if ( !boot_cpu_has(X86_FEATURE_DBEXT) )
944             break;
945         *val = curr->arch.pv_vcpu.dr_mask[reg - MSR_AMD64_DR1_ADDRESS_MASK + 1];
946         return X86EMUL_OKAY;
947 
948     case MSR_IA32_PERF_CAPABILITIES:
949         /* No extra capabilities are supported. */
950         *val = 0;
951         return X86EMUL_OKAY;
952 
953     case MSR_P6_PERFCTR(0) ... MSR_P6_PERFCTR(7):
954     case MSR_P6_EVNTSEL(0) ... MSR_P6_EVNTSEL(3):
955     case MSR_CORE_PERF_FIXED_CTR0 ... MSR_CORE_PERF_FIXED_CTR2:
956     case MSR_CORE_PERF_FIXED_CTR_CTRL ... MSR_CORE_PERF_GLOBAL_OVF_CTRL:
957         if ( boot_cpu_data.x86_vendor == X86_VENDOR_INTEL )
958         {
959             vpmu_msr = true;
960             /* fall through */
961     case MSR_AMD_FAM15H_EVNTSEL0 ... MSR_AMD_FAM15H_PERFCTR5:
962     case MSR_K7_EVNTSEL0 ... MSR_K7_PERFCTR3:
963             if ( vpmu_msr || (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) )
964             {
965                 if ( vpmu_do_rdmsr(reg, val) )
966                     break;
967                 return X86EMUL_OKAY;
968             }
969         }
970         /* fall through */
971     default:
972         if ( rdmsr_hypervisor_regs(reg, val) )
973             return X86EMUL_OKAY;
974 
975         rc = vmce_rdmsr(reg, val);
976         if ( rc < 0 )
977             break;
978         if ( rc )
979             return X86EMUL_OKAY;
980         /* fall through */
981     normal:
982         /* Everyone can read the MSR space. */
983         /* gdprintk(XENLOG_WARNING, "Domain attempted RDMSR %08x\n", reg); */
984         if ( rdmsr_safe(reg, *val) )
985             break;
986         return X86EMUL_OKAY;
987     }
988 
989     return X86EMUL_UNHANDLEABLE;
990 }
991 
write_msr(unsigned int reg,uint64_t val,struct x86_emulate_ctxt * ctxt)992 static int write_msr(unsigned int reg, uint64_t val,
993                      struct x86_emulate_ctxt *ctxt)
994 {
995     struct vcpu *curr = current;
996     const struct domain *currd = curr->domain;
997     bool vpmu_msr = false;
998     int ret;
999 
1000     if ( (ret = guest_wrmsr(curr, reg, val)) != X86EMUL_UNHANDLEABLE )
1001     {
1002         if ( ret == X86EMUL_EXCEPTION )
1003             x86_emul_hw_exception(TRAP_gp_fault, 0, ctxt);
1004 
1005         return ret;
1006     }
1007 
1008     switch ( reg )
1009     {
1010         uint64_t temp;
1011         int rc;
1012 
1013     case MSR_FS_BASE:
1014         if ( is_pv_32bit_domain(currd) || !is_canonical_address(val) )
1015             break;
1016         wrfsbase(val);
1017         curr->arch.pv_vcpu.fs_base = val;
1018         return X86EMUL_OKAY;
1019 
1020     case MSR_GS_BASE:
1021         if ( is_pv_32bit_domain(currd) || !is_canonical_address(val) )
1022             break;
1023         wrgsbase(val);
1024         curr->arch.pv_vcpu.gs_base_kernel = val;
1025         return X86EMUL_OKAY;
1026 
1027     case MSR_SHADOW_GS_BASE:
1028         if ( is_pv_32bit_domain(currd) || !is_canonical_address(val) )
1029             break;
1030         wrmsrl(MSR_SHADOW_GS_BASE, val);
1031         curr->arch.pv_vcpu.gs_base_user = val;
1032         return X86EMUL_OKAY;
1033 
1034     case MSR_K7_FID_VID_STATUS:
1035     case MSR_K7_FID_VID_CTL:
1036     case MSR_K8_PSTATE_LIMIT:
1037     case MSR_K8_PSTATE_CTRL:
1038     case MSR_K8_PSTATE_STATUS:
1039     case MSR_K8_PSTATE0:
1040     case MSR_K8_PSTATE1:
1041     case MSR_K8_PSTATE2:
1042     case MSR_K8_PSTATE3:
1043     case MSR_K8_PSTATE4:
1044     case MSR_K8_PSTATE5:
1045     case MSR_K8_PSTATE6:
1046     case MSR_K8_PSTATE7:
1047     case MSR_K8_HWCR:
1048         if ( boot_cpu_data.x86_vendor != X86_VENDOR_AMD )
1049             break;
1050         if ( likely(!is_cpufreq_controller(currd)) ||
1051              wrmsr_safe(reg, val) == 0 )
1052             return X86EMUL_OKAY;
1053         break;
1054 
1055     case MSR_AMD64_NB_CFG:
1056         if ( boot_cpu_data.x86_vendor != X86_VENDOR_AMD ||
1057              boot_cpu_data.x86 < 0x10 || boot_cpu_data.x86 > 0x17 )
1058             break;
1059         if ( !is_hardware_domain(currd) || !is_pinned_vcpu(curr) )
1060             return X86EMUL_OKAY;
1061         if ( (rdmsr_safe(MSR_AMD64_NB_CFG, temp) != 0) ||
1062              ((val ^ temp) & ~(1ULL << AMD64_NB_CFG_CF8_EXT_ENABLE_BIT)) )
1063             goto invalid;
1064         if ( wrmsr_safe(MSR_AMD64_NB_CFG, val) == 0 )
1065             return X86EMUL_OKAY;
1066         break;
1067 
1068     case MSR_FAM10H_MMIO_CONF_BASE:
1069         if ( boot_cpu_data.x86_vendor != X86_VENDOR_AMD ||
1070              boot_cpu_data.x86 < 0x10 || boot_cpu_data.x86 > 0x17 )
1071             break;
1072         if ( !is_hardware_domain(currd) || !is_pinned_vcpu(curr) )
1073             return X86EMUL_OKAY;
1074         if ( rdmsr_safe(MSR_FAM10H_MMIO_CONF_BASE, temp) != 0 )
1075             break;
1076         if ( (pci_probe & PCI_PROBE_MASK) == PCI_PROBE_MMCONF ?
1077              temp != val :
1078              ((temp ^ val) &
1079               ~(FAM10H_MMIO_CONF_ENABLE |
1080                 (FAM10H_MMIO_CONF_BUSRANGE_MASK <<
1081                  FAM10H_MMIO_CONF_BUSRANGE_SHIFT) |
1082                 ((u64)FAM10H_MMIO_CONF_BASE_MASK <<
1083                  FAM10H_MMIO_CONF_BASE_SHIFT))) )
1084             goto invalid;
1085         if ( wrmsr_safe(MSR_FAM10H_MMIO_CONF_BASE, val) == 0 )
1086             return X86EMUL_OKAY;
1087         break;
1088 
1089     case MSR_IA32_UCODE_REV:
1090         if ( boot_cpu_data.x86_vendor != X86_VENDOR_INTEL )
1091             break;
1092         if ( !is_hardware_domain(currd) || !is_pinned_vcpu(curr) )
1093             return X86EMUL_OKAY;
1094         if ( rdmsr_safe(reg, temp) )
1095             break;
1096         if ( val )
1097             goto invalid;
1098         return X86EMUL_OKAY;
1099 
1100     case MSR_IA32_MISC_ENABLE:
1101         if ( rdmsr_safe(reg, temp) )
1102             break;
1103         if ( val != guest_misc_enable(temp) )
1104             goto invalid;
1105         return X86EMUL_OKAY;
1106 
1107     case MSR_IA32_MPERF:
1108     case MSR_IA32_APERF:
1109         if ( (boot_cpu_data.x86_vendor != X86_VENDOR_INTEL) &&
1110              (boot_cpu_data.x86_vendor != X86_VENDOR_AMD) )
1111             break;
1112         if ( likely(!is_cpufreq_controller(currd)) ||
1113              wrmsr_safe(reg, val) == 0 )
1114             return X86EMUL_OKAY;
1115         break;
1116 
1117     case MSR_IA32_PERF_CTL:
1118         if ( boot_cpu_data.x86_vendor != X86_VENDOR_INTEL )
1119             break;
1120         if ( likely(!is_cpufreq_controller(currd)) ||
1121              wrmsr_safe(reg, val) == 0 )
1122             return X86EMUL_OKAY;
1123         break;
1124 
1125     case MSR_IA32_THERM_CONTROL:
1126     case MSR_IA32_ENERGY_PERF_BIAS:
1127         if ( boot_cpu_data.x86_vendor != X86_VENDOR_INTEL )
1128             break;
1129         if ( !is_hardware_domain(currd) || !is_pinned_vcpu(curr) ||
1130              wrmsr_safe(reg, val) == 0 )
1131             return X86EMUL_OKAY;
1132         break;
1133 
1134     case MSR_AMD64_DR0_ADDRESS_MASK:
1135         if ( !boot_cpu_has(X86_FEATURE_DBEXT) || (val >> 32) )
1136             break;
1137         curr->arch.pv_vcpu.dr_mask[0] = val;
1138         if ( curr->arch.debugreg[7] & DR7_ACTIVE_MASK )
1139             wrmsrl(MSR_AMD64_DR0_ADDRESS_MASK, val);
1140         return X86EMUL_OKAY;
1141 
1142     case MSR_AMD64_DR1_ADDRESS_MASK ... MSR_AMD64_DR3_ADDRESS_MASK:
1143         if ( !boot_cpu_has(X86_FEATURE_DBEXT) || (val >> 32) )
1144             break;
1145         curr->arch.pv_vcpu.dr_mask[reg - MSR_AMD64_DR1_ADDRESS_MASK + 1] = val;
1146         if ( curr->arch.debugreg[7] & DR7_ACTIVE_MASK )
1147             wrmsrl(reg, val);
1148         return X86EMUL_OKAY;
1149 
1150     case MSR_P6_PERFCTR(0) ... MSR_P6_PERFCTR(7):
1151     case MSR_P6_EVNTSEL(0) ... MSR_P6_EVNTSEL(3):
1152     case MSR_CORE_PERF_FIXED_CTR0 ... MSR_CORE_PERF_FIXED_CTR2:
1153     case MSR_CORE_PERF_FIXED_CTR_CTRL ... MSR_CORE_PERF_GLOBAL_OVF_CTRL:
1154         if ( boot_cpu_data.x86_vendor == X86_VENDOR_INTEL )
1155         {
1156             vpmu_msr = true;
1157     case MSR_AMD_FAM15H_EVNTSEL0 ... MSR_AMD_FAM15H_PERFCTR5:
1158     case MSR_K7_EVNTSEL0 ... MSR_K7_PERFCTR3:
1159             if ( vpmu_msr || (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) )
1160             {
1161                 if ( (vpmu_mode & XENPMU_MODE_ALL) &&
1162                      !is_hardware_domain(currd) )
1163                     return X86EMUL_OKAY;
1164 
1165                 if ( vpmu_do_wrmsr(reg, val, 0) )
1166                     break;
1167                 return X86EMUL_OKAY;
1168             }
1169         }
1170         /* fall through */
1171     default:
1172         if ( wrmsr_hypervisor_regs(reg, val) == 1 )
1173             return X86EMUL_OKAY;
1174 
1175         rc = vmce_wrmsr(reg, val);
1176         if ( rc < 0 )
1177             break;
1178         if ( rc )
1179             return X86EMUL_OKAY;
1180 
1181         if ( (rdmsr_safe(reg, temp) != 0) || (val != temp) )
1182     invalid:
1183             gdprintk(XENLOG_WARNING,
1184                      "Domain attempted WRMSR %08x from 0x%016"PRIx64" to 0x%016"PRIx64"\n",
1185                      reg, temp, val);
1186         return X86EMUL_OKAY;
1187     }
1188 
1189     return X86EMUL_UNHANDLEABLE;
1190 }
1191 
1192 /* Name it differently to avoid clashing with wbinvd() */
_wbinvd(struct x86_emulate_ctxt * ctxt)1193 static int _wbinvd(struct x86_emulate_ctxt *ctxt)
1194 {
1195     /* Ignore the instruction if unprivileged. */
1196     if ( !cache_flush_permitted(current->domain) )
1197         /*
1198          * Non-physdev domain attempted WBINVD; ignore for now since
1199          * newer linux uses this in some start-of-day timing loops.
1200          */
1201         ;
1202     else
1203         wbinvd();
1204 
1205     return X86EMUL_OKAY;
1206 }
1207 
pv_emul_cpuid(uint32_t leaf,uint32_t subleaf,struct cpuid_leaf * res,struct x86_emulate_ctxt * ctxt)1208 int pv_emul_cpuid(uint32_t leaf, uint32_t subleaf,
1209                   struct cpuid_leaf *res, struct x86_emulate_ctxt *ctxt)
1210 {
1211     guest_cpuid(current, leaf, subleaf, res);
1212 
1213     return X86EMUL_OKAY;
1214 }
1215 
validate(const struct x86_emulate_state * state,struct x86_emulate_ctxt * ctxt)1216 static int validate(const struct x86_emulate_state *state,
1217                     struct x86_emulate_ctxt *ctxt)
1218 {
1219     switch ( ctxt->opcode )
1220     {
1221     case 0x6c ... 0x6f: /* ins / outs */
1222     case 0xe4 ... 0xe7: /* in / out (immediate port) */
1223     case 0xec ... 0xef: /* in / out (port in %dx) */
1224     case X86EMUL_OPC(0x0f, 0x06): /* clts */
1225     case X86EMUL_OPC(0x0f, 0x09): /* wbinvd */
1226     case X86EMUL_OPC(0x0f, 0x20) ...
1227          X86EMUL_OPC(0x0f, 0x23): /* mov to/from cr/dr */
1228     case X86EMUL_OPC(0x0f, 0x30): /* wrmsr */
1229     case X86EMUL_OPC(0x0f, 0x31): /* rdtsc */
1230     case X86EMUL_OPC(0x0f, 0x32): /* rdmsr */
1231     case X86EMUL_OPC(0x0f, 0xa2): /* cpuid */
1232         return X86EMUL_OKAY;
1233 
1234     case 0xfa: case 0xfb: /* cli / sti */
1235         if ( !iopl_ok(current, ctxt->regs) )
1236             break;
1237         /*
1238          * This is just too dangerous to allow, in my opinion. Consider if the
1239          * caller then tries to reenable interrupts using POPF: we can't trap
1240          * that and we'll end up with hard-to-debug lockups. Fast & loose will
1241          * do for us. :-)
1242         vcpu_info(current, evtchn_upcall_mask) = (ctxt->opcode == 0xfa);
1243          */
1244         return X86EMUL_DONE;
1245 
1246     case X86EMUL_OPC(0x0f, 0x01):
1247     {
1248         unsigned int modrm_rm, modrm_reg;
1249 
1250         if ( x86_insn_modrm(state, &modrm_rm, &modrm_reg) != 3 ||
1251              (modrm_rm & 7) != 1 )
1252             break;
1253         switch ( modrm_reg & 7 )
1254         {
1255         case 2: /* xsetbv */
1256         case 7: /* rdtscp */
1257             return X86EMUL_OKAY;
1258         }
1259         break;
1260     }
1261     }
1262 
1263     return X86EMUL_UNHANDLEABLE;
1264 }
1265 
insn_fetch(enum x86_segment seg,unsigned long offset,void * p_data,unsigned int bytes,struct x86_emulate_ctxt * ctxt)1266 static int insn_fetch(enum x86_segment seg,
1267                       unsigned long offset,
1268                       void *p_data,
1269                       unsigned int bytes,
1270                       struct x86_emulate_ctxt *ctxt)
1271 {
1272     const struct priv_op_ctxt *poc =
1273         container_of(ctxt, struct priv_op_ctxt, ctxt);
1274     unsigned int rc;
1275     unsigned long addr = poc->cs.base + offset;
1276 
1277     ASSERT(seg == x86_seg_cs);
1278 
1279     /* We don't mean to emulate any branches. */
1280     if ( !bytes )
1281         return X86EMUL_UNHANDLEABLE;
1282 
1283     rc = pv_emul_virt_to_linear(poc->cs.base, offset, bytes, poc->cs.limit,
1284                                 x86_seg_cs, ctxt, &addr);
1285     if ( rc != X86EMUL_OKAY )
1286         return rc;
1287 
1288     if ( (rc = __copy_from_user(p_data, (void *)addr, bytes)) != 0 )
1289     {
1290         /*
1291          * TODO: This should report PFEC_insn_fetch when goc->insn_fetch &&
1292          * cpu_has_nx, but we'd then need a "fetch" variant of
1293          * __copy_from_user() respecting NX, SMEP, and protection keys.
1294          */
1295         x86_emul_pagefault(0, addr + bytes - rc, ctxt);
1296         return X86EMUL_EXCEPTION;
1297     }
1298 
1299     return X86EMUL_OKAY;
1300 }
1301 
1302 
1303 static const struct x86_emulate_ops priv_op_ops = {
1304     .insn_fetch          = insn_fetch,
1305     .read                = x86emul_unhandleable_rw,
1306     .validate            = validate,
1307     .read_io             = read_io,
1308     .write_io            = write_io,
1309     .rep_ins             = rep_ins,
1310     .rep_outs            = rep_outs,
1311     .read_segment        = read_segment,
1312     .read_cr             = read_cr,
1313     .write_cr            = write_cr,
1314     .read_dr             = read_dr,
1315     .write_dr            = write_dr,
1316     .read_msr            = read_msr,
1317     .write_msr           = write_msr,
1318     .cpuid               = pv_emul_cpuid,
1319     .wbinvd              = _wbinvd,
1320 };
1321 
pv_emulate_privileged_op(struct cpu_user_regs * regs)1322 int pv_emulate_privileged_op(struct cpu_user_regs *regs)
1323 {
1324     struct vcpu *curr = current;
1325     struct domain *currd = curr->domain;
1326     struct priv_op_ctxt ctxt = {
1327         .ctxt.regs = regs,
1328         .ctxt.vendor = currd->arch.cpuid->x86_vendor,
1329         .ctxt.lma = !is_pv_32bit_domain(currd),
1330     };
1331     int rc;
1332     unsigned int eflags, ar;
1333 
1334     if ( !pv_emul_read_descriptor(regs->cs, curr, &ctxt.cs.base,
1335                                   &ctxt.cs.limit, &ar, 1) ||
1336          !(ar & _SEGMENT_S) ||
1337          !(ar & _SEGMENT_P) ||
1338          !(ar & _SEGMENT_CODE) )
1339         return 0;
1340 
1341     /* Mirror virtualized state into EFLAGS. */
1342     ASSERT(regs->eflags & X86_EFLAGS_IF);
1343     if ( vcpu_info(curr, evtchn_upcall_mask) )
1344         regs->eflags &= ~X86_EFLAGS_IF;
1345     else
1346         regs->eflags |= X86_EFLAGS_IF;
1347     ASSERT(!(regs->eflags & X86_EFLAGS_IOPL));
1348     regs->eflags |= curr->arch.pv_vcpu.iopl;
1349     eflags = regs->eflags;
1350 
1351     ctxt.ctxt.addr_size = ar & _SEGMENT_L ? 64 : ar & _SEGMENT_DB ? 32 : 16;
1352     /* Leave zero in ctxt.ctxt.sp_size, as it's not needed. */
1353     rc = x86_emulate(&ctxt.ctxt, &priv_op_ops);
1354 
1355     if ( ctxt.io_emul_stub )
1356         unmap_domain_page(ctxt.io_emul_stub);
1357 
1358     /*
1359      * Un-mirror virtualized state from EFLAGS.
1360      * Nothing we allow to be emulated can change anything other than the
1361      * arithmetic bits, and the resume flag.
1362      */
1363     ASSERT(!((regs->eflags ^ eflags) &
1364              ~(X86_EFLAGS_RF | X86_EFLAGS_ARITH_MASK)));
1365     regs->eflags |= X86_EFLAGS_IF;
1366     regs->eflags &= ~X86_EFLAGS_IOPL;
1367 
1368     switch ( rc )
1369     {
1370     case X86EMUL_OKAY:
1371         if ( ctxt.tsc & TSC_BASE )
1372         {
1373             if ( ctxt.tsc & TSC_AUX )
1374                 pv_soft_rdtsc(curr, regs, 1);
1375             else if ( currd->arch.vtsc )
1376                 pv_soft_rdtsc(curr, regs, 0);
1377             else
1378                 msr_split(regs, rdtsc());
1379         }
1380 
1381         if ( ctxt.ctxt.retire.singlestep )
1382             ctxt.bpmatch |= DR_STEP;
1383         if ( ctxt.bpmatch )
1384         {
1385             curr->arch.debugreg[6] |= ctxt.bpmatch | DR_STATUS_RESERVED_ONE;
1386             if ( !(curr->arch.pv_vcpu.trap_bounce.flags & TBF_EXCEPTION) )
1387                 pv_inject_hw_exception(TRAP_debug, X86_EVENT_NO_EC);
1388         }
1389         /* fall through */
1390     case X86EMUL_RETRY:
1391         return EXCRET_fault_fixed;
1392 
1393     case X86EMUL_EXCEPTION:
1394         pv_inject_event(&ctxt.ctxt.event);
1395         return EXCRET_fault_fixed;
1396     }
1397 
1398     return 0;
1399 }
1400 
1401 /*
1402  * Local variables:
1403  * mode: C
1404  * c-file-style: "BSD"
1405  * c-basic-offset: 4
1406  * tab-width: 4
1407  * indent-tabs-mode: nil
1408  * End:
1409  */
1410