1 /******************************************************************************
2  * arch/x86/traps.c
3  *
4  * Modifications to Linux original are copyright (c) 2002-2004, K A Fraser
5  *
6  * This program is free software; you can redistribute it and/or modify
7  * it under the terms of the GNU General Public License as published by
8  * the Free Software Foundation; either version 2 of the License, or
9  * (at your option) any later version.
10  *
11  * This program is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14  * GNU General Public License for more details.
15  *
16  * You should have received a copy of the GNU General Public License
17  * along with this program; If not, see <http://www.gnu.org/licenses/>.
18  */
19 
20 /*
21  *  Copyright (C) 1991, 1992  Linus Torvalds
22  *
23  *  Pentium III FXSR, SSE support
24  * Gareth Hughes <gareth@valinux.com>, May 2000
25  */
26 
27 #include <xen/init.h>
28 #include <xen/sched.h>
29 #include <xen/lib.h>
30 #include <xen/err.h>
31 #include <xen/errno.h>
32 #include <xen/mm.h>
33 #include <xen/console.h>
34 #include <xen/shutdown.h>
35 #include <xen/guest_access.h>
36 #include <asm/regs.h>
37 #include <xen/delay.h>
38 #include <xen/event.h>
39 #include <xen/spinlock.h>
40 #include <xen/irq.h>
41 #include <xen/perfc.h>
42 #include <xen/softirq.h>
43 #include <xen/domain_page.h>
44 #include <xen/symbols.h>
45 #include <xen/iocap.h>
46 #include <xen/version.h>
47 #include <xen/kexec.h>
48 #include <xen/trace.h>
49 #include <xen/paging.h>
50 #include <xen/virtual_region.h>
51 #include <xen/watchdog.h>
52 #include <xen/livepatch.h>
53 #include <asm/system.h>
54 #include <asm/io.h>
55 #include <asm/atomic.h>
56 #include <xen/bitops.h>
57 #include <asm/desc.h>
58 #include <asm/debugreg.h>
59 #include <asm/smp.h>
60 #include <asm/flushtlb.h>
61 #include <asm/uaccess.h>
62 #include <asm/i387.h>
63 #include <asm/xstate.h>
64 #include <asm/debugger.h>
65 #include <asm/msr.h>
66 #include <asm/nmi.h>
67 #include <asm/shared.h>
68 #include <asm/x86_emulate.h>
69 #include <asm/traps.h>
70 #include <asm/hvm/vpt.h>
71 #include <asm/hypercall.h>
72 #include <asm/mce.h>
73 #include <asm/apic.h>
74 #include <asm/mc146818rtc.h>
75 #include <asm/hpet.h>
76 #include <asm/vpmu.h>
77 #include <public/arch-x86/cpuid.h>
78 #include <asm/cpuid.h>
79 #include <xsm/xsm.h>
80 #include <asm/pv/traps.h>
81 #include <asm/pv/mm.h>
82 
83 /*
84  * opt_nmi: one of 'ignore', 'dom0', or 'fatal'.
85  *  fatal:  Xen prints diagnostic message and then hangs.
86  *  dom0:   The NMI is virtualised to DOM0.
87  *  ignore: The NMI error is cleared and ignored.
88  */
89 #ifdef NDEBUG
90 static char __read_mostly opt_nmi[10] = "dom0";
91 #else
92 static char __read_mostly opt_nmi[10] = "fatal";
93 #endif
94 string_param("nmi", opt_nmi);
95 
96 DEFINE_PER_CPU(u64, efer);
97 static DEFINE_PER_CPU(unsigned long, last_extable_addr);
98 
99 DEFINE_PER_CPU_READ_MOSTLY(u32, ler_msr);
100 
101 DEFINE_PER_CPU_READ_MOSTLY(struct desc_struct *, gdt_table);
102 DEFINE_PER_CPU_READ_MOSTLY(struct desc_struct *, compat_gdt_table);
103 
104 /* Master table, used by CPU0. */
105 idt_entry_t idt_table[IDT_ENTRIES];
106 
107 /* Pointer to the IDT of every CPU. */
108 idt_entry_t *idt_tables[NR_CPUS] __read_mostly;
109 
110 void (*ioemul_handle_quirk)(
111     u8 opcode, char *io_emul_stub, struct cpu_user_regs *regs);
112 
113 static int debug_stack_lines = 20;
114 integer_param("debug_stack_lines", debug_stack_lines);
115 
116 static bool opt_ler;
117 boolean_param("ler", opt_ler);
118 
119 #define stack_words_per_line 4
120 #define ESP_BEFORE_EXCEPTION(regs) ((unsigned long *)regs->rsp)
121 
show_code(const struct cpu_user_regs * regs)122 static void show_code(const struct cpu_user_regs *regs)
123 {
124     unsigned char insns_before[8] = {}, insns_after[16] = {};
125     unsigned int i, tmp, missing_before, missing_after;
126 
127     if ( guest_mode(regs) )
128         return;
129 
130     stac();
131 
132     /*
133      * Copy forward from regs->rip.  In the case of a fault, %ecx contains the
134      * number of bytes remaining to copy.
135      */
136     asm volatile ("1: rep movsb; 2:"
137                   _ASM_EXTABLE(1b, 2b)
138                   : "=&c" (missing_after),
139                     "=&D" (tmp), "=&S" (tmp)
140                   : "0" (ARRAY_SIZE(insns_after)),
141                     "1" (insns_after),
142                     "2" (regs->rip));
143 
144     /*
145      * Copy backwards from regs->rip - 1.  In the case of a fault, %ecx
146      * contains the number of bytes remaining to copy.
147      */
148     asm volatile ("std;"
149                   "1: rep movsb;"
150                   "2: cld;"
151                   _ASM_EXTABLE(1b, 2b)
152                   : "=&c" (missing_before),
153                     "=&D" (tmp), "=&S" (tmp)
154                   : "0" (ARRAY_SIZE(insns_before)),
155                     "1" (insns_before + ARRAY_SIZE(insns_before) - 1),
156                     "2" (regs->rip - 1));
157     clac();
158 
159     printk("Xen code around <%p> (%ps)%s:\n",
160            _p(regs->rip), _p(regs->rip),
161            (missing_before || missing_after) ? " [fault on access]" : "");
162 
163     /* Print bytes from insns_before[]. */
164     for ( i = 0; i < ARRAY_SIZE(insns_before); ++i )
165     {
166         if ( i < missing_before )
167             printk(" --");
168         else
169             printk(" %02x", insns_before[i]);
170     }
171 
172     /* Print the byte under %rip. */
173     if ( missing_after != ARRAY_SIZE(insns_after) )
174         printk(" <%02x>", insns_after[0]);
175     else
176         printk(" <-->");
177 
178     /* Print bytes from insns_after[]. */
179     for ( i = 1; i < ARRAY_SIZE(insns_after); ++i )
180     {
181         if ( i < (ARRAY_SIZE(insns_after) - missing_after) )
182             printk(" %02x", insns_after[i]);
183         else
184             printk(" --");
185     }
186 
187     printk("\n");
188 }
189 
compat_show_guest_stack(struct vcpu * v,const struct cpu_user_regs * regs,int debug_stack_lines)190 static void compat_show_guest_stack(struct vcpu *v,
191                                     const struct cpu_user_regs *regs,
192                                     int debug_stack_lines)
193 {
194     unsigned int i, *stack, addr, mask = STACK_SIZE;
195 
196     stack = (unsigned int *)(unsigned long)regs->esp;
197     printk("Guest stack trace from esp=%08lx:\n ", (unsigned long)stack);
198 
199     if ( !__compat_access_ok(v->domain, stack, sizeof(*stack)) )
200     {
201         printk("Guest-inaccessible memory.\n");
202         return;
203     }
204 
205     if ( v != current )
206     {
207         struct vcpu *vcpu;
208         unsigned long mfn;
209 
210         ASSERT(guest_kernel_mode(v, regs));
211         mfn = read_cr3() >> PAGE_SHIFT;
212         for_each_vcpu( v->domain, vcpu )
213             if ( pagetable_get_pfn(vcpu->arch.guest_table) == mfn )
214                 break;
215         if ( !vcpu )
216         {
217             stack = do_page_walk(v, (unsigned long)stack);
218             if ( (unsigned long)stack < PAGE_SIZE )
219             {
220                 printk("Inaccessible guest memory.\n");
221                 return;
222             }
223             mask = PAGE_SIZE;
224         }
225     }
226 
227     for ( i = 0; i < debug_stack_lines * 8; i++ )
228     {
229         if ( (((long)stack - 1) ^ ((long)(stack + 1) - 1)) & mask )
230             break;
231         if ( __get_user(addr, stack) )
232         {
233             if ( i != 0 )
234                 printk("\n    ");
235             printk("Fault while accessing guest memory.");
236             i = 1;
237             break;
238         }
239         if ( (i != 0) && ((i % 8) == 0) )
240             printk("\n ");
241         printk(" %08x", addr);
242         stack++;
243     }
244     if ( mask == PAGE_SIZE )
245     {
246         BUILD_BUG_ON(PAGE_SIZE == STACK_SIZE);
247         unmap_domain_page(stack);
248     }
249     if ( i == 0 )
250         printk("Stack empty.");
251     printk("\n");
252 }
253 
show_guest_stack(struct vcpu * v,const struct cpu_user_regs * regs)254 static void show_guest_stack(struct vcpu *v, const struct cpu_user_regs *regs)
255 {
256     int i;
257     unsigned long *stack, addr;
258     unsigned long mask = STACK_SIZE;
259 
260     /* Avoid HVM as we don't know what the stack looks like. */
261     if ( is_hvm_vcpu(v) )
262         return;
263 
264     if ( is_pv_32bit_vcpu(v) )
265     {
266         compat_show_guest_stack(v, regs, debug_stack_lines);
267         return;
268     }
269 
270     stack = (unsigned long *)regs->rsp;
271     printk("Guest stack trace from "__OP"sp=%p:\n  ", stack);
272 
273     if ( !access_ok(stack, sizeof(*stack)) )
274     {
275         printk("Guest-inaccessible memory.\n");
276         return;
277     }
278 
279     if ( v != current )
280     {
281         struct vcpu *vcpu;
282 
283         ASSERT(guest_kernel_mode(v, regs));
284         vcpu = maddr_get_owner(read_cr3()) == v->domain ? v : NULL;
285         if ( !vcpu )
286         {
287             stack = do_page_walk(v, (unsigned long)stack);
288             if ( (unsigned long)stack < PAGE_SIZE )
289             {
290                 printk("Inaccessible guest memory.\n");
291                 return;
292             }
293             mask = PAGE_SIZE;
294         }
295     }
296 
297     for ( i = 0; i < (debug_stack_lines*stack_words_per_line); i++ )
298     {
299         if ( (((long)stack - 1) ^ ((long)(stack + 1) - 1)) & mask )
300             break;
301         if ( __get_user(addr, stack) )
302         {
303             if ( i != 0 )
304                 printk("\n    ");
305             printk("Fault while accessing guest memory.");
306             i = 1;
307             break;
308         }
309         if ( (i != 0) && ((i % stack_words_per_line) == 0) )
310             printk("\n  ");
311         printk(" %p", _p(addr));
312         stack++;
313     }
314     if ( mask == PAGE_SIZE )
315     {
316         BUILD_BUG_ON(PAGE_SIZE == STACK_SIZE);
317         unmap_domain_page(stack);
318     }
319     if ( i == 0 )
320         printk("Stack empty.");
321     printk("\n");
322 }
323 
324 /*
325  * Notes for get_stack_trace_bottom() and get_stack_dump_bottom()
326  *
327  * Stack pages 0, 1 and 2:
328  *   These are all 1-page IST stacks.  Each of these stacks have an exception
329  *   frame and saved register state at the top.  The interesting bound for a
330  *   trace is the word adjacent to this, while the bound for a dump is the
331  *   very top, including the exception frame.
332  *
333  * Stack pages 3, 4 and 5:
334  *   None of these are particularly interesting.  With MEMORY_GUARD, page 5 is
335  *   explicitly not present, so attempting to dump or trace it is
336  *   counterproductive.  Without MEMORY_GUARD, it is possible for a call chain
337  *   to use the entire primary stack and wander into page 5.  In this case,
338  *   consider these pages an extension of the primary stack to aid debugging
339  *   hopefully rare situations where the primary stack has effective been
340  *   overflown.
341  *
342  * Stack pages 6 and 7:
343  *   These form the primary stack, and have a cpu_info at the top.  For a
344  *   trace, the interesting bound is adjacent to the cpu_info, while for a
345  *   dump, the entire cpu_info is interesting.
346  *
347  * For the cases where the stack should not be inspected, pretend that the
348  * passed stack pointer is already out of reasonable bounds.
349  */
get_stack_trace_bottom(unsigned long sp)350 unsigned long get_stack_trace_bottom(unsigned long sp)
351 {
352     switch ( get_stack_page(sp) )
353     {
354     case 0 ... 2:
355         return ROUNDUP(sp, PAGE_SIZE) -
356             offsetof(struct cpu_user_regs, es) - sizeof(unsigned long);
357 
358 #ifndef MEMORY_GUARD
359     case 3 ... 5:
360 #endif
361     case 6 ... 7:
362         return ROUNDUP(sp, STACK_SIZE) -
363             sizeof(struct cpu_info) - sizeof(unsigned long);
364 
365     default:
366         return sp - sizeof(unsigned long);
367     }
368 }
369 
get_stack_dump_bottom(unsigned long sp)370 unsigned long get_stack_dump_bottom(unsigned long sp)
371 {
372     switch ( get_stack_page(sp) )
373     {
374     case 0 ... 2:
375         return ROUNDUP(sp, PAGE_SIZE) - sizeof(unsigned long);
376 
377 #ifndef MEMORY_GUARD
378     case 3 ... 5:
379 #endif
380     case 6 ... 7:
381         return ROUNDUP(sp, STACK_SIZE) - sizeof(unsigned long);
382 
383     default:
384         return sp - sizeof(unsigned long);
385     }
386 }
387 
388 #if !defined(CONFIG_FRAME_POINTER)
389 
390 /*
391  * Stack trace from pointers found in stack, unaided by frame pointers.  For
392  * caller convenience, this has the same prototype as its alternative, and
393  * simply ignores the base pointer parameter.
394  */
_show_trace(unsigned long sp,unsigned long __maybe_unused bp)395 static void _show_trace(unsigned long sp, unsigned long __maybe_unused bp)
396 {
397     unsigned long *stack = (unsigned long *)sp, addr;
398     unsigned long *bottom = (unsigned long *)get_stack_trace_bottom(sp);
399 
400     while ( stack <= bottom )
401     {
402         addr = *stack++;
403         if ( is_active_kernel_text(addr) )
404             printk("   [<%p>] %pS\n", _p(addr), _p(addr));
405     }
406 }
407 
408 #else
409 
410 /* Stack trace from frames in the stack, using frame pointers */
_show_trace(unsigned long sp,unsigned long bp)411 static void _show_trace(unsigned long sp, unsigned long bp)
412 {
413     unsigned long *frame, next, addr;
414 
415     /* Bounds for range of valid frame pointer. */
416     unsigned long low = sp, high = get_stack_trace_bottom(sp);
417 
418     /* The initial frame pointer. */
419     next = bp;
420 
421     for ( ; ; )
422     {
423         /* Valid frame pointer? */
424         if ( (next < low) || (next >= high) )
425         {
426             /*
427              * Exception stack frames have a different layout, denoted by an
428              * inverted frame pointer.
429              */
430             next = ~next;
431             if ( (next < low) || (next >= high) )
432                 break;
433             frame = (unsigned long *)next;
434             next  = frame[0];
435             addr  = frame[(offsetof(struct cpu_user_regs, rip) -
436                            offsetof(struct cpu_user_regs, rbp))
437                          / BYTES_PER_LONG];
438         }
439         else
440         {
441             /* Ordinary stack frame. */
442             frame = (unsigned long *)next;
443             next  = frame[0];
444             addr  = frame[1];
445         }
446 
447         printk("   [<%p>] %pS\n", _p(addr), _p(addr));
448 
449         low = (unsigned long)&frame[2];
450     }
451 }
452 
453 #endif
454 
show_trace(const struct cpu_user_regs * regs)455 static void show_trace(const struct cpu_user_regs *regs)
456 {
457     unsigned long *sp = ESP_BEFORE_EXCEPTION(regs);
458 
459     printk("Xen call trace:\n");
460 
461     /*
462      * If RIP looks sensible, or the top of the stack doesn't, print RIP at
463      * the top of the stack trace.
464      */
465     if ( is_active_kernel_text(regs->rip) ||
466          !is_active_kernel_text(*sp) )
467         printk("   [<%p>] %pS\n", _p(regs->rip), _p(regs->rip));
468     /*
469      * Else RIP looks bad but the top of the stack looks good.  Perhaps we
470      * followed a wild function pointer? Lets assume the top of the stack is a
471      * return address; print it and skip past so _show_trace() doesn't print
472      * it again.
473      */
474     else
475     {
476         printk("   [<%p>] %pS\n", _p(*sp), _p(*sp));
477         sp++;
478     }
479 
480     _show_trace((unsigned long)sp, regs->rbp);
481 
482     printk("\n");
483 }
484 
show_stack(const struct cpu_user_regs * regs)485 void show_stack(const struct cpu_user_regs *regs)
486 {
487     unsigned long *stack = ESP_BEFORE_EXCEPTION(regs), *stack_bottom, addr;
488     int i;
489 
490     if ( guest_mode(regs) )
491         return show_guest_stack(current, regs);
492 
493     printk("Xen stack trace from "__OP"sp=%p:\n  ", stack);
494 
495     stack_bottom = _p(get_stack_dump_bottom(regs->rsp));
496 
497     for ( i = 0; i < (debug_stack_lines*stack_words_per_line) &&
498               (stack <= stack_bottom); i++ )
499     {
500         if ( (i != 0) && ((i % stack_words_per_line) == 0) )
501             printk("\n  ");
502         addr = *stack++;
503         printk(" %p", _p(addr));
504     }
505     if ( i == 0 )
506         printk("Stack empty.");
507     printk("\n");
508 
509     show_trace(regs);
510 }
511 
show_stack_overflow(unsigned int cpu,const struct cpu_user_regs * regs)512 void show_stack_overflow(unsigned int cpu, const struct cpu_user_regs *regs)
513 {
514     unsigned long esp = regs->rsp;
515     unsigned long curr_stack_base = esp & ~(STACK_SIZE - 1);
516 #ifdef MEMORY_GUARD
517     unsigned long esp_top, esp_bottom;
518 #endif
519 
520     if ( _p(curr_stack_base) != stack_base[cpu] )
521         printk("Current stack base %p differs from expected %p\n",
522                _p(curr_stack_base), stack_base[cpu]);
523 
524 #ifdef MEMORY_GUARD
525     esp_bottom = (esp | (STACK_SIZE - 1)) + 1;
526     esp_top    = esp_bottom - PRIMARY_STACK_SIZE;
527 
528     printk("Valid stack range: %p-%p, sp=%p, tss.rsp0=%p\n",
529            (void *)esp_top, (void *)esp_bottom, (void *)esp,
530            (void *)per_cpu(init_tss, cpu).rsp0);
531 
532     /*
533      * Trigger overflow trace if %esp is anywhere within the guard page, or
534      * with fewer than 512 bytes remaining on the primary stack.
535      */
536     if ( (esp > (esp_top + 512)) ||
537          (esp < (esp_top - PAGE_SIZE)) )
538     {
539         printk("No stack overflow detected. Skipping stack trace.\n");
540         return;
541     }
542 
543     if ( esp < esp_top )
544         esp = esp_top;
545 
546     printk("Xen stack overflow (dumping trace %p-%p):\n",
547            (void *)esp, (void *)esp_bottom);
548 
549     _show_trace(esp, regs->rbp);
550 
551     printk("\n");
552 #endif
553 }
554 
show_execution_state(const struct cpu_user_regs * regs)555 void show_execution_state(const struct cpu_user_regs *regs)
556 {
557     /* Prevent interleaving of output. */
558     unsigned long flags = console_lock_recursive_irqsave();
559 
560     show_registers(regs);
561     show_code(regs);
562     show_stack(regs);
563 
564     console_unlock_recursive_irqrestore(flags);
565 }
566 
vcpu_show_execution_state(struct vcpu * v)567 void vcpu_show_execution_state(struct vcpu *v)
568 {
569     unsigned long flags;
570 
571     printk("*** Dumping Dom%d vcpu#%d state: ***\n",
572            v->domain->domain_id, v->vcpu_id);
573 
574     if ( v == current )
575     {
576         show_execution_state(guest_cpu_user_regs());
577         return;
578     }
579 
580     vcpu_pause(v); /* acceptably dangerous */
581 
582     /* Prevent interleaving of output. */
583     flags = console_lock_recursive_irqsave();
584 
585     vcpu_show_registers(v);
586     if ( guest_kernel_mode(v, &v->arch.user_regs) )
587         show_guest_stack(v, &v->arch.user_regs);
588 
589     console_unlock_recursive_irqrestore(flags);
590 
591     vcpu_unpause(v);
592 }
593 
594 static cpumask_t show_state_mask;
595 static bool opt_show_all;
596 boolean_param("async-show-all", opt_show_all);
597 
nmi_show_execution_state(const struct cpu_user_regs * regs,int cpu)598 static int nmi_show_execution_state(const struct cpu_user_regs *regs, int cpu)
599 {
600     if ( !cpumask_test_cpu(cpu, &show_state_mask) )
601         return 0;
602 
603     if ( opt_show_all )
604         show_execution_state(regs);
605     else
606         printk(XENLOG_ERR "CPU%d @ %04x:%08lx (%pS)\n", cpu, regs->cs,
607                regs->rip, guest_mode(regs) ? _p(regs->rip) : NULL);
608     cpumask_clear_cpu(cpu, &show_state_mask);
609 
610     return 1;
611 }
612 
trapstr(unsigned int trapnr)613 const char *trapstr(unsigned int trapnr)
614 {
615     static const char * const strings[] = {
616         "divide error", "debug", "nmi", "bkpt", "overflow", "bounds",
617         "invalid opcode", "device not available", "double fault",
618         "coprocessor segment", "invalid tss", "segment not found",
619         "stack error", "general protection fault", "page fault",
620         "spurious interrupt", "coprocessor error", "alignment check",
621         "machine check", "simd error", "virtualisation exception"
622     };
623 
624     return trapnr < ARRAY_SIZE(strings) ? strings[trapnr] : "???";
625 }
626 
627 /*
628  * This is called for faults at very unexpected times (e.g., when interrupts
629  * are disabled). In such situations we can't do much that is safe. We try to
630  * print out some tracing and then we just spin.
631  */
fatal_trap(const struct cpu_user_regs * regs,bool show_remote)632 void fatal_trap(const struct cpu_user_regs *regs, bool show_remote)
633 {
634     static DEFINE_PER_CPU(char, depth);
635     unsigned int trapnr = regs->entry_vector;
636 
637     /* Set AC to reduce chance of further SMAP faults */
638     stac();
639 
640     /*
641      * In some cases, we can end up in a vicious cycle of fatal_trap()s
642      * within fatal_trap()s. We give the problem a couple of iterations to
643      * bottom out, and then we just panic.
644      */
645     if ( ++this_cpu(depth) < 3 )
646     {
647         watchdog_disable();
648         console_start_sync();
649 
650         show_execution_state(regs);
651 
652         if ( trapnr == TRAP_page_fault )
653         {
654             unsigned long cr2 = read_cr2();
655             printk("Faulting linear address: %p\n", _p(cr2));
656             show_page_walk(cr2);
657         }
658 
659         if ( show_remote )
660         {
661             unsigned int msecs, pending;
662 
663             cpumask_andnot(&show_state_mask, &cpu_online_map,
664                            cpumask_of(smp_processor_id()));
665             set_nmi_callback(nmi_show_execution_state);
666             /* Ensure new callback is set before sending out the NMI. */
667             smp_wmb();
668             smp_send_nmi_allbutself();
669 
670             /* Wait at most 10ms for some other CPU to respond. */
671             msecs = 10;
672             pending = cpumask_weight(&show_state_mask);
673             while ( pending && msecs-- )
674             {
675                 unsigned int left;
676 
677                 mdelay(1);
678                 left = cpumask_weight(&show_state_mask);
679                 if ( left < pending )
680                 {
681                     pending = left;
682                     msecs = 10;
683                 }
684             }
685         }
686     }
687 
688     panic("FATAL TRAP: vector = %d (%s)\n"
689           "[error_code=%04x] %s",
690           trapnr, trapstr(trapnr), regs->error_code,
691           (regs->eflags & X86_EFLAGS_IF) ? "" : ", IN INTERRUPT CONTEXT");
692 }
693 
do_reserved_trap(struct cpu_user_regs * regs)694 void do_reserved_trap(struct cpu_user_regs *regs)
695 {
696     unsigned int trapnr = regs->entry_vector;
697 
698     if ( debugger_trap_fatal(trapnr, regs) )
699         return;
700 
701     show_execution_state(regs);
702     panic("FATAL RESERVED TRAP %#x: %s", trapnr, trapstr(trapnr));
703 }
704 
do_trap(struct cpu_user_regs * regs)705 void do_trap(struct cpu_user_regs *regs)
706 {
707     struct vcpu *curr = current;
708     unsigned int trapnr = regs->entry_vector;
709     unsigned long fixup;
710 
711     if ( regs->error_code & X86_XEC_EXT )
712         goto hardware_trap;
713 
714     if ( debugger_trap_entry(trapnr, regs) )
715         return;
716 
717     ASSERT(trapnr < 32);
718 
719     if ( guest_mode(regs) )
720     {
721         pv_inject_hw_exception(trapnr,
722                                (TRAP_HAVE_EC & (1u << trapnr))
723                                ? regs->error_code : X86_EVENT_NO_EC);
724         return;
725     }
726 
727     if ( ((trapnr == TRAP_copro_error) || (trapnr == TRAP_simd_error)) &&
728          system_state >= SYS_STATE_active && is_hvm_vcpu(curr) &&
729          curr->arch.hvm_vcpu.fpu_exception_callback )
730     {
731         curr->arch.hvm_vcpu.fpu_exception_callback(
732             curr->arch.hvm_vcpu.fpu_exception_callback_arg, regs);
733         return;
734     }
735 
736     if ( likely((fixup = search_exception_table(regs)) != 0) )
737     {
738         dprintk(XENLOG_ERR, "Trap %u: %p [%ps] -> %p\n",
739                 trapnr, _p(regs->rip), _p(regs->rip), _p(fixup));
740         this_cpu(last_extable_addr) = regs->rip;
741         regs->rip = fixup;
742         return;
743     }
744 
745  hardware_trap:
746     if ( debugger_trap_fatal(trapnr, regs) )
747         return;
748 
749     show_execution_state(regs);
750     panic("FATAL TRAP: vector = %d (%s)\n"
751           "[error_code=%04x]",
752           trapnr, trapstr(trapnr), regs->error_code);
753 }
754 
755 /* Returns 0 if not handled, and non-0 for success. */
rdmsr_hypervisor_regs(uint32_t idx,uint64_t * val)756 int rdmsr_hypervisor_regs(uint32_t idx, uint64_t *val)
757 {
758     struct domain *d = current->domain;
759     /* Optionally shift out of the way of Viridian architectural MSRs. */
760     uint32_t base = is_viridian_domain(d) ? 0x40000200 : 0x40000000;
761 
762     switch ( idx - base )
763     {
764     case 0: /* Write hypercall page MSR.  Read as zero. */
765     {
766         *val = 0;
767         return 1;
768     }
769     }
770 
771     return 0;
772 }
773 
774 /* Returns 1 if handled, 0 if not and -Exx for error. */
wrmsr_hypervisor_regs(uint32_t idx,uint64_t val)775 int wrmsr_hypervisor_regs(uint32_t idx, uint64_t val)
776 {
777     struct domain *d = current->domain;
778     /* Optionally shift out of the way of Viridian architectural MSRs. */
779     uint32_t base = is_viridian_domain(d) ? 0x40000200 : 0x40000000;
780 
781     switch ( idx - base )
782     {
783     case 0: /* Write hypercall page */
784     {
785         void *hypercall_page;
786         unsigned long gmfn = val >> PAGE_SHIFT;
787         unsigned int page_index = val & (PAGE_SIZE - 1);
788         struct page_info *page;
789         p2m_type_t t;
790 
791         if ( page_index > 0 )
792         {
793             gdprintk(XENLOG_WARNING,
794                      "wrmsr hypercall page index %#x unsupported\n",
795                      page_index);
796             return 0;
797         }
798 
799         page = get_page_from_gfn(d, gmfn, &t, P2M_ALLOC);
800 
801         if ( !page || !get_page_type(page, PGT_writable_page) )
802         {
803             if ( page )
804                 put_page(page);
805 
806             if ( p2m_is_paging(t) )
807             {
808                 p2m_mem_paging_populate(d, gmfn);
809                 return -ERESTART;
810             }
811 
812             gdprintk(XENLOG_WARNING,
813                      "Bad GMFN %lx (MFN %lx) to MSR %08x\n",
814                      gmfn, page ? page_to_mfn(page) : -1UL, base);
815             return 0;
816         }
817 
818         hypercall_page = __map_domain_page(page);
819         hypercall_page_initialise(d, hypercall_page);
820         unmap_domain_page(hypercall_page);
821 
822         put_page_and_type(page);
823         return 1;
824     }
825     }
826 
827     return 0;
828 }
829 
cpuid_hypervisor_leaves(const struct vcpu * v,uint32_t leaf,uint32_t subleaf,struct cpuid_leaf * res)830 void cpuid_hypervisor_leaves(const struct vcpu *v, uint32_t leaf,
831                              uint32_t subleaf, struct cpuid_leaf *res)
832 {
833     const struct domain *d = v->domain;
834     const struct cpuid_policy *p = d->arch.cpuid;
835     uint32_t base = is_viridian_domain(d) ? 0x40000100 : 0x40000000;
836     uint32_t idx  = leaf - base;
837     unsigned int limit = is_viridian_domain(d) ? p->hv2_limit : p->hv_limit;
838 
839     if ( limit == 0 )
840         /* Default number of leaves */
841         limit = XEN_CPUID_MAX_NUM_LEAVES;
842     else
843         /* Clamp toolstack value between 2 and MAX_NUM_LEAVES. */
844         limit = min(max(limit, 2u), XEN_CPUID_MAX_NUM_LEAVES + 0u);
845 
846     if ( idx > limit )
847         return;
848 
849     switch ( idx )
850     {
851     case 0:
852         res->a = base + limit; /* Largest leaf */
853         res->b = XEN_CPUID_SIGNATURE_EBX;
854         res->c = XEN_CPUID_SIGNATURE_ECX;
855         res->d = XEN_CPUID_SIGNATURE_EDX;
856         break;
857 
858     case 1:
859         res->a = (xen_major_version() << 16) | xen_minor_version();
860         break;
861 
862     case 2:
863         res->a = 1;            /* Number of hypercall-transfer pages */
864                                /* MSR base address */
865         res->b = is_viridian_domain(d) ? 0x40000200 : 0x40000000;
866         if ( is_pv_domain(d) ) /* Features */
867             res->c |= XEN_CPUID_FEAT1_MMU_PT_UPDATE_PRESERVE_AD;
868         break;
869 
870     case 3: /* Time leaf. */
871         switch ( subleaf )
872         {
873         case 0: /* features */
874             res->a = ((d->arch.vtsc << 0) |
875                       (!!host_tsc_is_safe() << 1) |
876                       (!!boot_cpu_has(X86_FEATURE_RDTSCP) << 2));
877             res->b = d->arch.tsc_mode;
878             res->c = d->arch.tsc_khz;
879             res->d = d->arch.incarnation;
880             break;
881 
882         case 1: /* scale and offset */
883         {
884             uint64_t offset;
885 
886             if ( !d->arch.vtsc )
887                 offset = d->arch.vtsc_offset;
888             else
889                 /* offset already applied to value returned by virtual rdtscp */
890                 offset = 0;
891             res->a = offset;
892             res->b = offset >> 32;
893             res->c = d->arch.vtsc_to_ns.mul_frac;
894             res->d = (s8)d->arch.vtsc_to_ns.shift;
895             break;
896         }
897 
898         case 2: /* physical cpu_khz */
899             res->a = cpu_khz;
900             break;
901         }
902         break;
903 
904     case 4: /* HVM hypervisor leaf. */
905         if ( !is_hvm_domain(d) || subleaf != 0 )
906             break;
907 
908         if ( cpu_has_vmx_apic_reg_virt )
909             res->a |= XEN_HVM_CPUID_APIC_ACCESS_VIRT;
910 
911         /*
912          * We want to claim that x2APIC is virtualized if APIC MSR accesses
913          * are not intercepted. When all three of these are true both rdmsr
914          * and wrmsr in the guest will run without VMEXITs (see
915          * vmx_vlapic_msr_changed()).
916          */
917         if ( cpu_has_vmx_virtualize_x2apic_mode &&
918              cpu_has_vmx_apic_reg_virt &&
919              cpu_has_vmx_virtual_intr_delivery )
920             res->a |= XEN_HVM_CPUID_X2APIC_VIRT;
921 
922         /*
923          * Indicate that memory mapped from other domains (either grants or
924          * foreign pages) has valid IOMMU entries.
925          */
926         res->a |= XEN_HVM_CPUID_IOMMU_MAPPINGS;
927 
928         /* Indicate presence of vcpu id and set it in ebx */
929         res->a |= XEN_HVM_CPUID_VCPU_ID_PRESENT;
930         res->b = v->vcpu_id;
931 
932         /* Indicate presence of domain id and set it in ecx */
933         res->a |= XEN_HVM_CPUID_DOMID_PRESENT;
934         res->c = d->domain_id;
935 
936         break;
937 
938     case 5: /* PV-specific parameters */
939         if ( is_hvm_domain(d) || subleaf != 0 )
940             break;
941 
942         res->b = flsl(get_upper_mfn_bound()) + PAGE_SHIFT;
943         break;
944 
945     default:
946         ASSERT_UNREACHABLE();
947     }
948 }
949 
do_invalid_op(struct cpu_user_regs * regs)950 void do_invalid_op(struct cpu_user_regs *regs)
951 {
952     const struct bug_frame *bug = NULL;
953     u8 bug_insn[2];
954     const char *prefix = "", *filename, *predicate, *eip = (char *)regs->rip;
955     unsigned long fixup;
956     int id = -1, lineno;
957     const struct virtual_region *region;
958 
959     if ( debugger_trap_entry(TRAP_invalid_op, regs) )
960         return;
961 
962     if ( likely(guest_mode(regs)) )
963     {
964         if ( pv_emulate_invalid_op(regs) )
965             pv_inject_hw_exception(TRAP_invalid_op, X86_EVENT_NO_EC);
966         return;
967     }
968 
969     if ( !is_active_kernel_text(regs->rip) ||
970          __copy_from_user(bug_insn, eip, sizeof(bug_insn)) ||
971          memcmp(bug_insn, "\xf\xb", sizeof(bug_insn)) )
972         goto die;
973 
974     region = find_text_region(regs->rip);
975     if ( region )
976     {
977         for ( id = 0; id < BUGFRAME_NR; id++ )
978         {
979             const struct bug_frame *b;
980             unsigned int i;
981 
982             for ( i = 0, b = region->frame[id].bugs;
983                   i < region->frame[id].n_bugs; b++, i++ )
984             {
985                 if ( bug_loc(b) == eip )
986                 {
987                     bug = b;
988                     goto found;
989                 }
990             }
991         }
992     }
993 
994  found:
995     if ( !bug )
996         goto die;
997     eip += sizeof(bug_insn);
998     if ( id == BUGFRAME_run_fn )
999     {
1000         void (*fn)(struct cpu_user_regs *) = bug_ptr(bug);
1001 
1002         fn(regs);
1003         regs->rip = (unsigned long)eip;
1004         return;
1005     }
1006 
1007     /* WARN, BUG or ASSERT: decode the filename pointer and line number. */
1008     filename = bug_ptr(bug);
1009     if ( !is_kernel(filename) && !is_patch(filename) )
1010         goto die;
1011     fixup = strlen(filename);
1012     if ( fixup > 50 )
1013     {
1014         filename += fixup - 47;
1015         prefix = "...";
1016     }
1017     lineno = bug_line(bug);
1018 
1019     switch ( id )
1020     {
1021     case BUGFRAME_warn:
1022         printk("Xen WARN at %s%s:%d\n", prefix, filename, lineno);
1023         show_execution_state(regs);
1024         regs->rip = (unsigned long)eip;
1025         return;
1026 
1027     case BUGFRAME_bug:
1028         printk("Xen BUG at %s%s:%d\n", prefix, filename, lineno);
1029 
1030         if ( debugger_trap_fatal(TRAP_invalid_op, regs) )
1031             return;
1032 
1033         show_execution_state(regs);
1034         panic("Xen BUG at %s%s:%d", prefix, filename, lineno);
1035 
1036     case BUGFRAME_assert:
1037         /* ASSERT: decode the predicate string pointer. */
1038         predicate = bug_msg(bug);
1039         if ( !is_kernel(predicate) && !is_patch(predicate) )
1040             predicate = "<unknown>";
1041 
1042         printk("Assertion '%s' failed at %s%s:%d\n",
1043                predicate, prefix, filename, lineno);
1044 
1045         if ( debugger_trap_fatal(TRAP_invalid_op, regs) )
1046             return;
1047 
1048         show_execution_state(regs);
1049         panic("Assertion '%s' failed at %s%s:%d",
1050               predicate, prefix, filename, lineno);
1051     }
1052 
1053  die:
1054     if ( (fixup = search_exception_table(regs)) != 0 )
1055     {
1056         this_cpu(last_extable_addr) = regs->rip;
1057         regs->rip = fixup;
1058         return;
1059     }
1060 
1061     if ( debugger_trap_fatal(TRAP_invalid_op, regs) )
1062         return;
1063 
1064     show_execution_state(regs);
1065     panic("FATAL TRAP: vector = %d (invalid opcode)", TRAP_invalid_op);
1066 }
1067 
do_int3(struct cpu_user_regs * regs)1068 void do_int3(struct cpu_user_regs *regs)
1069 {
1070     if ( debugger_trap_entry(TRAP_int3, regs) )
1071         return;
1072 
1073     if ( !guest_mode(regs) )
1074     {
1075         unsigned long fixup;
1076 
1077         if ( (fixup = search_exception_table(regs)) != 0 )
1078         {
1079             this_cpu(last_extable_addr) = regs->rip;
1080             dprintk(XENLOG_DEBUG, "Trap %u: %p [%ps] -> %p\n",
1081                     TRAP_int3, _p(regs->rip), _p(regs->rip), _p(fixup));
1082             regs->rip = fixup;
1083             return;
1084         }
1085 
1086         if ( !debugger_trap_fatal(TRAP_int3, regs) )
1087             printk(XENLOG_DEBUG "Hit embedded breakpoint at %p [%ps]\n",
1088                    _p(regs->rip), _p(regs->rip));
1089 
1090         return;
1091     }
1092 
1093     pv_inject_hw_exception(TRAP_int3, X86_EVENT_NO_EC);
1094 }
1095 
reserved_bit_page_fault(unsigned long addr,struct cpu_user_regs * regs)1096 static void reserved_bit_page_fault(unsigned long addr,
1097                                     struct cpu_user_regs *regs)
1098 {
1099     printk("%pv: reserved bit in page table (ec=%04X)\n",
1100            current, regs->error_code);
1101     show_page_walk(addr);
1102     show_execution_state(regs);
1103 }
1104 
handle_gdt_ldt_mapping_fault(unsigned long offset,struct cpu_user_regs * regs)1105 static int handle_gdt_ldt_mapping_fault(unsigned long offset,
1106                                         struct cpu_user_regs *regs)
1107 {
1108     struct vcpu *curr = current;
1109     /* Which vcpu's area did we fault in, and is it in the ldt sub-area? */
1110     unsigned int is_ldt_area = (offset >> (GDT_LDT_VCPU_VA_SHIFT-1)) & 1;
1111     unsigned int vcpu_area   = (offset >> GDT_LDT_VCPU_VA_SHIFT);
1112 
1113     /*
1114      * If the fault is in another vcpu's area, it cannot be due to
1115      * a GDT/LDT descriptor load. Thus we can reasonably exit immediately, and
1116      * indeed we have to since pv_map_ldt_shadow_page() works correctly only on
1117      * accesses to a vcpu's own area.
1118      */
1119     if ( vcpu_area != curr->vcpu_id )
1120         return 0;
1121 
1122     /* Byte offset within the gdt/ldt sub-area. */
1123     offset &= (1UL << (GDT_LDT_VCPU_VA_SHIFT-1)) - 1UL;
1124 
1125     if ( likely(is_ldt_area) )
1126     {
1127         /* LDT fault: Copy a mapping from the guest's LDT, if it is valid. */
1128         if ( likely(pv_map_ldt_shadow_page(offset)) )
1129         {
1130             if ( guest_mode(regs) )
1131                 trace_trap_two_addr(TRC_PV_GDT_LDT_MAPPING_FAULT,
1132                                     regs->rip, offset);
1133         }
1134         else
1135         {
1136             /* In hypervisor mode? Leave it to the #PF handler to fix up. */
1137             if ( !guest_mode(regs) )
1138                 return 0;
1139 
1140             /* Access would have become non-canonical? Pass #GP[sel] back. */
1141             if ( unlikely(!is_canonical_address(
1142                               curr->arch.pv_vcpu.ldt_base + offset)) )
1143             {
1144                 uint16_t ec = (offset & ~(X86_XEC_EXT | X86_XEC_IDT)) | X86_XEC_TI;
1145 
1146                 pv_inject_hw_exception(TRAP_gp_fault, ec);
1147             }
1148             else
1149                 /* else pass the #PF back, with adjusted %cr2. */
1150                 pv_inject_page_fault(regs->error_code,
1151                                      curr->arch.pv_vcpu.ldt_base + offset);
1152         }
1153     }
1154     else
1155     {
1156         /* GDT fault: handle the fault as #GP(selector). */
1157         regs->error_code = offset & ~(X86_XEC_EXT | X86_XEC_IDT | X86_XEC_TI);
1158         (void)do_general_protection(regs);
1159     }
1160 
1161     return EXCRET_fault_fixed;
1162 }
1163 
1164 #define IN_HYPERVISOR_RANGE(va) \
1165     (((va) >= HYPERVISOR_VIRT_START) && ((va) < HYPERVISOR_VIRT_END))
1166 
1167 enum pf_type {
1168     real_fault,
1169     smep_fault,
1170     smap_fault,
1171     spurious_fault
1172 };
1173 
__page_fault_type(unsigned long addr,const struct cpu_user_regs * regs)1174 static enum pf_type __page_fault_type(unsigned long addr,
1175                                       const struct cpu_user_regs *regs)
1176 {
1177     unsigned long mfn, cr3 = read_cr3();
1178     l4_pgentry_t l4e, *l4t;
1179     l3_pgentry_t l3e, *l3t;
1180     l2_pgentry_t l2e, *l2t;
1181     l1_pgentry_t l1e, *l1t;
1182     unsigned int required_flags, disallowed_flags, page_user;
1183     unsigned int error_code = regs->error_code;
1184 
1185     /*
1186      * We do not take spurious page faults in IRQ handlers as we do not
1187      * modify page tables in IRQ context. We therefore bail here because
1188      * map_domain_page() is not IRQ-safe.
1189      */
1190     if ( in_irq() )
1191         return real_fault;
1192 
1193     /* Reserved bit violations are never spurious faults. */
1194     if ( error_code & PFEC_reserved_bit )
1195         return real_fault;
1196 
1197     required_flags  = _PAGE_PRESENT;
1198     if ( error_code & PFEC_write_access )
1199         required_flags |= _PAGE_RW;
1200     if ( error_code & PFEC_user_mode )
1201         required_flags |= _PAGE_USER;
1202 
1203     disallowed_flags = 0;
1204     if ( error_code & PFEC_insn_fetch )
1205         disallowed_flags |= _PAGE_NX_BIT;
1206 
1207     page_user = _PAGE_USER;
1208 
1209     mfn = cr3 >> PAGE_SHIFT;
1210 
1211     l4t = map_domain_page(_mfn(mfn));
1212     l4e = l4e_read_atomic(&l4t[l4_table_offset(addr)]);
1213     mfn = l4e_get_pfn(l4e);
1214     unmap_domain_page(l4t);
1215     if ( ((l4e_get_flags(l4e) & required_flags) != required_flags) ||
1216          (l4e_get_flags(l4e) & disallowed_flags) )
1217         return real_fault;
1218     page_user &= l4e_get_flags(l4e);
1219 
1220     l3t  = map_domain_page(_mfn(mfn));
1221     l3e = l3e_read_atomic(&l3t[l3_table_offset(addr)]);
1222     mfn = l3e_get_pfn(l3e);
1223     unmap_domain_page(l3t);
1224     if ( ((l3e_get_flags(l3e) & required_flags) != required_flags) ||
1225          (l3e_get_flags(l3e) & disallowed_flags) )
1226         return real_fault;
1227     page_user &= l3e_get_flags(l3e);
1228     if ( l3e_get_flags(l3e) & _PAGE_PSE )
1229         goto leaf;
1230 
1231     l2t = map_domain_page(_mfn(mfn));
1232     l2e = l2e_read_atomic(&l2t[l2_table_offset(addr)]);
1233     mfn = l2e_get_pfn(l2e);
1234     unmap_domain_page(l2t);
1235     if ( ((l2e_get_flags(l2e) & required_flags) != required_flags) ||
1236          (l2e_get_flags(l2e) & disallowed_flags) )
1237         return real_fault;
1238     page_user &= l2e_get_flags(l2e);
1239     if ( l2e_get_flags(l2e) & _PAGE_PSE )
1240         goto leaf;
1241 
1242     l1t = map_domain_page(_mfn(mfn));
1243     l1e = l1e_read_atomic(&l1t[l1_table_offset(addr)]);
1244     mfn = l1e_get_pfn(l1e);
1245     unmap_domain_page(l1t);
1246     if ( ((l1e_get_flags(l1e) & required_flags) != required_flags) ||
1247          (l1e_get_flags(l1e) & disallowed_flags) )
1248         return real_fault;
1249     page_user &= l1e_get_flags(l1e);
1250 
1251 leaf:
1252     if ( page_user )
1253     {
1254         unsigned long cr4 = read_cr4();
1255         /*
1256          * Supervisor Mode Execution Prevention (SMEP):
1257          * Disallow supervisor execution from user-accessible mappings
1258          */
1259         if ( (cr4 & X86_CR4_SMEP) &&
1260              ((error_code & (PFEC_insn_fetch|PFEC_user_mode)) == PFEC_insn_fetch) )
1261             return smep_fault;
1262 
1263         /*
1264          * Supervisor Mode Access Prevention (SMAP):
1265          * Disallow supervisor access user-accessible mappings
1266          * A fault is considered as an SMAP violation if the following
1267          * conditions are true:
1268          *   - X86_CR4_SMAP is set in CR4
1269          *   - A user page is being accessed
1270          *   - CPL=3 or X86_EFLAGS_AC is clear
1271          *   - Page fault in kernel mode
1272          */
1273         if ( (cr4 & X86_CR4_SMAP) && !(error_code & PFEC_user_mode) &&
1274              (((regs->cs & 3) == 3) || !(regs->eflags & X86_EFLAGS_AC)) )
1275             return smap_fault;
1276     }
1277 
1278     return spurious_fault;
1279 }
1280 
spurious_page_fault(unsigned long addr,const struct cpu_user_regs * regs)1281 static enum pf_type spurious_page_fault(unsigned long addr,
1282                                         const struct cpu_user_regs *regs)
1283 {
1284     unsigned long flags;
1285     enum pf_type pf_type;
1286 
1287     /*
1288      * Disabling interrupts prevents TLB flushing, and hence prevents
1289      * page tables from becoming invalid under our feet during the walk.
1290      */
1291     local_irq_save(flags);
1292     pf_type = __page_fault_type(addr, regs);
1293     local_irq_restore(flags);
1294 
1295     return pf_type;
1296 }
1297 
fixup_page_fault(unsigned long addr,struct cpu_user_regs * regs)1298 static int fixup_page_fault(unsigned long addr, struct cpu_user_regs *regs)
1299 {
1300     struct vcpu   *v = current;
1301     struct domain *d = v->domain;
1302 
1303     /* No fixups in interrupt context or when interrupts are disabled. */
1304     if ( in_irq() || !(regs->eflags & X86_EFLAGS_IF) )
1305         return 0;
1306 
1307     if ( !(regs->error_code & PFEC_page_present) &&
1308           (pagefault_by_memadd(addr, regs)) )
1309         return handle_memadd_fault(addr, regs);
1310 
1311     if ( unlikely(IN_HYPERVISOR_RANGE(addr)) )
1312     {
1313         if ( !(regs->error_code & (PFEC_user_mode | PFEC_reserved_bit)) &&
1314              (addr >= GDT_LDT_VIRT_START) && (addr < GDT_LDT_VIRT_END) )
1315             return handle_gdt_ldt_mapping_fault(
1316                 addr - GDT_LDT_VIRT_START, regs);
1317         return 0;
1318     }
1319 
1320     if ( guest_kernel_mode(v, regs) &&
1321          !(regs->error_code & (PFEC_reserved_bit | PFEC_insn_fetch)) &&
1322          (regs->error_code & PFEC_write_access) )
1323     {
1324         bool ptwr, mmio_ro;
1325 
1326         ptwr = VM_ASSIST(d, writable_pagetables) &&
1327                /* Do not check if access-protection fault since the page may
1328                   legitimately be not present in shadow page tables */
1329                (paging_mode_enabled(d) ||
1330                 (regs->error_code & PFEC_page_present));
1331 
1332         mmio_ro = is_hardware_domain(d) &&
1333                   (regs->error_code & PFEC_page_present);
1334 
1335         if ( (ptwr || mmio_ro) && pv_ro_page_fault(addr, regs) )
1336             return EXCRET_fault_fixed;
1337     }
1338 
1339     /*
1340      * For non-external shadowed guests, we fix up both their own pagefaults
1341      * and Xen's, since they share the pagetables.  This includes hypervisor
1342      * faults, e.g. from copy_to_user().
1343      */
1344     if ( paging_mode_enabled(d) && !paging_mode_external(d) )
1345     {
1346         int ret = paging_fault(addr, regs);
1347 
1348         if ( ret == EXCRET_fault_fixed )
1349             trace_trap_two_addr(TRC_PV_PAGING_FIXUP, regs->rip, addr);
1350         return ret;
1351     }
1352 
1353     return 0;
1354 }
1355 
1356 /*
1357  * #PF error code:
1358  *  Bit 0: Protection violation (=1) ; Page not present (=0)
1359  *  Bit 1: Write access
1360  *  Bit 2: User mode (=1) ; Supervisor mode (=0)
1361  *  Bit 3: Reserved bit violation
1362  *  Bit 4: Instruction fetch
1363  */
do_page_fault(struct cpu_user_regs * regs)1364 void do_page_fault(struct cpu_user_regs *regs)
1365 {
1366     unsigned long addr, fixup;
1367     unsigned int error_code;
1368     enum pf_type pf_type;
1369 
1370     addr = read_cr2();
1371 
1372     /* fixup_page_fault() might change regs->error_code, so cache it here. */
1373     error_code = regs->error_code;
1374 
1375     if ( debugger_trap_entry(TRAP_page_fault, regs) )
1376         return;
1377 
1378     perfc_incr(page_faults);
1379 
1380     if ( unlikely(fixup_page_fault(addr, regs) != 0) )
1381         return;
1382 
1383     if ( unlikely(!guest_mode(regs)) )
1384     {
1385         pf_type = spurious_page_fault(addr, regs);
1386         if ( (pf_type == smep_fault) || (pf_type == smap_fault) )
1387         {
1388             console_start_sync();
1389             printk("Xen SM%cP violation\n",
1390                    (pf_type == smep_fault) ? 'E' : 'A');
1391             fatal_trap(regs, 0);
1392         }
1393 
1394         if ( pf_type != real_fault )
1395             return;
1396 
1397         if ( likely((fixup = search_exception_table(regs)) != 0) )
1398         {
1399             perfc_incr(copy_user_faults);
1400             if ( unlikely(regs->error_code & PFEC_reserved_bit) )
1401                 reserved_bit_page_fault(addr, regs);
1402             this_cpu(last_extable_addr) = regs->rip;
1403             regs->rip = fixup;
1404             return;
1405         }
1406 
1407         if ( debugger_trap_fatal(TRAP_page_fault, regs) )
1408             return;
1409 
1410         show_execution_state(regs);
1411         show_page_walk(addr);
1412         panic("FATAL PAGE FAULT\n"
1413               "[error_code=%04x]\n"
1414               "Faulting linear address: %p",
1415               error_code, _p(addr));
1416     }
1417 
1418     if ( unlikely(current->domain->arch.suppress_spurious_page_faults) )
1419     {
1420         pf_type = spurious_page_fault(addr, regs);
1421         if ( (pf_type == smep_fault) || (pf_type == smap_fault))
1422         {
1423             printk(XENLOG_G_ERR "%pv fatal SM%cP violation\n",
1424                    current, (pf_type == smep_fault) ? 'E' : 'A');
1425 
1426             domain_crash(current->domain);
1427         }
1428         if ( pf_type != real_fault )
1429             return;
1430     }
1431 
1432     if ( unlikely(regs->error_code & PFEC_reserved_bit) )
1433         reserved_bit_page_fault(addr, regs);
1434 
1435     pv_inject_page_fault(regs->error_code, addr);
1436 }
1437 
1438 /*
1439  * Early #PF handler to print CR2, error code, and stack.
1440  *
1441  * We also deal with spurious faults here, even though they should never happen
1442  * during early boot (an issue was seen once, but was most likely a hardware
1443  * problem).
1444  */
do_early_page_fault(struct cpu_user_regs * regs)1445 void __init do_early_page_fault(struct cpu_user_regs *regs)
1446 {
1447     static unsigned int __initdata stuck;
1448     static unsigned long __initdata prev_eip, prev_cr2;
1449     unsigned long cr2 = read_cr2();
1450 
1451     BUG_ON(smp_processor_id() != 0);
1452 
1453     if ( (regs->rip != prev_eip) || (cr2 != prev_cr2) )
1454     {
1455         prev_eip = regs->rip;
1456         prev_cr2 = cr2;
1457         stuck    = 0;
1458         return;
1459     }
1460 
1461     if ( stuck++ == 1000 )
1462     {
1463         console_start_sync();
1464         printk("Early fatal page fault at %04x:%p (cr2=%p, ec=%04x)\n",
1465                regs->cs, _p(regs->rip), _p(cr2), regs->error_code);
1466         fatal_trap(regs, 0);
1467     }
1468 }
1469 
do_general_protection(struct cpu_user_regs * regs)1470 void do_general_protection(struct cpu_user_regs *regs)
1471 {
1472     struct vcpu *v = current;
1473     unsigned long fixup;
1474 
1475     if ( debugger_trap_entry(TRAP_gp_fault, regs) )
1476         return;
1477 
1478     if ( regs->error_code & X86_XEC_EXT )
1479         goto hardware_gp;
1480 
1481     if ( !guest_mode(regs) )
1482         goto gp_in_kernel;
1483 
1484     /*
1485      * Cunning trick to allow arbitrary "INT n" handling.
1486      *
1487      * We set DPL == 0 on all vectors in the IDT. This prevents any INT <n>
1488      * instruction from trapping to the appropriate vector, when that might not
1489      * be expected by Xen or the guest OS. For example, that entry might be for
1490      * a fault handler (unlike traps, faults don't increment EIP), or might
1491      * expect an error code on the stack (which a software trap never
1492      * provides), or might be a hardware interrupt handler that doesn't like
1493      * being called spuriously.
1494      *
1495      * Instead, a GPF occurs with the faulting IDT vector in the error code.
1496      * Bit 1 is set to indicate that an IDT entry caused the fault. Bit 0 is
1497      * clear (which got already checked above) to indicate that it's a software
1498      * fault, not a hardware one.
1499      *
1500      * NOTE: Vectors 3 and 4 are dealt with from their own handler. This is
1501      * okay because they can only be triggered by an explicit DPL-checked
1502      * instruction. The DPL specified by the guest OS for these vectors is NOT
1503      * CHECKED!!
1504      */
1505     if ( regs->error_code & X86_XEC_IDT )
1506     {
1507         /* This fault must be due to <INT n> instruction. */
1508         const struct trap_info *ti;
1509         unsigned char vector = regs->error_code >> 3;
1510         ti = &v->arch.pv_vcpu.trap_ctxt[vector];
1511         if ( permit_softint(TI_GET_DPL(ti), v, regs) )
1512         {
1513             regs->rip += 2;
1514             pv_inject_sw_interrupt(vector);
1515             return;
1516         }
1517     }
1518     else if ( is_pv_32bit_vcpu(v) && regs->error_code )
1519     {
1520         pv_emulate_gate_op(regs);
1521         return;
1522     }
1523 
1524     /* Emulate some simple privileged and I/O instructions. */
1525     if ( (regs->error_code == 0) &&
1526          pv_emulate_privileged_op(regs) )
1527     {
1528         trace_trap_one_addr(TRC_PV_EMULATE_PRIVOP, regs->rip);
1529         return;
1530     }
1531 
1532     /* Pass on GPF as is. */
1533     pv_inject_hw_exception(TRAP_gp_fault, regs->error_code);
1534     return;
1535 
1536  gp_in_kernel:
1537 
1538     if ( likely((fixup = search_exception_table(regs)) != 0) )
1539     {
1540         dprintk(XENLOG_INFO, "GPF (%04x): %p [%ps] -> %p\n",
1541                 regs->error_code, _p(regs->rip), _p(regs->rip), _p(fixup));
1542         this_cpu(last_extable_addr) = regs->rip;
1543         regs->rip = fixup;
1544         return;
1545     }
1546 
1547  hardware_gp:
1548     if ( debugger_trap_fatal(TRAP_gp_fault, regs) )
1549         return;
1550 
1551     show_execution_state(regs);
1552     panic("GENERAL PROTECTION FAULT\n[error_code=%04x]", regs->error_code);
1553 }
1554 
pci_serr_softirq(void)1555 static void pci_serr_softirq(void)
1556 {
1557     printk("\n\nNMI - PCI system error (SERR)\n");
1558     outb(inb(0x61) & 0x0b, 0x61); /* re-enable the PCI SERR error line. */
1559 }
1560 
async_exception_cleanup(struct vcpu * curr)1561 void async_exception_cleanup(struct vcpu *curr)
1562 {
1563     int trap;
1564 
1565     if ( !curr->async_exception_mask )
1566         return;
1567 
1568     /* Restore affinity.  */
1569     if ( !cpumask_empty(curr->cpu_hard_affinity_tmp) &&
1570          !cpumask_equal(curr->cpu_hard_affinity_tmp, curr->cpu_hard_affinity) )
1571     {
1572         vcpu_set_hard_affinity(curr, curr->cpu_hard_affinity_tmp);
1573         cpumask_clear(curr->cpu_hard_affinity_tmp);
1574     }
1575 
1576     if ( !(curr->async_exception_mask & (curr->async_exception_mask - 1)) )
1577         trap = __scanbit(curr->async_exception_mask, VCPU_TRAP_NONE);
1578     else
1579         for ( trap = VCPU_TRAP_NONE + 1; trap <= VCPU_TRAP_LAST; ++trap )
1580             if ( (curr->async_exception_mask ^
1581                   curr->async_exception_state(trap).old_mask) == (1 << trap) )
1582                 break;
1583     if ( unlikely(trap > VCPU_TRAP_LAST) )
1584     {
1585         ASSERT_UNREACHABLE();
1586         return;
1587     }
1588 
1589     /* Restore previous asynchronous exception mask. */
1590     curr->async_exception_mask = curr->async_exception_state(trap).old_mask;
1591 }
1592 
nmi_hwdom_report(unsigned int reason_idx)1593 static void nmi_hwdom_report(unsigned int reason_idx)
1594 {
1595     struct domain *d = hardware_domain;
1596 
1597     if ( !d || !d->vcpu || !d->vcpu[0] || !is_pv_domain(d) /* PVH fixme */ )
1598         return;
1599 
1600     set_bit(reason_idx, nmi_reason(d));
1601 
1602     pv_raise_interrupt(d->vcpu[0], TRAP_nmi);
1603 }
1604 
pci_serr_error(const struct cpu_user_regs * regs)1605 static void pci_serr_error(const struct cpu_user_regs *regs)
1606 {
1607     outb((inb(0x61) & 0x0f) | 0x04, 0x61); /* clear-and-disable the PCI SERR error line. */
1608 
1609     switch ( opt_nmi[0] )
1610     {
1611     case 'd': /* 'dom0' */
1612         nmi_hwdom_report(_XEN_NMIREASON_pci_serr);
1613         /* fallthrough */
1614     case 'i': /* 'ignore' */
1615         /* Would like to print a diagnostic here but can't call printk()
1616            from NMI context -- raise a softirq instead. */
1617         raise_softirq(PCI_SERR_SOFTIRQ);
1618         break;
1619     default:  /* 'fatal' */
1620         console_force_unlock();
1621         printk("\n\nNMI - PCI system error (SERR)\n");
1622         fatal_trap(regs, 0);
1623     }
1624 }
1625 
io_check_error(const struct cpu_user_regs * regs)1626 static void io_check_error(const struct cpu_user_regs *regs)
1627 {
1628     switch ( opt_nmi[0] )
1629     {
1630     case 'd': /* 'dom0' */
1631         nmi_hwdom_report(_XEN_NMIREASON_io_error);
1632     case 'i': /* 'ignore' */
1633         break;
1634     default:  /* 'fatal' */
1635         console_force_unlock();
1636         printk("\n\nNMI - I/O ERROR\n");
1637         fatal_trap(regs, 0);
1638     }
1639 
1640     outb((inb(0x61) & 0x0f) | 0x08, 0x61); /* clear-and-disable IOCK */
1641     mdelay(1);
1642     outb((inb(0x61) & 0x07) | 0x00, 0x61); /* enable IOCK */
1643 }
1644 
unknown_nmi_error(const struct cpu_user_regs * regs,unsigned char reason)1645 static void unknown_nmi_error(const struct cpu_user_regs *regs,
1646                               unsigned char reason)
1647 {
1648     switch ( opt_nmi[0] )
1649     {
1650     case 'd': /* 'dom0' */
1651         nmi_hwdom_report(_XEN_NMIREASON_unknown);
1652     case 'i': /* 'ignore' */
1653         break;
1654     default:  /* 'fatal' */
1655         console_force_unlock();
1656         printk("Uhhuh. NMI received for unknown reason %02x.\n", reason);
1657         printk("Do you have a strange power saving mode enabled?\n");
1658         fatal_trap(regs, 0);
1659     }
1660 }
1661 
dummy_nmi_callback(const struct cpu_user_regs * regs,int cpu)1662 static int dummy_nmi_callback(const struct cpu_user_regs *regs, int cpu)
1663 {
1664     return 0;
1665 }
1666 
1667 static nmi_callback_t *nmi_callback = dummy_nmi_callback;
1668 
do_nmi(const struct cpu_user_regs * regs)1669 void do_nmi(const struct cpu_user_regs *regs)
1670 {
1671     unsigned int cpu = smp_processor_id();
1672     unsigned char reason;
1673     bool handle_unknown = false;
1674 
1675     ++nmi_count(cpu);
1676 
1677     if ( nmi_callback(regs, cpu) )
1678         return;
1679 
1680     if ( (nmi_watchdog == NMI_NONE) ||
1681          (!nmi_watchdog_tick(regs) && watchdog_force) )
1682         handle_unknown = true;
1683 
1684     /* Only the BSP gets external NMIs from the system. */
1685     if ( cpu == 0 )
1686     {
1687         reason = inb(0x61);
1688         if ( reason & 0x80 )
1689             pci_serr_error(regs);
1690         if ( reason & 0x40 )
1691             io_check_error(regs);
1692         if ( !(reason & 0xc0) && handle_unknown )
1693             unknown_nmi_error(regs, reason);
1694     }
1695 }
1696 
set_nmi_callback(nmi_callback_t * callback)1697 nmi_callback_t *set_nmi_callback(nmi_callback_t *callback)
1698 {
1699     nmi_callback_t *old_nmi_callback = nmi_callback;
1700 
1701     nmi_callback = callback;
1702 
1703     return old_nmi_callback;
1704 }
1705 
unset_nmi_callback(void)1706 void unset_nmi_callback(void)
1707 {
1708     nmi_callback = dummy_nmi_callback;
1709 }
1710 
do_device_not_available(struct cpu_user_regs * regs)1711 void do_device_not_available(struct cpu_user_regs *regs)
1712 {
1713     struct vcpu *curr = current;
1714 
1715     BUG_ON(!guest_mode(regs));
1716 
1717     vcpu_restore_fpu_lazy(curr);
1718 
1719     if ( curr->arch.pv_vcpu.ctrlreg[0] & X86_CR0_TS )
1720     {
1721         pv_inject_hw_exception(TRAP_no_device, X86_EVENT_NO_EC);
1722         curr->arch.pv_vcpu.ctrlreg[0] &= ~X86_CR0_TS;
1723     }
1724     else
1725         TRACE_0D(TRC_PV_MATH_STATE_RESTORE);
1726 
1727     return;
1728 }
1729 
read_efer(void)1730 u64 read_efer(void)
1731 {
1732     return this_cpu(efer);
1733 }
1734 
write_efer(u64 val)1735 void write_efer(u64 val)
1736 {
1737     this_cpu(efer) = val;
1738     wrmsrl(MSR_EFER, val);
1739 }
1740 
ler_enable(void)1741 static void ler_enable(void)
1742 {
1743     u64 debugctl;
1744 
1745     if ( !this_cpu(ler_msr) )
1746         return;
1747 
1748     rdmsrl(MSR_IA32_DEBUGCTLMSR, debugctl);
1749     wrmsrl(MSR_IA32_DEBUGCTLMSR, debugctl | IA32_DEBUGCTLMSR_LBR);
1750 }
1751 
do_debug(struct cpu_user_regs * regs)1752 void do_debug(struct cpu_user_regs *regs)
1753 {
1754     struct vcpu *v = current;
1755 
1756     if ( debugger_trap_entry(TRAP_debug, regs) )
1757         return;
1758 
1759     if ( !guest_mode(regs) )
1760     {
1761         if ( regs->eflags & X86_EFLAGS_TF )
1762         {
1763             /* In SYSENTER entry path we can't zap TF until EFLAGS is saved. */
1764             if ( (regs->rip >= (unsigned long)sysenter_entry) &&
1765                  (regs->rip <= (unsigned long)sysenter_eflags_saved) )
1766             {
1767                 if ( regs->rip == (unsigned long)sysenter_eflags_saved )
1768                     regs->eflags &= ~X86_EFLAGS_TF;
1769                 goto out;
1770             }
1771             if ( !debugger_trap_fatal(TRAP_debug, regs) )
1772             {
1773                 WARN();
1774                 regs->eflags &= ~X86_EFLAGS_TF;
1775             }
1776         }
1777         else
1778         {
1779             /*
1780              * We ignore watchpoints when they trigger within Xen. This may
1781              * happen when a buffer is passed to us which previously had a
1782              * watchpoint set on it. No need to bump EIP; the only faulting
1783              * trap is an instruction breakpoint, which can't happen to us.
1784              */
1785             WARN_ON(!search_exception_table(regs));
1786         }
1787         goto out;
1788     }
1789 
1790     /* Save debug status register where guest OS can peek at it */
1791     v->arch.debugreg[6] = read_debugreg(6);
1792 
1793     ler_enable();
1794     pv_inject_hw_exception(TRAP_debug, X86_EVENT_NO_EC);
1795     return;
1796 
1797  out:
1798     ler_enable();
1799     return;
1800 }
1801 
__set_intr_gate(unsigned int n,uint32_t dpl,void * addr)1802 static void __init noinline __set_intr_gate(unsigned int n,
1803                                             uint32_t dpl, void *addr)
1804 {
1805     _set_gate(&idt_table[n], SYS_DESC_irq_gate, dpl, addr);
1806 }
1807 
set_swint_gate(unsigned int n,void * addr)1808 static void __init set_swint_gate(unsigned int n, void *addr)
1809 {
1810     __set_intr_gate(n, 3, addr);
1811 }
1812 
set_intr_gate(unsigned int n,void * addr)1813 static void __init set_intr_gate(unsigned int n, void *addr)
1814 {
1815     __set_intr_gate(n, 0, addr);
1816 }
1817 
load_TR(void)1818 void load_TR(void)
1819 {
1820     struct tss_struct *tss = &this_cpu(init_tss);
1821     struct desc_ptr old_gdt, tss_gdt = {
1822         .base = (long)(this_cpu(gdt_table) - FIRST_RESERVED_GDT_ENTRY),
1823         .limit = LAST_RESERVED_GDT_BYTE
1824     };
1825 
1826     _set_tssldt_desc(
1827         this_cpu(gdt_table) + TSS_ENTRY - FIRST_RESERVED_GDT_ENTRY,
1828         (unsigned long)tss,
1829         offsetof(struct tss_struct, __cacheline_filler) - 1,
1830         SYS_DESC_tss_avail);
1831     _set_tssldt_desc(
1832         this_cpu(compat_gdt_table) + TSS_ENTRY - FIRST_RESERVED_GDT_ENTRY,
1833         (unsigned long)tss,
1834         offsetof(struct tss_struct, __cacheline_filler) - 1,
1835         SYS_DESC_tss_busy);
1836 
1837     /* Switch to non-compat GDT (which has B bit clear) to execute LTR. */
1838     asm volatile (
1839         "sgdt %0; lgdt %2; ltr %w1; lgdt %0"
1840         : "=m" (old_gdt) : "rm" (TSS_ENTRY << 3), "m" (tss_gdt) : "memory" );
1841 }
1842 
percpu_traps_init(void)1843 void percpu_traps_init(void)
1844 {
1845     subarch_percpu_traps_init();
1846 
1847     if ( !opt_ler )
1848         return;
1849 
1850     switch ( boot_cpu_data.x86_vendor )
1851     {
1852     case X86_VENDOR_INTEL:
1853         switch ( boot_cpu_data.x86 )
1854         {
1855         case 6:
1856             this_cpu(ler_msr) = MSR_IA32_LASTINTFROMIP;
1857             break;
1858         case 15:
1859             this_cpu(ler_msr) = MSR_P4_LER_FROM_LIP;
1860             break;
1861         }
1862         break;
1863     case X86_VENDOR_AMD:
1864         switch ( boot_cpu_data.x86 )
1865         {
1866         case 6:
1867         case 0xf ... 0x17:
1868             this_cpu(ler_msr) = MSR_IA32_LASTINTFROMIP;
1869             break;
1870         }
1871         break;
1872     }
1873 
1874     ler_enable();
1875 }
1876 
init_idt_traps(void)1877 void __init init_idt_traps(void)
1878 {
1879     /*
1880      * Note that interrupt gates are always used, rather than trap gates. We
1881      * must have interrupts disabled until DS/ES/FS/GS are saved because the
1882      * first activation must have the "bad" value(s) for these registers and
1883      * we may lose them if another activation is installed before they are
1884      * saved. The page-fault handler also needs interrupts disabled until %cr2
1885      * has been read and saved on the stack.
1886      */
1887     set_intr_gate(TRAP_divide_error,&divide_error);
1888     set_intr_gate(TRAP_debug,&debug);
1889     set_intr_gate(TRAP_nmi,&nmi);
1890     set_swint_gate(TRAP_int3,&int3);         /* usable from all privileges */
1891     set_swint_gate(TRAP_overflow,&overflow); /* usable from all privileges */
1892     set_intr_gate(TRAP_bounds,&bounds);
1893     set_intr_gate(TRAP_invalid_op,&invalid_op);
1894     set_intr_gate(TRAP_no_device,&device_not_available);
1895     set_intr_gate(TRAP_double_fault,&double_fault);
1896     set_intr_gate(TRAP_invalid_tss,&invalid_TSS);
1897     set_intr_gate(TRAP_no_segment,&segment_not_present);
1898     set_intr_gate(TRAP_stack_error,&stack_segment);
1899     set_intr_gate(TRAP_gp_fault,&general_protection);
1900     set_intr_gate(TRAP_page_fault,&early_page_fault);
1901     set_intr_gate(TRAP_copro_error,&coprocessor_error);
1902     set_intr_gate(TRAP_alignment_check,&alignment_check);
1903     set_intr_gate(TRAP_machine_check,&machine_check);
1904     set_intr_gate(TRAP_simd_error,&simd_coprocessor_error);
1905 
1906     /* Specify dedicated interrupt stacks for NMI, #DF, and #MC. */
1907     set_ist(&idt_table[TRAP_double_fault],  IST_DF);
1908     set_ist(&idt_table[TRAP_nmi],           IST_NMI);
1909     set_ist(&idt_table[TRAP_machine_check], IST_MCE);
1910 
1911     /* CPU0 uses the master IDT. */
1912     idt_tables[0] = idt_table;
1913 
1914     this_cpu(gdt_table) = boot_cpu_gdt_table;
1915     this_cpu(compat_gdt_table) = boot_cpu_compat_gdt_table;
1916 }
1917 
1918 extern void (*const autogen_entrypoints[NR_VECTORS])(void);
trap_init(void)1919 void __init trap_init(void)
1920 {
1921     unsigned int vector;
1922 
1923     /* Replace early pagefault with real pagefault handler. */
1924     set_intr_gate(TRAP_page_fault, &page_fault);
1925 
1926     pv_trap_init();
1927 
1928     for ( vector = 0; vector < NR_VECTORS; ++vector )
1929     {
1930         if ( autogen_entrypoints[vector] )
1931         {
1932             /* Found autogen entry: check we won't clobber an existing trap. */
1933             ASSERT(idt_table[vector].b == 0);
1934             set_intr_gate(vector, autogen_entrypoints[vector]);
1935         }
1936         else
1937         {
1938             /* No entry point: confirm we have an existing trap in place. */
1939             ASSERT(idt_table[vector].b != 0);
1940         }
1941     }
1942 
1943     percpu_traps_init();
1944 
1945     cpu_init();
1946 
1947     open_softirq(PCI_SERR_SOFTIRQ, pci_serr_softirq);
1948 }
1949 
activate_debugregs(const struct vcpu * curr)1950 void activate_debugregs(const struct vcpu *curr)
1951 {
1952     ASSERT(curr == current);
1953 
1954     write_debugreg(0, curr->arch.debugreg[0]);
1955     write_debugreg(1, curr->arch.debugreg[1]);
1956     write_debugreg(2, curr->arch.debugreg[2]);
1957     write_debugreg(3, curr->arch.debugreg[3]);
1958     write_debugreg(6, curr->arch.debugreg[6]);
1959 
1960     /*
1961      * Avoid writing the subsequently getting replaced value when getting
1962      * called from set_debugreg() below. Eventual future callers will need
1963      * to take this into account.
1964      */
1965     if ( curr->arch.debugreg[7] & DR7_ACTIVE_MASK )
1966         write_debugreg(7, curr->arch.debugreg[7]);
1967 
1968     if ( boot_cpu_has(X86_FEATURE_DBEXT) )
1969     {
1970         wrmsrl(MSR_AMD64_DR0_ADDRESS_MASK, curr->arch.pv_vcpu.dr_mask[0]);
1971         wrmsrl(MSR_AMD64_DR1_ADDRESS_MASK, curr->arch.pv_vcpu.dr_mask[1]);
1972         wrmsrl(MSR_AMD64_DR2_ADDRESS_MASK, curr->arch.pv_vcpu.dr_mask[2]);
1973         wrmsrl(MSR_AMD64_DR3_ADDRESS_MASK, curr->arch.pv_vcpu.dr_mask[3]);
1974     }
1975 }
1976 
set_debugreg(struct vcpu * v,unsigned int reg,unsigned long value)1977 long set_debugreg(struct vcpu *v, unsigned int reg, unsigned long value)
1978 {
1979     int i;
1980     struct vcpu *curr = current;
1981 
1982     switch ( reg )
1983     {
1984     case 0:
1985         if ( !access_ok(value, sizeof(long)) )
1986             return -EPERM;
1987         if ( v == curr )
1988             write_debugreg(0, value);
1989         break;
1990     case 1:
1991         if ( !access_ok(value, sizeof(long)) )
1992             return -EPERM;
1993         if ( v == curr )
1994             write_debugreg(1, value);
1995         break;
1996     case 2:
1997         if ( !access_ok(value, sizeof(long)) )
1998             return -EPERM;
1999         if ( v == curr )
2000             write_debugreg(2, value);
2001         break;
2002     case 3:
2003         if ( !access_ok(value, sizeof(long)) )
2004             return -EPERM;
2005         if ( v == curr )
2006             write_debugreg(3, value);
2007         break;
2008     case 6:
2009         /*
2010          * DR6: Bits 4-11,16-31 reserved (set to 1).
2011          *      Bit 12 reserved (set to 0).
2012          */
2013         value &= ~DR_STATUS_RESERVED_ZERO; /* reserved bits => 0 */
2014         value |=  DR_STATUS_RESERVED_ONE;  /* reserved bits => 1 */
2015         if ( v == curr )
2016             write_debugreg(6, value);
2017         break;
2018     case 7:
2019         /*
2020          * DR7: Bit 10 reserved (set to 1).
2021          *      Bits 11-12,14-15 reserved (set to 0).
2022          */
2023         value &= ~DR_CONTROL_RESERVED_ZERO; /* reserved bits => 0 */
2024         value |=  DR_CONTROL_RESERVED_ONE;  /* reserved bits => 1 */
2025         /*
2026          * Privileged bits:
2027          *      GD (bit 13): must be 0.
2028          */
2029         if ( value & DR_GENERAL_DETECT )
2030             return -EPERM;
2031         /* DR7.{G,L}E = 0 => debugging disabled for this domain. */
2032         if ( value & DR7_ACTIVE_MASK )
2033         {
2034             unsigned int io_enable = 0;
2035 
2036             for ( i = DR_CONTROL_SHIFT; i < 32; i += DR_CONTROL_SIZE )
2037             {
2038                 if ( ((value >> i) & 3) == DR_IO )
2039                 {
2040                     if ( !(v->arch.pv_vcpu.ctrlreg[4] & X86_CR4_DE) )
2041                         return -EPERM;
2042                     io_enable |= value & (3 << ((i - 16) >> 1));
2043                 }
2044             }
2045 
2046             /* Guest DR5 is a handy stash for I/O intercept information. */
2047             v->arch.debugreg[5] = io_enable;
2048             value &= ~io_enable;
2049 
2050             /*
2051              * If DR7 was previously clear then we need to load all other
2052              * debug registers at this point as they were not restored during
2053              * context switch.
2054              */
2055             if ( (v == curr) &&
2056                  !(v->arch.debugreg[7] & DR7_ACTIVE_MASK) )
2057             {
2058                 activate_debugregs(v);
2059                 break;
2060             }
2061         }
2062         if ( v == curr )
2063             write_debugreg(7, value);
2064         break;
2065     default:
2066         return -EINVAL;
2067     }
2068 
2069     v->arch.debugreg[reg] = value;
2070     return 0;
2071 }
2072 
asm_domain_crash_synchronous(unsigned long addr)2073 void asm_domain_crash_synchronous(unsigned long addr)
2074 {
2075     /*
2076      * We need clear AC bit here because in entry.S AC is set
2077      * by ASM_STAC to temporarily allow accesses to user pages
2078      * which is prevented by SMAP by default.
2079      *
2080      * For some code paths, where this function is called, clac()
2081      * is not needed, but adding clac() here instead of each place
2082      * asm_domain_crash_synchronous() is called can reduce the code
2083      * redundancy, and it is harmless as well.
2084      */
2085     clac();
2086 
2087     if ( addr == 0 )
2088         addr = this_cpu(last_extable_addr);
2089 
2090     printk("domain_crash_sync called from entry.S: fault at %p %pS\n",
2091            _p(addr), _p(addr));
2092 
2093     __domain_crash_synchronous();
2094 }
2095 
2096 /*
2097  * Local variables:
2098  * mode: C
2099  * c-file-style: "BSD"
2100  * c-basic-offset: 4
2101  * tab-width: 4
2102  * indent-tabs-mode: nil
2103  * End:
2104  */
2105