1 /******************************************************************************
2 * arch/x86/traps.c
3 *
4 * Modifications to Linux original are copyright (c) 2002-2004, K A Fraser
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; If not, see <http://www.gnu.org/licenses/>.
18 */
19
20 /*
21 * Copyright (C) 1991, 1992 Linus Torvalds
22 *
23 * Pentium III FXSR, SSE support
24 * Gareth Hughes <gareth@valinux.com>, May 2000
25 */
26
27 #include <xen/init.h>
28 #include <xen/sched.h>
29 #include <xen/lib.h>
30 #include <xen/err.h>
31 #include <xen/errno.h>
32 #include <xen/mm.h>
33 #include <xen/console.h>
34 #include <xen/shutdown.h>
35 #include <xen/guest_access.h>
36 #include <asm/regs.h>
37 #include <xen/delay.h>
38 #include <xen/event.h>
39 #include <xen/spinlock.h>
40 #include <xen/irq.h>
41 #include <xen/perfc.h>
42 #include <xen/softirq.h>
43 #include <xen/domain_page.h>
44 #include <xen/symbols.h>
45 #include <xen/iocap.h>
46 #include <xen/version.h>
47 #include <xen/kexec.h>
48 #include <xen/trace.h>
49 #include <xen/paging.h>
50 #include <xen/virtual_region.h>
51 #include <xen/watchdog.h>
52 #include <xen/livepatch.h>
53 #include <asm/system.h>
54 #include <asm/io.h>
55 #include <asm/atomic.h>
56 #include <xen/bitops.h>
57 #include <asm/desc.h>
58 #include <asm/debugreg.h>
59 #include <asm/smp.h>
60 #include <asm/flushtlb.h>
61 #include <asm/uaccess.h>
62 #include <asm/i387.h>
63 #include <asm/xstate.h>
64 #include <asm/debugger.h>
65 #include <asm/msr.h>
66 #include <asm/nmi.h>
67 #include <asm/shared.h>
68 #include <asm/x86_emulate.h>
69 #include <asm/traps.h>
70 #include <asm/hvm/vpt.h>
71 #include <asm/hypercall.h>
72 #include <asm/mce.h>
73 #include <asm/apic.h>
74 #include <asm/mc146818rtc.h>
75 #include <asm/hpet.h>
76 #include <asm/vpmu.h>
77 #include <public/arch-x86/cpuid.h>
78 #include <asm/cpuid.h>
79 #include <xsm/xsm.h>
80 #include <asm/pv/traps.h>
81 #include <asm/pv/mm.h>
82
83 /*
84 * opt_nmi: one of 'ignore', 'dom0', or 'fatal'.
85 * fatal: Xen prints diagnostic message and then hangs.
86 * dom0: The NMI is virtualised to DOM0.
87 * ignore: The NMI error is cleared and ignored.
88 */
89 #ifdef NDEBUG
90 static char __read_mostly opt_nmi[10] = "dom0";
91 #else
92 static char __read_mostly opt_nmi[10] = "fatal";
93 #endif
94 string_param("nmi", opt_nmi);
95
96 DEFINE_PER_CPU(u64, efer);
97 static DEFINE_PER_CPU(unsigned long, last_extable_addr);
98
99 DEFINE_PER_CPU_READ_MOSTLY(u32, ler_msr);
100
101 DEFINE_PER_CPU_READ_MOSTLY(struct desc_struct *, gdt_table);
102 DEFINE_PER_CPU_READ_MOSTLY(struct desc_struct *, compat_gdt_table);
103
104 /* Master table, used by CPU0. */
105 idt_entry_t idt_table[IDT_ENTRIES];
106
107 /* Pointer to the IDT of every CPU. */
108 idt_entry_t *idt_tables[NR_CPUS] __read_mostly;
109
110 void (*ioemul_handle_quirk)(
111 u8 opcode, char *io_emul_stub, struct cpu_user_regs *regs);
112
113 static int debug_stack_lines = 20;
114 integer_param("debug_stack_lines", debug_stack_lines);
115
116 static bool opt_ler;
117 boolean_param("ler", opt_ler);
118
119 #define stack_words_per_line 4
120 #define ESP_BEFORE_EXCEPTION(regs) ((unsigned long *)regs->rsp)
121
show_code(const struct cpu_user_regs * regs)122 static void show_code(const struct cpu_user_regs *regs)
123 {
124 unsigned char insns_before[8] = {}, insns_after[16] = {};
125 unsigned int i, tmp, missing_before, missing_after;
126
127 if ( guest_mode(regs) )
128 return;
129
130 stac();
131
132 /*
133 * Copy forward from regs->rip. In the case of a fault, %ecx contains the
134 * number of bytes remaining to copy.
135 */
136 asm volatile ("1: rep movsb; 2:"
137 _ASM_EXTABLE(1b, 2b)
138 : "=&c" (missing_after),
139 "=&D" (tmp), "=&S" (tmp)
140 : "0" (ARRAY_SIZE(insns_after)),
141 "1" (insns_after),
142 "2" (regs->rip));
143
144 /*
145 * Copy backwards from regs->rip - 1. In the case of a fault, %ecx
146 * contains the number of bytes remaining to copy.
147 */
148 asm volatile ("std;"
149 "1: rep movsb;"
150 "2: cld;"
151 _ASM_EXTABLE(1b, 2b)
152 : "=&c" (missing_before),
153 "=&D" (tmp), "=&S" (tmp)
154 : "0" (ARRAY_SIZE(insns_before)),
155 "1" (insns_before + ARRAY_SIZE(insns_before) - 1),
156 "2" (regs->rip - 1));
157 clac();
158
159 printk("Xen code around <%p> (%ps)%s:\n",
160 _p(regs->rip), _p(regs->rip),
161 (missing_before || missing_after) ? " [fault on access]" : "");
162
163 /* Print bytes from insns_before[]. */
164 for ( i = 0; i < ARRAY_SIZE(insns_before); ++i )
165 {
166 if ( i < missing_before )
167 printk(" --");
168 else
169 printk(" %02x", insns_before[i]);
170 }
171
172 /* Print the byte under %rip. */
173 if ( missing_after != ARRAY_SIZE(insns_after) )
174 printk(" <%02x>", insns_after[0]);
175 else
176 printk(" <-->");
177
178 /* Print bytes from insns_after[]. */
179 for ( i = 1; i < ARRAY_SIZE(insns_after); ++i )
180 {
181 if ( i < (ARRAY_SIZE(insns_after) - missing_after) )
182 printk(" %02x", insns_after[i]);
183 else
184 printk(" --");
185 }
186
187 printk("\n");
188 }
189
compat_show_guest_stack(struct vcpu * v,const struct cpu_user_regs * regs,int debug_stack_lines)190 static void compat_show_guest_stack(struct vcpu *v,
191 const struct cpu_user_regs *regs,
192 int debug_stack_lines)
193 {
194 unsigned int i, *stack, addr, mask = STACK_SIZE;
195
196 stack = (unsigned int *)(unsigned long)regs->esp;
197 printk("Guest stack trace from esp=%08lx:\n ", (unsigned long)stack);
198
199 if ( !__compat_access_ok(v->domain, stack, sizeof(*stack)) )
200 {
201 printk("Guest-inaccessible memory.\n");
202 return;
203 }
204
205 if ( v != current )
206 {
207 struct vcpu *vcpu;
208 unsigned long mfn;
209
210 ASSERT(guest_kernel_mode(v, regs));
211 mfn = read_cr3() >> PAGE_SHIFT;
212 for_each_vcpu( v->domain, vcpu )
213 if ( pagetable_get_pfn(vcpu->arch.guest_table) == mfn )
214 break;
215 if ( !vcpu )
216 {
217 stack = do_page_walk(v, (unsigned long)stack);
218 if ( (unsigned long)stack < PAGE_SIZE )
219 {
220 printk("Inaccessible guest memory.\n");
221 return;
222 }
223 mask = PAGE_SIZE;
224 }
225 }
226
227 for ( i = 0; i < debug_stack_lines * 8; i++ )
228 {
229 if ( (((long)stack - 1) ^ ((long)(stack + 1) - 1)) & mask )
230 break;
231 if ( __get_user(addr, stack) )
232 {
233 if ( i != 0 )
234 printk("\n ");
235 printk("Fault while accessing guest memory.");
236 i = 1;
237 break;
238 }
239 if ( (i != 0) && ((i % 8) == 0) )
240 printk("\n ");
241 printk(" %08x", addr);
242 stack++;
243 }
244 if ( mask == PAGE_SIZE )
245 {
246 BUILD_BUG_ON(PAGE_SIZE == STACK_SIZE);
247 unmap_domain_page(stack);
248 }
249 if ( i == 0 )
250 printk("Stack empty.");
251 printk("\n");
252 }
253
show_guest_stack(struct vcpu * v,const struct cpu_user_regs * regs)254 static void show_guest_stack(struct vcpu *v, const struct cpu_user_regs *regs)
255 {
256 int i;
257 unsigned long *stack, addr;
258 unsigned long mask = STACK_SIZE;
259
260 /* Avoid HVM as we don't know what the stack looks like. */
261 if ( is_hvm_vcpu(v) )
262 return;
263
264 if ( is_pv_32bit_vcpu(v) )
265 {
266 compat_show_guest_stack(v, regs, debug_stack_lines);
267 return;
268 }
269
270 stack = (unsigned long *)regs->rsp;
271 printk("Guest stack trace from "__OP"sp=%p:\n ", stack);
272
273 if ( !access_ok(stack, sizeof(*stack)) )
274 {
275 printk("Guest-inaccessible memory.\n");
276 return;
277 }
278
279 if ( v != current )
280 {
281 struct vcpu *vcpu;
282
283 ASSERT(guest_kernel_mode(v, regs));
284 vcpu = maddr_get_owner(read_cr3()) == v->domain ? v : NULL;
285 if ( !vcpu )
286 {
287 stack = do_page_walk(v, (unsigned long)stack);
288 if ( (unsigned long)stack < PAGE_SIZE )
289 {
290 printk("Inaccessible guest memory.\n");
291 return;
292 }
293 mask = PAGE_SIZE;
294 }
295 }
296
297 for ( i = 0; i < (debug_stack_lines*stack_words_per_line); i++ )
298 {
299 if ( (((long)stack - 1) ^ ((long)(stack + 1) - 1)) & mask )
300 break;
301 if ( __get_user(addr, stack) )
302 {
303 if ( i != 0 )
304 printk("\n ");
305 printk("Fault while accessing guest memory.");
306 i = 1;
307 break;
308 }
309 if ( (i != 0) && ((i % stack_words_per_line) == 0) )
310 printk("\n ");
311 printk(" %p", _p(addr));
312 stack++;
313 }
314 if ( mask == PAGE_SIZE )
315 {
316 BUILD_BUG_ON(PAGE_SIZE == STACK_SIZE);
317 unmap_domain_page(stack);
318 }
319 if ( i == 0 )
320 printk("Stack empty.");
321 printk("\n");
322 }
323
324 /*
325 * Notes for get_stack_trace_bottom() and get_stack_dump_bottom()
326 *
327 * Stack pages 0, 1 and 2:
328 * These are all 1-page IST stacks. Each of these stacks have an exception
329 * frame and saved register state at the top. The interesting bound for a
330 * trace is the word adjacent to this, while the bound for a dump is the
331 * very top, including the exception frame.
332 *
333 * Stack pages 3, 4 and 5:
334 * None of these are particularly interesting. With MEMORY_GUARD, page 5 is
335 * explicitly not present, so attempting to dump or trace it is
336 * counterproductive. Without MEMORY_GUARD, it is possible for a call chain
337 * to use the entire primary stack and wander into page 5. In this case,
338 * consider these pages an extension of the primary stack to aid debugging
339 * hopefully rare situations where the primary stack has effective been
340 * overflown.
341 *
342 * Stack pages 6 and 7:
343 * These form the primary stack, and have a cpu_info at the top. For a
344 * trace, the interesting bound is adjacent to the cpu_info, while for a
345 * dump, the entire cpu_info is interesting.
346 *
347 * For the cases where the stack should not be inspected, pretend that the
348 * passed stack pointer is already out of reasonable bounds.
349 */
get_stack_trace_bottom(unsigned long sp)350 unsigned long get_stack_trace_bottom(unsigned long sp)
351 {
352 switch ( get_stack_page(sp) )
353 {
354 case 0 ... 2:
355 return ROUNDUP(sp, PAGE_SIZE) -
356 offsetof(struct cpu_user_regs, es) - sizeof(unsigned long);
357
358 #ifndef MEMORY_GUARD
359 case 3 ... 5:
360 #endif
361 case 6 ... 7:
362 return ROUNDUP(sp, STACK_SIZE) -
363 sizeof(struct cpu_info) - sizeof(unsigned long);
364
365 default:
366 return sp - sizeof(unsigned long);
367 }
368 }
369
get_stack_dump_bottom(unsigned long sp)370 unsigned long get_stack_dump_bottom(unsigned long sp)
371 {
372 switch ( get_stack_page(sp) )
373 {
374 case 0 ... 2:
375 return ROUNDUP(sp, PAGE_SIZE) - sizeof(unsigned long);
376
377 #ifndef MEMORY_GUARD
378 case 3 ... 5:
379 #endif
380 case 6 ... 7:
381 return ROUNDUP(sp, STACK_SIZE) - sizeof(unsigned long);
382
383 default:
384 return sp - sizeof(unsigned long);
385 }
386 }
387
388 #if !defined(CONFIG_FRAME_POINTER)
389
390 /*
391 * Stack trace from pointers found in stack, unaided by frame pointers. For
392 * caller convenience, this has the same prototype as its alternative, and
393 * simply ignores the base pointer parameter.
394 */
_show_trace(unsigned long sp,unsigned long __maybe_unused bp)395 static void _show_trace(unsigned long sp, unsigned long __maybe_unused bp)
396 {
397 unsigned long *stack = (unsigned long *)sp, addr;
398 unsigned long *bottom = (unsigned long *)get_stack_trace_bottom(sp);
399
400 while ( stack <= bottom )
401 {
402 addr = *stack++;
403 if ( is_active_kernel_text(addr) )
404 printk(" [<%p>] %pS\n", _p(addr), _p(addr));
405 }
406 }
407
408 #else
409
410 /* Stack trace from frames in the stack, using frame pointers */
_show_trace(unsigned long sp,unsigned long bp)411 static void _show_trace(unsigned long sp, unsigned long bp)
412 {
413 unsigned long *frame, next, addr;
414
415 /* Bounds for range of valid frame pointer. */
416 unsigned long low = sp, high = get_stack_trace_bottom(sp);
417
418 /* The initial frame pointer. */
419 next = bp;
420
421 for ( ; ; )
422 {
423 /* Valid frame pointer? */
424 if ( (next < low) || (next >= high) )
425 {
426 /*
427 * Exception stack frames have a different layout, denoted by an
428 * inverted frame pointer.
429 */
430 next = ~next;
431 if ( (next < low) || (next >= high) )
432 break;
433 frame = (unsigned long *)next;
434 next = frame[0];
435 addr = frame[(offsetof(struct cpu_user_regs, rip) -
436 offsetof(struct cpu_user_regs, rbp))
437 / BYTES_PER_LONG];
438 }
439 else
440 {
441 /* Ordinary stack frame. */
442 frame = (unsigned long *)next;
443 next = frame[0];
444 addr = frame[1];
445 }
446
447 printk(" [<%p>] %pS\n", _p(addr), _p(addr));
448
449 low = (unsigned long)&frame[2];
450 }
451 }
452
453 #endif
454
show_trace(const struct cpu_user_regs * regs)455 static void show_trace(const struct cpu_user_regs *regs)
456 {
457 unsigned long *sp = ESP_BEFORE_EXCEPTION(regs);
458
459 printk("Xen call trace:\n");
460
461 /*
462 * If RIP looks sensible, or the top of the stack doesn't, print RIP at
463 * the top of the stack trace.
464 */
465 if ( is_active_kernel_text(regs->rip) ||
466 !is_active_kernel_text(*sp) )
467 printk(" [<%p>] %pS\n", _p(regs->rip), _p(regs->rip));
468 /*
469 * Else RIP looks bad but the top of the stack looks good. Perhaps we
470 * followed a wild function pointer? Lets assume the top of the stack is a
471 * return address; print it and skip past so _show_trace() doesn't print
472 * it again.
473 */
474 else
475 {
476 printk(" [<%p>] %pS\n", _p(*sp), _p(*sp));
477 sp++;
478 }
479
480 _show_trace((unsigned long)sp, regs->rbp);
481
482 printk("\n");
483 }
484
show_stack(const struct cpu_user_regs * regs)485 void show_stack(const struct cpu_user_regs *regs)
486 {
487 unsigned long *stack = ESP_BEFORE_EXCEPTION(regs), *stack_bottom, addr;
488 int i;
489
490 if ( guest_mode(regs) )
491 return show_guest_stack(current, regs);
492
493 printk("Xen stack trace from "__OP"sp=%p:\n ", stack);
494
495 stack_bottom = _p(get_stack_dump_bottom(regs->rsp));
496
497 for ( i = 0; i < (debug_stack_lines*stack_words_per_line) &&
498 (stack <= stack_bottom); i++ )
499 {
500 if ( (i != 0) && ((i % stack_words_per_line) == 0) )
501 printk("\n ");
502 addr = *stack++;
503 printk(" %p", _p(addr));
504 }
505 if ( i == 0 )
506 printk("Stack empty.");
507 printk("\n");
508
509 show_trace(regs);
510 }
511
show_stack_overflow(unsigned int cpu,const struct cpu_user_regs * regs)512 void show_stack_overflow(unsigned int cpu, const struct cpu_user_regs *regs)
513 {
514 unsigned long esp = regs->rsp;
515 unsigned long curr_stack_base = esp & ~(STACK_SIZE - 1);
516 #ifdef MEMORY_GUARD
517 unsigned long esp_top, esp_bottom;
518 #endif
519
520 if ( _p(curr_stack_base) != stack_base[cpu] )
521 printk("Current stack base %p differs from expected %p\n",
522 _p(curr_stack_base), stack_base[cpu]);
523
524 #ifdef MEMORY_GUARD
525 esp_bottom = (esp | (STACK_SIZE - 1)) + 1;
526 esp_top = esp_bottom - PRIMARY_STACK_SIZE;
527
528 printk("Valid stack range: %p-%p, sp=%p, tss.rsp0=%p\n",
529 (void *)esp_top, (void *)esp_bottom, (void *)esp,
530 (void *)per_cpu(init_tss, cpu).rsp0);
531
532 /*
533 * Trigger overflow trace if %esp is anywhere within the guard page, or
534 * with fewer than 512 bytes remaining on the primary stack.
535 */
536 if ( (esp > (esp_top + 512)) ||
537 (esp < (esp_top - PAGE_SIZE)) )
538 {
539 printk("No stack overflow detected. Skipping stack trace.\n");
540 return;
541 }
542
543 if ( esp < esp_top )
544 esp = esp_top;
545
546 printk("Xen stack overflow (dumping trace %p-%p):\n",
547 (void *)esp, (void *)esp_bottom);
548
549 _show_trace(esp, regs->rbp);
550
551 printk("\n");
552 #endif
553 }
554
show_execution_state(const struct cpu_user_regs * regs)555 void show_execution_state(const struct cpu_user_regs *regs)
556 {
557 /* Prevent interleaving of output. */
558 unsigned long flags = console_lock_recursive_irqsave();
559
560 show_registers(regs);
561 show_code(regs);
562 show_stack(regs);
563
564 console_unlock_recursive_irqrestore(flags);
565 }
566
vcpu_show_execution_state(struct vcpu * v)567 void vcpu_show_execution_state(struct vcpu *v)
568 {
569 unsigned long flags;
570
571 printk("*** Dumping Dom%d vcpu#%d state: ***\n",
572 v->domain->domain_id, v->vcpu_id);
573
574 if ( v == current )
575 {
576 show_execution_state(guest_cpu_user_regs());
577 return;
578 }
579
580 vcpu_pause(v); /* acceptably dangerous */
581
582 /* Prevent interleaving of output. */
583 flags = console_lock_recursive_irqsave();
584
585 vcpu_show_registers(v);
586 if ( guest_kernel_mode(v, &v->arch.user_regs) )
587 show_guest_stack(v, &v->arch.user_regs);
588
589 console_unlock_recursive_irqrestore(flags);
590
591 vcpu_unpause(v);
592 }
593
594 static cpumask_t show_state_mask;
595 static bool opt_show_all;
596 boolean_param("async-show-all", opt_show_all);
597
nmi_show_execution_state(const struct cpu_user_regs * regs,int cpu)598 static int nmi_show_execution_state(const struct cpu_user_regs *regs, int cpu)
599 {
600 if ( !cpumask_test_cpu(cpu, &show_state_mask) )
601 return 0;
602
603 if ( opt_show_all )
604 show_execution_state(regs);
605 else
606 printk(XENLOG_ERR "CPU%d @ %04x:%08lx (%pS)\n", cpu, regs->cs,
607 regs->rip, guest_mode(regs) ? _p(regs->rip) : NULL);
608 cpumask_clear_cpu(cpu, &show_state_mask);
609
610 return 1;
611 }
612
trapstr(unsigned int trapnr)613 const char *trapstr(unsigned int trapnr)
614 {
615 static const char * const strings[] = {
616 "divide error", "debug", "nmi", "bkpt", "overflow", "bounds",
617 "invalid opcode", "device not available", "double fault",
618 "coprocessor segment", "invalid tss", "segment not found",
619 "stack error", "general protection fault", "page fault",
620 "spurious interrupt", "coprocessor error", "alignment check",
621 "machine check", "simd error", "virtualisation exception"
622 };
623
624 return trapnr < ARRAY_SIZE(strings) ? strings[trapnr] : "???";
625 }
626
627 /*
628 * This is called for faults at very unexpected times (e.g., when interrupts
629 * are disabled). In such situations we can't do much that is safe. We try to
630 * print out some tracing and then we just spin.
631 */
fatal_trap(const struct cpu_user_regs * regs,bool show_remote)632 void fatal_trap(const struct cpu_user_regs *regs, bool show_remote)
633 {
634 static DEFINE_PER_CPU(char, depth);
635 unsigned int trapnr = regs->entry_vector;
636
637 /* Set AC to reduce chance of further SMAP faults */
638 stac();
639
640 /*
641 * In some cases, we can end up in a vicious cycle of fatal_trap()s
642 * within fatal_trap()s. We give the problem a couple of iterations to
643 * bottom out, and then we just panic.
644 */
645 if ( ++this_cpu(depth) < 3 )
646 {
647 watchdog_disable();
648 console_start_sync();
649
650 show_execution_state(regs);
651
652 if ( trapnr == TRAP_page_fault )
653 {
654 unsigned long cr2 = read_cr2();
655 printk("Faulting linear address: %p\n", _p(cr2));
656 show_page_walk(cr2);
657 }
658
659 if ( show_remote )
660 {
661 unsigned int msecs, pending;
662
663 cpumask_andnot(&show_state_mask, &cpu_online_map,
664 cpumask_of(smp_processor_id()));
665 set_nmi_callback(nmi_show_execution_state);
666 /* Ensure new callback is set before sending out the NMI. */
667 smp_wmb();
668 smp_send_nmi_allbutself();
669
670 /* Wait at most 10ms for some other CPU to respond. */
671 msecs = 10;
672 pending = cpumask_weight(&show_state_mask);
673 while ( pending && msecs-- )
674 {
675 unsigned int left;
676
677 mdelay(1);
678 left = cpumask_weight(&show_state_mask);
679 if ( left < pending )
680 {
681 pending = left;
682 msecs = 10;
683 }
684 }
685 }
686 }
687
688 panic("FATAL TRAP: vector = %d (%s)\n"
689 "[error_code=%04x] %s",
690 trapnr, trapstr(trapnr), regs->error_code,
691 (regs->eflags & X86_EFLAGS_IF) ? "" : ", IN INTERRUPT CONTEXT");
692 }
693
do_reserved_trap(struct cpu_user_regs * regs)694 void do_reserved_trap(struct cpu_user_regs *regs)
695 {
696 unsigned int trapnr = regs->entry_vector;
697
698 if ( debugger_trap_fatal(trapnr, regs) )
699 return;
700
701 show_execution_state(regs);
702 panic("FATAL RESERVED TRAP %#x: %s", trapnr, trapstr(trapnr));
703 }
704
do_trap(struct cpu_user_regs * regs)705 void do_trap(struct cpu_user_regs *regs)
706 {
707 struct vcpu *curr = current;
708 unsigned int trapnr = regs->entry_vector;
709 unsigned long fixup;
710
711 if ( regs->error_code & X86_XEC_EXT )
712 goto hardware_trap;
713
714 if ( debugger_trap_entry(trapnr, regs) )
715 return;
716
717 ASSERT(trapnr < 32);
718
719 if ( guest_mode(regs) )
720 {
721 pv_inject_hw_exception(trapnr,
722 (TRAP_HAVE_EC & (1u << trapnr))
723 ? regs->error_code : X86_EVENT_NO_EC);
724 return;
725 }
726
727 if ( ((trapnr == TRAP_copro_error) || (trapnr == TRAP_simd_error)) &&
728 system_state >= SYS_STATE_active && is_hvm_vcpu(curr) &&
729 curr->arch.hvm_vcpu.fpu_exception_callback )
730 {
731 curr->arch.hvm_vcpu.fpu_exception_callback(
732 curr->arch.hvm_vcpu.fpu_exception_callback_arg, regs);
733 return;
734 }
735
736 if ( likely((fixup = search_exception_table(regs)) != 0) )
737 {
738 dprintk(XENLOG_ERR, "Trap %u: %p [%ps] -> %p\n",
739 trapnr, _p(regs->rip), _p(regs->rip), _p(fixup));
740 this_cpu(last_extable_addr) = regs->rip;
741 regs->rip = fixup;
742 return;
743 }
744
745 hardware_trap:
746 if ( debugger_trap_fatal(trapnr, regs) )
747 return;
748
749 show_execution_state(regs);
750 panic("FATAL TRAP: vector = %d (%s)\n"
751 "[error_code=%04x]",
752 trapnr, trapstr(trapnr), regs->error_code);
753 }
754
755 /* Returns 0 if not handled, and non-0 for success. */
rdmsr_hypervisor_regs(uint32_t idx,uint64_t * val)756 int rdmsr_hypervisor_regs(uint32_t idx, uint64_t *val)
757 {
758 struct domain *d = current->domain;
759 /* Optionally shift out of the way of Viridian architectural MSRs. */
760 uint32_t base = is_viridian_domain(d) ? 0x40000200 : 0x40000000;
761
762 switch ( idx - base )
763 {
764 case 0: /* Write hypercall page MSR. Read as zero. */
765 {
766 *val = 0;
767 return 1;
768 }
769 }
770
771 return 0;
772 }
773
774 /* Returns 1 if handled, 0 if not and -Exx for error. */
wrmsr_hypervisor_regs(uint32_t idx,uint64_t val)775 int wrmsr_hypervisor_regs(uint32_t idx, uint64_t val)
776 {
777 struct domain *d = current->domain;
778 /* Optionally shift out of the way of Viridian architectural MSRs. */
779 uint32_t base = is_viridian_domain(d) ? 0x40000200 : 0x40000000;
780
781 switch ( idx - base )
782 {
783 case 0: /* Write hypercall page */
784 {
785 void *hypercall_page;
786 unsigned long gmfn = val >> PAGE_SHIFT;
787 unsigned int page_index = val & (PAGE_SIZE - 1);
788 struct page_info *page;
789 p2m_type_t t;
790
791 if ( page_index > 0 )
792 {
793 gdprintk(XENLOG_WARNING,
794 "wrmsr hypercall page index %#x unsupported\n",
795 page_index);
796 return 0;
797 }
798
799 page = get_page_from_gfn(d, gmfn, &t, P2M_ALLOC);
800
801 if ( !page || !get_page_type(page, PGT_writable_page) )
802 {
803 if ( page )
804 put_page(page);
805
806 if ( p2m_is_paging(t) )
807 {
808 p2m_mem_paging_populate(d, gmfn);
809 return -ERESTART;
810 }
811
812 gdprintk(XENLOG_WARNING,
813 "Bad GMFN %lx (MFN %lx) to MSR %08x\n",
814 gmfn, page ? page_to_mfn(page) : -1UL, base);
815 return 0;
816 }
817
818 hypercall_page = __map_domain_page(page);
819 hypercall_page_initialise(d, hypercall_page);
820 unmap_domain_page(hypercall_page);
821
822 put_page_and_type(page);
823 return 1;
824 }
825 }
826
827 return 0;
828 }
829
cpuid_hypervisor_leaves(const struct vcpu * v,uint32_t leaf,uint32_t subleaf,struct cpuid_leaf * res)830 void cpuid_hypervisor_leaves(const struct vcpu *v, uint32_t leaf,
831 uint32_t subleaf, struct cpuid_leaf *res)
832 {
833 const struct domain *d = v->domain;
834 const struct cpuid_policy *p = d->arch.cpuid;
835 uint32_t base = is_viridian_domain(d) ? 0x40000100 : 0x40000000;
836 uint32_t idx = leaf - base;
837 unsigned int limit = is_viridian_domain(d) ? p->hv2_limit : p->hv_limit;
838
839 if ( limit == 0 )
840 /* Default number of leaves */
841 limit = XEN_CPUID_MAX_NUM_LEAVES;
842 else
843 /* Clamp toolstack value between 2 and MAX_NUM_LEAVES. */
844 limit = min(max(limit, 2u), XEN_CPUID_MAX_NUM_LEAVES + 0u);
845
846 if ( idx > limit )
847 return;
848
849 switch ( idx )
850 {
851 case 0:
852 res->a = base + limit; /* Largest leaf */
853 res->b = XEN_CPUID_SIGNATURE_EBX;
854 res->c = XEN_CPUID_SIGNATURE_ECX;
855 res->d = XEN_CPUID_SIGNATURE_EDX;
856 break;
857
858 case 1:
859 res->a = (xen_major_version() << 16) | xen_minor_version();
860 break;
861
862 case 2:
863 res->a = 1; /* Number of hypercall-transfer pages */
864 /* MSR base address */
865 res->b = is_viridian_domain(d) ? 0x40000200 : 0x40000000;
866 if ( is_pv_domain(d) ) /* Features */
867 res->c |= XEN_CPUID_FEAT1_MMU_PT_UPDATE_PRESERVE_AD;
868 break;
869
870 case 3: /* Time leaf. */
871 switch ( subleaf )
872 {
873 case 0: /* features */
874 res->a = ((d->arch.vtsc << 0) |
875 (!!host_tsc_is_safe() << 1) |
876 (!!boot_cpu_has(X86_FEATURE_RDTSCP) << 2));
877 res->b = d->arch.tsc_mode;
878 res->c = d->arch.tsc_khz;
879 res->d = d->arch.incarnation;
880 break;
881
882 case 1: /* scale and offset */
883 {
884 uint64_t offset;
885
886 if ( !d->arch.vtsc )
887 offset = d->arch.vtsc_offset;
888 else
889 /* offset already applied to value returned by virtual rdtscp */
890 offset = 0;
891 res->a = offset;
892 res->b = offset >> 32;
893 res->c = d->arch.vtsc_to_ns.mul_frac;
894 res->d = (s8)d->arch.vtsc_to_ns.shift;
895 break;
896 }
897
898 case 2: /* physical cpu_khz */
899 res->a = cpu_khz;
900 break;
901 }
902 break;
903
904 case 4: /* HVM hypervisor leaf. */
905 if ( !is_hvm_domain(d) || subleaf != 0 )
906 break;
907
908 if ( cpu_has_vmx_apic_reg_virt )
909 res->a |= XEN_HVM_CPUID_APIC_ACCESS_VIRT;
910
911 /*
912 * We want to claim that x2APIC is virtualized if APIC MSR accesses
913 * are not intercepted. When all three of these are true both rdmsr
914 * and wrmsr in the guest will run without VMEXITs (see
915 * vmx_vlapic_msr_changed()).
916 */
917 if ( cpu_has_vmx_virtualize_x2apic_mode &&
918 cpu_has_vmx_apic_reg_virt &&
919 cpu_has_vmx_virtual_intr_delivery )
920 res->a |= XEN_HVM_CPUID_X2APIC_VIRT;
921
922 /*
923 * Indicate that memory mapped from other domains (either grants or
924 * foreign pages) has valid IOMMU entries.
925 */
926 res->a |= XEN_HVM_CPUID_IOMMU_MAPPINGS;
927
928 /* Indicate presence of vcpu id and set it in ebx */
929 res->a |= XEN_HVM_CPUID_VCPU_ID_PRESENT;
930 res->b = v->vcpu_id;
931
932 /* Indicate presence of domain id and set it in ecx */
933 res->a |= XEN_HVM_CPUID_DOMID_PRESENT;
934 res->c = d->domain_id;
935
936 break;
937
938 case 5: /* PV-specific parameters */
939 if ( is_hvm_domain(d) || subleaf != 0 )
940 break;
941
942 res->b = flsl(get_upper_mfn_bound()) + PAGE_SHIFT;
943 break;
944
945 default:
946 ASSERT_UNREACHABLE();
947 }
948 }
949
do_invalid_op(struct cpu_user_regs * regs)950 void do_invalid_op(struct cpu_user_regs *regs)
951 {
952 const struct bug_frame *bug = NULL;
953 u8 bug_insn[2];
954 const char *prefix = "", *filename, *predicate, *eip = (char *)regs->rip;
955 unsigned long fixup;
956 int id = -1, lineno;
957 const struct virtual_region *region;
958
959 if ( debugger_trap_entry(TRAP_invalid_op, regs) )
960 return;
961
962 if ( likely(guest_mode(regs)) )
963 {
964 if ( pv_emulate_invalid_op(regs) )
965 pv_inject_hw_exception(TRAP_invalid_op, X86_EVENT_NO_EC);
966 return;
967 }
968
969 if ( !is_active_kernel_text(regs->rip) ||
970 __copy_from_user(bug_insn, eip, sizeof(bug_insn)) ||
971 memcmp(bug_insn, "\xf\xb", sizeof(bug_insn)) )
972 goto die;
973
974 region = find_text_region(regs->rip);
975 if ( region )
976 {
977 for ( id = 0; id < BUGFRAME_NR; id++ )
978 {
979 const struct bug_frame *b;
980 unsigned int i;
981
982 for ( i = 0, b = region->frame[id].bugs;
983 i < region->frame[id].n_bugs; b++, i++ )
984 {
985 if ( bug_loc(b) == eip )
986 {
987 bug = b;
988 goto found;
989 }
990 }
991 }
992 }
993
994 found:
995 if ( !bug )
996 goto die;
997 eip += sizeof(bug_insn);
998 if ( id == BUGFRAME_run_fn )
999 {
1000 void (*fn)(struct cpu_user_regs *) = bug_ptr(bug);
1001
1002 fn(regs);
1003 regs->rip = (unsigned long)eip;
1004 return;
1005 }
1006
1007 /* WARN, BUG or ASSERT: decode the filename pointer and line number. */
1008 filename = bug_ptr(bug);
1009 if ( !is_kernel(filename) && !is_patch(filename) )
1010 goto die;
1011 fixup = strlen(filename);
1012 if ( fixup > 50 )
1013 {
1014 filename += fixup - 47;
1015 prefix = "...";
1016 }
1017 lineno = bug_line(bug);
1018
1019 switch ( id )
1020 {
1021 case BUGFRAME_warn:
1022 printk("Xen WARN at %s%s:%d\n", prefix, filename, lineno);
1023 show_execution_state(regs);
1024 regs->rip = (unsigned long)eip;
1025 return;
1026
1027 case BUGFRAME_bug:
1028 printk("Xen BUG at %s%s:%d\n", prefix, filename, lineno);
1029
1030 if ( debugger_trap_fatal(TRAP_invalid_op, regs) )
1031 return;
1032
1033 show_execution_state(regs);
1034 panic("Xen BUG at %s%s:%d", prefix, filename, lineno);
1035
1036 case BUGFRAME_assert:
1037 /* ASSERT: decode the predicate string pointer. */
1038 predicate = bug_msg(bug);
1039 if ( !is_kernel(predicate) && !is_patch(predicate) )
1040 predicate = "<unknown>";
1041
1042 printk("Assertion '%s' failed at %s%s:%d\n",
1043 predicate, prefix, filename, lineno);
1044
1045 if ( debugger_trap_fatal(TRAP_invalid_op, regs) )
1046 return;
1047
1048 show_execution_state(regs);
1049 panic("Assertion '%s' failed at %s%s:%d",
1050 predicate, prefix, filename, lineno);
1051 }
1052
1053 die:
1054 if ( (fixup = search_exception_table(regs)) != 0 )
1055 {
1056 this_cpu(last_extable_addr) = regs->rip;
1057 regs->rip = fixup;
1058 return;
1059 }
1060
1061 if ( debugger_trap_fatal(TRAP_invalid_op, regs) )
1062 return;
1063
1064 show_execution_state(regs);
1065 panic("FATAL TRAP: vector = %d (invalid opcode)", TRAP_invalid_op);
1066 }
1067
do_int3(struct cpu_user_regs * regs)1068 void do_int3(struct cpu_user_regs *regs)
1069 {
1070 if ( debugger_trap_entry(TRAP_int3, regs) )
1071 return;
1072
1073 if ( !guest_mode(regs) )
1074 {
1075 unsigned long fixup;
1076
1077 if ( (fixup = search_exception_table(regs)) != 0 )
1078 {
1079 this_cpu(last_extable_addr) = regs->rip;
1080 dprintk(XENLOG_DEBUG, "Trap %u: %p [%ps] -> %p\n",
1081 TRAP_int3, _p(regs->rip), _p(regs->rip), _p(fixup));
1082 regs->rip = fixup;
1083 return;
1084 }
1085
1086 if ( !debugger_trap_fatal(TRAP_int3, regs) )
1087 printk(XENLOG_DEBUG "Hit embedded breakpoint at %p [%ps]\n",
1088 _p(regs->rip), _p(regs->rip));
1089
1090 return;
1091 }
1092
1093 pv_inject_hw_exception(TRAP_int3, X86_EVENT_NO_EC);
1094 }
1095
reserved_bit_page_fault(unsigned long addr,struct cpu_user_regs * regs)1096 static void reserved_bit_page_fault(unsigned long addr,
1097 struct cpu_user_regs *regs)
1098 {
1099 printk("%pv: reserved bit in page table (ec=%04X)\n",
1100 current, regs->error_code);
1101 show_page_walk(addr);
1102 show_execution_state(regs);
1103 }
1104
handle_gdt_ldt_mapping_fault(unsigned long offset,struct cpu_user_regs * regs)1105 static int handle_gdt_ldt_mapping_fault(unsigned long offset,
1106 struct cpu_user_regs *regs)
1107 {
1108 struct vcpu *curr = current;
1109 /* Which vcpu's area did we fault in, and is it in the ldt sub-area? */
1110 unsigned int is_ldt_area = (offset >> (GDT_LDT_VCPU_VA_SHIFT-1)) & 1;
1111 unsigned int vcpu_area = (offset >> GDT_LDT_VCPU_VA_SHIFT);
1112
1113 /*
1114 * If the fault is in another vcpu's area, it cannot be due to
1115 * a GDT/LDT descriptor load. Thus we can reasonably exit immediately, and
1116 * indeed we have to since pv_map_ldt_shadow_page() works correctly only on
1117 * accesses to a vcpu's own area.
1118 */
1119 if ( vcpu_area != curr->vcpu_id )
1120 return 0;
1121
1122 /* Byte offset within the gdt/ldt sub-area. */
1123 offset &= (1UL << (GDT_LDT_VCPU_VA_SHIFT-1)) - 1UL;
1124
1125 if ( likely(is_ldt_area) )
1126 {
1127 /* LDT fault: Copy a mapping from the guest's LDT, if it is valid. */
1128 if ( likely(pv_map_ldt_shadow_page(offset)) )
1129 {
1130 if ( guest_mode(regs) )
1131 trace_trap_two_addr(TRC_PV_GDT_LDT_MAPPING_FAULT,
1132 regs->rip, offset);
1133 }
1134 else
1135 {
1136 /* In hypervisor mode? Leave it to the #PF handler to fix up. */
1137 if ( !guest_mode(regs) )
1138 return 0;
1139
1140 /* Access would have become non-canonical? Pass #GP[sel] back. */
1141 if ( unlikely(!is_canonical_address(
1142 curr->arch.pv_vcpu.ldt_base + offset)) )
1143 {
1144 uint16_t ec = (offset & ~(X86_XEC_EXT | X86_XEC_IDT)) | X86_XEC_TI;
1145
1146 pv_inject_hw_exception(TRAP_gp_fault, ec);
1147 }
1148 else
1149 /* else pass the #PF back, with adjusted %cr2. */
1150 pv_inject_page_fault(regs->error_code,
1151 curr->arch.pv_vcpu.ldt_base + offset);
1152 }
1153 }
1154 else
1155 {
1156 /* GDT fault: handle the fault as #GP(selector). */
1157 regs->error_code = offset & ~(X86_XEC_EXT | X86_XEC_IDT | X86_XEC_TI);
1158 (void)do_general_protection(regs);
1159 }
1160
1161 return EXCRET_fault_fixed;
1162 }
1163
1164 #define IN_HYPERVISOR_RANGE(va) \
1165 (((va) >= HYPERVISOR_VIRT_START) && ((va) < HYPERVISOR_VIRT_END))
1166
1167 enum pf_type {
1168 real_fault,
1169 smep_fault,
1170 smap_fault,
1171 spurious_fault
1172 };
1173
__page_fault_type(unsigned long addr,const struct cpu_user_regs * regs)1174 static enum pf_type __page_fault_type(unsigned long addr,
1175 const struct cpu_user_regs *regs)
1176 {
1177 unsigned long mfn, cr3 = read_cr3();
1178 l4_pgentry_t l4e, *l4t;
1179 l3_pgentry_t l3e, *l3t;
1180 l2_pgentry_t l2e, *l2t;
1181 l1_pgentry_t l1e, *l1t;
1182 unsigned int required_flags, disallowed_flags, page_user;
1183 unsigned int error_code = regs->error_code;
1184
1185 /*
1186 * We do not take spurious page faults in IRQ handlers as we do not
1187 * modify page tables in IRQ context. We therefore bail here because
1188 * map_domain_page() is not IRQ-safe.
1189 */
1190 if ( in_irq() )
1191 return real_fault;
1192
1193 /* Reserved bit violations are never spurious faults. */
1194 if ( error_code & PFEC_reserved_bit )
1195 return real_fault;
1196
1197 required_flags = _PAGE_PRESENT;
1198 if ( error_code & PFEC_write_access )
1199 required_flags |= _PAGE_RW;
1200 if ( error_code & PFEC_user_mode )
1201 required_flags |= _PAGE_USER;
1202
1203 disallowed_flags = 0;
1204 if ( error_code & PFEC_insn_fetch )
1205 disallowed_flags |= _PAGE_NX_BIT;
1206
1207 page_user = _PAGE_USER;
1208
1209 mfn = cr3 >> PAGE_SHIFT;
1210
1211 l4t = map_domain_page(_mfn(mfn));
1212 l4e = l4e_read_atomic(&l4t[l4_table_offset(addr)]);
1213 mfn = l4e_get_pfn(l4e);
1214 unmap_domain_page(l4t);
1215 if ( ((l4e_get_flags(l4e) & required_flags) != required_flags) ||
1216 (l4e_get_flags(l4e) & disallowed_flags) )
1217 return real_fault;
1218 page_user &= l4e_get_flags(l4e);
1219
1220 l3t = map_domain_page(_mfn(mfn));
1221 l3e = l3e_read_atomic(&l3t[l3_table_offset(addr)]);
1222 mfn = l3e_get_pfn(l3e);
1223 unmap_domain_page(l3t);
1224 if ( ((l3e_get_flags(l3e) & required_flags) != required_flags) ||
1225 (l3e_get_flags(l3e) & disallowed_flags) )
1226 return real_fault;
1227 page_user &= l3e_get_flags(l3e);
1228 if ( l3e_get_flags(l3e) & _PAGE_PSE )
1229 goto leaf;
1230
1231 l2t = map_domain_page(_mfn(mfn));
1232 l2e = l2e_read_atomic(&l2t[l2_table_offset(addr)]);
1233 mfn = l2e_get_pfn(l2e);
1234 unmap_domain_page(l2t);
1235 if ( ((l2e_get_flags(l2e) & required_flags) != required_flags) ||
1236 (l2e_get_flags(l2e) & disallowed_flags) )
1237 return real_fault;
1238 page_user &= l2e_get_flags(l2e);
1239 if ( l2e_get_flags(l2e) & _PAGE_PSE )
1240 goto leaf;
1241
1242 l1t = map_domain_page(_mfn(mfn));
1243 l1e = l1e_read_atomic(&l1t[l1_table_offset(addr)]);
1244 mfn = l1e_get_pfn(l1e);
1245 unmap_domain_page(l1t);
1246 if ( ((l1e_get_flags(l1e) & required_flags) != required_flags) ||
1247 (l1e_get_flags(l1e) & disallowed_flags) )
1248 return real_fault;
1249 page_user &= l1e_get_flags(l1e);
1250
1251 leaf:
1252 if ( page_user )
1253 {
1254 unsigned long cr4 = read_cr4();
1255 /*
1256 * Supervisor Mode Execution Prevention (SMEP):
1257 * Disallow supervisor execution from user-accessible mappings
1258 */
1259 if ( (cr4 & X86_CR4_SMEP) &&
1260 ((error_code & (PFEC_insn_fetch|PFEC_user_mode)) == PFEC_insn_fetch) )
1261 return smep_fault;
1262
1263 /*
1264 * Supervisor Mode Access Prevention (SMAP):
1265 * Disallow supervisor access user-accessible mappings
1266 * A fault is considered as an SMAP violation if the following
1267 * conditions are true:
1268 * - X86_CR4_SMAP is set in CR4
1269 * - A user page is being accessed
1270 * - CPL=3 or X86_EFLAGS_AC is clear
1271 * - Page fault in kernel mode
1272 */
1273 if ( (cr4 & X86_CR4_SMAP) && !(error_code & PFEC_user_mode) &&
1274 (((regs->cs & 3) == 3) || !(regs->eflags & X86_EFLAGS_AC)) )
1275 return smap_fault;
1276 }
1277
1278 return spurious_fault;
1279 }
1280
spurious_page_fault(unsigned long addr,const struct cpu_user_regs * regs)1281 static enum pf_type spurious_page_fault(unsigned long addr,
1282 const struct cpu_user_regs *regs)
1283 {
1284 unsigned long flags;
1285 enum pf_type pf_type;
1286
1287 /*
1288 * Disabling interrupts prevents TLB flushing, and hence prevents
1289 * page tables from becoming invalid under our feet during the walk.
1290 */
1291 local_irq_save(flags);
1292 pf_type = __page_fault_type(addr, regs);
1293 local_irq_restore(flags);
1294
1295 return pf_type;
1296 }
1297
fixup_page_fault(unsigned long addr,struct cpu_user_regs * regs)1298 static int fixup_page_fault(unsigned long addr, struct cpu_user_regs *regs)
1299 {
1300 struct vcpu *v = current;
1301 struct domain *d = v->domain;
1302
1303 /* No fixups in interrupt context or when interrupts are disabled. */
1304 if ( in_irq() || !(regs->eflags & X86_EFLAGS_IF) )
1305 return 0;
1306
1307 if ( !(regs->error_code & PFEC_page_present) &&
1308 (pagefault_by_memadd(addr, regs)) )
1309 return handle_memadd_fault(addr, regs);
1310
1311 if ( unlikely(IN_HYPERVISOR_RANGE(addr)) )
1312 {
1313 if ( !(regs->error_code & (PFEC_user_mode | PFEC_reserved_bit)) &&
1314 (addr >= GDT_LDT_VIRT_START) && (addr < GDT_LDT_VIRT_END) )
1315 return handle_gdt_ldt_mapping_fault(
1316 addr - GDT_LDT_VIRT_START, regs);
1317 return 0;
1318 }
1319
1320 if ( guest_kernel_mode(v, regs) &&
1321 !(regs->error_code & (PFEC_reserved_bit | PFEC_insn_fetch)) &&
1322 (regs->error_code & PFEC_write_access) )
1323 {
1324 bool ptwr, mmio_ro;
1325
1326 ptwr = VM_ASSIST(d, writable_pagetables) &&
1327 /* Do not check if access-protection fault since the page may
1328 legitimately be not present in shadow page tables */
1329 (paging_mode_enabled(d) ||
1330 (regs->error_code & PFEC_page_present));
1331
1332 mmio_ro = is_hardware_domain(d) &&
1333 (regs->error_code & PFEC_page_present);
1334
1335 if ( (ptwr || mmio_ro) && pv_ro_page_fault(addr, regs) )
1336 return EXCRET_fault_fixed;
1337 }
1338
1339 /*
1340 * For non-external shadowed guests, we fix up both their own pagefaults
1341 * and Xen's, since they share the pagetables. This includes hypervisor
1342 * faults, e.g. from copy_to_user().
1343 */
1344 if ( paging_mode_enabled(d) && !paging_mode_external(d) )
1345 {
1346 int ret = paging_fault(addr, regs);
1347
1348 if ( ret == EXCRET_fault_fixed )
1349 trace_trap_two_addr(TRC_PV_PAGING_FIXUP, regs->rip, addr);
1350 return ret;
1351 }
1352
1353 return 0;
1354 }
1355
1356 /*
1357 * #PF error code:
1358 * Bit 0: Protection violation (=1) ; Page not present (=0)
1359 * Bit 1: Write access
1360 * Bit 2: User mode (=1) ; Supervisor mode (=0)
1361 * Bit 3: Reserved bit violation
1362 * Bit 4: Instruction fetch
1363 */
do_page_fault(struct cpu_user_regs * regs)1364 void do_page_fault(struct cpu_user_regs *regs)
1365 {
1366 unsigned long addr, fixup;
1367 unsigned int error_code;
1368 enum pf_type pf_type;
1369
1370 addr = read_cr2();
1371
1372 /* fixup_page_fault() might change regs->error_code, so cache it here. */
1373 error_code = regs->error_code;
1374
1375 if ( debugger_trap_entry(TRAP_page_fault, regs) )
1376 return;
1377
1378 perfc_incr(page_faults);
1379
1380 if ( unlikely(fixup_page_fault(addr, regs) != 0) )
1381 return;
1382
1383 if ( unlikely(!guest_mode(regs)) )
1384 {
1385 pf_type = spurious_page_fault(addr, regs);
1386 if ( (pf_type == smep_fault) || (pf_type == smap_fault) )
1387 {
1388 console_start_sync();
1389 printk("Xen SM%cP violation\n",
1390 (pf_type == smep_fault) ? 'E' : 'A');
1391 fatal_trap(regs, 0);
1392 }
1393
1394 if ( pf_type != real_fault )
1395 return;
1396
1397 if ( likely((fixup = search_exception_table(regs)) != 0) )
1398 {
1399 perfc_incr(copy_user_faults);
1400 if ( unlikely(regs->error_code & PFEC_reserved_bit) )
1401 reserved_bit_page_fault(addr, regs);
1402 this_cpu(last_extable_addr) = regs->rip;
1403 regs->rip = fixup;
1404 return;
1405 }
1406
1407 if ( debugger_trap_fatal(TRAP_page_fault, regs) )
1408 return;
1409
1410 show_execution_state(regs);
1411 show_page_walk(addr);
1412 panic("FATAL PAGE FAULT\n"
1413 "[error_code=%04x]\n"
1414 "Faulting linear address: %p",
1415 error_code, _p(addr));
1416 }
1417
1418 if ( unlikely(current->domain->arch.suppress_spurious_page_faults) )
1419 {
1420 pf_type = spurious_page_fault(addr, regs);
1421 if ( (pf_type == smep_fault) || (pf_type == smap_fault))
1422 {
1423 printk(XENLOG_G_ERR "%pv fatal SM%cP violation\n",
1424 current, (pf_type == smep_fault) ? 'E' : 'A');
1425
1426 domain_crash(current->domain);
1427 }
1428 if ( pf_type != real_fault )
1429 return;
1430 }
1431
1432 if ( unlikely(regs->error_code & PFEC_reserved_bit) )
1433 reserved_bit_page_fault(addr, regs);
1434
1435 pv_inject_page_fault(regs->error_code, addr);
1436 }
1437
1438 /*
1439 * Early #PF handler to print CR2, error code, and stack.
1440 *
1441 * We also deal with spurious faults here, even though they should never happen
1442 * during early boot (an issue was seen once, but was most likely a hardware
1443 * problem).
1444 */
do_early_page_fault(struct cpu_user_regs * regs)1445 void __init do_early_page_fault(struct cpu_user_regs *regs)
1446 {
1447 static unsigned int __initdata stuck;
1448 static unsigned long __initdata prev_eip, prev_cr2;
1449 unsigned long cr2 = read_cr2();
1450
1451 BUG_ON(smp_processor_id() != 0);
1452
1453 if ( (regs->rip != prev_eip) || (cr2 != prev_cr2) )
1454 {
1455 prev_eip = regs->rip;
1456 prev_cr2 = cr2;
1457 stuck = 0;
1458 return;
1459 }
1460
1461 if ( stuck++ == 1000 )
1462 {
1463 console_start_sync();
1464 printk("Early fatal page fault at %04x:%p (cr2=%p, ec=%04x)\n",
1465 regs->cs, _p(regs->rip), _p(cr2), regs->error_code);
1466 fatal_trap(regs, 0);
1467 }
1468 }
1469
do_general_protection(struct cpu_user_regs * regs)1470 void do_general_protection(struct cpu_user_regs *regs)
1471 {
1472 struct vcpu *v = current;
1473 unsigned long fixup;
1474
1475 if ( debugger_trap_entry(TRAP_gp_fault, regs) )
1476 return;
1477
1478 if ( regs->error_code & X86_XEC_EXT )
1479 goto hardware_gp;
1480
1481 if ( !guest_mode(regs) )
1482 goto gp_in_kernel;
1483
1484 /*
1485 * Cunning trick to allow arbitrary "INT n" handling.
1486 *
1487 * We set DPL == 0 on all vectors in the IDT. This prevents any INT <n>
1488 * instruction from trapping to the appropriate vector, when that might not
1489 * be expected by Xen or the guest OS. For example, that entry might be for
1490 * a fault handler (unlike traps, faults don't increment EIP), or might
1491 * expect an error code on the stack (which a software trap never
1492 * provides), or might be a hardware interrupt handler that doesn't like
1493 * being called spuriously.
1494 *
1495 * Instead, a GPF occurs with the faulting IDT vector in the error code.
1496 * Bit 1 is set to indicate that an IDT entry caused the fault. Bit 0 is
1497 * clear (which got already checked above) to indicate that it's a software
1498 * fault, not a hardware one.
1499 *
1500 * NOTE: Vectors 3 and 4 are dealt with from their own handler. This is
1501 * okay because they can only be triggered by an explicit DPL-checked
1502 * instruction. The DPL specified by the guest OS for these vectors is NOT
1503 * CHECKED!!
1504 */
1505 if ( regs->error_code & X86_XEC_IDT )
1506 {
1507 /* This fault must be due to <INT n> instruction. */
1508 const struct trap_info *ti;
1509 unsigned char vector = regs->error_code >> 3;
1510 ti = &v->arch.pv_vcpu.trap_ctxt[vector];
1511 if ( permit_softint(TI_GET_DPL(ti), v, regs) )
1512 {
1513 regs->rip += 2;
1514 pv_inject_sw_interrupt(vector);
1515 return;
1516 }
1517 }
1518 else if ( is_pv_32bit_vcpu(v) && regs->error_code )
1519 {
1520 pv_emulate_gate_op(regs);
1521 return;
1522 }
1523
1524 /* Emulate some simple privileged and I/O instructions. */
1525 if ( (regs->error_code == 0) &&
1526 pv_emulate_privileged_op(regs) )
1527 {
1528 trace_trap_one_addr(TRC_PV_EMULATE_PRIVOP, regs->rip);
1529 return;
1530 }
1531
1532 /* Pass on GPF as is. */
1533 pv_inject_hw_exception(TRAP_gp_fault, regs->error_code);
1534 return;
1535
1536 gp_in_kernel:
1537
1538 if ( likely((fixup = search_exception_table(regs)) != 0) )
1539 {
1540 dprintk(XENLOG_INFO, "GPF (%04x): %p [%ps] -> %p\n",
1541 regs->error_code, _p(regs->rip), _p(regs->rip), _p(fixup));
1542 this_cpu(last_extable_addr) = regs->rip;
1543 regs->rip = fixup;
1544 return;
1545 }
1546
1547 hardware_gp:
1548 if ( debugger_trap_fatal(TRAP_gp_fault, regs) )
1549 return;
1550
1551 show_execution_state(regs);
1552 panic("GENERAL PROTECTION FAULT\n[error_code=%04x]", regs->error_code);
1553 }
1554
pci_serr_softirq(void)1555 static void pci_serr_softirq(void)
1556 {
1557 printk("\n\nNMI - PCI system error (SERR)\n");
1558 outb(inb(0x61) & 0x0b, 0x61); /* re-enable the PCI SERR error line. */
1559 }
1560
async_exception_cleanup(struct vcpu * curr)1561 void async_exception_cleanup(struct vcpu *curr)
1562 {
1563 int trap;
1564
1565 if ( !curr->async_exception_mask )
1566 return;
1567
1568 /* Restore affinity. */
1569 if ( !cpumask_empty(curr->cpu_hard_affinity_tmp) &&
1570 !cpumask_equal(curr->cpu_hard_affinity_tmp, curr->cpu_hard_affinity) )
1571 {
1572 vcpu_set_hard_affinity(curr, curr->cpu_hard_affinity_tmp);
1573 cpumask_clear(curr->cpu_hard_affinity_tmp);
1574 }
1575
1576 if ( !(curr->async_exception_mask & (curr->async_exception_mask - 1)) )
1577 trap = __scanbit(curr->async_exception_mask, VCPU_TRAP_NONE);
1578 else
1579 for ( trap = VCPU_TRAP_NONE + 1; trap <= VCPU_TRAP_LAST; ++trap )
1580 if ( (curr->async_exception_mask ^
1581 curr->async_exception_state(trap).old_mask) == (1 << trap) )
1582 break;
1583 if ( unlikely(trap > VCPU_TRAP_LAST) )
1584 {
1585 ASSERT_UNREACHABLE();
1586 return;
1587 }
1588
1589 /* Restore previous asynchronous exception mask. */
1590 curr->async_exception_mask = curr->async_exception_state(trap).old_mask;
1591 }
1592
nmi_hwdom_report(unsigned int reason_idx)1593 static void nmi_hwdom_report(unsigned int reason_idx)
1594 {
1595 struct domain *d = hardware_domain;
1596
1597 if ( !d || !d->vcpu || !d->vcpu[0] || !is_pv_domain(d) /* PVH fixme */ )
1598 return;
1599
1600 set_bit(reason_idx, nmi_reason(d));
1601
1602 pv_raise_interrupt(d->vcpu[0], TRAP_nmi);
1603 }
1604
pci_serr_error(const struct cpu_user_regs * regs)1605 static void pci_serr_error(const struct cpu_user_regs *regs)
1606 {
1607 outb((inb(0x61) & 0x0f) | 0x04, 0x61); /* clear-and-disable the PCI SERR error line. */
1608
1609 switch ( opt_nmi[0] )
1610 {
1611 case 'd': /* 'dom0' */
1612 nmi_hwdom_report(_XEN_NMIREASON_pci_serr);
1613 /* fallthrough */
1614 case 'i': /* 'ignore' */
1615 /* Would like to print a diagnostic here but can't call printk()
1616 from NMI context -- raise a softirq instead. */
1617 raise_softirq(PCI_SERR_SOFTIRQ);
1618 break;
1619 default: /* 'fatal' */
1620 console_force_unlock();
1621 printk("\n\nNMI - PCI system error (SERR)\n");
1622 fatal_trap(regs, 0);
1623 }
1624 }
1625
io_check_error(const struct cpu_user_regs * regs)1626 static void io_check_error(const struct cpu_user_regs *regs)
1627 {
1628 switch ( opt_nmi[0] )
1629 {
1630 case 'd': /* 'dom0' */
1631 nmi_hwdom_report(_XEN_NMIREASON_io_error);
1632 case 'i': /* 'ignore' */
1633 break;
1634 default: /* 'fatal' */
1635 console_force_unlock();
1636 printk("\n\nNMI - I/O ERROR\n");
1637 fatal_trap(regs, 0);
1638 }
1639
1640 outb((inb(0x61) & 0x0f) | 0x08, 0x61); /* clear-and-disable IOCK */
1641 mdelay(1);
1642 outb((inb(0x61) & 0x07) | 0x00, 0x61); /* enable IOCK */
1643 }
1644
unknown_nmi_error(const struct cpu_user_regs * regs,unsigned char reason)1645 static void unknown_nmi_error(const struct cpu_user_regs *regs,
1646 unsigned char reason)
1647 {
1648 switch ( opt_nmi[0] )
1649 {
1650 case 'd': /* 'dom0' */
1651 nmi_hwdom_report(_XEN_NMIREASON_unknown);
1652 case 'i': /* 'ignore' */
1653 break;
1654 default: /* 'fatal' */
1655 console_force_unlock();
1656 printk("Uhhuh. NMI received for unknown reason %02x.\n", reason);
1657 printk("Do you have a strange power saving mode enabled?\n");
1658 fatal_trap(regs, 0);
1659 }
1660 }
1661
dummy_nmi_callback(const struct cpu_user_regs * regs,int cpu)1662 static int dummy_nmi_callback(const struct cpu_user_regs *regs, int cpu)
1663 {
1664 return 0;
1665 }
1666
1667 static nmi_callback_t *nmi_callback = dummy_nmi_callback;
1668
do_nmi(const struct cpu_user_regs * regs)1669 void do_nmi(const struct cpu_user_regs *regs)
1670 {
1671 unsigned int cpu = smp_processor_id();
1672 unsigned char reason;
1673 bool handle_unknown = false;
1674
1675 ++nmi_count(cpu);
1676
1677 if ( nmi_callback(regs, cpu) )
1678 return;
1679
1680 if ( (nmi_watchdog == NMI_NONE) ||
1681 (!nmi_watchdog_tick(regs) && watchdog_force) )
1682 handle_unknown = true;
1683
1684 /* Only the BSP gets external NMIs from the system. */
1685 if ( cpu == 0 )
1686 {
1687 reason = inb(0x61);
1688 if ( reason & 0x80 )
1689 pci_serr_error(regs);
1690 if ( reason & 0x40 )
1691 io_check_error(regs);
1692 if ( !(reason & 0xc0) && handle_unknown )
1693 unknown_nmi_error(regs, reason);
1694 }
1695 }
1696
set_nmi_callback(nmi_callback_t * callback)1697 nmi_callback_t *set_nmi_callback(nmi_callback_t *callback)
1698 {
1699 nmi_callback_t *old_nmi_callback = nmi_callback;
1700
1701 nmi_callback = callback;
1702
1703 return old_nmi_callback;
1704 }
1705
unset_nmi_callback(void)1706 void unset_nmi_callback(void)
1707 {
1708 nmi_callback = dummy_nmi_callback;
1709 }
1710
do_device_not_available(struct cpu_user_regs * regs)1711 void do_device_not_available(struct cpu_user_regs *regs)
1712 {
1713 struct vcpu *curr = current;
1714
1715 BUG_ON(!guest_mode(regs));
1716
1717 vcpu_restore_fpu_lazy(curr);
1718
1719 if ( curr->arch.pv_vcpu.ctrlreg[0] & X86_CR0_TS )
1720 {
1721 pv_inject_hw_exception(TRAP_no_device, X86_EVENT_NO_EC);
1722 curr->arch.pv_vcpu.ctrlreg[0] &= ~X86_CR0_TS;
1723 }
1724 else
1725 TRACE_0D(TRC_PV_MATH_STATE_RESTORE);
1726
1727 return;
1728 }
1729
read_efer(void)1730 u64 read_efer(void)
1731 {
1732 return this_cpu(efer);
1733 }
1734
write_efer(u64 val)1735 void write_efer(u64 val)
1736 {
1737 this_cpu(efer) = val;
1738 wrmsrl(MSR_EFER, val);
1739 }
1740
ler_enable(void)1741 static void ler_enable(void)
1742 {
1743 u64 debugctl;
1744
1745 if ( !this_cpu(ler_msr) )
1746 return;
1747
1748 rdmsrl(MSR_IA32_DEBUGCTLMSR, debugctl);
1749 wrmsrl(MSR_IA32_DEBUGCTLMSR, debugctl | IA32_DEBUGCTLMSR_LBR);
1750 }
1751
do_debug(struct cpu_user_regs * regs)1752 void do_debug(struct cpu_user_regs *regs)
1753 {
1754 struct vcpu *v = current;
1755
1756 if ( debugger_trap_entry(TRAP_debug, regs) )
1757 return;
1758
1759 if ( !guest_mode(regs) )
1760 {
1761 if ( regs->eflags & X86_EFLAGS_TF )
1762 {
1763 /* In SYSENTER entry path we can't zap TF until EFLAGS is saved. */
1764 if ( (regs->rip >= (unsigned long)sysenter_entry) &&
1765 (regs->rip <= (unsigned long)sysenter_eflags_saved) )
1766 {
1767 if ( regs->rip == (unsigned long)sysenter_eflags_saved )
1768 regs->eflags &= ~X86_EFLAGS_TF;
1769 goto out;
1770 }
1771 if ( !debugger_trap_fatal(TRAP_debug, regs) )
1772 {
1773 WARN();
1774 regs->eflags &= ~X86_EFLAGS_TF;
1775 }
1776 }
1777 else
1778 {
1779 /*
1780 * We ignore watchpoints when they trigger within Xen. This may
1781 * happen when a buffer is passed to us which previously had a
1782 * watchpoint set on it. No need to bump EIP; the only faulting
1783 * trap is an instruction breakpoint, which can't happen to us.
1784 */
1785 WARN_ON(!search_exception_table(regs));
1786 }
1787 goto out;
1788 }
1789
1790 /* Save debug status register where guest OS can peek at it */
1791 v->arch.debugreg[6] = read_debugreg(6);
1792
1793 ler_enable();
1794 pv_inject_hw_exception(TRAP_debug, X86_EVENT_NO_EC);
1795 return;
1796
1797 out:
1798 ler_enable();
1799 return;
1800 }
1801
__set_intr_gate(unsigned int n,uint32_t dpl,void * addr)1802 static void __init noinline __set_intr_gate(unsigned int n,
1803 uint32_t dpl, void *addr)
1804 {
1805 _set_gate(&idt_table[n], SYS_DESC_irq_gate, dpl, addr);
1806 }
1807
set_swint_gate(unsigned int n,void * addr)1808 static void __init set_swint_gate(unsigned int n, void *addr)
1809 {
1810 __set_intr_gate(n, 3, addr);
1811 }
1812
set_intr_gate(unsigned int n,void * addr)1813 static void __init set_intr_gate(unsigned int n, void *addr)
1814 {
1815 __set_intr_gate(n, 0, addr);
1816 }
1817
load_TR(void)1818 void load_TR(void)
1819 {
1820 struct tss_struct *tss = &this_cpu(init_tss);
1821 struct desc_ptr old_gdt, tss_gdt = {
1822 .base = (long)(this_cpu(gdt_table) - FIRST_RESERVED_GDT_ENTRY),
1823 .limit = LAST_RESERVED_GDT_BYTE
1824 };
1825
1826 _set_tssldt_desc(
1827 this_cpu(gdt_table) + TSS_ENTRY - FIRST_RESERVED_GDT_ENTRY,
1828 (unsigned long)tss,
1829 offsetof(struct tss_struct, __cacheline_filler) - 1,
1830 SYS_DESC_tss_avail);
1831 _set_tssldt_desc(
1832 this_cpu(compat_gdt_table) + TSS_ENTRY - FIRST_RESERVED_GDT_ENTRY,
1833 (unsigned long)tss,
1834 offsetof(struct tss_struct, __cacheline_filler) - 1,
1835 SYS_DESC_tss_busy);
1836
1837 /* Switch to non-compat GDT (which has B bit clear) to execute LTR. */
1838 asm volatile (
1839 "sgdt %0; lgdt %2; ltr %w1; lgdt %0"
1840 : "=m" (old_gdt) : "rm" (TSS_ENTRY << 3), "m" (tss_gdt) : "memory" );
1841 }
1842
percpu_traps_init(void)1843 void percpu_traps_init(void)
1844 {
1845 subarch_percpu_traps_init();
1846
1847 if ( !opt_ler )
1848 return;
1849
1850 switch ( boot_cpu_data.x86_vendor )
1851 {
1852 case X86_VENDOR_INTEL:
1853 switch ( boot_cpu_data.x86 )
1854 {
1855 case 6:
1856 this_cpu(ler_msr) = MSR_IA32_LASTINTFROMIP;
1857 break;
1858 case 15:
1859 this_cpu(ler_msr) = MSR_P4_LER_FROM_LIP;
1860 break;
1861 }
1862 break;
1863 case X86_VENDOR_AMD:
1864 switch ( boot_cpu_data.x86 )
1865 {
1866 case 6:
1867 case 0xf ... 0x17:
1868 this_cpu(ler_msr) = MSR_IA32_LASTINTFROMIP;
1869 break;
1870 }
1871 break;
1872 }
1873
1874 ler_enable();
1875 }
1876
init_idt_traps(void)1877 void __init init_idt_traps(void)
1878 {
1879 /*
1880 * Note that interrupt gates are always used, rather than trap gates. We
1881 * must have interrupts disabled until DS/ES/FS/GS are saved because the
1882 * first activation must have the "bad" value(s) for these registers and
1883 * we may lose them if another activation is installed before they are
1884 * saved. The page-fault handler also needs interrupts disabled until %cr2
1885 * has been read and saved on the stack.
1886 */
1887 set_intr_gate(TRAP_divide_error,÷_error);
1888 set_intr_gate(TRAP_debug,&debug);
1889 set_intr_gate(TRAP_nmi,&nmi);
1890 set_swint_gate(TRAP_int3,&int3); /* usable from all privileges */
1891 set_swint_gate(TRAP_overflow,&overflow); /* usable from all privileges */
1892 set_intr_gate(TRAP_bounds,&bounds);
1893 set_intr_gate(TRAP_invalid_op,&invalid_op);
1894 set_intr_gate(TRAP_no_device,&device_not_available);
1895 set_intr_gate(TRAP_double_fault,&double_fault);
1896 set_intr_gate(TRAP_invalid_tss,&invalid_TSS);
1897 set_intr_gate(TRAP_no_segment,&segment_not_present);
1898 set_intr_gate(TRAP_stack_error,&stack_segment);
1899 set_intr_gate(TRAP_gp_fault,&general_protection);
1900 set_intr_gate(TRAP_page_fault,&early_page_fault);
1901 set_intr_gate(TRAP_copro_error,&coprocessor_error);
1902 set_intr_gate(TRAP_alignment_check,&alignment_check);
1903 set_intr_gate(TRAP_machine_check,&machine_check);
1904 set_intr_gate(TRAP_simd_error,&simd_coprocessor_error);
1905
1906 /* Specify dedicated interrupt stacks for NMI, #DF, and #MC. */
1907 set_ist(&idt_table[TRAP_double_fault], IST_DF);
1908 set_ist(&idt_table[TRAP_nmi], IST_NMI);
1909 set_ist(&idt_table[TRAP_machine_check], IST_MCE);
1910
1911 /* CPU0 uses the master IDT. */
1912 idt_tables[0] = idt_table;
1913
1914 this_cpu(gdt_table) = boot_cpu_gdt_table;
1915 this_cpu(compat_gdt_table) = boot_cpu_compat_gdt_table;
1916 }
1917
1918 extern void (*const autogen_entrypoints[NR_VECTORS])(void);
trap_init(void)1919 void __init trap_init(void)
1920 {
1921 unsigned int vector;
1922
1923 /* Replace early pagefault with real pagefault handler. */
1924 set_intr_gate(TRAP_page_fault, &page_fault);
1925
1926 pv_trap_init();
1927
1928 for ( vector = 0; vector < NR_VECTORS; ++vector )
1929 {
1930 if ( autogen_entrypoints[vector] )
1931 {
1932 /* Found autogen entry: check we won't clobber an existing trap. */
1933 ASSERT(idt_table[vector].b == 0);
1934 set_intr_gate(vector, autogen_entrypoints[vector]);
1935 }
1936 else
1937 {
1938 /* No entry point: confirm we have an existing trap in place. */
1939 ASSERT(idt_table[vector].b != 0);
1940 }
1941 }
1942
1943 percpu_traps_init();
1944
1945 cpu_init();
1946
1947 open_softirq(PCI_SERR_SOFTIRQ, pci_serr_softirq);
1948 }
1949
activate_debugregs(const struct vcpu * curr)1950 void activate_debugregs(const struct vcpu *curr)
1951 {
1952 ASSERT(curr == current);
1953
1954 write_debugreg(0, curr->arch.debugreg[0]);
1955 write_debugreg(1, curr->arch.debugreg[1]);
1956 write_debugreg(2, curr->arch.debugreg[2]);
1957 write_debugreg(3, curr->arch.debugreg[3]);
1958 write_debugreg(6, curr->arch.debugreg[6]);
1959
1960 /*
1961 * Avoid writing the subsequently getting replaced value when getting
1962 * called from set_debugreg() below. Eventual future callers will need
1963 * to take this into account.
1964 */
1965 if ( curr->arch.debugreg[7] & DR7_ACTIVE_MASK )
1966 write_debugreg(7, curr->arch.debugreg[7]);
1967
1968 if ( boot_cpu_has(X86_FEATURE_DBEXT) )
1969 {
1970 wrmsrl(MSR_AMD64_DR0_ADDRESS_MASK, curr->arch.pv_vcpu.dr_mask[0]);
1971 wrmsrl(MSR_AMD64_DR1_ADDRESS_MASK, curr->arch.pv_vcpu.dr_mask[1]);
1972 wrmsrl(MSR_AMD64_DR2_ADDRESS_MASK, curr->arch.pv_vcpu.dr_mask[2]);
1973 wrmsrl(MSR_AMD64_DR3_ADDRESS_MASK, curr->arch.pv_vcpu.dr_mask[3]);
1974 }
1975 }
1976
set_debugreg(struct vcpu * v,unsigned int reg,unsigned long value)1977 long set_debugreg(struct vcpu *v, unsigned int reg, unsigned long value)
1978 {
1979 int i;
1980 struct vcpu *curr = current;
1981
1982 switch ( reg )
1983 {
1984 case 0:
1985 if ( !access_ok(value, sizeof(long)) )
1986 return -EPERM;
1987 if ( v == curr )
1988 write_debugreg(0, value);
1989 break;
1990 case 1:
1991 if ( !access_ok(value, sizeof(long)) )
1992 return -EPERM;
1993 if ( v == curr )
1994 write_debugreg(1, value);
1995 break;
1996 case 2:
1997 if ( !access_ok(value, sizeof(long)) )
1998 return -EPERM;
1999 if ( v == curr )
2000 write_debugreg(2, value);
2001 break;
2002 case 3:
2003 if ( !access_ok(value, sizeof(long)) )
2004 return -EPERM;
2005 if ( v == curr )
2006 write_debugreg(3, value);
2007 break;
2008 case 6:
2009 /*
2010 * DR6: Bits 4-11,16-31 reserved (set to 1).
2011 * Bit 12 reserved (set to 0).
2012 */
2013 value &= ~DR_STATUS_RESERVED_ZERO; /* reserved bits => 0 */
2014 value |= DR_STATUS_RESERVED_ONE; /* reserved bits => 1 */
2015 if ( v == curr )
2016 write_debugreg(6, value);
2017 break;
2018 case 7:
2019 /*
2020 * DR7: Bit 10 reserved (set to 1).
2021 * Bits 11-12,14-15 reserved (set to 0).
2022 */
2023 value &= ~DR_CONTROL_RESERVED_ZERO; /* reserved bits => 0 */
2024 value |= DR_CONTROL_RESERVED_ONE; /* reserved bits => 1 */
2025 /*
2026 * Privileged bits:
2027 * GD (bit 13): must be 0.
2028 */
2029 if ( value & DR_GENERAL_DETECT )
2030 return -EPERM;
2031 /* DR7.{G,L}E = 0 => debugging disabled for this domain. */
2032 if ( value & DR7_ACTIVE_MASK )
2033 {
2034 unsigned int io_enable = 0;
2035
2036 for ( i = DR_CONTROL_SHIFT; i < 32; i += DR_CONTROL_SIZE )
2037 {
2038 if ( ((value >> i) & 3) == DR_IO )
2039 {
2040 if ( !(v->arch.pv_vcpu.ctrlreg[4] & X86_CR4_DE) )
2041 return -EPERM;
2042 io_enable |= value & (3 << ((i - 16) >> 1));
2043 }
2044 }
2045
2046 /* Guest DR5 is a handy stash for I/O intercept information. */
2047 v->arch.debugreg[5] = io_enable;
2048 value &= ~io_enable;
2049
2050 /*
2051 * If DR7 was previously clear then we need to load all other
2052 * debug registers at this point as they were not restored during
2053 * context switch.
2054 */
2055 if ( (v == curr) &&
2056 !(v->arch.debugreg[7] & DR7_ACTIVE_MASK) )
2057 {
2058 activate_debugregs(v);
2059 break;
2060 }
2061 }
2062 if ( v == curr )
2063 write_debugreg(7, value);
2064 break;
2065 default:
2066 return -EINVAL;
2067 }
2068
2069 v->arch.debugreg[reg] = value;
2070 return 0;
2071 }
2072
asm_domain_crash_synchronous(unsigned long addr)2073 void asm_domain_crash_synchronous(unsigned long addr)
2074 {
2075 /*
2076 * We need clear AC bit here because in entry.S AC is set
2077 * by ASM_STAC to temporarily allow accesses to user pages
2078 * which is prevented by SMAP by default.
2079 *
2080 * For some code paths, where this function is called, clac()
2081 * is not needed, but adding clac() here instead of each place
2082 * asm_domain_crash_synchronous() is called can reduce the code
2083 * redundancy, and it is harmless as well.
2084 */
2085 clac();
2086
2087 if ( addr == 0 )
2088 addr = this_cpu(last_extable_addr);
2089
2090 printk("domain_crash_sync called from entry.S: fault at %p %pS\n",
2091 _p(addr), _p(addr));
2092
2093 __domain_crash_synchronous();
2094 }
2095
2096 /*
2097 * Local variables:
2098 * mode: C
2099 * c-file-style: "BSD"
2100 * c-basic-offset: 4
2101 * tab-width: 4
2102 * indent-tabs-mode: nil
2103 * End:
2104 */
2105