1 /******************************************************************************
2  * crash.c
3  *
4  * Based heavily on arch/i386/kernel/crash.c from Linux 2.6.16
5  *
6  * Xen port written by:
7  * - Simon 'Horms' Horman <horms@verge.net.au>
8  * - Magnus Damm <magnus@valinux.co.jp>
9  */
10 
11 #include <asm/atomic.h>
12 #include <asm/elf.h>
13 #include <asm/percpu.h>
14 #include <xen/types.h>
15 #include <xen/irq.h>
16 #include <asm/nmi.h>
17 #include <xen/string.h>
18 #include <xen/elf.h>
19 #include <xen/elfcore.h>
20 #include <xen/smp.h>
21 #include <xen/delay.h>
22 #include <xen/perfc.h>
23 #include <xen/kexec.h>
24 #include <xen/sched.h>
25 #include <xen/keyhandler.h>
26 #include <public/xen.h>
27 #include <asm/shared.h>
28 #include <asm/hvm/support.h>
29 #include <asm/apic.h>
30 #include <asm/io_apic.h>
31 #include <xen/iommu.h>
32 #include <asm/hpet.h>
33 
34 static cpumask_t waiting_to_crash;
35 static unsigned int crashing_cpu;
36 static DEFINE_PER_CPU_READ_MOSTLY(bool, crash_save_done);
37 
38 /* This becomes the NMI handler for non-crashing CPUs, when Xen is crashing. */
do_nmi_crash(const struct cpu_user_regs * regs)39 static void noreturn do_nmi_crash(const struct cpu_user_regs *regs)
40 {
41     unsigned int cpu = smp_processor_id();
42 
43     stac();
44 
45     /* nmi_shootdown_cpus() should ensure that this assertion is correct. */
46     ASSERT(cpu != crashing_cpu);
47 
48     /* Save crash information and shut down CPU.  Attempt only once. */
49     if ( !this_cpu(crash_save_done) )
50     {
51         /* Disable the interrupt stack table for the MCE handler.  This
52          * prevents race conditions between clearing MCIP and receving a
53          * new MCE, during which the exception frame would be clobbered
54          * and the MCE handler fall into an infinite loop.  We are soon
55          * going to disable the NMI watchdog, so the loop would not be
56          * caught.
57          *
58          * We do not need to change the NMI IST, as the nmi_crash
59          * handler is immue to corrupt exception frames, by virtue of
60          * being designed never to return.
61          *
62          * This update is safe from a security point of view, as this
63          * pcpu is never going to try to sysret back to a PV vcpu.
64          */
65         set_ist(&idt_tables[cpu][TRAP_machine_check], IST_NONE);
66 
67         kexec_crash_save_cpu();
68         __stop_this_cpu();
69 
70         this_cpu(crash_save_done) = true;
71         cpumask_clear_cpu(cpu, &waiting_to_crash);
72     }
73 
74     /* Poor mans self_nmi().  __stop_this_cpu() has reverted the LAPIC
75      * back to its boot state, so we are unable to rely on the regular
76      * apic_* functions, due to 'x2apic_enabled' being possibly wrong.
77      * (The likely scenario is that we have reverted from x2apic mode to
78      * xapic, at which point #GPFs will occur if we use the apic_*
79      * functions)
80      *
81      * The ICR and APIC ID of the LAPIC are still valid even during
82      * software disable (Intel SDM Vol 3, 10.4.7.2).  As a result, we
83      * can deliberately queue up another NMI at the LAPIC which will not
84      * be delivered as the hardware NMI latch is currently in effect.
85      * This means that if NMIs become unlatched (e.g. following a
86      * non-fatal MCE), the LAPIC will force us back here rather than
87      * wandering back into regular Xen code.
88      */
89     switch ( current_local_apic_mode() )
90     {
91         u32 apic_id;
92 
93     case APIC_MODE_X2APIC:
94         apic_id = apic_rdmsr(APIC_ID);
95 
96         apic_wrmsr(APIC_ICR, APIC_DM_NMI | APIC_DEST_PHYSICAL
97                    | ((u64)apic_id << 32));
98         break;
99 
100     case APIC_MODE_XAPIC:
101         apic_id = GET_xAPIC_ID(apic_mem_read(APIC_ID));
102 
103         while ( apic_mem_read(APIC_ICR) & APIC_ICR_BUSY )
104             cpu_relax();
105 
106         apic_mem_write(APIC_ICR2, apic_id << 24);
107         apic_mem_write(APIC_ICR, APIC_DM_NMI | APIC_DEST_PHYSICAL);
108         break;
109 
110     default:
111         break;
112     }
113 
114     for ( ; ; )
115         halt();
116 }
117 
nmi_shootdown_cpus(void)118 static void nmi_shootdown_cpus(void)
119 {
120     unsigned long msecs;
121     unsigned int cpu = smp_processor_id();
122 
123     disable_lapic_nmi_watchdog();
124     local_irq_disable();
125 
126     crashing_cpu = cpu;
127     local_irq_count(crashing_cpu) = 0;
128 
129     cpumask_andnot(&waiting_to_crash, &cpu_online_map, cpumask_of(cpu));
130 
131     /*
132      * Disable IST for MCEs to avoid stack corruption race conditions, and
133      * change the NMI handler to a nop to avoid deviation from this codepath.
134      */
135     _set_gate_lower(&idt_tables[cpu][TRAP_nmi],
136                     SYS_DESC_irq_gate, 0, &trap_nop);
137     set_ist(&idt_tables[cpu][TRAP_machine_check], IST_NONE);
138 
139     /*
140      * Ideally would be:
141      *   exception_table[TRAP_nmi] = &do_nmi_crash;
142      *
143      * but the exception_table is read only.  Access it via its directmap
144      * mappings.
145      */
146     write_atomic((unsigned long *)__va(__pa(&exception_table[TRAP_nmi])),
147                  (unsigned long)&do_nmi_crash);
148 
149     /* Ensure the new callback function is set before sending out the NMI. */
150     wmb();
151 
152     smp_send_nmi_allbutself();
153 
154     msecs = 1000; /* Wait at most a second for the other cpus to stop */
155     while ( !cpumask_empty(&waiting_to_crash) && msecs )
156     {
157         mdelay(1);
158         msecs--;
159     }
160 
161     /* Leave a hint of how well we did trying to shoot down the other cpus */
162     if ( cpumask_empty(&waiting_to_crash) )
163         printk("Shot down all CPUs\n");
164     else
165     {
166         cpulist_scnprintf(keyhandler_scratch, sizeof keyhandler_scratch,
167                           &waiting_to_crash);
168         printk("Failed to shoot down CPUs {%s}\n", keyhandler_scratch);
169     }
170 
171     /* Crash shutdown any IOMMU functionality as the crashdump kernel is not
172      * happy when booting if interrupt/dma remapping is still enabled */
173     iommu_crash_shutdown();
174 
175     __stop_this_cpu();
176 
177     /* This is a bit of a hack due to the problems with the x2apic_enabled
178      * variable, but we can't do any better without a significant refactoring
179      * of the APIC code */
180     x2apic_enabled = (current_local_apic_mode() == APIC_MODE_X2APIC);
181 
182     disable_IO_APIC();
183     hpet_disable();
184 }
185 
machine_crash_shutdown(void)186 void machine_crash_shutdown(void)
187 {
188     crash_xen_info_t *info;
189 
190     nmi_shootdown_cpus();
191 
192     /* Reset CPUID masking and faulting to the host's default. */
193     ctxt_switch_levelling(NULL);
194 
195     info = kexec_crash_save_info();
196     info->xen_phys_start = xen_phys_start;
197     info->dom0_pfn_to_mfn_frame_list_list =
198         arch_get_pfn_to_mfn_frame_list_list(hardware_domain);
199 }
200 
201 /*
202  * Local variables:
203  * mode: C
204  * c-file-style: "BSD"
205  * c-basic-offset: 4
206  * tab-width: 4
207  * indent-tabs-mode: nil
208  * End:
209  */
210