1 /*
2  *  linux/arch/i386/nmi.c
3  *
4  *  NMI watchdog support on APIC systems
5  *
6  *  Started by Ingo Molnar <mingo@redhat.com>
7  *
8  *  Fixes:
9  *  Mikael Pettersson : AMD K7 support for local APIC NMI watchdog.
10  *  Mikael Pettersson : Power Management for local APIC NMI watchdog.
11  *  Mikael Pettersson : Pentium 4 support for local APIC NMI watchdog.
12  *  Pavel Machek and
13  *  Mikael Pettersson : PM converted to driver model. Disable/enable API.
14  */
15 
16 #include <xen/init.h>
17 #include <xen/lib.h>
18 #include <xen/mm.h>
19 #include <xen/irq.h>
20 #include <xen/delay.h>
21 #include <xen/time.h>
22 #include <xen/sched.h>
23 #include <xen/console.h>
24 #include <xen/smp.h>
25 #include <xen/keyhandler.h>
26 #include <xen/cpu.h>
27 #include <asm/current.h>
28 #include <asm/mc146818rtc.h>
29 #include <asm/msr.h>
30 #include <asm/mpspec.h>
31 #include <asm/nmi.h>
32 #include <asm/debugger.h>
33 #include <asm/div64.h>
34 #include <asm/apic.h>
35 
36 unsigned int nmi_watchdog = NMI_NONE;
37 static unsigned int nmi_hz = HZ;
38 static unsigned int nmi_perfctr_msr;	/* the MSR to reset in NMI handler */
39 static unsigned int nmi_p4_cccr_val;
40 static DEFINE_PER_CPU(struct timer, nmi_timer);
41 static DEFINE_PER_CPU(unsigned int, nmi_timer_ticks);
42 
43 /* opt_watchdog: If true, run a watchdog NMI on each processor. */
44 bool __initdata opt_watchdog;
45 
46 /* watchdog_force: If true, process unknown NMIs when running the watchdog. */
47 bool watchdog_force;
48 
parse_watchdog(const char * s)49 static int __init parse_watchdog(const char *s)
50 {
51     if ( !*s )
52     {
53         opt_watchdog = true;
54         return 0;
55     }
56 
57     switch ( parse_bool(s, NULL) )
58     {
59     case 0:
60         opt_watchdog = false;
61         return 0;
62     case 1:
63         opt_watchdog = true;
64         return 0;
65     }
66 
67     if ( !strcmp(s, "force") )
68         watchdog_force = opt_watchdog = true;
69     else
70         return -EINVAL;
71 
72     return 0;
73 }
74 custom_param("watchdog", parse_watchdog);
75 
76 /* opt_watchdog_timeout: Number of seconds to wait before panic. */
77 static unsigned int opt_watchdog_timeout = 5;
78 
parse_watchdog_timeout(const char * s)79 static int parse_watchdog_timeout(const char *s)
80 {
81     const char *q;
82 
83     opt_watchdog_timeout = simple_strtoull(s, &q, 0);
84     opt_watchdog = !!opt_watchdog_timeout;
85 
86     return *q ? -EINVAL : 0;
87 }
88 custom_param("watchdog_timeout", parse_watchdog_timeout);
89 
90 /*
91  * lapic_nmi_owner tracks the ownership of the lapic NMI hardware:
92  * - it may be reserved by some other driver, or not
93  * - when not reserved by some other driver, it may be used for
94  *   the NMI watchdog, or not
95  *
96  * This is maintained separately from nmi_active because the NMI
97  * watchdog may also be driven from the I/O APIC timer.
98  */
99 static DEFINE_SPINLOCK(lapic_nmi_owner_lock);
100 static unsigned int lapic_nmi_owner;
101 #define LAPIC_NMI_WATCHDOG	(1<<0)
102 #define LAPIC_NMI_RESERVED	(1<<1)
103 
104 /* nmi_active:
105  * +1: the lapic NMI watchdog is active, but can be disabled
106  *  0: the lapic NMI watchdog has not been set up, and cannot
107  *     be enabled
108  * -1: the lapic NMI watchdog is disabled, but can be enabled
109  */
110 int nmi_active;
111 
112 #define K7_EVNTSEL_ENABLE	(1 << 22)
113 #define K7_EVNTSEL_INT		(1 << 20)
114 #define K7_EVNTSEL_OS		(1 << 17)
115 #define K7_EVNTSEL_USR		(1 << 16)
116 #define K7_EVENT_CYCLES_PROCESSOR_IS_RUNNING	0x76
117 #define K7_NMI_EVENT		K7_EVENT_CYCLES_PROCESSOR_IS_RUNNING
118 #define K7_EVENT_WIDTH          32
119 
120 #define P6_EVNTSEL0_ENABLE	(1 << 22)
121 #define P6_EVNTSEL_INT		(1 << 20)
122 #define P6_EVNTSEL_OS		(1 << 17)
123 #define P6_EVNTSEL_USR		(1 << 16)
124 #define P6_EVENT_CPU_CLOCKS_NOT_HALTED	 0x79
125 #define CORE_EVENT_CPU_CLOCKS_NOT_HALTED 0x3c
126 #define P6_EVENT_WIDTH          32
127 
128 #define P4_ESCR_EVENT_SELECT(N)	((N)<<25)
129 #define P4_CCCR_OVF_PMI0	(1<<26)
130 #define P4_CCCR_OVF_PMI1	(1<<27)
131 #define P4_CCCR_OVF		(1<<31)
132 #define P4_CCCR_THRESHOLD(N)	((N)<<20)
133 #define P4_CCCR_COMPLEMENT	(1<<19)
134 #define P4_CCCR_COMPARE		(1<<18)
135 #define P4_CCCR_REQUIRED	(3<<16)
136 #define P4_CCCR_ESCR_SELECT(N)	((N)<<13)
137 #define P4_CCCR_ENABLE		(1<<12)
138 /*
139  * Set up IQ_PERFCTR0 to behave like a clock, by having IQ_CCCR0 filter
140  * CRU_ESCR0 (with any non-null event selector) through a complemented
141  * max threshold. [IA32-Vol3, Section 14.9.9]
142  */
143 #define P4_NMI_CRU_ESCR0	P4_ESCR_EVENT_SELECT(0x3F)
144 #define P4_NMI_IQ_CCCR0	\
145     (P4_CCCR_OVF_PMI0|P4_CCCR_THRESHOLD(15)|P4_CCCR_COMPLEMENT| \
146      P4_CCCR_COMPARE|P4_CCCR_REQUIRED|P4_CCCR_ESCR_SELECT(4)|P4_CCCR_ENABLE)
147 
wait_for_nmis(void * p)148 static void __init wait_for_nmis(void *p)
149 {
150     unsigned int cpu = smp_processor_id();
151     unsigned int start_count = nmi_count(cpu);
152     unsigned long ticks = 10 * 1000 * cpu_khz / nmi_hz;
153     unsigned long s, e;
154 
155     s = rdtsc();
156     do {
157         cpu_relax();
158         if ( nmi_count(cpu) >= start_count + 2 )
159             break;
160         e = rdtsc();
161     } while( e - s < ticks );
162 }
163 
check_nmi_watchdog(void)164 void __init check_nmi_watchdog(void)
165 {
166     static unsigned int __initdata prev_nmi_count[NR_CPUS];
167     int cpu;
168     bool ok = true;
169 
170     if ( nmi_watchdog == NMI_NONE )
171         return;
172 
173     printk("Testing NMI watchdog on all CPUs:");
174 
175     for_each_online_cpu ( cpu )
176         prev_nmi_count[cpu] = nmi_count(cpu);
177 
178     /*
179      * Wait at most 10 ticks for 2 watchdog NMIs on each CPU.
180      * Busy-wait on all CPUs: the LAPIC counter that the NMI watchdog
181      * uses only runs while the core's not halted
182      */
183     on_selected_cpus(&cpu_online_map, wait_for_nmis, NULL, 1);
184 
185     for_each_online_cpu ( cpu )
186     {
187         if ( nmi_count(cpu) - prev_nmi_count[cpu] < 2 )
188         {
189             printk(" %d", cpu);
190             ok = false;
191         }
192     }
193 
194     printk(" %s\n", ok ? "ok" : "stuck");
195 
196     /*
197      * Now that we know it works we can reduce NMI frequency to
198      * something more reasonable; makes a difference in some configs.
199      * There's a limit to how slow we can go because writing the perfctr
200      * MSRs only sets the low 32 bits, with the top 8 bits sign-extended
201      * from those, so it's not possible to set up a delay larger than
202      * 2^31 cycles and smaller than (2^40 - 2^31) cycles.
203      * (Intel SDM, section 18.22.2)
204      */
205     if ( nmi_watchdog == NMI_LOCAL_APIC )
206         nmi_hz = max(1ul, cpu_khz >> 20);
207 
208     return;
209 }
210 
nmi_timer_fn(void * unused)211 static void nmi_timer_fn(void *unused)
212 {
213     this_cpu(nmi_timer_ticks)++;
214     set_timer(&this_cpu(nmi_timer), NOW() + MILLISECS(1000));
215 }
216 
disable_lapic_nmi_watchdog(void)217 void disable_lapic_nmi_watchdog(void)
218 {
219     if (nmi_active <= 0)
220         return;
221     switch (boot_cpu_data.x86_vendor) {
222     case X86_VENDOR_AMD:
223         wrmsr(MSR_K7_EVNTSEL0, 0, 0);
224         break;
225     case X86_VENDOR_INTEL:
226         switch (boot_cpu_data.x86) {
227         case 6:
228             wrmsr(MSR_P6_EVNTSEL(0), 0, 0);
229             break;
230         case 15:
231             wrmsr(MSR_P4_IQ_CCCR0, 0, 0);
232             wrmsr(MSR_P4_CRU_ESCR0, 0, 0);
233             break;
234         }
235         break;
236     }
237     nmi_active = -1;
238     /* tell do_nmi() and others that we're not active any more */
239     nmi_watchdog = NMI_NONE;
240 }
241 
enable_lapic_nmi_watchdog(void)242 static void enable_lapic_nmi_watchdog(void)
243 {
244     if (nmi_active < 0) {
245         nmi_watchdog = NMI_LOCAL_APIC;
246         setup_apic_nmi_watchdog();
247     }
248 }
249 
reserve_lapic_nmi(void)250 int reserve_lapic_nmi(void)
251 {
252     unsigned int old_owner;
253 
254     spin_lock(&lapic_nmi_owner_lock);
255     old_owner = lapic_nmi_owner;
256     lapic_nmi_owner |= LAPIC_NMI_RESERVED;
257     spin_unlock(&lapic_nmi_owner_lock);
258     if (old_owner & LAPIC_NMI_RESERVED)
259         return -EBUSY;
260     if (old_owner & LAPIC_NMI_WATCHDOG)
261         disable_lapic_nmi_watchdog();
262     return 0;
263 }
264 
release_lapic_nmi(void)265 void release_lapic_nmi(void)
266 {
267     unsigned int new_owner;
268 
269     spin_lock(&lapic_nmi_owner_lock);
270     new_owner = lapic_nmi_owner & ~LAPIC_NMI_RESERVED;
271     lapic_nmi_owner = new_owner;
272     spin_unlock(&lapic_nmi_owner_lock);
273     if (new_owner & LAPIC_NMI_WATCHDOG)
274         enable_lapic_nmi_watchdog();
275 }
276 
277 /*
278  * Activate the NMI watchdog via the local APIC.
279  * Original code written by Keith Owens.
280  */
281 
clear_msr_range(unsigned int base,unsigned int n)282 static void clear_msr_range(unsigned int base, unsigned int n)
283 {
284     unsigned int i;
285 
286     for (i = 0; i < n; i++)
287         wrmsr(base+i, 0, 0);
288 }
289 
write_watchdog_counter(const char * descr)290 static inline void write_watchdog_counter(const char *descr)
291 {
292     u64 count = (u64)cpu_khz * 1000;
293 
294     do_div(count, nmi_hz);
295     if(descr)
296         Dprintk("setting %s to -%#"PRIx64"\n", descr, count);
297     wrmsrl(nmi_perfctr_msr, 0 - count);
298 }
299 
setup_k7_watchdog(void)300 static void setup_k7_watchdog(void)
301 {
302     unsigned int evntsel;
303 
304     nmi_perfctr_msr = MSR_K7_PERFCTR0;
305 
306     clear_msr_range(MSR_K7_EVNTSEL0, 4);
307     clear_msr_range(MSR_K7_PERFCTR0, 4);
308 
309     evntsel = K7_EVNTSEL_INT
310         | K7_EVNTSEL_OS
311         | K7_EVNTSEL_USR
312         | K7_NMI_EVENT;
313 
314     wrmsr(MSR_K7_EVNTSEL0, evntsel, 0);
315     write_watchdog_counter("K7_PERFCTR0");
316     apic_write(APIC_LVTPC, APIC_DM_NMI);
317     evntsel |= K7_EVNTSEL_ENABLE;
318     wrmsr(MSR_K7_EVNTSEL0, evntsel, 0);
319 }
320 
setup_p6_watchdog(unsigned counter)321 static void setup_p6_watchdog(unsigned counter)
322 {
323     unsigned int evntsel;
324 
325     nmi_perfctr_msr = MSR_P6_PERFCTR(0);
326 
327     clear_msr_range(MSR_P6_EVNTSEL(0), 2);
328     clear_msr_range(MSR_P6_PERFCTR(0), 2);
329 
330     evntsel = P6_EVNTSEL_INT
331         | P6_EVNTSEL_OS
332         | P6_EVNTSEL_USR
333         | counter;
334 
335     wrmsr(MSR_P6_EVNTSEL(0), evntsel, 0);
336     write_watchdog_counter("P6_PERFCTR0");
337     apic_write(APIC_LVTPC, APIC_DM_NMI);
338     evntsel |= P6_EVNTSEL0_ENABLE;
339     wrmsr(MSR_P6_EVNTSEL(0), evntsel, 0);
340 }
341 
setup_p4_watchdog(void)342 static int setup_p4_watchdog(void)
343 {
344     uint64_t misc_enable;
345 
346     rdmsrl(MSR_IA32_MISC_ENABLE, misc_enable);
347     if (!(misc_enable & MSR_IA32_MISC_ENABLE_PERF_AVAIL))
348         return 0;
349 
350     nmi_perfctr_msr = MSR_P4_IQ_PERFCTR0;
351     nmi_p4_cccr_val = P4_NMI_IQ_CCCR0;
352     if ( boot_cpu_data.x86_num_siblings == 2 )
353         nmi_p4_cccr_val |= P4_CCCR_OVF_PMI1;
354 
355     if (!(misc_enable & MSR_IA32_MISC_ENABLE_PEBS_UNAVAIL))
356         clear_msr_range(0x3F1, 2);
357     /* MSR 0x3F0 seems to have a default value of 0xFC00, but current
358        docs doesn't fully define it, so leave it alone for now. */
359     if (boot_cpu_data.x86_model >= 0x3) {
360         /* MSR_P4_IQ_ESCR0/1 (0x3ba/0x3bb) removed */
361         clear_msr_range(0x3A0, 26);
362         clear_msr_range(0x3BC, 3);
363     } else {
364         clear_msr_range(0x3A0, 31);
365     }
366     clear_msr_range(0x3C0, 6);
367     clear_msr_range(0x3C8, 6);
368     clear_msr_range(0x3E0, 2);
369     clear_msr_range(MSR_P4_BPU_CCCR0, 18);
370     clear_msr_range(MSR_P4_BPU_PERFCTR0, 18);
371 
372     wrmsrl(MSR_P4_CRU_ESCR0, P4_NMI_CRU_ESCR0);
373     wrmsrl(MSR_P4_IQ_CCCR0, P4_NMI_IQ_CCCR0 & ~P4_CCCR_ENABLE);
374     write_watchdog_counter("P4_IQ_COUNTER0");
375     apic_write(APIC_LVTPC, APIC_DM_NMI);
376     wrmsrl(MSR_P4_IQ_CCCR0, nmi_p4_cccr_val);
377     return 1;
378 }
379 
setup_apic_nmi_watchdog(void)380 void setup_apic_nmi_watchdog(void)
381 {
382     if ( nmi_watchdog == NMI_NONE )
383         return;
384 
385     switch (boot_cpu_data.x86_vendor) {
386     case X86_VENDOR_AMD:
387         switch (boot_cpu_data.x86) {
388         case 6:
389         case 0xf ... 0x17:
390             setup_k7_watchdog();
391             break;
392         default:
393             return;
394         }
395         break;
396     case X86_VENDOR_INTEL:
397         switch (boot_cpu_data.x86) {
398         case 6:
399             setup_p6_watchdog((boot_cpu_data.x86_model < 14)
400                               ? P6_EVENT_CPU_CLOCKS_NOT_HALTED
401                               : CORE_EVENT_CPU_CLOCKS_NOT_HALTED);
402             break;
403         case 15:
404             if (!setup_p4_watchdog())
405                 return;
406             break;
407         default:
408             return;
409         }
410         break;
411     default:
412         return;
413     }
414 
415     lapic_nmi_owner = LAPIC_NMI_WATCHDOG;
416     nmi_active = 1;
417 }
418 
cpu_nmi_callback(struct notifier_block * nfb,unsigned long action,void * hcpu)419 static int cpu_nmi_callback(
420     struct notifier_block *nfb, unsigned long action, void *hcpu)
421 {
422     unsigned int cpu = (unsigned long)hcpu;
423 
424     switch ( action )
425     {
426     case CPU_UP_PREPARE:
427         init_timer(&per_cpu(nmi_timer, cpu), nmi_timer_fn, NULL, cpu);
428         set_timer(&per_cpu(nmi_timer, cpu), NOW());
429         break;
430     case CPU_UP_CANCELED:
431     case CPU_DEAD:
432         kill_timer(&per_cpu(nmi_timer, cpu));
433         break;
434     default:
435         break;
436     }
437 
438     return NOTIFY_DONE;
439 }
440 
441 static struct notifier_block cpu_nmi_nfb = {
442     .notifier_call = cpu_nmi_callback
443 };
444 
445 static DEFINE_PER_CPU(unsigned int, last_irq_sums);
446 static DEFINE_PER_CPU(unsigned int, alert_counter);
447 
448 static atomic_t watchdog_disable_count = ATOMIC_INIT(1);
449 
watchdog_disable(void)450 void watchdog_disable(void)
451 {
452     atomic_inc(&watchdog_disable_count);
453 }
454 
watchdog_enable(void)455 void watchdog_enable(void)
456 {
457     atomic_dec(&watchdog_disable_count);
458 }
459 
watchdog_enabled(void)460 bool watchdog_enabled(void)
461 {
462     return !atomic_read(&watchdog_disable_count);
463 }
464 
watchdog_setup(void)465 int __init watchdog_setup(void)
466 {
467     unsigned int cpu;
468 
469     /*
470      * Activate periodic heartbeats. We cannot do this earlier during
471      * setup because the timer infrastructure is not available.
472      */
473     for_each_online_cpu ( cpu )
474         cpu_nmi_callback(&cpu_nmi_nfb, CPU_UP_PREPARE, (void *)(long)cpu);
475     register_cpu_notifier(&cpu_nmi_nfb);
476 
477     watchdog_enable();
478     return 0;
479 }
480 
481 /* Returns false if this was not a watchdog NMI, true otherwise */
nmi_watchdog_tick(const struct cpu_user_regs * regs)482 bool nmi_watchdog_tick(const struct cpu_user_regs *regs)
483 {
484     bool watchdog_tick = true;
485     unsigned int sum = this_cpu(nmi_timer_ticks);
486 
487     if ( (this_cpu(last_irq_sums) == sum) && watchdog_enabled() )
488     {
489         /*
490          * Ayiee, looks like this CPU is stuck ... wait for the timeout
491          * before doing the oops ...
492          */
493         this_cpu(alert_counter)++;
494         if ( this_cpu(alert_counter) == opt_watchdog_timeout*nmi_hz )
495         {
496             console_force_unlock();
497             printk("Watchdog timer detects that CPU%d is stuck!\n",
498                    smp_processor_id());
499             fatal_trap(regs, 1);
500         }
501     }
502     else
503     {
504         this_cpu(last_irq_sums) = sum;
505         this_cpu(alert_counter) = 0;
506     }
507 
508     if ( nmi_perfctr_msr )
509     {
510         uint64_t msr_content;
511 
512         /* Work out if this is a watchdog tick by checking for overflow. */
513         if ( nmi_perfctr_msr == MSR_P4_IQ_PERFCTR0 )
514         {
515             rdmsrl(MSR_P4_IQ_CCCR0, msr_content);
516             if ( !(msr_content & P4_CCCR_OVF) )
517                 watchdog_tick = false;
518 
519             /*
520              * P4 quirks:
521              * - An overflown perfctr will assert its interrupt
522              *   until the OVF flag in its CCCR is cleared.
523              * - LVTPC is masked on interrupt and must be
524              *   unmasked by the LVTPC handler.
525              */
526             wrmsrl(MSR_P4_IQ_CCCR0, nmi_p4_cccr_val);
527             apic_write(APIC_LVTPC, APIC_DM_NMI);
528         }
529         else if ( nmi_perfctr_msr == MSR_P6_PERFCTR(0) )
530         {
531             rdmsrl(MSR_P6_PERFCTR(0), msr_content);
532             if ( msr_content & (1ULL << P6_EVENT_WIDTH) )
533                 watchdog_tick = false;
534 
535             /*
536              * Only P6 based Pentium M need to re-unmask the apic vector but
537              * it doesn't hurt other P6 variants.
538              */
539             apic_write(APIC_LVTPC, APIC_DM_NMI);
540         }
541         else if ( nmi_perfctr_msr == MSR_K7_PERFCTR0 )
542         {
543             rdmsrl(MSR_K7_PERFCTR0, msr_content);
544             if ( msr_content & (1ULL << K7_EVENT_WIDTH) )
545                 watchdog_tick = false;
546         }
547         write_watchdog_counter(NULL);
548     }
549 
550     return watchdog_tick;
551 }
552 
553 /*
554  * For some reason the destination shorthand for self is not valid
555  * when used with the NMI delivery mode. This is documented in Tables
556  * 8-3 and 8-4 in IA32 Reference Manual Volume 3. We send the IPI to
557  * our own APIC ID explicitly which is valid.
558  */
self_nmi(void)559 void self_nmi(void)
560 {
561     unsigned long flags;
562     u32 id = get_apic_id();
563     local_irq_save(flags);
564     apic_wait_icr_idle();
565     apic_icr_write(APIC_DM_NMI | APIC_DEST_PHYSICAL, id);
566     local_irq_restore(flags);
567 }
568 
do_nmi_trigger(unsigned char key)569 static void do_nmi_trigger(unsigned char key)
570 {
571     printk("Triggering NMI on APIC ID %x\n", get_apic_id());
572     self_nmi();
573 }
574 
do_nmi_stats(unsigned char key)575 static void do_nmi_stats(unsigned char key)
576 {
577     int i;
578     struct domain *d;
579     struct vcpu *v;
580 
581     printk("CPU\tNMI\n");
582     for_each_online_cpu ( i )
583         printk("%3d\t%3d\n", i, nmi_count(i));
584 
585     if ( ((d = hardware_domain) == NULL) || (d->vcpu == NULL) ||
586          ((v = d->vcpu[0]) == NULL) )
587         return;
588 
589     i = v->async_exception_mask & (1 << VCPU_TRAP_NMI);
590     if ( v->nmi_pending || i )
591         printk("dom0 vpu0: NMI %s%s\n",
592                v->nmi_pending ? "pending " : "",
593                i ? "masked " : "");
594     else
595         printk("dom0 vcpu0: NMI neither pending nor masked\n");
596 }
597 
register_nmi_trigger(void)598 static __init int register_nmi_trigger(void)
599 {
600     register_keyhandler('N', do_nmi_trigger, "trigger an NMI", 0);
601     register_keyhandler('n', do_nmi_stats, "NMI statistics", 1);
602     return 0;
603 }
604 __initcall(register_nmi_trigger);
605