1 /*
2 * linux/arch/i386/nmi.c
3 *
4 * NMI watchdog support on APIC systems
5 *
6 * Started by Ingo Molnar <mingo@redhat.com>
7 *
8 * Fixes:
9 * Mikael Pettersson : AMD K7 support for local APIC NMI watchdog.
10 * Mikael Pettersson : Power Management for local APIC NMI watchdog.
11 * Mikael Pettersson : Pentium 4 support for local APIC NMI watchdog.
12 * Pavel Machek and
13 * Mikael Pettersson : PM converted to driver model. Disable/enable API.
14 */
15
16 #include <xen/init.h>
17 #include <xen/lib.h>
18 #include <xen/mm.h>
19 #include <xen/irq.h>
20 #include <xen/delay.h>
21 #include <xen/time.h>
22 #include <xen/sched.h>
23 #include <xen/console.h>
24 #include <xen/smp.h>
25 #include <xen/keyhandler.h>
26 #include <xen/cpu.h>
27 #include <asm/current.h>
28 #include <asm/mc146818rtc.h>
29 #include <asm/msr.h>
30 #include <asm/mpspec.h>
31 #include <asm/nmi.h>
32 #include <asm/debugger.h>
33 #include <asm/div64.h>
34 #include <asm/apic.h>
35
36 unsigned int nmi_watchdog = NMI_NONE;
37 static unsigned int nmi_hz = HZ;
38 static unsigned int nmi_perfctr_msr; /* the MSR to reset in NMI handler */
39 static unsigned int nmi_p4_cccr_val;
40 static DEFINE_PER_CPU(struct timer, nmi_timer);
41 static DEFINE_PER_CPU(unsigned int, nmi_timer_ticks);
42
43 /* opt_watchdog: If true, run a watchdog NMI on each processor. */
44 bool __initdata opt_watchdog;
45
46 /* watchdog_force: If true, process unknown NMIs when running the watchdog. */
47 bool watchdog_force;
48
parse_watchdog(const char * s)49 static int __init parse_watchdog(const char *s)
50 {
51 if ( !*s )
52 {
53 opt_watchdog = true;
54 return 0;
55 }
56
57 switch ( parse_bool(s, NULL) )
58 {
59 case 0:
60 opt_watchdog = false;
61 return 0;
62 case 1:
63 opt_watchdog = true;
64 return 0;
65 }
66
67 if ( !strcmp(s, "force") )
68 watchdog_force = opt_watchdog = true;
69 else
70 return -EINVAL;
71
72 return 0;
73 }
74 custom_param("watchdog", parse_watchdog);
75
76 /* opt_watchdog_timeout: Number of seconds to wait before panic. */
77 static unsigned int opt_watchdog_timeout = 5;
78
parse_watchdog_timeout(const char * s)79 static int parse_watchdog_timeout(const char *s)
80 {
81 const char *q;
82
83 opt_watchdog_timeout = simple_strtoull(s, &q, 0);
84 opt_watchdog = !!opt_watchdog_timeout;
85
86 return *q ? -EINVAL : 0;
87 }
88 custom_param("watchdog_timeout", parse_watchdog_timeout);
89
90 /*
91 * lapic_nmi_owner tracks the ownership of the lapic NMI hardware:
92 * - it may be reserved by some other driver, or not
93 * - when not reserved by some other driver, it may be used for
94 * the NMI watchdog, or not
95 *
96 * This is maintained separately from nmi_active because the NMI
97 * watchdog may also be driven from the I/O APIC timer.
98 */
99 static DEFINE_SPINLOCK(lapic_nmi_owner_lock);
100 static unsigned int lapic_nmi_owner;
101 #define LAPIC_NMI_WATCHDOG (1<<0)
102 #define LAPIC_NMI_RESERVED (1<<1)
103
104 /* nmi_active:
105 * +1: the lapic NMI watchdog is active, but can be disabled
106 * 0: the lapic NMI watchdog has not been set up, and cannot
107 * be enabled
108 * -1: the lapic NMI watchdog is disabled, but can be enabled
109 */
110 int nmi_active;
111
112 #define K7_EVNTSEL_ENABLE (1 << 22)
113 #define K7_EVNTSEL_INT (1 << 20)
114 #define K7_EVNTSEL_OS (1 << 17)
115 #define K7_EVNTSEL_USR (1 << 16)
116 #define K7_EVENT_CYCLES_PROCESSOR_IS_RUNNING 0x76
117 #define K7_NMI_EVENT K7_EVENT_CYCLES_PROCESSOR_IS_RUNNING
118 #define K7_EVENT_WIDTH 32
119
120 #define P6_EVNTSEL0_ENABLE (1 << 22)
121 #define P6_EVNTSEL_INT (1 << 20)
122 #define P6_EVNTSEL_OS (1 << 17)
123 #define P6_EVNTSEL_USR (1 << 16)
124 #define P6_EVENT_CPU_CLOCKS_NOT_HALTED 0x79
125 #define CORE_EVENT_CPU_CLOCKS_NOT_HALTED 0x3c
126 #define P6_EVENT_WIDTH 32
127
128 #define P4_ESCR_EVENT_SELECT(N) ((N)<<25)
129 #define P4_CCCR_OVF_PMI0 (1<<26)
130 #define P4_CCCR_OVF_PMI1 (1<<27)
131 #define P4_CCCR_OVF (1<<31)
132 #define P4_CCCR_THRESHOLD(N) ((N)<<20)
133 #define P4_CCCR_COMPLEMENT (1<<19)
134 #define P4_CCCR_COMPARE (1<<18)
135 #define P4_CCCR_REQUIRED (3<<16)
136 #define P4_CCCR_ESCR_SELECT(N) ((N)<<13)
137 #define P4_CCCR_ENABLE (1<<12)
138 /*
139 * Set up IQ_PERFCTR0 to behave like a clock, by having IQ_CCCR0 filter
140 * CRU_ESCR0 (with any non-null event selector) through a complemented
141 * max threshold. [IA32-Vol3, Section 14.9.9]
142 */
143 #define P4_NMI_CRU_ESCR0 P4_ESCR_EVENT_SELECT(0x3F)
144 #define P4_NMI_IQ_CCCR0 \
145 (P4_CCCR_OVF_PMI0|P4_CCCR_THRESHOLD(15)|P4_CCCR_COMPLEMENT| \
146 P4_CCCR_COMPARE|P4_CCCR_REQUIRED|P4_CCCR_ESCR_SELECT(4)|P4_CCCR_ENABLE)
147
wait_for_nmis(void * p)148 static void __init wait_for_nmis(void *p)
149 {
150 unsigned int cpu = smp_processor_id();
151 unsigned int start_count = nmi_count(cpu);
152 unsigned long ticks = 10 * 1000 * cpu_khz / nmi_hz;
153 unsigned long s, e;
154
155 s = rdtsc();
156 do {
157 cpu_relax();
158 if ( nmi_count(cpu) >= start_count + 2 )
159 break;
160 e = rdtsc();
161 } while( e - s < ticks );
162 }
163
check_nmi_watchdog(void)164 void __init check_nmi_watchdog(void)
165 {
166 static unsigned int __initdata prev_nmi_count[NR_CPUS];
167 int cpu;
168 bool ok = true;
169
170 if ( nmi_watchdog == NMI_NONE )
171 return;
172
173 printk("Testing NMI watchdog on all CPUs:");
174
175 for_each_online_cpu ( cpu )
176 prev_nmi_count[cpu] = nmi_count(cpu);
177
178 /*
179 * Wait at most 10 ticks for 2 watchdog NMIs on each CPU.
180 * Busy-wait on all CPUs: the LAPIC counter that the NMI watchdog
181 * uses only runs while the core's not halted
182 */
183 on_selected_cpus(&cpu_online_map, wait_for_nmis, NULL, 1);
184
185 for_each_online_cpu ( cpu )
186 {
187 if ( nmi_count(cpu) - prev_nmi_count[cpu] < 2 )
188 {
189 printk(" %d", cpu);
190 ok = false;
191 }
192 }
193
194 printk(" %s\n", ok ? "ok" : "stuck");
195
196 /*
197 * Now that we know it works we can reduce NMI frequency to
198 * something more reasonable; makes a difference in some configs.
199 * There's a limit to how slow we can go because writing the perfctr
200 * MSRs only sets the low 32 bits, with the top 8 bits sign-extended
201 * from those, so it's not possible to set up a delay larger than
202 * 2^31 cycles and smaller than (2^40 - 2^31) cycles.
203 * (Intel SDM, section 18.22.2)
204 */
205 if ( nmi_watchdog == NMI_LOCAL_APIC )
206 nmi_hz = max(1ul, cpu_khz >> 20);
207
208 return;
209 }
210
nmi_timer_fn(void * unused)211 static void nmi_timer_fn(void *unused)
212 {
213 this_cpu(nmi_timer_ticks)++;
214 set_timer(&this_cpu(nmi_timer), NOW() + MILLISECS(1000));
215 }
216
disable_lapic_nmi_watchdog(void)217 void disable_lapic_nmi_watchdog(void)
218 {
219 if (nmi_active <= 0)
220 return;
221 switch (boot_cpu_data.x86_vendor) {
222 case X86_VENDOR_AMD:
223 wrmsr(MSR_K7_EVNTSEL0, 0, 0);
224 break;
225 case X86_VENDOR_INTEL:
226 switch (boot_cpu_data.x86) {
227 case 6:
228 wrmsr(MSR_P6_EVNTSEL(0), 0, 0);
229 break;
230 case 15:
231 wrmsr(MSR_P4_IQ_CCCR0, 0, 0);
232 wrmsr(MSR_P4_CRU_ESCR0, 0, 0);
233 break;
234 }
235 break;
236 }
237 nmi_active = -1;
238 /* tell do_nmi() and others that we're not active any more */
239 nmi_watchdog = NMI_NONE;
240 }
241
enable_lapic_nmi_watchdog(void)242 static void enable_lapic_nmi_watchdog(void)
243 {
244 if (nmi_active < 0) {
245 nmi_watchdog = NMI_LOCAL_APIC;
246 setup_apic_nmi_watchdog();
247 }
248 }
249
reserve_lapic_nmi(void)250 int reserve_lapic_nmi(void)
251 {
252 unsigned int old_owner;
253
254 spin_lock(&lapic_nmi_owner_lock);
255 old_owner = lapic_nmi_owner;
256 lapic_nmi_owner |= LAPIC_NMI_RESERVED;
257 spin_unlock(&lapic_nmi_owner_lock);
258 if (old_owner & LAPIC_NMI_RESERVED)
259 return -EBUSY;
260 if (old_owner & LAPIC_NMI_WATCHDOG)
261 disable_lapic_nmi_watchdog();
262 return 0;
263 }
264
release_lapic_nmi(void)265 void release_lapic_nmi(void)
266 {
267 unsigned int new_owner;
268
269 spin_lock(&lapic_nmi_owner_lock);
270 new_owner = lapic_nmi_owner & ~LAPIC_NMI_RESERVED;
271 lapic_nmi_owner = new_owner;
272 spin_unlock(&lapic_nmi_owner_lock);
273 if (new_owner & LAPIC_NMI_WATCHDOG)
274 enable_lapic_nmi_watchdog();
275 }
276
277 /*
278 * Activate the NMI watchdog via the local APIC.
279 * Original code written by Keith Owens.
280 */
281
clear_msr_range(unsigned int base,unsigned int n)282 static void clear_msr_range(unsigned int base, unsigned int n)
283 {
284 unsigned int i;
285
286 for (i = 0; i < n; i++)
287 wrmsr(base+i, 0, 0);
288 }
289
write_watchdog_counter(const char * descr)290 static inline void write_watchdog_counter(const char *descr)
291 {
292 u64 count = (u64)cpu_khz * 1000;
293
294 do_div(count, nmi_hz);
295 if(descr)
296 Dprintk("setting %s to -%#"PRIx64"\n", descr, count);
297 wrmsrl(nmi_perfctr_msr, 0 - count);
298 }
299
setup_k7_watchdog(void)300 static void setup_k7_watchdog(void)
301 {
302 unsigned int evntsel;
303
304 nmi_perfctr_msr = MSR_K7_PERFCTR0;
305
306 clear_msr_range(MSR_K7_EVNTSEL0, 4);
307 clear_msr_range(MSR_K7_PERFCTR0, 4);
308
309 evntsel = K7_EVNTSEL_INT
310 | K7_EVNTSEL_OS
311 | K7_EVNTSEL_USR
312 | K7_NMI_EVENT;
313
314 wrmsr(MSR_K7_EVNTSEL0, evntsel, 0);
315 write_watchdog_counter("K7_PERFCTR0");
316 apic_write(APIC_LVTPC, APIC_DM_NMI);
317 evntsel |= K7_EVNTSEL_ENABLE;
318 wrmsr(MSR_K7_EVNTSEL0, evntsel, 0);
319 }
320
setup_p6_watchdog(unsigned counter)321 static void setup_p6_watchdog(unsigned counter)
322 {
323 unsigned int evntsel;
324
325 nmi_perfctr_msr = MSR_P6_PERFCTR(0);
326
327 clear_msr_range(MSR_P6_EVNTSEL(0), 2);
328 clear_msr_range(MSR_P6_PERFCTR(0), 2);
329
330 evntsel = P6_EVNTSEL_INT
331 | P6_EVNTSEL_OS
332 | P6_EVNTSEL_USR
333 | counter;
334
335 wrmsr(MSR_P6_EVNTSEL(0), evntsel, 0);
336 write_watchdog_counter("P6_PERFCTR0");
337 apic_write(APIC_LVTPC, APIC_DM_NMI);
338 evntsel |= P6_EVNTSEL0_ENABLE;
339 wrmsr(MSR_P6_EVNTSEL(0), evntsel, 0);
340 }
341
setup_p4_watchdog(void)342 static int setup_p4_watchdog(void)
343 {
344 uint64_t misc_enable;
345
346 rdmsrl(MSR_IA32_MISC_ENABLE, misc_enable);
347 if (!(misc_enable & MSR_IA32_MISC_ENABLE_PERF_AVAIL))
348 return 0;
349
350 nmi_perfctr_msr = MSR_P4_IQ_PERFCTR0;
351 nmi_p4_cccr_val = P4_NMI_IQ_CCCR0;
352 if ( boot_cpu_data.x86_num_siblings == 2 )
353 nmi_p4_cccr_val |= P4_CCCR_OVF_PMI1;
354
355 if (!(misc_enable & MSR_IA32_MISC_ENABLE_PEBS_UNAVAIL))
356 clear_msr_range(0x3F1, 2);
357 /* MSR 0x3F0 seems to have a default value of 0xFC00, but current
358 docs doesn't fully define it, so leave it alone for now. */
359 if (boot_cpu_data.x86_model >= 0x3) {
360 /* MSR_P4_IQ_ESCR0/1 (0x3ba/0x3bb) removed */
361 clear_msr_range(0x3A0, 26);
362 clear_msr_range(0x3BC, 3);
363 } else {
364 clear_msr_range(0x3A0, 31);
365 }
366 clear_msr_range(0x3C0, 6);
367 clear_msr_range(0x3C8, 6);
368 clear_msr_range(0x3E0, 2);
369 clear_msr_range(MSR_P4_BPU_CCCR0, 18);
370 clear_msr_range(MSR_P4_BPU_PERFCTR0, 18);
371
372 wrmsrl(MSR_P4_CRU_ESCR0, P4_NMI_CRU_ESCR0);
373 wrmsrl(MSR_P4_IQ_CCCR0, P4_NMI_IQ_CCCR0 & ~P4_CCCR_ENABLE);
374 write_watchdog_counter("P4_IQ_COUNTER0");
375 apic_write(APIC_LVTPC, APIC_DM_NMI);
376 wrmsrl(MSR_P4_IQ_CCCR0, nmi_p4_cccr_val);
377 return 1;
378 }
379
setup_apic_nmi_watchdog(void)380 void setup_apic_nmi_watchdog(void)
381 {
382 if ( nmi_watchdog == NMI_NONE )
383 return;
384
385 switch (boot_cpu_data.x86_vendor) {
386 case X86_VENDOR_AMD:
387 switch (boot_cpu_data.x86) {
388 case 6:
389 case 0xf ... 0x17:
390 setup_k7_watchdog();
391 break;
392 default:
393 return;
394 }
395 break;
396 case X86_VENDOR_INTEL:
397 switch (boot_cpu_data.x86) {
398 case 6:
399 setup_p6_watchdog((boot_cpu_data.x86_model < 14)
400 ? P6_EVENT_CPU_CLOCKS_NOT_HALTED
401 : CORE_EVENT_CPU_CLOCKS_NOT_HALTED);
402 break;
403 case 15:
404 if (!setup_p4_watchdog())
405 return;
406 break;
407 default:
408 return;
409 }
410 break;
411 default:
412 return;
413 }
414
415 lapic_nmi_owner = LAPIC_NMI_WATCHDOG;
416 nmi_active = 1;
417 }
418
cpu_nmi_callback(struct notifier_block * nfb,unsigned long action,void * hcpu)419 static int cpu_nmi_callback(
420 struct notifier_block *nfb, unsigned long action, void *hcpu)
421 {
422 unsigned int cpu = (unsigned long)hcpu;
423
424 switch ( action )
425 {
426 case CPU_UP_PREPARE:
427 init_timer(&per_cpu(nmi_timer, cpu), nmi_timer_fn, NULL, cpu);
428 set_timer(&per_cpu(nmi_timer, cpu), NOW());
429 break;
430 case CPU_UP_CANCELED:
431 case CPU_DEAD:
432 kill_timer(&per_cpu(nmi_timer, cpu));
433 break;
434 default:
435 break;
436 }
437
438 return NOTIFY_DONE;
439 }
440
441 static struct notifier_block cpu_nmi_nfb = {
442 .notifier_call = cpu_nmi_callback
443 };
444
445 static DEFINE_PER_CPU(unsigned int, last_irq_sums);
446 static DEFINE_PER_CPU(unsigned int, alert_counter);
447
448 static atomic_t watchdog_disable_count = ATOMIC_INIT(1);
449
watchdog_disable(void)450 void watchdog_disable(void)
451 {
452 atomic_inc(&watchdog_disable_count);
453 }
454
watchdog_enable(void)455 void watchdog_enable(void)
456 {
457 atomic_dec(&watchdog_disable_count);
458 }
459
watchdog_enabled(void)460 bool watchdog_enabled(void)
461 {
462 return !atomic_read(&watchdog_disable_count);
463 }
464
watchdog_setup(void)465 int __init watchdog_setup(void)
466 {
467 unsigned int cpu;
468
469 /*
470 * Activate periodic heartbeats. We cannot do this earlier during
471 * setup because the timer infrastructure is not available.
472 */
473 for_each_online_cpu ( cpu )
474 cpu_nmi_callback(&cpu_nmi_nfb, CPU_UP_PREPARE, (void *)(long)cpu);
475 register_cpu_notifier(&cpu_nmi_nfb);
476
477 watchdog_enable();
478 return 0;
479 }
480
481 /* Returns false if this was not a watchdog NMI, true otherwise */
nmi_watchdog_tick(const struct cpu_user_regs * regs)482 bool nmi_watchdog_tick(const struct cpu_user_regs *regs)
483 {
484 bool watchdog_tick = true;
485 unsigned int sum = this_cpu(nmi_timer_ticks);
486
487 if ( (this_cpu(last_irq_sums) == sum) && watchdog_enabled() )
488 {
489 /*
490 * Ayiee, looks like this CPU is stuck ... wait for the timeout
491 * before doing the oops ...
492 */
493 this_cpu(alert_counter)++;
494 if ( this_cpu(alert_counter) == opt_watchdog_timeout*nmi_hz )
495 {
496 console_force_unlock();
497 printk("Watchdog timer detects that CPU%d is stuck!\n",
498 smp_processor_id());
499 fatal_trap(regs, 1);
500 }
501 }
502 else
503 {
504 this_cpu(last_irq_sums) = sum;
505 this_cpu(alert_counter) = 0;
506 }
507
508 if ( nmi_perfctr_msr )
509 {
510 uint64_t msr_content;
511
512 /* Work out if this is a watchdog tick by checking for overflow. */
513 if ( nmi_perfctr_msr == MSR_P4_IQ_PERFCTR0 )
514 {
515 rdmsrl(MSR_P4_IQ_CCCR0, msr_content);
516 if ( !(msr_content & P4_CCCR_OVF) )
517 watchdog_tick = false;
518
519 /*
520 * P4 quirks:
521 * - An overflown perfctr will assert its interrupt
522 * until the OVF flag in its CCCR is cleared.
523 * - LVTPC is masked on interrupt and must be
524 * unmasked by the LVTPC handler.
525 */
526 wrmsrl(MSR_P4_IQ_CCCR0, nmi_p4_cccr_val);
527 apic_write(APIC_LVTPC, APIC_DM_NMI);
528 }
529 else if ( nmi_perfctr_msr == MSR_P6_PERFCTR(0) )
530 {
531 rdmsrl(MSR_P6_PERFCTR(0), msr_content);
532 if ( msr_content & (1ULL << P6_EVENT_WIDTH) )
533 watchdog_tick = false;
534
535 /*
536 * Only P6 based Pentium M need to re-unmask the apic vector but
537 * it doesn't hurt other P6 variants.
538 */
539 apic_write(APIC_LVTPC, APIC_DM_NMI);
540 }
541 else if ( nmi_perfctr_msr == MSR_K7_PERFCTR0 )
542 {
543 rdmsrl(MSR_K7_PERFCTR0, msr_content);
544 if ( msr_content & (1ULL << K7_EVENT_WIDTH) )
545 watchdog_tick = false;
546 }
547 write_watchdog_counter(NULL);
548 }
549
550 return watchdog_tick;
551 }
552
553 /*
554 * For some reason the destination shorthand for self is not valid
555 * when used with the NMI delivery mode. This is documented in Tables
556 * 8-3 and 8-4 in IA32 Reference Manual Volume 3. We send the IPI to
557 * our own APIC ID explicitly which is valid.
558 */
self_nmi(void)559 void self_nmi(void)
560 {
561 unsigned long flags;
562 u32 id = get_apic_id();
563 local_irq_save(flags);
564 apic_wait_icr_idle();
565 apic_icr_write(APIC_DM_NMI | APIC_DEST_PHYSICAL, id);
566 local_irq_restore(flags);
567 }
568
do_nmi_trigger(unsigned char key)569 static void do_nmi_trigger(unsigned char key)
570 {
571 printk("Triggering NMI on APIC ID %x\n", get_apic_id());
572 self_nmi();
573 }
574
do_nmi_stats(unsigned char key)575 static void do_nmi_stats(unsigned char key)
576 {
577 int i;
578 struct domain *d;
579 struct vcpu *v;
580
581 printk("CPU\tNMI\n");
582 for_each_online_cpu ( i )
583 printk("%3d\t%3d\n", i, nmi_count(i));
584
585 if ( ((d = hardware_domain) == NULL) || (d->vcpu == NULL) ||
586 ((v = d->vcpu[0]) == NULL) )
587 return;
588
589 i = v->async_exception_mask & (1 << VCPU_TRAP_NMI);
590 if ( v->nmi_pending || i )
591 printk("dom0 vpu0: NMI %s%s\n",
592 v->nmi_pending ? "pending " : "",
593 i ? "masked " : "");
594 else
595 printk("dom0 vcpu0: NMI neither pending nor masked\n");
596 }
597
register_nmi_trigger(void)598 static __init int register_nmi_trigger(void)
599 {
600 register_keyhandler('N', do_nmi_trigger, "trigger an NMI", 0);
601 register_keyhandler('n', do_nmi_stats, "NMI statistics", 1);
602 return 0;
603 }
604 __initcall(register_nmi_trigger);
605