1 /*
2 * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
3 * Use is subject to license terms.
4 *
5 * This program is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU General Public License as
7 * published by the Free Software Foundation, version 2 of the
8 * License.
9 */
10
11 /*
12 * mctelem.c - x86 Machine Check Telemetry Transport
13 */
14
15 #include <xen/init.h>
16 #include <xen/types.h>
17 #include <xen/kernel.h>
18 #include <xen/smp.h>
19 #include <xen/errno.h>
20 #include <xen/sched.h>
21 #include <xen/sched-if.h>
22 #include <xen/cpumask.h>
23 #include <xen/event.h>
24
25 #include <asm/processor.h>
26 #include <asm/system.h>
27 #include <asm/msr.h>
28
29 #include "mce.h"
30
31 struct mctelem_ent {
32 struct mctelem_ent *mcte_next; /* next in chronological order */
33 struct mctelem_ent *mcte_prev; /* previous in chronological order */
34 uint32_t mcte_flags; /* See MCTE_F_* below */
35 uint32_t mcte_refcnt; /* Reference count */
36 void *mcte_data; /* corresponding data payload */
37 };
38
39 #define MCTE_F_CLASS_URGENT 0x0001U /* in use - urgent errors */
40 #define MCTE_F_CLASS_NONURGENT 0x0002U /* in use - nonurgent errors */
41 #define MCTE_F_STATE_FREE 0x0010U /* on a freelist */
42 #define MCTE_F_STATE_UNCOMMITTED 0x0020U /* reserved; on no list */
43 #define MCTE_F_STATE_COMMITTED 0x0040U /* on a committed list */
44 #define MCTE_F_STATE_PROCESSING 0x0080U /* on a processing list */
45
46 #define MCTE_F_MASK_CLASS (MCTE_F_CLASS_URGENT | MCTE_F_CLASS_NONURGENT)
47 #define MCTE_F_MASK_STATE (MCTE_F_STATE_FREE | \
48 MCTE_F_STATE_UNCOMMITTED | \
49 MCTE_F_STATE_COMMITTED | \
50 MCTE_F_STATE_PROCESSING)
51
52 #define MCTE_CLASS(tep) ((tep)->mcte_flags & MCTE_F_MASK_CLASS)
53 #define MCTE_SET_CLASS(tep, new) do { \
54 (tep)->mcte_flags &= ~MCTE_F_MASK_CLASS; \
55 (tep)->mcte_flags |= MCTE_F_CLASS_##new; } while (0)
56
57 #define MCTE_STATE(tep) ((tep)->mcte_flags & MCTE_F_MASK_STATE)
58 #define MCTE_TRANSITION_STATE(tep, old, new) do { \
59 BUG_ON(MCTE_STATE(tep) != (MCTE_F_STATE_##old)); \
60 (tep)->mcte_flags &= ~MCTE_F_MASK_STATE; \
61 (tep)->mcte_flags |= (MCTE_F_STATE_##new); } while (0)
62
63 #define MC_URGENT_NENT 10
64 #define MC_NONURGENT_NENT 20
65
66 #define MC_NENT (MC_URGENT_NENT + MC_NONURGENT_NENT)
67
68 #define MC_NCLASSES (MC_NONURGENT + 1)
69
70 #define COOKIE2MCTE(c) ((struct mctelem_ent *)(c))
71 #define MCTE2COOKIE(tep) ((mctelem_cookie_t)(tep))
72
73 static struct mc_telem_ctl {
74 /* Linked lists that thread the array members together.
75 *
76 * The free lists is a bit array where bit 1 means free.
77 * This as element number is quite small and is easy to
78 * atomically allocate that way.
79 *
80 * The committed list grows at the head and we do not maintain a
81 * tail pointer; insertions are performed atomically. The head
82 * thus has the most-recently committed telemetry, i.e. the
83 * list is in reverse chronological order. The committed list
84 * is singly-linked via mcte_prev pointers, and mcte_next is NULL.
85 * When we move telemetry from the committed list to the processing
86 * list we atomically unlink the committed list and keep a pointer
87 * to the head of that list; we then traverse the list following
88 * mcte_prev and fill in mcte_next to doubly-link the list, and then
89 * append the tail of the list onto the processing list. If we panic
90 * during this manipulation of the committed list we still have
91 * the pointer to its head so we can recover all entries during
92 * the panic flow (albeit in reverse chronological order).
93 *
94 * The processing list is updated in a controlled context, and
95 * we can lock it for updates. The head of the processing list
96 * always has the oldest telemetry, and we append (as above)
97 * at the tail of the processing list. */
98 DECLARE_BITMAP(mctc_free, MC_NENT);
99 struct mctelem_ent *mctc_committed[MC_NCLASSES];
100 struct mctelem_ent *mctc_processing_head[MC_NCLASSES];
101 struct mctelem_ent *mctc_processing_tail[MC_NCLASSES];
102 /*
103 * Telemetry array
104 */
105 struct mctelem_ent *mctc_elems;
106 } mctctl;
107
108 struct mc_telem_cpu_ctl {
109 /*
110 * Per-CPU processing lists, used for deferred (softirq)
111 * processing of telemetry.
112 *
113 * The two pending lists @lmce_pending and @pending grow at
114 * the head in the reverse chronological order.
115 *
116 * @pending and @lmce_pending on the same CPU are mutually
117 * exclusive, i.e. deferred MCE on a CPU are either all in
118 * @lmce_pending or all in @pending. In the former case, all
119 * deferred MCE are LMCE. In the latter case, both LMCE and
120 * non-local MCE can be in @pending, and @pending contains at
121 * least one non-local MCE if it's not empty.
122 *
123 * Changes to @pending and @lmce_pending should be performed
124 * via mctelem_process_deferred() and mctelem_defer(), in order
125 * to guarantee the above mutual exclusivity.
126 */
127 struct mctelem_ent *pending, *lmce_pending;
128 struct mctelem_ent *processing;
129 };
130 static DEFINE_PER_CPU(struct mc_telem_cpu_ctl, mctctl);
131
132 /* Lock protecting all processing lists */
133 static DEFINE_SPINLOCK(processing_lock);
134
mctelem_xchg_head(struct mctelem_ent ** headp,struct mctelem_ent ** linkp,struct mctelem_ent * new)135 static void mctelem_xchg_head(struct mctelem_ent **headp,
136 struct mctelem_ent **linkp,
137 struct mctelem_ent *new)
138 {
139 for (;;) {
140 struct mctelem_ent *old;
141
142 *linkp = old = *headp;
143 if (cmpxchgptr(headp, old, new) == old)
144 break;
145 }
146 }
147
148 /**
149 * Append a telemetry of deferred MCE to a per-cpu pending list,
150 * either @pending or @lmce_pending, according to rules below:
151 * - if @pending is not empty, then the new telemetry will be
152 * appended to @pending;
153 * - if @pending is empty and the new telemetry is for a deferred
154 * LMCE, then the new telemetry will be appended to @lmce_pending;
155 * - if @pending is empty and the new telemetry is for a deferred
156 * non-local MCE, all existing telemetries in @lmce_pending will be
157 * moved to @pending and then the new telemetry will be appended to
158 * @pending.
159 *
160 * This function must be called with MCIP bit set, so that it does not
161 * need to worry about MC# re-occurring in this function.
162 *
163 * As a result, this function can preserve the mutual exclusivity
164 * between @pending and @lmce_pending (see their comments in struct
165 * mc_telem_cpu_ctl).
166 *
167 * Parameters:
168 * @cookie: telemetry of the deferred MCE
169 * @lmce: indicate whether the telemetry is for LMCE
170 */
mctelem_defer(mctelem_cookie_t cookie,bool lmce)171 void mctelem_defer(mctelem_cookie_t cookie, bool lmce)
172 {
173 struct mctelem_ent *tep = COOKIE2MCTE(cookie);
174 struct mc_telem_cpu_ctl *mctctl = &this_cpu(mctctl);
175
176 ASSERT(mctctl->pending == NULL || mctctl->lmce_pending == NULL);
177
178 if (mctctl->pending)
179 mctelem_xchg_head(&mctctl->pending, &tep->mcte_next, tep);
180 else if (lmce)
181 mctelem_xchg_head(&mctctl->lmce_pending, &tep->mcte_next, tep);
182 else {
183 /*
184 * LMCE is supported on Skylake-server and later CPUs, on
185 * which mce_broadcast is always true. Therefore, non-empty
186 * mctctl->lmce_pending in this branch implies a broadcasting
187 * MC# is being handled, every CPU is in the exception
188 * context, and no one is consuming mctctl->pending at this
189 * moment. As a result, the following two exchanges together
190 * can be treated as atomic.
191 */
192 if (mctctl->lmce_pending)
193 mctelem_xchg_head(&mctctl->lmce_pending,
194 &mctctl->pending, NULL);
195 mctelem_xchg_head(&mctctl->pending, &tep->mcte_next, tep);
196 }
197 }
198
199 /**
200 * Move telemetries of deferred MCE from the per-cpu pending list on
201 * this or another CPU to the per-cpu processing list on this CPU, and
202 * then process all deferred MCE on the processing list.
203 *
204 * This function can be called with MCIP bit set (e.g. from MC#
205 * handler) or cleared (from MCE softirq handler). In the latter case,
206 * MC# may re-occur in this function.
207 *
208 * Parameters:
209 * @cpu: indicate the CPU where the pending list is
210 * @fn: the function to handle the deferred MCE
211 * @lmce: indicate which pending list on @cpu is handled
212 */
mctelem_process_deferred(unsigned int cpu,int (* fn)(mctelem_cookie_t),bool lmce)213 void mctelem_process_deferred(unsigned int cpu,
214 int (*fn)(mctelem_cookie_t),
215 bool lmce)
216 {
217 struct mctelem_ent *tep;
218 struct mctelem_ent *head, *prev;
219 struct mc_telem_cpu_ctl *mctctl = &per_cpu(mctctl, cpu);
220 int ret;
221
222 /*
223 * First, unhook the list of telemetry structures, and
224 * hook it up to the processing list head for this CPU.
225 *
226 * If @lmce is true and a non-local MC# occurs before the
227 * following atomic exchange, @lmce will not hold after
228 * resumption, because all telemetries in @lmce_pending on
229 * @cpu are moved to @pending on @cpu in mcheck_cmn_handler().
230 * In such a case, no telemetries will be handled in this
231 * function after resumption. Another round of MCE softirq,
232 * which was raised by above mcheck_cmn_handler(), will handle
233 * those moved telemetries in @pending on @cpu.
234 *
235 * Any MC# occurring after the following atomic exchange will be
236 * handled by another round of MCE softirq.
237 */
238 mctelem_xchg_head(lmce ? &mctctl->lmce_pending : &mctctl->pending,
239 &this_cpu(mctctl.processing), NULL);
240
241 head = this_cpu(mctctl.processing);
242
243 /*
244 * Then, fix up the list to include prev pointers, to make
245 * things a little easier, as the list must be traversed in
246 * chronological order, which is backward from the order they
247 * are in.
248 */
249 for (tep = head, prev = NULL; tep != NULL; tep = tep->mcte_next) {
250 tep->mcte_prev = prev;
251 prev = tep;
252 }
253
254 /*
255 * Now walk the list of telemetry structures, handling each
256 * one of them. Unhooking the structure here does not need to
257 * be atomic, as this list is only accessed from a softirq
258 * context; the MCE handler does not touch it.
259 */
260 for (tep = prev; tep != NULL; tep = prev) {
261 prev = tep->mcte_prev;
262 tep->mcte_next = tep->mcte_prev = NULL;
263
264 ret = fn(MCTE2COOKIE(tep));
265 if (prev != NULL)
266 prev->mcte_next = NULL;
267 tep->mcte_prev = tep->mcte_next = NULL;
268 if (ret != 0)
269 mctelem_commit(MCTE2COOKIE(tep));
270 else
271 mctelem_dismiss(MCTE2COOKIE(tep));
272 }
273 }
274
mctelem_has_deferred(unsigned int cpu)275 bool mctelem_has_deferred(unsigned int cpu)
276 {
277 if (per_cpu(mctctl.pending, cpu) != NULL)
278 return true;
279 return false;
280 }
281
mctelem_has_deferred_lmce(unsigned int cpu)282 bool mctelem_has_deferred_lmce(unsigned int cpu)
283 {
284 return per_cpu(mctctl.lmce_pending, cpu) != NULL;
285 }
286
287 /* Free an entry to its native free list; the entry must not be linked on
288 * any list.
289 */
mctelem_free(struct mctelem_ent * tep)290 static void mctelem_free(struct mctelem_ent *tep)
291 {
292 BUG_ON(tep->mcte_refcnt != 0);
293 BUG_ON(MCTE_STATE(tep) != MCTE_F_STATE_FREE);
294
295 tep->mcte_prev = NULL;
296 tep->mcte_next = NULL;
297
298 /* set free in array */
299 set_bit(tep - mctctl.mctc_elems, mctctl.mctc_free);
300 }
301
302 /* Increment the reference count of an entry that is not linked on to
303 * any list and which only the caller has a pointer to.
304 */
mctelem_hold(struct mctelem_ent * tep)305 static void mctelem_hold(struct mctelem_ent *tep)
306 {
307 tep->mcte_refcnt++;
308 }
309
310 /* Increment the reference count on an entry that is linked at the head of
311 * a processing list. The caller is responsible for locking the list.
312 */
mctelem_processing_hold(struct mctelem_ent * tep)313 static void mctelem_processing_hold(struct mctelem_ent *tep)
314 {
315 int which = MCTE_CLASS(tep) == MCTE_F_CLASS_URGENT ?
316 MC_URGENT : MC_NONURGENT;
317
318 BUG_ON(tep != mctctl.mctc_processing_head[which]);
319 tep->mcte_refcnt++;
320 }
321
322 /* Decrement the reference count on an entry that is linked at the head of
323 * a processing list. The caller is responsible for locking the list.
324 */
mctelem_processing_release(struct mctelem_ent * tep)325 static void mctelem_processing_release(struct mctelem_ent *tep)
326 {
327 int which = MCTE_CLASS(tep) == MCTE_F_CLASS_URGENT ?
328 MC_URGENT : MC_NONURGENT;
329
330 BUG_ON(tep != mctctl.mctc_processing_head[which]);
331 if (--tep->mcte_refcnt == 0) {
332 MCTE_TRANSITION_STATE(tep, PROCESSING, FREE);
333 mctctl.mctc_processing_head[which] = tep->mcte_next;
334 mctelem_free(tep);
335 }
336 }
337
mctelem_init(unsigned int datasz)338 void __init mctelem_init(unsigned int datasz)
339 {
340 char *datarr;
341 unsigned int i;
342
343 BUILD_BUG_ON(MC_URGENT != 0 || MC_NONURGENT != 1 || MC_NCLASSES != 2);
344
345 datasz = (datasz & ~0xf) + 0x10; /* 16 byte roundup */
346
347 if ((mctctl.mctc_elems = xmalloc_array(struct mctelem_ent,
348 MC_NENT)) == NULL ||
349 (datarr = xmalloc_bytes(MC_NENT * datasz)) == NULL) {
350 xfree(mctctl.mctc_elems);
351 printk("Allocations for MCA telemetry failed\n");
352 return;
353 }
354
355 for (i = 0; i < MC_NENT; i++) {
356 struct mctelem_ent *tep;
357
358 tep = mctctl.mctc_elems + i;
359 tep->mcte_flags = MCTE_F_STATE_FREE;
360 tep->mcte_refcnt = 0;
361 tep->mcte_data = datarr + i * datasz;
362
363 __set_bit(i, mctctl.mctc_free);
364 tep->mcte_next = NULL;
365 tep->mcte_prev = NULL;
366 }
367 }
368
369 /* incremented non-atomically when reserve fails */
370 static int mctelem_drop_count;
371
372 /* Reserve a telemetry entry, or return NULL if none available.
373 * If we return an entry then the caller must subsequently call exactly one of
374 * mctelem_dismiss or mctelem_commit for that entry.
375 */
mctelem_reserve(mctelem_class_t which)376 mctelem_cookie_t mctelem_reserve(mctelem_class_t which)
377 {
378 unsigned bit;
379 unsigned start_bit = (which == MC_URGENT) ? 0 : MC_URGENT_NENT;
380
381 for (;;) {
382 bit = find_next_bit(mctctl.mctc_free, MC_NENT, start_bit);
383
384 if (bit >= MC_NENT) {
385 mctelem_drop_count++;
386 return (NULL);
387 }
388
389 /* try to allocate, atomically clear free bit */
390 if (test_and_clear_bit(bit, mctctl.mctc_free)) {
391 /* return element we got */
392 struct mctelem_ent *tep = mctctl.mctc_elems + bit;
393
394 mctelem_hold(tep);
395 MCTE_TRANSITION_STATE(tep, FREE, UNCOMMITTED);
396 tep->mcte_next = NULL;
397 tep->mcte_prev = NULL;
398 if (which == MC_URGENT)
399 MCTE_SET_CLASS(tep, URGENT);
400 else
401 MCTE_SET_CLASS(tep, NONURGENT);
402 return MCTE2COOKIE(tep);
403 }
404 }
405 }
406
mctelem_dataptr(mctelem_cookie_t cookie)407 void *mctelem_dataptr(mctelem_cookie_t cookie)
408 {
409 struct mctelem_ent *tep = COOKIE2MCTE(cookie);
410
411 return tep->mcte_data;
412 }
413
414 /* Release a previously reserved entry back to the freelist without
415 * submitting it for logging. The entry must not be linked on to any
416 * list - that's how mctelem_reserve handed it out.
417 */
mctelem_dismiss(mctelem_cookie_t cookie)418 void mctelem_dismiss(mctelem_cookie_t cookie)
419 {
420 struct mctelem_ent *tep = COOKIE2MCTE(cookie);
421
422 tep->mcte_refcnt--;
423 MCTE_TRANSITION_STATE(tep, UNCOMMITTED, FREE);
424 mctelem_free(tep);
425 }
426
427 /* Commit an entry with completed telemetry for logging. The caller must
428 * not reference the entry after this call. Note that we add entries
429 * at the head of the committed list, so that list therefore has entries
430 * in reverse chronological order.
431 */
mctelem_commit(mctelem_cookie_t cookie)432 void mctelem_commit(mctelem_cookie_t cookie)
433 {
434 struct mctelem_ent *tep = COOKIE2MCTE(cookie);
435 mctelem_class_t target = MCTE_CLASS(tep) == MCTE_F_CLASS_URGENT ?
436 MC_URGENT : MC_NONURGENT;
437
438 BUG_ON(tep->mcte_next != NULL || tep->mcte_prev != NULL);
439 MCTE_TRANSITION_STATE(tep, UNCOMMITTED, COMMITTED);
440
441 mctelem_xchg_head(&mctctl.mctc_committed[target], &tep->mcte_prev, tep);
442 }
443
444 /* Move telemetry from committed list to processing list, reversing the
445 * list into chronological order. The processing list has been
446 * locked by the caller, and may be non-empty. We append the
447 * reversed committed list on to the tail of the processing list.
448 * The committed list may grow even while we run, so use atomic
449 * operations to swap NULL to the freelist head.
450 *
451 * Note that "chronological order" means the order in which producers
452 * won additions to the processing list, which may not reflect the
453 * strict chronological order of the associated events if events are
454 * closely spaced in time and contend for the processing list at once.
455 */
456
457 static struct mctelem_ent *dangling[MC_NCLASSES];
458
mctelem_append_processing(mctelem_class_t which)459 static void mctelem_append_processing(mctelem_class_t which)
460 {
461 mctelem_class_t target = which == MC_URGENT ?
462 MC_URGENT : MC_NONURGENT;
463 struct mctelem_ent **commlp = &mctctl.mctc_committed[target];
464 struct mctelem_ent **proclhp = &mctctl.mctc_processing_head[target];
465 struct mctelem_ent **procltp = &mctctl.mctc_processing_tail[target];
466 struct mctelem_ent *tep, *ltep;
467
468 /* Check for an empty list; no race since we hold the processing lock */
469 if (*commlp == NULL)
470 return;
471
472 /* Atomically unlink the committed list, and keep a pointer to
473 * the list we unlink in a well-known location so it can be
474 * picked up in panic code should we panic between this unlink
475 * and the append to the processing list. */
476 mctelem_xchg_head(commlp, &dangling[target], NULL);
477
478 if (dangling[target] == NULL)
479 return;
480
481 /* Traverse the list following the previous pointers (reverse
482 * chronological order). For each entry fill in the next pointer
483 * and transition the element state. */
484 for (tep = dangling[target], ltep = NULL; tep != NULL;
485 tep = tep->mcte_prev) {
486 MCTE_TRANSITION_STATE(tep, COMMITTED, PROCESSING);
487 tep->mcte_next = ltep;
488 ltep = tep;
489 }
490
491 /* ltep points to the head of a chronologically ordered linked
492 * list of telemetry entries ending at the most recent entry
493 * dangling[target] if mcte_next is followed; tack this on to
494 * the processing list.
495 */
496 if (*proclhp == NULL) {
497 *proclhp = ltep;
498 *procltp = dangling[target];
499 } else {
500 (*procltp)->mcte_next = ltep;
501 ltep->mcte_prev = *procltp;
502 *procltp = dangling[target];
503 }
504 wmb();
505 dangling[target] = NULL;
506 wmb();
507 }
508
mctelem_consume_oldest_begin(mctelem_class_t which)509 mctelem_cookie_t mctelem_consume_oldest_begin(mctelem_class_t which)
510 {
511 mctelem_class_t target = (which == MC_URGENT) ?
512 MC_URGENT : MC_NONURGENT;
513 struct mctelem_ent *tep;
514
515 spin_lock(&processing_lock);
516 mctelem_append_processing(target);
517 if ((tep = mctctl.mctc_processing_head[target]) == NULL) {
518 spin_unlock(&processing_lock);
519 return NULL;
520 }
521
522 mctelem_processing_hold(tep);
523 wmb();
524 spin_unlock(&processing_lock);
525 return MCTE2COOKIE(tep);
526 }
527
mctelem_consume_oldest_end(mctelem_cookie_t cookie)528 void mctelem_consume_oldest_end(mctelem_cookie_t cookie)
529 {
530 struct mctelem_ent *tep = COOKIE2MCTE(cookie);
531
532 spin_lock(&processing_lock);
533 mctelem_processing_release(tep);
534 wmb();
535 spin_unlock(&processing_lock);
536 }
537
mctelem_ack(mctelem_class_t which,mctelem_cookie_t cookie)538 void mctelem_ack(mctelem_class_t which, mctelem_cookie_t cookie)
539 {
540 mctelem_class_t target = (which == MC_URGENT) ?
541 MC_URGENT : MC_NONURGENT;
542 struct mctelem_ent *tep = COOKIE2MCTE(cookie);
543
544 if (tep == NULL)
545 return;
546
547 spin_lock(&processing_lock);
548 if (tep == mctctl.mctc_processing_head[target])
549 mctelem_processing_release(tep);
550 wmb();
551 spin_unlock(&processing_lock);
552 }
553
554 /*
555 * Local variables:
556 * mode: C
557 * c-file-style: "BSD"
558 * c-basic-offset: 4
559 * indent-tabs-mode: t
560 * tab-width: 8
561 * End:
562 */
563