1 /*
2  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
3  * Use is subject to license terms.
4  *
5  * This program is free software; you can redistribute it and/or
6  * modify it under the terms of the GNU General Public License as
7  * published by the Free Software Foundation, version 2 of the
8  * License.
9  */
10 
11 /*
12  * mctelem.c - x86 Machine Check Telemetry Transport
13  */
14 
15 #include <xen/init.h>
16 #include <xen/types.h>
17 #include <xen/kernel.h>
18 #include <xen/smp.h>
19 #include <xen/errno.h>
20 #include <xen/sched.h>
21 #include <xen/sched-if.h>
22 #include <xen/cpumask.h>
23 #include <xen/event.h>
24 
25 #include <asm/processor.h>
26 #include <asm/system.h>
27 #include <asm/msr.h>
28 
29 #include "mce.h"
30 
31 struct mctelem_ent {
32 	struct mctelem_ent *mcte_next;	/* next in chronological order */
33 	struct mctelem_ent *mcte_prev;	/* previous in chronological order */
34 	uint32_t mcte_flags;		/* See MCTE_F_* below */
35 	uint32_t mcte_refcnt;		/* Reference count */
36 	void *mcte_data;		/* corresponding data payload */
37 };
38 
39 #define	MCTE_F_CLASS_URGENT		0x0001U /* in use - urgent errors */
40 #define	MCTE_F_CLASS_NONURGENT		0x0002U /* in use - nonurgent errors */
41 #define	MCTE_F_STATE_FREE		0x0010U	/* on a freelist */
42 #define	MCTE_F_STATE_UNCOMMITTED	0x0020U	/* reserved; on no list */
43 #define	MCTE_F_STATE_COMMITTED		0x0040U	/* on a committed list */
44 #define	MCTE_F_STATE_PROCESSING		0x0080U	/* on a processing list */
45 
46 #define	MCTE_F_MASK_CLASS	(MCTE_F_CLASS_URGENT | MCTE_F_CLASS_NONURGENT)
47 #define	MCTE_F_MASK_STATE	(MCTE_F_STATE_FREE | \
48 				MCTE_F_STATE_UNCOMMITTED | \
49 				MCTE_F_STATE_COMMITTED | \
50 				MCTE_F_STATE_PROCESSING)
51 
52 #define	MCTE_CLASS(tep) ((tep)->mcte_flags & MCTE_F_MASK_CLASS)
53 #define	MCTE_SET_CLASS(tep, new) do { \
54     (tep)->mcte_flags &= ~MCTE_F_MASK_CLASS; \
55     (tep)->mcte_flags |= MCTE_F_CLASS_##new; } while (0)
56 
57 #define	MCTE_STATE(tep) ((tep)->mcte_flags & MCTE_F_MASK_STATE)
58 #define	MCTE_TRANSITION_STATE(tep, old, new) do { \
59     BUG_ON(MCTE_STATE(tep) != (MCTE_F_STATE_##old)); \
60     (tep)->mcte_flags &= ~MCTE_F_MASK_STATE; \
61     (tep)->mcte_flags |= (MCTE_F_STATE_##new); } while (0)
62 
63 #define	MC_URGENT_NENT		10
64 #define	MC_NONURGENT_NENT	20
65 
66 #define MC_NENT (MC_URGENT_NENT + MC_NONURGENT_NENT)
67 
68 #define	MC_NCLASSES		(MC_NONURGENT + 1)
69 
70 #define	COOKIE2MCTE(c)		((struct mctelem_ent *)(c))
71 #define	MCTE2COOKIE(tep)	((mctelem_cookie_t)(tep))
72 
73 static struct mc_telem_ctl {
74 	/* Linked lists that thread the array members together.
75 	 *
76 	 * The free lists is a bit array where bit 1 means free.
77 	 * This as element number is quite small and is easy to
78 	 * atomically allocate that way.
79 	 *
80 	 * The committed list grows at the head and we do not maintain a
81 	 * tail pointer; insertions are performed atomically.  The head
82 	 * thus has the most-recently committed telemetry, i.e. the
83 	 * list is in reverse chronological order.  The committed list
84 	 * is singly-linked via mcte_prev pointers, and mcte_next is NULL.
85 	 * When we move telemetry from the committed list to the processing
86 	 * list we atomically unlink the committed list and keep a pointer
87 	 * to the head of that list;  we then traverse the list following
88 	 * mcte_prev and fill in mcte_next to doubly-link the list, and then
89 	 * append the tail of the list onto the processing list.  If we panic
90 	 * during this manipulation of the committed list we still have
91 	 * the pointer to its head so we can recover all entries during
92 	 * the panic flow (albeit in reverse chronological order).
93 	 *
94 	 * The processing list is updated in a controlled context, and
95 	 * we can lock it for updates.  The head of the processing list
96 	 * always has the oldest telemetry, and we append (as above)
97 	 * at the tail of the processing list. */
98 	DECLARE_BITMAP(mctc_free, MC_NENT);
99 	struct mctelem_ent *mctc_committed[MC_NCLASSES];
100 	struct mctelem_ent *mctc_processing_head[MC_NCLASSES];
101 	struct mctelem_ent *mctc_processing_tail[MC_NCLASSES];
102 	/*
103 	 * Telemetry array
104 	 */
105 	struct mctelem_ent *mctc_elems;
106 } mctctl;
107 
108 struct mc_telem_cpu_ctl {
109 	/*
110 	 * Per-CPU processing lists, used for deferred (softirq)
111 	 * processing of telemetry.
112 	 *
113 	 * The two pending lists @lmce_pending and @pending grow at
114 	 * the head in the reverse chronological order.
115 	 *
116 	 * @pending and @lmce_pending on the same CPU are mutually
117 	 * exclusive, i.e. deferred MCE on a CPU are either all in
118 	 * @lmce_pending or all in @pending. In the former case, all
119 	 * deferred MCE are LMCE. In the latter case, both LMCE and
120 	 * non-local MCE can be in @pending, and @pending contains at
121 	 * least one non-local MCE if it's not empty.
122 	 *
123 	 * Changes to @pending and @lmce_pending should be performed
124 	 * via mctelem_process_deferred() and mctelem_defer(), in order
125 	 * to guarantee the above mutual exclusivity.
126 	 */
127 	struct mctelem_ent *pending, *lmce_pending;
128 	struct mctelem_ent *processing;
129 };
130 static DEFINE_PER_CPU(struct mc_telem_cpu_ctl, mctctl);
131 
132 /* Lock protecting all processing lists */
133 static DEFINE_SPINLOCK(processing_lock);
134 
mctelem_xchg_head(struct mctelem_ent ** headp,struct mctelem_ent ** linkp,struct mctelem_ent * new)135 static void mctelem_xchg_head(struct mctelem_ent **headp,
136 				struct mctelem_ent **linkp,
137 				struct mctelem_ent *new)
138 {
139 	for (;;) {
140 		struct mctelem_ent *old;
141 
142 		*linkp = old = *headp;
143 		if (cmpxchgptr(headp, old, new) == old)
144 			break;
145 	}
146 }
147 
148 /**
149  * Append a telemetry of deferred MCE to a per-cpu pending list,
150  * either @pending or @lmce_pending, according to rules below:
151  *  - if @pending is not empty, then the new telemetry will be
152  *    appended to @pending;
153  *  - if @pending is empty and the new telemetry is for a deferred
154  *    LMCE, then the new telemetry will be appended to @lmce_pending;
155  *  - if @pending is empty and the new telemetry is for a deferred
156  *    non-local MCE, all existing telemetries in @lmce_pending will be
157  *    moved to @pending and then the new telemetry will be appended to
158  *    @pending.
159  *
160  * This function must be called with MCIP bit set, so that it does not
161  * need to worry about MC# re-occurring in this function.
162  *
163  * As a result, this function can preserve the mutual exclusivity
164  * between @pending and @lmce_pending (see their comments in struct
165  * mc_telem_cpu_ctl).
166  *
167  * Parameters:
168  *  @cookie: telemetry of the deferred MCE
169  *  @lmce:   indicate whether the telemetry is for LMCE
170  */
mctelem_defer(mctelem_cookie_t cookie,bool lmce)171 void mctelem_defer(mctelem_cookie_t cookie, bool lmce)
172 {
173 	struct mctelem_ent *tep = COOKIE2MCTE(cookie);
174 	struct mc_telem_cpu_ctl *mctctl = &this_cpu(mctctl);
175 
176 	ASSERT(mctctl->pending == NULL || mctctl->lmce_pending == NULL);
177 
178 	if (mctctl->pending)
179 		mctelem_xchg_head(&mctctl->pending, &tep->mcte_next, tep);
180 	else if (lmce)
181 		mctelem_xchg_head(&mctctl->lmce_pending, &tep->mcte_next, tep);
182 	else {
183 		/*
184 		 * LMCE is supported on Skylake-server and later CPUs, on
185 		 * which mce_broadcast is always true. Therefore, non-empty
186 		 * mctctl->lmce_pending in this branch implies a broadcasting
187 		 * MC# is being handled, every CPU is in the exception
188 		 * context, and no one is consuming mctctl->pending at this
189 		 * moment. As a result, the following two exchanges together
190 		 * can be treated as atomic.
191 		 */
192 		if (mctctl->lmce_pending)
193 			mctelem_xchg_head(&mctctl->lmce_pending,
194 					  &mctctl->pending, NULL);
195 		mctelem_xchg_head(&mctctl->pending, &tep->mcte_next, tep);
196 	}
197 }
198 
199 /**
200  * Move telemetries of deferred MCE from the per-cpu pending list on
201  * this or another CPU to the per-cpu processing list on this CPU, and
202  * then process all deferred MCE on the processing list.
203  *
204  * This function can be called with MCIP bit set (e.g. from MC#
205  * handler) or cleared (from MCE softirq handler). In the latter case,
206  * MC# may re-occur in this function.
207  *
208  * Parameters:
209  *  @cpu:  indicate the CPU where the pending list is
210  *  @fn:   the function to handle the deferred MCE
211  *  @lmce: indicate which pending list on @cpu is handled
212  */
mctelem_process_deferred(unsigned int cpu,int (* fn)(mctelem_cookie_t),bool lmce)213 void mctelem_process_deferred(unsigned int cpu,
214 			      int (*fn)(mctelem_cookie_t),
215 			      bool lmce)
216 {
217 	struct mctelem_ent *tep;
218 	struct mctelem_ent *head, *prev;
219 	struct mc_telem_cpu_ctl *mctctl = &per_cpu(mctctl, cpu);
220 	int ret;
221 
222 	/*
223 	 * First, unhook the list of telemetry structures, and
224 	 * hook it up to the processing list head for this CPU.
225 	 *
226 	 * If @lmce is true and a non-local MC# occurs before the
227 	 * following atomic exchange, @lmce will not hold after
228 	 * resumption, because all telemetries in @lmce_pending on
229 	 * @cpu are moved to @pending on @cpu in mcheck_cmn_handler().
230 	 * In such a case, no telemetries will be handled in this
231 	 * function after resumption. Another round of MCE softirq,
232 	 * which was raised by above mcheck_cmn_handler(), will handle
233 	 * those moved telemetries in @pending on @cpu.
234 	 *
235 	 * Any MC# occurring after the following atomic exchange will be
236 	 * handled by another round of MCE softirq.
237 	 */
238 	mctelem_xchg_head(lmce ? &mctctl->lmce_pending : &mctctl->pending,
239 			  &this_cpu(mctctl.processing), NULL);
240 
241 	head = this_cpu(mctctl.processing);
242 
243 	/*
244 	 * Then, fix up the list to include prev pointers, to make
245 	 * things a little easier, as the list must be traversed in
246 	 * chronological order, which is backward from the order they
247 	 * are in.
248 	 */
249 	for (tep = head, prev = NULL; tep != NULL; tep = tep->mcte_next) {
250 		tep->mcte_prev = prev;
251 		prev = tep;
252 	}
253 
254 	/*
255 	 * Now walk the list of telemetry structures, handling each
256 	 * one of them. Unhooking the structure here does not need to
257 	 * be atomic, as this list is only accessed from a softirq
258 	 * context; the MCE handler does not touch it.
259 	 */
260 	for (tep = prev; tep != NULL; tep = prev) {
261 		prev = tep->mcte_prev;
262 		tep->mcte_next = tep->mcte_prev = NULL;
263 
264 		ret = fn(MCTE2COOKIE(tep));
265 		if (prev != NULL)
266 			prev->mcte_next = NULL;
267 		tep->mcte_prev = tep->mcte_next = NULL;
268 		if (ret != 0)
269 			mctelem_commit(MCTE2COOKIE(tep));
270 		else
271 			mctelem_dismiss(MCTE2COOKIE(tep));
272 	}
273 }
274 
mctelem_has_deferred(unsigned int cpu)275 bool mctelem_has_deferred(unsigned int cpu)
276 {
277 	if (per_cpu(mctctl.pending, cpu) != NULL)
278 		return true;
279 	return false;
280 }
281 
mctelem_has_deferred_lmce(unsigned int cpu)282 bool mctelem_has_deferred_lmce(unsigned int cpu)
283 {
284 	return per_cpu(mctctl.lmce_pending, cpu) != NULL;
285 }
286 
287 /* Free an entry to its native free list; the entry must not be linked on
288  * any list.
289  */
mctelem_free(struct mctelem_ent * tep)290 static void mctelem_free(struct mctelem_ent *tep)
291 {
292 	BUG_ON(tep->mcte_refcnt != 0);
293 	BUG_ON(MCTE_STATE(tep) != MCTE_F_STATE_FREE);
294 
295 	tep->mcte_prev = NULL;
296 	tep->mcte_next = NULL;
297 
298 	/* set free in array */
299 	set_bit(tep - mctctl.mctc_elems, mctctl.mctc_free);
300 }
301 
302 /* Increment the reference count of an entry that is not linked on to
303  * any list and which only the caller has a pointer to.
304  */
mctelem_hold(struct mctelem_ent * tep)305 static void mctelem_hold(struct mctelem_ent *tep)
306 {
307 	tep->mcte_refcnt++;
308 }
309 
310 /* Increment the reference count on an entry that is linked at the head of
311  * a processing list.  The caller is responsible for locking the list.
312  */
mctelem_processing_hold(struct mctelem_ent * tep)313 static void mctelem_processing_hold(struct mctelem_ent *tep)
314 {
315 	int which = MCTE_CLASS(tep) == MCTE_F_CLASS_URGENT ?
316 	    MC_URGENT : MC_NONURGENT;
317 
318 	BUG_ON(tep != mctctl.mctc_processing_head[which]);
319 	tep->mcte_refcnt++;
320 }
321 
322 /* Decrement the reference count on an entry that is linked at the head of
323  * a processing list.  The caller is responsible for locking the list.
324  */
mctelem_processing_release(struct mctelem_ent * tep)325 static void mctelem_processing_release(struct mctelem_ent *tep)
326 {
327 	int which = MCTE_CLASS(tep) == MCTE_F_CLASS_URGENT ?
328 	    MC_URGENT : MC_NONURGENT;
329 
330 	BUG_ON(tep != mctctl.mctc_processing_head[which]);
331 	if (--tep->mcte_refcnt == 0) {
332 		MCTE_TRANSITION_STATE(tep, PROCESSING, FREE);
333 		mctctl.mctc_processing_head[which] = tep->mcte_next;
334 		mctelem_free(tep);
335 	}
336 }
337 
mctelem_init(unsigned int datasz)338 void __init mctelem_init(unsigned int datasz)
339 {
340 	char *datarr;
341 	unsigned int i;
342 
343 	BUILD_BUG_ON(MC_URGENT != 0 || MC_NONURGENT != 1 || MC_NCLASSES != 2);
344 
345 	datasz = (datasz & ~0xf) + 0x10;	/* 16 byte roundup */
346 
347 	if ((mctctl.mctc_elems = xmalloc_array(struct mctelem_ent,
348 	    MC_NENT)) == NULL ||
349 	    (datarr = xmalloc_bytes(MC_NENT * datasz)) == NULL) {
350 		xfree(mctctl.mctc_elems);
351 		printk("Allocations for MCA telemetry failed\n");
352 		return;
353 	}
354 
355 	for (i = 0; i < MC_NENT; i++) {
356 		struct mctelem_ent *tep;
357 
358 		tep = mctctl.mctc_elems + i;
359 		tep->mcte_flags = MCTE_F_STATE_FREE;
360 		tep->mcte_refcnt = 0;
361 		tep->mcte_data = datarr + i * datasz;
362 
363 		__set_bit(i, mctctl.mctc_free);
364 		tep->mcte_next = NULL;
365 		tep->mcte_prev = NULL;
366 	}
367 }
368 
369 /* incremented non-atomically when reserve fails */
370 static int mctelem_drop_count;
371 
372 /* Reserve a telemetry entry, or return NULL if none available.
373  * If we return an entry then the caller must subsequently call exactly one of
374  * mctelem_dismiss or mctelem_commit for that entry.
375  */
mctelem_reserve(mctelem_class_t which)376 mctelem_cookie_t mctelem_reserve(mctelem_class_t which)
377 {
378 	unsigned bit;
379 	unsigned start_bit = (which == MC_URGENT) ? 0 : MC_URGENT_NENT;
380 
381 	for (;;) {
382 		bit = find_next_bit(mctctl.mctc_free, MC_NENT, start_bit);
383 
384 		if (bit >= MC_NENT) {
385 			mctelem_drop_count++;
386 			return (NULL);
387 		}
388 
389 		/* try to allocate, atomically clear free bit */
390 		if (test_and_clear_bit(bit, mctctl.mctc_free)) {
391 			/* return element we got */
392 			struct mctelem_ent *tep = mctctl.mctc_elems + bit;
393 
394 			mctelem_hold(tep);
395 			MCTE_TRANSITION_STATE(tep, FREE, UNCOMMITTED);
396 			tep->mcte_next = NULL;
397 			tep->mcte_prev = NULL;
398 			if (which == MC_URGENT)
399 				MCTE_SET_CLASS(tep, URGENT);
400 			else
401 				MCTE_SET_CLASS(tep, NONURGENT);
402 			return MCTE2COOKIE(tep);
403 		}
404 	}
405 }
406 
mctelem_dataptr(mctelem_cookie_t cookie)407 void *mctelem_dataptr(mctelem_cookie_t cookie)
408 {
409 	struct mctelem_ent *tep = COOKIE2MCTE(cookie);
410 
411 	return tep->mcte_data;
412 }
413 
414 /* Release a previously reserved entry back to the freelist without
415  * submitting it for logging.  The entry must not be linked on to any
416  * list - that's how mctelem_reserve handed it out.
417  */
mctelem_dismiss(mctelem_cookie_t cookie)418 void mctelem_dismiss(mctelem_cookie_t cookie)
419 {
420 	struct mctelem_ent *tep = COOKIE2MCTE(cookie);
421 
422 	tep->mcte_refcnt--;
423 	MCTE_TRANSITION_STATE(tep, UNCOMMITTED, FREE);
424 	mctelem_free(tep);
425 }
426 
427 /* Commit an entry with completed telemetry for logging.  The caller must
428  * not reference the entry after this call.  Note that we add entries
429  * at the head of the committed list, so that list therefore has entries
430  * in reverse chronological order.
431  */
mctelem_commit(mctelem_cookie_t cookie)432 void mctelem_commit(mctelem_cookie_t cookie)
433 {
434 	struct mctelem_ent *tep = COOKIE2MCTE(cookie);
435 	mctelem_class_t target = MCTE_CLASS(tep) == MCTE_F_CLASS_URGENT ?
436 	    MC_URGENT : MC_NONURGENT;
437 
438 	BUG_ON(tep->mcte_next != NULL || tep->mcte_prev != NULL);
439 	MCTE_TRANSITION_STATE(tep, UNCOMMITTED, COMMITTED);
440 
441 	mctelem_xchg_head(&mctctl.mctc_committed[target], &tep->mcte_prev, tep);
442 }
443 
444 /* Move telemetry from committed list to processing list, reversing the
445  * list into chronological order.  The processing list has been
446  * locked by the caller, and may be non-empty.  We append the
447  * reversed committed list on to the tail of the processing list.
448  * The committed list may grow even while we run, so use atomic
449  * operations to swap NULL to the freelist head.
450  *
451  * Note that "chronological order" means the order in which producers
452  * won additions to the processing list, which may not reflect the
453  * strict chronological order of the associated events if events are
454  * closely spaced in time and contend for the processing list at once.
455  */
456 
457 static struct mctelem_ent *dangling[MC_NCLASSES];
458 
mctelem_append_processing(mctelem_class_t which)459 static void mctelem_append_processing(mctelem_class_t which)
460 {
461 	mctelem_class_t target = which == MC_URGENT ?
462 	    MC_URGENT : MC_NONURGENT;
463 	struct mctelem_ent **commlp = &mctctl.mctc_committed[target];
464 	struct mctelem_ent **proclhp = &mctctl.mctc_processing_head[target];
465 	struct mctelem_ent **procltp = &mctctl.mctc_processing_tail[target];
466 	struct mctelem_ent *tep, *ltep;
467 
468 	/* Check for an empty list; no race since we hold the processing lock */
469 	if (*commlp == NULL)
470 		return;
471 
472 	/* Atomically unlink the committed list, and keep a pointer to
473 	 * the list we unlink in a well-known location so it can be
474 	 * picked up in panic code should we panic between this unlink
475 	 * and the append to the processing list. */
476 	mctelem_xchg_head(commlp, &dangling[target], NULL);
477 
478 	if (dangling[target] == NULL)
479 		return;
480 
481 	/* Traverse the list following the previous pointers (reverse
482 	 * chronological order).  For each entry fill in the next pointer
483 	 * and transition the element state.  */
484 	for (tep = dangling[target], ltep = NULL; tep != NULL;
485 	    tep = tep->mcte_prev) {
486 		MCTE_TRANSITION_STATE(tep, COMMITTED, PROCESSING);
487 		tep->mcte_next = ltep;
488 		ltep = tep;
489 	}
490 
491 	/* ltep points to the head of a chronologically ordered linked
492 	 * list of telemetry entries ending at the most recent entry
493 	 * dangling[target] if mcte_next is followed; tack this on to
494 	 * the processing list.
495 	 */
496 	if (*proclhp == NULL) {
497 		*proclhp = ltep;
498 		*procltp = dangling[target];
499 	} else {
500 		(*procltp)->mcte_next = ltep;
501 		ltep->mcte_prev = *procltp;
502 		*procltp = dangling[target];
503 	}
504 	wmb();
505 	dangling[target] = NULL;
506 	wmb();
507 }
508 
mctelem_consume_oldest_begin(mctelem_class_t which)509 mctelem_cookie_t mctelem_consume_oldest_begin(mctelem_class_t which)
510 {
511 	mctelem_class_t target = (which == MC_URGENT) ?
512 	    MC_URGENT : MC_NONURGENT;
513 	struct mctelem_ent *tep;
514 
515 	spin_lock(&processing_lock);
516 	mctelem_append_processing(target);
517 	if ((tep = mctctl.mctc_processing_head[target]) == NULL) {
518 		spin_unlock(&processing_lock);
519 		return NULL;
520 	}
521 
522 	mctelem_processing_hold(tep);
523 	wmb();
524 	spin_unlock(&processing_lock);
525 	return MCTE2COOKIE(tep);
526 }
527 
mctelem_consume_oldest_end(mctelem_cookie_t cookie)528 void mctelem_consume_oldest_end(mctelem_cookie_t cookie)
529 {
530 	struct mctelem_ent *tep = COOKIE2MCTE(cookie);
531 
532 	spin_lock(&processing_lock);
533 	mctelem_processing_release(tep);
534 	wmb();
535 	spin_unlock(&processing_lock);
536 }
537 
mctelem_ack(mctelem_class_t which,mctelem_cookie_t cookie)538 void mctelem_ack(mctelem_class_t which, mctelem_cookie_t cookie)
539 {
540 	mctelem_class_t target = (which == MC_URGENT) ?
541 	    MC_URGENT : MC_NONURGENT;
542 	struct mctelem_ent *tep = COOKIE2MCTE(cookie);
543 
544 	if (tep == NULL)
545 		return;
546 
547 	spin_lock(&processing_lock);
548 	if (tep == mctctl.mctc_processing_head[target])
549 		mctelem_processing_release(tep);
550 	wmb();
551 	spin_unlock(&processing_lock);
552 }
553 
554 /*
555  * Local variables:
556  * mode: C
557  * c-file-style: "BSD"
558  * c-basic-offset: 4
559  * indent-tabs-mode: t
560  * tab-width: 8
561  * End:
562  */
563