1 // SPDX-License-Identifier: GPL-2.0-only
2 #define pr_fmt(fmt) "SMP alternatives: " fmt
3 
4 #include <linux/module.h>
5 #include <linux/sched.h>
6 #include <linux/perf_event.h>
7 #include <linux/mutex.h>
8 #include <linux/list.h>
9 #include <linux/stringify.h>
10 #include <linux/highmem.h>
11 #include <linux/mm.h>
12 #include <linux/vmalloc.h>
13 #include <linux/memory.h>
14 #include <linux/stop_machine.h>
15 #include <linux/slab.h>
16 #include <linux/kdebug.h>
17 #include <linux/kprobes.h>
18 #include <linux/mmu_context.h>
19 #include <linux/bsearch.h>
20 #include <linux/sync_core.h>
21 #include <asm/text-patching.h>
22 #include <asm/alternative.h>
23 #include <asm/sections.h>
24 #include <asm/mce.h>
25 #include <asm/nmi.h>
26 #include <asm/cacheflush.h>
27 #include <asm/tlbflush.h>
28 #include <asm/insn.h>
29 #include <asm/io.h>
30 #include <asm/fixmap.h>
31 #include <asm/paravirt.h>
32 #include <asm/asm-prototypes.h>
33 
34 int __read_mostly alternatives_patched;
35 
36 EXPORT_SYMBOL_GPL(alternatives_patched);
37 
38 #define MAX_PATCH_LEN (255-1)
39 
40 static int __initdata_or_module debug_alternative;
41 
debug_alt(char * str)42 static int __init debug_alt(char *str)
43 {
44 	debug_alternative = 1;
45 	return 1;
46 }
47 __setup("debug-alternative", debug_alt);
48 
49 static int noreplace_smp;
50 
setup_noreplace_smp(char * str)51 static int __init setup_noreplace_smp(char *str)
52 {
53 	noreplace_smp = 1;
54 	return 1;
55 }
56 __setup("noreplace-smp", setup_noreplace_smp);
57 
58 #define DPRINTK(fmt, args...)						\
59 do {									\
60 	if (debug_alternative)						\
61 		printk(KERN_DEBUG pr_fmt(fmt) "\n", ##args);		\
62 } while (0)
63 
64 #define DUMP_BYTES(buf, len, fmt, args...)				\
65 do {									\
66 	if (unlikely(debug_alternative)) {				\
67 		int j;							\
68 									\
69 		if (!(len))						\
70 			break;						\
71 									\
72 		printk(KERN_DEBUG pr_fmt(fmt), ##args);			\
73 		for (j = 0; j < (len) - 1; j++)				\
74 			printk(KERN_CONT "%02hhx ", buf[j]);		\
75 		printk(KERN_CONT "%02hhx\n", buf[j]);			\
76 	}								\
77 } while (0)
78 
79 static const unsigned char x86nops[] =
80 {
81 	BYTES_NOP1,
82 	BYTES_NOP2,
83 	BYTES_NOP3,
84 	BYTES_NOP4,
85 	BYTES_NOP5,
86 	BYTES_NOP6,
87 	BYTES_NOP7,
88 	BYTES_NOP8,
89 };
90 
91 const unsigned char * const x86_nops[ASM_NOP_MAX+1] =
92 {
93 	NULL,
94 	x86nops,
95 	x86nops + 1,
96 	x86nops + 1 + 2,
97 	x86nops + 1 + 2 + 3,
98 	x86nops + 1 + 2 + 3 + 4,
99 	x86nops + 1 + 2 + 3 + 4 + 5,
100 	x86nops + 1 + 2 + 3 + 4 + 5 + 6,
101 	x86nops + 1 + 2 + 3 + 4 + 5 + 6 + 7,
102 };
103 
104 /* Use this to add nops to a buffer, then text_poke the whole buffer. */
add_nops(void * insns,unsigned int len)105 static void __init_or_module add_nops(void *insns, unsigned int len)
106 {
107 	while (len > 0) {
108 		unsigned int noplen = len;
109 		if (noplen > ASM_NOP_MAX)
110 			noplen = ASM_NOP_MAX;
111 		memcpy(insns, x86_nops[noplen], noplen);
112 		insns += noplen;
113 		len -= noplen;
114 	}
115 }
116 
117 extern s32 __retpoline_sites[], __retpoline_sites_end[];
118 extern s32 __return_sites[], __return_sites_end[];
119 extern s32 __cfi_sites[], __cfi_sites_end[];
120 extern s32 __ibt_endbr_seal[], __ibt_endbr_seal_end[];
121 extern struct alt_instr __alt_instructions[], __alt_instructions_end[];
122 extern s32 __smp_locks[], __smp_locks_end[];
123 void text_poke_early(void *addr, const void *opcode, size_t len);
124 
125 /*
126  * Are we looking at a near JMP with a 1 or 4-byte displacement.
127  */
is_jmp(const u8 opcode)128 static inline bool is_jmp(const u8 opcode)
129 {
130 	return opcode == 0xeb || opcode == 0xe9;
131 }
132 
133 static void __init_or_module
recompute_jump(struct alt_instr * a,u8 * orig_insn,u8 * repl_insn,u8 * insn_buff)134 recompute_jump(struct alt_instr *a, u8 *orig_insn, u8 *repl_insn, u8 *insn_buff)
135 {
136 	u8 *next_rip, *tgt_rip;
137 	s32 n_dspl, o_dspl;
138 	int repl_len;
139 
140 	if (a->replacementlen != 5)
141 		return;
142 
143 	o_dspl = *(s32 *)(insn_buff + 1);
144 
145 	/* next_rip of the replacement JMP */
146 	next_rip = repl_insn + a->replacementlen;
147 	/* target rip of the replacement JMP */
148 	tgt_rip  = next_rip + o_dspl;
149 	n_dspl = tgt_rip - orig_insn;
150 
151 	DPRINTK("target RIP: %px, new_displ: 0x%x", tgt_rip, n_dspl);
152 
153 	if (tgt_rip - orig_insn >= 0) {
154 		if (n_dspl - 2 <= 127)
155 			goto two_byte_jmp;
156 		else
157 			goto five_byte_jmp;
158 	/* negative offset */
159 	} else {
160 		if (((n_dspl - 2) & 0xff) == (n_dspl - 2))
161 			goto two_byte_jmp;
162 		else
163 			goto five_byte_jmp;
164 	}
165 
166 two_byte_jmp:
167 	n_dspl -= 2;
168 
169 	insn_buff[0] = 0xeb;
170 	insn_buff[1] = (s8)n_dspl;
171 	add_nops(insn_buff + 2, 3);
172 
173 	repl_len = 2;
174 	goto done;
175 
176 five_byte_jmp:
177 	n_dspl -= 5;
178 
179 	insn_buff[0] = 0xe9;
180 	*(s32 *)&insn_buff[1] = n_dspl;
181 
182 	repl_len = 5;
183 
184 done:
185 
186 	DPRINTK("final displ: 0x%08x, JMP 0x%lx",
187 		n_dspl, (unsigned long)orig_insn + n_dspl + repl_len);
188 }
189 
190 /*
191  * optimize_nops_range() - Optimize a sequence of single byte NOPs (0x90)
192  *
193  * @instr: instruction byte stream
194  * @instrlen: length of the above
195  * @off: offset within @instr where the first NOP has been detected
196  *
197  * Return: number of NOPs found (and replaced).
198  */
optimize_nops_range(u8 * instr,u8 instrlen,int off)199 static __always_inline int optimize_nops_range(u8 *instr, u8 instrlen, int off)
200 {
201 	unsigned long flags;
202 	int i = off, nnops;
203 
204 	while (i < instrlen) {
205 		if (instr[i] != 0x90)
206 			break;
207 
208 		i++;
209 	}
210 
211 	nnops = i - off;
212 
213 	if (nnops <= 1)
214 		return nnops;
215 
216 	local_irq_save(flags);
217 	add_nops(instr + off, nnops);
218 	local_irq_restore(flags);
219 
220 	DUMP_BYTES(instr, instrlen, "%px: [%d:%d) optimized NOPs: ", instr, off, i);
221 
222 	return nnops;
223 }
224 
225 /*
226  * "noinline" to cause control flow change and thus invalidate I$ and
227  * cause refetch after modification.
228  */
optimize_nops(u8 * instr,size_t len)229 static void __init_or_module noinline optimize_nops(u8 *instr, size_t len)
230 {
231 	struct insn insn;
232 	int i = 0;
233 
234 	/*
235 	 * Jump over the non-NOP insns and optimize single-byte NOPs into bigger
236 	 * ones.
237 	 */
238 	for (;;) {
239 		if (insn_decode_kernel(&insn, &instr[i]))
240 			return;
241 
242 		/*
243 		 * See if this and any potentially following NOPs can be
244 		 * optimized.
245 		 */
246 		if (insn.length == 1 && insn.opcode.bytes[0] == 0x90)
247 			i += optimize_nops_range(instr, len, i);
248 		else
249 			i += insn.length;
250 
251 		if (i >= len)
252 			return;
253 	}
254 }
255 
256 /*
257  * Replace instructions with better alternatives for this CPU type. This runs
258  * before SMP is initialized to avoid SMP problems with self modifying code.
259  * This implies that asymmetric systems where APs have less capabilities than
260  * the boot processor are not handled. Tough. Make sure you disable such
261  * features by hand.
262  *
263  * Marked "noinline" to cause control flow change and thus insn cache
264  * to refetch changed I$ lines.
265  */
apply_alternatives(struct alt_instr * start,struct alt_instr * end)266 void __init_or_module noinline apply_alternatives(struct alt_instr *start,
267 						  struct alt_instr *end)
268 {
269 	struct alt_instr *a;
270 	u8 *instr, *replacement;
271 	u8 insn_buff[MAX_PATCH_LEN];
272 
273 	DPRINTK("alt table %px, -> %px", start, end);
274 	/*
275 	 * The scan order should be from start to end. A later scanned
276 	 * alternative code can overwrite previously scanned alternative code.
277 	 * Some kernel functions (e.g. memcpy, memset, etc) use this order to
278 	 * patch code.
279 	 *
280 	 * So be careful if you want to change the scan order to any other
281 	 * order.
282 	 */
283 	for (a = start; a < end; a++) {
284 		int insn_buff_sz = 0;
285 
286 		instr = (u8 *)&a->instr_offset + a->instr_offset;
287 		replacement = (u8 *)&a->repl_offset + a->repl_offset;
288 		BUG_ON(a->instrlen > sizeof(insn_buff));
289 		BUG_ON(a->cpuid >= (NCAPINTS + NBUGINTS) * 32);
290 
291 		/*
292 		 * Patch if either:
293 		 * - feature is present
294 		 * - feature not present but ALT_FLAG_NOT is set to mean,
295 		 *   patch if feature is *NOT* present.
296 		 */
297 		if (!boot_cpu_has(a->cpuid) == !(a->flags & ALT_FLAG_NOT))
298 			goto next;
299 
300 		DPRINTK("feat: %s%d*32+%d, old: (%pS (%px) len: %d), repl: (%px, len: %d)",
301 			(a->flags & ALT_FLAG_NOT) ? "!" : "",
302 			a->cpuid >> 5,
303 			a->cpuid & 0x1f,
304 			instr, instr, a->instrlen,
305 			replacement, a->replacementlen);
306 
307 		DUMP_BYTES(instr, a->instrlen, "%px:   old_insn: ", instr);
308 		DUMP_BYTES(replacement, a->replacementlen, "%px:   rpl_insn: ", replacement);
309 
310 		memcpy(insn_buff, replacement, a->replacementlen);
311 		insn_buff_sz = a->replacementlen;
312 
313 		/*
314 		 * 0xe8 is a relative jump; fix the offset.
315 		 *
316 		 * Instruction length is checked before the opcode to avoid
317 		 * accessing uninitialized bytes for zero-length replacements.
318 		 */
319 		if (a->replacementlen == 5 && *insn_buff == 0xe8) {
320 			*(s32 *)(insn_buff + 1) += replacement - instr;
321 			DPRINTK("Fix CALL offset: 0x%x, CALL 0x%lx",
322 				*(s32 *)(insn_buff + 1),
323 				(unsigned long)instr + *(s32 *)(insn_buff + 1) + 5);
324 		}
325 
326 		if (a->replacementlen && is_jmp(replacement[0]))
327 			recompute_jump(a, instr, replacement, insn_buff);
328 
329 		for (; insn_buff_sz < a->instrlen; insn_buff_sz++)
330 			insn_buff[insn_buff_sz] = 0x90;
331 
332 		DUMP_BYTES(insn_buff, insn_buff_sz, "%px: final_insn: ", instr);
333 
334 		text_poke_early(instr, insn_buff, insn_buff_sz);
335 
336 next:
337 		optimize_nops(instr, a->instrlen);
338 	}
339 }
340 
is_jcc32(struct insn * insn)341 static inline bool is_jcc32(struct insn *insn)
342 {
343 	/* Jcc.d32 second opcode byte is in the range: 0x80-0x8f */
344 	return insn->opcode.bytes[0] == 0x0f && (insn->opcode.bytes[1] & 0xf0) == 0x80;
345 }
346 
347 #if defined(CONFIG_RETPOLINE) && defined(CONFIG_OBJTOOL)
348 
349 /*
350  * CALL/JMP *%\reg
351  */
emit_indirect(int op,int reg,u8 * bytes)352 static int emit_indirect(int op, int reg, u8 *bytes)
353 {
354 	int i = 0;
355 	u8 modrm;
356 
357 	switch (op) {
358 	case CALL_INSN_OPCODE:
359 		modrm = 0x10; /* Reg = 2; CALL r/m */
360 		break;
361 
362 	case JMP32_INSN_OPCODE:
363 		modrm = 0x20; /* Reg = 4; JMP r/m */
364 		break;
365 
366 	default:
367 		WARN_ON_ONCE(1);
368 		return -1;
369 	}
370 
371 	if (reg >= 8) {
372 		bytes[i++] = 0x41; /* REX.B prefix */
373 		reg -= 8;
374 	}
375 
376 	modrm |= 0xc0; /* Mod = 3 */
377 	modrm += reg;
378 
379 	bytes[i++] = 0xff; /* opcode */
380 	bytes[i++] = modrm;
381 
382 	return i;
383 }
384 
emit_call_track_retpoline(void * addr,struct insn * insn,int reg,u8 * bytes)385 static int emit_call_track_retpoline(void *addr, struct insn *insn, int reg, u8 *bytes)
386 {
387 	u8 op = insn->opcode.bytes[0];
388 	int i = 0;
389 
390 	/*
391 	 * Clang does 'weird' Jcc __x86_indirect_thunk_r11 conditional
392 	 * tail-calls. Deal with them.
393 	 */
394 	if (is_jcc32(insn)) {
395 		bytes[i++] = op;
396 		op = insn->opcode.bytes[1];
397 		goto clang_jcc;
398 	}
399 
400 	if (insn->length == 6)
401 		bytes[i++] = 0x2e; /* CS-prefix */
402 
403 	switch (op) {
404 	case CALL_INSN_OPCODE:
405 		__text_gen_insn(bytes+i, op, addr+i,
406 				__x86_indirect_call_thunk_array[reg],
407 				CALL_INSN_SIZE);
408 		i += CALL_INSN_SIZE;
409 		break;
410 
411 	case JMP32_INSN_OPCODE:
412 clang_jcc:
413 		__text_gen_insn(bytes+i, op, addr+i,
414 				__x86_indirect_jump_thunk_array[reg],
415 				JMP32_INSN_SIZE);
416 		i += JMP32_INSN_SIZE;
417 		break;
418 
419 	default:
420 		WARN(1, "%pS %px %*ph\n", addr, addr, 6, addr);
421 		return -1;
422 	}
423 
424 	WARN_ON_ONCE(i != insn->length);
425 
426 	return i;
427 }
428 
429 /*
430  * Rewrite the compiler generated retpoline thunk calls.
431  *
432  * For spectre_v2=off (!X86_FEATURE_RETPOLINE), rewrite them into immediate
433  * indirect instructions, avoiding the extra indirection.
434  *
435  * For example, convert:
436  *
437  *   CALL __x86_indirect_thunk_\reg
438  *
439  * into:
440  *
441  *   CALL *%\reg
442  *
443  * It also tries to inline spectre_v2=retpoline,lfence when size permits.
444  */
patch_retpoline(void * addr,struct insn * insn,u8 * bytes)445 static int patch_retpoline(void *addr, struct insn *insn, u8 *bytes)
446 {
447 	retpoline_thunk_t *target;
448 	int reg, ret, i = 0;
449 	u8 op, cc;
450 
451 	target = addr + insn->length + insn->immediate.value;
452 	reg = target - __x86_indirect_thunk_array;
453 
454 	if (WARN_ON_ONCE(reg & ~0xf))
455 		return -1;
456 
457 	/* If anyone ever does: CALL/JMP *%rsp, we're in deep trouble. */
458 	BUG_ON(reg == 4);
459 
460 	if (cpu_feature_enabled(X86_FEATURE_RETPOLINE) &&
461 	    !cpu_feature_enabled(X86_FEATURE_RETPOLINE_LFENCE)) {
462 		if (cpu_feature_enabled(X86_FEATURE_CALL_DEPTH))
463 			return emit_call_track_retpoline(addr, insn, reg, bytes);
464 
465 		return -1;
466 	}
467 
468 	op = insn->opcode.bytes[0];
469 
470 	/*
471 	 * Convert:
472 	 *
473 	 *   Jcc.d32 __x86_indirect_thunk_\reg
474 	 *
475 	 * into:
476 	 *
477 	 *   Jncc.d8 1f
478 	 *   [ LFENCE ]
479 	 *   JMP *%\reg
480 	 *   [ NOP ]
481 	 * 1:
482 	 */
483 	if (is_jcc32(insn)) {
484 		cc = insn->opcode.bytes[1] & 0xf;
485 		cc ^= 1; /* invert condition */
486 
487 		bytes[i++] = 0x70 + cc;        /* Jcc.d8 */
488 		bytes[i++] = insn->length - 2; /* sizeof(Jcc.d8) == 2 */
489 
490 		/* Continue as if: JMP.d32 __x86_indirect_thunk_\reg */
491 		op = JMP32_INSN_OPCODE;
492 	}
493 
494 	/*
495 	 * For RETPOLINE_LFENCE: prepend the indirect CALL/JMP with an LFENCE.
496 	 */
497 	if (cpu_feature_enabled(X86_FEATURE_RETPOLINE_LFENCE)) {
498 		bytes[i++] = 0x0f;
499 		bytes[i++] = 0xae;
500 		bytes[i++] = 0xe8; /* LFENCE */
501 	}
502 
503 	ret = emit_indirect(op, reg, bytes + i);
504 	if (ret < 0)
505 		return ret;
506 	i += ret;
507 
508 	/*
509 	 * The compiler is supposed to EMIT an INT3 after every unconditional
510 	 * JMP instruction due to AMD BTC. However, if the compiler is too old
511 	 * or SLS isn't enabled, we still need an INT3 after indirect JMPs
512 	 * even on Intel.
513 	 */
514 	if (op == JMP32_INSN_OPCODE && i < insn->length)
515 		bytes[i++] = INT3_INSN_OPCODE;
516 
517 	for (; i < insn->length;)
518 		bytes[i++] = BYTES_NOP1;
519 
520 	return i;
521 }
522 
523 /*
524  * Generated by 'objtool --retpoline'.
525  */
apply_retpolines(s32 * start,s32 * end)526 void __init_or_module noinline apply_retpolines(s32 *start, s32 *end)
527 {
528 	s32 *s;
529 
530 	for (s = start; s < end; s++) {
531 		void *addr = (void *)s + *s;
532 		struct insn insn;
533 		int len, ret;
534 		u8 bytes[16];
535 		u8 op1, op2;
536 
537 		ret = insn_decode_kernel(&insn, addr);
538 		if (WARN_ON_ONCE(ret < 0))
539 			continue;
540 
541 		op1 = insn.opcode.bytes[0];
542 		op2 = insn.opcode.bytes[1];
543 
544 		switch (op1) {
545 		case CALL_INSN_OPCODE:
546 		case JMP32_INSN_OPCODE:
547 			break;
548 
549 		case 0x0f: /* escape */
550 			if (op2 >= 0x80 && op2 <= 0x8f)
551 				break;
552 			fallthrough;
553 		default:
554 			WARN_ON_ONCE(1);
555 			continue;
556 		}
557 
558 		DPRINTK("retpoline at: %pS (%px) len: %d to: %pS",
559 			addr, addr, insn.length,
560 			addr + insn.length + insn.immediate.value);
561 
562 		len = patch_retpoline(addr, &insn, bytes);
563 		if (len == insn.length) {
564 			optimize_nops(bytes, len);
565 			DUMP_BYTES(((u8*)addr),  len, "%px: orig: ", addr);
566 			DUMP_BYTES(((u8*)bytes), len, "%px: repl: ", addr);
567 			text_poke_early(addr, bytes, len);
568 		}
569 	}
570 }
571 
572 #ifdef CONFIG_RETHUNK
573 
574 #ifdef CONFIG_CALL_THUNKS
575 void (*x86_return_thunk)(void) __ro_after_init = &__x86_return_thunk;
576 #endif
577 
578 /*
579  * Rewrite the compiler generated return thunk tail-calls.
580  *
581  * For example, convert:
582  *
583  *   JMP __x86_return_thunk
584  *
585  * into:
586  *
587  *   RET
588  */
patch_return(void * addr,struct insn * insn,u8 * bytes)589 static int patch_return(void *addr, struct insn *insn, u8 *bytes)
590 {
591 	int i = 0;
592 
593 	if (cpu_feature_enabled(X86_FEATURE_RETHUNK)) {
594 		if (x86_return_thunk == __x86_return_thunk)
595 			return -1;
596 
597 		i = JMP32_INSN_SIZE;
598 		__text_gen_insn(bytes, JMP32_INSN_OPCODE, addr, x86_return_thunk, i);
599 	} else {
600 		bytes[i++] = RET_INSN_OPCODE;
601 	}
602 
603 	for (; i < insn->length;)
604 		bytes[i++] = INT3_INSN_OPCODE;
605 	return i;
606 }
607 
apply_returns(s32 * start,s32 * end)608 void __init_or_module noinline apply_returns(s32 *start, s32 *end)
609 {
610 	s32 *s;
611 
612 	for (s = start; s < end; s++) {
613 		void *dest = NULL, *addr = (void *)s + *s;
614 		struct insn insn;
615 		int len, ret;
616 		u8 bytes[16];
617 		u8 op;
618 
619 		ret = insn_decode_kernel(&insn, addr);
620 		if (WARN_ON_ONCE(ret < 0))
621 			continue;
622 
623 		op = insn.opcode.bytes[0];
624 		if (op == JMP32_INSN_OPCODE)
625 			dest = addr + insn.length + insn.immediate.value;
626 
627 		if (__static_call_fixup(addr, op, dest) ||
628 		    WARN_ONCE(dest != &__x86_return_thunk,
629 			      "missing return thunk: %pS-%pS: %*ph",
630 			      addr, dest, 5, addr))
631 			continue;
632 
633 		DPRINTK("return thunk at: %pS (%px) len: %d to: %pS",
634 			addr, addr, insn.length,
635 			addr + insn.length + insn.immediate.value);
636 
637 		len = patch_return(addr, &insn, bytes);
638 		if (len == insn.length) {
639 			DUMP_BYTES(((u8*)addr),  len, "%px: orig: ", addr);
640 			DUMP_BYTES(((u8*)bytes), len, "%px: repl: ", addr);
641 			text_poke_early(addr, bytes, len);
642 		}
643 	}
644 }
645 #else
apply_returns(s32 * start,s32 * end)646 void __init_or_module noinline apply_returns(s32 *start, s32 *end) { }
647 #endif /* CONFIG_RETHUNK */
648 
649 #else /* !CONFIG_RETPOLINE || !CONFIG_OBJTOOL */
650 
apply_retpolines(s32 * start,s32 * end)651 void __init_or_module noinline apply_retpolines(s32 *start, s32 *end) { }
apply_returns(s32 * start,s32 * end)652 void __init_or_module noinline apply_returns(s32 *start, s32 *end) { }
653 
654 #endif /* CONFIG_RETPOLINE && CONFIG_OBJTOOL */
655 
656 #ifdef CONFIG_X86_KERNEL_IBT
657 
poison_endbr(void * addr,bool warn)658 static void poison_endbr(void *addr, bool warn)
659 {
660 	u32 endbr, poison = gen_endbr_poison();
661 
662 	if (WARN_ON_ONCE(get_kernel_nofault(endbr, addr)))
663 		return;
664 
665 	if (!is_endbr(endbr)) {
666 		WARN_ON_ONCE(warn);
667 		return;
668 	}
669 
670 	DPRINTK("ENDBR at: %pS (%px)", addr, addr);
671 
672 	/*
673 	 * When we have IBT, the lack of ENDBR will trigger #CP
674 	 */
675 	DUMP_BYTES(((u8*)addr), 4, "%px: orig: ", addr);
676 	DUMP_BYTES(((u8*)&poison), 4, "%px: repl: ", addr);
677 	text_poke_early(addr, &poison, 4);
678 }
679 
680 /*
681  * Generated by: objtool --ibt
682  */
apply_ibt_endbr(s32 * start,s32 * end)683 void __init_or_module noinline apply_ibt_endbr(s32 *start, s32 *end)
684 {
685 	s32 *s;
686 
687 	for (s = start; s < end; s++) {
688 		void *addr = (void *)s + *s;
689 
690 		poison_endbr(addr, true);
691 		if (IS_ENABLED(CONFIG_FINEIBT))
692 			poison_endbr(addr - 16, false);
693 	}
694 }
695 
696 #else
697 
apply_ibt_endbr(s32 * start,s32 * end)698 void __init_or_module apply_ibt_endbr(s32 *start, s32 *end) { }
699 
700 #endif /* CONFIG_X86_KERNEL_IBT */
701 
702 #ifdef CONFIG_FINEIBT
703 
704 enum cfi_mode {
705 	CFI_DEFAULT,
706 	CFI_OFF,
707 	CFI_KCFI,
708 	CFI_FINEIBT,
709 };
710 
711 static enum cfi_mode cfi_mode __ro_after_init = CFI_DEFAULT;
712 static bool cfi_rand __ro_after_init = true;
713 static u32  cfi_seed __ro_after_init;
714 
715 /*
716  * Re-hash the CFI hash with a boot-time seed while making sure the result is
717  * not a valid ENDBR instruction.
718  */
cfi_rehash(u32 hash)719 static u32 cfi_rehash(u32 hash)
720 {
721 	hash ^= cfi_seed;
722 	while (unlikely(is_endbr(hash) || is_endbr(-hash))) {
723 		bool lsb = hash & 1;
724 		hash >>= 1;
725 		if (lsb)
726 			hash ^= 0x80200003;
727 	}
728 	return hash;
729 }
730 
cfi_parse_cmdline(char * str)731 static __init int cfi_parse_cmdline(char *str)
732 {
733 	if (!str)
734 		return -EINVAL;
735 
736 	while (str) {
737 		char *next = strchr(str, ',');
738 		if (next) {
739 			*next = 0;
740 			next++;
741 		}
742 
743 		if (!strcmp(str, "auto")) {
744 			cfi_mode = CFI_DEFAULT;
745 		} else if (!strcmp(str, "off")) {
746 			cfi_mode = CFI_OFF;
747 			cfi_rand = false;
748 		} else if (!strcmp(str, "kcfi")) {
749 			cfi_mode = CFI_KCFI;
750 		} else if (!strcmp(str, "fineibt")) {
751 			cfi_mode = CFI_FINEIBT;
752 		} else if (!strcmp(str, "norand")) {
753 			cfi_rand = false;
754 		} else {
755 			pr_err("Ignoring unknown cfi option (%s).", str);
756 		}
757 
758 		str = next;
759 	}
760 
761 	return 0;
762 }
763 early_param("cfi", cfi_parse_cmdline);
764 
765 /*
766  * kCFI						FineIBT
767  *
768  * __cfi_\func:					__cfi_\func:
769  *	movl   $0x12345678,%eax		// 5	     endbr64			// 4
770  *	nop					     subl   $0x12345678,%r10d   // 7
771  *	nop					     jz     1f			// 2
772  *	nop					     ud2			// 2
773  *	nop					1:   nop			// 1
774  *	nop
775  *	nop
776  *	nop
777  *	nop
778  *	nop
779  *	nop
780  *	nop
781  *
782  *
783  * caller:					caller:
784  *	movl	$(-0x12345678),%r10d	 // 6	     movl   $0x12345678,%r10d	// 6
785  *	addl	$-15(%r11),%r10d	 // 4	     sub    $16,%r11		// 4
786  *	je	1f			 // 2	     nop4			// 4
787  *	ud2				 // 2
788  * 1:	call	__x86_indirect_thunk_r11 // 5	     call   *%r11; nop2;	// 5
789  *
790  */
791 
792 asm(	".pushsection .rodata			\n"
793 	"fineibt_preamble_start:		\n"
794 	"	endbr64				\n"
795 	"	subl	$0x12345678, %r10d	\n"
796 	"	je	fineibt_preamble_end	\n"
797 	"	ud2				\n"
798 	"	nop				\n"
799 	"fineibt_preamble_end:			\n"
800 	".popsection\n"
801 );
802 
803 extern u8 fineibt_preamble_start[];
804 extern u8 fineibt_preamble_end[];
805 
806 #define fineibt_preamble_size (fineibt_preamble_end - fineibt_preamble_start)
807 #define fineibt_preamble_hash 7
808 
809 asm(	".pushsection .rodata			\n"
810 	"fineibt_caller_start:			\n"
811 	"	movl	$0x12345678, %r10d	\n"
812 	"	sub	$16, %r11		\n"
813 	ASM_NOP4
814 	"fineibt_caller_end:			\n"
815 	".popsection				\n"
816 );
817 
818 extern u8 fineibt_caller_start[];
819 extern u8 fineibt_caller_end[];
820 
821 #define fineibt_caller_size (fineibt_caller_end - fineibt_caller_start)
822 #define fineibt_caller_hash 2
823 
824 #define fineibt_caller_jmp (fineibt_caller_size - 2)
825 
decode_preamble_hash(void * addr)826 static u32 decode_preamble_hash(void *addr)
827 {
828 	u8 *p = addr;
829 
830 	/* b8 78 56 34 12          mov    $0x12345678,%eax */
831 	if (p[0] == 0xb8)
832 		return *(u32 *)(addr + 1);
833 
834 	return 0; /* invalid hash value */
835 }
836 
decode_caller_hash(void * addr)837 static u32 decode_caller_hash(void *addr)
838 {
839 	u8 *p = addr;
840 
841 	/* 41 ba 78 56 34 12       mov    $0x12345678,%r10d */
842 	if (p[0] == 0x41 && p[1] == 0xba)
843 		return -*(u32 *)(addr + 2);
844 
845 	/* e8 0c 78 56 34 12	   jmp.d8  +12 */
846 	if (p[0] == JMP8_INSN_OPCODE && p[1] == fineibt_caller_jmp)
847 		return -*(u32 *)(addr + 2);
848 
849 	return 0; /* invalid hash value */
850 }
851 
852 /* .retpoline_sites */
cfi_disable_callers(s32 * start,s32 * end)853 static int cfi_disable_callers(s32 *start, s32 *end)
854 {
855 	/*
856 	 * Disable kCFI by patching in a JMP.d8, this leaves the hash immediate
857 	 * in tact for later usage. Also see decode_caller_hash() and
858 	 * cfi_rewrite_callers().
859 	 */
860 	const u8 jmp[] = { JMP8_INSN_OPCODE, fineibt_caller_jmp };
861 	s32 *s;
862 
863 	for (s = start; s < end; s++) {
864 		void *addr = (void *)s + *s;
865 		u32 hash;
866 
867 		addr -= fineibt_caller_size;
868 		hash = decode_caller_hash(addr);
869 		if (!hash) /* nocfi callers */
870 			continue;
871 
872 		text_poke_early(addr, jmp, 2);
873 	}
874 
875 	return 0;
876 }
877 
cfi_enable_callers(s32 * start,s32 * end)878 static int cfi_enable_callers(s32 *start, s32 *end)
879 {
880 	/*
881 	 * Re-enable kCFI, undo what cfi_disable_callers() did.
882 	 */
883 	const u8 mov[] = { 0x41, 0xba };
884 	s32 *s;
885 
886 	for (s = start; s < end; s++) {
887 		void *addr = (void *)s + *s;
888 		u32 hash;
889 
890 		addr -= fineibt_caller_size;
891 		hash = decode_caller_hash(addr);
892 		if (!hash) /* nocfi callers */
893 			continue;
894 
895 		text_poke_early(addr, mov, 2);
896 	}
897 
898 	return 0;
899 }
900 
901 /* .cfi_sites */
cfi_rand_preamble(s32 * start,s32 * end)902 static int cfi_rand_preamble(s32 *start, s32 *end)
903 {
904 	s32 *s;
905 
906 	for (s = start; s < end; s++) {
907 		void *addr = (void *)s + *s;
908 		u32 hash;
909 
910 		hash = decode_preamble_hash(addr);
911 		if (WARN(!hash, "no CFI hash found at: %pS %px %*ph\n",
912 			 addr, addr, 5, addr))
913 			return -EINVAL;
914 
915 		hash = cfi_rehash(hash);
916 		text_poke_early(addr + 1, &hash, 4);
917 	}
918 
919 	return 0;
920 }
921 
cfi_rewrite_preamble(s32 * start,s32 * end)922 static int cfi_rewrite_preamble(s32 *start, s32 *end)
923 {
924 	s32 *s;
925 
926 	for (s = start; s < end; s++) {
927 		void *addr = (void *)s + *s;
928 		u32 hash;
929 
930 		hash = decode_preamble_hash(addr);
931 		if (WARN(!hash, "no CFI hash found at: %pS %px %*ph\n",
932 			 addr, addr, 5, addr))
933 			return -EINVAL;
934 
935 		text_poke_early(addr, fineibt_preamble_start, fineibt_preamble_size);
936 		WARN_ON(*(u32 *)(addr + fineibt_preamble_hash) != 0x12345678);
937 		text_poke_early(addr + fineibt_preamble_hash, &hash, 4);
938 	}
939 
940 	return 0;
941 }
942 
943 /* .retpoline_sites */
cfi_rand_callers(s32 * start,s32 * end)944 static int cfi_rand_callers(s32 *start, s32 *end)
945 {
946 	s32 *s;
947 
948 	for (s = start; s < end; s++) {
949 		void *addr = (void *)s + *s;
950 		u32 hash;
951 
952 		addr -= fineibt_caller_size;
953 		hash = decode_caller_hash(addr);
954 		if (hash) {
955 			hash = -cfi_rehash(hash);
956 			text_poke_early(addr + 2, &hash, 4);
957 		}
958 	}
959 
960 	return 0;
961 }
962 
cfi_rewrite_callers(s32 * start,s32 * end)963 static int cfi_rewrite_callers(s32 *start, s32 *end)
964 {
965 	s32 *s;
966 
967 	for (s = start; s < end; s++) {
968 		void *addr = (void *)s + *s;
969 		u32 hash;
970 
971 		addr -= fineibt_caller_size;
972 		hash = decode_caller_hash(addr);
973 		if (hash) {
974 			text_poke_early(addr, fineibt_caller_start, fineibt_caller_size);
975 			WARN_ON(*(u32 *)(addr + fineibt_caller_hash) != 0x12345678);
976 			text_poke_early(addr + fineibt_caller_hash, &hash, 4);
977 		}
978 		/* rely on apply_retpolines() */
979 	}
980 
981 	return 0;
982 }
983 
__apply_fineibt(s32 * start_retpoline,s32 * end_retpoline,s32 * start_cfi,s32 * end_cfi,bool builtin)984 static void __apply_fineibt(s32 *start_retpoline, s32 *end_retpoline,
985 			    s32 *start_cfi, s32 *end_cfi, bool builtin)
986 {
987 	int ret;
988 
989 	if (WARN_ONCE(fineibt_preamble_size != 16,
990 		      "FineIBT preamble wrong size: %ld", fineibt_preamble_size))
991 		return;
992 
993 	if (cfi_mode == CFI_DEFAULT) {
994 		cfi_mode = CFI_KCFI;
995 		if (HAS_KERNEL_IBT && cpu_feature_enabled(X86_FEATURE_IBT))
996 			cfi_mode = CFI_FINEIBT;
997 	}
998 
999 	/*
1000 	 * Rewrite the callers to not use the __cfi_ stubs, such that we might
1001 	 * rewrite them. This disables all CFI. If this succeeds but any of the
1002 	 * later stages fails, we're without CFI.
1003 	 */
1004 	ret = cfi_disable_callers(start_retpoline, end_retpoline);
1005 	if (ret)
1006 		goto err;
1007 
1008 	if (cfi_rand) {
1009 		if (builtin)
1010 			cfi_seed = get_random_u32();
1011 
1012 		ret = cfi_rand_preamble(start_cfi, end_cfi);
1013 		if (ret)
1014 			goto err;
1015 
1016 		ret = cfi_rand_callers(start_retpoline, end_retpoline);
1017 		if (ret)
1018 			goto err;
1019 	}
1020 
1021 	switch (cfi_mode) {
1022 	case CFI_OFF:
1023 		if (builtin)
1024 			pr_info("Disabling CFI\n");
1025 		return;
1026 
1027 	case CFI_KCFI:
1028 		ret = cfi_enable_callers(start_retpoline, end_retpoline);
1029 		if (ret)
1030 			goto err;
1031 
1032 		if (builtin)
1033 			pr_info("Using kCFI\n");
1034 		return;
1035 
1036 	case CFI_FINEIBT:
1037 		ret = cfi_rewrite_preamble(start_cfi, end_cfi);
1038 		if (ret)
1039 			goto err;
1040 
1041 		ret = cfi_rewrite_callers(start_retpoline, end_retpoline);
1042 		if (ret)
1043 			goto err;
1044 
1045 		if (builtin)
1046 			pr_info("Using FineIBT CFI\n");
1047 		return;
1048 
1049 	default:
1050 		break;
1051 	}
1052 
1053 err:
1054 	pr_err("Something went horribly wrong trying to rewrite the CFI implementation.\n");
1055 }
1056 
1057 #else
1058 
__apply_fineibt(s32 * start_retpoline,s32 * end_retpoline,s32 * start_cfi,s32 * end_cfi,bool builtin)1059 static void __apply_fineibt(s32 *start_retpoline, s32 *end_retpoline,
1060 			    s32 *start_cfi, s32 *end_cfi, bool builtin)
1061 {
1062 }
1063 
1064 #endif
1065 
apply_fineibt(s32 * start_retpoline,s32 * end_retpoline,s32 * start_cfi,s32 * end_cfi)1066 void apply_fineibt(s32 *start_retpoline, s32 *end_retpoline,
1067 		   s32 *start_cfi, s32 *end_cfi)
1068 {
1069 	return __apply_fineibt(start_retpoline, end_retpoline,
1070 			       start_cfi, end_cfi,
1071 			       /* .builtin = */ false);
1072 }
1073 
1074 #ifdef CONFIG_SMP
alternatives_smp_lock(const s32 * start,const s32 * end,u8 * text,u8 * text_end)1075 static void alternatives_smp_lock(const s32 *start, const s32 *end,
1076 				  u8 *text, u8 *text_end)
1077 {
1078 	const s32 *poff;
1079 
1080 	for (poff = start; poff < end; poff++) {
1081 		u8 *ptr = (u8 *)poff + *poff;
1082 
1083 		if (!*poff || ptr < text || ptr >= text_end)
1084 			continue;
1085 		/* turn DS segment override prefix into lock prefix */
1086 		if (*ptr == 0x3e)
1087 			text_poke(ptr, ((unsigned char []){0xf0}), 1);
1088 	}
1089 }
1090 
alternatives_smp_unlock(const s32 * start,const s32 * end,u8 * text,u8 * text_end)1091 static void alternatives_smp_unlock(const s32 *start, const s32 *end,
1092 				    u8 *text, u8 *text_end)
1093 {
1094 	const s32 *poff;
1095 
1096 	for (poff = start; poff < end; poff++) {
1097 		u8 *ptr = (u8 *)poff + *poff;
1098 
1099 		if (!*poff || ptr < text || ptr >= text_end)
1100 			continue;
1101 		/* turn lock prefix into DS segment override prefix */
1102 		if (*ptr == 0xf0)
1103 			text_poke(ptr, ((unsigned char []){0x3E}), 1);
1104 	}
1105 }
1106 
1107 struct smp_alt_module {
1108 	/* what is this ??? */
1109 	struct module	*mod;
1110 	char		*name;
1111 
1112 	/* ptrs to lock prefixes */
1113 	const s32	*locks;
1114 	const s32	*locks_end;
1115 
1116 	/* .text segment, needed to avoid patching init code ;) */
1117 	u8		*text;
1118 	u8		*text_end;
1119 
1120 	struct list_head next;
1121 };
1122 static LIST_HEAD(smp_alt_modules);
1123 static bool uniproc_patched = false;	/* protected by text_mutex */
1124 
alternatives_smp_module_add(struct module * mod,char * name,void * locks,void * locks_end,void * text,void * text_end)1125 void __init_or_module alternatives_smp_module_add(struct module *mod,
1126 						  char *name,
1127 						  void *locks, void *locks_end,
1128 						  void *text,  void *text_end)
1129 {
1130 	struct smp_alt_module *smp;
1131 
1132 	mutex_lock(&text_mutex);
1133 	if (!uniproc_patched)
1134 		goto unlock;
1135 
1136 	if (num_possible_cpus() == 1)
1137 		/* Don't bother remembering, we'll never have to undo it. */
1138 		goto smp_unlock;
1139 
1140 	smp = kzalloc(sizeof(*smp), GFP_KERNEL);
1141 	if (NULL == smp)
1142 		/* we'll run the (safe but slow) SMP code then ... */
1143 		goto unlock;
1144 
1145 	smp->mod	= mod;
1146 	smp->name	= name;
1147 	smp->locks	= locks;
1148 	smp->locks_end	= locks_end;
1149 	smp->text	= text;
1150 	smp->text_end	= text_end;
1151 	DPRINTK("locks %p -> %p, text %p -> %p, name %s\n",
1152 		smp->locks, smp->locks_end,
1153 		smp->text, smp->text_end, smp->name);
1154 
1155 	list_add_tail(&smp->next, &smp_alt_modules);
1156 smp_unlock:
1157 	alternatives_smp_unlock(locks, locks_end, text, text_end);
1158 unlock:
1159 	mutex_unlock(&text_mutex);
1160 }
1161 
alternatives_smp_module_del(struct module * mod)1162 void __init_or_module alternatives_smp_module_del(struct module *mod)
1163 {
1164 	struct smp_alt_module *item;
1165 
1166 	mutex_lock(&text_mutex);
1167 	list_for_each_entry(item, &smp_alt_modules, next) {
1168 		if (mod != item->mod)
1169 			continue;
1170 		list_del(&item->next);
1171 		kfree(item);
1172 		break;
1173 	}
1174 	mutex_unlock(&text_mutex);
1175 }
1176 
alternatives_enable_smp(void)1177 void alternatives_enable_smp(void)
1178 {
1179 	struct smp_alt_module *mod;
1180 
1181 	/* Why bother if there are no other CPUs? */
1182 	BUG_ON(num_possible_cpus() == 1);
1183 
1184 	mutex_lock(&text_mutex);
1185 
1186 	if (uniproc_patched) {
1187 		pr_info("switching to SMP code\n");
1188 		BUG_ON(num_online_cpus() != 1);
1189 		clear_cpu_cap(&boot_cpu_data, X86_FEATURE_UP);
1190 		clear_cpu_cap(&cpu_data(0), X86_FEATURE_UP);
1191 		list_for_each_entry(mod, &smp_alt_modules, next)
1192 			alternatives_smp_lock(mod->locks, mod->locks_end,
1193 					      mod->text, mod->text_end);
1194 		uniproc_patched = false;
1195 	}
1196 	mutex_unlock(&text_mutex);
1197 }
1198 
1199 /*
1200  * Return 1 if the address range is reserved for SMP-alternatives.
1201  * Must hold text_mutex.
1202  */
alternatives_text_reserved(void * start,void * end)1203 int alternatives_text_reserved(void *start, void *end)
1204 {
1205 	struct smp_alt_module *mod;
1206 	const s32 *poff;
1207 	u8 *text_start = start;
1208 	u8 *text_end = end;
1209 
1210 	lockdep_assert_held(&text_mutex);
1211 
1212 	list_for_each_entry(mod, &smp_alt_modules, next) {
1213 		if (mod->text > text_end || mod->text_end < text_start)
1214 			continue;
1215 		for (poff = mod->locks; poff < mod->locks_end; poff++) {
1216 			const u8 *ptr = (const u8 *)poff + *poff;
1217 
1218 			if (text_start <= ptr && text_end > ptr)
1219 				return 1;
1220 		}
1221 	}
1222 
1223 	return 0;
1224 }
1225 #endif /* CONFIG_SMP */
1226 
1227 #ifdef CONFIG_PARAVIRT
apply_paravirt(struct paravirt_patch_site * start,struct paravirt_patch_site * end)1228 void __init_or_module apply_paravirt(struct paravirt_patch_site *start,
1229 				     struct paravirt_patch_site *end)
1230 {
1231 	struct paravirt_patch_site *p;
1232 	char insn_buff[MAX_PATCH_LEN];
1233 
1234 	for (p = start; p < end; p++) {
1235 		unsigned int used;
1236 
1237 		BUG_ON(p->len > MAX_PATCH_LEN);
1238 		/* prep the buffer with the original instructions */
1239 		memcpy(insn_buff, p->instr, p->len);
1240 		used = paravirt_patch(p->type, insn_buff, (unsigned long)p->instr, p->len);
1241 
1242 		BUG_ON(used > p->len);
1243 
1244 		/* Pad the rest with nops */
1245 		add_nops(insn_buff + used, p->len - used);
1246 		text_poke_early(p->instr, insn_buff, p->len);
1247 	}
1248 }
1249 extern struct paravirt_patch_site __start_parainstructions[],
1250 	__stop_parainstructions[];
1251 #endif	/* CONFIG_PARAVIRT */
1252 
1253 /*
1254  * Self-test for the INT3 based CALL emulation code.
1255  *
1256  * This exercises int3_emulate_call() to make sure INT3 pt_regs are set up
1257  * properly and that there is a stack gap between the INT3 frame and the
1258  * previous context. Without this gap doing a virtual PUSH on the interrupted
1259  * stack would corrupt the INT3 IRET frame.
1260  *
1261  * See entry_{32,64}.S for more details.
1262  */
1263 
1264 /*
1265  * We define the int3_magic() function in assembly to control the calling
1266  * convention such that we can 'call' it from assembly.
1267  */
1268 
1269 extern void int3_magic(unsigned int *ptr); /* defined in asm */
1270 
1271 asm (
1272 "	.pushsection	.init.text, \"ax\", @progbits\n"
1273 "	.type		int3_magic, @function\n"
1274 "int3_magic:\n"
1275 	ANNOTATE_NOENDBR
1276 "	movl	$1, (%" _ASM_ARG1 ")\n"
1277 	ASM_RET
1278 "	.size		int3_magic, .-int3_magic\n"
1279 "	.popsection\n"
1280 );
1281 
1282 extern void int3_selftest_ip(void); /* defined in asm below */
1283 
1284 static int __init
int3_exception_notify(struct notifier_block * self,unsigned long val,void * data)1285 int3_exception_notify(struct notifier_block *self, unsigned long val, void *data)
1286 {
1287 	unsigned long selftest = (unsigned long)&int3_selftest_ip;
1288 	struct die_args *args = data;
1289 	struct pt_regs *regs = args->regs;
1290 
1291 	OPTIMIZER_HIDE_VAR(selftest);
1292 
1293 	if (!regs || user_mode(regs))
1294 		return NOTIFY_DONE;
1295 
1296 	if (val != DIE_INT3)
1297 		return NOTIFY_DONE;
1298 
1299 	if (regs->ip - INT3_INSN_SIZE != selftest)
1300 		return NOTIFY_DONE;
1301 
1302 	int3_emulate_call(regs, (unsigned long)&int3_magic);
1303 	return NOTIFY_STOP;
1304 }
1305 
1306 /* Must be noinline to ensure uniqueness of int3_selftest_ip. */
int3_selftest(void)1307 static noinline void __init int3_selftest(void)
1308 {
1309 	static __initdata struct notifier_block int3_exception_nb = {
1310 		.notifier_call	= int3_exception_notify,
1311 		.priority	= INT_MAX-1, /* last */
1312 	};
1313 	unsigned int val = 0;
1314 
1315 	BUG_ON(register_die_notifier(&int3_exception_nb));
1316 
1317 	/*
1318 	 * Basically: int3_magic(&val); but really complicated :-)
1319 	 *
1320 	 * INT3 padded with NOP to CALL_INSN_SIZE. The int3_exception_nb
1321 	 * notifier above will emulate CALL for us.
1322 	 */
1323 	asm volatile ("int3_selftest_ip:\n\t"
1324 		      ANNOTATE_NOENDBR
1325 		      "    int3; nop; nop; nop; nop\n\t"
1326 		      : ASM_CALL_CONSTRAINT
1327 		      : __ASM_SEL_RAW(a, D) (&val)
1328 		      : "memory");
1329 
1330 	BUG_ON(val != 1);
1331 
1332 	unregister_die_notifier(&int3_exception_nb);
1333 }
1334 
alternative_instructions(void)1335 void __init alternative_instructions(void)
1336 {
1337 	int3_selftest();
1338 
1339 	/*
1340 	 * The patching is not fully atomic, so try to avoid local
1341 	 * interruptions that might execute the to be patched code.
1342 	 * Other CPUs are not running.
1343 	 */
1344 	stop_nmi();
1345 
1346 	/*
1347 	 * Don't stop machine check exceptions while patching.
1348 	 * MCEs only happen when something got corrupted and in this
1349 	 * case we must do something about the corruption.
1350 	 * Ignoring it is worse than an unlikely patching race.
1351 	 * Also machine checks tend to be broadcast and if one CPU
1352 	 * goes into machine check the others follow quickly, so we don't
1353 	 * expect a machine check to cause undue problems during to code
1354 	 * patching.
1355 	 */
1356 
1357 	/*
1358 	 * Paravirt patching and alternative patching can be combined to
1359 	 * replace a function call with a short direct code sequence (e.g.
1360 	 * by setting a constant return value instead of doing that in an
1361 	 * external function).
1362 	 * In order to make this work the following sequence is required:
1363 	 * 1. set (artificial) features depending on used paravirt
1364 	 *    functions which can later influence alternative patching
1365 	 * 2. apply paravirt patching (generally replacing an indirect
1366 	 *    function call with a direct one)
1367 	 * 3. apply alternative patching (e.g. replacing a direct function
1368 	 *    call with a custom code sequence)
1369 	 * Doing paravirt patching after alternative patching would clobber
1370 	 * the optimization of the custom code with a function call again.
1371 	 */
1372 	paravirt_set_cap();
1373 
1374 	/*
1375 	 * First patch paravirt functions, such that we overwrite the indirect
1376 	 * call with the direct call.
1377 	 */
1378 	apply_paravirt(__parainstructions, __parainstructions_end);
1379 
1380 	__apply_fineibt(__retpoline_sites, __retpoline_sites_end,
1381 			__cfi_sites, __cfi_sites_end, true);
1382 
1383 	/*
1384 	 * Rewrite the retpolines, must be done before alternatives since
1385 	 * those can rewrite the retpoline thunks.
1386 	 */
1387 	apply_retpolines(__retpoline_sites, __retpoline_sites_end);
1388 	apply_returns(__return_sites, __return_sites_end);
1389 
1390 	/*
1391 	 * Then patch alternatives, such that those paravirt calls that are in
1392 	 * alternatives can be overwritten by their immediate fragments.
1393 	 */
1394 	apply_alternatives(__alt_instructions, __alt_instructions_end);
1395 
1396 	/*
1397 	 * Now all calls are established. Apply the call thunks if
1398 	 * required.
1399 	 */
1400 	callthunks_patch_builtin_calls();
1401 
1402 	apply_ibt_endbr(__ibt_endbr_seal, __ibt_endbr_seal_end);
1403 
1404 #ifdef CONFIG_SMP
1405 	/* Patch to UP if other cpus not imminent. */
1406 	if (!noreplace_smp && (num_present_cpus() == 1 || setup_max_cpus <= 1)) {
1407 		uniproc_patched = true;
1408 		alternatives_smp_module_add(NULL, "core kernel",
1409 					    __smp_locks, __smp_locks_end,
1410 					    _text, _etext);
1411 	}
1412 
1413 	if (!uniproc_patched || num_possible_cpus() == 1) {
1414 		free_init_pages("SMP alternatives",
1415 				(unsigned long)__smp_locks,
1416 				(unsigned long)__smp_locks_end);
1417 	}
1418 #endif
1419 
1420 	restart_nmi();
1421 	alternatives_patched = 1;
1422 }
1423 
1424 /**
1425  * text_poke_early - Update instructions on a live kernel at boot time
1426  * @addr: address to modify
1427  * @opcode: source of the copy
1428  * @len: length to copy
1429  *
1430  * When you use this code to patch more than one byte of an instruction
1431  * you need to make sure that other CPUs cannot execute this code in parallel.
1432  * Also no thread must be currently preempted in the middle of these
1433  * instructions. And on the local CPU you need to be protected against NMI or
1434  * MCE handlers seeing an inconsistent instruction while you patch.
1435  */
text_poke_early(void * addr,const void * opcode,size_t len)1436 void __init_or_module text_poke_early(void *addr, const void *opcode,
1437 				      size_t len)
1438 {
1439 	unsigned long flags;
1440 
1441 	if (boot_cpu_has(X86_FEATURE_NX) &&
1442 	    is_module_text_address((unsigned long)addr)) {
1443 		/*
1444 		 * Modules text is marked initially as non-executable, so the
1445 		 * code cannot be running and speculative code-fetches are
1446 		 * prevented. Just change the code.
1447 		 */
1448 		memcpy(addr, opcode, len);
1449 	} else {
1450 		local_irq_save(flags);
1451 		memcpy(addr, opcode, len);
1452 		local_irq_restore(flags);
1453 		sync_core();
1454 
1455 		/*
1456 		 * Could also do a CLFLUSH here to speed up CPU recovery; but
1457 		 * that causes hangs on some VIA CPUs.
1458 		 */
1459 	}
1460 }
1461 
1462 typedef struct {
1463 	struct mm_struct *mm;
1464 } temp_mm_state_t;
1465 
1466 /*
1467  * Using a temporary mm allows to set temporary mappings that are not accessible
1468  * by other CPUs. Such mappings are needed to perform sensitive memory writes
1469  * that override the kernel memory protections (e.g., W^X), without exposing the
1470  * temporary page-table mappings that are required for these write operations to
1471  * other CPUs. Using a temporary mm also allows to avoid TLB shootdowns when the
1472  * mapping is torn down.
1473  *
1474  * Context: The temporary mm needs to be used exclusively by a single core. To
1475  *          harden security IRQs must be disabled while the temporary mm is
1476  *          loaded, thereby preventing interrupt handler bugs from overriding
1477  *          the kernel memory protection.
1478  */
use_temporary_mm(struct mm_struct * mm)1479 static inline temp_mm_state_t use_temporary_mm(struct mm_struct *mm)
1480 {
1481 	temp_mm_state_t temp_state;
1482 
1483 	lockdep_assert_irqs_disabled();
1484 
1485 	/*
1486 	 * Make sure not to be in TLB lazy mode, as otherwise we'll end up
1487 	 * with a stale address space WITHOUT being in lazy mode after
1488 	 * restoring the previous mm.
1489 	 */
1490 	if (this_cpu_read(cpu_tlbstate_shared.is_lazy))
1491 		leave_mm(smp_processor_id());
1492 
1493 	temp_state.mm = this_cpu_read(cpu_tlbstate.loaded_mm);
1494 	switch_mm_irqs_off(NULL, mm, current);
1495 
1496 	/*
1497 	 * If breakpoints are enabled, disable them while the temporary mm is
1498 	 * used. Userspace might set up watchpoints on addresses that are used
1499 	 * in the temporary mm, which would lead to wrong signals being sent or
1500 	 * crashes.
1501 	 *
1502 	 * Note that breakpoints are not disabled selectively, which also causes
1503 	 * kernel breakpoints (e.g., perf's) to be disabled. This might be
1504 	 * undesirable, but still seems reasonable as the code that runs in the
1505 	 * temporary mm should be short.
1506 	 */
1507 	if (hw_breakpoint_active())
1508 		hw_breakpoint_disable();
1509 
1510 	return temp_state;
1511 }
1512 
unuse_temporary_mm(temp_mm_state_t prev_state)1513 static inline void unuse_temporary_mm(temp_mm_state_t prev_state)
1514 {
1515 	lockdep_assert_irqs_disabled();
1516 	switch_mm_irqs_off(NULL, prev_state.mm, current);
1517 
1518 	/*
1519 	 * Restore the breakpoints if they were disabled before the temporary mm
1520 	 * was loaded.
1521 	 */
1522 	if (hw_breakpoint_active())
1523 		hw_breakpoint_restore();
1524 }
1525 
1526 __ro_after_init struct mm_struct *poking_mm;
1527 __ro_after_init unsigned long poking_addr;
1528 
text_poke_memcpy(void * dst,const void * src,size_t len)1529 static void text_poke_memcpy(void *dst, const void *src, size_t len)
1530 {
1531 	memcpy(dst, src, len);
1532 }
1533 
text_poke_memset(void * dst,const void * src,size_t len)1534 static void text_poke_memset(void *dst, const void *src, size_t len)
1535 {
1536 	int c = *(const int *)src;
1537 
1538 	memset(dst, c, len);
1539 }
1540 
1541 typedef void text_poke_f(void *dst, const void *src, size_t len);
1542 
__text_poke(text_poke_f func,void * addr,const void * src,size_t len)1543 static void *__text_poke(text_poke_f func, void *addr, const void *src, size_t len)
1544 {
1545 	bool cross_page_boundary = offset_in_page(addr) + len > PAGE_SIZE;
1546 	struct page *pages[2] = {NULL};
1547 	temp_mm_state_t prev;
1548 	unsigned long flags;
1549 	pte_t pte, *ptep;
1550 	spinlock_t *ptl;
1551 	pgprot_t pgprot;
1552 
1553 	/*
1554 	 * While boot memory allocator is running we cannot use struct pages as
1555 	 * they are not yet initialized. There is no way to recover.
1556 	 */
1557 	BUG_ON(!after_bootmem);
1558 
1559 	if (!core_kernel_text((unsigned long)addr)) {
1560 		pages[0] = vmalloc_to_page(addr);
1561 		if (cross_page_boundary)
1562 			pages[1] = vmalloc_to_page(addr + PAGE_SIZE);
1563 	} else {
1564 		pages[0] = virt_to_page(addr);
1565 		WARN_ON(!PageReserved(pages[0]));
1566 		if (cross_page_boundary)
1567 			pages[1] = virt_to_page(addr + PAGE_SIZE);
1568 	}
1569 	/*
1570 	 * If something went wrong, crash and burn since recovery paths are not
1571 	 * implemented.
1572 	 */
1573 	BUG_ON(!pages[0] || (cross_page_boundary && !pages[1]));
1574 
1575 	/*
1576 	 * Map the page without the global bit, as TLB flushing is done with
1577 	 * flush_tlb_mm_range(), which is intended for non-global PTEs.
1578 	 */
1579 	pgprot = __pgprot(pgprot_val(PAGE_KERNEL) & ~_PAGE_GLOBAL);
1580 
1581 	/*
1582 	 * The lock is not really needed, but this allows to avoid open-coding.
1583 	 */
1584 	ptep = get_locked_pte(poking_mm, poking_addr, &ptl);
1585 
1586 	/*
1587 	 * This must not fail; preallocated in poking_init().
1588 	 */
1589 	VM_BUG_ON(!ptep);
1590 
1591 	local_irq_save(flags);
1592 
1593 	pte = mk_pte(pages[0], pgprot);
1594 	set_pte_at(poking_mm, poking_addr, ptep, pte);
1595 
1596 	if (cross_page_boundary) {
1597 		pte = mk_pte(pages[1], pgprot);
1598 		set_pte_at(poking_mm, poking_addr + PAGE_SIZE, ptep + 1, pte);
1599 	}
1600 
1601 	/*
1602 	 * Loading the temporary mm behaves as a compiler barrier, which
1603 	 * guarantees that the PTE will be set at the time memcpy() is done.
1604 	 */
1605 	prev = use_temporary_mm(poking_mm);
1606 
1607 	kasan_disable_current();
1608 	func((u8 *)poking_addr + offset_in_page(addr), src, len);
1609 	kasan_enable_current();
1610 
1611 	/*
1612 	 * Ensure that the PTE is only cleared after the instructions of memcpy
1613 	 * were issued by using a compiler barrier.
1614 	 */
1615 	barrier();
1616 
1617 	pte_clear(poking_mm, poking_addr, ptep);
1618 	if (cross_page_boundary)
1619 		pte_clear(poking_mm, poking_addr + PAGE_SIZE, ptep + 1);
1620 
1621 	/*
1622 	 * Loading the previous page-table hierarchy requires a serializing
1623 	 * instruction that already allows the core to see the updated version.
1624 	 * Xen-PV is assumed to serialize execution in a similar manner.
1625 	 */
1626 	unuse_temporary_mm(prev);
1627 
1628 	/*
1629 	 * Flushing the TLB might involve IPIs, which would require enabled
1630 	 * IRQs, but not if the mm is not used, as it is in this point.
1631 	 */
1632 	flush_tlb_mm_range(poking_mm, poking_addr, poking_addr +
1633 			   (cross_page_boundary ? 2 : 1) * PAGE_SIZE,
1634 			   PAGE_SHIFT, false);
1635 
1636 	if (func == text_poke_memcpy) {
1637 		/*
1638 		 * If the text does not match what we just wrote then something is
1639 		 * fundamentally screwy; there's nothing we can really do about that.
1640 		 */
1641 		BUG_ON(memcmp(addr, src, len));
1642 	}
1643 
1644 	local_irq_restore(flags);
1645 	pte_unmap_unlock(ptep, ptl);
1646 	return addr;
1647 }
1648 
1649 /**
1650  * text_poke - Update instructions on a live kernel
1651  * @addr: address to modify
1652  * @opcode: source of the copy
1653  * @len: length to copy
1654  *
1655  * Only atomic text poke/set should be allowed when not doing early patching.
1656  * It means the size must be writable atomically and the address must be aligned
1657  * in a way that permits an atomic write. It also makes sure we fit on a single
1658  * page.
1659  *
1660  * Note that the caller must ensure that if the modified code is part of a
1661  * module, the module would not be removed during poking. This can be achieved
1662  * by registering a module notifier, and ordering module removal and patching
1663  * trough a mutex.
1664  */
text_poke(void * addr,const void * opcode,size_t len)1665 void *text_poke(void *addr, const void *opcode, size_t len)
1666 {
1667 	lockdep_assert_held(&text_mutex);
1668 
1669 	return __text_poke(text_poke_memcpy, addr, opcode, len);
1670 }
1671 
1672 /**
1673  * text_poke_kgdb - Update instructions on a live kernel by kgdb
1674  * @addr: address to modify
1675  * @opcode: source of the copy
1676  * @len: length to copy
1677  *
1678  * Only atomic text poke/set should be allowed when not doing early patching.
1679  * It means the size must be writable atomically and the address must be aligned
1680  * in a way that permits an atomic write. It also makes sure we fit on a single
1681  * page.
1682  *
1683  * Context: should only be used by kgdb, which ensures no other core is running,
1684  *	    despite the fact it does not hold the text_mutex.
1685  */
text_poke_kgdb(void * addr,const void * opcode,size_t len)1686 void *text_poke_kgdb(void *addr, const void *opcode, size_t len)
1687 {
1688 	return __text_poke(text_poke_memcpy, addr, opcode, len);
1689 }
1690 
text_poke_copy_locked(void * addr,const void * opcode,size_t len,bool core_ok)1691 void *text_poke_copy_locked(void *addr, const void *opcode, size_t len,
1692 			    bool core_ok)
1693 {
1694 	unsigned long start = (unsigned long)addr;
1695 	size_t patched = 0;
1696 
1697 	if (WARN_ON_ONCE(!core_ok && core_kernel_text(start)))
1698 		return NULL;
1699 
1700 	while (patched < len) {
1701 		unsigned long ptr = start + patched;
1702 		size_t s;
1703 
1704 		s = min_t(size_t, PAGE_SIZE * 2 - offset_in_page(ptr), len - patched);
1705 
1706 		__text_poke(text_poke_memcpy, (void *)ptr, opcode + patched, s);
1707 		patched += s;
1708 	}
1709 	return addr;
1710 }
1711 
1712 /**
1713  * text_poke_copy - Copy instructions into (an unused part of) RX memory
1714  * @addr: address to modify
1715  * @opcode: source of the copy
1716  * @len: length to copy, could be more than 2x PAGE_SIZE
1717  *
1718  * Not safe against concurrent execution; useful for JITs to dump
1719  * new code blocks into unused regions of RX memory. Can be used in
1720  * conjunction with synchronize_rcu_tasks() to wait for existing
1721  * execution to quiesce after having made sure no existing functions
1722  * pointers are live.
1723  */
text_poke_copy(void * addr,const void * opcode,size_t len)1724 void *text_poke_copy(void *addr, const void *opcode, size_t len)
1725 {
1726 	mutex_lock(&text_mutex);
1727 	addr = text_poke_copy_locked(addr, opcode, len, false);
1728 	mutex_unlock(&text_mutex);
1729 	return addr;
1730 }
1731 
1732 /**
1733  * text_poke_set - memset into (an unused part of) RX memory
1734  * @addr: address to modify
1735  * @c: the byte to fill the area with
1736  * @len: length to copy, could be more than 2x PAGE_SIZE
1737  *
1738  * This is useful to overwrite unused regions of RX memory with illegal
1739  * instructions.
1740  */
text_poke_set(void * addr,int c,size_t len)1741 void *text_poke_set(void *addr, int c, size_t len)
1742 {
1743 	unsigned long start = (unsigned long)addr;
1744 	size_t patched = 0;
1745 
1746 	if (WARN_ON_ONCE(core_kernel_text(start)))
1747 		return NULL;
1748 
1749 	mutex_lock(&text_mutex);
1750 	while (patched < len) {
1751 		unsigned long ptr = start + patched;
1752 		size_t s;
1753 
1754 		s = min_t(size_t, PAGE_SIZE * 2 - offset_in_page(ptr), len - patched);
1755 
1756 		__text_poke(text_poke_memset, (void *)ptr, (void *)&c, s);
1757 		patched += s;
1758 	}
1759 	mutex_unlock(&text_mutex);
1760 	return addr;
1761 }
1762 
do_sync_core(void * info)1763 static void do_sync_core(void *info)
1764 {
1765 	sync_core();
1766 }
1767 
text_poke_sync(void)1768 void text_poke_sync(void)
1769 {
1770 	on_each_cpu(do_sync_core, NULL, 1);
1771 }
1772 
1773 /*
1774  * NOTE: crazy scheme to allow patching Jcc.d32 but not increase the size of
1775  * this thing. When len == 6 everything is prefixed with 0x0f and we map
1776  * opcode to Jcc.d8, using len to distinguish.
1777  */
1778 struct text_poke_loc {
1779 	/* addr := _stext + rel_addr */
1780 	s32 rel_addr;
1781 	s32 disp;
1782 	u8 len;
1783 	u8 opcode;
1784 	const u8 text[POKE_MAX_OPCODE_SIZE];
1785 	/* see text_poke_bp_batch() */
1786 	u8 old;
1787 };
1788 
1789 struct bp_patching_desc {
1790 	struct text_poke_loc *vec;
1791 	int nr_entries;
1792 	atomic_t refs;
1793 };
1794 
1795 static struct bp_patching_desc bp_desc;
1796 
1797 static __always_inline
try_get_desc(void)1798 struct bp_patching_desc *try_get_desc(void)
1799 {
1800 	struct bp_patching_desc *desc = &bp_desc;
1801 
1802 	if (!arch_atomic_inc_not_zero(&desc->refs))
1803 		return NULL;
1804 
1805 	return desc;
1806 }
1807 
put_desc(void)1808 static __always_inline void put_desc(void)
1809 {
1810 	struct bp_patching_desc *desc = &bp_desc;
1811 
1812 	smp_mb__before_atomic();
1813 	arch_atomic_dec(&desc->refs);
1814 }
1815 
text_poke_addr(struct text_poke_loc * tp)1816 static __always_inline void *text_poke_addr(struct text_poke_loc *tp)
1817 {
1818 	return _stext + tp->rel_addr;
1819 }
1820 
patch_cmp(const void * key,const void * elt)1821 static __always_inline int patch_cmp(const void *key, const void *elt)
1822 {
1823 	struct text_poke_loc *tp = (struct text_poke_loc *) elt;
1824 
1825 	if (key < text_poke_addr(tp))
1826 		return -1;
1827 	if (key > text_poke_addr(tp))
1828 		return 1;
1829 	return 0;
1830 }
1831 
poke_int3_handler(struct pt_regs * regs)1832 noinstr int poke_int3_handler(struct pt_regs *regs)
1833 {
1834 	struct bp_patching_desc *desc;
1835 	struct text_poke_loc *tp;
1836 	int ret = 0;
1837 	void *ip;
1838 
1839 	if (user_mode(regs))
1840 		return 0;
1841 
1842 	/*
1843 	 * Having observed our INT3 instruction, we now must observe
1844 	 * bp_desc with non-zero refcount:
1845 	 *
1846 	 *	bp_desc.refs = 1		INT3
1847 	 *	WMB				RMB
1848 	 *	write INT3			if (bp_desc.refs != 0)
1849 	 */
1850 	smp_rmb();
1851 
1852 	desc = try_get_desc();
1853 	if (!desc)
1854 		return 0;
1855 
1856 	/*
1857 	 * Discount the INT3. See text_poke_bp_batch().
1858 	 */
1859 	ip = (void *) regs->ip - INT3_INSN_SIZE;
1860 
1861 	/*
1862 	 * Skip the binary search if there is a single member in the vector.
1863 	 */
1864 	if (unlikely(desc->nr_entries > 1)) {
1865 		tp = __inline_bsearch(ip, desc->vec, desc->nr_entries,
1866 				      sizeof(struct text_poke_loc),
1867 				      patch_cmp);
1868 		if (!tp)
1869 			goto out_put;
1870 	} else {
1871 		tp = desc->vec;
1872 		if (text_poke_addr(tp) != ip)
1873 			goto out_put;
1874 	}
1875 
1876 	ip += tp->len;
1877 
1878 	switch (tp->opcode) {
1879 	case INT3_INSN_OPCODE:
1880 		/*
1881 		 * Someone poked an explicit INT3, they'll want to handle it,
1882 		 * do not consume.
1883 		 */
1884 		goto out_put;
1885 
1886 	case RET_INSN_OPCODE:
1887 		int3_emulate_ret(regs);
1888 		break;
1889 
1890 	case CALL_INSN_OPCODE:
1891 		int3_emulate_call(regs, (long)ip + tp->disp);
1892 		break;
1893 
1894 	case JMP32_INSN_OPCODE:
1895 	case JMP8_INSN_OPCODE:
1896 		int3_emulate_jmp(regs, (long)ip + tp->disp);
1897 		break;
1898 
1899 	case 0x70 ... 0x7f: /* Jcc */
1900 		int3_emulate_jcc(regs, tp->opcode & 0xf, (long)ip, tp->disp);
1901 		break;
1902 
1903 	default:
1904 		BUG();
1905 	}
1906 
1907 	ret = 1;
1908 
1909 out_put:
1910 	put_desc();
1911 	return ret;
1912 }
1913 
1914 #define TP_VEC_MAX (PAGE_SIZE / sizeof(struct text_poke_loc))
1915 static struct text_poke_loc tp_vec[TP_VEC_MAX];
1916 static int tp_vec_nr;
1917 
1918 /**
1919  * text_poke_bp_batch() -- update instructions on live kernel on SMP
1920  * @tp:			vector of instructions to patch
1921  * @nr_entries:		number of entries in the vector
1922  *
1923  * Modify multi-byte instruction by using int3 breakpoint on SMP.
1924  * We completely avoid stop_machine() here, and achieve the
1925  * synchronization using int3 breakpoint.
1926  *
1927  * The way it is done:
1928  *	- For each entry in the vector:
1929  *		- add a int3 trap to the address that will be patched
1930  *	- sync cores
1931  *	- For each entry in the vector:
1932  *		- update all but the first byte of the patched range
1933  *	- sync cores
1934  *	- For each entry in the vector:
1935  *		- replace the first byte (int3) by the first byte of
1936  *		  replacing opcode
1937  *	- sync cores
1938  */
text_poke_bp_batch(struct text_poke_loc * tp,unsigned int nr_entries)1939 static void text_poke_bp_batch(struct text_poke_loc *tp, unsigned int nr_entries)
1940 {
1941 	unsigned char int3 = INT3_INSN_OPCODE;
1942 	unsigned int i;
1943 	int do_sync;
1944 
1945 	lockdep_assert_held(&text_mutex);
1946 
1947 	bp_desc.vec = tp;
1948 	bp_desc.nr_entries = nr_entries;
1949 
1950 	/*
1951 	 * Corresponds to the implicit memory barrier in try_get_desc() to
1952 	 * ensure reading a non-zero refcount provides up to date bp_desc data.
1953 	 */
1954 	atomic_set_release(&bp_desc.refs, 1);
1955 
1956 	/*
1957 	 * Corresponding read barrier in int3 notifier for making sure the
1958 	 * nr_entries and handler are correctly ordered wrt. patching.
1959 	 */
1960 	smp_wmb();
1961 
1962 	/*
1963 	 * First step: add a int3 trap to the address that will be patched.
1964 	 */
1965 	for (i = 0; i < nr_entries; i++) {
1966 		tp[i].old = *(u8 *)text_poke_addr(&tp[i]);
1967 		text_poke(text_poke_addr(&tp[i]), &int3, INT3_INSN_SIZE);
1968 	}
1969 
1970 	text_poke_sync();
1971 
1972 	/*
1973 	 * Second step: update all but the first byte of the patched range.
1974 	 */
1975 	for (do_sync = 0, i = 0; i < nr_entries; i++) {
1976 		u8 old[POKE_MAX_OPCODE_SIZE+1] = { tp[i].old, };
1977 		u8 _new[POKE_MAX_OPCODE_SIZE+1];
1978 		const u8 *new = tp[i].text;
1979 		int len = tp[i].len;
1980 
1981 		if (len - INT3_INSN_SIZE > 0) {
1982 			memcpy(old + INT3_INSN_SIZE,
1983 			       text_poke_addr(&tp[i]) + INT3_INSN_SIZE,
1984 			       len - INT3_INSN_SIZE);
1985 
1986 			if (len == 6) {
1987 				_new[0] = 0x0f;
1988 				memcpy(_new + 1, new, 5);
1989 				new = _new;
1990 			}
1991 
1992 			text_poke(text_poke_addr(&tp[i]) + INT3_INSN_SIZE,
1993 				  new + INT3_INSN_SIZE,
1994 				  len - INT3_INSN_SIZE);
1995 
1996 			do_sync++;
1997 		}
1998 
1999 		/*
2000 		 * Emit a perf event to record the text poke, primarily to
2001 		 * support Intel PT decoding which must walk the executable code
2002 		 * to reconstruct the trace. The flow up to here is:
2003 		 *   - write INT3 byte
2004 		 *   - IPI-SYNC
2005 		 *   - write instruction tail
2006 		 * At this point the actual control flow will be through the
2007 		 * INT3 and handler and not hit the old or new instruction.
2008 		 * Intel PT outputs FUP/TIP packets for the INT3, so the flow
2009 		 * can still be decoded. Subsequently:
2010 		 *   - emit RECORD_TEXT_POKE with the new instruction
2011 		 *   - IPI-SYNC
2012 		 *   - write first byte
2013 		 *   - IPI-SYNC
2014 		 * So before the text poke event timestamp, the decoder will see
2015 		 * either the old instruction flow or FUP/TIP of INT3. After the
2016 		 * text poke event timestamp, the decoder will see either the
2017 		 * new instruction flow or FUP/TIP of INT3. Thus decoders can
2018 		 * use the timestamp as the point at which to modify the
2019 		 * executable code.
2020 		 * The old instruction is recorded so that the event can be
2021 		 * processed forwards or backwards.
2022 		 */
2023 		perf_event_text_poke(text_poke_addr(&tp[i]), old, len, new, len);
2024 	}
2025 
2026 	if (do_sync) {
2027 		/*
2028 		 * According to Intel, this core syncing is very likely
2029 		 * not necessary and we'd be safe even without it. But
2030 		 * better safe than sorry (plus there's not only Intel).
2031 		 */
2032 		text_poke_sync();
2033 	}
2034 
2035 	/*
2036 	 * Third step: replace the first byte (int3) by the first byte of
2037 	 * replacing opcode.
2038 	 */
2039 	for (do_sync = 0, i = 0; i < nr_entries; i++) {
2040 		u8 byte = tp[i].text[0];
2041 
2042 		if (tp[i].len == 6)
2043 			byte = 0x0f;
2044 
2045 		if (byte == INT3_INSN_OPCODE)
2046 			continue;
2047 
2048 		text_poke(text_poke_addr(&tp[i]), &byte, INT3_INSN_SIZE);
2049 		do_sync++;
2050 	}
2051 
2052 	if (do_sync)
2053 		text_poke_sync();
2054 
2055 	/*
2056 	 * Remove and wait for refs to be zero.
2057 	 */
2058 	if (!atomic_dec_and_test(&bp_desc.refs))
2059 		atomic_cond_read_acquire(&bp_desc.refs, !VAL);
2060 }
2061 
text_poke_loc_init(struct text_poke_loc * tp,void * addr,const void * opcode,size_t len,const void * emulate)2062 static void text_poke_loc_init(struct text_poke_loc *tp, void *addr,
2063 			       const void *opcode, size_t len, const void *emulate)
2064 {
2065 	struct insn insn;
2066 	int ret, i = 0;
2067 
2068 	if (len == 6)
2069 		i = 1;
2070 	memcpy((void *)tp->text, opcode+i, len-i);
2071 	if (!emulate)
2072 		emulate = opcode;
2073 
2074 	ret = insn_decode_kernel(&insn, emulate);
2075 	BUG_ON(ret < 0);
2076 
2077 	tp->rel_addr = addr - (void *)_stext;
2078 	tp->len = len;
2079 	tp->opcode = insn.opcode.bytes[0];
2080 
2081 	if (is_jcc32(&insn)) {
2082 		/*
2083 		 * Map Jcc.d32 onto Jcc.d8 and use len to distinguish.
2084 		 */
2085 		tp->opcode = insn.opcode.bytes[1] - 0x10;
2086 	}
2087 
2088 	switch (tp->opcode) {
2089 	case RET_INSN_OPCODE:
2090 	case JMP32_INSN_OPCODE:
2091 	case JMP8_INSN_OPCODE:
2092 		/*
2093 		 * Control flow instructions without implied execution of the
2094 		 * next instruction can be padded with INT3.
2095 		 */
2096 		for (i = insn.length; i < len; i++)
2097 			BUG_ON(tp->text[i] != INT3_INSN_OPCODE);
2098 		break;
2099 
2100 	default:
2101 		BUG_ON(len != insn.length);
2102 	}
2103 
2104 	switch (tp->opcode) {
2105 	case INT3_INSN_OPCODE:
2106 	case RET_INSN_OPCODE:
2107 		break;
2108 
2109 	case CALL_INSN_OPCODE:
2110 	case JMP32_INSN_OPCODE:
2111 	case JMP8_INSN_OPCODE:
2112 	case 0x70 ... 0x7f: /* Jcc */
2113 		tp->disp = insn.immediate.value;
2114 		break;
2115 
2116 	default: /* assume NOP */
2117 		switch (len) {
2118 		case 2: /* NOP2 -- emulate as JMP8+0 */
2119 			BUG_ON(memcmp(emulate, x86_nops[len], len));
2120 			tp->opcode = JMP8_INSN_OPCODE;
2121 			tp->disp = 0;
2122 			break;
2123 
2124 		case 5: /* NOP5 -- emulate as JMP32+0 */
2125 			BUG_ON(memcmp(emulate, x86_nops[len], len));
2126 			tp->opcode = JMP32_INSN_OPCODE;
2127 			tp->disp = 0;
2128 			break;
2129 
2130 		default: /* unknown instruction */
2131 			BUG();
2132 		}
2133 		break;
2134 	}
2135 }
2136 
2137 /*
2138  * We hard rely on the tp_vec being ordered; ensure this is so by flushing
2139  * early if needed.
2140  */
tp_order_fail(void * addr)2141 static bool tp_order_fail(void *addr)
2142 {
2143 	struct text_poke_loc *tp;
2144 
2145 	if (!tp_vec_nr)
2146 		return false;
2147 
2148 	if (!addr) /* force */
2149 		return true;
2150 
2151 	tp = &tp_vec[tp_vec_nr - 1];
2152 	if ((unsigned long)text_poke_addr(tp) > (unsigned long)addr)
2153 		return true;
2154 
2155 	return false;
2156 }
2157 
text_poke_flush(void * addr)2158 static void text_poke_flush(void *addr)
2159 {
2160 	if (tp_vec_nr == TP_VEC_MAX || tp_order_fail(addr)) {
2161 		text_poke_bp_batch(tp_vec, tp_vec_nr);
2162 		tp_vec_nr = 0;
2163 	}
2164 }
2165 
text_poke_finish(void)2166 void text_poke_finish(void)
2167 {
2168 	text_poke_flush(NULL);
2169 }
2170 
text_poke_queue(void * addr,const void * opcode,size_t len,const void * emulate)2171 void __ref text_poke_queue(void *addr, const void *opcode, size_t len, const void *emulate)
2172 {
2173 	struct text_poke_loc *tp;
2174 
2175 	text_poke_flush(addr);
2176 
2177 	tp = &tp_vec[tp_vec_nr++];
2178 	text_poke_loc_init(tp, addr, opcode, len, emulate);
2179 }
2180 
2181 /**
2182  * text_poke_bp() -- update instructions on live kernel on SMP
2183  * @addr:	address to patch
2184  * @opcode:	opcode of new instruction
2185  * @len:	length to copy
2186  * @emulate:	instruction to be emulated
2187  *
2188  * Update a single instruction with the vector in the stack, avoiding
2189  * dynamically allocated memory. This function should be used when it is
2190  * not possible to allocate memory.
2191  */
text_poke_bp(void * addr,const void * opcode,size_t len,const void * emulate)2192 void __ref text_poke_bp(void *addr, const void *opcode, size_t len, const void *emulate)
2193 {
2194 	struct text_poke_loc tp;
2195 
2196 	text_poke_loc_init(&tp, addr, opcode, len, emulate);
2197 	text_poke_bp_batch(&tp, 1);
2198 }
2199