1 /* SPDX-License-Identifier: GPL-2.0-or-later */
2 /******************************************************************************
3 * alternative.c
4 */
5
6 #include <xen/delay.h>
7 #include <xen/types.h>
8 #include <asm/apic.h>
9 #include <asm/endbr.h>
10 #include <asm/processor.h>
11 #include <asm/alternative.h>
12 #include <xen/init.h>
13 #include <asm/setup.h>
14 #include <asm/stubs.h>
15 #include <asm/system.h>
16 #include <asm/traps.h>
17 #include <asm/nmi.h>
18 #include <asm/nops.h>
19 #include <xen/livepatch.h>
20
21 #define MAX_PATCH_LEN (255-1)
22
23 #ifdef K8_NOP1
24 static const unsigned char k8nops[] init_or_livepatch_const = {
25 K8_NOP1,
26 K8_NOP2,
27 K8_NOP3,
28 K8_NOP4,
29 K8_NOP5,
30 K8_NOP6,
31 K8_NOP7,
32 K8_NOP8,
33 K8_NOP9,
34 };
35 static const unsigned char * const k8_nops[ASM_NOP_MAX+1] init_or_livepatch_constrel = {
36 NULL,
37 k8nops,
38 k8nops + 1,
39 k8nops + 1 + 2,
40 k8nops + 1 + 2 + 3,
41 k8nops + 1 + 2 + 3 + 4,
42 k8nops + 1 + 2 + 3 + 4 + 5,
43 k8nops + 1 + 2 + 3 + 4 + 5 + 6,
44 k8nops + 1 + 2 + 3 + 4 + 5 + 6 + 7,
45 k8nops + 1 + 2 + 3 + 4 + 5 + 6 + 7 + 8,
46 };
47 #endif
48
49 #ifdef P6_NOP1
50 static const unsigned char p6nops[] init_or_livepatch_const = {
51 P6_NOP1,
52 P6_NOP2,
53 P6_NOP3,
54 P6_NOP4,
55 P6_NOP5,
56 P6_NOP6,
57 P6_NOP7,
58 P6_NOP8,
59 P6_NOP9,
60 };
61 static const unsigned char * const p6_nops[ASM_NOP_MAX+1] init_or_livepatch_constrel = {
62 NULL,
63 p6nops,
64 p6nops + 1,
65 p6nops + 1 + 2,
66 p6nops + 1 + 2 + 3,
67 p6nops + 1 + 2 + 3 + 4,
68 p6nops + 1 + 2 + 3 + 4 + 5,
69 p6nops + 1 + 2 + 3 + 4 + 5 + 6,
70 p6nops + 1 + 2 + 3 + 4 + 5 + 6 + 7,
71 p6nops + 1 + 2 + 3 + 4 + 5 + 6 + 7 + 8,
72 };
73 #endif
74
75 static const unsigned char * const *ideal_nops init_or_livepatch_data = p6_nops;
76
77 #ifdef HAVE_AS_NOPS_DIRECTIVE
78
79 /* Nops in .init.rodata to compare against the runtime ideal nops. */
80 asm ( ".pushsection .init.rodata, \"a\", @progbits\n\t"
81 "toolchain_nops: .nops " __stringify(ASM_NOP_MAX) "\n\t"
82 ".popsection\n\t");
83 extern char toolchain_nops[ASM_NOP_MAX];
84 static bool init_or_livepatch_read_mostly toolchain_nops_are_ideal;
85
86 #else
87 # define toolchain_nops_are_ideal false
88 #endif
89
arch_init_ideal_nops(void)90 static void __init arch_init_ideal_nops(void)
91 {
92 switch ( boot_cpu_data.x86_vendor )
93 {
94 case X86_VENDOR_INTEL:
95 /*
96 * Due to a decoder implementation quirk, some specific Intel CPUs
97 * actually perform better with the "k8_nops" than with the SDM-
98 * recommended NOPs.
99 */
100 if ( boot_cpu_data.x86 != 6 )
101 break;
102
103 switch ( boot_cpu_data.x86_model )
104 {
105 case 0x0f ... 0x1b:
106 case 0x1d ... 0x25:
107 case 0x28 ... 0x2f:
108 ideal_nops = k8_nops;
109 break;
110 }
111 break;
112
113 case X86_VENDOR_AMD:
114 if ( boot_cpu_data.x86 <= 0xf )
115 ideal_nops = k8_nops;
116 break;
117 }
118
119 #ifdef HAVE_AS_NOPS_DIRECTIVE
120 if ( memcmp(ideal_nops[ASM_NOP_MAX], toolchain_nops, ASM_NOP_MAX) == 0 )
121 toolchain_nops_are_ideal = true;
122 #endif
123 }
124
125 /* Use this to add nops to a buffer, then text_poke the whole buffer. */
add_nops(void * insns,unsigned int len)126 void init_or_livepatch add_nops(void *insns, unsigned int len)
127 {
128 while ( len > 0 )
129 {
130 unsigned int noplen = len;
131 if ( noplen > ASM_NOP_MAX )
132 noplen = ASM_NOP_MAX;
133 memcpy(insns, ideal_nops[noplen], noplen);
134 insns += noplen;
135 len -= noplen;
136 }
137 }
138
139 void nocall __x86_return_thunk(void);
140
141 /*
142 * Place a return at @ptr. @ptr must be in the writable alias of a stub.
143 *
144 * When CONFIG_RETURN_THUNK is active, this may be a JMP __x86_return_thunk
145 * instead, depending on the safety of @ptr with respect to Indirect Target
146 * Selection.
147 *
148 * Returns the next position to write into the stub.
149 */
place_ret(void * ptr)150 void *place_ret(void *ptr)
151 {
152 unsigned long addr = (unsigned long)ptr;
153 uint8_t *p = ptr;
154
155 /*
156 * When Return Thunks are used, if a RET would be unsafe at this location
157 * with respect to Indirect Target Selection (i.e. if addr is in the first
158 * half of a cacheline), insert a JMP __x86_return_thunk instead.
159 *
160 * The displacement needs to be relative to the executable alias of the
161 * stub, not to @ptr which is the writeable alias.
162 */
163 if ( IS_ENABLED(CONFIG_RETURN_THUNK) && !(addr & 0x20) )
164 {
165 long stub_va = (this_cpu(stubs.addr) & PAGE_MASK) + (addr & ~PAGE_MASK);
166 long disp = (long)__x86_return_thunk - (stub_va + 5);
167
168 BUG_ON((int32_t)disp != disp);
169
170 *p++ = 0xe9;
171 *(int32_t *)p = disp;
172 p += 4;
173 }
174 else
175 {
176 *p++ = 0xc3;
177 }
178
179 return p;
180 }
181
182 /*
183 * text_poke - Update instructions on a live kernel or non-executed code.
184 * @addr: address to modify
185 * @opcode: source of the copy
186 * @len: length to copy
187 *
188 * When you use this code to patch more than one byte of an instruction
189 * you need to make sure that other CPUs cannot execute this code in parallel.
190 * Also no thread must be currently preempted in the middle of these
191 * instructions. And on the local CPU you need to be protected again NMI or MCE
192 * handlers seeing an inconsistent instruction while you patch.
193 *
194 * You should run this with interrupts disabled or on code that is not
195 * executing.
196 *
197 * While the SDM continues to suggest using "noinline" would be sufficient, it
198 * may not be, e.g. due to errata. Issue a serializing insn afterwards, unless
199 * this is for live-patching, where we modify code before it goes live. Issue
200 * a serializing insn which is unlikely to be intercepted by a hypervisor, in
201 * case we run virtualized ourselves.
202 */
203 static void init_or_livepatch
text_poke(void * addr,const void * opcode,size_t len)204 text_poke(void *addr, const void *opcode, size_t len)
205 {
206 memcpy(addr, opcode, len);
207 if ( system_state < SYS_STATE_active )
208 asm volatile ( "mov %0, %%cr2" :: "r" (0L) : "memory" );
209 }
210
211 extern void *const __initdata_cf_clobber_start[];
212 extern void *const __initdata_cf_clobber_end[];
213
214 /*
215 * In CET-IBT enabled builds, clobber endbr64 instructions after altcall has
216 * finished optimising all indirect branches to direct ones.
217 */
seal_endbr64(void)218 static void __init seal_endbr64(void)
219 {
220 void *const *val;
221 unsigned int clobbered = 0;
222
223 if ( !cpu_has_xen_ibt )
224 return;
225
226 /*
227 * This is some minor structure (ab)use. We walk the entire contents
228 * of .init.{ro,}data.cf_clobber as if it were an array of pointers.
229 *
230 * If the pointer points into .text, and at an endbr64 instruction,
231 * nop out the endbr64. This causes the pointer to no longer be a
232 * legal indirect branch target under CET-IBT. This is a
233 * defence-in-depth measure, to reduce the options available to an
234 * adversary who has managed to hijack a function pointer.
235 */
236 for ( val = __initdata_cf_clobber_start;
237 val < __initdata_cf_clobber_end;
238 val++ )
239 {
240 void *ptr = *val;
241
242 if ( !is_kernel_text(ptr) || !is_endbr64(ptr) )
243 continue;
244
245 place_endbr64_poison(ptr);
246 clobbered++;
247 }
248
249 printk("altcall: Optimised away %u endbr64 instructions\n", clobbered);
250 }
251
252 /*
253 * Replace instructions with better alternatives for this CPU type.
254 * This runs before SMP is initialized to avoid SMP problems with
255 * self modifying code. This implies that asymmetric systems where
256 * APs have less capabilities than the boot processor are not handled.
257 * Tough. Make sure you disable such features by hand.
258 */
_apply_alternatives(struct alt_instr * start,struct alt_instr * end)259 static int init_or_livepatch _apply_alternatives(struct alt_instr *start,
260 struct alt_instr *end)
261 {
262 struct alt_instr *a, *base;
263
264 printk(KERN_INFO "alt table %p -> %p\n", start, end);
265
266 /*
267 * The scan order should be from start to end. A later scanned
268 * alternative code can overwrite a previous scanned alternative code.
269 * Some code (e.g. ALTERNATIVE_2()) relies on this order of patching.
270 *
271 * So be careful if you want to change the scan order to any other
272 * order.
273 */
274 for ( a = base = start; a < end; a++ )
275 {
276 uint8_t *orig = ALT_ORIG_PTR(a);
277 uint8_t *repl = ALT_REPL_PTR(a);
278 uint8_t buf[MAX_PATCH_LEN];
279 unsigned int total_len = a->orig_len + a->pad_len;
280 unsigned int feat = a->cpuid & ~ALT_FLAG_NOT;
281 bool inv = a->cpuid & ALT_FLAG_NOT, replace;
282
283 if ( a->repl_len > total_len )
284 {
285 printk(XENLOG_ERR
286 "Alt for %ps, replacement size %#x larger than origin %#x\n",
287 ALT_ORIG_PTR(a), a->repl_len, total_len);
288 return -ENOSPC;
289 }
290
291 if ( total_len > sizeof(buf) )
292 {
293 printk(XENLOG_ERR
294 "Alt for %ps, origin size %#x bigger than buffer %#zx\n",
295 ALT_ORIG_PTR(a), total_len, sizeof(buf));
296 return -ENOSPC;
297 }
298
299 if ( feat >= NCAPINTS * 32 )
300 {
301 printk(XENLOG_ERR
302 "Alt for %ps, feature %#x outside of featureset range %#x\n",
303 ALT_ORIG_PTR(a), feat, NCAPINTS * 32);
304 return -ERANGE;
305 }
306
307 /*
308 * Detect sequences of alt_instr's patching the same origin site, and
309 * keep base pointing at the first alt_instr entry. This is so we can
310 * refer to a single ->priv field for some of our patching decisions,
311 * in particular the NOP optimization. We deliberately use the alt_instr
312 * itself rather than a local variable in case we end up making multiple
313 * passes.
314 *
315 * ->priv being nonzero means that the origin site has already been
316 * modified, and we shouldn't try to optimise the nops again.
317 */
318 if ( ALT_ORIG_PTR(base) != orig )
319 base = a;
320
321 /* Skip patch sites already handled during the first pass. */
322 if ( a->priv )
323 continue;
324
325 /*
326 * Should a replacement be performed? Most replacements have positive
327 * polarity, but we support negative polarity too.
328 */
329 replace = boot_cpu_has(feat) ^ inv;
330
331 /* If there is no replacement to make, see about optimising the nops. */
332 if ( !replace )
333 {
334 /* Origin site site already touched? Don't nop anything. */
335 if ( base->priv )
336 continue;
337
338 a->priv = 1;
339
340 /* Nothing useful to do? */
341 if ( toolchain_nops_are_ideal || a->pad_len <= 1 )
342 continue;
343
344 add_nops(buf, a->pad_len);
345 text_poke(orig + a->orig_len, buf, a->pad_len);
346 continue;
347 }
348
349 memcpy(buf, repl, a->repl_len);
350
351 /* 0xe8/0xe9 are relative branches; fix the offset. */
352 if ( a->repl_len >= 5 && (*buf & 0xfe) == 0xe8 )
353 *(int32_t *)(buf + 1) += repl - orig;
354 else if ( IS_ENABLED(CONFIG_RETURN_THUNK) &&
355 a->repl_len > 5 && buf[a->repl_len - 5] == 0xe9 &&
356 ((long)repl + a->repl_len +
357 *(int32_t *)(buf + a->repl_len - 4) ==
358 (long)__x86_return_thunk) )
359 *(int32_t *)(buf + a->repl_len - 4) += repl - orig;
360
361 a->priv = 1;
362
363 add_nops(buf + a->repl_len, total_len - a->repl_len);
364 text_poke(orig, buf, total_len);
365 }
366
367 return 0;
368 }
369
370 /*
371 * At build time, alternative calls are emitted as:
372 * ff 15 xx xx xx xx => call *disp32(%rip)
373 *
374 * During boot, we devirtualise by editing to:
375 * 2e e8 xx xx xx xx => cs call disp32
376 *
377 * or, if the function pointer is still NULL, poison to:
378 * 0f 0b 0f 0b 0f 0b => ud2a (x3)
379 */
apply_alt_calls(const struct alt_call * start,const struct alt_call * end)380 static int init_or_livepatch apply_alt_calls(
381 const struct alt_call *start, const struct alt_call *end)
382 {
383 const struct alt_call *a;
384
385 for ( a = start; a < end; a++ )
386 {
387 const uint8_t *dest;
388 uint8_t buf[6], *orig = ALT_CALL_PTR(a);
389 long disp;
390
391 /* It's likely that this won't change, but check just to be safe. */
392 BUILD_BUG_ON(ALT_CALL_LEN(a) != 6);
393
394 if ( orig[0] != 0xff || orig[1] != 0x15 )
395 {
396 printk(XENLOG_ERR
397 "Altcall for %ps [%6ph] not CALL *RIPREL\n",
398 orig, orig);
399 return -EINVAL;
400 }
401
402 disp = *(int32_t *)(orig + 2);
403 dest = *(const void **)(orig + 6 + disp);
404
405 if ( dest )
406 {
407 /*
408 * When building for CET-IBT, all function pointer targets
409 * should have an endbr64 instruction.
410 *
411 * If this is not the case, leave a warning because
412 * something is probably wrong with the build. A CET-IBT
413 * enabled system might have exploded already.
414 *
415 * Otherwise, skip the endbr64 instruction. This is a
416 * marginal perf improvement which saves on instruction
417 * decode bandwidth.
418 */
419 if ( IS_ENABLED(CONFIG_XEN_IBT) )
420 {
421 if ( is_endbr64(dest) )
422 dest += ENDBR64_LEN;
423 else
424 printk(XENLOG_WARNING
425 "Altcall %ps dest %ps has no endbr64\n",
426 orig, dest);
427 }
428
429 disp = dest - (orig + 6);
430 ASSERT(disp == (int32_t)disp);
431
432 buf[0] = 0x2e;
433 buf[1] = 0xe8;
434 *(int32_t *)(buf + 2) = disp;
435 }
436 else
437 {
438 /*
439 * The function pointer is still NULL. Seal the whole call, as
440 * it's not used.
441 */
442 buf[0] = 0x0f;
443 buf[1] = 0x0b;
444 buf[2] = 0x0f;
445 buf[3] = 0x0b;
446 buf[4] = 0x0f;
447 buf[5] = 0x0b;
448 }
449
450 text_poke(orig, buf, sizeof(buf));
451 }
452
453 return 0;
454 }
455
456 #ifdef CONFIG_LIVEPATCH
apply_alternatives(struct alt_instr * start,struct alt_instr * end)457 int apply_alternatives(struct alt_instr *start, struct alt_instr *end)
458 {
459 return _apply_alternatives(start, end);
460 }
461
livepatch_apply_alt_calls(const struct alt_call * start,const struct alt_call * end)462 int livepatch_apply_alt_calls(const struct alt_call *start,
463 const struct alt_call *end)
464 {
465 return apply_alt_calls(start, end);
466 }
467 #endif
468
469 #define ALT_INSNS (1U << 0)
470 #define ALT_CALLS (1U << 1)
471 static unsigned int __initdata alt_todo;
472 static unsigned int __initdata alt_done;
473
474 extern struct alt_instr __alt_instructions[], __alt_instructions_end[];
475 extern struct alt_call __alt_call_sites_start[], __alt_call_sites_end[];
476
477 /*
478 * At boot time, we patch alternatives in NMI context. This means that the
479 * active NMI-shadow will defer any further NMIs, removing the slim race
480 * condition where an NMI hits while we are midway though patching some
481 * instructions in the NMI path.
482 */
nmi_apply_alternatives(const struct cpu_user_regs * regs,int cpu)483 static int __init cf_check nmi_apply_alternatives(
484 const struct cpu_user_regs *regs, int cpu)
485 {
486 /*
487 * More than one NMI may occur between the two set_nmi_callback() below.
488 * We only need to apply alternatives once.
489 */
490 if ( !(alt_done & alt_todo) )
491 {
492 int rc;
493
494 /*
495 * Relax perms on .text to be RWX, so we can modify them.
496 *
497 * This relaxes perms globally, but we run ahead of bringing APs
498 * online, so only have our own TLB to worry about.
499 */
500 modify_xen_mappings_lite(XEN_VIRT_START + MB(2),
501 (unsigned long)&__2M_text_end,
502 PAGE_HYPERVISOR_RWX);
503 flush_local(FLUSH_TLB_GLOBAL);
504
505 if ( alt_todo & ALT_INSNS )
506 {
507 rc = _apply_alternatives(__alt_instructions,
508 __alt_instructions_end);
509 if ( rc )
510 panic("Unable to apply alternatives: %d\n", rc);
511 }
512
513 if ( alt_todo & ALT_CALLS )
514 {
515 rc = apply_alt_calls(__alt_call_sites_start, __alt_call_sites_end);
516 if ( rc )
517 panic("Unable to apply alternative calls: %d\n", rc);
518
519 seal_endbr64();
520 }
521
522 /*
523 * Reinstate perms on .text to be RX. This also cleans out the dirty
524 * bits, which matters when CET Shstk is active.
525 */
526 modify_xen_mappings_lite(XEN_VIRT_START + MB(2),
527 (unsigned long)&__2M_text_end,
528 PAGE_HYPERVISOR_RX);
529 flush_local(FLUSH_TLB_GLOBAL);
530
531 alt_done |= alt_todo;
532 }
533
534 return 1;
535 }
536
537 /*
538 * This routine is called with local interrupt disabled and used during
539 * bootup.
540 */
_alternative_instructions(unsigned int what)541 static void __init _alternative_instructions(unsigned int what)
542 {
543 unsigned int i;
544 nmi_callback_t *saved_nmi_callback;
545
546 /*
547 * Don't stop machine check exceptions while patching.
548 * MCEs only happen when something got corrupted and in this
549 * case we must do something about the corruption.
550 * Ignoring it is worse than a unlikely patching race.
551 * Also machine checks tend to be broadcast and if one CPU
552 * goes into machine check the others follow quickly, so we don't
553 * expect a machine check to cause undue problems during to code
554 * patching.
555 */
556 ASSERT(!local_irq_is_enabled());
557
558 /* Set what operation to perform /before/ setting the callback. */
559 alt_todo = what;
560 barrier();
561
562 /*
563 * As soon as the callback is set up, the next NMI will trigger patching,
564 * even an NMI ahead of our explicit self-NMI.
565 */
566 saved_nmi_callback = set_nmi_callback(nmi_apply_alternatives);
567
568 /* Send ourselves an NMI to trigger the callback. */
569 self_nmi();
570
571 /*
572 * In practice, the self_nmi() above appears to act synchronously.
573 * However, synchronous behaviour is not architecturally guaranteed. To
574 * cover the (hopefully never) async case, poll alt_done for up to one
575 * second.
576 */
577 for ( i = 0; !(ACCESS_ONCE(alt_done) & alt_todo) && i < 1000; ++i )
578 mdelay(1);
579
580 if ( !(ACCESS_ONCE(alt_done) & alt_todo) )
581 panic("Timed out waiting for alternatives self-NMI to hit\n");
582
583 set_nmi_callback(saved_nmi_callback);
584 }
585
alternative_instructions(void)586 void __init alternative_instructions(void)
587 {
588 arch_init_ideal_nops();
589 _alternative_instructions(ALT_INSNS);
590 }
591
boot_apply_alt_calls(void)592 void __init boot_apply_alt_calls(void)
593 {
594 local_irq_disable();
595 _alternative_instructions(ALT_CALLS);
596 local_irq_enable();
597 }
598