1 /* SPDX-License-Identifier: GPL-2.0-or-later */
2 /******************************************************************************
3  * alternative.c
4  */
5 
6 #include <xen/delay.h>
7 #include <xen/types.h>
8 #include <asm/apic.h>
9 #include <asm/endbr.h>
10 #include <asm/processor.h>
11 #include <asm/alternative.h>
12 #include <xen/init.h>
13 #include <asm/setup.h>
14 #include <asm/stubs.h>
15 #include <asm/system.h>
16 #include <asm/traps.h>
17 #include <asm/nmi.h>
18 #include <asm/nops.h>
19 #include <xen/livepatch.h>
20 
21 #define MAX_PATCH_LEN (255-1)
22 
23 #ifdef K8_NOP1
24 static const unsigned char k8nops[] init_or_livepatch_const = {
25     K8_NOP1,
26     K8_NOP2,
27     K8_NOP3,
28     K8_NOP4,
29     K8_NOP5,
30     K8_NOP6,
31     K8_NOP7,
32     K8_NOP8,
33     K8_NOP9,
34 };
35 static const unsigned char * const k8_nops[ASM_NOP_MAX+1] init_or_livepatch_constrel = {
36     NULL,
37     k8nops,
38     k8nops + 1,
39     k8nops + 1 + 2,
40     k8nops + 1 + 2 + 3,
41     k8nops + 1 + 2 + 3 + 4,
42     k8nops + 1 + 2 + 3 + 4 + 5,
43     k8nops + 1 + 2 + 3 + 4 + 5 + 6,
44     k8nops + 1 + 2 + 3 + 4 + 5 + 6 + 7,
45     k8nops + 1 + 2 + 3 + 4 + 5 + 6 + 7 + 8,
46 };
47 #endif
48 
49 #ifdef P6_NOP1
50 static const unsigned char p6nops[] init_or_livepatch_const = {
51     P6_NOP1,
52     P6_NOP2,
53     P6_NOP3,
54     P6_NOP4,
55     P6_NOP5,
56     P6_NOP6,
57     P6_NOP7,
58     P6_NOP8,
59     P6_NOP9,
60 };
61 static const unsigned char * const p6_nops[ASM_NOP_MAX+1] init_or_livepatch_constrel = {
62     NULL,
63     p6nops,
64     p6nops + 1,
65     p6nops + 1 + 2,
66     p6nops + 1 + 2 + 3,
67     p6nops + 1 + 2 + 3 + 4,
68     p6nops + 1 + 2 + 3 + 4 + 5,
69     p6nops + 1 + 2 + 3 + 4 + 5 + 6,
70     p6nops + 1 + 2 + 3 + 4 + 5 + 6 + 7,
71     p6nops + 1 + 2 + 3 + 4 + 5 + 6 + 7 + 8,
72 };
73 #endif
74 
75 static const unsigned char * const *ideal_nops init_or_livepatch_data = p6_nops;
76 
77 #ifdef HAVE_AS_NOPS_DIRECTIVE
78 
79 /* Nops in .init.rodata to compare against the runtime ideal nops. */
80 asm ( ".pushsection .init.rodata, \"a\", @progbits\n\t"
81       "toolchain_nops: .nops " __stringify(ASM_NOP_MAX) "\n\t"
82       ".popsection\n\t");
83 extern char toolchain_nops[ASM_NOP_MAX];
84 static bool init_or_livepatch_read_mostly toolchain_nops_are_ideal;
85 
86 #else
87 # define toolchain_nops_are_ideal false
88 #endif
89 
arch_init_ideal_nops(void)90 static void __init arch_init_ideal_nops(void)
91 {
92     switch ( boot_cpu_data.x86_vendor )
93     {
94     case X86_VENDOR_INTEL:
95         /*
96          * Due to a decoder implementation quirk, some specific Intel CPUs
97          * actually perform better with the "k8_nops" than with the SDM-
98          * recommended NOPs.
99          */
100         if ( boot_cpu_data.x86 != 6 )
101             break;
102 
103         switch ( boot_cpu_data.x86_model )
104         {
105         case 0x0f ... 0x1b:
106         case 0x1d ... 0x25:
107         case 0x28 ... 0x2f:
108             ideal_nops = k8_nops;
109             break;
110         }
111         break;
112 
113     case X86_VENDOR_AMD:
114         if ( boot_cpu_data.x86 <= 0xf )
115             ideal_nops = k8_nops;
116         break;
117     }
118 
119 #ifdef HAVE_AS_NOPS_DIRECTIVE
120     if ( memcmp(ideal_nops[ASM_NOP_MAX], toolchain_nops, ASM_NOP_MAX) == 0 )
121         toolchain_nops_are_ideal = true;
122 #endif
123 }
124 
125 /* Use this to add nops to a buffer, then text_poke the whole buffer. */
add_nops(void * insns,unsigned int len)126 void init_or_livepatch add_nops(void *insns, unsigned int len)
127 {
128     while ( len > 0 )
129     {
130         unsigned int noplen = len;
131         if ( noplen > ASM_NOP_MAX )
132             noplen = ASM_NOP_MAX;
133         memcpy(insns, ideal_nops[noplen], noplen);
134         insns += noplen;
135         len -= noplen;
136     }
137 }
138 
139 void nocall __x86_return_thunk(void);
140 
141 /*
142  * Place a return at @ptr.  @ptr must be in the writable alias of a stub.
143  *
144  * When CONFIG_RETURN_THUNK is active, this may be a JMP __x86_return_thunk
145  * instead, depending on the safety of @ptr with respect to Indirect Target
146  * Selection.
147  *
148  * Returns the next position to write into the stub.
149  */
place_ret(void * ptr)150 void *place_ret(void *ptr)
151 {
152     unsigned long addr = (unsigned long)ptr;
153     uint8_t *p = ptr;
154 
155     /*
156      * When Return Thunks are used, if a RET would be unsafe at this location
157      * with respect to Indirect Target Selection (i.e. if addr is in the first
158      * half of a cacheline), insert a JMP __x86_return_thunk instead.
159      *
160      * The displacement needs to be relative to the executable alias of the
161      * stub, not to @ptr which is the writeable alias.
162      */
163     if ( IS_ENABLED(CONFIG_RETURN_THUNK) && !(addr & 0x20) )
164     {
165         long stub_va = (this_cpu(stubs.addr) & PAGE_MASK) + (addr & ~PAGE_MASK);
166         long disp = (long)__x86_return_thunk - (stub_va + 5);
167 
168         BUG_ON((int32_t)disp != disp);
169 
170         *p++ = 0xe9;
171         *(int32_t *)p = disp;
172         p += 4;
173     }
174     else
175     {
176         *p++ = 0xc3;
177     }
178 
179     return p;
180 }
181 
182 /*
183  * text_poke - Update instructions on a live kernel or non-executed code.
184  * @addr: address to modify
185  * @opcode: source of the copy
186  * @len: length to copy
187  *
188  * When you use this code to patch more than one byte of an instruction
189  * you need to make sure that other CPUs cannot execute this code in parallel.
190  * Also no thread must be currently preempted in the middle of these
191  * instructions. And on the local CPU you need to be protected again NMI or MCE
192  * handlers seeing an inconsistent instruction while you patch.
193  *
194  * You should run this with interrupts disabled or on code that is not
195  * executing.
196  *
197  * While the SDM continues to suggest using "noinline" would be sufficient, it
198  * may not be, e.g. due to errata.  Issue a serializing insn afterwards, unless
199  * this is for live-patching, where we modify code before it goes live.  Issue
200  * a serializing insn which is unlikely to be intercepted by a hypervisor, in
201  * case we run virtualized ourselves.
202  */
203 static void init_or_livepatch
text_poke(void * addr,const void * opcode,size_t len)204 text_poke(void *addr, const void *opcode, size_t len)
205 {
206     memcpy(addr, opcode, len);
207     if ( system_state < SYS_STATE_active )
208         asm volatile ( "mov %0, %%cr2" :: "r" (0L) : "memory" );
209 }
210 
211 extern void *const __initdata_cf_clobber_start[];
212 extern void *const __initdata_cf_clobber_end[];
213 
214 /*
215  * In CET-IBT enabled builds, clobber endbr64 instructions after altcall has
216  * finished optimising all indirect branches to direct ones.
217  */
seal_endbr64(void)218 static void __init seal_endbr64(void)
219 {
220     void *const *val;
221     unsigned int clobbered = 0;
222 
223     if ( !cpu_has_xen_ibt )
224         return;
225 
226     /*
227      * This is some minor structure (ab)use.  We walk the entire contents
228      * of .init.{ro,}data.cf_clobber as if it were an array of pointers.
229      *
230      * If the pointer points into .text, and at an endbr64 instruction,
231      * nop out the endbr64.  This causes the pointer to no longer be a
232      * legal indirect branch target under CET-IBT.  This is a
233      * defence-in-depth measure, to reduce the options available to an
234      * adversary who has managed to hijack a function pointer.
235      */
236     for ( val = __initdata_cf_clobber_start;
237           val < __initdata_cf_clobber_end;
238           val++ )
239     {
240         void *ptr = *val;
241 
242         if ( !is_kernel_text(ptr) || !is_endbr64(ptr) )
243             continue;
244 
245         place_endbr64_poison(ptr);
246         clobbered++;
247     }
248 
249     printk("altcall: Optimised away %u endbr64 instructions\n", clobbered);
250 }
251 
252 /*
253  * Replace instructions with better alternatives for this CPU type.
254  * This runs before SMP is initialized to avoid SMP problems with
255  * self modifying code. This implies that asymmetric systems where
256  * APs have less capabilities than the boot processor are not handled.
257  * Tough. Make sure you disable such features by hand.
258  */
_apply_alternatives(struct alt_instr * start,struct alt_instr * end)259 static int init_or_livepatch _apply_alternatives(struct alt_instr *start,
260                                                  struct alt_instr *end)
261 {
262     struct alt_instr *a, *base;
263 
264     printk(KERN_INFO "alt table %p -> %p\n", start, end);
265 
266     /*
267      * The scan order should be from start to end. A later scanned
268      * alternative code can overwrite a previous scanned alternative code.
269      * Some code (e.g. ALTERNATIVE_2()) relies on this order of patching.
270      *
271      * So be careful if you want to change the scan order to any other
272      * order.
273      */
274     for ( a = base = start; a < end; a++ )
275     {
276         uint8_t *orig = ALT_ORIG_PTR(a);
277         uint8_t *repl = ALT_REPL_PTR(a);
278         uint8_t buf[MAX_PATCH_LEN];
279         unsigned int total_len = a->orig_len + a->pad_len;
280         unsigned int feat = a->cpuid & ~ALT_FLAG_NOT;
281         bool inv = a->cpuid & ALT_FLAG_NOT, replace;
282 
283         if ( a->repl_len > total_len )
284         {
285             printk(XENLOG_ERR
286                    "Alt for %ps, replacement size %#x larger than origin %#x\n",
287                     ALT_ORIG_PTR(a), a->repl_len, total_len);
288             return -ENOSPC;
289         }
290 
291         if ( total_len > sizeof(buf) )
292         {
293             printk(XENLOG_ERR
294                    "Alt for %ps, origin size %#x bigger than buffer %#zx\n",
295                    ALT_ORIG_PTR(a), total_len, sizeof(buf));
296             return -ENOSPC;
297         }
298 
299         if ( feat >= NCAPINTS * 32 )
300         {
301              printk(XENLOG_ERR
302                    "Alt for %ps, feature %#x outside of featureset range %#x\n",
303                    ALT_ORIG_PTR(a), feat, NCAPINTS * 32);
304             return -ERANGE;
305         }
306 
307         /*
308          * Detect sequences of alt_instr's patching the same origin site, and
309          * keep base pointing at the first alt_instr entry.  This is so we can
310          * refer to a single ->priv field for some of our patching decisions,
311          * in particular the NOP optimization. We deliberately use the alt_instr
312          * itself rather than a local variable in case we end up making multiple
313          * passes.
314          *
315          * ->priv being nonzero means that the origin site has already been
316          * modified, and we shouldn't try to optimise the nops again.
317          */
318         if ( ALT_ORIG_PTR(base) != orig )
319             base = a;
320 
321         /* Skip patch sites already handled during the first pass. */
322         if ( a->priv )
323             continue;
324 
325         /*
326          * Should a replacement be performed?  Most replacements have positive
327          * polarity, but we support negative polarity too.
328          */
329         replace = boot_cpu_has(feat) ^ inv;
330 
331         /* If there is no replacement to make, see about optimising the nops. */
332         if ( !replace )
333         {
334             /* Origin site site already touched?  Don't nop anything. */
335             if ( base->priv )
336                 continue;
337 
338             a->priv = 1;
339 
340             /* Nothing useful to do? */
341             if ( toolchain_nops_are_ideal || a->pad_len <= 1 )
342                 continue;
343 
344             add_nops(buf, a->pad_len);
345             text_poke(orig + a->orig_len, buf, a->pad_len);
346             continue;
347         }
348 
349         memcpy(buf, repl, a->repl_len);
350 
351         /* 0xe8/0xe9 are relative branches; fix the offset. */
352         if ( a->repl_len >= 5 && (*buf & 0xfe) == 0xe8 )
353             *(int32_t *)(buf + 1) += repl - orig;
354         else if ( IS_ENABLED(CONFIG_RETURN_THUNK) &&
355                   a->repl_len > 5 && buf[a->repl_len - 5] == 0xe9 &&
356                   ((long)repl + a->repl_len +
357                    *(int32_t *)(buf + a->repl_len - 4) ==
358                    (long)__x86_return_thunk) )
359             *(int32_t *)(buf + a->repl_len - 4) += repl - orig;
360 
361         a->priv = 1;
362 
363         add_nops(buf + a->repl_len, total_len - a->repl_len);
364         text_poke(orig, buf, total_len);
365     }
366 
367     return 0;
368 }
369 
370 /*
371  * At build time, alternative calls are emitted as:
372  *   ff 15 xx xx xx xx  =>  call *disp32(%rip)
373  *
374  * During boot, we devirtualise by editing to:
375  *   2e e8 xx xx xx xx  =>  cs call disp32
376  *
377  * or, if the function pointer is still NULL, poison to:
378  *   0f 0b 0f 0b 0f 0b  =>  ud2a (x3)
379  */
apply_alt_calls(const struct alt_call * start,const struct alt_call * end)380 static int init_or_livepatch apply_alt_calls(
381     const struct alt_call *start, const struct alt_call *end)
382 {
383     const struct alt_call *a;
384 
385     for ( a = start; a < end; a++ )
386     {
387         const uint8_t *dest;
388         uint8_t buf[6], *orig = ALT_CALL_PTR(a);
389         long disp;
390 
391         /* It's likely that this won't change, but check just to be safe. */
392         BUILD_BUG_ON(ALT_CALL_LEN(a) != 6);
393 
394         if ( orig[0] != 0xff || orig[1] != 0x15 )
395         {
396             printk(XENLOG_ERR
397                    "Altcall for %ps [%6ph] not CALL *RIPREL\n",
398                    orig, orig);
399             return -EINVAL;
400         }
401 
402         disp = *(int32_t *)(orig + 2);
403         dest = *(const void **)(orig + 6 + disp);
404 
405         if ( dest )
406         {
407             /*
408              * When building for CET-IBT, all function pointer targets
409              * should have an endbr64 instruction.
410              *
411              * If this is not the case, leave a warning because
412              * something is probably wrong with the build.  A CET-IBT
413              * enabled system might have exploded already.
414              *
415              * Otherwise, skip the endbr64 instruction.  This is a
416              * marginal perf improvement which saves on instruction
417              * decode bandwidth.
418              */
419             if ( IS_ENABLED(CONFIG_XEN_IBT) )
420             {
421                 if ( is_endbr64(dest) )
422                     dest += ENDBR64_LEN;
423                 else
424                     printk(XENLOG_WARNING
425                            "Altcall %ps dest %ps has no endbr64\n",
426                            orig, dest);
427             }
428 
429             disp = dest - (orig + 6);
430             ASSERT(disp == (int32_t)disp);
431 
432             buf[0] = 0x2e;
433             buf[1] = 0xe8;
434             *(int32_t *)(buf + 2) = disp;
435         }
436         else
437         {
438             /*
439              * The function pointer is still NULL.  Seal the whole call, as
440              * it's not used.
441              */
442             buf[0] = 0x0f;
443             buf[1] = 0x0b;
444             buf[2] = 0x0f;
445             buf[3] = 0x0b;
446             buf[4] = 0x0f;
447             buf[5] = 0x0b;
448         }
449 
450         text_poke(orig, buf, sizeof(buf));
451     }
452 
453     return 0;
454 }
455 
456 #ifdef CONFIG_LIVEPATCH
apply_alternatives(struct alt_instr * start,struct alt_instr * end)457 int apply_alternatives(struct alt_instr *start, struct alt_instr *end)
458 {
459     return _apply_alternatives(start, end);
460 }
461 
livepatch_apply_alt_calls(const struct alt_call * start,const struct alt_call * end)462 int livepatch_apply_alt_calls(const struct alt_call *start,
463                               const struct alt_call *end)
464 {
465     return apply_alt_calls(start, end);
466 }
467 #endif
468 
469 #define ALT_INSNS (1U << 0)
470 #define ALT_CALLS (1U << 1)
471 static unsigned int __initdata alt_todo;
472 static unsigned int __initdata alt_done;
473 
474 extern struct alt_instr __alt_instructions[], __alt_instructions_end[];
475 extern struct alt_call __alt_call_sites_start[], __alt_call_sites_end[];
476 
477 /*
478  * At boot time, we patch alternatives in NMI context.  This means that the
479  * active NMI-shadow will defer any further NMIs, removing the slim race
480  * condition where an NMI hits while we are midway though patching some
481  * instructions in the NMI path.
482  */
nmi_apply_alternatives(const struct cpu_user_regs * regs,int cpu)483 static int __init cf_check nmi_apply_alternatives(
484     const struct cpu_user_regs *regs, int cpu)
485 {
486     /*
487      * More than one NMI may occur between the two set_nmi_callback() below.
488      * We only need to apply alternatives once.
489      */
490     if ( !(alt_done & alt_todo) )
491     {
492         int rc;
493 
494         /*
495          * Relax perms on .text to be RWX, so we can modify them.
496          *
497          * This relaxes perms globally, but we run ahead of bringing APs
498          * online, so only have our own TLB to worry about.
499          */
500         modify_xen_mappings_lite(XEN_VIRT_START + MB(2),
501                                  (unsigned long)&__2M_text_end,
502                                  PAGE_HYPERVISOR_RWX);
503         flush_local(FLUSH_TLB_GLOBAL);
504 
505         if ( alt_todo & ALT_INSNS )
506         {
507             rc = _apply_alternatives(__alt_instructions,
508                                      __alt_instructions_end);
509             if ( rc )
510                 panic("Unable to apply alternatives: %d\n", rc);
511         }
512 
513         if ( alt_todo & ALT_CALLS )
514         {
515             rc = apply_alt_calls(__alt_call_sites_start, __alt_call_sites_end);
516             if ( rc )
517                 panic("Unable to apply alternative calls: %d\n", rc);
518 
519             seal_endbr64();
520         }
521 
522         /*
523          * Reinstate perms on .text to be RX.  This also cleans out the dirty
524          * bits, which matters when CET Shstk is active.
525          */
526         modify_xen_mappings_lite(XEN_VIRT_START + MB(2),
527                                  (unsigned long)&__2M_text_end,
528                                  PAGE_HYPERVISOR_RX);
529         flush_local(FLUSH_TLB_GLOBAL);
530 
531         alt_done |= alt_todo;
532     }
533 
534     return 1;
535 }
536 
537 /*
538  * This routine is called with local interrupt disabled and used during
539  * bootup.
540  */
_alternative_instructions(unsigned int what)541 static void __init _alternative_instructions(unsigned int what)
542 {
543     unsigned int i;
544     nmi_callback_t *saved_nmi_callback;
545 
546     /*
547      * Don't stop machine check exceptions while patching.
548      * MCEs only happen when something got corrupted and in this
549      * case we must do something about the corruption.
550      * Ignoring it is worse than a unlikely patching race.
551      * Also machine checks tend to be broadcast and if one CPU
552      * goes into machine check the others follow quickly, so we don't
553      * expect a machine check to cause undue problems during to code
554      * patching.
555      */
556     ASSERT(!local_irq_is_enabled());
557 
558     /* Set what operation to perform /before/ setting the callback. */
559     alt_todo = what;
560     barrier();
561 
562     /*
563      * As soon as the callback is set up, the next NMI will trigger patching,
564      * even an NMI ahead of our explicit self-NMI.
565      */
566     saved_nmi_callback = set_nmi_callback(nmi_apply_alternatives);
567 
568     /* Send ourselves an NMI to trigger the callback. */
569     self_nmi();
570 
571     /*
572      * In practice, the self_nmi() above appears to act synchronously.
573      * However, synchronous behaviour is not architecturally guaranteed.  To
574      * cover the (hopefully never) async case, poll alt_done for up to one
575      * second.
576      */
577     for ( i = 0; !(ACCESS_ONCE(alt_done) & alt_todo) && i < 1000; ++i )
578         mdelay(1);
579 
580     if ( !(ACCESS_ONCE(alt_done) & alt_todo) )
581         panic("Timed out waiting for alternatives self-NMI to hit\n");
582 
583     set_nmi_callback(saved_nmi_callback);
584 }
585 
alternative_instructions(void)586 void __init alternative_instructions(void)
587 {
588     arch_init_ideal_nops();
589     _alternative_instructions(ALT_INSNS);
590 }
591 
boot_apply_alt_calls(void)592 void __init boot_apply_alt_calls(void)
593 {
594     local_irq_disable();
595     _alternative_instructions(ALT_CALLS);
596     local_irq_enable();
597 }
598