1 /******************************************************************************
2  * kexec.c - Achitecture independent kexec code for Xen
3  *
4  * Xen port written by:
5  * - Simon 'Horms' Horman <horms@verge.net.au>
6  * - Magnus Damm <magnus@valinux.co.jp>
7  */
8 
9 #include <xen/init.h>
10 #include <xen/lib.h>
11 #include <xen/acpi.h>
12 #include <xen/ctype.h>
13 #include <xen/elfcore.h>
14 #include <xen/errno.h>
15 #include <xen/guest_access.h>
16 #include <xen/param.h>
17 #include <xen/watchdog.h>
18 #include <xen/sched.h>
19 #include <xen/types.h>
20 #include <xen/hypercall.h>
21 #include <xen/kexec.h>
22 #include <xen/keyhandler.h>
23 #include <public/kexec.h>
24 #include <xen/cpumask.h>
25 #include <asm/atomic.h>
26 #include <xen/spinlock.h>
27 #include <xen/version.h>
28 #include <xen/console.h>
29 #include <xen/kexec.h>
30 #include <xen/kimage.h>
31 #include <public/elfnote.h>
32 #include <xsm/xsm.h>
33 #include <xen/cpu.h>
34 #ifdef CONFIG_COMPAT
35 #include <compat/kexec.h>
36 #endif
37 
38 bool __read_mostly kexecing;
39 
40 /* Memory regions to store the per cpu register state etc. on a crash. */
41 typedef struct { Elf_Note * start; size_t size; } crash_note_range_t;
42 static crash_note_range_t * crash_notes;
43 
44 /* Lock to prevent race conditions when allocating the crash note buffers.
45  * It also serves to protect calls to alloc_from_crash_heap when allocating
46  * crash note buffers in lower memory. */
47 static DEFINE_SPINLOCK(crash_notes_lock);
48 
49 static Elf_Note *xen_crash_note;
50 
51 static cpumask_t crash_saved_cpus;
52 
53 static struct kexec_image *kexec_image[KEXEC_IMAGE_NR];
54 
55 #define KEXEC_FLAG_DEFAULT_POS   (KEXEC_IMAGE_NR + 0)
56 #define KEXEC_FLAG_CRASH_POS     (KEXEC_IMAGE_NR + 1)
57 #define KEXEC_FLAG_IN_PROGRESS   (KEXEC_IMAGE_NR + 2)
58 #define KEXEC_FLAG_IN_HYPERCALL  (KEXEC_IMAGE_NR + 3)
59 
60 static unsigned long kexec_flags = 0; /* the lowest bits are for KEXEC_IMAGE... */
61 
62 static unsigned char vmcoreinfo_data[VMCOREINFO_BYTES];
63 static size_t vmcoreinfo_size = 0;
64 
65 xen_kexec_reserve_t kexec_crash_area;
66 paddr_t __initdata kexec_crash_area_limit = ~(paddr_t)0;
67 static struct {
68     u64 start, end;
69     unsigned long size;
70 } ranges[16] __initdata;
71 
72 /* Low crashinfo mode.  Start as INVALID so serveral codepaths can set up
73  * defaults without needing to know the state of the others. */
74 enum low_crashinfo low_crashinfo_mode = LOW_CRASHINFO_INVALID;
75 
76 /* This value is only considered if low_crash_mode is set to MIN or ALL, so
77  * setting a default here is safe. Default to 4GB.  This is because the current
78  * KEXEC_CMD_get_range compat hypercall trucates 64bit pointers to 32 bits. The
79  * typical usecase for crashinfo_maxaddr will be for 64bit Xen with 32bit dom0
80  * and 32bit crash kernel. */
81 static paddr_t __initdata crashinfo_maxaddr = 4ULL << 30;
82 
83 /* = log base 2 of crashinfo_maxaddr after checking for sanity. Default to
84  * larger than the entire physical address space. */
85 unsigned int __initdata crashinfo_maxaddr_bits = 64;
86 
87 /* Pointers to keep track of the crash heap region. */
88 static void *crash_heap_current = NULL, *crash_heap_end = NULL;
89 
90 /*
91  * Parse command lines in the format
92  *
93  *   crashkernel=<ramsize-range>:<size>[,...][{@,<,below=}<address>]
94  *
95  * with <ramsize-range> being of form
96  *
97  *   <start>-[<end>]
98  *
99  * as well as the legacy ones in the format
100  *
101  *   crashkernel=<size>[{@,<}<address>]
102  *   crashkernel=<size>,below=address
103  *
104  * < and below are synonyomous, the latter being useful for grub2 systems
105  * which would otherwise require escaping of the < option
106  */
parse_crashkernel(const char * str)107 static int __init cf_check parse_crashkernel(const char *str)
108 {
109     const char *cur;
110     int rc = 0;
111 
112     if ( strchr(str, ':' ) )
113     {
114         unsigned int idx = 0;
115 
116         do {
117             if ( idx >= ARRAY_SIZE(ranges) )
118             {
119                 printk(XENLOG_WARNING "crashkernel: too many ranges\n");
120                 cur = NULL;
121                 str = strpbrk(str, "@,<");
122                 rc = -EINVAL;
123                 break;
124             }
125 
126             ranges[idx].start = parse_size_and_unit(cur = str + !!idx, &str);
127             if ( cur == str )
128                 break;
129 
130             if ( *str != '-' )
131             {
132                 printk(XENLOG_WARNING "crashkernel: '-' expected\n");
133                 rc = -EINVAL;
134                 break;
135             }
136 
137             if ( *++str != ':' )
138             {
139                 ranges[idx].end = parse_size_and_unit(cur = str, &str);
140                 if ( cur == str )
141                     break;
142                 if ( ranges[idx].end <= ranges[idx].start )
143                 {
144                     printk(XENLOG_WARNING "crashkernel: end <= start\n");
145                     rc = -EINVAL;
146                     break;
147                 }
148             }
149             else
150                 ranges[idx].end = -1;
151 
152             if ( *str != ':' )
153             {
154                 printk(XENLOG_WARNING "crashkernel: ':' expected\n");
155                 rc = -EINVAL;
156                 break;
157             }
158 
159             ranges[idx].size = parse_size_and_unit(cur = str + 1, &str);
160             if ( cur == str )
161                 break;
162 
163             ++idx;
164         } while ( *str == ',' );
165         if ( idx < ARRAY_SIZE(ranges) )
166             ranges[idx].size = 0;
167     }
168     else
169         kexec_crash_area.size = parse_size_and_unit(cur = str, &str);
170     if ( cur != str )
171     {
172         if ( *str == '@' )
173             kexec_crash_area.start = parse_size_and_unit(cur = str + 1, &str);
174         else if ( *str == '<' )
175             kexec_crash_area_limit = parse_size_and_unit(cur = str + 1, &str);
176         else if ( !strncmp(str, ",below=", 7) )
177             kexec_crash_area_limit = parse_size_and_unit(cur = str + 7, &str);
178         else if ( *str )
179         {
180             printk(XENLOG_WARNING "crashkernel: '%s' ignored\n", str);
181             rc = -EINVAL;
182         }
183     }
184     if ( cur && cur == str )
185     {
186         printk(XENLOG_WARNING "crashkernel: memory value expected\n");
187         rc = -EINVAL;
188     }
189 
190     return rc;
191 }
192 custom_param("crashkernel", parse_crashkernel);
193 
194 /* Parse command lines in the format:
195  *
196  *   low_crashinfo=[none,min,all]
197  *
198  * - none disables the low allocation of crash info.
199  * - min will allocate enough low information for the crash kernel to be able
200  *       to extract the hypervisor and dom0 message ring buffers.
201  * - all will allocate additional structures such as domain and vcpu structs
202  *       low so the crash kernel can perform an extended analysis of state.
203  */
parse_low_crashinfo(const char * str)204 static int __init cf_check parse_low_crashinfo(const char *str)
205 {
206 
207     if ( !strlen(str) )
208         /* default to min if user just specifies "low_crashinfo" */
209         low_crashinfo_mode = LOW_CRASHINFO_MIN;
210     else if ( !strcmp(str, "none" ) )
211         low_crashinfo_mode = LOW_CRASHINFO_NONE;
212     else if ( !strcmp(str, "min" ) )
213         low_crashinfo_mode = LOW_CRASHINFO_MIN;
214     else if ( !strcmp(str, "all" ) )
215         low_crashinfo_mode = LOW_CRASHINFO_ALL;
216     else
217     {
218         printk("Unknown low_crashinfo parameter '%s'.  Defaulting to min.\n", str);
219         low_crashinfo_mode = LOW_CRASHINFO_MIN;
220         return -EINVAL;
221     }
222 
223     return 0;
224 }
225 custom_param("low_crashinfo", parse_low_crashinfo);
226 
227 /* Parse command lines in the format:
228  *
229  *   crashinfo_maxaddr=<addr>
230  *
231  * <addr> will be rounded down to the nearest power of two.  Defaults to 64G
232  */
parse_crashinfo_maxaddr(const char * str)233 static int __init cf_check parse_crashinfo_maxaddr(const char *str)
234 {
235     u64 addr;
236     const char *q;
237 
238     /* if low_crashinfo_mode is unset, default to min. */
239     if ( low_crashinfo_mode == LOW_CRASHINFO_INVALID )
240         low_crashinfo_mode = LOW_CRASHINFO_MIN;
241 
242     if ( (addr = parse_size_and_unit(str, &q)) )
243         crashinfo_maxaddr = addr;
244     else
245     {
246         printk("Unable to parse crashinfo_maxaddr. Defaulting to %"PRIpaddr"\n",
247                crashinfo_maxaddr);
248         return -EINVAL;
249     }
250 
251     return *q ? -EINVAL : 0;
252 }
253 custom_param("crashinfo_maxaddr", parse_crashinfo_maxaddr);
254 
set_kexec_crash_area_size(u64 system_ram)255 void __init set_kexec_crash_area_size(u64 system_ram)
256 {
257     unsigned int idx;
258 
259     for ( idx = 0; idx < ARRAY_SIZE(ranges) && !kexec_crash_area.size; ++idx )
260     {
261         if ( !ranges[idx].size )
262             break;
263 
264         if ( ranges[idx].size >= system_ram )
265         {
266             printk(XENLOG_WARNING "crashkernel: invalid size\n");
267             continue;
268         }
269 
270         if ( ranges[idx].start <= system_ram && ranges[idx].end > system_ram )
271             kexec_crash_area.size = ranges[idx].size;
272     }
273 }
274 
275 /*
276  * Only allow one cpu to continue on the crash path, forcing others to spin.
277  * Racing on the crash path from here will end in misery.  If we reenter,
278  * something has very gone wrong and retrying will (almost certainly) be
279  * futile.  Return up to our nested panic() to try and reboot.
280  *
281  * This is noinline to make it obvious in stack traces which cpus have lost
282  * the race (as opposed to being somewhere in kexec_common_shutdown())
283  */
one_cpu_only(void)284 static int noinline one_cpu_only(void)
285 {
286     static unsigned int crashing_cpu = -1;
287     unsigned int cpu = smp_processor_id();
288 
289     if ( cmpxchg(&crashing_cpu, -1, cpu) != -1 )
290     {
291         /* Not the first entry into one_cpu_only(). */
292         if ( crashing_cpu == cpu )
293         {
294             printk("Reentered the crash path.  Something is very broken\n");
295             return -EBUSY;
296         }
297 
298         /*
299          * Another cpu has beaten us to this point.  Wait here patiently for
300          * it to kill us.
301          */
302         for ( ; ; )
303             halt();
304     }
305 
306     set_bit(KEXEC_FLAG_IN_PROGRESS, &kexec_flags);
307     printk("Executing kexec image on cpu%u\n", cpu);
308 
309     return 0;
310 }
311 
312 /* Save the registers in the per-cpu crash note buffer. */
kexec_crash_save_cpu(void)313 void kexec_crash_save_cpu(void)
314 {
315     int cpu = smp_processor_id();
316     Elf_Note *note;
317     ELF_Prstatus *prstatus;
318     crash_xen_core_t *xencore;
319 
320     BUG_ON ( ! crash_notes );
321 
322     if ( cpumask_test_and_set_cpu(cpu, &crash_saved_cpus) )
323         return;
324 
325     note = crash_notes[cpu].start;
326 
327     prstatus = (ELF_Prstatus *)ELFNOTE_DESC(note);
328 
329     note = ELFNOTE_NEXT(note);
330     xencore = (crash_xen_core_t *)ELFNOTE_DESC(note);
331 
332     elf_core_save_regs(&prstatus->pr_reg, xencore);
333 }
334 
335 /* Set up the single Xen-specific-info crash note. */
kexec_crash_save_info(void)336 crash_xen_info_t *kexec_crash_save_info(void)
337 {
338     int cpu = smp_processor_id();
339     crash_xen_info_t info;
340     crash_xen_info_t *out = (crash_xen_info_t *)ELFNOTE_DESC(xen_crash_note);
341 
342     BUG_ON(!cpumask_test_and_set_cpu(cpu, &crash_saved_cpus));
343 
344     memset(&info, 0, sizeof(info));
345     info.xen_major_version = xen_major_version();
346     info.xen_minor_version = xen_minor_version();
347     info.xen_extra_version = __pa(xen_extra_version());
348     info.xen_changeset = __pa(xen_changeset());
349     info.xen_compiler = __pa(xen_compiler());
350     info.xen_compile_date = __pa(xen_compile_date());
351     info.xen_compile_time = __pa(xen_compile_time());
352     info.tainted = tainted;
353 
354     /* Copy from guaranteed-aligned local copy to possibly-unaligned dest. */
355     memcpy(out, &info, sizeof(info));
356 
357     return out;
358 }
359 
kexec_common_shutdown(void)360 static int kexec_common_shutdown(void)
361 {
362     int ret;
363 
364     ret = one_cpu_only();
365     if ( ret )
366         return ret;
367 
368     watchdog_disable();
369     console_start_sync();
370     spin_debug_disable();
371     acpi_dmar_reinstate();
372 
373     return 0;
374 }
375 
kexec_crash(enum crash_reason reason)376 void kexec_crash(enum crash_reason reason)
377 {
378     int pos;
379 
380     keyhandler_crash_action(reason);
381 
382     pos = (test_bit(KEXEC_FLAG_CRASH_POS, &kexec_flags) != 0);
383     if ( !test_bit(KEXEC_IMAGE_CRASH_BASE + pos, &kexec_flags) )
384         return;
385 
386     kexecing = true;
387 
388     if ( kexec_common_shutdown() != 0 )
389         return;
390 
391     kexec_crash_save_cpu();
392     machine_crash_shutdown();
393     machine_kexec(kexec_image[KEXEC_IMAGE_CRASH_BASE + pos]);
394 
395     BUG();
396 }
397 
kexec_reboot(void * _image)398 static long cf_check kexec_reboot(void *_image)
399 {
400     struct kexec_image *image = _image;
401 
402     kexecing = true;
403 
404     kexec_common_shutdown();
405     machine_reboot_kexec(image);
406 
407     BUG();
408     return 0;
409 }
410 
do_crashdump_trigger(unsigned char key)411 static void cf_check do_crashdump_trigger(unsigned char key)
412 {
413     printk("'%c' pressed -> triggering crashdump\n", key);
414     kexec_crash(CRASHREASON_DEBUGKEY);
415     printk(" * no crash kernel loaded!\n");
416 }
417 
setup_note(Elf_Note * n,const char * name,int type,int descsz)418 static void setup_note(Elf_Note *n, const char *name, int type, int descsz)
419 {
420     int l = strlen(name) + 1;
421     strlcpy(ELFNOTE_NAME(n), name, l);
422     n->namesz = l;
423     n->descsz = descsz;
424     n->type = type;
425 }
426 
sizeof_note(const char * name,int descsz)427 static size_t sizeof_note(const char *name, int descsz)
428 {
429     return (sizeof(Elf_Note) +
430             ELFNOTE_ALIGN(strlen(name)+1) +
431             ELFNOTE_ALIGN(descsz));
432 }
433 
sizeof_cpu_notes(const unsigned long cpu)434 static size_t sizeof_cpu_notes(const unsigned long cpu)
435 {
436     /* All CPUs present a PRSTATUS and crash_xen_core note. */
437     size_t bytes =
438         + sizeof_note("CORE", sizeof(ELF_Prstatus)) +
439         + sizeof_note("Xen", sizeof(crash_xen_core_t));
440 
441     /* CPU0 also presents the crash_xen_info note. */
442     if ( ! cpu )
443         bytes = bytes +
444             sizeof_note("Xen", sizeof(crash_xen_info_t));
445 
446     return bytes;
447 }
448 
449 /* Allocate size_t bytes of space from the previously allocated
450  * crash heap if the user has requested that crash notes be allocated
451  * in lower memory.  There is currently no case where the crash notes
452  * should be free()'d. */
alloc_from_crash_heap(const size_t bytes)453 static void * alloc_from_crash_heap(const size_t bytes)
454 {
455     void * ret;
456     if ( crash_heap_current + bytes > crash_heap_end )
457         return NULL;
458     ret = (void*)crash_heap_current;
459     crash_heap_current += bytes;
460     return ret;
461 }
462 
463 /* Allocate a crash note buffer for a newly onlined cpu. */
kexec_init_cpu_notes(const unsigned long cpu)464 static int kexec_init_cpu_notes(const unsigned long cpu)
465 {
466     Elf_Note * note = NULL;
467     int ret = 0;
468     int nr_bytes = 0;
469 
470     BUG_ON( cpu >= nr_cpu_ids || ! crash_notes );
471 
472     /* If already allocated, nothing to do. */
473     if ( crash_notes[cpu].start )
474         return ret;
475 
476     nr_bytes = sizeof_cpu_notes(cpu);
477 
478     /* If we dont care about the position of allocation, malloc. */
479     if ( low_crashinfo_mode == LOW_CRASHINFO_NONE )
480         note = xzalloc_bytes(nr_bytes);
481 
482     /* Protect the write into crash_notes[] with a spinlock, as this function
483      * is on a hotplug path and a hypercall path. */
484     spin_lock(&crash_notes_lock);
485 
486     /* If we are racing with another CPU and it has beaten us, give up
487      * gracefully. */
488     if ( crash_notes[cpu].start )
489     {
490         spin_unlock(&crash_notes_lock);
491         /* Always return ok, because whether we successfully allocated or not,
492          * another CPU has successfully allocated. */
493         xfree(note);
494     }
495     else
496     {
497         /* If we care about memory possition, alloc from the crash heap,
498          * also protected by the crash_notes_lock. */
499         if ( low_crashinfo_mode > LOW_CRASHINFO_NONE )
500             note = alloc_from_crash_heap(nr_bytes);
501 
502         crash_notes[cpu].start = note;
503         crash_notes[cpu].size = nr_bytes;
504         spin_unlock(&crash_notes_lock);
505 
506         /* If the allocation failed, and another CPU did not beat us, give
507          * up with ENOMEM. */
508         if ( ! note )
509             ret = -ENOMEM;
510         /* else all is good so lets set up the notes. */
511         else
512         {
513             /* Set up CORE note. */
514             setup_note(note, "CORE", NT_PRSTATUS, sizeof(ELF_Prstatus));
515             note = ELFNOTE_NEXT(note);
516 
517             /* Set up Xen CORE note. */
518             setup_note(note, "Xen", XEN_ELFNOTE_CRASH_REGS,
519                        sizeof(crash_xen_core_t));
520 
521             if ( ! cpu )
522             {
523                 /* Set up Xen Crash Info note. */
524                 xen_crash_note = note = ELFNOTE_NEXT(note);
525                 setup_note(note, "Xen", XEN_ELFNOTE_CRASH_INFO,
526                            sizeof(crash_xen_info_t));
527             }
528         }
529     }
530 
531     return ret;
532 }
533 
cpu_callback(struct notifier_block * nfb,unsigned long action,void * hcpu)534 static int cf_check cpu_callback(
535     struct notifier_block *nfb, unsigned long action, void *hcpu)
536 {
537     unsigned long cpu = (unsigned long)hcpu;
538 
539     /* Only hook on CPU_UP_PREPARE because once a crash_note has been reported
540      * to dom0, it must keep it around in case of a crash, as the crash kernel
541      * will be hard coded to the original physical address reported. */
542     switch ( action )
543     {
544     case CPU_UP_PREPARE:
545         /* Ignore return value.  If this boot time, -ENOMEM will cause all
546          * manner of problems elsewhere very soon, and if it is during runtime,
547          * then failing to allocate crash notes is not a good enough reason to
548          * fail the CPU_UP_PREPARE */
549         kexec_init_cpu_notes(cpu);
550         break;
551     default:
552         break;
553     }
554     return NOTIFY_DONE;
555 }
556 
557 static struct notifier_block cpu_nfb = {
558     .notifier_call = cpu_callback
559 };
560 
kexec_early_calculations(void)561 void __init kexec_early_calculations(void)
562 {
563     /* If low_crashinfo_mode is still INVALID, neither "low_crashinfo" nor
564      * "crashinfo_maxaddr" have been specified on the command line, so
565      * explicitly set to NONE. */
566     if ( low_crashinfo_mode == LOW_CRASHINFO_INVALID )
567         low_crashinfo_mode = LOW_CRASHINFO_NONE;
568 
569     if ( low_crashinfo_mode > LOW_CRASHINFO_NONE )
570         crashinfo_maxaddr_bits = fls64(crashinfo_maxaddr) - 1;
571 }
572 
kexec_init(void)573 static int __init cf_check kexec_init(void)
574 {
575     void *cpu = (void *)(unsigned long)smp_processor_id();
576 
577     /* If no crash area, no need to allocate space for notes. */
578     if ( !kexec_crash_area.size )
579         return 0;
580 
581     if ( low_crashinfo_mode > LOW_CRASHINFO_NONE )
582     {
583         size_t crash_heap_size;
584 
585         /* This calculation is safe even if the machine is booted in
586          * uniprocessor mode. */
587         crash_heap_size = sizeof_cpu_notes(0) +
588             sizeof_cpu_notes(1) * (nr_cpu_ids - 1);
589         crash_heap_size = PAGE_ALIGN(crash_heap_size);
590 
591         crash_heap_current = alloc_xenheap_pages(
592             get_order_from_bytes(crash_heap_size),
593             MEMF_bits(crashinfo_maxaddr_bits) );
594 
595         if ( ! crash_heap_current )
596             return -ENOMEM;
597 
598         memset(crash_heap_current, 0, crash_heap_size);
599 
600         crash_heap_end = crash_heap_current + crash_heap_size;
601     }
602 
603     /* crash_notes may be allocated anywhere Xen can reach in memory.
604        Only the individual CPU crash notes themselves must be allocated
605        in lower memory if requested. */
606     crash_notes = xzalloc_array(crash_note_range_t, nr_cpu_ids);
607     if ( ! crash_notes )
608         return -ENOMEM;
609 
610     register_keyhandler('C', do_crashdump_trigger, "trigger a crashdump", 0);
611 
612     cpu_callback(&cpu_nfb, CPU_UP_PREPARE, cpu);
613     register_cpu_notifier(&cpu_nfb);
614     return 0;
615 }
616 /* The reason for this to be a presmp_initcall as opposed to a regular
617  * __initcall is to allow the setup of the cpu hotplug handler before APs are
618  * brought up. */
619 presmp_initcall(kexec_init);
620 
kexec_get_reserve(xen_kexec_range_t * range)621 static int kexec_get_reserve(xen_kexec_range_t *range)
622 {
623     if ( kexec_crash_area.size > 0 && kexec_crash_area.start > 0) {
624         range->start = kexec_crash_area.start;
625         range->size = kexec_crash_area.size;
626     }
627     else
628         range->start = range->size = 0;
629     return 0;
630 }
631 
kexec_get_cpu(xen_kexec_range_t * range)632 static int kexec_get_cpu(xen_kexec_range_t *range)
633 {
634     int nr = range->nr;
635 
636     if ( nr < 0 || nr >= nr_cpu_ids )
637         return -ERANGE;
638 
639     if ( ! crash_notes )
640         return -EINVAL;
641 
642     /* Try once again to allocate room for the crash notes.  It is just possible
643      * that more space has become available since we last tried.  If space has
644      * already been allocated, kexec_init_cpu_notes() will return early with 0.
645      */
646     kexec_init_cpu_notes(nr);
647 
648     /* In the case of still not having enough memory to allocate buffer room,
649      * returning a range of 0,0 is still valid. */
650     if ( crash_notes[nr].start )
651     {
652         range->start = __pa(crash_notes[nr].start);
653         range->size = crash_notes[nr].size;
654     }
655     else
656         range->start = range->size = 0;
657 
658     return 0;
659 }
660 
kexec_get_vmcoreinfo(xen_kexec_range_t * range)661 static int kexec_get_vmcoreinfo(xen_kexec_range_t *range)
662 {
663     range->start = __pa((unsigned long)vmcoreinfo_data);
664     range->size = VMCOREINFO_BYTES;
665     return 0;
666 }
667 
kexec_get_range_internal(xen_kexec_range_t * range)668 static int kexec_get_range_internal(xen_kexec_range_t *range)
669 {
670     int ret = -EINVAL;
671 
672     switch ( range->range )
673     {
674     case KEXEC_RANGE_MA_CRASH:
675         ret = kexec_get_reserve(range);
676         break;
677     case KEXEC_RANGE_MA_CPU:
678         ret = kexec_get_cpu(range);
679         break;
680     case KEXEC_RANGE_MA_VMCOREINFO:
681         ret = kexec_get_vmcoreinfo(range);
682         break;
683     default:
684         ret = machine_kexec_get(range);
685         break;
686     }
687 
688     return ret;
689 }
690 
kexec_get_range(XEN_GUEST_HANDLE_PARAM (void)uarg)691 static int kexec_get_range(XEN_GUEST_HANDLE_PARAM(void) uarg)
692 {
693     xen_kexec_range_t range;
694     int ret = -EINVAL;
695 
696     if ( unlikely(copy_from_guest(&range, uarg, 1)) )
697         return -EFAULT;
698 
699     ret = kexec_get_range_internal(&range);
700 
701     if ( ret == 0 && unlikely(__copy_to_guest(uarg, &range, 1)) )
702         ret = -EFAULT;
703 
704     return ret;
705 }
706 
kexec_get_range_compat(XEN_GUEST_HANDLE_PARAM (void)uarg)707 static int kexec_get_range_compat(XEN_GUEST_HANDLE_PARAM(void) uarg)
708 {
709 #ifdef CONFIG_COMPAT
710     xen_kexec_range_t range;
711     compat_kexec_range_t compat_range;
712     int ret = -EINVAL;
713 
714     if ( unlikely(copy_from_guest(&compat_range, uarg, 1)) )
715         return -EFAULT;
716 
717     XLAT_kexec_range(&range, &compat_range);
718 
719     ret = kexec_get_range_internal(&range);
720 
721     /* Dont silently truncate physical addresses or sizes. */
722     if ( (range.start | range.size) & ~(unsigned long)(~0u) )
723         return -ERANGE;
724 
725     if ( ret == 0 )
726     {
727         XLAT_kexec_range(&compat_range, &range);
728         if ( unlikely(__copy_to_guest(uarg, &compat_range, 1)) )
729              ret = -EFAULT;
730     }
731 
732     return ret;
733 #else /* CONFIG_COMPAT */
734     return 0;
735 #endif /* CONFIG_COMPAT */
736 }
737 
kexec_load_get_bits(int type,int * base,int * bit)738 static int kexec_load_get_bits(int type, int *base, int *bit)
739 {
740     switch ( type )
741     {
742     case KEXEC_TYPE_DEFAULT:
743         *base = KEXEC_IMAGE_DEFAULT_BASE;
744         *bit = KEXEC_FLAG_DEFAULT_POS;
745         break;
746     case KEXEC_TYPE_CRASH:
747         *base = KEXEC_IMAGE_CRASH_BASE;
748         *bit = KEXEC_FLAG_CRASH_POS;
749         break;
750     default:
751         return -1;
752     }
753     return 0;
754 }
755 
vmcoreinfo_append_str(const char * fmt,...)756 void vmcoreinfo_append_str(const char *fmt, ...)
757 {
758     va_list args;
759     char buf[0x50];
760     int r;
761     size_t note_size = sizeof(Elf_Note) + ELFNOTE_ALIGN(strlen(VMCOREINFO_NOTE_NAME) + 1);
762 
763     if (vmcoreinfo_size + note_size + sizeof(buf) > VMCOREINFO_BYTES)
764         return;
765 
766     va_start(args, fmt);
767     r = vsnprintf(buf, sizeof(buf), fmt, args);
768     va_end(args);
769 
770     memcpy(&vmcoreinfo_data[note_size + vmcoreinfo_size], buf, r);
771 
772     vmcoreinfo_size += r;
773 }
774 
crash_save_vmcoreinfo(void)775 static void crash_save_vmcoreinfo(void)
776 {
777     size_t data_size;
778 
779     if (vmcoreinfo_size > 0)    /* already saved */
780         return;
781 
782     data_size = VMCOREINFO_BYTES - (sizeof(Elf_Note) + ELFNOTE_ALIGN(strlen(VMCOREINFO_NOTE_NAME) + 1));
783     setup_note((Elf_Note *)vmcoreinfo_data, VMCOREINFO_NOTE_NAME, 0, data_size);
784 
785     VMCOREINFO_PAGESIZE(PAGE_SIZE);
786 
787     VMCOREINFO_SYMBOL(domain_list);
788 #ifndef frame_table
789     VMCOREINFO_SYMBOL(frame_table);
790 #else
791     {
792         static const void *const _frame_table = frame_table;
793         VMCOREINFO_SYMBOL_ALIAS(frame_table, _frame_table);
794     }
795 #endif
796     VMCOREINFO_SYMBOL(max_page);
797 
798     VMCOREINFO_STRUCT_SIZE(page_info);
799     VMCOREINFO_STRUCT_SIZE(domain);
800 
801     VMCOREINFO_OFFSET(page_info, count_info);
802     VMCOREINFO_OFFSET_SUB(page_info, v.inuse, _domain);
803     VMCOREINFO_OFFSET(domain, domain_id);
804     VMCOREINFO_OFFSET(domain, next_in_list);
805 
806 #ifdef ARCH_CRASH_SAVE_VMCOREINFO
807     arch_crash_save_vmcoreinfo();
808 #endif
809 }
810 
kexec_unload_image(struct kexec_image * image)811 static void kexec_unload_image(struct kexec_image *image)
812 {
813     if ( !image )
814         return;
815 
816     machine_kexec_unload(image);
817     kimage_free(image);
818 }
819 
kexec_exec(XEN_GUEST_HANDLE_PARAM (void)uarg)820 static int kexec_exec(XEN_GUEST_HANDLE_PARAM(void) uarg)
821 {
822     xen_kexec_exec_t exec;
823     struct kexec_image *image;
824     int base, bit, pos, ret = -EINVAL;
825 
826     if ( unlikely(copy_from_guest(&exec, uarg, 1)) )
827         return -EFAULT;
828 
829     if ( kexec_load_get_bits(exec.type, &base, &bit) )
830         return -EINVAL;
831 
832     pos = (test_bit(bit, &kexec_flags) != 0);
833 
834     /* Only allow kexec/kdump into loaded images */
835     if ( !test_bit(base + pos, &kexec_flags) )
836         return -ENOENT;
837 
838     switch (exec.type)
839     {
840     case KEXEC_TYPE_DEFAULT:
841         image = kexec_image[base + pos];
842         ret = continue_hypercall_on_cpu(0, kexec_reboot, image);
843         break;
844     case KEXEC_TYPE_CRASH:
845         kexec_crash(CRASHREASON_KEXECCMD); /* Does not return */
846         break;
847     }
848 
849     return -EINVAL; /* never reached */
850 }
851 
kexec_swap_images(int type,struct kexec_image * new,struct kexec_image ** old)852 static int kexec_swap_images(int type, struct kexec_image *new,
853                              struct kexec_image **old)
854 {
855     int base, bit, pos;
856     int new_slot, old_slot;
857 
858     *old = NULL;
859 
860     if ( test_bit(KEXEC_FLAG_IN_PROGRESS, &kexec_flags) )
861         return -EBUSY;
862 
863     if ( kexec_load_get_bits(type, &base, &bit) )
864         return -EINVAL;
865 
866     ASSERT(test_bit(KEXEC_FLAG_IN_HYPERCALL, &kexec_flags));
867 
868     pos = (test_bit(bit, &kexec_flags) != 0);
869     old_slot = base + pos;
870     new_slot = base + !pos;
871 
872     kexec_image[new_slot] = new;
873     if ( new )
874         set_bit(new_slot, &kexec_flags);
875     change_bit(bit, &kexec_flags);
876 
877     clear_bit(old_slot, &kexec_flags);
878     *old = kexec_image[old_slot];
879 
880     return 0;
881 }
882 
kexec_load_slot(struct kexec_image * kimage)883 static int kexec_load_slot(struct kexec_image *kimage)
884 {
885     struct kexec_image *old_kimage;
886     int ret = -ENOMEM;
887 
888     ret = machine_kexec_load(kimage);
889     if ( ret < 0 )
890         return ret;
891 
892     crash_save_vmcoreinfo();
893 
894     ret = kexec_swap_images(kimage->type, kimage, &old_kimage);
895     if ( ret < 0 )
896         return ret;
897 
898     kexec_unload_image(old_kimage);
899 
900     return 0;
901 }
902 
kexec_load_v1_arch(void)903 static uint16_t kexec_load_v1_arch(void)
904 {
905 #ifdef CONFIG_X86
906     return is_pv_32bit_domain(hardware_domain) ? EM_386 : EM_X86_64;
907 #else
908     return EM_NONE;
909 #endif
910 }
911 
kexec_segments_add_segment(unsigned int * nr_segments,xen_kexec_segment_t * segments,mfn_t mfn)912 static int kexec_segments_add_segment(unsigned int *nr_segments,
913                                       xen_kexec_segment_t *segments,
914                                       mfn_t mfn)
915 {
916     paddr_t maddr = mfn_to_maddr(mfn);
917     unsigned int n = *nr_segments;
918 
919     /* Need a new segment? */
920     if ( n == 0
921          || segments[n-1].dest_maddr + segments[n-1].dest_size != maddr )
922     {
923         n++;
924         if ( n > KEXEC_SEGMENT_MAX )
925             return -EINVAL;
926         *nr_segments = n;
927 
928         set_xen_guest_handle(segments[n-1].buf.h, NULL);
929         segments[n-1].buf_size = 0;
930         segments[n-1].dest_maddr = maddr;
931         segments[n-1].dest_size = 0;
932     }
933 
934     return 0;
935 }
936 
kexec_segments_from_ind_page(mfn_t mfn,unsigned int * nr_segments,xen_kexec_segment_t * segments,bool compat)937 static int kexec_segments_from_ind_page(mfn_t mfn,
938                                         unsigned int *nr_segments,
939                                         xen_kexec_segment_t *segments,
940                                         bool compat)
941 {
942     void *page;
943     kimage_entry_t *entry;
944     int ret = 0;
945 
946     page = map_domain_page(mfn);
947 
948     /*
949      * Walk the indirection page list, adding destination pages to the
950      * segments.
951      */
952     for ( entry = page; ; )
953     {
954         unsigned long ind;
955 
956         ind = kimage_entry_ind(entry, compat);
957         mfn = kimage_entry_mfn(entry, compat);
958 
959         switch ( ind )
960         {
961         case IND_DESTINATION:
962             ret = kexec_segments_add_segment(nr_segments, segments, mfn);
963             if ( ret < 0 )
964                 goto done;
965             break;
966         case IND_INDIRECTION:
967             unmap_domain_page(page);
968             entry = page = map_domain_page(mfn);
969             continue;
970         case IND_DONE:
971             goto done;
972         case IND_SOURCE:
973             if ( *nr_segments == 0 )
974             {
975                 ret = -EINVAL;
976                 goto done;
977             }
978             segments[*nr_segments-1].dest_size += PAGE_SIZE;
979             break;
980         default:
981             ret = -EINVAL;
982             goto done;
983         }
984         entry = kimage_entry_next(entry, compat);
985     }
986 done:
987     unmap_domain_page(page);
988     return ret;
989 }
990 
kexec_do_load_v1(xen_kexec_load_v1_t * load,int compat)991 static int kexec_do_load_v1(xen_kexec_load_v1_t *load, int compat)
992 {
993     struct kexec_image *kimage = NULL;
994     xen_kexec_segment_t *segments;
995     uint16_t arch;
996     unsigned int nr_segments = 0;
997     mfn_t ind_mfn = maddr_to_mfn(load->image.indirection_page);
998     int ret;
999 
1000     arch = kexec_load_v1_arch();
1001     if ( arch == EM_NONE )
1002         return -ENOSYS;
1003 
1004     segments = xmalloc_array(xen_kexec_segment_t, KEXEC_SEGMENT_MAX);
1005     if ( segments == NULL )
1006         return -ENOMEM;
1007 
1008     /*
1009      * Work out the image segments (destination only) from the
1010      * indirection pages.
1011      *
1012      * This is needed so we don't allocate pages that will overlap
1013      * with the destination when building the new set of indirection
1014      * pages below.
1015      */
1016     ret = kexec_segments_from_ind_page(ind_mfn, &nr_segments, segments, compat);
1017     if ( ret < 0 )
1018         goto error;
1019 
1020     ret = kimage_alloc(&kimage, load->type, arch, load->image.start_address,
1021                        nr_segments, segments);
1022     if ( ret < 0 )
1023         goto error;
1024 
1025     /*
1026      * Build a new set of indirection pages in the native format.
1027      *
1028      * This walks the guest provided indirection pages a second time.
1029      * The guest could have altered then, invalidating the segment
1030      * information constructed above.  This will only result in the
1031      * resulting image being potentially unrelocatable.
1032      */
1033     ret = kimage_build_ind(kimage, ind_mfn, compat);
1034     if ( ret < 0 )
1035         goto error;
1036 
1037     if ( arch == EM_386 || arch == EM_X86_64 )
1038     {
1039         /*
1040          * Ensure 0 - 1 MiB is mapped and accessible by the image.
1041          *
1042          * This allows access to VGA memory and the region purgatory copies
1043          * in the crash case.
1044          */
1045         unsigned long addr;
1046 
1047         for ( addr = 0; addr < MB(1); addr += PAGE_SIZE )
1048         {
1049             ret = machine_kexec_add_page(kimage, addr, addr);
1050             if ( ret < 0 )
1051                 goto error;
1052         }
1053     }
1054 
1055     ret = kexec_load_slot(kimage);
1056     if ( ret < 0 )
1057         goto error;
1058 
1059     return 0;
1060 
1061 error:
1062     if ( !kimage )
1063         xfree(segments);
1064     kimage_free(kimage);
1065     return ret;
1066 }
1067 
kexec_load_v1(XEN_GUEST_HANDLE_PARAM (void)uarg)1068 static int kexec_load_v1(XEN_GUEST_HANDLE_PARAM(void) uarg)
1069 {
1070     xen_kexec_load_v1_t load;
1071 
1072     if ( unlikely(copy_from_guest(&load, uarg, 1)) )
1073         return -EFAULT;
1074 
1075     return kexec_do_load_v1(&load, 0);
1076 }
1077 
kexec_load_v1_compat(XEN_GUEST_HANDLE_PARAM (void)uarg)1078 static int kexec_load_v1_compat(XEN_GUEST_HANDLE_PARAM(void) uarg)
1079 {
1080 #ifdef CONFIG_COMPAT
1081     compat_kexec_load_v1_t compat_load;
1082     xen_kexec_load_v1_t load;
1083 
1084     if ( unlikely(copy_from_guest(&compat_load, uarg, 1)) )
1085         return -EFAULT;
1086 
1087     /* This is a bit dodgy, load.image is inside load,
1088      * but XLAT_kexec_load (which is automatically generated)
1089      * doesn't translate load.image (correctly)
1090      * Just copy load->type, the only other member, manually instead.
1091      *
1092      * XLAT_kexec_load(&load, &compat_load);
1093      */
1094     load.type = compat_load.type;
1095     XLAT_kexec_image(&load.image, &compat_load.image);
1096 
1097     return kexec_do_load_v1(&load, 1);
1098 #else
1099     return 0;
1100 #endif
1101 }
1102 
kexec_load(XEN_GUEST_HANDLE_PARAM (void)uarg)1103 static int kexec_load(XEN_GUEST_HANDLE_PARAM(void) uarg)
1104 {
1105     xen_kexec_load_t load;
1106     xen_kexec_segment_t *segments;
1107     struct kexec_image *kimage = NULL;
1108     int ret;
1109 
1110     if ( copy_from_guest(&load, uarg, 1) )
1111         return -EFAULT;
1112 
1113     if ( load.nr_segments >= KEXEC_SEGMENT_MAX )
1114         return -EINVAL;
1115 
1116     segments = xmalloc_array(xen_kexec_segment_t, load.nr_segments);
1117     if ( segments == NULL )
1118         return -ENOMEM;
1119 
1120     if ( copy_from_guest(segments, load.segments.h, load.nr_segments) )
1121     {
1122         ret = -EFAULT;
1123         goto error;
1124     }
1125 
1126     ret = kimage_alloc(&kimage, load.type, load.arch, load.entry_maddr,
1127                        load.nr_segments, segments);
1128     if ( ret < 0 )
1129         goto error;
1130 
1131     ret = kimage_load_segments(kimage);
1132     if ( ret < 0 )
1133         goto error;
1134 
1135     ret = kexec_load_slot(kimage);
1136     if ( ret < 0 )
1137         goto error;
1138 
1139     return 0;
1140 
1141 error:
1142     if ( ! kimage )
1143         xfree(segments);
1144     kimage_free(kimage);
1145     return ret;
1146 }
1147 
kexec_do_unload(xen_kexec_unload_t * unload)1148 static int kexec_do_unload(xen_kexec_unload_t *unload)
1149 {
1150     struct kexec_image *old_kimage;
1151     int ret;
1152 
1153     ret = kexec_swap_images(unload->type, NULL, &old_kimage);
1154     if ( ret < 0 )
1155         return ret;
1156 
1157     kexec_unload_image(old_kimage);
1158 
1159     return 0;
1160 }
1161 
kexec_unload_v1(XEN_GUEST_HANDLE_PARAM (void)uarg)1162 static int kexec_unload_v1(XEN_GUEST_HANDLE_PARAM(void) uarg)
1163 {
1164     xen_kexec_load_v1_t load;
1165     xen_kexec_unload_t unload;
1166 
1167     if ( copy_from_guest(&load, uarg, 1) )
1168         return -EFAULT;
1169 
1170     unload.type = load.type;
1171     return kexec_do_unload(&unload);
1172 }
1173 
kexec_unload_v1_compat(XEN_GUEST_HANDLE_PARAM (void)uarg)1174 static int kexec_unload_v1_compat(XEN_GUEST_HANDLE_PARAM(void) uarg)
1175 {
1176 #ifdef CONFIG_COMPAT
1177     compat_kexec_load_v1_t compat_load;
1178     xen_kexec_unload_t unload;
1179 
1180     if ( copy_from_guest(&compat_load, uarg, 1) )
1181         return -EFAULT;
1182 
1183     unload.type = compat_load.type;
1184     return kexec_do_unload(&unload);
1185 #else
1186     return 0;
1187 #endif
1188 }
1189 
kexec_unload(XEN_GUEST_HANDLE_PARAM (void)uarg)1190 static int kexec_unload(XEN_GUEST_HANDLE_PARAM(void) uarg)
1191 {
1192     xen_kexec_unload_t unload;
1193 
1194     if ( unlikely(copy_from_guest(&unload, uarg, 1)) )
1195         return -EFAULT;
1196 
1197     return kexec_do_unload(&unload);
1198 }
1199 
kexec_status(XEN_GUEST_HANDLE_PARAM (void)uarg)1200 static int kexec_status(XEN_GUEST_HANDLE_PARAM(void) uarg)
1201 {
1202     xen_kexec_status_t status;
1203     int base, bit;
1204 
1205     if ( unlikely(copy_from_guest(&status, uarg, 1)) )
1206         return -EFAULT;
1207 
1208     /* No need to check KEXEC_FLAG_IN_PROGRESS. */
1209 
1210     if ( kexec_load_get_bits(status.type, &base, &bit) )
1211         return -EINVAL;
1212 
1213     return !!test_bit(bit, &kexec_flags);
1214 }
1215 
do_kexec_op_internal(unsigned long op,XEN_GUEST_HANDLE_PARAM (void)uarg,bool compat)1216 static int do_kexec_op_internal(unsigned long op,
1217                                 XEN_GUEST_HANDLE_PARAM(void) uarg,
1218                                 bool compat)
1219 {
1220     int ret = -EINVAL;
1221 
1222     ret = xsm_kexec(XSM_PRIV);
1223     if ( ret )
1224         return ret;
1225 
1226     if ( test_and_set_bit(KEXEC_FLAG_IN_HYPERCALL, &kexec_flags) )
1227         return hypercall_create_continuation(__HYPERVISOR_kexec_op, "lh", op, uarg);
1228 
1229     switch ( op )
1230     {
1231     case KEXEC_CMD_kexec_get_range:
1232         if (compat)
1233                 ret = kexec_get_range_compat(uarg);
1234         else
1235                 ret = kexec_get_range(uarg);
1236         break;
1237     case KEXEC_CMD_kexec_load_v1:
1238         if ( compat )
1239             ret = kexec_load_v1_compat(uarg);
1240         else
1241             ret = kexec_load_v1(uarg);
1242         break;
1243     case KEXEC_CMD_kexec_unload_v1:
1244         if ( compat )
1245             ret = kexec_unload_v1_compat(uarg);
1246         else
1247             ret = kexec_unload_v1(uarg);
1248         break;
1249     case KEXEC_CMD_kexec:
1250         ret = kexec_exec(uarg);
1251         break;
1252     case KEXEC_CMD_kexec_load:
1253         ret = kexec_load(uarg);
1254         break;
1255     case KEXEC_CMD_kexec_unload:
1256         ret = kexec_unload(uarg);
1257         break;
1258     case KEXEC_CMD_kexec_status:
1259         ret = kexec_status(uarg);
1260         break;
1261     }
1262 
1263     clear_bit(KEXEC_FLAG_IN_HYPERCALL, &kexec_flags);
1264 
1265     return ret;
1266 }
1267 
do_kexec_op(unsigned long op,XEN_GUEST_HANDLE_PARAM (void)uarg)1268 long do_kexec_op(unsigned long op, XEN_GUEST_HANDLE_PARAM(void) uarg)
1269 {
1270     return do_kexec_op_internal(op, uarg, 0);
1271 }
1272 
1273 #ifdef CONFIG_COMPAT
compat_kexec_op(unsigned int op,XEN_GUEST_HANDLE_PARAM (void)uarg)1274 int compat_kexec_op(unsigned int op, XEN_GUEST_HANDLE_PARAM(void) uarg)
1275 {
1276     return do_kexec_op_internal(op, uarg, 1);
1277 }
1278 #endif
1279 
1280 /*
1281  * Local variables:
1282  * mode: C
1283  * c-file-style: "BSD"
1284  * c-basic-offset: 4
1285  * tab-width: 4
1286  * indent-tabs-mode: nil
1287  * End:
1288  */
1289