1 /******************************************************************************
2  * kexec.c - Achitecture independent kexec code for Xen
3  *
4  * Xen port written by:
5  * - Simon 'Horms' Horman <horms@verge.net.au>
6  * - Magnus Damm <magnus@valinux.co.jp>
7  */
8 
9 #include <xen/init.h>
10 #include <xen/lib.h>
11 #include <xen/acpi.h>
12 #include <xen/ctype.h>
13 #include <xen/errno.h>
14 #include <xen/guest_access.h>
15 #include <xen/watchdog.h>
16 #include <xen/sched.h>
17 #include <xen/types.h>
18 #include <xen/hypercall.h>
19 #include <xen/kexec.h>
20 #include <xen/keyhandler.h>
21 #include <public/kexec.h>
22 #include <xen/cpumask.h>
23 #include <asm/atomic.h>
24 #include <xen/spinlock.h>
25 #include <xen/version.h>
26 #include <xen/console.h>
27 #include <xen/kexec.h>
28 #include <xen/kimage.h>
29 #include <public/elfnote.h>
30 #include <xsm/xsm.h>
31 #include <xen/cpu.h>
32 #ifdef CONFIG_COMPAT
33 #include <compat/kexec.h>
34 #endif
35 
36 bool_t kexecing = FALSE;
37 
38 /* Memory regions to store the per cpu register state etc. on a crash. */
39 typedef struct { Elf_Note * start; size_t size; } crash_note_range_t;
40 static crash_note_range_t * crash_notes;
41 
42 /* Lock to prevent race conditions when allocating the crash note buffers.
43  * It also serves to protect calls to alloc_from_crash_heap when allocating
44  * crash note buffers in lower memory. */
45 static DEFINE_SPINLOCK(crash_notes_lock);
46 
47 static Elf_Note *xen_crash_note;
48 
49 static cpumask_t crash_saved_cpus;
50 
51 static struct kexec_image *kexec_image[KEXEC_IMAGE_NR];
52 
53 #define KEXEC_FLAG_DEFAULT_POS   (KEXEC_IMAGE_NR + 0)
54 #define KEXEC_FLAG_CRASH_POS     (KEXEC_IMAGE_NR + 1)
55 #define KEXEC_FLAG_IN_PROGRESS   (KEXEC_IMAGE_NR + 2)
56 #define KEXEC_FLAG_IN_HYPERCALL  (KEXEC_IMAGE_NR + 3)
57 
58 static unsigned long kexec_flags = 0; /* the lowest bits are for KEXEC_IMAGE... */
59 
60 static unsigned char vmcoreinfo_data[VMCOREINFO_BYTES];
61 static size_t vmcoreinfo_size = 0;
62 
63 xen_kexec_reserve_t kexec_crash_area;
64 paddr_t __initdata kexec_crash_area_limit = ~(paddr_t)0;
65 static struct {
66     u64 start, end;
67     unsigned long size;
68 } ranges[16] __initdata;
69 
70 /* Low crashinfo mode.  Start as INVALID so serveral codepaths can set up
71  * defaults without needing to know the state of the others. */
72 enum low_crashinfo low_crashinfo_mode = LOW_CRASHINFO_INVALID;
73 
74 /* This value is only considered if low_crash_mode is set to MIN or ALL, so
75  * setting a default here is safe. Default to 4GB.  This is because the current
76  * KEXEC_CMD_get_range compat hypercall trucates 64bit pointers to 32 bits. The
77  * typical usecase for crashinfo_maxaddr will be for 64bit Xen with 32bit dom0
78  * and 32bit crash kernel. */
79 static paddr_t __initdata crashinfo_maxaddr = 4ULL << 30;
80 
81 /* = log base 2 of crashinfo_maxaddr after checking for sanity. Default to
82  * larger than the entire physical address space. */
83 unsigned int __initdata crashinfo_maxaddr_bits = 64;
84 
85 /* Pointers to keep track of the crash heap region. */
86 static void *crash_heap_current = NULL, *crash_heap_end = NULL;
87 
88 /*
89  * Parse command lines in the format
90  *
91  *   crashkernel=<ramsize-range>:<size>[,...][{@,<,below=}<address>]
92  *
93  * with <ramsize-range> being of form
94  *
95  *   <start>-[<end>]
96  *
97  * as well as the legacy ones in the format
98  *
99  *   crashkernel=<size>[{@,<}<address>]
100  *   crashkernel=<size>,below=address
101  *
102  * < and below are synonyomous, the latter being useful for grub2 systems
103  * which would otherwise require escaping of the < option
104  */
parse_crashkernel(const char * str)105 static int __init parse_crashkernel(const char *str)
106 {
107     const char *cur;
108     int rc = 0;
109 
110     if ( strchr(str, ':' ) )
111     {
112         unsigned int idx = 0;
113 
114         do {
115             if ( idx >= ARRAY_SIZE(ranges) )
116             {
117                 printk(XENLOG_WARNING "crashkernel: too many ranges\n");
118                 cur = NULL;
119                 str = strpbrk(str, "@,<");
120                 rc = -EINVAL;
121                 break;
122             }
123 
124             ranges[idx].start = parse_size_and_unit(cur = str + !!idx, &str);
125             if ( cur == str )
126                 break;
127 
128             if ( *str != '-' )
129             {
130                 printk(XENLOG_WARNING "crashkernel: '-' expected\n");
131                 rc = -EINVAL;
132                 break;
133             }
134 
135             if ( *++str != ':' )
136             {
137                 ranges[idx].end = parse_size_and_unit(cur = str, &str);
138                 if ( cur == str )
139                     break;
140                 if ( ranges[idx].end <= ranges[idx].start )
141                 {
142                     printk(XENLOG_WARNING "crashkernel: end <= start\n");
143                     rc = -EINVAL;
144                     break;
145                 }
146             }
147             else
148                 ranges[idx].end = -1;
149 
150             if ( *str != ':' )
151             {
152                 printk(XENLOG_WARNING "crashkernel: ':' expected\n");
153                 rc = -EINVAL;
154                 break;
155             }
156 
157             ranges[idx].size = parse_size_and_unit(cur = str + 1, &str);
158             if ( cur == str )
159                 break;
160 
161             ++idx;
162         } while ( *str == ',' );
163         if ( idx < ARRAY_SIZE(ranges) )
164             ranges[idx].size = 0;
165     }
166     else
167         kexec_crash_area.size = parse_size_and_unit(cur = str, &str);
168     if ( cur != str )
169     {
170         if ( *str == '@' )
171             kexec_crash_area.start = parse_size_and_unit(cur = str + 1, &str);
172         else if ( *str == '<' )
173             kexec_crash_area_limit = parse_size_and_unit(cur = str + 1, &str);
174         else if ( !strncmp(str, ",below=", 7) )
175             kexec_crash_area_limit = parse_size_and_unit(cur = str + 7, &str);
176         else
177         {
178             printk(XENLOG_WARNING "crashkernel: '%s' ignored\n", str);
179             rc = -EINVAL;
180         }
181     }
182     if ( cur && cur == str )
183     {
184         printk(XENLOG_WARNING "crashkernel: memory value expected\n");
185         rc = -EINVAL;
186     }
187 
188     return rc;
189 }
190 custom_param("crashkernel", parse_crashkernel);
191 
192 /* Parse command lines in the format:
193  *
194  *   low_crashinfo=[none,min,all]
195  *
196  * - none disables the low allocation of crash info.
197  * - min will allocate enough low information for the crash kernel to be able
198  *       to extract the hypervisor and dom0 message ring buffers.
199  * - all will allocate additional structures such as domain and vcpu structs
200  *       low so the crash kernel can perform an extended analysis of state.
201  */
parse_low_crashinfo(const char * str)202 static int __init parse_low_crashinfo(const char *str)
203 {
204 
205     if ( !strlen(str) )
206         /* default to min if user just specifies "low_crashinfo" */
207         low_crashinfo_mode = LOW_CRASHINFO_MIN;
208     else if ( !strcmp(str, "none" ) )
209         low_crashinfo_mode = LOW_CRASHINFO_NONE;
210     else if ( !strcmp(str, "min" ) )
211         low_crashinfo_mode = LOW_CRASHINFO_MIN;
212     else if ( !strcmp(str, "all" ) )
213         low_crashinfo_mode = LOW_CRASHINFO_ALL;
214     else
215     {
216         printk("Unknown low_crashinfo parameter '%s'.  Defaulting to min.\n", str);
217         low_crashinfo_mode = LOW_CRASHINFO_MIN;
218         return -EINVAL;
219     }
220 
221     return 0;
222 }
223 custom_param("low_crashinfo", parse_low_crashinfo);
224 
225 /* Parse command lines in the format:
226  *
227  *   crashinfo_maxaddr=<addr>
228  *
229  * <addr> will be rounded down to the nearest power of two.  Defaults to 64G
230  */
parse_crashinfo_maxaddr(const char * str)231 static int __init parse_crashinfo_maxaddr(const char *str)
232 {
233     u64 addr;
234     const char *q;
235 
236     /* if low_crashinfo_mode is unset, default to min. */
237     if ( low_crashinfo_mode == LOW_CRASHINFO_INVALID )
238         low_crashinfo_mode = LOW_CRASHINFO_MIN;
239 
240     if ( (addr = parse_size_and_unit(str, &q)) )
241         crashinfo_maxaddr = addr;
242     else
243     {
244         printk("Unable to parse crashinfo_maxaddr. Defaulting to %"PRIpaddr"\n",
245                crashinfo_maxaddr);
246         return -EINVAL;
247     }
248 
249     return *q ? -EINVAL : 0;
250 }
251 custom_param("crashinfo_maxaddr", parse_crashinfo_maxaddr);
252 
set_kexec_crash_area_size(u64 system_ram)253 void __init set_kexec_crash_area_size(u64 system_ram)
254 {
255     unsigned int idx;
256 
257     for ( idx = 0; idx < ARRAY_SIZE(ranges) && !kexec_crash_area.size; ++idx )
258     {
259         if ( !ranges[idx].size )
260             break;
261 
262         if ( ranges[idx].size >= system_ram )
263         {
264             printk(XENLOG_WARNING "crashkernel: invalid size\n");
265             continue;
266         }
267 
268         if ( ranges[idx].start <= system_ram && ranges[idx].end > system_ram )
269             kexec_crash_area.size = ranges[idx].size;
270     }
271 }
272 
273 /*
274  * Only allow one cpu to continue on the crash path, forcing others to spin.
275  * Racing on the crash path from here will end in misery.  If we reenter,
276  * something has very gone wrong and retrying will (almost certainly) be
277  * futile.  Return up to our nested panic() to try and reboot.
278  *
279  * This is noinline to make it obvious in stack traces which cpus have lost
280  * the race (as opposed to being somewhere in kexec_common_shutdown())
281  */
one_cpu_only(void)282 static int noinline one_cpu_only(void)
283 {
284     static unsigned int crashing_cpu = -1;
285     unsigned int cpu = smp_processor_id();
286 
287     if ( cmpxchg(&crashing_cpu, -1, cpu) != -1 )
288     {
289         /* Not the first entry into one_cpu_only(). */
290         if ( crashing_cpu == cpu )
291         {
292             printk("Reentered the crash path.  Something is very broken\n");
293             return -EBUSY;
294         }
295 
296         /*
297          * Another cpu has beaten us to this point.  Wait here patiently for
298          * it to kill us.
299          */
300         for ( ; ; )
301             halt();
302     }
303 
304     set_bit(KEXEC_FLAG_IN_PROGRESS, &kexec_flags);
305     printk("Executing kexec image on cpu%u\n", cpu);
306 
307     return 0;
308 }
309 
310 /* Save the registers in the per-cpu crash note buffer. */
kexec_crash_save_cpu(void)311 void kexec_crash_save_cpu(void)
312 {
313     int cpu = smp_processor_id();
314     Elf_Note *note;
315     ELF_Prstatus *prstatus;
316     crash_xen_core_t *xencore;
317 
318     BUG_ON ( ! crash_notes );
319 
320     if ( cpumask_test_and_set_cpu(cpu, &crash_saved_cpus) )
321         return;
322 
323     note = crash_notes[cpu].start;
324 
325     prstatus = (ELF_Prstatus *)ELFNOTE_DESC(note);
326 
327     note = ELFNOTE_NEXT(note);
328     xencore = (crash_xen_core_t *)ELFNOTE_DESC(note);
329 
330     elf_core_save_regs(&prstatus->pr_reg, xencore);
331 }
332 
333 /* Set up the single Xen-specific-info crash note. */
kexec_crash_save_info(void)334 crash_xen_info_t *kexec_crash_save_info(void)
335 {
336     int cpu = smp_processor_id();
337     crash_xen_info_t info;
338     crash_xen_info_t *out = (crash_xen_info_t *)ELFNOTE_DESC(xen_crash_note);
339 
340     BUG_ON(!cpumask_test_and_set_cpu(cpu, &crash_saved_cpus));
341 
342     memset(&info, 0, sizeof(info));
343     info.xen_major_version = xen_major_version();
344     info.xen_minor_version = xen_minor_version();
345     info.xen_extra_version = __pa(xen_extra_version());
346     info.xen_changeset = __pa(xen_changeset());
347     info.xen_compiler = __pa(xen_compiler());
348     info.xen_compile_date = __pa(xen_compile_date());
349     info.xen_compile_time = __pa(xen_compile_time());
350     info.tainted = tainted;
351 
352     /* Copy from guaranteed-aligned local copy to possibly-unaligned dest. */
353     memcpy(out, &info, sizeof(info));
354 
355     return out;
356 }
357 
kexec_common_shutdown(void)358 static int kexec_common_shutdown(void)
359 {
360     int ret;
361 
362     ret = one_cpu_only();
363     if ( ret )
364         return ret;
365 
366     watchdog_disable();
367     console_start_sync();
368     spin_debug_disable();
369     acpi_dmar_reinstate();
370 
371     return 0;
372 }
373 
kexec_crash(void)374 void kexec_crash(void)
375 {
376     int pos;
377 
378     pos = (test_bit(KEXEC_FLAG_CRASH_POS, &kexec_flags) != 0);
379     if ( !test_bit(KEXEC_IMAGE_CRASH_BASE + pos, &kexec_flags) )
380         return;
381 
382     kexecing = TRUE;
383 
384     if ( kexec_common_shutdown() != 0 )
385         return;
386 
387     kexec_crash_save_cpu();
388     machine_crash_shutdown();
389     machine_kexec(kexec_image[KEXEC_IMAGE_CRASH_BASE + pos]);
390 
391     BUG();
392 }
393 
kexec_reboot(void * _image)394 static long kexec_reboot(void *_image)
395 {
396     struct kexec_image *image = _image;
397 
398     kexecing = TRUE;
399 
400     kexec_common_shutdown();
401     machine_reboot_kexec(image);
402 
403     BUG();
404     return 0;
405 }
406 
do_crashdump_trigger(unsigned char key)407 static void do_crashdump_trigger(unsigned char key)
408 {
409     printk("'%c' pressed -> triggering crashdump\n", key);
410     kexec_crash();
411     printk(" * no crash kernel loaded!\n");
412 }
413 
setup_note(Elf_Note * n,const char * name,int type,int descsz)414 static void setup_note(Elf_Note *n, const char *name, int type, int descsz)
415 {
416     int l = strlen(name) + 1;
417     strlcpy(ELFNOTE_NAME(n), name, l);
418     n->namesz = l;
419     n->descsz = descsz;
420     n->type = type;
421 }
422 
sizeof_note(const char * name,int descsz)423 static size_t sizeof_note(const char *name, int descsz)
424 {
425     return (sizeof(Elf_Note) +
426             ELFNOTE_ALIGN(strlen(name)+1) +
427             ELFNOTE_ALIGN(descsz));
428 }
429 
sizeof_cpu_notes(const unsigned long cpu)430 static size_t sizeof_cpu_notes(const unsigned long cpu)
431 {
432     /* All CPUs present a PRSTATUS and crash_xen_core note. */
433     size_t bytes =
434         + sizeof_note("CORE", sizeof(ELF_Prstatus)) +
435         + sizeof_note("Xen", sizeof(crash_xen_core_t));
436 
437     /* CPU0 also presents the crash_xen_info note. */
438     if ( ! cpu )
439         bytes = bytes +
440             sizeof_note("Xen", sizeof(crash_xen_info_t));
441 
442     return bytes;
443 }
444 
445 /* Allocate size_t bytes of space from the previously allocated
446  * crash heap if the user has requested that crash notes be allocated
447  * in lower memory.  There is currently no case where the crash notes
448  * should be free()'d. */
alloc_from_crash_heap(const size_t bytes)449 static void * alloc_from_crash_heap(const size_t bytes)
450 {
451     void * ret;
452     if ( crash_heap_current + bytes > crash_heap_end )
453         return NULL;
454     ret = (void*)crash_heap_current;
455     crash_heap_current += bytes;
456     return ret;
457 }
458 
459 /* Allocate a crash note buffer for a newly onlined cpu. */
kexec_init_cpu_notes(const unsigned long cpu)460 static int kexec_init_cpu_notes(const unsigned long cpu)
461 {
462     Elf_Note * note = NULL;
463     int ret = 0;
464     int nr_bytes = 0;
465 
466     BUG_ON( cpu >= nr_cpu_ids || ! crash_notes );
467 
468     /* If already allocated, nothing to do. */
469     if ( crash_notes[cpu].start )
470         return ret;
471 
472     nr_bytes = sizeof_cpu_notes(cpu);
473 
474     /* If we dont care about the position of allocation, malloc. */
475     if ( low_crashinfo_mode == LOW_CRASHINFO_NONE )
476         note = xzalloc_bytes(nr_bytes);
477 
478     /* Protect the write into crash_notes[] with a spinlock, as this function
479      * is on a hotplug path and a hypercall path. */
480     spin_lock(&crash_notes_lock);
481 
482     /* If we are racing with another CPU and it has beaten us, give up
483      * gracefully. */
484     if ( crash_notes[cpu].start )
485     {
486         spin_unlock(&crash_notes_lock);
487         /* Always return ok, because whether we successfully allocated or not,
488          * another CPU has successfully allocated. */
489         xfree(note);
490     }
491     else
492     {
493         /* If we care about memory possition, alloc from the crash heap,
494          * also protected by the crash_notes_lock. */
495         if ( low_crashinfo_mode > LOW_CRASHINFO_NONE )
496             note = alloc_from_crash_heap(nr_bytes);
497 
498         crash_notes[cpu].start = note;
499         crash_notes[cpu].size = nr_bytes;
500         spin_unlock(&crash_notes_lock);
501 
502         /* If the allocation failed, and another CPU did not beat us, give
503          * up with ENOMEM. */
504         if ( ! note )
505             ret = -ENOMEM;
506         /* else all is good so lets set up the notes. */
507         else
508         {
509             /* Set up CORE note. */
510             setup_note(note, "CORE", NT_PRSTATUS, sizeof(ELF_Prstatus));
511             note = ELFNOTE_NEXT(note);
512 
513             /* Set up Xen CORE note. */
514             setup_note(note, "Xen", XEN_ELFNOTE_CRASH_REGS,
515                        sizeof(crash_xen_core_t));
516 
517             if ( ! cpu )
518             {
519                 /* Set up Xen Crash Info note. */
520                 xen_crash_note = note = ELFNOTE_NEXT(note);
521                 setup_note(note, "Xen", XEN_ELFNOTE_CRASH_INFO,
522                            sizeof(crash_xen_info_t));
523             }
524         }
525     }
526 
527     return ret;
528 }
529 
cpu_callback(struct notifier_block * nfb,unsigned long action,void * hcpu)530 static int cpu_callback(
531     struct notifier_block *nfb, unsigned long action, void *hcpu)
532 {
533     unsigned long cpu = (unsigned long)hcpu;
534 
535     /* Only hook on CPU_UP_PREPARE because once a crash_note has been reported
536      * to dom0, it must keep it around in case of a crash, as the crash kernel
537      * will be hard coded to the original physical address reported. */
538     switch ( action )
539     {
540     case CPU_UP_PREPARE:
541         /* Ignore return value.  If this boot time, -ENOMEM will cause all
542          * manner of problems elsewhere very soon, and if it is during runtime,
543          * then failing to allocate crash notes is not a good enough reason to
544          * fail the CPU_UP_PREPARE */
545         kexec_init_cpu_notes(cpu);
546         break;
547     default:
548         break;
549     }
550     return NOTIFY_DONE;
551 }
552 
553 static struct notifier_block cpu_nfb = {
554     .notifier_call = cpu_callback
555 };
556 
kexec_early_calculations(void)557 void __init kexec_early_calculations(void)
558 {
559     /* If low_crashinfo_mode is still INVALID, neither "low_crashinfo" nor
560      * "crashinfo_maxaddr" have been specified on the command line, so
561      * explicitly set to NONE. */
562     if ( low_crashinfo_mode == LOW_CRASHINFO_INVALID )
563         low_crashinfo_mode = LOW_CRASHINFO_NONE;
564 
565     if ( low_crashinfo_mode > LOW_CRASHINFO_NONE )
566         crashinfo_maxaddr_bits = fls64(crashinfo_maxaddr) - 1;
567 }
568 
kexec_init(void)569 static int __init kexec_init(void)
570 {
571     void *cpu = (void *)(unsigned long)smp_processor_id();
572 
573     /* If no crash area, no need to allocate space for notes. */
574     if ( !kexec_crash_area.size )
575         return 0;
576 
577     if ( low_crashinfo_mode > LOW_CRASHINFO_NONE )
578     {
579         size_t crash_heap_size;
580 
581         /* This calculation is safe even if the machine is booted in
582          * uniprocessor mode. */
583         crash_heap_size = sizeof_cpu_notes(0) +
584             sizeof_cpu_notes(1) * (nr_cpu_ids - 1);
585         crash_heap_size = PAGE_ALIGN(crash_heap_size);
586 
587         crash_heap_current = alloc_xenheap_pages(
588             get_order_from_bytes(crash_heap_size),
589             MEMF_bits(crashinfo_maxaddr_bits) );
590 
591         if ( ! crash_heap_current )
592             return -ENOMEM;
593 
594         memset(crash_heap_current, 0, crash_heap_size);
595 
596         crash_heap_end = crash_heap_current + crash_heap_size;
597     }
598 
599     /* crash_notes may be allocated anywhere Xen can reach in memory.
600        Only the individual CPU crash notes themselves must be allocated
601        in lower memory if requested. */
602     crash_notes = xzalloc_array(crash_note_range_t, nr_cpu_ids);
603     if ( ! crash_notes )
604         return -ENOMEM;
605 
606     register_keyhandler('C', do_crashdump_trigger, "trigger a crashdump", 0);
607 
608     cpu_callback(&cpu_nfb, CPU_UP_PREPARE, cpu);
609     register_cpu_notifier(&cpu_nfb);
610     return 0;
611 }
612 /* The reason for this to be a presmp_initcall as opposed to a regular
613  * __initcall is to allow the setup of the cpu hotplug handler before APs are
614  * brought up. */
615 presmp_initcall(kexec_init);
616 
kexec_get_reserve(xen_kexec_range_t * range)617 static int kexec_get_reserve(xen_kexec_range_t *range)
618 {
619     if ( kexec_crash_area.size > 0 && kexec_crash_area.start > 0) {
620         range->start = kexec_crash_area.start;
621         range->size = kexec_crash_area.size;
622     }
623     else
624         range->start = range->size = 0;
625     return 0;
626 }
627 
kexec_get_cpu(xen_kexec_range_t * range)628 static int kexec_get_cpu(xen_kexec_range_t *range)
629 {
630     int nr = range->nr;
631 
632     if ( nr < 0 || nr >= nr_cpu_ids )
633         return -ERANGE;
634 
635     if ( ! crash_notes )
636         return -EINVAL;
637 
638     /* Try once again to allocate room for the crash notes.  It is just possible
639      * that more space has become available since we last tried.  If space has
640      * already been allocated, kexec_init_cpu_notes() will return early with 0.
641      */
642     kexec_init_cpu_notes(nr);
643 
644     /* In the case of still not having enough memory to allocate buffer room,
645      * returning a range of 0,0 is still valid. */
646     if ( crash_notes[nr].start )
647     {
648         range->start = __pa(crash_notes[nr].start);
649         range->size = crash_notes[nr].size;
650     }
651     else
652         range->start = range->size = 0;
653 
654     return 0;
655 }
656 
kexec_get_vmcoreinfo(xen_kexec_range_t * range)657 static int kexec_get_vmcoreinfo(xen_kexec_range_t *range)
658 {
659     range->start = __pa((unsigned long)vmcoreinfo_data);
660     range->size = VMCOREINFO_BYTES;
661     return 0;
662 }
663 
kexec_get_range_internal(xen_kexec_range_t * range)664 static int kexec_get_range_internal(xen_kexec_range_t *range)
665 {
666     int ret = -EINVAL;
667 
668     switch ( range->range )
669     {
670     case KEXEC_RANGE_MA_CRASH:
671         ret = kexec_get_reserve(range);
672         break;
673     case KEXEC_RANGE_MA_CPU:
674         ret = kexec_get_cpu(range);
675         break;
676     case KEXEC_RANGE_MA_VMCOREINFO:
677         ret = kexec_get_vmcoreinfo(range);
678         break;
679     default:
680         ret = machine_kexec_get(range);
681         break;
682     }
683 
684     return ret;
685 }
686 
kexec_get_range(XEN_GUEST_HANDLE_PARAM (void)uarg)687 static int kexec_get_range(XEN_GUEST_HANDLE_PARAM(void) uarg)
688 {
689     xen_kexec_range_t range;
690     int ret = -EINVAL;
691 
692     if ( unlikely(copy_from_guest(&range, uarg, 1)) )
693         return -EFAULT;
694 
695     ret = kexec_get_range_internal(&range);
696 
697     if ( ret == 0 && unlikely(__copy_to_guest(uarg, &range, 1)) )
698         ret = -EFAULT;
699 
700     return ret;
701 }
702 
kexec_get_range_compat(XEN_GUEST_HANDLE_PARAM (void)uarg)703 static int kexec_get_range_compat(XEN_GUEST_HANDLE_PARAM(void) uarg)
704 {
705 #ifdef CONFIG_COMPAT
706     xen_kexec_range_t range;
707     compat_kexec_range_t compat_range;
708     int ret = -EINVAL;
709 
710     if ( unlikely(copy_from_guest(&compat_range, uarg, 1)) )
711         return -EFAULT;
712 
713     XLAT_kexec_range(&range, &compat_range);
714 
715     ret = kexec_get_range_internal(&range);
716 
717     /* Dont silently truncate physical addresses or sizes. */
718     if ( (range.start | range.size) & ~(unsigned long)(~0u) )
719         return -ERANGE;
720 
721     if ( ret == 0 )
722     {
723         XLAT_kexec_range(&compat_range, &range);
724         if ( unlikely(__copy_to_guest(uarg, &compat_range, 1)) )
725              ret = -EFAULT;
726     }
727 
728     return ret;
729 #else /* CONFIG_COMPAT */
730     return 0;
731 #endif /* CONFIG_COMPAT */
732 }
733 
kexec_load_get_bits(int type,int * base,int * bit)734 static int kexec_load_get_bits(int type, int *base, int *bit)
735 {
736     switch ( type )
737     {
738     case KEXEC_TYPE_DEFAULT:
739         *base = KEXEC_IMAGE_DEFAULT_BASE;
740         *bit = KEXEC_FLAG_DEFAULT_POS;
741         break;
742     case KEXEC_TYPE_CRASH:
743         *base = KEXEC_IMAGE_CRASH_BASE;
744         *bit = KEXEC_FLAG_CRASH_POS;
745         break;
746     default:
747         return -1;
748     }
749     return 0;
750 }
751 
vmcoreinfo_append_str(const char * fmt,...)752 void vmcoreinfo_append_str(const char *fmt, ...)
753 {
754     va_list args;
755     char buf[0x50];
756     int r;
757     size_t note_size = sizeof(Elf_Note) + ELFNOTE_ALIGN(strlen(VMCOREINFO_NOTE_NAME) + 1);
758 
759     if (vmcoreinfo_size + note_size + sizeof(buf) > VMCOREINFO_BYTES)
760         return;
761 
762     va_start(args, fmt);
763     r = vsnprintf(buf, sizeof(buf), fmt, args);
764     va_end(args);
765 
766     memcpy(&vmcoreinfo_data[note_size + vmcoreinfo_size], buf, r);
767 
768     vmcoreinfo_size += r;
769 }
770 
crash_save_vmcoreinfo(void)771 static void crash_save_vmcoreinfo(void)
772 {
773     size_t data_size;
774 
775     if (vmcoreinfo_size > 0)    /* already saved */
776         return;
777 
778     data_size = VMCOREINFO_BYTES - (sizeof(Elf_Note) + ELFNOTE_ALIGN(strlen(VMCOREINFO_NOTE_NAME) + 1));
779     setup_note((Elf_Note *)vmcoreinfo_data, VMCOREINFO_NOTE_NAME, 0, data_size);
780 
781     VMCOREINFO_PAGESIZE(PAGE_SIZE);
782 
783     VMCOREINFO_SYMBOL(domain_list);
784 #ifndef frame_table
785     VMCOREINFO_SYMBOL(frame_table);
786 #else
787     {
788         static const void *const _frame_table = frame_table;
789         VMCOREINFO_SYMBOL_ALIAS(frame_table, _frame_table);
790     }
791 #endif
792     VMCOREINFO_SYMBOL(max_page);
793 
794     VMCOREINFO_STRUCT_SIZE(page_info);
795     VMCOREINFO_STRUCT_SIZE(domain);
796 
797     VMCOREINFO_OFFSET(page_info, count_info);
798     VMCOREINFO_OFFSET_SUB(page_info, v.inuse, _domain);
799     VMCOREINFO_OFFSET(domain, domain_id);
800     VMCOREINFO_OFFSET(domain, next_in_list);
801 
802 #ifdef ARCH_CRASH_SAVE_VMCOREINFO
803     arch_crash_save_vmcoreinfo();
804 #endif
805 }
806 
kexec_unload_image(struct kexec_image * image)807 static void kexec_unload_image(struct kexec_image *image)
808 {
809     if ( !image )
810         return;
811 
812     machine_kexec_unload(image);
813     kimage_free(image);
814 }
815 
kexec_exec(XEN_GUEST_HANDLE_PARAM (void)uarg)816 static int kexec_exec(XEN_GUEST_HANDLE_PARAM(void) uarg)
817 {
818     xen_kexec_exec_t exec;
819     struct kexec_image *image;
820     int base, bit, pos, ret = -EINVAL;
821 
822     if ( unlikely(copy_from_guest(&exec, uarg, 1)) )
823         return -EFAULT;
824 
825     if ( kexec_load_get_bits(exec.type, &base, &bit) )
826         return -EINVAL;
827 
828     pos = (test_bit(bit, &kexec_flags) != 0);
829 
830     /* Only allow kexec/kdump into loaded images */
831     if ( !test_bit(base + pos, &kexec_flags) )
832         return -ENOENT;
833 
834     switch (exec.type)
835     {
836     case KEXEC_TYPE_DEFAULT:
837         image = kexec_image[base + pos];
838         ret = continue_hypercall_on_cpu(0, kexec_reboot, image);
839         break;
840     case KEXEC_TYPE_CRASH:
841         kexec_crash(); /* Does not return */
842         break;
843     }
844 
845     return -EINVAL; /* never reached */
846 }
847 
kexec_swap_images(int type,struct kexec_image * new,struct kexec_image ** old)848 static int kexec_swap_images(int type, struct kexec_image *new,
849                              struct kexec_image **old)
850 {
851     int base, bit, pos;
852     int new_slot, old_slot;
853 
854     *old = NULL;
855 
856     if ( test_bit(KEXEC_FLAG_IN_PROGRESS, &kexec_flags) )
857         return -EBUSY;
858 
859     if ( kexec_load_get_bits(type, &base, &bit) )
860         return -EINVAL;
861 
862     ASSERT(test_bit(KEXEC_FLAG_IN_HYPERCALL, &kexec_flags));
863 
864     pos = (test_bit(bit, &kexec_flags) != 0);
865     old_slot = base + pos;
866     new_slot = base + !pos;
867 
868     kexec_image[new_slot] = new;
869     if ( new )
870         set_bit(new_slot, &kexec_flags);
871     change_bit(bit, &kexec_flags);
872 
873     clear_bit(old_slot, &kexec_flags);
874     *old = kexec_image[old_slot];
875 
876     return 0;
877 }
878 
kexec_load_slot(struct kexec_image * kimage)879 static int kexec_load_slot(struct kexec_image *kimage)
880 {
881     struct kexec_image *old_kimage;
882     int ret = -ENOMEM;
883 
884     ret = machine_kexec_load(kimage);
885     if ( ret < 0 )
886         return ret;
887 
888     crash_save_vmcoreinfo();
889 
890     ret = kexec_swap_images(kimage->type, kimage, &old_kimage);
891     if ( ret < 0 )
892         return ret;
893 
894     kexec_unload_image(old_kimage);
895 
896     return 0;
897 }
898 
kexec_load_v1_arch(void)899 static uint16_t kexec_load_v1_arch(void)
900 {
901 #ifdef CONFIG_X86
902     return is_pv_32bit_domain(hardware_domain) ? EM_386 : EM_X86_64;
903 #else
904     return EM_NONE;
905 #endif
906 }
907 
kexec_segments_add_segment(unsigned int * nr_segments,xen_kexec_segment_t * segments,mfn_t mfn)908 static int kexec_segments_add_segment(unsigned int *nr_segments,
909                                       xen_kexec_segment_t *segments,
910                                       mfn_t mfn)
911 {
912     paddr_t maddr = mfn_to_maddr(mfn);
913     unsigned int n = *nr_segments;
914 
915     /* Need a new segment? */
916     if ( n == 0
917          || segments[n-1].dest_maddr + segments[n-1].dest_size != maddr )
918     {
919         n++;
920         if ( n > KEXEC_SEGMENT_MAX )
921             return -EINVAL;
922         *nr_segments = n;
923 
924         set_xen_guest_handle(segments[n-1].buf.h, NULL);
925         segments[n-1].buf_size = 0;
926         segments[n-1].dest_maddr = maddr;
927         segments[n-1].dest_size = 0;
928     }
929 
930     return 0;
931 }
932 
kexec_segments_from_ind_page(mfn_t mfn,unsigned int * nr_segments,xen_kexec_segment_t * segments,bool_t compat)933 static int kexec_segments_from_ind_page(mfn_t mfn,
934                                         unsigned int *nr_segments,
935                                         xen_kexec_segment_t *segments,
936                                         bool_t compat)
937 {
938     void *page;
939     kimage_entry_t *entry;
940     int ret = 0;
941 
942     page = map_domain_page(mfn);
943 
944     /*
945      * Walk the indirection page list, adding destination pages to the
946      * segments.
947      */
948     for ( entry = page; ; )
949     {
950         unsigned long ind;
951 
952         ind = kimage_entry_ind(entry, compat);
953         mfn = kimage_entry_mfn(entry, compat);
954 
955         switch ( ind )
956         {
957         case IND_DESTINATION:
958             ret = kexec_segments_add_segment(nr_segments, segments, mfn);
959             if ( ret < 0 )
960                 goto done;
961             break;
962         case IND_INDIRECTION:
963             unmap_domain_page(page);
964             entry = page = map_domain_page(mfn);
965             continue;
966         case IND_DONE:
967             goto done;
968         case IND_SOURCE:
969             if ( *nr_segments == 0 )
970             {
971                 ret = -EINVAL;
972                 goto done;
973             }
974             segments[*nr_segments-1].dest_size += PAGE_SIZE;
975             break;
976         default:
977             ret = -EINVAL;
978             goto done;
979         }
980         entry = kimage_entry_next(entry, compat);
981     }
982 done:
983     unmap_domain_page(page);
984     return ret;
985 }
986 
kexec_do_load_v1(xen_kexec_load_v1_t * load,int compat)987 static int kexec_do_load_v1(xen_kexec_load_v1_t *load, int compat)
988 {
989     struct kexec_image *kimage = NULL;
990     xen_kexec_segment_t *segments;
991     uint16_t arch;
992     unsigned int nr_segments = 0;
993     mfn_t ind_mfn = maddr_to_mfn(load->image.indirection_page);
994     int ret;
995 
996     arch = kexec_load_v1_arch();
997     if ( arch == EM_NONE )
998         return -ENOSYS;
999 
1000     segments = xmalloc_array(xen_kexec_segment_t, KEXEC_SEGMENT_MAX);
1001     if ( segments == NULL )
1002         return -ENOMEM;
1003 
1004     /*
1005      * Work out the image segments (destination only) from the
1006      * indirection pages.
1007      *
1008      * This is needed so we don't allocate pages that will overlap
1009      * with the destination when building the new set of indirection
1010      * pages below.
1011      */
1012     ret = kexec_segments_from_ind_page(ind_mfn, &nr_segments, segments, compat);
1013     if ( ret < 0 )
1014         goto error;
1015 
1016     ret = kimage_alloc(&kimage, load->type, arch, load->image.start_address,
1017                        nr_segments, segments);
1018     if ( ret < 0 )
1019         goto error;
1020 
1021     /*
1022      * Build a new set of indirection pages in the native format.
1023      *
1024      * This walks the guest provided indirection pages a second time.
1025      * The guest could have altered then, invalidating the segment
1026      * information constructed above.  This will only result in the
1027      * resulting image being potentially unrelocatable.
1028      */
1029     ret = kimage_build_ind(kimage, ind_mfn, compat);
1030     if ( ret < 0 )
1031         goto error;
1032 
1033     if ( arch == EM_386 || arch == EM_X86_64 )
1034     {
1035         /*
1036          * Ensure 0 - 1 MiB is mapped and accessible by the image.
1037          *
1038          * This allows access to VGA memory and the region purgatory copies
1039          * in the crash case.
1040          */
1041         unsigned long addr;
1042 
1043         for ( addr = 0; addr < MB(1); addr += PAGE_SIZE )
1044         {
1045             ret = machine_kexec_add_page(kimage, addr, addr);
1046             if ( ret < 0 )
1047                 goto error;
1048         }
1049     }
1050 
1051     ret = kexec_load_slot(kimage);
1052     if ( ret < 0 )
1053         goto error;
1054 
1055     return 0;
1056 
1057 error:
1058     if ( !kimage )
1059         xfree(segments);
1060     kimage_free(kimage);
1061     return ret;
1062 }
1063 
kexec_load_v1(XEN_GUEST_HANDLE_PARAM (void)uarg)1064 static int kexec_load_v1(XEN_GUEST_HANDLE_PARAM(void) uarg)
1065 {
1066     xen_kexec_load_v1_t load;
1067 
1068     if ( unlikely(copy_from_guest(&load, uarg, 1)) )
1069         return -EFAULT;
1070 
1071     return kexec_do_load_v1(&load, 0);
1072 }
1073 
kexec_load_v1_compat(XEN_GUEST_HANDLE_PARAM (void)uarg)1074 static int kexec_load_v1_compat(XEN_GUEST_HANDLE_PARAM(void) uarg)
1075 {
1076 #ifdef CONFIG_COMPAT
1077     compat_kexec_load_v1_t compat_load;
1078     xen_kexec_load_v1_t load;
1079 
1080     if ( unlikely(copy_from_guest(&compat_load, uarg, 1)) )
1081         return -EFAULT;
1082 
1083     /* This is a bit dodgy, load.image is inside load,
1084      * but XLAT_kexec_load (which is automatically generated)
1085      * doesn't translate load.image (correctly)
1086      * Just copy load->type, the only other member, manually instead.
1087      *
1088      * XLAT_kexec_load(&load, &compat_load);
1089      */
1090     load.type = compat_load.type;
1091     XLAT_kexec_image(&load.image, &compat_load.image);
1092 
1093     return kexec_do_load_v1(&load, 1);
1094 #else
1095     return 0;
1096 #endif
1097 }
1098 
kexec_load(XEN_GUEST_HANDLE_PARAM (void)uarg)1099 static int kexec_load(XEN_GUEST_HANDLE_PARAM(void) uarg)
1100 {
1101     xen_kexec_load_t load;
1102     xen_kexec_segment_t *segments;
1103     struct kexec_image *kimage = NULL;
1104     int ret;
1105 
1106     if ( copy_from_guest(&load, uarg, 1) )
1107         return -EFAULT;
1108 
1109     if ( load.nr_segments >= KEXEC_SEGMENT_MAX )
1110         return -EINVAL;
1111 
1112     segments = xmalloc_array(xen_kexec_segment_t, load.nr_segments);
1113     if ( segments == NULL )
1114         return -ENOMEM;
1115 
1116     if ( copy_from_guest(segments, load.segments.h, load.nr_segments) )
1117     {
1118         ret = -EFAULT;
1119         goto error;
1120     }
1121 
1122     ret = kimage_alloc(&kimage, load.type, load.arch, load.entry_maddr,
1123                        load.nr_segments, segments);
1124     if ( ret < 0 )
1125         goto error;
1126 
1127     ret = kimage_load_segments(kimage);
1128     if ( ret < 0 )
1129         goto error;
1130 
1131     ret = kexec_load_slot(kimage);
1132     if ( ret < 0 )
1133         goto error;
1134 
1135     return 0;
1136 
1137 error:
1138     if ( ! kimage )
1139         xfree(segments);
1140     kimage_free(kimage);
1141     return ret;
1142 }
1143 
kexec_do_unload(xen_kexec_unload_t * unload)1144 static int kexec_do_unload(xen_kexec_unload_t *unload)
1145 {
1146     struct kexec_image *old_kimage;
1147     int ret;
1148 
1149     ret = kexec_swap_images(unload->type, NULL, &old_kimage);
1150     if ( ret < 0 )
1151         return ret;
1152 
1153     kexec_unload_image(old_kimage);
1154 
1155     return 0;
1156 }
1157 
kexec_unload_v1(XEN_GUEST_HANDLE_PARAM (void)uarg)1158 static int kexec_unload_v1(XEN_GUEST_HANDLE_PARAM(void) uarg)
1159 {
1160     xen_kexec_load_v1_t load;
1161     xen_kexec_unload_t unload;
1162 
1163     if ( copy_from_guest(&load, uarg, 1) )
1164         return -EFAULT;
1165 
1166     unload.type = load.type;
1167     return kexec_do_unload(&unload);
1168 }
1169 
kexec_unload_v1_compat(XEN_GUEST_HANDLE_PARAM (void)uarg)1170 static int kexec_unload_v1_compat(XEN_GUEST_HANDLE_PARAM(void) uarg)
1171 {
1172 #ifdef CONFIG_COMPAT
1173     compat_kexec_load_v1_t compat_load;
1174     xen_kexec_unload_t unload;
1175 
1176     if ( copy_from_guest(&compat_load, uarg, 1) )
1177         return -EFAULT;
1178 
1179     unload.type = compat_load.type;
1180     return kexec_do_unload(&unload);
1181 #else
1182     return 0;
1183 #endif
1184 }
1185 
kexec_unload(XEN_GUEST_HANDLE_PARAM (void)uarg)1186 static int kexec_unload(XEN_GUEST_HANDLE_PARAM(void) uarg)
1187 {
1188     xen_kexec_unload_t unload;
1189 
1190     if ( unlikely(copy_from_guest(&unload, uarg, 1)) )
1191         return -EFAULT;
1192 
1193     return kexec_do_unload(&unload);
1194 }
1195 
kexec_status(XEN_GUEST_HANDLE_PARAM (void)uarg)1196 static int kexec_status(XEN_GUEST_HANDLE_PARAM(void) uarg)
1197 {
1198     xen_kexec_status_t status;
1199     int base, bit;
1200 
1201     if ( unlikely(copy_from_guest(&status, uarg, 1)) )
1202         return -EFAULT;
1203 
1204     /* No need to check KEXEC_FLAG_IN_PROGRESS. */
1205 
1206     if ( kexec_load_get_bits(status.type, &base, &bit) )
1207         return -EINVAL;
1208 
1209     return !!test_bit(bit, &kexec_flags);
1210 }
1211 
do_kexec_op_internal(unsigned long op,XEN_GUEST_HANDLE_PARAM (void)uarg,bool_t compat)1212 static int do_kexec_op_internal(unsigned long op,
1213                                 XEN_GUEST_HANDLE_PARAM(void) uarg,
1214                                 bool_t compat)
1215 {
1216     int ret = -EINVAL;
1217 
1218     ret = xsm_kexec(XSM_PRIV);
1219     if ( ret )
1220         return ret;
1221 
1222     if ( test_and_set_bit(KEXEC_FLAG_IN_HYPERCALL, &kexec_flags) )
1223         return hypercall_create_continuation(__HYPERVISOR_kexec_op, "lh", op, uarg);
1224 
1225     switch ( op )
1226     {
1227     case KEXEC_CMD_kexec_get_range:
1228         if (compat)
1229                 ret = kexec_get_range_compat(uarg);
1230         else
1231                 ret = kexec_get_range(uarg);
1232         break;
1233     case KEXEC_CMD_kexec_load_v1:
1234         if ( compat )
1235             ret = kexec_load_v1_compat(uarg);
1236         else
1237             ret = kexec_load_v1(uarg);
1238         break;
1239     case KEXEC_CMD_kexec_unload_v1:
1240         if ( compat )
1241             ret = kexec_unload_v1_compat(uarg);
1242         else
1243             ret = kexec_unload_v1(uarg);
1244         break;
1245     case KEXEC_CMD_kexec:
1246         ret = kexec_exec(uarg);
1247         break;
1248     case KEXEC_CMD_kexec_load:
1249         ret = kexec_load(uarg);
1250         break;
1251     case KEXEC_CMD_kexec_unload:
1252         ret = kexec_unload(uarg);
1253         break;
1254     case KEXEC_CMD_kexec_status:
1255         ret = kexec_status(uarg);
1256         break;
1257     }
1258 
1259     clear_bit(KEXEC_FLAG_IN_HYPERCALL, &kexec_flags);
1260 
1261     return ret;
1262 }
1263 
do_kexec_op(unsigned long op,XEN_GUEST_HANDLE_PARAM (void)uarg)1264 long do_kexec_op(unsigned long op, XEN_GUEST_HANDLE_PARAM(void) uarg)
1265 {
1266     return do_kexec_op_internal(op, uarg, 0);
1267 }
1268 
1269 #ifdef CONFIG_COMPAT
compat_kexec_op(unsigned long op,XEN_GUEST_HANDLE_PARAM (void)uarg)1270 int compat_kexec_op(unsigned long op, XEN_GUEST_HANDLE_PARAM(void) uarg)
1271 {
1272     return do_kexec_op_internal(op, uarg, 1);
1273 }
1274 #endif
1275 
1276 /*
1277  * Local variables:
1278  * mode: C
1279  * c-file-style: "BSD"
1280  * c-basic-offset: 4
1281  * tab-width: 4
1282  * indent-tabs-mode: nil
1283  * End:
1284  */
1285