1 /******************************************************************************
2 * kexec.c - Achitecture independent kexec code for Xen
3 *
4 * Xen port written by:
5 * - Simon 'Horms' Horman <horms@verge.net.au>
6 * - Magnus Damm <magnus@valinux.co.jp>
7 */
8
9 #include <xen/init.h>
10 #include <xen/lib.h>
11 #include <xen/acpi.h>
12 #include <xen/ctype.h>
13 #include <xen/elfcore.h>
14 #include <xen/errno.h>
15 #include <xen/guest_access.h>
16 #include <xen/param.h>
17 #include <xen/watchdog.h>
18 #include <xen/sched.h>
19 #include <xen/types.h>
20 #include <xen/hypercall.h>
21 #include <xen/kexec.h>
22 #include <xen/keyhandler.h>
23 #include <public/kexec.h>
24 #include <xen/cpumask.h>
25 #include <asm/atomic.h>
26 #include <xen/spinlock.h>
27 #include <xen/version.h>
28 #include <xen/console.h>
29 #include <xen/kexec.h>
30 #include <xen/kimage.h>
31 #include <public/elfnote.h>
32 #include <xsm/xsm.h>
33 #include <xen/cpu.h>
34 #ifdef CONFIG_COMPAT
35 #include <compat/kexec.h>
36 #endif
37
38 bool __read_mostly kexecing;
39
40 /* Memory regions to store the per cpu register state etc. on a crash. */
41 typedef struct { Elf_Note * start; size_t size; } crash_note_range_t;
42 static crash_note_range_t * crash_notes;
43
44 /* Lock to prevent race conditions when allocating the crash note buffers.
45 * It also serves to protect calls to alloc_from_crash_heap when allocating
46 * crash note buffers in lower memory. */
47 static DEFINE_SPINLOCK(crash_notes_lock);
48
49 static Elf_Note *xen_crash_note;
50
51 static cpumask_t crash_saved_cpus;
52
53 static struct kexec_image *kexec_image[KEXEC_IMAGE_NR];
54
55 #define KEXEC_FLAG_DEFAULT_POS (KEXEC_IMAGE_NR + 0)
56 #define KEXEC_FLAG_CRASH_POS (KEXEC_IMAGE_NR + 1)
57 #define KEXEC_FLAG_IN_PROGRESS (KEXEC_IMAGE_NR + 2)
58 #define KEXEC_FLAG_IN_HYPERCALL (KEXEC_IMAGE_NR + 3)
59
60 static unsigned long kexec_flags = 0; /* the lowest bits are for KEXEC_IMAGE... */
61
62 static unsigned char vmcoreinfo_data[VMCOREINFO_BYTES];
63 static size_t vmcoreinfo_size = 0;
64
65 xen_kexec_reserve_t kexec_crash_area;
66 paddr_t __initdata kexec_crash_area_limit = ~(paddr_t)0;
67 static struct {
68 u64 start, end;
69 unsigned long size;
70 } ranges[16] __initdata;
71
72 /* Low crashinfo mode. Start as INVALID so serveral codepaths can set up
73 * defaults without needing to know the state of the others. */
74 enum low_crashinfo low_crashinfo_mode = LOW_CRASHINFO_INVALID;
75
76 /* This value is only considered if low_crash_mode is set to MIN or ALL, so
77 * setting a default here is safe. Default to 4GB. This is because the current
78 * KEXEC_CMD_get_range compat hypercall trucates 64bit pointers to 32 bits. The
79 * typical usecase for crashinfo_maxaddr will be for 64bit Xen with 32bit dom0
80 * and 32bit crash kernel. */
81 static paddr_t __initdata crashinfo_maxaddr = 4ULL << 30;
82
83 /* = log base 2 of crashinfo_maxaddr after checking for sanity. Default to
84 * larger than the entire physical address space. */
85 unsigned int __initdata crashinfo_maxaddr_bits = 64;
86
87 /* Pointers to keep track of the crash heap region. */
88 static void *crash_heap_current = NULL, *crash_heap_end = NULL;
89
90 /*
91 * Parse command lines in the format
92 *
93 * crashkernel=<ramsize-range>:<size>[,...][{@,<,below=}<address>]
94 *
95 * with <ramsize-range> being of form
96 *
97 * <start>-[<end>]
98 *
99 * as well as the legacy ones in the format
100 *
101 * crashkernel=<size>[{@,<}<address>]
102 * crashkernel=<size>,below=address
103 *
104 * < and below are synonyomous, the latter being useful for grub2 systems
105 * which would otherwise require escaping of the < option
106 */
parse_crashkernel(const char * str)107 static int __init cf_check parse_crashkernel(const char *str)
108 {
109 const char *cur;
110 int rc = 0;
111
112 if ( strchr(str, ':' ) )
113 {
114 unsigned int idx = 0;
115
116 do {
117 if ( idx >= ARRAY_SIZE(ranges) )
118 {
119 printk(XENLOG_WARNING "crashkernel: too many ranges\n");
120 cur = NULL;
121 str = strpbrk(str, "@,<");
122 rc = -EINVAL;
123 break;
124 }
125
126 ranges[idx].start = parse_size_and_unit(cur = str + !!idx, &str);
127 if ( cur == str )
128 break;
129
130 if ( *str != '-' )
131 {
132 printk(XENLOG_WARNING "crashkernel: '-' expected\n");
133 rc = -EINVAL;
134 break;
135 }
136
137 if ( *++str != ':' )
138 {
139 ranges[idx].end = parse_size_and_unit(cur = str, &str);
140 if ( cur == str )
141 break;
142 if ( ranges[idx].end <= ranges[idx].start )
143 {
144 printk(XENLOG_WARNING "crashkernel: end <= start\n");
145 rc = -EINVAL;
146 break;
147 }
148 }
149 else
150 ranges[idx].end = -1;
151
152 if ( *str != ':' )
153 {
154 printk(XENLOG_WARNING "crashkernel: ':' expected\n");
155 rc = -EINVAL;
156 break;
157 }
158
159 ranges[idx].size = parse_size_and_unit(cur = str + 1, &str);
160 if ( cur == str )
161 break;
162
163 ++idx;
164 } while ( *str == ',' );
165 if ( idx < ARRAY_SIZE(ranges) )
166 ranges[idx].size = 0;
167 }
168 else
169 kexec_crash_area.size = parse_size_and_unit(cur = str, &str);
170 if ( cur != str )
171 {
172 if ( *str == '@' )
173 kexec_crash_area.start = parse_size_and_unit(cur = str + 1, &str);
174 else if ( *str == '<' )
175 kexec_crash_area_limit = parse_size_and_unit(cur = str + 1, &str);
176 else if ( !strncmp(str, ",below=", 7) )
177 kexec_crash_area_limit = parse_size_and_unit(cur = str + 7, &str);
178 else if ( *str )
179 {
180 printk(XENLOG_WARNING "crashkernel: '%s' ignored\n", str);
181 rc = -EINVAL;
182 }
183 }
184 if ( cur && cur == str )
185 {
186 printk(XENLOG_WARNING "crashkernel: memory value expected\n");
187 rc = -EINVAL;
188 }
189
190 return rc;
191 }
192 custom_param("crashkernel", parse_crashkernel);
193
194 /* Parse command lines in the format:
195 *
196 * low_crashinfo=[none,min,all]
197 *
198 * - none disables the low allocation of crash info.
199 * - min will allocate enough low information for the crash kernel to be able
200 * to extract the hypervisor and dom0 message ring buffers.
201 * - all will allocate additional structures such as domain and vcpu structs
202 * low so the crash kernel can perform an extended analysis of state.
203 */
parse_low_crashinfo(const char * str)204 static int __init cf_check parse_low_crashinfo(const char *str)
205 {
206
207 if ( !strlen(str) )
208 /* default to min if user just specifies "low_crashinfo" */
209 low_crashinfo_mode = LOW_CRASHINFO_MIN;
210 else if ( !strcmp(str, "none" ) )
211 low_crashinfo_mode = LOW_CRASHINFO_NONE;
212 else if ( !strcmp(str, "min" ) )
213 low_crashinfo_mode = LOW_CRASHINFO_MIN;
214 else if ( !strcmp(str, "all" ) )
215 low_crashinfo_mode = LOW_CRASHINFO_ALL;
216 else
217 {
218 printk("Unknown low_crashinfo parameter '%s'. Defaulting to min.\n", str);
219 low_crashinfo_mode = LOW_CRASHINFO_MIN;
220 return -EINVAL;
221 }
222
223 return 0;
224 }
225 custom_param("low_crashinfo", parse_low_crashinfo);
226
227 /* Parse command lines in the format:
228 *
229 * crashinfo_maxaddr=<addr>
230 *
231 * <addr> will be rounded down to the nearest power of two. Defaults to 64G
232 */
parse_crashinfo_maxaddr(const char * str)233 static int __init cf_check parse_crashinfo_maxaddr(const char *str)
234 {
235 u64 addr;
236 const char *q;
237
238 /* if low_crashinfo_mode is unset, default to min. */
239 if ( low_crashinfo_mode == LOW_CRASHINFO_INVALID )
240 low_crashinfo_mode = LOW_CRASHINFO_MIN;
241
242 if ( (addr = parse_size_and_unit(str, &q)) )
243 crashinfo_maxaddr = addr;
244 else
245 {
246 printk("Unable to parse crashinfo_maxaddr. Defaulting to %"PRIpaddr"\n",
247 crashinfo_maxaddr);
248 return -EINVAL;
249 }
250
251 return *q ? -EINVAL : 0;
252 }
253 custom_param("crashinfo_maxaddr", parse_crashinfo_maxaddr);
254
set_kexec_crash_area_size(u64 system_ram)255 void __init set_kexec_crash_area_size(u64 system_ram)
256 {
257 unsigned int idx;
258
259 for ( idx = 0; idx < ARRAY_SIZE(ranges) && !kexec_crash_area.size; ++idx )
260 {
261 if ( !ranges[idx].size )
262 break;
263
264 if ( ranges[idx].size >= system_ram )
265 {
266 printk(XENLOG_WARNING "crashkernel: invalid size\n");
267 continue;
268 }
269
270 if ( ranges[idx].start <= system_ram && ranges[idx].end > system_ram )
271 kexec_crash_area.size = ranges[idx].size;
272 }
273 }
274
275 /*
276 * Only allow one cpu to continue on the crash path, forcing others to spin.
277 * Racing on the crash path from here will end in misery. If we reenter,
278 * something has very gone wrong and retrying will (almost certainly) be
279 * futile. Return up to our nested panic() to try and reboot.
280 *
281 * This is noinline to make it obvious in stack traces which cpus have lost
282 * the race (as opposed to being somewhere in kexec_common_shutdown())
283 */
one_cpu_only(void)284 static int noinline one_cpu_only(void)
285 {
286 static unsigned int crashing_cpu = -1;
287 unsigned int cpu = smp_processor_id();
288
289 if ( cmpxchg(&crashing_cpu, -1, cpu) != -1 )
290 {
291 /* Not the first entry into one_cpu_only(). */
292 if ( crashing_cpu == cpu )
293 {
294 printk("Reentered the crash path. Something is very broken\n");
295 return -EBUSY;
296 }
297
298 /*
299 * Another cpu has beaten us to this point. Wait here patiently for
300 * it to kill us.
301 */
302 for ( ; ; )
303 halt();
304 }
305
306 set_bit(KEXEC_FLAG_IN_PROGRESS, &kexec_flags);
307 printk("Executing kexec image on cpu%u\n", cpu);
308
309 return 0;
310 }
311
312 /* Save the registers in the per-cpu crash note buffer. */
kexec_crash_save_cpu(void)313 void kexec_crash_save_cpu(void)
314 {
315 int cpu = smp_processor_id();
316 Elf_Note *note;
317 ELF_Prstatus *prstatus;
318 crash_xen_core_t *xencore;
319
320 BUG_ON ( ! crash_notes );
321
322 if ( cpumask_test_and_set_cpu(cpu, &crash_saved_cpus) )
323 return;
324
325 note = crash_notes[cpu].start;
326
327 prstatus = (ELF_Prstatus *)ELFNOTE_DESC(note);
328
329 note = ELFNOTE_NEXT(note);
330 xencore = (crash_xen_core_t *)ELFNOTE_DESC(note);
331
332 elf_core_save_regs(&prstatus->pr_reg, xencore);
333 }
334
335 /* Set up the single Xen-specific-info crash note. */
kexec_crash_save_info(void)336 crash_xen_info_t *kexec_crash_save_info(void)
337 {
338 int cpu = smp_processor_id();
339 crash_xen_info_t info;
340 crash_xen_info_t *out = (crash_xen_info_t *)ELFNOTE_DESC(xen_crash_note);
341
342 BUG_ON(!cpumask_test_and_set_cpu(cpu, &crash_saved_cpus));
343
344 memset(&info, 0, sizeof(info));
345 info.xen_major_version = xen_major_version();
346 info.xen_minor_version = xen_minor_version();
347 info.xen_extra_version = __pa(xen_extra_version());
348 info.xen_changeset = __pa(xen_changeset());
349 info.xen_compiler = __pa(xen_compiler());
350 info.xen_compile_date = __pa(xen_compile_date());
351 info.xen_compile_time = __pa(xen_compile_time());
352 info.tainted = tainted;
353
354 /* Copy from guaranteed-aligned local copy to possibly-unaligned dest. */
355 memcpy(out, &info, sizeof(info));
356
357 return out;
358 }
359
kexec_common_shutdown(void)360 static int kexec_common_shutdown(void)
361 {
362 int ret;
363
364 ret = one_cpu_only();
365 if ( ret )
366 return ret;
367
368 watchdog_disable();
369 console_start_sync();
370 spin_debug_disable();
371 acpi_dmar_reinstate();
372
373 return 0;
374 }
375
kexec_crash(enum crash_reason reason)376 void kexec_crash(enum crash_reason reason)
377 {
378 int pos;
379
380 keyhandler_crash_action(reason);
381
382 pos = (test_bit(KEXEC_FLAG_CRASH_POS, &kexec_flags) != 0);
383 if ( !test_bit(KEXEC_IMAGE_CRASH_BASE + pos, &kexec_flags) )
384 return;
385
386 kexecing = true;
387
388 if ( kexec_common_shutdown() != 0 )
389 return;
390
391 kexec_crash_save_cpu();
392 machine_crash_shutdown();
393 machine_kexec(kexec_image[KEXEC_IMAGE_CRASH_BASE + pos]);
394
395 BUG();
396 }
397
kexec_reboot(void * _image)398 static long cf_check kexec_reboot(void *_image)
399 {
400 struct kexec_image *image = _image;
401
402 kexecing = true;
403
404 kexec_common_shutdown();
405 machine_reboot_kexec(image);
406
407 BUG();
408 return 0;
409 }
410
do_crashdump_trigger(unsigned char key)411 static void cf_check do_crashdump_trigger(unsigned char key)
412 {
413 printk("'%c' pressed -> triggering crashdump\n", key);
414 kexec_crash(CRASHREASON_DEBUGKEY);
415 printk(" * no crash kernel loaded!\n");
416 }
417
setup_note(Elf_Note * n,const char * name,int type,int descsz)418 static void setup_note(Elf_Note *n, const char *name, int type, int descsz)
419 {
420 int l = strlen(name) + 1;
421 strlcpy(ELFNOTE_NAME(n), name, l);
422 n->namesz = l;
423 n->descsz = descsz;
424 n->type = type;
425 }
426
sizeof_note(const char * name,int descsz)427 static size_t sizeof_note(const char *name, int descsz)
428 {
429 return (sizeof(Elf_Note) +
430 ELFNOTE_ALIGN(strlen(name)+1) +
431 ELFNOTE_ALIGN(descsz));
432 }
433
sizeof_cpu_notes(const unsigned long cpu)434 static size_t sizeof_cpu_notes(const unsigned long cpu)
435 {
436 /* All CPUs present a PRSTATUS and crash_xen_core note. */
437 size_t bytes =
438 + sizeof_note("CORE", sizeof(ELF_Prstatus)) +
439 + sizeof_note("Xen", sizeof(crash_xen_core_t));
440
441 /* CPU0 also presents the crash_xen_info note. */
442 if ( ! cpu )
443 bytes = bytes +
444 sizeof_note("Xen", sizeof(crash_xen_info_t));
445
446 return bytes;
447 }
448
449 /* Allocate size_t bytes of space from the previously allocated
450 * crash heap if the user has requested that crash notes be allocated
451 * in lower memory. There is currently no case where the crash notes
452 * should be free()'d. */
alloc_from_crash_heap(const size_t bytes)453 static void * alloc_from_crash_heap(const size_t bytes)
454 {
455 void * ret;
456 if ( crash_heap_current + bytes > crash_heap_end )
457 return NULL;
458 ret = (void*)crash_heap_current;
459 crash_heap_current += bytes;
460 return ret;
461 }
462
463 /* Allocate a crash note buffer for a newly onlined cpu. */
kexec_init_cpu_notes(const unsigned long cpu)464 static int kexec_init_cpu_notes(const unsigned long cpu)
465 {
466 Elf_Note * note = NULL;
467 int ret = 0;
468 int nr_bytes = 0;
469
470 BUG_ON( cpu >= nr_cpu_ids || ! crash_notes );
471
472 /* If already allocated, nothing to do. */
473 if ( crash_notes[cpu].start )
474 return ret;
475
476 nr_bytes = sizeof_cpu_notes(cpu);
477
478 /* If we dont care about the position of allocation, malloc. */
479 if ( low_crashinfo_mode == LOW_CRASHINFO_NONE )
480 note = xzalloc_bytes(nr_bytes);
481
482 /* Protect the write into crash_notes[] with a spinlock, as this function
483 * is on a hotplug path and a hypercall path. */
484 spin_lock(&crash_notes_lock);
485
486 /* If we are racing with another CPU and it has beaten us, give up
487 * gracefully. */
488 if ( crash_notes[cpu].start )
489 {
490 spin_unlock(&crash_notes_lock);
491 /* Always return ok, because whether we successfully allocated or not,
492 * another CPU has successfully allocated. */
493 xfree(note);
494 }
495 else
496 {
497 /* If we care about memory possition, alloc from the crash heap,
498 * also protected by the crash_notes_lock. */
499 if ( low_crashinfo_mode > LOW_CRASHINFO_NONE )
500 note = alloc_from_crash_heap(nr_bytes);
501
502 crash_notes[cpu].start = note;
503 crash_notes[cpu].size = nr_bytes;
504 spin_unlock(&crash_notes_lock);
505
506 /* If the allocation failed, and another CPU did not beat us, give
507 * up with ENOMEM. */
508 if ( ! note )
509 ret = -ENOMEM;
510 /* else all is good so lets set up the notes. */
511 else
512 {
513 /* Set up CORE note. */
514 setup_note(note, "CORE", NT_PRSTATUS, sizeof(ELF_Prstatus));
515 note = ELFNOTE_NEXT(note);
516
517 /* Set up Xen CORE note. */
518 setup_note(note, "Xen", XEN_ELFNOTE_CRASH_REGS,
519 sizeof(crash_xen_core_t));
520
521 if ( ! cpu )
522 {
523 /* Set up Xen Crash Info note. */
524 xen_crash_note = note = ELFNOTE_NEXT(note);
525 setup_note(note, "Xen", XEN_ELFNOTE_CRASH_INFO,
526 sizeof(crash_xen_info_t));
527 }
528 }
529 }
530
531 return ret;
532 }
533
cpu_callback(struct notifier_block * nfb,unsigned long action,void * hcpu)534 static int cf_check cpu_callback(
535 struct notifier_block *nfb, unsigned long action, void *hcpu)
536 {
537 unsigned long cpu = (unsigned long)hcpu;
538
539 /* Only hook on CPU_UP_PREPARE because once a crash_note has been reported
540 * to dom0, it must keep it around in case of a crash, as the crash kernel
541 * will be hard coded to the original physical address reported. */
542 switch ( action )
543 {
544 case CPU_UP_PREPARE:
545 /* Ignore return value. If this boot time, -ENOMEM will cause all
546 * manner of problems elsewhere very soon, and if it is during runtime,
547 * then failing to allocate crash notes is not a good enough reason to
548 * fail the CPU_UP_PREPARE */
549 kexec_init_cpu_notes(cpu);
550 break;
551 default:
552 break;
553 }
554 return NOTIFY_DONE;
555 }
556
557 static struct notifier_block cpu_nfb = {
558 .notifier_call = cpu_callback
559 };
560
kexec_early_calculations(void)561 void __init kexec_early_calculations(void)
562 {
563 /* If low_crashinfo_mode is still INVALID, neither "low_crashinfo" nor
564 * "crashinfo_maxaddr" have been specified on the command line, so
565 * explicitly set to NONE. */
566 if ( low_crashinfo_mode == LOW_CRASHINFO_INVALID )
567 low_crashinfo_mode = LOW_CRASHINFO_NONE;
568
569 if ( low_crashinfo_mode > LOW_CRASHINFO_NONE )
570 crashinfo_maxaddr_bits = fls64(crashinfo_maxaddr) - 1;
571 }
572
kexec_init(void)573 static int __init cf_check kexec_init(void)
574 {
575 void *cpu = (void *)(unsigned long)smp_processor_id();
576
577 /* If no crash area, no need to allocate space for notes. */
578 if ( !kexec_crash_area.size )
579 return 0;
580
581 if ( low_crashinfo_mode > LOW_CRASHINFO_NONE )
582 {
583 size_t crash_heap_size;
584
585 /* This calculation is safe even if the machine is booted in
586 * uniprocessor mode. */
587 crash_heap_size = sizeof_cpu_notes(0) +
588 sizeof_cpu_notes(1) * (nr_cpu_ids - 1);
589 crash_heap_size = PAGE_ALIGN(crash_heap_size);
590
591 crash_heap_current = alloc_xenheap_pages(
592 get_order_from_bytes(crash_heap_size),
593 MEMF_bits(crashinfo_maxaddr_bits) );
594
595 if ( ! crash_heap_current )
596 return -ENOMEM;
597
598 memset(crash_heap_current, 0, crash_heap_size);
599
600 crash_heap_end = crash_heap_current + crash_heap_size;
601 }
602
603 /* crash_notes may be allocated anywhere Xen can reach in memory.
604 Only the individual CPU crash notes themselves must be allocated
605 in lower memory if requested. */
606 crash_notes = xzalloc_array(crash_note_range_t, nr_cpu_ids);
607 if ( ! crash_notes )
608 return -ENOMEM;
609
610 register_keyhandler('C', do_crashdump_trigger, "trigger a crashdump", 0);
611
612 cpu_callback(&cpu_nfb, CPU_UP_PREPARE, cpu);
613 register_cpu_notifier(&cpu_nfb);
614 return 0;
615 }
616 /* The reason for this to be a presmp_initcall as opposed to a regular
617 * __initcall is to allow the setup of the cpu hotplug handler before APs are
618 * brought up. */
619 presmp_initcall(kexec_init);
620
kexec_get_reserve(xen_kexec_range_t * range)621 static int kexec_get_reserve(xen_kexec_range_t *range)
622 {
623 if ( kexec_crash_area.size > 0 && kexec_crash_area.start > 0) {
624 range->start = kexec_crash_area.start;
625 range->size = kexec_crash_area.size;
626 }
627 else
628 range->start = range->size = 0;
629 return 0;
630 }
631
kexec_get_cpu(xen_kexec_range_t * range)632 static int kexec_get_cpu(xen_kexec_range_t *range)
633 {
634 int nr = range->nr;
635
636 if ( nr < 0 || nr >= nr_cpu_ids )
637 return -ERANGE;
638
639 if ( ! crash_notes )
640 return -EINVAL;
641
642 /* Try once again to allocate room for the crash notes. It is just possible
643 * that more space has become available since we last tried. If space has
644 * already been allocated, kexec_init_cpu_notes() will return early with 0.
645 */
646 kexec_init_cpu_notes(nr);
647
648 /* In the case of still not having enough memory to allocate buffer room,
649 * returning a range of 0,0 is still valid. */
650 if ( crash_notes[nr].start )
651 {
652 range->start = __pa(crash_notes[nr].start);
653 range->size = crash_notes[nr].size;
654 }
655 else
656 range->start = range->size = 0;
657
658 return 0;
659 }
660
kexec_get_vmcoreinfo(xen_kexec_range_t * range)661 static int kexec_get_vmcoreinfo(xen_kexec_range_t *range)
662 {
663 range->start = __pa((unsigned long)vmcoreinfo_data);
664 range->size = VMCOREINFO_BYTES;
665 return 0;
666 }
667
kexec_get_range_internal(xen_kexec_range_t * range)668 static int kexec_get_range_internal(xen_kexec_range_t *range)
669 {
670 int ret = -EINVAL;
671
672 switch ( range->range )
673 {
674 case KEXEC_RANGE_MA_CRASH:
675 ret = kexec_get_reserve(range);
676 break;
677 case KEXEC_RANGE_MA_CPU:
678 ret = kexec_get_cpu(range);
679 break;
680 case KEXEC_RANGE_MA_VMCOREINFO:
681 ret = kexec_get_vmcoreinfo(range);
682 break;
683 default:
684 ret = machine_kexec_get(range);
685 break;
686 }
687
688 return ret;
689 }
690
kexec_get_range(XEN_GUEST_HANDLE_PARAM (void)uarg)691 static int kexec_get_range(XEN_GUEST_HANDLE_PARAM(void) uarg)
692 {
693 xen_kexec_range_t range;
694 int ret = -EINVAL;
695
696 if ( unlikely(copy_from_guest(&range, uarg, 1)) )
697 return -EFAULT;
698
699 ret = kexec_get_range_internal(&range);
700
701 if ( ret == 0 && unlikely(__copy_to_guest(uarg, &range, 1)) )
702 ret = -EFAULT;
703
704 return ret;
705 }
706
kexec_get_range_compat(XEN_GUEST_HANDLE_PARAM (void)uarg)707 static int kexec_get_range_compat(XEN_GUEST_HANDLE_PARAM(void) uarg)
708 {
709 #ifdef CONFIG_COMPAT
710 xen_kexec_range_t range;
711 compat_kexec_range_t compat_range;
712 int ret = -EINVAL;
713
714 if ( unlikely(copy_from_guest(&compat_range, uarg, 1)) )
715 return -EFAULT;
716
717 XLAT_kexec_range(&range, &compat_range);
718
719 ret = kexec_get_range_internal(&range);
720
721 /* Dont silently truncate physical addresses or sizes. */
722 if ( (range.start | range.size) & ~(unsigned long)(~0u) )
723 return -ERANGE;
724
725 if ( ret == 0 )
726 {
727 XLAT_kexec_range(&compat_range, &range);
728 if ( unlikely(__copy_to_guest(uarg, &compat_range, 1)) )
729 ret = -EFAULT;
730 }
731
732 return ret;
733 #else /* CONFIG_COMPAT */
734 return 0;
735 #endif /* CONFIG_COMPAT */
736 }
737
kexec_load_get_bits(int type,int * base,int * bit)738 static int kexec_load_get_bits(int type, int *base, int *bit)
739 {
740 switch ( type )
741 {
742 case KEXEC_TYPE_DEFAULT:
743 *base = KEXEC_IMAGE_DEFAULT_BASE;
744 *bit = KEXEC_FLAG_DEFAULT_POS;
745 break;
746 case KEXEC_TYPE_CRASH:
747 *base = KEXEC_IMAGE_CRASH_BASE;
748 *bit = KEXEC_FLAG_CRASH_POS;
749 break;
750 default:
751 return -1;
752 }
753 return 0;
754 }
755
vmcoreinfo_append_str(const char * fmt,...)756 void vmcoreinfo_append_str(const char *fmt, ...)
757 {
758 va_list args;
759 char buf[0x50];
760 int r;
761 size_t note_size = sizeof(Elf_Note) + ELFNOTE_ALIGN(strlen(VMCOREINFO_NOTE_NAME) + 1);
762
763 if (vmcoreinfo_size + note_size + sizeof(buf) > VMCOREINFO_BYTES)
764 return;
765
766 va_start(args, fmt);
767 r = vsnprintf(buf, sizeof(buf), fmt, args);
768 va_end(args);
769
770 memcpy(&vmcoreinfo_data[note_size + vmcoreinfo_size], buf, r);
771
772 vmcoreinfo_size += r;
773 }
774
crash_save_vmcoreinfo(void)775 static void crash_save_vmcoreinfo(void)
776 {
777 size_t data_size;
778
779 if (vmcoreinfo_size > 0) /* already saved */
780 return;
781
782 data_size = VMCOREINFO_BYTES - (sizeof(Elf_Note) + ELFNOTE_ALIGN(strlen(VMCOREINFO_NOTE_NAME) + 1));
783 setup_note((Elf_Note *)vmcoreinfo_data, VMCOREINFO_NOTE_NAME, 0, data_size);
784
785 VMCOREINFO_PAGESIZE(PAGE_SIZE);
786
787 VMCOREINFO_SYMBOL(domain_list);
788 #ifndef frame_table
789 VMCOREINFO_SYMBOL(frame_table);
790 #else
791 {
792 static const void *const _frame_table = frame_table;
793 VMCOREINFO_SYMBOL_ALIAS(frame_table, _frame_table);
794 }
795 #endif
796 VMCOREINFO_SYMBOL(max_page);
797
798 VMCOREINFO_STRUCT_SIZE(page_info);
799 VMCOREINFO_STRUCT_SIZE(domain);
800
801 VMCOREINFO_OFFSET(page_info, count_info);
802 VMCOREINFO_OFFSET_SUB(page_info, v.inuse, _domain);
803 VMCOREINFO_OFFSET(domain, domain_id);
804 VMCOREINFO_OFFSET(domain, next_in_list);
805
806 #ifdef ARCH_CRASH_SAVE_VMCOREINFO
807 arch_crash_save_vmcoreinfo();
808 #endif
809 }
810
kexec_unload_image(struct kexec_image * image)811 static void kexec_unload_image(struct kexec_image *image)
812 {
813 if ( !image )
814 return;
815
816 machine_kexec_unload(image);
817 kimage_free(image);
818 }
819
kexec_exec(XEN_GUEST_HANDLE_PARAM (void)uarg)820 static int kexec_exec(XEN_GUEST_HANDLE_PARAM(void) uarg)
821 {
822 xen_kexec_exec_t exec;
823 struct kexec_image *image;
824 int base, bit, pos, ret = -EINVAL;
825
826 if ( unlikely(copy_from_guest(&exec, uarg, 1)) )
827 return -EFAULT;
828
829 if ( kexec_load_get_bits(exec.type, &base, &bit) )
830 return -EINVAL;
831
832 pos = (test_bit(bit, &kexec_flags) != 0);
833
834 /* Only allow kexec/kdump into loaded images */
835 if ( !test_bit(base + pos, &kexec_flags) )
836 return -ENOENT;
837
838 switch (exec.type)
839 {
840 case KEXEC_TYPE_DEFAULT:
841 image = kexec_image[base + pos];
842 ret = continue_hypercall_on_cpu(0, kexec_reboot, image);
843 break;
844 case KEXEC_TYPE_CRASH:
845 kexec_crash(CRASHREASON_KEXECCMD); /* Does not return */
846 break;
847 }
848
849 return -EINVAL; /* never reached */
850 }
851
kexec_swap_images(int type,struct kexec_image * new,struct kexec_image ** old)852 static int kexec_swap_images(int type, struct kexec_image *new,
853 struct kexec_image **old)
854 {
855 int base, bit, pos;
856 int new_slot, old_slot;
857
858 *old = NULL;
859
860 if ( test_bit(KEXEC_FLAG_IN_PROGRESS, &kexec_flags) )
861 return -EBUSY;
862
863 if ( kexec_load_get_bits(type, &base, &bit) )
864 return -EINVAL;
865
866 ASSERT(test_bit(KEXEC_FLAG_IN_HYPERCALL, &kexec_flags));
867
868 pos = (test_bit(bit, &kexec_flags) != 0);
869 old_slot = base + pos;
870 new_slot = base + !pos;
871
872 kexec_image[new_slot] = new;
873 if ( new )
874 set_bit(new_slot, &kexec_flags);
875 change_bit(bit, &kexec_flags);
876
877 clear_bit(old_slot, &kexec_flags);
878 *old = kexec_image[old_slot];
879
880 return 0;
881 }
882
kexec_load_slot(struct kexec_image * kimage)883 static int kexec_load_slot(struct kexec_image *kimage)
884 {
885 struct kexec_image *old_kimage;
886 int ret = -ENOMEM;
887
888 ret = machine_kexec_load(kimage);
889 if ( ret < 0 )
890 return ret;
891
892 crash_save_vmcoreinfo();
893
894 ret = kexec_swap_images(kimage->type, kimage, &old_kimage);
895 if ( ret < 0 )
896 return ret;
897
898 kexec_unload_image(old_kimage);
899
900 return 0;
901 }
902
kexec_load_v1_arch(void)903 static uint16_t kexec_load_v1_arch(void)
904 {
905 #ifdef CONFIG_X86
906 return is_pv_32bit_domain(hardware_domain) ? EM_386 : EM_X86_64;
907 #else
908 return EM_NONE;
909 #endif
910 }
911
kexec_segments_add_segment(unsigned int * nr_segments,xen_kexec_segment_t * segments,mfn_t mfn)912 static int kexec_segments_add_segment(unsigned int *nr_segments,
913 xen_kexec_segment_t *segments,
914 mfn_t mfn)
915 {
916 paddr_t maddr = mfn_to_maddr(mfn);
917 unsigned int n = *nr_segments;
918
919 /* Need a new segment? */
920 if ( n == 0
921 || segments[n-1].dest_maddr + segments[n-1].dest_size != maddr )
922 {
923 n++;
924 if ( n > KEXEC_SEGMENT_MAX )
925 return -EINVAL;
926 *nr_segments = n;
927
928 set_xen_guest_handle(segments[n-1].buf.h, NULL);
929 segments[n-1].buf_size = 0;
930 segments[n-1].dest_maddr = maddr;
931 segments[n-1].dest_size = 0;
932 }
933
934 return 0;
935 }
936
kexec_segments_from_ind_page(mfn_t mfn,unsigned int * nr_segments,xen_kexec_segment_t * segments,bool compat)937 static int kexec_segments_from_ind_page(mfn_t mfn,
938 unsigned int *nr_segments,
939 xen_kexec_segment_t *segments,
940 bool compat)
941 {
942 void *page;
943 kimage_entry_t *entry;
944 int ret = 0;
945
946 page = map_domain_page(mfn);
947
948 /*
949 * Walk the indirection page list, adding destination pages to the
950 * segments.
951 */
952 for ( entry = page; ; )
953 {
954 unsigned long ind;
955
956 ind = kimage_entry_ind(entry, compat);
957 mfn = kimage_entry_mfn(entry, compat);
958
959 switch ( ind )
960 {
961 case IND_DESTINATION:
962 ret = kexec_segments_add_segment(nr_segments, segments, mfn);
963 if ( ret < 0 )
964 goto done;
965 break;
966 case IND_INDIRECTION:
967 unmap_domain_page(page);
968 entry = page = map_domain_page(mfn);
969 continue;
970 case IND_DONE:
971 goto done;
972 case IND_SOURCE:
973 if ( *nr_segments == 0 )
974 {
975 ret = -EINVAL;
976 goto done;
977 }
978 segments[*nr_segments-1].dest_size += PAGE_SIZE;
979 break;
980 default:
981 ret = -EINVAL;
982 goto done;
983 }
984 entry = kimage_entry_next(entry, compat);
985 }
986 done:
987 unmap_domain_page(page);
988 return ret;
989 }
990
kexec_do_load_v1(xen_kexec_load_v1_t * load,int compat)991 static int kexec_do_load_v1(xen_kexec_load_v1_t *load, int compat)
992 {
993 struct kexec_image *kimage = NULL;
994 xen_kexec_segment_t *segments;
995 uint16_t arch;
996 unsigned int nr_segments = 0;
997 mfn_t ind_mfn = maddr_to_mfn(load->image.indirection_page);
998 int ret;
999
1000 arch = kexec_load_v1_arch();
1001 if ( arch == EM_NONE )
1002 return -ENOSYS;
1003
1004 segments = xmalloc_array(xen_kexec_segment_t, KEXEC_SEGMENT_MAX);
1005 if ( segments == NULL )
1006 return -ENOMEM;
1007
1008 /*
1009 * Work out the image segments (destination only) from the
1010 * indirection pages.
1011 *
1012 * This is needed so we don't allocate pages that will overlap
1013 * with the destination when building the new set of indirection
1014 * pages below.
1015 */
1016 ret = kexec_segments_from_ind_page(ind_mfn, &nr_segments, segments, compat);
1017 if ( ret < 0 )
1018 goto error;
1019
1020 ret = kimage_alloc(&kimage, load->type, arch, load->image.start_address,
1021 nr_segments, segments);
1022 if ( ret < 0 )
1023 goto error;
1024
1025 /*
1026 * Build a new set of indirection pages in the native format.
1027 *
1028 * This walks the guest provided indirection pages a second time.
1029 * The guest could have altered then, invalidating the segment
1030 * information constructed above. This will only result in the
1031 * resulting image being potentially unrelocatable.
1032 */
1033 ret = kimage_build_ind(kimage, ind_mfn, compat);
1034 if ( ret < 0 )
1035 goto error;
1036
1037 if ( arch == EM_386 || arch == EM_X86_64 )
1038 {
1039 /*
1040 * Ensure 0 - 1 MiB is mapped and accessible by the image.
1041 *
1042 * This allows access to VGA memory and the region purgatory copies
1043 * in the crash case.
1044 */
1045 unsigned long addr;
1046
1047 for ( addr = 0; addr < MB(1); addr += PAGE_SIZE )
1048 {
1049 ret = machine_kexec_add_page(kimage, addr, addr);
1050 if ( ret < 0 )
1051 goto error;
1052 }
1053 }
1054
1055 ret = kexec_load_slot(kimage);
1056 if ( ret < 0 )
1057 goto error;
1058
1059 return 0;
1060
1061 error:
1062 if ( !kimage )
1063 xfree(segments);
1064 kimage_free(kimage);
1065 return ret;
1066 }
1067
kexec_load_v1(XEN_GUEST_HANDLE_PARAM (void)uarg)1068 static int kexec_load_v1(XEN_GUEST_HANDLE_PARAM(void) uarg)
1069 {
1070 xen_kexec_load_v1_t load;
1071
1072 if ( unlikely(copy_from_guest(&load, uarg, 1)) )
1073 return -EFAULT;
1074
1075 return kexec_do_load_v1(&load, 0);
1076 }
1077
kexec_load_v1_compat(XEN_GUEST_HANDLE_PARAM (void)uarg)1078 static int kexec_load_v1_compat(XEN_GUEST_HANDLE_PARAM(void) uarg)
1079 {
1080 #ifdef CONFIG_COMPAT
1081 compat_kexec_load_v1_t compat_load;
1082 xen_kexec_load_v1_t load;
1083
1084 if ( unlikely(copy_from_guest(&compat_load, uarg, 1)) )
1085 return -EFAULT;
1086
1087 /* This is a bit dodgy, load.image is inside load,
1088 * but XLAT_kexec_load (which is automatically generated)
1089 * doesn't translate load.image (correctly)
1090 * Just copy load->type, the only other member, manually instead.
1091 *
1092 * XLAT_kexec_load(&load, &compat_load);
1093 */
1094 load.type = compat_load.type;
1095 XLAT_kexec_image(&load.image, &compat_load.image);
1096
1097 return kexec_do_load_v1(&load, 1);
1098 #else
1099 return 0;
1100 #endif
1101 }
1102
kexec_load(XEN_GUEST_HANDLE_PARAM (void)uarg)1103 static int kexec_load(XEN_GUEST_HANDLE_PARAM(void) uarg)
1104 {
1105 xen_kexec_load_t load;
1106 xen_kexec_segment_t *segments;
1107 struct kexec_image *kimage = NULL;
1108 int ret;
1109
1110 if ( copy_from_guest(&load, uarg, 1) )
1111 return -EFAULT;
1112
1113 if ( load.nr_segments >= KEXEC_SEGMENT_MAX )
1114 return -EINVAL;
1115
1116 segments = xmalloc_array(xen_kexec_segment_t, load.nr_segments);
1117 if ( segments == NULL )
1118 return -ENOMEM;
1119
1120 if ( copy_from_guest(segments, load.segments.h, load.nr_segments) )
1121 {
1122 ret = -EFAULT;
1123 goto error;
1124 }
1125
1126 ret = kimage_alloc(&kimage, load.type, load.arch, load.entry_maddr,
1127 load.nr_segments, segments);
1128 if ( ret < 0 )
1129 goto error;
1130
1131 ret = kimage_load_segments(kimage);
1132 if ( ret < 0 )
1133 goto error;
1134
1135 ret = kexec_load_slot(kimage);
1136 if ( ret < 0 )
1137 goto error;
1138
1139 return 0;
1140
1141 error:
1142 if ( ! kimage )
1143 xfree(segments);
1144 kimage_free(kimage);
1145 return ret;
1146 }
1147
kexec_do_unload(xen_kexec_unload_t * unload)1148 static int kexec_do_unload(xen_kexec_unload_t *unload)
1149 {
1150 struct kexec_image *old_kimage;
1151 int ret;
1152
1153 ret = kexec_swap_images(unload->type, NULL, &old_kimage);
1154 if ( ret < 0 )
1155 return ret;
1156
1157 kexec_unload_image(old_kimage);
1158
1159 return 0;
1160 }
1161
kexec_unload_v1(XEN_GUEST_HANDLE_PARAM (void)uarg)1162 static int kexec_unload_v1(XEN_GUEST_HANDLE_PARAM(void) uarg)
1163 {
1164 xen_kexec_load_v1_t load;
1165 xen_kexec_unload_t unload;
1166
1167 if ( copy_from_guest(&load, uarg, 1) )
1168 return -EFAULT;
1169
1170 unload.type = load.type;
1171 return kexec_do_unload(&unload);
1172 }
1173
kexec_unload_v1_compat(XEN_GUEST_HANDLE_PARAM (void)uarg)1174 static int kexec_unload_v1_compat(XEN_GUEST_HANDLE_PARAM(void) uarg)
1175 {
1176 #ifdef CONFIG_COMPAT
1177 compat_kexec_load_v1_t compat_load;
1178 xen_kexec_unload_t unload;
1179
1180 if ( copy_from_guest(&compat_load, uarg, 1) )
1181 return -EFAULT;
1182
1183 unload.type = compat_load.type;
1184 return kexec_do_unload(&unload);
1185 #else
1186 return 0;
1187 #endif
1188 }
1189
kexec_unload(XEN_GUEST_HANDLE_PARAM (void)uarg)1190 static int kexec_unload(XEN_GUEST_HANDLE_PARAM(void) uarg)
1191 {
1192 xen_kexec_unload_t unload;
1193
1194 if ( unlikely(copy_from_guest(&unload, uarg, 1)) )
1195 return -EFAULT;
1196
1197 return kexec_do_unload(&unload);
1198 }
1199
kexec_status(XEN_GUEST_HANDLE_PARAM (void)uarg)1200 static int kexec_status(XEN_GUEST_HANDLE_PARAM(void) uarg)
1201 {
1202 xen_kexec_status_t status;
1203 int base, bit;
1204
1205 if ( unlikely(copy_from_guest(&status, uarg, 1)) )
1206 return -EFAULT;
1207
1208 /* No need to check KEXEC_FLAG_IN_PROGRESS. */
1209
1210 if ( kexec_load_get_bits(status.type, &base, &bit) )
1211 return -EINVAL;
1212
1213 return !!test_bit(bit, &kexec_flags);
1214 }
1215
do_kexec_op_internal(unsigned long op,XEN_GUEST_HANDLE_PARAM (void)uarg,bool compat)1216 static int do_kexec_op_internal(unsigned long op,
1217 XEN_GUEST_HANDLE_PARAM(void) uarg,
1218 bool compat)
1219 {
1220 int ret = -EINVAL;
1221
1222 ret = xsm_kexec(XSM_PRIV);
1223 if ( ret )
1224 return ret;
1225
1226 if ( test_and_set_bit(KEXEC_FLAG_IN_HYPERCALL, &kexec_flags) )
1227 return hypercall_create_continuation(__HYPERVISOR_kexec_op, "lh", op, uarg);
1228
1229 switch ( op )
1230 {
1231 case KEXEC_CMD_kexec_get_range:
1232 if (compat)
1233 ret = kexec_get_range_compat(uarg);
1234 else
1235 ret = kexec_get_range(uarg);
1236 break;
1237 case KEXEC_CMD_kexec_load_v1:
1238 if ( compat )
1239 ret = kexec_load_v1_compat(uarg);
1240 else
1241 ret = kexec_load_v1(uarg);
1242 break;
1243 case KEXEC_CMD_kexec_unload_v1:
1244 if ( compat )
1245 ret = kexec_unload_v1_compat(uarg);
1246 else
1247 ret = kexec_unload_v1(uarg);
1248 break;
1249 case KEXEC_CMD_kexec:
1250 ret = kexec_exec(uarg);
1251 break;
1252 case KEXEC_CMD_kexec_load:
1253 ret = kexec_load(uarg);
1254 break;
1255 case KEXEC_CMD_kexec_unload:
1256 ret = kexec_unload(uarg);
1257 break;
1258 case KEXEC_CMD_kexec_status:
1259 ret = kexec_status(uarg);
1260 break;
1261 }
1262
1263 clear_bit(KEXEC_FLAG_IN_HYPERCALL, &kexec_flags);
1264
1265 return ret;
1266 }
1267
do_kexec_op(unsigned long op,XEN_GUEST_HANDLE_PARAM (void)uarg)1268 long do_kexec_op(unsigned long op, XEN_GUEST_HANDLE_PARAM(void) uarg)
1269 {
1270 return do_kexec_op_internal(op, uarg, 0);
1271 }
1272
1273 #ifdef CONFIG_COMPAT
compat_kexec_op(unsigned int op,XEN_GUEST_HANDLE_PARAM (void)uarg)1274 int compat_kexec_op(unsigned int op, XEN_GUEST_HANDLE_PARAM(void) uarg)
1275 {
1276 return do_kexec_op_internal(op, uarg, 1);
1277 }
1278 #endif
1279
1280 /*
1281 * Local variables:
1282 * mode: C
1283 * c-file-style: "BSD"
1284 * c-basic-offset: 4
1285 * tab-width: 4
1286 * indent-tabs-mode: nil
1287 * End:
1288 */
1289