1 /******************************************************************************
2 * kexec.c - Achitecture independent kexec code for Xen
3 *
4 * Xen port written by:
5 * - Simon 'Horms' Horman <horms@verge.net.au>
6 * - Magnus Damm <magnus@valinux.co.jp>
7 */
8
9 #include <xen/init.h>
10 #include <xen/lib.h>
11 #include <xen/acpi.h>
12 #include <xen/ctype.h>
13 #include <xen/errno.h>
14 #include <xen/guest_access.h>
15 #include <xen/watchdog.h>
16 #include <xen/sched.h>
17 #include <xen/types.h>
18 #include <xen/hypercall.h>
19 #include <xen/kexec.h>
20 #include <xen/keyhandler.h>
21 #include <public/kexec.h>
22 #include <xen/cpumask.h>
23 #include <asm/atomic.h>
24 #include <xen/spinlock.h>
25 #include <xen/version.h>
26 #include <xen/console.h>
27 #include <xen/kexec.h>
28 #include <xen/kimage.h>
29 #include <public/elfnote.h>
30 #include <xsm/xsm.h>
31 #include <xen/cpu.h>
32 #ifdef CONFIG_COMPAT
33 #include <compat/kexec.h>
34 #endif
35
36 bool_t kexecing = FALSE;
37
38 /* Memory regions to store the per cpu register state etc. on a crash. */
39 typedef struct { Elf_Note * start; size_t size; } crash_note_range_t;
40 static crash_note_range_t * crash_notes;
41
42 /* Lock to prevent race conditions when allocating the crash note buffers.
43 * It also serves to protect calls to alloc_from_crash_heap when allocating
44 * crash note buffers in lower memory. */
45 static DEFINE_SPINLOCK(crash_notes_lock);
46
47 static Elf_Note *xen_crash_note;
48
49 static cpumask_t crash_saved_cpus;
50
51 static struct kexec_image *kexec_image[KEXEC_IMAGE_NR];
52
53 #define KEXEC_FLAG_DEFAULT_POS (KEXEC_IMAGE_NR + 0)
54 #define KEXEC_FLAG_CRASH_POS (KEXEC_IMAGE_NR + 1)
55 #define KEXEC_FLAG_IN_PROGRESS (KEXEC_IMAGE_NR + 2)
56 #define KEXEC_FLAG_IN_HYPERCALL (KEXEC_IMAGE_NR + 3)
57
58 static unsigned long kexec_flags = 0; /* the lowest bits are for KEXEC_IMAGE... */
59
60 static unsigned char vmcoreinfo_data[VMCOREINFO_BYTES];
61 static size_t vmcoreinfo_size = 0;
62
63 xen_kexec_reserve_t kexec_crash_area;
64 paddr_t __initdata kexec_crash_area_limit = ~(paddr_t)0;
65 static struct {
66 u64 start, end;
67 unsigned long size;
68 } ranges[16] __initdata;
69
70 /* Low crashinfo mode. Start as INVALID so serveral codepaths can set up
71 * defaults without needing to know the state of the others. */
72 enum low_crashinfo low_crashinfo_mode = LOW_CRASHINFO_INVALID;
73
74 /* This value is only considered if low_crash_mode is set to MIN or ALL, so
75 * setting a default here is safe. Default to 4GB. This is because the current
76 * KEXEC_CMD_get_range compat hypercall trucates 64bit pointers to 32 bits. The
77 * typical usecase for crashinfo_maxaddr will be for 64bit Xen with 32bit dom0
78 * and 32bit crash kernel. */
79 static paddr_t __initdata crashinfo_maxaddr = 4ULL << 30;
80
81 /* = log base 2 of crashinfo_maxaddr after checking for sanity. Default to
82 * larger than the entire physical address space. */
83 unsigned int __initdata crashinfo_maxaddr_bits = 64;
84
85 /* Pointers to keep track of the crash heap region. */
86 static void *crash_heap_current = NULL, *crash_heap_end = NULL;
87
88 /*
89 * Parse command lines in the format
90 *
91 * crashkernel=<ramsize-range>:<size>[,...][{@,<,below=}<address>]
92 *
93 * with <ramsize-range> being of form
94 *
95 * <start>-[<end>]
96 *
97 * as well as the legacy ones in the format
98 *
99 * crashkernel=<size>[{@,<}<address>]
100 * crashkernel=<size>,below=address
101 *
102 * < and below are synonyomous, the latter being useful for grub2 systems
103 * which would otherwise require escaping of the < option
104 */
parse_crashkernel(const char * str)105 static int __init parse_crashkernel(const char *str)
106 {
107 const char *cur;
108 int rc = 0;
109
110 if ( strchr(str, ':' ) )
111 {
112 unsigned int idx = 0;
113
114 do {
115 if ( idx >= ARRAY_SIZE(ranges) )
116 {
117 printk(XENLOG_WARNING "crashkernel: too many ranges\n");
118 cur = NULL;
119 str = strpbrk(str, "@,<");
120 rc = -EINVAL;
121 break;
122 }
123
124 ranges[idx].start = parse_size_and_unit(cur = str + !!idx, &str);
125 if ( cur == str )
126 break;
127
128 if ( *str != '-' )
129 {
130 printk(XENLOG_WARNING "crashkernel: '-' expected\n");
131 rc = -EINVAL;
132 break;
133 }
134
135 if ( *++str != ':' )
136 {
137 ranges[idx].end = parse_size_and_unit(cur = str, &str);
138 if ( cur == str )
139 break;
140 if ( ranges[idx].end <= ranges[idx].start )
141 {
142 printk(XENLOG_WARNING "crashkernel: end <= start\n");
143 rc = -EINVAL;
144 break;
145 }
146 }
147 else
148 ranges[idx].end = -1;
149
150 if ( *str != ':' )
151 {
152 printk(XENLOG_WARNING "crashkernel: ':' expected\n");
153 rc = -EINVAL;
154 break;
155 }
156
157 ranges[idx].size = parse_size_and_unit(cur = str + 1, &str);
158 if ( cur == str )
159 break;
160
161 ++idx;
162 } while ( *str == ',' );
163 if ( idx < ARRAY_SIZE(ranges) )
164 ranges[idx].size = 0;
165 }
166 else
167 kexec_crash_area.size = parse_size_and_unit(cur = str, &str);
168 if ( cur != str )
169 {
170 if ( *str == '@' )
171 kexec_crash_area.start = parse_size_and_unit(cur = str + 1, &str);
172 else if ( *str == '<' )
173 kexec_crash_area_limit = parse_size_and_unit(cur = str + 1, &str);
174 else if ( !strncmp(str, ",below=", 7) )
175 kexec_crash_area_limit = parse_size_and_unit(cur = str + 7, &str);
176 else
177 {
178 printk(XENLOG_WARNING "crashkernel: '%s' ignored\n", str);
179 rc = -EINVAL;
180 }
181 }
182 if ( cur && cur == str )
183 {
184 printk(XENLOG_WARNING "crashkernel: memory value expected\n");
185 rc = -EINVAL;
186 }
187
188 return rc;
189 }
190 custom_param("crashkernel", parse_crashkernel);
191
192 /* Parse command lines in the format:
193 *
194 * low_crashinfo=[none,min,all]
195 *
196 * - none disables the low allocation of crash info.
197 * - min will allocate enough low information for the crash kernel to be able
198 * to extract the hypervisor and dom0 message ring buffers.
199 * - all will allocate additional structures such as domain and vcpu structs
200 * low so the crash kernel can perform an extended analysis of state.
201 */
parse_low_crashinfo(const char * str)202 static int __init parse_low_crashinfo(const char *str)
203 {
204
205 if ( !strlen(str) )
206 /* default to min if user just specifies "low_crashinfo" */
207 low_crashinfo_mode = LOW_CRASHINFO_MIN;
208 else if ( !strcmp(str, "none" ) )
209 low_crashinfo_mode = LOW_CRASHINFO_NONE;
210 else if ( !strcmp(str, "min" ) )
211 low_crashinfo_mode = LOW_CRASHINFO_MIN;
212 else if ( !strcmp(str, "all" ) )
213 low_crashinfo_mode = LOW_CRASHINFO_ALL;
214 else
215 {
216 printk("Unknown low_crashinfo parameter '%s'. Defaulting to min.\n", str);
217 low_crashinfo_mode = LOW_CRASHINFO_MIN;
218 return -EINVAL;
219 }
220
221 return 0;
222 }
223 custom_param("low_crashinfo", parse_low_crashinfo);
224
225 /* Parse command lines in the format:
226 *
227 * crashinfo_maxaddr=<addr>
228 *
229 * <addr> will be rounded down to the nearest power of two. Defaults to 64G
230 */
parse_crashinfo_maxaddr(const char * str)231 static int __init parse_crashinfo_maxaddr(const char *str)
232 {
233 u64 addr;
234 const char *q;
235
236 /* if low_crashinfo_mode is unset, default to min. */
237 if ( low_crashinfo_mode == LOW_CRASHINFO_INVALID )
238 low_crashinfo_mode = LOW_CRASHINFO_MIN;
239
240 if ( (addr = parse_size_and_unit(str, &q)) )
241 crashinfo_maxaddr = addr;
242 else
243 {
244 printk("Unable to parse crashinfo_maxaddr. Defaulting to %"PRIpaddr"\n",
245 crashinfo_maxaddr);
246 return -EINVAL;
247 }
248
249 return *q ? -EINVAL : 0;
250 }
251 custom_param("crashinfo_maxaddr", parse_crashinfo_maxaddr);
252
set_kexec_crash_area_size(u64 system_ram)253 void __init set_kexec_crash_area_size(u64 system_ram)
254 {
255 unsigned int idx;
256
257 for ( idx = 0; idx < ARRAY_SIZE(ranges) && !kexec_crash_area.size; ++idx )
258 {
259 if ( !ranges[idx].size )
260 break;
261
262 if ( ranges[idx].size >= system_ram )
263 {
264 printk(XENLOG_WARNING "crashkernel: invalid size\n");
265 continue;
266 }
267
268 if ( ranges[idx].start <= system_ram && ranges[idx].end > system_ram )
269 kexec_crash_area.size = ranges[idx].size;
270 }
271 }
272
273 /*
274 * Only allow one cpu to continue on the crash path, forcing others to spin.
275 * Racing on the crash path from here will end in misery. If we reenter,
276 * something has very gone wrong and retrying will (almost certainly) be
277 * futile. Return up to our nested panic() to try and reboot.
278 *
279 * This is noinline to make it obvious in stack traces which cpus have lost
280 * the race (as opposed to being somewhere in kexec_common_shutdown())
281 */
one_cpu_only(void)282 static int noinline one_cpu_only(void)
283 {
284 static unsigned int crashing_cpu = -1;
285 unsigned int cpu = smp_processor_id();
286
287 if ( cmpxchg(&crashing_cpu, -1, cpu) != -1 )
288 {
289 /* Not the first entry into one_cpu_only(). */
290 if ( crashing_cpu == cpu )
291 {
292 printk("Reentered the crash path. Something is very broken\n");
293 return -EBUSY;
294 }
295
296 /*
297 * Another cpu has beaten us to this point. Wait here patiently for
298 * it to kill us.
299 */
300 for ( ; ; )
301 halt();
302 }
303
304 set_bit(KEXEC_FLAG_IN_PROGRESS, &kexec_flags);
305 printk("Executing kexec image on cpu%u\n", cpu);
306
307 return 0;
308 }
309
310 /* Save the registers in the per-cpu crash note buffer. */
kexec_crash_save_cpu(void)311 void kexec_crash_save_cpu(void)
312 {
313 int cpu = smp_processor_id();
314 Elf_Note *note;
315 ELF_Prstatus *prstatus;
316 crash_xen_core_t *xencore;
317
318 BUG_ON ( ! crash_notes );
319
320 if ( cpumask_test_and_set_cpu(cpu, &crash_saved_cpus) )
321 return;
322
323 note = crash_notes[cpu].start;
324
325 prstatus = (ELF_Prstatus *)ELFNOTE_DESC(note);
326
327 note = ELFNOTE_NEXT(note);
328 xencore = (crash_xen_core_t *)ELFNOTE_DESC(note);
329
330 elf_core_save_regs(&prstatus->pr_reg, xencore);
331 }
332
333 /* Set up the single Xen-specific-info crash note. */
kexec_crash_save_info(void)334 crash_xen_info_t *kexec_crash_save_info(void)
335 {
336 int cpu = smp_processor_id();
337 crash_xen_info_t info;
338 crash_xen_info_t *out = (crash_xen_info_t *)ELFNOTE_DESC(xen_crash_note);
339
340 BUG_ON(!cpumask_test_and_set_cpu(cpu, &crash_saved_cpus));
341
342 memset(&info, 0, sizeof(info));
343 info.xen_major_version = xen_major_version();
344 info.xen_minor_version = xen_minor_version();
345 info.xen_extra_version = __pa(xen_extra_version());
346 info.xen_changeset = __pa(xen_changeset());
347 info.xen_compiler = __pa(xen_compiler());
348 info.xen_compile_date = __pa(xen_compile_date());
349 info.xen_compile_time = __pa(xen_compile_time());
350 info.tainted = tainted;
351
352 /* Copy from guaranteed-aligned local copy to possibly-unaligned dest. */
353 memcpy(out, &info, sizeof(info));
354
355 return out;
356 }
357
kexec_common_shutdown(void)358 static int kexec_common_shutdown(void)
359 {
360 int ret;
361
362 ret = one_cpu_only();
363 if ( ret )
364 return ret;
365
366 watchdog_disable();
367 console_start_sync();
368 spin_debug_disable();
369 acpi_dmar_reinstate();
370
371 return 0;
372 }
373
kexec_crash(void)374 void kexec_crash(void)
375 {
376 int pos;
377
378 pos = (test_bit(KEXEC_FLAG_CRASH_POS, &kexec_flags) != 0);
379 if ( !test_bit(KEXEC_IMAGE_CRASH_BASE + pos, &kexec_flags) )
380 return;
381
382 kexecing = TRUE;
383
384 if ( kexec_common_shutdown() != 0 )
385 return;
386
387 kexec_crash_save_cpu();
388 machine_crash_shutdown();
389 machine_kexec(kexec_image[KEXEC_IMAGE_CRASH_BASE + pos]);
390
391 BUG();
392 }
393
kexec_reboot(void * _image)394 static long kexec_reboot(void *_image)
395 {
396 struct kexec_image *image = _image;
397
398 kexecing = TRUE;
399
400 kexec_common_shutdown();
401 machine_reboot_kexec(image);
402
403 BUG();
404 return 0;
405 }
406
do_crashdump_trigger(unsigned char key)407 static void do_crashdump_trigger(unsigned char key)
408 {
409 printk("'%c' pressed -> triggering crashdump\n", key);
410 kexec_crash();
411 printk(" * no crash kernel loaded!\n");
412 }
413
setup_note(Elf_Note * n,const char * name,int type,int descsz)414 static void setup_note(Elf_Note *n, const char *name, int type, int descsz)
415 {
416 int l = strlen(name) + 1;
417 strlcpy(ELFNOTE_NAME(n), name, l);
418 n->namesz = l;
419 n->descsz = descsz;
420 n->type = type;
421 }
422
sizeof_note(const char * name,int descsz)423 static size_t sizeof_note(const char *name, int descsz)
424 {
425 return (sizeof(Elf_Note) +
426 ELFNOTE_ALIGN(strlen(name)+1) +
427 ELFNOTE_ALIGN(descsz));
428 }
429
sizeof_cpu_notes(const unsigned long cpu)430 static size_t sizeof_cpu_notes(const unsigned long cpu)
431 {
432 /* All CPUs present a PRSTATUS and crash_xen_core note. */
433 size_t bytes =
434 + sizeof_note("CORE", sizeof(ELF_Prstatus)) +
435 + sizeof_note("Xen", sizeof(crash_xen_core_t));
436
437 /* CPU0 also presents the crash_xen_info note. */
438 if ( ! cpu )
439 bytes = bytes +
440 sizeof_note("Xen", sizeof(crash_xen_info_t));
441
442 return bytes;
443 }
444
445 /* Allocate size_t bytes of space from the previously allocated
446 * crash heap if the user has requested that crash notes be allocated
447 * in lower memory. There is currently no case where the crash notes
448 * should be free()'d. */
alloc_from_crash_heap(const size_t bytes)449 static void * alloc_from_crash_heap(const size_t bytes)
450 {
451 void * ret;
452 if ( crash_heap_current + bytes > crash_heap_end )
453 return NULL;
454 ret = (void*)crash_heap_current;
455 crash_heap_current += bytes;
456 return ret;
457 }
458
459 /* Allocate a crash note buffer for a newly onlined cpu. */
kexec_init_cpu_notes(const unsigned long cpu)460 static int kexec_init_cpu_notes(const unsigned long cpu)
461 {
462 Elf_Note * note = NULL;
463 int ret = 0;
464 int nr_bytes = 0;
465
466 BUG_ON( cpu >= nr_cpu_ids || ! crash_notes );
467
468 /* If already allocated, nothing to do. */
469 if ( crash_notes[cpu].start )
470 return ret;
471
472 nr_bytes = sizeof_cpu_notes(cpu);
473
474 /* If we dont care about the position of allocation, malloc. */
475 if ( low_crashinfo_mode == LOW_CRASHINFO_NONE )
476 note = xzalloc_bytes(nr_bytes);
477
478 /* Protect the write into crash_notes[] with a spinlock, as this function
479 * is on a hotplug path and a hypercall path. */
480 spin_lock(&crash_notes_lock);
481
482 /* If we are racing with another CPU and it has beaten us, give up
483 * gracefully. */
484 if ( crash_notes[cpu].start )
485 {
486 spin_unlock(&crash_notes_lock);
487 /* Always return ok, because whether we successfully allocated or not,
488 * another CPU has successfully allocated. */
489 xfree(note);
490 }
491 else
492 {
493 /* If we care about memory possition, alloc from the crash heap,
494 * also protected by the crash_notes_lock. */
495 if ( low_crashinfo_mode > LOW_CRASHINFO_NONE )
496 note = alloc_from_crash_heap(nr_bytes);
497
498 crash_notes[cpu].start = note;
499 crash_notes[cpu].size = nr_bytes;
500 spin_unlock(&crash_notes_lock);
501
502 /* If the allocation failed, and another CPU did not beat us, give
503 * up with ENOMEM. */
504 if ( ! note )
505 ret = -ENOMEM;
506 /* else all is good so lets set up the notes. */
507 else
508 {
509 /* Set up CORE note. */
510 setup_note(note, "CORE", NT_PRSTATUS, sizeof(ELF_Prstatus));
511 note = ELFNOTE_NEXT(note);
512
513 /* Set up Xen CORE note. */
514 setup_note(note, "Xen", XEN_ELFNOTE_CRASH_REGS,
515 sizeof(crash_xen_core_t));
516
517 if ( ! cpu )
518 {
519 /* Set up Xen Crash Info note. */
520 xen_crash_note = note = ELFNOTE_NEXT(note);
521 setup_note(note, "Xen", XEN_ELFNOTE_CRASH_INFO,
522 sizeof(crash_xen_info_t));
523 }
524 }
525 }
526
527 return ret;
528 }
529
cpu_callback(struct notifier_block * nfb,unsigned long action,void * hcpu)530 static int cpu_callback(
531 struct notifier_block *nfb, unsigned long action, void *hcpu)
532 {
533 unsigned long cpu = (unsigned long)hcpu;
534
535 /* Only hook on CPU_UP_PREPARE because once a crash_note has been reported
536 * to dom0, it must keep it around in case of a crash, as the crash kernel
537 * will be hard coded to the original physical address reported. */
538 switch ( action )
539 {
540 case CPU_UP_PREPARE:
541 /* Ignore return value. If this boot time, -ENOMEM will cause all
542 * manner of problems elsewhere very soon, and if it is during runtime,
543 * then failing to allocate crash notes is not a good enough reason to
544 * fail the CPU_UP_PREPARE */
545 kexec_init_cpu_notes(cpu);
546 break;
547 default:
548 break;
549 }
550 return NOTIFY_DONE;
551 }
552
553 static struct notifier_block cpu_nfb = {
554 .notifier_call = cpu_callback
555 };
556
kexec_early_calculations(void)557 void __init kexec_early_calculations(void)
558 {
559 /* If low_crashinfo_mode is still INVALID, neither "low_crashinfo" nor
560 * "crashinfo_maxaddr" have been specified on the command line, so
561 * explicitly set to NONE. */
562 if ( low_crashinfo_mode == LOW_CRASHINFO_INVALID )
563 low_crashinfo_mode = LOW_CRASHINFO_NONE;
564
565 if ( low_crashinfo_mode > LOW_CRASHINFO_NONE )
566 crashinfo_maxaddr_bits = fls64(crashinfo_maxaddr) - 1;
567 }
568
kexec_init(void)569 static int __init kexec_init(void)
570 {
571 void *cpu = (void *)(unsigned long)smp_processor_id();
572
573 /* If no crash area, no need to allocate space for notes. */
574 if ( !kexec_crash_area.size )
575 return 0;
576
577 if ( low_crashinfo_mode > LOW_CRASHINFO_NONE )
578 {
579 size_t crash_heap_size;
580
581 /* This calculation is safe even if the machine is booted in
582 * uniprocessor mode. */
583 crash_heap_size = sizeof_cpu_notes(0) +
584 sizeof_cpu_notes(1) * (nr_cpu_ids - 1);
585 crash_heap_size = PAGE_ALIGN(crash_heap_size);
586
587 crash_heap_current = alloc_xenheap_pages(
588 get_order_from_bytes(crash_heap_size),
589 MEMF_bits(crashinfo_maxaddr_bits) );
590
591 if ( ! crash_heap_current )
592 return -ENOMEM;
593
594 memset(crash_heap_current, 0, crash_heap_size);
595
596 crash_heap_end = crash_heap_current + crash_heap_size;
597 }
598
599 /* crash_notes may be allocated anywhere Xen can reach in memory.
600 Only the individual CPU crash notes themselves must be allocated
601 in lower memory if requested. */
602 crash_notes = xzalloc_array(crash_note_range_t, nr_cpu_ids);
603 if ( ! crash_notes )
604 return -ENOMEM;
605
606 register_keyhandler('C', do_crashdump_trigger, "trigger a crashdump", 0);
607
608 cpu_callback(&cpu_nfb, CPU_UP_PREPARE, cpu);
609 register_cpu_notifier(&cpu_nfb);
610 return 0;
611 }
612 /* The reason for this to be a presmp_initcall as opposed to a regular
613 * __initcall is to allow the setup of the cpu hotplug handler before APs are
614 * brought up. */
615 presmp_initcall(kexec_init);
616
kexec_get_reserve(xen_kexec_range_t * range)617 static int kexec_get_reserve(xen_kexec_range_t *range)
618 {
619 if ( kexec_crash_area.size > 0 && kexec_crash_area.start > 0) {
620 range->start = kexec_crash_area.start;
621 range->size = kexec_crash_area.size;
622 }
623 else
624 range->start = range->size = 0;
625 return 0;
626 }
627
kexec_get_cpu(xen_kexec_range_t * range)628 static int kexec_get_cpu(xen_kexec_range_t *range)
629 {
630 int nr = range->nr;
631
632 if ( nr < 0 || nr >= nr_cpu_ids )
633 return -ERANGE;
634
635 if ( ! crash_notes )
636 return -EINVAL;
637
638 /* Try once again to allocate room for the crash notes. It is just possible
639 * that more space has become available since we last tried. If space has
640 * already been allocated, kexec_init_cpu_notes() will return early with 0.
641 */
642 kexec_init_cpu_notes(nr);
643
644 /* In the case of still not having enough memory to allocate buffer room,
645 * returning a range of 0,0 is still valid. */
646 if ( crash_notes[nr].start )
647 {
648 range->start = __pa(crash_notes[nr].start);
649 range->size = crash_notes[nr].size;
650 }
651 else
652 range->start = range->size = 0;
653
654 return 0;
655 }
656
kexec_get_vmcoreinfo(xen_kexec_range_t * range)657 static int kexec_get_vmcoreinfo(xen_kexec_range_t *range)
658 {
659 range->start = __pa((unsigned long)vmcoreinfo_data);
660 range->size = VMCOREINFO_BYTES;
661 return 0;
662 }
663
kexec_get_range_internal(xen_kexec_range_t * range)664 static int kexec_get_range_internal(xen_kexec_range_t *range)
665 {
666 int ret = -EINVAL;
667
668 switch ( range->range )
669 {
670 case KEXEC_RANGE_MA_CRASH:
671 ret = kexec_get_reserve(range);
672 break;
673 case KEXEC_RANGE_MA_CPU:
674 ret = kexec_get_cpu(range);
675 break;
676 case KEXEC_RANGE_MA_VMCOREINFO:
677 ret = kexec_get_vmcoreinfo(range);
678 break;
679 default:
680 ret = machine_kexec_get(range);
681 break;
682 }
683
684 return ret;
685 }
686
kexec_get_range(XEN_GUEST_HANDLE_PARAM (void)uarg)687 static int kexec_get_range(XEN_GUEST_HANDLE_PARAM(void) uarg)
688 {
689 xen_kexec_range_t range;
690 int ret = -EINVAL;
691
692 if ( unlikely(copy_from_guest(&range, uarg, 1)) )
693 return -EFAULT;
694
695 ret = kexec_get_range_internal(&range);
696
697 if ( ret == 0 && unlikely(__copy_to_guest(uarg, &range, 1)) )
698 ret = -EFAULT;
699
700 return ret;
701 }
702
kexec_get_range_compat(XEN_GUEST_HANDLE_PARAM (void)uarg)703 static int kexec_get_range_compat(XEN_GUEST_HANDLE_PARAM(void) uarg)
704 {
705 #ifdef CONFIG_COMPAT
706 xen_kexec_range_t range;
707 compat_kexec_range_t compat_range;
708 int ret = -EINVAL;
709
710 if ( unlikely(copy_from_guest(&compat_range, uarg, 1)) )
711 return -EFAULT;
712
713 XLAT_kexec_range(&range, &compat_range);
714
715 ret = kexec_get_range_internal(&range);
716
717 /* Dont silently truncate physical addresses or sizes. */
718 if ( (range.start | range.size) & ~(unsigned long)(~0u) )
719 return -ERANGE;
720
721 if ( ret == 0 )
722 {
723 XLAT_kexec_range(&compat_range, &range);
724 if ( unlikely(__copy_to_guest(uarg, &compat_range, 1)) )
725 ret = -EFAULT;
726 }
727
728 return ret;
729 #else /* CONFIG_COMPAT */
730 return 0;
731 #endif /* CONFIG_COMPAT */
732 }
733
kexec_load_get_bits(int type,int * base,int * bit)734 static int kexec_load_get_bits(int type, int *base, int *bit)
735 {
736 switch ( type )
737 {
738 case KEXEC_TYPE_DEFAULT:
739 *base = KEXEC_IMAGE_DEFAULT_BASE;
740 *bit = KEXEC_FLAG_DEFAULT_POS;
741 break;
742 case KEXEC_TYPE_CRASH:
743 *base = KEXEC_IMAGE_CRASH_BASE;
744 *bit = KEXEC_FLAG_CRASH_POS;
745 break;
746 default:
747 return -1;
748 }
749 return 0;
750 }
751
vmcoreinfo_append_str(const char * fmt,...)752 void vmcoreinfo_append_str(const char *fmt, ...)
753 {
754 va_list args;
755 char buf[0x50];
756 int r;
757 size_t note_size = sizeof(Elf_Note) + ELFNOTE_ALIGN(strlen(VMCOREINFO_NOTE_NAME) + 1);
758
759 if (vmcoreinfo_size + note_size + sizeof(buf) > VMCOREINFO_BYTES)
760 return;
761
762 va_start(args, fmt);
763 r = vsnprintf(buf, sizeof(buf), fmt, args);
764 va_end(args);
765
766 memcpy(&vmcoreinfo_data[note_size + vmcoreinfo_size], buf, r);
767
768 vmcoreinfo_size += r;
769 }
770
crash_save_vmcoreinfo(void)771 static void crash_save_vmcoreinfo(void)
772 {
773 size_t data_size;
774
775 if (vmcoreinfo_size > 0) /* already saved */
776 return;
777
778 data_size = VMCOREINFO_BYTES - (sizeof(Elf_Note) + ELFNOTE_ALIGN(strlen(VMCOREINFO_NOTE_NAME) + 1));
779 setup_note((Elf_Note *)vmcoreinfo_data, VMCOREINFO_NOTE_NAME, 0, data_size);
780
781 VMCOREINFO_PAGESIZE(PAGE_SIZE);
782
783 VMCOREINFO_SYMBOL(domain_list);
784 #ifndef frame_table
785 VMCOREINFO_SYMBOL(frame_table);
786 #else
787 {
788 static const void *const _frame_table = frame_table;
789 VMCOREINFO_SYMBOL_ALIAS(frame_table, _frame_table);
790 }
791 #endif
792 VMCOREINFO_SYMBOL(max_page);
793
794 VMCOREINFO_STRUCT_SIZE(page_info);
795 VMCOREINFO_STRUCT_SIZE(domain);
796
797 VMCOREINFO_OFFSET(page_info, count_info);
798 VMCOREINFO_OFFSET_SUB(page_info, v.inuse, _domain);
799 VMCOREINFO_OFFSET(domain, domain_id);
800 VMCOREINFO_OFFSET(domain, next_in_list);
801
802 #ifdef ARCH_CRASH_SAVE_VMCOREINFO
803 arch_crash_save_vmcoreinfo();
804 #endif
805 }
806
kexec_unload_image(struct kexec_image * image)807 static void kexec_unload_image(struct kexec_image *image)
808 {
809 if ( !image )
810 return;
811
812 machine_kexec_unload(image);
813 kimage_free(image);
814 }
815
kexec_exec(XEN_GUEST_HANDLE_PARAM (void)uarg)816 static int kexec_exec(XEN_GUEST_HANDLE_PARAM(void) uarg)
817 {
818 xen_kexec_exec_t exec;
819 struct kexec_image *image;
820 int base, bit, pos, ret = -EINVAL;
821
822 if ( unlikely(copy_from_guest(&exec, uarg, 1)) )
823 return -EFAULT;
824
825 if ( kexec_load_get_bits(exec.type, &base, &bit) )
826 return -EINVAL;
827
828 pos = (test_bit(bit, &kexec_flags) != 0);
829
830 /* Only allow kexec/kdump into loaded images */
831 if ( !test_bit(base + pos, &kexec_flags) )
832 return -ENOENT;
833
834 switch (exec.type)
835 {
836 case KEXEC_TYPE_DEFAULT:
837 image = kexec_image[base + pos];
838 ret = continue_hypercall_on_cpu(0, kexec_reboot, image);
839 break;
840 case KEXEC_TYPE_CRASH:
841 kexec_crash(); /* Does not return */
842 break;
843 }
844
845 return -EINVAL; /* never reached */
846 }
847
kexec_swap_images(int type,struct kexec_image * new,struct kexec_image ** old)848 static int kexec_swap_images(int type, struct kexec_image *new,
849 struct kexec_image **old)
850 {
851 int base, bit, pos;
852 int new_slot, old_slot;
853
854 *old = NULL;
855
856 if ( test_bit(KEXEC_FLAG_IN_PROGRESS, &kexec_flags) )
857 return -EBUSY;
858
859 if ( kexec_load_get_bits(type, &base, &bit) )
860 return -EINVAL;
861
862 ASSERT(test_bit(KEXEC_FLAG_IN_HYPERCALL, &kexec_flags));
863
864 pos = (test_bit(bit, &kexec_flags) != 0);
865 old_slot = base + pos;
866 new_slot = base + !pos;
867
868 kexec_image[new_slot] = new;
869 if ( new )
870 set_bit(new_slot, &kexec_flags);
871 change_bit(bit, &kexec_flags);
872
873 clear_bit(old_slot, &kexec_flags);
874 *old = kexec_image[old_slot];
875
876 return 0;
877 }
878
kexec_load_slot(struct kexec_image * kimage)879 static int kexec_load_slot(struct kexec_image *kimage)
880 {
881 struct kexec_image *old_kimage;
882 int ret = -ENOMEM;
883
884 ret = machine_kexec_load(kimage);
885 if ( ret < 0 )
886 return ret;
887
888 crash_save_vmcoreinfo();
889
890 ret = kexec_swap_images(kimage->type, kimage, &old_kimage);
891 if ( ret < 0 )
892 return ret;
893
894 kexec_unload_image(old_kimage);
895
896 return 0;
897 }
898
kexec_load_v1_arch(void)899 static uint16_t kexec_load_v1_arch(void)
900 {
901 #ifdef CONFIG_X86
902 return is_pv_32bit_domain(hardware_domain) ? EM_386 : EM_X86_64;
903 #else
904 return EM_NONE;
905 #endif
906 }
907
kexec_segments_add_segment(unsigned int * nr_segments,xen_kexec_segment_t * segments,mfn_t mfn)908 static int kexec_segments_add_segment(unsigned int *nr_segments,
909 xen_kexec_segment_t *segments,
910 mfn_t mfn)
911 {
912 paddr_t maddr = mfn_to_maddr(mfn);
913 unsigned int n = *nr_segments;
914
915 /* Need a new segment? */
916 if ( n == 0
917 || segments[n-1].dest_maddr + segments[n-1].dest_size != maddr )
918 {
919 n++;
920 if ( n > KEXEC_SEGMENT_MAX )
921 return -EINVAL;
922 *nr_segments = n;
923
924 set_xen_guest_handle(segments[n-1].buf.h, NULL);
925 segments[n-1].buf_size = 0;
926 segments[n-1].dest_maddr = maddr;
927 segments[n-1].dest_size = 0;
928 }
929
930 return 0;
931 }
932
kexec_segments_from_ind_page(mfn_t mfn,unsigned int * nr_segments,xen_kexec_segment_t * segments,bool_t compat)933 static int kexec_segments_from_ind_page(mfn_t mfn,
934 unsigned int *nr_segments,
935 xen_kexec_segment_t *segments,
936 bool_t compat)
937 {
938 void *page;
939 kimage_entry_t *entry;
940 int ret = 0;
941
942 page = map_domain_page(mfn);
943
944 /*
945 * Walk the indirection page list, adding destination pages to the
946 * segments.
947 */
948 for ( entry = page; ; )
949 {
950 unsigned long ind;
951
952 ind = kimage_entry_ind(entry, compat);
953 mfn = kimage_entry_mfn(entry, compat);
954
955 switch ( ind )
956 {
957 case IND_DESTINATION:
958 ret = kexec_segments_add_segment(nr_segments, segments, mfn);
959 if ( ret < 0 )
960 goto done;
961 break;
962 case IND_INDIRECTION:
963 unmap_domain_page(page);
964 entry = page = map_domain_page(mfn);
965 continue;
966 case IND_DONE:
967 goto done;
968 case IND_SOURCE:
969 if ( *nr_segments == 0 )
970 {
971 ret = -EINVAL;
972 goto done;
973 }
974 segments[*nr_segments-1].dest_size += PAGE_SIZE;
975 break;
976 default:
977 ret = -EINVAL;
978 goto done;
979 }
980 entry = kimage_entry_next(entry, compat);
981 }
982 done:
983 unmap_domain_page(page);
984 return ret;
985 }
986
kexec_do_load_v1(xen_kexec_load_v1_t * load,int compat)987 static int kexec_do_load_v1(xen_kexec_load_v1_t *load, int compat)
988 {
989 struct kexec_image *kimage = NULL;
990 xen_kexec_segment_t *segments;
991 uint16_t arch;
992 unsigned int nr_segments = 0;
993 mfn_t ind_mfn = maddr_to_mfn(load->image.indirection_page);
994 int ret;
995
996 arch = kexec_load_v1_arch();
997 if ( arch == EM_NONE )
998 return -ENOSYS;
999
1000 segments = xmalloc_array(xen_kexec_segment_t, KEXEC_SEGMENT_MAX);
1001 if ( segments == NULL )
1002 return -ENOMEM;
1003
1004 /*
1005 * Work out the image segments (destination only) from the
1006 * indirection pages.
1007 *
1008 * This is needed so we don't allocate pages that will overlap
1009 * with the destination when building the new set of indirection
1010 * pages below.
1011 */
1012 ret = kexec_segments_from_ind_page(ind_mfn, &nr_segments, segments, compat);
1013 if ( ret < 0 )
1014 goto error;
1015
1016 ret = kimage_alloc(&kimage, load->type, arch, load->image.start_address,
1017 nr_segments, segments);
1018 if ( ret < 0 )
1019 goto error;
1020
1021 /*
1022 * Build a new set of indirection pages in the native format.
1023 *
1024 * This walks the guest provided indirection pages a second time.
1025 * The guest could have altered then, invalidating the segment
1026 * information constructed above. This will only result in the
1027 * resulting image being potentially unrelocatable.
1028 */
1029 ret = kimage_build_ind(kimage, ind_mfn, compat);
1030 if ( ret < 0 )
1031 goto error;
1032
1033 if ( arch == EM_386 || arch == EM_X86_64 )
1034 {
1035 /*
1036 * Ensure 0 - 1 MiB is mapped and accessible by the image.
1037 *
1038 * This allows access to VGA memory and the region purgatory copies
1039 * in the crash case.
1040 */
1041 unsigned long addr;
1042
1043 for ( addr = 0; addr < MB(1); addr += PAGE_SIZE )
1044 {
1045 ret = machine_kexec_add_page(kimage, addr, addr);
1046 if ( ret < 0 )
1047 goto error;
1048 }
1049 }
1050
1051 ret = kexec_load_slot(kimage);
1052 if ( ret < 0 )
1053 goto error;
1054
1055 return 0;
1056
1057 error:
1058 if ( !kimage )
1059 xfree(segments);
1060 kimage_free(kimage);
1061 return ret;
1062 }
1063
kexec_load_v1(XEN_GUEST_HANDLE_PARAM (void)uarg)1064 static int kexec_load_v1(XEN_GUEST_HANDLE_PARAM(void) uarg)
1065 {
1066 xen_kexec_load_v1_t load;
1067
1068 if ( unlikely(copy_from_guest(&load, uarg, 1)) )
1069 return -EFAULT;
1070
1071 return kexec_do_load_v1(&load, 0);
1072 }
1073
kexec_load_v1_compat(XEN_GUEST_HANDLE_PARAM (void)uarg)1074 static int kexec_load_v1_compat(XEN_GUEST_HANDLE_PARAM(void) uarg)
1075 {
1076 #ifdef CONFIG_COMPAT
1077 compat_kexec_load_v1_t compat_load;
1078 xen_kexec_load_v1_t load;
1079
1080 if ( unlikely(copy_from_guest(&compat_load, uarg, 1)) )
1081 return -EFAULT;
1082
1083 /* This is a bit dodgy, load.image is inside load,
1084 * but XLAT_kexec_load (which is automatically generated)
1085 * doesn't translate load.image (correctly)
1086 * Just copy load->type, the only other member, manually instead.
1087 *
1088 * XLAT_kexec_load(&load, &compat_load);
1089 */
1090 load.type = compat_load.type;
1091 XLAT_kexec_image(&load.image, &compat_load.image);
1092
1093 return kexec_do_load_v1(&load, 1);
1094 #else
1095 return 0;
1096 #endif
1097 }
1098
kexec_load(XEN_GUEST_HANDLE_PARAM (void)uarg)1099 static int kexec_load(XEN_GUEST_HANDLE_PARAM(void) uarg)
1100 {
1101 xen_kexec_load_t load;
1102 xen_kexec_segment_t *segments;
1103 struct kexec_image *kimage = NULL;
1104 int ret;
1105
1106 if ( copy_from_guest(&load, uarg, 1) )
1107 return -EFAULT;
1108
1109 if ( load.nr_segments >= KEXEC_SEGMENT_MAX )
1110 return -EINVAL;
1111
1112 segments = xmalloc_array(xen_kexec_segment_t, load.nr_segments);
1113 if ( segments == NULL )
1114 return -ENOMEM;
1115
1116 if ( copy_from_guest(segments, load.segments.h, load.nr_segments) )
1117 {
1118 ret = -EFAULT;
1119 goto error;
1120 }
1121
1122 ret = kimage_alloc(&kimage, load.type, load.arch, load.entry_maddr,
1123 load.nr_segments, segments);
1124 if ( ret < 0 )
1125 goto error;
1126
1127 ret = kimage_load_segments(kimage);
1128 if ( ret < 0 )
1129 goto error;
1130
1131 ret = kexec_load_slot(kimage);
1132 if ( ret < 0 )
1133 goto error;
1134
1135 return 0;
1136
1137 error:
1138 if ( ! kimage )
1139 xfree(segments);
1140 kimage_free(kimage);
1141 return ret;
1142 }
1143
kexec_do_unload(xen_kexec_unload_t * unload)1144 static int kexec_do_unload(xen_kexec_unload_t *unload)
1145 {
1146 struct kexec_image *old_kimage;
1147 int ret;
1148
1149 ret = kexec_swap_images(unload->type, NULL, &old_kimage);
1150 if ( ret < 0 )
1151 return ret;
1152
1153 kexec_unload_image(old_kimage);
1154
1155 return 0;
1156 }
1157
kexec_unload_v1(XEN_GUEST_HANDLE_PARAM (void)uarg)1158 static int kexec_unload_v1(XEN_GUEST_HANDLE_PARAM(void) uarg)
1159 {
1160 xen_kexec_load_v1_t load;
1161 xen_kexec_unload_t unload;
1162
1163 if ( copy_from_guest(&load, uarg, 1) )
1164 return -EFAULT;
1165
1166 unload.type = load.type;
1167 return kexec_do_unload(&unload);
1168 }
1169
kexec_unload_v1_compat(XEN_GUEST_HANDLE_PARAM (void)uarg)1170 static int kexec_unload_v1_compat(XEN_GUEST_HANDLE_PARAM(void) uarg)
1171 {
1172 #ifdef CONFIG_COMPAT
1173 compat_kexec_load_v1_t compat_load;
1174 xen_kexec_unload_t unload;
1175
1176 if ( copy_from_guest(&compat_load, uarg, 1) )
1177 return -EFAULT;
1178
1179 unload.type = compat_load.type;
1180 return kexec_do_unload(&unload);
1181 #else
1182 return 0;
1183 #endif
1184 }
1185
kexec_unload(XEN_GUEST_HANDLE_PARAM (void)uarg)1186 static int kexec_unload(XEN_GUEST_HANDLE_PARAM(void) uarg)
1187 {
1188 xen_kexec_unload_t unload;
1189
1190 if ( unlikely(copy_from_guest(&unload, uarg, 1)) )
1191 return -EFAULT;
1192
1193 return kexec_do_unload(&unload);
1194 }
1195
kexec_status(XEN_GUEST_HANDLE_PARAM (void)uarg)1196 static int kexec_status(XEN_GUEST_HANDLE_PARAM(void) uarg)
1197 {
1198 xen_kexec_status_t status;
1199 int base, bit;
1200
1201 if ( unlikely(copy_from_guest(&status, uarg, 1)) )
1202 return -EFAULT;
1203
1204 /* No need to check KEXEC_FLAG_IN_PROGRESS. */
1205
1206 if ( kexec_load_get_bits(status.type, &base, &bit) )
1207 return -EINVAL;
1208
1209 return !!test_bit(bit, &kexec_flags);
1210 }
1211
do_kexec_op_internal(unsigned long op,XEN_GUEST_HANDLE_PARAM (void)uarg,bool_t compat)1212 static int do_kexec_op_internal(unsigned long op,
1213 XEN_GUEST_HANDLE_PARAM(void) uarg,
1214 bool_t compat)
1215 {
1216 int ret = -EINVAL;
1217
1218 ret = xsm_kexec(XSM_PRIV);
1219 if ( ret )
1220 return ret;
1221
1222 if ( test_and_set_bit(KEXEC_FLAG_IN_HYPERCALL, &kexec_flags) )
1223 return hypercall_create_continuation(__HYPERVISOR_kexec_op, "lh", op, uarg);
1224
1225 switch ( op )
1226 {
1227 case KEXEC_CMD_kexec_get_range:
1228 if (compat)
1229 ret = kexec_get_range_compat(uarg);
1230 else
1231 ret = kexec_get_range(uarg);
1232 break;
1233 case KEXEC_CMD_kexec_load_v1:
1234 if ( compat )
1235 ret = kexec_load_v1_compat(uarg);
1236 else
1237 ret = kexec_load_v1(uarg);
1238 break;
1239 case KEXEC_CMD_kexec_unload_v1:
1240 if ( compat )
1241 ret = kexec_unload_v1_compat(uarg);
1242 else
1243 ret = kexec_unload_v1(uarg);
1244 break;
1245 case KEXEC_CMD_kexec:
1246 ret = kexec_exec(uarg);
1247 break;
1248 case KEXEC_CMD_kexec_load:
1249 ret = kexec_load(uarg);
1250 break;
1251 case KEXEC_CMD_kexec_unload:
1252 ret = kexec_unload(uarg);
1253 break;
1254 case KEXEC_CMD_kexec_status:
1255 ret = kexec_status(uarg);
1256 break;
1257 }
1258
1259 clear_bit(KEXEC_FLAG_IN_HYPERCALL, &kexec_flags);
1260
1261 return ret;
1262 }
1263
do_kexec_op(unsigned long op,XEN_GUEST_HANDLE_PARAM (void)uarg)1264 long do_kexec_op(unsigned long op, XEN_GUEST_HANDLE_PARAM(void) uarg)
1265 {
1266 return do_kexec_op_internal(op, uarg, 0);
1267 }
1268
1269 #ifdef CONFIG_COMPAT
compat_kexec_op(unsigned long op,XEN_GUEST_HANDLE_PARAM (void)uarg)1270 int compat_kexec_op(unsigned long op, XEN_GUEST_HANDLE_PARAM(void) uarg)
1271 {
1272 return do_kexec_op_internal(op, uarg, 1);
1273 }
1274 #endif
1275
1276 /*
1277 * Local variables:
1278 * mode: C
1279 * c-file-style: "BSD"
1280 * c-basic-offset: 4
1281 * tab-width: 4
1282 * indent-tabs-mode: nil
1283 * End:
1284 */
1285