1 #include <xen/init.h>
2 #include <xen/lib.h>
3 #include <xen/err.h>
4 #include <xen/sched.h>
5 #include <xen/sched-if.h>
6 #include <xen/domain.h>
7 #include <xen/serial.h>
8 #include <xen/softirq.h>
9 #include <xen/acpi.h>
10 #include <xen/efi.h>
11 #include <xen/console.h>
12 #include <xen/serial.h>
13 #include <xen/trace.h>
14 #include <xen/multiboot.h>
15 #include <xen/domain_page.h>
16 #include <xen/version.h>
17 #include <xen/gdbstub.h>
18 #include <xen/percpu.h>
19 #include <xen/hypercall.h>
20 #include <xen/keyhandler.h>
21 #include <xen/numa.h>
22 #include <xen/rcupdate.h>
23 #include <xen/vga.h>
24 #include <xen/dmi.h>
25 #include <xen/pfn.h>
26 #include <xen/nodemask.h>
27 #include <xen/tmem_xen.h>
28 #include <xen/virtual_region.h>
29 #include <xen/watchdog.h>
30 #include <public/version.h>
31 #include <compat/platform.h>
32 #include <compat/xen.h>
33 #include <xen/bitops.h>
34 #include <asm/smp.h>
35 #include <asm/processor.h>
36 #include <asm/mpspec.h>
37 #include <asm/apic.h>
38 #include <asm/msi.h>
39 #include <asm/desc.h>
40 #include <asm/paging.h>
41 #include <asm/e820.h>
42 #include <xen/kexec.h>
43 #include <asm/edd.h>
44 #include <xsm/xsm.h>
45 #include <asm/tboot.h>
46 #include <asm/bzimage.h> /* for bzimage_headroom */
47 #include <asm/mach-generic/mach_apic.h> /* for generic_apic_probe */
48 #include <asm/setup.h>
49 #include <xen/cpu.h>
50 #include <asm/nmi.h>
51 #include <asm/alternative.h>
52 #include <asm/mc146818rtc.h>
53 #include <asm/cpuid.h>
54 #include <asm/guest.h>
55
56 /* opt_nosmp: If true, secondary processors are ignored. */
57 static bool __initdata opt_nosmp;
58 boolean_param("nosmp", opt_nosmp);
59
60 /* maxcpus: maximum number of CPUs to activate. */
61 static unsigned int __initdata max_cpus;
62 integer_param("maxcpus", max_cpus);
63
64 unsigned long __read_mostly cr4_pv32_mask;
65
66 /* **** Linux config option: propagated to domain0. */
67 /* "acpi=off": Sisables both ACPI table parsing and interpreter. */
68 /* "acpi=force": Override the disable blacklist. */
69 /* "acpi=ht": Limit ACPI just to boot-time to enable HT. */
70 /* "acpi=noirq": Disables ACPI interrupt routing. */
71 static int parse_acpi_param(const char *s);
72 custom_param("acpi", parse_acpi_param);
73
74 /* **** Linux config option: propagated to domain0. */
75 /* noapic: Disable IOAPIC setup. */
76 boolean_param("noapic", skip_ioapic_setup);
77
78 /* **** Linux config option: propagated to domain0. */
79 /* xen_cpuidle: xen control cstate. */
80 s8 __read_mostly xen_cpuidle = -1;
81 boolean_param("cpuidle", xen_cpuidle);
82
83 #ifndef NDEBUG
84 unsigned long __initdata highmem_start;
85 size_param("highmem-start", highmem_start);
86 #endif
87
88 cpumask_t __read_mostly cpu_present_map;
89
90 unsigned long __read_mostly xen_phys_start;
91
92 unsigned long __read_mostly xen_virt_end;
93
94 DEFINE_PER_CPU(struct tss_struct, init_tss);
95
96 char __section(".bss.stack_aligned") __aligned(STACK_SIZE)
97 cpu0_stack[STACK_SIZE];
98
99 struct cpuinfo_x86 __read_mostly boot_cpu_data = { 0, 0, 0, 0, -1 };
100
101 unsigned long __read_mostly mmu_cr4_features = XEN_MINIMAL_CR4;
102
103 /* smep: Enable/disable Supervisor Mode Execution Protection (default on). */
104 #define SMEP_HVM_ONLY (-1)
105 static s8 __initdata opt_smep = 1;
106
107 /*
108 * Initial domain place holder. Needs to be global so it can be created in
109 * __start_xen and unpaused in init_done.
110 */
111 static struct domain *__initdata dom0;
112
parse_smep_param(const char * s)113 static int __init parse_smep_param(const char *s)
114 {
115 if ( !*s )
116 {
117 opt_smep = 1;
118 return 0;
119 }
120
121 switch ( parse_bool(s, NULL) )
122 {
123 case 0:
124 opt_smep = 0;
125 return 0;
126 case 1:
127 opt_smep = 1;
128 return 0;
129 }
130
131 if ( !strcmp(s, "hvm") )
132 opt_smep = SMEP_HVM_ONLY;
133 else
134 return -EINVAL;
135
136 return 0;
137 }
138 custom_param("smep", parse_smep_param);
139
140 /* smap: Enable/disable Supervisor Mode Access Prevention (default on). */
141 #define SMAP_HVM_ONLY (-1)
142 static s8 __initdata opt_smap = 1;
143
parse_smap_param(const char * s)144 static int __init parse_smap_param(const char *s)
145 {
146 if ( !*s )
147 {
148 opt_smap = 1;
149 return 0;
150 }
151
152 switch ( parse_bool(s, NULL) )
153 {
154 case 0:
155 opt_smap = 0;
156 return 0;
157 case 1:
158 opt_smap = 1;
159 return 0;
160 }
161
162 if ( !strcmp(s, "hvm") )
163 opt_smap = SMAP_HVM_ONLY;
164 else
165 return -EINVAL;
166
167 return 0;
168 }
169 custom_param("smap", parse_smap_param);
170
171 bool __read_mostly acpi_disabled;
172 bool __initdata acpi_force;
173 static char __initdata acpi_param[10] = "";
174
parse_acpi_param(const char * s)175 static int __init parse_acpi_param(const char *s)
176 {
177 /* Save the parameter so it can be propagated to domain0. */
178 safe_strcpy(acpi_param, s);
179
180 /* Interpret the parameter for use within Xen. */
181 if ( !parse_bool(s, NULL) )
182 {
183 disable_acpi();
184 }
185 else if ( !strcmp(s, "force") )
186 {
187 acpi_force = true;
188 acpi_ht = 1;
189 acpi_disabled = false;
190 }
191 else if ( !strcmp(s, "ht") )
192 {
193 if ( !acpi_force )
194 disable_acpi();
195 acpi_ht = 1;
196 }
197 else if ( !strcmp(s, "noirq") )
198 {
199 acpi_noirq_set();
200 }
201 else
202 return -EINVAL;
203
204 return 0;
205 }
206
207 static const module_t *__initdata initial_images;
208 static unsigned int __initdata nr_initial_images;
209
initial_images_nrpages(nodeid_t node)210 unsigned long __init initial_images_nrpages(nodeid_t node)
211 {
212 unsigned long node_start = node_start_pfn(node);
213 unsigned long node_end = node_end_pfn(node);
214 unsigned long nr;
215 unsigned int i;
216
217 for ( nr = i = 0; i < nr_initial_images; ++i )
218 {
219 unsigned long start = initial_images[i].mod_start;
220 unsigned long end = start + PFN_UP(initial_images[i].mod_end);
221
222 if ( end > node_start && node_end > start )
223 nr += min(node_end, end) - max(node_start, start);
224 }
225
226 return nr;
227 }
228
discard_initial_images(void)229 void __init discard_initial_images(void)
230 {
231 unsigned int i;
232
233 for ( i = 0; i < nr_initial_images; ++i )
234 {
235 uint64_t start = (uint64_t)initial_images[i].mod_start << PAGE_SHIFT;
236
237 init_domheap_pages(start,
238 start + PAGE_ALIGN(initial_images[i].mod_end));
239 }
240
241 nr_initial_images = 0;
242 initial_images = NULL;
243 }
244
245 extern char __init_begin[], __init_end[], __bss_start[], __bss_end[];
246
init_idle_domain(void)247 static void __init init_idle_domain(void)
248 {
249 scheduler_init();
250 set_current(idle_vcpu[0]);
251 this_cpu(curr_vcpu) = current;
252 }
253
srat_detect_node(int cpu)254 void srat_detect_node(int cpu)
255 {
256 nodeid_t node;
257 u32 apicid = x86_cpu_to_apicid[cpu];
258
259 node = apicid < MAX_LOCAL_APIC ? apicid_to_node[apicid] : NUMA_NO_NODE;
260 if ( node == NUMA_NO_NODE )
261 node = 0;
262
263 node_set_online(node);
264 numa_set_node(cpu, node);
265
266 if ( opt_cpu_info && acpi_numa > 0 )
267 printk("CPU %d APIC %d -> Node %d\n", cpu, apicid, node);
268 }
269
270 /*
271 * Sort CPUs by <node,package,core,thread> tuple. Fortunately this hierarchy is
272 * reflected in the structure of modern APIC identifiers, so we sort based on
273 * those. This is slightly complicated by the fact that the BSP must remain
274 * CPU 0. Hence we do a variation on longest-prefix matching to do the best we
275 * can while keeping CPU 0 static.
276 */
normalise_cpu_order(void)277 static void __init normalise_cpu_order(void)
278 {
279 unsigned int i, j, min_cpu;
280 uint32_t apicid, diff, min_diff;
281
282 for_each_present_cpu ( i )
283 {
284 apicid = x86_cpu_to_apicid[i];
285 min_diff = min_cpu = ~0u;
286
287 /*
288 * Find remaining CPU with longest-prefix match on APIC ID.
289 * Among identical longest-prefix matches, pick the smallest APIC ID.
290 */
291 for ( j = cpumask_next(i, &cpu_present_map);
292 j < nr_cpu_ids;
293 j = cpumask_next(j, &cpu_present_map) )
294 {
295 diff = x86_cpu_to_apicid[j] ^ apicid;
296 while ( diff & (diff-1) )
297 diff &= diff-1;
298 if ( (diff < min_diff) ||
299 ((diff == min_diff) &&
300 (x86_cpu_to_apicid[j] < x86_cpu_to_apicid[min_cpu])) )
301 {
302 min_diff = diff;
303 min_cpu = j;
304 }
305 }
306
307 /* If no match then there must be no CPUs remaining to consider. */
308 if ( min_cpu >= nr_cpu_ids )
309 {
310 BUG_ON(cpumask_next(i, &cpu_present_map) < nr_cpu_ids);
311 break;
312 }
313
314 /* Switch the best-matching CPU with the next CPU in logical order. */
315 j = cpumask_next(i, &cpu_present_map);
316 apicid = x86_cpu_to_apicid[min_cpu];
317 x86_cpu_to_apicid[min_cpu] = x86_cpu_to_apicid[j];
318 x86_cpu_to_apicid[j] = apicid;
319 }
320 }
321
322 #define BOOTSTRAP_MAP_BASE (16UL << 20)
323 #define BOOTSTRAP_MAP_LIMIT (1UL << L3_PAGETABLE_SHIFT)
324
325 /*
326 * Ensure a given physical memory range is present in the bootstrap mappings.
327 * Use superpage mappings to ensure that pagetable memory needn't be allocated.
328 */
bootstrap_map(const module_t * mod)329 static void *__init bootstrap_map(const module_t *mod)
330 {
331 static unsigned long __initdata map_cur = BOOTSTRAP_MAP_BASE;
332 uint64_t start, end, mask = (1L << L2_PAGETABLE_SHIFT) - 1;
333 void *ret;
334
335 if ( system_state != SYS_STATE_early_boot )
336 return mod ? mfn_to_virt(mod->mod_start) : NULL;
337
338 if ( !mod )
339 {
340 destroy_xen_mappings(BOOTSTRAP_MAP_BASE, BOOTSTRAP_MAP_LIMIT);
341 map_cur = BOOTSTRAP_MAP_BASE;
342 return NULL;
343 }
344
345 start = (uint64_t)mod->mod_start << PAGE_SHIFT;
346 end = start + mod->mod_end;
347 if ( start >= end )
348 return NULL;
349
350 ret = (void *)(map_cur + (unsigned long)(start & mask));
351 start &= ~mask;
352 end = (end + mask) & ~mask;
353 if ( end - start > BOOTSTRAP_MAP_LIMIT - map_cur )
354 return NULL;
355
356 map_pages_to_xen(map_cur, start >> PAGE_SHIFT,
357 (end - start) >> PAGE_SHIFT, PAGE_HYPERVISOR);
358 map_cur += end - start;
359 return ret;
360 }
361
move_memory(uint64_t dst,uint64_t src,unsigned int size,bool keep)362 static void *__init move_memory(
363 uint64_t dst, uint64_t src, unsigned int size, bool keep)
364 {
365 unsigned int blksz = BOOTSTRAP_MAP_LIMIT - BOOTSTRAP_MAP_BASE;
366 unsigned int mask = (1L << L2_PAGETABLE_SHIFT) - 1;
367
368 if ( src + size > BOOTSTRAP_MAP_BASE )
369 blksz >>= 1;
370
371 while ( size )
372 {
373 module_t mod;
374 unsigned int soffs = src & mask;
375 unsigned int doffs = dst & mask;
376 unsigned int sz;
377 void *d, *s;
378
379 mod.mod_start = (src - soffs) >> PAGE_SHIFT;
380 mod.mod_end = soffs + size;
381 if ( mod.mod_end > blksz )
382 mod.mod_end = blksz;
383 sz = mod.mod_end - soffs;
384 s = bootstrap_map(&mod);
385
386 mod.mod_start = (dst - doffs) >> PAGE_SHIFT;
387 mod.mod_end = doffs + size;
388 if ( mod.mod_end > blksz )
389 mod.mod_end = blksz;
390 if ( sz > mod.mod_end - doffs )
391 sz = mod.mod_end - doffs;
392 d = bootstrap_map(&mod);
393
394 memmove(d + doffs, s + soffs, sz);
395
396 dst += sz;
397 src += sz;
398 size -= sz;
399
400 if ( keep )
401 return size ? NULL : d + doffs;
402
403 bootstrap_map(NULL);
404 }
405
406 return NULL;
407 }
408
consider_modules(uint64_t s,uint64_t e,uint32_t size,const module_t * mod,unsigned int nr_mods,unsigned int this_mod)409 static uint64_t __init consider_modules(
410 uint64_t s, uint64_t e, uint32_t size, const module_t *mod,
411 unsigned int nr_mods, unsigned int this_mod)
412 {
413 unsigned int i;
414
415 if ( s > e || e - s < size )
416 return 0;
417
418 for ( i = 0; i < nr_mods ; ++i )
419 {
420 uint64_t start = (uint64_t)mod[i].mod_start << PAGE_SHIFT;
421 uint64_t end = start + PAGE_ALIGN(mod[i].mod_end);
422
423 if ( i == this_mod )
424 continue;
425
426 if ( s < end && start < e )
427 {
428 end = consider_modules(end, e, size, mod + i + 1,
429 nr_mods - i - 1, this_mod - i - 1);
430 if ( end )
431 return end;
432
433 return consider_modules(s, start, size, mod + i + 1,
434 nr_mods - i - 1, this_mod - i - 1);
435 }
436 }
437
438 return e;
439 }
440
setup_max_pdx(unsigned long top_page)441 static void __init setup_max_pdx(unsigned long top_page)
442 {
443 max_pdx = pfn_to_pdx(top_page - 1) + 1;
444
445 if ( max_pdx > (DIRECTMAP_SIZE >> PAGE_SHIFT) )
446 max_pdx = DIRECTMAP_SIZE >> PAGE_SHIFT;
447
448 if ( max_pdx > FRAMETABLE_NR )
449 max_pdx = FRAMETABLE_NR;
450
451 if ( max_pdx > MPT_VIRT_SIZE / sizeof(unsigned long) )
452 max_pdx = MPT_VIRT_SIZE / sizeof(unsigned long);
453
454 #ifdef PAGE_LIST_NULL
455 if ( max_pdx >= PAGE_LIST_NULL )
456 max_pdx = PAGE_LIST_NULL - 1;
457 #endif
458
459 max_page = pdx_to_pfn(max_pdx - 1) + 1;
460 }
461
462 /* A temporary copy of the e820 map that we can mess with during bootstrap. */
463 static struct e820map __initdata boot_e820;
464
465 #ifdef CONFIG_VIDEO
466 struct boot_video_info {
467 u8 orig_x; /* 0x00 */
468 u8 orig_y; /* 0x01 */
469 u8 orig_video_mode; /* 0x02 */
470 u8 orig_video_cols; /* 0x03 */
471 u8 orig_video_lines; /* 0x04 */
472 u8 orig_video_isVGA; /* 0x05 */
473 u16 orig_video_points; /* 0x06 */
474
475 /* VESA graphic mode -- linear frame buffer */
476 u32 capabilities; /* 0x08 */
477 u16 lfb_linelength; /* 0x0c */
478 u16 lfb_width; /* 0x0e */
479 u16 lfb_height; /* 0x10 */
480 u16 lfb_depth; /* 0x12 */
481 u32 lfb_base; /* 0x14 */
482 u32 lfb_size; /* 0x18 */
483 u8 red_size; /* 0x1c */
484 u8 red_pos; /* 0x1d */
485 u8 green_size; /* 0x1e */
486 u8 green_pos; /* 0x1f */
487 u8 blue_size; /* 0x20 */
488 u8 blue_pos; /* 0x21 */
489 u8 rsvd_size; /* 0x22 */
490 u8 rsvd_pos; /* 0x23 */
491 u16 vesapm_seg; /* 0x24 */
492 u16 vesapm_off; /* 0x26 */
493 u16 vesa_attrib; /* 0x28 */
494 };
495 extern struct boot_video_info boot_vid_info;
496 #endif
497
parse_video_info(void)498 static void __init parse_video_info(void)
499 {
500 #ifdef CONFIG_VIDEO
501 struct boot_video_info *bvi = &bootsym(boot_vid_info);
502
503 /* vga_console_info is filled directly on EFI platform. */
504 if ( efi_enabled(EFI_BOOT) )
505 return;
506
507 if ( (bvi->orig_video_isVGA == 1) && (bvi->orig_video_mode == 3) )
508 {
509 vga_console_info.video_type = XEN_VGATYPE_TEXT_MODE_3;
510 vga_console_info.u.text_mode_3.font_height = bvi->orig_video_points;
511 vga_console_info.u.text_mode_3.cursor_x = bvi->orig_x;
512 vga_console_info.u.text_mode_3.cursor_y = bvi->orig_y;
513 vga_console_info.u.text_mode_3.rows = bvi->orig_video_lines;
514 vga_console_info.u.text_mode_3.columns = bvi->orig_video_cols;
515 }
516 else if ( bvi->orig_video_isVGA == 0x23 )
517 {
518 vga_console_info.video_type = XEN_VGATYPE_VESA_LFB;
519 vga_console_info.u.vesa_lfb.width = bvi->lfb_width;
520 vga_console_info.u.vesa_lfb.height = bvi->lfb_height;
521 vga_console_info.u.vesa_lfb.bytes_per_line = bvi->lfb_linelength;
522 vga_console_info.u.vesa_lfb.bits_per_pixel = bvi->lfb_depth;
523 vga_console_info.u.vesa_lfb.lfb_base = bvi->lfb_base;
524 vga_console_info.u.vesa_lfb.lfb_size = bvi->lfb_size;
525 vga_console_info.u.vesa_lfb.red_pos = bvi->red_pos;
526 vga_console_info.u.vesa_lfb.red_size = bvi->red_size;
527 vga_console_info.u.vesa_lfb.green_pos = bvi->green_pos;
528 vga_console_info.u.vesa_lfb.green_size = bvi->green_size;
529 vga_console_info.u.vesa_lfb.blue_pos = bvi->blue_pos;
530 vga_console_info.u.vesa_lfb.blue_size = bvi->blue_size;
531 vga_console_info.u.vesa_lfb.rsvd_pos = bvi->rsvd_pos;
532 vga_console_info.u.vesa_lfb.rsvd_size = bvi->rsvd_size;
533 vga_console_info.u.vesa_lfb.gbl_caps = bvi->capabilities;
534 vga_console_info.u.vesa_lfb.mode_attrs = bvi->vesa_attrib;
535 }
536 #endif
537 }
538
kexec_reserve_area(struct e820map * e820)539 static void __init kexec_reserve_area(struct e820map *e820)
540 {
541 #ifdef CONFIG_KEXEC
542 unsigned long kdump_start = kexec_crash_area.start;
543 unsigned long kdump_size = kexec_crash_area.size;
544 static bool __initdata is_reserved = false;
545
546 kdump_size = (kdump_size + PAGE_SIZE - 1) & PAGE_MASK;
547
548 if ( (kdump_start == 0) || (kdump_size == 0) || is_reserved )
549 return;
550
551 is_reserved = true;
552
553 if ( !reserve_e820_ram(e820, kdump_start, kdump_start + kdump_size) )
554 {
555 printk("Kdump: DISABLED (failed to reserve %luMB (%lukB) at %#lx)"
556 "\n", kdump_size >> 20, kdump_size >> 10, kdump_start);
557 kexec_crash_area.start = kexec_crash_area.size = 0;
558 }
559 else
560 {
561 printk("Kdump: %luMB (%lukB) at %#lx\n",
562 kdump_size >> 20, kdump_size >> 10, kdump_start);
563 }
564 #endif
565 }
566
using_2M_mapping(void)567 static inline bool using_2M_mapping(void)
568 {
569 return !l1_table_offset((unsigned long)__2M_text_end) &&
570 !l1_table_offset((unsigned long)__2M_rodata_start) &&
571 !l1_table_offset((unsigned long)__2M_rodata_end) &&
572 !l1_table_offset((unsigned long)__2M_init_start) &&
573 !l1_table_offset((unsigned long)__2M_init_end) &&
574 !l1_table_offset((unsigned long)__2M_rwdata_start) &&
575 !l1_table_offset((unsigned long)__2M_rwdata_end);
576 }
577
init_done(void)578 static void noinline init_done(void)
579 {
580 void *va;
581 unsigned long start, end;
582
583 system_state = SYS_STATE_active;
584
585 domain_unpause_by_systemcontroller(dom0);
586
587 /* MUST be done prior to removing .init data. */
588 unregister_init_virtual_region();
589
590 /* Zero the .init code and data. */
591 for ( va = __init_begin; va < _p(__init_end); va += PAGE_SIZE )
592 clear_page(va);
593
594 /* Destroy Xen's mappings, and reuse the pages. */
595 if ( using_2M_mapping() )
596 {
597 start = (unsigned long)&__2M_init_start,
598 end = (unsigned long)&__2M_init_end;
599 }
600 else
601 {
602 start = (unsigned long)&__init_begin;
603 end = (unsigned long)&__init_end;
604 }
605
606 destroy_xen_mappings(start, end);
607 init_xenheap_pages(__pa(start), __pa(end));
608 printk("Freed %lukB init memory\n", (end - start) >> 10);
609
610 startup_cpu_idle_loop();
611 }
612
613 /* Reinitalise all state referring to the old virtual address of the stack. */
reinit_bsp_stack(void)614 static void __init noreturn reinit_bsp_stack(void)
615 {
616 unsigned long *stack = (void*)(get_stack_bottom() & ~(STACK_SIZE - 1));
617
618 /* Update TSS and ISTs */
619 load_system_tables();
620
621 /* Update SYSCALL trampolines */
622 percpu_traps_init();
623
624 stack_base[0] = stack;
625 memguard_guard_stack(stack);
626
627 reset_stack_and_jump(init_done);
628 }
629
loader_is_grub2(const char * loader_name)630 static bool __init loader_is_grub2(const char *loader_name)
631 {
632 /* GRUB1="GNU GRUB 0.xx"; GRUB2="GRUB 1.xx" */
633 const char *p = strstr(loader_name, "GRUB ");
634 return (p != NULL) && (p[5] != '0');
635 }
636
cmdline_cook(char * p,const char * loader_name)637 static char * __init cmdline_cook(char *p, const char *loader_name)
638 {
639 p = p ? : "";
640
641 /* Strip leading whitespace. */
642 while ( *p == ' ' )
643 p++;
644
645 /* GRUB2 and PVH don't not include image name as first item on command line. */
646 if ( xen_guest || loader_is_grub2(loader_name) )
647 return p;
648
649 /* Strip image name plus whitespace. */
650 while ( (*p != ' ') && (*p != '\0') )
651 p++;
652 while ( *p == ' ' )
653 p++;
654
655 return p;
656 }
657
__start_xen(unsigned long mbi_p)658 void __init noreturn __start_xen(unsigned long mbi_p)
659 {
660 char *memmap_type = NULL;
661 char *cmdline, *kextra, *loader;
662 unsigned int initrdidx, domcr_flags = DOMCRF_s3_integrity;
663 multiboot_info_t *mbi;
664 module_t *mod;
665 unsigned long nr_pages, raw_max_page, modules_headroom, *module_map;
666 int i, j, e820_warn = 0, bytes = 0;
667 bool acpi_boot_table_init_done = false, relocated = false;
668 struct ns16550_defaults ns16550 = {
669 .data_bits = 8,
670 .parity = 'n',
671 .stop_bits = 1
672 };
673 struct xen_arch_domainconfig config = { .emulation_flags = 0 };
674
675 /* Critical region without IDT or TSS. Any fault is deadly! */
676
677 set_processor_id(0);
678 set_current(INVALID_VCPU); /* debug sanity. */
679 idle_vcpu[0] = current;
680
681 percpu_init_areas();
682
683 init_idt_traps();
684 load_system_tables();
685
686 smp_prepare_boot_cpu();
687 sort_exception_tables();
688
689 setup_virtual_regions(__start___ex_table, __stop___ex_table);
690
691 /* Full exception support from here on in. */
692
693 if ( pvh_boot )
694 {
695 ASSERT(mbi_p == 0);
696 mbi = pvh_init();
697 }
698 else
699 mbi = __va(mbi_p);
700
701 mod = __va(mbi->mods_addr);
702
703 loader = (mbi->flags & MBI_LOADERNAME)
704 ? (char *)__va(mbi->boot_loader_name) : "unknown";
705
706 /* Parse the command-line options. */
707 cmdline = cmdline_cook((mbi->flags & MBI_CMDLINE) ?
708 __va(mbi->cmdline) : NULL,
709 loader);
710 if ( (kextra = strstr(cmdline, " -- ")) != NULL )
711 {
712 /*
713 * Options after ' -- ' separator belong to dom0.
714 * 1. Orphan dom0's options from Xen's command line.
715 * 2. Skip all but final leading space from dom0's options.
716 */
717 *kextra = '\0';
718 kextra += 3;
719 while ( kextra[1] == ' ' ) kextra++;
720 }
721 cmdline_parse(cmdline);
722
723 /* Must be after command line argument parsing and before
724 * allocing any xenheap structures wanted in lower memory. */
725 kexec_early_calculations();
726
727 probe_hypervisor();
728
729 parse_video_info();
730
731 rdmsrl(MSR_EFER, this_cpu(efer));
732 asm volatile ( "mov %%cr4,%0" : "=r" (get_cpu_info()->cr4) );
733
734 /* We initialise the serial devices very early so we can get debugging. */
735 ns16550.io_base = 0x3f8;
736 ns16550.irq = 4;
737 ns16550_init(0, &ns16550);
738 ns16550.io_base = 0x2f8;
739 ns16550.irq = 3;
740 ns16550_init(1, &ns16550);
741 ehci_dbgp_init();
742 console_init_preirq();
743
744 if ( pvh_boot )
745 pvh_print_info();
746
747 printk("Bootloader: %s\n", loader);
748
749 printk("Command line: %s\n", cmdline);
750
751 printk("Xen image load base address: %#lx\n", xen_phys_start);
752
753 #ifdef CONFIG_VIDEO
754 printk("Video information:\n");
755
756 /* Print VGA display mode information. */
757 switch ( vga_console_info.video_type )
758 {
759 case XEN_VGATYPE_TEXT_MODE_3:
760 printk(" VGA is text mode %dx%d, font 8x%d\n",
761 vga_console_info.u.text_mode_3.columns,
762 vga_console_info.u.text_mode_3.rows,
763 vga_console_info.u.text_mode_3.font_height);
764 break;
765 case XEN_VGATYPE_VESA_LFB:
766 case XEN_VGATYPE_EFI_LFB:
767 printk(" VGA is graphics mode %dx%d, %d bpp\n",
768 vga_console_info.u.vesa_lfb.width,
769 vga_console_info.u.vesa_lfb.height,
770 vga_console_info.u.vesa_lfb.bits_per_pixel);
771 break;
772 default:
773 printk(" No VGA detected\n");
774 break;
775 }
776
777 /* Print VBE/DDC EDID information. */
778 if ( bootsym(boot_edid_caps) != 0x1313 )
779 {
780 u16 caps = bootsym(boot_edid_caps);
781 printk(" VBE/DDC methods:%s%s%s; ",
782 (caps & 1) ? " V1" : "",
783 (caps & 2) ? " V2" : "",
784 !(caps & 3) ? " none" : "");
785 printk("EDID transfer time: %d seconds\n", caps >> 8);
786 if ( *(u32 *)bootsym(boot_edid_info) == 0x13131313 )
787 {
788 printk(" EDID info not retrieved because ");
789 if ( !(caps & 3) )
790 printk("no DDC retrieval method detected\n");
791 else if ( (caps >> 8) > 5 )
792 printk("takes longer than 5 seconds\n");
793 else
794 printk("of reasons unknown\n");
795 }
796 }
797 #endif
798
799 printk("Disc information:\n");
800 printk(" Found %d MBR signatures\n",
801 bootsym(boot_mbr_signature_nr));
802 printk(" Found %d EDD information structures\n",
803 bootsym(boot_edd_info_nr));
804
805 /* Check that we have at least one Multiboot module. */
806 if ( !(mbi->flags & MBI_MODULES) || (mbi->mods_count == 0) )
807 panic("dom0 kernel not specified. Check bootloader configuration.");
808
809 if ( pvh_boot )
810 {
811 /* pvh_init() already filled in e820_raw */
812 memmap_type = "PVH-e820";
813 }
814 else if ( efi_enabled(EFI_LOADER) )
815 {
816 set_pdx_range(xen_phys_start >> PAGE_SHIFT,
817 (xen_phys_start + BOOTSTRAP_MAP_BASE) >> PAGE_SHIFT);
818
819 /* Clean up boot loader identity mappings. */
820 destroy_xen_mappings(xen_phys_start,
821 xen_phys_start + BOOTSTRAP_MAP_BASE);
822
823 /* Make boot page tables match non-EFI boot. */
824 l3_bootmap[l3_table_offset(BOOTSTRAP_MAP_BASE)] =
825 l3e_from_paddr(__pa(l2_bootmap), __PAGE_HYPERVISOR);
826
827 memmap_type = loader;
828 }
829 else if ( efi_enabled(EFI_BOOT) )
830 memmap_type = "EFI";
831 else if ( (e820_raw.nr_map =
832 copy_bios_e820(e820_raw.map,
833 ARRAY_SIZE(e820_raw.map))) != 0 )
834 {
835 memmap_type = "Xen-e820";
836 }
837 else if ( mbi->flags & MBI_MEMMAP )
838 {
839 memmap_type = "Multiboot-e820";
840 while ( bytes < mbi->mmap_length &&
841 e820_raw.nr_map < ARRAY_SIZE(e820_raw.map) )
842 {
843 memory_map_t *map = __va(mbi->mmap_addr + bytes);
844
845 /*
846 * This is a gross workaround for a BIOS bug. Some bootloaders do
847 * not write e820 map entries into pre-zeroed memory. This is
848 * okay if the BIOS fills in all fields of the map entry, but
849 * some broken BIOSes do not bother to write the high word of
850 * the length field if the length is smaller than 4GB. We
851 * detect and fix this by flagging sections below 4GB that
852 * appear to be larger than 4GB in size.
853 */
854 if ( (map->base_addr_high == 0) && (map->length_high != 0) )
855 {
856 if ( !e820_warn )
857 {
858 printk("WARNING: Buggy e820 map detected and fixed "
859 "(truncated length fields).\n");
860 e820_warn = 1;
861 }
862 map->length_high = 0;
863 }
864
865 e820_raw.map[e820_raw.nr_map].addr =
866 ((u64)map->base_addr_high << 32) | (u64)map->base_addr_low;
867 e820_raw.map[e820_raw.nr_map].size =
868 ((u64)map->length_high << 32) | (u64)map->length_low;
869 e820_raw.map[e820_raw.nr_map].type = map->type;
870 e820_raw.nr_map++;
871
872 bytes += map->size + 4;
873 }
874 }
875 else if ( bootsym(lowmem_kb) )
876 {
877 memmap_type = "Xen-e801";
878 e820_raw.map[0].addr = 0;
879 e820_raw.map[0].size = bootsym(lowmem_kb) << 10;
880 e820_raw.map[0].type = E820_RAM;
881 e820_raw.map[1].addr = 0x100000;
882 e820_raw.map[1].size = bootsym(highmem_kb) << 10;
883 e820_raw.map[1].type = E820_RAM;
884 e820_raw.nr_map = 2;
885 }
886 else if ( mbi->flags & MBI_MEMLIMITS )
887 {
888 memmap_type = "Multiboot-e801";
889 e820_raw.map[0].addr = 0;
890 e820_raw.map[0].size = mbi->mem_lower << 10;
891 e820_raw.map[0].type = E820_RAM;
892 e820_raw.map[1].addr = 0x100000;
893 e820_raw.map[1].size = mbi->mem_upper << 10;
894 e820_raw.map[1].type = E820_RAM;
895 e820_raw.nr_map = 2;
896 }
897 else
898 panic("Bootloader provided no memory information.");
899
900 /* Sanitise the raw E820 map to produce a final clean version. */
901 max_page = raw_max_page = init_e820(memmap_type, &e820_raw);
902
903 /* Create a temporary copy of the E820 map. */
904 memcpy(&boot_e820, &e820, sizeof(e820));
905
906 /* Early kexec reservation (explicit static start address). */
907 nr_pages = 0;
908 for ( i = 0; i < e820.nr_map; i++ )
909 if ( e820.map[i].type == E820_RAM )
910 nr_pages += e820.map[i].size >> PAGE_SHIFT;
911 set_kexec_crash_area_size((u64)nr_pages << PAGE_SHIFT);
912 kexec_reserve_area(&boot_e820);
913
914 initial_images = mod;
915 nr_initial_images = mbi->mods_count;
916
917 /*
918 * Iterate backwards over all superpage-aligned RAM regions.
919 *
920 * We require superpage alignment because the boot allocator is not yet
921 * initialised. Hence we can only map superpages in the address range
922 * 0 to BOOTSTRAP_DIRECTMAP_END, as this is guaranteed not to require
923 * dynamic allocation of pagetables.
924 *
925 * As well as mapping superpages in that range, in preparation for
926 * initialising the boot allocator, we also look for a region to which
927 * we can relocate the dom0 kernel and other multiboot modules. Also, on
928 * x86/64, we relocate Xen to higher memory.
929 */
930 for ( i = 0; !efi_enabled(EFI_LOADER) && i < mbi->mods_count; i++ )
931 {
932 if ( mod[i].mod_start & (PAGE_SIZE - 1) )
933 panic("Bootloader didn't honor module alignment request.");
934 mod[i].mod_end -= mod[i].mod_start;
935 mod[i].mod_start >>= PAGE_SHIFT;
936 mod[i].reserved = 0;
937 }
938
939 if ( xen_phys_start )
940 {
941 relocated = true;
942
943 /*
944 * This needs to remain in sync with xen_in_range() and the
945 * respective reserve_e820_ram() invocation below.
946 */
947 mod[mbi->mods_count].mod_start = virt_to_mfn(_stext);
948 mod[mbi->mods_count].mod_end = __2M_rwdata_end - _stext;
949 }
950
951 modules_headroom = bzimage_headroom(bootstrap_map(mod), mod->mod_end);
952 bootstrap_map(NULL);
953
954 #ifndef highmem_start
955 /* Don't allow split below 4Gb. */
956 if ( highmem_start < GB(4) )
957 highmem_start = 0;
958 else /* align to L3 entry boundary */
959 highmem_start &= ~((1UL << L3_PAGETABLE_SHIFT) - 1);
960 #endif
961
962 for ( i = boot_e820.nr_map-1; i >= 0; i-- )
963 {
964 uint64_t s, e, mask = (1UL << L2_PAGETABLE_SHIFT) - 1;
965 uint64_t end, limit = ARRAY_SIZE(l2_identmap) << L2_PAGETABLE_SHIFT;
966
967 /* Superpage-aligned chunks from BOOTSTRAP_MAP_BASE. */
968 s = (boot_e820.map[i].addr + mask) & ~mask;
969 e = (boot_e820.map[i].addr + boot_e820.map[i].size) & ~mask;
970 s = max_t(uint64_t, s, BOOTSTRAP_MAP_BASE);
971 if ( (boot_e820.map[i].type != E820_RAM) || (s >= e) )
972 continue;
973
974 if ( s < limit )
975 {
976 end = min(e, limit);
977 set_pdx_range(s >> PAGE_SHIFT, end >> PAGE_SHIFT);
978 map_pages_to_xen((unsigned long)__va(s), s >> PAGE_SHIFT,
979 (end - s) >> PAGE_SHIFT, PAGE_HYPERVISOR);
980 }
981
982 if ( e > min(HYPERVISOR_VIRT_END - DIRECTMAP_VIRT_START,
983 1UL << (PAGE_SHIFT + 32)) )
984 e = min(HYPERVISOR_VIRT_END - DIRECTMAP_VIRT_START,
985 1UL << (PAGE_SHIFT + 32));
986 #define reloc_size ((__pa(__2M_rwdata_end) + mask) & ~mask)
987 /* Is the region suitable for relocating Xen? */
988 if ( !xen_phys_start && e <= limit )
989 {
990 /* Don't overlap with modules. */
991 end = consider_modules(s, e, reloc_size + mask,
992 mod, mbi->mods_count, -1);
993 end &= ~mask;
994 }
995 else
996 end = 0;
997 if ( end > s )
998 {
999 l4_pgentry_t *pl4e;
1000 l3_pgentry_t *pl3e;
1001 l2_pgentry_t *pl2e;
1002 int i, j, k;
1003
1004 /* Select relocation address. */
1005 e = end - reloc_size;
1006 xen_phys_start = e;
1007 bootsym(trampoline_xen_phys_start) = e;
1008
1009 /*
1010 * Perform relocation to new physical address.
1011 * Before doing so we must sync static/global data with main memory
1012 * with a barrier(). After this we must *not* modify static/global
1013 * data until after we have switched to the relocated pagetables!
1014 */
1015 barrier();
1016 move_memory(e + XEN_IMG_OFFSET, XEN_IMG_OFFSET, _end - _start, 1);
1017
1018 /* Walk initial pagetables, relocating page directory entries. */
1019 pl4e = __va(__pa(idle_pg_table));
1020 for ( i = 0 ; i < L4_PAGETABLE_ENTRIES; i++, pl4e++ )
1021 {
1022 if ( !(l4e_get_flags(*pl4e) & _PAGE_PRESENT) )
1023 continue;
1024 *pl4e = l4e_from_intpte(l4e_get_intpte(*pl4e) +
1025 xen_phys_start);
1026 pl3e = l4e_to_l3e(*pl4e);
1027 for ( j = 0; j < L3_PAGETABLE_ENTRIES; j++, pl3e++ )
1028 {
1029 /* Not present, 1GB mapping, or already relocated? */
1030 if ( !(l3e_get_flags(*pl3e) & _PAGE_PRESENT) ||
1031 (l3e_get_flags(*pl3e) & _PAGE_PSE) ||
1032 (l3e_get_pfn(*pl3e) > PFN_DOWN(xen_phys_start)) )
1033 continue;
1034 *pl3e = l3e_from_intpte(l3e_get_intpte(*pl3e) +
1035 xen_phys_start);
1036 pl2e = l3e_to_l2e(*pl3e);
1037 for ( k = 0; k < L2_PAGETABLE_ENTRIES; k++, pl2e++ )
1038 {
1039 /* Not present, PSE, or already relocated? */
1040 if ( !(l2e_get_flags(*pl2e) & _PAGE_PRESENT) ||
1041 (l2e_get_flags(*pl2e) & _PAGE_PSE) ||
1042 (l2e_get_pfn(*pl2e) > PFN_DOWN(xen_phys_start)) )
1043 continue;
1044 *pl2e = l2e_from_intpte(l2e_get_intpte(*pl2e) +
1045 xen_phys_start);
1046 }
1047 }
1048 }
1049
1050 /* The only data mappings to be relocated are in the Xen area. */
1051 pl2e = __va(__pa(l2_xenmap));
1052 /*
1053 * Undo the temporary-hooking of the l1_identmap. __2M_text_start
1054 * is contained in this PTE.
1055 */
1056 BUG_ON(using_2M_mapping() &&
1057 l2_table_offset((unsigned long)_erodata) ==
1058 l2_table_offset((unsigned long)_stext));
1059 *pl2e++ = l2e_from_pfn(xen_phys_start >> PAGE_SHIFT,
1060 PAGE_HYPERVISOR_RX | _PAGE_PSE);
1061 for ( i = 1; i < L2_PAGETABLE_ENTRIES; i++, pl2e++ )
1062 {
1063 unsigned int flags;
1064
1065 if ( !(l2e_get_flags(*pl2e) & _PAGE_PRESENT) ||
1066 (l2e_get_pfn(*pl2e) > PFN_DOWN(xen_phys_start)) )
1067 continue;
1068
1069 if ( !using_2M_mapping() )
1070 {
1071 *pl2e = l2e_from_intpte(l2e_get_intpte(*pl2e) +
1072 xen_phys_start);
1073 continue;
1074 }
1075
1076 if ( i < l2_table_offset((unsigned long)&__2M_text_end) )
1077 {
1078 flags = PAGE_HYPERVISOR_RX | _PAGE_PSE;
1079 }
1080 else if ( i >= l2_table_offset((unsigned long)&__2M_rodata_start) &&
1081 i < l2_table_offset((unsigned long)&__2M_rodata_end) )
1082 {
1083 flags = PAGE_HYPERVISOR_RO | _PAGE_PSE;
1084 }
1085 else if ( i >= l2_table_offset((unsigned long)&__2M_init_start) &&
1086 i < l2_table_offset((unsigned long)&__2M_init_end) )
1087 {
1088 flags = PAGE_HYPERVISOR_RWX | _PAGE_PSE;
1089 }
1090 else if ( (i >= l2_table_offset((unsigned long)&__2M_rwdata_start) &&
1091 i < l2_table_offset((unsigned long)&__2M_rwdata_end)) )
1092 {
1093 flags = PAGE_HYPERVISOR_RW | _PAGE_PSE;
1094 }
1095 else
1096 {
1097 *pl2e = l2e_empty();
1098 continue;
1099 }
1100
1101 *pl2e = l2e_from_paddr(
1102 l2e_get_paddr(*pl2e) + xen_phys_start, flags);
1103 }
1104
1105 /* Re-sync the stack and then switch to relocated pagetables. */
1106 asm volatile (
1107 "rep movsq ; " /* re-sync the stack */
1108 "movq %%cr4,%%rsi ; "
1109 "andb $0x7f,%%sil ; "
1110 "movq %%rsi,%%cr4 ; " /* CR4.PGE == 0 */
1111 "movq %[pg],%%cr3 ; " /* CR3 == new pagetables */
1112 "orb $0x80,%%sil ; "
1113 "movq %%rsi,%%cr4 " /* CR4.PGE == 1 */
1114 : "=&S" (i), "=&D" (i), "=&c" (i) /* All outputs discarded. */
1115 : [pg] "r" (__pa(idle_pg_table)), "0" (cpu0_stack),
1116 "1" (__va(__pa(cpu0_stack))), "2" (STACK_SIZE / 8)
1117 : "memory" );
1118
1119 bootstrap_map(NULL);
1120
1121 printk("New Xen image base address: %#lx\n", xen_phys_start);
1122 }
1123
1124 /* Is the region suitable for relocating the multiboot modules? */
1125 for ( j = mbi->mods_count - 1; j >= 0; j-- )
1126 {
1127 unsigned long headroom = j ? 0 : modules_headroom;
1128 unsigned long size = PAGE_ALIGN(headroom + mod[j].mod_end);
1129
1130 if ( mod[j].reserved )
1131 continue;
1132
1133 /* Don't overlap with other modules (or Xen itself). */
1134 end = consider_modules(s, e, size, mod,
1135 mbi->mods_count + relocated, j);
1136
1137 if ( highmem_start && end > highmem_start )
1138 continue;
1139
1140 if ( s < end &&
1141 (headroom ||
1142 ((end - size) >> PAGE_SHIFT) > mod[j].mod_start) )
1143 {
1144 move_memory(end - size + headroom,
1145 (uint64_t)mod[j].mod_start << PAGE_SHIFT,
1146 mod[j].mod_end, 0);
1147 mod[j].mod_start = (end - size) >> PAGE_SHIFT;
1148 mod[j].mod_end += headroom;
1149 mod[j].reserved = 1;
1150 }
1151 }
1152
1153 #ifdef CONFIG_KEXEC
1154 /*
1155 * Looking backwards from the crash area limit, find a large
1156 * enough range that does not overlap with modules.
1157 */
1158 while ( !kexec_crash_area.start )
1159 {
1160 /* Don't overlap with modules (or Xen itself). */
1161 e = consider_modules(s, e, PAGE_ALIGN(kexec_crash_area.size), mod,
1162 mbi->mods_count + relocated, -1);
1163 if ( s >= e )
1164 break;
1165 if ( e > kexec_crash_area_limit )
1166 {
1167 e = kexec_crash_area_limit & PAGE_MASK;
1168 continue;
1169 }
1170 kexec_crash_area.start = (e - kexec_crash_area.size) & PAGE_MASK;
1171 }
1172 #endif
1173 }
1174
1175 if ( modules_headroom && !mod->reserved )
1176 panic("Not enough memory to relocate the dom0 kernel image.");
1177 for ( i = 0; i < mbi->mods_count; ++i )
1178 {
1179 uint64_t s = (uint64_t)mod[i].mod_start << PAGE_SHIFT;
1180
1181 reserve_e820_ram(&boot_e820, s, s + PAGE_ALIGN(mod[i].mod_end));
1182 }
1183
1184 if ( !xen_phys_start )
1185 panic("Not enough memory to relocate Xen.");
1186
1187 /* This needs to remain in sync with xen_in_range(). */
1188 reserve_e820_ram(&boot_e820, __pa(_stext), __pa(__2M_rwdata_end));
1189
1190 /* Late kexec reservation (dynamic start address). */
1191 kexec_reserve_area(&boot_e820);
1192
1193 setup_max_pdx(raw_max_page);
1194 if ( highmem_start )
1195 xenheap_max_mfn(PFN_DOWN(highmem_start - 1));
1196
1197 /*
1198 * Walk every RAM region and map it in its entirety (on x86/64, at least)
1199 * and notify it to the boot allocator.
1200 */
1201 for ( i = 0; i < boot_e820.nr_map; i++ )
1202 {
1203 uint64_t s, e, mask = PAGE_SIZE - 1;
1204 uint64_t map_s, map_e;
1205
1206 /* Only page alignment required now. */
1207 s = (boot_e820.map[i].addr + mask) & ~mask;
1208 e = (boot_e820.map[i].addr + boot_e820.map[i].size) & ~mask;
1209 s = max_t(uint64_t, s, 1<<20);
1210 if ( (boot_e820.map[i].type != E820_RAM) || (s >= e) )
1211 continue;
1212
1213 if ( !acpi_boot_table_init_done &&
1214 s >= (1ULL << 32) &&
1215 !acpi_boot_table_init() )
1216 {
1217 acpi_boot_table_init_done = true;
1218 srat_parse_regions(s);
1219 setup_max_pdx(raw_max_page);
1220 }
1221
1222 if ( pfn_to_pdx((e - 1) >> PAGE_SHIFT) >= max_pdx )
1223 {
1224 if ( pfn_to_pdx(s >> PAGE_SHIFT) >= max_pdx )
1225 {
1226 for ( j = i - 1; ; --j )
1227 {
1228 if ( boot_e820.map[j].type == E820_RAM )
1229 break;
1230 ASSERT(j);
1231 }
1232 map_e = boot_e820.map[j].addr + boot_e820.map[j].size;
1233 for ( j = 0; j < mbi->mods_count; ++j )
1234 {
1235 uint64_t end = pfn_to_paddr(mod[j].mod_start) +
1236 mod[j].mod_end;
1237
1238 if ( map_e < end )
1239 map_e = end;
1240 }
1241 if ( PFN_UP(map_e) < max_page )
1242 {
1243 max_page = PFN_UP(map_e);
1244 max_pdx = pfn_to_pdx(max_page - 1) + 1;
1245 }
1246 printk(XENLOG_WARNING "Ignoring inaccessible memory range"
1247 " %013"PRIx64"-%013"PRIx64"\n",
1248 s, e);
1249 continue;
1250 }
1251 map_e = e;
1252 e = (pdx_to_pfn(max_pdx - 1) + 1ULL) << PAGE_SHIFT;
1253 printk(XENLOG_WARNING "Ignoring inaccessible memory range"
1254 " %013"PRIx64"-%013"PRIx64"\n",
1255 e, map_e);
1256 }
1257
1258 set_pdx_range(s >> PAGE_SHIFT, e >> PAGE_SHIFT);
1259
1260 /* Need to create mappings above BOOTSTRAP_MAP_BASE. */
1261 map_s = max_t(uint64_t, s, BOOTSTRAP_MAP_BASE);
1262 map_e = min_t(uint64_t, e,
1263 ARRAY_SIZE(l2_identmap) << L2_PAGETABLE_SHIFT);
1264
1265 /* Pass mapped memory to allocator /before/ creating new mappings. */
1266 init_boot_pages(s, min(map_s, e));
1267 s = map_s;
1268 if ( s < map_e )
1269 {
1270 uint64_t mask = (1UL << L2_PAGETABLE_SHIFT) - 1;
1271
1272 map_s = (s + mask) & ~mask;
1273 map_e &= ~mask;
1274 init_boot_pages(map_s, map_e);
1275 }
1276
1277 if ( map_s > map_e )
1278 map_s = map_e = s;
1279
1280 /* Create new mappings /before/ passing memory to the allocator. */
1281 if ( map_e < e )
1282 {
1283 uint64_t limit = __pa(HYPERVISOR_VIRT_END - 1) + 1;
1284 uint64_t end = min(e, limit);
1285
1286 if ( map_e < end )
1287 {
1288 map_pages_to_xen((unsigned long)__va(map_e), PFN_DOWN(map_e),
1289 PFN_DOWN(end - map_e), PAGE_HYPERVISOR);
1290 init_boot_pages(map_e, end);
1291 map_e = end;
1292 }
1293 }
1294 if ( map_e < e )
1295 {
1296 /* This range must not be passed to the boot allocator and
1297 * must also not be mapped with _PAGE_GLOBAL. */
1298 map_pages_to_xen((unsigned long)__va(map_e), PFN_DOWN(map_e),
1299 PFN_DOWN(e - map_e), __PAGE_HYPERVISOR_RW);
1300 }
1301 if ( s < map_s )
1302 {
1303 map_pages_to_xen((unsigned long)__va(s), s >> PAGE_SHIFT,
1304 (map_s - s) >> PAGE_SHIFT, PAGE_HYPERVISOR);
1305 init_boot_pages(s, map_s);
1306 }
1307 }
1308
1309 for ( i = 0; i < mbi->mods_count; ++i )
1310 {
1311 set_pdx_range(mod[i].mod_start,
1312 mod[i].mod_start + PFN_UP(mod[i].mod_end));
1313 map_pages_to_xen((unsigned long)mfn_to_virt(mod[i].mod_start),
1314 mod[i].mod_start,
1315 PFN_UP(mod[i].mod_end), PAGE_HYPERVISOR);
1316 }
1317
1318 #ifdef CONFIG_KEXEC
1319 if ( kexec_crash_area.size )
1320 {
1321 unsigned long s = PFN_DOWN(kexec_crash_area.start);
1322 unsigned long e = min(s + PFN_UP(kexec_crash_area.size),
1323 PFN_UP(__pa(HYPERVISOR_VIRT_END - 1)));
1324
1325 if ( e > s )
1326 map_pages_to_xen((unsigned long)__va(kexec_crash_area.start),
1327 s, e - s, PAGE_HYPERVISOR);
1328 }
1329 #endif
1330
1331 xen_virt_end = ((unsigned long)_end + (1UL << L2_PAGETABLE_SHIFT) - 1) &
1332 ~((1UL << L2_PAGETABLE_SHIFT) - 1);
1333 destroy_xen_mappings(xen_virt_end, XEN_VIRT_START + BOOTSTRAP_MAP_BASE);
1334
1335 /*
1336 * If not using 2M mappings to gain suitable pagetable permissions
1337 * directly from the relocation above, remap the code/data
1338 * sections with decreased permissions.
1339 */
1340 if ( !using_2M_mapping() )
1341 {
1342 /* Mark .text as RX (avoiding the first 2M superpage). */
1343 modify_xen_mappings(XEN_VIRT_START + MB(2),
1344 (unsigned long)&__2M_text_end,
1345 PAGE_HYPERVISOR_RX);
1346
1347 /* Mark .rodata as RO. */
1348 modify_xen_mappings((unsigned long)&__2M_rodata_start,
1349 (unsigned long)&__2M_rodata_end,
1350 PAGE_HYPERVISOR_RO);
1351
1352 /* Mark .data and .bss as RW. */
1353 modify_xen_mappings((unsigned long)&__2M_rwdata_start,
1354 (unsigned long)&__2M_rwdata_end,
1355 PAGE_HYPERVISOR_RW);
1356
1357 /* Drop the remaining mappings in the shattered superpage. */
1358 destroy_xen_mappings((unsigned long)&__2M_rwdata_end,
1359 ROUNDUP((unsigned long)&__2M_rwdata_end, MB(2)));
1360 }
1361
1362 nr_pages = 0;
1363 for ( i = 0; i < e820.nr_map; i++ )
1364 if ( e820.map[i].type == E820_RAM )
1365 nr_pages += e820.map[i].size >> PAGE_SHIFT;
1366 printk("System RAM: %luMB (%lukB)\n",
1367 nr_pages >> (20 - PAGE_SHIFT),
1368 nr_pages << (PAGE_SHIFT - 10));
1369 total_pages = nr_pages;
1370
1371 /* Sanity check for unwanted bloat of certain hypercall structures. */
1372 BUILD_BUG_ON(sizeof(((struct xen_platform_op *)0)->u) !=
1373 sizeof(((struct xen_platform_op *)0)->u.pad));
1374 BUILD_BUG_ON(sizeof(((struct xen_domctl *)0)->u) !=
1375 sizeof(((struct xen_domctl *)0)->u.pad));
1376 BUILD_BUG_ON(sizeof(((struct xen_sysctl *)0)->u) !=
1377 sizeof(((struct xen_sysctl *)0)->u.pad));
1378
1379 BUILD_BUG_ON(sizeof(start_info_t) > PAGE_SIZE);
1380 BUILD_BUG_ON(sizeof(shared_info_t) > PAGE_SIZE);
1381 BUILD_BUG_ON(sizeof(struct vcpu_info) != 64);
1382
1383 BUILD_BUG_ON(sizeof(((struct compat_platform_op *)0)->u) !=
1384 sizeof(((struct compat_platform_op *)0)->u.pad));
1385 BUILD_BUG_ON(sizeof(start_info_compat_t) > PAGE_SIZE);
1386 BUILD_BUG_ON(sizeof(struct compat_vcpu_info) != 64);
1387
1388 /* Check definitions in public headers match internal defs. */
1389 BUILD_BUG_ON(__HYPERVISOR_VIRT_START != HYPERVISOR_VIRT_START);
1390 BUILD_BUG_ON(__HYPERVISOR_VIRT_END != HYPERVISOR_VIRT_END);
1391 BUILD_BUG_ON(MACH2PHYS_VIRT_START != RO_MPT_VIRT_START);
1392 BUILD_BUG_ON(MACH2PHYS_VIRT_END != RO_MPT_VIRT_END);
1393
1394 init_frametable();
1395
1396 if ( !acpi_boot_table_init_done )
1397 acpi_boot_table_init();
1398
1399 acpi_numa_init();
1400
1401 numa_initmem_init(0, raw_max_page);
1402
1403 if ( max_page - 1 > virt_to_mfn(HYPERVISOR_VIRT_END - 1) )
1404 {
1405 unsigned long limit = virt_to_mfn(HYPERVISOR_VIRT_END - 1);
1406 uint64_t mask = PAGE_SIZE - 1;
1407
1408 if ( !highmem_start )
1409 xenheap_max_mfn(limit);
1410
1411 end_boot_allocator();
1412
1413 /* Pass the remaining memory to the allocator. */
1414 for ( i = 0; i < boot_e820.nr_map; i++ )
1415 {
1416 uint64_t s, e;
1417
1418 if ( boot_e820.map[i].type != E820_RAM )
1419 continue;
1420 s = (boot_e820.map[i].addr + mask) & ~mask;
1421 e = (boot_e820.map[i].addr + boot_e820.map[i].size) & ~mask;
1422 if ( PFN_DOWN(e) <= limit )
1423 continue;
1424 if ( PFN_DOWN(s) <= limit )
1425 s = pfn_to_paddr(limit + 1);
1426 init_domheap_pages(s, e);
1427 }
1428
1429 if ( tmem_enabled() )
1430 {
1431 printk(XENLOG_WARNING
1432 "TMEM physical RAM limit exceeded, disabling TMEM\n");
1433 tmem_disable();
1434 }
1435 }
1436 else
1437 end_boot_allocator();
1438
1439 system_state = SYS_STATE_boot;
1440 /*
1441 * No calls involving ACPI code should go between the setting of
1442 * SYS_STATE_boot and vm_init() (or else acpi_os_{,un}map_memory()
1443 * will break).
1444 */
1445 vm_init();
1446
1447 console_init_ring();
1448 vesa_init();
1449
1450 softirq_init();
1451 tasklet_subsys_init();
1452
1453 early_cpu_init();
1454
1455 paging_init();
1456
1457 tboot_probe();
1458
1459 open_softirq(NEW_TLBFLUSH_CLOCK_PERIOD_SOFTIRQ, new_tlbflush_clock_period);
1460
1461 if ( opt_watchdog )
1462 nmi_watchdog = NMI_LOCAL_APIC;
1463
1464 find_smp_config();
1465
1466 dmi_scan_machine();
1467
1468 generic_apic_probe();
1469
1470 acpi_boot_init();
1471
1472 if ( smp_found_config )
1473 get_smp_config();
1474
1475 if ( opt_nosmp )
1476 {
1477 max_cpus = 0;
1478 set_nr_cpu_ids(1);
1479 }
1480 else
1481 {
1482 set_nr_cpu_ids(max_cpus);
1483 max_cpus = nr_cpu_ids;
1484 }
1485
1486 if ( xen_guest )
1487 hypervisor_setup();
1488
1489 /* Low mappings were only needed for some BIOS table parsing. */
1490 zap_low_mappings();
1491
1492 mmio_ro_ranges = rangeset_new(NULL, "r/o mmio ranges",
1493 RANGESETF_prettyprint_hex);
1494
1495 init_apic_mappings();
1496
1497 normalise_cpu_order();
1498
1499 init_cpu_to_node();
1500
1501 x2apic_bsp_setup();
1502
1503 init_IRQ();
1504
1505 module_map = xmalloc_array(unsigned long, BITS_TO_LONGS(mbi->mods_count));
1506 bitmap_fill(module_map, mbi->mods_count);
1507 __clear_bit(0, module_map); /* Dom0 kernel is always first */
1508
1509 xsm_multiboot_init(module_map, mbi, bootstrap_map);
1510
1511 microcode_grab_module(module_map, mbi, bootstrap_map);
1512
1513 timer_init();
1514
1515 early_microcode_init();
1516
1517 identify_cpu(&boot_cpu_data);
1518
1519 set_in_cr4(X86_CR4_OSFXSR | X86_CR4_OSXMMEXCPT);
1520
1521 if ( !opt_smep )
1522 setup_clear_cpu_cap(X86_FEATURE_SMEP);
1523 if ( cpu_has_smep && opt_smep != SMEP_HVM_ONLY )
1524 setup_force_cpu_cap(X86_FEATURE_XEN_SMEP);
1525 if ( boot_cpu_has(X86_FEATURE_XEN_SMEP) )
1526 set_in_cr4(X86_CR4_SMEP);
1527
1528 if ( !opt_smap )
1529 setup_clear_cpu_cap(X86_FEATURE_SMAP);
1530 if ( cpu_has_smap && opt_smap != SMAP_HVM_ONLY )
1531 setup_force_cpu_cap(X86_FEATURE_XEN_SMAP);
1532 if ( boot_cpu_has(X86_FEATURE_XEN_SMAP) )
1533 set_in_cr4(X86_CR4_SMAP);
1534
1535 cr4_pv32_mask = mmu_cr4_features & XEN_CR4_PV32_BITS;
1536
1537 if ( cpu_has_fsgsbase )
1538 set_in_cr4(X86_CR4_FSGSBASE);
1539
1540 init_idle_domain();
1541
1542 this_cpu(stubs.addr) = alloc_stub_page(smp_processor_id(),
1543 &this_cpu(stubs).mfn);
1544 BUG_ON(!this_cpu(stubs.addr));
1545
1546 trap_init();
1547
1548 rcu_init();
1549
1550 early_time_init();
1551
1552 arch_init_memory();
1553
1554 alternative_instructions();
1555
1556 local_irq_enable();
1557
1558 pt_pci_init();
1559
1560 vesa_mtrr_init();
1561
1562 acpi_mmcfg_init();
1563
1564 early_msi_init();
1565
1566 iommu_setup(); /* setup iommu if available */
1567
1568 smp_prepare_cpus(max_cpus);
1569
1570 spin_debug_enable();
1571
1572 /*
1573 * Initialise higher-level timer functions. We do this fairly late
1574 * (after interrupts got enabled) because the time bases and scale
1575 * factors need to be updated regularly.
1576 */
1577 init_xen_time();
1578
1579 initialize_keytable();
1580
1581 console_init_postirq();
1582
1583 system_state = SYS_STATE_smp_boot;
1584
1585 do_presmp_initcalls();
1586
1587 /*
1588 * NB: when running as a PV shim VCPUOP_up/down is wired to the shim
1589 * physical cpu_add/remove functions, so launch the guest with only
1590 * the BSP online and let it bring up the other CPUs as required.
1591 */
1592 if ( !pv_shim )
1593 {
1594 for_each_present_cpu ( i )
1595 {
1596 /* Set up cpu_to_node[]. */
1597 srat_detect_node(i);
1598 /* Set up node_to_cpumask based on cpu_to_node[]. */
1599 numa_add_cpu(i);
1600
1601 if ( (num_online_cpus() < max_cpus) && !cpu_online(i) )
1602 {
1603 int ret = cpu_up(i);
1604 if ( ret != 0 )
1605 printk("Failed to bring up CPU %u (error %d)\n", i, ret);
1606 }
1607 }
1608 }
1609
1610 printk("Brought up %ld CPUs\n", (long)num_online_cpus());
1611 smp_cpus_done();
1612
1613 do_initcalls();
1614
1615 if ( opt_watchdog )
1616 watchdog_setup();
1617
1618 if ( !tboot_protect_mem_regions() )
1619 panic("Could not protect TXT memory regions");
1620
1621 init_guest_cpuid();
1622 init_guest_msr_policy();
1623
1624 if ( dom0_pvh )
1625 {
1626 domcr_flags |= DOMCRF_hvm |
1627 ((hvm_funcs.hap_supported && !opt_dom0_shadow) ?
1628 DOMCRF_hap : 0);
1629 config.emulation_flags = XEN_X86_EMU_LAPIC|XEN_X86_EMU_IOAPIC;
1630 }
1631
1632 /* Create initial domain 0. */
1633 dom0 = domain_create(get_initial_domain_id(), domcr_flags, 0, &config);
1634 if ( IS_ERR(dom0) || (alloc_dom0_vcpu0(dom0) == NULL) )
1635 panic("Error creating domain 0");
1636
1637 if ( !pv_shim )
1638 dom0->is_privileged = 1;
1639 dom0->target = NULL;
1640
1641 /* Grab the DOM0 command line. */
1642 cmdline = (char *)(mod[0].string ? __va(mod[0].string) : NULL);
1643 if ( (cmdline != NULL) || (kextra != NULL) )
1644 {
1645 static char __initdata dom0_cmdline[MAX_GUEST_CMDLINE];
1646
1647 cmdline = cmdline_cook(cmdline, loader);
1648 safe_strcpy(dom0_cmdline, cmdline);
1649
1650 if ( kextra != NULL )
1651 /* kextra always includes exactly one leading space. */
1652 safe_strcat(dom0_cmdline, kextra);
1653
1654 /* Append any extra parameters. */
1655 if ( skip_ioapic_setup && !strstr(dom0_cmdline, "noapic") )
1656 safe_strcat(dom0_cmdline, " noapic");
1657 if ( (strlen(acpi_param) == 0) && acpi_disabled )
1658 {
1659 printk("ACPI is disabled, notifying Domain 0 (acpi=off)\n");
1660 safe_strcpy(acpi_param, "off");
1661 }
1662 if ( (strlen(acpi_param) != 0) && !strstr(dom0_cmdline, "acpi=") )
1663 {
1664 safe_strcat(dom0_cmdline, " acpi=");
1665 safe_strcat(dom0_cmdline, acpi_param);
1666 }
1667
1668 cmdline = dom0_cmdline;
1669 }
1670
1671 if ( xen_cpuidle )
1672 xen_processor_pmbits |= XEN_PROCESSOR_PM_CX;
1673
1674 initrdidx = find_first_bit(module_map, mbi->mods_count);
1675 if ( bitmap_weight(module_map, mbi->mods_count) > 1 )
1676 printk(XENLOG_WARNING
1677 "Multiple initrd candidates, picking module #%u\n",
1678 initrdidx);
1679
1680 /*
1681 * Temporarily clear SMAP in CR4 to allow user-accesses in construct_dom0().
1682 * This saves a large number of corner cases interactions with
1683 * copy_from_user().
1684 */
1685 if ( cpu_has_smap )
1686 {
1687 cr4_pv32_mask &= ~X86_CR4_SMAP;
1688 write_cr4(read_cr4() & ~X86_CR4_SMAP);
1689 }
1690
1691 printk("%sNX (Execute Disable) protection %sactive\n",
1692 cpu_has_nx ? XENLOG_INFO : XENLOG_WARNING "Warning: ",
1693 cpu_has_nx ? "" : "not ");
1694
1695 /*
1696 * We're going to setup domain0 using the module(s) that we stashed safely
1697 * above our heap. The second module, if present, is an initrd ramdisk.
1698 */
1699 if ( construct_dom0(dom0, mod, modules_headroom,
1700 (initrdidx > 0) && (initrdidx < mbi->mods_count)
1701 ? mod + initrdidx : NULL,
1702 bootstrap_map, cmdline) != 0)
1703 panic("Could not set up DOM0 guest OS");
1704
1705 if ( cpu_has_smap )
1706 {
1707 write_cr4(read_cr4() | X86_CR4_SMAP);
1708 cr4_pv32_mask |= X86_CR4_SMAP;
1709 }
1710
1711 heap_init_late();
1712
1713 init_trace_bufs();
1714
1715 init_constructors();
1716
1717 console_endboot();
1718
1719 /* Hide UART from DOM0 if we're using it */
1720 serial_endboot();
1721
1722 dmi_end_boot();
1723
1724 setup_io_bitmap(dom0);
1725
1726 /* Jump to the 1:1 virtual mappings of cpu0_stack. */
1727 asm volatile ("mov %[stk], %%rsp; jmp %c[fn]" ::
1728 [stk] "g" (__va(__pa(get_stack_bottom()))),
1729 [fn] "i" (reinit_bsp_stack) : "memory");
1730 unreachable();
1731 }
1732
arch_get_xen_caps(xen_capabilities_info_t * info)1733 void arch_get_xen_caps(xen_capabilities_info_t *info)
1734 {
1735 /* Interface name is always xen-3.0-* for Xen-3.x. */
1736 int major = 3, minor = 0;
1737 char s[32];
1738
1739 (*info)[0] = '\0';
1740
1741 snprintf(s, sizeof(s), "xen-%d.%d-x86_64 ", major, minor);
1742 safe_strcat(*info, s);
1743 snprintf(s, sizeof(s), "xen-%d.%d-x86_32p ", major, minor);
1744 safe_strcat(*info, s);
1745 if ( hvm_enabled )
1746 {
1747 snprintf(s, sizeof(s), "hvm-%d.%d-x86_32 ", major, minor);
1748 safe_strcat(*info, s);
1749 snprintf(s, sizeof(s), "hvm-%d.%d-x86_32p ", major, minor);
1750 safe_strcat(*info, s);
1751 snprintf(s, sizeof(s), "hvm-%d.%d-x86_64 ", major, minor);
1752 safe_strcat(*info, s);
1753 }
1754 }
1755
xen_in_range(unsigned long mfn)1756 int __hwdom_init xen_in_range(unsigned long mfn)
1757 {
1758 paddr_t start, end;
1759 int i;
1760
1761 enum { region_s3, region_ro, region_rw, nr_regions };
1762 static struct {
1763 paddr_t s, e;
1764 } xen_regions[nr_regions] __hwdom_initdata;
1765
1766 /* initialize first time */
1767 if ( !xen_regions[0].s )
1768 {
1769 /* S3 resume code (and other real mode trampoline code) */
1770 xen_regions[region_s3].s = bootsym_phys(trampoline_start);
1771 xen_regions[region_s3].e = bootsym_phys(trampoline_end);
1772
1773 /*
1774 * This needs to remain in sync with the uses of the same symbols in
1775 * - __start_xen() (above)
1776 * - is_xen_fixed_mfn()
1777 * - tboot_shutdown()
1778 */
1779
1780 /* hypervisor .text + .rodata */
1781 xen_regions[region_ro].s = __pa(&_stext);
1782 xen_regions[region_ro].e = __pa(&__2M_rodata_end);
1783 /* hypervisor .data + .bss */
1784 xen_regions[region_rw].s = __pa(&__2M_rwdata_start);
1785 xen_regions[region_rw].e = __pa(&__2M_rwdata_end);
1786 }
1787
1788 start = (paddr_t)mfn << PAGE_SHIFT;
1789 end = start + PAGE_SIZE;
1790 for ( i = 0; i < nr_regions; i++ )
1791 if ( (start < xen_regions[i].e) && (end > xen_regions[i].s) )
1792 return 1;
1793
1794 return 0;
1795 }
1796
io_bitmap_cb(unsigned long s,unsigned long e,void * ctx)1797 static int __hwdom_init io_bitmap_cb(unsigned long s, unsigned long e,
1798 void *ctx)
1799 {
1800 struct domain *d = ctx;
1801 unsigned int i;
1802
1803 ASSERT(e <= INT_MAX);
1804 for ( i = s; i <= e; i++ )
1805 __clear_bit(i, d->arch.hvm_domain.io_bitmap);
1806
1807 return 0;
1808 }
1809
setup_io_bitmap(struct domain * d)1810 void __hwdom_init setup_io_bitmap(struct domain *d)
1811 {
1812 int rc;
1813
1814 if ( is_hvm_domain(d) )
1815 {
1816 bitmap_fill(d->arch.hvm_domain.io_bitmap, 0x10000);
1817 rc = rangeset_report_ranges(d->arch.ioport_caps, 0, 0x10000,
1818 io_bitmap_cb, d);
1819 BUG_ON(rc);
1820 /*
1821 * NB: we need to trap accesses to 0xcf8 in order to intercept
1822 * 4 byte accesses, that need to be handled by Xen in order to
1823 * keep consistency.
1824 * Access to 1 byte RTC ports also needs to be trapped in order
1825 * to keep consistency with PV.
1826 */
1827 __set_bit(0xcf8, d->arch.hvm_domain.io_bitmap);
1828 __set_bit(RTC_PORT(0), d->arch.hvm_domain.io_bitmap);
1829 __set_bit(RTC_PORT(1), d->arch.hvm_domain.io_bitmap);
1830 }
1831 }
1832
1833 /*
1834 * Local variables:
1835 * mode: C
1836 * c-file-style: "BSD"
1837 * c-basic-offset: 4
1838 * tab-width: 4
1839 * indent-tabs-mode: nil
1840 * End:
1841 */
1842