1 #include <xen/init.h>
2 #include <xen/lib.h>
3 #include <xen/err.h>
4 #include <xen/sched.h>
5 #include <xen/sched-if.h>
6 #include <xen/domain.h>
7 #include <xen/serial.h>
8 #include <xen/softirq.h>
9 #include <xen/acpi.h>
10 #include <xen/efi.h>
11 #include <xen/console.h>
12 #include <xen/serial.h>
13 #include <xen/trace.h>
14 #include <xen/multiboot.h>
15 #include <xen/domain_page.h>
16 #include <xen/version.h>
17 #include <xen/gdbstub.h>
18 #include <xen/percpu.h>
19 #include <xen/hypercall.h>
20 #include <xen/keyhandler.h>
21 #include <xen/numa.h>
22 #include <xen/rcupdate.h>
23 #include <xen/vga.h>
24 #include <xen/dmi.h>
25 #include <xen/pfn.h>
26 #include <xen/nodemask.h>
27 #include <xen/tmem_xen.h>
28 #include <xen/virtual_region.h>
29 #include <xen/watchdog.h>
30 #include <public/version.h>
31 #include <compat/platform.h>
32 #include <compat/xen.h>
33 #include <xen/bitops.h>
34 #include <asm/smp.h>
35 #include <asm/processor.h>
36 #include <asm/mpspec.h>
37 #include <asm/apic.h>
38 #include <asm/msi.h>
39 #include <asm/desc.h>
40 #include <asm/paging.h>
41 #include <asm/e820.h>
42 #include <xen/kexec.h>
43 #include <asm/edd.h>
44 #include <xsm/xsm.h>
45 #include <asm/tboot.h>
46 #include <asm/bzimage.h> /* for bzimage_headroom */
47 #include <asm/mach-generic/mach_apic.h> /* for generic_apic_probe */
48 #include <asm/setup.h>
49 #include <xen/cpu.h>
50 #include <asm/nmi.h>
51 #include <asm/alternative.h>
52 #include <asm/mc146818rtc.h>
53 #include <asm/cpuid.h>
54 #include <asm/guest.h>
55 
56 /* opt_nosmp: If true, secondary processors are ignored. */
57 static bool __initdata opt_nosmp;
58 boolean_param("nosmp", opt_nosmp);
59 
60 /* maxcpus: maximum number of CPUs to activate. */
61 static unsigned int __initdata max_cpus;
62 integer_param("maxcpus", max_cpus);
63 
64 unsigned long __read_mostly cr4_pv32_mask;
65 
66 /* **** Linux config option: propagated to domain0. */
67 /* "acpi=off":    Sisables both ACPI table parsing and interpreter. */
68 /* "acpi=force":  Override the disable blacklist.                   */
69 /* "acpi=ht":     Limit ACPI just to boot-time to enable HT.        */
70 /* "acpi=noirq":  Disables ACPI interrupt routing.                  */
71 static int parse_acpi_param(const char *s);
72 custom_param("acpi", parse_acpi_param);
73 
74 /* **** Linux config option: propagated to domain0. */
75 /* noapic: Disable IOAPIC setup. */
76 boolean_param("noapic", skip_ioapic_setup);
77 
78 /* **** Linux config option: propagated to domain0. */
79 /* xen_cpuidle: xen control cstate. */
80 s8 __read_mostly xen_cpuidle = -1;
81 boolean_param("cpuidle", xen_cpuidle);
82 
83 #ifndef NDEBUG
84 unsigned long __initdata highmem_start;
85 size_param("highmem-start", highmem_start);
86 #endif
87 
88 cpumask_t __read_mostly cpu_present_map;
89 
90 unsigned long __read_mostly xen_phys_start;
91 
92 unsigned long __read_mostly xen_virt_end;
93 
94 DEFINE_PER_CPU(struct tss_struct, init_tss);
95 
96 char __section(".bss.stack_aligned") __aligned(STACK_SIZE)
97     cpu0_stack[STACK_SIZE];
98 
99 struct cpuinfo_x86 __read_mostly boot_cpu_data = { 0, 0, 0, 0, -1 };
100 
101 unsigned long __read_mostly mmu_cr4_features = XEN_MINIMAL_CR4;
102 
103 /* smep: Enable/disable Supervisor Mode Execution Protection (default on). */
104 #define SMEP_HVM_ONLY (-1)
105 static s8 __initdata opt_smep = 1;
106 
107 /*
108  * Initial domain place holder. Needs to be global so it can be created in
109  * __start_xen and unpaused in init_done.
110  */
111 static struct domain *__initdata dom0;
112 
parse_smep_param(const char * s)113 static int __init parse_smep_param(const char *s)
114 {
115     if ( !*s )
116     {
117         opt_smep = 1;
118         return 0;
119     }
120 
121     switch ( parse_bool(s, NULL) )
122     {
123     case 0:
124         opt_smep = 0;
125         return 0;
126     case 1:
127         opt_smep = 1;
128         return 0;
129     }
130 
131     if ( !strcmp(s, "hvm") )
132         opt_smep = SMEP_HVM_ONLY;
133     else
134         return -EINVAL;
135 
136     return 0;
137 }
138 custom_param("smep", parse_smep_param);
139 
140 /* smap: Enable/disable Supervisor Mode Access Prevention (default on). */
141 #define SMAP_HVM_ONLY (-1)
142 static s8 __initdata opt_smap = 1;
143 
parse_smap_param(const char * s)144 static int __init parse_smap_param(const char *s)
145 {
146     if ( !*s )
147     {
148         opt_smap = 1;
149         return 0;
150     }
151 
152     switch ( parse_bool(s, NULL) )
153     {
154     case 0:
155         opt_smap = 0;
156         return 0;
157     case 1:
158         opt_smap = 1;
159         return 0;
160     }
161 
162     if ( !strcmp(s, "hvm") )
163         opt_smap = SMAP_HVM_ONLY;
164     else
165         return -EINVAL;
166 
167     return 0;
168 }
169 custom_param("smap", parse_smap_param);
170 
171 bool __read_mostly acpi_disabled;
172 bool __initdata acpi_force;
173 static char __initdata acpi_param[10] = "";
174 
parse_acpi_param(const char * s)175 static int __init parse_acpi_param(const char *s)
176 {
177     /* Save the parameter so it can be propagated to domain0. */
178     safe_strcpy(acpi_param, s);
179 
180     /* Interpret the parameter for use within Xen. */
181     if ( !parse_bool(s, NULL) )
182     {
183         disable_acpi();
184     }
185     else if ( !strcmp(s, "force") )
186     {
187         acpi_force = true;
188         acpi_ht = 1;
189         acpi_disabled = false;
190     }
191     else if ( !strcmp(s, "ht") )
192     {
193         if ( !acpi_force )
194             disable_acpi();
195         acpi_ht = 1;
196     }
197     else if ( !strcmp(s, "noirq") )
198     {
199         acpi_noirq_set();
200     }
201     else
202         return -EINVAL;
203 
204     return 0;
205 }
206 
207 static const module_t *__initdata initial_images;
208 static unsigned int __initdata nr_initial_images;
209 
initial_images_nrpages(nodeid_t node)210 unsigned long __init initial_images_nrpages(nodeid_t node)
211 {
212     unsigned long node_start = node_start_pfn(node);
213     unsigned long node_end = node_end_pfn(node);
214     unsigned long nr;
215     unsigned int i;
216 
217     for ( nr = i = 0; i < nr_initial_images; ++i )
218     {
219         unsigned long start = initial_images[i].mod_start;
220         unsigned long end = start + PFN_UP(initial_images[i].mod_end);
221 
222         if ( end > node_start && node_end > start )
223             nr += min(node_end, end) - max(node_start, start);
224     }
225 
226     return nr;
227 }
228 
discard_initial_images(void)229 void __init discard_initial_images(void)
230 {
231     unsigned int i;
232 
233     for ( i = 0; i < nr_initial_images; ++i )
234     {
235         uint64_t start = (uint64_t)initial_images[i].mod_start << PAGE_SHIFT;
236 
237         init_domheap_pages(start,
238                            start + PAGE_ALIGN(initial_images[i].mod_end));
239     }
240 
241     nr_initial_images = 0;
242     initial_images = NULL;
243 }
244 
245 extern char __init_begin[], __init_end[], __bss_start[], __bss_end[];
246 
init_idle_domain(void)247 static void __init init_idle_domain(void)
248 {
249     scheduler_init();
250     set_current(idle_vcpu[0]);
251     this_cpu(curr_vcpu) = current;
252 }
253 
srat_detect_node(int cpu)254 void srat_detect_node(int cpu)
255 {
256     nodeid_t node;
257     u32 apicid = x86_cpu_to_apicid[cpu];
258 
259     node = apicid < MAX_LOCAL_APIC ? apicid_to_node[apicid] : NUMA_NO_NODE;
260     if ( node == NUMA_NO_NODE )
261         node = 0;
262 
263     node_set_online(node);
264     numa_set_node(cpu, node);
265 
266     if ( opt_cpu_info && acpi_numa > 0 )
267         printk("CPU %d APIC %d -> Node %d\n", cpu, apicid, node);
268 }
269 
270 /*
271  * Sort CPUs by <node,package,core,thread> tuple. Fortunately this hierarchy is
272  * reflected in the structure of modern APIC identifiers, so we sort based on
273  * those. This is slightly complicated by the fact that the BSP must remain
274  * CPU 0. Hence we do a variation on longest-prefix matching to do the best we
275  * can while keeping CPU 0 static.
276  */
normalise_cpu_order(void)277 static void __init normalise_cpu_order(void)
278 {
279     unsigned int i, j, min_cpu;
280     uint32_t apicid, diff, min_diff;
281 
282     for_each_present_cpu ( i )
283     {
284         apicid = x86_cpu_to_apicid[i];
285         min_diff = min_cpu = ~0u;
286 
287         /*
288          * Find remaining CPU with longest-prefix match on APIC ID.
289          * Among identical longest-prefix matches, pick the smallest APIC ID.
290          */
291         for ( j = cpumask_next(i, &cpu_present_map);
292               j < nr_cpu_ids;
293               j = cpumask_next(j, &cpu_present_map) )
294         {
295             diff = x86_cpu_to_apicid[j] ^ apicid;
296             while ( diff & (diff-1) )
297                 diff &= diff-1;
298             if ( (diff < min_diff) ||
299                  ((diff == min_diff) &&
300                   (x86_cpu_to_apicid[j] < x86_cpu_to_apicid[min_cpu])) )
301             {
302                 min_diff = diff;
303                 min_cpu = j;
304             }
305         }
306 
307         /* If no match then there must be no CPUs remaining to consider. */
308         if ( min_cpu >= nr_cpu_ids )
309         {
310             BUG_ON(cpumask_next(i, &cpu_present_map) < nr_cpu_ids);
311             break;
312         }
313 
314         /* Switch the best-matching CPU with the next CPU in logical order. */
315         j = cpumask_next(i, &cpu_present_map);
316         apicid = x86_cpu_to_apicid[min_cpu];
317         x86_cpu_to_apicid[min_cpu] = x86_cpu_to_apicid[j];
318         x86_cpu_to_apicid[j] = apicid;
319     }
320 }
321 
322 #define BOOTSTRAP_MAP_BASE  (16UL << 20)
323 #define BOOTSTRAP_MAP_LIMIT (1UL << L3_PAGETABLE_SHIFT)
324 
325 /*
326  * Ensure a given physical memory range is present in the bootstrap mappings.
327  * Use superpage mappings to ensure that pagetable memory needn't be allocated.
328  */
bootstrap_map(const module_t * mod)329 static void *__init bootstrap_map(const module_t *mod)
330 {
331     static unsigned long __initdata map_cur = BOOTSTRAP_MAP_BASE;
332     uint64_t start, end, mask = (1L << L2_PAGETABLE_SHIFT) - 1;
333     void *ret;
334 
335     if ( system_state != SYS_STATE_early_boot )
336         return mod ? mfn_to_virt(mod->mod_start) : NULL;
337 
338     if ( !mod )
339     {
340         destroy_xen_mappings(BOOTSTRAP_MAP_BASE, BOOTSTRAP_MAP_LIMIT);
341         map_cur = BOOTSTRAP_MAP_BASE;
342         return NULL;
343     }
344 
345     start = (uint64_t)mod->mod_start << PAGE_SHIFT;
346     end = start + mod->mod_end;
347     if ( start >= end )
348         return NULL;
349 
350     ret = (void *)(map_cur + (unsigned long)(start & mask));
351     start &= ~mask;
352     end = (end + mask) & ~mask;
353     if ( end - start > BOOTSTRAP_MAP_LIMIT - map_cur )
354         return NULL;
355 
356     map_pages_to_xen(map_cur, start >> PAGE_SHIFT,
357                      (end - start) >> PAGE_SHIFT, PAGE_HYPERVISOR);
358     map_cur += end - start;
359     return ret;
360 }
361 
move_memory(uint64_t dst,uint64_t src,unsigned int size,bool keep)362 static void *__init move_memory(
363     uint64_t dst, uint64_t src, unsigned int size, bool keep)
364 {
365     unsigned int blksz = BOOTSTRAP_MAP_LIMIT - BOOTSTRAP_MAP_BASE;
366     unsigned int mask = (1L << L2_PAGETABLE_SHIFT) - 1;
367 
368     if ( src + size > BOOTSTRAP_MAP_BASE )
369         blksz >>= 1;
370 
371     while ( size )
372     {
373         module_t mod;
374         unsigned int soffs = src & mask;
375         unsigned int doffs = dst & mask;
376         unsigned int sz;
377         void *d, *s;
378 
379         mod.mod_start = (src - soffs) >> PAGE_SHIFT;
380         mod.mod_end = soffs + size;
381         if ( mod.mod_end > blksz )
382             mod.mod_end = blksz;
383         sz = mod.mod_end - soffs;
384         s = bootstrap_map(&mod);
385 
386         mod.mod_start = (dst - doffs) >> PAGE_SHIFT;
387         mod.mod_end = doffs + size;
388         if ( mod.mod_end > blksz )
389             mod.mod_end = blksz;
390         if ( sz > mod.mod_end - doffs )
391             sz = mod.mod_end - doffs;
392         d = bootstrap_map(&mod);
393 
394         memmove(d + doffs, s + soffs, sz);
395 
396         dst += sz;
397         src += sz;
398         size -= sz;
399 
400         if ( keep )
401             return size ? NULL : d + doffs;
402 
403         bootstrap_map(NULL);
404     }
405 
406     return NULL;
407 }
408 
consider_modules(uint64_t s,uint64_t e,uint32_t size,const module_t * mod,unsigned int nr_mods,unsigned int this_mod)409 static uint64_t __init consider_modules(
410     uint64_t s, uint64_t e, uint32_t size, const module_t *mod,
411     unsigned int nr_mods, unsigned int this_mod)
412 {
413     unsigned int i;
414 
415     if ( s > e || e - s < size )
416         return 0;
417 
418     for ( i = 0; i < nr_mods ; ++i )
419     {
420         uint64_t start = (uint64_t)mod[i].mod_start << PAGE_SHIFT;
421         uint64_t end = start + PAGE_ALIGN(mod[i].mod_end);
422 
423         if ( i == this_mod )
424             continue;
425 
426         if ( s < end && start < e )
427         {
428             end = consider_modules(end, e, size, mod + i + 1,
429                                    nr_mods - i - 1, this_mod - i - 1);
430             if ( end )
431                 return end;
432 
433             return consider_modules(s, start, size, mod + i + 1,
434                                     nr_mods - i - 1, this_mod - i - 1);
435         }
436     }
437 
438     return e;
439 }
440 
setup_max_pdx(unsigned long top_page)441 static void __init setup_max_pdx(unsigned long top_page)
442 {
443     max_pdx = pfn_to_pdx(top_page - 1) + 1;
444 
445     if ( max_pdx > (DIRECTMAP_SIZE >> PAGE_SHIFT) )
446         max_pdx = DIRECTMAP_SIZE >> PAGE_SHIFT;
447 
448     if ( max_pdx > FRAMETABLE_NR )
449         max_pdx = FRAMETABLE_NR;
450 
451     if ( max_pdx > MPT_VIRT_SIZE / sizeof(unsigned long) )
452         max_pdx = MPT_VIRT_SIZE / sizeof(unsigned long);
453 
454 #ifdef PAGE_LIST_NULL
455     if ( max_pdx >= PAGE_LIST_NULL )
456         max_pdx = PAGE_LIST_NULL - 1;
457 #endif
458 
459     max_page = pdx_to_pfn(max_pdx - 1) + 1;
460 }
461 
462 /* A temporary copy of the e820 map that we can mess with during bootstrap. */
463 static struct e820map __initdata boot_e820;
464 
465 #ifdef CONFIG_VIDEO
466 struct boot_video_info {
467     u8  orig_x;             /* 0x00 */
468     u8  orig_y;             /* 0x01 */
469     u8  orig_video_mode;    /* 0x02 */
470     u8  orig_video_cols;    /* 0x03 */
471     u8  orig_video_lines;   /* 0x04 */
472     u8  orig_video_isVGA;   /* 0x05 */
473     u16 orig_video_points;  /* 0x06 */
474 
475     /* VESA graphic mode -- linear frame buffer */
476     u32 capabilities;       /* 0x08 */
477     u16 lfb_linelength;     /* 0x0c */
478     u16 lfb_width;          /* 0x0e */
479     u16 lfb_height;         /* 0x10 */
480     u16 lfb_depth;          /* 0x12 */
481     u32 lfb_base;           /* 0x14 */
482     u32 lfb_size;           /* 0x18 */
483     u8  red_size;           /* 0x1c */
484     u8  red_pos;            /* 0x1d */
485     u8  green_size;         /* 0x1e */
486     u8  green_pos;          /* 0x1f */
487     u8  blue_size;          /* 0x20 */
488     u8  blue_pos;           /* 0x21 */
489     u8  rsvd_size;          /* 0x22 */
490     u8  rsvd_pos;           /* 0x23 */
491     u16 vesapm_seg;         /* 0x24 */
492     u16 vesapm_off;         /* 0x26 */
493     u16 vesa_attrib;        /* 0x28 */
494 };
495 extern struct boot_video_info boot_vid_info;
496 #endif
497 
parse_video_info(void)498 static void __init parse_video_info(void)
499 {
500 #ifdef CONFIG_VIDEO
501     struct boot_video_info *bvi = &bootsym(boot_vid_info);
502 
503     /* vga_console_info is filled directly on EFI platform. */
504     if ( efi_enabled(EFI_BOOT) )
505         return;
506 
507     if ( (bvi->orig_video_isVGA == 1) && (bvi->orig_video_mode == 3) )
508     {
509         vga_console_info.video_type = XEN_VGATYPE_TEXT_MODE_3;
510         vga_console_info.u.text_mode_3.font_height = bvi->orig_video_points;
511         vga_console_info.u.text_mode_3.cursor_x = bvi->orig_x;
512         vga_console_info.u.text_mode_3.cursor_y = bvi->orig_y;
513         vga_console_info.u.text_mode_3.rows = bvi->orig_video_lines;
514         vga_console_info.u.text_mode_3.columns = bvi->orig_video_cols;
515     }
516     else if ( bvi->orig_video_isVGA == 0x23 )
517     {
518         vga_console_info.video_type = XEN_VGATYPE_VESA_LFB;
519         vga_console_info.u.vesa_lfb.width = bvi->lfb_width;
520         vga_console_info.u.vesa_lfb.height = bvi->lfb_height;
521         vga_console_info.u.vesa_lfb.bytes_per_line = bvi->lfb_linelength;
522         vga_console_info.u.vesa_lfb.bits_per_pixel = bvi->lfb_depth;
523         vga_console_info.u.vesa_lfb.lfb_base = bvi->lfb_base;
524         vga_console_info.u.vesa_lfb.lfb_size = bvi->lfb_size;
525         vga_console_info.u.vesa_lfb.red_pos = bvi->red_pos;
526         vga_console_info.u.vesa_lfb.red_size = bvi->red_size;
527         vga_console_info.u.vesa_lfb.green_pos = bvi->green_pos;
528         vga_console_info.u.vesa_lfb.green_size = bvi->green_size;
529         vga_console_info.u.vesa_lfb.blue_pos = bvi->blue_pos;
530         vga_console_info.u.vesa_lfb.blue_size = bvi->blue_size;
531         vga_console_info.u.vesa_lfb.rsvd_pos = bvi->rsvd_pos;
532         vga_console_info.u.vesa_lfb.rsvd_size = bvi->rsvd_size;
533         vga_console_info.u.vesa_lfb.gbl_caps = bvi->capabilities;
534         vga_console_info.u.vesa_lfb.mode_attrs = bvi->vesa_attrib;
535     }
536 #endif
537 }
538 
kexec_reserve_area(struct e820map * e820)539 static void __init kexec_reserve_area(struct e820map *e820)
540 {
541 #ifdef CONFIG_KEXEC
542     unsigned long kdump_start = kexec_crash_area.start;
543     unsigned long kdump_size  = kexec_crash_area.size;
544     static bool __initdata is_reserved = false;
545 
546     kdump_size = (kdump_size + PAGE_SIZE - 1) & PAGE_MASK;
547 
548     if ( (kdump_start == 0) || (kdump_size == 0) || is_reserved )
549         return;
550 
551     is_reserved = true;
552 
553     if ( !reserve_e820_ram(e820, kdump_start, kdump_start + kdump_size) )
554     {
555         printk("Kdump: DISABLED (failed to reserve %luMB (%lukB) at %#lx)"
556                "\n", kdump_size >> 20, kdump_size >> 10, kdump_start);
557         kexec_crash_area.start = kexec_crash_area.size = 0;
558     }
559     else
560     {
561         printk("Kdump: %luMB (%lukB) at %#lx\n",
562                kdump_size >> 20, kdump_size >> 10, kdump_start);
563     }
564 #endif
565 }
566 
using_2M_mapping(void)567 static inline bool using_2M_mapping(void)
568 {
569     return !l1_table_offset((unsigned long)__2M_text_end) &&
570            !l1_table_offset((unsigned long)__2M_rodata_start) &&
571            !l1_table_offset((unsigned long)__2M_rodata_end) &&
572            !l1_table_offset((unsigned long)__2M_init_start) &&
573            !l1_table_offset((unsigned long)__2M_init_end) &&
574            !l1_table_offset((unsigned long)__2M_rwdata_start) &&
575            !l1_table_offset((unsigned long)__2M_rwdata_end);
576 }
577 
init_done(void)578 static void noinline init_done(void)
579 {
580     void *va;
581     unsigned long start, end;
582 
583     system_state = SYS_STATE_active;
584 
585     domain_unpause_by_systemcontroller(dom0);
586 
587     /* MUST be done prior to removing .init data. */
588     unregister_init_virtual_region();
589 
590     /* Zero the .init code and data. */
591     for ( va = __init_begin; va < _p(__init_end); va += PAGE_SIZE )
592         clear_page(va);
593 
594     /* Destroy Xen's mappings, and reuse the pages. */
595     if ( using_2M_mapping() )
596     {
597         start = (unsigned long)&__2M_init_start,
598         end   = (unsigned long)&__2M_init_end;
599     }
600     else
601     {
602         start = (unsigned long)&__init_begin;
603         end   = (unsigned long)&__init_end;
604     }
605 
606     destroy_xen_mappings(start, end);
607     init_xenheap_pages(__pa(start), __pa(end));
608     printk("Freed %lukB init memory\n", (end - start) >> 10);
609 
610     startup_cpu_idle_loop();
611 }
612 
613 /* Reinitalise all state referring to the old virtual address of the stack. */
reinit_bsp_stack(void)614 static void __init noreturn reinit_bsp_stack(void)
615 {
616     unsigned long *stack = (void*)(get_stack_bottom() & ~(STACK_SIZE - 1));
617 
618     /* Update TSS and ISTs */
619     load_system_tables();
620 
621     /* Update SYSCALL trampolines */
622     percpu_traps_init();
623 
624     stack_base[0] = stack;
625     memguard_guard_stack(stack);
626 
627     reset_stack_and_jump(init_done);
628 }
629 
loader_is_grub2(const char * loader_name)630 static bool __init loader_is_grub2(const char *loader_name)
631 {
632     /* GRUB1="GNU GRUB 0.xx"; GRUB2="GRUB 1.xx" */
633     const char *p = strstr(loader_name, "GRUB ");
634     return (p != NULL) && (p[5] != '0');
635 }
636 
cmdline_cook(char * p,const char * loader_name)637 static char * __init cmdline_cook(char *p, const char *loader_name)
638 {
639     p = p ? : "";
640 
641     /* Strip leading whitespace. */
642     while ( *p == ' ' )
643         p++;
644 
645     /* GRUB2 and PVH don't not include image name as first item on command line. */
646     if ( xen_guest || loader_is_grub2(loader_name) )
647         return p;
648 
649     /* Strip image name plus whitespace. */
650     while ( (*p != ' ') && (*p != '\0') )
651         p++;
652     while ( *p == ' ' )
653         p++;
654 
655     return p;
656 }
657 
__start_xen(unsigned long mbi_p)658 void __init noreturn __start_xen(unsigned long mbi_p)
659 {
660     char *memmap_type = NULL;
661     char *cmdline, *kextra, *loader;
662     unsigned int initrdidx, domcr_flags = DOMCRF_s3_integrity;
663     multiboot_info_t *mbi;
664     module_t *mod;
665     unsigned long nr_pages, raw_max_page, modules_headroom, *module_map;
666     int i, j, e820_warn = 0, bytes = 0;
667     bool acpi_boot_table_init_done = false, relocated = false;
668     struct ns16550_defaults ns16550 = {
669         .data_bits = 8,
670         .parity    = 'n',
671         .stop_bits = 1
672     };
673     struct xen_arch_domainconfig config = { .emulation_flags = 0 };
674 
675     /* Critical region without IDT or TSS.  Any fault is deadly! */
676 
677     set_processor_id(0);
678     set_current(INVALID_VCPU); /* debug sanity. */
679     idle_vcpu[0] = current;
680 
681     percpu_init_areas();
682 
683     init_idt_traps();
684     load_system_tables();
685 
686     smp_prepare_boot_cpu();
687     sort_exception_tables();
688 
689     setup_virtual_regions(__start___ex_table, __stop___ex_table);
690 
691     /* Full exception support from here on in. */
692 
693     if ( pvh_boot )
694     {
695         ASSERT(mbi_p == 0);
696         mbi = pvh_init();
697     }
698     else
699         mbi = __va(mbi_p);
700 
701     mod = __va(mbi->mods_addr);
702 
703     loader = (mbi->flags & MBI_LOADERNAME)
704         ? (char *)__va(mbi->boot_loader_name) : "unknown";
705 
706     /* Parse the command-line options. */
707     cmdline = cmdline_cook((mbi->flags & MBI_CMDLINE) ?
708                            __va(mbi->cmdline) : NULL,
709                            loader);
710     if ( (kextra = strstr(cmdline, " -- ")) != NULL )
711     {
712         /*
713          * Options after ' -- ' separator belong to dom0.
714          *  1. Orphan dom0's options from Xen's command line.
715          *  2. Skip all but final leading space from dom0's options.
716          */
717         *kextra = '\0';
718         kextra += 3;
719         while ( kextra[1] == ' ' ) kextra++;
720     }
721     cmdline_parse(cmdline);
722 
723     /* Must be after command line argument parsing and before
724      * allocing any xenheap structures wanted in lower memory. */
725     kexec_early_calculations();
726 
727     probe_hypervisor();
728 
729     parse_video_info();
730 
731     rdmsrl(MSR_EFER, this_cpu(efer));
732     asm volatile ( "mov %%cr4,%0" : "=r" (get_cpu_info()->cr4) );
733 
734     /* We initialise the serial devices very early so we can get debugging. */
735     ns16550.io_base = 0x3f8;
736     ns16550.irq     = 4;
737     ns16550_init(0, &ns16550);
738     ns16550.io_base = 0x2f8;
739     ns16550.irq     = 3;
740     ns16550_init(1, &ns16550);
741     ehci_dbgp_init();
742     console_init_preirq();
743 
744     if ( pvh_boot )
745         pvh_print_info();
746 
747     printk("Bootloader: %s\n", loader);
748 
749     printk("Command line: %s\n", cmdline);
750 
751     printk("Xen image load base address: %#lx\n", xen_phys_start);
752 
753 #ifdef CONFIG_VIDEO
754     printk("Video information:\n");
755 
756     /* Print VGA display mode information. */
757     switch ( vga_console_info.video_type )
758     {
759     case XEN_VGATYPE_TEXT_MODE_3:
760         printk(" VGA is text mode %dx%d, font 8x%d\n",
761                vga_console_info.u.text_mode_3.columns,
762                vga_console_info.u.text_mode_3.rows,
763                vga_console_info.u.text_mode_3.font_height);
764         break;
765     case XEN_VGATYPE_VESA_LFB:
766     case XEN_VGATYPE_EFI_LFB:
767         printk(" VGA is graphics mode %dx%d, %d bpp\n",
768                vga_console_info.u.vesa_lfb.width,
769                vga_console_info.u.vesa_lfb.height,
770                vga_console_info.u.vesa_lfb.bits_per_pixel);
771         break;
772     default:
773         printk(" No VGA detected\n");
774         break;
775     }
776 
777     /* Print VBE/DDC EDID information. */
778     if ( bootsym(boot_edid_caps) != 0x1313 )
779     {
780         u16 caps = bootsym(boot_edid_caps);
781         printk(" VBE/DDC methods:%s%s%s; ",
782                (caps & 1) ? " V1" : "",
783                (caps & 2) ? " V2" : "",
784                !(caps & 3) ? " none" : "");
785         printk("EDID transfer time: %d seconds\n", caps >> 8);
786         if ( *(u32 *)bootsym(boot_edid_info) == 0x13131313 )
787         {
788             printk(" EDID info not retrieved because ");
789             if ( !(caps & 3) )
790                 printk("no DDC retrieval method detected\n");
791             else if ( (caps >> 8) > 5 )
792                 printk("takes longer than 5 seconds\n");
793             else
794                 printk("of reasons unknown\n");
795         }
796     }
797 #endif
798 
799     printk("Disc information:\n");
800     printk(" Found %d MBR signatures\n",
801            bootsym(boot_mbr_signature_nr));
802     printk(" Found %d EDD information structures\n",
803            bootsym(boot_edd_info_nr));
804 
805     /* Check that we have at least one Multiboot module. */
806     if ( !(mbi->flags & MBI_MODULES) || (mbi->mods_count == 0) )
807         panic("dom0 kernel not specified. Check bootloader configuration.");
808 
809     if ( pvh_boot )
810     {
811         /* pvh_init() already filled in e820_raw */
812         memmap_type = "PVH-e820";
813     }
814     else if ( efi_enabled(EFI_LOADER) )
815     {
816         set_pdx_range(xen_phys_start >> PAGE_SHIFT,
817                       (xen_phys_start + BOOTSTRAP_MAP_BASE) >> PAGE_SHIFT);
818 
819         /* Clean up boot loader identity mappings. */
820         destroy_xen_mappings(xen_phys_start,
821                              xen_phys_start + BOOTSTRAP_MAP_BASE);
822 
823         /* Make boot page tables match non-EFI boot. */
824         l3_bootmap[l3_table_offset(BOOTSTRAP_MAP_BASE)] =
825             l3e_from_paddr(__pa(l2_bootmap), __PAGE_HYPERVISOR);
826 
827         memmap_type = loader;
828     }
829     else if ( efi_enabled(EFI_BOOT) )
830         memmap_type = "EFI";
831     else if ( (e820_raw.nr_map =
832                    copy_bios_e820(e820_raw.map,
833                                   ARRAY_SIZE(e820_raw.map))) != 0 )
834     {
835         memmap_type = "Xen-e820";
836     }
837     else if ( mbi->flags & MBI_MEMMAP )
838     {
839         memmap_type = "Multiboot-e820";
840         while ( bytes < mbi->mmap_length &&
841                 e820_raw.nr_map < ARRAY_SIZE(e820_raw.map) )
842         {
843             memory_map_t *map = __va(mbi->mmap_addr + bytes);
844 
845             /*
846              * This is a gross workaround for a BIOS bug. Some bootloaders do
847              * not write e820 map entries into pre-zeroed memory. This is
848              * okay if the BIOS fills in all fields of the map entry, but
849              * some broken BIOSes do not bother to write the high word of
850              * the length field if the length is smaller than 4GB. We
851              * detect and fix this by flagging sections below 4GB that
852              * appear to be larger than 4GB in size.
853              */
854             if ( (map->base_addr_high == 0) && (map->length_high != 0) )
855             {
856                 if ( !e820_warn )
857                 {
858                     printk("WARNING: Buggy e820 map detected and fixed "
859                            "(truncated length fields).\n");
860                     e820_warn = 1;
861                 }
862                 map->length_high = 0;
863             }
864 
865             e820_raw.map[e820_raw.nr_map].addr =
866                 ((u64)map->base_addr_high << 32) | (u64)map->base_addr_low;
867             e820_raw.map[e820_raw.nr_map].size =
868                 ((u64)map->length_high << 32) | (u64)map->length_low;
869             e820_raw.map[e820_raw.nr_map].type = map->type;
870             e820_raw.nr_map++;
871 
872             bytes += map->size + 4;
873         }
874     }
875     else if ( bootsym(lowmem_kb) )
876     {
877         memmap_type = "Xen-e801";
878         e820_raw.map[0].addr = 0;
879         e820_raw.map[0].size = bootsym(lowmem_kb) << 10;
880         e820_raw.map[0].type = E820_RAM;
881         e820_raw.map[1].addr = 0x100000;
882         e820_raw.map[1].size = bootsym(highmem_kb) << 10;
883         e820_raw.map[1].type = E820_RAM;
884         e820_raw.nr_map = 2;
885     }
886     else if ( mbi->flags & MBI_MEMLIMITS )
887     {
888         memmap_type = "Multiboot-e801";
889         e820_raw.map[0].addr = 0;
890         e820_raw.map[0].size = mbi->mem_lower << 10;
891         e820_raw.map[0].type = E820_RAM;
892         e820_raw.map[1].addr = 0x100000;
893         e820_raw.map[1].size = mbi->mem_upper << 10;
894         e820_raw.map[1].type = E820_RAM;
895         e820_raw.nr_map = 2;
896     }
897     else
898         panic("Bootloader provided no memory information.");
899 
900     /* Sanitise the raw E820 map to produce a final clean version. */
901     max_page = raw_max_page = init_e820(memmap_type, &e820_raw);
902 
903     /* Create a temporary copy of the E820 map. */
904     memcpy(&boot_e820, &e820, sizeof(e820));
905 
906     /* Early kexec reservation (explicit static start address). */
907     nr_pages = 0;
908     for ( i = 0; i < e820.nr_map; i++ )
909         if ( e820.map[i].type == E820_RAM )
910             nr_pages += e820.map[i].size >> PAGE_SHIFT;
911     set_kexec_crash_area_size((u64)nr_pages << PAGE_SHIFT);
912     kexec_reserve_area(&boot_e820);
913 
914     initial_images = mod;
915     nr_initial_images = mbi->mods_count;
916 
917     /*
918      * Iterate backwards over all superpage-aligned RAM regions.
919      *
920      * We require superpage alignment because the boot allocator is not yet
921      * initialised. Hence we can only map superpages in the address range
922      * 0 to BOOTSTRAP_DIRECTMAP_END, as this is guaranteed not to require
923      * dynamic allocation of pagetables.
924      *
925      * As well as mapping superpages in that range, in preparation for
926      * initialising the boot allocator, we also look for a region to which
927      * we can relocate the dom0 kernel and other multiboot modules. Also, on
928      * x86/64, we relocate Xen to higher memory.
929      */
930     for ( i = 0; !efi_enabled(EFI_LOADER) && i < mbi->mods_count; i++ )
931     {
932         if ( mod[i].mod_start & (PAGE_SIZE - 1) )
933             panic("Bootloader didn't honor module alignment request.");
934         mod[i].mod_end -= mod[i].mod_start;
935         mod[i].mod_start >>= PAGE_SHIFT;
936         mod[i].reserved = 0;
937     }
938 
939     if ( xen_phys_start )
940     {
941         relocated = true;
942 
943         /*
944          * This needs to remain in sync with xen_in_range() and the
945          * respective reserve_e820_ram() invocation below.
946          */
947         mod[mbi->mods_count].mod_start = virt_to_mfn(_stext);
948         mod[mbi->mods_count].mod_end = __2M_rwdata_end - _stext;
949     }
950 
951     modules_headroom = bzimage_headroom(bootstrap_map(mod), mod->mod_end);
952     bootstrap_map(NULL);
953 
954 #ifndef highmem_start
955     /* Don't allow split below 4Gb. */
956     if ( highmem_start < GB(4) )
957         highmem_start = 0;
958     else /* align to L3 entry boundary */
959         highmem_start &= ~((1UL << L3_PAGETABLE_SHIFT) - 1);
960 #endif
961 
962     for ( i = boot_e820.nr_map-1; i >= 0; i-- )
963     {
964         uint64_t s, e, mask = (1UL << L2_PAGETABLE_SHIFT) - 1;
965         uint64_t end, limit = ARRAY_SIZE(l2_identmap) << L2_PAGETABLE_SHIFT;
966 
967         /* Superpage-aligned chunks from BOOTSTRAP_MAP_BASE. */
968         s = (boot_e820.map[i].addr + mask) & ~mask;
969         e = (boot_e820.map[i].addr + boot_e820.map[i].size) & ~mask;
970         s = max_t(uint64_t, s, BOOTSTRAP_MAP_BASE);
971         if ( (boot_e820.map[i].type != E820_RAM) || (s >= e) )
972             continue;
973 
974         if ( s < limit )
975         {
976             end = min(e, limit);
977             set_pdx_range(s >> PAGE_SHIFT, end >> PAGE_SHIFT);
978             map_pages_to_xen((unsigned long)__va(s), s >> PAGE_SHIFT,
979                              (end - s) >> PAGE_SHIFT, PAGE_HYPERVISOR);
980         }
981 
982         if ( e > min(HYPERVISOR_VIRT_END - DIRECTMAP_VIRT_START,
983                      1UL << (PAGE_SHIFT + 32)) )
984             e = min(HYPERVISOR_VIRT_END - DIRECTMAP_VIRT_START,
985                     1UL << (PAGE_SHIFT + 32));
986 #define reloc_size ((__pa(__2M_rwdata_end) + mask) & ~mask)
987         /* Is the region suitable for relocating Xen? */
988         if ( !xen_phys_start && e <= limit )
989         {
990             /* Don't overlap with modules. */
991             end = consider_modules(s, e, reloc_size + mask,
992                                    mod, mbi->mods_count, -1);
993             end &= ~mask;
994         }
995         else
996             end = 0;
997         if ( end > s )
998         {
999             l4_pgentry_t *pl4e;
1000             l3_pgentry_t *pl3e;
1001             l2_pgentry_t *pl2e;
1002             int i, j, k;
1003 
1004             /* Select relocation address. */
1005             e = end - reloc_size;
1006             xen_phys_start = e;
1007             bootsym(trampoline_xen_phys_start) = e;
1008 
1009             /*
1010              * Perform relocation to new physical address.
1011              * Before doing so we must sync static/global data with main memory
1012              * with a barrier(). After this we must *not* modify static/global
1013              * data until after we have switched to the relocated pagetables!
1014              */
1015             barrier();
1016             move_memory(e + XEN_IMG_OFFSET, XEN_IMG_OFFSET, _end - _start, 1);
1017 
1018             /* Walk initial pagetables, relocating page directory entries. */
1019             pl4e = __va(__pa(idle_pg_table));
1020             for ( i = 0 ; i < L4_PAGETABLE_ENTRIES; i++, pl4e++ )
1021             {
1022                 if ( !(l4e_get_flags(*pl4e) & _PAGE_PRESENT) )
1023                     continue;
1024                 *pl4e = l4e_from_intpte(l4e_get_intpte(*pl4e) +
1025                                         xen_phys_start);
1026                 pl3e = l4e_to_l3e(*pl4e);
1027                 for ( j = 0; j < L3_PAGETABLE_ENTRIES; j++, pl3e++ )
1028                 {
1029                     /* Not present, 1GB mapping, or already relocated? */
1030                     if ( !(l3e_get_flags(*pl3e) & _PAGE_PRESENT) ||
1031                          (l3e_get_flags(*pl3e) & _PAGE_PSE) ||
1032                          (l3e_get_pfn(*pl3e) > PFN_DOWN(xen_phys_start)) )
1033                         continue;
1034                     *pl3e = l3e_from_intpte(l3e_get_intpte(*pl3e) +
1035                                             xen_phys_start);
1036                     pl2e = l3e_to_l2e(*pl3e);
1037                     for ( k = 0; k < L2_PAGETABLE_ENTRIES; k++, pl2e++ )
1038                     {
1039                         /* Not present, PSE, or already relocated? */
1040                         if ( !(l2e_get_flags(*pl2e) & _PAGE_PRESENT) ||
1041                              (l2e_get_flags(*pl2e) & _PAGE_PSE) ||
1042                              (l2e_get_pfn(*pl2e) > PFN_DOWN(xen_phys_start)) )
1043                             continue;
1044                         *pl2e = l2e_from_intpte(l2e_get_intpte(*pl2e) +
1045                                                 xen_phys_start);
1046                     }
1047                 }
1048             }
1049 
1050             /* The only data mappings to be relocated are in the Xen area. */
1051             pl2e = __va(__pa(l2_xenmap));
1052             /*
1053              * Undo the temporary-hooking of the l1_identmap.  __2M_text_start
1054              * is contained in this PTE.
1055              */
1056             BUG_ON(using_2M_mapping() &&
1057                    l2_table_offset((unsigned long)_erodata) ==
1058                    l2_table_offset((unsigned long)_stext));
1059             *pl2e++ = l2e_from_pfn(xen_phys_start >> PAGE_SHIFT,
1060                                    PAGE_HYPERVISOR_RX | _PAGE_PSE);
1061             for ( i = 1; i < L2_PAGETABLE_ENTRIES; i++, pl2e++ )
1062             {
1063                 unsigned int flags;
1064 
1065                 if ( !(l2e_get_flags(*pl2e) & _PAGE_PRESENT) ||
1066                      (l2e_get_pfn(*pl2e) > PFN_DOWN(xen_phys_start)) )
1067                     continue;
1068 
1069                 if ( !using_2M_mapping() )
1070                 {
1071                     *pl2e = l2e_from_intpte(l2e_get_intpte(*pl2e) +
1072                                             xen_phys_start);
1073                     continue;
1074                 }
1075 
1076                 if ( i < l2_table_offset((unsigned long)&__2M_text_end) )
1077                 {
1078                     flags = PAGE_HYPERVISOR_RX | _PAGE_PSE;
1079                 }
1080                 else if ( i >= l2_table_offset((unsigned long)&__2M_rodata_start) &&
1081                           i <  l2_table_offset((unsigned long)&__2M_rodata_end) )
1082                 {
1083                     flags = PAGE_HYPERVISOR_RO | _PAGE_PSE;
1084                 }
1085                 else if ( i >= l2_table_offset((unsigned long)&__2M_init_start) &&
1086                           i <  l2_table_offset((unsigned long)&__2M_init_end) )
1087                 {
1088                     flags = PAGE_HYPERVISOR_RWX | _PAGE_PSE;
1089                 }
1090                 else if ( (i >= l2_table_offset((unsigned long)&__2M_rwdata_start) &&
1091                            i <  l2_table_offset((unsigned long)&__2M_rwdata_end)) )
1092                 {
1093                     flags = PAGE_HYPERVISOR_RW | _PAGE_PSE;
1094                 }
1095                 else
1096                 {
1097                     *pl2e = l2e_empty();
1098                     continue;
1099                 }
1100 
1101                 *pl2e = l2e_from_paddr(
1102                     l2e_get_paddr(*pl2e) + xen_phys_start, flags);
1103             }
1104 
1105             /* Re-sync the stack and then switch to relocated pagetables. */
1106             asm volatile (
1107                 "rep movsq        ; " /* re-sync the stack */
1108                 "movq %%cr4,%%rsi ; "
1109                 "andb $0x7f,%%sil ; "
1110                 "movq %%rsi,%%cr4 ; " /* CR4.PGE == 0 */
1111                 "movq %[pg],%%cr3 ; " /* CR3 == new pagetables */
1112                 "orb $0x80,%%sil  ; "
1113                 "movq %%rsi,%%cr4   " /* CR4.PGE == 1 */
1114                 : "=&S" (i), "=&D" (i), "=&c" (i) /* All outputs discarded. */
1115                 :  [pg] "r" (__pa(idle_pg_table)), "0" (cpu0_stack),
1116                    "1" (__va(__pa(cpu0_stack))), "2" (STACK_SIZE / 8)
1117                 : "memory" );
1118 
1119             bootstrap_map(NULL);
1120 
1121             printk("New Xen image base address: %#lx\n", xen_phys_start);
1122         }
1123 
1124         /* Is the region suitable for relocating the multiboot modules? */
1125         for ( j = mbi->mods_count - 1; j >= 0; j-- )
1126         {
1127             unsigned long headroom = j ? 0 : modules_headroom;
1128             unsigned long size = PAGE_ALIGN(headroom + mod[j].mod_end);
1129 
1130             if ( mod[j].reserved )
1131                 continue;
1132 
1133             /* Don't overlap with other modules (or Xen itself). */
1134             end = consider_modules(s, e, size, mod,
1135                                    mbi->mods_count + relocated, j);
1136 
1137             if ( highmem_start && end > highmem_start )
1138                 continue;
1139 
1140             if ( s < end &&
1141                  (headroom ||
1142                   ((end - size) >> PAGE_SHIFT) > mod[j].mod_start) )
1143             {
1144                 move_memory(end - size + headroom,
1145                             (uint64_t)mod[j].mod_start << PAGE_SHIFT,
1146                             mod[j].mod_end, 0);
1147                 mod[j].mod_start = (end - size) >> PAGE_SHIFT;
1148                 mod[j].mod_end += headroom;
1149                 mod[j].reserved = 1;
1150             }
1151         }
1152 
1153 #ifdef CONFIG_KEXEC
1154         /*
1155          * Looking backwards from the crash area limit, find a large
1156          * enough range that does not overlap with modules.
1157          */
1158         while ( !kexec_crash_area.start )
1159         {
1160             /* Don't overlap with modules (or Xen itself). */
1161             e = consider_modules(s, e, PAGE_ALIGN(kexec_crash_area.size), mod,
1162                                  mbi->mods_count + relocated, -1);
1163             if ( s >= e )
1164                 break;
1165             if ( e > kexec_crash_area_limit )
1166             {
1167                 e = kexec_crash_area_limit & PAGE_MASK;
1168                 continue;
1169             }
1170             kexec_crash_area.start = (e - kexec_crash_area.size) & PAGE_MASK;
1171         }
1172 #endif
1173     }
1174 
1175     if ( modules_headroom && !mod->reserved )
1176         panic("Not enough memory to relocate the dom0 kernel image.");
1177     for ( i = 0; i < mbi->mods_count; ++i )
1178     {
1179         uint64_t s = (uint64_t)mod[i].mod_start << PAGE_SHIFT;
1180 
1181         reserve_e820_ram(&boot_e820, s, s + PAGE_ALIGN(mod[i].mod_end));
1182     }
1183 
1184     if ( !xen_phys_start )
1185         panic("Not enough memory to relocate Xen.");
1186 
1187     /* This needs to remain in sync with xen_in_range(). */
1188     reserve_e820_ram(&boot_e820, __pa(_stext), __pa(__2M_rwdata_end));
1189 
1190     /* Late kexec reservation (dynamic start address). */
1191     kexec_reserve_area(&boot_e820);
1192 
1193     setup_max_pdx(raw_max_page);
1194     if ( highmem_start )
1195         xenheap_max_mfn(PFN_DOWN(highmem_start - 1));
1196 
1197     /*
1198      * Walk every RAM region and map it in its entirety (on x86/64, at least)
1199      * and notify it to the boot allocator.
1200      */
1201     for ( i = 0; i < boot_e820.nr_map; i++ )
1202     {
1203         uint64_t s, e, mask = PAGE_SIZE - 1;
1204         uint64_t map_s, map_e;
1205 
1206         /* Only page alignment required now. */
1207         s = (boot_e820.map[i].addr + mask) & ~mask;
1208         e = (boot_e820.map[i].addr + boot_e820.map[i].size) & ~mask;
1209         s = max_t(uint64_t, s, 1<<20);
1210         if ( (boot_e820.map[i].type != E820_RAM) || (s >= e) )
1211             continue;
1212 
1213         if ( !acpi_boot_table_init_done &&
1214              s >= (1ULL << 32) &&
1215              !acpi_boot_table_init() )
1216         {
1217             acpi_boot_table_init_done = true;
1218             srat_parse_regions(s);
1219             setup_max_pdx(raw_max_page);
1220         }
1221 
1222         if ( pfn_to_pdx((e - 1) >> PAGE_SHIFT) >= max_pdx )
1223         {
1224             if ( pfn_to_pdx(s >> PAGE_SHIFT) >= max_pdx )
1225             {
1226                 for ( j = i - 1; ; --j )
1227                 {
1228                     if ( boot_e820.map[j].type == E820_RAM )
1229                         break;
1230                     ASSERT(j);
1231                 }
1232                 map_e = boot_e820.map[j].addr + boot_e820.map[j].size;
1233                 for ( j = 0; j < mbi->mods_count; ++j )
1234                 {
1235                     uint64_t end = pfn_to_paddr(mod[j].mod_start) +
1236                                    mod[j].mod_end;
1237 
1238                     if ( map_e < end )
1239                         map_e = end;
1240                 }
1241                 if ( PFN_UP(map_e) < max_page )
1242                 {
1243                     max_page = PFN_UP(map_e);
1244                     max_pdx = pfn_to_pdx(max_page - 1) + 1;
1245                 }
1246                 printk(XENLOG_WARNING "Ignoring inaccessible memory range"
1247                                       " %013"PRIx64"-%013"PRIx64"\n",
1248                        s, e);
1249                 continue;
1250             }
1251             map_e = e;
1252             e = (pdx_to_pfn(max_pdx - 1) + 1ULL) << PAGE_SHIFT;
1253             printk(XENLOG_WARNING "Ignoring inaccessible memory range"
1254                                   " %013"PRIx64"-%013"PRIx64"\n",
1255                    e, map_e);
1256         }
1257 
1258         set_pdx_range(s >> PAGE_SHIFT, e >> PAGE_SHIFT);
1259 
1260         /* Need to create mappings above BOOTSTRAP_MAP_BASE. */
1261         map_s = max_t(uint64_t, s, BOOTSTRAP_MAP_BASE);
1262         map_e = min_t(uint64_t, e,
1263                       ARRAY_SIZE(l2_identmap) << L2_PAGETABLE_SHIFT);
1264 
1265         /* Pass mapped memory to allocator /before/ creating new mappings. */
1266         init_boot_pages(s, min(map_s, e));
1267         s = map_s;
1268         if ( s < map_e )
1269         {
1270             uint64_t mask = (1UL << L2_PAGETABLE_SHIFT) - 1;
1271 
1272             map_s = (s + mask) & ~mask;
1273             map_e &= ~mask;
1274             init_boot_pages(map_s, map_e);
1275         }
1276 
1277         if ( map_s > map_e )
1278             map_s = map_e = s;
1279 
1280         /* Create new mappings /before/ passing memory to the allocator. */
1281         if ( map_e < e )
1282         {
1283             uint64_t limit = __pa(HYPERVISOR_VIRT_END - 1) + 1;
1284             uint64_t end = min(e, limit);
1285 
1286             if ( map_e < end )
1287             {
1288                 map_pages_to_xen((unsigned long)__va(map_e), PFN_DOWN(map_e),
1289                                  PFN_DOWN(end - map_e), PAGE_HYPERVISOR);
1290                 init_boot_pages(map_e, end);
1291                 map_e = end;
1292             }
1293         }
1294         if ( map_e < e )
1295         {
1296             /* This range must not be passed to the boot allocator and
1297              * must also not be mapped with _PAGE_GLOBAL. */
1298             map_pages_to_xen((unsigned long)__va(map_e), PFN_DOWN(map_e),
1299                              PFN_DOWN(e - map_e), __PAGE_HYPERVISOR_RW);
1300         }
1301         if ( s < map_s )
1302         {
1303             map_pages_to_xen((unsigned long)__va(s), s >> PAGE_SHIFT,
1304                              (map_s - s) >> PAGE_SHIFT, PAGE_HYPERVISOR);
1305             init_boot_pages(s, map_s);
1306         }
1307     }
1308 
1309     for ( i = 0; i < mbi->mods_count; ++i )
1310     {
1311         set_pdx_range(mod[i].mod_start,
1312                       mod[i].mod_start + PFN_UP(mod[i].mod_end));
1313         map_pages_to_xen((unsigned long)mfn_to_virt(mod[i].mod_start),
1314                          mod[i].mod_start,
1315                          PFN_UP(mod[i].mod_end), PAGE_HYPERVISOR);
1316     }
1317 
1318 #ifdef CONFIG_KEXEC
1319     if ( kexec_crash_area.size )
1320     {
1321         unsigned long s = PFN_DOWN(kexec_crash_area.start);
1322         unsigned long e = min(s + PFN_UP(kexec_crash_area.size),
1323                               PFN_UP(__pa(HYPERVISOR_VIRT_END - 1)));
1324 
1325         if ( e > s )
1326             map_pages_to_xen((unsigned long)__va(kexec_crash_area.start),
1327                              s, e - s, PAGE_HYPERVISOR);
1328     }
1329 #endif
1330 
1331     xen_virt_end = ((unsigned long)_end + (1UL << L2_PAGETABLE_SHIFT) - 1) &
1332                    ~((1UL << L2_PAGETABLE_SHIFT) - 1);
1333     destroy_xen_mappings(xen_virt_end, XEN_VIRT_START + BOOTSTRAP_MAP_BASE);
1334 
1335     /*
1336      * If not using 2M mappings to gain suitable pagetable permissions
1337      * directly from the relocation above, remap the code/data
1338      * sections with decreased permissions.
1339      */
1340     if ( !using_2M_mapping() )
1341     {
1342         /* Mark .text as RX (avoiding the first 2M superpage). */
1343         modify_xen_mappings(XEN_VIRT_START + MB(2),
1344                             (unsigned long)&__2M_text_end,
1345                             PAGE_HYPERVISOR_RX);
1346 
1347         /* Mark .rodata as RO. */
1348         modify_xen_mappings((unsigned long)&__2M_rodata_start,
1349                             (unsigned long)&__2M_rodata_end,
1350                             PAGE_HYPERVISOR_RO);
1351 
1352         /* Mark .data and .bss as RW. */
1353         modify_xen_mappings((unsigned long)&__2M_rwdata_start,
1354                             (unsigned long)&__2M_rwdata_end,
1355                             PAGE_HYPERVISOR_RW);
1356 
1357         /* Drop the remaining mappings in the shattered superpage. */
1358         destroy_xen_mappings((unsigned long)&__2M_rwdata_end,
1359                              ROUNDUP((unsigned long)&__2M_rwdata_end, MB(2)));
1360     }
1361 
1362     nr_pages = 0;
1363     for ( i = 0; i < e820.nr_map; i++ )
1364         if ( e820.map[i].type == E820_RAM )
1365             nr_pages += e820.map[i].size >> PAGE_SHIFT;
1366     printk("System RAM: %luMB (%lukB)\n",
1367            nr_pages >> (20 - PAGE_SHIFT),
1368            nr_pages << (PAGE_SHIFT - 10));
1369     total_pages = nr_pages;
1370 
1371     /* Sanity check for unwanted bloat of certain hypercall structures. */
1372     BUILD_BUG_ON(sizeof(((struct xen_platform_op *)0)->u) !=
1373                  sizeof(((struct xen_platform_op *)0)->u.pad));
1374     BUILD_BUG_ON(sizeof(((struct xen_domctl *)0)->u) !=
1375                  sizeof(((struct xen_domctl *)0)->u.pad));
1376     BUILD_BUG_ON(sizeof(((struct xen_sysctl *)0)->u) !=
1377                  sizeof(((struct xen_sysctl *)0)->u.pad));
1378 
1379     BUILD_BUG_ON(sizeof(start_info_t) > PAGE_SIZE);
1380     BUILD_BUG_ON(sizeof(shared_info_t) > PAGE_SIZE);
1381     BUILD_BUG_ON(sizeof(struct vcpu_info) != 64);
1382 
1383     BUILD_BUG_ON(sizeof(((struct compat_platform_op *)0)->u) !=
1384                  sizeof(((struct compat_platform_op *)0)->u.pad));
1385     BUILD_BUG_ON(sizeof(start_info_compat_t) > PAGE_SIZE);
1386     BUILD_BUG_ON(sizeof(struct compat_vcpu_info) != 64);
1387 
1388     /* Check definitions in public headers match internal defs. */
1389     BUILD_BUG_ON(__HYPERVISOR_VIRT_START != HYPERVISOR_VIRT_START);
1390     BUILD_BUG_ON(__HYPERVISOR_VIRT_END   != HYPERVISOR_VIRT_END);
1391     BUILD_BUG_ON(MACH2PHYS_VIRT_START != RO_MPT_VIRT_START);
1392     BUILD_BUG_ON(MACH2PHYS_VIRT_END   != RO_MPT_VIRT_END);
1393 
1394     init_frametable();
1395 
1396     if ( !acpi_boot_table_init_done )
1397         acpi_boot_table_init();
1398 
1399     acpi_numa_init();
1400 
1401     numa_initmem_init(0, raw_max_page);
1402 
1403     if ( max_page - 1 > virt_to_mfn(HYPERVISOR_VIRT_END - 1) )
1404     {
1405         unsigned long limit = virt_to_mfn(HYPERVISOR_VIRT_END - 1);
1406         uint64_t mask = PAGE_SIZE - 1;
1407 
1408         if ( !highmem_start )
1409             xenheap_max_mfn(limit);
1410 
1411         end_boot_allocator();
1412 
1413         /* Pass the remaining memory to the allocator. */
1414         for ( i = 0; i < boot_e820.nr_map; i++ )
1415         {
1416             uint64_t s, e;
1417 
1418             if ( boot_e820.map[i].type != E820_RAM )
1419                 continue;
1420             s = (boot_e820.map[i].addr + mask) & ~mask;
1421             e = (boot_e820.map[i].addr + boot_e820.map[i].size) & ~mask;
1422             if ( PFN_DOWN(e) <= limit )
1423                 continue;
1424             if ( PFN_DOWN(s) <= limit )
1425                 s = pfn_to_paddr(limit + 1);
1426             init_domheap_pages(s, e);
1427         }
1428 
1429         if ( tmem_enabled() )
1430         {
1431            printk(XENLOG_WARNING
1432                   "TMEM physical RAM limit exceeded, disabling TMEM\n");
1433            tmem_disable();
1434         }
1435     }
1436     else
1437         end_boot_allocator();
1438 
1439     system_state = SYS_STATE_boot;
1440     /*
1441      * No calls involving ACPI code should go between the setting of
1442      * SYS_STATE_boot and vm_init() (or else acpi_os_{,un}map_memory()
1443      * will break).
1444      */
1445     vm_init();
1446 
1447     console_init_ring();
1448     vesa_init();
1449 
1450     softirq_init();
1451     tasklet_subsys_init();
1452 
1453     early_cpu_init();
1454 
1455     paging_init();
1456 
1457     tboot_probe();
1458 
1459     open_softirq(NEW_TLBFLUSH_CLOCK_PERIOD_SOFTIRQ, new_tlbflush_clock_period);
1460 
1461     if ( opt_watchdog )
1462         nmi_watchdog = NMI_LOCAL_APIC;
1463 
1464     find_smp_config();
1465 
1466     dmi_scan_machine();
1467 
1468     generic_apic_probe();
1469 
1470     acpi_boot_init();
1471 
1472     if ( smp_found_config )
1473         get_smp_config();
1474 
1475     if ( opt_nosmp )
1476     {
1477         max_cpus = 0;
1478         set_nr_cpu_ids(1);
1479     }
1480     else
1481     {
1482         set_nr_cpu_ids(max_cpus);
1483         max_cpus = nr_cpu_ids;
1484     }
1485 
1486     if ( xen_guest )
1487         hypervisor_setup();
1488 
1489     /* Low mappings were only needed for some BIOS table parsing. */
1490     zap_low_mappings();
1491 
1492     mmio_ro_ranges = rangeset_new(NULL, "r/o mmio ranges",
1493                                   RANGESETF_prettyprint_hex);
1494 
1495     init_apic_mappings();
1496 
1497     normalise_cpu_order();
1498 
1499     init_cpu_to_node();
1500 
1501     x2apic_bsp_setup();
1502 
1503     init_IRQ();
1504 
1505     module_map = xmalloc_array(unsigned long, BITS_TO_LONGS(mbi->mods_count));
1506     bitmap_fill(module_map, mbi->mods_count);
1507     __clear_bit(0, module_map); /* Dom0 kernel is always first */
1508 
1509     xsm_multiboot_init(module_map, mbi, bootstrap_map);
1510 
1511     microcode_grab_module(module_map, mbi, bootstrap_map);
1512 
1513     timer_init();
1514 
1515     early_microcode_init();
1516 
1517     identify_cpu(&boot_cpu_data);
1518 
1519     set_in_cr4(X86_CR4_OSFXSR | X86_CR4_OSXMMEXCPT);
1520 
1521     if ( !opt_smep )
1522         setup_clear_cpu_cap(X86_FEATURE_SMEP);
1523     if ( cpu_has_smep && opt_smep != SMEP_HVM_ONLY )
1524         setup_force_cpu_cap(X86_FEATURE_XEN_SMEP);
1525     if ( boot_cpu_has(X86_FEATURE_XEN_SMEP) )
1526         set_in_cr4(X86_CR4_SMEP);
1527 
1528     if ( !opt_smap )
1529         setup_clear_cpu_cap(X86_FEATURE_SMAP);
1530     if ( cpu_has_smap && opt_smap != SMAP_HVM_ONLY )
1531         setup_force_cpu_cap(X86_FEATURE_XEN_SMAP);
1532     if ( boot_cpu_has(X86_FEATURE_XEN_SMAP) )
1533         set_in_cr4(X86_CR4_SMAP);
1534 
1535     cr4_pv32_mask = mmu_cr4_features & XEN_CR4_PV32_BITS;
1536 
1537     if ( cpu_has_fsgsbase )
1538         set_in_cr4(X86_CR4_FSGSBASE);
1539 
1540     init_idle_domain();
1541 
1542     this_cpu(stubs.addr) = alloc_stub_page(smp_processor_id(),
1543                                            &this_cpu(stubs).mfn);
1544     BUG_ON(!this_cpu(stubs.addr));
1545 
1546     trap_init();
1547 
1548     rcu_init();
1549 
1550     early_time_init();
1551 
1552     arch_init_memory();
1553 
1554     alternative_instructions();
1555 
1556     local_irq_enable();
1557 
1558     pt_pci_init();
1559 
1560     vesa_mtrr_init();
1561 
1562     acpi_mmcfg_init();
1563 
1564     early_msi_init();
1565 
1566     iommu_setup();    /* setup iommu if available */
1567 
1568     smp_prepare_cpus(max_cpus);
1569 
1570     spin_debug_enable();
1571 
1572     /*
1573      * Initialise higher-level timer functions. We do this fairly late
1574      * (after interrupts got enabled) because the time bases and scale
1575      * factors need to be updated regularly.
1576      */
1577     init_xen_time();
1578 
1579     initialize_keytable();
1580 
1581     console_init_postirq();
1582 
1583     system_state = SYS_STATE_smp_boot;
1584 
1585     do_presmp_initcalls();
1586 
1587     /*
1588      * NB: when running as a PV shim VCPUOP_up/down is wired to the shim
1589      * physical cpu_add/remove functions, so launch the guest with only
1590      * the BSP online and let it bring up the other CPUs as required.
1591      */
1592     if ( !pv_shim )
1593     {
1594         for_each_present_cpu ( i )
1595         {
1596             /* Set up cpu_to_node[]. */
1597             srat_detect_node(i);
1598             /* Set up node_to_cpumask based on cpu_to_node[]. */
1599             numa_add_cpu(i);
1600 
1601             if ( (num_online_cpus() < max_cpus) && !cpu_online(i) )
1602             {
1603                 int ret = cpu_up(i);
1604                 if ( ret != 0 )
1605                     printk("Failed to bring up CPU %u (error %d)\n", i, ret);
1606             }
1607         }
1608     }
1609 
1610     printk("Brought up %ld CPUs\n", (long)num_online_cpus());
1611     smp_cpus_done();
1612 
1613     do_initcalls();
1614 
1615     if ( opt_watchdog )
1616         watchdog_setup();
1617 
1618     if ( !tboot_protect_mem_regions() )
1619         panic("Could not protect TXT memory regions");
1620 
1621     init_guest_cpuid();
1622     init_guest_msr_policy();
1623 
1624     if ( dom0_pvh )
1625     {
1626         domcr_flags |= DOMCRF_hvm |
1627                        ((hvm_funcs.hap_supported && !opt_dom0_shadow) ?
1628                          DOMCRF_hap : 0);
1629         config.emulation_flags = XEN_X86_EMU_LAPIC|XEN_X86_EMU_IOAPIC;
1630     }
1631 
1632     /* Create initial domain 0. */
1633     dom0 = domain_create(get_initial_domain_id(), domcr_flags, 0, &config);
1634     if ( IS_ERR(dom0) || (alloc_dom0_vcpu0(dom0) == NULL) )
1635         panic("Error creating domain 0");
1636 
1637     if ( !pv_shim )
1638         dom0->is_privileged = 1;
1639     dom0->target = NULL;
1640 
1641     /* Grab the DOM0 command line. */
1642     cmdline = (char *)(mod[0].string ? __va(mod[0].string) : NULL);
1643     if ( (cmdline != NULL) || (kextra != NULL) )
1644     {
1645         static char __initdata dom0_cmdline[MAX_GUEST_CMDLINE];
1646 
1647         cmdline = cmdline_cook(cmdline, loader);
1648         safe_strcpy(dom0_cmdline, cmdline);
1649 
1650         if ( kextra != NULL )
1651             /* kextra always includes exactly one leading space. */
1652             safe_strcat(dom0_cmdline, kextra);
1653 
1654         /* Append any extra parameters. */
1655         if ( skip_ioapic_setup && !strstr(dom0_cmdline, "noapic") )
1656             safe_strcat(dom0_cmdline, " noapic");
1657         if ( (strlen(acpi_param) == 0) && acpi_disabled )
1658         {
1659             printk("ACPI is disabled, notifying Domain 0 (acpi=off)\n");
1660             safe_strcpy(acpi_param, "off");
1661         }
1662         if ( (strlen(acpi_param) != 0) && !strstr(dom0_cmdline, "acpi=") )
1663         {
1664             safe_strcat(dom0_cmdline, " acpi=");
1665             safe_strcat(dom0_cmdline, acpi_param);
1666         }
1667 
1668         cmdline = dom0_cmdline;
1669     }
1670 
1671     if ( xen_cpuidle )
1672         xen_processor_pmbits |= XEN_PROCESSOR_PM_CX;
1673 
1674     initrdidx = find_first_bit(module_map, mbi->mods_count);
1675     if ( bitmap_weight(module_map, mbi->mods_count) > 1 )
1676         printk(XENLOG_WARNING
1677                "Multiple initrd candidates, picking module #%u\n",
1678                initrdidx);
1679 
1680     /*
1681      * Temporarily clear SMAP in CR4 to allow user-accesses in construct_dom0().
1682      * This saves a large number of corner cases interactions with
1683      * copy_from_user().
1684      */
1685     if ( cpu_has_smap )
1686     {
1687         cr4_pv32_mask &= ~X86_CR4_SMAP;
1688         write_cr4(read_cr4() & ~X86_CR4_SMAP);
1689     }
1690 
1691     printk("%sNX (Execute Disable) protection %sactive\n",
1692            cpu_has_nx ? XENLOG_INFO : XENLOG_WARNING "Warning: ",
1693            cpu_has_nx ? "" : "not ");
1694 
1695     /*
1696      * We're going to setup domain0 using the module(s) that we stashed safely
1697      * above our heap. The second module, if present, is an initrd ramdisk.
1698      */
1699     if ( construct_dom0(dom0, mod, modules_headroom,
1700                         (initrdidx > 0) && (initrdidx < mbi->mods_count)
1701                         ? mod + initrdidx : NULL,
1702                         bootstrap_map, cmdline) != 0)
1703         panic("Could not set up DOM0 guest OS");
1704 
1705     if ( cpu_has_smap )
1706     {
1707         write_cr4(read_cr4() | X86_CR4_SMAP);
1708         cr4_pv32_mask |= X86_CR4_SMAP;
1709     }
1710 
1711     heap_init_late();
1712 
1713     init_trace_bufs();
1714 
1715     init_constructors();
1716 
1717     console_endboot();
1718 
1719     /* Hide UART from DOM0 if we're using it */
1720     serial_endboot();
1721 
1722     dmi_end_boot();
1723 
1724     setup_io_bitmap(dom0);
1725 
1726     /* Jump to the 1:1 virtual mappings of cpu0_stack. */
1727     asm volatile ("mov %[stk], %%rsp; jmp %c[fn]" ::
1728                   [stk] "g" (__va(__pa(get_stack_bottom()))),
1729                   [fn] "i" (reinit_bsp_stack) : "memory");
1730     unreachable();
1731 }
1732 
arch_get_xen_caps(xen_capabilities_info_t * info)1733 void arch_get_xen_caps(xen_capabilities_info_t *info)
1734 {
1735     /* Interface name is always xen-3.0-* for Xen-3.x. */
1736     int major = 3, minor = 0;
1737     char s[32];
1738 
1739     (*info)[0] = '\0';
1740 
1741     snprintf(s, sizeof(s), "xen-%d.%d-x86_64 ", major, minor);
1742     safe_strcat(*info, s);
1743     snprintf(s, sizeof(s), "xen-%d.%d-x86_32p ", major, minor);
1744     safe_strcat(*info, s);
1745     if ( hvm_enabled )
1746     {
1747         snprintf(s, sizeof(s), "hvm-%d.%d-x86_32 ", major, minor);
1748         safe_strcat(*info, s);
1749         snprintf(s, sizeof(s), "hvm-%d.%d-x86_32p ", major, minor);
1750         safe_strcat(*info, s);
1751         snprintf(s, sizeof(s), "hvm-%d.%d-x86_64 ", major, minor);
1752         safe_strcat(*info, s);
1753     }
1754 }
1755 
xen_in_range(unsigned long mfn)1756 int __hwdom_init xen_in_range(unsigned long mfn)
1757 {
1758     paddr_t start, end;
1759     int i;
1760 
1761     enum { region_s3, region_ro, region_rw, nr_regions };
1762     static struct {
1763         paddr_t s, e;
1764     } xen_regions[nr_regions] __hwdom_initdata;
1765 
1766     /* initialize first time */
1767     if ( !xen_regions[0].s )
1768     {
1769         /* S3 resume code (and other real mode trampoline code) */
1770         xen_regions[region_s3].s = bootsym_phys(trampoline_start);
1771         xen_regions[region_s3].e = bootsym_phys(trampoline_end);
1772 
1773         /*
1774          * This needs to remain in sync with the uses of the same symbols in
1775          * - __start_xen() (above)
1776          * - is_xen_fixed_mfn()
1777          * - tboot_shutdown()
1778          */
1779 
1780         /* hypervisor .text + .rodata */
1781         xen_regions[region_ro].s = __pa(&_stext);
1782         xen_regions[region_ro].e = __pa(&__2M_rodata_end);
1783         /* hypervisor .data + .bss */
1784         xen_regions[region_rw].s = __pa(&__2M_rwdata_start);
1785         xen_regions[region_rw].e = __pa(&__2M_rwdata_end);
1786     }
1787 
1788     start = (paddr_t)mfn << PAGE_SHIFT;
1789     end = start + PAGE_SIZE;
1790     for ( i = 0; i < nr_regions; i++ )
1791         if ( (start < xen_regions[i].e) && (end > xen_regions[i].s) )
1792             return 1;
1793 
1794     return 0;
1795 }
1796 
io_bitmap_cb(unsigned long s,unsigned long e,void * ctx)1797 static int __hwdom_init io_bitmap_cb(unsigned long s, unsigned long e,
1798                                      void *ctx)
1799 {
1800     struct domain *d = ctx;
1801     unsigned int i;
1802 
1803     ASSERT(e <= INT_MAX);
1804     for ( i = s; i <= e; i++ )
1805         __clear_bit(i, d->arch.hvm_domain.io_bitmap);
1806 
1807     return 0;
1808 }
1809 
setup_io_bitmap(struct domain * d)1810 void __hwdom_init setup_io_bitmap(struct domain *d)
1811 {
1812     int rc;
1813 
1814     if ( is_hvm_domain(d) )
1815     {
1816         bitmap_fill(d->arch.hvm_domain.io_bitmap, 0x10000);
1817         rc = rangeset_report_ranges(d->arch.ioport_caps, 0, 0x10000,
1818                                     io_bitmap_cb, d);
1819         BUG_ON(rc);
1820         /*
1821          * NB: we need to trap accesses to 0xcf8 in order to intercept
1822          * 4 byte accesses, that need to be handled by Xen in order to
1823          * keep consistency.
1824          * Access to 1 byte RTC ports also needs to be trapped in order
1825          * to keep consistency with PV.
1826          */
1827         __set_bit(0xcf8, d->arch.hvm_domain.io_bitmap);
1828         __set_bit(RTC_PORT(0), d->arch.hvm_domain.io_bitmap);
1829         __set_bit(RTC_PORT(1), d->arch.hvm_domain.io_bitmap);
1830     }
1831 }
1832 
1833 /*
1834  * Local variables:
1835  * mode: C
1836  * c-file-style: "BSD"
1837  * c-basic-offset: 4
1838  * tab-width: 4
1839  * indent-tabs-mode: nil
1840  * End:
1841  */
1842