1 /******************************************************************************
2  * dom0_build.c
3  *
4  * Copyright (c) 2002-2005, K A Fraser
5  */
6 
7 #include <xen/init.h>
8 #include <xen/iocap.h>
9 #include <xen/libelf.h>
10 #include <xen/pfn.h>
11 #include <xen/sched.h>
12 #include <xen/sched-if.h>
13 #include <xen/softirq.h>
14 
15 #include <asm/dom0_build.h>
16 #include <asm/guest.h>
17 #include <asm/hpet.h>
18 #include <asm/io_apic.h>
19 #include <asm/p2m.h>
20 #include <asm/setup.h>
21 
22 static long __initdata dom0_nrpages;
23 static long __initdata dom0_min_nrpages;
24 static long __initdata dom0_max_nrpages = LONG_MAX;
25 
26 /*
27  * dom0_mem=[min:<min_amt>,][max:<max_amt>,][<amt>]
28  *
29  * <min_amt>: The minimum amount of memory which should be allocated for dom0.
30  * <max_amt>: The maximum amount of memory which should be allocated for dom0.
31  * <amt>:     The precise amount of memory to allocate for dom0.
32  *
33  * Notes:
34  *  1. <amt> is clamped from below by <min_amt> and from above by available
35  *     memory and <max_amt>
36  *  2. <min_amt> is clamped from above by available memory and <max_amt>
37  *  3. <min_amt> is ignored if it is greater than <max_amt>
38  *  4. If <amt> is not specified, it is calculated as follows:
39  *     "All of memory is allocated to domain 0, minus 1/16th which is reserved
40  *      for uses such as DMA buffers (the reservation is clamped to 128MB)."
41  *
42  * Each value can be specified as positive or negative:
43  *  If +ve: The specified amount is an absolute value.
44  *  If -ve: The specified amount is subtracted from total available memory.
45  */
parse_amt(const char * s,const char ** ps)46 static long __init parse_amt(const char *s, const char **ps)
47 {
48     long pages = parse_size_and_unit((*s == '-') ? s+1 : s, ps) >> PAGE_SHIFT;
49     return (*s == '-') ? -pages : pages;
50 }
51 
parse_dom0_mem(const char * s)52 static int __init parse_dom0_mem(const char *s)
53 {
54     /* xen-shim uses shim_mem parameter instead of dom0_mem */
55     if ( pv_shim )
56     {
57         printk("Ignoring dom0_mem param in pv-shim mode\n");
58         return 0;
59     }
60 
61     do {
62         if ( !strncmp(s, "min:", 4) )
63             dom0_min_nrpages = parse_amt(s+4, &s);
64         else if ( !strncmp(s, "max:", 4) )
65             dom0_max_nrpages = parse_amt(s+4, &s);
66         else
67             dom0_nrpages = parse_amt(s, &s);
68     } while ( *s++ == ',' );
69 
70     return s[-1] ? -EINVAL : 0;
71 }
72 custom_param("dom0_mem", parse_dom0_mem);
73 
74 static unsigned int __initdata opt_dom0_max_vcpus_min = 1;
75 static unsigned int __initdata opt_dom0_max_vcpus_max = UINT_MAX;
76 
parse_dom0_max_vcpus(const char * s)77 static int __init parse_dom0_max_vcpus(const char *s)
78 {
79     if ( *s == '-' )                   /* -M */
80         opt_dom0_max_vcpus_max = simple_strtoul(s + 1, &s, 0);
81     else                               /* N, N-, or N-M */
82     {
83         opt_dom0_max_vcpus_min = simple_strtoul(s, &s, 0);
84         if ( opt_dom0_max_vcpus_min == 0 )
85             opt_dom0_max_vcpus_min = 1;
86         if ( !*s )                    /* N */
87             opt_dom0_max_vcpus_max = opt_dom0_max_vcpus_min;
88         else if ( *s++ == '-' && *s ) /* N-M */
89             opt_dom0_max_vcpus_max = simple_strtoul(s, &s, 0);
90     }
91 
92     return *s ? -EINVAL : 0;
93 }
94 custom_param("dom0_max_vcpus", parse_dom0_max_vcpus);
95 
96 static __initdata unsigned int dom0_nr_pxms;
97 static __initdata unsigned int dom0_pxms[MAX_NUMNODES] =
98     { [0 ... MAX_NUMNODES - 1] = ~0 };
99 static __initdata bool dom0_affinity_relaxed;
100 
parse_dom0_nodes(const char * s)101 static int __init parse_dom0_nodes(const char *s)
102 {
103     do {
104         if ( isdigit(*s) )
105         {
106             if ( dom0_nr_pxms >= ARRAY_SIZE(dom0_pxms) )
107                 return -E2BIG;
108             dom0_pxms[dom0_nr_pxms] = simple_strtoul(s, &s, 0);
109             if ( !*s || *s == ',' )
110                 ++dom0_nr_pxms;
111         }
112         else if ( !strncmp(s, "relaxed", 7) && (!s[7] || s[7] == ',') )
113         {
114             dom0_affinity_relaxed = true;
115             s += 7;
116         }
117         else if ( !strncmp(s, "strict", 6) && (!s[6] || s[6] == ',') )
118         {
119             dom0_affinity_relaxed = false;
120             s += 6;
121         }
122         else
123             return -EINVAL;
124     } while ( *s++ == ',' );
125 
126     return s[-1] ? -EINVAL : 0;
127 }
128 custom_param("dom0_nodes", parse_dom0_nodes);
129 
130 static cpumask_t __initdata dom0_cpus;
131 
dom0_setup_vcpu(struct domain * d,unsigned int vcpu_id,unsigned int prev_cpu)132 struct vcpu *__init dom0_setup_vcpu(struct domain *d,
133                                     unsigned int vcpu_id,
134                                     unsigned int prev_cpu)
135 {
136     unsigned int cpu = cpumask_cycle(prev_cpu, &dom0_cpus);
137     struct vcpu *v = alloc_vcpu(d, vcpu_id, cpu);
138 
139     if ( v )
140     {
141         if ( pv_shim )
142         {
143 
144             cpumask_setall(v->cpu_hard_affinity);
145             cpumask_setall(v->cpu_soft_affinity);
146         }
147         else
148         {
149             if ( !d->is_pinned && !dom0_affinity_relaxed )
150                 cpumask_copy(v->cpu_hard_affinity, &dom0_cpus);
151             cpumask_copy(v->cpu_soft_affinity, &dom0_cpus);
152         }
153     }
154 
155     return v;
156 }
157 
158 static nodemask_t __initdata dom0_nodes;
159 
dom0_max_vcpus(void)160 unsigned int __init dom0_max_vcpus(void)
161 {
162     unsigned int i, max_vcpus, limit;
163     nodeid_t node;
164 
165     if ( pv_shim )
166     {
167         nodes_setall(dom0_nodes);
168 
169         /*
170          * When booting in shim mode APs are not started until the guest brings
171          * other vCPUs up.
172          */
173         cpumask_set_cpu(0, &dom0_cpus);
174 
175         /* On PV shim mode allow the guest to have as many CPUs as available. */
176         return nr_cpu_ids;
177     }
178 
179 
180     for ( i = 0; i < dom0_nr_pxms; ++i )
181         if ( (node = pxm_to_node(dom0_pxms[i])) != NUMA_NO_NODE )
182             node_set(node, dom0_nodes);
183     nodes_and(dom0_nodes, dom0_nodes, node_online_map);
184     if ( nodes_empty(dom0_nodes) )
185         dom0_nodes = node_online_map;
186     for_each_node_mask ( node, dom0_nodes )
187         cpumask_or(&dom0_cpus, &dom0_cpus, &node_to_cpumask(node));
188     cpumask_and(&dom0_cpus, &dom0_cpus, cpupool0->cpu_valid);
189     if ( cpumask_empty(&dom0_cpus) )
190         cpumask_copy(&dom0_cpus, cpupool0->cpu_valid);
191 
192     max_vcpus = cpumask_weight(&dom0_cpus);
193     if ( opt_dom0_max_vcpus_min > max_vcpus )
194         max_vcpus = opt_dom0_max_vcpus_min;
195     if ( opt_dom0_max_vcpus_max < max_vcpus )
196         max_vcpus = opt_dom0_max_vcpus_max;
197     limit = dom0_pvh ? HVM_MAX_VCPUS : MAX_VIRT_CPUS;
198     if ( max_vcpus > limit )
199         max_vcpus = limit;
200 
201     return max_vcpus;
202 }
203 
alloc_dom0_vcpu0(struct domain * dom0)204 struct vcpu *__init alloc_dom0_vcpu0(struct domain *dom0)
205 {
206     unsigned int max_vcpus = dom0_max_vcpus();
207 
208     dom0->node_affinity = dom0_nodes;
209     dom0->auto_node_affinity = !dom0_nr_pxms;
210 
211     dom0->vcpu = xzalloc_array(struct vcpu *, max_vcpus);
212     if ( !dom0->vcpu )
213         return NULL;
214     dom0->max_vcpus = max_vcpus;
215 
216     return dom0_setup_vcpu(dom0, 0,
217                            cpumask_last(&dom0_cpus) /* so it wraps around to first pcpu */);
218 }
219 
220 #ifdef CONFIG_SHADOW_PAGING
221 bool __initdata opt_dom0_shadow;
222 #endif
223 bool __initdata dom0_pvh;
224 
225 /*
226  * List of parameters that affect Dom0 creation:
227  *
228  *  - pvh               Create a PVHv2 Dom0.
229  *  - shadow            Use shadow paging for Dom0.
230  */
parse_dom0_param(const char * s)231 static int __init parse_dom0_param(const char *s)
232 {
233     const char *ss;
234     int rc = 0;
235 
236     do {
237 
238         ss = strchr(s, ',');
239         if ( !ss )
240             ss = strchr(s, '\0');
241 
242         if ( !strncmp(s, "pvh", ss - s) )
243             dom0_pvh = true;
244 #ifdef CONFIG_SHADOW_PAGING
245         else if ( !strncmp(s, "shadow", ss - s) )
246             opt_dom0_shadow = true;
247 #endif
248         else
249             rc = -EINVAL;
250 
251         s = ss + 1;
252     } while ( *ss );
253 
254     return rc;
255 }
256 custom_param("dom0", parse_dom0_param);
257 
258 static char __initdata opt_dom0_ioports_disable[200] = "";
259 string_param("dom0_ioports_disable", opt_dom0_ioports_disable);
260 
261 static bool __initdata ro_hpet = true;
262 boolean_param("ro-hpet", ro_hpet);
263 
264 unsigned int __initdata dom0_memflags = MEMF_no_dma|MEMF_exact_node;
265 
dom0_paging_pages(const struct domain * d,unsigned long nr_pages)266 unsigned long __init dom0_paging_pages(const struct domain *d,
267                                        unsigned long nr_pages)
268 {
269     /* Copied from: libxl_get_required_shadow_memory() */
270     unsigned long memkb = nr_pages * (PAGE_SIZE / 1024);
271 
272     memkb = 4 * (256 * d->max_vcpus + 2 * (memkb / 1024));
273 
274     return ((memkb + 1023) / 1024) << (20 - PAGE_SHIFT);
275 }
276 
dom0_compute_nr_pages(struct domain * d,struct elf_dom_parms * parms,unsigned long initrd_len)277 unsigned long __init dom0_compute_nr_pages(
278     struct domain *d, struct elf_dom_parms *parms, unsigned long initrd_len)
279 {
280     nodeid_t node;
281     unsigned long avail = 0, nr_pages, min_pages, max_pages;
282     bool need_paging;
283 
284     for_each_node_mask ( node, dom0_nodes )
285         avail += avail_domheap_pages_region(node, 0, 0) +
286                  initial_images_nrpages(node);
287 
288     /* Reserve memory for further dom0 vcpu-struct allocations... */
289     avail -= (d->max_vcpus - 1UL)
290              << get_order_from_bytes(sizeof(struct vcpu));
291     /* ...and compat_l4's, if needed. */
292     if ( is_pv_32bit_domain(d) )
293         avail -= d->max_vcpus - 1;
294 
295     /* Reserve memory for iommu_dom0_init() (rough estimate). */
296     if ( iommu_enabled )
297     {
298         unsigned int s;
299 
300         for ( s = 9; s < BITS_PER_LONG; s += 9 )
301             avail -= max_pdx >> s;
302     }
303 
304     need_paging = is_hvm_domain(d) &&
305         (!iommu_hap_pt_share || !paging_mode_hap(d));
306     for ( ; ; need_paging = false )
307     {
308         nr_pages = dom0_nrpages;
309         min_pages = dom0_min_nrpages;
310         max_pages = dom0_max_nrpages;
311 
312         /*
313          * If allocation isn't specified, reserve 1/16th of available memory
314          * for things like DMA buffers. This reservation is clamped to a
315          * maximum of 128MB.
316          */
317         if ( !nr_pages )
318             nr_pages = -(pv_shim ? pv_shim_mem(avail)
319                                  : min(avail / 16, 128UL << (20 - PAGE_SHIFT)));
320 
321         /* Negative specification means "all memory - specified amount". */
322         if ( (long)nr_pages  < 0 ) nr_pages  += avail;
323         if ( (long)min_pages < 0 ) min_pages += avail;
324         if ( (long)max_pages < 0 ) max_pages += avail;
325 
326         /* Clamp according to min/max limits and available memory. */
327         nr_pages = max(nr_pages, min_pages);
328         nr_pages = min(nr_pages, max_pages);
329         nr_pages = min(nr_pages, avail);
330 
331         if ( !need_paging )
332             break;
333 
334         /* Reserve memory for shadow or HAP. */
335         avail -= dom0_paging_pages(d, nr_pages);
336     }
337 
338     if ( is_pv_domain(d) &&
339          (parms->p2m_base == UNSET_ADDR) && (dom0_nrpages <= 0) &&
340          ((dom0_min_nrpages <= 0) || (nr_pages > min_pages)) )
341     {
342         /*
343          * Legacy Linux kernels (i.e. such without a XEN_ELFNOTE_INIT_P2M
344          * note) require that there is enough virtual space beyond the initial
345          * allocation to set up their initial page tables. This space is
346          * roughly the same size as the p2m table, so make sure the initial
347          * allocation doesn't consume more than about half the space that's
348          * available between params.virt_base and the address space end.
349          */
350         unsigned long vstart, vend, end;
351         size_t sizeof_long = is_pv_32bit_domain(d) ? sizeof(int) : sizeof(long);
352 
353         vstart = parms->virt_base;
354         vend = round_pgup(parms->virt_kend);
355         if ( !parms->unmapped_initrd )
356             vend += round_pgup(initrd_len);
357         end = vend + nr_pages * sizeof_long;
358 
359         if ( end > vstart )
360             end += end - vstart;
361         if ( end <= vstart ||
362              (sizeof_long < sizeof(end) && end > (1UL << (8 * sizeof_long))) )
363         {
364             end = sizeof_long >= sizeof(end) ? 0 : 1UL << (8 * sizeof_long);
365             nr_pages = (end - vend) / (2 * sizeof_long);
366             if ( dom0_min_nrpages > 0 && nr_pages < min_pages )
367                 nr_pages = min_pages;
368             printk("Dom0 memory clipped to %lu pages\n", nr_pages);
369         }
370     }
371 
372     d->max_pages = min_t(unsigned long, max_pages, UINT_MAX);
373 
374     return nr_pages;
375 }
376 
process_dom0_ioports_disable(struct domain * dom0)377 static void __init process_dom0_ioports_disable(struct domain *dom0)
378 {
379     unsigned long io_from, io_to;
380     char *t, *s = opt_dom0_ioports_disable;
381     const char *u;
382 
383     if ( *s == '\0' )
384         return;
385 
386     while ( (t = strsep(&s, ",")) != NULL )
387     {
388         io_from = simple_strtoul(t, &u, 16);
389         if ( u == t )
390         {
391         parse_error:
392             printk("Invalid ioport range <%s> "
393                    "in dom0_ioports_disable, skipping\n", t);
394             continue;
395         }
396 
397         if ( *u == '\0' )
398             io_to = io_from;
399         else if ( *u == '-' )
400             io_to = simple_strtoul(u + 1, &u, 16);
401         else
402             goto parse_error;
403 
404         if ( (*u != '\0') || (io_to < io_from) || (io_to >= 65536) )
405             goto parse_error;
406 
407         printk("Disabling dom0 access to ioport range %04lx-%04lx\n",
408             io_from, io_to);
409 
410         if ( ioports_deny_access(dom0, io_from, io_to) != 0 )
411             BUG();
412     }
413 }
414 
dom0_setup_permissions(struct domain * d)415 int __init dom0_setup_permissions(struct domain *d)
416 {
417     unsigned long mfn;
418     unsigned int i;
419     int rc;
420 
421     if ( pv_shim )
422         return 0;
423 
424     /* The hardware domain is initially permitted full I/O capabilities. */
425     rc = ioports_permit_access(d, 0, 0xFFFF);
426     rc |= iomem_permit_access(d, 0UL, (1UL << (paddr_bits - PAGE_SHIFT)) - 1);
427     rc |= irqs_permit_access(d, 1, nr_irqs_gsi - 1);
428 
429     /* Modify I/O port access permissions. */
430 
431     /* Master Interrupt Controller (PIC). */
432     rc |= ioports_deny_access(d, 0x20, 0x21);
433     /* Slave Interrupt Controller (PIC). */
434     rc |= ioports_deny_access(d, 0xA0, 0xA1);
435     /* Interval Timer (PIT). */
436     rc |= ioports_deny_access(d, 0x40, 0x43);
437     /* PIT Channel 2 / PC Speaker Control. */
438     rc |= ioports_deny_access(d, 0x61, 0x61);
439     /* ACPI PM Timer. */
440     if ( pmtmr_ioport )
441         rc |= ioports_deny_access(d, pmtmr_ioport, pmtmr_ioport + 3);
442     /* PCI configuration space (NB. 0xcf8 has special treatment). */
443     rc |= ioports_deny_access(d, 0xcfc, 0xcff);
444     /* Command-line I/O ranges. */
445     process_dom0_ioports_disable(d);
446 
447     /* Modify I/O memory access permissions. */
448 
449     /* Local APIC. */
450     if ( mp_lapic_addr != 0 )
451     {
452         mfn = paddr_to_pfn(mp_lapic_addr);
453         rc |= iomem_deny_access(d, mfn, mfn);
454     }
455     /* I/O APICs. */
456     for ( i = 0; i < nr_ioapics; i++ )
457     {
458         mfn = paddr_to_pfn(mp_ioapics[i].mpc_apicaddr);
459         if ( !rangeset_contains_singleton(mmio_ro_ranges, mfn) )
460             rc |= iomem_deny_access(d, mfn, mfn);
461     }
462     /* MSI range. */
463     rc |= iomem_deny_access(d, paddr_to_pfn(MSI_ADDR_BASE_LO),
464                             paddr_to_pfn(MSI_ADDR_BASE_LO +
465                                          MSI_ADDR_DEST_ID_MASK));
466     /* HyperTransport range. */
467     if ( boot_cpu_data.x86_vendor == X86_VENDOR_AMD )
468         rc |= iomem_deny_access(d, paddr_to_pfn(0xfdULL << 32),
469                                 paddr_to_pfn((1ULL << 40) - 1));
470 
471     /* Remove access to E820_UNUSABLE I/O regions above 1MB. */
472     for ( i = 0; i < e820.nr_map; i++ )
473     {
474         unsigned long sfn, efn;
475         sfn = max_t(unsigned long, paddr_to_pfn(e820.map[i].addr), 0x100ul);
476         efn = paddr_to_pfn(e820.map[i].addr + e820.map[i].size - 1);
477         if ( (e820.map[i].type == E820_UNUSABLE) &&
478              (e820.map[i].size != 0) &&
479              (sfn <= efn) )
480             rc |= iomem_deny_access(d, sfn, efn);
481     }
482 
483     /* Prevent access to HPET */
484     if ( hpet_address )
485     {
486         u8 prot_flags = hpet_flags & ACPI_HPET_PAGE_PROTECT_MASK;
487 
488         mfn = paddr_to_pfn(hpet_address);
489         if ( prot_flags == ACPI_HPET_PAGE_PROTECT4 )
490             rc |= iomem_deny_access(d, mfn, mfn);
491         else if ( prot_flags == ACPI_HPET_PAGE_PROTECT64 )
492             rc |= iomem_deny_access(d, mfn, mfn + 15);
493         else if ( ro_hpet )
494             rc |= rangeset_add_singleton(mmio_ro_ranges, mfn);
495     }
496 
497     return rc;
498 }
499 
construct_dom0(struct domain * d,const module_t * image,unsigned long image_headroom,module_t * initrd,void * (* bootstrap_map)(const module_t *),char * cmdline)500 int __init construct_dom0(struct domain *d, const module_t *image,
501                           unsigned long image_headroom, module_t *initrd,
502                           void *(*bootstrap_map)(const module_t *),
503                           char *cmdline)
504 {
505     int rc;
506 
507     /* Sanity! */
508     BUG_ON(!pv_shim && d->domain_id != 0);
509     BUG_ON(d->vcpu[0] == NULL);
510     BUG_ON(d->vcpu[0]->is_initialised);
511 
512     process_pending_softirqs();
513 
514 #ifdef CONFIG_SHADOW_PAGING
515     if ( opt_dom0_shadow && !dom0_pvh )
516     {
517         opt_dom0_shadow = false;
518         printk(XENLOG_WARNING "Shadow Dom0 requires PVH. Option ignored.\n");
519     }
520 #endif
521 
522     rc = (is_hvm_domain(d) ? dom0_construct_pvh : dom0_construct_pv)
523          (d, image, image_headroom, initrd, bootstrap_map, cmdline);
524     if ( rc )
525         return rc;
526 
527     /* Sanity! */
528     BUG_ON(!d->vcpu[0]->is_initialised);
529 
530     return 0;
531 }
532 
533 /*
534  * Local variables:
535  * mode: C
536  * c-file-style: "BSD"
537  * c-basic-offset: 4
538  * tab-width: 4
539  * indent-tabs-mode: nil
540  * End:
541  */
542