1 /******************************************************************************
2 * dom0_build.c
3 *
4 * Copyright (c) 2002-2005, K A Fraser
5 */
6
7 #include <xen/init.h>
8 #include <xen/iocap.h>
9 #include <xen/libelf.h>
10 #include <xen/pfn.h>
11 #include <xen/sched.h>
12 #include <xen/sched-if.h>
13 #include <xen/softirq.h>
14
15 #include <asm/dom0_build.h>
16 #include <asm/guest.h>
17 #include <asm/hpet.h>
18 #include <asm/io_apic.h>
19 #include <asm/p2m.h>
20 #include <asm/setup.h>
21
22 static long __initdata dom0_nrpages;
23 static long __initdata dom0_min_nrpages;
24 static long __initdata dom0_max_nrpages = LONG_MAX;
25
26 /*
27 * dom0_mem=[min:<min_amt>,][max:<max_amt>,][<amt>]
28 *
29 * <min_amt>: The minimum amount of memory which should be allocated for dom0.
30 * <max_amt>: The maximum amount of memory which should be allocated for dom0.
31 * <amt>: The precise amount of memory to allocate for dom0.
32 *
33 * Notes:
34 * 1. <amt> is clamped from below by <min_amt> and from above by available
35 * memory and <max_amt>
36 * 2. <min_amt> is clamped from above by available memory and <max_amt>
37 * 3. <min_amt> is ignored if it is greater than <max_amt>
38 * 4. If <amt> is not specified, it is calculated as follows:
39 * "All of memory is allocated to domain 0, minus 1/16th which is reserved
40 * for uses such as DMA buffers (the reservation is clamped to 128MB)."
41 *
42 * Each value can be specified as positive or negative:
43 * If +ve: The specified amount is an absolute value.
44 * If -ve: The specified amount is subtracted from total available memory.
45 */
parse_amt(const char * s,const char ** ps)46 static long __init parse_amt(const char *s, const char **ps)
47 {
48 long pages = parse_size_and_unit((*s == '-') ? s+1 : s, ps) >> PAGE_SHIFT;
49 return (*s == '-') ? -pages : pages;
50 }
51
parse_dom0_mem(const char * s)52 static int __init parse_dom0_mem(const char *s)
53 {
54 /* xen-shim uses shim_mem parameter instead of dom0_mem */
55 if ( pv_shim )
56 {
57 printk("Ignoring dom0_mem param in pv-shim mode\n");
58 return 0;
59 }
60
61 do {
62 if ( !strncmp(s, "min:", 4) )
63 dom0_min_nrpages = parse_amt(s+4, &s);
64 else if ( !strncmp(s, "max:", 4) )
65 dom0_max_nrpages = parse_amt(s+4, &s);
66 else
67 dom0_nrpages = parse_amt(s, &s);
68 } while ( *s++ == ',' );
69
70 return s[-1] ? -EINVAL : 0;
71 }
72 custom_param("dom0_mem", parse_dom0_mem);
73
74 static unsigned int __initdata opt_dom0_max_vcpus_min = 1;
75 static unsigned int __initdata opt_dom0_max_vcpus_max = UINT_MAX;
76
parse_dom0_max_vcpus(const char * s)77 static int __init parse_dom0_max_vcpus(const char *s)
78 {
79 if ( *s == '-' ) /* -M */
80 opt_dom0_max_vcpus_max = simple_strtoul(s + 1, &s, 0);
81 else /* N, N-, or N-M */
82 {
83 opt_dom0_max_vcpus_min = simple_strtoul(s, &s, 0);
84 if ( opt_dom0_max_vcpus_min == 0 )
85 opt_dom0_max_vcpus_min = 1;
86 if ( !*s ) /* N */
87 opt_dom0_max_vcpus_max = opt_dom0_max_vcpus_min;
88 else if ( *s++ == '-' && *s ) /* N-M */
89 opt_dom0_max_vcpus_max = simple_strtoul(s, &s, 0);
90 }
91
92 return *s ? -EINVAL : 0;
93 }
94 custom_param("dom0_max_vcpus", parse_dom0_max_vcpus);
95
96 static __initdata unsigned int dom0_nr_pxms;
97 static __initdata unsigned int dom0_pxms[MAX_NUMNODES] =
98 { [0 ... MAX_NUMNODES - 1] = ~0 };
99 static __initdata bool dom0_affinity_relaxed;
100
parse_dom0_nodes(const char * s)101 static int __init parse_dom0_nodes(const char *s)
102 {
103 do {
104 if ( isdigit(*s) )
105 {
106 if ( dom0_nr_pxms >= ARRAY_SIZE(dom0_pxms) )
107 return -E2BIG;
108 dom0_pxms[dom0_nr_pxms] = simple_strtoul(s, &s, 0);
109 if ( !*s || *s == ',' )
110 ++dom0_nr_pxms;
111 }
112 else if ( !strncmp(s, "relaxed", 7) && (!s[7] || s[7] == ',') )
113 {
114 dom0_affinity_relaxed = true;
115 s += 7;
116 }
117 else if ( !strncmp(s, "strict", 6) && (!s[6] || s[6] == ',') )
118 {
119 dom0_affinity_relaxed = false;
120 s += 6;
121 }
122 else
123 return -EINVAL;
124 } while ( *s++ == ',' );
125
126 return s[-1] ? -EINVAL : 0;
127 }
128 custom_param("dom0_nodes", parse_dom0_nodes);
129
130 static cpumask_t __initdata dom0_cpus;
131
dom0_setup_vcpu(struct domain * d,unsigned int vcpu_id,unsigned int prev_cpu)132 struct vcpu *__init dom0_setup_vcpu(struct domain *d,
133 unsigned int vcpu_id,
134 unsigned int prev_cpu)
135 {
136 unsigned int cpu = cpumask_cycle(prev_cpu, &dom0_cpus);
137 struct vcpu *v = alloc_vcpu(d, vcpu_id, cpu);
138
139 if ( v )
140 {
141 if ( pv_shim )
142 {
143
144 cpumask_setall(v->cpu_hard_affinity);
145 cpumask_setall(v->cpu_soft_affinity);
146 }
147 else
148 {
149 if ( !d->is_pinned && !dom0_affinity_relaxed )
150 cpumask_copy(v->cpu_hard_affinity, &dom0_cpus);
151 cpumask_copy(v->cpu_soft_affinity, &dom0_cpus);
152 }
153 }
154
155 return v;
156 }
157
158 static nodemask_t __initdata dom0_nodes;
159
dom0_max_vcpus(void)160 unsigned int __init dom0_max_vcpus(void)
161 {
162 unsigned int i, max_vcpus, limit;
163 nodeid_t node;
164
165 if ( pv_shim )
166 {
167 nodes_setall(dom0_nodes);
168
169 /*
170 * When booting in shim mode APs are not started until the guest brings
171 * other vCPUs up.
172 */
173 cpumask_set_cpu(0, &dom0_cpus);
174
175 /* On PV shim mode allow the guest to have as many CPUs as available. */
176 return nr_cpu_ids;
177 }
178
179
180 for ( i = 0; i < dom0_nr_pxms; ++i )
181 if ( (node = pxm_to_node(dom0_pxms[i])) != NUMA_NO_NODE )
182 node_set(node, dom0_nodes);
183 nodes_and(dom0_nodes, dom0_nodes, node_online_map);
184 if ( nodes_empty(dom0_nodes) )
185 dom0_nodes = node_online_map;
186 for_each_node_mask ( node, dom0_nodes )
187 cpumask_or(&dom0_cpus, &dom0_cpus, &node_to_cpumask(node));
188 cpumask_and(&dom0_cpus, &dom0_cpus, cpupool0->cpu_valid);
189 if ( cpumask_empty(&dom0_cpus) )
190 cpumask_copy(&dom0_cpus, cpupool0->cpu_valid);
191
192 max_vcpus = cpumask_weight(&dom0_cpus);
193 if ( opt_dom0_max_vcpus_min > max_vcpus )
194 max_vcpus = opt_dom0_max_vcpus_min;
195 if ( opt_dom0_max_vcpus_max < max_vcpus )
196 max_vcpus = opt_dom0_max_vcpus_max;
197 limit = dom0_pvh ? HVM_MAX_VCPUS : MAX_VIRT_CPUS;
198 if ( max_vcpus > limit )
199 max_vcpus = limit;
200
201 return max_vcpus;
202 }
203
alloc_dom0_vcpu0(struct domain * dom0)204 struct vcpu *__init alloc_dom0_vcpu0(struct domain *dom0)
205 {
206 unsigned int max_vcpus = dom0_max_vcpus();
207
208 dom0->node_affinity = dom0_nodes;
209 dom0->auto_node_affinity = !dom0_nr_pxms;
210
211 dom0->vcpu = xzalloc_array(struct vcpu *, max_vcpus);
212 if ( !dom0->vcpu )
213 return NULL;
214 dom0->max_vcpus = max_vcpus;
215
216 return dom0_setup_vcpu(dom0, 0,
217 cpumask_last(&dom0_cpus) /* so it wraps around to first pcpu */);
218 }
219
220 #ifdef CONFIG_SHADOW_PAGING
221 bool __initdata opt_dom0_shadow;
222 #endif
223 bool __initdata dom0_pvh;
224
225 /*
226 * List of parameters that affect Dom0 creation:
227 *
228 * - pvh Create a PVHv2 Dom0.
229 * - shadow Use shadow paging for Dom0.
230 */
parse_dom0_param(const char * s)231 static int __init parse_dom0_param(const char *s)
232 {
233 const char *ss;
234 int rc = 0;
235
236 do {
237
238 ss = strchr(s, ',');
239 if ( !ss )
240 ss = strchr(s, '\0');
241
242 if ( !strncmp(s, "pvh", ss - s) )
243 dom0_pvh = true;
244 #ifdef CONFIG_SHADOW_PAGING
245 else if ( !strncmp(s, "shadow", ss - s) )
246 opt_dom0_shadow = true;
247 #endif
248 else
249 rc = -EINVAL;
250
251 s = ss + 1;
252 } while ( *ss );
253
254 return rc;
255 }
256 custom_param("dom0", parse_dom0_param);
257
258 static char __initdata opt_dom0_ioports_disable[200] = "";
259 string_param("dom0_ioports_disable", opt_dom0_ioports_disable);
260
261 static bool __initdata ro_hpet = true;
262 boolean_param("ro-hpet", ro_hpet);
263
264 unsigned int __initdata dom0_memflags = MEMF_no_dma|MEMF_exact_node;
265
dom0_paging_pages(const struct domain * d,unsigned long nr_pages)266 unsigned long __init dom0_paging_pages(const struct domain *d,
267 unsigned long nr_pages)
268 {
269 /* Copied from: libxl_get_required_shadow_memory() */
270 unsigned long memkb = nr_pages * (PAGE_SIZE / 1024);
271
272 memkb = 4 * (256 * d->max_vcpus + 2 * (memkb / 1024));
273
274 return ((memkb + 1023) / 1024) << (20 - PAGE_SHIFT);
275 }
276
dom0_compute_nr_pages(struct domain * d,struct elf_dom_parms * parms,unsigned long initrd_len)277 unsigned long __init dom0_compute_nr_pages(
278 struct domain *d, struct elf_dom_parms *parms, unsigned long initrd_len)
279 {
280 nodeid_t node;
281 unsigned long avail = 0, nr_pages, min_pages, max_pages;
282 bool need_paging;
283
284 for_each_node_mask ( node, dom0_nodes )
285 avail += avail_domheap_pages_region(node, 0, 0) +
286 initial_images_nrpages(node);
287
288 /* Reserve memory for further dom0 vcpu-struct allocations... */
289 avail -= (d->max_vcpus - 1UL)
290 << get_order_from_bytes(sizeof(struct vcpu));
291 /* ...and compat_l4's, if needed. */
292 if ( is_pv_32bit_domain(d) )
293 avail -= d->max_vcpus - 1;
294
295 /* Reserve memory for iommu_dom0_init() (rough estimate). */
296 if ( iommu_enabled )
297 {
298 unsigned int s;
299
300 for ( s = 9; s < BITS_PER_LONG; s += 9 )
301 avail -= max_pdx >> s;
302 }
303
304 need_paging = is_hvm_domain(d) &&
305 (!iommu_hap_pt_share || !paging_mode_hap(d));
306 for ( ; ; need_paging = false )
307 {
308 nr_pages = dom0_nrpages;
309 min_pages = dom0_min_nrpages;
310 max_pages = dom0_max_nrpages;
311
312 /*
313 * If allocation isn't specified, reserve 1/16th of available memory
314 * for things like DMA buffers. This reservation is clamped to a
315 * maximum of 128MB.
316 */
317 if ( !nr_pages )
318 nr_pages = -(pv_shim ? pv_shim_mem(avail)
319 : min(avail / 16, 128UL << (20 - PAGE_SHIFT)));
320
321 /* Negative specification means "all memory - specified amount". */
322 if ( (long)nr_pages < 0 ) nr_pages += avail;
323 if ( (long)min_pages < 0 ) min_pages += avail;
324 if ( (long)max_pages < 0 ) max_pages += avail;
325
326 /* Clamp according to min/max limits and available memory. */
327 nr_pages = max(nr_pages, min_pages);
328 nr_pages = min(nr_pages, max_pages);
329 nr_pages = min(nr_pages, avail);
330
331 if ( !need_paging )
332 break;
333
334 /* Reserve memory for shadow or HAP. */
335 avail -= dom0_paging_pages(d, nr_pages);
336 }
337
338 if ( is_pv_domain(d) &&
339 (parms->p2m_base == UNSET_ADDR) && (dom0_nrpages <= 0) &&
340 ((dom0_min_nrpages <= 0) || (nr_pages > min_pages)) )
341 {
342 /*
343 * Legacy Linux kernels (i.e. such without a XEN_ELFNOTE_INIT_P2M
344 * note) require that there is enough virtual space beyond the initial
345 * allocation to set up their initial page tables. This space is
346 * roughly the same size as the p2m table, so make sure the initial
347 * allocation doesn't consume more than about half the space that's
348 * available between params.virt_base and the address space end.
349 */
350 unsigned long vstart, vend, end;
351 size_t sizeof_long = is_pv_32bit_domain(d) ? sizeof(int) : sizeof(long);
352
353 vstart = parms->virt_base;
354 vend = round_pgup(parms->virt_kend);
355 if ( !parms->unmapped_initrd )
356 vend += round_pgup(initrd_len);
357 end = vend + nr_pages * sizeof_long;
358
359 if ( end > vstart )
360 end += end - vstart;
361 if ( end <= vstart ||
362 (sizeof_long < sizeof(end) && end > (1UL << (8 * sizeof_long))) )
363 {
364 end = sizeof_long >= sizeof(end) ? 0 : 1UL << (8 * sizeof_long);
365 nr_pages = (end - vend) / (2 * sizeof_long);
366 if ( dom0_min_nrpages > 0 && nr_pages < min_pages )
367 nr_pages = min_pages;
368 printk("Dom0 memory clipped to %lu pages\n", nr_pages);
369 }
370 }
371
372 d->max_pages = min_t(unsigned long, max_pages, UINT_MAX);
373
374 return nr_pages;
375 }
376
process_dom0_ioports_disable(struct domain * dom0)377 static void __init process_dom0_ioports_disable(struct domain *dom0)
378 {
379 unsigned long io_from, io_to;
380 char *t, *s = opt_dom0_ioports_disable;
381 const char *u;
382
383 if ( *s == '\0' )
384 return;
385
386 while ( (t = strsep(&s, ",")) != NULL )
387 {
388 io_from = simple_strtoul(t, &u, 16);
389 if ( u == t )
390 {
391 parse_error:
392 printk("Invalid ioport range <%s> "
393 "in dom0_ioports_disable, skipping\n", t);
394 continue;
395 }
396
397 if ( *u == '\0' )
398 io_to = io_from;
399 else if ( *u == '-' )
400 io_to = simple_strtoul(u + 1, &u, 16);
401 else
402 goto parse_error;
403
404 if ( (*u != '\0') || (io_to < io_from) || (io_to >= 65536) )
405 goto parse_error;
406
407 printk("Disabling dom0 access to ioport range %04lx-%04lx\n",
408 io_from, io_to);
409
410 if ( ioports_deny_access(dom0, io_from, io_to) != 0 )
411 BUG();
412 }
413 }
414
dom0_setup_permissions(struct domain * d)415 int __init dom0_setup_permissions(struct domain *d)
416 {
417 unsigned long mfn;
418 unsigned int i;
419 int rc;
420
421 if ( pv_shim )
422 return 0;
423
424 /* The hardware domain is initially permitted full I/O capabilities. */
425 rc = ioports_permit_access(d, 0, 0xFFFF);
426 rc |= iomem_permit_access(d, 0UL, (1UL << (paddr_bits - PAGE_SHIFT)) - 1);
427 rc |= irqs_permit_access(d, 1, nr_irqs_gsi - 1);
428
429 /* Modify I/O port access permissions. */
430
431 /* Master Interrupt Controller (PIC). */
432 rc |= ioports_deny_access(d, 0x20, 0x21);
433 /* Slave Interrupt Controller (PIC). */
434 rc |= ioports_deny_access(d, 0xA0, 0xA1);
435 /* Interval Timer (PIT). */
436 rc |= ioports_deny_access(d, 0x40, 0x43);
437 /* PIT Channel 2 / PC Speaker Control. */
438 rc |= ioports_deny_access(d, 0x61, 0x61);
439 /* ACPI PM Timer. */
440 if ( pmtmr_ioport )
441 rc |= ioports_deny_access(d, pmtmr_ioport, pmtmr_ioport + 3);
442 /* PCI configuration space (NB. 0xcf8 has special treatment). */
443 rc |= ioports_deny_access(d, 0xcfc, 0xcff);
444 /* Command-line I/O ranges. */
445 process_dom0_ioports_disable(d);
446
447 /* Modify I/O memory access permissions. */
448
449 /* Local APIC. */
450 if ( mp_lapic_addr != 0 )
451 {
452 mfn = paddr_to_pfn(mp_lapic_addr);
453 rc |= iomem_deny_access(d, mfn, mfn);
454 }
455 /* I/O APICs. */
456 for ( i = 0; i < nr_ioapics; i++ )
457 {
458 mfn = paddr_to_pfn(mp_ioapics[i].mpc_apicaddr);
459 if ( !rangeset_contains_singleton(mmio_ro_ranges, mfn) )
460 rc |= iomem_deny_access(d, mfn, mfn);
461 }
462 /* MSI range. */
463 rc |= iomem_deny_access(d, paddr_to_pfn(MSI_ADDR_BASE_LO),
464 paddr_to_pfn(MSI_ADDR_BASE_LO +
465 MSI_ADDR_DEST_ID_MASK));
466 /* HyperTransport range. */
467 if ( boot_cpu_data.x86_vendor == X86_VENDOR_AMD )
468 rc |= iomem_deny_access(d, paddr_to_pfn(0xfdULL << 32),
469 paddr_to_pfn((1ULL << 40) - 1));
470
471 /* Remove access to E820_UNUSABLE I/O regions above 1MB. */
472 for ( i = 0; i < e820.nr_map; i++ )
473 {
474 unsigned long sfn, efn;
475 sfn = max_t(unsigned long, paddr_to_pfn(e820.map[i].addr), 0x100ul);
476 efn = paddr_to_pfn(e820.map[i].addr + e820.map[i].size - 1);
477 if ( (e820.map[i].type == E820_UNUSABLE) &&
478 (e820.map[i].size != 0) &&
479 (sfn <= efn) )
480 rc |= iomem_deny_access(d, sfn, efn);
481 }
482
483 /* Prevent access to HPET */
484 if ( hpet_address )
485 {
486 u8 prot_flags = hpet_flags & ACPI_HPET_PAGE_PROTECT_MASK;
487
488 mfn = paddr_to_pfn(hpet_address);
489 if ( prot_flags == ACPI_HPET_PAGE_PROTECT4 )
490 rc |= iomem_deny_access(d, mfn, mfn);
491 else if ( prot_flags == ACPI_HPET_PAGE_PROTECT64 )
492 rc |= iomem_deny_access(d, mfn, mfn + 15);
493 else if ( ro_hpet )
494 rc |= rangeset_add_singleton(mmio_ro_ranges, mfn);
495 }
496
497 return rc;
498 }
499
construct_dom0(struct domain * d,const module_t * image,unsigned long image_headroom,module_t * initrd,void * (* bootstrap_map)(const module_t *),char * cmdline)500 int __init construct_dom0(struct domain *d, const module_t *image,
501 unsigned long image_headroom, module_t *initrd,
502 void *(*bootstrap_map)(const module_t *),
503 char *cmdline)
504 {
505 int rc;
506
507 /* Sanity! */
508 BUG_ON(!pv_shim && d->domain_id != 0);
509 BUG_ON(d->vcpu[0] == NULL);
510 BUG_ON(d->vcpu[0]->is_initialised);
511
512 process_pending_softirqs();
513
514 #ifdef CONFIG_SHADOW_PAGING
515 if ( opt_dom0_shadow && !dom0_pvh )
516 {
517 opt_dom0_shadow = false;
518 printk(XENLOG_WARNING "Shadow Dom0 requires PVH. Option ignored.\n");
519 }
520 #endif
521
522 rc = (is_hvm_domain(d) ? dom0_construct_pvh : dom0_construct_pv)
523 (d, image, image_headroom, initrd, bootstrap_map, cmdline);
524 if ( rc )
525 return rc;
526
527 /* Sanity! */
528 BUG_ON(!d->vcpu[0]->is_initialised);
529
530 return 0;
531 }
532
533 /*
534 * Local variables:
535 * mode: C
536 * c-file-style: "BSD"
537 * c-basic-offset: 4
538 * tab-width: 4
539 * indent-tabs-mode: nil
540 * End:
541 */
542