1 /*
2 * This supports booting another PV kernel from Mini-OS
3 *
4 * The idea is to setup it using libxc, answer to day0 memory allocation
5 * requests, and using a trampoline boot page to switch to the new page table.
6 *
7 * The procedure of the boot page is:
8 * - map itself at the target position (that may overwrite some C stuff, but we
9 * do not care any more)
10 * - jump there
11 * - switch to the target page table
12 * - unpin the old page table
13 * - jump to the new kernel
14 *
15 * Samuel Thibault <Samuel.Thibault@eu.citrix.com>, May 2008
16 */
17 #include <stdio.h>
18 #include <unistd.h>
19 #include <stdlib.h>
20 #include <sys/mman.h>
21
22 #include <xenctrl.h>
23 #include <xc_dom.h>
24
25 #include <kernel.h>
26 #include <console.h>
27 #include <os.h>
28 #include <blkfront.h>
29 #include <netfront.h>
30 #include <fbfront.h>
31 #include <tpmfront.h>
32 #include <shared.h>
33 #include <byteswap.h>
34
35 #include "mini-os.h"
36
37 #if 0
38 #define DEBUG(fmt, ...) printk(fmt, ## __VA_ARGS__)
39 #else
40 #define DEBUG(fmt, ...) (void)0
41 #endif
42
43 /* Assembly boot page from boot.S */
44 extern void _boot_page;
45 extern pgentry_t _boot_page_entry;
46 extern unsigned long _boot_pdmfn;
47 extern unsigned long _boot_stack, _boot_target, _boot_start_info, _boot_start;
48 extern xen_pfn_t _boot_oldpdmfn;
49 extern void _boot(void);
50
51 static unsigned long *pages;
52 static unsigned long *pages_mfns;
53 static xen_pfn_t *pages_moved2pfns;
54 static unsigned long allocated;
55
56 int pin_table(xc_interface *xc_handle, unsigned int type, unsigned long mfn,
57 domid_t dom);
58
59 #define TPM_TAG_RQU_COMMAND 0xC1
60 #define TPM_ORD_Extend 20
61
62 struct pcr_extend_cmd {
63 uint16_t tag;
64 uint32_t size;
65 uint32_t ord;
66
67 uint32_t pcr;
68 unsigned char hash[20];
69 } __attribute__((packed));
70
71 struct pcr_extend_rsp {
72 uint16_t tag;
73 uint32_t size;
74 uint32_t status;
75
76 unsigned char hash[20];
77 } __attribute__((packed));
78
79 /* Not imported from polarssl's header since the prototype unhelpfully defines
80 * the input as unsigned char, which causes pointer type mismatches */
81 void sha1(const void *input, size_t ilen, unsigned char output[20]);
82
83 /* We need mfn to appear as target_pfn, so exchange with the MFN there */
do_exchange(struct xc_dom_image * dom,xen_pfn_t target_pfn,xen_pfn_t source_mfn)84 static void do_exchange(struct xc_dom_image *dom, xen_pfn_t target_pfn, xen_pfn_t source_mfn)
85 {
86 xen_pfn_t source_pfn;
87 xen_pfn_t target_mfn;
88
89 for (source_pfn = 0; source_pfn < start_info.nr_pages; source_pfn++)
90 if (dom->p2m_host[source_pfn] == source_mfn)
91 break;
92 ASSERT(source_pfn < start_info.nr_pages);
93
94 target_mfn = dom->p2m_host[target_pfn];
95
96 /* Put target MFN at source PFN */
97 dom->p2m_host[source_pfn] = target_mfn;
98
99 /* Put source MFN at target PFN */
100 dom->p2m_host[target_pfn] = source_mfn;
101 }
102
kexec_allocate(struct xc_dom_image * dom)103 int kexec_allocate(struct xc_dom_image *dom)
104 {
105 unsigned long new_allocated = dom->pfn_alloc_end - dom->rambase_pfn;
106 unsigned long i;
107
108 pages = realloc(pages, new_allocated * sizeof(*pages));
109 pages_mfns = realloc(pages_mfns, new_allocated * sizeof(*pages_mfns));
110 pages_moved2pfns = realloc(pages_moved2pfns, new_allocated * sizeof(*pages_moved2pfns));
111 for (i = allocated; i < new_allocated; i++) {
112 /* Exchange old page of PFN i with a newly allocated page. */
113 xen_pfn_t old_mfn = dom->p2m_host[i];
114 xen_pfn_t new_pfn;
115 xen_pfn_t new_mfn;
116
117 pages[i] = alloc_page();
118 memset((void*) pages[i], 0, PAGE_SIZE);
119 new_pfn = PHYS_PFN(to_phys(pages[i]));
120 pages_mfns[i] = new_mfn = pfn_to_mfn(new_pfn);
121
122 /*
123 * If PFN of newly allocated page (new_pfn) is less then currently
124 * requested PFN (i) then look for relevant PFN/MFN pair. In this
125 * situation dom->p2m_host[new_pfn] no longer contains proper MFN
126 * because original page with new_pfn was moved earlier
127 * to different location.
128 */
129 for (; new_pfn < i; new_pfn = pages_moved2pfns[new_pfn]);
130
131 /* Store destination PFN of currently requested page. */
132 pages_moved2pfns[i] = new_pfn;
133
134 /* Put old page at new PFN */
135 dom->p2m_host[new_pfn] = old_mfn;
136
137 /* Put new page at PFN i */
138 dom->p2m_host[i] = new_mfn;
139 }
140
141 allocated = new_allocated;
142
143 return 0;
144 }
145
146 /* Filled from mini-os command line or left as NULL */
147 char *vtpm_label;
148
tpm_hash2pcr(struct xc_dom_image * dom,char * cmdline)149 static void tpm_hash2pcr(struct xc_dom_image *dom, char *cmdline)
150 {
151 struct tpmfront_dev* tpm = init_tpmfront(NULL);
152 struct pcr_extend_rsp *resp;
153 size_t resplen = 0;
154 struct pcr_extend_cmd cmd;
155 int rv;
156
157 /*
158 * If vtpm_label was specified on the command line, require a vTPM to be
159 * attached and for the domain providing the vTPM to have the given
160 * label.
161 */
162 if (vtpm_label) {
163 char ctx[128];
164 if (!tpm) {
165 printf("No TPM found and vtpm_label specified, aborting!\n");
166 do_exit();
167 }
168 rv = evtchn_get_peercontext(tpm->evtchn, ctx, sizeof(ctx) - 1);
169 if (rv < 0) {
170 printf("Could not verify vtpm_label: %d\n", rv);
171 do_exit();
172 }
173 ctx[127] = 0;
174 rv = strcmp(ctx, vtpm_label);
175 if (rv && vtpm_label[0] == '*') {
176 int match_len = strlen(vtpm_label) - 1;
177 int offset = strlen(ctx) - match_len;
178 if (offset > 0)
179 rv = strcmp(ctx + offset, vtpm_label + 1);
180 }
181
182 if (rv) {
183 printf("Mismatched vtpm_label: '%s' != '%s'\n", ctx, vtpm_label);
184 do_exit();
185 }
186 } else if (!tpm) {
187 return;
188 }
189
190 cmd.tag = bswap_16(TPM_TAG_RQU_COMMAND);
191 cmd.size = bswap_32(sizeof(cmd));
192 cmd.ord = bswap_32(TPM_ORD_Extend);
193 cmd.pcr = bswap_32(4); // PCR #4 for kernel
194 sha1(dom->kernel_blob, dom->kernel_size, cmd.hash);
195
196 rv = tpmfront_cmd(tpm, (void*)&cmd, sizeof(cmd), (void*)&resp, &resplen);
197 ASSERT(rv == 0 && resp->status == 0);
198
199 cmd.pcr = bswap_32(5); // PCR #5 for cmdline
200 sha1(cmdline, strlen(cmdline), cmd.hash);
201 rv = tpmfront_cmd(tpm, (void*)&cmd, sizeof(cmd), (void*)&resp, &resplen);
202 ASSERT(rv == 0 && resp->status == 0);
203
204 cmd.pcr = bswap_32(5); // PCR #5 for initrd
205 sha1(dom->modules[0].blob, dom->modules[0].size, cmd.hash);
206 rv = tpmfront_cmd(tpm, (void*)&cmd, sizeof(cmd), (void*)&resp, &resplen);
207 ASSERT(rv == 0 && resp->status == 0);
208
209 shutdown_tpmfront(tpm);
210 }
211
kexec(void * kernel,long kernel_size,void * module,long module_size,char * cmdline,unsigned long flags)212 void kexec(void *kernel, long kernel_size, void *module, long module_size, char *cmdline, unsigned long flags)
213 {
214 struct xc_dom_image *dom;
215 int rc;
216 domid_t domid = DOMID_SELF;
217 xen_pfn_t pfn;
218 xc_interface *xc_handle;
219 unsigned long i;
220 void *seg;
221 xen_pfn_t boot_page_mfn = virt_to_mfn(&_boot_page);
222 char features[] = "";
223 struct mmu_update *m2p_updates;
224 unsigned long nr_m2p_updates;
225
226 DEBUG("booting with cmdline %s\n", cmdline);
227 xc_handle = xc_interface_open(0,0,0);
228
229 dom = xc_dom_allocate(xc_handle, cmdline, features);
230 dom->allocate = kexec_allocate;
231
232 /* We are using guest owned memory, therefore no limits. */
233 xc_dom_kernel_max_size(dom, 0);
234 xc_dom_module_max_size(dom, 0);
235
236 dom->kernel_blob = kernel;
237 dom->kernel_size = kernel_size;
238
239 xc_dom_module_mem(dom, module, module_size, NULL);
240
241 dom->flags = flags;
242 dom->console_evtchn = start_info.console.domU.evtchn;
243 dom->xenstore_evtchn = start_info.store_evtchn;
244
245 tpm_hash2pcr(dom, cmdline);
246
247 if ( (rc = xc_dom_boot_xen_init(dom, xc_handle, domid)) != 0 ) {
248 printk("xc_dom_boot_xen_init returned %d\n", rc);
249 errnum = ERR_BOOT_FAILURE;
250 goto out;
251 }
252 if ( (rc = xc_dom_parse_image(dom)) != 0 ) {
253 printk("xc_dom_parse_image returned %d\n", rc);
254 errnum = ERR_BOOT_FAILURE;
255 goto out;
256 }
257
258 #ifdef __i386__
259 if (strcmp(dom->guest_type, "xen-3.0-x86_32p")) {
260 printk("can only boot x86 32 PAE kernels, not %s\n", dom->guest_type);
261 errnum = ERR_EXEC_FORMAT;
262 goto out;
263 }
264 #endif
265 #ifdef __x86_64__
266 if (strcmp(dom->guest_type, "xen-3.0-x86_64")) {
267 printk("can only boot x86 64 kernels, not %s\n", dom->guest_type);
268 errnum = ERR_EXEC_FORMAT;
269 goto out;
270 }
271 #endif
272
273 /* equivalent of xc_dom_mem_init */
274 if (xc_dom_set_arch_hooks(dom)) {
275 printk("xc_dom_set_arch_hooks failed\n");
276 errnum = ERR_EXEC_FORMAT;
277 goto out;
278 }
279 dom->total_pages = start_info.nr_pages;
280
281 /* equivalent of arch_setup_meminit */
282 dom->p2m_size = dom->total_pages;
283
284 /* setup initial p2m */
285 dom->p2m_host = malloc(sizeof(*dom->p2m_host) * dom->p2m_size);
286
287 /* Start with our current P2M */
288 for (i = 0; i < dom->p2m_size; i++)
289 dom->p2m_host[i] = pfn_to_mfn(i);
290
291 if ( (rc = xc_dom_build_image(dom)) != 0 ) {
292 printk("xc_dom_build_image returned %d\n", rc);
293 errnum = ERR_BOOT_FAILURE;
294 goto out;
295 }
296
297 /* copy hypercall page */
298 /* TODO: domctl instead, but requires privileges */
299 if (dom->parms.virt_hypercall != -1) {
300 pfn = PHYS_PFN(dom->parms.virt_hypercall - dom->parms.virt_base);
301 memcpy((void *) pages[pfn], hypercall_page, PAGE_SIZE);
302 }
303
304 /* Equivalent of xc_dom_boot_image */
305 dom->shared_info_mfn = PHYS_PFN(start_info.shared_info);
306
307 if (!xc_dom_compat_check(dom)) {
308 printk("xc_dom_compat_check failed\n");
309 errnum = ERR_EXEC_FORMAT;
310 goto out;
311 }
312
313 /* Move current console, xenstore and boot MFNs to the allocated place */
314 do_exchange(dom, dom->console_pfn, start_info.console.domU.mfn);
315 do_exchange(dom, dom->xenstore_pfn, start_info.store_mfn);
316 DEBUG("virt base at %llx\n", dom->parms.virt_base);
317 DEBUG("bootstack_pfn %lx\n", dom->bootstack_pfn);
318 _boot_target = dom->parms.virt_base + PFN_PHYS(dom->bootstack_pfn);
319 DEBUG("_boot_target %lx\n", _boot_target);
320 do_exchange(dom, PHYS_PFN(_boot_target - dom->parms.virt_base),
321 virt_to_mfn(&_boot_page));
322
323 /* Make sure the bootstrap page table does not RW-map any of our current
324 * page table frames */
325 if ( (rc = xc_dom_update_guest_p2m(dom))) {
326 printk("xc_dom_update_guest_p2m returned %d\n", rc);
327 errnum = ERR_BOOT_FAILURE;
328 goto out;
329 }
330
331 if ( dom->arch_hooks->setup_pgtables )
332 if ( (rc = dom->arch_hooks->setup_pgtables(dom))) {
333 printk("setup_pgtables returned %d\n", rc);
334 errnum = ERR_BOOT_FAILURE;
335 goto out;
336 }
337
338 /* start info page */
339 #undef start_info
340 if ( dom->arch_hooks->start_info )
341 dom->arch_hooks->start_info(dom);
342 #define start_info (start_info_union.start_info)
343
344 xc_dom_log_memory_footprint(dom);
345
346 /* Unmap libxc's projection of the boot page table */
347 seg = xc_dom_seg_to_ptr(dom, &dom->pgtables_seg);
348 munmap(seg, dom->pgtables_seg.vend - dom->pgtables_seg.vstart);
349 seg = xc_dom_seg_to_ptr(dom, &dom->p2m_seg);
350 munmap(seg, dom->p2m_seg.vend - dom->p2m_seg.vstart);
351
352 /* Unmap day0 pages to avoid having a r/w mapping of the future page table */
353 for (pfn = 0; pfn < allocated; pfn++)
354 munmap((void*) pages[pfn], PAGE_SIZE);
355
356 /* Pin the boot page table base */
357 if ( (rc = pin_table(dom->xch,
358 #ifdef __i386__
359 MMUEXT_PIN_L3_TABLE,
360 #endif
361 #ifdef __x86_64__
362 MMUEXT_PIN_L4_TABLE,
363 #endif
364 xc_dom_p2m(dom, dom->pgtables_seg.pfn),
365 dom->guest_domid)) != 0 ) {
366 printk("pin_table(%lx) returned %d\n", xc_dom_p2m(dom,
367 dom->pgtables_seg.pfn), rc);
368 errnum = ERR_BOOT_FAILURE;
369 goto out_remap;
370 }
371
372 /* We populate the Mini-OS page table here so that boot.S can just call
373 * update_va_mapping to project itself there. */
374 need_pgt(_boot_target);
375 DEBUG("day0 pages %lx\n", allocated);
376 DEBUG("boot target page %lx\n", _boot_target);
377 DEBUG("boot page %p\n", &_boot_page);
378 DEBUG("boot page mfn %lx\n", boot_page_mfn);
379 _boot_page_entry = PFN_PHYS(boot_page_mfn) | L1_PROT;
380 DEBUG("boot page entry %llx\n", _boot_page_entry);
381 _boot_oldpdmfn = virt_to_mfn(start_info.pt_base);
382 DEBUG("boot old pd mfn %lx\n", _boot_oldpdmfn);
383 DEBUG("boot pd virt %lx\n", dom->pgtables_seg.vstart);
384 _boot_pdmfn = dom->p2m_host[PHYS_PFN(dom->pgtables_seg.vstart - dom->parms.virt_base)];
385 DEBUG("boot pd mfn %lx\n", _boot_pdmfn);
386 _boot_stack = _boot_target + PAGE_SIZE;
387 DEBUG("boot stack %lx\n", _boot_stack);
388 _boot_start_info = dom->parms.virt_base + PFN_PHYS(dom->start_info_pfn);
389 DEBUG("boot start info %lx\n", _boot_start_info);
390 _boot_start = dom->parms.virt_entry;
391 DEBUG("boot start %lx\n", _boot_start);
392
393 /* Keep only useful entries */
394 for (nr_m2p_updates = pfn = 0; pfn < start_info.nr_pages; pfn++)
395 if (dom->p2m_host[pfn] != pfn_to_mfn(pfn))
396 nr_m2p_updates++;
397
398 m2p_updates = malloc(sizeof(*m2p_updates) * nr_m2p_updates);
399 for (i = pfn = 0; pfn < start_info.nr_pages; pfn++)
400 if (dom->p2m_host[pfn] != pfn_to_mfn(pfn)) {
401 m2p_updates[i].ptr = PFN_PHYS(dom->p2m_host[pfn]) | MMU_MACHPHYS_UPDATE;
402 m2p_updates[i].val = pfn;
403 i++;
404 }
405
406 for (i = 0; i < blk_nb; i++)
407 shutdown_blkfront(blk_dev[i]);
408 if (net_dev)
409 shutdown_netfront(net_dev);
410 if (kbd_dev)
411 shutdown_kbdfront(kbd_dev);
412 stop_kernel();
413
414 /* Update M2P */
415 if ((rc = HYPERVISOR_mmu_update(m2p_updates, nr_m2p_updates, NULL, DOMID_SELF)) < 0) {
416 xprintk("Could not update M2P\n");
417 ASSERT(0);
418 }
419
420 xprintk("go!\n");
421
422 /* Jump to trampoline boot page */
423 _boot();
424
425 ASSERT(0);
426
427 out_remap:
428 for (pfn = 0; pfn < allocated; pfn++)
429 do_map_frames(pages[pfn], &pages_mfns[pfn], 1, 0, 0, DOMID_SELF, 0, L1_PROT);
430 out:
431 xc_dom_release(dom);
432 for (pfn = 0; pfn < allocated; pfn++)
433 free_page((void*)pages[pfn]);
434 free(pages);
435 free(pages_mfns);
436 pages = NULL;
437 pages_mfns = NULL;
438 allocated = 0;
439 xc_interface_close(xc_handle );
440 }
441