1 /*
2  * This supports booting another PV kernel from Mini-OS
3  *
4  * The idea is to setup it using libxc, answer to day0 memory allocation
5  * requests, and using a trampoline boot page to switch to the new page table.
6  *
7  * The procedure of the boot page is:
8  * - map itself at the target position (that may overwrite some C stuff, but we
9  *   do not care any more)
10  * - jump there
11  * - switch to the target page table
12  * - unpin the old page table
13  * - jump to the new kernel
14  *
15  * Samuel Thibault <Samuel.Thibault@eu.citrix.com>, May 2008
16  */
17 #include <stdio.h>
18 #include <unistd.h>
19 #include <stdlib.h>
20 #include <sys/mman.h>
21 
22 #include <xenctrl.h>
23 #include <xc_dom.h>
24 
25 #include <kernel.h>
26 #include <console.h>
27 #include <os.h>
28 #include <blkfront.h>
29 #include <netfront.h>
30 #include <fbfront.h>
31 #include <tpmfront.h>
32 #include <shared.h>
33 #include <byteswap.h>
34 
35 #include "mini-os.h"
36 
37 #if 0
38 #define DEBUG(fmt, ...) printk(fmt, ## __VA_ARGS__)
39 #else
40 #define DEBUG(fmt, ...) (void)0
41 #endif
42 
43 /* Assembly boot page from boot.S */
44 extern void _boot_page;
45 extern pgentry_t _boot_page_entry;
46 extern unsigned long _boot_pdmfn;
47 extern unsigned long _boot_stack, _boot_target, _boot_start_info, _boot_start;
48 extern xen_pfn_t _boot_oldpdmfn;
49 extern void _boot(void);
50 
51 static unsigned long *pages;
52 static unsigned long *pages_mfns;
53 static xen_pfn_t *pages_moved2pfns;
54 static unsigned long allocated;
55 
56 int pin_table(xc_interface *xc_handle, unsigned int type, unsigned long mfn,
57               domid_t dom);
58 
59 #define TPM_TAG_RQU_COMMAND 0xC1
60 #define TPM_ORD_Extend 20
61 
62 struct pcr_extend_cmd {
63 	uint16_t tag;
64 	uint32_t size;
65 	uint32_t ord;
66 
67 	uint32_t pcr;
68 	unsigned char hash[20];
69 } __attribute__((packed));
70 
71 struct pcr_extend_rsp {
72 	uint16_t tag;
73 	uint32_t size;
74 	uint32_t status;
75 
76 	unsigned char hash[20];
77 } __attribute__((packed));
78 
79 /* Not imported from polarssl's header since the prototype unhelpfully defines
80  * the input as unsigned char, which causes pointer type mismatches */
81 void sha1(const void *input, size_t ilen, unsigned char output[20]);
82 
83 /* We need mfn to appear as target_pfn, so exchange with the MFN there */
do_exchange(struct xc_dom_image * dom,xen_pfn_t target_pfn,xen_pfn_t source_mfn)84 static void do_exchange(struct xc_dom_image *dom, xen_pfn_t target_pfn, xen_pfn_t source_mfn)
85 {
86     xen_pfn_t source_pfn;
87     xen_pfn_t target_mfn;
88 
89     for (source_pfn = 0; source_pfn < start_info.nr_pages; source_pfn++)
90         if (dom->p2m_host[source_pfn] == source_mfn)
91             break;
92     ASSERT(source_pfn < start_info.nr_pages);
93 
94     target_mfn = dom->p2m_host[target_pfn];
95 
96     /* Put target MFN at source PFN */
97     dom->p2m_host[source_pfn] = target_mfn;
98 
99     /* Put source MFN at target PFN */
100     dom->p2m_host[target_pfn] = source_mfn;
101 }
102 
kexec_allocate(struct xc_dom_image * dom)103 int kexec_allocate(struct xc_dom_image *dom)
104 {
105     unsigned long new_allocated = dom->pfn_alloc_end - dom->rambase_pfn;
106     unsigned long i;
107 
108     pages = realloc(pages, new_allocated * sizeof(*pages));
109     pages_mfns = realloc(pages_mfns, new_allocated * sizeof(*pages_mfns));
110     pages_moved2pfns = realloc(pages_moved2pfns, new_allocated * sizeof(*pages_moved2pfns));
111     for (i = allocated; i < new_allocated; i++) {
112         /* Exchange old page of PFN i with a newly allocated page.  */
113         xen_pfn_t old_mfn = dom->p2m_host[i];
114         xen_pfn_t new_pfn;
115         xen_pfn_t new_mfn;
116 
117         pages[i] = alloc_page();
118         memset((void*) pages[i], 0, PAGE_SIZE);
119         new_pfn = PHYS_PFN(to_phys(pages[i]));
120         pages_mfns[i] = new_mfn = pfn_to_mfn(new_pfn);
121 
122 	/*
123 	 * If PFN of newly allocated page (new_pfn) is less then currently
124 	 * requested PFN (i) then look for relevant PFN/MFN pair. In this
125 	 * situation dom->p2m_host[new_pfn] no longer contains proper MFN
126 	 * because original page with new_pfn was moved earlier
127 	 * to different location.
128 	 */
129 	for (; new_pfn < i; new_pfn = pages_moved2pfns[new_pfn]);
130 
131 	/* Store destination PFN of currently requested page. */
132 	pages_moved2pfns[i] = new_pfn;
133 
134         /* Put old page at new PFN */
135         dom->p2m_host[new_pfn] = old_mfn;
136 
137         /* Put new page at PFN i */
138         dom->p2m_host[i] = new_mfn;
139     }
140 
141     allocated = new_allocated;
142 
143     return 0;
144 }
145 
146 /* Filled from mini-os command line or left as NULL */
147 char *vtpm_label;
148 
tpm_hash2pcr(struct xc_dom_image * dom,char * cmdline)149 static void tpm_hash2pcr(struct xc_dom_image *dom, char *cmdline)
150 {
151 	struct tpmfront_dev* tpm = init_tpmfront(NULL);
152 	struct pcr_extend_rsp *resp;
153 	size_t resplen = 0;
154 	struct pcr_extend_cmd cmd;
155 	int rv;
156 
157 	/*
158 	 * If vtpm_label was specified on the command line, require a vTPM to be
159 	 * attached and for the domain providing the vTPM to have the given
160 	 * label.
161 	 */
162 	if (vtpm_label) {
163 		char ctx[128];
164 		if (!tpm) {
165 			printf("No TPM found and vtpm_label specified, aborting!\n");
166 			do_exit();
167 		}
168 		rv = evtchn_get_peercontext(tpm->evtchn, ctx, sizeof(ctx) - 1);
169 		if (rv < 0) {
170 			printf("Could not verify vtpm_label: %d\n", rv);
171 			do_exit();
172 		}
173 		ctx[127] = 0;
174 		rv = strcmp(ctx, vtpm_label);
175 		if (rv && vtpm_label[0] == '*') {
176 			int match_len = strlen(vtpm_label) - 1;
177 			int offset = strlen(ctx) - match_len;
178 			if (offset > 0)
179 				rv = strcmp(ctx + offset, vtpm_label + 1);
180 		}
181 
182 		if (rv) {
183 			printf("Mismatched vtpm_label: '%s' != '%s'\n", ctx, vtpm_label);
184 			do_exit();
185 		}
186 	} else if (!tpm) {
187 		return;
188 	}
189 
190 	cmd.tag = bswap_16(TPM_TAG_RQU_COMMAND);
191 	cmd.size = bswap_32(sizeof(cmd));
192 	cmd.ord = bswap_32(TPM_ORD_Extend);
193 	cmd.pcr = bswap_32(4); // PCR #4 for kernel
194 	sha1(dom->kernel_blob, dom->kernel_size, cmd.hash);
195 
196 	rv = tpmfront_cmd(tpm, (void*)&cmd, sizeof(cmd), (void*)&resp, &resplen);
197 	ASSERT(rv == 0 && resp->status == 0);
198 
199 	cmd.pcr = bswap_32(5); // PCR #5 for cmdline
200 	sha1(cmdline, strlen(cmdline), cmd.hash);
201 	rv = tpmfront_cmd(tpm, (void*)&cmd, sizeof(cmd), (void*)&resp, &resplen);
202 	ASSERT(rv == 0 && resp->status == 0);
203 
204 	cmd.pcr = bswap_32(5); // PCR #5 for initrd
205 	sha1(dom->modules[0].blob, dom->modules[0].size, cmd.hash);
206 	rv = tpmfront_cmd(tpm, (void*)&cmd, sizeof(cmd), (void*)&resp, &resplen);
207 	ASSERT(rv == 0 && resp->status == 0);
208 
209 	shutdown_tpmfront(tpm);
210 }
211 
kexec(void * kernel,long kernel_size,void * module,long module_size,char * cmdline,unsigned long flags)212 void kexec(void *kernel, long kernel_size, void *module, long module_size, char *cmdline, unsigned long flags)
213 {
214     struct xc_dom_image *dom;
215     int rc;
216     domid_t domid = DOMID_SELF;
217     xen_pfn_t pfn;
218     xc_interface *xc_handle;
219     unsigned long i;
220     void *seg;
221     xen_pfn_t boot_page_mfn = virt_to_mfn(&_boot_page);
222     char features[] = "";
223     struct mmu_update *m2p_updates;
224     unsigned long nr_m2p_updates;
225 
226     DEBUG("booting with cmdline %s\n", cmdline);
227     xc_handle = xc_interface_open(0,0,0);
228 
229     dom = xc_dom_allocate(xc_handle, cmdline, features);
230     dom->allocate = kexec_allocate;
231 
232     /* We are using guest owned memory, therefore no limits. */
233     xc_dom_kernel_max_size(dom, 0);
234     xc_dom_module_max_size(dom, 0);
235 
236     dom->kernel_blob = kernel;
237     dom->kernel_size = kernel_size;
238 
239     xc_dom_module_mem(dom, module, module_size, NULL);
240 
241     dom->flags = flags;
242     dom->console_evtchn = start_info.console.domU.evtchn;
243     dom->xenstore_evtchn = start_info.store_evtchn;
244 
245     tpm_hash2pcr(dom, cmdline);
246 
247     if ( (rc = xc_dom_boot_xen_init(dom, xc_handle, domid)) != 0 ) {
248         printk("xc_dom_boot_xen_init returned %d\n", rc);
249         errnum = ERR_BOOT_FAILURE;
250         goto out;
251     }
252     if ( (rc = xc_dom_parse_image(dom)) != 0 ) {
253         printk("xc_dom_parse_image returned %d\n", rc);
254         errnum = ERR_BOOT_FAILURE;
255         goto out;
256     }
257 
258 #ifdef __i386__
259     if (strcmp(dom->guest_type, "xen-3.0-x86_32p")) {
260         printk("can only boot x86 32 PAE kernels, not %s\n", dom->guest_type);
261         errnum = ERR_EXEC_FORMAT;
262         goto out;
263     }
264 #endif
265 #ifdef __x86_64__
266     if (strcmp(dom->guest_type, "xen-3.0-x86_64")) {
267         printk("can only boot x86 64 kernels, not %s\n", dom->guest_type);
268         errnum = ERR_EXEC_FORMAT;
269         goto out;
270     }
271 #endif
272 
273     /* equivalent of xc_dom_mem_init */
274     if (xc_dom_set_arch_hooks(dom)) {
275         printk("xc_dom_set_arch_hooks failed\n");
276         errnum = ERR_EXEC_FORMAT;
277         goto out;
278     }
279     dom->total_pages = start_info.nr_pages;
280 
281     /* equivalent of arch_setup_meminit */
282     dom->p2m_size = dom->total_pages;
283 
284     /* setup initial p2m */
285     dom->p2m_host = malloc(sizeof(*dom->p2m_host) * dom->p2m_size);
286 
287     /* Start with our current P2M */
288     for (i = 0; i < dom->p2m_size; i++)
289         dom->p2m_host[i] = pfn_to_mfn(i);
290 
291     if ( (rc = xc_dom_build_image(dom)) != 0 ) {
292         printk("xc_dom_build_image returned %d\n", rc);
293         errnum = ERR_BOOT_FAILURE;
294         goto out;
295     }
296 
297     /* copy hypercall page */
298     /* TODO: domctl instead, but requires privileges */
299     if (dom->parms.virt_hypercall != -1) {
300         pfn = PHYS_PFN(dom->parms.virt_hypercall - dom->parms.virt_base);
301         memcpy((void *) pages[pfn], hypercall_page, PAGE_SIZE);
302     }
303 
304     /* Equivalent of xc_dom_boot_image */
305     dom->shared_info_mfn = PHYS_PFN(start_info.shared_info);
306 
307     if (!xc_dom_compat_check(dom)) {
308         printk("xc_dom_compat_check failed\n");
309         errnum = ERR_EXEC_FORMAT;
310         goto out;
311     }
312 
313     /* Move current console, xenstore and boot MFNs to the allocated place */
314     do_exchange(dom, dom->console_pfn, start_info.console.domU.mfn);
315     do_exchange(dom, dom->xenstore_pfn, start_info.store_mfn);
316     DEBUG("virt base at %llx\n", dom->parms.virt_base);
317     DEBUG("bootstack_pfn %lx\n", dom->bootstack_pfn);
318     _boot_target = dom->parms.virt_base + PFN_PHYS(dom->bootstack_pfn);
319     DEBUG("_boot_target %lx\n", _boot_target);
320     do_exchange(dom, PHYS_PFN(_boot_target - dom->parms.virt_base),
321             virt_to_mfn(&_boot_page));
322 
323     /* Make sure the bootstrap page table does not RW-map any of our current
324      * page table frames */
325     if ( (rc = xc_dom_update_guest_p2m(dom))) {
326         printk("xc_dom_update_guest_p2m returned %d\n", rc);
327         errnum = ERR_BOOT_FAILURE;
328         goto out;
329     }
330 
331     if ( dom->arch_hooks->setup_pgtables )
332         if ( (rc = dom->arch_hooks->setup_pgtables(dom))) {
333             printk("setup_pgtables returned %d\n", rc);
334             errnum = ERR_BOOT_FAILURE;
335             goto out;
336         }
337 
338     /* start info page */
339 #undef start_info
340     if ( dom->arch_hooks->start_info )
341         dom->arch_hooks->start_info(dom);
342 #define start_info (start_info_union.start_info)
343 
344     xc_dom_log_memory_footprint(dom);
345 
346     /* Unmap libxc's projection of the boot page table */
347     seg = xc_dom_seg_to_ptr(dom, &dom->pgtables_seg);
348     munmap(seg, dom->pgtables_seg.vend - dom->pgtables_seg.vstart);
349     seg = xc_dom_seg_to_ptr(dom, &dom->p2m_seg);
350     munmap(seg, dom->p2m_seg.vend - dom->p2m_seg.vstart);
351 
352     /* Unmap day0 pages to avoid having a r/w mapping of the future page table */
353     for (pfn = 0; pfn < allocated; pfn++)
354         munmap((void*) pages[pfn], PAGE_SIZE);
355 
356     /* Pin the boot page table base */
357     if ( (rc = pin_table(dom->xch,
358 #ifdef __i386__
359                 MMUEXT_PIN_L3_TABLE,
360 #endif
361 #ifdef __x86_64__
362                 MMUEXT_PIN_L4_TABLE,
363 #endif
364                 xc_dom_p2m(dom, dom->pgtables_seg.pfn),
365                 dom->guest_domid)) != 0 ) {
366         printk("pin_table(%lx) returned %d\n", xc_dom_p2m(dom,
367                dom->pgtables_seg.pfn), rc);
368         errnum = ERR_BOOT_FAILURE;
369         goto out_remap;
370     }
371 
372     /* We populate the Mini-OS page table here so that boot.S can just call
373      * update_va_mapping to project itself there.  */
374     need_pgt(_boot_target);
375     DEBUG("day0 pages %lx\n", allocated);
376     DEBUG("boot target page %lx\n", _boot_target);
377     DEBUG("boot page %p\n", &_boot_page);
378     DEBUG("boot page mfn %lx\n", boot_page_mfn);
379     _boot_page_entry = PFN_PHYS(boot_page_mfn) | L1_PROT;
380     DEBUG("boot page entry %llx\n", _boot_page_entry);
381     _boot_oldpdmfn = virt_to_mfn(start_info.pt_base);
382     DEBUG("boot old pd mfn %lx\n", _boot_oldpdmfn);
383     DEBUG("boot pd virt %lx\n", dom->pgtables_seg.vstart);
384     _boot_pdmfn = dom->p2m_host[PHYS_PFN(dom->pgtables_seg.vstart - dom->parms.virt_base)];
385     DEBUG("boot pd mfn %lx\n", _boot_pdmfn);
386     _boot_stack = _boot_target + PAGE_SIZE;
387     DEBUG("boot stack %lx\n", _boot_stack);
388     _boot_start_info = dom->parms.virt_base + PFN_PHYS(dom->start_info_pfn);
389     DEBUG("boot start info %lx\n", _boot_start_info);
390     _boot_start = dom->parms.virt_entry;
391     DEBUG("boot start %lx\n", _boot_start);
392 
393     /* Keep only useful entries */
394     for (nr_m2p_updates = pfn = 0; pfn < start_info.nr_pages; pfn++)
395         if (dom->p2m_host[pfn] != pfn_to_mfn(pfn))
396             nr_m2p_updates++;
397 
398     m2p_updates = malloc(sizeof(*m2p_updates) * nr_m2p_updates);
399     for (i = pfn = 0; pfn < start_info.nr_pages; pfn++)
400         if (dom->p2m_host[pfn] != pfn_to_mfn(pfn)) {
401             m2p_updates[i].ptr = PFN_PHYS(dom->p2m_host[pfn]) | MMU_MACHPHYS_UPDATE;
402             m2p_updates[i].val = pfn;
403             i++;
404         }
405 
406     for (i = 0; i < blk_nb; i++)
407         shutdown_blkfront(blk_dev[i]);
408     if (net_dev)
409         shutdown_netfront(net_dev);
410     if (kbd_dev)
411         shutdown_kbdfront(kbd_dev);
412     stop_kernel();
413 
414     /* Update M2P */
415     if ((rc = HYPERVISOR_mmu_update(m2p_updates, nr_m2p_updates, NULL, DOMID_SELF)) < 0) {
416         xprintk("Could not update M2P\n");
417         ASSERT(0);
418     }
419 
420     xprintk("go!\n");
421 
422     /* Jump to trampoline boot page */
423     _boot();
424 
425     ASSERT(0);
426 
427 out_remap:
428     for (pfn = 0; pfn < allocated; pfn++)
429         do_map_frames(pages[pfn], &pages_mfns[pfn], 1, 0, 0, DOMID_SELF, 0, L1_PROT);
430 out:
431     xc_dom_release(dom);
432     for (pfn = 0; pfn < allocated; pfn++)
433         free_page((void*)pages[pfn]);
434     free(pages);
435     free(pages_mfns);
436     pages = NULL;
437     pages_mfns = NULL;
438     allocated = 0;
439     xc_interface_close(xc_handle );
440 }
441