1 /******************************************************************************
2 * arch/x86/domain.c
3 *
4 * x86-specific domain handling (e.g., register setup and context switching).
5 */
6
7 /*
8 * Copyright (C) 1995 Linus Torvalds
9 *
10 * Pentium III FXSR, SSE support
11 * Gareth Hughes <gareth@valinux.com>, May 2000
12 */
13
14 #include <xen/init.h>
15 #include <xen/lib.h>
16 #include <xen/errno.h>
17 #include <xen/sched.h>
18 #include <xen/domain.h>
19 #include <xen/smp.h>
20 #include <xen/delay.h>
21 #include <xen/softirq.h>
22 #include <xen/grant_table.h>
23 #include <xen/iocap.h>
24 #include <xen/kernel.h>
25 #include <xen/hypercall.h>
26 #include <xen/multicall.h>
27 #include <xen/irq.h>
28 #include <xen/event.h>
29 #include <xen/console.h>
30 #include <xen/percpu.h>
31 #include <xen/compat.h>
32 #include <xen/acpi.h>
33 #include <xen/pci.h>
34 #include <xen/paging.h>
35 #include <xen/cpu.h>
36 #include <xen/wait.h>
37 #include <xen/guest_access.h>
38 #include <xen/livepatch.h>
39 #include <public/sysctl.h>
40 #include <public/hvm/hvm_vcpu.h>
41 #include <asm/regs.h>
42 #include <asm/mc146818rtc.h>
43 #include <asm/system.h>
44 #include <asm/io.h>
45 #include <asm/processor.h>
46 #include <asm/desc.h>
47 #include <asm/i387.h>
48 #include <asm/xstate.h>
49 #include <asm/cpuidle.h>
50 #include <asm/mpspec.h>
51 #include <asm/ldt.h>
52 #include <asm/hvm/hvm.h>
53 #include <asm/hvm/nestedhvm.h>
54 #include <asm/hvm/support.h>
55 #include <asm/hvm/viridian.h>
56 #include <asm/debugreg.h>
57 #include <asm/msr.h>
58 #include <asm/traps.h>
59 #include <asm/nmi.h>
60 #include <asm/mce.h>
61 #include <asm/amd.h>
62 #include <xen/numa.h>
63 #include <xen/iommu.h>
64 #include <compat/vcpu.h>
65 #include <asm/psr.h>
66 #include <asm/pv/domain.h>
67 #include <asm/pv/mm.h>
68
69 DEFINE_PER_CPU(struct vcpu *, curr_vcpu);
70
71 static void default_idle(void);
72 void (*pm_idle) (void) __read_mostly = default_idle;
73 void (*dead_idle) (void) __read_mostly = default_dead_idle;
74
default_idle(void)75 static void default_idle(void)
76 {
77 local_irq_disable();
78 if ( cpu_is_haltable(smp_processor_id()) )
79 safe_halt();
80 else
81 local_irq_enable();
82 }
83
default_dead_idle(void)84 void default_dead_idle(void)
85 {
86 /*
87 * When going into S3, without flushing caches modified data may be
88 * held by the CPUs spinning here indefinitely, and get discarded by
89 * a subsequent INIT.
90 */
91 wbinvd();
92 for ( ; ; )
93 halt();
94 }
95
play_dead(void)96 static void play_dead(void)
97 {
98 local_irq_disable();
99
100 /*
101 * NOTE: After cpu_exit_clear, per-cpu variables are no longer accessible,
102 * as they may be freed at any time. In this case, heap corruption or
103 * #PF can occur (when heap debugging is enabled). For example, even
104 * printk() can involve tasklet scheduling, which touches per-cpu vars.
105 *
106 * Consider very carefully when adding code to *dead_idle. Most hypervisor
107 * subsystems are unsafe to call.
108 */
109 cpu_exit_clear(smp_processor_id());
110
111 (*dead_idle)();
112 }
113
idle_loop(void)114 static void idle_loop(void)
115 {
116 unsigned int cpu = smp_processor_id();
117
118 for ( ; ; )
119 {
120 if ( cpu_is_offline(cpu) )
121 play_dead();
122
123 /* Are we here for running vcpu context tasklets, or for idling? */
124 if ( unlikely(tasklet_work_to_do(cpu)) )
125 do_tasklet();
126 /*
127 * Test softirqs twice --- first to see if should even try scrubbing
128 * and then, after it is done, whether softirqs became pending
129 * while we were scrubbing.
130 */
131 else if ( !softirq_pending(cpu) && !scrub_free_pages() &&
132 !softirq_pending(cpu) )
133 pm_idle();
134 do_softirq();
135 /*
136 * We MUST be last (or before pm_idle). Otherwise after we get the
137 * softirq we would execute pm_idle (and sleep) and not patch.
138 */
139 check_for_livepatch_work();
140 }
141 }
142
startup_cpu_idle_loop(void)143 void startup_cpu_idle_loop(void)
144 {
145 struct vcpu *v = current;
146
147 ASSERT(is_idle_vcpu(v));
148 cpumask_set_cpu(v->processor, v->domain->domain_dirty_cpumask);
149 cpumask_set_cpu(v->processor, v->vcpu_dirty_cpumask);
150
151 reset_stack_and_jump(idle_loop);
152 }
153
continue_idle_domain(struct vcpu * v)154 static void noreturn continue_idle_domain(struct vcpu *v)
155 {
156 reset_stack_and_jump(idle_loop);
157 }
158
dump_pageframe_info(struct domain * d)159 void dump_pageframe_info(struct domain *d)
160 {
161 struct page_info *page;
162
163 printk("Memory pages belonging to domain %u:\n", d->domain_id);
164
165 if ( d->tot_pages >= 10 && d->is_dying < DOMDYING_dead )
166 {
167 printk(" DomPage list too long to display\n");
168 }
169 else
170 {
171 unsigned long total[MASK_EXTR(PGT_type_mask, PGT_type_mask) + 1] = {};
172
173 spin_lock(&d->page_alloc_lock);
174 page_list_for_each ( page, &d->page_list )
175 {
176 unsigned int index = MASK_EXTR(page->u.inuse.type_info,
177 PGT_type_mask);
178
179 if ( ++total[index] > 16 )
180 {
181 switch ( page->u.inuse.type_info & PGT_type_mask )
182 {
183 case PGT_none:
184 case PGT_writable_page:
185 continue;
186 }
187 }
188 printk(" DomPage %p: caf=%08lx, taf=%" PRtype_info "\n",
189 _p(page_to_mfn(page)),
190 page->count_info, page->u.inuse.type_info);
191 }
192 spin_unlock(&d->page_alloc_lock);
193 }
194
195 if ( is_hvm_domain(d) )
196 p2m_pod_dump_data(d);
197
198 spin_lock(&d->page_alloc_lock);
199 page_list_for_each ( page, &d->xenpage_list )
200 {
201 printk(" XenPage %p: caf=%08lx, taf=%" PRtype_info "\n",
202 _p(page_to_mfn(page)),
203 page->count_info, page->u.inuse.type_info);
204 }
205 spin_unlock(&d->page_alloc_lock);
206 }
207
update_guest_memory_policy(struct vcpu * v,struct guest_memory_policy * policy)208 void update_guest_memory_policy(struct vcpu *v,
209 struct guest_memory_policy *policy)
210 {
211 smap_check_policy_t old_smap_policy = v->arch.smap_check_policy;
212 bool old_guest_mode = nestedhvm_is_n2(v);
213 bool new_guest_mode = policy->nested_guest_mode;
214
215 v->arch.smap_check_policy = policy->smap_policy;
216 policy->smap_policy = old_smap_policy;
217
218 /*
219 * When 'v' is in the nested guest mode, all guest copy
220 * functions/macros which finally call paging_gva_to_gfn()
221 * transfer data to/from L2 guest. If the copy is intended for L1
222 * guest, we must first clear the nested guest flag (by setting
223 * policy->nested_guest_mode to false) before the copy and then
224 * restore the nested guest flag (by setting
225 * policy->nested_guest_mode to true) after the copy.
226 */
227 if ( unlikely(old_guest_mode != new_guest_mode) )
228 {
229 if ( new_guest_mode )
230 nestedhvm_vcpu_enter_guestmode(v);
231 else
232 nestedhvm_vcpu_exit_guestmode(v);
233 policy->nested_guest_mode = old_guest_mode;
234 }
235 }
236
237 #ifndef CONFIG_BIGMEM
238 /*
239 * The hole may be at or above the 44-bit boundary, so we need to determine
240 * the total bit count until reaching 32 significant (not squashed out) bits
241 * in PFN representations.
242 * Note that the way "bits" gets initialized/updated/bounds-checked guarantees
243 * that the function will never return zero, and hence will never be called
244 * more than once (which is important due to it being deliberately placed in
245 * .init.text).
246 */
_domain_struct_bits(void)247 static unsigned int __init noinline _domain_struct_bits(void)
248 {
249 unsigned int bits = 32 + PAGE_SHIFT;
250 unsigned int sig = hweight32(~pfn_hole_mask);
251 unsigned int mask = pfn_hole_mask >> 32;
252
253 for ( ; bits < BITS_PER_LONG && sig < 32; ++bits, mask >>= 1 )
254 if ( !(mask & 1) )
255 ++sig;
256
257 return bits;
258 }
259 #endif
260
alloc_domain_struct(void)261 struct domain *alloc_domain_struct(void)
262 {
263 struct domain *d;
264 unsigned int order = get_order_from_bytes(sizeof(*d));
265 #ifdef CONFIG_BIGMEM
266 const unsigned int bits = 0;
267 #else
268 /*
269 * We pack the PDX of the domain structure into a 32-bit field within
270 * the page_info structure. Hence the MEMF_bits() restriction.
271 */
272 static unsigned int __read_mostly bits;
273
274 if ( unlikely(!bits) )
275 bits = _domain_struct_bits();
276 #endif
277
278
279 #ifndef CONFIG_LOCK_PROFILE
280 BUILD_BUG_ON(sizeof(*d) > PAGE_SIZE);
281 #endif
282 d = alloc_xenheap_pages(order, MEMF_bits(bits));
283 if ( d != NULL )
284 {
285 unsigned int sz;
286
287 for ( sz = 0; sz < (PAGE_SIZE << order); sz += PAGE_SIZE )
288 clear_page((void *)d + sz);
289 }
290 return d;
291 }
292
free_domain_struct(struct domain * d)293 void free_domain_struct(struct domain *d)
294 {
295 lock_profile_deregister_struct(LOCKPROF_TYPE_PERDOM, d);
296 free_xenheap_page(d);
297 }
298
alloc_vcpu_struct(void)299 struct vcpu *alloc_vcpu_struct(void)
300 {
301 struct vcpu *v;
302 /*
303 * This structure contains embedded PAE PDPTEs, used when an HVM guest
304 * runs on shadow pagetables outside of 64-bit mode. In this case the CPU
305 * may require that the shadow CR3 points below 4GB, and hence the whole
306 * structure must satisfy this restriction. Thus we specify MEMF_bits(32).
307 */
308 BUILD_BUG_ON(sizeof(*v) > PAGE_SIZE);
309 v = alloc_xenheap_pages(0, MEMF_bits(32));
310 if ( v != NULL )
311 clear_page(v);
312 return v;
313 }
314
free_vcpu_struct(struct vcpu * v)315 void free_vcpu_struct(struct vcpu *v)
316 {
317 free_xenheap_page(v);
318 }
319
vcpu_initialise(struct vcpu * v)320 int vcpu_initialise(struct vcpu *v)
321 {
322 struct domain *d = v->domain;
323 int rc;
324
325 v->arch.flags = TF_kernel_mode;
326
327 rc = mapcache_vcpu_init(v);
328 if ( rc )
329 return rc;
330
331 if ( !is_idle_domain(d) )
332 {
333 paging_vcpu_init(v);
334
335 if ( (rc = vcpu_init_fpu(v)) != 0 )
336 return rc;
337
338 vmce_init_vcpu(v);
339 }
340 else if ( (rc = xstate_alloc_save_area(v)) != 0 )
341 return rc;
342
343 spin_lock_init(&v->arch.vpmu.vpmu_lock);
344
345 if ( is_hvm_domain(d) )
346 rc = hvm_vcpu_initialise(v);
347 else if ( !is_idle_domain(d) )
348 rc = pv_vcpu_initialise(v);
349 else
350 {
351 /* Idle domain */
352 v->arch.cr3 = __pa(idle_pg_table);
353 rc = 0;
354 v->arch.msr = ZERO_BLOCK_PTR; /* Catch stray misuses */
355 }
356
357 if ( rc )
358 goto fail;
359
360 if ( !is_idle_domain(v->domain) )
361 {
362 vpmu_initialise(v);
363
364 if ( (rc = init_vcpu_msr_policy(v)) )
365 goto fail;
366 }
367
368 return rc;
369
370 fail:
371 vcpu_destroy_fpu(v);
372 xfree(v->arch.msr);
373 v->arch.msr = NULL;
374
375 return rc;
376 }
377
vcpu_destroy(struct vcpu * v)378 void vcpu_destroy(struct vcpu *v)
379 {
380 xfree(v->arch.vm_event);
381 v->arch.vm_event = NULL;
382
383 vcpu_destroy_fpu(v);
384
385 xfree(v->arch.msr);
386 v->arch.msr = NULL;
387
388 if ( !is_idle_domain(v->domain) )
389 vpmu_destroy(v);
390
391 if ( is_hvm_vcpu(v) )
392 hvm_vcpu_destroy(v);
393 else
394 pv_vcpu_destroy(v);
395 }
396
emulation_flags_ok(const struct domain * d,uint32_t emflags)397 static bool emulation_flags_ok(const struct domain *d, uint32_t emflags)
398 {
399
400 if ( is_hvm_domain(d) )
401 {
402 if ( is_hardware_domain(d) &&
403 emflags != (XEN_X86_EMU_LAPIC|XEN_X86_EMU_IOAPIC) )
404 return false;
405 if ( !is_hardware_domain(d) && emflags &&
406 emflags != XEN_X86_EMU_ALL && emflags != XEN_X86_EMU_LAPIC )
407 return false;
408 }
409 else if ( emflags != 0 && emflags != XEN_X86_EMU_PIT )
410 {
411 /* PV or classic PVH. */
412 return false;
413 }
414
415 return true;
416 }
417
arch_domain_create(struct domain * d,unsigned int domcr_flags,struct xen_arch_domainconfig * config)418 int arch_domain_create(struct domain *d, unsigned int domcr_flags,
419 struct xen_arch_domainconfig *config)
420 {
421 bool paging_initialised = false;
422 int rc;
423
424 if ( config == NULL && !is_idle_domain(d) )
425 return -EINVAL;
426
427 d->arch.s3_integrity = !!(domcr_flags & DOMCRF_s3_integrity);
428
429 INIT_LIST_HEAD(&d->arch.pdev_list);
430
431 d->arch.relmem = RELMEM_not_started;
432 INIT_PAGE_LIST_HEAD(&d->arch.relmem_list);
433
434 if ( d->domain_id && !is_idle_domain(d) &&
435 cpu_has_amd_erratum(&boot_cpu_data, AMD_ERRATUM_121) )
436 {
437 if ( !opt_allow_unsafe )
438 {
439 printk(XENLOG_G_ERR "Xen does not allow DomU creation on this CPU"
440 " for security reasons.\n");
441 return -EPERM;
442 }
443 printk(XENLOG_G_WARNING
444 "Dom%d may compromise security on this CPU.\n",
445 d->domain_id);
446 }
447
448 if ( is_idle_domain(d) )
449 {
450 d->arch.emulation_flags = 0;
451 d->arch.cpuid = ZERO_BLOCK_PTR; /* Catch stray misuses. */
452 d->arch.msr = ZERO_BLOCK_PTR;
453 }
454 else
455 {
456 uint32_t emflags;
457
458 if ( is_hardware_domain(d) && is_pv_domain(d) )
459 config->emulation_flags |= XEN_X86_EMU_PIT;
460
461 emflags = config->emulation_flags;
462 if ( emflags & ~XEN_X86_EMU_ALL )
463 {
464 printk(XENLOG_G_ERR "d%d: Invalid emulation bitmap: %#x\n",
465 d->domain_id, emflags);
466 return -EINVAL;
467 }
468
469 if ( !emulation_flags_ok(d, emflags) )
470 {
471 printk(XENLOG_G_ERR "d%d: Xen does not allow %s domain creation "
472 "with the current selection of emulators: %#x\n",
473 d->domain_id, is_hvm_domain(d) ? "HVM" : "PV", emflags);
474 return -EOPNOTSUPP;
475 }
476 d->arch.emulation_flags = emflags;
477 }
478
479 mapcache_domain_init(d);
480
481 HYPERVISOR_COMPAT_VIRT_START(d) =
482 is_pv_domain(d) ? __HYPERVISOR_COMPAT_VIRT_START : ~0u;
483
484 if ( !is_idle_domain(d) )
485 {
486 /* Need to determine if HAP is enabled before initialising paging */
487 if ( is_hvm_domain(d) )
488 d->arch.hvm_domain.hap_enabled =
489 hvm_funcs.hap_supported && (domcr_flags & DOMCRF_hap);
490
491 if ( (rc = paging_domain_init(d, domcr_flags)) != 0 )
492 goto fail;
493 paging_initialised = 1;
494
495 if ( (rc = init_domain_cpuid_policy(d)) )
496 goto fail;
497
498 if ( (rc = init_domain_msr_policy(d)) )
499 goto fail;
500
501 d->arch.ioport_caps =
502 rangeset_new(d, "I/O Ports", RANGESETF_prettyprint_hex);
503 rc = -ENOMEM;
504 if ( d->arch.ioport_caps == NULL )
505 goto fail;
506
507 /*
508 * The shared_info machine address must fit in a 32-bit field within a
509 * 32-bit guest's start_info structure. Hence we specify MEMF_bits(32).
510 */
511 if ( (d->shared_info = alloc_xenheap_pages(0, MEMF_bits(32))) == NULL )
512 goto fail;
513
514 clear_page(d->shared_info);
515 share_xen_page_with_guest(
516 virt_to_page(d->shared_info), d, XENSHARE_writable);
517
518 if ( (rc = init_domain_irq_mapping(d)) != 0 )
519 goto fail;
520
521 if ( (rc = iommu_domain_init(d)) != 0 )
522 goto fail;
523 }
524 spin_lock_init(&d->arch.e820_lock);
525
526 psr_domain_init(d);
527
528 if ( is_hvm_domain(d) )
529 {
530 if ( (rc = hvm_domain_initialise(d, domcr_flags, config)) != 0 )
531 goto fail;
532 }
533 else if ( is_idle_domain(d) )
534 {
535 static const struct arch_csw idle_csw = {
536 .from = paravirt_ctxt_switch_from,
537 .to = paravirt_ctxt_switch_to,
538 .tail = continue_idle_domain,
539 };
540
541 d->arch.ctxt_switch = &idle_csw;
542 }
543 else
544 {
545 if ( (rc = pv_domain_initialise(d, domcr_flags, config)) != 0 )
546 goto fail;
547 }
548
549 /* initialize default tsc behavior in case tools don't */
550 tsc_set_info(d, TSC_MODE_DEFAULT, 0UL, 0, 0);
551 spin_lock_init(&d->arch.vtsc_lock);
552
553 /* PV/PVH guests get an emulated PIT too for video BIOSes to use. */
554 pit_init(d, cpu_khz);
555
556 /*
557 * If the FPU does not save FCS/FDS then we can always
558 * save/restore the 64-bit FIP/FDP and ignore the selectors.
559 */
560 d->arch.x87_fip_width = cpu_has_fpu_sel ? 0 : 8;
561
562 return 0;
563
564 fail:
565 d->is_dying = DOMDYING_dead;
566 psr_domain_free(d);
567 iommu_domain_destroy(d);
568 cleanup_domain_irq_mapping(d);
569 free_xenheap_page(d->shared_info);
570 xfree(d->arch.cpuid);
571 xfree(d->arch.msr);
572 if ( paging_initialised )
573 paging_final_teardown(d);
574 free_perdomain_mappings(d);
575
576 return rc;
577 }
578
arch_domain_destroy(struct domain * d)579 void arch_domain_destroy(struct domain *d)
580 {
581 if ( is_hvm_domain(d) )
582 hvm_domain_destroy(d);
583
584 xfree(d->arch.e820);
585 xfree(d->arch.cpuid);
586 xfree(d->arch.msr);
587
588 free_domain_pirqs(d);
589 if ( !is_idle_domain(d) )
590 iommu_domain_destroy(d);
591
592 paging_final_teardown(d);
593
594 if ( is_pv_domain(d) )
595 pv_domain_destroy(d);
596 free_perdomain_mappings(d);
597
598 free_xenheap_page(d->shared_info);
599 cleanup_domain_irq_mapping(d);
600
601 psr_domain_free(d);
602 }
603
arch_domain_shutdown(struct domain * d)604 void arch_domain_shutdown(struct domain *d)
605 {
606 if ( has_viridian_time_ref_count(d) )
607 viridian_time_ref_count_freeze(d);
608 }
609
arch_domain_pause(struct domain * d)610 void arch_domain_pause(struct domain *d)
611 {
612 if ( has_viridian_time_ref_count(d) )
613 viridian_time_ref_count_freeze(d);
614 }
615
arch_domain_unpause(struct domain * d)616 void arch_domain_unpause(struct domain *d)
617 {
618 if ( has_viridian_time_ref_count(d) )
619 viridian_time_ref_count_thaw(d);
620 }
621
arch_domain_soft_reset(struct domain * d)622 int arch_domain_soft_reset(struct domain *d)
623 {
624 struct page_info *page = virt_to_page(d->shared_info), *new_page;
625 int ret = 0;
626 struct domain *owner;
627 unsigned long mfn, gfn;
628 p2m_type_t p2mt;
629 unsigned int i;
630
631 /* Soft reset is supported for HVM domains only. */
632 if ( !is_hvm_domain(d) )
633 return -EINVAL;
634
635 hvm_domain_soft_reset(d);
636
637 spin_lock(&d->event_lock);
638 for ( i = 0; i < d->nr_pirqs ; i++ )
639 {
640 if ( domain_pirq_to_emuirq(d, i) != IRQ_UNBOUND )
641 {
642 ret = unmap_domain_pirq_emuirq(d, i);
643 if ( ret )
644 break;
645 }
646 }
647 spin_unlock(&d->event_lock);
648
649 if ( ret )
650 return ret;
651
652 /*
653 * The shared_info page needs to be replaced with a new page, otherwise we
654 * will get a hole if the domain does XENMAPSPACE_shared_info.
655 */
656
657 owner = page_get_owner_and_reference(page);
658 ASSERT( owner == d );
659
660 mfn = page_to_mfn(page);
661 gfn = mfn_to_gmfn(d, mfn);
662
663 /*
664 * gfn == INVALID_GFN indicates that the shared_info page was never mapped
665 * to the domain's address space and there is nothing to replace.
666 */
667 if ( gfn == gfn_x(INVALID_GFN) )
668 goto exit_put_page;
669
670 if ( mfn_x(get_gfn_query(d, gfn, &p2mt)) != mfn )
671 {
672 printk(XENLOG_G_ERR "Failed to get Dom%d's shared_info GFN (%lx)\n",
673 d->domain_id, gfn);
674 ret = -EINVAL;
675 goto exit_put_page;
676 }
677
678 new_page = alloc_domheap_page(d, 0);
679 if ( !new_page )
680 {
681 printk(XENLOG_G_ERR "Failed to alloc a page to replace"
682 " Dom%d's shared_info frame %lx\n", d->domain_id, gfn);
683 ret = -ENOMEM;
684 goto exit_put_gfn;
685 }
686
687 ret = guest_physmap_remove_page(d, _gfn(gfn), _mfn(mfn), PAGE_ORDER_4K);
688 if ( ret )
689 {
690 printk(XENLOG_G_ERR "Failed to remove Dom%d's shared_info frame %lx\n",
691 d->domain_id, gfn);
692 free_domheap_page(new_page);
693 goto exit_put_gfn;
694 }
695
696 ret = guest_physmap_add_page(d, _gfn(gfn), _mfn(page_to_mfn(new_page)),
697 PAGE_ORDER_4K);
698 if ( ret )
699 {
700 printk(XENLOG_G_ERR "Failed to add a page to replace"
701 " Dom%d's shared_info frame %lx\n", d->domain_id, gfn);
702 free_domheap_page(new_page);
703 }
704 exit_put_gfn:
705 put_gfn(d, gfn);
706 exit_put_page:
707 put_page(page);
708
709 return ret;
710 }
711
712 /*
713 * These are the masks of CR4 bits (subject to hardware availability) which a
714 * PV guest may not legitimiately attempt to modify.
715 */
716 static unsigned long __read_mostly pv_cr4_mask, compat_pv_cr4_mask;
717
init_pv_cr4_masks(void)718 static int __init init_pv_cr4_masks(void)
719 {
720 unsigned long common_mask = ~X86_CR4_TSD;
721
722 /*
723 * All PV guests may attempt to modify TSD, DE and OSXSAVE.
724 */
725 if ( cpu_has_de )
726 common_mask &= ~X86_CR4_DE;
727 if ( cpu_has_xsave )
728 common_mask &= ~X86_CR4_OSXSAVE;
729
730 pv_cr4_mask = compat_pv_cr4_mask = common_mask;
731
732 /*
733 * 64bit PV guests may attempt to modify FSGSBASE.
734 */
735 if ( cpu_has_fsgsbase )
736 pv_cr4_mask &= ~X86_CR4_FSGSBASE;
737
738 return 0;
739 }
740 __initcall(init_pv_cr4_masks);
741
pv_guest_cr4_fixup(const struct vcpu * v,unsigned long guest_cr4)742 unsigned long pv_guest_cr4_fixup(const struct vcpu *v, unsigned long guest_cr4)
743 {
744 unsigned long hv_cr4 = real_cr4_to_pv_guest_cr4(read_cr4());
745 unsigned long mask = is_pv_32bit_vcpu(v) ? compat_pv_cr4_mask : pv_cr4_mask;
746
747 if ( (guest_cr4 & mask) != (hv_cr4 & mask) )
748 printk(XENLOG_G_WARNING
749 "d%d attempted to change %pv's CR4 flags %08lx -> %08lx\n",
750 current->domain->domain_id, v, hv_cr4, guest_cr4);
751
752 return (hv_cr4 & mask) | (guest_cr4 & ~mask);
753 }
754
755 #define xen_vcpu_guest_context vcpu_guest_context
756 #define fpu_ctxt fpu_ctxt.x
757 CHECK_FIELD_(struct, vcpu_guest_context, fpu_ctxt);
758 #undef fpu_ctxt
759 #undef xen_vcpu_guest_context
760
761 /* Called by XEN_DOMCTL_setvcpucontext and VCPUOP_initialise. */
arch_set_info_guest(struct vcpu * v,vcpu_guest_context_u c)762 int arch_set_info_guest(
763 struct vcpu *v, vcpu_guest_context_u c)
764 {
765 struct domain *d = v->domain;
766 unsigned long cr3_gfn;
767 struct page_info *cr3_page;
768 unsigned long flags, cr4;
769 unsigned int i;
770 int rc = 0, compat;
771
772 /* The context is a compat-mode one if the target domain is compat-mode;
773 * we expect the tools to DTRT even in compat-mode callers. */
774 compat = is_pv_32bit_domain(d);
775
776 #define c(fld) (compat ? (c.cmp->fld) : (c.nat->fld))
777 flags = c(flags);
778
779 if ( is_pv_domain(d) )
780 {
781 if ( !compat )
782 {
783 if ( !is_canonical_address(c.nat->user_regs.rip) ||
784 !is_canonical_address(c.nat->user_regs.rsp) ||
785 !is_canonical_address(c.nat->kernel_sp) ||
786 (c.nat->ldt_ents && !is_canonical_address(c.nat->ldt_base)) ||
787 !is_canonical_address(c.nat->fs_base) ||
788 !is_canonical_address(c.nat->gs_base_kernel) ||
789 !is_canonical_address(c.nat->gs_base_user) ||
790 !is_canonical_address(c.nat->event_callback_eip) ||
791 !is_canonical_address(c.nat->syscall_callback_eip) ||
792 !is_canonical_address(c.nat->failsafe_callback_eip) )
793 return -EINVAL;
794
795 fixup_guest_stack_selector(d, c.nat->user_regs.ss);
796 fixup_guest_stack_selector(d, c.nat->kernel_ss);
797 fixup_guest_code_selector(d, c.nat->user_regs.cs);
798
799 for ( i = 0; i < ARRAY_SIZE(c.nat->trap_ctxt); i++ )
800 {
801 if ( !is_canonical_address(c.nat->trap_ctxt[i].address) )
802 return -EINVAL;
803 fixup_guest_code_selector(d, c.nat->trap_ctxt[i].cs);
804 }
805
806 if ( !__addr_ok(c.nat->ldt_base) )
807 return -EINVAL;
808 }
809 else
810 {
811 fixup_guest_stack_selector(d, c.cmp->user_regs.ss);
812 fixup_guest_stack_selector(d, c.cmp->kernel_ss);
813 fixup_guest_code_selector(d, c.cmp->user_regs.cs);
814 fixup_guest_code_selector(d, c.cmp->event_callback_cs);
815 fixup_guest_code_selector(d, c.cmp->failsafe_callback_cs);
816
817 for ( i = 0; i < ARRAY_SIZE(c.cmp->trap_ctxt); i++ )
818 fixup_guest_code_selector(d, c.cmp->trap_ctxt[i].cs);
819 }
820
821 /* LDT safety checks. */
822 if ( ((c(ldt_base) & (PAGE_SIZE - 1)) != 0) ||
823 (c(ldt_ents) > 8192) )
824 return -EINVAL;
825 }
826
827 v->fpu_initialised = !!(flags & VGCF_I387_VALID);
828
829 v->arch.flags &= ~TF_kernel_mode;
830 if ( (flags & VGCF_in_kernel) || is_hvm_domain(d)/*???*/ )
831 v->arch.flags |= TF_kernel_mode;
832
833 v->arch.vgc_flags = flags;
834
835 if ( flags & VGCF_I387_VALID )
836 {
837 memcpy(v->arch.fpu_ctxt, &c.nat->fpu_ctxt, sizeof(c.nat->fpu_ctxt));
838 if ( v->arch.xsave_area )
839 v->arch.xsave_area->xsave_hdr.xstate_bv = XSTATE_FP_SSE;
840 }
841 else if ( v->arch.xsave_area )
842 {
843 v->arch.xsave_area->xsave_hdr.xstate_bv = 0;
844 v->arch.xsave_area->fpu_sse.mxcsr = MXCSR_DEFAULT;
845 }
846 else
847 {
848 typeof(v->arch.xsave_area->fpu_sse) *fpu_sse = v->arch.fpu_ctxt;
849
850 memset(fpu_sse, 0, sizeof(*fpu_sse));
851 fpu_sse->fcw = FCW_DEFAULT;
852 fpu_sse->mxcsr = MXCSR_DEFAULT;
853 }
854 if ( v->arch.xsave_area )
855 v->arch.xsave_area->xsave_hdr.xcomp_bv = 0;
856
857 if ( !compat )
858 {
859 memcpy(&v->arch.user_regs, &c.nat->user_regs, sizeof(c.nat->user_regs));
860 if ( is_pv_domain(d) )
861 memcpy(v->arch.pv_vcpu.trap_ctxt, c.nat->trap_ctxt,
862 sizeof(c.nat->trap_ctxt));
863 }
864 else
865 {
866 XLAT_cpu_user_regs(&v->arch.user_regs, &c.cmp->user_regs);
867 if ( is_pv_domain(d) )
868 {
869 for ( i = 0; i < ARRAY_SIZE(c.cmp->trap_ctxt); ++i )
870 XLAT_trap_info(v->arch.pv_vcpu.trap_ctxt + i,
871 c.cmp->trap_ctxt + i);
872 }
873 }
874
875 if ( is_hvm_domain(d) )
876 {
877 for ( i = 0; i < ARRAY_SIZE(v->arch.debugreg); ++i )
878 v->arch.debugreg[i] = c(debugreg[i]);
879
880 hvm_set_info_guest(v);
881 goto out;
882 }
883
884 init_int80_direct_trap(v);
885
886 /* IOPL privileges are virtualised. */
887 v->arch.pv_vcpu.iopl = v->arch.user_regs.eflags & X86_EFLAGS_IOPL;
888 v->arch.user_regs.eflags &= ~X86_EFLAGS_IOPL;
889
890 /* Ensure real hardware interrupts are enabled. */
891 v->arch.user_regs.eflags |= X86_EFLAGS_IF;
892
893 if ( !v->is_initialised )
894 {
895 if ( !compat && !(flags & VGCF_in_kernel) && !c.nat->ctrlreg[1] )
896 return -EINVAL;
897
898 v->arch.pv_vcpu.ldt_base = c(ldt_base);
899 v->arch.pv_vcpu.ldt_ents = c(ldt_ents);
900 }
901 else
902 {
903 unsigned long pfn = pagetable_get_pfn(v->arch.guest_table);
904 bool fail;
905
906 if ( !compat )
907 {
908 fail = xen_pfn_to_cr3(pfn) != c.nat->ctrlreg[3];
909 if ( pagetable_is_null(v->arch.guest_table_user) )
910 fail |= c.nat->ctrlreg[1] || !(flags & VGCF_in_kernel);
911 else
912 {
913 pfn = pagetable_get_pfn(v->arch.guest_table_user);
914 fail |= xen_pfn_to_cr3(pfn) != c.nat->ctrlreg[1];
915 }
916 } else {
917 l4_pgentry_t *l4tab = map_domain_page(_mfn(pfn));
918
919 pfn = l4e_get_pfn(*l4tab);
920 unmap_domain_page(l4tab);
921 fail = compat_pfn_to_cr3(pfn) != c.cmp->ctrlreg[3];
922 }
923
924 for ( i = 0; i < ARRAY_SIZE(v->arch.pv_vcpu.gdt_frames); ++i )
925 fail |= v->arch.pv_vcpu.gdt_frames[i] != c(gdt_frames[i]);
926 fail |= v->arch.pv_vcpu.gdt_ents != c(gdt_ents);
927
928 fail |= v->arch.pv_vcpu.ldt_base != c(ldt_base);
929 fail |= v->arch.pv_vcpu.ldt_ents != c(ldt_ents);
930
931 if ( fail )
932 return -EOPNOTSUPP;
933 }
934
935 v->arch.pv_vcpu.kernel_ss = c(kernel_ss);
936 v->arch.pv_vcpu.kernel_sp = c(kernel_sp);
937 for ( i = 0; i < ARRAY_SIZE(v->arch.pv_vcpu.ctrlreg); ++i )
938 v->arch.pv_vcpu.ctrlreg[i] = c(ctrlreg[i]);
939
940 v->arch.pv_vcpu.event_callback_eip = c(event_callback_eip);
941 v->arch.pv_vcpu.failsafe_callback_eip = c(failsafe_callback_eip);
942 if ( !compat )
943 {
944 v->arch.pv_vcpu.syscall_callback_eip = c.nat->syscall_callback_eip;
945 v->arch.pv_vcpu.fs_base = c.nat->fs_base;
946 v->arch.pv_vcpu.gs_base_kernel = c.nat->gs_base_kernel;
947 v->arch.pv_vcpu.gs_base_user = c.nat->gs_base_user;
948 }
949 else
950 {
951 v->arch.pv_vcpu.event_callback_cs = c(event_callback_cs);
952 v->arch.pv_vcpu.failsafe_callback_cs = c(failsafe_callback_cs);
953 }
954
955 /* Only CR0.TS is modifiable by guest or admin. */
956 v->arch.pv_vcpu.ctrlreg[0] &= X86_CR0_TS;
957 v->arch.pv_vcpu.ctrlreg[0] |= read_cr0() & ~X86_CR0_TS;
958
959 cr4 = v->arch.pv_vcpu.ctrlreg[4];
960 v->arch.pv_vcpu.ctrlreg[4] = cr4 ? pv_guest_cr4_fixup(v, cr4) :
961 real_cr4_to_pv_guest_cr4(mmu_cr4_features);
962
963 memset(v->arch.debugreg, 0, sizeof(v->arch.debugreg));
964 for ( i = 0; i < 8; i++ )
965 (void)set_debugreg(v, i, c(debugreg[i]));
966
967 if ( v->is_initialised )
968 goto out;
969
970 if ( v->vcpu_id == 0 )
971 {
972 /*
973 * In the restore case we need to deal with L4 pages which got
974 * initialized with m2p_strict still clear (and which hence lack the
975 * correct initial RO_MPT_VIRT_{START,END} L4 entry).
976 */
977 if ( d != current->domain && !VM_ASSIST(d, m2p_strict) &&
978 is_pv_domain(d) && !is_pv_32bit_domain(d) &&
979 test_bit(VMASST_TYPE_m2p_strict, &c.nat->vm_assist) &&
980 atomic_read(&d->arch.pv_domain.nr_l4_pages) )
981 {
982 bool done = false;
983
984 spin_lock_recursive(&d->page_alloc_lock);
985
986 for ( i = 0; ; )
987 {
988 struct page_info *page = page_list_remove_head(&d->page_list);
989
990 if ( page_lock(page) )
991 {
992 if ( (page->u.inuse.type_info & PGT_type_mask) ==
993 PGT_l4_page_table )
994 done = !fill_ro_mpt(_mfn(page_to_mfn(page)));
995
996 page_unlock(page);
997 }
998
999 page_list_add_tail(page, &d->page_list);
1000
1001 if ( done || (!(++i & 0xff) && hypercall_preempt_check()) )
1002 break;
1003 }
1004
1005 spin_unlock_recursive(&d->page_alloc_lock);
1006
1007 if ( !done )
1008 return -ERESTART;
1009 }
1010
1011 d->vm_assist = c(vm_assist);
1012 }
1013
1014 rc = put_old_guest_table(current);
1015 if ( rc )
1016 return rc;
1017
1018 if ( !compat )
1019 rc = (int)pv_set_gdt(v, c.nat->gdt_frames, c.nat->gdt_ents);
1020 else
1021 {
1022 unsigned long gdt_frames[ARRAY_SIZE(v->arch.pv_vcpu.gdt_frames)];
1023 unsigned int n = (c.cmp->gdt_ents + 511) / 512;
1024
1025 if ( n > ARRAY_SIZE(v->arch.pv_vcpu.gdt_frames) )
1026 return -EINVAL;
1027 for ( i = 0; i < n; ++i )
1028 gdt_frames[i] = c.cmp->gdt_frames[i];
1029 rc = (int)pv_set_gdt(v, gdt_frames, c.cmp->gdt_ents);
1030 }
1031 if ( rc != 0 )
1032 return rc;
1033
1034 set_bit(_VPF_in_reset, &v->pause_flags);
1035
1036 if ( !compat )
1037 cr3_gfn = xen_cr3_to_pfn(c.nat->ctrlreg[3]);
1038 else
1039 cr3_gfn = compat_cr3_to_pfn(c.cmp->ctrlreg[3]);
1040 cr3_page = get_page_from_gfn(d, cr3_gfn, NULL, P2M_ALLOC);
1041
1042 if ( !cr3_page )
1043 rc = -EINVAL;
1044 else if ( paging_mode_refcounts(d) )
1045 /* nothing */;
1046 else if ( cr3_page == v->arch.old_guest_table )
1047 {
1048 v->arch.old_guest_table = NULL;
1049 put_page(cr3_page);
1050 }
1051 else
1052 {
1053 if ( !compat )
1054 rc = put_old_guest_table(v);
1055 if ( !rc )
1056 rc = get_page_type_preemptible(cr3_page,
1057 !compat ? PGT_root_page_table
1058 : PGT_l3_page_table);
1059 switch ( rc )
1060 {
1061 case -EINTR:
1062 rc = -ERESTART;
1063 case -ERESTART:
1064 break;
1065 case 0:
1066 if ( !compat && !VM_ASSIST(d, m2p_strict) &&
1067 !paging_mode_refcounts(d) )
1068 fill_ro_mpt(_mfn(cr3_gfn));
1069 break;
1070 default:
1071 if ( cr3_page == current->arch.old_guest_table )
1072 cr3_page = NULL;
1073 break;
1074 }
1075 }
1076 if ( rc )
1077 /* handled below */;
1078 else if ( !compat )
1079 {
1080 v->arch.guest_table = pagetable_from_page(cr3_page);
1081 if ( c.nat->ctrlreg[1] )
1082 {
1083 cr3_gfn = xen_cr3_to_pfn(c.nat->ctrlreg[1]);
1084 cr3_page = get_page_from_gfn(d, cr3_gfn, NULL, P2M_ALLOC);
1085
1086 if ( !cr3_page )
1087 rc = -EINVAL;
1088 else if ( !paging_mode_refcounts(d) )
1089 {
1090 rc = get_page_type_preemptible(cr3_page, PGT_root_page_table);
1091 switch ( rc )
1092 {
1093 case -EINTR:
1094 rc = -ERESTART;
1095 /* Fallthrough */
1096 case -ERESTART:
1097 v->arch.old_guest_ptpg = NULL;
1098 v->arch.old_guest_table =
1099 pagetable_get_page(v->arch.guest_table);
1100 v->arch.guest_table = pagetable_null();
1101 break;
1102 default:
1103 if ( cr3_page == current->arch.old_guest_table )
1104 cr3_page = NULL;
1105 break;
1106 case 0:
1107 if ( VM_ASSIST(d, m2p_strict) )
1108 zap_ro_mpt(_mfn(cr3_gfn));
1109 break;
1110 }
1111 }
1112 if ( !rc )
1113 v->arch.guest_table_user = pagetable_from_page(cr3_page);
1114 }
1115 }
1116 else
1117 {
1118 l4_pgentry_t *l4tab;
1119
1120 l4tab = map_domain_page(pagetable_get_mfn(v->arch.guest_table));
1121 *l4tab = l4e_from_pfn(page_to_mfn(cr3_page),
1122 _PAGE_PRESENT|_PAGE_RW|_PAGE_USER|_PAGE_ACCESSED);
1123 unmap_domain_page(l4tab);
1124 }
1125 if ( rc )
1126 {
1127 if ( cr3_page )
1128 put_page(cr3_page);
1129 pv_destroy_gdt(v);
1130 return rc;
1131 }
1132
1133 clear_bit(_VPF_in_reset, &v->pause_flags);
1134
1135 if ( v->vcpu_id == 0 )
1136 update_domain_wallclock_time(d);
1137
1138 /* Don't redo final setup */
1139 v->is_initialised = 1;
1140
1141 if ( paging_mode_enabled(d) )
1142 paging_update_paging_modes(v);
1143
1144 update_cr3(v);
1145
1146 out:
1147 if ( flags & VGCF_online )
1148 clear_bit(_VPF_down, &v->pause_flags);
1149 else
1150 set_bit(_VPF_down, &v->pause_flags);
1151 return 0;
1152 #undef c
1153 }
1154
arch_initialise_vcpu(struct vcpu * v,XEN_GUEST_HANDLE_PARAM (void)arg)1155 int arch_initialise_vcpu(struct vcpu *v, XEN_GUEST_HANDLE_PARAM(void) arg)
1156 {
1157 int rc;
1158
1159 if ( is_hvm_vcpu(v) )
1160 {
1161 struct domain *d = v->domain;
1162 struct vcpu_hvm_context ctxt;
1163
1164 if ( copy_from_guest(&ctxt, arg, 1) )
1165 return -EFAULT;
1166
1167 domain_lock(d);
1168 rc = v->is_initialised ? -EEXIST : arch_set_info_hvm_guest(v, &ctxt);
1169 domain_unlock(d);
1170 }
1171 else
1172 rc = default_initialise_vcpu(v, arg);
1173
1174 return rc;
1175 }
1176
arch_vcpu_reset(struct vcpu * v)1177 int arch_vcpu_reset(struct vcpu *v)
1178 {
1179 if ( is_pv_vcpu(v) )
1180 {
1181 pv_destroy_gdt(v);
1182 return vcpu_destroy_pagetables(v);
1183 }
1184
1185 vcpu_end_shutdown_deferral(v);
1186 return 0;
1187 }
1188
1189 long
arch_do_vcpu_op(int cmd,struct vcpu * v,XEN_GUEST_HANDLE_PARAM (void)arg)1190 arch_do_vcpu_op(
1191 int cmd, struct vcpu *v, XEN_GUEST_HANDLE_PARAM(void) arg)
1192 {
1193 long rc = 0;
1194
1195 switch ( cmd )
1196 {
1197 case VCPUOP_register_vcpu_time_memory_area:
1198 {
1199 struct vcpu_register_time_memory_area area;
1200
1201 rc = -EFAULT;
1202 if ( copy_from_guest(&area, arg, 1) )
1203 break;
1204
1205 if ( !guest_handle_okay(area.addr.h, 1) )
1206 break;
1207
1208 rc = 0;
1209 v->arch.time_info_guest = area.addr.h;
1210
1211 force_update_vcpu_system_time(v);
1212
1213 break;
1214 }
1215
1216 case VCPUOP_get_physid:
1217 {
1218 struct vcpu_get_physid cpu_id;
1219
1220 rc = -EINVAL;
1221 if ( !is_pinned_vcpu(v) )
1222 break;
1223
1224 cpu_id.phys_id =
1225 (uint64_t)x86_cpu_to_apicid[v->vcpu_id] |
1226 ((uint64_t)acpi_get_processor_id(v->vcpu_id) << 32);
1227
1228 rc = -EFAULT;
1229 if ( copy_to_guest(arg, &cpu_id, 1) )
1230 break;
1231
1232 rc = 0;
1233 break;
1234 }
1235
1236 default:
1237 rc = -ENOSYS;
1238 break;
1239 }
1240
1241 return rc;
1242 }
1243
1244 /*
1245 * Loading a nul selector does not clear bases and limits on AMD CPUs. Be on
1246 * the safe side and re-initialize both to flat segment values before loading
1247 * a nul selector.
1248 */
1249 #define preload_segment(seg, value) do { \
1250 if ( !((value) & ~3) && \
1251 boot_cpu_data.x86_vendor == X86_VENDOR_AMD ) \
1252 asm volatile ( "movl %k0, %%" #seg \
1253 :: "r" (FLAT_USER_DS32) ); \
1254 } while ( false )
1255
1256 #define loadsegment(seg,value) ({ \
1257 int __r = 1; \
1258 asm volatile ( \
1259 "1: movl %k1,%%" #seg "\n2:\n" \
1260 ".section .fixup,\"ax\"\n" \
1261 "3: xorl %k0,%k0\n" \
1262 " movl %k0,%%" #seg "\n" \
1263 " jmp 2b\n" \
1264 ".previous\n" \
1265 _ASM_EXTABLE(1b, 3b) \
1266 : "=r" (__r) : "r" (value), "0" (__r) );\
1267 __r; })
1268
1269 /*
1270 * save_segments() writes a mask of segments which are dirty (non-zero),
1271 * allowing load_segments() to avoid some expensive segment loads and
1272 * MSR writes.
1273 */
1274 static DEFINE_PER_CPU(unsigned int, dirty_segment_mask);
1275 #define DIRTY_DS 0x01
1276 #define DIRTY_ES 0x02
1277 #define DIRTY_FS 0x04
1278 #define DIRTY_GS 0x08
1279 #define DIRTY_FS_BASE 0x10
1280 #define DIRTY_GS_BASE_USER 0x20
1281
load_segments(struct vcpu * n)1282 static void load_segments(struct vcpu *n)
1283 {
1284 struct cpu_user_regs *uregs = &n->arch.user_regs;
1285 int all_segs_okay = 1;
1286 unsigned int dirty_segment_mask, cpu = smp_processor_id();
1287
1288 /* Load and clear the dirty segment mask. */
1289 dirty_segment_mask = per_cpu(dirty_segment_mask, cpu);
1290 per_cpu(dirty_segment_mask, cpu) = 0;
1291
1292 /* Either selector != 0 ==> reload. */
1293 if ( unlikely((dirty_segment_mask & DIRTY_DS) | uregs->ds) )
1294 {
1295 preload_segment(ds, uregs->ds);
1296 all_segs_okay &= loadsegment(ds, uregs->ds);
1297 }
1298
1299 /* Either selector != 0 ==> reload. */
1300 if ( unlikely((dirty_segment_mask & DIRTY_ES) | uregs->es) )
1301 {
1302 preload_segment(es, uregs->es);
1303 all_segs_okay &= loadsegment(es, uregs->es);
1304 }
1305
1306 /* Either selector != 0 ==> reload. */
1307 if ( unlikely((dirty_segment_mask & DIRTY_FS) | uregs->fs) )
1308 {
1309 all_segs_okay &= loadsegment(fs, uregs->fs);
1310 /* non-nul selector updates fs_base */
1311 if ( uregs->fs & ~3 )
1312 dirty_segment_mask &= ~DIRTY_FS_BASE;
1313 }
1314
1315 /* Either selector != 0 ==> reload. */
1316 if ( unlikely((dirty_segment_mask & DIRTY_GS) | uregs->gs) )
1317 {
1318 all_segs_okay &= loadsegment(gs, uregs->gs);
1319 /* non-nul selector updates gs_base_user */
1320 if ( uregs->gs & ~3 )
1321 dirty_segment_mask &= ~DIRTY_GS_BASE_USER;
1322 }
1323
1324 if ( !is_pv_32bit_vcpu(n) )
1325 {
1326 /* This can only be non-zero if selector is NULL. */
1327 if ( n->arch.pv_vcpu.fs_base | (dirty_segment_mask & DIRTY_FS_BASE) )
1328 wrfsbase(n->arch.pv_vcpu.fs_base);
1329
1330 /* Most kernels have non-zero GS base, so don't bother testing. */
1331 /* (This is also a serialising instruction, avoiding AMD erratum #88.) */
1332 wrmsrl(MSR_SHADOW_GS_BASE, n->arch.pv_vcpu.gs_base_kernel);
1333
1334 /* This can only be non-zero if selector is NULL. */
1335 if ( n->arch.pv_vcpu.gs_base_user |
1336 (dirty_segment_mask & DIRTY_GS_BASE_USER) )
1337 wrgsbase(n->arch.pv_vcpu.gs_base_user);
1338
1339 /* If in kernel mode then switch the GS bases around. */
1340 if ( (n->arch.flags & TF_kernel_mode) )
1341 asm volatile ( "swapgs" );
1342 }
1343
1344 if ( unlikely(!all_segs_okay) )
1345 {
1346 struct pv_vcpu *pv = &n->arch.pv_vcpu;
1347 struct cpu_user_regs *regs = guest_cpu_user_regs();
1348 unsigned long *rsp =
1349 (unsigned long *)(((n->arch.flags & TF_kernel_mode)
1350 ? regs->rsp : pv->kernel_sp) & ~0xf);
1351 unsigned long cs_and_mask, rflags;
1352
1353 /* Fold upcall mask and architectural IOPL into RFLAGS.IF. */
1354 rflags = regs->rflags & ~(X86_EFLAGS_IF|X86_EFLAGS_IOPL);
1355 rflags |= !vcpu_info(n, evtchn_upcall_mask) << 9;
1356 if ( VM_ASSIST(n->domain, architectural_iopl) )
1357 rflags |= n->arch.pv_vcpu.iopl;
1358
1359 if ( is_pv_32bit_vcpu(n) )
1360 {
1361 unsigned int *esp = ring_1(regs) ?
1362 (unsigned int *)regs->rsp :
1363 (unsigned int *)pv->kernel_sp;
1364 int ret = 0;
1365
1366 /* CS longword also contains full evtchn_upcall_mask. */
1367 cs_and_mask = (unsigned short)regs->cs |
1368 ((unsigned int)vcpu_info(n, evtchn_upcall_mask) << 16);
1369
1370 if ( !ring_1(regs) )
1371 {
1372 ret = put_user(regs->ss, esp-1);
1373 ret |= put_user(regs->esp, esp-2);
1374 esp -= 2;
1375 }
1376
1377 if ( ret |
1378 put_user(rflags, esp-1) |
1379 put_user(cs_and_mask, esp-2) |
1380 put_user(regs->eip, esp-3) |
1381 put_user(uregs->gs, esp-4) |
1382 put_user(uregs->fs, esp-5) |
1383 put_user(uregs->es, esp-6) |
1384 put_user(uregs->ds, esp-7) )
1385 {
1386 gprintk(XENLOG_ERR,
1387 "error while creating compat failsafe callback frame\n");
1388 domain_crash(n->domain);
1389 }
1390
1391 if ( n->arch.vgc_flags & VGCF_failsafe_disables_events )
1392 vcpu_info(n, evtchn_upcall_mask) = 1;
1393
1394 regs->entry_vector |= TRAP_syscall;
1395 regs->eflags &= ~(X86_EFLAGS_VM|X86_EFLAGS_RF|X86_EFLAGS_NT|
1396 X86_EFLAGS_IOPL|X86_EFLAGS_TF);
1397 regs->ss = FLAT_COMPAT_KERNEL_SS;
1398 regs->esp = (unsigned long)(esp-7);
1399 regs->cs = FLAT_COMPAT_KERNEL_CS;
1400 regs->eip = pv->failsafe_callback_eip;
1401 return;
1402 }
1403
1404 if ( !(n->arch.flags & TF_kernel_mode) )
1405 toggle_guest_mode(n);
1406 else
1407 regs->cs &= ~3;
1408
1409 /* CS longword also contains full evtchn_upcall_mask. */
1410 cs_and_mask = (unsigned long)regs->cs |
1411 ((unsigned long)vcpu_info(n, evtchn_upcall_mask) << 32);
1412
1413 if ( put_user(regs->ss, rsp- 1) |
1414 put_user(regs->rsp, rsp- 2) |
1415 put_user(rflags, rsp- 3) |
1416 put_user(cs_and_mask, rsp- 4) |
1417 put_user(regs->rip, rsp- 5) |
1418 put_user(uregs->gs, rsp- 6) |
1419 put_user(uregs->fs, rsp- 7) |
1420 put_user(uregs->es, rsp- 8) |
1421 put_user(uregs->ds, rsp- 9) |
1422 put_user(regs->r11, rsp-10) |
1423 put_user(regs->rcx, rsp-11) )
1424 {
1425 gprintk(XENLOG_ERR,
1426 "error while creating failsafe callback frame\n");
1427 domain_crash(n->domain);
1428 }
1429
1430 if ( n->arch.vgc_flags & VGCF_failsafe_disables_events )
1431 vcpu_info(n, evtchn_upcall_mask) = 1;
1432
1433 regs->entry_vector |= TRAP_syscall;
1434 regs->rflags &= ~(X86_EFLAGS_AC|X86_EFLAGS_VM|X86_EFLAGS_RF|
1435 X86_EFLAGS_NT|X86_EFLAGS_IOPL|X86_EFLAGS_TF);
1436 regs->ss = FLAT_KERNEL_SS;
1437 regs->rsp = (unsigned long)(rsp-11);
1438 regs->cs = FLAT_KERNEL_CS;
1439 regs->rip = pv->failsafe_callback_eip;
1440 }
1441 }
1442
save_segments(struct vcpu * v)1443 static void save_segments(struct vcpu *v)
1444 {
1445 struct cpu_user_regs *regs = &v->arch.user_regs;
1446 unsigned int dirty_segment_mask = 0;
1447
1448 regs->ds = read_sreg(ds);
1449 regs->es = read_sreg(es);
1450 regs->fs = read_sreg(fs);
1451 regs->gs = read_sreg(gs);
1452
1453 if ( cpu_has_fsgsbase && !is_pv_32bit_vcpu(v) )
1454 {
1455 v->arch.pv_vcpu.fs_base = __rdfsbase();
1456 if ( v->arch.flags & TF_kernel_mode )
1457 v->arch.pv_vcpu.gs_base_kernel = __rdgsbase();
1458 else
1459 v->arch.pv_vcpu.gs_base_user = __rdgsbase();
1460 }
1461
1462 if ( regs->ds )
1463 dirty_segment_mask |= DIRTY_DS;
1464
1465 if ( regs->es )
1466 dirty_segment_mask |= DIRTY_ES;
1467
1468 if ( regs->fs || is_pv_32bit_vcpu(v) )
1469 {
1470 dirty_segment_mask |= DIRTY_FS;
1471 /* non-nul selector kills fs_base */
1472 if ( regs->fs & ~3 )
1473 v->arch.pv_vcpu.fs_base = 0;
1474 }
1475 if ( v->arch.pv_vcpu.fs_base )
1476 dirty_segment_mask |= DIRTY_FS_BASE;
1477
1478 if ( regs->gs || is_pv_32bit_vcpu(v) )
1479 {
1480 dirty_segment_mask |= DIRTY_GS;
1481 /* non-nul selector kills gs_base_user */
1482 if ( regs->gs & ~3 )
1483 v->arch.pv_vcpu.gs_base_user = 0;
1484 }
1485 if ( v->arch.flags & TF_kernel_mode ? v->arch.pv_vcpu.gs_base_kernel
1486 : v->arch.pv_vcpu.gs_base_user )
1487 dirty_segment_mask |= DIRTY_GS_BASE_USER;
1488
1489 this_cpu(dirty_segment_mask) = dirty_segment_mask;
1490 }
1491
paravirt_ctxt_switch_from(struct vcpu * v)1492 void paravirt_ctxt_switch_from(struct vcpu *v)
1493 {
1494 save_segments(v);
1495
1496 /*
1497 * Disable debug breakpoints. We do this aggressively because if we switch
1498 * to an HVM guest we may load DR0-DR3 with values that can cause #DE
1499 * inside Xen, before we get a chance to reload DR7, and this cannot always
1500 * safely be handled.
1501 */
1502 if ( unlikely(v->arch.debugreg[7] & DR7_ACTIVE_MASK) )
1503 write_debugreg(7, 0);
1504 }
1505
paravirt_ctxt_switch_to(struct vcpu * v)1506 void paravirt_ctxt_switch_to(struct vcpu *v)
1507 {
1508 unsigned long cr4;
1509
1510 cr4 = pv_guest_cr4_to_real_cr4(v);
1511 if ( unlikely(cr4 != read_cr4()) )
1512 write_cr4(cr4);
1513
1514 if ( unlikely(v->arch.debugreg[7] & DR7_ACTIVE_MASK) )
1515 activate_debugregs(v);
1516
1517 if ( (v->domain->arch.tsc_mode == TSC_MODE_PVRDTSCP) &&
1518 boot_cpu_has(X86_FEATURE_RDTSCP) )
1519 write_rdtscp_aux(v->domain->arch.incarnation);
1520 }
1521
1522 /* Update per-VCPU guest runstate shared memory area (if registered). */
update_runstate_area(struct vcpu * v)1523 bool update_runstate_area(struct vcpu *v)
1524 {
1525 bool rc;
1526 struct guest_memory_policy policy =
1527 { .smap_policy = SMAP_CHECK_ENABLED, .nested_guest_mode = false };
1528 void __user *guest_handle = NULL;
1529
1530 if ( guest_handle_is_null(runstate_guest(v)) )
1531 return true;
1532
1533 update_guest_memory_policy(v, &policy);
1534
1535 if ( VM_ASSIST(v->domain, runstate_update_flag) )
1536 {
1537 guest_handle = has_32bit_shinfo(v->domain)
1538 ? &v->runstate_guest.compat.p->state_entry_time + 1
1539 : &v->runstate_guest.native.p->state_entry_time + 1;
1540 guest_handle--;
1541 v->runstate.state_entry_time |= XEN_RUNSTATE_UPDATE;
1542 __raw_copy_to_guest(guest_handle,
1543 (void *)(&v->runstate.state_entry_time + 1) - 1, 1);
1544 smp_wmb();
1545 }
1546
1547 if ( has_32bit_shinfo(v->domain) )
1548 {
1549 struct compat_vcpu_runstate_info info;
1550
1551 XLAT_vcpu_runstate_info(&info, &v->runstate);
1552 __copy_to_guest(v->runstate_guest.compat, &info, 1);
1553 rc = true;
1554 }
1555 else
1556 rc = __copy_to_guest(runstate_guest(v), &v->runstate, 1) !=
1557 sizeof(v->runstate);
1558
1559 if ( guest_handle )
1560 {
1561 v->runstate.state_entry_time &= ~XEN_RUNSTATE_UPDATE;
1562 smp_wmb();
1563 __raw_copy_to_guest(guest_handle,
1564 (void *)(&v->runstate.state_entry_time + 1) - 1, 1);
1565 }
1566
1567 update_guest_memory_policy(v, &policy);
1568
1569 return rc;
1570 }
1571
_update_runstate_area(struct vcpu * v)1572 static void _update_runstate_area(struct vcpu *v)
1573 {
1574 if ( !update_runstate_area(v) && is_pv_vcpu(v) &&
1575 !(v->arch.flags & TF_kernel_mode) )
1576 v->arch.pv_vcpu.need_update_runstate_area = 1;
1577 }
1578
need_full_gdt(const struct domain * d)1579 static inline bool need_full_gdt(const struct domain *d)
1580 {
1581 return is_pv_domain(d) && !is_idle_domain(d);
1582 }
1583
__context_switch(void)1584 static void __context_switch(void)
1585 {
1586 struct cpu_user_regs *stack_regs = guest_cpu_user_regs();
1587 unsigned int cpu = smp_processor_id();
1588 struct vcpu *p = per_cpu(curr_vcpu, cpu);
1589 struct vcpu *n = current;
1590 struct domain *pd = p->domain, *nd = n->domain;
1591 struct desc_struct *gdt;
1592 struct desc_ptr gdt_desc;
1593
1594 ASSERT(p != n);
1595 ASSERT(cpumask_empty(n->vcpu_dirty_cpumask));
1596
1597 if ( !is_idle_domain(pd) )
1598 {
1599 memcpy(&p->arch.user_regs, stack_regs, CTXT_SWITCH_STACK_BYTES);
1600 vcpu_save_fpu(p);
1601 pd->arch.ctxt_switch->from(p);
1602 }
1603
1604 /*
1605 * Mark this CPU in next domain's dirty cpumasks before calling
1606 * ctxt_switch_to(). This avoids a race on things like EPT flushing,
1607 * which is synchronised on that function.
1608 */
1609 if ( pd != nd )
1610 cpumask_set_cpu(cpu, nd->domain_dirty_cpumask);
1611 cpumask_set_cpu(cpu, n->vcpu_dirty_cpumask);
1612
1613 if ( !is_idle_domain(nd) )
1614 {
1615 memcpy(stack_regs, &n->arch.user_regs, CTXT_SWITCH_STACK_BYTES);
1616 if ( cpu_has_xsave )
1617 {
1618 u64 xcr0 = n->arch.xcr0 ?: XSTATE_FP_SSE;
1619
1620 if ( xcr0 != get_xcr0() && !set_xcr0(xcr0) )
1621 BUG();
1622
1623 if ( cpu_has_xsaves && is_hvm_vcpu(n) )
1624 set_msr_xss(n->arch.hvm_vcpu.msr_xss);
1625 }
1626 vcpu_restore_fpu_eager(n);
1627 nd->arch.ctxt_switch->to(n);
1628 }
1629
1630 psr_ctxt_switch_to(nd);
1631
1632 gdt = !is_pv_32bit_domain(nd) ? per_cpu(gdt_table, cpu) :
1633 per_cpu(compat_gdt_table, cpu);
1634 if ( need_full_gdt(nd) )
1635 {
1636 unsigned long mfn = virt_to_mfn(gdt);
1637 l1_pgentry_t *pl1e = pv_gdt_ptes(n);
1638 unsigned int i;
1639
1640 for ( i = 0; i < NR_RESERVED_GDT_PAGES; i++ )
1641 l1e_write(pl1e + FIRST_RESERVED_GDT_PAGE + i,
1642 l1e_from_pfn(mfn + i, __PAGE_HYPERVISOR_RW));
1643 }
1644
1645 if ( need_full_gdt(pd) &&
1646 ((p->vcpu_id != n->vcpu_id) || !need_full_gdt(nd)) )
1647 {
1648 gdt_desc.limit = LAST_RESERVED_GDT_BYTE;
1649 gdt_desc.base = (unsigned long)(gdt - FIRST_RESERVED_GDT_ENTRY);
1650 asm volatile ( "lgdt %0" : : "m" (gdt_desc) );
1651 }
1652
1653 write_ptbase(n);
1654
1655 if ( need_full_gdt(nd) &&
1656 ((p->vcpu_id != n->vcpu_id) || !need_full_gdt(pd)) )
1657 {
1658 gdt_desc.limit = LAST_RESERVED_GDT_BYTE;
1659 gdt_desc.base = GDT_VIRT_START(n);
1660 asm volatile ( "lgdt %0" : : "m" (gdt_desc) );
1661 }
1662
1663 if ( pd != nd )
1664 cpumask_clear_cpu(cpu, pd->domain_dirty_cpumask);
1665 cpumask_clear_cpu(cpu, p->vcpu_dirty_cpumask);
1666
1667 per_cpu(curr_vcpu, cpu) = n;
1668 }
1669
1670
context_switch(struct vcpu * prev,struct vcpu * next)1671 void context_switch(struct vcpu *prev, struct vcpu *next)
1672 {
1673 unsigned int cpu = smp_processor_id();
1674 const struct domain *prevd = prev->domain, *nextd = next->domain;
1675 cpumask_t dirty_mask;
1676
1677 ASSERT(local_irq_is_enabled());
1678
1679 cpumask_copy(&dirty_mask, next->vcpu_dirty_cpumask);
1680 /* Allow at most one CPU at a time to be dirty. */
1681 ASSERT(cpumask_weight(&dirty_mask) <= 1);
1682 if ( unlikely(!cpumask_test_cpu(cpu, &dirty_mask) &&
1683 !cpumask_empty(&dirty_mask)) )
1684 {
1685 /* Other cpus call __sync_local_execstate from flush ipi handler. */
1686 flush_tlb_mask(&dirty_mask);
1687 }
1688
1689 if ( prev != next )
1690 {
1691 _update_runstate_area(prev);
1692 vpmu_switch_from(prev);
1693 np2m_schedule(NP2M_SCHEDLE_OUT);
1694 }
1695
1696 if ( is_hvm_domain(prevd) && !list_empty(&prev->arch.hvm_vcpu.tm_list) )
1697 pt_save_timer(prev);
1698
1699 local_irq_disable();
1700
1701 set_current(next);
1702
1703 if ( (per_cpu(curr_vcpu, cpu) == next) ||
1704 (is_idle_domain(nextd) && cpu_online(cpu)) )
1705 {
1706 local_irq_enable();
1707 }
1708 else
1709 {
1710 __context_switch();
1711
1712 if ( is_pv_domain(nextd) &&
1713 (is_idle_domain(prevd) ||
1714 is_hvm_domain(prevd) ||
1715 is_pv_32bit_domain(prevd) != is_pv_32bit_domain(nextd)) )
1716 {
1717 uint64_t efer = read_efer();
1718 if ( !(efer & EFER_SCE) )
1719 write_efer(efer | EFER_SCE);
1720 }
1721
1722 /* Re-enable interrupts before restoring state which may fault. */
1723 local_irq_enable();
1724
1725 if ( is_pv_domain(nextd) )
1726 {
1727 load_LDT(next);
1728 load_segments(next);
1729 }
1730
1731 ctxt_switch_levelling(next);
1732 }
1733
1734 context_saved(prev);
1735
1736 if ( prev != next )
1737 {
1738 _update_runstate_area(next);
1739
1740 /* Must be done with interrupts enabled */
1741 vpmu_switch_to(next);
1742 np2m_schedule(NP2M_SCHEDLE_IN);
1743 }
1744
1745 /* Ensure that the vcpu has an up-to-date time base. */
1746 update_vcpu_system_time(next);
1747
1748 /*
1749 * Schedule tail *should* be a terminal function pointer, but leave a
1750 * bug frame around just in case it returns, to save going back into the
1751 * context switching code and leaving a far more subtle crash to diagnose.
1752 */
1753 nextd->arch.ctxt_switch->tail(next);
1754 BUG();
1755 }
1756
continue_running(struct vcpu * same)1757 void continue_running(struct vcpu *same)
1758 {
1759 /* See the comment above. */
1760 same->domain->arch.ctxt_switch->tail(same);
1761 BUG();
1762 }
1763
__sync_local_execstate(void)1764 int __sync_local_execstate(void)
1765 {
1766 unsigned long flags;
1767 int switch_required;
1768
1769 local_irq_save(flags);
1770
1771 switch_required = (this_cpu(curr_vcpu) != current);
1772
1773 if ( switch_required )
1774 {
1775 ASSERT(current == idle_vcpu[smp_processor_id()]);
1776 __context_switch();
1777 }
1778
1779 local_irq_restore(flags);
1780
1781 return switch_required;
1782 }
1783
sync_local_execstate(void)1784 void sync_local_execstate(void)
1785 {
1786 (void)__sync_local_execstate();
1787 }
1788
sync_vcpu_execstate(struct vcpu * v)1789 void sync_vcpu_execstate(struct vcpu *v)
1790 {
1791 if ( cpumask_test_cpu(smp_processor_id(), v->vcpu_dirty_cpumask) )
1792 sync_local_execstate();
1793
1794 /* Other cpus call __sync_local_execstate from flush ipi handler. */
1795 flush_tlb_mask(v->vcpu_dirty_cpumask);
1796 }
1797
relinquish_memory(struct domain * d,struct page_list_head * list,unsigned long type)1798 static int relinquish_memory(
1799 struct domain *d, struct page_list_head *list, unsigned long type)
1800 {
1801 struct page_info *page;
1802 unsigned long x, y;
1803 int ret = 0;
1804
1805 /* Use a recursive lock, as we may enter 'free_domheap_page'. */
1806 spin_lock_recursive(&d->page_alloc_lock);
1807
1808 while ( (page = page_list_remove_head(list)) )
1809 {
1810 /* Grab a reference to the page so it won't disappear from under us. */
1811 if ( unlikely(!get_page(page, d)) )
1812 {
1813 /* Couldn't get a reference -- someone is freeing this page. */
1814 page_list_add_tail(page, &d->arch.relmem_list);
1815 continue;
1816 }
1817
1818 if ( test_and_clear_bit(_PGT_pinned, &page->u.inuse.type_info) )
1819 ret = put_page_and_type_preemptible(page);
1820 switch ( ret )
1821 {
1822 case 0:
1823 break;
1824 case -ERESTART:
1825 case -EINTR:
1826 ret = -ERESTART;
1827 page_list_add(page, list);
1828 set_bit(_PGT_pinned, &page->u.inuse.type_info);
1829 put_page(page);
1830 goto out;
1831 default:
1832 BUG();
1833 }
1834
1835 if ( test_and_clear_bit(_PGC_allocated, &page->count_info) )
1836 put_page(page);
1837
1838 /*
1839 * Forcibly invalidate top-most, still valid page tables at this point
1840 * to break circular 'linear page table' references as well as clean up
1841 * partially validated pages. This is okay because MMU structures are
1842 * not shared across domains and this domain is now dead. Thus top-most
1843 * valid tables are not in use so a non-zero count means circular
1844 * reference or partially validated.
1845 */
1846 y = page->u.inuse.type_info;
1847 for ( ; ; )
1848 {
1849 x = y;
1850 if ( likely((x & PGT_type_mask) != type) ||
1851 likely(!(x & (PGT_validated|PGT_partial))) )
1852 break;
1853
1854 y = cmpxchg(&page->u.inuse.type_info, x,
1855 x & ~(PGT_validated|PGT_partial));
1856 if ( likely(y == x) )
1857 {
1858 /* No need for atomic update of type_info here: noone else updates it. */
1859 switch ( ret = free_page_type(page, x, 1) )
1860 {
1861 case 0:
1862 break;
1863 case -EINTR:
1864 page_list_add(page, list);
1865 page->u.inuse.type_info |= PGT_validated;
1866 if ( x & PGT_partial )
1867 put_page(page);
1868 put_page(page);
1869 ret = -ERESTART;
1870 goto out;
1871 case -ERESTART:
1872 page_list_add(page, list);
1873 page->u.inuse.type_info |= PGT_partial;
1874 if ( x & PGT_partial )
1875 put_page(page);
1876 goto out;
1877 default:
1878 BUG();
1879 }
1880 if ( x & PGT_partial )
1881 {
1882 page->u.inuse.type_info--;
1883 put_page(page);
1884 }
1885 break;
1886 }
1887 }
1888
1889 /* Put the page on the list and /then/ potentially free it. */
1890 page_list_add_tail(page, &d->arch.relmem_list);
1891 put_page(page);
1892
1893 if ( hypercall_preempt_check() )
1894 {
1895 ret = -ERESTART;
1896 goto out;
1897 }
1898 }
1899
1900 /* list is empty at this point. */
1901 page_list_move(list, &d->arch.relmem_list);
1902
1903 out:
1904 spin_unlock_recursive(&d->page_alloc_lock);
1905 return ret;
1906 }
1907
domain_relinquish_resources(struct domain * d)1908 int domain_relinquish_resources(struct domain *d)
1909 {
1910 int ret;
1911 struct vcpu *v;
1912
1913 BUG_ON(!cpumask_empty(d->domain_dirty_cpumask));
1914
1915 switch ( d->arch.relmem )
1916 {
1917 case RELMEM_not_started:
1918 ret = pci_release_devices(d);
1919 if ( ret )
1920 return ret;
1921
1922 /* Tear down paging-assistance stuff. */
1923 ret = paging_teardown(d);
1924 if ( ret )
1925 return ret;
1926
1927 /* Drop the in-use references to page-table bases. */
1928 for_each_vcpu ( d, v )
1929 {
1930 ret = vcpu_destroy_pagetables(v);
1931 if ( ret )
1932 return ret;
1933 }
1934
1935 if ( is_pv_domain(d) )
1936 {
1937 for_each_vcpu ( d, v )
1938 {
1939 /*
1940 * Relinquish GDT mappings. No need for explicit unmapping of
1941 * the LDT as it automatically gets squashed with the guest
1942 * mappings.
1943 */
1944 pv_destroy_gdt(v);
1945 }
1946 }
1947
1948 if ( d->arch.pirq_eoi_map != NULL )
1949 {
1950 unmap_domain_page_global(d->arch.pirq_eoi_map);
1951 put_page_and_type(mfn_to_page(d->arch.pirq_eoi_map_mfn));
1952 d->arch.pirq_eoi_map = NULL;
1953 d->arch.auto_unmask = 0;
1954 }
1955
1956 d->arch.relmem = RELMEM_shared;
1957 /* fallthrough */
1958
1959 case RELMEM_shared:
1960
1961 if ( is_hvm_domain(d) )
1962 {
1963 /* If the domain has shared pages, relinquish them allowing
1964 * for preemption. */
1965 ret = relinquish_shared_pages(d);
1966 if ( ret )
1967 return ret;
1968 }
1969
1970 d->arch.relmem = RELMEM_xen;
1971
1972 spin_lock(&d->page_alloc_lock);
1973 page_list_splice(&d->arch.relmem_list, &d->page_list);
1974 INIT_PAGE_LIST_HEAD(&d->arch.relmem_list);
1975 spin_unlock(&d->page_alloc_lock);
1976
1977 /* Fallthrough. Relinquish every page of memory. */
1978 case RELMEM_xen:
1979 ret = relinquish_memory(d, &d->xenpage_list, ~0UL);
1980 if ( ret )
1981 return ret;
1982 d->arch.relmem = RELMEM_l4;
1983 /* fallthrough */
1984
1985 case RELMEM_l4:
1986 ret = relinquish_memory(d, &d->page_list, PGT_l4_page_table);
1987 if ( ret )
1988 return ret;
1989 d->arch.relmem = RELMEM_l3;
1990 /* fallthrough */
1991
1992 case RELMEM_l3:
1993 ret = relinquish_memory(d, &d->page_list, PGT_l3_page_table);
1994 if ( ret )
1995 return ret;
1996 d->arch.relmem = RELMEM_l2;
1997 /* fallthrough */
1998
1999 case RELMEM_l2:
2000 ret = relinquish_memory(d, &d->page_list, PGT_l2_page_table);
2001 if ( ret )
2002 return ret;
2003 d->arch.relmem = RELMEM_done;
2004 /* fallthrough */
2005
2006 case RELMEM_done:
2007 break;
2008
2009 default:
2010 BUG();
2011 }
2012
2013 pit_deinit(d);
2014
2015 if ( is_hvm_domain(d) )
2016 hvm_domain_relinquish_resources(d);
2017
2018 return 0;
2019 }
2020
arch_dump_domain_info(struct domain * d)2021 void arch_dump_domain_info(struct domain *d)
2022 {
2023 paging_dump_domain_info(d);
2024 }
2025
arch_dump_vcpu_info(struct vcpu * v)2026 void arch_dump_vcpu_info(struct vcpu *v)
2027 {
2028 paging_dump_vcpu_info(v);
2029
2030 vpmu_dump(v);
2031 }
2032
vcpu_kick(struct vcpu * v)2033 void vcpu_kick(struct vcpu *v)
2034 {
2035 /*
2036 * NB1. 'pause_flags' and 'processor' must be checked /after/ update of
2037 * pending flag. These values may fluctuate (after all, we hold no
2038 * locks) but the key insight is that each change will cause
2039 * evtchn_upcall_pending to be polled.
2040 *
2041 * NB2. We save the running flag across the unblock to avoid a needless
2042 * IPI for domains that we IPI'd to unblock.
2043 */
2044 bool running = v->is_running;
2045
2046 vcpu_unblock(v);
2047 if ( running && (in_irq() || (v != current)) )
2048 cpu_raise_softirq(v->processor, VCPU_KICK_SOFTIRQ);
2049 }
2050
vcpu_mark_events_pending(struct vcpu * v)2051 void vcpu_mark_events_pending(struct vcpu *v)
2052 {
2053 int already_pending = test_and_set_bit(
2054 0, (unsigned long *)&vcpu_info(v, evtchn_upcall_pending));
2055
2056 if ( already_pending )
2057 return;
2058
2059 if ( is_hvm_vcpu(v) )
2060 hvm_assert_evtchn_irq(v);
2061 else
2062 vcpu_kick(v);
2063 }
2064
vcpu_kick_softirq(void)2065 static void vcpu_kick_softirq(void)
2066 {
2067 /*
2068 * Nothing to do here: we merely prevent notifiers from racing with checks
2069 * executed on return to guest context with interrupts enabled. See, for
2070 * example, xxx_intr_assist() executed on return to HVM guest context.
2071 */
2072 }
2073
init_vcpu_kick_softirq(void)2074 static int __init init_vcpu_kick_softirq(void)
2075 {
2076 open_softirq(VCPU_KICK_SOFTIRQ, vcpu_kick_softirq);
2077 return 0;
2078 }
2079 __initcall(init_vcpu_kick_softirq);
2080
2081
2082 /*
2083 * Local variables:
2084 * mode: C
2085 * c-file-style: "BSD"
2086 * c-basic-offset: 4
2087 * tab-width: 4
2088 * indent-tabs-mode: nil
2089 * End:
2090 */
2091