1 /******************************************************************************
2  * arch/x86/domain.c
3  *
4  * x86-specific domain handling (e.g., register setup and context switching).
5  */
6 
7 /*
8  *  Copyright (C) 1995  Linus Torvalds
9  *
10  *  Pentium III FXSR, SSE support
11  *  Gareth Hughes <gareth@valinux.com>, May 2000
12  */
13 
14 #include <xen/init.h>
15 #include <xen/lib.h>
16 #include <xen/errno.h>
17 #include <xen/sched.h>
18 #include <xen/domain.h>
19 #include <xen/smp.h>
20 #include <xen/delay.h>
21 #include <xen/softirq.h>
22 #include <xen/grant_table.h>
23 #include <xen/iocap.h>
24 #include <xen/kernel.h>
25 #include <xen/hypercall.h>
26 #include <xen/multicall.h>
27 #include <xen/irq.h>
28 #include <xen/event.h>
29 #include <xen/console.h>
30 #include <xen/percpu.h>
31 #include <xen/compat.h>
32 #include <xen/acpi.h>
33 #include <xen/pci.h>
34 #include <xen/paging.h>
35 #include <xen/cpu.h>
36 #include <xen/wait.h>
37 #include <xen/guest_access.h>
38 #include <xen/livepatch.h>
39 #include <public/sysctl.h>
40 #include <public/hvm/hvm_vcpu.h>
41 #include <asm/regs.h>
42 #include <asm/mc146818rtc.h>
43 #include <asm/system.h>
44 #include <asm/io.h>
45 #include <asm/processor.h>
46 #include <asm/desc.h>
47 #include <asm/i387.h>
48 #include <asm/xstate.h>
49 #include <asm/cpuidle.h>
50 #include <asm/mpspec.h>
51 #include <asm/ldt.h>
52 #include <asm/hvm/hvm.h>
53 #include <asm/hvm/nestedhvm.h>
54 #include <asm/hvm/support.h>
55 #include <asm/hvm/viridian.h>
56 #include <asm/debugreg.h>
57 #include <asm/msr.h>
58 #include <asm/traps.h>
59 #include <asm/nmi.h>
60 #include <asm/mce.h>
61 #include <asm/amd.h>
62 #include <xen/numa.h>
63 #include <xen/iommu.h>
64 #include <compat/vcpu.h>
65 #include <asm/psr.h>
66 #include <asm/pv/domain.h>
67 #include <asm/pv/mm.h>
68 
69 DEFINE_PER_CPU(struct vcpu *, curr_vcpu);
70 
71 static void default_idle(void);
72 void (*pm_idle) (void) __read_mostly = default_idle;
73 void (*dead_idle) (void) __read_mostly = default_dead_idle;
74 
default_idle(void)75 static void default_idle(void)
76 {
77     local_irq_disable();
78     if ( cpu_is_haltable(smp_processor_id()) )
79         safe_halt();
80     else
81         local_irq_enable();
82 }
83 
default_dead_idle(void)84 void default_dead_idle(void)
85 {
86     /*
87      * When going into S3, without flushing caches modified data may be
88      * held by the CPUs spinning here indefinitely, and get discarded by
89      * a subsequent INIT.
90      */
91     wbinvd();
92     for ( ; ; )
93         halt();
94 }
95 
play_dead(void)96 static void play_dead(void)
97 {
98     local_irq_disable();
99 
100     /*
101      * NOTE: After cpu_exit_clear, per-cpu variables are no longer accessible,
102      * as they may be freed at any time. In this case, heap corruption or
103      * #PF can occur (when heap debugging is enabled). For example, even
104      * printk() can involve tasklet scheduling, which touches per-cpu vars.
105      *
106      * Consider very carefully when adding code to *dead_idle. Most hypervisor
107      * subsystems are unsafe to call.
108      */
109     cpu_exit_clear(smp_processor_id());
110 
111     (*dead_idle)();
112 }
113 
idle_loop(void)114 static void idle_loop(void)
115 {
116     unsigned int cpu = smp_processor_id();
117 
118     for ( ; ; )
119     {
120         if ( cpu_is_offline(cpu) )
121             play_dead();
122 
123         /* Are we here for running vcpu context tasklets, or for idling? */
124         if ( unlikely(tasklet_work_to_do(cpu)) )
125             do_tasklet();
126         /*
127          * Test softirqs twice --- first to see if should even try scrubbing
128          * and then, after it is done, whether softirqs became pending
129          * while we were scrubbing.
130          */
131         else if ( !softirq_pending(cpu) && !scrub_free_pages()  &&
132                     !softirq_pending(cpu) )
133             pm_idle();
134         do_softirq();
135         /*
136          * We MUST be last (or before pm_idle). Otherwise after we get the
137          * softirq we would execute pm_idle (and sleep) and not patch.
138          */
139         check_for_livepatch_work();
140     }
141 }
142 
startup_cpu_idle_loop(void)143 void startup_cpu_idle_loop(void)
144 {
145     struct vcpu *v = current;
146 
147     ASSERT(is_idle_vcpu(v));
148     cpumask_set_cpu(v->processor, v->domain->domain_dirty_cpumask);
149     cpumask_set_cpu(v->processor, v->vcpu_dirty_cpumask);
150 
151     reset_stack_and_jump(idle_loop);
152 }
153 
continue_idle_domain(struct vcpu * v)154 static void noreturn continue_idle_domain(struct vcpu *v)
155 {
156     reset_stack_and_jump(idle_loop);
157 }
158 
dump_pageframe_info(struct domain * d)159 void dump_pageframe_info(struct domain *d)
160 {
161     struct page_info *page;
162 
163     printk("Memory pages belonging to domain %u:\n", d->domain_id);
164 
165     if ( d->tot_pages >= 10 && d->is_dying < DOMDYING_dead )
166     {
167         printk("    DomPage list too long to display\n");
168     }
169     else
170     {
171         unsigned long total[MASK_EXTR(PGT_type_mask, PGT_type_mask) + 1] = {};
172 
173         spin_lock(&d->page_alloc_lock);
174         page_list_for_each ( page, &d->page_list )
175         {
176             unsigned int index = MASK_EXTR(page->u.inuse.type_info,
177                                            PGT_type_mask);
178 
179             if ( ++total[index] > 16 )
180             {
181                 switch ( page->u.inuse.type_info & PGT_type_mask )
182                 {
183                 case PGT_none:
184                 case PGT_writable_page:
185                     continue;
186                 }
187             }
188             printk("    DomPage %p: caf=%08lx, taf=%" PRtype_info "\n",
189                    _p(page_to_mfn(page)),
190                    page->count_info, page->u.inuse.type_info);
191         }
192         spin_unlock(&d->page_alloc_lock);
193     }
194 
195     if ( is_hvm_domain(d) )
196         p2m_pod_dump_data(d);
197 
198     spin_lock(&d->page_alloc_lock);
199     page_list_for_each ( page, &d->xenpage_list )
200     {
201         printk("    XenPage %p: caf=%08lx, taf=%" PRtype_info "\n",
202                _p(page_to_mfn(page)),
203                page->count_info, page->u.inuse.type_info);
204     }
205     spin_unlock(&d->page_alloc_lock);
206 }
207 
update_guest_memory_policy(struct vcpu * v,struct guest_memory_policy * policy)208 void update_guest_memory_policy(struct vcpu *v,
209                                 struct guest_memory_policy *policy)
210 {
211     smap_check_policy_t old_smap_policy = v->arch.smap_check_policy;
212     bool old_guest_mode = nestedhvm_is_n2(v);
213     bool new_guest_mode = policy->nested_guest_mode;
214 
215     v->arch.smap_check_policy = policy->smap_policy;
216     policy->smap_policy = old_smap_policy;
217 
218     /*
219      * When 'v' is in the nested guest mode, all guest copy
220      * functions/macros which finally call paging_gva_to_gfn()
221      * transfer data to/from L2 guest. If the copy is intended for L1
222      * guest, we must first clear the nested guest flag (by setting
223      * policy->nested_guest_mode to false) before the copy and then
224      * restore the nested guest flag (by setting
225      * policy->nested_guest_mode to true) after the copy.
226      */
227     if ( unlikely(old_guest_mode != new_guest_mode) )
228     {
229         if ( new_guest_mode )
230             nestedhvm_vcpu_enter_guestmode(v);
231         else
232             nestedhvm_vcpu_exit_guestmode(v);
233         policy->nested_guest_mode = old_guest_mode;
234     }
235 }
236 
237 #ifndef CONFIG_BIGMEM
238 /*
239  * The hole may be at or above the 44-bit boundary, so we need to determine
240  * the total bit count until reaching 32 significant (not squashed out) bits
241  * in PFN representations.
242  * Note that the way "bits" gets initialized/updated/bounds-checked guarantees
243  * that the function will never return zero, and hence will never be called
244  * more than once (which is important due to it being deliberately placed in
245  * .init.text).
246  */
_domain_struct_bits(void)247 static unsigned int __init noinline _domain_struct_bits(void)
248 {
249     unsigned int bits = 32 + PAGE_SHIFT;
250     unsigned int sig = hweight32(~pfn_hole_mask);
251     unsigned int mask = pfn_hole_mask >> 32;
252 
253     for ( ; bits < BITS_PER_LONG && sig < 32; ++bits, mask >>= 1 )
254         if ( !(mask & 1) )
255             ++sig;
256 
257     return bits;
258 }
259 #endif
260 
alloc_domain_struct(void)261 struct domain *alloc_domain_struct(void)
262 {
263     struct domain *d;
264     unsigned int order = get_order_from_bytes(sizeof(*d));
265 #ifdef CONFIG_BIGMEM
266     const unsigned int bits = 0;
267 #else
268     /*
269      * We pack the PDX of the domain structure into a 32-bit field within
270      * the page_info structure. Hence the MEMF_bits() restriction.
271      */
272     static unsigned int __read_mostly bits;
273 
274     if ( unlikely(!bits) )
275          bits = _domain_struct_bits();
276 #endif
277 
278 
279 #ifndef CONFIG_LOCK_PROFILE
280     BUILD_BUG_ON(sizeof(*d) > PAGE_SIZE);
281 #endif
282     d = alloc_xenheap_pages(order, MEMF_bits(bits));
283     if ( d != NULL )
284     {
285         unsigned int sz;
286 
287         for ( sz = 0; sz < (PAGE_SIZE << order); sz += PAGE_SIZE )
288             clear_page((void *)d + sz);
289     }
290     return d;
291 }
292 
free_domain_struct(struct domain * d)293 void free_domain_struct(struct domain *d)
294 {
295     lock_profile_deregister_struct(LOCKPROF_TYPE_PERDOM, d);
296     free_xenheap_page(d);
297 }
298 
alloc_vcpu_struct(void)299 struct vcpu *alloc_vcpu_struct(void)
300 {
301     struct vcpu *v;
302     /*
303      * This structure contains embedded PAE PDPTEs, used when an HVM guest
304      * runs on shadow pagetables outside of 64-bit mode. In this case the CPU
305      * may require that the shadow CR3 points below 4GB, and hence the whole
306      * structure must satisfy this restriction. Thus we specify MEMF_bits(32).
307      */
308     BUILD_BUG_ON(sizeof(*v) > PAGE_SIZE);
309     v = alloc_xenheap_pages(0, MEMF_bits(32));
310     if ( v != NULL )
311         clear_page(v);
312     return v;
313 }
314 
free_vcpu_struct(struct vcpu * v)315 void free_vcpu_struct(struct vcpu *v)
316 {
317     free_xenheap_page(v);
318 }
319 
vcpu_initialise(struct vcpu * v)320 int vcpu_initialise(struct vcpu *v)
321 {
322     struct domain *d = v->domain;
323     int rc;
324 
325     v->arch.flags = TF_kernel_mode;
326 
327     rc = mapcache_vcpu_init(v);
328     if ( rc )
329         return rc;
330 
331     if ( !is_idle_domain(d) )
332     {
333         paging_vcpu_init(v);
334 
335         if ( (rc = vcpu_init_fpu(v)) != 0 )
336             return rc;
337 
338         vmce_init_vcpu(v);
339     }
340     else if ( (rc = xstate_alloc_save_area(v)) != 0 )
341         return rc;
342 
343     spin_lock_init(&v->arch.vpmu.vpmu_lock);
344 
345     if ( is_hvm_domain(d) )
346         rc = hvm_vcpu_initialise(v);
347     else if ( !is_idle_domain(d) )
348         rc = pv_vcpu_initialise(v);
349     else
350     {
351         /* Idle domain */
352         v->arch.cr3 = __pa(idle_pg_table);
353         rc = 0;
354         v->arch.msr = ZERO_BLOCK_PTR; /* Catch stray misuses */
355     }
356 
357     if ( rc )
358         goto fail;
359 
360     if ( !is_idle_domain(v->domain) )
361     {
362         vpmu_initialise(v);
363 
364         if ( (rc = init_vcpu_msr_policy(v)) )
365             goto fail;
366     }
367 
368     return rc;
369 
370  fail:
371     vcpu_destroy_fpu(v);
372     xfree(v->arch.msr);
373     v->arch.msr = NULL;
374 
375     return rc;
376 }
377 
vcpu_destroy(struct vcpu * v)378 void vcpu_destroy(struct vcpu *v)
379 {
380     xfree(v->arch.vm_event);
381     v->arch.vm_event = NULL;
382 
383     vcpu_destroy_fpu(v);
384 
385     xfree(v->arch.msr);
386     v->arch.msr = NULL;
387 
388     if ( !is_idle_domain(v->domain) )
389         vpmu_destroy(v);
390 
391     if ( is_hvm_vcpu(v) )
392         hvm_vcpu_destroy(v);
393     else
394         pv_vcpu_destroy(v);
395 }
396 
emulation_flags_ok(const struct domain * d,uint32_t emflags)397 static bool emulation_flags_ok(const struct domain *d, uint32_t emflags)
398 {
399 
400     if ( is_hvm_domain(d) )
401     {
402         if ( is_hardware_domain(d) &&
403              emflags != (XEN_X86_EMU_LAPIC|XEN_X86_EMU_IOAPIC) )
404             return false;
405         if ( !is_hardware_domain(d) && emflags &&
406              emflags != XEN_X86_EMU_ALL && emflags != XEN_X86_EMU_LAPIC )
407             return false;
408     }
409     else if ( emflags != 0 && emflags != XEN_X86_EMU_PIT )
410     {
411         /* PV or classic PVH. */
412         return false;
413     }
414 
415     return true;
416 }
417 
arch_domain_create(struct domain * d,unsigned int domcr_flags,struct xen_arch_domainconfig * config)418 int arch_domain_create(struct domain *d, unsigned int domcr_flags,
419                        struct xen_arch_domainconfig *config)
420 {
421     bool paging_initialised = false;
422     int rc;
423 
424     if ( config == NULL && !is_idle_domain(d) )
425         return -EINVAL;
426 
427     d->arch.s3_integrity = !!(domcr_flags & DOMCRF_s3_integrity);
428 
429     INIT_LIST_HEAD(&d->arch.pdev_list);
430 
431     d->arch.relmem = RELMEM_not_started;
432     INIT_PAGE_LIST_HEAD(&d->arch.relmem_list);
433 
434     if ( d->domain_id && !is_idle_domain(d) &&
435          cpu_has_amd_erratum(&boot_cpu_data, AMD_ERRATUM_121) )
436     {
437         if ( !opt_allow_unsafe )
438         {
439             printk(XENLOG_G_ERR "Xen does not allow DomU creation on this CPU"
440                    " for security reasons.\n");
441             return -EPERM;
442         }
443         printk(XENLOG_G_WARNING
444                "Dom%d may compromise security on this CPU.\n",
445                d->domain_id);
446     }
447 
448     if ( is_idle_domain(d) )
449     {
450         d->arch.emulation_flags = 0;
451         d->arch.cpuid = ZERO_BLOCK_PTR; /* Catch stray misuses. */
452         d->arch.msr = ZERO_BLOCK_PTR;
453     }
454     else
455     {
456         uint32_t emflags;
457 
458         if ( is_hardware_domain(d) && is_pv_domain(d) )
459             config->emulation_flags |= XEN_X86_EMU_PIT;
460 
461         emflags = config->emulation_flags;
462         if ( emflags & ~XEN_X86_EMU_ALL )
463         {
464             printk(XENLOG_G_ERR "d%d: Invalid emulation bitmap: %#x\n",
465                    d->domain_id, emflags);
466             return -EINVAL;
467         }
468 
469         if ( !emulation_flags_ok(d, emflags) )
470         {
471             printk(XENLOG_G_ERR "d%d: Xen does not allow %s domain creation "
472                    "with the current selection of emulators: %#x\n",
473                    d->domain_id, is_hvm_domain(d) ? "HVM" : "PV", emflags);
474             return -EOPNOTSUPP;
475         }
476         d->arch.emulation_flags = emflags;
477     }
478 
479     mapcache_domain_init(d);
480 
481     HYPERVISOR_COMPAT_VIRT_START(d) =
482         is_pv_domain(d) ? __HYPERVISOR_COMPAT_VIRT_START : ~0u;
483 
484     if ( !is_idle_domain(d) )
485     {
486         /* Need to determine if HAP is enabled before initialising paging */
487         if ( is_hvm_domain(d) )
488             d->arch.hvm_domain.hap_enabled =
489                 hvm_funcs.hap_supported && (domcr_flags & DOMCRF_hap);
490 
491         if ( (rc = paging_domain_init(d, domcr_flags)) != 0 )
492             goto fail;
493         paging_initialised = 1;
494 
495         if ( (rc = init_domain_cpuid_policy(d)) )
496             goto fail;
497 
498         if ( (rc = init_domain_msr_policy(d)) )
499             goto fail;
500 
501         d->arch.ioport_caps =
502             rangeset_new(d, "I/O Ports", RANGESETF_prettyprint_hex);
503         rc = -ENOMEM;
504         if ( d->arch.ioport_caps == NULL )
505             goto fail;
506 
507         /*
508          * The shared_info machine address must fit in a 32-bit field within a
509          * 32-bit guest's start_info structure. Hence we specify MEMF_bits(32).
510          */
511         if ( (d->shared_info = alloc_xenheap_pages(0, MEMF_bits(32))) == NULL )
512             goto fail;
513 
514         clear_page(d->shared_info);
515         share_xen_page_with_guest(
516             virt_to_page(d->shared_info), d, XENSHARE_writable);
517 
518         if ( (rc = init_domain_irq_mapping(d)) != 0 )
519             goto fail;
520 
521         if ( (rc = iommu_domain_init(d)) != 0 )
522             goto fail;
523     }
524     spin_lock_init(&d->arch.e820_lock);
525 
526     psr_domain_init(d);
527 
528     if ( is_hvm_domain(d) )
529     {
530         if ( (rc = hvm_domain_initialise(d, domcr_flags, config)) != 0 )
531             goto fail;
532     }
533     else if ( is_idle_domain(d) )
534     {
535         static const struct arch_csw idle_csw = {
536             .from = paravirt_ctxt_switch_from,
537             .to   = paravirt_ctxt_switch_to,
538             .tail = continue_idle_domain,
539         };
540 
541         d->arch.ctxt_switch = &idle_csw;
542     }
543     else
544     {
545         if ( (rc = pv_domain_initialise(d, domcr_flags, config)) != 0 )
546             goto fail;
547     }
548 
549     /* initialize default tsc behavior in case tools don't */
550     tsc_set_info(d, TSC_MODE_DEFAULT, 0UL, 0, 0);
551     spin_lock_init(&d->arch.vtsc_lock);
552 
553     /* PV/PVH guests get an emulated PIT too for video BIOSes to use. */
554     pit_init(d, cpu_khz);
555 
556     /*
557      * If the FPU does not save FCS/FDS then we can always
558      * save/restore the 64-bit FIP/FDP and ignore the selectors.
559      */
560     d->arch.x87_fip_width = cpu_has_fpu_sel ? 0 : 8;
561 
562     return 0;
563 
564  fail:
565     d->is_dying = DOMDYING_dead;
566     psr_domain_free(d);
567     iommu_domain_destroy(d);
568     cleanup_domain_irq_mapping(d);
569     free_xenheap_page(d->shared_info);
570     xfree(d->arch.cpuid);
571     xfree(d->arch.msr);
572     if ( paging_initialised )
573         paging_final_teardown(d);
574     free_perdomain_mappings(d);
575 
576     return rc;
577 }
578 
arch_domain_destroy(struct domain * d)579 void arch_domain_destroy(struct domain *d)
580 {
581     if ( is_hvm_domain(d) )
582         hvm_domain_destroy(d);
583 
584     xfree(d->arch.e820);
585     xfree(d->arch.cpuid);
586     xfree(d->arch.msr);
587 
588     free_domain_pirqs(d);
589     if ( !is_idle_domain(d) )
590         iommu_domain_destroy(d);
591 
592     paging_final_teardown(d);
593 
594     if ( is_pv_domain(d) )
595         pv_domain_destroy(d);
596     free_perdomain_mappings(d);
597 
598     free_xenheap_page(d->shared_info);
599     cleanup_domain_irq_mapping(d);
600 
601     psr_domain_free(d);
602 }
603 
arch_domain_shutdown(struct domain * d)604 void arch_domain_shutdown(struct domain *d)
605 {
606     if ( has_viridian_time_ref_count(d) )
607         viridian_time_ref_count_freeze(d);
608 }
609 
arch_domain_pause(struct domain * d)610 void arch_domain_pause(struct domain *d)
611 {
612     if ( has_viridian_time_ref_count(d) )
613         viridian_time_ref_count_freeze(d);
614 }
615 
arch_domain_unpause(struct domain * d)616 void arch_domain_unpause(struct domain *d)
617 {
618     if ( has_viridian_time_ref_count(d) )
619         viridian_time_ref_count_thaw(d);
620 }
621 
arch_domain_soft_reset(struct domain * d)622 int arch_domain_soft_reset(struct domain *d)
623 {
624     struct page_info *page = virt_to_page(d->shared_info), *new_page;
625     int ret = 0;
626     struct domain *owner;
627     unsigned long mfn, gfn;
628     p2m_type_t p2mt;
629     unsigned int i;
630 
631     /* Soft reset is supported for HVM domains only. */
632     if ( !is_hvm_domain(d) )
633         return -EINVAL;
634 
635     hvm_domain_soft_reset(d);
636 
637     spin_lock(&d->event_lock);
638     for ( i = 0; i < d->nr_pirqs ; i++ )
639     {
640         if ( domain_pirq_to_emuirq(d, i) != IRQ_UNBOUND )
641         {
642             ret = unmap_domain_pirq_emuirq(d, i);
643             if ( ret )
644                 break;
645         }
646     }
647     spin_unlock(&d->event_lock);
648 
649     if ( ret )
650         return ret;
651 
652     /*
653      * The shared_info page needs to be replaced with a new page, otherwise we
654      * will get a hole if the domain does XENMAPSPACE_shared_info.
655      */
656 
657     owner = page_get_owner_and_reference(page);
658     ASSERT( owner == d );
659 
660     mfn = page_to_mfn(page);
661     gfn = mfn_to_gmfn(d, mfn);
662 
663     /*
664      * gfn == INVALID_GFN indicates that the shared_info page was never mapped
665      * to the domain's address space and there is nothing to replace.
666      */
667     if ( gfn == gfn_x(INVALID_GFN) )
668         goto exit_put_page;
669 
670     if ( mfn_x(get_gfn_query(d, gfn, &p2mt)) != mfn )
671     {
672         printk(XENLOG_G_ERR "Failed to get Dom%d's shared_info GFN (%lx)\n",
673                d->domain_id, gfn);
674         ret = -EINVAL;
675         goto exit_put_page;
676     }
677 
678     new_page = alloc_domheap_page(d, 0);
679     if ( !new_page )
680     {
681         printk(XENLOG_G_ERR "Failed to alloc a page to replace"
682                " Dom%d's shared_info frame %lx\n", d->domain_id, gfn);
683         ret = -ENOMEM;
684         goto exit_put_gfn;
685     }
686 
687     ret = guest_physmap_remove_page(d, _gfn(gfn), _mfn(mfn), PAGE_ORDER_4K);
688     if ( ret )
689     {
690         printk(XENLOG_G_ERR "Failed to remove Dom%d's shared_info frame %lx\n",
691                d->domain_id, gfn);
692         free_domheap_page(new_page);
693         goto exit_put_gfn;
694     }
695 
696     ret = guest_physmap_add_page(d, _gfn(gfn), _mfn(page_to_mfn(new_page)),
697                                  PAGE_ORDER_4K);
698     if ( ret )
699     {
700         printk(XENLOG_G_ERR "Failed to add a page to replace"
701                " Dom%d's shared_info frame %lx\n", d->domain_id, gfn);
702         free_domheap_page(new_page);
703     }
704  exit_put_gfn:
705     put_gfn(d, gfn);
706  exit_put_page:
707     put_page(page);
708 
709     return ret;
710 }
711 
712 /*
713  * These are the masks of CR4 bits (subject to hardware availability) which a
714  * PV guest may not legitimiately attempt to modify.
715  */
716 static unsigned long __read_mostly pv_cr4_mask, compat_pv_cr4_mask;
717 
init_pv_cr4_masks(void)718 static int __init init_pv_cr4_masks(void)
719 {
720     unsigned long common_mask = ~X86_CR4_TSD;
721 
722     /*
723      * All PV guests may attempt to modify TSD, DE and OSXSAVE.
724      */
725     if ( cpu_has_de )
726         common_mask &= ~X86_CR4_DE;
727     if ( cpu_has_xsave )
728         common_mask &= ~X86_CR4_OSXSAVE;
729 
730     pv_cr4_mask = compat_pv_cr4_mask = common_mask;
731 
732     /*
733      * 64bit PV guests may attempt to modify FSGSBASE.
734      */
735     if ( cpu_has_fsgsbase )
736         pv_cr4_mask &= ~X86_CR4_FSGSBASE;
737 
738     return 0;
739 }
740 __initcall(init_pv_cr4_masks);
741 
pv_guest_cr4_fixup(const struct vcpu * v,unsigned long guest_cr4)742 unsigned long pv_guest_cr4_fixup(const struct vcpu *v, unsigned long guest_cr4)
743 {
744     unsigned long hv_cr4 = real_cr4_to_pv_guest_cr4(read_cr4());
745     unsigned long mask = is_pv_32bit_vcpu(v) ? compat_pv_cr4_mask : pv_cr4_mask;
746 
747     if ( (guest_cr4 & mask) != (hv_cr4 & mask) )
748         printk(XENLOG_G_WARNING
749                "d%d attempted to change %pv's CR4 flags %08lx -> %08lx\n",
750                current->domain->domain_id, v, hv_cr4, guest_cr4);
751 
752     return (hv_cr4 & mask) | (guest_cr4 & ~mask);
753 }
754 
755 #define xen_vcpu_guest_context vcpu_guest_context
756 #define fpu_ctxt fpu_ctxt.x
757 CHECK_FIELD_(struct, vcpu_guest_context, fpu_ctxt);
758 #undef fpu_ctxt
759 #undef xen_vcpu_guest_context
760 
761 /* Called by XEN_DOMCTL_setvcpucontext and VCPUOP_initialise. */
arch_set_info_guest(struct vcpu * v,vcpu_guest_context_u c)762 int arch_set_info_guest(
763     struct vcpu *v, vcpu_guest_context_u c)
764 {
765     struct domain *d = v->domain;
766     unsigned long cr3_gfn;
767     struct page_info *cr3_page;
768     unsigned long flags, cr4;
769     unsigned int i;
770     int rc = 0, compat;
771 
772     /* The context is a compat-mode one if the target domain is compat-mode;
773      * we expect the tools to DTRT even in compat-mode callers. */
774     compat = is_pv_32bit_domain(d);
775 
776 #define c(fld) (compat ? (c.cmp->fld) : (c.nat->fld))
777     flags = c(flags);
778 
779     if ( is_pv_domain(d) )
780     {
781         if ( !compat )
782         {
783             if ( !is_canonical_address(c.nat->user_regs.rip) ||
784                  !is_canonical_address(c.nat->user_regs.rsp) ||
785                  !is_canonical_address(c.nat->kernel_sp) ||
786                  (c.nat->ldt_ents && !is_canonical_address(c.nat->ldt_base)) ||
787                  !is_canonical_address(c.nat->fs_base) ||
788                  !is_canonical_address(c.nat->gs_base_kernel) ||
789                  !is_canonical_address(c.nat->gs_base_user) ||
790                  !is_canonical_address(c.nat->event_callback_eip) ||
791                  !is_canonical_address(c.nat->syscall_callback_eip) ||
792                  !is_canonical_address(c.nat->failsafe_callback_eip) )
793                 return -EINVAL;
794 
795             fixup_guest_stack_selector(d, c.nat->user_regs.ss);
796             fixup_guest_stack_selector(d, c.nat->kernel_ss);
797             fixup_guest_code_selector(d, c.nat->user_regs.cs);
798 
799             for ( i = 0; i < ARRAY_SIZE(c.nat->trap_ctxt); i++ )
800             {
801                 if ( !is_canonical_address(c.nat->trap_ctxt[i].address) )
802                     return -EINVAL;
803                 fixup_guest_code_selector(d, c.nat->trap_ctxt[i].cs);
804             }
805 
806             if ( !__addr_ok(c.nat->ldt_base) )
807                 return -EINVAL;
808         }
809         else
810         {
811             fixup_guest_stack_selector(d, c.cmp->user_regs.ss);
812             fixup_guest_stack_selector(d, c.cmp->kernel_ss);
813             fixup_guest_code_selector(d, c.cmp->user_regs.cs);
814             fixup_guest_code_selector(d, c.cmp->event_callback_cs);
815             fixup_guest_code_selector(d, c.cmp->failsafe_callback_cs);
816 
817             for ( i = 0; i < ARRAY_SIZE(c.cmp->trap_ctxt); i++ )
818                 fixup_guest_code_selector(d, c.cmp->trap_ctxt[i].cs);
819         }
820 
821         /* LDT safety checks. */
822         if ( ((c(ldt_base) & (PAGE_SIZE - 1)) != 0) ||
823              (c(ldt_ents) > 8192) )
824             return -EINVAL;
825     }
826 
827     v->fpu_initialised = !!(flags & VGCF_I387_VALID);
828 
829     v->arch.flags &= ~TF_kernel_mode;
830     if ( (flags & VGCF_in_kernel) || is_hvm_domain(d)/*???*/ )
831         v->arch.flags |= TF_kernel_mode;
832 
833     v->arch.vgc_flags = flags;
834 
835     if ( flags & VGCF_I387_VALID )
836     {
837         memcpy(v->arch.fpu_ctxt, &c.nat->fpu_ctxt, sizeof(c.nat->fpu_ctxt));
838         if ( v->arch.xsave_area )
839             v->arch.xsave_area->xsave_hdr.xstate_bv = XSTATE_FP_SSE;
840     }
841     else if ( v->arch.xsave_area )
842     {
843         v->arch.xsave_area->xsave_hdr.xstate_bv = 0;
844         v->arch.xsave_area->fpu_sse.mxcsr = MXCSR_DEFAULT;
845     }
846     else
847     {
848         typeof(v->arch.xsave_area->fpu_sse) *fpu_sse = v->arch.fpu_ctxt;
849 
850         memset(fpu_sse, 0, sizeof(*fpu_sse));
851         fpu_sse->fcw = FCW_DEFAULT;
852         fpu_sse->mxcsr = MXCSR_DEFAULT;
853     }
854     if ( v->arch.xsave_area )
855         v->arch.xsave_area->xsave_hdr.xcomp_bv = 0;
856 
857     if ( !compat )
858     {
859         memcpy(&v->arch.user_regs, &c.nat->user_regs, sizeof(c.nat->user_regs));
860         if ( is_pv_domain(d) )
861             memcpy(v->arch.pv_vcpu.trap_ctxt, c.nat->trap_ctxt,
862                    sizeof(c.nat->trap_ctxt));
863     }
864     else
865     {
866         XLAT_cpu_user_regs(&v->arch.user_regs, &c.cmp->user_regs);
867         if ( is_pv_domain(d) )
868         {
869             for ( i = 0; i < ARRAY_SIZE(c.cmp->trap_ctxt); ++i )
870                 XLAT_trap_info(v->arch.pv_vcpu.trap_ctxt + i,
871                                c.cmp->trap_ctxt + i);
872         }
873     }
874 
875     if ( is_hvm_domain(d) )
876     {
877         for ( i = 0; i < ARRAY_SIZE(v->arch.debugreg); ++i )
878             v->arch.debugreg[i] = c(debugreg[i]);
879 
880         hvm_set_info_guest(v);
881         goto out;
882     }
883 
884     init_int80_direct_trap(v);
885 
886     /* IOPL privileges are virtualised. */
887     v->arch.pv_vcpu.iopl = v->arch.user_regs.eflags & X86_EFLAGS_IOPL;
888     v->arch.user_regs.eflags &= ~X86_EFLAGS_IOPL;
889 
890     /* Ensure real hardware interrupts are enabled. */
891     v->arch.user_regs.eflags |= X86_EFLAGS_IF;
892 
893     if ( !v->is_initialised )
894     {
895         if ( !compat && !(flags & VGCF_in_kernel) && !c.nat->ctrlreg[1] )
896             return -EINVAL;
897 
898         v->arch.pv_vcpu.ldt_base = c(ldt_base);
899         v->arch.pv_vcpu.ldt_ents = c(ldt_ents);
900     }
901     else
902     {
903         unsigned long pfn = pagetable_get_pfn(v->arch.guest_table);
904         bool fail;
905 
906         if ( !compat )
907         {
908             fail = xen_pfn_to_cr3(pfn) != c.nat->ctrlreg[3];
909             if ( pagetable_is_null(v->arch.guest_table_user) )
910                 fail |= c.nat->ctrlreg[1] || !(flags & VGCF_in_kernel);
911             else
912             {
913                 pfn = pagetable_get_pfn(v->arch.guest_table_user);
914                 fail |= xen_pfn_to_cr3(pfn) != c.nat->ctrlreg[1];
915             }
916         } else {
917             l4_pgentry_t *l4tab = map_domain_page(_mfn(pfn));
918 
919             pfn = l4e_get_pfn(*l4tab);
920             unmap_domain_page(l4tab);
921             fail = compat_pfn_to_cr3(pfn) != c.cmp->ctrlreg[3];
922         }
923 
924         for ( i = 0; i < ARRAY_SIZE(v->arch.pv_vcpu.gdt_frames); ++i )
925             fail |= v->arch.pv_vcpu.gdt_frames[i] != c(gdt_frames[i]);
926         fail |= v->arch.pv_vcpu.gdt_ents != c(gdt_ents);
927 
928         fail |= v->arch.pv_vcpu.ldt_base != c(ldt_base);
929         fail |= v->arch.pv_vcpu.ldt_ents != c(ldt_ents);
930 
931         if ( fail )
932            return -EOPNOTSUPP;
933     }
934 
935     v->arch.pv_vcpu.kernel_ss = c(kernel_ss);
936     v->arch.pv_vcpu.kernel_sp = c(kernel_sp);
937     for ( i = 0; i < ARRAY_SIZE(v->arch.pv_vcpu.ctrlreg); ++i )
938         v->arch.pv_vcpu.ctrlreg[i] = c(ctrlreg[i]);
939 
940     v->arch.pv_vcpu.event_callback_eip = c(event_callback_eip);
941     v->arch.pv_vcpu.failsafe_callback_eip = c(failsafe_callback_eip);
942     if ( !compat )
943     {
944         v->arch.pv_vcpu.syscall_callback_eip = c.nat->syscall_callback_eip;
945         v->arch.pv_vcpu.fs_base = c.nat->fs_base;
946         v->arch.pv_vcpu.gs_base_kernel = c.nat->gs_base_kernel;
947         v->arch.pv_vcpu.gs_base_user = c.nat->gs_base_user;
948     }
949     else
950     {
951         v->arch.pv_vcpu.event_callback_cs = c(event_callback_cs);
952         v->arch.pv_vcpu.failsafe_callback_cs = c(failsafe_callback_cs);
953     }
954 
955     /* Only CR0.TS is modifiable by guest or admin. */
956     v->arch.pv_vcpu.ctrlreg[0] &= X86_CR0_TS;
957     v->arch.pv_vcpu.ctrlreg[0] |= read_cr0() & ~X86_CR0_TS;
958 
959     cr4 = v->arch.pv_vcpu.ctrlreg[4];
960     v->arch.pv_vcpu.ctrlreg[4] = cr4 ? pv_guest_cr4_fixup(v, cr4) :
961         real_cr4_to_pv_guest_cr4(mmu_cr4_features);
962 
963     memset(v->arch.debugreg, 0, sizeof(v->arch.debugreg));
964     for ( i = 0; i < 8; i++ )
965         (void)set_debugreg(v, i, c(debugreg[i]));
966 
967     if ( v->is_initialised )
968         goto out;
969 
970     if ( v->vcpu_id == 0 )
971     {
972         /*
973          * In the restore case we need to deal with L4 pages which got
974          * initialized with m2p_strict still clear (and which hence lack the
975          * correct initial RO_MPT_VIRT_{START,END} L4 entry).
976          */
977         if ( d != current->domain && !VM_ASSIST(d, m2p_strict) &&
978              is_pv_domain(d) && !is_pv_32bit_domain(d) &&
979              test_bit(VMASST_TYPE_m2p_strict, &c.nat->vm_assist) &&
980              atomic_read(&d->arch.pv_domain.nr_l4_pages) )
981         {
982             bool done = false;
983 
984             spin_lock_recursive(&d->page_alloc_lock);
985 
986             for ( i = 0; ; )
987             {
988                 struct page_info *page = page_list_remove_head(&d->page_list);
989 
990                 if ( page_lock(page) )
991                 {
992                     if ( (page->u.inuse.type_info & PGT_type_mask) ==
993                          PGT_l4_page_table )
994                         done = !fill_ro_mpt(_mfn(page_to_mfn(page)));
995 
996                     page_unlock(page);
997                 }
998 
999                 page_list_add_tail(page, &d->page_list);
1000 
1001                 if ( done || (!(++i & 0xff) && hypercall_preempt_check()) )
1002                     break;
1003             }
1004 
1005             spin_unlock_recursive(&d->page_alloc_lock);
1006 
1007             if ( !done )
1008                 return -ERESTART;
1009         }
1010 
1011         d->vm_assist = c(vm_assist);
1012     }
1013 
1014     rc = put_old_guest_table(current);
1015     if ( rc )
1016         return rc;
1017 
1018     if ( !compat )
1019         rc = (int)pv_set_gdt(v, c.nat->gdt_frames, c.nat->gdt_ents);
1020     else
1021     {
1022         unsigned long gdt_frames[ARRAY_SIZE(v->arch.pv_vcpu.gdt_frames)];
1023         unsigned int n = (c.cmp->gdt_ents + 511) / 512;
1024 
1025         if ( n > ARRAY_SIZE(v->arch.pv_vcpu.gdt_frames) )
1026             return -EINVAL;
1027         for ( i = 0; i < n; ++i )
1028             gdt_frames[i] = c.cmp->gdt_frames[i];
1029         rc = (int)pv_set_gdt(v, gdt_frames, c.cmp->gdt_ents);
1030     }
1031     if ( rc != 0 )
1032         return rc;
1033 
1034     set_bit(_VPF_in_reset, &v->pause_flags);
1035 
1036     if ( !compat )
1037         cr3_gfn = xen_cr3_to_pfn(c.nat->ctrlreg[3]);
1038     else
1039         cr3_gfn = compat_cr3_to_pfn(c.cmp->ctrlreg[3]);
1040     cr3_page = get_page_from_gfn(d, cr3_gfn, NULL, P2M_ALLOC);
1041 
1042     if ( !cr3_page )
1043         rc = -EINVAL;
1044     else if ( paging_mode_refcounts(d) )
1045         /* nothing */;
1046     else if ( cr3_page == v->arch.old_guest_table )
1047     {
1048         v->arch.old_guest_table = NULL;
1049         put_page(cr3_page);
1050     }
1051     else
1052     {
1053         if ( !compat )
1054             rc = put_old_guest_table(v);
1055         if ( !rc )
1056             rc = get_page_type_preemptible(cr3_page,
1057                                            !compat ? PGT_root_page_table
1058                                                    : PGT_l3_page_table);
1059         switch ( rc )
1060         {
1061         case -EINTR:
1062             rc = -ERESTART;
1063         case -ERESTART:
1064             break;
1065         case 0:
1066             if ( !compat && !VM_ASSIST(d, m2p_strict) &&
1067                  !paging_mode_refcounts(d) )
1068                 fill_ro_mpt(_mfn(cr3_gfn));
1069             break;
1070         default:
1071             if ( cr3_page == current->arch.old_guest_table )
1072                 cr3_page = NULL;
1073             break;
1074         }
1075     }
1076     if ( rc )
1077         /* handled below */;
1078     else if ( !compat )
1079     {
1080         v->arch.guest_table = pagetable_from_page(cr3_page);
1081         if ( c.nat->ctrlreg[1] )
1082         {
1083             cr3_gfn = xen_cr3_to_pfn(c.nat->ctrlreg[1]);
1084             cr3_page = get_page_from_gfn(d, cr3_gfn, NULL, P2M_ALLOC);
1085 
1086             if ( !cr3_page )
1087                 rc = -EINVAL;
1088             else if ( !paging_mode_refcounts(d) )
1089             {
1090                 rc = get_page_type_preemptible(cr3_page, PGT_root_page_table);
1091                 switch ( rc )
1092                 {
1093                 case -EINTR:
1094                     rc = -ERESTART;
1095                     /* Fallthrough */
1096                 case -ERESTART:
1097                     v->arch.old_guest_ptpg = NULL;
1098                     v->arch.old_guest_table =
1099                         pagetable_get_page(v->arch.guest_table);
1100                     v->arch.guest_table = pagetable_null();
1101                     break;
1102                 default:
1103                     if ( cr3_page == current->arch.old_guest_table )
1104                         cr3_page = NULL;
1105                     break;
1106                 case 0:
1107                     if ( VM_ASSIST(d, m2p_strict) )
1108                         zap_ro_mpt(_mfn(cr3_gfn));
1109                     break;
1110                 }
1111             }
1112             if ( !rc )
1113                v->arch.guest_table_user = pagetable_from_page(cr3_page);
1114         }
1115     }
1116     else
1117     {
1118         l4_pgentry_t *l4tab;
1119 
1120         l4tab = map_domain_page(pagetable_get_mfn(v->arch.guest_table));
1121         *l4tab = l4e_from_pfn(page_to_mfn(cr3_page),
1122             _PAGE_PRESENT|_PAGE_RW|_PAGE_USER|_PAGE_ACCESSED);
1123         unmap_domain_page(l4tab);
1124     }
1125     if ( rc )
1126     {
1127         if ( cr3_page )
1128             put_page(cr3_page);
1129         pv_destroy_gdt(v);
1130         return rc;
1131     }
1132 
1133     clear_bit(_VPF_in_reset, &v->pause_flags);
1134 
1135     if ( v->vcpu_id == 0 )
1136         update_domain_wallclock_time(d);
1137 
1138     /* Don't redo final setup */
1139     v->is_initialised = 1;
1140 
1141     if ( paging_mode_enabled(d) )
1142         paging_update_paging_modes(v);
1143 
1144     update_cr3(v);
1145 
1146  out:
1147     if ( flags & VGCF_online )
1148         clear_bit(_VPF_down, &v->pause_flags);
1149     else
1150         set_bit(_VPF_down, &v->pause_flags);
1151     return 0;
1152 #undef c
1153 }
1154 
arch_initialise_vcpu(struct vcpu * v,XEN_GUEST_HANDLE_PARAM (void)arg)1155 int arch_initialise_vcpu(struct vcpu *v, XEN_GUEST_HANDLE_PARAM(void) arg)
1156 {
1157     int rc;
1158 
1159     if ( is_hvm_vcpu(v) )
1160     {
1161         struct domain *d = v->domain;
1162         struct vcpu_hvm_context ctxt;
1163 
1164         if ( copy_from_guest(&ctxt, arg, 1) )
1165             return -EFAULT;
1166 
1167         domain_lock(d);
1168         rc = v->is_initialised ? -EEXIST : arch_set_info_hvm_guest(v, &ctxt);
1169         domain_unlock(d);
1170     }
1171     else
1172         rc = default_initialise_vcpu(v, arg);
1173 
1174     return rc;
1175 }
1176 
arch_vcpu_reset(struct vcpu * v)1177 int arch_vcpu_reset(struct vcpu *v)
1178 {
1179     if ( is_pv_vcpu(v) )
1180     {
1181         pv_destroy_gdt(v);
1182         return vcpu_destroy_pagetables(v);
1183     }
1184 
1185     vcpu_end_shutdown_deferral(v);
1186     return 0;
1187 }
1188 
1189 long
arch_do_vcpu_op(int cmd,struct vcpu * v,XEN_GUEST_HANDLE_PARAM (void)arg)1190 arch_do_vcpu_op(
1191     int cmd, struct vcpu *v, XEN_GUEST_HANDLE_PARAM(void) arg)
1192 {
1193     long rc = 0;
1194 
1195     switch ( cmd )
1196     {
1197     case VCPUOP_register_vcpu_time_memory_area:
1198     {
1199         struct vcpu_register_time_memory_area area;
1200 
1201         rc = -EFAULT;
1202         if ( copy_from_guest(&area, arg, 1) )
1203             break;
1204 
1205         if ( !guest_handle_okay(area.addr.h, 1) )
1206             break;
1207 
1208         rc = 0;
1209         v->arch.time_info_guest = area.addr.h;
1210 
1211         force_update_vcpu_system_time(v);
1212 
1213         break;
1214     }
1215 
1216     case VCPUOP_get_physid:
1217     {
1218         struct vcpu_get_physid cpu_id;
1219 
1220         rc = -EINVAL;
1221         if ( !is_pinned_vcpu(v) )
1222             break;
1223 
1224         cpu_id.phys_id =
1225             (uint64_t)x86_cpu_to_apicid[v->vcpu_id] |
1226             ((uint64_t)acpi_get_processor_id(v->vcpu_id) << 32);
1227 
1228         rc = -EFAULT;
1229         if ( copy_to_guest(arg, &cpu_id, 1) )
1230             break;
1231 
1232         rc = 0;
1233         break;
1234     }
1235 
1236     default:
1237         rc = -ENOSYS;
1238         break;
1239     }
1240 
1241     return rc;
1242 }
1243 
1244 /*
1245  * Loading a nul selector does not clear bases and limits on AMD CPUs. Be on
1246  * the safe side and re-initialize both to flat segment values before loading
1247  * a nul selector.
1248  */
1249 #define preload_segment(seg, value) do {              \
1250     if ( !((value) & ~3) &&                           \
1251          boot_cpu_data.x86_vendor == X86_VENDOR_AMD ) \
1252         asm volatile ( "movl %k0, %%" #seg            \
1253                        :: "r" (FLAT_USER_DS32) );     \
1254 } while ( false )
1255 
1256 #define loadsegment(seg,value) ({               \
1257     int __r = 1;                                \
1258     asm volatile (                              \
1259         "1: movl %k1,%%" #seg "\n2:\n"          \
1260         ".section .fixup,\"ax\"\n"              \
1261         "3: xorl %k0,%k0\n"                     \
1262         "   movl %k0,%%" #seg "\n"              \
1263         "   jmp 2b\n"                           \
1264         ".previous\n"                           \
1265         _ASM_EXTABLE(1b, 3b)                    \
1266         : "=r" (__r) : "r" (value), "0" (__r) );\
1267     __r; })
1268 
1269 /*
1270  * save_segments() writes a mask of segments which are dirty (non-zero),
1271  * allowing load_segments() to avoid some expensive segment loads and
1272  * MSR writes.
1273  */
1274 static DEFINE_PER_CPU(unsigned int, dirty_segment_mask);
1275 #define DIRTY_DS           0x01
1276 #define DIRTY_ES           0x02
1277 #define DIRTY_FS           0x04
1278 #define DIRTY_GS           0x08
1279 #define DIRTY_FS_BASE      0x10
1280 #define DIRTY_GS_BASE_USER 0x20
1281 
load_segments(struct vcpu * n)1282 static void load_segments(struct vcpu *n)
1283 {
1284     struct cpu_user_regs *uregs = &n->arch.user_regs;
1285     int all_segs_okay = 1;
1286     unsigned int dirty_segment_mask, cpu = smp_processor_id();
1287 
1288     /* Load and clear the dirty segment mask. */
1289     dirty_segment_mask = per_cpu(dirty_segment_mask, cpu);
1290     per_cpu(dirty_segment_mask, cpu) = 0;
1291 
1292     /* Either selector != 0 ==> reload. */
1293     if ( unlikely((dirty_segment_mask & DIRTY_DS) | uregs->ds) )
1294     {
1295         preload_segment(ds, uregs->ds);
1296         all_segs_okay &= loadsegment(ds, uregs->ds);
1297     }
1298 
1299     /* Either selector != 0 ==> reload. */
1300     if ( unlikely((dirty_segment_mask & DIRTY_ES) | uregs->es) )
1301     {
1302         preload_segment(es, uregs->es);
1303         all_segs_okay &= loadsegment(es, uregs->es);
1304     }
1305 
1306     /* Either selector != 0 ==> reload. */
1307     if ( unlikely((dirty_segment_mask & DIRTY_FS) | uregs->fs) )
1308     {
1309         all_segs_okay &= loadsegment(fs, uregs->fs);
1310         /* non-nul selector updates fs_base */
1311         if ( uregs->fs & ~3 )
1312             dirty_segment_mask &= ~DIRTY_FS_BASE;
1313     }
1314 
1315     /* Either selector != 0 ==> reload. */
1316     if ( unlikely((dirty_segment_mask & DIRTY_GS) | uregs->gs) )
1317     {
1318         all_segs_okay &= loadsegment(gs, uregs->gs);
1319         /* non-nul selector updates gs_base_user */
1320         if ( uregs->gs & ~3 )
1321             dirty_segment_mask &= ~DIRTY_GS_BASE_USER;
1322     }
1323 
1324     if ( !is_pv_32bit_vcpu(n) )
1325     {
1326         /* This can only be non-zero if selector is NULL. */
1327         if ( n->arch.pv_vcpu.fs_base | (dirty_segment_mask & DIRTY_FS_BASE) )
1328             wrfsbase(n->arch.pv_vcpu.fs_base);
1329 
1330         /* Most kernels have non-zero GS base, so don't bother testing. */
1331         /* (This is also a serialising instruction, avoiding AMD erratum #88.) */
1332         wrmsrl(MSR_SHADOW_GS_BASE, n->arch.pv_vcpu.gs_base_kernel);
1333 
1334         /* This can only be non-zero if selector is NULL. */
1335         if ( n->arch.pv_vcpu.gs_base_user |
1336              (dirty_segment_mask & DIRTY_GS_BASE_USER) )
1337             wrgsbase(n->arch.pv_vcpu.gs_base_user);
1338 
1339         /* If in kernel mode then switch the GS bases around. */
1340         if ( (n->arch.flags & TF_kernel_mode) )
1341             asm volatile ( "swapgs" );
1342     }
1343 
1344     if ( unlikely(!all_segs_okay) )
1345     {
1346         struct pv_vcpu *pv = &n->arch.pv_vcpu;
1347         struct cpu_user_regs *regs = guest_cpu_user_regs();
1348         unsigned long *rsp =
1349             (unsigned long *)(((n->arch.flags & TF_kernel_mode)
1350                                ? regs->rsp : pv->kernel_sp) & ~0xf);
1351         unsigned long cs_and_mask, rflags;
1352 
1353         /* Fold upcall mask and architectural IOPL into RFLAGS.IF. */
1354         rflags  = regs->rflags & ~(X86_EFLAGS_IF|X86_EFLAGS_IOPL);
1355         rflags |= !vcpu_info(n, evtchn_upcall_mask) << 9;
1356         if ( VM_ASSIST(n->domain, architectural_iopl) )
1357             rflags |= n->arch.pv_vcpu.iopl;
1358 
1359         if ( is_pv_32bit_vcpu(n) )
1360         {
1361             unsigned int *esp = ring_1(regs) ?
1362                                 (unsigned int *)regs->rsp :
1363                                 (unsigned int *)pv->kernel_sp;
1364             int ret = 0;
1365 
1366             /* CS longword also contains full evtchn_upcall_mask. */
1367             cs_and_mask = (unsigned short)regs->cs |
1368                 ((unsigned int)vcpu_info(n, evtchn_upcall_mask) << 16);
1369 
1370             if ( !ring_1(regs) )
1371             {
1372                 ret  = put_user(regs->ss,       esp-1);
1373                 ret |= put_user(regs->esp,      esp-2);
1374                 esp -= 2;
1375             }
1376 
1377             if ( ret |
1378                  put_user(rflags,              esp-1) |
1379                  put_user(cs_and_mask,         esp-2) |
1380                  put_user(regs->eip,           esp-3) |
1381                  put_user(uregs->gs,           esp-4) |
1382                  put_user(uregs->fs,           esp-5) |
1383                  put_user(uregs->es,           esp-6) |
1384                  put_user(uregs->ds,           esp-7) )
1385             {
1386                 gprintk(XENLOG_ERR,
1387                         "error while creating compat failsafe callback frame\n");
1388                 domain_crash(n->domain);
1389             }
1390 
1391             if ( n->arch.vgc_flags & VGCF_failsafe_disables_events )
1392                 vcpu_info(n, evtchn_upcall_mask) = 1;
1393 
1394             regs->entry_vector |= TRAP_syscall;
1395             regs->eflags       &= ~(X86_EFLAGS_VM|X86_EFLAGS_RF|X86_EFLAGS_NT|
1396                                     X86_EFLAGS_IOPL|X86_EFLAGS_TF);
1397             regs->ss            = FLAT_COMPAT_KERNEL_SS;
1398             regs->esp           = (unsigned long)(esp-7);
1399             regs->cs            = FLAT_COMPAT_KERNEL_CS;
1400             regs->eip           = pv->failsafe_callback_eip;
1401             return;
1402         }
1403 
1404         if ( !(n->arch.flags & TF_kernel_mode) )
1405             toggle_guest_mode(n);
1406         else
1407             regs->cs &= ~3;
1408 
1409         /* CS longword also contains full evtchn_upcall_mask. */
1410         cs_and_mask = (unsigned long)regs->cs |
1411             ((unsigned long)vcpu_info(n, evtchn_upcall_mask) << 32);
1412 
1413         if ( put_user(regs->ss,            rsp- 1) |
1414              put_user(regs->rsp,           rsp- 2) |
1415              put_user(rflags,              rsp- 3) |
1416              put_user(cs_and_mask,         rsp- 4) |
1417              put_user(regs->rip,           rsp- 5) |
1418              put_user(uregs->gs,           rsp- 6) |
1419              put_user(uregs->fs,           rsp- 7) |
1420              put_user(uregs->es,           rsp- 8) |
1421              put_user(uregs->ds,           rsp- 9) |
1422              put_user(regs->r11,           rsp-10) |
1423              put_user(regs->rcx,           rsp-11) )
1424         {
1425             gprintk(XENLOG_ERR,
1426                     "error while creating failsafe callback frame\n");
1427             domain_crash(n->domain);
1428         }
1429 
1430         if ( n->arch.vgc_flags & VGCF_failsafe_disables_events )
1431             vcpu_info(n, evtchn_upcall_mask) = 1;
1432 
1433         regs->entry_vector |= TRAP_syscall;
1434         regs->rflags       &= ~(X86_EFLAGS_AC|X86_EFLAGS_VM|X86_EFLAGS_RF|
1435                                 X86_EFLAGS_NT|X86_EFLAGS_IOPL|X86_EFLAGS_TF);
1436         regs->ss            = FLAT_KERNEL_SS;
1437         regs->rsp           = (unsigned long)(rsp-11);
1438         regs->cs            = FLAT_KERNEL_CS;
1439         regs->rip           = pv->failsafe_callback_eip;
1440     }
1441 }
1442 
save_segments(struct vcpu * v)1443 static void save_segments(struct vcpu *v)
1444 {
1445     struct cpu_user_regs *regs = &v->arch.user_regs;
1446     unsigned int dirty_segment_mask = 0;
1447 
1448     regs->ds = read_sreg(ds);
1449     regs->es = read_sreg(es);
1450     regs->fs = read_sreg(fs);
1451     regs->gs = read_sreg(gs);
1452 
1453     if ( cpu_has_fsgsbase && !is_pv_32bit_vcpu(v) )
1454     {
1455         v->arch.pv_vcpu.fs_base = __rdfsbase();
1456         if ( v->arch.flags & TF_kernel_mode )
1457             v->arch.pv_vcpu.gs_base_kernel = __rdgsbase();
1458         else
1459             v->arch.pv_vcpu.gs_base_user = __rdgsbase();
1460     }
1461 
1462     if ( regs->ds )
1463         dirty_segment_mask |= DIRTY_DS;
1464 
1465     if ( regs->es )
1466         dirty_segment_mask |= DIRTY_ES;
1467 
1468     if ( regs->fs || is_pv_32bit_vcpu(v) )
1469     {
1470         dirty_segment_mask |= DIRTY_FS;
1471         /* non-nul selector kills fs_base */
1472         if ( regs->fs & ~3 )
1473             v->arch.pv_vcpu.fs_base = 0;
1474     }
1475     if ( v->arch.pv_vcpu.fs_base )
1476         dirty_segment_mask |= DIRTY_FS_BASE;
1477 
1478     if ( regs->gs || is_pv_32bit_vcpu(v) )
1479     {
1480         dirty_segment_mask |= DIRTY_GS;
1481         /* non-nul selector kills gs_base_user */
1482         if ( regs->gs & ~3 )
1483             v->arch.pv_vcpu.gs_base_user = 0;
1484     }
1485     if ( v->arch.flags & TF_kernel_mode ? v->arch.pv_vcpu.gs_base_kernel
1486                                         : v->arch.pv_vcpu.gs_base_user )
1487         dirty_segment_mask |= DIRTY_GS_BASE_USER;
1488 
1489     this_cpu(dirty_segment_mask) = dirty_segment_mask;
1490 }
1491 
paravirt_ctxt_switch_from(struct vcpu * v)1492 void paravirt_ctxt_switch_from(struct vcpu *v)
1493 {
1494     save_segments(v);
1495 
1496     /*
1497      * Disable debug breakpoints. We do this aggressively because if we switch
1498      * to an HVM guest we may load DR0-DR3 with values that can cause #DE
1499      * inside Xen, before we get a chance to reload DR7, and this cannot always
1500      * safely be handled.
1501      */
1502     if ( unlikely(v->arch.debugreg[7] & DR7_ACTIVE_MASK) )
1503         write_debugreg(7, 0);
1504 }
1505 
paravirt_ctxt_switch_to(struct vcpu * v)1506 void paravirt_ctxt_switch_to(struct vcpu *v)
1507 {
1508     unsigned long cr4;
1509 
1510     cr4 = pv_guest_cr4_to_real_cr4(v);
1511     if ( unlikely(cr4 != read_cr4()) )
1512         write_cr4(cr4);
1513 
1514     if ( unlikely(v->arch.debugreg[7] & DR7_ACTIVE_MASK) )
1515         activate_debugregs(v);
1516 
1517     if ( (v->domain->arch.tsc_mode ==  TSC_MODE_PVRDTSCP) &&
1518          boot_cpu_has(X86_FEATURE_RDTSCP) )
1519         write_rdtscp_aux(v->domain->arch.incarnation);
1520 }
1521 
1522 /* Update per-VCPU guest runstate shared memory area (if registered). */
update_runstate_area(struct vcpu * v)1523 bool update_runstate_area(struct vcpu *v)
1524 {
1525     bool rc;
1526     struct guest_memory_policy policy =
1527         { .smap_policy = SMAP_CHECK_ENABLED, .nested_guest_mode = false };
1528     void __user *guest_handle = NULL;
1529 
1530     if ( guest_handle_is_null(runstate_guest(v)) )
1531         return true;
1532 
1533     update_guest_memory_policy(v, &policy);
1534 
1535     if ( VM_ASSIST(v->domain, runstate_update_flag) )
1536     {
1537         guest_handle = has_32bit_shinfo(v->domain)
1538             ? &v->runstate_guest.compat.p->state_entry_time + 1
1539             : &v->runstate_guest.native.p->state_entry_time + 1;
1540         guest_handle--;
1541         v->runstate.state_entry_time |= XEN_RUNSTATE_UPDATE;
1542         __raw_copy_to_guest(guest_handle,
1543                             (void *)(&v->runstate.state_entry_time + 1) - 1, 1);
1544         smp_wmb();
1545     }
1546 
1547     if ( has_32bit_shinfo(v->domain) )
1548     {
1549         struct compat_vcpu_runstate_info info;
1550 
1551         XLAT_vcpu_runstate_info(&info, &v->runstate);
1552         __copy_to_guest(v->runstate_guest.compat, &info, 1);
1553         rc = true;
1554     }
1555     else
1556         rc = __copy_to_guest(runstate_guest(v), &v->runstate, 1) !=
1557              sizeof(v->runstate);
1558 
1559     if ( guest_handle )
1560     {
1561         v->runstate.state_entry_time &= ~XEN_RUNSTATE_UPDATE;
1562         smp_wmb();
1563         __raw_copy_to_guest(guest_handle,
1564                             (void *)(&v->runstate.state_entry_time + 1) - 1, 1);
1565     }
1566 
1567     update_guest_memory_policy(v, &policy);
1568 
1569     return rc;
1570 }
1571 
_update_runstate_area(struct vcpu * v)1572 static void _update_runstate_area(struct vcpu *v)
1573 {
1574     if ( !update_runstate_area(v) && is_pv_vcpu(v) &&
1575          !(v->arch.flags & TF_kernel_mode) )
1576         v->arch.pv_vcpu.need_update_runstate_area = 1;
1577 }
1578 
need_full_gdt(const struct domain * d)1579 static inline bool need_full_gdt(const struct domain *d)
1580 {
1581     return is_pv_domain(d) && !is_idle_domain(d);
1582 }
1583 
__context_switch(void)1584 static void __context_switch(void)
1585 {
1586     struct cpu_user_regs *stack_regs = guest_cpu_user_regs();
1587     unsigned int          cpu = smp_processor_id();
1588     struct vcpu          *p = per_cpu(curr_vcpu, cpu);
1589     struct vcpu          *n = current;
1590     struct domain        *pd = p->domain, *nd = n->domain;
1591     struct desc_struct   *gdt;
1592     struct desc_ptr       gdt_desc;
1593 
1594     ASSERT(p != n);
1595     ASSERT(cpumask_empty(n->vcpu_dirty_cpumask));
1596 
1597     if ( !is_idle_domain(pd) )
1598     {
1599         memcpy(&p->arch.user_regs, stack_regs, CTXT_SWITCH_STACK_BYTES);
1600         vcpu_save_fpu(p);
1601         pd->arch.ctxt_switch->from(p);
1602     }
1603 
1604     /*
1605      * Mark this CPU in next domain's dirty cpumasks before calling
1606      * ctxt_switch_to(). This avoids a race on things like EPT flushing,
1607      * which is synchronised on that function.
1608      */
1609     if ( pd != nd )
1610         cpumask_set_cpu(cpu, nd->domain_dirty_cpumask);
1611     cpumask_set_cpu(cpu, n->vcpu_dirty_cpumask);
1612 
1613     if ( !is_idle_domain(nd) )
1614     {
1615         memcpy(stack_regs, &n->arch.user_regs, CTXT_SWITCH_STACK_BYTES);
1616         if ( cpu_has_xsave )
1617         {
1618             u64 xcr0 = n->arch.xcr0 ?: XSTATE_FP_SSE;
1619 
1620             if ( xcr0 != get_xcr0() && !set_xcr0(xcr0) )
1621                 BUG();
1622 
1623             if ( cpu_has_xsaves && is_hvm_vcpu(n) )
1624                 set_msr_xss(n->arch.hvm_vcpu.msr_xss);
1625         }
1626         vcpu_restore_fpu_eager(n);
1627         nd->arch.ctxt_switch->to(n);
1628     }
1629 
1630     psr_ctxt_switch_to(nd);
1631 
1632     gdt = !is_pv_32bit_domain(nd) ? per_cpu(gdt_table, cpu) :
1633                                     per_cpu(compat_gdt_table, cpu);
1634     if ( need_full_gdt(nd) )
1635     {
1636         unsigned long mfn = virt_to_mfn(gdt);
1637         l1_pgentry_t *pl1e = pv_gdt_ptes(n);
1638         unsigned int i;
1639 
1640         for ( i = 0; i < NR_RESERVED_GDT_PAGES; i++ )
1641             l1e_write(pl1e + FIRST_RESERVED_GDT_PAGE + i,
1642                       l1e_from_pfn(mfn + i, __PAGE_HYPERVISOR_RW));
1643     }
1644 
1645     if ( need_full_gdt(pd) &&
1646          ((p->vcpu_id != n->vcpu_id) || !need_full_gdt(nd)) )
1647     {
1648         gdt_desc.limit = LAST_RESERVED_GDT_BYTE;
1649         gdt_desc.base  = (unsigned long)(gdt - FIRST_RESERVED_GDT_ENTRY);
1650         asm volatile ( "lgdt %0" : : "m" (gdt_desc) );
1651     }
1652 
1653     write_ptbase(n);
1654 
1655     if ( need_full_gdt(nd) &&
1656          ((p->vcpu_id != n->vcpu_id) || !need_full_gdt(pd)) )
1657     {
1658         gdt_desc.limit = LAST_RESERVED_GDT_BYTE;
1659         gdt_desc.base = GDT_VIRT_START(n);
1660         asm volatile ( "lgdt %0" : : "m" (gdt_desc) );
1661     }
1662 
1663     if ( pd != nd )
1664         cpumask_clear_cpu(cpu, pd->domain_dirty_cpumask);
1665     cpumask_clear_cpu(cpu, p->vcpu_dirty_cpumask);
1666 
1667     per_cpu(curr_vcpu, cpu) = n;
1668 }
1669 
1670 
context_switch(struct vcpu * prev,struct vcpu * next)1671 void context_switch(struct vcpu *prev, struct vcpu *next)
1672 {
1673     unsigned int cpu = smp_processor_id();
1674     const struct domain *prevd = prev->domain, *nextd = next->domain;
1675     cpumask_t dirty_mask;
1676 
1677     ASSERT(local_irq_is_enabled());
1678 
1679     cpumask_copy(&dirty_mask, next->vcpu_dirty_cpumask);
1680     /* Allow at most one CPU at a time to be dirty. */
1681     ASSERT(cpumask_weight(&dirty_mask) <= 1);
1682     if ( unlikely(!cpumask_test_cpu(cpu, &dirty_mask) &&
1683                   !cpumask_empty(&dirty_mask)) )
1684     {
1685         /* Other cpus call __sync_local_execstate from flush ipi handler. */
1686         flush_tlb_mask(&dirty_mask);
1687     }
1688 
1689     if ( prev != next )
1690     {
1691         _update_runstate_area(prev);
1692         vpmu_switch_from(prev);
1693         np2m_schedule(NP2M_SCHEDLE_OUT);
1694     }
1695 
1696     if ( is_hvm_domain(prevd) && !list_empty(&prev->arch.hvm_vcpu.tm_list) )
1697         pt_save_timer(prev);
1698 
1699     local_irq_disable();
1700 
1701     set_current(next);
1702 
1703     if ( (per_cpu(curr_vcpu, cpu) == next) ||
1704          (is_idle_domain(nextd) && cpu_online(cpu)) )
1705     {
1706         local_irq_enable();
1707     }
1708     else
1709     {
1710         __context_switch();
1711 
1712         if ( is_pv_domain(nextd) &&
1713              (is_idle_domain(prevd) ||
1714               is_hvm_domain(prevd) ||
1715               is_pv_32bit_domain(prevd) != is_pv_32bit_domain(nextd)) )
1716         {
1717             uint64_t efer = read_efer();
1718             if ( !(efer & EFER_SCE) )
1719                 write_efer(efer | EFER_SCE);
1720         }
1721 
1722         /* Re-enable interrupts before restoring state which may fault. */
1723         local_irq_enable();
1724 
1725         if ( is_pv_domain(nextd) )
1726         {
1727             load_LDT(next);
1728             load_segments(next);
1729         }
1730 
1731         ctxt_switch_levelling(next);
1732     }
1733 
1734     context_saved(prev);
1735 
1736     if ( prev != next )
1737     {
1738         _update_runstate_area(next);
1739 
1740         /* Must be done with interrupts enabled */
1741         vpmu_switch_to(next);
1742         np2m_schedule(NP2M_SCHEDLE_IN);
1743     }
1744 
1745     /* Ensure that the vcpu has an up-to-date time base. */
1746     update_vcpu_system_time(next);
1747 
1748     /*
1749      * Schedule tail *should* be a terminal function pointer, but leave a
1750      * bug frame around just in case it returns, to save going back into the
1751      * context switching code and leaving a far more subtle crash to diagnose.
1752      */
1753     nextd->arch.ctxt_switch->tail(next);
1754     BUG();
1755 }
1756 
continue_running(struct vcpu * same)1757 void continue_running(struct vcpu *same)
1758 {
1759     /* See the comment above. */
1760     same->domain->arch.ctxt_switch->tail(same);
1761     BUG();
1762 }
1763 
__sync_local_execstate(void)1764 int __sync_local_execstate(void)
1765 {
1766     unsigned long flags;
1767     int switch_required;
1768 
1769     local_irq_save(flags);
1770 
1771     switch_required = (this_cpu(curr_vcpu) != current);
1772 
1773     if ( switch_required )
1774     {
1775         ASSERT(current == idle_vcpu[smp_processor_id()]);
1776         __context_switch();
1777     }
1778 
1779     local_irq_restore(flags);
1780 
1781     return switch_required;
1782 }
1783 
sync_local_execstate(void)1784 void sync_local_execstate(void)
1785 {
1786     (void)__sync_local_execstate();
1787 }
1788 
sync_vcpu_execstate(struct vcpu * v)1789 void sync_vcpu_execstate(struct vcpu *v)
1790 {
1791     if ( cpumask_test_cpu(smp_processor_id(), v->vcpu_dirty_cpumask) )
1792         sync_local_execstate();
1793 
1794     /* Other cpus call __sync_local_execstate from flush ipi handler. */
1795     flush_tlb_mask(v->vcpu_dirty_cpumask);
1796 }
1797 
relinquish_memory(struct domain * d,struct page_list_head * list,unsigned long type)1798 static int relinquish_memory(
1799     struct domain *d, struct page_list_head *list, unsigned long type)
1800 {
1801     struct page_info  *page;
1802     unsigned long     x, y;
1803     int               ret = 0;
1804 
1805     /* Use a recursive lock, as we may enter 'free_domheap_page'. */
1806     spin_lock_recursive(&d->page_alloc_lock);
1807 
1808     while ( (page = page_list_remove_head(list)) )
1809     {
1810         /* Grab a reference to the page so it won't disappear from under us. */
1811         if ( unlikely(!get_page(page, d)) )
1812         {
1813             /* Couldn't get a reference -- someone is freeing this page. */
1814             page_list_add_tail(page, &d->arch.relmem_list);
1815             continue;
1816         }
1817 
1818         if ( test_and_clear_bit(_PGT_pinned, &page->u.inuse.type_info) )
1819             ret = put_page_and_type_preemptible(page);
1820         switch ( ret )
1821         {
1822         case 0:
1823             break;
1824         case -ERESTART:
1825         case -EINTR:
1826             ret = -ERESTART;
1827             page_list_add(page, list);
1828             set_bit(_PGT_pinned, &page->u.inuse.type_info);
1829             put_page(page);
1830             goto out;
1831         default:
1832             BUG();
1833         }
1834 
1835         if ( test_and_clear_bit(_PGC_allocated, &page->count_info) )
1836             put_page(page);
1837 
1838         /*
1839          * Forcibly invalidate top-most, still valid page tables at this point
1840          * to break circular 'linear page table' references as well as clean up
1841          * partially validated pages. This is okay because MMU structures are
1842          * not shared across domains and this domain is now dead. Thus top-most
1843          * valid tables are not in use so a non-zero count means circular
1844          * reference or partially validated.
1845          */
1846         y = page->u.inuse.type_info;
1847         for ( ; ; )
1848         {
1849             x = y;
1850             if ( likely((x & PGT_type_mask) != type) ||
1851                  likely(!(x & (PGT_validated|PGT_partial))) )
1852                 break;
1853 
1854             y = cmpxchg(&page->u.inuse.type_info, x,
1855                         x & ~(PGT_validated|PGT_partial));
1856             if ( likely(y == x) )
1857             {
1858                 /* No need for atomic update of type_info here: noone else updates it. */
1859                 switch ( ret = free_page_type(page, x, 1) )
1860                 {
1861                 case 0:
1862                     break;
1863                 case -EINTR:
1864                     page_list_add(page, list);
1865                     page->u.inuse.type_info |= PGT_validated;
1866                     if ( x & PGT_partial )
1867                         put_page(page);
1868                     put_page(page);
1869                     ret = -ERESTART;
1870                     goto out;
1871                 case -ERESTART:
1872                     page_list_add(page, list);
1873                     page->u.inuse.type_info |= PGT_partial;
1874                     if ( x & PGT_partial )
1875                         put_page(page);
1876                     goto out;
1877                 default:
1878                     BUG();
1879                 }
1880                 if ( x & PGT_partial )
1881                 {
1882                     page->u.inuse.type_info--;
1883                     put_page(page);
1884                 }
1885                 break;
1886             }
1887         }
1888 
1889         /* Put the page on the list and /then/ potentially free it. */
1890         page_list_add_tail(page, &d->arch.relmem_list);
1891         put_page(page);
1892 
1893         if ( hypercall_preempt_check() )
1894         {
1895             ret = -ERESTART;
1896             goto out;
1897         }
1898     }
1899 
1900     /* list is empty at this point. */
1901     page_list_move(list, &d->arch.relmem_list);
1902 
1903  out:
1904     spin_unlock_recursive(&d->page_alloc_lock);
1905     return ret;
1906 }
1907 
domain_relinquish_resources(struct domain * d)1908 int domain_relinquish_resources(struct domain *d)
1909 {
1910     int ret;
1911     struct vcpu *v;
1912 
1913     BUG_ON(!cpumask_empty(d->domain_dirty_cpumask));
1914 
1915     switch ( d->arch.relmem )
1916     {
1917     case RELMEM_not_started:
1918         ret = pci_release_devices(d);
1919         if ( ret )
1920             return ret;
1921 
1922         /* Tear down paging-assistance stuff. */
1923         ret = paging_teardown(d);
1924         if ( ret )
1925             return ret;
1926 
1927         /* Drop the in-use references to page-table bases. */
1928         for_each_vcpu ( d, v )
1929         {
1930             ret = vcpu_destroy_pagetables(v);
1931             if ( ret )
1932                 return ret;
1933         }
1934 
1935         if ( is_pv_domain(d) )
1936         {
1937             for_each_vcpu ( d, v )
1938             {
1939                 /*
1940                  * Relinquish GDT mappings. No need for explicit unmapping of
1941                  * the LDT as it automatically gets squashed with the guest
1942                  * mappings.
1943                  */
1944                 pv_destroy_gdt(v);
1945             }
1946         }
1947 
1948         if ( d->arch.pirq_eoi_map != NULL )
1949         {
1950             unmap_domain_page_global(d->arch.pirq_eoi_map);
1951             put_page_and_type(mfn_to_page(d->arch.pirq_eoi_map_mfn));
1952             d->arch.pirq_eoi_map = NULL;
1953             d->arch.auto_unmask = 0;
1954         }
1955 
1956         d->arch.relmem = RELMEM_shared;
1957         /* fallthrough */
1958 
1959     case RELMEM_shared:
1960 
1961         if ( is_hvm_domain(d) )
1962         {
1963             /* If the domain has shared pages, relinquish them allowing
1964              * for preemption. */
1965             ret = relinquish_shared_pages(d);
1966             if ( ret )
1967                 return ret;
1968         }
1969 
1970         d->arch.relmem = RELMEM_xen;
1971 
1972         spin_lock(&d->page_alloc_lock);
1973         page_list_splice(&d->arch.relmem_list, &d->page_list);
1974         INIT_PAGE_LIST_HEAD(&d->arch.relmem_list);
1975         spin_unlock(&d->page_alloc_lock);
1976 
1977         /* Fallthrough. Relinquish every page of memory. */
1978     case RELMEM_xen:
1979         ret = relinquish_memory(d, &d->xenpage_list, ~0UL);
1980         if ( ret )
1981             return ret;
1982         d->arch.relmem = RELMEM_l4;
1983         /* fallthrough */
1984 
1985     case RELMEM_l4:
1986         ret = relinquish_memory(d, &d->page_list, PGT_l4_page_table);
1987         if ( ret )
1988             return ret;
1989         d->arch.relmem = RELMEM_l3;
1990         /* fallthrough */
1991 
1992     case RELMEM_l3:
1993         ret = relinquish_memory(d, &d->page_list, PGT_l3_page_table);
1994         if ( ret )
1995             return ret;
1996         d->arch.relmem = RELMEM_l2;
1997         /* fallthrough */
1998 
1999     case RELMEM_l2:
2000         ret = relinquish_memory(d, &d->page_list, PGT_l2_page_table);
2001         if ( ret )
2002             return ret;
2003         d->arch.relmem = RELMEM_done;
2004         /* fallthrough */
2005 
2006     case RELMEM_done:
2007         break;
2008 
2009     default:
2010         BUG();
2011     }
2012 
2013     pit_deinit(d);
2014 
2015     if ( is_hvm_domain(d) )
2016         hvm_domain_relinquish_resources(d);
2017 
2018     return 0;
2019 }
2020 
arch_dump_domain_info(struct domain * d)2021 void arch_dump_domain_info(struct domain *d)
2022 {
2023     paging_dump_domain_info(d);
2024 }
2025 
arch_dump_vcpu_info(struct vcpu * v)2026 void arch_dump_vcpu_info(struct vcpu *v)
2027 {
2028     paging_dump_vcpu_info(v);
2029 
2030     vpmu_dump(v);
2031 }
2032 
vcpu_kick(struct vcpu * v)2033 void vcpu_kick(struct vcpu *v)
2034 {
2035     /*
2036      * NB1. 'pause_flags' and 'processor' must be checked /after/ update of
2037      * pending flag. These values may fluctuate (after all, we hold no
2038      * locks) but the key insight is that each change will cause
2039      * evtchn_upcall_pending to be polled.
2040      *
2041      * NB2. We save the running flag across the unblock to avoid a needless
2042      * IPI for domains that we IPI'd to unblock.
2043      */
2044     bool running = v->is_running;
2045 
2046     vcpu_unblock(v);
2047     if ( running && (in_irq() || (v != current)) )
2048         cpu_raise_softirq(v->processor, VCPU_KICK_SOFTIRQ);
2049 }
2050 
vcpu_mark_events_pending(struct vcpu * v)2051 void vcpu_mark_events_pending(struct vcpu *v)
2052 {
2053     int already_pending = test_and_set_bit(
2054         0, (unsigned long *)&vcpu_info(v, evtchn_upcall_pending));
2055 
2056     if ( already_pending )
2057         return;
2058 
2059     if ( is_hvm_vcpu(v) )
2060         hvm_assert_evtchn_irq(v);
2061     else
2062         vcpu_kick(v);
2063 }
2064 
vcpu_kick_softirq(void)2065 static void vcpu_kick_softirq(void)
2066 {
2067     /*
2068      * Nothing to do here: we merely prevent notifiers from racing with checks
2069      * executed on return to guest context with interrupts enabled. See, for
2070      * example, xxx_intr_assist() executed on return to HVM guest context.
2071      */
2072 }
2073 
init_vcpu_kick_softirq(void)2074 static int __init init_vcpu_kick_softirq(void)
2075 {
2076     open_softirq(VCPU_KICK_SOFTIRQ, vcpu_kick_softirq);
2077     return 0;
2078 }
2079 __initcall(init_vcpu_kick_softirq);
2080 
2081 
2082 /*
2083  * Local variables:
2084  * mode: C
2085  * c-file-style: "BSD"
2086  * c-basic-offset: 4
2087  * tab-width: 4
2088  * indent-tabs-mode: nil
2089  * End:
2090  */
2091