1 /******************************************************************************
2  * domctl.c
3  *
4  * Domain management operations. For use by node control stack.
5  *
6  * Copyright (c) 2002-2006, K A Fraser
7  */
8 
9 #include <xen/types.h>
10 #include <xen/lib.h>
11 #include <xen/err.h>
12 #include <xen/mm.h>
13 #include <xen/sched.h>
14 #include <xen/sched-if.h>
15 #include <xen/domain.h>
16 #include <xen/event.h>
17 #include <xen/grant_table.h>
18 #include <xen/domain_page.h>
19 #include <xen/trace.h>
20 #include <xen/console.h>
21 #include <xen/iocap.h>
22 #include <xen/rcupdate.h>
23 #include <xen/guest_access.h>
24 #include <xen/bitmap.h>
25 #include <xen/paging.h>
26 #include <xen/hypercall.h>
27 #include <xen/vm_event.h>
28 #include <xen/monitor.h>
29 #include <asm/current.h>
30 #include <asm/irq.h>
31 #include <asm/page.h>
32 #include <asm/p2m.h>
33 #include <public/domctl.h>
34 #include <xsm/xsm.h>
35 
36 static DEFINE_SPINLOCK(domctl_lock);
37 DEFINE_SPINLOCK(vcpu_alloc_lock);
38 
bitmap_to_xenctl_bitmap(struct xenctl_bitmap * xenctl_bitmap,const unsigned long * bitmap,unsigned int nbits)39 static int bitmap_to_xenctl_bitmap(struct xenctl_bitmap *xenctl_bitmap,
40                                    const unsigned long *bitmap,
41                                    unsigned int nbits)
42 {
43     unsigned int guest_bytes, copy_bytes, i;
44     uint8_t zero = 0;
45     int err = 0;
46     uint8_t *bytemap = xmalloc_array(uint8_t, (nbits + 7) / 8);
47 
48     if ( !bytemap )
49         return -ENOMEM;
50 
51     guest_bytes = (xenctl_bitmap->nr_bits + 7) / 8;
52     copy_bytes  = min_t(unsigned int, guest_bytes, (nbits + 7) / 8);
53 
54     bitmap_long_to_byte(bytemap, bitmap, nbits);
55 
56     if ( copy_bytes != 0 )
57         if ( copy_to_guest(xenctl_bitmap->bitmap, bytemap, copy_bytes) )
58             err = -EFAULT;
59 
60     for ( i = copy_bytes; !err && i < guest_bytes; i++ )
61         if ( copy_to_guest_offset(xenctl_bitmap->bitmap, i, &zero, 1) )
62             err = -EFAULT;
63 
64     xfree(bytemap);
65 
66     return err;
67 }
68 
xenctl_bitmap_to_bitmap(unsigned long * bitmap,const struct xenctl_bitmap * xenctl_bitmap,unsigned int nbits)69 static int xenctl_bitmap_to_bitmap(unsigned long *bitmap,
70                                    const struct xenctl_bitmap *xenctl_bitmap,
71                                    unsigned int nbits)
72 {
73     unsigned int guest_bytes, copy_bytes;
74     int err = 0;
75     uint8_t *bytemap = xzalloc_array(uint8_t, (nbits + 7) / 8);
76 
77     if ( !bytemap )
78         return -ENOMEM;
79 
80     guest_bytes = (xenctl_bitmap->nr_bits + 7) / 8;
81     copy_bytes  = min_t(unsigned int, guest_bytes, (nbits + 7) / 8);
82 
83     if ( copy_bytes != 0 )
84     {
85         if ( copy_from_guest(bytemap, xenctl_bitmap->bitmap, copy_bytes) )
86             err = -EFAULT;
87         if ( (xenctl_bitmap->nr_bits & 7) && (guest_bytes == copy_bytes) )
88             bytemap[guest_bytes-1] &= ~(0xff << (xenctl_bitmap->nr_bits & 7));
89     }
90 
91     if ( !err )
92         bitmap_byte_to_long(bitmap, bytemap, nbits);
93 
94     xfree(bytemap);
95 
96     return err;
97 }
98 
cpumask_to_xenctl_bitmap(struct xenctl_bitmap * xenctl_cpumap,const cpumask_t * cpumask)99 int cpumask_to_xenctl_bitmap(struct xenctl_bitmap *xenctl_cpumap,
100                              const cpumask_t *cpumask)
101 {
102     return bitmap_to_xenctl_bitmap(xenctl_cpumap, cpumask_bits(cpumask),
103                                    nr_cpu_ids);
104 }
105 
xenctl_bitmap_to_cpumask(cpumask_var_t * cpumask,const struct xenctl_bitmap * xenctl_cpumap)106 int xenctl_bitmap_to_cpumask(cpumask_var_t *cpumask,
107                              const struct xenctl_bitmap *xenctl_cpumap)
108 {
109     int err = 0;
110 
111     if ( alloc_cpumask_var(cpumask) ) {
112         err = xenctl_bitmap_to_bitmap(cpumask_bits(*cpumask), xenctl_cpumap,
113                                       nr_cpu_ids);
114         /* In case of error, cleanup is up to us, as the caller won't care! */
115         if ( err )
116             free_cpumask_var(*cpumask);
117     }
118     else
119         err = -ENOMEM;
120 
121     return err;
122 }
123 
nodemask_to_xenctl_bitmap(struct xenctl_bitmap * xenctl_nodemap,const nodemask_t * nodemask)124 static int nodemask_to_xenctl_bitmap(struct xenctl_bitmap *xenctl_nodemap,
125                                      const nodemask_t *nodemask)
126 {
127     return bitmap_to_xenctl_bitmap(xenctl_nodemap, nodes_addr(*nodemask),
128                                    MAX_NUMNODES);
129 }
130 
xenctl_bitmap_to_nodemask(nodemask_t * nodemask,const struct xenctl_bitmap * xenctl_nodemap)131 static int xenctl_bitmap_to_nodemask(nodemask_t *nodemask,
132                                      const struct xenctl_bitmap *xenctl_nodemap)
133 {
134     return xenctl_bitmap_to_bitmap(nodes_addr(*nodemask), xenctl_nodemap,
135                                    MAX_NUMNODES);
136 }
137 
is_free_domid(domid_t dom)138 static inline int is_free_domid(domid_t dom)
139 {
140     struct domain *d;
141 
142     if ( dom >= DOMID_FIRST_RESERVED )
143         return 0;
144 
145     if ( (d = rcu_lock_domain_by_id(dom)) == NULL )
146         return 1;
147 
148     rcu_unlock_domain(d);
149     return 0;
150 }
151 
getdomaininfo(struct domain * d,struct xen_domctl_getdomaininfo * info)152 void getdomaininfo(struct domain *d, struct xen_domctl_getdomaininfo *info)
153 {
154     struct vcpu *v;
155     u64 cpu_time = 0;
156     int flags = XEN_DOMINF_blocked;
157     struct vcpu_runstate_info runstate;
158 
159     info->domain = d->domain_id;
160     info->max_vcpu_id = XEN_INVALID_MAX_VCPU_ID;
161     info->nr_online_vcpus = 0;
162     info->ssidref = 0;
163 
164     /*
165      * - domain is marked as blocked only if all its vcpus are blocked
166      * - domain is marked as running if any of its vcpus is running
167      */
168     for_each_vcpu ( d, v )
169     {
170         vcpu_runstate_get(v, &runstate);
171         cpu_time += runstate.time[RUNSTATE_running];
172         info->max_vcpu_id = v->vcpu_id;
173         if ( !(v->pause_flags & VPF_down) )
174         {
175             if ( !(v->pause_flags & VPF_blocked) )
176                 flags &= ~XEN_DOMINF_blocked;
177             if ( v->is_running )
178                 flags |= XEN_DOMINF_running;
179             info->nr_online_vcpus++;
180         }
181     }
182 
183     info->cpu_time = cpu_time;
184 
185     info->flags = (info->nr_online_vcpus ? flags : 0) |
186         ((d->is_dying == DOMDYING_dead) ? XEN_DOMINF_dying     : 0) |
187         (d->is_shut_down                ? XEN_DOMINF_shutdown  : 0) |
188         (d->controller_pause_count > 0  ? XEN_DOMINF_paused    : 0) |
189         (d->debugger_attached           ? XEN_DOMINF_debugged  : 0) |
190         (d->is_xenstore                 ? XEN_DOMINF_xs_domain : 0) |
191         d->shutdown_code << XEN_DOMINF_shutdownshift;
192 
193     switch ( d->guest_type )
194     {
195     case guest_type_hvm:
196         info->flags |= XEN_DOMINF_hvm_guest;
197         break;
198     default:
199         break;
200     }
201 
202     xsm_security_domaininfo(d, info);
203 
204     info->tot_pages         = d->tot_pages;
205     info->max_pages         = d->max_pages;
206     info->outstanding_pages = d->outstanding_pages;
207     info->shr_pages         = atomic_read(&d->shr_pages);
208     info->paged_pages       = atomic_read(&d->paged_pages);
209     info->shared_info_frame = mfn_to_gmfn(d, virt_to_mfn(d->shared_info));
210     BUG_ON(SHARED_M2P(info->shared_info_frame));
211 
212     info->cpupool = d->cpupool ? d->cpupool->cpupool_id : CPUPOOLID_NONE;
213 
214     memcpy(info->handle, d->handle, sizeof(xen_domain_handle_t));
215 
216     arch_get_domain_info(d, info);
217 }
218 
domctl_lock_acquire(void)219 bool_t domctl_lock_acquire(void)
220 {
221     /*
222      * Caller may try to pause its own VCPUs. We must prevent deadlock
223      * against other non-domctl routines which try to do the same.
224      */
225     if ( !spin_trylock(&current->domain->hypercall_deadlock_mutex) )
226         return 0;
227 
228     /*
229      * Trylock here is paranoia if we have multiple privileged domains. Then
230      * we could have one domain trying to pause another which is spinning
231      * on domctl_lock -- results in deadlock.
232      */
233     if ( spin_trylock(&domctl_lock) )
234         return 1;
235 
236     spin_unlock(&current->domain->hypercall_deadlock_mutex);
237     return 0;
238 }
239 
domctl_lock_release(void)240 void domctl_lock_release(void)
241 {
242     spin_unlock(&domctl_lock);
243     spin_unlock(&current->domain->hypercall_deadlock_mutex);
244 }
245 
246 static inline
vcpuaffinity_params_invalid(const struct xen_domctl_vcpuaffinity * vcpuaff)247 int vcpuaffinity_params_invalid(const struct xen_domctl_vcpuaffinity *vcpuaff)
248 {
249     return vcpuaff->flags == 0 ||
250            ((vcpuaff->flags & XEN_VCPUAFFINITY_HARD) &&
251             guest_handle_is_null(vcpuaff->cpumap_hard.bitmap)) ||
252            ((vcpuaff->flags & XEN_VCPUAFFINITY_SOFT) &&
253             guest_handle_is_null(vcpuaff->cpumap_soft.bitmap));
254 }
255 
vnuma_destroy(struct vnuma_info * vnuma)256 void vnuma_destroy(struct vnuma_info *vnuma)
257 {
258     if ( vnuma )
259     {
260         xfree(vnuma->vmemrange);
261         xfree(vnuma->vcpu_to_vnode);
262         xfree(vnuma->vdistance);
263         xfree(vnuma->vnode_to_pnode);
264         xfree(vnuma);
265     }
266 }
267 
268 /*
269  * Allocates memory for vNUMA, **vnuma should be NULL.
270  * Caller has to make sure that domain has max_pages
271  * and number of vcpus set for domain.
272  * Verifies that single allocation does not exceed
273  * PAGE_SIZE.
274  */
vnuma_alloc(unsigned int nr_vnodes,unsigned int nr_ranges,unsigned int nr_vcpus)275 static struct vnuma_info *vnuma_alloc(unsigned int nr_vnodes,
276                                       unsigned int nr_ranges,
277                                       unsigned int nr_vcpus)
278 {
279 
280     struct vnuma_info *vnuma;
281 
282     /*
283      * Check if any of the allocations are bigger than PAGE_SIZE.
284      * See XSA-77.
285      */
286     if ( nr_vnodes * nr_vnodes > (PAGE_SIZE / sizeof(*vnuma->vdistance)) ||
287          nr_ranges > (PAGE_SIZE / sizeof(*vnuma->vmemrange)) )
288         return ERR_PTR(-EINVAL);
289 
290     /*
291      * If allocations become larger then PAGE_SIZE, these allocations
292      * should be split into PAGE_SIZE allocations due to XSA-77.
293      */
294     vnuma = xmalloc(struct vnuma_info);
295     if ( !vnuma )
296         return ERR_PTR(-ENOMEM);
297 
298     vnuma->vdistance = xmalloc_array(unsigned int, nr_vnodes * nr_vnodes);
299     vnuma->vcpu_to_vnode = xmalloc_array(unsigned int, nr_vcpus);
300     vnuma->vnode_to_pnode = xmalloc_array(nodeid_t, nr_vnodes);
301     vnuma->vmemrange = xmalloc_array(xen_vmemrange_t, nr_ranges);
302 
303     if ( vnuma->vdistance == NULL || vnuma->vmemrange == NULL ||
304          vnuma->vcpu_to_vnode == NULL || vnuma->vnode_to_pnode == NULL )
305     {
306         vnuma_destroy(vnuma);
307         return ERR_PTR(-ENOMEM);
308     }
309 
310     return vnuma;
311 }
312 
313 /*
314  * Construct vNUMA topology form uinfo.
315  */
vnuma_init(const struct xen_domctl_vnuma * uinfo,const struct domain * d)316 static struct vnuma_info *vnuma_init(const struct xen_domctl_vnuma *uinfo,
317                                      const struct domain *d)
318 {
319     unsigned int i, nr_vnodes;
320     int ret = -EINVAL;
321     struct vnuma_info *info;
322 
323     nr_vnodes = uinfo->nr_vnodes;
324 
325     if ( nr_vnodes == 0 || uinfo->nr_vcpus != d->max_vcpus || uinfo->pad != 0 )
326         return ERR_PTR(ret);
327 
328     info = vnuma_alloc(nr_vnodes, uinfo->nr_vmemranges, d->max_vcpus);
329     if ( IS_ERR(info) )
330         return info;
331 
332     ret = -EFAULT;
333 
334     if ( copy_from_guest(info->vdistance, uinfo->vdistance,
335                          nr_vnodes * nr_vnodes) )
336         goto vnuma_fail;
337 
338     if ( copy_from_guest(info->vmemrange, uinfo->vmemrange,
339                          uinfo->nr_vmemranges) )
340         goto vnuma_fail;
341 
342     if ( copy_from_guest(info->vcpu_to_vnode, uinfo->vcpu_to_vnode,
343                          d->max_vcpus) )
344         goto vnuma_fail;
345 
346     ret = -E2BIG;
347     for ( i = 0; i < d->max_vcpus; ++i )
348         if ( info->vcpu_to_vnode[i] >= nr_vnodes )
349             goto vnuma_fail;
350 
351     for ( i = 0; i < nr_vnodes; ++i )
352     {
353         unsigned int pnode;
354 
355         ret = -EFAULT;
356         if ( copy_from_guest_offset(&pnode, uinfo->vnode_to_pnode, i, 1) )
357             goto vnuma_fail;
358         ret = -E2BIG;
359         if ( pnode >= MAX_NUMNODES )
360             goto vnuma_fail;
361         info->vnode_to_pnode[i] = pnode;
362     }
363 
364     info->nr_vnodes = nr_vnodes;
365     info->nr_vmemranges = uinfo->nr_vmemranges;
366 
367     /* Check that vmemranges flags are zero. */
368     ret = -EINVAL;
369     for ( i = 0; i < info->nr_vmemranges; i++ )
370         if ( info->vmemrange[i].flags != 0 )
371             goto vnuma_fail;
372 
373     return info;
374 
375  vnuma_fail:
376     vnuma_destroy(info);
377     return ERR_PTR(ret);
378 }
379 
do_domctl(XEN_GUEST_HANDLE_PARAM (xen_domctl_t)u_domctl)380 long do_domctl(XEN_GUEST_HANDLE_PARAM(xen_domctl_t) u_domctl)
381 {
382     long ret = 0;
383     bool_t copyback = 0;
384     struct xen_domctl curop, *op = &curop;
385     struct domain *d;
386 
387     if ( copy_from_guest(op, u_domctl, 1) )
388         return -EFAULT;
389 
390     if ( op->interface_version != XEN_DOMCTL_INTERFACE_VERSION )
391         return -EACCES;
392 
393     switch ( op->cmd )
394     {
395     case XEN_DOMCTL_test_assign_device:
396         if ( op->domain == DOMID_INVALID )
397         {
398     case XEN_DOMCTL_createdomain:
399     case XEN_DOMCTL_gdbsx_guestmemio:
400             d = NULL;
401             break;
402         }
403         /* fall through */
404     default:
405         d = rcu_lock_domain_by_id(op->domain);
406         if ( !d && op->cmd != XEN_DOMCTL_getdomaininfo )
407             return -ESRCH;
408     }
409 
410     ret = xsm_domctl(XSM_OTHER, d, op->cmd);
411     if ( ret )
412         goto domctl_out_unlock_domonly;
413 
414     if ( !domctl_lock_acquire() )
415     {
416         if ( d )
417             rcu_unlock_domain(d);
418         return hypercall_create_continuation(
419             __HYPERVISOR_domctl, "h", u_domctl);
420     }
421 
422     switch ( op->cmd )
423     {
424 
425     case XEN_DOMCTL_setvcpucontext:
426     {
427         vcpu_guest_context_u c = { .nat = NULL };
428         unsigned int vcpu = op->u.vcpucontext.vcpu;
429         struct vcpu *v;
430 
431         ret = -EINVAL;
432         if ( (d == current->domain) || /* no domain_pause() */
433              (vcpu >= d->max_vcpus) || ((v = d->vcpu[vcpu]) == NULL) )
434             break;
435 
436         if ( guest_handle_is_null(op->u.vcpucontext.ctxt) )
437         {
438             ret = vcpu_reset(v);
439             if ( ret == -ERESTART )
440                 ret = hypercall_create_continuation(
441                           __HYPERVISOR_domctl, "h", u_domctl);
442             break;
443         }
444 
445 #ifdef CONFIG_COMPAT
446         BUILD_BUG_ON(sizeof(struct vcpu_guest_context)
447                      < sizeof(struct compat_vcpu_guest_context));
448 #endif
449         ret = -ENOMEM;
450         if ( (c.nat = alloc_vcpu_guest_context()) == NULL )
451             break;
452 
453 #ifdef CONFIG_COMPAT
454         if ( !is_pv_32bit_domain(d) )
455             ret = copy_from_guest(c.nat, op->u.vcpucontext.ctxt, 1);
456         else
457             ret = copy_from_guest(c.cmp,
458                                   guest_handle_cast(op->u.vcpucontext.ctxt,
459                                                     void), 1);
460 #else
461         ret = copy_from_guest(c.nat, op->u.vcpucontext.ctxt, 1);
462 #endif
463         ret = ret ? -EFAULT : 0;
464 
465         if ( ret == 0 )
466         {
467             domain_pause(d);
468             ret = arch_set_info_guest(v, c);
469             domain_unpause(d);
470 
471             if ( ret == -ERESTART )
472                 ret = hypercall_create_continuation(
473                           __HYPERVISOR_domctl, "h", u_domctl);
474         }
475 
476         free_vcpu_guest_context(c.nat);
477         break;
478     }
479 
480     case XEN_DOMCTL_pausedomain:
481         ret = -EINVAL;
482         if ( d != current->domain )
483             ret = domain_pause_by_systemcontroller(d);
484         break;
485 
486     case XEN_DOMCTL_unpausedomain:
487         ret = domain_unpause_by_systemcontroller(d);
488         break;
489 
490     case XEN_DOMCTL_resumedomain:
491         if ( d == current->domain ) /* no domain_pause() */
492             ret = -EINVAL;
493         else
494             domain_resume(d);
495         break;
496 
497     case XEN_DOMCTL_createdomain:
498     {
499         domid_t        dom;
500         static domid_t rover = 0;
501         unsigned int domcr_flags;
502 
503         ret = -EINVAL;
504         if ( (op->u.createdomain.flags &
505              ~(XEN_DOMCTL_CDF_hvm_guest
506                | XEN_DOMCTL_CDF_hap
507                | XEN_DOMCTL_CDF_s3_integrity
508                | XEN_DOMCTL_CDF_oos_off
509                | XEN_DOMCTL_CDF_xs_domain)) )
510             break;
511 
512         dom = op->domain;
513         if ( (dom > 0) && (dom < DOMID_FIRST_RESERVED) )
514         {
515             ret = -EINVAL;
516             if ( !is_free_domid(dom) )
517                 break;
518         }
519         else
520         {
521             for ( dom = rover + 1; dom != rover; dom++ )
522             {
523                 if ( dom == DOMID_FIRST_RESERVED )
524                     dom = 1;
525                 if ( is_free_domid(dom) )
526                     break;
527             }
528 
529             ret = -ENOMEM;
530             if ( dom == rover )
531                 break;
532 
533             rover = dom;
534         }
535 
536         domcr_flags = 0;
537         if ( op->u.createdomain.flags & XEN_DOMCTL_CDF_hvm_guest )
538             domcr_flags |= DOMCRF_hvm;
539         if ( op->u.createdomain.flags & XEN_DOMCTL_CDF_hap )
540             domcr_flags |= DOMCRF_hap;
541         if ( op->u.createdomain.flags & XEN_DOMCTL_CDF_s3_integrity )
542             domcr_flags |= DOMCRF_s3_integrity;
543         if ( op->u.createdomain.flags & XEN_DOMCTL_CDF_oos_off )
544             domcr_flags |= DOMCRF_oos_off;
545         if ( op->u.createdomain.flags & XEN_DOMCTL_CDF_xs_domain )
546             domcr_flags |= DOMCRF_xs_domain;
547 
548         d = domain_create(dom, domcr_flags, op->u.createdomain.ssidref,
549                           &op->u.createdomain.config);
550         if ( IS_ERR(d) )
551         {
552             ret = PTR_ERR(d);
553             d = NULL;
554             break;
555         }
556 
557         ret = 0;
558 
559         memcpy(d->handle, op->u.createdomain.handle,
560                sizeof(xen_domain_handle_t));
561 
562         op->domain = d->domain_id;
563         copyback = 1;
564         d = NULL;
565         break;
566     }
567 
568     case XEN_DOMCTL_max_vcpus:
569     {
570         unsigned int i, max = op->u.max_vcpus.max, cpu;
571         cpumask_t *online;
572 
573         ret = -EINVAL;
574         if ( (d == current->domain) || /* no domain_pause() */
575              (max > domain_max_vcpus(d)) )
576             break;
577 
578         /* Until Xenoprof can dynamically grow its vcpu-s array... */
579         if ( d->xenoprof )
580         {
581             ret = -EAGAIN;
582             break;
583         }
584 
585         /* Needed, for example, to ensure writable p.t. state is synced. */
586         domain_pause(d);
587 
588         /*
589          * Certain operations (e.g. CPU microcode updates) modify data which is
590          * used during VCPU allocation/initialization
591          */
592         while ( !spin_trylock(&vcpu_alloc_lock) )
593         {
594             if ( hypercall_preempt_check() )
595             {
596                 ret =  hypercall_create_continuation(
597                     __HYPERVISOR_domctl, "h", u_domctl);
598                 goto maxvcpu_out_novcpulock;
599             }
600         }
601 
602         /* We cannot reduce maximum VCPUs. */
603         ret = -EINVAL;
604         if ( (max < d->max_vcpus) && (d->vcpu[max] != NULL) )
605             goto maxvcpu_out;
606 
607         /*
608          * For now don't allow increasing the vcpu count from a non-zero
609          * value: This code and all readers of d->vcpu would otherwise need
610          * to be converted to use RCU, but at present there's no tools side
611          * code path that would issue such a request.
612          */
613         ret = -EBUSY;
614         if ( (d->max_vcpus > 0) && (max > d->max_vcpus) )
615             goto maxvcpu_out;
616 
617         ret = -ENOMEM;
618         online = cpupool_domain_cpumask(d);
619         if ( max > d->max_vcpus )
620         {
621             struct vcpu **vcpus;
622 
623             BUG_ON(d->vcpu != NULL);
624             BUG_ON(d->max_vcpus != 0);
625 
626             if ( (vcpus = xzalloc_array(struct vcpu *, max)) == NULL )
627                 goto maxvcpu_out;
628 
629             /* Install vcpu array /then/ update max_vcpus. */
630             d->vcpu = vcpus;
631             smp_wmb();
632             d->max_vcpus = max;
633         }
634 
635         for ( i = 0; i < max; i++ )
636         {
637             if ( d->vcpu[i] != NULL )
638                 continue;
639 
640             cpu = (i == 0) ?
641                 cpumask_any(online) :
642                 cpumask_cycle(d->vcpu[i-1]->processor, online);
643 
644             if ( alloc_vcpu(d, i, cpu) == NULL )
645                 goto maxvcpu_out;
646         }
647 
648         ret = 0;
649 
650     maxvcpu_out:
651         spin_unlock(&vcpu_alloc_lock);
652 
653     maxvcpu_out_novcpulock:
654         domain_unpause(d);
655         break;
656     }
657 
658     case XEN_DOMCTL_soft_reset:
659         if ( d == current->domain ) /* no domain_pause() */
660         {
661             ret = -EINVAL;
662             break;
663         }
664         ret = domain_soft_reset(d);
665         break;
666 
667     case XEN_DOMCTL_destroydomain:
668         ret = domain_kill(d);
669         if ( ret == -ERESTART )
670             ret = hypercall_create_continuation(
671                 __HYPERVISOR_domctl, "h", u_domctl);
672         break;
673 
674     case XEN_DOMCTL_setnodeaffinity:
675     {
676         nodemask_t new_affinity;
677 
678         ret = xenctl_bitmap_to_nodemask(&new_affinity,
679                                         &op->u.nodeaffinity.nodemap);
680         if ( !ret )
681             ret = domain_set_node_affinity(d, &new_affinity);
682         break;
683     }
684 
685     case XEN_DOMCTL_getnodeaffinity:
686         ret = nodemask_to_xenctl_bitmap(&op->u.nodeaffinity.nodemap,
687                                         &d->node_affinity);
688         break;
689 
690     case XEN_DOMCTL_setvcpuaffinity:
691     case XEN_DOMCTL_getvcpuaffinity:
692     {
693         struct vcpu *v;
694         struct xen_domctl_vcpuaffinity *vcpuaff = &op->u.vcpuaffinity;
695 
696         ret = -EINVAL;
697         if ( vcpuaff->vcpu >= d->max_vcpus )
698             break;
699 
700         ret = -ESRCH;
701         if ( (v = d->vcpu[vcpuaff->vcpu]) == NULL )
702             break;
703 
704         ret = -EINVAL;
705         if ( vcpuaffinity_params_invalid(vcpuaff) )
706             break;
707 
708         if ( op->cmd == XEN_DOMCTL_setvcpuaffinity )
709         {
710             cpumask_var_t new_affinity, old_affinity;
711             cpumask_t *online = cpupool_domain_cpumask(v->domain);
712 
713             /*
714              * We want to be able to restore hard affinity if we are trying
715              * setting both and changing soft affinity (which happens later,
716              * when hard affinity has been succesfully chaged already) fails.
717              */
718             if ( !alloc_cpumask_var(&old_affinity) )
719             {
720                 ret = -ENOMEM;
721                 break;
722             }
723             cpumask_copy(old_affinity, v->cpu_hard_affinity);
724 
725             if ( !alloc_cpumask_var(&new_affinity) )
726             {
727                 free_cpumask_var(old_affinity);
728                 ret = -ENOMEM;
729                 break;
730             }
731 
732             /* Undo a stuck SCHED_pin_override? */
733             if ( vcpuaff->flags & XEN_VCPUAFFINITY_FORCE )
734                 vcpu_pin_override(v, -1);
735 
736             ret = 0;
737 
738             /*
739              * We both set a new affinity and report back to the caller what
740              * the scheduler will be effectively using.
741              */
742             if ( vcpuaff->flags & XEN_VCPUAFFINITY_HARD )
743             {
744                 ret = xenctl_bitmap_to_bitmap(cpumask_bits(new_affinity),
745                                               &vcpuaff->cpumap_hard,
746                                               nr_cpu_ids);
747                 if ( !ret )
748                     ret = vcpu_set_hard_affinity(v, new_affinity);
749                 if ( ret )
750                     goto setvcpuaffinity_out;
751 
752                 /*
753                  * For hard affinity, what we return is the intersection of
754                  * cpupool's online mask and the new hard affinity.
755                  */
756                 cpumask_and(new_affinity, online, v->cpu_hard_affinity);
757                 ret = cpumask_to_xenctl_bitmap(&vcpuaff->cpumap_hard,
758                                                new_affinity);
759             }
760             if ( vcpuaff->flags & XEN_VCPUAFFINITY_SOFT )
761             {
762                 ret = xenctl_bitmap_to_bitmap(cpumask_bits(new_affinity),
763                                               &vcpuaff->cpumap_soft,
764                                               nr_cpu_ids);
765                 if ( !ret)
766                     ret = vcpu_set_soft_affinity(v, new_affinity);
767                 if ( ret )
768                 {
769                     /*
770                      * Since we're returning error, the caller expects nothing
771                      * happened, so we rollback the changes to hard affinity
772                      * (if any).
773                      */
774                     if ( vcpuaff->flags & XEN_VCPUAFFINITY_HARD )
775                         vcpu_set_hard_affinity(v, old_affinity);
776                     goto setvcpuaffinity_out;
777                 }
778 
779                 /*
780                  * For soft affinity, we return the intersection between the
781                  * new soft affinity, the cpupool's online map and the (new)
782                  * hard affinity.
783                  */
784                 cpumask_and(new_affinity, new_affinity, online);
785                 cpumask_and(new_affinity, new_affinity, v->cpu_hard_affinity);
786                 ret = cpumask_to_xenctl_bitmap(&vcpuaff->cpumap_soft,
787                                                new_affinity);
788             }
789 
790  setvcpuaffinity_out:
791             free_cpumask_var(new_affinity);
792             free_cpumask_var(old_affinity);
793         }
794         else
795         {
796             if ( vcpuaff->flags & XEN_VCPUAFFINITY_HARD )
797                 ret = cpumask_to_xenctl_bitmap(&vcpuaff->cpumap_hard,
798                                                v->cpu_hard_affinity);
799             if ( vcpuaff->flags & XEN_VCPUAFFINITY_SOFT )
800                 ret = cpumask_to_xenctl_bitmap(&vcpuaff->cpumap_soft,
801                                                v->cpu_soft_affinity);
802         }
803         break;
804     }
805 
806     case XEN_DOMCTL_scheduler_op:
807         ret = sched_adjust(d, &op->u.scheduler_op);
808         copyback = 1;
809         break;
810 
811     case XEN_DOMCTL_getdomaininfo:
812     {
813         domid_t dom = DOMID_INVALID;
814 
815         if ( !d )
816         {
817             ret = -EINVAL;
818             if ( op->domain >= DOMID_FIRST_RESERVED )
819                 break;
820 
821             rcu_read_lock(&domlist_read_lock);
822 
823             dom = op->domain;
824             for_each_domain ( d )
825                 if ( d->domain_id >= dom )
826                     break;
827         }
828 
829         ret = -ESRCH;
830         if ( d == NULL )
831             goto getdomaininfo_out;
832 
833         ret = xsm_getdomaininfo(XSM_HOOK, d);
834         if ( ret )
835             goto getdomaininfo_out;
836 
837         getdomaininfo(d, &op->u.getdomaininfo);
838 
839         op->domain = op->u.getdomaininfo.domain;
840         copyback = 1;
841 
842     getdomaininfo_out:
843         /* When d was non-NULL upon entry, no cleanup is needed. */
844         if ( dom == DOMID_INVALID )
845             break;
846 
847         rcu_read_unlock(&domlist_read_lock);
848         d = NULL;
849         break;
850     }
851 
852     case XEN_DOMCTL_getvcpucontext:
853     {
854         vcpu_guest_context_u c = { .nat = NULL };
855         struct vcpu         *v;
856 
857         ret = -EINVAL;
858         if ( op->u.vcpucontext.vcpu >= d->max_vcpus ||
859              (v = d->vcpu[op->u.vcpucontext.vcpu]) == NULL ||
860              v == current ) /* no vcpu_pause() */
861             goto getvcpucontext_out;
862 
863         ret = -ENODATA;
864         if ( !v->is_initialised )
865             goto getvcpucontext_out;
866 
867 #ifdef CONFIG_COMPAT
868         BUILD_BUG_ON(sizeof(struct vcpu_guest_context)
869                      < sizeof(struct compat_vcpu_guest_context));
870 #endif
871         ret = -ENOMEM;
872         if ( (c.nat = xzalloc(struct vcpu_guest_context)) == NULL )
873             goto getvcpucontext_out;
874 
875         vcpu_pause(v);
876 
877         arch_get_info_guest(v, c);
878         ret = 0;
879 
880         vcpu_unpause(v);
881 
882 #ifdef CONFIG_COMPAT
883         if ( !is_pv_32bit_domain(d) )
884             ret = copy_to_guest(op->u.vcpucontext.ctxt, c.nat, 1);
885         else
886             ret = copy_to_guest(guest_handle_cast(op->u.vcpucontext.ctxt,
887                                                   void), c.cmp, 1);
888 #else
889         ret = copy_to_guest(op->u.vcpucontext.ctxt, c.nat, 1);
890 #endif
891 
892         if ( ret )
893             ret = -EFAULT;
894         copyback = 1;
895 
896     getvcpucontext_out:
897         xfree(c.nat);
898         break;
899     }
900 
901     case XEN_DOMCTL_getvcpuinfo:
902     {
903         struct vcpu   *v;
904         struct vcpu_runstate_info runstate;
905 
906         ret = -EINVAL;
907         if ( op->u.getvcpuinfo.vcpu >= d->max_vcpus )
908             break;
909 
910         ret = -ESRCH;
911         if ( (v = d->vcpu[op->u.getvcpuinfo.vcpu]) == NULL )
912             break;
913 
914         vcpu_runstate_get(v, &runstate);
915 
916         op->u.getvcpuinfo.online   = !(v->pause_flags & VPF_down);
917         op->u.getvcpuinfo.blocked  = !!(v->pause_flags & VPF_blocked);
918         op->u.getvcpuinfo.running  = v->is_running;
919         op->u.getvcpuinfo.cpu_time = runstate.time[RUNSTATE_running];
920         op->u.getvcpuinfo.cpu      = v->processor;
921         ret = 0;
922         copyback = 1;
923         break;
924     }
925 
926     case XEN_DOMCTL_max_mem:
927     {
928         uint64_t new_max = op->u.max_mem.max_memkb >> (PAGE_SHIFT - 10);
929 
930         spin_lock(&d->page_alloc_lock);
931         /*
932          * NB. We removed a check that new_max >= current tot_pages; this means
933          * that the domain will now be allowed to "ratchet" down to new_max. In
934          * the meantime, while tot > max, all new allocations are disallowed.
935          */
936         d->max_pages = min(new_max, (uint64_t)(typeof(d->max_pages))-1);
937         spin_unlock(&d->page_alloc_lock);
938         break;
939     }
940 
941     case XEN_DOMCTL_setdomainhandle:
942         memcpy(d->handle, op->u.setdomainhandle.handle,
943                sizeof(xen_domain_handle_t));
944         break;
945 
946     case XEN_DOMCTL_setdebugging:
947         if ( unlikely(d == current->domain) ) /* no domain_pause() */
948             ret = -EINVAL;
949         else
950         {
951             domain_pause(d);
952             d->debugger_attached = !!op->u.setdebugging.enable;
953             domain_unpause(d); /* causes guest to latch new status */
954         }
955         break;
956 
957     case XEN_DOMCTL_irq_permission:
958     {
959         unsigned int pirq = op->u.irq_permission.pirq, irq;
960         int allow = op->u.irq_permission.allow_access;
961 
962         if ( pirq >= current->domain->nr_pirqs )
963         {
964             ret = -EINVAL;
965             break;
966         }
967         irq = pirq_access_permitted(current->domain, pirq);
968         if ( !irq || xsm_irq_permission(XSM_HOOK, d, irq, allow) )
969             ret = -EPERM;
970         else if ( allow )
971             ret = irq_permit_access(d, irq);
972         else
973             ret = irq_deny_access(d, irq);
974         break;
975     }
976 
977     case XEN_DOMCTL_iomem_permission:
978     {
979         unsigned long mfn = op->u.iomem_permission.first_mfn;
980         unsigned long nr_mfns = op->u.iomem_permission.nr_mfns;
981         int allow = op->u.iomem_permission.allow_access;
982 
983         ret = -EINVAL;
984         if ( (mfn + nr_mfns - 1) < mfn ) /* wrap? */
985             break;
986 
987         if ( !iomem_access_permitted(current->domain,
988                                      mfn, mfn + nr_mfns - 1) ||
989              xsm_iomem_permission(XSM_HOOK, d, mfn, mfn + nr_mfns - 1, allow) )
990             ret = -EPERM;
991         else if ( allow )
992             ret = iomem_permit_access(d, mfn, mfn + nr_mfns - 1);
993         else
994             ret = iomem_deny_access(d, mfn, mfn + nr_mfns - 1);
995         if ( !ret )
996             memory_type_changed(d);
997         break;
998     }
999 
1000     case XEN_DOMCTL_memory_mapping:
1001     {
1002         unsigned long gfn = op->u.memory_mapping.first_gfn;
1003         unsigned long mfn = op->u.memory_mapping.first_mfn;
1004         unsigned long nr_mfns = op->u.memory_mapping.nr_mfns;
1005         unsigned long mfn_end = mfn + nr_mfns - 1;
1006         int add = op->u.memory_mapping.add_mapping;
1007 
1008         ret = -EINVAL;
1009         if ( mfn_end < mfn || /* wrap? */
1010              ((mfn | mfn_end) >> (paddr_bits - PAGE_SHIFT)) ||
1011              (gfn + nr_mfns - 1) < gfn ) /* wrap? */
1012             break;
1013 
1014 #ifndef CONFIG_X86 /* XXX ARM!? */
1015         ret = -E2BIG;
1016         /* Must break hypercall up as this could take a while. */
1017         if ( nr_mfns > 64 )
1018             break;
1019 #endif
1020 
1021         ret = -EPERM;
1022         if ( !iomem_access_permitted(current->domain, mfn, mfn_end) ||
1023              !iomem_access_permitted(d, mfn, mfn_end) )
1024             break;
1025 
1026         ret = xsm_iomem_mapping(XSM_HOOK, d, mfn, mfn_end, add);
1027         if ( ret )
1028             break;
1029 
1030         if ( add )
1031         {
1032             printk(XENLOG_G_DEBUG
1033                    "memory_map:add: dom%d gfn=%lx mfn=%lx nr=%lx\n",
1034                    d->domain_id, gfn, mfn, nr_mfns);
1035 
1036             ret = map_mmio_regions(d, _gfn(gfn), nr_mfns, _mfn(mfn));
1037             if ( ret < 0 )
1038                 printk(XENLOG_G_WARNING
1039                        "memory_map:fail: dom%d gfn=%lx mfn=%lx nr=%lx ret:%ld\n",
1040                        d->domain_id, gfn, mfn, nr_mfns, ret);
1041         }
1042         else
1043         {
1044             printk(XENLOG_G_DEBUG
1045                    "memory_map:remove: dom%d gfn=%lx mfn=%lx nr=%lx\n",
1046                    d->domain_id, gfn, mfn, nr_mfns);
1047 
1048             ret = unmap_mmio_regions(d, _gfn(gfn), nr_mfns, _mfn(mfn));
1049             if ( ret < 0 && is_hardware_domain(current->domain) )
1050                 printk(XENLOG_ERR
1051                        "memory_map: error %ld removing dom%d access to [%lx,%lx]\n",
1052                        ret, d->domain_id, mfn, mfn_end);
1053         }
1054         /* Do this unconditionally to cover errors on above failure paths. */
1055         memory_type_changed(d);
1056         break;
1057     }
1058 
1059     case XEN_DOMCTL_settimeoffset:
1060         domain_set_time_offset(d, op->u.settimeoffset.time_offset_seconds);
1061         break;
1062 
1063     case XEN_DOMCTL_set_target:
1064     {
1065         struct domain *e;
1066 
1067         ret = -ESRCH;
1068         e = get_domain_by_id(op->u.set_target.target);
1069         if ( e == NULL )
1070             break;
1071 
1072         ret = -EINVAL;
1073         if ( (d == e) || (d->target != NULL) )
1074         {
1075             put_domain(e);
1076             break;
1077         }
1078 
1079         ret = -EOPNOTSUPP;
1080         if ( is_hvm_domain(e) )
1081             ret = xsm_set_target(XSM_HOOK, d, e);
1082         if ( ret )
1083         {
1084             put_domain(e);
1085             break;
1086         }
1087 
1088         /* Hold reference on @e until we destroy @d. */
1089         d->target = e;
1090         break;
1091     }
1092 
1093     case XEN_DOMCTL_subscribe:
1094         d->suspend_evtchn = op->u.subscribe.port;
1095         break;
1096 
1097     case XEN_DOMCTL_vm_event_op:
1098         ret = vm_event_domctl(d, &op->u.vm_event_op,
1099                               guest_handle_cast(u_domctl, void));
1100         copyback = 1;
1101         break;
1102 
1103 #ifdef CONFIG_HAS_MEM_ACCESS
1104     case XEN_DOMCTL_set_access_required:
1105         if ( unlikely(current->domain == d) ) /* no domain_pause() */
1106             ret = -EPERM;
1107         else
1108         {
1109             domain_pause(d);
1110             p2m_get_hostp2m(d)->access_required =
1111                 op->u.access_required.access_required;
1112             domain_unpause(d);
1113         }
1114         break;
1115 #endif
1116 
1117     case XEN_DOMCTL_set_virq_handler:
1118         ret = set_global_virq_handler(d, op->u.set_virq_handler.virq);
1119         break;
1120 
1121     case XEN_DOMCTL_set_max_evtchn:
1122         d->max_evtchn_port = min_t(unsigned int,
1123                                    op->u.set_max_evtchn.max_port,
1124                                    INT_MAX);
1125         break;
1126 
1127     case XEN_DOMCTL_setvnumainfo:
1128     {
1129         struct vnuma_info *vnuma;
1130 
1131         vnuma = vnuma_init(&op->u.vnuma, d);
1132         if ( IS_ERR(vnuma) )
1133         {
1134             ret = PTR_ERR(vnuma);
1135             break;
1136         }
1137 
1138         /* overwrite vnuma topology for domain. */
1139         write_lock(&d->vnuma_rwlock);
1140         vnuma_destroy(d->vnuma);
1141         d->vnuma = vnuma;
1142         write_unlock(&d->vnuma_rwlock);
1143 
1144         break;
1145     }
1146 
1147     case XEN_DOMCTL_monitor_op:
1148         ret = monitor_domctl(d, &op->u.monitor_op);
1149         if ( !ret )
1150             copyback = 1;
1151         break;
1152 
1153     case XEN_DOMCTL_set_gnttab_limits:
1154         ret = grant_table_set_limits(d, op->u.set_gnttab_limits.grant_frames,
1155                                      op->u.set_gnttab_limits.maptrack_frames);
1156         break;
1157 
1158     default:
1159         ret = arch_do_domctl(op, d, u_domctl);
1160         break;
1161     }
1162 
1163     domctl_lock_release();
1164 
1165  domctl_out_unlock_domonly:
1166     if ( d )
1167         rcu_unlock_domain(d);
1168 
1169     if ( copyback && __copy_to_guest(u_domctl, op, 1) )
1170         ret = -EFAULT;
1171 
1172     return ret;
1173 }
1174 
1175 /*
1176  * Local variables:
1177  * mode: C
1178  * c-file-style: "BSD"
1179  * c-basic-offset: 4
1180  * tab-width: 4
1181  * indent-tabs-mode: nil
1182  * End:
1183  */
1184