1 /******************************************************************************
2  * cpupool.c
3  *
4  * Generic cpupool-handling functions.
5  *
6  * Cpupools are a feature to have configurable scheduling domains. Each
7  * cpupool runs an own scheduler on a dedicated set of physical cpus.
8  * A domain is bound to one cpupool at any time, but it can be moved to
9  * another cpupool.
10  *
11  * (C) 2009, Juergen Gross, Fujitsu Technology Solutions
12  */
13 
14 #include <xen/cpu.h>
15 #include <xen/cpumask.h>
16 #include <xen/guest_access.h>
17 #include <xen/hypfs.h>
18 #include <xen/init.h>
19 #include <xen/keyhandler.h>
20 #include <xen/lib.h>
21 #include <xen/list.h>
22 #include <xen/param.h>
23 #include <xen/percpu.h>
24 #include <xen/sched.h>
25 #include <xen/warning.h>
26 
27 #include "private.h"
28 
29 struct cpupool *cpupool0;                /* Initial cpupool with Dom0 */
30 cpumask_t cpupool_free_cpus;             /* cpus not in any cpupool */
31 
32 static LIST_HEAD(cpupool_list);          /* linked list, sorted by poolid */
33 static unsigned int n_cpupools;
34 
35 static int cpupool_moving_cpu = -1;
36 static struct cpupool *cpupool_cpu_moving = NULL;
37 static cpumask_t cpupool_locked_cpus;
38 
39 /* This lock nests inside sysctl or hypfs lock. */
40 static DEFINE_SPINLOCK(cpupool_lock);
41 
42 static enum sched_gran __read_mostly opt_sched_granularity = SCHED_GRAN_cpu;
43 static unsigned int __read_mostly sched_granularity = 1;
44 
45 #define SCHED_GRAN_NAME_LEN  8
46 struct sched_gran_name {
47     enum sched_gran mode;
48     char name[SCHED_GRAN_NAME_LEN];
49 };
50 
51 static const struct sched_gran_name sg_name[] = {
52     {SCHED_GRAN_cpu, "cpu"},
53     {SCHED_GRAN_core, "core"},
54     {SCHED_GRAN_socket, "socket"},
55 };
56 
sched_gran_get_name(enum sched_gran mode)57 static const char *sched_gran_get_name(enum sched_gran mode)
58 {
59     const char *name = "";
60     unsigned int i;
61 
62     for ( i = 0; i < ARRAY_SIZE(sg_name); i++ )
63     {
64         if ( mode == sg_name[i].mode )
65         {
66             name = sg_name[i].name;
67             break;
68         }
69     }
70 
71     return name;
72 }
73 
sched_gran_print(enum sched_gran mode,unsigned int gran)74 static void sched_gran_print(enum sched_gran mode, unsigned int gran)
75 {
76     printk("Scheduling granularity: %s, %u CPU%s per sched-resource\n",
77            sched_gran_get_name(mode), gran, gran == 1 ? "" : "s");
78 }
79 
80 #ifdef CONFIG_HAS_SCHED_GRANULARITY
sched_gran_get(const char * str,enum sched_gran * mode)81 static int sched_gran_get(const char *str, enum sched_gran *mode)
82 {
83     unsigned int i;
84 
85     for ( i = 0; i < ARRAY_SIZE(sg_name); i++ )
86     {
87         if ( strcmp(sg_name[i].name, str) == 0 )
88         {
89             *mode = sg_name[i].mode;
90             return 0;
91         }
92     }
93 
94     return -EINVAL;
95 }
96 
sched_select_granularity(const char * str)97 static int __init cf_check sched_select_granularity(const char *str)
98 {
99     return sched_gran_get(str, &opt_sched_granularity);
100 }
101 custom_param("sched-gran", sched_select_granularity);
102 #elif defined(CONFIG_HYPFS)
sched_gran_get(const char * str,enum sched_gran * mode)103 static int sched_gran_get(const char *str, enum sched_gran *mode)
104 {
105     return -EINVAL;
106 }
107 #endif
108 
cpupool_check_granularity(enum sched_gran mode)109 static unsigned int cpupool_check_granularity(enum sched_gran mode)
110 {
111     unsigned int cpu;
112     unsigned int siblings, gran = 0;
113 
114     if ( mode == SCHED_GRAN_cpu )
115         return 1;
116 
117     for_each_online_cpu ( cpu )
118     {
119         siblings = cpumask_weight(sched_get_opt_cpumask(mode, cpu));
120         if ( gran == 0 )
121             gran = siblings;
122         else if ( gran != siblings )
123             return 0;
124     }
125 
126     return gran;
127 }
128 
129 /* Setup data for selected scheduler granularity. */
cpupool_gran_init(void)130 static void __init cpupool_gran_init(void)
131 {
132     unsigned int gran = 0;
133     const char *fallback = NULL;
134 
135     while ( gran == 0 )
136     {
137         gran = cpupool_check_granularity(opt_sched_granularity);
138 
139         if ( gran == 0 )
140         {
141             switch ( opt_sched_granularity )
142             {
143             case SCHED_GRAN_core:
144                 opt_sched_granularity = SCHED_GRAN_cpu;
145                 fallback = "Asymmetric cpu configuration.\n"
146                            "Falling back to sched-gran=cpu.\n";
147                 break;
148             case SCHED_GRAN_socket:
149                 opt_sched_granularity = SCHED_GRAN_core;
150                 fallback = "Asymmetric cpu configuration.\n"
151                            "Falling back to sched-gran=core.\n";
152                 break;
153             default:
154                 ASSERT_UNREACHABLE();
155                 break;
156             }
157         }
158     }
159 
160     if ( fallback )
161         warning_add(fallback);
162 
163     if ( opt_sched_granularity != SCHED_GRAN_cpu )
164         sched_disable_smt_switching = true;
165 
166     sched_granularity = gran;
167     sched_gran_print(opt_sched_granularity, sched_granularity);
168 }
169 
cpupool_get_granularity(const struct cpupool * c)170 unsigned int cpupool_get_granularity(const struct cpupool *c)
171 {
172     return c ? c->sched_gran : 1;
173 }
174 
free_cpupool_struct(struct cpupool * c)175 static void free_cpupool_struct(struct cpupool *c)
176 {
177     if ( c )
178     {
179         free_cpumask_var(c->res_valid);
180         free_cpumask_var(c->cpu_valid);
181     }
182     xfree(c);
183 }
184 
alloc_cpupool_struct(void)185 static struct cpupool *alloc_cpupool_struct(void)
186 {
187     struct cpupool *c = xzalloc(struct cpupool);
188 
189     if ( !c )
190         return NULL;
191 
192     if ( !zalloc_cpumask_var(&c->cpu_valid) ||
193          !zalloc_cpumask_var(&c->res_valid) )
194     {
195         free_cpupool_struct(c);
196         c = NULL;
197     }
198 
199     return c;
200 }
201 
202 /*
203  * find a cpupool by it's id. to be called with cpupool lock held
204  * if exact is not specified, the first cpupool with an id larger or equal to
205  * the searched id is returned
206  * returns NULL if not found.
207  */
__cpupool_find_by_id(unsigned int id,bool exact)208 static struct cpupool *__cpupool_find_by_id(unsigned int id, bool exact)
209 {
210     struct cpupool *q;
211 
212     ASSERT(spin_is_locked(&cpupool_lock));
213 
214     list_for_each_entry(q, &cpupool_list, list)
215         if ( q->cpupool_id == id || (!exact && q->cpupool_id > id) )
216             return q;
217 
218     return NULL;
219 }
220 
cpupool_find_by_id(unsigned int poolid)221 static struct cpupool *cpupool_find_by_id(unsigned int poolid)
222 {
223     return __cpupool_find_by_id(poolid, true);
224 }
225 
__cpupool_get_by_id(unsigned int poolid,bool exact)226 static struct cpupool *__cpupool_get_by_id(unsigned int poolid, bool exact)
227 {
228     struct cpupool *c;
229     spin_lock(&cpupool_lock);
230     c = __cpupool_find_by_id(poolid, exact);
231     if ( c != NULL )
232         atomic_inc(&c->refcnt);
233     spin_unlock(&cpupool_lock);
234     return c;
235 }
236 
cpupool_get_by_id(unsigned int poolid)237 struct cpupool *cpupool_get_by_id(unsigned int poolid)
238 {
239     return __cpupool_get_by_id(poolid, true);
240 }
241 
cpupool_get_next_by_id(unsigned int poolid)242 static struct cpupool *cpupool_get_next_by_id(unsigned int poolid)
243 {
244     return __cpupool_get_by_id(poolid, false);
245 }
246 
cpupool_put(struct cpupool * pool)247 void cpupool_put(struct cpupool *pool)
248 {
249     if ( !atomic_dec_and_test(&pool->refcnt) )
250         return;
251     scheduler_free(pool->sched);
252     free_cpupool_struct(pool);
253 }
254 
255 /*
256  * create a new cpupool with specified poolid and scheduler
257  * returns pointer to new cpupool structure if okay, NULL else
258  * possible failures:
259  * - no memory
260  * - poolid already used
261  * - unknown scheduler
262  */
cpupool_create(unsigned int poolid,unsigned int sched_id)263 static struct cpupool *cpupool_create(unsigned int poolid,
264                                       unsigned int sched_id)
265 {
266     struct cpupool *c;
267     struct cpupool *q;
268     int ret;
269 
270     if ( (c = alloc_cpupool_struct()) == NULL )
271         return ERR_PTR(-ENOMEM);
272 
273     /* One reference for caller, one reference for cpupool_destroy(). */
274     atomic_set(&c->refcnt, 2);
275 
276     debugtrace_printk("cpupool_create(pool=%u,sched=%u)\n", poolid, sched_id);
277 
278     spin_lock(&cpupool_lock);
279 
280     /* Don't allow too many cpupools. */
281     if ( n_cpupools >= 2 * nr_cpu_ids )
282     {
283         ret = -ENOSPC;
284         goto unlock;
285     }
286     n_cpupools++;
287 
288     if ( poolid != CPUPOOLID_NONE )
289     {
290         q = __cpupool_find_by_id(poolid, false);
291         if ( !q )
292             list_add_tail(&c->list, &cpupool_list);
293         else
294         {
295             list_add_tail(&c->list, &q->list);
296             if ( q->cpupool_id == poolid )
297             {
298                 ret = -EEXIST;
299                 goto err;
300             }
301         }
302 
303         c->cpupool_id = poolid;
304     }
305     else
306     {
307         /* Cpupool 0 is created with specified id at boot and never removed. */
308         ASSERT(!list_empty(&cpupool_list));
309 
310         q = list_last_entry(&cpupool_list, struct cpupool, list);
311         /* In case of wrap search for first free id. */
312         if ( q->cpupool_id == CPUPOOLID_NONE - 1 )
313         {
314             list_for_each_entry(q, &cpupool_list, list)
315                 if ( q->cpupool_id + 1 != list_next_entry(q, list)->cpupool_id )
316                     break;
317         }
318 
319         list_add(&c->list, &q->list);
320 
321         c->cpupool_id = q->cpupool_id + 1;
322     }
323 
324     c->sched = scheduler_alloc(sched_id);
325     if ( IS_ERR(c->sched) )
326     {
327         ret = PTR_ERR(c->sched);
328         goto err;
329     }
330 
331     c->sched->cpupool = c;
332     c->gran = opt_sched_granularity;
333     c->sched_gran = sched_granularity;
334 
335     spin_unlock(&cpupool_lock);
336 
337     debugtrace_printk("Created cpupool %u with scheduler %s (%s)\n",
338                       c->cpupool_id, c->sched->name, c->sched->opt_name);
339 
340     return c;
341 
342  err:
343     list_del(&c->list);
344     n_cpupools--;
345 
346  unlock:
347     spin_unlock(&cpupool_lock);
348 
349     free_cpupool_struct(c);
350 
351     return ERR_PTR(ret);
352 }
353 /*
354  * destroys the given cpupool
355  * returns 0 on success, 1 else
356  * possible failures:
357  * - pool still in use
358  * - cpus still assigned to pool
359  */
cpupool_destroy(struct cpupool * c)360 static int cpupool_destroy(struct cpupool *c)
361 {
362     spin_lock(&cpupool_lock);
363 
364     if ( (c->n_dom != 0) || cpumask_weight(c->cpu_valid) )
365     {
366         spin_unlock(&cpupool_lock);
367         return -EBUSY;
368     }
369 
370     n_cpupools--;
371     list_del(&c->list);
372 
373     spin_unlock(&cpupool_lock);
374 
375     cpupool_put(c);
376 
377     debugtrace_printk("cpupool_destroy(pool=%u)\n", c->cpupool_id);
378     return 0;
379 }
380 
381 /*
382  * Move domain to another cpupool
383  */
cpupool_move_domain_locked(struct domain * d,struct cpupool * c)384 static int cpupool_move_domain_locked(struct domain *d, struct cpupool *c)
385 {
386     int ret;
387 
388     if ( unlikely(d->cpupool == c) )
389         return 0;
390 
391     d->cpupool->n_dom--;
392     ret = sched_move_domain(d, c);
393     if ( ret )
394         d->cpupool->n_dom++;
395     else
396         c->n_dom++;
397 
398     return ret;
399 }
cpupool_move_domain(struct domain * d,struct cpupool * c)400 int cpupool_move_domain(struct domain *d, struct cpupool *c)
401 {
402     int ret;
403 
404     spin_lock(&cpupool_lock);
405 
406     ret = cpupool_move_domain_locked(d, c);
407 
408     spin_unlock(&cpupool_lock);
409 
410     return ret;
411 }
412 
413 /* Update affinities of all domains in a cpupool. */
cpupool_update_node_affinity(const struct cpupool * c,struct affinity_masks * masks)414 static void cpupool_update_node_affinity(const struct cpupool *c,
415                                          struct affinity_masks *masks)
416 {
417     struct affinity_masks local_masks;
418     struct domain *d;
419 
420     if ( !masks )
421     {
422         if ( !alloc_affinity_masks(&local_masks) )
423             return;
424         masks = &local_masks;
425     }
426 
427     rcu_read_lock(&domlist_read_lock);
428 
429     for_each_domain_in_cpupool(d, c)
430         domain_update_node_aff(d, masks);
431 
432     rcu_read_unlock(&domlist_read_lock);
433 
434     if ( masks == &local_masks )
435         free_affinity_masks(masks);
436 }
437 
438 /*
439  * assign a specific cpu to a cpupool
440  * cpupool_lock must be held
441  */
cpupool_assign_cpu_locked(struct cpupool * c,unsigned int cpu)442 static int cpupool_assign_cpu_locked(struct cpupool *c, unsigned int cpu)
443 {
444     int ret;
445     const cpumask_t *cpus;
446 
447     cpus = sched_get_opt_cpumask(c->gran, cpu);
448 
449     if ( (cpupool_moving_cpu == cpu) && (c != cpupool_cpu_moving) )
450         return -EADDRNOTAVAIL;
451     ret = schedule_cpu_add(cpumask_first(cpus), c);
452     if ( ret )
453         return ret;
454 
455     rcu_read_lock(&sched_res_rculock);
456 
457     cpumask_andnot(&cpupool_free_cpus, &cpupool_free_cpus, cpus);
458     if (cpupool_moving_cpu == cpu)
459     {
460         cpupool_moving_cpu = -1;
461         cpupool_put(cpupool_cpu_moving);
462         cpupool_cpu_moving = NULL;
463     }
464     cpumask_or(c->cpu_valid, c->cpu_valid, cpus);
465     cpumask_and(c->res_valid, c->cpu_valid, &sched_res_mask);
466 
467     rcu_read_unlock(&sched_res_rculock);
468 
469     cpupool_update_node_affinity(c, NULL);
470 
471     return 0;
472 }
473 
cpupool_unassign_cpu_finish(struct cpupool * c,struct cpu_rm_data * mem)474 static int cpupool_unassign_cpu_finish(struct cpupool *c,
475                                        struct cpu_rm_data *mem)
476 {
477     int cpu = cpupool_moving_cpu;
478     const cpumask_t *cpus;
479     struct affinity_masks *masks = mem ? &mem->affinity : NULL;
480     int ret;
481 
482     if ( c != cpupool_cpu_moving )
483         return -EADDRNOTAVAIL;
484 
485     rcu_read_lock(&domlist_read_lock);
486     ret = cpu_disable_scheduler(cpu);
487     rcu_read_unlock(&domlist_read_lock);
488 
489     rcu_read_lock(&sched_res_rculock);
490     cpus = get_sched_res(cpu)->cpus;
491     cpumask_or(&cpupool_free_cpus, &cpupool_free_cpus, cpus);
492 
493     /*
494      * cpu_disable_scheduler() returning an error doesn't require resetting
495      * cpupool_free_cpus' cpu bit. All error cases should be of temporary
496      * nature and tools will retry the operation. Even if the number of
497      * retries may be limited, the in-between state can easily be repaired
498      * by adding the cpu to the cpupool again.
499      */
500     if ( !ret )
501     {
502         ret = schedule_cpu_rm(cpu, mem);
503         if ( ret )
504             cpumask_andnot(&cpupool_free_cpus, &cpupool_free_cpus, cpus);
505         else
506         {
507             cpupool_moving_cpu = -1;
508             cpupool_put(cpupool_cpu_moving);
509             cpupool_cpu_moving = NULL;
510         }
511     }
512     rcu_read_unlock(&sched_res_rculock);
513 
514     cpupool_update_node_affinity(c, masks);
515 
516     return ret;
517 }
518 
cpupool_unassign_cpu_start(struct cpupool * c,unsigned int cpu)519 static int cpupool_unassign_cpu_start(struct cpupool *c, unsigned int cpu)
520 {
521     int ret;
522     struct domain *d;
523     const cpumask_t *cpus;
524 
525     spin_lock(&cpupool_lock);
526     ret = -EADDRNOTAVAIL;
527     if ( ((cpupool_moving_cpu != -1) || !cpumask_test_cpu(cpu, c->cpu_valid))
528          && (cpu != cpupool_moving_cpu) )
529         goto out;
530 
531     ret = 0;
532     rcu_read_lock(&sched_res_rculock);
533     cpus = get_sched_res(cpu)->cpus;
534 
535     if ( (c->n_dom > 0) &&
536          (cpumask_weight(c->cpu_valid) == cpumask_weight(cpus)) &&
537          (cpu != cpupool_moving_cpu) )
538     {
539         rcu_read_lock(&domlist_read_lock);
540         for_each_domain_in_cpupool(d, c)
541         {
542             if ( !d->is_dying && system_state == SYS_STATE_active )
543             {
544                 ret = -EBUSY;
545                 break;
546             }
547             ret = cpupool_move_domain_locked(d, cpupool0);
548             if ( ret )
549                 break;
550         }
551         rcu_read_unlock(&domlist_read_lock);
552         if ( ret )
553             goto out_rcu;
554     }
555     cpupool_moving_cpu = cpu;
556     atomic_inc(&c->refcnt);
557     cpupool_cpu_moving = c;
558     cpumask_andnot(c->cpu_valid, c->cpu_valid, cpus);
559     cpumask_and(c->res_valid, c->cpu_valid, &sched_res_mask);
560 
561  out_rcu:
562     rcu_read_unlock(&sched_res_rculock);
563  out:
564     spin_unlock(&cpupool_lock);
565 
566     return ret;
567 }
568 
cpupool_unassign_cpu_helper(void * info)569 static long cf_check cpupool_unassign_cpu_helper(void *info)
570 {
571     struct cpupool *c = info;
572     long ret;
573 
574     debugtrace_printk("cpupool_unassign_cpu(pool=%u,cpu=%d)\n",
575                       cpupool_cpu_moving->cpupool_id, cpupool_moving_cpu);
576     spin_lock(&cpupool_lock);
577 
578     ret = cpupool_unassign_cpu_finish(c, NULL);
579 
580     spin_unlock(&cpupool_lock);
581     debugtrace_printk("cpupool_unassign_cpu ret=%ld\n", ret);
582 
583     return ret;
584 }
585 
586 /*
587  * unassign a specific cpu from a cpupool
588  * we must be sure not to run on the cpu to be unassigned! to achieve this
589  * the main functionality is performed via continue_hypercall_on_cpu on a
590  * specific cpu.
591  * if the cpu to be removed is the last one of the cpupool no active domain
592  * must be bound to the cpupool. dying domains are moved to cpupool0 as they
593  * might be zombies.
594  * possible failures:
595  * - last cpu and still active domains in cpupool
596  * - cpu just being unplugged
597  * - Attempt to remove boot cpu from cpupool0
598  */
cpupool_unassign_cpu(struct cpupool * c,unsigned int cpu)599 static int cpupool_unassign_cpu(struct cpupool *c, unsigned int cpu)
600 {
601     int work_cpu;
602     int ret;
603     unsigned int master_cpu;
604 
605     debugtrace_printk("cpupool_unassign_cpu(pool=%u,cpu=%d)\n",
606                       c->cpupool_id, cpu);
607 
608     /*
609      * Cpu0 must remain in cpupool0, otherwise some operations like moving cpus
610      * between cpupools, cpu hotplug, destroying cpupools, shutdown of the host,
611      * might not work in a sane way.
612      */
613     if ( (!c->cpupool_id && !cpu) || !cpu_online(cpu) )
614         return -EINVAL;
615 
616     master_cpu = sched_get_resource_cpu(cpu);
617     ret = cpupool_unassign_cpu_start(c, master_cpu);
618     if ( ret )
619     {
620         debugtrace_printk("cpupool_unassign_cpu(pool=%u,cpu=%d) ret %d\n",
621                           c->cpupool_id, cpu, ret);
622         return ret;
623     }
624 
625     work_cpu = sched_get_resource_cpu(smp_processor_id());
626     if ( work_cpu == master_cpu )
627     {
628         work_cpu = cpumask_first(cpupool0->cpu_valid);
629         if ( work_cpu == master_cpu )
630             work_cpu = cpumask_last(cpupool0->cpu_valid);
631     }
632     return continue_hypercall_on_cpu(work_cpu, cpupool_unassign_cpu_helper, c);
633 }
634 
635 /*
636  * add a new domain to a cpupool
637  * possible failures:
638  * - pool does not exist
639  * - no cpu assigned to pool
640  */
cpupool_add_domain(struct domain * d,unsigned int poolid)641 int cpupool_add_domain(struct domain *d, unsigned int poolid)
642 {
643     struct cpupool *c;
644     int rc;
645     int n_dom = 0;
646 
647     spin_lock(&cpupool_lock);
648     c = cpupool_find_by_id(poolid);
649     if ( c == NULL )
650         rc = -ESRCH;
651     else if ( !cpumask_weight(c->cpu_valid) )
652         rc = -ENODEV;
653     else
654     {
655         c->n_dom++;
656         n_dom = c->n_dom;
657         d->cpupool = c;
658         rc = 0;
659     }
660     spin_unlock(&cpupool_lock);
661     debugtrace_printk("cpupool_add_domain(dom=%d,pool=%u) n_dom %d rc %d\n",
662                       d->domain_id, poolid, n_dom, rc);
663     return rc;
664 }
665 
666 /*
667  * remove a domain from a cpupool
668  */
cpupool_rm_domain(struct domain * d)669 void cpupool_rm_domain(struct domain *d)
670 {
671     unsigned int cpupool_id;
672     int n_dom;
673 
674     if ( d->cpupool == NULL )
675         return;
676     spin_lock(&cpupool_lock);
677     cpupool_id = d->cpupool->cpupool_id;
678     d->cpupool->n_dom--;
679     n_dom = d->cpupool->n_dom;
680     d->cpupool = NULL;
681     spin_unlock(&cpupool_lock);
682     debugtrace_printk("cpupool_rm_domain(dom=%d,pool=%u) n_dom %d\n",
683                       d->domain_id, cpupool_id, n_dom);
684     return;
685 }
686 
687 /*
688  * Called to add a cpu to a pool. CPUs being hot-plugged are added to pool0,
689  * as they must have been in there when unplugged.
690  */
cpupool_cpu_add(unsigned int cpu)691 static int cpupool_cpu_add(unsigned int cpu)
692 {
693     int ret = 0;
694     const cpumask_t *cpus;
695 
696     spin_lock(&cpupool_lock);
697     cpumask_clear_cpu(cpu, &cpupool_locked_cpus);
698     cpumask_set_cpu(cpu, &cpupool_free_cpus);
699 
700     /*
701      * If we are not resuming, we are hot-plugging cpu, and in which case
702      * we add it to pool0, as it certainly was there when hot-unplagged
703      * (or unplugging would have failed) and that is the default behavior
704      * anyway.
705      */
706     rcu_read_lock(&sched_res_rculock);
707     get_sched_res(cpu)->cpupool = NULL;
708 
709     cpus = sched_get_opt_cpumask(cpupool0->gran, cpu);
710     if ( cpumask_subset(cpus, &cpupool_free_cpus) &&
711          cpumask_weight(cpus) == cpupool_get_granularity(cpupool0) )
712         ret = cpupool_assign_cpu_locked(cpupool0, cpu);
713 
714     rcu_read_unlock(&sched_res_rculock);
715 
716     spin_unlock(&cpupool_lock);
717 
718     return ret;
719 }
720 
721 /*
722  * This function is called in stop_machine context, so we can be sure no
723  * non-idle vcpu is active on the system.
724  */
cpupool_cpu_remove(unsigned int cpu,struct cpu_rm_data * mem)725 static void cpupool_cpu_remove(unsigned int cpu, struct cpu_rm_data *mem)
726 {
727     int ret;
728 
729     ASSERT(is_idle_vcpu(current));
730 
731     if ( !cpumask_test_cpu(cpu, &cpupool_free_cpus) )
732     {
733         ret = cpupool_unassign_cpu_finish(cpupool0, mem);
734         BUG_ON(ret);
735     }
736     cpumask_clear_cpu(cpu, &cpupool_free_cpus);
737 }
738 
739 /*
740  * Called before a CPU is being removed from the system.
741  * Removing a CPU is allowed for free CPUs or CPUs in Pool-0 (those are moved
742  * to free cpus actually before removing them).
743  * The CPU is locked, to forbid adding it again to another cpupool.
744  */
cpupool_cpu_remove_prologue(unsigned int cpu)745 static int cpupool_cpu_remove_prologue(unsigned int cpu)
746 {
747     int ret = 0;
748     cpumask_t *cpus;
749     unsigned int master_cpu;
750 
751     spin_lock(&cpupool_lock);
752 
753     rcu_read_lock(&sched_res_rculock);
754     cpus = get_sched_res(cpu)->cpus;
755     master_cpu = sched_get_resource_cpu(cpu);
756     if ( cpumask_intersects(cpus, &cpupool_locked_cpus) )
757         ret = -EBUSY;
758     else
759         cpumask_set_cpu(cpu, &cpupool_locked_cpus);
760     rcu_read_unlock(&sched_res_rculock);
761 
762     spin_unlock(&cpupool_lock);
763 
764     if ( ret )
765         return  ret;
766 
767     if ( cpumask_test_cpu(master_cpu, cpupool0->cpu_valid) )
768     {
769         /* Cpupool0 is populated only after all cpus are up. */
770         ASSERT(system_state == SYS_STATE_active);
771 
772         ret = cpupool_unassign_cpu_start(cpupool0, master_cpu);
773     }
774     else if ( !cpumask_test_cpu(master_cpu, &cpupool_free_cpus) )
775         ret = -ENODEV;
776 
777     return ret;
778 }
779 
780 /*
781  * Called during resume for all cpus which didn't come up again. The cpu must
782  * be removed from the cpupool it is assigned to. In case a cpupool will be
783  * left without cpu we move all domains of that cpupool to cpupool0.
784  * As we are called with all domains still frozen there is no need to take the
785  * cpupool lock here.
786  */
cpupool_cpu_remove_forced(unsigned int cpu)787 static void cpupool_cpu_remove_forced(unsigned int cpu)
788 {
789     struct cpupool *c;
790     int ret;
791     unsigned int master_cpu = sched_get_resource_cpu(cpu);
792 
793     list_for_each_entry(c, &cpupool_list, list)
794     {
795         if ( cpumask_test_cpu(master_cpu, c->cpu_valid) )
796         {
797             ret = cpupool_unassign_cpu_start(c, master_cpu);
798             BUG_ON(ret);
799             ret = cpupool_unassign_cpu_finish(c, NULL);
800             BUG_ON(ret);
801         }
802     }
803 
804     cpumask_clear_cpu(cpu, &cpupool_free_cpus);
805 
806     rcu_read_lock(&sched_res_rculock);
807     sched_rm_cpu(cpu);
808     rcu_read_unlock(&sched_res_rculock);
809 }
810 
811 /*
812  * do cpupool related sysctl operations
813  */
cpupool_do_sysctl(struct xen_sysctl_cpupool_op * op)814 int cpupool_do_sysctl(struct xen_sysctl_cpupool_op *op)
815 {
816     int ret = 0;
817     struct cpupool *c;
818 
819     switch ( op->op )
820     {
821 
822     case XEN_SYSCTL_CPUPOOL_OP_CREATE:
823     {
824         unsigned int poolid;
825 
826         poolid = (op->cpupool_id == XEN_SYSCTL_CPUPOOL_PAR_ANY) ?
827             CPUPOOLID_NONE: op->cpupool_id;
828         c = cpupool_create(poolid, op->sched_id);
829         if ( IS_ERR(c) )
830             ret = PTR_ERR(c);
831         else
832         {
833             op->cpupool_id = c->cpupool_id;
834             cpupool_put(c);
835         }
836     }
837     break;
838 
839     case XEN_SYSCTL_CPUPOOL_OP_DESTROY:
840     {
841         c = cpupool_get_by_id(op->cpupool_id);
842         ret = -ENOENT;
843         if ( c == NULL )
844             break;
845         ret = cpupool_destroy(c);
846         cpupool_put(c);
847     }
848     break;
849 
850     case XEN_SYSCTL_CPUPOOL_OP_INFO:
851     {
852         c = cpupool_get_next_by_id(op->cpupool_id);
853         ret = -ENOENT;
854         if ( c == NULL )
855             break;
856         op->cpupool_id = c->cpupool_id;
857         op->sched_id = c->sched->sched_id;
858         op->n_dom = c->n_dom;
859         ret = cpumask_to_xenctl_bitmap(&op->cpumap, c->cpu_valid);
860         cpupool_put(c);
861     }
862     break;
863 
864     case XEN_SYSCTL_CPUPOOL_OP_ADDCPU:
865     {
866         unsigned int cpu;
867         const cpumask_t *cpus;
868 
869         cpu = op->cpu;
870         debugtrace_printk("cpupool_assign_cpu(pool=%u,cpu=%u)\n",
871                           op->cpupool_id, cpu);
872 
873         spin_lock(&cpupool_lock);
874 
875         c = cpupool_find_by_id(op->cpupool_id);
876         ret = -ENOENT;
877         if ( c == NULL )
878             goto addcpu_out;
879         if ( cpu == XEN_SYSCTL_CPUPOOL_PAR_ANY )
880         {
881             for_each_cpu ( cpu, &cpupool_free_cpus )
882             {
883                 cpus = sched_get_opt_cpumask(c->gran, cpu);
884                 if ( cpumask_subset(cpus, &cpupool_free_cpus) )
885                     break;
886             }
887             ret = -ENODEV;
888             if ( cpu >= nr_cpu_ids )
889                 goto addcpu_out;
890         }
891         ret = -EINVAL;
892         if ( cpu >= nr_cpu_ids )
893             goto addcpu_out;
894         ret = -ENODEV;
895         if ( !cpu_online(cpu) )
896             goto addcpu_out;
897         cpus = sched_get_opt_cpumask(c->gran, cpu);
898         if ( !cpumask_subset(cpus, &cpupool_free_cpus) ||
899              cpumask_intersects(cpus, &cpupool_locked_cpus) )
900             goto addcpu_out;
901         ret = cpupool_assign_cpu_locked(c, cpu);
902 
903     addcpu_out:
904         spin_unlock(&cpupool_lock);
905         debugtrace_printk("cpupool_assign_cpu(pool=%u,cpu=%u) ret %d\n",
906                           op->cpupool_id, cpu, ret);
907 
908     }
909     break;
910 
911     case XEN_SYSCTL_CPUPOOL_OP_RMCPU:
912     {
913         unsigned int cpu;
914 
915         c = cpupool_get_by_id(op->cpupool_id);
916         ret = -ENOENT;
917         if ( c == NULL )
918             break;
919         cpu = op->cpu;
920         if ( cpu == XEN_SYSCTL_CPUPOOL_PAR_ANY )
921             cpu = cpumask_last(c->cpu_valid);
922         ret = (cpu < nr_cpu_ids) ? cpupool_unassign_cpu(c, cpu) : -EINVAL;
923         cpupool_put(c);
924     }
925     break;
926 
927     case XEN_SYSCTL_CPUPOOL_OP_MOVEDOMAIN:
928     {
929         struct domain *d;
930 
931         ret = rcu_lock_remote_domain_by_id(op->domid, &d);
932         if ( ret )
933             break;
934         if ( d->cpupool == NULL )
935         {
936             ret = -EINVAL;
937             rcu_unlock_domain(d);
938             break;
939         }
940         if ( op->cpupool_id == d->cpupool->cpupool_id )
941         {
942             ret = 0;
943             rcu_unlock_domain(d);
944             break;
945         }
946         debugtrace_printk("cpupool move_domain(dom=%d)->pool=%u\n",
947                           d->domain_id, op->cpupool_id);
948         ret = -ENOENT;
949         spin_lock(&cpupool_lock);
950 
951         c = cpupool_find_by_id(op->cpupool_id);
952         if ( (c != NULL) && cpumask_weight(c->cpu_valid) )
953             ret = cpupool_move_domain_locked(d, c);
954 
955         spin_unlock(&cpupool_lock);
956         debugtrace_printk("cpupool move_domain(dom=%d)->pool=%u ret %d\n",
957                           d->domain_id, op->cpupool_id, ret);
958         rcu_unlock_domain(d);
959     }
960     break;
961 
962     case XEN_SYSCTL_CPUPOOL_OP_FREEINFO:
963     {
964         ret = cpumask_to_xenctl_bitmap(
965             &op->cpumap, &cpupool_free_cpus);
966     }
967     break;
968 
969     default:
970         ret = -ENOSYS;
971         break;
972     }
973 
974     return ret;
975 }
976 
cpupool_get_id(const struct domain * d)977 unsigned int cpupool_get_id(const struct domain *d)
978 {
979     return d->cpupool ? d->cpupool->cpupool_id : CPUPOOLID_NONE;
980 }
981 
cpupool_valid_cpus(const struct cpupool * pool)982 const cpumask_t *cpupool_valid_cpus(const struct cpupool *pool)
983 {
984     return pool->cpu_valid;
985 }
986 
dump_runq(unsigned char key)987 void cf_check dump_runq(unsigned char key)
988 {
989     s_time_t         now = NOW();
990     struct cpupool *c;
991 
992     spin_lock(&cpupool_lock);
993 
994     printk("sched_smt_power_savings: %s\n",
995             sched_smt_power_savings? "enabled":"disabled");
996     printk("NOW=%"PRI_stime"\n", now);
997 
998     printk("Online Cpus: %*pbl\n", CPUMASK_PR(&cpu_online_map));
999     if ( !cpumask_empty(&cpupool_free_cpus) )
1000     {
1001         printk("Free Cpus: %*pbl\n", CPUMASK_PR(&cpupool_free_cpus));
1002         schedule_dump(NULL);
1003     }
1004 
1005     list_for_each_entry(c, &cpupool_list, list)
1006     {
1007         printk("Cpupool %u:\n", c->cpupool_id);
1008         printk("Cpus: %*pbl\n", CPUMASK_PR(c->cpu_valid));
1009         sched_gran_print(c->gran, cpupool_get_granularity(c));
1010         schedule_dump(c);
1011     }
1012 
1013     spin_unlock(&cpupool_lock);
1014 }
1015 
cpu_callback(struct notifier_block * nfb,unsigned long action,void * hcpu)1016 static int cf_check cpu_callback(
1017     struct notifier_block *nfb, unsigned long action, void *hcpu)
1018 {
1019     static struct cpu_rm_data *mem;
1020 
1021     unsigned int cpu = (unsigned long)hcpu;
1022     int rc = 0;
1023 
1024     switch ( action )
1025     {
1026     case CPU_DOWN_FAILED:
1027         if ( system_state <= SYS_STATE_active )
1028         {
1029             if ( mem )
1030             {
1031                 free_cpu_rm_data(mem, cpu);
1032                 mem = NULL;
1033             }
1034             rc = cpupool_cpu_add(cpu);
1035         }
1036         break;
1037     case CPU_ONLINE:
1038         if ( system_state <= SYS_STATE_active )
1039             rc = cpupool_cpu_add(cpu);
1040         else
1041             sched_migrate_timers(cpu);
1042         break;
1043     case CPU_DOWN_PREPARE:
1044         /* Suspend/Resume don't change assignments of cpus to cpupools. */
1045         if ( system_state <= SYS_STATE_active )
1046         {
1047             rc = cpupool_cpu_remove_prologue(cpu);
1048             if ( !rc )
1049             {
1050                 ASSERT(!mem);
1051                 mem = alloc_cpu_rm_data(cpu, true);
1052                 rc = mem ? 0 : -ENOMEM;
1053             }
1054         }
1055         break;
1056     case CPU_DYING:
1057         /* Suspend/Resume don't change assignments of cpus to cpupools. */
1058         if ( system_state <= SYS_STATE_active )
1059         {
1060             ASSERT(mem);
1061             cpupool_cpu_remove(cpu, mem);
1062         }
1063         break;
1064     case CPU_DEAD:
1065         if ( system_state <= SYS_STATE_active )
1066         {
1067             ASSERT(mem);
1068             free_cpu_rm_data(mem, cpu);
1069             mem = NULL;
1070         }
1071         break;
1072     case CPU_RESUME_FAILED:
1073         cpupool_cpu_remove_forced(cpu);
1074         break;
1075     default:
1076         break;
1077     }
1078 
1079     return notifier_from_errno(rc);
1080 }
1081 
1082 static struct notifier_block cpu_nfb = {
1083     .notifier_call = cpu_callback
1084 };
1085 
1086 #ifdef CONFIG_HYPFS
1087 
1088 static HYPFS_DIR_INIT(cpupool_pooldir, "%u");
1089 
cpupool_dir_read(const struct hypfs_entry * entry,XEN_GUEST_HANDLE_PARAM (void)uaddr)1090 static int cf_check cpupool_dir_read(
1091     const struct hypfs_entry *entry, XEN_GUEST_HANDLE_PARAM(void) uaddr)
1092 {
1093     int ret = 0;
1094     struct cpupool *c;
1095     struct hypfs_dyndir_id *data;
1096 
1097     data = hypfs_get_dyndata();
1098 
1099     list_for_each_entry(c, &cpupool_list, list)
1100     {
1101         data->id = c->cpupool_id;
1102         data->data = c;
1103 
1104         ret = hypfs_read_dyndir_id_entry(&cpupool_pooldir, c->cpupool_id,
1105                                          list_is_last(&c->list, &cpupool_list),
1106                                          &uaddr);
1107         if ( ret )
1108             break;
1109     }
1110 
1111     return ret;
1112 }
1113 
cpupool_dir_getsize(const struct hypfs_entry * entry)1114 static unsigned int cf_check cpupool_dir_getsize(
1115     const struct hypfs_entry *entry)
1116 {
1117     const struct cpupool *c;
1118     unsigned int size = 0;
1119 
1120     list_for_each_entry(c, &cpupool_list, list)
1121         size += hypfs_dynid_entry_size(entry, c->cpupool_id);
1122 
1123     return size;
1124 }
1125 
cpupool_dir_enter(const struct hypfs_entry * entry)1126 static const struct hypfs_entry *cf_check cpupool_dir_enter(
1127     const struct hypfs_entry *entry)
1128 {
1129     struct hypfs_dyndir_id *data;
1130 
1131     data = hypfs_alloc_dyndata(struct hypfs_dyndir_id);
1132     if ( !data )
1133         return ERR_PTR(-ENOMEM);
1134     data->id = CPUPOOLID_NONE;
1135 
1136     spin_lock(&cpupool_lock);
1137 
1138     return entry;
1139 }
1140 
cpupool_dir_exit(const struct hypfs_entry * entry)1141 static void cf_check cpupool_dir_exit(const struct hypfs_entry *entry)
1142 {
1143     spin_unlock(&cpupool_lock);
1144 
1145     hypfs_free_dyndata();
1146 }
1147 
cpupool_dir_findentry(const struct hypfs_entry_dir * dir,const char * name,unsigned int name_len)1148 static struct hypfs_entry *cf_check cpupool_dir_findentry(
1149     const struct hypfs_entry_dir *dir, const char *name, unsigned int name_len)
1150 {
1151     unsigned long id;
1152     const char *end;
1153     struct cpupool *cpupool;
1154 
1155     id = simple_strtoul(name, &end, 10);
1156     if ( end != name + name_len || id > UINT_MAX )
1157         return ERR_PTR(-ENOENT);
1158 
1159     cpupool = __cpupool_find_by_id(id, true);
1160 
1161     if ( !cpupool )
1162         return ERR_PTR(-ENOENT);
1163 
1164     return hypfs_gen_dyndir_id_entry(&cpupool_pooldir, id, cpupool);
1165 }
1166 
cpupool_gran_read(const struct hypfs_entry * entry,XEN_GUEST_HANDLE_PARAM (void)uaddr)1167 static int cf_check cpupool_gran_read(
1168     const struct hypfs_entry *entry, XEN_GUEST_HANDLE_PARAM(void) uaddr)
1169 {
1170     const struct hypfs_dyndir_id *data;
1171     const struct cpupool *cpupool;
1172     const char *gran;
1173 
1174     data = hypfs_get_dyndata();
1175     cpupool = data->data;
1176     ASSERT(cpupool);
1177 
1178     gran = sched_gran_get_name(cpupool->gran);
1179 
1180     if ( !*gran )
1181         return -ENOENT;
1182 
1183     return copy_to_guest(uaddr, gran, strlen(gran) + 1) ? -EFAULT : 0;
1184 }
1185 
hypfs_gran_getsize(const struct hypfs_entry * entry)1186 static unsigned int cf_check hypfs_gran_getsize(const struct hypfs_entry *entry)
1187 {
1188     const struct hypfs_dyndir_id *data;
1189     const struct cpupool *cpupool;
1190     const char *gran;
1191 
1192     data = hypfs_get_dyndata();
1193     cpupool = data->data;
1194     ASSERT(cpupool);
1195 
1196     gran = sched_gran_get_name(cpupool->gran);
1197 
1198     return strlen(gran) + 1;
1199 }
1200 
cpupool_gran_write(struct hypfs_entry_leaf * leaf,XEN_GUEST_HANDLE_PARAM (const_void)uaddr,unsigned int ulen)1201 static int cf_check cpupool_gran_write(
1202     struct hypfs_entry_leaf *leaf, XEN_GUEST_HANDLE_PARAM(const_void) uaddr,
1203     unsigned int ulen)
1204 {
1205     const struct hypfs_dyndir_id *data;
1206     struct cpupool *cpupool;
1207     enum sched_gran gran;
1208     unsigned int sched_gran = 0;
1209     char name[SCHED_GRAN_NAME_LEN];
1210     int ret = 0;
1211 
1212     if ( ulen > SCHED_GRAN_NAME_LEN )
1213         return -ENOSPC;
1214 
1215     if ( copy_from_guest(name, uaddr, ulen) )
1216         return -EFAULT;
1217 
1218     if ( memchr(name, 0, ulen) == (name + ulen - 1) )
1219         sched_gran = sched_gran_get(name, &gran) ?
1220                      0 : cpupool_check_granularity(gran);
1221     if ( sched_gran == 0 )
1222         return -EINVAL;
1223 
1224     data = hypfs_get_dyndata();
1225     cpupool = data->data;
1226     ASSERT(cpupool);
1227 
1228     /* Guarded by the cpupool_lock taken in cpupool_dir_enter(). */
1229     if ( !cpumask_empty(cpupool->cpu_valid) )
1230         ret = -EBUSY;
1231     else
1232     {
1233         cpupool->gran = gran;
1234         cpupool->sched_gran = sched_gran;
1235     }
1236 
1237     return ret;
1238 }
1239 
1240 static const struct hypfs_funcs cpupool_gran_funcs = {
1241     .enter = hypfs_node_enter,
1242     .exit = hypfs_node_exit,
1243     .read = cpupool_gran_read,
1244     .write = cpupool_gran_write,
1245     .getsize = hypfs_gran_getsize,
1246     .findentry = hypfs_leaf_findentry,
1247 };
1248 
1249 static HYPFS_VARSIZE_INIT(cpupool_gran, XEN_HYPFS_TYPE_STRING, "sched-gran",
1250                           SCHED_GRAN_NAME_LEN, &cpupool_gran_funcs);
1251 static char granstr[SCHED_GRAN_NAME_LEN] = {
1252     [0 ... SCHED_GRAN_NAME_LEN - 2] = '?',
1253     [SCHED_GRAN_NAME_LEN - 1] = 0
1254 };
1255 
1256 static const struct hypfs_funcs cpupool_dir_funcs = {
1257     .enter = cpupool_dir_enter,
1258     .exit = cpupool_dir_exit,
1259     .read = cpupool_dir_read,
1260     .write = hypfs_write_deny,
1261     .getsize = cpupool_dir_getsize,
1262     .findentry = cpupool_dir_findentry,
1263 };
1264 
1265 static HYPFS_DIR_INIT_FUNC(cpupool_dir, "cpupool", &cpupool_dir_funcs);
1266 
cpupool_hypfs_init(void)1267 static void cpupool_hypfs_init(void)
1268 {
1269     hypfs_add_dir(&hypfs_root, &cpupool_dir, true);
1270     hypfs_add_dyndir(&cpupool_dir, &cpupool_pooldir);
1271     hypfs_string_set_reference(&cpupool_gran, granstr);
1272     hypfs_add_leaf(&cpupool_pooldir, &cpupool_gran, true);
1273 }
1274 
1275 #else /* CONFIG_HYPFS */
1276 
cpupool_hypfs_init(void)1277 static void cpupool_hypfs_init(void)
1278 {
1279 }
1280 
1281 #endif /* CONFIG_HYPFS */
1282 
cpupool_create_pool(unsigned int pool_id,int sched_id)1283 struct cpupool *__init cpupool_create_pool(unsigned int pool_id, int sched_id)
1284 {
1285     struct cpupool *pool;
1286 
1287     if ( sched_id < 0 )
1288         sched_id = scheduler_get_default()->sched_id;
1289 
1290     pool = cpupool_create(pool_id, sched_id);
1291 
1292     BUG_ON(IS_ERR(pool));
1293     cpupool_put(pool);
1294 
1295     return pool;
1296 }
1297 
cpupool_init(void)1298 static int __init cf_check cpupool_init(void)
1299 {
1300     unsigned int cpu;
1301 
1302     cpupool_gran_init();
1303 
1304     cpupool_hypfs_init();
1305 
1306     register_cpu_notifier(&cpu_nfb);
1307 
1308     btcpupools_dtb_parse();
1309 
1310     btcpupools_allocate_pools();
1311 
1312     spin_lock(&cpupool_lock);
1313 
1314     cpumask_copy(&cpupool_free_cpus, &cpu_online_map);
1315 
1316     for_each_cpu ( cpu, &cpupool_free_cpus )
1317     {
1318         unsigned int pool_id = btcpupools_get_cpupool_id(cpu);
1319         struct cpupool *pool = cpupool_find_by_id(pool_id);
1320 
1321         ASSERT(pool);
1322         cpupool_assign_cpu_locked(pool, cpu);
1323     }
1324 
1325     spin_unlock(&cpupool_lock);
1326 
1327     return 0;
1328 }
1329 __initcall(cpupool_init);
1330 
1331 /*
1332  * Local variables:
1333  * mode: C
1334  * c-file-style: "BSD"
1335  * c-basic-offset: 4
1336  * tab-width: 4
1337  * indent-tabs-mode: nil
1338  * End:
1339  */
1340