1 /*
2  * xen/common/sched_null.c
3  *
4  *  Copyright (c) 2017, Dario Faggioli, Citrix Ltd
5  *
6  * This program is free software; you can redistribute it and/or
7  * modify it under the terms of the GNU General Public
8  * License v2 as published by the Free Software Foundation.
9  *
10  * This program is distributed in the hope that it will be useful,
11  * but WITHOUT ANY WARRANTY; without even the implied warranty of
12  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
13  * General Public License for more details.
14  *
15  * You should have received a copy of the GNU General Public
16  * License along with this program; If not, see <http://www.gnu.org/licenses/>.
17  */
18 
19 /*
20  * The 'null' scheduler always choose to run, on each pCPU, either nothing
21  * (i.e., the pCPU stays idle) or always the same unit.
22  *
23  * It is aimed at supporting static scenarios, where there always are
24  * less units than pCPUs (and the units don't need to move among pCPUs
25  * for any reason) with the least possible overhead.
26  *
27  * Typical usecase are embedded applications, but also HPC, especially
28  * if the scheduler is used inside a cpupool.
29  */
30 
31 #include <xen/sched.h>
32 #include <xen/softirq.h>
33 #include <xen/trace.h>
34 
35 #include "private.h"
36 
37 /*
38  * null tracing events. Check include/public/trace.h for more details.
39  */
40 #define TRC_SNULL_PICKED_CPU    TRC_SCHED_CLASS_EVT(SNULL, 1)
41 #define TRC_SNULL_UNIT_ASSIGN   TRC_SCHED_CLASS_EVT(SNULL, 2)
42 #define TRC_SNULL_UNIT_DEASSIGN TRC_SCHED_CLASS_EVT(SNULL, 3)
43 #define TRC_SNULL_MIGRATE       TRC_SCHED_CLASS_EVT(SNULL, 4)
44 #define TRC_SNULL_SCHEDULE      TRC_SCHED_CLASS_EVT(SNULL, 5)
45 #define TRC_SNULL_TASKLET       TRC_SCHED_CLASS_EVT(SNULL, 6)
46 
47 /*
48  * Locking:
49  * - Scheduler-lock (a.k.a. runqueue lock):
50  *  + is per-pCPU;
51  *  + serializes assignment and deassignment of units to a pCPU.
52  * - Private data lock (a.k.a. private scheduler lock):
53  *  + is scheduler-wide;
54  *  + serializes accesses to the list of domains in this scheduler.
55  * - Waitqueue lock:
56  *  + is scheduler-wide;
57  *  + serialize accesses to the list of units waiting to be assigned
58  *    to pCPUs.
59  *
60  * Ordering is: private lock, runqueue lock, waitqueue lock. Or, OTOH,
61  * waitqueue lock nests inside runqueue lock which nests inside private
62  * lock. More specifically:
63  *  + if we need both runqueue and private locks, we must acquire the
64  *    private lock for first;
65  *  + if we need both runqueue and waitqueue locks, we must acquire
66  *    the runqueue lock for first;
67  *  + if we need both private and waitqueue locks, we must acquire
68  *    the private lock for first;
69  *  + if we already own a runqueue lock, we must never acquire
70  *    the private lock;
71  *  + if we already own the waitqueue lock, we must never acquire
72  *    the runqueue lock or the private lock.
73  */
74 
75 /*
76  * System-wide private data
77  */
78 struct null_private {
79     spinlock_t lock;        /* scheduler lock; nests inside cpupool_lock */
80     struct list_head ndom;  /* Domains of this scheduler                 */
81     struct list_head waitq; /* units not assigned to any pCPU            */
82     spinlock_t waitq_lock;  /* serializes waitq; nests inside runq locks */
83     cpumask_t cpus_free;    /* CPUs without a unit associated to them    */
84 };
85 
86 /*
87  * Physical CPU
88  */
89 struct null_pcpu {
90     struct sched_unit *unit;
91 };
92 
93 /*
94  * Schedule unit
95  */
96 struct null_unit {
97     struct list_head waitq_elem;
98     struct sched_unit *unit;
99 };
100 
101 /*
102  * Domain
103  */
104 struct null_dom {
105     struct list_head ndom_elem;
106     struct domain *dom;
107 };
108 
109 /*
110  * Accessor helpers functions
111  */
null_priv(const struct scheduler * ops)112 static inline struct null_private *null_priv(const struct scheduler *ops)
113 {
114     return ops->sched_data;
115 }
116 
null_unit(const struct sched_unit * unit)117 static inline struct null_unit *null_unit(const struct sched_unit *unit)
118 {
119     return unit->priv;
120 }
121 
unit_check_affinity(struct sched_unit * unit,unsigned int cpu,unsigned int balance_step)122 static inline bool unit_check_affinity(struct sched_unit *unit,
123                                        unsigned int cpu,
124                                        unsigned int balance_step)
125 {
126     affinity_balance_cpumask(unit, balance_step, cpumask_scratch_cpu(cpu));
127     cpumask_and(cpumask_scratch_cpu(cpu), cpumask_scratch_cpu(cpu),
128                 cpupool_domain_master_cpumask(unit->domain));
129 
130     return cpumask_test_cpu(cpu, cpumask_scratch_cpu(cpu));
131 }
132 
null_init(struct scheduler * ops)133 static int cf_check null_init(struct scheduler *ops)
134 {
135     struct null_private *prv;
136 
137     printk("Initializing null scheduler\n"
138            "WARNING: This is experimental software in development.\n"
139            "Use at your own risk.\n");
140 
141     prv = xzalloc(struct null_private);
142     if ( prv == NULL )
143         return -ENOMEM;
144 
145     spin_lock_init(&prv->lock);
146     spin_lock_init(&prv->waitq_lock);
147     INIT_LIST_HEAD(&prv->ndom);
148     INIT_LIST_HEAD(&prv->waitq);
149 
150     ops->sched_data = prv;
151 
152     return 0;
153 }
154 
null_deinit(struct scheduler * ops)155 static void cf_check null_deinit(struct scheduler *ops)
156 {
157     xfree(ops->sched_data);
158     ops->sched_data = NULL;
159 }
160 
init_pdata(struct null_private * prv,struct null_pcpu * npc,unsigned int cpu)161 static void init_pdata(struct null_private *prv, struct null_pcpu *npc,
162                        unsigned int cpu)
163 {
164     /* Mark the pCPU as free, and with no unit assigned */
165     cpumask_set_cpu(cpu, &prv->cpus_free);
166     npc->unit = NULL;
167 }
168 
null_deinit_pdata(const struct scheduler * ops,void * pcpu,int cpu)169 static void cf_check null_deinit_pdata(
170     const struct scheduler *ops, void *pcpu, int cpu)
171 {
172     struct null_private *prv = null_priv(ops);
173     struct null_pcpu *npc = pcpu;
174 
175     ASSERT(npc);
176 
177     cpumask_clear_cpu(cpu, &prv->cpus_free);
178     npc->unit = NULL;
179 }
180 
null_alloc_pdata(const struct scheduler * ops,int cpu)181 static void *cf_check null_alloc_pdata(const struct scheduler *ops, int cpu)
182 {
183     struct null_pcpu *npc;
184 
185     npc = xzalloc(struct null_pcpu);
186     if ( npc == NULL )
187         return ERR_PTR(-ENOMEM);
188 
189     return npc;
190 }
191 
null_free_pdata(const struct scheduler * ops,void * pcpu,int cpu)192 static void cf_check null_free_pdata(
193     const struct scheduler *ops, void *pcpu, int cpu)
194 {
195     xfree(pcpu);
196 }
197 
null_alloc_udata(const struct scheduler * ops,struct sched_unit * unit,void * dd)198 static void *cf_check null_alloc_udata(
199     const struct scheduler *ops, struct sched_unit *unit, void *dd)
200 {
201     struct null_unit *nvc;
202 
203     nvc = xzalloc(struct null_unit);
204     if ( nvc == NULL )
205         return NULL;
206 
207     INIT_LIST_HEAD(&nvc->waitq_elem);
208     nvc->unit = unit;
209 
210     SCHED_STAT_CRANK(unit_alloc);
211 
212     return nvc;
213 }
214 
null_free_udata(const struct scheduler * ops,void * priv)215 static void cf_check null_free_udata(const struct scheduler *ops, void *priv)
216 {
217     struct null_unit *nvc = priv;
218 
219     xfree(nvc);
220 }
221 
null_alloc_domdata(const struct scheduler * ops,struct domain * d)222 static void *cf_check null_alloc_domdata(
223     const struct scheduler *ops, struct domain *d)
224 {
225     struct null_private *prv = null_priv(ops);
226     struct null_dom *ndom;
227     unsigned long flags;
228 
229     ndom = xzalloc(struct null_dom);
230     if ( ndom == NULL )
231         return ERR_PTR(-ENOMEM);
232 
233     ndom->dom = d;
234 
235     spin_lock_irqsave(&prv->lock, flags);
236     list_add_tail(&ndom->ndom_elem, &null_priv(ops)->ndom);
237     spin_unlock_irqrestore(&prv->lock, flags);
238 
239     return ndom;
240 }
241 
null_free_domdata(const struct scheduler * ops,void * data)242 static void cf_check null_free_domdata(const struct scheduler *ops, void *data)
243 {
244     struct null_dom *ndom = data;
245     struct null_private *prv = null_priv(ops);
246 
247     if ( ndom )
248     {
249         unsigned long flags;
250 
251         spin_lock_irqsave(&prv->lock, flags);
252         list_del_init(&ndom->ndom_elem);
253         spin_unlock_irqrestore(&prv->lock, flags);
254 
255         xfree(ndom);
256     }
257 }
258 
259 /*
260  * unit to pCPU assignment and placement. This _only_ happens:
261  *  - on insert,
262  *  - on migrate.
263  *
264  * Insert occurs when a unit joins this scheduler for the first time
265  * (e.g., when the domain it's part of is moved to the scheduler's
266  * cpupool).
267  *
268  * Migration may be necessary if a pCPU (with a unit assigned to it)
269  * is removed from the scheduler's cpupool.
270  *
271  * So this is not part of any hot path.
272  */
273 static struct sched_resource *
pick_res(const struct null_private * prv,const struct sched_unit * unit)274 pick_res(const struct null_private *prv, const struct sched_unit *unit)
275 {
276     unsigned int bs;
277     unsigned int cpu = sched_unit_master(unit), new_cpu;
278     const cpumask_t *cpus = cpupool_domain_master_cpumask(unit->domain);
279     const struct null_pcpu *npc = get_sched_res(cpu)->sched_priv;
280 
281     ASSERT(spin_is_locked(get_sched_res(cpu)->schedule_lock));
282 
283     for_each_affinity_balance_step( bs )
284     {
285         if ( bs == BALANCE_SOFT_AFFINITY && !has_soft_affinity(unit) )
286             continue;
287 
288         affinity_balance_cpumask(unit, bs, cpumask_scratch_cpu(cpu));
289         cpumask_and(cpumask_scratch_cpu(cpu), cpumask_scratch_cpu(cpu), cpus);
290 
291         /*
292          * If our processor is free, or we are assigned to it, and it is also
293          * still valid and part of our affinity, just go for it.
294          * (Note that we may call unit_check_affinity(), but we deliberately
295          * don't, so we get to keep in the scratch cpumask what we have just
296          * put in it.)
297          */
298         if ( likely((npc->unit == NULL || npc->unit == unit)
299                     && cpumask_test_cpu(cpu, cpumask_scratch_cpu(cpu))) )
300         {
301             new_cpu = cpu;
302             goto out;
303         }
304 
305         /* If not, just go for a free pCPU, within our affinity, if any */
306         cpumask_and(cpumask_scratch_cpu(cpu), cpumask_scratch_cpu(cpu),
307                     &prv->cpus_free);
308         new_cpu = cpumask_first(cpumask_scratch_cpu(cpu));
309 
310         if ( likely(new_cpu != nr_cpu_ids) )
311             goto out;
312     }
313 
314     /*
315      * If we didn't find any free pCPU, just pick any valid pcpu, even if
316      * it has another unit assigned. This will happen during shutdown and
317      * suspend/resume, but it may also happen during "normal operation", if
318      * all the pCPUs are busy.
319      *
320      * In fact, there must always be something sane in v->processor, or
321      * unit_schedule_lock() and friends won't work. This is not a problem,
322      * as we will actually assign the unit to the pCPU we return from here,
323      * only if the pCPU is free.
324      */
325     cpumask_and(cpumask_scratch_cpu(cpu), cpus, unit->cpu_hard_affinity);
326     new_cpu = cpumask_any(cpumask_scratch_cpu(cpu));
327 
328  out:
329     if ( unlikely(tb_init_done) )
330     {
331         struct {
332             uint16_t unit, dom;
333             uint32_t new_cpu;
334         } d = {
335             .unit    = unit->unit_id,
336             .dom     = unit->domain->domain_id,
337             .new_cpu = new_cpu,
338         };
339 
340         trace_time(TRC_SNULL_PICKED_CPU, sizeof(d), &d);
341     }
342 
343     return get_sched_res(new_cpu);
344 }
345 
unit_assign(struct null_private * prv,struct sched_unit * unit,unsigned int cpu)346 static void unit_assign(struct null_private *prv, struct sched_unit *unit,
347                         unsigned int cpu)
348 {
349     struct null_pcpu *npc = get_sched_res(cpu)->sched_priv;
350 
351     ASSERT(is_unit_online(unit));
352 
353     npc->unit = unit;
354     sched_set_res(unit, get_sched_res(cpu));
355     cpumask_clear_cpu(cpu, &prv->cpus_free);
356 
357     dprintk(XENLOG_G_INFO, "%d <-- %pdv%d\n", cpu, unit->domain, unit->unit_id);
358 
359     if ( unlikely(tb_init_done) )
360     {
361         struct {
362             uint16_t unit, dom;
363             uint32_t cpu;
364         } d = {
365             .unit = unit->unit_id,
366             .dom  = unit->domain->domain_id,
367             .cpu  = cpu,
368         };
369 
370         trace_time(TRC_SNULL_UNIT_ASSIGN, sizeof(d), &d);
371     }
372 }
373 
374 /* Returns true if a cpu was tickled */
unit_deassign(struct null_private * prv,const struct sched_unit * unit)375 static bool unit_deassign(struct null_private *prv, const struct sched_unit *unit)
376 {
377     unsigned int bs;
378     unsigned int cpu = sched_unit_master(unit);
379     struct null_unit *wvc;
380     struct null_pcpu *npc = get_sched_res(cpu)->sched_priv;
381 
382     ASSERT(list_empty(&null_unit(unit)->waitq_elem));
383     ASSERT(npc->unit == unit);
384     ASSERT(!cpumask_test_cpu(cpu, &prv->cpus_free));
385 
386     npc->unit = NULL;
387     cpumask_set_cpu(cpu, &prv->cpus_free);
388 
389     dprintk(XENLOG_G_INFO, "%d <-- NULL (%pdv%d)\n", cpu, unit->domain,
390             unit->unit_id);
391 
392     if ( unlikely(tb_init_done) )
393     {
394         struct {
395             uint16_t unit, dom;
396             uint32_t cpu;
397         } d = {
398             .unit = unit->unit_id,
399             .dom  = unit->domain->domain_id,
400             .cpu  = cpu,
401         };
402 
403         trace_time(TRC_SNULL_UNIT_DEASSIGN, sizeof(d), &d);
404     }
405 
406     spin_lock(&prv->waitq_lock);
407 
408     /*
409      * If unit is assigned to a pCPU, let's see if there is someone waiting,
410      * suitable to be assigned to it (prioritizing units that have
411      * soft-affinity with cpu).
412      */
413     for_each_affinity_balance_step( bs )
414     {
415         list_for_each_entry( wvc, &prv->waitq, waitq_elem )
416         {
417             if ( bs == BALANCE_SOFT_AFFINITY &&
418                  !has_soft_affinity(wvc->unit) )
419                 continue;
420 
421             if ( unit_check_affinity(wvc->unit, cpu, bs) )
422             {
423                 list_del_init(&wvc->waitq_elem);
424                 unit_assign(prv, wvc->unit, cpu);
425                 cpu_raise_softirq(cpu, SCHEDULE_SOFTIRQ);
426                 spin_unlock(&prv->waitq_lock);
427                 return true;
428             }
429         }
430     }
431     spin_unlock(&prv->waitq_lock);
432 
433     return false;
434 }
435 
436 /* Change the scheduler of cpu to us (null). */
null_switch_sched(struct scheduler * new_ops,unsigned int cpu,void * pdata,void * vdata)437 static spinlock_t *cf_check null_switch_sched(
438     struct scheduler *new_ops, unsigned int cpu, void *pdata, void *vdata)
439 {
440     struct sched_resource *sr = get_sched_res(cpu);
441     struct null_private *prv = null_priv(new_ops);
442     const struct null_unit *nvc = vdata;
443 
444     ASSERT(nvc && is_idle_unit(nvc->unit));
445 
446     sched_idle_unit(cpu)->priv = vdata;
447 
448     /*
449      * We are holding the runqueue lock already (it's been taken in
450      * schedule_cpu_switch()). It actually may or may not be the 'right'
451      * one for this cpu, but that is ok for preventing races.
452      */
453     ASSERT(!local_irq_is_enabled());
454 
455     init_pdata(prv, pdata, cpu);
456 
457     return &sr->_lock;
458 }
459 
null_unit_insert(const struct scheduler * ops,struct sched_unit * unit)460 static void cf_check null_unit_insert(
461     const struct scheduler *ops, struct sched_unit *unit)
462 {
463     struct null_private *prv = null_priv(ops);
464     struct null_unit *nvc = null_unit(unit);
465     struct null_pcpu *npc;
466     unsigned int cpu;
467     spinlock_t *lock;
468 
469     ASSERT(!is_idle_unit(unit));
470 
471     lock = unit_schedule_lock_irq(unit);
472 
473     if ( unlikely(!is_unit_online(unit)) )
474     {
475         unit_schedule_unlock_irq(lock, unit);
476         return;
477     }
478 
479  retry:
480     sched_set_res(unit, pick_res(prv, unit));
481     cpu = sched_unit_master(unit);
482     npc = get_sched_res(cpu)->sched_priv;
483 
484     spin_unlock(lock);
485 
486     lock = unit_schedule_lock(unit);
487 
488     cpumask_and(cpumask_scratch_cpu(cpu), unit->cpu_hard_affinity,
489                 cpupool_domain_master_cpumask(unit->domain));
490 
491     /* If the pCPU is free, we assign unit to it */
492     if ( likely(npc->unit == NULL) )
493     {
494         /*
495          * Insert is followed by vcpu_wake(), so there's no need to poke
496          * the pcpu with the SCHEDULE_SOFTIRQ, as wake will do that.
497          */
498         unit_assign(prv, unit, cpu);
499     }
500     else if ( cpumask_intersects(&prv->cpus_free, cpumask_scratch_cpu(cpu)) )
501     {
502         /*
503          * If the pCPU is not free (e.g., because we raced with another
504          * insert or a migrate), but there are other free pCPUs, we can
505          * try to pick again.
506          */
507          goto retry;
508     }
509     else
510     {
511         /*
512          * If the pCPU is not free, and there aren't any (valid) others,
513          * we have no alternatives than to go into the waitqueue.
514          */
515         spin_lock(&prv->waitq_lock);
516         list_add_tail(&nvc->waitq_elem, &prv->waitq);
517         dprintk(XENLOG_G_WARNING, "WARNING: %pdv%d not assigned to any CPU!\n",
518                 unit->domain, unit->unit_id);
519         spin_unlock(&prv->waitq_lock);
520     }
521     spin_unlock_irq(lock);
522 
523     SCHED_STAT_CRANK(unit_insert);
524 }
525 
null_unit_remove(const struct scheduler * ops,struct sched_unit * unit)526 static void cf_check null_unit_remove(
527     const struct scheduler *ops, struct sched_unit *unit)
528 {
529     struct null_private *prv = null_priv(ops);
530     struct null_unit *nvc = null_unit(unit);
531     struct null_pcpu *npc;
532     unsigned int cpu;
533     spinlock_t *lock;
534 
535     ASSERT(!is_idle_unit(unit));
536 
537     lock = unit_schedule_lock_irq(unit);
538 
539     /* If offline, the unit shouldn't be assigned, nor in the waitqueue */
540     if ( unlikely(!is_unit_online(unit)) )
541     {
542         npc = unit->res->sched_priv;
543         ASSERT(npc->unit != unit);
544         ASSERT(list_empty(&nvc->waitq_elem));
545         goto out;
546     }
547 
548     /* If unit is in waitqueue, just get it out of there and bail */
549     if ( unlikely(!list_empty(&nvc->waitq_elem)) )
550     {
551         spin_lock(&prv->waitq_lock);
552         list_del_init(&nvc->waitq_elem);
553         spin_unlock(&prv->waitq_lock);
554 
555         goto out;
556     }
557 
558     cpu = sched_unit_master(unit);
559     npc = get_sched_res(cpu)->sched_priv;
560     if ( npc->unit == unit )
561         unit_deassign(prv, unit);
562 
563  out:
564     unit_schedule_unlock_irq(lock, unit);
565 
566     SCHED_STAT_CRANK(unit_remove);
567 }
568 
null_unit_wake(const struct scheduler * ops,struct sched_unit * unit)569 static void cf_check null_unit_wake(
570     const struct scheduler *ops, struct sched_unit *unit)
571 {
572     struct null_private *prv = null_priv(ops);
573     struct null_unit *nvc = null_unit(unit);
574     unsigned int cpu = sched_unit_master(unit);
575     struct null_pcpu *npc = get_sched_res(cpu)->sched_priv;
576 
577     ASSERT(!is_idle_unit(unit));
578 
579     if ( unlikely(curr_on_cpu(sched_unit_master(unit)) == unit) )
580     {
581         SCHED_STAT_CRANK(unit_wake_running);
582         return;
583     }
584 
585     if ( unlikely(!list_empty(&nvc->waitq_elem)) )
586     {
587         /* Not exactly "on runq", but close enough for reusing the counter */
588         SCHED_STAT_CRANK(unit_wake_onrunq);
589         return;
590     }
591 
592     if ( likely(unit_runnable(unit)) )
593         SCHED_STAT_CRANK(unit_wake_runnable);
594     else
595         SCHED_STAT_CRANK(unit_wake_not_runnable);
596 
597     if ( likely(npc->unit == unit) )
598     {
599         cpu_raise_softirq(cpu, SCHEDULE_SOFTIRQ);
600         return;
601     }
602 
603     /*
604      * If a unit is neither on a pCPU nor in the waitqueue, it means it was
605      * offline, and that it is now coming back being online. If we're lucky,
606      * and its previous resource is free (and affinities match), we can just
607      * assign the unit to it (we own the proper lock already) and be done.
608      */
609     if ( npc->unit == NULL &&
610          unit_check_affinity(unit, cpu, BALANCE_HARD_AFFINITY) )
611     {
612         if ( !has_soft_affinity(unit) ||
613              unit_check_affinity(unit, cpu, BALANCE_SOFT_AFFINITY) )
614         {
615             unit_assign(prv, unit, cpu);
616             cpu_raise_softirq(cpu, SCHEDULE_SOFTIRQ);
617             return;
618         }
619     }
620 
621     /*
622      * If the resource is not free (or affinities do not match) we need
623      * to assign unit to some other one, but we can't do it here, as:
624      * - we don't own  the proper lock,
625      * - we can't change v->processor under vcpu_wake()'s feet.
626      * So we add it to the waitqueue, and tickle all the free CPUs (if any)
627      * on which unit can run. The first one that schedules will pick it up.
628      */
629     spin_lock(&prv->waitq_lock);
630     list_add_tail(&nvc->waitq_elem, &prv->waitq);
631     spin_unlock(&prv->waitq_lock);
632 
633     cpumask_and(cpumask_scratch_cpu(cpu), unit->cpu_hard_affinity,
634                 cpupool_domain_master_cpumask(unit->domain));
635     cpumask_and(cpumask_scratch_cpu(cpu), cpumask_scratch_cpu(cpu),
636                 &prv->cpus_free);
637 
638     if ( cpumask_empty(cpumask_scratch_cpu(cpu)) )
639         dprintk(XENLOG_G_WARNING, "WARNING: d%dv%d not assigned to any CPU!\n",
640                 unit->domain->domain_id, unit->unit_id);
641     else
642         cpumask_raise_softirq(cpumask_scratch_cpu(cpu), SCHEDULE_SOFTIRQ);
643 }
644 
null_unit_sleep(const struct scheduler * ops,struct sched_unit * unit)645 static void cf_check null_unit_sleep(
646     const struct scheduler *ops, struct sched_unit *unit)
647 {
648     struct null_private *prv = null_priv(ops);
649     unsigned int cpu = sched_unit_master(unit);
650     struct null_pcpu *npc = get_sched_res(cpu)->sched_priv;
651     bool tickled = false;
652 
653     ASSERT(!is_idle_unit(unit));
654 
655     /*
656      * Check if the unit is in the process of being offlined. If yes,
657      * we need to remove it from either its pCPU or the waitqueue.
658      */
659     if ( unlikely(!is_unit_online(unit)) )
660     {
661         struct null_unit *nvc = null_unit(unit);
662 
663         if ( unlikely(!list_empty(&nvc->waitq_elem)) )
664         {
665             spin_lock(&prv->waitq_lock);
666             list_del_init(&nvc->waitq_elem);
667             spin_unlock(&prv->waitq_lock);
668         }
669         else if ( npc->unit == unit )
670             tickled = unit_deassign(prv, unit);
671     }
672 
673     /* If unit is not assigned to a pCPU, or is not running, no need to bother */
674     if ( likely(!tickled && curr_on_cpu(cpu) == unit) )
675         cpu_raise_softirq(cpu, SCHEDULE_SOFTIRQ);
676 
677     SCHED_STAT_CRANK(unit_sleep);
678 }
679 
680 static struct sched_resource *cf_check
null_res_pick(const struct scheduler * ops,const struct sched_unit * unit)681 null_res_pick(const struct scheduler *ops, const struct sched_unit *unit)
682 {
683     ASSERT(!is_idle_unit(unit));
684     return pick_res(null_priv(ops), unit);
685 }
686 
null_unit_migrate(const struct scheduler * ops,struct sched_unit * unit,unsigned int new_cpu)687 static void cf_check null_unit_migrate(
688     const struct scheduler *ops, struct sched_unit *unit, unsigned int new_cpu)
689 {
690     struct null_private *prv = null_priv(ops);
691     struct null_unit *nvc = null_unit(unit);
692     struct null_pcpu *npc;
693 
694     ASSERT(!is_idle_unit(unit));
695 
696     if ( sched_unit_master(unit) == new_cpu )
697         return;
698 
699     if ( unlikely(tb_init_done) )
700     {
701         struct {
702             uint16_t unit, dom;
703             uint16_t cpu, new_cpu;
704         } d = {
705             .unit    = unit->unit_id,
706             .dom     = unit->domain->domain_id,
707             .cpu     = sched_unit_master(unit),
708             .new_cpu = new_cpu,
709         };
710 
711         trace_time(TRC_SNULL_MIGRATE, sizeof(d), &d);
712     }
713 
714     /*
715      * If unit is assigned to a pCPU, then such pCPU becomes free, and we
716      * should look in the waitqueue if anyone else can be assigned to it.
717      */
718     npc = unit->res->sched_priv;
719     if ( likely(npc->unit == unit) )
720     {
721         unit_deassign(prv, unit);
722         SCHED_STAT_CRANK(migrate_running);
723     }
724     else if ( !list_empty(&nvc->waitq_elem) )
725         SCHED_STAT_CRANK(migrate_on_runq);
726 
727     SCHED_STAT_CRANK(migrated);
728 
729     /*
730      * If a unit is (going) offline, we want it to be neither assigned
731      * to a pCPU, nor in the waitqueue.
732      *
733      * If it was on a cpu, we've removed it from there above. If it is
734      * in the waitqueue, we remove it from there now. And then we bail.
735      */
736     if ( unlikely(!is_unit_online(unit)) )
737     {
738         spin_lock(&prv->waitq_lock);
739         list_del_init(&nvc->waitq_elem);
740         spin_unlock(&prv->waitq_lock);
741         goto out;
742     }
743 
744     /*
745      * Let's now consider new_cpu, which is where unit is being sent. It can be
746      * either free, or have a unit already assigned to it.
747      *
748      * In the former case we should assign unit to it, and try to get it to run,
749      * if possible, according to affinity.
750      *
751      * In latter, all we can do is to park unit in the waitqueue.
752      */
753     npc = get_sched_res(new_cpu)->sched_priv;
754     if ( npc->unit == NULL &&
755          unit_check_affinity(unit, new_cpu, BALANCE_HARD_AFFINITY) )
756     {
757         /* unit might have been in the waitqueue, so remove it */
758         spin_lock(&prv->waitq_lock);
759         list_del_init(&nvc->waitq_elem);
760         spin_unlock(&prv->waitq_lock);
761 
762         unit_assign(prv, unit, new_cpu);
763     }
764     else
765     {
766         /* Put unit in the waitqueue, if it wasn't there already */
767         spin_lock(&prv->waitq_lock);
768         if ( list_empty(&nvc->waitq_elem) )
769         {
770             list_add_tail(&nvc->waitq_elem, &prv->waitq);
771             dprintk(XENLOG_G_WARNING,
772                     "WARNING: %pdv%d not assigned to any CPU!\n", unit->domain,
773                     unit->unit_id);
774         }
775         spin_unlock(&prv->waitq_lock);
776     }
777 
778     /*
779      * Whatever all the above, we always at least override v->processor.
780      * This is especially important for shutdown or suspend/resume paths,
781      * when it is important to let our caller (cpu_disable_scheduler())
782      * know that the migration did happen, to the best of our possibilities,
783      * at least. In case of suspend, any temporary inconsistency caused
784      * by this, will be fixed-up during resume.
785      */
786  out:
787     sched_set_res(unit, get_sched_res(new_cpu));
788 }
789 
790 #ifndef NDEBUG
null_unit_check(struct sched_unit * unit)791 static inline void null_unit_check(struct sched_unit *unit)
792 {
793     struct null_unit * const nvc = null_unit(unit);
794     struct null_dom * const ndom = unit->domain->sched_priv;
795 
796     BUG_ON(nvc->unit != unit);
797 
798     if ( ndom )
799         BUG_ON(is_idle_unit(unit));
800     else
801         BUG_ON(!is_idle_unit(unit));
802 
803     SCHED_STAT_CRANK(unit_check);
804 }
805 #define NULL_UNIT_CHECK(unit)  (null_unit_check(unit))
806 #else
807 #define NULL_UNIT_CHECK(unit)
808 #endif
809 
810 
811 /*
812  * The most simple scheduling function of all times! We either return:
813  *  - the unit assigned to the pCPU, if there's one and it can run;
814  *  - the idle unit, otherwise.
815  */
null_schedule(const struct scheduler * ops,struct sched_unit * prev,s_time_t now,bool tasklet_work_scheduled)816 static void cf_check null_schedule(
817     const struct scheduler *ops, struct sched_unit *prev, s_time_t now,
818     bool tasklet_work_scheduled)
819 {
820     unsigned int bs;
821     const unsigned int cur_cpu = smp_processor_id();
822     const unsigned int sched_cpu = sched_get_resource_cpu(cur_cpu);
823     struct null_pcpu *npc = get_sched_res(sched_cpu)->sched_priv;
824     struct null_private *prv = null_priv(ops);
825     struct null_unit *wvc;
826 
827     SCHED_STAT_CRANK(schedule);
828     NULL_UNIT_CHECK(current->sched_unit);
829 
830     if ( unlikely(tb_init_done) )
831     {
832         struct {
833             uint16_t tasklet, cpu;
834             int16_t unit, dom;
835         } d = {
836             .tasklet = tasklet_work_scheduled,
837             .cpu     = cur_cpu,
838         };
839 
840         if ( npc->unit == NULL )
841         {
842             d.unit = d.dom = -1;
843         }
844         else
845         {
846             d.unit = npc->unit->unit_id;
847             d.dom = npc->unit->domain->domain_id;
848         }
849 
850         trace_time(TRC_SNULL_SCHEDULE, sizeof(d), &d);
851     }
852 
853     if ( tasklet_work_scheduled )
854     {
855         TRACE_TIME(TRC_SNULL_TASKLET);
856         prev->next_task = sched_idle_unit(sched_cpu);
857     }
858     else
859         prev->next_task = npc->unit;
860     prev->next_time = -1;
861 
862     /*
863      * We may be new in the cpupool, or just coming back online. In which
864      * case, there may be units in the waitqueue that we can assign to us
865      * and run.
866      */
867     if ( unlikely(prev->next_task == NULL) )
868     {
869         bool unit_found;
870 
871         spin_lock(&prv->waitq_lock);
872 
873         if ( list_empty(&prv->waitq) )
874             goto unlock;
875 
876         /*
877          * We scan the waitqueue twice, for prioritizing units that have
878          * soft-affinity with cpu. This may look like something expensive to
879          * do here in null_schedule(), but it's actually fine, because we do
880          * it only in cases where a pcpu has no unit associated (e.g., as
881          * said above, the cpu has just joined a cpupool).
882          */
883         unit_found = false;
884         for_each_affinity_balance_step( bs )
885         {
886             list_for_each_entry( wvc, &prv->waitq, waitq_elem )
887             {
888                 if ( bs == BALANCE_SOFT_AFFINITY &&
889                      !has_soft_affinity(wvc->unit) )
890                     continue;
891 
892                 if ( unit_check_affinity(wvc->unit, sched_cpu, bs) )
893                 {
894                     spinlock_t *lock;
895 
896                     unit_found = true;
897 
898                     /*
899                      * If the unit in the waitqueue has just come up online,
900                      * we risk racing with vcpu_wake(). To avoid this, sync
901                      * on the spinlock that vcpu_wake() holds, but only with
902                      * trylock, to avoid deadlock).
903                      */
904                     lock = pcpu_schedule_trylock(sched_unit_master(wvc->unit));
905 
906                     /*
907                      * We know the vcpu's lock is not this resource's lock. In
908                      * fact, if it were, since this cpu is free, vcpu_wake()
909                      * would have assigned the unit to here directly.
910                      */
911                     ASSERT(lock != get_sched_res(sched_cpu)->schedule_lock);
912 
913                     if ( lock ) {
914                         unit_assign(prv, wvc->unit, sched_cpu);
915                         list_del_init(&wvc->waitq_elem);
916                         prev->next_task = wvc->unit;
917                         spin_unlock(lock);
918                         goto unlock;
919                     }
920                 }
921             }
922         }
923         /*
924          * If we did find a unit with suitable affinity in the waitqueue, but
925          * we could not pick it up (due to lock contention), and hence we are
926          * still free, plan for another try. In fact, we don't want such unit
927          * to be stuck in the waitqueue, when there are free cpus where it
928          * could run.
929          */
930         if ( unlikely( unit_found && prev->next_task == NULL &&
931                        !list_empty(&prv->waitq)) )
932             cpu_raise_softirq(cur_cpu, SCHEDULE_SOFTIRQ);
933  unlock:
934         spin_unlock(&prv->waitq_lock);
935 
936         if ( prev->next_task == NULL &&
937              !cpumask_test_cpu(sched_cpu, &prv->cpus_free) )
938             cpumask_set_cpu(sched_cpu, &prv->cpus_free);
939     }
940 
941     if ( unlikely(prev->next_task == NULL ||
942                   !unit_runnable_state(prev->next_task)) )
943         prev->next_task = sched_idle_unit(sched_cpu);
944 
945     NULL_UNIT_CHECK(prev->next_task);
946 
947     prev->next_task->migrated = false;
948 }
949 
dump_unit(const struct null_private * prv,const struct null_unit * nvc)950 static inline void dump_unit(const struct null_private *prv,
951                              const struct null_unit *nvc)
952 {
953     printk("[%i.%i] pcpu=%d", nvc->unit->domain->domain_id,
954             nvc->unit->unit_id, list_empty(&nvc->waitq_elem) ?
955                                 sched_unit_master(nvc->unit) : -1);
956 }
957 
null_dump_pcpu(const struct scheduler * ops,int cpu)958 static void cf_check null_dump_pcpu(const struct scheduler *ops, int cpu)
959 {
960     struct null_private *prv = null_priv(ops);
961     const struct null_pcpu *npc = get_sched_res(cpu)->sched_priv;
962     const struct null_unit *nvc;
963     spinlock_t *lock;
964     unsigned long flags;
965 
966     lock = pcpu_schedule_lock_irqsave(cpu, &flags);
967 
968     printk("CPU[%02d] sibling={%*pbl}, core={%*pbl}",
969            cpu, CPUMASK_PR(per_cpu(cpu_sibling_mask, cpu)),
970            CPUMASK_PR(per_cpu(cpu_core_mask, cpu)));
971     if ( npc->unit != NULL )
972         printk(", unit=%pdv%d", npc->unit->domain, npc->unit->unit_id);
973     printk("\n");
974 
975     /* current unit (nothing to say if that's the idle unit) */
976     nvc = null_unit(curr_on_cpu(cpu));
977     if ( nvc && !is_idle_unit(nvc->unit) )
978     {
979         printk("\trun: ");
980         dump_unit(prv, nvc);
981         printk("\n");
982     }
983 
984     pcpu_schedule_unlock_irqrestore(lock, flags, cpu);
985 }
986 
null_dump(const struct scheduler * ops)987 static void cf_check null_dump(const struct scheduler *ops)
988 {
989     struct null_private *prv = null_priv(ops);
990     struct list_head *iter;
991     unsigned long flags;
992     unsigned int loop;
993 
994     spin_lock_irqsave(&prv->lock, flags);
995 
996     printk("\tcpus_free = %*pbl\n", CPUMASK_PR(&prv->cpus_free));
997 
998     printk("Domain info:\n");
999     loop = 0;
1000     list_for_each( iter, &prv->ndom )
1001     {
1002         struct null_dom *ndom;
1003         struct sched_unit *unit;
1004 
1005         ndom = list_entry(iter, struct null_dom, ndom_elem);
1006 
1007         printk("\tDomain: %d\n", ndom->dom->domain_id);
1008         for_each_sched_unit( ndom->dom, unit )
1009         {
1010             struct null_unit * const nvc = null_unit(unit);
1011             spinlock_t *lock;
1012 
1013             lock = unit_schedule_lock(unit);
1014 
1015             printk("\t%3d: ", ++loop);
1016             dump_unit(prv, nvc);
1017             printk("\n");
1018 
1019             unit_schedule_unlock(lock, unit);
1020         }
1021     }
1022 
1023     printk("Waitqueue: ");
1024     loop = 0;
1025     spin_lock(&prv->waitq_lock);
1026     list_for_each( iter, &prv->waitq )
1027     {
1028         struct null_unit *nvc = list_entry(iter, struct null_unit, waitq_elem);
1029 
1030         if ( loop++ != 0 )
1031             printk(", ");
1032         if ( loop % 24 == 0 )
1033             printk("\n\t");
1034         printk("%pdv%d", nvc->unit->domain, nvc->unit->unit_id);
1035     }
1036     printk("\n");
1037     spin_unlock(&prv->waitq_lock);
1038 
1039     spin_unlock_irqrestore(&prv->lock, flags);
1040 }
1041 
1042 static const struct scheduler sched_null_def = {
1043     .name           = "null Scheduler",
1044     .opt_name       = "null",
1045     .sched_id       = XEN_SCHEDULER_NULL,
1046     .sched_data     = NULL,
1047 
1048     .init           = null_init,
1049     .deinit         = null_deinit,
1050     .alloc_pdata    = null_alloc_pdata,
1051     .free_pdata     = null_free_pdata,
1052     .switch_sched   = null_switch_sched,
1053     .deinit_pdata   = null_deinit_pdata,
1054 
1055     .alloc_udata    = null_alloc_udata,
1056     .free_udata     = null_free_udata,
1057     .alloc_domdata  = null_alloc_domdata,
1058     .free_domdata   = null_free_domdata,
1059 
1060     .insert_unit    = null_unit_insert,
1061     .remove_unit    = null_unit_remove,
1062 
1063     .wake           = null_unit_wake,
1064     .sleep          = null_unit_sleep,
1065     .pick_resource  = null_res_pick,
1066     .migrate        = null_unit_migrate,
1067     .do_schedule    = null_schedule,
1068 
1069     .dump_cpu_state = null_dump_pcpu,
1070     .dump_settings  = null_dump,
1071 };
1072 
1073 REGISTER_SCHEDULER(sched_null_def);
1074