1 /*
2 * xen/common/sched_null.c
3 *
4 * Copyright (c) 2017, Dario Faggioli, Citrix Ltd
5 *
6 * This program is free software; you can redistribute it and/or
7 * modify it under the terms of the GNU General Public
8 * License v2 as published by the Free Software Foundation.
9 *
10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
13 * General Public License for more details.
14 *
15 * You should have received a copy of the GNU General Public
16 * License along with this program; If not, see <http://www.gnu.org/licenses/>.
17 */
18
19 /*
20 * The 'null' scheduler always choose to run, on each pCPU, either nothing
21 * (i.e., the pCPU stays idle) or always the same unit.
22 *
23 * It is aimed at supporting static scenarios, where there always are
24 * less units than pCPUs (and the units don't need to move among pCPUs
25 * for any reason) with the least possible overhead.
26 *
27 * Typical usecase are embedded applications, but also HPC, especially
28 * if the scheduler is used inside a cpupool.
29 */
30
31 #include <xen/sched.h>
32 #include <xen/softirq.h>
33 #include <xen/trace.h>
34
35 #include "private.h"
36
37 /*
38 * null tracing events. Check include/public/trace.h for more details.
39 */
40 #define TRC_SNULL_PICKED_CPU TRC_SCHED_CLASS_EVT(SNULL, 1)
41 #define TRC_SNULL_UNIT_ASSIGN TRC_SCHED_CLASS_EVT(SNULL, 2)
42 #define TRC_SNULL_UNIT_DEASSIGN TRC_SCHED_CLASS_EVT(SNULL, 3)
43 #define TRC_SNULL_MIGRATE TRC_SCHED_CLASS_EVT(SNULL, 4)
44 #define TRC_SNULL_SCHEDULE TRC_SCHED_CLASS_EVT(SNULL, 5)
45 #define TRC_SNULL_TASKLET TRC_SCHED_CLASS_EVT(SNULL, 6)
46
47 /*
48 * Locking:
49 * - Scheduler-lock (a.k.a. runqueue lock):
50 * + is per-pCPU;
51 * + serializes assignment and deassignment of units to a pCPU.
52 * - Private data lock (a.k.a. private scheduler lock):
53 * + is scheduler-wide;
54 * + serializes accesses to the list of domains in this scheduler.
55 * - Waitqueue lock:
56 * + is scheduler-wide;
57 * + serialize accesses to the list of units waiting to be assigned
58 * to pCPUs.
59 *
60 * Ordering is: private lock, runqueue lock, waitqueue lock. Or, OTOH,
61 * waitqueue lock nests inside runqueue lock which nests inside private
62 * lock. More specifically:
63 * + if we need both runqueue and private locks, we must acquire the
64 * private lock for first;
65 * + if we need both runqueue and waitqueue locks, we must acquire
66 * the runqueue lock for first;
67 * + if we need both private and waitqueue locks, we must acquire
68 * the private lock for first;
69 * + if we already own a runqueue lock, we must never acquire
70 * the private lock;
71 * + if we already own the waitqueue lock, we must never acquire
72 * the runqueue lock or the private lock.
73 */
74
75 /*
76 * System-wide private data
77 */
78 struct null_private {
79 spinlock_t lock; /* scheduler lock; nests inside cpupool_lock */
80 struct list_head ndom; /* Domains of this scheduler */
81 struct list_head waitq; /* units not assigned to any pCPU */
82 spinlock_t waitq_lock; /* serializes waitq; nests inside runq locks */
83 cpumask_t cpus_free; /* CPUs without a unit associated to them */
84 };
85
86 /*
87 * Physical CPU
88 */
89 struct null_pcpu {
90 struct sched_unit *unit;
91 };
92
93 /*
94 * Schedule unit
95 */
96 struct null_unit {
97 struct list_head waitq_elem;
98 struct sched_unit *unit;
99 };
100
101 /*
102 * Domain
103 */
104 struct null_dom {
105 struct list_head ndom_elem;
106 struct domain *dom;
107 };
108
109 /*
110 * Accessor helpers functions
111 */
null_priv(const struct scheduler * ops)112 static inline struct null_private *null_priv(const struct scheduler *ops)
113 {
114 return ops->sched_data;
115 }
116
null_unit(const struct sched_unit * unit)117 static inline struct null_unit *null_unit(const struct sched_unit *unit)
118 {
119 return unit->priv;
120 }
121
unit_check_affinity(struct sched_unit * unit,unsigned int cpu,unsigned int balance_step)122 static inline bool unit_check_affinity(struct sched_unit *unit,
123 unsigned int cpu,
124 unsigned int balance_step)
125 {
126 affinity_balance_cpumask(unit, balance_step, cpumask_scratch_cpu(cpu));
127 cpumask_and(cpumask_scratch_cpu(cpu), cpumask_scratch_cpu(cpu),
128 cpupool_domain_master_cpumask(unit->domain));
129
130 return cpumask_test_cpu(cpu, cpumask_scratch_cpu(cpu));
131 }
132
null_init(struct scheduler * ops)133 static int cf_check null_init(struct scheduler *ops)
134 {
135 struct null_private *prv;
136
137 printk("Initializing null scheduler\n"
138 "WARNING: This is experimental software in development.\n"
139 "Use at your own risk.\n");
140
141 prv = xzalloc(struct null_private);
142 if ( prv == NULL )
143 return -ENOMEM;
144
145 spin_lock_init(&prv->lock);
146 spin_lock_init(&prv->waitq_lock);
147 INIT_LIST_HEAD(&prv->ndom);
148 INIT_LIST_HEAD(&prv->waitq);
149
150 ops->sched_data = prv;
151
152 return 0;
153 }
154
null_deinit(struct scheduler * ops)155 static void cf_check null_deinit(struct scheduler *ops)
156 {
157 xfree(ops->sched_data);
158 ops->sched_data = NULL;
159 }
160
init_pdata(struct null_private * prv,struct null_pcpu * npc,unsigned int cpu)161 static void init_pdata(struct null_private *prv, struct null_pcpu *npc,
162 unsigned int cpu)
163 {
164 /* Mark the pCPU as free, and with no unit assigned */
165 cpumask_set_cpu(cpu, &prv->cpus_free);
166 npc->unit = NULL;
167 }
168
null_deinit_pdata(const struct scheduler * ops,void * pcpu,int cpu)169 static void cf_check null_deinit_pdata(
170 const struct scheduler *ops, void *pcpu, int cpu)
171 {
172 struct null_private *prv = null_priv(ops);
173 struct null_pcpu *npc = pcpu;
174
175 ASSERT(npc);
176
177 cpumask_clear_cpu(cpu, &prv->cpus_free);
178 npc->unit = NULL;
179 }
180
null_alloc_pdata(const struct scheduler * ops,int cpu)181 static void *cf_check null_alloc_pdata(const struct scheduler *ops, int cpu)
182 {
183 struct null_pcpu *npc;
184
185 npc = xzalloc(struct null_pcpu);
186 if ( npc == NULL )
187 return ERR_PTR(-ENOMEM);
188
189 return npc;
190 }
191
null_free_pdata(const struct scheduler * ops,void * pcpu,int cpu)192 static void cf_check null_free_pdata(
193 const struct scheduler *ops, void *pcpu, int cpu)
194 {
195 xfree(pcpu);
196 }
197
null_alloc_udata(const struct scheduler * ops,struct sched_unit * unit,void * dd)198 static void *cf_check null_alloc_udata(
199 const struct scheduler *ops, struct sched_unit *unit, void *dd)
200 {
201 struct null_unit *nvc;
202
203 nvc = xzalloc(struct null_unit);
204 if ( nvc == NULL )
205 return NULL;
206
207 INIT_LIST_HEAD(&nvc->waitq_elem);
208 nvc->unit = unit;
209
210 SCHED_STAT_CRANK(unit_alloc);
211
212 return nvc;
213 }
214
null_free_udata(const struct scheduler * ops,void * priv)215 static void cf_check null_free_udata(const struct scheduler *ops, void *priv)
216 {
217 struct null_unit *nvc = priv;
218
219 xfree(nvc);
220 }
221
null_alloc_domdata(const struct scheduler * ops,struct domain * d)222 static void *cf_check null_alloc_domdata(
223 const struct scheduler *ops, struct domain *d)
224 {
225 struct null_private *prv = null_priv(ops);
226 struct null_dom *ndom;
227 unsigned long flags;
228
229 ndom = xzalloc(struct null_dom);
230 if ( ndom == NULL )
231 return ERR_PTR(-ENOMEM);
232
233 ndom->dom = d;
234
235 spin_lock_irqsave(&prv->lock, flags);
236 list_add_tail(&ndom->ndom_elem, &null_priv(ops)->ndom);
237 spin_unlock_irqrestore(&prv->lock, flags);
238
239 return ndom;
240 }
241
null_free_domdata(const struct scheduler * ops,void * data)242 static void cf_check null_free_domdata(const struct scheduler *ops, void *data)
243 {
244 struct null_dom *ndom = data;
245 struct null_private *prv = null_priv(ops);
246
247 if ( ndom )
248 {
249 unsigned long flags;
250
251 spin_lock_irqsave(&prv->lock, flags);
252 list_del_init(&ndom->ndom_elem);
253 spin_unlock_irqrestore(&prv->lock, flags);
254
255 xfree(ndom);
256 }
257 }
258
259 /*
260 * unit to pCPU assignment and placement. This _only_ happens:
261 * - on insert,
262 * - on migrate.
263 *
264 * Insert occurs when a unit joins this scheduler for the first time
265 * (e.g., when the domain it's part of is moved to the scheduler's
266 * cpupool).
267 *
268 * Migration may be necessary if a pCPU (with a unit assigned to it)
269 * is removed from the scheduler's cpupool.
270 *
271 * So this is not part of any hot path.
272 */
273 static struct sched_resource *
pick_res(const struct null_private * prv,const struct sched_unit * unit)274 pick_res(const struct null_private *prv, const struct sched_unit *unit)
275 {
276 unsigned int bs;
277 unsigned int cpu = sched_unit_master(unit), new_cpu;
278 const cpumask_t *cpus = cpupool_domain_master_cpumask(unit->domain);
279 const struct null_pcpu *npc = get_sched_res(cpu)->sched_priv;
280
281 ASSERT(spin_is_locked(get_sched_res(cpu)->schedule_lock));
282
283 for_each_affinity_balance_step( bs )
284 {
285 if ( bs == BALANCE_SOFT_AFFINITY && !has_soft_affinity(unit) )
286 continue;
287
288 affinity_balance_cpumask(unit, bs, cpumask_scratch_cpu(cpu));
289 cpumask_and(cpumask_scratch_cpu(cpu), cpumask_scratch_cpu(cpu), cpus);
290
291 /*
292 * If our processor is free, or we are assigned to it, and it is also
293 * still valid and part of our affinity, just go for it.
294 * (Note that we may call unit_check_affinity(), but we deliberately
295 * don't, so we get to keep in the scratch cpumask what we have just
296 * put in it.)
297 */
298 if ( likely((npc->unit == NULL || npc->unit == unit)
299 && cpumask_test_cpu(cpu, cpumask_scratch_cpu(cpu))) )
300 {
301 new_cpu = cpu;
302 goto out;
303 }
304
305 /* If not, just go for a free pCPU, within our affinity, if any */
306 cpumask_and(cpumask_scratch_cpu(cpu), cpumask_scratch_cpu(cpu),
307 &prv->cpus_free);
308 new_cpu = cpumask_first(cpumask_scratch_cpu(cpu));
309
310 if ( likely(new_cpu != nr_cpu_ids) )
311 goto out;
312 }
313
314 /*
315 * If we didn't find any free pCPU, just pick any valid pcpu, even if
316 * it has another unit assigned. This will happen during shutdown and
317 * suspend/resume, but it may also happen during "normal operation", if
318 * all the pCPUs are busy.
319 *
320 * In fact, there must always be something sane in v->processor, or
321 * unit_schedule_lock() and friends won't work. This is not a problem,
322 * as we will actually assign the unit to the pCPU we return from here,
323 * only if the pCPU is free.
324 */
325 cpumask_and(cpumask_scratch_cpu(cpu), cpus, unit->cpu_hard_affinity);
326 new_cpu = cpumask_any(cpumask_scratch_cpu(cpu));
327
328 out:
329 if ( unlikely(tb_init_done) )
330 {
331 struct {
332 uint16_t unit, dom;
333 uint32_t new_cpu;
334 } d = {
335 .unit = unit->unit_id,
336 .dom = unit->domain->domain_id,
337 .new_cpu = new_cpu,
338 };
339
340 trace_time(TRC_SNULL_PICKED_CPU, sizeof(d), &d);
341 }
342
343 return get_sched_res(new_cpu);
344 }
345
unit_assign(struct null_private * prv,struct sched_unit * unit,unsigned int cpu)346 static void unit_assign(struct null_private *prv, struct sched_unit *unit,
347 unsigned int cpu)
348 {
349 struct null_pcpu *npc = get_sched_res(cpu)->sched_priv;
350
351 ASSERT(is_unit_online(unit));
352
353 npc->unit = unit;
354 sched_set_res(unit, get_sched_res(cpu));
355 cpumask_clear_cpu(cpu, &prv->cpus_free);
356
357 dprintk(XENLOG_G_INFO, "%d <-- %pdv%d\n", cpu, unit->domain, unit->unit_id);
358
359 if ( unlikely(tb_init_done) )
360 {
361 struct {
362 uint16_t unit, dom;
363 uint32_t cpu;
364 } d = {
365 .unit = unit->unit_id,
366 .dom = unit->domain->domain_id,
367 .cpu = cpu,
368 };
369
370 trace_time(TRC_SNULL_UNIT_ASSIGN, sizeof(d), &d);
371 }
372 }
373
374 /* Returns true if a cpu was tickled */
unit_deassign(struct null_private * prv,const struct sched_unit * unit)375 static bool unit_deassign(struct null_private *prv, const struct sched_unit *unit)
376 {
377 unsigned int bs;
378 unsigned int cpu = sched_unit_master(unit);
379 struct null_unit *wvc;
380 struct null_pcpu *npc = get_sched_res(cpu)->sched_priv;
381
382 ASSERT(list_empty(&null_unit(unit)->waitq_elem));
383 ASSERT(npc->unit == unit);
384 ASSERT(!cpumask_test_cpu(cpu, &prv->cpus_free));
385
386 npc->unit = NULL;
387 cpumask_set_cpu(cpu, &prv->cpus_free);
388
389 dprintk(XENLOG_G_INFO, "%d <-- NULL (%pdv%d)\n", cpu, unit->domain,
390 unit->unit_id);
391
392 if ( unlikely(tb_init_done) )
393 {
394 struct {
395 uint16_t unit, dom;
396 uint32_t cpu;
397 } d = {
398 .unit = unit->unit_id,
399 .dom = unit->domain->domain_id,
400 .cpu = cpu,
401 };
402
403 trace_time(TRC_SNULL_UNIT_DEASSIGN, sizeof(d), &d);
404 }
405
406 spin_lock(&prv->waitq_lock);
407
408 /*
409 * If unit is assigned to a pCPU, let's see if there is someone waiting,
410 * suitable to be assigned to it (prioritizing units that have
411 * soft-affinity with cpu).
412 */
413 for_each_affinity_balance_step( bs )
414 {
415 list_for_each_entry( wvc, &prv->waitq, waitq_elem )
416 {
417 if ( bs == BALANCE_SOFT_AFFINITY &&
418 !has_soft_affinity(wvc->unit) )
419 continue;
420
421 if ( unit_check_affinity(wvc->unit, cpu, bs) )
422 {
423 list_del_init(&wvc->waitq_elem);
424 unit_assign(prv, wvc->unit, cpu);
425 cpu_raise_softirq(cpu, SCHEDULE_SOFTIRQ);
426 spin_unlock(&prv->waitq_lock);
427 return true;
428 }
429 }
430 }
431 spin_unlock(&prv->waitq_lock);
432
433 return false;
434 }
435
436 /* Change the scheduler of cpu to us (null). */
null_switch_sched(struct scheduler * new_ops,unsigned int cpu,void * pdata,void * vdata)437 static spinlock_t *cf_check null_switch_sched(
438 struct scheduler *new_ops, unsigned int cpu, void *pdata, void *vdata)
439 {
440 struct sched_resource *sr = get_sched_res(cpu);
441 struct null_private *prv = null_priv(new_ops);
442 const struct null_unit *nvc = vdata;
443
444 ASSERT(nvc && is_idle_unit(nvc->unit));
445
446 sched_idle_unit(cpu)->priv = vdata;
447
448 /*
449 * We are holding the runqueue lock already (it's been taken in
450 * schedule_cpu_switch()). It actually may or may not be the 'right'
451 * one for this cpu, but that is ok for preventing races.
452 */
453 ASSERT(!local_irq_is_enabled());
454
455 init_pdata(prv, pdata, cpu);
456
457 return &sr->_lock;
458 }
459
null_unit_insert(const struct scheduler * ops,struct sched_unit * unit)460 static void cf_check null_unit_insert(
461 const struct scheduler *ops, struct sched_unit *unit)
462 {
463 struct null_private *prv = null_priv(ops);
464 struct null_unit *nvc = null_unit(unit);
465 struct null_pcpu *npc;
466 unsigned int cpu;
467 spinlock_t *lock;
468
469 ASSERT(!is_idle_unit(unit));
470
471 lock = unit_schedule_lock_irq(unit);
472
473 if ( unlikely(!is_unit_online(unit)) )
474 {
475 unit_schedule_unlock_irq(lock, unit);
476 return;
477 }
478
479 retry:
480 sched_set_res(unit, pick_res(prv, unit));
481 cpu = sched_unit_master(unit);
482 npc = get_sched_res(cpu)->sched_priv;
483
484 spin_unlock(lock);
485
486 lock = unit_schedule_lock(unit);
487
488 cpumask_and(cpumask_scratch_cpu(cpu), unit->cpu_hard_affinity,
489 cpupool_domain_master_cpumask(unit->domain));
490
491 /* If the pCPU is free, we assign unit to it */
492 if ( likely(npc->unit == NULL) )
493 {
494 /*
495 * Insert is followed by vcpu_wake(), so there's no need to poke
496 * the pcpu with the SCHEDULE_SOFTIRQ, as wake will do that.
497 */
498 unit_assign(prv, unit, cpu);
499 }
500 else if ( cpumask_intersects(&prv->cpus_free, cpumask_scratch_cpu(cpu)) )
501 {
502 /*
503 * If the pCPU is not free (e.g., because we raced with another
504 * insert or a migrate), but there are other free pCPUs, we can
505 * try to pick again.
506 */
507 goto retry;
508 }
509 else
510 {
511 /*
512 * If the pCPU is not free, and there aren't any (valid) others,
513 * we have no alternatives than to go into the waitqueue.
514 */
515 spin_lock(&prv->waitq_lock);
516 list_add_tail(&nvc->waitq_elem, &prv->waitq);
517 dprintk(XENLOG_G_WARNING, "WARNING: %pdv%d not assigned to any CPU!\n",
518 unit->domain, unit->unit_id);
519 spin_unlock(&prv->waitq_lock);
520 }
521 spin_unlock_irq(lock);
522
523 SCHED_STAT_CRANK(unit_insert);
524 }
525
null_unit_remove(const struct scheduler * ops,struct sched_unit * unit)526 static void cf_check null_unit_remove(
527 const struct scheduler *ops, struct sched_unit *unit)
528 {
529 struct null_private *prv = null_priv(ops);
530 struct null_unit *nvc = null_unit(unit);
531 struct null_pcpu *npc;
532 unsigned int cpu;
533 spinlock_t *lock;
534
535 ASSERT(!is_idle_unit(unit));
536
537 lock = unit_schedule_lock_irq(unit);
538
539 /* If offline, the unit shouldn't be assigned, nor in the waitqueue */
540 if ( unlikely(!is_unit_online(unit)) )
541 {
542 npc = unit->res->sched_priv;
543 ASSERT(npc->unit != unit);
544 ASSERT(list_empty(&nvc->waitq_elem));
545 goto out;
546 }
547
548 /* If unit is in waitqueue, just get it out of there and bail */
549 if ( unlikely(!list_empty(&nvc->waitq_elem)) )
550 {
551 spin_lock(&prv->waitq_lock);
552 list_del_init(&nvc->waitq_elem);
553 spin_unlock(&prv->waitq_lock);
554
555 goto out;
556 }
557
558 cpu = sched_unit_master(unit);
559 npc = get_sched_res(cpu)->sched_priv;
560 if ( npc->unit == unit )
561 unit_deassign(prv, unit);
562
563 out:
564 unit_schedule_unlock_irq(lock, unit);
565
566 SCHED_STAT_CRANK(unit_remove);
567 }
568
null_unit_wake(const struct scheduler * ops,struct sched_unit * unit)569 static void cf_check null_unit_wake(
570 const struct scheduler *ops, struct sched_unit *unit)
571 {
572 struct null_private *prv = null_priv(ops);
573 struct null_unit *nvc = null_unit(unit);
574 unsigned int cpu = sched_unit_master(unit);
575 struct null_pcpu *npc = get_sched_res(cpu)->sched_priv;
576
577 ASSERT(!is_idle_unit(unit));
578
579 if ( unlikely(curr_on_cpu(sched_unit_master(unit)) == unit) )
580 {
581 SCHED_STAT_CRANK(unit_wake_running);
582 return;
583 }
584
585 if ( unlikely(!list_empty(&nvc->waitq_elem)) )
586 {
587 /* Not exactly "on runq", but close enough for reusing the counter */
588 SCHED_STAT_CRANK(unit_wake_onrunq);
589 return;
590 }
591
592 if ( likely(unit_runnable(unit)) )
593 SCHED_STAT_CRANK(unit_wake_runnable);
594 else
595 SCHED_STAT_CRANK(unit_wake_not_runnable);
596
597 if ( likely(npc->unit == unit) )
598 {
599 cpu_raise_softirq(cpu, SCHEDULE_SOFTIRQ);
600 return;
601 }
602
603 /*
604 * If a unit is neither on a pCPU nor in the waitqueue, it means it was
605 * offline, and that it is now coming back being online. If we're lucky,
606 * and its previous resource is free (and affinities match), we can just
607 * assign the unit to it (we own the proper lock already) and be done.
608 */
609 if ( npc->unit == NULL &&
610 unit_check_affinity(unit, cpu, BALANCE_HARD_AFFINITY) )
611 {
612 if ( !has_soft_affinity(unit) ||
613 unit_check_affinity(unit, cpu, BALANCE_SOFT_AFFINITY) )
614 {
615 unit_assign(prv, unit, cpu);
616 cpu_raise_softirq(cpu, SCHEDULE_SOFTIRQ);
617 return;
618 }
619 }
620
621 /*
622 * If the resource is not free (or affinities do not match) we need
623 * to assign unit to some other one, but we can't do it here, as:
624 * - we don't own the proper lock,
625 * - we can't change v->processor under vcpu_wake()'s feet.
626 * So we add it to the waitqueue, and tickle all the free CPUs (if any)
627 * on which unit can run. The first one that schedules will pick it up.
628 */
629 spin_lock(&prv->waitq_lock);
630 list_add_tail(&nvc->waitq_elem, &prv->waitq);
631 spin_unlock(&prv->waitq_lock);
632
633 cpumask_and(cpumask_scratch_cpu(cpu), unit->cpu_hard_affinity,
634 cpupool_domain_master_cpumask(unit->domain));
635 cpumask_and(cpumask_scratch_cpu(cpu), cpumask_scratch_cpu(cpu),
636 &prv->cpus_free);
637
638 if ( cpumask_empty(cpumask_scratch_cpu(cpu)) )
639 dprintk(XENLOG_G_WARNING, "WARNING: d%dv%d not assigned to any CPU!\n",
640 unit->domain->domain_id, unit->unit_id);
641 else
642 cpumask_raise_softirq(cpumask_scratch_cpu(cpu), SCHEDULE_SOFTIRQ);
643 }
644
null_unit_sleep(const struct scheduler * ops,struct sched_unit * unit)645 static void cf_check null_unit_sleep(
646 const struct scheduler *ops, struct sched_unit *unit)
647 {
648 struct null_private *prv = null_priv(ops);
649 unsigned int cpu = sched_unit_master(unit);
650 struct null_pcpu *npc = get_sched_res(cpu)->sched_priv;
651 bool tickled = false;
652
653 ASSERT(!is_idle_unit(unit));
654
655 /*
656 * Check if the unit is in the process of being offlined. If yes,
657 * we need to remove it from either its pCPU or the waitqueue.
658 */
659 if ( unlikely(!is_unit_online(unit)) )
660 {
661 struct null_unit *nvc = null_unit(unit);
662
663 if ( unlikely(!list_empty(&nvc->waitq_elem)) )
664 {
665 spin_lock(&prv->waitq_lock);
666 list_del_init(&nvc->waitq_elem);
667 spin_unlock(&prv->waitq_lock);
668 }
669 else if ( npc->unit == unit )
670 tickled = unit_deassign(prv, unit);
671 }
672
673 /* If unit is not assigned to a pCPU, or is not running, no need to bother */
674 if ( likely(!tickled && curr_on_cpu(cpu) == unit) )
675 cpu_raise_softirq(cpu, SCHEDULE_SOFTIRQ);
676
677 SCHED_STAT_CRANK(unit_sleep);
678 }
679
680 static struct sched_resource *cf_check
null_res_pick(const struct scheduler * ops,const struct sched_unit * unit)681 null_res_pick(const struct scheduler *ops, const struct sched_unit *unit)
682 {
683 ASSERT(!is_idle_unit(unit));
684 return pick_res(null_priv(ops), unit);
685 }
686
null_unit_migrate(const struct scheduler * ops,struct sched_unit * unit,unsigned int new_cpu)687 static void cf_check null_unit_migrate(
688 const struct scheduler *ops, struct sched_unit *unit, unsigned int new_cpu)
689 {
690 struct null_private *prv = null_priv(ops);
691 struct null_unit *nvc = null_unit(unit);
692 struct null_pcpu *npc;
693
694 ASSERT(!is_idle_unit(unit));
695
696 if ( sched_unit_master(unit) == new_cpu )
697 return;
698
699 if ( unlikely(tb_init_done) )
700 {
701 struct {
702 uint16_t unit, dom;
703 uint16_t cpu, new_cpu;
704 } d = {
705 .unit = unit->unit_id,
706 .dom = unit->domain->domain_id,
707 .cpu = sched_unit_master(unit),
708 .new_cpu = new_cpu,
709 };
710
711 trace_time(TRC_SNULL_MIGRATE, sizeof(d), &d);
712 }
713
714 /*
715 * If unit is assigned to a pCPU, then such pCPU becomes free, and we
716 * should look in the waitqueue if anyone else can be assigned to it.
717 */
718 npc = unit->res->sched_priv;
719 if ( likely(npc->unit == unit) )
720 {
721 unit_deassign(prv, unit);
722 SCHED_STAT_CRANK(migrate_running);
723 }
724 else if ( !list_empty(&nvc->waitq_elem) )
725 SCHED_STAT_CRANK(migrate_on_runq);
726
727 SCHED_STAT_CRANK(migrated);
728
729 /*
730 * If a unit is (going) offline, we want it to be neither assigned
731 * to a pCPU, nor in the waitqueue.
732 *
733 * If it was on a cpu, we've removed it from there above. If it is
734 * in the waitqueue, we remove it from there now. And then we bail.
735 */
736 if ( unlikely(!is_unit_online(unit)) )
737 {
738 spin_lock(&prv->waitq_lock);
739 list_del_init(&nvc->waitq_elem);
740 spin_unlock(&prv->waitq_lock);
741 goto out;
742 }
743
744 /*
745 * Let's now consider new_cpu, which is where unit is being sent. It can be
746 * either free, or have a unit already assigned to it.
747 *
748 * In the former case we should assign unit to it, and try to get it to run,
749 * if possible, according to affinity.
750 *
751 * In latter, all we can do is to park unit in the waitqueue.
752 */
753 npc = get_sched_res(new_cpu)->sched_priv;
754 if ( npc->unit == NULL &&
755 unit_check_affinity(unit, new_cpu, BALANCE_HARD_AFFINITY) )
756 {
757 /* unit might have been in the waitqueue, so remove it */
758 spin_lock(&prv->waitq_lock);
759 list_del_init(&nvc->waitq_elem);
760 spin_unlock(&prv->waitq_lock);
761
762 unit_assign(prv, unit, new_cpu);
763 }
764 else
765 {
766 /* Put unit in the waitqueue, if it wasn't there already */
767 spin_lock(&prv->waitq_lock);
768 if ( list_empty(&nvc->waitq_elem) )
769 {
770 list_add_tail(&nvc->waitq_elem, &prv->waitq);
771 dprintk(XENLOG_G_WARNING,
772 "WARNING: %pdv%d not assigned to any CPU!\n", unit->domain,
773 unit->unit_id);
774 }
775 spin_unlock(&prv->waitq_lock);
776 }
777
778 /*
779 * Whatever all the above, we always at least override v->processor.
780 * This is especially important for shutdown or suspend/resume paths,
781 * when it is important to let our caller (cpu_disable_scheduler())
782 * know that the migration did happen, to the best of our possibilities,
783 * at least. In case of suspend, any temporary inconsistency caused
784 * by this, will be fixed-up during resume.
785 */
786 out:
787 sched_set_res(unit, get_sched_res(new_cpu));
788 }
789
790 #ifndef NDEBUG
null_unit_check(struct sched_unit * unit)791 static inline void null_unit_check(struct sched_unit *unit)
792 {
793 struct null_unit * const nvc = null_unit(unit);
794 struct null_dom * const ndom = unit->domain->sched_priv;
795
796 BUG_ON(nvc->unit != unit);
797
798 if ( ndom )
799 BUG_ON(is_idle_unit(unit));
800 else
801 BUG_ON(!is_idle_unit(unit));
802
803 SCHED_STAT_CRANK(unit_check);
804 }
805 #define NULL_UNIT_CHECK(unit) (null_unit_check(unit))
806 #else
807 #define NULL_UNIT_CHECK(unit)
808 #endif
809
810
811 /*
812 * The most simple scheduling function of all times! We either return:
813 * - the unit assigned to the pCPU, if there's one and it can run;
814 * - the idle unit, otherwise.
815 */
null_schedule(const struct scheduler * ops,struct sched_unit * prev,s_time_t now,bool tasklet_work_scheduled)816 static void cf_check null_schedule(
817 const struct scheduler *ops, struct sched_unit *prev, s_time_t now,
818 bool tasklet_work_scheduled)
819 {
820 unsigned int bs;
821 const unsigned int cur_cpu = smp_processor_id();
822 const unsigned int sched_cpu = sched_get_resource_cpu(cur_cpu);
823 struct null_pcpu *npc = get_sched_res(sched_cpu)->sched_priv;
824 struct null_private *prv = null_priv(ops);
825 struct null_unit *wvc;
826
827 SCHED_STAT_CRANK(schedule);
828 NULL_UNIT_CHECK(current->sched_unit);
829
830 if ( unlikely(tb_init_done) )
831 {
832 struct {
833 uint16_t tasklet, cpu;
834 int16_t unit, dom;
835 } d = {
836 .tasklet = tasklet_work_scheduled,
837 .cpu = cur_cpu,
838 };
839
840 if ( npc->unit == NULL )
841 {
842 d.unit = d.dom = -1;
843 }
844 else
845 {
846 d.unit = npc->unit->unit_id;
847 d.dom = npc->unit->domain->domain_id;
848 }
849
850 trace_time(TRC_SNULL_SCHEDULE, sizeof(d), &d);
851 }
852
853 if ( tasklet_work_scheduled )
854 {
855 TRACE_TIME(TRC_SNULL_TASKLET);
856 prev->next_task = sched_idle_unit(sched_cpu);
857 }
858 else
859 prev->next_task = npc->unit;
860 prev->next_time = -1;
861
862 /*
863 * We may be new in the cpupool, or just coming back online. In which
864 * case, there may be units in the waitqueue that we can assign to us
865 * and run.
866 */
867 if ( unlikely(prev->next_task == NULL) )
868 {
869 bool unit_found;
870
871 spin_lock(&prv->waitq_lock);
872
873 if ( list_empty(&prv->waitq) )
874 goto unlock;
875
876 /*
877 * We scan the waitqueue twice, for prioritizing units that have
878 * soft-affinity with cpu. This may look like something expensive to
879 * do here in null_schedule(), but it's actually fine, because we do
880 * it only in cases where a pcpu has no unit associated (e.g., as
881 * said above, the cpu has just joined a cpupool).
882 */
883 unit_found = false;
884 for_each_affinity_balance_step( bs )
885 {
886 list_for_each_entry( wvc, &prv->waitq, waitq_elem )
887 {
888 if ( bs == BALANCE_SOFT_AFFINITY &&
889 !has_soft_affinity(wvc->unit) )
890 continue;
891
892 if ( unit_check_affinity(wvc->unit, sched_cpu, bs) )
893 {
894 spinlock_t *lock;
895
896 unit_found = true;
897
898 /*
899 * If the unit in the waitqueue has just come up online,
900 * we risk racing with vcpu_wake(). To avoid this, sync
901 * on the spinlock that vcpu_wake() holds, but only with
902 * trylock, to avoid deadlock).
903 */
904 lock = pcpu_schedule_trylock(sched_unit_master(wvc->unit));
905
906 /*
907 * We know the vcpu's lock is not this resource's lock. In
908 * fact, if it were, since this cpu is free, vcpu_wake()
909 * would have assigned the unit to here directly.
910 */
911 ASSERT(lock != get_sched_res(sched_cpu)->schedule_lock);
912
913 if ( lock ) {
914 unit_assign(prv, wvc->unit, sched_cpu);
915 list_del_init(&wvc->waitq_elem);
916 prev->next_task = wvc->unit;
917 spin_unlock(lock);
918 goto unlock;
919 }
920 }
921 }
922 }
923 /*
924 * If we did find a unit with suitable affinity in the waitqueue, but
925 * we could not pick it up (due to lock contention), and hence we are
926 * still free, plan for another try. In fact, we don't want such unit
927 * to be stuck in the waitqueue, when there are free cpus where it
928 * could run.
929 */
930 if ( unlikely( unit_found && prev->next_task == NULL &&
931 !list_empty(&prv->waitq)) )
932 cpu_raise_softirq(cur_cpu, SCHEDULE_SOFTIRQ);
933 unlock:
934 spin_unlock(&prv->waitq_lock);
935
936 if ( prev->next_task == NULL &&
937 !cpumask_test_cpu(sched_cpu, &prv->cpus_free) )
938 cpumask_set_cpu(sched_cpu, &prv->cpus_free);
939 }
940
941 if ( unlikely(prev->next_task == NULL ||
942 !unit_runnable_state(prev->next_task)) )
943 prev->next_task = sched_idle_unit(sched_cpu);
944
945 NULL_UNIT_CHECK(prev->next_task);
946
947 prev->next_task->migrated = false;
948 }
949
dump_unit(const struct null_private * prv,const struct null_unit * nvc)950 static inline void dump_unit(const struct null_private *prv,
951 const struct null_unit *nvc)
952 {
953 printk("[%i.%i] pcpu=%d", nvc->unit->domain->domain_id,
954 nvc->unit->unit_id, list_empty(&nvc->waitq_elem) ?
955 sched_unit_master(nvc->unit) : -1);
956 }
957
null_dump_pcpu(const struct scheduler * ops,int cpu)958 static void cf_check null_dump_pcpu(const struct scheduler *ops, int cpu)
959 {
960 struct null_private *prv = null_priv(ops);
961 const struct null_pcpu *npc = get_sched_res(cpu)->sched_priv;
962 const struct null_unit *nvc;
963 spinlock_t *lock;
964 unsigned long flags;
965
966 lock = pcpu_schedule_lock_irqsave(cpu, &flags);
967
968 printk("CPU[%02d] sibling={%*pbl}, core={%*pbl}",
969 cpu, CPUMASK_PR(per_cpu(cpu_sibling_mask, cpu)),
970 CPUMASK_PR(per_cpu(cpu_core_mask, cpu)));
971 if ( npc->unit != NULL )
972 printk(", unit=%pdv%d", npc->unit->domain, npc->unit->unit_id);
973 printk("\n");
974
975 /* current unit (nothing to say if that's the idle unit) */
976 nvc = null_unit(curr_on_cpu(cpu));
977 if ( nvc && !is_idle_unit(nvc->unit) )
978 {
979 printk("\trun: ");
980 dump_unit(prv, nvc);
981 printk("\n");
982 }
983
984 pcpu_schedule_unlock_irqrestore(lock, flags, cpu);
985 }
986
null_dump(const struct scheduler * ops)987 static void cf_check null_dump(const struct scheduler *ops)
988 {
989 struct null_private *prv = null_priv(ops);
990 struct list_head *iter;
991 unsigned long flags;
992 unsigned int loop;
993
994 spin_lock_irqsave(&prv->lock, flags);
995
996 printk("\tcpus_free = %*pbl\n", CPUMASK_PR(&prv->cpus_free));
997
998 printk("Domain info:\n");
999 loop = 0;
1000 list_for_each( iter, &prv->ndom )
1001 {
1002 struct null_dom *ndom;
1003 struct sched_unit *unit;
1004
1005 ndom = list_entry(iter, struct null_dom, ndom_elem);
1006
1007 printk("\tDomain: %d\n", ndom->dom->domain_id);
1008 for_each_sched_unit( ndom->dom, unit )
1009 {
1010 struct null_unit * const nvc = null_unit(unit);
1011 spinlock_t *lock;
1012
1013 lock = unit_schedule_lock(unit);
1014
1015 printk("\t%3d: ", ++loop);
1016 dump_unit(prv, nvc);
1017 printk("\n");
1018
1019 unit_schedule_unlock(lock, unit);
1020 }
1021 }
1022
1023 printk("Waitqueue: ");
1024 loop = 0;
1025 spin_lock(&prv->waitq_lock);
1026 list_for_each( iter, &prv->waitq )
1027 {
1028 struct null_unit *nvc = list_entry(iter, struct null_unit, waitq_elem);
1029
1030 if ( loop++ != 0 )
1031 printk(", ");
1032 if ( loop % 24 == 0 )
1033 printk("\n\t");
1034 printk("%pdv%d", nvc->unit->domain, nvc->unit->unit_id);
1035 }
1036 printk("\n");
1037 spin_unlock(&prv->waitq_lock);
1038
1039 spin_unlock_irqrestore(&prv->lock, flags);
1040 }
1041
1042 static const struct scheduler sched_null_def = {
1043 .name = "null Scheduler",
1044 .opt_name = "null",
1045 .sched_id = XEN_SCHEDULER_NULL,
1046 .sched_data = NULL,
1047
1048 .init = null_init,
1049 .deinit = null_deinit,
1050 .alloc_pdata = null_alloc_pdata,
1051 .free_pdata = null_free_pdata,
1052 .switch_sched = null_switch_sched,
1053 .deinit_pdata = null_deinit_pdata,
1054
1055 .alloc_udata = null_alloc_udata,
1056 .free_udata = null_free_udata,
1057 .alloc_domdata = null_alloc_domdata,
1058 .free_domdata = null_free_domdata,
1059
1060 .insert_unit = null_unit_insert,
1061 .remove_unit = null_unit_remove,
1062
1063 .wake = null_unit_wake,
1064 .sleep = null_unit_sleep,
1065 .pick_resource = null_res_pick,
1066 .migrate = null_unit_migrate,
1067 .do_schedule = null_schedule,
1068
1069 .dump_cpu_state = null_dump_pcpu,
1070 .dump_settings = null_dump,
1071 };
1072
1073 REGISTER_SCHEDULER(sched_null_def);
1074