1 /*****************************************************************************
2 * Preemptive Global Earliest Deadline First (EDF) scheduler for Xen
3 * EDF scheduling is a real-time scheduling algorithm used in embedded field.
4 *
5 * by Sisu Xi, 2013, Washington University in Saint Louis
6 * Meng Xu, 2014-2016, University of Pennsylvania
7 *
8 * Conversion toward event driven model by Tianyang Chen
9 * and Dagaen Golomb, 2016, University of Pennsylvania
10 *
11 * based on the code of credit Scheduler
12 */
13
14 #include <xen/init.h>
15 #include <xen/lib.h>
16 #include <xen/sched.h>
17 #include <xen/domain.h>
18 #include <xen/delay.h>
19 #include <xen/event.h>
20 #include <xen/time.h>
21 #include <xen/timer.h>
22 #include <xen/perfc.h>
23 #include <xen/softirq.h>
24 #include <asm/atomic.h>
25 #include <xen/errno.h>
26 #include <xen/trace.h>
27 #include <xen/cpu.h>
28 #include <xen/keyhandler.h>
29 #include <xen/trace.h>
30 #include <xen/err.h>
31 #include <xen/guest_access.h>
32
33 #include "private.h"
34
35 /*
36 * TODO:
37 *
38 * Migration compensation and resist like credit2 to better use cache;
39 * Lock Holder Problem, using yield?
40 * Self switch problem: UNITs of the same domain may preempt each other;
41 */
42
43 /*
44 * Design:
45 *
46 * This scheduler follows the Preemptive Global Earliest Deadline First (EDF)
47 * theory in real-time field.
48 * At any scheduling point, the UNIT with earlier deadline has higher priority.
49 * The scheduler always picks highest priority UNIT to run on a feasible PCPU.
50 * A PCPU is feasible if the UNIT can run on this PCPU and (the PCPU is idle or
51 * has a lower-priority UNIT running on it.)
52 *
53 * Each UNIT has a dedicated period, budget and a extratime flag
54 * The deadline of an UNIT is at the end of each period;
55 * An UNIT has its budget replenished at the beginning of each period;
56 * While scheduled, an UNIT burns its budget.
57 * The UNIT needs to finish its budget before its deadline in each period;
58 * The UNIT discards its unused budget at the end of each period.
59 * When an UNIT runs out of budget in a period, if its extratime flag is set,
60 * the UNIT increases its priority_level by 1 and refills its budget; otherwise,
61 * it has to wait until next period.
62 *
63 * Each UNIT is implemented as a deferable server.
64 * When an UNIT has a task running on it, its budget is continuously burned;
65 * When an UNIT has no task but with budget left, its budget is preserved.
66 *
67 * Queue scheme:
68 * A global runqueue and a global depletedqueue for each CPU pool.
69 * The runqueue holds all runnable UNITs with budget,
70 * sorted by priority_level and deadline;
71 * The depletedqueue holds all UNITs without budget, unsorted;
72 *
73 * Note: cpumask and cpupool is supported.
74 */
75
76 /*
77 * Locking:
78 * A global system lock is used to protect the RunQ and DepletedQ.
79 * The global lock is referenced by sched_res->schedule_lock
80 * from all physical cpus.
81 *
82 * The lock is already grabbed when calling wake/sleep/schedule/ functions
83 * in schedule.c
84 *
85 * The functions involes RunQ and needs to grab locks are:
86 * unit_insert, unit_remove, context_saved, runq_insert
87 */
88
89
90 /*
91 * Default parameters:
92 * Period and budget in default is 10 and 4 ms, respectively
93 */
94 #define RTDS_DEFAULT_PERIOD (MICROSECS(10000))
95 #define RTDS_DEFAULT_BUDGET (MICROSECS(4000))
96
97 /*
98 * Max period: max delta of time type, because period is added to the time
99 * an unit activates, so this must not overflow.
100 * Min period: 10 us, considering the scheduling overhead (when period is
101 * too low, scheduling is invoked too frequently, causing high overhead).
102 */
103 #define RTDS_MAX_PERIOD (STIME_DELTA_MAX)
104 #define RTDS_MIN_PERIOD (MICROSECS(10))
105
106 /*
107 * Min budget: 10 us, considering the scheduling overhead (when budget is
108 * consumed too fast, scheduling is invoked too frequently, causing
109 * high overhead).
110 */
111 #define RTDS_MIN_BUDGET (MICROSECS(10))
112
113 /*
114 * UPDATE_LIMIT_SHIFT: a constant used in rt_update_deadline(). When finding
115 * the next deadline, performing addition could be faster if the difference
116 * between cur_deadline and now is small. If the difference is bigger than
117 * 1024 * period, use multiplication.
118 */
119 #define UPDATE_LIMIT_SHIFT 10
120
121 /*
122 * Flags
123 */
124 /*
125 * RTDS_scheduled: Is this unit either running on, or context-switching off,
126 * a physical cpu?
127 * + Accessed only with global lock held.
128 * + Set when chosen as next in rt_schedule().
129 * + Cleared after context switch has been saved in rt_context_saved()
130 * + Checked in unit_wake to see if we can add to the Runqueue, or if we should
131 * set RTDS_delayed_runq_add
132 * + Checked to be false in runq_insert.
133 */
134 #define __RTDS_scheduled 1
135 #define RTDS_scheduled (1<<__RTDS_scheduled)
136 /*
137 * RTDS_delayed_runq_add: Do we need to add this to the RunQ/DepletedQ
138 * once it's done being context switching out?
139 * + Set when scheduling out in rt_schedule() if prev is runable
140 * + Set in rt_unit_wake if it finds RTDS_scheduled set
141 * + Read in rt_context_saved(). If set, it adds prev to the Runqueue/DepletedQ
142 * and clears the bit.
143 */
144 #define __RTDS_delayed_runq_add 2
145 #define RTDS_delayed_runq_add (1<<__RTDS_delayed_runq_add)
146
147 /*
148 * RTDS_depleted: Does this vcp run out of budget?
149 * This flag is
150 * + set in burn_budget() if an unit has zero budget left;
151 * + cleared and checked in the repenishment handler,
152 * for the units that are being replenished.
153 */
154 #define __RTDS_depleted 3
155 #define RTDS_depleted (1<<__RTDS_depleted)
156
157 /*
158 * RTDS_extratime: Can the unit run in the time that is
159 * not part of any real-time reservation, and would therefore
160 * be otherwise left idle?
161 */
162 #define __RTDS_extratime 4
163 #define RTDS_extratime (1<<__RTDS_extratime)
164
165 /*
166 * rt tracing events ("only" 512 available!). Check
167 * include/public/trace.h for more details.
168 */
169 #define TRC_RTDS_TICKLE TRC_SCHED_CLASS_EVT(RTDS, 1)
170 #define TRC_RTDS_RUNQ_PICK TRC_SCHED_CLASS_EVT(RTDS, 2)
171 #define TRC_RTDS_BUDGET_BURN TRC_SCHED_CLASS_EVT(RTDS, 3)
172 #define TRC_RTDS_BUDGET_REPLENISH TRC_SCHED_CLASS_EVT(RTDS, 4)
173 #define TRC_RTDS_SCHED_TASKLET TRC_SCHED_CLASS_EVT(RTDS, 5)
174 #define TRC_RTDS_SCHEDULE TRC_SCHED_CLASS_EVT(RTDS, 6)
175
176 static void cf_check repl_timer_handler(void *data);
177
178 /*
179 * System-wide private data, include global RunQueue/DepletedQ
180 * Global lock is referenced by sched_res->schedule_lock from all
181 * physical cpus. It can be grabbed via unit_schedule_lock_irq()
182 */
183 struct rt_private {
184 spinlock_t lock; /* the global coarse-grained lock */
185 struct list_head sdom; /* list of availalbe domains, used for dump */
186
187 struct list_head runq; /* ordered list of runnable units */
188 struct list_head depletedq; /* unordered list of depleted units */
189
190 struct timer repl_timer; /* replenishment timer */
191 struct list_head replq; /* ordered list of units that need replenishment */
192
193 cpumask_t tickled; /* cpus been tickled */
194 };
195
196 /*
197 * Virtual CPU
198 */
199 struct rt_unit {
200 struct list_head q_elem; /* on the runq/depletedq list */
201 struct list_head replq_elem; /* on the replenishment events list */
202
203 /* UNIT parameters, in nanoseconds */
204 s_time_t period;
205 s_time_t budget;
206
207 /* UNIT current information in nanosecond */
208 s_time_t cur_budget; /* current budget */
209 s_time_t last_start; /* last start time */
210 s_time_t cur_deadline; /* current deadline for EDF */
211
212 /* Up-pointers */
213 struct rt_dom *sdom;
214 struct sched_unit *unit;
215
216 unsigned priority_level;
217
218 unsigned flags; /* mark __RTDS_scheduled, etc.. */
219 };
220
221 /*
222 * Domain
223 */
224 struct rt_dom {
225 struct list_head sdom_elem; /* link list on rt_priv */
226 struct domain *dom; /* pointer to upper domain */
227 };
228
229 /*
230 * Useful inline functions
231 */
rt_priv(const struct scheduler * ops)232 static inline struct rt_private *rt_priv(const struct scheduler *ops)
233 {
234 return ops->sched_data;
235 }
236
rt_unit(const struct sched_unit * unit)237 static inline struct rt_unit *rt_unit(const struct sched_unit *unit)
238 {
239 return unit->priv;
240 }
241
rt_runq(const struct scheduler * ops)242 static inline struct list_head *rt_runq(const struct scheduler *ops)
243 {
244 return &rt_priv(ops)->runq;
245 }
246
rt_depletedq(const struct scheduler * ops)247 static inline struct list_head *rt_depletedq(const struct scheduler *ops)
248 {
249 return &rt_priv(ops)->depletedq;
250 }
251
rt_replq(const struct scheduler * ops)252 static inline struct list_head *rt_replq(const struct scheduler *ops)
253 {
254 return &rt_priv(ops)->replq;
255 }
256
has_extratime(const struct rt_unit * svc)257 static inline bool has_extratime(const struct rt_unit *svc)
258 {
259 return svc->flags & RTDS_extratime;
260 }
261
262 /*
263 * Helper functions for manipulating the runqueue, the depleted queue,
264 * and the replenishment events queue.
265 */
266 static int
unit_on_q(const struct rt_unit * svc)267 unit_on_q(const struct rt_unit *svc)
268 {
269 return !list_empty(&svc->q_elem);
270 }
271
272 static struct rt_unit *cf_check
q_elem(struct list_head * elem)273 q_elem(struct list_head *elem)
274 {
275 return list_entry(elem, struct rt_unit, q_elem);
276 }
277
278 static struct rt_unit *cf_check
replq_elem(struct list_head * elem)279 replq_elem(struct list_head *elem)
280 {
281 return list_entry(elem, struct rt_unit, replq_elem);
282 }
283
284 static int
unit_on_replq(const struct rt_unit * svc)285 unit_on_replq(const struct rt_unit *svc)
286 {
287 return !list_empty(&svc->replq_elem);
288 }
289
290 /*
291 * If v1 priority >= v2 priority, return value > 0
292 * Otherwise, return value < 0
293 */
294 static s_time_t
compare_unit_priority(const struct rt_unit * v1,const struct rt_unit * v2)295 compare_unit_priority(const struct rt_unit *v1, const struct rt_unit *v2)
296 {
297 int prio = v2->priority_level - v1->priority_level;
298
299 if ( prio == 0 )
300 return v2->cur_deadline - v1->cur_deadline;
301
302 return prio;
303 }
304
305 /*
306 * Debug related code, dump unit/cpu information
307 */
308 static void
rt_dump_unit(const struct scheduler * ops,const struct rt_unit * svc)309 rt_dump_unit(const struct scheduler *ops, const struct rt_unit *svc)
310 {
311 cpumask_t *cpupool_mask, *mask;
312
313 ASSERT(svc != NULL);
314 /* idle unit */
315 if( svc->sdom == NULL )
316 {
317 printk("\n");
318 return;
319 }
320
321 /*
322 * We can't just use 'cpumask_scratch' because the dumping can
323 * happen from a pCPU outside of this scheduler's cpupool, and
324 * hence it's not right to use its pCPU's scratch mask.
325 * On the other hand, it is safe to use sched_unit_master(svc->unit)'s
326 * own scratch space, since we hold the runqueue lock.
327 */
328 mask = cpumask_scratch_cpu(sched_unit_master(svc->unit));
329
330 cpupool_mask = cpupool_domain_master_cpumask(svc->unit->domain);
331 cpumask_and(mask, cpupool_mask, svc->unit->cpu_hard_affinity);
332 printk("[%5d.%-2u] cpu %u, (%"PRI_stime", %"PRI_stime"),"
333 " cur_b=%"PRI_stime" cur_d=%"PRI_stime" last_start=%"PRI_stime"\n"
334 " \t\t priority_level=%d has_extratime=%d\n"
335 " \t\t onQ=%d runnable=%d flags=%x effective hard_affinity=%*pbl\n",
336 svc->unit->domain->domain_id,
337 svc->unit->unit_id,
338 sched_unit_master(svc->unit),
339 svc->period,
340 svc->budget,
341 svc->cur_budget,
342 svc->cur_deadline,
343 svc->last_start,
344 svc->priority_level,
345 has_extratime(svc),
346 unit_on_q(svc),
347 unit_runnable(svc->unit),
348 svc->flags, CPUMASK_PR(mask));
349 }
350
351 static void cf_check
rt_dump_pcpu(const struct scheduler * ops,int cpu)352 rt_dump_pcpu(const struct scheduler *ops, int cpu)
353 {
354 struct rt_private *prv = rt_priv(ops);
355 const struct rt_unit *svc;
356 unsigned long flags;
357
358 spin_lock_irqsave(&prv->lock, flags);
359 printk("CPU[%02d]\n", cpu);
360 /* current UNIT (nothing to say if that's the idle unit). */
361 svc = rt_unit(curr_on_cpu(cpu));
362 if ( svc && !is_idle_unit(svc->unit) )
363 {
364 rt_dump_unit(ops, svc);
365 }
366 spin_unlock_irqrestore(&prv->lock, flags);
367 }
368
369 static void cf_check
rt_dump(const struct scheduler * ops)370 rt_dump(const struct scheduler *ops)
371 {
372 struct list_head *runq, *depletedq, *replq, *iter;
373 struct rt_private *prv = rt_priv(ops);
374 const struct rt_unit *svc;
375 const struct rt_dom *sdom;
376 unsigned long flags;
377
378 spin_lock_irqsave(&prv->lock, flags);
379
380 if ( list_empty(&prv->sdom) )
381 goto out;
382
383 runq = rt_runq(ops);
384 depletedq = rt_depletedq(ops);
385 replq = rt_replq(ops);
386
387 printk("Global RunQueue info:\n");
388 list_for_each ( iter, runq )
389 {
390 svc = q_elem(iter);
391 rt_dump_unit(ops, svc);
392 }
393
394 printk("Global DepletedQueue info:\n");
395 list_for_each ( iter, depletedq )
396 {
397 svc = q_elem(iter);
398 rt_dump_unit(ops, svc);
399 }
400
401 printk("Global Replenishment Events info:\n");
402 list_for_each ( iter, replq )
403 {
404 svc = replq_elem(iter);
405 rt_dump_unit(ops, svc);
406 }
407
408 printk("Domain info:\n");
409 list_for_each ( iter, &prv->sdom )
410 {
411 const struct sched_unit *unit;
412
413 sdom = list_entry(iter, struct rt_dom, sdom_elem);
414 printk("\tdomain: %d\n", sdom->dom->domain_id);
415
416 for_each_sched_unit ( sdom->dom, unit )
417 {
418 svc = rt_unit(unit);
419 rt_dump_unit(ops, svc);
420 }
421 }
422
423 out:
424 spin_unlock_irqrestore(&prv->lock, flags);
425 }
426
427 /*
428 * update deadline and budget when now >= cur_deadline
429 * it needs to be updated to the deadline of the current period
430 */
431 static void
rt_update_deadline(s_time_t now,struct rt_unit * svc)432 rt_update_deadline(s_time_t now, struct rt_unit *svc)
433 {
434 ASSERT(now >= svc->cur_deadline);
435 ASSERT(svc->period != 0);
436
437 if ( svc->cur_deadline + (svc->period << UPDATE_LIMIT_SHIFT) > now )
438 {
439 do
440 svc->cur_deadline += svc->period;
441 while ( svc->cur_deadline <= now );
442 }
443 else
444 {
445 long count = ((now - svc->cur_deadline) / svc->period) + 1;
446 svc->cur_deadline += count * svc->period;
447 }
448
449 /*
450 * svc may be scheduled to run immediately after it misses deadline
451 * Then rt_update_deadline is called before rt_schedule, which
452 * should only deduct the time spent in current period from the budget
453 */
454 svc->last_start = now;
455 svc->cur_budget = svc->budget;
456 svc->priority_level = 0;
457
458 if ( unlikely(tb_init_done) )
459 {
460 struct {
461 uint16_t unit, dom;
462 uint32_t priority_level;
463 uint64_t cur_deadline, cur_budget;
464 } d = {
465 .dom = svc->unit->domain->domain_id,
466 .unit = svc->unit->unit_id,
467 .priority_level = svc->priority_level,
468 .cur_deadline = svc->cur_deadline,
469 .cur_budget = svc->cur_budget,
470 };
471
472 trace_time(TRC_RTDS_BUDGET_REPLENISH, sizeof(d), &d);
473 }
474
475 return;
476 }
477
478 /*
479 * Helpers for removing and inserting an unit in a queue
480 * that is being kept ordered by the units' deadlines (as EDF
481 * mandates).
482 *
483 * For callers' convenience, the unit removing helper returns
484 * true if the unit removed was the one at the front of the
485 * queue; similarly, the inserting helper returns true if the
486 * inserted ended at the front of the queue (i.e., in both
487 * cases, if the unit with the earliest deadline is what we
488 * are dealing with).
489 */
490 static inline bool
deadline_queue_remove(struct list_head * queue,struct list_head * elem)491 deadline_queue_remove(struct list_head *queue, struct list_head *elem)
492 {
493 bool first = false;
494
495 if ( queue->next != elem )
496 first = true;
497
498 list_del_init(elem);
499 return !first;
500 }
501
502 static inline bool
deadline_queue_insert(struct rt_unit * (* qelem)(struct list_head *),struct rt_unit * svc,struct list_head * elem,struct list_head * queue)503 deadline_queue_insert(struct rt_unit * (*qelem)(struct list_head *),
504 struct rt_unit *svc, struct list_head *elem,
505 struct list_head *queue)
506 {
507 struct list_head *iter;
508 bool first = true;
509
510 list_for_each ( iter, queue )
511 {
512 const struct rt_unit * iter_svc = (*qelem)(iter);
513 if ( compare_unit_priority(svc, iter_svc) > 0 )
514 break;
515 first = false;
516 }
517 list_add_tail(elem, iter);
518 return first;
519 }
520 #define deadline_runq_insert(...) \
521 deadline_queue_insert(&q_elem, ##__VA_ARGS__)
522 #define deadline_replq_insert(...) \
523 deadline_queue_insert(&replq_elem, ##__VA_ARGS__)
524
525 static inline void
q_remove(struct rt_unit * svc)526 q_remove(struct rt_unit *svc)
527 {
528 ASSERT( unit_on_q(svc) );
529 list_del_init(&svc->q_elem);
530 }
531
532 static inline void
replq_remove(const struct scheduler * ops,struct rt_unit * svc)533 replq_remove(const struct scheduler *ops, struct rt_unit *svc)
534 {
535 struct rt_private *prv = rt_priv(ops);
536 struct list_head *replq = rt_replq(ops);
537
538 ASSERT( unit_on_replq(svc) );
539
540 if ( deadline_queue_remove(replq, &svc->replq_elem) )
541 {
542 /*
543 * The replenishment timer needs to be set to fire when a
544 * replenishment for the unit at the front of the replenishment
545 * queue is due. If it is such unit that we just removed, we may
546 * need to reprogram the timer.
547 */
548 if ( !list_empty(replq) )
549 {
550 const struct rt_unit *svc_next = replq_elem(replq->next);
551 set_timer(&prv->repl_timer, svc_next->cur_deadline);
552 }
553 else
554 stop_timer(&prv->repl_timer);
555 }
556 }
557
558 /*
559 * Insert svc with budget in RunQ according to EDF:
560 * units with smaller deadlines go first.
561 * Insert svc without budget in DepletedQ unsorted;
562 */
563 static void
runq_insert(const struct scheduler * ops,struct rt_unit * svc)564 runq_insert(const struct scheduler *ops, struct rt_unit *svc)
565 {
566 struct rt_private *prv = rt_priv(ops);
567 struct list_head *runq = rt_runq(ops);
568
569 ASSERT( spin_is_locked(&prv->lock) );
570 ASSERT( !unit_on_q(svc) );
571 ASSERT( unit_on_replq(svc) );
572
573 /* add svc to runq if svc still has budget or its extratime is set */
574 if ( svc->cur_budget > 0 ||
575 has_extratime(svc) )
576 deadline_runq_insert(svc, &svc->q_elem, runq);
577 else
578 list_add(&svc->q_elem, &prv->depletedq);
579 }
580
581 static void
replq_insert(const struct scheduler * ops,struct rt_unit * svc)582 replq_insert(const struct scheduler *ops, struct rt_unit *svc)
583 {
584 struct list_head *replq = rt_replq(ops);
585 struct rt_private *prv = rt_priv(ops);
586
587 ASSERT( !unit_on_replq(svc) );
588
589 /*
590 * The timer may be re-programmed if svc is inserted
591 * at the front of the event list.
592 */
593 if ( deadline_replq_insert(svc, &svc->replq_elem, replq) )
594 set_timer(&prv->repl_timer, svc->cur_deadline);
595 }
596
597 /*
598 * Removes and re-inserts an event to the replenishment queue.
599 * The aim is to update its position inside the queue, as its
600 * deadline (and hence its replenishment time) could have
601 * changed.
602 */
603 static void
replq_reinsert(const struct scheduler * ops,struct rt_unit * svc)604 replq_reinsert(const struct scheduler *ops, struct rt_unit *svc)
605 {
606 struct list_head *replq = rt_replq(ops);
607 const struct rt_unit *rearm_svc = svc;
608 bool rearm = false;
609
610 ASSERT( unit_on_replq(svc) );
611
612 /*
613 * If svc was at the front of the replenishment queue, we certainly
614 * need to re-program the timer, and we want to use the deadline of
615 * the unit which is now at the front of the queue (which may still
616 * be svc or not).
617 *
618 * We may also need to re-program, if svc has been put at the front
619 * of the replenishment queue when being re-inserted.
620 */
621 if ( deadline_queue_remove(replq, &svc->replq_elem) )
622 {
623 deadline_replq_insert(svc, &svc->replq_elem, replq);
624 rearm_svc = replq_elem(replq->next);
625 rearm = true;
626 }
627 else
628 rearm = deadline_replq_insert(svc, &svc->replq_elem, replq);
629
630 if ( rearm )
631 set_timer(&rt_priv(ops)->repl_timer, rearm_svc->cur_deadline);
632 }
633
634 /*
635 * Pick a valid resource for the unit vc
636 * Valid resource of an unit is intesection of unit's affinity
637 * and available resources
638 */
639 static struct sched_resource *cf_check
rt_res_pick_locked(const struct sched_unit * unit,unsigned int locked_cpu)640 rt_res_pick_locked(const struct sched_unit *unit, unsigned int locked_cpu)
641 {
642 cpumask_t *cpus = cpumask_scratch_cpu(locked_cpu);
643 const cpumask_t *online;
644 int cpu;
645
646 online = cpupool_domain_master_cpumask(unit->domain);
647 cpumask_and(cpus, online, unit->cpu_hard_affinity);
648
649 cpu = cpumask_test_cpu(sched_unit_master(unit), cpus)
650 ? sched_unit_master(unit)
651 : cpumask_cycle(sched_unit_master(unit), cpus);
652 ASSERT( !cpumask_empty(cpus) && cpumask_test_cpu(cpu, cpus) );
653
654 return get_sched_res(cpu);
655 }
656
657 /*
658 * Pick a valid resource for the unit vc
659 * Valid resource of an unit is intesection of unit's affinity
660 * and available resources
661 */
662 static struct sched_resource *cf_check
rt_res_pick(const struct scheduler * ops,const struct sched_unit * unit)663 rt_res_pick(const struct scheduler *ops, const struct sched_unit *unit)
664 {
665 struct sched_resource *res;
666
667 res = rt_res_pick_locked(unit, unit->res->master_cpu);
668
669 return res;
670 }
671
672 /*
673 * Init/Free related code
674 */
675 static int cf_check
rt_init(struct scheduler * ops)676 rt_init(struct scheduler *ops)
677 {
678 int rc = -ENOMEM;
679 struct rt_private *prv = xzalloc(struct rt_private);
680
681 printk("Initializing RTDS scheduler\n"
682 "WARNING: This is experimental software in development.\n"
683 "Use at your own risk.\n");
684
685 if ( prv == NULL )
686 goto err;
687
688 spin_lock_init(&prv->lock);
689 INIT_LIST_HEAD(&prv->sdom);
690 INIT_LIST_HEAD(&prv->runq);
691 INIT_LIST_HEAD(&prv->depletedq);
692 INIT_LIST_HEAD(&prv->replq);
693
694 ops->sched_data = prv;
695 rc = 0;
696
697 err:
698 if ( rc )
699 xfree(prv);
700
701 return rc;
702 }
703
704 static void cf_check
rt_deinit(struct scheduler * ops)705 rt_deinit(struct scheduler *ops)
706 {
707 struct rt_private *prv = rt_priv(ops);
708
709 ASSERT(prv->repl_timer.status == TIMER_STATUS_invalid ||
710 prv->repl_timer.status == TIMER_STATUS_killed);
711
712 ops->sched_data = NULL;
713 xfree(prv);
714 }
715
716 /* Change the scheduler of cpu to us (RTDS). */
717 static spinlock_t *cf_check
rt_switch_sched(struct scheduler * new_ops,unsigned int cpu,void * pdata,void * vdata)718 rt_switch_sched(struct scheduler *new_ops, unsigned int cpu,
719 void *pdata, void *vdata)
720 {
721 struct rt_private *prv = rt_priv(new_ops);
722 struct rt_unit *svc = vdata;
723
724 ASSERT(!pdata && svc && is_idle_unit(svc->unit));
725
726 /*
727 * We are holding the runqueue lock already (it's been taken in
728 * schedule_cpu_switch()). It's actually the runqueue lock of
729 * another scheduler, but that is how things need to be, for
730 * preventing races.
731 */
732 ASSERT(get_sched_res(cpu)->schedule_lock != &prv->lock);
733
734 /*
735 * If we are the absolute first cpu being switched toward this
736 * scheduler (in which case we'll see TIMER_STATUS_invalid), or the
737 * first one that is added back to the cpupool that had all its cpus
738 * removed (in which case we'll see TIMER_STATUS_killed), it's our
739 * job to (re)initialize the timer.
740 */
741 if ( prv->repl_timer.status == TIMER_STATUS_invalid ||
742 prv->repl_timer.status == TIMER_STATUS_killed )
743 {
744 init_timer(&prv->repl_timer, repl_timer_handler, (void *)new_ops, cpu);
745 dprintk(XENLOG_DEBUG, "RTDS: timer initialized on cpu %u\n", cpu);
746 }
747
748 sched_idle_unit(cpu)->priv = vdata;
749
750 return &prv->lock;
751 }
752
move_repl_timer(struct rt_private * prv,unsigned int old_cpu)753 static void move_repl_timer(struct rt_private *prv, unsigned int old_cpu)
754 {
755 cpumask_t *online = get_sched_res(old_cpu)->cpupool->res_valid;
756 unsigned int new_cpu = cpumask_cycle(old_cpu, online);
757
758 /*
759 * Make sure the timer run on one of the cpus that are still available
760 * to this scheduler. If there aren't any left, it means it's the time
761 * to just kill it.
762 */
763 if ( new_cpu >= nr_cpu_ids )
764 {
765 kill_timer(&prv->repl_timer);
766 dprintk(XENLOG_DEBUG, "RTDS: timer killed on cpu %d\n", old_cpu);
767 }
768 else
769 {
770 migrate_timer(&prv->repl_timer, new_cpu);
771 }
772 }
773
774 static void cf_check
rt_deinit_pdata(const struct scheduler * ops,void * pcpu,int cpu)775 rt_deinit_pdata(const struct scheduler *ops, void *pcpu, int cpu)
776 {
777 unsigned long flags;
778 struct rt_private *prv = rt_priv(ops);
779
780 spin_lock_irqsave(&prv->lock, flags);
781
782 if ( prv->repl_timer.cpu == cpu )
783 move_repl_timer(prv, cpu);
784
785 spin_unlock_irqrestore(&prv->lock, flags);
786 }
787
788 static void cf_check
rt_move_timers(const struct scheduler * ops,struct sched_resource * sr)789 rt_move_timers(const struct scheduler *ops, struct sched_resource *sr)
790 {
791 unsigned long flags;
792 struct rt_private *prv = rt_priv(ops);
793 unsigned int old_cpu;
794
795 spin_lock_irqsave(&prv->lock, flags);
796
797 old_cpu = prv->repl_timer.cpu;
798 if ( prv->repl_timer.status != TIMER_STATUS_invalid &&
799 prv->repl_timer.status != TIMER_STATUS_killed &&
800 !cpumask_test_cpu(old_cpu, sr->cpupool->res_valid) )
801 move_repl_timer(prv, old_cpu);
802
803 spin_unlock_irqrestore(&prv->lock, flags);
804 }
805
806 static void *cf_check
rt_alloc_domdata(const struct scheduler * ops,struct domain * dom)807 rt_alloc_domdata(const struct scheduler *ops, struct domain *dom)
808 {
809 unsigned long flags;
810 struct rt_dom *sdom;
811 struct rt_private * prv = rt_priv(ops);
812
813 sdom = xzalloc(struct rt_dom);
814 if ( sdom == NULL )
815 return ERR_PTR(-ENOMEM);
816
817 INIT_LIST_HEAD(&sdom->sdom_elem);
818 sdom->dom = dom;
819
820 /* spinlock here to insert the dom */
821 spin_lock_irqsave(&prv->lock, flags);
822 list_add_tail(&sdom->sdom_elem, &(prv->sdom));
823 spin_unlock_irqrestore(&prv->lock, flags);
824
825 return sdom;
826 }
827
828 static void cf_check
rt_free_domdata(const struct scheduler * ops,void * data)829 rt_free_domdata(const struct scheduler *ops, void *data)
830 {
831 struct rt_dom *sdom = data;
832 struct rt_private *prv = rt_priv(ops);
833
834 if ( sdom )
835 {
836 unsigned long flags;
837
838 spin_lock_irqsave(&prv->lock, flags);
839 list_del_init(&sdom->sdom_elem);
840 spin_unlock_irqrestore(&prv->lock, flags);
841
842 xfree(sdom);
843 }
844 }
845
846 static void * cf_check
rt_alloc_udata(const struct scheduler * ops,struct sched_unit * unit,void * dd)847 rt_alloc_udata(const struct scheduler *ops, struct sched_unit *unit, void *dd)
848 {
849 struct rt_unit *svc;
850
851 /* Allocate per-UNIT info */
852 svc = xzalloc(struct rt_unit);
853 if ( svc == NULL )
854 return NULL;
855
856 INIT_LIST_HEAD(&svc->q_elem);
857 INIT_LIST_HEAD(&svc->replq_elem);
858 svc->flags = 0U;
859 svc->sdom = dd;
860 svc->unit = unit;
861 svc->last_start = 0;
862
863 __set_bit(__RTDS_extratime, &svc->flags);
864 svc->priority_level = 0;
865 svc->period = RTDS_DEFAULT_PERIOD;
866 if ( !is_idle_unit(unit) )
867 svc->budget = RTDS_DEFAULT_BUDGET;
868
869 SCHED_STAT_CRANK(unit_alloc);
870
871 return svc;
872 }
873
874 static void cf_check
rt_free_udata(const struct scheduler * ops,void * priv)875 rt_free_udata(const struct scheduler *ops, void *priv)
876 {
877 struct rt_unit *svc = priv;
878
879 xfree(svc);
880 }
881
882 /*
883 * It is called in sched_move_domain() and sched_init_vcpu
884 * in schedule.c.
885 * When move a domain to a new cpupool.
886 * It inserts units of moving domain to the scheduler's RunQ in
887 * dest. cpupool.
888 */
889 static void cf_check
rt_unit_insert(const struct scheduler * ops,struct sched_unit * unit)890 rt_unit_insert(const struct scheduler *ops, struct sched_unit *unit)
891 {
892 struct rt_unit *svc = rt_unit(unit);
893 s_time_t now;
894 spinlock_t *lock;
895 unsigned int cpu = smp_processor_id();
896
897 BUG_ON( is_idle_unit(unit) );
898
899 /* This is safe because unit isn't yet being scheduled */
900 lock = pcpu_schedule_lock_irq(cpu);
901 sched_set_res(unit, rt_res_pick_locked(unit, cpu));
902 pcpu_schedule_unlock_irq(lock, cpu);
903
904 lock = unit_schedule_lock_irq(unit);
905
906 now = NOW();
907 if ( now >= svc->cur_deadline )
908 rt_update_deadline(now, svc);
909
910 if ( !unit_on_q(svc) && unit_runnable(unit) )
911 {
912 replq_insert(ops, svc);
913
914 if ( !unit->is_running )
915 runq_insert(ops, svc);
916 }
917 unit_schedule_unlock_irq(lock, unit);
918
919 SCHED_STAT_CRANK(unit_insert);
920 }
921
922 /*
923 * Remove rt_unit svc from the old scheduler in source cpupool.
924 */
925 static void cf_check
rt_unit_remove(const struct scheduler * ops,struct sched_unit * unit)926 rt_unit_remove(const struct scheduler *ops, struct sched_unit *unit)
927 {
928 struct rt_unit * const svc = rt_unit(unit);
929 struct rt_dom * const sdom = svc->sdom;
930 spinlock_t *lock;
931
932 SCHED_STAT_CRANK(unit_remove);
933
934 BUG_ON( sdom == NULL );
935
936 lock = unit_schedule_lock_irq(unit);
937 if ( unit_on_q(svc) )
938 q_remove(svc);
939
940 if ( unit_on_replq(svc) )
941 replq_remove(ops,svc);
942
943 unit_schedule_unlock_irq(lock, unit);
944 }
945
946 /*
947 * Burn budget in nanosecond granularity
948 */
949 static void
burn_budget(const struct scheduler * ops,struct rt_unit * svc,s_time_t now)950 burn_budget(const struct scheduler *ops, struct rt_unit *svc, s_time_t now)
951 {
952 s_time_t delta;
953
954 /* don't burn budget for idle UNIT */
955 if ( is_idle_unit(svc->unit) )
956 return;
957
958 /* burn at nanoseconds level */
959 delta = now - svc->last_start;
960 /*
961 * delta < 0 only happens in nested virtualization;
962 * TODO: how should we handle delta < 0 in a better way?
963 */
964 if ( delta < 0 )
965 {
966 printk("%s, ATTENTION: now is behind last_start! delta=%"PRI_stime"\n",
967 __func__, delta);
968 svc->last_start = now;
969 return;
970 }
971
972 svc->cur_budget -= delta;
973 svc->last_start = now;
974
975 if ( svc->cur_budget <= 0 )
976 {
977 if ( has_extratime(svc) )
978 {
979 svc->priority_level++;
980 svc->cur_budget = svc->budget;
981 }
982 else
983 {
984 svc->cur_budget = 0;
985 __set_bit(__RTDS_depleted, &svc->flags);
986 }
987 }
988
989 if ( unlikely(tb_init_done) )
990 {
991 struct __packed {
992 uint16_t unit, dom;
993 uint64_t cur_budget;
994 uint32_t delta;
995 uint32_t priority_level;
996 uint32_t has_extratime;
997 } d = {
998 .unit = svc->unit->unit_id,
999 .dom = svc->unit->domain->domain_id,
1000 .cur_budget = svc->cur_budget,
1001 .delta = delta, /* TODO: truncation? */
1002 .priority_level = svc->priority_level,
1003 .has_extratime = !!(svc->flags & RTDS_extratime),
1004 };
1005
1006 trace_time(TRC_RTDS_BUDGET_BURN, sizeof(d), &d);
1007 }
1008 }
1009
1010 /*
1011 * RunQ is sorted. Pick first one within cpumask. If no one, return NULL
1012 * lock is grabbed before calling this function
1013 */
1014 static struct rt_unit *
runq_pick(const struct scheduler * ops,const cpumask_t * mask,unsigned int cpu)1015 runq_pick(const struct scheduler *ops, const cpumask_t *mask, unsigned int cpu)
1016 {
1017 struct list_head *runq = rt_runq(ops);
1018 struct list_head *iter;
1019 struct rt_unit *svc = NULL;
1020 struct rt_unit *iter_svc = NULL;
1021 cpumask_t *cpu_common = cpumask_scratch_cpu(cpu);
1022 const cpumask_t *online;
1023
1024 list_for_each ( iter, runq )
1025 {
1026 iter_svc = q_elem(iter);
1027
1028 /* mask cpu_hard_affinity & cpupool & mask */
1029 online = cpupool_domain_master_cpumask(iter_svc->unit->domain);
1030 cpumask_and(cpu_common, online, iter_svc->unit->cpu_hard_affinity);
1031 cpumask_and(cpu_common, mask, cpu_common);
1032 if ( cpumask_empty(cpu_common) )
1033 continue;
1034
1035 ASSERT( iter_svc->cur_budget > 0 );
1036
1037 svc = iter_svc;
1038 break;
1039 }
1040
1041 if ( unlikely(tb_init_done) && svc )
1042 {
1043 struct __packed {
1044 uint16_t unit, dom;
1045 uint64_t cur_deadline, cur_budget;
1046 } d = {
1047 .unit = svc->unit->unit_id,
1048 .dom = svc->unit->domain->domain_id,
1049 .cur_deadline = svc->cur_deadline,
1050 .cur_budget = svc->cur_budget,
1051 };
1052
1053 trace_time(TRC_RTDS_RUNQ_PICK, sizeof(d), &d);
1054 }
1055
1056 return svc;
1057 }
1058
1059 /*
1060 * schedule function for rt scheduler.
1061 * The lock is already grabbed in schedule.c, no need to lock here
1062 */
1063 static void cf_check
rt_schedule(const struct scheduler * ops,struct sched_unit * currunit,s_time_t now,bool tasklet_work_scheduled)1064 rt_schedule(const struct scheduler *ops, struct sched_unit *currunit,
1065 s_time_t now, bool tasklet_work_scheduled)
1066 {
1067 const unsigned int cur_cpu = smp_processor_id();
1068 const unsigned int sched_cpu = sched_get_resource_cpu(cur_cpu);
1069 struct rt_private *prv = rt_priv(ops);
1070 struct rt_unit *const scurr = rt_unit(currunit);
1071 struct rt_unit *snext = NULL;
1072 bool migrated = false;
1073
1074 if ( unlikely(tb_init_done) )
1075 {
1076 struct {
1077 uint16_t cpu;
1078 uint8_t tasklet, tickled:4, idle:4;
1079 } d = {
1080 .cpu = cur_cpu,
1081 .tasklet = tasklet_work_scheduled,
1082 .tickled = cpumask_test_cpu(sched_cpu, &prv->tickled),
1083 .idle = is_idle_unit(currunit),
1084 };
1085
1086 trace_time(TRC_RTDS_SCHEDULE, sizeof(d), &d);
1087 }
1088
1089 /* clear ticked bit now that we've been scheduled */
1090 cpumask_clear_cpu(sched_cpu, &prv->tickled);
1091
1092 /* burn_budget would return for IDLE UNIT */
1093 burn_budget(ops, scurr, now);
1094
1095 if ( tasklet_work_scheduled )
1096 {
1097 TRACE_TIME(TRC_RTDS_SCHED_TASKLET);
1098 snext = rt_unit(sched_idle_unit(sched_cpu));
1099 }
1100 else
1101 {
1102 while ( true )
1103 {
1104 snext = runq_pick(ops, cpumask_of(sched_cpu), cur_cpu);
1105
1106 if ( snext == NULL )
1107 {
1108 snext = rt_unit(sched_idle_unit(sched_cpu));
1109 break;
1110 }
1111 if ( unit_runnable_state(snext->unit) )
1112 break;
1113
1114 q_remove(snext);
1115 replq_remove(ops, snext);
1116 }
1117
1118 /* if scurr has higher priority and budget, still pick scurr */
1119 if ( !is_idle_unit(currunit) &&
1120 unit_runnable_state(currunit) &&
1121 scurr->cur_budget > 0 &&
1122 ( is_idle_unit(snext->unit) ||
1123 compare_unit_priority(scurr, snext) > 0 ) )
1124 snext = scurr;
1125 }
1126
1127 if ( snext != scurr &&
1128 !is_idle_unit(currunit) &&
1129 unit_runnable(currunit) )
1130 __set_bit(__RTDS_delayed_runq_add, &scurr->flags);
1131
1132 snext->last_start = now;
1133 currunit->next_time = -1; /* if an idle unit is picked */
1134 if ( !is_idle_unit(snext->unit) )
1135 {
1136 if ( snext != scurr )
1137 {
1138 q_remove(snext);
1139 __set_bit(__RTDS_scheduled, &snext->flags);
1140 }
1141 if ( sched_unit_master(snext->unit) != sched_cpu )
1142 {
1143 sched_set_res(snext->unit, get_sched_res(sched_cpu));
1144 migrated = true;
1145 }
1146 /* Invoke the scheduler next time. */
1147 currunit->next_time = snext->cur_budget;
1148 }
1149 currunit->next_task = snext->unit;
1150 snext->unit->migrated = migrated;
1151 }
1152
1153 /*
1154 * Remove UNIT from RunQ
1155 * The lock is already grabbed in schedule.c, no need to lock here
1156 */
1157 static void cf_check
rt_unit_sleep(const struct scheduler * ops,struct sched_unit * unit)1158 rt_unit_sleep(const struct scheduler *ops, struct sched_unit *unit)
1159 {
1160 struct rt_unit * const svc = rt_unit(unit);
1161
1162 BUG_ON( is_idle_unit(unit) );
1163 SCHED_STAT_CRANK(unit_sleep);
1164
1165 if ( curr_on_cpu(sched_unit_master(unit)) == unit )
1166 cpu_raise_softirq(sched_unit_master(unit), SCHEDULE_SOFTIRQ);
1167 else if ( unit_on_q(svc) )
1168 {
1169 q_remove(svc);
1170 replq_remove(ops, svc);
1171 }
1172 else if ( svc->flags & RTDS_delayed_runq_add )
1173 __clear_bit(__RTDS_delayed_runq_add, &svc->flags);
1174 }
1175
1176 /*
1177 * Pick a cpu where to run an unit,
1178 * possibly kicking out the unit running there
1179 * Called by wake() and context_saved()
1180 * We have a running candidate here, the kick logic is:
1181 * Among all the cpus that are within the cpu affinity
1182 * 1) if there are any idle CPUs, kick one.
1183 For cache benefit, we check new->cpu as first
1184 * 2) now all pcpus are busy;
1185 * among all the running units, pick lowest priority one
1186 * if snext has higher priority, kick it.
1187 *
1188 * TODO:
1189 * 1) what if these two units belongs to the same domain?
1190 * replace an unit belonging to the same domain introduces more overhead
1191 *
1192 * lock is grabbed before calling this function
1193 */
1194 static void
runq_tickle(const struct scheduler * ops,const struct rt_unit * new)1195 runq_tickle(const struct scheduler *ops, const struct rt_unit *new)
1196 {
1197 struct rt_private *prv = rt_priv(ops);
1198 const struct rt_unit *latest_deadline_unit = NULL; /* lowest priority */
1199 const struct rt_unit *iter_svc;
1200 const struct sched_unit *iter_unit;
1201 int cpu = 0, cpu_to_tickle = 0;
1202 cpumask_t *not_tickled = cpumask_scratch_cpu(smp_processor_id());
1203 const cpumask_t *online;
1204
1205 if ( new == NULL || is_idle_unit(new->unit) )
1206 return;
1207
1208 online = cpupool_domain_master_cpumask(new->unit->domain);
1209 cpumask_and(not_tickled, online, new->unit->cpu_hard_affinity);
1210 cpumask_andnot(not_tickled, not_tickled, &prv->tickled);
1211
1212 /*
1213 * 1) If there are any idle CPUs, kick one.
1214 * For cache benefit,we first search new->cpu.
1215 * The same loop also find the one with lowest priority.
1216 */
1217 cpu = cpumask_test_or_cycle(sched_unit_master(new->unit), not_tickled);
1218 while ( cpu!= nr_cpu_ids )
1219 {
1220 iter_unit = curr_on_cpu(cpu);
1221 if ( is_idle_unit(iter_unit) )
1222 {
1223 SCHED_STAT_CRANK(tickled_idle_cpu);
1224 cpu_to_tickle = cpu;
1225 goto out;
1226 }
1227 iter_svc = rt_unit(iter_unit);
1228 if ( latest_deadline_unit == NULL ||
1229 compare_unit_priority(iter_svc, latest_deadline_unit) < 0 )
1230 latest_deadline_unit = iter_svc;
1231
1232 cpumask_clear_cpu(cpu, not_tickled);
1233 cpu = cpumask_cycle(cpu, not_tickled);
1234 }
1235
1236 /* 2) candicate has higher priority, kick out lowest priority unit */
1237 if ( latest_deadline_unit != NULL &&
1238 compare_unit_priority(latest_deadline_unit, new) < 0 )
1239 {
1240 SCHED_STAT_CRANK(tickled_busy_cpu);
1241 cpu_to_tickle = sched_unit_master(latest_deadline_unit->unit);
1242 goto out;
1243 }
1244
1245 /* didn't tickle any cpu */
1246 SCHED_STAT_CRANK(tickled_no_cpu);
1247 return;
1248 out:
1249 if ( unlikely(tb_init_done) )
1250 {
1251 struct {
1252 uint16_t cpu, _pad;
1253 } d = {
1254 .cpu = cpu_to_tickle,
1255 };
1256
1257 trace_time(TRC_RTDS_TICKLE, sizeof(d), &d);
1258 }
1259
1260 cpumask_set_cpu(cpu_to_tickle, &prv->tickled);
1261 cpu_raise_softirq(cpu_to_tickle, SCHEDULE_SOFTIRQ);
1262 return;
1263 }
1264
1265 /*
1266 * Should always wake up runnable unit, put it back to RunQ.
1267 * Check priority to raise interrupt
1268 * The lock is already grabbed in schedule.c, no need to lock here
1269 * TODO: what if these two units belongs to the same domain?
1270 */
1271 static void cf_check
rt_unit_wake(const struct scheduler * ops,struct sched_unit * unit)1272 rt_unit_wake(const struct scheduler *ops, struct sched_unit *unit)
1273 {
1274 struct rt_unit * const svc = rt_unit(unit);
1275 s_time_t now;
1276 bool missed;
1277
1278 BUG_ON( is_idle_unit(unit) );
1279
1280 if ( unlikely(curr_on_cpu(sched_unit_master(unit)) == unit) )
1281 {
1282 SCHED_STAT_CRANK(unit_wake_running);
1283 return;
1284 }
1285
1286 /* on RunQ/DepletedQ, just update info is ok */
1287 if ( unlikely(unit_on_q(svc)) )
1288 {
1289 SCHED_STAT_CRANK(unit_wake_onrunq);
1290 return;
1291 }
1292
1293 if ( likely(unit_runnable(unit)) )
1294 SCHED_STAT_CRANK(unit_wake_runnable);
1295 else
1296 SCHED_STAT_CRANK(unit_wake_not_runnable);
1297
1298 /*
1299 * If a deadline passed while svc was asleep/blocked, we need new
1300 * scheduling parameters (a new deadline and full budget).
1301 */
1302 now = NOW();
1303
1304 missed = ( now >= svc->cur_deadline );
1305 if ( missed )
1306 rt_update_deadline(now, svc);
1307
1308 /*
1309 * If context hasn't been saved for this unit yet, we can't put it on
1310 * the run-queue/depleted-queue. Instead, we set the appropriate flag,
1311 * the unit will be put back on queue after the context has been saved
1312 * (in rt_context_save()).
1313 */
1314 if ( unlikely(svc->flags & RTDS_scheduled) )
1315 {
1316 __set_bit(__RTDS_delayed_runq_add, &svc->flags);
1317 /*
1318 * The unit is waking up already, and we didn't even had the time to
1319 * remove its next replenishment event from the replenishment queue
1320 * when it blocked! No big deal. If we did not miss the deadline in
1321 * the meantime, let's just leave it there. If we did, let's remove it
1322 * and queue a new one (to occur at our new deadline).
1323 */
1324 if ( missed )
1325 replq_reinsert(ops, svc);
1326 return;
1327 }
1328
1329 /* Replenishment event got cancelled when we blocked. Add it back. */
1330 replq_insert(ops, svc);
1331 /* insert svc to runq/depletedq because svc is not in queue now */
1332 runq_insert(ops, svc);
1333
1334 runq_tickle(ops, svc);
1335 }
1336
1337 /*
1338 * scurr has finished context switch, insert it back to the RunQ,
1339 * and then pick the highest priority unit from runq to run
1340 */
1341 static void cf_check
rt_context_saved(const struct scheduler * ops,struct sched_unit * unit)1342 rt_context_saved(const struct scheduler *ops, struct sched_unit *unit)
1343 {
1344 struct rt_unit *svc = rt_unit(unit);
1345 spinlock_t *lock = unit_schedule_lock_irq(unit);
1346
1347 __clear_bit(__RTDS_scheduled, &svc->flags);
1348 /* not insert idle unit to runq */
1349 if ( is_idle_unit(unit) )
1350 goto out;
1351
1352 if ( __test_and_clear_bit(__RTDS_delayed_runq_add, &svc->flags) &&
1353 likely(unit_runnable(unit)) )
1354 {
1355 runq_insert(ops, svc);
1356 runq_tickle(ops, svc);
1357 }
1358 else
1359 replq_remove(ops, svc);
1360
1361 out:
1362 unit_schedule_unlock_irq(lock, unit);
1363 }
1364
1365 /*
1366 * set/get each unit info of each domain
1367 */
1368 static int cf_check
rt_dom_cntl(const struct scheduler * ops,struct domain * d,struct xen_domctl_scheduler_op * op)1369 rt_dom_cntl(
1370 const struct scheduler *ops,
1371 struct domain *d,
1372 struct xen_domctl_scheduler_op *op)
1373 {
1374 struct rt_private *prv = rt_priv(ops);
1375 struct rt_unit *svc;
1376 const struct sched_unit *unit;
1377 unsigned long flags;
1378 int rc = 0;
1379 struct xen_domctl_schedparam_vcpu local_sched;
1380 s_time_t period, budget;
1381 uint32_t index = 0;
1382
1383 switch ( op->cmd )
1384 {
1385 case XEN_DOMCTL_SCHEDOP_getinfo:
1386 /* Return the default parameters. */
1387 op->u.rtds.period = RTDS_DEFAULT_PERIOD / MICROSECS(1);
1388 op->u.rtds.budget = RTDS_DEFAULT_BUDGET / MICROSECS(1);
1389 break;
1390 case XEN_DOMCTL_SCHEDOP_putinfo:
1391 if ( op->u.rtds.period == 0 || op->u.rtds.budget == 0 )
1392 {
1393 rc = -EINVAL;
1394 break;
1395 }
1396 spin_lock_irqsave(&prv->lock, flags);
1397 for_each_sched_unit ( d, unit )
1398 {
1399 svc = rt_unit(unit);
1400 svc->period = MICROSECS(op->u.rtds.period); /* transfer to nanosec */
1401 svc->budget = MICROSECS(op->u.rtds.budget);
1402 }
1403 spin_unlock_irqrestore(&prv->lock, flags);
1404 break;
1405 case XEN_DOMCTL_SCHEDOP_getvcpuinfo:
1406 case XEN_DOMCTL_SCHEDOP_putvcpuinfo:
1407 while ( index < op->u.v.nr_vcpus )
1408 {
1409 if ( copy_from_guest_offset(&local_sched,
1410 op->u.v.vcpus, index, 1) )
1411 {
1412 rc = -EFAULT;
1413 break;
1414 }
1415 if ( local_sched.vcpuid >= d->max_vcpus ||
1416 d->vcpu[local_sched.vcpuid] == NULL )
1417 {
1418 rc = -EINVAL;
1419 break;
1420 }
1421
1422 if ( op->cmd == XEN_DOMCTL_SCHEDOP_getvcpuinfo )
1423 {
1424 spin_lock_irqsave(&prv->lock, flags);
1425 svc = rt_unit(d->vcpu[local_sched.vcpuid]->sched_unit);
1426 local_sched.u.rtds.budget = svc->budget / MICROSECS(1);
1427 local_sched.u.rtds.period = svc->period / MICROSECS(1);
1428 if ( has_extratime(svc) )
1429 local_sched.u.rtds.flags |= XEN_DOMCTL_SCHEDRT_extra;
1430 else
1431 local_sched.u.rtds.flags &= ~XEN_DOMCTL_SCHEDRT_extra;
1432 spin_unlock_irqrestore(&prv->lock, flags);
1433
1434 if ( copy_to_guest_offset(op->u.v.vcpus, index,
1435 &local_sched, 1) )
1436 {
1437 rc = -EFAULT;
1438 break;
1439 }
1440 }
1441 else
1442 {
1443 period = MICROSECS(local_sched.u.rtds.period);
1444 budget = MICROSECS(local_sched.u.rtds.budget);
1445 if ( period > RTDS_MAX_PERIOD || budget < RTDS_MIN_BUDGET ||
1446 budget > period || period < RTDS_MIN_PERIOD )
1447 {
1448 rc = -EINVAL;
1449 break;
1450 }
1451
1452 spin_lock_irqsave(&prv->lock, flags);
1453 svc = rt_unit(d->vcpu[local_sched.vcpuid]->sched_unit);
1454 svc->period = period;
1455 svc->budget = budget;
1456 if ( local_sched.u.rtds.flags & XEN_DOMCTL_SCHEDRT_extra )
1457 __set_bit(__RTDS_extratime, &svc->flags);
1458 else
1459 __clear_bit(__RTDS_extratime, &svc->flags);
1460 spin_unlock_irqrestore(&prv->lock, flags);
1461 }
1462 /* Process a most 64 vCPUs without checking for preemptions. */
1463 if ( (++index > 63) && hypercall_preempt_check() )
1464 break;
1465 }
1466 if ( !rc )
1467 /* notify upper caller how many units have been processed. */
1468 op->u.v.nr_vcpus = index;
1469 break;
1470 }
1471
1472 return rc;
1473 }
1474
1475 /*
1476 * The replenishment timer handler picks units
1477 * from the replq and does the actual replenishment.
1478 */
repl_timer_handler(void * data)1479 static void cf_check repl_timer_handler(void *data)
1480 {
1481 s_time_t now;
1482 const struct scheduler *ops = data;
1483 struct rt_private *prv = rt_priv(ops);
1484 struct list_head *replq = rt_replq(ops);
1485 struct list_head *runq = rt_runq(ops);
1486 struct list_head *iter, *tmp;
1487 struct rt_unit *svc;
1488 LIST_HEAD(tmp_replq);
1489
1490 spin_lock_irq(&prv->lock);
1491
1492 now = NOW();
1493
1494 /*
1495 * Do the replenishment and move replenished units
1496 * to the temporary list to tickle.
1497 * If svc is on run queue, we need to put it at
1498 * the correct place since its deadline changes.
1499 */
1500 list_for_each_safe ( iter, tmp, replq )
1501 {
1502 svc = replq_elem(iter);
1503
1504 if ( now < svc->cur_deadline )
1505 break;
1506
1507 list_del(&svc->replq_elem);
1508 rt_update_deadline(now, svc);
1509 list_add(&svc->replq_elem, &tmp_replq);
1510
1511 if ( unit_on_q(svc) )
1512 {
1513 q_remove(svc);
1514 runq_insert(ops, svc);
1515 }
1516 }
1517
1518 /*
1519 * Iterate through the list of updated units.
1520 * If an updated unit is running, tickle the head of the
1521 * runqueue if it has a higher priority.
1522 * If an updated unit was depleted and on the runqueue, tickle it.
1523 * Finally, reinsert the units back to replenishement events list.
1524 */
1525 list_for_each_safe ( iter, tmp, &tmp_replq )
1526 {
1527 svc = replq_elem(iter);
1528
1529 if ( curr_on_cpu(sched_unit_master(svc->unit)) == svc->unit &&
1530 !list_empty(runq) )
1531 {
1532 struct rt_unit *next_on_runq = q_elem(runq->next);
1533
1534 if ( compare_unit_priority(svc, next_on_runq) < 0 )
1535 runq_tickle(ops, next_on_runq);
1536 }
1537 else if ( __test_and_clear_bit(__RTDS_depleted, &svc->flags) &&
1538 unit_on_q(svc) )
1539 runq_tickle(ops, svc);
1540
1541 list_del(&svc->replq_elem);
1542 deadline_replq_insert(svc, &svc->replq_elem, replq);
1543 }
1544
1545 /*
1546 * If there are units left in the replenishment event list,
1547 * set the next replenishment to happen at the deadline of
1548 * the one in the front.
1549 */
1550 if ( !list_empty(replq) )
1551 set_timer(&prv->repl_timer, replq_elem(replq->next)->cur_deadline);
1552
1553 spin_unlock_irq(&prv->lock);
1554 }
1555
1556 static const struct scheduler sched_rtds_def = {
1557 .name = "SMP RTDS Scheduler",
1558 .opt_name = "rtds",
1559 .sched_id = XEN_SCHEDULER_RTDS,
1560 .sched_data = NULL,
1561
1562 .dump_cpu_state = rt_dump_pcpu,
1563 .dump_settings = rt_dump,
1564 .init = rt_init,
1565 .deinit = rt_deinit,
1566 .switch_sched = rt_switch_sched,
1567 .deinit_pdata = rt_deinit_pdata,
1568 .alloc_domdata = rt_alloc_domdata,
1569 .free_domdata = rt_free_domdata,
1570 .alloc_udata = rt_alloc_udata,
1571 .free_udata = rt_free_udata,
1572 .insert_unit = rt_unit_insert,
1573 .remove_unit = rt_unit_remove,
1574
1575 .adjust = rt_dom_cntl,
1576
1577 .pick_resource = rt_res_pick,
1578 .do_schedule = rt_schedule,
1579 .sleep = rt_unit_sleep,
1580 .wake = rt_unit_wake,
1581 .context_saved = rt_context_saved,
1582 .move_timers = rt_move_timers,
1583 };
1584
1585 REGISTER_SCHEDULER(sched_rtds_def);
1586