1 /*****************************************************************************
2 * Preemptive Global Earliest Deadline First (EDF) scheduler for Xen
3 * EDF scheduling is a real-time scheduling algorithm used in embedded field.
4 *
5 * by Sisu Xi, 2013, Washington University in Saint Louis
6 * Meng Xu, 2014-2016, University of Pennsylvania
7 *
8 * Conversion toward event driven model by Tianyang Chen
9 * and Dagaen Golomb, 2016, University of Pennsylvania
10 *
11 * based on the code of credit Scheduler
12 */
13
14 #include <xen/init.h>
15 #include <xen/lib.h>
16 #include <xen/sched.h>
17 #include <xen/domain.h>
18 #include <xen/delay.h>
19 #include <xen/event.h>
20 #include <xen/time.h>
21 #include <xen/timer.h>
22 #include <xen/perfc.h>
23 #include <xen/sched-if.h>
24 #include <xen/softirq.h>
25 #include <asm/atomic.h>
26 #include <xen/errno.h>
27 #include <xen/trace.h>
28 #include <xen/cpu.h>
29 #include <xen/keyhandler.h>
30 #include <xen/trace.h>
31 #include <xen/err.h>
32 #include <xen/guest_access.h>
33
34 /*
35 * TODO:
36 *
37 * Migration compensation and resist like credit2 to better use cache;
38 * Lock Holder Problem, using yield?
39 * Self switch problem: VCPUs of the same domain may preempt each other;
40 */
41
42 /*
43 * Design:
44 *
45 * This scheduler follows the Preemptive Global Earliest Deadline First (EDF)
46 * theory in real-time field.
47 * At any scheduling point, the VCPU with earlier deadline has higher priority.
48 * The scheduler always picks highest priority VCPU to run on a feasible PCPU.
49 * A PCPU is feasible if the VCPU can run on this PCPU and (the PCPU is idle or
50 * has a lower-priority VCPU running on it.)
51 *
52 * Each VCPU has a dedicated period, budget and a extratime flag
53 * The deadline of a VCPU is at the end of each period;
54 * A VCPU has its budget replenished at the beginning of each period;
55 * While scheduled, a VCPU burns its budget.
56 * The VCPU needs to finish its budget before its deadline in each period;
57 * The VCPU discards its unused budget at the end of each period.
58 * When a VCPU runs out of budget in a period, if its extratime flag is set,
59 * the VCPU increases its priority_level by 1 and refills its budget; otherwise,
60 * it has to wait until next period.
61 *
62 * Each VCPU is implemented as a deferable server.
63 * When a VCPU has a task running on it, its budget is continuously burned;
64 * When a VCPU has no task but with budget left, its budget is preserved.
65 *
66 * Queue scheme:
67 * A global runqueue and a global depletedqueue for each CPU pool.
68 * The runqueue holds all runnable VCPUs with budget,
69 * sorted by priority_level and deadline;
70 * The depletedqueue holds all VCPUs without budget, unsorted;
71 *
72 * Note: cpumask and cpupool is supported.
73 */
74
75 /*
76 * Locking:
77 * A global system lock is used to protect the RunQ and DepletedQ.
78 * The global lock is referenced by schedule_data.schedule_lock
79 * from all physical cpus.
80 *
81 * The lock is already grabbed when calling wake/sleep/schedule/ functions
82 * in schedule.c
83 *
84 * The functions involes RunQ and needs to grab locks are:
85 * vcpu_insert, vcpu_remove, context_saved, runq_insert
86 */
87
88
89 /*
90 * Default parameters:
91 * Period and budget in default is 10 and 4 ms, respectively
92 */
93 #define RTDS_DEFAULT_PERIOD (MICROSECS(10000))
94 #define RTDS_DEFAULT_BUDGET (MICROSECS(4000))
95
96 /*
97 * Max period: max delta of time type, because period is added to the time
98 * a vcpu activates, so this must not overflow.
99 * Min period: 10 us, considering the scheduling overhead (when period is
100 * too low, scheduling is invoked too frequently, causing high overhead).
101 */
102 #define RTDS_MAX_PERIOD (STIME_DELTA_MAX)
103 #define RTDS_MIN_PERIOD (MICROSECS(10))
104
105 /*
106 * Min budget: 10 us, considering the scheduling overhead (when budget is
107 * consumed too fast, scheduling is invoked too frequently, causing
108 * high overhead).
109 */
110 #define RTDS_MIN_BUDGET (MICROSECS(10))
111
112 /*
113 * UPDATE_LIMIT_SHIFT: a constant used in rt_update_deadline(). When finding
114 * the next deadline, performing addition could be faster if the difference
115 * between cur_deadline and now is small. If the difference is bigger than
116 * 1024 * period, use multiplication.
117 */
118 #define UPDATE_LIMIT_SHIFT 10
119
120 /*
121 * Flags
122 */
123 /*
124 * RTDS_scheduled: Is this vcpu either running on, or context-switching off,
125 * a phyiscal cpu?
126 * + Accessed only with global lock held.
127 * + Set when chosen as next in rt_schedule().
128 * + Cleared after context switch has been saved in rt_context_saved()
129 * + Checked in vcpu_wake to see if we can add to the Runqueue, or if we should
130 * set RTDS_delayed_runq_add
131 * + Checked to be false in runq_insert.
132 */
133 #define __RTDS_scheduled 1
134 #define RTDS_scheduled (1<<__RTDS_scheduled)
135 /*
136 * RTDS_delayed_runq_add: Do we need to add this to the RunQ/DepletedQ
137 * once it's done being context switching out?
138 * + Set when scheduling out in rt_schedule() if prev is runable
139 * + Set in rt_vcpu_wake if it finds RTDS_scheduled set
140 * + Read in rt_context_saved(). If set, it adds prev to the Runqueue/DepletedQ
141 * and clears the bit.
142 */
143 #define __RTDS_delayed_runq_add 2
144 #define RTDS_delayed_runq_add (1<<__RTDS_delayed_runq_add)
145
146 /*
147 * RTDS_depleted: Does this vcp run out of budget?
148 * This flag is
149 * + set in burn_budget() if a vcpu has zero budget left;
150 * + cleared and checked in the repenishment handler,
151 * for the vcpus that are being replenished.
152 */
153 #define __RTDS_depleted 3
154 #define RTDS_depleted (1<<__RTDS_depleted)
155
156 /*
157 * RTDS_extratime: Can the vcpu run in the time that is
158 * not part of any real-time reservation, and would therefore
159 * be otherwise left idle?
160 */
161 #define __RTDS_extratime 4
162 #define RTDS_extratime (1<<__RTDS_extratime)
163
164 /*
165 * rt tracing events ("only" 512 available!). Check
166 * include/public/trace.h for more details.
167 */
168 #define TRC_RTDS_TICKLE TRC_SCHED_CLASS_EVT(RTDS, 1)
169 #define TRC_RTDS_RUNQ_PICK TRC_SCHED_CLASS_EVT(RTDS, 2)
170 #define TRC_RTDS_BUDGET_BURN TRC_SCHED_CLASS_EVT(RTDS, 3)
171 #define TRC_RTDS_BUDGET_REPLENISH TRC_SCHED_CLASS_EVT(RTDS, 4)
172 #define TRC_RTDS_SCHED_TASKLET TRC_SCHED_CLASS_EVT(RTDS, 5)
173 #define TRC_RTDS_SCHEDULE TRC_SCHED_CLASS_EVT(RTDS, 6)
174
175 static void repl_timer_handler(void *data);
176
177 /*
178 * System-wide private data, include global RunQueue/DepletedQ
179 * Global lock is referenced by schedule_data.schedule_lock from all
180 * physical cpus. It can be grabbed via vcpu_schedule_lock_irq()
181 */
182 struct rt_private {
183 spinlock_t lock; /* the global coarse-grained lock */
184 struct list_head sdom; /* list of availalbe domains, used for dump */
185
186 struct list_head runq; /* ordered list of runnable vcpus */
187 struct list_head depletedq; /* unordered list of depleted vcpus */
188
189 struct timer *repl_timer; /* replenishment timer */
190 struct list_head replq; /* ordered list of vcpus that need replenishment */
191
192 cpumask_t tickled; /* cpus been tickled */
193 };
194
195 /*
196 * Virtual CPU
197 */
198 struct rt_vcpu {
199 struct list_head q_elem; /* on the runq/depletedq list */
200 struct list_head replq_elem; /* on the replenishment events list */
201
202 /* VCPU parameters, in nanoseconds */
203 s_time_t period;
204 s_time_t budget;
205
206 /* VCPU current infomation in nanosecond */
207 s_time_t cur_budget; /* current budget */
208 s_time_t last_start; /* last start time */
209 s_time_t cur_deadline; /* current deadline for EDF */
210
211 /* Up-pointers */
212 struct rt_dom *sdom;
213 struct vcpu *vcpu;
214
215 unsigned priority_level;
216
217 unsigned flags; /* mark __RTDS_scheduled, etc.. */
218 };
219
220 /*
221 * Domain
222 */
223 struct rt_dom {
224 struct list_head sdom_elem; /* link list on rt_priv */
225 struct domain *dom; /* pointer to upper domain */
226 };
227
228 /*
229 * Useful inline functions
230 */
rt_priv(const struct scheduler * ops)231 static inline struct rt_private *rt_priv(const struct scheduler *ops)
232 {
233 return ops->sched_data;
234 }
235
rt_vcpu(const struct vcpu * vcpu)236 static inline struct rt_vcpu *rt_vcpu(const struct vcpu *vcpu)
237 {
238 return vcpu->sched_priv;
239 }
240
rt_dom(const struct domain * dom)241 static inline struct rt_dom *rt_dom(const struct domain *dom)
242 {
243 return dom->sched_priv;
244 }
245
rt_runq(const struct scheduler * ops)246 static inline struct list_head *rt_runq(const struct scheduler *ops)
247 {
248 return &rt_priv(ops)->runq;
249 }
250
rt_depletedq(const struct scheduler * ops)251 static inline struct list_head *rt_depletedq(const struct scheduler *ops)
252 {
253 return &rt_priv(ops)->depletedq;
254 }
255
rt_replq(const struct scheduler * ops)256 static inline struct list_head *rt_replq(const struct scheduler *ops)
257 {
258 return &rt_priv(ops)->replq;
259 }
260
has_extratime(const struct rt_vcpu * svc)261 static inline bool has_extratime(const struct rt_vcpu *svc)
262 {
263 return svc->flags & RTDS_extratime;
264 }
265
266 /*
267 * Helper functions for manipulating the runqueue, the depleted queue,
268 * and the replenishment events queue.
269 */
270 static int
vcpu_on_q(const struct rt_vcpu * svc)271 vcpu_on_q(const struct rt_vcpu *svc)
272 {
273 return !list_empty(&svc->q_elem);
274 }
275
276 static struct rt_vcpu *
q_elem(struct list_head * elem)277 q_elem(struct list_head *elem)
278 {
279 return list_entry(elem, struct rt_vcpu, q_elem);
280 }
281
282 static struct rt_vcpu *
replq_elem(struct list_head * elem)283 replq_elem(struct list_head *elem)
284 {
285 return list_entry(elem, struct rt_vcpu, replq_elem);
286 }
287
288 static int
vcpu_on_replq(const struct rt_vcpu * svc)289 vcpu_on_replq(const struct rt_vcpu *svc)
290 {
291 return !list_empty(&svc->replq_elem);
292 }
293
294 /*
295 * If v1 priority >= v2 priority, return value > 0
296 * Otherwise, return value < 0
297 */
298 static s_time_t
compare_vcpu_priority(const struct rt_vcpu * v1,const struct rt_vcpu * v2)299 compare_vcpu_priority(const struct rt_vcpu *v1, const struct rt_vcpu *v2)
300 {
301 int prio = v2->priority_level - v1->priority_level;
302
303 if ( prio == 0 )
304 return v2->cur_deadline - v1->cur_deadline;
305
306 return prio;
307 }
308
309 /*
310 * Debug related code, dump vcpu/cpu information
311 */
312 static void
rt_dump_vcpu(const struct scheduler * ops,const struct rt_vcpu * svc)313 rt_dump_vcpu(const struct scheduler *ops, const struct rt_vcpu *svc)
314 {
315 cpumask_t *cpupool_mask, *mask;
316
317 ASSERT(svc != NULL);
318 /* idle vcpu */
319 if( svc->sdom == NULL )
320 {
321 printk("\n");
322 return;
323 }
324
325 /*
326 * We can't just use 'cpumask_scratch' because the dumping can
327 * happen from a pCPU outside of this scheduler's cpupool, and
328 * hence it's not right to use its pCPU's scratch mask.
329 * On the other hand, it is safe to use svc->vcpu->processor's
330 * own scratch space, since we hold the runqueue lock.
331 */
332 mask = cpumask_scratch_cpu(svc->vcpu->processor);
333
334 cpupool_mask = cpupool_domain_cpumask(svc->vcpu->domain);
335 cpumask_and(mask, cpupool_mask, svc->vcpu->cpu_hard_affinity);
336 cpulist_scnprintf(keyhandler_scratch, sizeof(keyhandler_scratch), mask);
337 printk("[%5d.%-2u] cpu %u, (%"PRI_stime", %"PRI_stime"),"
338 " cur_b=%"PRI_stime" cur_d=%"PRI_stime" last_start=%"PRI_stime"\n"
339 " \t\t priority_level=%d has_extratime=%d\n"
340 " \t\t onQ=%d runnable=%d flags=%x effective hard_affinity=%s\n",
341 svc->vcpu->domain->domain_id,
342 svc->vcpu->vcpu_id,
343 svc->vcpu->processor,
344 svc->period,
345 svc->budget,
346 svc->cur_budget,
347 svc->cur_deadline,
348 svc->last_start,
349 svc->priority_level,
350 has_extratime(svc),
351 vcpu_on_q(svc),
352 vcpu_runnable(svc->vcpu),
353 svc->flags,
354 keyhandler_scratch);
355 }
356
357 static void
rt_dump_pcpu(const struct scheduler * ops,int cpu)358 rt_dump_pcpu(const struct scheduler *ops, int cpu)
359 {
360 struct rt_private *prv = rt_priv(ops);
361 struct rt_vcpu *svc;
362 unsigned long flags;
363
364 spin_lock_irqsave(&prv->lock, flags);
365 printk("CPU[%02d]\n", cpu);
366 /* current VCPU (nothing to say if that's the idle vcpu). */
367 svc = rt_vcpu(curr_on_cpu(cpu));
368 if ( svc && !is_idle_vcpu(svc->vcpu) )
369 {
370 rt_dump_vcpu(ops, svc);
371 }
372 spin_unlock_irqrestore(&prv->lock, flags);
373 }
374
375 static void
rt_dump(const struct scheduler * ops)376 rt_dump(const struct scheduler *ops)
377 {
378 struct list_head *runq, *depletedq, *replq, *iter;
379 struct rt_private *prv = rt_priv(ops);
380 struct rt_vcpu *svc;
381 struct rt_dom *sdom;
382 unsigned long flags;
383
384 spin_lock_irqsave(&prv->lock, flags);
385
386 if ( list_empty(&prv->sdom) )
387 goto out;
388
389 runq = rt_runq(ops);
390 depletedq = rt_depletedq(ops);
391 replq = rt_replq(ops);
392
393 printk("Global RunQueue info:\n");
394 list_for_each ( iter, runq )
395 {
396 svc = q_elem(iter);
397 rt_dump_vcpu(ops, svc);
398 }
399
400 printk("Global DepletedQueue info:\n");
401 list_for_each ( iter, depletedq )
402 {
403 svc = q_elem(iter);
404 rt_dump_vcpu(ops, svc);
405 }
406
407 printk("Global Replenishment Events info:\n");
408 list_for_each ( iter, replq )
409 {
410 svc = replq_elem(iter);
411 rt_dump_vcpu(ops, svc);
412 }
413
414 printk("Domain info:\n");
415 list_for_each ( iter, &prv->sdom )
416 {
417 struct vcpu *v;
418
419 sdom = list_entry(iter, struct rt_dom, sdom_elem);
420 printk("\tdomain: %d\n", sdom->dom->domain_id);
421
422 for_each_vcpu ( sdom->dom, v )
423 {
424 svc = rt_vcpu(v);
425 rt_dump_vcpu(ops, svc);
426 }
427 }
428
429 out:
430 spin_unlock_irqrestore(&prv->lock, flags);
431 }
432
433 /*
434 * update deadline and budget when now >= cur_deadline
435 * it needs to be updated to the deadline of the current period
436 */
437 static void
rt_update_deadline(s_time_t now,struct rt_vcpu * svc)438 rt_update_deadline(s_time_t now, struct rt_vcpu *svc)
439 {
440 ASSERT(now >= svc->cur_deadline);
441 ASSERT(svc->period != 0);
442
443 if ( svc->cur_deadline + (svc->period << UPDATE_LIMIT_SHIFT) > now )
444 {
445 do
446 svc->cur_deadline += svc->period;
447 while ( svc->cur_deadline <= now );
448 }
449 else
450 {
451 long count = ((now - svc->cur_deadline) / svc->period) + 1;
452 svc->cur_deadline += count * svc->period;
453 }
454
455 /*
456 * svc may be scheduled to run immediately after it misses deadline
457 * Then rt_update_deadline is called before rt_schedule, which
458 * should only deduct the time spent in current period from the budget
459 */
460 svc->last_start = now;
461 svc->cur_budget = svc->budget;
462 svc->priority_level = 0;
463
464 /* TRACE */
465 {
466 struct __packed {
467 unsigned vcpu:16, dom:16;
468 unsigned priority_level;
469 uint64_t cur_deadline, cur_budget;
470 } d;
471 d.dom = svc->vcpu->domain->domain_id;
472 d.vcpu = svc->vcpu->vcpu_id;
473 d.priority_level = svc->priority_level;
474 d.cur_deadline = (uint64_t) svc->cur_deadline;
475 d.cur_budget = (uint64_t) svc->cur_budget;
476 trace_var(TRC_RTDS_BUDGET_REPLENISH, 1,
477 sizeof(d),
478 (unsigned char *) &d);
479 }
480
481 return;
482 }
483
484 /*
485 * Helpers for removing and inserting a vcpu in a queue
486 * that is being kept ordered by the vcpus' deadlines (as EDF
487 * mandates).
488 *
489 * For callers' convenience, the vcpu removing helper returns
490 * true if the vcpu removed was the one at the front of the
491 * queue; similarly, the inserting helper returns true if the
492 * inserted ended at the front of the queue (i.e., in both
493 * cases, if the vcpu with the earliest deadline is what we
494 * are dealing with).
495 */
496 static inline bool
deadline_queue_remove(struct list_head * queue,struct list_head * elem)497 deadline_queue_remove(struct list_head *queue, struct list_head *elem)
498 {
499 int pos = 0;
500
501 if ( queue->next != elem )
502 pos = 1;
503
504 list_del_init(elem);
505 return !pos;
506 }
507
508 static inline bool
deadline_queue_insert(struct rt_vcpu * (* qelem)(struct list_head *),struct rt_vcpu * svc,struct list_head * elem,struct list_head * queue)509 deadline_queue_insert(struct rt_vcpu * (*qelem)(struct list_head *),
510 struct rt_vcpu *svc, struct list_head *elem,
511 struct list_head *queue)
512 {
513 struct list_head *iter;
514 int pos = 0;
515
516 list_for_each ( iter, queue )
517 {
518 struct rt_vcpu * iter_svc = (*qelem)(iter);
519 if ( compare_vcpu_priority(svc, iter_svc) > 0 )
520 break;
521 pos++;
522 }
523 list_add_tail(elem, iter);
524 return !pos;
525 }
526 #define deadline_runq_insert(...) \
527 deadline_queue_insert(&q_elem, ##__VA_ARGS__)
528 #define deadline_replq_insert(...) \
529 deadline_queue_insert(&replq_elem, ##__VA_ARGS__)
530
531 static inline void
q_remove(struct rt_vcpu * svc)532 q_remove(struct rt_vcpu *svc)
533 {
534 ASSERT( vcpu_on_q(svc) );
535 list_del_init(&svc->q_elem);
536 }
537
538 static inline void
replq_remove(const struct scheduler * ops,struct rt_vcpu * svc)539 replq_remove(const struct scheduler *ops, struct rt_vcpu *svc)
540 {
541 struct rt_private *prv = rt_priv(ops);
542 struct list_head *replq = rt_replq(ops);
543
544 ASSERT( vcpu_on_replq(svc) );
545
546 if ( deadline_queue_remove(replq, &svc->replq_elem) )
547 {
548 /*
549 * The replenishment timer needs to be set to fire when a
550 * replenishment for the vcpu at the front of the replenishment
551 * queue is due. If it is such vcpu that we just removed, we may
552 * need to reprogram the timer.
553 */
554 if ( !list_empty(replq) )
555 {
556 struct rt_vcpu *svc_next = replq_elem(replq->next);
557 set_timer(prv->repl_timer, svc_next->cur_deadline);
558 }
559 else
560 stop_timer(prv->repl_timer);
561 }
562 }
563
564 /*
565 * Insert svc with budget in RunQ according to EDF:
566 * vcpus with smaller deadlines go first.
567 * Insert svc without budget in DepletedQ unsorted;
568 */
569 static void
runq_insert(const struct scheduler * ops,struct rt_vcpu * svc)570 runq_insert(const struct scheduler *ops, struct rt_vcpu *svc)
571 {
572 struct rt_private *prv = rt_priv(ops);
573 struct list_head *runq = rt_runq(ops);
574
575 ASSERT( spin_is_locked(&prv->lock) );
576 ASSERT( !vcpu_on_q(svc) );
577 ASSERT( vcpu_on_replq(svc) );
578
579 /* add svc to runq if svc still has budget or its extratime is set */
580 if ( svc->cur_budget > 0 ||
581 has_extratime(svc) )
582 deadline_runq_insert(svc, &svc->q_elem, runq);
583 else
584 list_add(&svc->q_elem, &prv->depletedq);
585 }
586
587 static void
replq_insert(const struct scheduler * ops,struct rt_vcpu * svc)588 replq_insert(const struct scheduler *ops, struct rt_vcpu *svc)
589 {
590 struct list_head *replq = rt_replq(ops);
591 struct rt_private *prv = rt_priv(ops);
592
593 ASSERT( !vcpu_on_replq(svc) );
594
595 /*
596 * The timer may be re-programmed if svc is inserted
597 * at the front of the event list.
598 */
599 if ( deadline_replq_insert(svc, &svc->replq_elem, replq) )
600 set_timer(prv->repl_timer, svc->cur_deadline);
601 }
602
603 /*
604 * Removes and re-inserts an event to the replenishment queue.
605 * The aim is to update its position inside the queue, as its
606 * deadline (and hence its replenishment time) could have
607 * changed.
608 */
609 static void
replq_reinsert(const struct scheduler * ops,struct rt_vcpu * svc)610 replq_reinsert(const struct scheduler *ops, struct rt_vcpu *svc)
611 {
612 struct list_head *replq = rt_replq(ops);
613 struct rt_vcpu *rearm_svc = svc;
614 bool_t rearm = 0;
615
616 ASSERT( vcpu_on_replq(svc) );
617
618 /*
619 * If svc was at the front of the replenishment queue, we certainly
620 * need to re-program the timer, and we want to use the deadline of
621 * the vcpu which is now at the front of the queue (which may still
622 * be svc or not).
623 *
624 * We may also need to re-program, if svc has been put at the front
625 * of the replenishment queue when being re-inserted.
626 */
627 if ( deadline_queue_remove(replq, &svc->replq_elem) )
628 {
629 deadline_replq_insert(svc, &svc->replq_elem, replq);
630 rearm_svc = replq_elem(replq->next);
631 rearm = 1;
632 }
633 else
634 rearm = deadline_replq_insert(svc, &svc->replq_elem, replq);
635
636 if ( rearm )
637 set_timer(rt_priv(ops)->repl_timer, rearm_svc->cur_deadline);
638 }
639
640 /*
641 * Pick a valid CPU for the vcpu vc
642 * Valid CPU of a vcpu is intesection of vcpu's affinity
643 * and available cpus
644 */
645 static int
rt_cpu_pick(const struct scheduler * ops,struct vcpu * vc)646 rt_cpu_pick(const struct scheduler *ops, struct vcpu *vc)
647 {
648 cpumask_t cpus;
649 cpumask_t *online;
650 int cpu;
651
652 online = cpupool_domain_cpumask(vc->domain);
653 cpumask_and(&cpus, online, vc->cpu_hard_affinity);
654
655 cpu = cpumask_test_cpu(vc->processor, &cpus)
656 ? vc->processor
657 : cpumask_cycle(vc->processor, &cpus);
658 ASSERT( !cpumask_empty(&cpus) && cpumask_test_cpu(cpu, &cpus) );
659
660 return cpu;
661 }
662
663 /*
664 * Init/Free related code
665 */
666 static int
rt_init(struct scheduler * ops)667 rt_init(struct scheduler *ops)
668 {
669 int rc = -ENOMEM;
670 struct rt_private *prv = xzalloc(struct rt_private);
671
672 printk("Initializing RTDS scheduler\n"
673 "WARNING: This is experimental software in development.\n"
674 "Use at your own risk.\n");
675
676 if ( prv == NULL )
677 goto err;
678
679 prv->repl_timer = xzalloc(struct timer);
680 if ( prv->repl_timer == NULL )
681 goto err;
682
683 spin_lock_init(&prv->lock);
684 INIT_LIST_HEAD(&prv->sdom);
685 INIT_LIST_HEAD(&prv->runq);
686 INIT_LIST_HEAD(&prv->depletedq);
687 INIT_LIST_HEAD(&prv->replq);
688
689 cpumask_clear(&prv->tickled);
690
691 ops->sched_data = prv;
692 rc = 0;
693
694 err:
695 if ( rc && prv )
696 {
697 xfree(prv->repl_timer);
698 xfree(prv);
699 }
700
701 return rc;
702 }
703
704 static void
rt_deinit(struct scheduler * ops)705 rt_deinit(struct scheduler *ops)
706 {
707 struct rt_private *prv = rt_priv(ops);
708
709 ASSERT(prv->repl_timer->status == TIMER_STATUS_invalid ||
710 prv->repl_timer->status == TIMER_STATUS_killed);
711 xfree(prv->repl_timer);
712
713 ops->sched_data = NULL;
714 xfree(prv);
715 }
716
717 /*
718 * Point per_cpu spinlock to the global system lock;
719 * All cpu have same global system lock
720 */
721 static void
rt_init_pdata(const struct scheduler * ops,void * pdata,int cpu)722 rt_init_pdata(const struct scheduler *ops, void *pdata, int cpu)
723 {
724 struct rt_private *prv = rt_priv(ops);
725 spinlock_t *old_lock;
726 unsigned long flags;
727
728 old_lock = pcpu_schedule_lock_irqsave(cpu, &flags);
729
730 /*
731 * TIMER_STATUS_invalid means we are the first cpu that sees the timer
732 * allocated but not initialized, and so it's up to us to initialize it.
733 */
734 if ( prv->repl_timer->status == TIMER_STATUS_invalid )
735 {
736 init_timer(prv->repl_timer, repl_timer_handler, (void*) ops, cpu);
737 dprintk(XENLOG_DEBUG, "RTDS: timer initialized on cpu %u\n", cpu);
738 }
739
740 /* Move the scheduler lock to our global runqueue lock. */
741 per_cpu(schedule_data, cpu).schedule_lock = &prv->lock;
742
743 /* _Not_ pcpu_schedule_unlock(): per_cpu().schedule_lock changed! */
744 spin_unlock_irqrestore(old_lock, flags);
745 }
746
747 /* Change the scheduler of cpu to us (RTDS). */
748 static void
rt_switch_sched(struct scheduler * new_ops,unsigned int cpu,void * pdata,void * vdata)749 rt_switch_sched(struct scheduler *new_ops, unsigned int cpu,
750 void *pdata, void *vdata)
751 {
752 struct rt_private *prv = rt_priv(new_ops);
753 struct rt_vcpu *svc = vdata;
754
755 ASSERT(!pdata && svc && is_idle_vcpu(svc->vcpu));
756
757 /*
758 * We are holding the runqueue lock already (it's been taken in
759 * schedule_cpu_switch()). It's actually the runqueue lock of
760 * another scheduler, but that is how things need to be, for
761 * preventing races.
762 */
763 ASSERT(per_cpu(schedule_data, cpu).schedule_lock != &prv->lock);
764
765 /*
766 * If we are the absolute first cpu being switched toward this
767 * scheduler (in which case we'll see TIMER_STATUS_invalid), or the
768 * first one that is added back to the cpupool that had all its cpus
769 * removed (in which case we'll see TIMER_STATUS_killed), it's our
770 * job to (re)initialize the timer.
771 */
772 if ( prv->repl_timer->status == TIMER_STATUS_invalid ||
773 prv->repl_timer->status == TIMER_STATUS_killed )
774 {
775 init_timer(prv->repl_timer, repl_timer_handler, (void*) new_ops, cpu);
776 dprintk(XENLOG_DEBUG, "RTDS: timer initialized on cpu %u\n", cpu);
777 }
778
779 idle_vcpu[cpu]->sched_priv = vdata;
780 per_cpu(scheduler, cpu) = new_ops;
781 per_cpu(schedule_data, cpu).sched_priv = NULL; /* no pdata */
782
783 /*
784 * (Re?)route the lock to the per pCPU lock as /last/ thing. In fact,
785 * if it is free (and it can be) we want that anyone that manages
786 * taking it, find all the initializations we've done above in place.
787 */
788 smp_mb();
789 per_cpu(schedule_data, cpu).schedule_lock = &prv->lock;
790 }
791
792 static void
rt_deinit_pdata(const struct scheduler * ops,void * pcpu,int cpu)793 rt_deinit_pdata(const struct scheduler *ops, void *pcpu, int cpu)
794 {
795 unsigned long flags;
796 struct rt_private *prv = rt_priv(ops);
797
798 spin_lock_irqsave(&prv->lock, flags);
799
800 if ( prv->repl_timer->cpu == cpu )
801 {
802 struct cpupool *c = per_cpu(cpupool, cpu);
803 unsigned int new_cpu = cpumask_cycle(cpu, cpupool_online_cpumask(c));
804
805 /*
806 * Make sure the timer run on one of the cpus that are still available
807 * to this scheduler. If there aren't any left, it means it's the time
808 * to just kill it.
809 */
810 if ( new_cpu >= nr_cpu_ids )
811 {
812 kill_timer(prv->repl_timer);
813 dprintk(XENLOG_DEBUG, "RTDS: timer killed on cpu %d\n", cpu);
814 }
815 else
816 {
817 migrate_timer(prv->repl_timer, new_cpu);
818 }
819 }
820
821 spin_unlock_irqrestore(&prv->lock, flags);
822 }
823
824 static void *
rt_alloc_domdata(const struct scheduler * ops,struct domain * dom)825 rt_alloc_domdata(const struct scheduler *ops, struct domain *dom)
826 {
827 unsigned long flags;
828 struct rt_dom *sdom;
829 struct rt_private * prv = rt_priv(ops);
830
831 sdom = xzalloc(struct rt_dom);
832 if ( sdom == NULL )
833 return NULL;
834
835 INIT_LIST_HEAD(&sdom->sdom_elem);
836 sdom->dom = dom;
837
838 /* spinlock here to insert the dom */
839 spin_lock_irqsave(&prv->lock, flags);
840 list_add_tail(&sdom->sdom_elem, &(prv->sdom));
841 spin_unlock_irqrestore(&prv->lock, flags);
842
843 return sdom;
844 }
845
846 static void
rt_free_domdata(const struct scheduler * ops,void * data)847 rt_free_domdata(const struct scheduler *ops, void *data)
848 {
849 unsigned long flags;
850 struct rt_dom *sdom = data;
851 struct rt_private *prv = rt_priv(ops);
852
853 spin_lock_irqsave(&prv->lock, flags);
854 list_del_init(&sdom->sdom_elem);
855 spin_unlock_irqrestore(&prv->lock, flags);
856 xfree(data);
857 }
858
859 static int
rt_dom_init(const struct scheduler * ops,struct domain * dom)860 rt_dom_init(const struct scheduler *ops, struct domain *dom)
861 {
862 struct rt_dom *sdom;
863
864 /* IDLE Domain does not link on rt_private */
865 if ( is_idle_domain(dom) )
866 return 0;
867
868 sdom = rt_alloc_domdata(ops, dom);
869 if ( sdom == NULL )
870 return -ENOMEM;
871
872 dom->sched_priv = sdom;
873
874 return 0;
875 }
876
877 static void
rt_dom_destroy(const struct scheduler * ops,struct domain * dom)878 rt_dom_destroy(const struct scheduler *ops, struct domain *dom)
879 {
880 rt_free_domdata(ops, rt_dom(dom));
881 }
882
883 static void *
rt_alloc_vdata(const struct scheduler * ops,struct vcpu * vc,void * dd)884 rt_alloc_vdata(const struct scheduler *ops, struct vcpu *vc, void *dd)
885 {
886 struct rt_vcpu *svc;
887
888 /* Allocate per-VCPU info */
889 svc = xzalloc(struct rt_vcpu);
890 if ( svc == NULL )
891 return NULL;
892
893 INIT_LIST_HEAD(&svc->q_elem);
894 INIT_LIST_HEAD(&svc->replq_elem);
895 svc->flags = 0U;
896 svc->sdom = dd;
897 svc->vcpu = vc;
898 svc->last_start = 0;
899
900 __set_bit(__RTDS_extratime, &svc->flags);
901 svc->priority_level = 0;
902 svc->period = RTDS_DEFAULT_PERIOD;
903 if ( !is_idle_vcpu(vc) )
904 svc->budget = RTDS_DEFAULT_BUDGET;
905
906 SCHED_STAT_CRANK(vcpu_alloc);
907
908 return svc;
909 }
910
911 static void
rt_free_vdata(const struct scheduler * ops,void * priv)912 rt_free_vdata(const struct scheduler *ops, void *priv)
913 {
914 struct rt_vcpu *svc = priv;
915
916 xfree(svc);
917 }
918
919 /*
920 * It is called in sched_move_domain() and sched_init_vcpu
921 * in schedule.c.
922 * When move a domain to a new cpupool.
923 * It inserts vcpus of moving domain to the scheduler's RunQ in
924 * dest. cpupool.
925 */
926 static void
rt_vcpu_insert(const struct scheduler * ops,struct vcpu * vc)927 rt_vcpu_insert(const struct scheduler *ops, struct vcpu *vc)
928 {
929 struct rt_vcpu *svc = rt_vcpu(vc);
930 s_time_t now;
931 spinlock_t *lock;
932
933 BUG_ON( is_idle_vcpu(vc) );
934
935 /* This is safe because vc isn't yet being scheduled */
936 vc->processor = rt_cpu_pick(ops, vc);
937
938 lock = vcpu_schedule_lock_irq(vc);
939
940 now = NOW();
941 if ( now >= svc->cur_deadline )
942 rt_update_deadline(now, svc);
943
944 if ( !vcpu_on_q(svc) && vcpu_runnable(vc) )
945 {
946 replq_insert(ops, svc);
947
948 if ( !vc->is_running )
949 runq_insert(ops, svc);
950 }
951 vcpu_schedule_unlock_irq(lock, vc);
952
953 SCHED_STAT_CRANK(vcpu_insert);
954 }
955
956 /*
957 * Remove rt_vcpu svc from the old scheduler in source cpupool.
958 */
959 static void
rt_vcpu_remove(const struct scheduler * ops,struct vcpu * vc)960 rt_vcpu_remove(const struct scheduler *ops, struct vcpu *vc)
961 {
962 struct rt_vcpu * const svc = rt_vcpu(vc);
963 struct rt_dom * const sdom = svc->sdom;
964 spinlock_t *lock;
965
966 SCHED_STAT_CRANK(vcpu_remove);
967
968 BUG_ON( sdom == NULL );
969
970 lock = vcpu_schedule_lock_irq(vc);
971 if ( vcpu_on_q(svc) )
972 q_remove(svc);
973
974 if ( vcpu_on_replq(svc) )
975 replq_remove(ops,svc);
976
977 vcpu_schedule_unlock_irq(lock, vc);
978 }
979
980 /*
981 * Burn budget in nanosecond granularity
982 */
983 static void
burn_budget(const struct scheduler * ops,struct rt_vcpu * svc,s_time_t now)984 burn_budget(const struct scheduler *ops, struct rt_vcpu *svc, s_time_t now)
985 {
986 s_time_t delta;
987
988 /* don't burn budget for idle VCPU */
989 if ( is_idle_vcpu(svc->vcpu) )
990 return;
991
992 /* burn at nanoseconds level */
993 delta = now - svc->last_start;
994 /*
995 * delta < 0 only happens in nested virtualization;
996 * TODO: how should we handle delta < 0 in a better way?
997 */
998 if ( delta < 0 )
999 {
1000 printk("%s, ATTENTION: now is behind last_start! delta=%"PRI_stime"\n",
1001 __func__, delta);
1002 svc->last_start = now;
1003 return;
1004 }
1005
1006 svc->cur_budget -= delta;
1007 svc->last_start = now;
1008
1009 if ( svc->cur_budget <= 0 )
1010 {
1011 if ( has_extratime(svc) )
1012 {
1013 svc->priority_level++;
1014 svc->cur_budget = svc->budget;
1015 }
1016 else
1017 {
1018 svc->cur_budget = 0;
1019 __set_bit(__RTDS_depleted, &svc->flags);
1020 }
1021 }
1022
1023 /* TRACE */
1024 {
1025 struct __packed {
1026 unsigned vcpu:16, dom:16;
1027 uint64_t cur_budget;
1028 int delta;
1029 unsigned priority_level;
1030 bool has_extratime;
1031 } d;
1032 d.dom = svc->vcpu->domain->domain_id;
1033 d.vcpu = svc->vcpu->vcpu_id;
1034 d.cur_budget = (uint64_t) svc->cur_budget;
1035 d.delta = delta;
1036 d.priority_level = svc->priority_level;
1037 d.has_extratime = svc->flags & RTDS_extratime;
1038 trace_var(TRC_RTDS_BUDGET_BURN, 1,
1039 sizeof(d),
1040 (unsigned char *) &d);
1041 }
1042 }
1043
1044 /*
1045 * RunQ is sorted. Pick first one within cpumask. If no one, return NULL
1046 * lock is grabbed before calling this function
1047 */
1048 static struct rt_vcpu *
runq_pick(const struct scheduler * ops,const cpumask_t * mask)1049 runq_pick(const struct scheduler *ops, const cpumask_t *mask)
1050 {
1051 struct list_head *runq = rt_runq(ops);
1052 struct list_head *iter;
1053 struct rt_vcpu *svc = NULL;
1054 struct rt_vcpu *iter_svc = NULL;
1055 cpumask_t cpu_common;
1056 cpumask_t *online;
1057
1058 list_for_each ( iter, runq )
1059 {
1060 iter_svc = q_elem(iter);
1061
1062 /* mask cpu_hard_affinity & cpupool & mask */
1063 online = cpupool_domain_cpumask(iter_svc->vcpu->domain);
1064 cpumask_and(&cpu_common, online, iter_svc->vcpu->cpu_hard_affinity);
1065 cpumask_and(&cpu_common, mask, &cpu_common);
1066 if ( cpumask_empty(&cpu_common) )
1067 continue;
1068
1069 ASSERT( iter_svc->cur_budget > 0 );
1070
1071 svc = iter_svc;
1072 break;
1073 }
1074
1075 /* TRACE */
1076 {
1077 if( svc != NULL )
1078 {
1079 struct __packed {
1080 unsigned vcpu:16, dom:16;
1081 uint64_t cur_deadline, cur_budget;
1082 } d;
1083 d.dom = svc->vcpu->domain->domain_id;
1084 d.vcpu = svc->vcpu->vcpu_id;
1085 d.cur_deadline = (uint64_t) svc->cur_deadline;
1086 d.cur_budget = (uint64_t) svc->cur_budget;
1087 trace_var(TRC_RTDS_RUNQ_PICK, 1,
1088 sizeof(d),
1089 (unsigned char *) &d);
1090 }
1091 }
1092
1093 return svc;
1094 }
1095
1096 /*
1097 * schedule function for rt scheduler.
1098 * The lock is already grabbed in schedule.c, no need to lock here
1099 */
1100 static struct task_slice
rt_schedule(const struct scheduler * ops,s_time_t now,bool_t tasklet_work_scheduled)1101 rt_schedule(const struct scheduler *ops, s_time_t now, bool_t tasklet_work_scheduled)
1102 {
1103 const int cpu = smp_processor_id();
1104 struct rt_private *prv = rt_priv(ops);
1105 struct rt_vcpu *const scurr = rt_vcpu(current);
1106 struct rt_vcpu *snext = NULL;
1107 struct task_slice ret = { .migrated = 0 };
1108
1109 /* TRACE */
1110 {
1111 struct __packed {
1112 unsigned cpu:16, tasklet:8, tickled:4, idle:4;
1113 } d;
1114 d.cpu = cpu;
1115 d.tasklet = tasklet_work_scheduled;
1116 d.tickled = cpumask_test_cpu(cpu, &prv->tickled);
1117 d.idle = is_idle_vcpu(current);
1118 trace_var(TRC_RTDS_SCHEDULE, 1,
1119 sizeof(d),
1120 (unsigned char *)&d);
1121 }
1122
1123 /* clear ticked bit now that we've been scheduled */
1124 cpumask_clear_cpu(cpu, &prv->tickled);
1125
1126 /* burn_budget would return for IDLE VCPU */
1127 burn_budget(ops, scurr, now);
1128
1129 if ( tasklet_work_scheduled )
1130 {
1131 trace_var(TRC_RTDS_SCHED_TASKLET, 1, 0, NULL);
1132 snext = rt_vcpu(idle_vcpu[cpu]);
1133 }
1134 else
1135 {
1136 snext = runq_pick(ops, cpumask_of(cpu));
1137 if ( snext == NULL )
1138 snext = rt_vcpu(idle_vcpu[cpu]);
1139
1140 /* if scurr has higher priority and budget, still pick scurr */
1141 if ( !is_idle_vcpu(current) &&
1142 vcpu_runnable(current) &&
1143 scurr->cur_budget > 0 &&
1144 ( is_idle_vcpu(snext->vcpu) ||
1145 compare_vcpu_priority(scurr, snext) > 0 ) )
1146 snext = scurr;
1147 }
1148
1149 if ( snext != scurr &&
1150 !is_idle_vcpu(current) &&
1151 vcpu_runnable(current) )
1152 __set_bit(__RTDS_delayed_runq_add, &scurr->flags);
1153
1154 snext->last_start = now;
1155 ret.time = -1; /* if an idle vcpu is picked */
1156 if ( !is_idle_vcpu(snext->vcpu) )
1157 {
1158 if ( snext != scurr )
1159 {
1160 q_remove(snext);
1161 __set_bit(__RTDS_scheduled, &snext->flags);
1162 }
1163 if ( snext->vcpu->processor != cpu )
1164 {
1165 snext->vcpu->processor = cpu;
1166 ret.migrated = 1;
1167 }
1168 ret.time = snext->cur_budget; /* invoke the scheduler next time */
1169 }
1170 ret.task = snext->vcpu;
1171
1172 return ret;
1173 }
1174
1175 /*
1176 * Remove VCPU from RunQ
1177 * The lock is already grabbed in schedule.c, no need to lock here
1178 */
1179 static void
rt_vcpu_sleep(const struct scheduler * ops,struct vcpu * vc)1180 rt_vcpu_sleep(const struct scheduler *ops, struct vcpu *vc)
1181 {
1182 struct rt_vcpu * const svc = rt_vcpu(vc);
1183
1184 BUG_ON( is_idle_vcpu(vc) );
1185 SCHED_STAT_CRANK(vcpu_sleep);
1186
1187 if ( curr_on_cpu(vc->processor) == vc )
1188 cpu_raise_softirq(vc->processor, SCHEDULE_SOFTIRQ);
1189 else if ( vcpu_on_q(svc) )
1190 {
1191 q_remove(svc);
1192 replq_remove(ops, svc);
1193 }
1194 else if ( svc->flags & RTDS_delayed_runq_add )
1195 __clear_bit(__RTDS_delayed_runq_add, &svc->flags);
1196 }
1197
1198 /*
1199 * Pick a cpu where to run a vcpu,
1200 * possibly kicking out the vcpu running there
1201 * Called by wake() and context_saved()
1202 * We have a running candidate here, the kick logic is:
1203 * Among all the cpus that are within the cpu affinity
1204 * 1) if there are any idle CPUs, kick one.
1205 For cache benefit, we check new->cpu as first
1206 * 2) now all pcpus are busy;
1207 * among all the running vcpus, pick lowest priority one
1208 * if snext has higher priority, kick it.
1209 *
1210 * TODO:
1211 * 1) what if these two vcpus belongs to the same domain?
1212 * replace a vcpu belonging to the same domain introduces more overhead
1213 *
1214 * lock is grabbed before calling this function
1215 */
1216 static void
runq_tickle(const struct scheduler * ops,struct rt_vcpu * new)1217 runq_tickle(const struct scheduler *ops, struct rt_vcpu *new)
1218 {
1219 struct rt_private *prv = rt_priv(ops);
1220 struct rt_vcpu *latest_deadline_vcpu = NULL; /* lowest priority */
1221 struct rt_vcpu *iter_svc;
1222 struct vcpu *iter_vc;
1223 int cpu = 0, cpu_to_tickle = 0;
1224 cpumask_t not_tickled;
1225 cpumask_t *online;
1226
1227 if ( new == NULL || is_idle_vcpu(new->vcpu) )
1228 return;
1229
1230 online = cpupool_domain_cpumask(new->vcpu->domain);
1231 cpumask_and(¬_tickled, online, new->vcpu->cpu_hard_affinity);
1232 cpumask_andnot(¬_tickled, ¬_tickled, &prv->tickled);
1233
1234 /*
1235 * 1) If there are any idle CPUs, kick one.
1236 * For cache benefit,we first search new->cpu.
1237 * The same loop also find the one with lowest priority.
1238 */
1239 cpu = cpumask_test_or_cycle(new->vcpu->processor, ¬_tickled);
1240 while ( cpu!= nr_cpu_ids )
1241 {
1242 iter_vc = curr_on_cpu(cpu);
1243 if ( is_idle_vcpu(iter_vc) )
1244 {
1245 SCHED_STAT_CRANK(tickled_idle_cpu);
1246 cpu_to_tickle = cpu;
1247 goto out;
1248 }
1249 iter_svc = rt_vcpu(iter_vc);
1250 if ( latest_deadline_vcpu == NULL ||
1251 compare_vcpu_priority(iter_svc, latest_deadline_vcpu) < 0 )
1252 latest_deadline_vcpu = iter_svc;
1253
1254 cpumask_clear_cpu(cpu, ¬_tickled);
1255 cpu = cpumask_cycle(cpu, ¬_tickled);
1256 }
1257
1258 /* 2) candicate has higher priority, kick out lowest priority vcpu */
1259 if ( latest_deadline_vcpu != NULL &&
1260 compare_vcpu_priority(latest_deadline_vcpu, new) < 0 )
1261 {
1262 SCHED_STAT_CRANK(tickled_busy_cpu);
1263 cpu_to_tickle = latest_deadline_vcpu->vcpu->processor;
1264 goto out;
1265 }
1266
1267 /* didn't tickle any cpu */
1268 SCHED_STAT_CRANK(tickled_no_cpu);
1269 return;
1270 out:
1271 /* TRACE */
1272 {
1273 struct {
1274 unsigned cpu:16, pad:16;
1275 } d;
1276 d.cpu = cpu_to_tickle;
1277 d.pad = 0;
1278 trace_var(TRC_RTDS_TICKLE, 1,
1279 sizeof(d),
1280 (unsigned char *)&d);
1281 }
1282
1283 cpumask_set_cpu(cpu_to_tickle, &prv->tickled);
1284 cpu_raise_softirq(cpu_to_tickle, SCHEDULE_SOFTIRQ);
1285 return;
1286 }
1287
1288 /*
1289 * Should always wake up runnable vcpu, put it back to RunQ.
1290 * Check priority to raise interrupt
1291 * The lock is already grabbed in schedule.c, no need to lock here
1292 * TODO: what if these two vcpus belongs to the same domain?
1293 */
1294 static void
rt_vcpu_wake(const struct scheduler * ops,struct vcpu * vc)1295 rt_vcpu_wake(const struct scheduler *ops, struct vcpu *vc)
1296 {
1297 struct rt_vcpu * const svc = rt_vcpu(vc);
1298 s_time_t now;
1299 bool_t missed;
1300
1301 BUG_ON( is_idle_vcpu(vc) );
1302
1303 if ( unlikely(curr_on_cpu(vc->processor) == vc) )
1304 {
1305 SCHED_STAT_CRANK(vcpu_wake_running);
1306 return;
1307 }
1308
1309 /* on RunQ/DepletedQ, just update info is ok */
1310 if ( unlikely(vcpu_on_q(svc)) )
1311 {
1312 SCHED_STAT_CRANK(vcpu_wake_onrunq);
1313 return;
1314 }
1315
1316 if ( likely(vcpu_runnable(vc)) )
1317 SCHED_STAT_CRANK(vcpu_wake_runnable);
1318 else
1319 SCHED_STAT_CRANK(vcpu_wake_not_runnable);
1320
1321 /*
1322 * If a deadline passed while svc was asleep/blocked, we need new
1323 * scheduling parameters (a new deadline and full budget).
1324 */
1325 now = NOW();
1326
1327 missed = ( now >= svc->cur_deadline );
1328 if ( missed )
1329 rt_update_deadline(now, svc);
1330
1331 /*
1332 * If context hasn't been saved for this vcpu yet, we can't put it on
1333 * the run-queue/depleted-queue. Instead, we set the appropriate flag,
1334 * the vcpu will be put back on queue after the context has been saved
1335 * (in rt_context_save()).
1336 */
1337 if ( unlikely(svc->flags & RTDS_scheduled) )
1338 {
1339 __set_bit(__RTDS_delayed_runq_add, &svc->flags);
1340 /*
1341 * The vcpu is waking up already, and we didn't even had the time to
1342 * remove its next replenishment event from the replenishment queue
1343 * when it blocked! No big deal. If we did not miss the deadline in
1344 * the meantime, let's just leave it there. If we did, let's remove it
1345 * and queue a new one (to occur at our new deadline).
1346 */
1347 if ( missed )
1348 replq_reinsert(ops, svc);
1349 return;
1350 }
1351
1352 /* Replenishment event got cancelled when we blocked. Add it back. */
1353 replq_insert(ops, svc);
1354 /* insert svc to runq/depletedq because svc is not in queue now */
1355 runq_insert(ops, svc);
1356
1357 runq_tickle(ops, svc);
1358 }
1359
1360 /*
1361 * scurr has finished context switch, insert it back to the RunQ,
1362 * and then pick the highest priority vcpu from runq to run
1363 */
1364 static void
rt_context_saved(const struct scheduler * ops,struct vcpu * vc)1365 rt_context_saved(const struct scheduler *ops, struct vcpu *vc)
1366 {
1367 struct rt_vcpu *svc = rt_vcpu(vc);
1368 spinlock_t *lock = vcpu_schedule_lock_irq(vc);
1369
1370 __clear_bit(__RTDS_scheduled, &svc->flags);
1371 /* not insert idle vcpu to runq */
1372 if ( is_idle_vcpu(vc) )
1373 goto out;
1374
1375 if ( __test_and_clear_bit(__RTDS_delayed_runq_add, &svc->flags) &&
1376 likely(vcpu_runnable(vc)) )
1377 {
1378 runq_insert(ops, svc);
1379 runq_tickle(ops, svc);
1380 }
1381 else
1382 replq_remove(ops, svc);
1383
1384 out:
1385 vcpu_schedule_unlock_irq(lock, vc);
1386 }
1387
1388 /*
1389 * set/get each vcpu info of each domain
1390 */
1391 static int
rt_dom_cntl(const struct scheduler * ops,struct domain * d,struct xen_domctl_scheduler_op * op)1392 rt_dom_cntl(
1393 const struct scheduler *ops,
1394 struct domain *d,
1395 struct xen_domctl_scheduler_op *op)
1396 {
1397 struct rt_private *prv = rt_priv(ops);
1398 struct rt_vcpu *svc;
1399 struct vcpu *v;
1400 unsigned long flags;
1401 int rc = 0;
1402 struct xen_domctl_schedparam_vcpu local_sched;
1403 s_time_t period, budget;
1404 uint32_t index = 0;
1405
1406 switch ( op->cmd )
1407 {
1408 case XEN_DOMCTL_SCHEDOP_getinfo:
1409 /* Return the default parameters. */
1410 op->u.rtds.period = RTDS_DEFAULT_PERIOD / MICROSECS(1);
1411 op->u.rtds.budget = RTDS_DEFAULT_BUDGET / MICROSECS(1);
1412 break;
1413 case XEN_DOMCTL_SCHEDOP_putinfo:
1414 if ( op->u.rtds.period == 0 || op->u.rtds.budget == 0 )
1415 {
1416 rc = -EINVAL;
1417 break;
1418 }
1419 spin_lock_irqsave(&prv->lock, flags);
1420 for_each_vcpu ( d, v )
1421 {
1422 svc = rt_vcpu(v);
1423 svc->period = MICROSECS(op->u.rtds.period); /* transfer to nanosec */
1424 svc->budget = MICROSECS(op->u.rtds.budget);
1425 }
1426 spin_unlock_irqrestore(&prv->lock, flags);
1427 break;
1428 case XEN_DOMCTL_SCHEDOP_getvcpuinfo:
1429 case XEN_DOMCTL_SCHEDOP_putvcpuinfo:
1430 while ( index < op->u.v.nr_vcpus )
1431 {
1432 if ( copy_from_guest_offset(&local_sched,
1433 op->u.v.vcpus, index, 1) )
1434 {
1435 rc = -EFAULT;
1436 break;
1437 }
1438 if ( local_sched.vcpuid >= d->max_vcpus ||
1439 d->vcpu[local_sched.vcpuid] == NULL )
1440 {
1441 rc = -EINVAL;
1442 break;
1443 }
1444
1445 if ( op->cmd == XEN_DOMCTL_SCHEDOP_getvcpuinfo )
1446 {
1447 spin_lock_irqsave(&prv->lock, flags);
1448 svc = rt_vcpu(d->vcpu[local_sched.vcpuid]);
1449 local_sched.u.rtds.budget = svc->budget / MICROSECS(1);
1450 local_sched.u.rtds.period = svc->period / MICROSECS(1);
1451 if ( has_extratime(svc) )
1452 local_sched.u.rtds.flags |= XEN_DOMCTL_SCHEDRT_extra;
1453 else
1454 local_sched.u.rtds.flags &= ~XEN_DOMCTL_SCHEDRT_extra;
1455 spin_unlock_irqrestore(&prv->lock, flags);
1456
1457 if ( copy_to_guest_offset(op->u.v.vcpus, index,
1458 &local_sched, 1) )
1459 {
1460 rc = -EFAULT;
1461 break;
1462 }
1463 }
1464 else
1465 {
1466 period = MICROSECS(local_sched.u.rtds.period);
1467 budget = MICROSECS(local_sched.u.rtds.budget);
1468 if ( period > RTDS_MAX_PERIOD || budget < RTDS_MIN_BUDGET ||
1469 budget > period || period < RTDS_MIN_PERIOD )
1470 {
1471 rc = -EINVAL;
1472 break;
1473 }
1474
1475 spin_lock_irqsave(&prv->lock, flags);
1476 svc = rt_vcpu(d->vcpu[local_sched.vcpuid]);
1477 svc->period = period;
1478 svc->budget = budget;
1479 if ( local_sched.u.rtds.flags & XEN_DOMCTL_SCHEDRT_extra )
1480 __set_bit(__RTDS_extratime, &svc->flags);
1481 else
1482 __clear_bit(__RTDS_extratime, &svc->flags);
1483 spin_unlock_irqrestore(&prv->lock, flags);
1484 }
1485 /* Process a most 64 vCPUs without checking for preemptions. */
1486 if ( (++index > 63) && hypercall_preempt_check() )
1487 break;
1488 }
1489 if ( !rc )
1490 /* notify upper caller how many vcpus have been processed. */
1491 op->u.v.nr_vcpus = index;
1492 break;
1493 }
1494
1495 return rc;
1496 }
1497
1498 /*
1499 * The replenishment timer handler picks vcpus
1500 * from the replq and does the actual replenishment.
1501 */
repl_timer_handler(void * data)1502 static void repl_timer_handler(void *data){
1503 s_time_t now;
1504 struct scheduler *ops = data;
1505 struct rt_private *prv = rt_priv(ops);
1506 struct list_head *replq = rt_replq(ops);
1507 struct list_head *runq = rt_runq(ops);
1508 struct timer *repl_timer = prv->repl_timer;
1509 struct list_head *iter, *tmp;
1510 struct rt_vcpu *svc;
1511 LIST_HEAD(tmp_replq);
1512
1513 spin_lock_irq(&prv->lock);
1514
1515 now = NOW();
1516
1517 /*
1518 * Do the replenishment and move replenished vcpus
1519 * to the temporary list to tickle.
1520 * If svc is on run queue, we need to put it at
1521 * the correct place since its deadline changes.
1522 */
1523 list_for_each_safe ( iter, tmp, replq )
1524 {
1525 svc = replq_elem(iter);
1526
1527 if ( now < svc->cur_deadline )
1528 break;
1529
1530 list_del(&svc->replq_elem);
1531 rt_update_deadline(now, svc);
1532 list_add(&svc->replq_elem, &tmp_replq);
1533
1534 if ( vcpu_on_q(svc) )
1535 {
1536 q_remove(svc);
1537 runq_insert(ops, svc);
1538 }
1539 }
1540
1541 /*
1542 * Iterate through the list of updated vcpus.
1543 * If an updated vcpu is running, tickle the head of the
1544 * runqueue if it has a higher priority.
1545 * If an updated vcpu was depleted and on the runqueue, tickle it.
1546 * Finally, reinsert the vcpus back to replenishement events list.
1547 */
1548 list_for_each_safe ( iter, tmp, &tmp_replq )
1549 {
1550 svc = replq_elem(iter);
1551
1552 if ( curr_on_cpu(svc->vcpu->processor) == svc->vcpu &&
1553 !list_empty(runq) )
1554 {
1555 struct rt_vcpu *next_on_runq = q_elem(runq->next);
1556
1557 if ( compare_vcpu_priority(svc, next_on_runq) < 0 )
1558 runq_tickle(ops, next_on_runq);
1559 }
1560 else if ( __test_and_clear_bit(__RTDS_depleted, &svc->flags) &&
1561 vcpu_on_q(svc) )
1562 runq_tickle(ops, svc);
1563
1564 list_del(&svc->replq_elem);
1565 deadline_replq_insert(svc, &svc->replq_elem, replq);
1566 }
1567
1568 /*
1569 * If there are vcpus left in the replenishment event list,
1570 * set the next replenishment to happen at the deadline of
1571 * the one in the front.
1572 */
1573 if ( !list_empty(replq) )
1574 set_timer(repl_timer, replq_elem(replq->next)->cur_deadline);
1575
1576 spin_unlock_irq(&prv->lock);
1577 }
1578
1579 static const struct scheduler sched_rtds_def = {
1580 .name = "SMP RTDS Scheduler",
1581 .opt_name = "rtds",
1582 .sched_id = XEN_SCHEDULER_RTDS,
1583 .sched_data = NULL,
1584
1585 .dump_cpu_state = rt_dump_pcpu,
1586 .dump_settings = rt_dump,
1587 .init = rt_init,
1588 .deinit = rt_deinit,
1589 .init_pdata = rt_init_pdata,
1590 .switch_sched = rt_switch_sched,
1591 .deinit_pdata = rt_deinit_pdata,
1592 .alloc_domdata = rt_alloc_domdata,
1593 .free_domdata = rt_free_domdata,
1594 .init_domain = rt_dom_init,
1595 .destroy_domain = rt_dom_destroy,
1596 .alloc_vdata = rt_alloc_vdata,
1597 .free_vdata = rt_free_vdata,
1598 .insert_vcpu = rt_vcpu_insert,
1599 .remove_vcpu = rt_vcpu_remove,
1600
1601 .adjust = rt_dom_cntl,
1602
1603 .pick_cpu = rt_cpu_pick,
1604 .do_schedule = rt_schedule,
1605 .sleep = rt_vcpu_sleep,
1606 .wake = rt_vcpu_wake,
1607 .context_saved = rt_context_saved,
1608 };
1609
1610 REGISTER_SCHEDULER(sched_rtds_def);
1611