1 /*****************************************************************************
2  * Preemptive Global Earliest Deadline First  (EDF) scheduler for Xen
3  * EDF scheduling is a real-time scheduling algorithm used in embedded field.
4  *
5  * by Sisu Xi, 2013, Washington University in Saint Louis
6  * Meng Xu, 2014-2016, University of Pennsylvania
7  *
8  * Conversion toward event driven model by Tianyang Chen
9  * and Dagaen Golomb, 2016, University of Pennsylvania
10  *
11  * based on the code of credit Scheduler
12  */
13 
14 #include <xen/init.h>
15 #include <xen/lib.h>
16 #include <xen/sched.h>
17 #include <xen/domain.h>
18 #include <xen/delay.h>
19 #include <xen/event.h>
20 #include <xen/time.h>
21 #include <xen/timer.h>
22 #include <xen/perfc.h>
23 #include <xen/sched-if.h>
24 #include <xen/softirq.h>
25 #include <asm/atomic.h>
26 #include <xen/errno.h>
27 #include <xen/trace.h>
28 #include <xen/cpu.h>
29 #include <xen/keyhandler.h>
30 #include <xen/trace.h>
31 #include <xen/err.h>
32 #include <xen/guest_access.h>
33 
34 /*
35  * TODO:
36  *
37  * Migration compensation and resist like credit2 to better use cache;
38  * Lock Holder Problem, using yield?
39  * Self switch problem: VCPUs of the same domain may preempt each other;
40  */
41 
42 /*
43  * Design:
44  *
45  * This scheduler follows the Preemptive Global Earliest Deadline First (EDF)
46  * theory in real-time field.
47  * At any scheduling point, the VCPU with earlier deadline has higher priority.
48  * The scheduler always picks highest priority VCPU to run on a feasible PCPU.
49  * A PCPU is feasible if the VCPU can run on this PCPU and (the PCPU is idle or
50  * has a lower-priority VCPU running on it.)
51  *
52  * Each VCPU has a dedicated period, budget and a extratime flag
53  * The deadline of a VCPU is at the end of each period;
54  * A VCPU has its budget replenished at the beginning of each period;
55  * While scheduled, a VCPU burns its budget.
56  * The VCPU needs to finish its budget before its deadline in each period;
57  * The VCPU discards its unused budget at the end of each period.
58  * When a VCPU runs out of budget in a period, if its extratime flag is set,
59  * the VCPU increases its priority_level by 1 and refills its budget; otherwise,
60  * it has to wait until next period.
61  *
62  * Each VCPU is implemented as a deferable server.
63  * When a VCPU has a task running on it, its budget is continuously burned;
64  * When a VCPU has no task but with budget left, its budget is preserved.
65  *
66  * Queue scheme:
67  * A global runqueue and a global depletedqueue for each CPU pool.
68  * The runqueue holds all runnable VCPUs with budget,
69  * sorted by priority_level and deadline;
70  * The depletedqueue holds all VCPUs without budget, unsorted;
71  *
72  * Note: cpumask and cpupool is supported.
73  */
74 
75 /*
76  * Locking:
77  * A global system lock is used to protect the RunQ and DepletedQ.
78  * The global lock is referenced by schedule_data.schedule_lock
79  * from all physical cpus.
80  *
81  * The lock is already grabbed when calling wake/sleep/schedule/ functions
82  * in schedule.c
83  *
84  * The functions involes RunQ and needs to grab locks are:
85  *    vcpu_insert, vcpu_remove, context_saved, runq_insert
86  */
87 
88 
89 /*
90  * Default parameters:
91  * Period and budget in default is 10 and 4 ms, respectively
92  */
93 #define RTDS_DEFAULT_PERIOD     (MICROSECS(10000))
94 #define RTDS_DEFAULT_BUDGET     (MICROSECS(4000))
95 
96 /*
97  * Max period: max delta of time type, because period is added to the time
98  * a vcpu activates, so this must not overflow.
99  * Min period: 10 us, considering the scheduling overhead (when period is
100  * too low, scheduling is invoked too frequently, causing high overhead).
101  */
102 #define RTDS_MAX_PERIOD     (STIME_DELTA_MAX)
103 #define RTDS_MIN_PERIOD     (MICROSECS(10))
104 
105 /*
106  * Min budget: 10 us, considering the scheduling overhead (when budget is
107  * consumed too fast, scheduling is invoked too frequently, causing
108  * high overhead).
109  */
110 #define RTDS_MIN_BUDGET     (MICROSECS(10))
111 
112 /*
113  * UPDATE_LIMIT_SHIFT: a constant used in rt_update_deadline(). When finding
114  * the next deadline, performing addition could be faster if the difference
115  * between cur_deadline and now is small. If the difference is bigger than
116  * 1024 * period, use multiplication.
117  */
118 #define UPDATE_LIMIT_SHIFT      10
119 
120 /*
121  * Flags
122  */
123 /*
124  * RTDS_scheduled: Is this vcpu either running on, or context-switching off,
125  * a phyiscal cpu?
126  * + Accessed only with global lock held.
127  * + Set when chosen as next in rt_schedule().
128  * + Cleared after context switch has been saved in rt_context_saved()
129  * + Checked in vcpu_wake to see if we can add to the Runqueue, or if we should
130  *   set RTDS_delayed_runq_add
131  * + Checked to be false in runq_insert.
132  */
133 #define __RTDS_scheduled            1
134 #define RTDS_scheduled (1<<__RTDS_scheduled)
135 /*
136  * RTDS_delayed_runq_add: Do we need to add this to the RunQ/DepletedQ
137  * once it's done being context switching out?
138  * + Set when scheduling out in rt_schedule() if prev is runable
139  * + Set in rt_vcpu_wake if it finds RTDS_scheduled set
140  * + Read in rt_context_saved(). If set, it adds prev to the Runqueue/DepletedQ
141  *   and clears the bit.
142  */
143 #define __RTDS_delayed_runq_add     2
144 #define RTDS_delayed_runq_add (1<<__RTDS_delayed_runq_add)
145 
146 /*
147  * RTDS_depleted: Does this vcp run out of budget?
148  * This flag is
149  * + set in burn_budget() if a vcpu has zero budget left;
150  * + cleared and checked in the repenishment handler,
151  *   for the vcpus that are being replenished.
152  */
153 #define __RTDS_depleted     3
154 #define RTDS_depleted (1<<__RTDS_depleted)
155 
156 /*
157  * RTDS_extratime: Can the vcpu run in the time that is
158  * not part of any real-time reservation, and would therefore
159  * be otherwise left idle?
160  */
161 #define __RTDS_extratime    4
162 #define RTDS_extratime (1<<__RTDS_extratime)
163 
164 /*
165  * rt tracing events ("only" 512 available!). Check
166  * include/public/trace.h for more details.
167  */
168 #define TRC_RTDS_TICKLE           TRC_SCHED_CLASS_EVT(RTDS, 1)
169 #define TRC_RTDS_RUNQ_PICK        TRC_SCHED_CLASS_EVT(RTDS, 2)
170 #define TRC_RTDS_BUDGET_BURN      TRC_SCHED_CLASS_EVT(RTDS, 3)
171 #define TRC_RTDS_BUDGET_REPLENISH TRC_SCHED_CLASS_EVT(RTDS, 4)
172 #define TRC_RTDS_SCHED_TASKLET    TRC_SCHED_CLASS_EVT(RTDS, 5)
173 #define TRC_RTDS_SCHEDULE         TRC_SCHED_CLASS_EVT(RTDS, 6)
174 
175 static void repl_timer_handler(void *data);
176 
177 /*
178  * System-wide private data, include global RunQueue/DepletedQ
179  * Global lock is referenced by schedule_data.schedule_lock from all
180  * physical cpus. It can be grabbed via vcpu_schedule_lock_irq()
181  */
182 struct rt_private {
183     spinlock_t lock;            /* the global coarse-grained lock */
184     struct list_head sdom;      /* list of availalbe domains, used for dump */
185 
186     struct list_head runq;      /* ordered list of runnable vcpus */
187     struct list_head depletedq; /* unordered list of depleted vcpus */
188 
189     struct timer *repl_timer;   /* replenishment timer */
190     struct list_head replq;     /* ordered list of vcpus that need replenishment */
191 
192     cpumask_t tickled;          /* cpus been tickled */
193 };
194 
195 /*
196  * Virtual CPU
197  */
198 struct rt_vcpu {
199     struct list_head q_elem;     /* on the runq/depletedq list */
200     struct list_head replq_elem; /* on the replenishment events list */
201 
202     /* VCPU parameters, in nanoseconds */
203     s_time_t period;
204     s_time_t budget;
205 
206     /* VCPU current infomation in nanosecond */
207     s_time_t cur_budget;         /* current budget */
208     s_time_t last_start;         /* last start time */
209     s_time_t cur_deadline;       /* current deadline for EDF */
210 
211     /* Up-pointers */
212     struct rt_dom *sdom;
213     struct vcpu *vcpu;
214 
215     unsigned priority_level;
216 
217     unsigned flags;              /* mark __RTDS_scheduled, etc.. */
218 };
219 
220 /*
221  * Domain
222  */
223 struct rt_dom {
224     struct list_head sdom_elem; /* link list on rt_priv */
225     struct domain *dom;         /* pointer to upper domain */
226 };
227 
228 /*
229  * Useful inline functions
230  */
rt_priv(const struct scheduler * ops)231 static inline struct rt_private *rt_priv(const struct scheduler *ops)
232 {
233     return ops->sched_data;
234 }
235 
rt_vcpu(const struct vcpu * vcpu)236 static inline struct rt_vcpu *rt_vcpu(const struct vcpu *vcpu)
237 {
238     return vcpu->sched_priv;
239 }
240 
rt_dom(const struct domain * dom)241 static inline struct rt_dom *rt_dom(const struct domain *dom)
242 {
243     return dom->sched_priv;
244 }
245 
rt_runq(const struct scheduler * ops)246 static inline struct list_head *rt_runq(const struct scheduler *ops)
247 {
248     return &rt_priv(ops)->runq;
249 }
250 
rt_depletedq(const struct scheduler * ops)251 static inline struct list_head *rt_depletedq(const struct scheduler *ops)
252 {
253     return &rt_priv(ops)->depletedq;
254 }
255 
rt_replq(const struct scheduler * ops)256 static inline struct list_head *rt_replq(const struct scheduler *ops)
257 {
258     return &rt_priv(ops)->replq;
259 }
260 
has_extratime(const struct rt_vcpu * svc)261 static inline bool has_extratime(const struct rt_vcpu *svc)
262 {
263     return svc->flags & RTDS_extratime;
264 }
265 
266 /*
267  * Helper functions for manipulating the runqueue, the depleted queue,
268  * and the replenishment events queue.
269  */
270 static int
vcpu_on_q(const struct rt_vcpu * svc)271 vcpu_on_q(const struct rt_vcpu *svc)
272 {
273    return !list_empty(&svc->q_elem);
274 }
275 
276 static struct rt_vcpu *
q_elem(struct list_head * elem)277 q_elem(struct list_head *elem)
278 {
279     return list_entry(elem, struct rt_vcpu, q_elem);
280 }
281 
282 static struct rt_vcpu *
replq_elem(struct list_head * elem)283 replq_elem(struct list_head *elem)
284 {
285     return list_entry(elem, struct rt_vcpu, replq_elem);
286 }
287 
288 static int
vcpu_on_replq(const struct rt_vcpu * svc)289 vcpu_on_replq(const struct rt_vcpu *svc)
290 {
291     return !list_empty(&svc->replq_elem);
292 }
293 
294 /*
295  * If v1 priority >= v2 priority, return value > 0
296  * Otherwise, return value < 0
297  */
298 static s_time_t
compare_vcpu_priority(const struct rt_vcpu * v1,const struct rt_vcpu * v2)299 compare_vcpu_priority(const struct rt_vcpu *v1, const struct rt_vcpu *v2)
300 {
301     int prio = v2->priority_level - v1->priority_level;
302 
303     if ( prio == 0 )
304         return v2->cur_deadline - v1->cur_deadline;
305 
306     return prio;
307 }
308 
309 /*
310  * Debug related code, dump vcpu/cpu information
311  */
312 static void
rt_dump_vcpu(const struct scheduler * ops,const struct rt_vcpu * svc)313 rt_dump_vcpu(const struct scheduler *ops, const struct rt_vcpu *svc)
314 {
315     cpumask_t *cpupool_mask, *mask;
316 
317     ASSERT(svc != NULL);
318     /* idle vcpu */
319     if( svc->sdom == NULL )
320     {
321         printk("\n");
322         return;
323     }
324 
325     /*
326      * We can't just use 'cpumask_scratch' because the dumping can
327      * happen from a pCPU outside of this scheduler's cpupool, and
328      * hence it's not right to use its pCPU's scratch mask.
329      * On the other hand, it is safe to use svc->vcpu->processor's
330      * own scratch space, since we hold the runqueue lock.
331      */
332     mask = cpumask_scratch_cpu(svc->vcpu->processor);
333 
334     cpupool_mask = cpupool_domain_cpumask(svc->vcpu->domain);
335     cpumask_and(mask, cpupool_mask, svc->vcpu->cpu_hard_affinity);
336     cpulist_scnprintf(keyhandler_scratch, sizeof(keyhandler_scratch), mask);
337     printk("[%5d.%-2u] cpu %u, (%"PRI_stime", %"PRI_stime"),"
338            " cur_b=%"PRI_stime" cur_d=%"PRI_stime" last_start=%"PRI_stime"\n"
339            " \t\t priority_level=%d has_extratime=%d\n"
340            " \t\t onQ=%d runnable=%d flags=%x effective hard_affinity=%s\n",
341             svc->vcpu->domain->domain_id,
342             svc->vcpu->vcpu_id,
343             svc->vcpu->processor,
344             svc->period,
345             svc->budget,
346             svc->cur_budget,
347             svc->cur_deadline,
348             svc->last_start,
349             svc->priority_level,
350             has_extratime(svc),
351             vcpu_on_q(svc),
352             vcpu_runnable(svc->vcpu),
353             svc->flags,
354             keyhandler_scratch);
355 }
356 
357 static void
rt_dump_pcpu(const struct scheduler * ops,int cpu)358 rt_dump_pcpu(const struct scheduler *ops, int cpu)
359 {
360     struct rt_private *prv = rt_priv(ops);
361     struct rt_vcpu *svc;
362     unsigned long flags;
363 
364     spin_lock_irqsave(&prv->lock, flags);
365     printk("CPU[%02d]\n", cpu);
366     /* current VCPU (nothing to say if that's the idle vcpu). */
367     svc = rt_vcpu(curr_on_cpu(cpu));
368     if ( svc && !is_idle_vcpu(svc->vcpu) )
369     {
370         rt_dump_vcpu(ops, svc);
371     }
372     spin_unlock_irqrestore(&prv->lock, flags);
373 }
374 
375 static void
rt_dump(const struct scheduler * ops)376 rt_dump(const struct scheduler *ops)
377 {
378     struct list_head *runq, *depletedq, *replq, *iter;
379     struct rt_private *prv = rt_priv(ops);
380     struct rt_vcpu *svc;
381     struct rt_dom *sdom;
382     unsigned long flags;
383 
384     spin_lock_irqsave(&prv->lock, flags);
385 
386     if ( list_empty(&prv->sdom) )
387         goto out;
388 
389     runq = rt_runq(ops);
390     depletedq = rt_depletedq(ops);
391     replq = rt_replq(ops);
392 
393     printk("Global RunQueue info:\n");
394     list_for_each ( iter, runq )
395     {
396         svc = q_elem(iter);
397         rt_dump_vcpu(ops, svc);
398     }
399 
400     printk("Global DepletedQueue info:\n");
401     list_for_each ( iter, depletedq )
402     {
403         svc = q_elem(iter);
404         rt_dump_vcpu(ops, svc);
405     }
406 
407     printk("Global Replenishment Events info:\n");
408     list_for_each ( iter, replq )
409     {
410         svc = replq_elem(iter);
411         rt_dump_vcpu(ops, svc);
412     }
413 
414     printk("Domain info:\n");
415     list_for_each ( iter, &prv->sdom )
416     {
417         struct vcpu *v;
418 
419         sdom = list_entry(iter, struct rt_dom, sdom_elem);
420         printk("\tdomain: %d\n", sdom->dom->domain_id);
421 
422         for_each_vcpu ( sdom->dom, v )
423         {
424             svc = rt_vcpu(v);
425             rt_dump_vcpu(ops, svc);
426         }
427     }
428 
429  out:
430     spin_unlock_irqrestore(&prv->lock, flags);
431 }
432 
433 /*
434  * update deadline and budget when now >= cur_deadline
435  * it needs to be updated to the deadline of the current period
436  */
437 static void
rt_update_deadline(s_time_t now,struct rt_vcpu * svc)438 rt_update_deadline(s_time_t now, struct rt_vcpu *svc)
439 {
440     ASSERT(now >= svc->cur_deadline);
441     ASSERT(svc->period != 0);
442 
443     if ( svc->cur_deadline + (svc->period << UPDATE_LIMIT_SHIFT) > now )
444     {
445         do
446             svc->cur_deadline += svc->period;
447         while ( svc->cur_deadline <= now );
448     }
449     else
450     {
451         long count = ((now - svc->cur_deadline) / svc->period) + 1;
452         svc->cur_deadline += count * svc->period;
453     }
454 
455     /*
456      * svc may be scheduled to run immediately after it misses deadline
457      * Then rt_update_deadline is called before rt_schedule, which
458      * should only deduct the time spent in current period from the budget
459      */
460     svc->last_start = now;
461     svc->cur_budget = svc->budget;
462     svc->priority_level = 0;
463 
464     /* TRACE */
465     {
466         struct __packed {
467             unsigned vcpu:16, dom:16;
468             unsigned priority_level;
469             uint64_t cur_deadline, cur_budget;
470         } d;
471         d.dom = svc->vcpu->domain->domain_id;
472         d.vcpu = svc->vcpu->vcpu_id;
473         d.priority_level = svc->priority_level;
474         d.cur_deadline = (uint64_t) svc->cur_deadline;
475         d.cur_budget = (uint64_t) svc->cur_budget;
476         trace_var(TRC_RTDS_BUDGET_REPLENISH, 1,
477                   sizeof(d),
478                   (unsigned char *) &d);
479     }
480 
481     return;
482 }
483 
484 /*
485  * Helpers for removing and inserting a vcpu in a queue
486  * that is being kept ordered by the vcpus' deadlines (as EDF
487  * mandates).
488  *
489  * For callers' convenience, the vcpu removing helper returns
490  * true if the vcpu removed was the one at the front of the
491  * queue; similarly, the inserting helper returns true if the
492  * inserted ended at the front of the queue (i.e., in both
493  * cases, if the vcpu with the earliest deadline is what we
494  * are dealing with).
495  */
496 static inline bool
deadline_queue_remove(struct list_head * queue,struct list_head * elem)497 deadline_queue_remove(struct list_head *queue, struct list_head *elem)
498 {
499     int pos = 0;
500 
501     if ( queue->next != elem )
502         pos = 1;
503 
504     list_del_init(elem);
505     return !pos;
506 }
507 
508 static inline bool
deadline_queue_insert(struct rt_vcpu * (* qelem)(struct list_head *),struct rt_vcpu * svc,struct list_head * elem,struct list_head * queue)509 deadline_queue_insert(struct rt_vcpu * (*qelem)(struct list_head *),
510                       struct rt_vcpu *svc, struct list_head *elem,
511                       struct list_head *queue)
512 {
513     struct list_head *iter;
514     int pos = 0;
515 
516     list_for_each ( iter, queue )
517     {
518         struct rt_vcpu * iter_svc = (*qelem)(iter);
519         if ( compare_vcpu_priority(svc, iter_svc) > 0 )
520             break;
521         pos++;
522     }
523     list_add_tail(elem, iter);
524     return !pos;
525 }
526 #define deadline_runq_insert(...) \
527   deadline_queue_insert(&q_elem, ##__VA_ARGS__)
528 #define deadline_replq_insert(...) \
529   deadline_queue_insert(&replq_elem, ##__VA_ARGS__)
530 
531 static inline void
q_remove(struct rt_vcpu * svc)532 q_remove(struct rt_vcpu *svc)
533 {
534     ASSERT( vcpu_on_q(svc) );
535     list_del_init(&svc->q_elem);
536 }
537 
538 static inline void
replq_remove(const struct scheduler * ops,struct rt_vcpu * svc)539 replq_remove(const struct scheduler *ops, struct rt_vcpu *svc)
540 {
541     struct rt_private *prv = rt_priv(ops);
542     struct list_head *replq = rt_replq(ops);
543 
544     ASSERT( vcpu_on_replq(svc) );
545 
546     if ( deadline_queue_remove(replq, &svc->replq_elem) )
547     {
548         /*
549          * The replenishment timer needs to be set to fire when a
550          * replenishment for the vcpu at the front of the replenishment
551          * queue is due. If it is such vcpu that we just removed, we may
552          * need to reprogram the timer.
553          */
554         if ( !list_empty(replq) )
555         {
556             struct rt_vcpu *svc_next = replq_elem(replq->next);
557             set_timer(prv->repl_timer, svc_next->cur_deadline);
558         }
559         else
560             stop_timer(prv->repl_timer);
561     }
562 }
563 
564 /*
565  * Insert svc with budget in RunQ according to EDF:
566  * vcpus with smaller deadlines go first.
567  * Insert svc without budget in DepletedQ unsorted;
568  */
569 static void
runq_insert(const struct scheduler * ops,struct rt_vcpu * svc)570 runq_insert(const struct scheduler *ops, struct rt_vcpu *svc)
571 {
572     struct rt_private *prv = rt_priv(ops);
573     struct list_head *runq = rt_runq(ops);
574 
575     ASSERT( spin_is_locked(&prv->lock) );
576     ASSERT( !vcpu_on_q(svc) );
577     ASSERT( vcpu_on_replq(svc) );
578 
579     /* add svc to runq if svc still has budget or its extratime is set */
580     if ( svc->cur_budget > 0 ||
581          has_extratime(svc) )
582         deadline_runq_insert(svc, &svc->q_elem, runq);
583     else
584         list_add(&svc->q_elem, &prv->depletedq);
585 }
586 
587 static void
replq_insert(const struct scheduler * ops,struct rt_vcpu * svc)588 replq_insert(const struct scheduler *ops, struct rt_vcpu *svc)
589 {
590     struct list_head *replq = rt_replq(ops);
591     struct rt_private *prv = rt_priv(ops);
592 
593     ASSERT( !vcpu_on_replq(svc) );
594 
595     /*
596      * The timer may be re-programmed if svc is inserted
597      * at the front of the event list.
598      */
599     if ( deadline_replq_insert(svc, &svc->replq_elem, replq) )
600         set_timer(prv->repl_timer, svc->cur_deadline);
601 }
602 
603 /*
604  * Removes and re-inserts an event to the replenishment queue.
605  * The aim is to update its position inside the queue, as its
606  * deadline (and hence its replenishment time) could have
607  * changed.
608  */
609 static void
replq_reinsert(const struct scheduler * ops,struct rt_vcpu * svc)610 replq_reinsert(const struct scheduler *ops, struct rt_vcpu *svc)
611 {
612     struct list_head *replq = rt_replq(ops);
613     struct rt_vcpu *rearm_svc = svc;
614     bool_t rearm = 0;
615 
616     ASSERT( vcpu_on_replq(svc) );
617 
618     /*
619      * If svc was at the front of the replenishment queue, we certainly
620      * need to re-program the timer, and we want to use the deadline of
621      * the vcpu which is now at the front of the queue (which may still
622      * be svc or not).
623      *
624      * We may also need to re-program, if svc has been put at the front
625      * of the replenishment queue when being re-inserted.
626      */
627     if ( deadline_queue_remove(replq, &svc->replq_elem) )
628     {
629         deadline_replq_insert(svc, &svc->replq_elem, replq);
630         rearm_svc = replq_elem(replq->next);
631         rearm = 1;
632     }
633     else
634         rearm = deadline_replq_insert(svc, &svc->replq_elem, replq);
635 
636     if ( rearm )
637         set_timer(rt_priv(ops)->repl_timer, rearm_svc->cur_deadline);
638 }
639 
640 /*
641  * Pick a valid CPU for the vcpu vc
642  * Valid CPU of a vcpu is intesection of vcpu's affinity
643  * and available cpus
644  */
645 static int
rt_cpu_pick(const struct scheduler * ops,struct vcpu * vc)646 rt_cpu_pick(const struct scheduler *ops, struct vcpu *vc)
647 {
648     cpumask_t cpus;
649     cpumask_t *online;
650     int cpu;
651 
652     online = cpupool_domain_cpumask(vc->domain);
653     cpumask_and(&cpus, online, vc->cpu_hard_affinity);
654 
655     cpu = cpumask_test_cpu(vc->processor, &cpus)
656             ? vc->processor
657             : cpumask_cycle(vc->processor, &cpus);
658     ASSERT( !cpumask_empty(&cpus) && cpumask_test_cpu(cpu, &cpus) );
659 
660     return cpu;
661 }
662 
663 /*
664  * Init/Free related code
665  */
666 static int
rt_init(struct scheduler * ops)667 rt_init(struct scheduler *ops)
668 {
669     int rc = -ENOMEM;
670     struct rt_private *prv = xzalloc(struct rt_private);
671 
672     printk("Initializing RTDS scheduler\n"
673            "WARNING: This is experimental software in development.\n"
674            "Use at your own risk.\n");
675 
676     if ( prv == NULL )
677         goto err;
678 
679     prv->repl_timer = xzalloc(struct timer);
680     if ( prv->repl_timer == NULL )
681         goto err;
682 
683     spin_lock_init(&prv->lock);
684     INIT_LIST_HEAD(&prv->sdom);
685     INIT_LIST_HEAD(&prv->runq);
686     INIT_LIST_HEAD(&prv->depletedq);
687     INIT_LIST_HEAD(&prv->replq);
688 
689     cpumask_clear(&prv->tickled);
690 
691     ops->sched_data = prv;
692     rc = 0;
693 
694  err:
695     if ( rc && prv )
696     {
697         xfree(prv->repl_timer);
698         xfree(prv);
699     }
700 
701     return rc;
702 }
703 
704 static void
rt_deinit(struct scheduler * ops)705 rt_deinit(struct scheduler *ops)
706 {
707     struct rt_private *prv = rt_priv(ops);
708 
709     ASSERT(prv->repl_timer->status == TIMER_STATUS_invalid ||
710            prv->repl_timer->status == TIMER_STATUS_killed);
711     xfree(prv->repl_timer);
712 
713     ops->sched_data = NULL;
714     xfree(prv);
715 }
716 
717 /*
718  * Point per_cpu spinlock to the global system lock;
719  * All cpu have same global system lock
720  */
721 static void
rt_init_pdata(const struct scheduler * ops,void * pdata,int cpu)722 rt_init_pdata(const struct scheduler *ops, void *pdata, int cpu)
723 {
724     struct rt_private *prv = rt_priv(ops);
725     spinlock_t *old_lock;
726     unsigned long flags;
727 
728     old_lock = pcpu_schedule_lock_irqsave(cpu, &flags);
729 
730     /*
731      * TIMER_STATUS_invalid means we are the first cpu that sees the timer
732      * allocated but not initialized, and so it's up to us to initialize it.
733      */
734     if ( prv->repl_timer->status == TIMER_STATUS_invalid )
735     {
736         init_timer(prv->repl_timer, repl_timer_handler, (void*) ops, cpu);
737         dprintk(XENLOG_DEBUG, "RTDS: timer initialized on cpu %u\n", cpu);
738     }
739 
740     /* Move the scheduler lock to our global runqueue lock.  */
741     per_cpu(schedule_data, cpu).schedule_lock = &prv->lock;
742 
743     /* _Not_ pcpu_schedule_unlock(): per_cpu().schedule_lock changed! */
744     spin_unlock_irqrestore(old_lock, flags);
745 }
746 
747 /* Change the scheduler of cpu to us (RTDS). */
748 static void
rt_switch_sched(struct scheduler * new_ops,unsigned int cpu,void * pdata,void * vdata)749 rt_switch_sched(struct scheduler *new_ops, unsigned int cpu,
750                 void *pdata, void *vdata)
751 {
752     struct rt_private *prv = rt_priv(new_ops);
753     struct rt_vcpu *svc = vdata;
754 
755     ASSERT(!pdata && svc && is_idle_vcpu(svc->vcpu));
756 
757     /*
758      * We are holding the runqueue lock already (it's been taken in
759      * schedule_cpu_switch()). It's actually the runqueue lock of
760      * another scheduler, but that is how things need to be, for
761      * preventing races.
762      */
763     ASSERT(per_cpu(schedule_data, cpu).schedule_lock != &prv->lock);
764 
765     /*
766      * If we are the absolute first cpu being switched toward this
767      * scheduler (in which case we'll see TIMER_STATUS_invalid), or the
768      * first one that is added back to the cpupool that had all its cpus
769      * removed (in which case we'll see TIMER_STATUS_killed), it's our
770      * job to (re)initialize the timer.
771      */
772     if ( prv->repl_timer->status == TIMER_STATUS_invalid ||
773          prv->repl_timer->status == TIMER_STATUS_killed )
774     {
775         init_timer(prv->repl_timer, repl_timer_handler, (void*) new_ops, cpu);
776         dprintk(XENLOG_DEBUG, "RTDS: timer initialized on cpu %u\n", cpu);
777     }
778 
779     idle_vcpu[cpu]->sched_priv = vdata;
780     per_cpu(scheduler, cpu) = new_ops;
781     per_cpu(schedule_data, cpu).sched_priv = NULL; /* no pdata */
782 
783     /*
784      * (Re?)route the lock to the per pCPU lock as /last/ thing. In fact,
785      * if it is free (and it can be) we want that anyone that manages
786      * taking it, find all the initializations we've done above in place.
787      */
788     smp_mb();
789     per_cpu(schedule_data, cpu).schedule_lock = &prv->lock;
790 }
791 
792 static void
rt_deinit_pdata(const struct scheduler * ops,void * pcpu,int cpu)793 rt_deinit_pdata(const struct scheduler *ops, void *pcpu, int cpu)
794 {
795     unsigned long flags;
796     struct rt_private *prv = rt_priv(ops);
797 
798     spin_lock_irqsave(&prv->lock, flags);
799 
800     if ( prv->repl_timer->cpu == cpu )
801     {
802         struct cpupool *c = per_cpu(cpupool, cpu);
803         unsigned int new_cpu = cpumask_cycle(cpu, cpupool_online_cpumask(c));
804 
805         /*
806          * Make sure the timer run on one of the cpus that are still available
807          * to this scheduler. If there aren't any left, it means it's the time
808          * to just kill it.
809          */
810         if ( new_cpu >= nr_cpu_ids )
811         {
812             kill_timer(prv->repl_timer);
813             dprintk(XENLOG_DEBUG, "RTDS: timer killed on cpu %d\n", cpu);
814         }
815         else
816         {
817             migrate_timer(prv->repl_timer, new_cpu);
818         }
819     }
820 
821     spin_unlock_irqrestore(&prv->lock, flags);
822 }
823 
824 static void *
rt_alloc_domdata(const struct scheduler * ops,struct domain * dom)825 rt_alloc_domdata(const struct scheduler *ops, struct domain *dom)
826 {
827     unsigned long flags;
828     struct rt_dom *sdom;
829     struct rt_private * prv = rt_priv(ops);
830 
831     sdom = xzalloc(struct rt_dom);
832     if ( sdom == NULL )
833         return NULL;
834 
835     INIT_LIST_HEAD(&sdom->sdom_elem);
836     sdom->dom = dom;
837 
838     /* spinlock here to insert the dom */
839     spin_lock_irqsave(&prv->lock, flags);
840     list_add_tail(&sdom->sdom_elem, &(prv->sdom));
841     spin_unlock_irqrestore(&prv->lock, flags);
842 
843     return sdom;
844 }
845 
846 static void
rt_free_domdata(const struct scheduler * ops,void * data)847 rt_free_domdata(const struct scheduler *ops, void *data)
848 {
849     unsigned long flags;
850     struct rt_dom *sdom = data;
851     struct rt_private *prv = rt_priv(ops);
852 
853     spin_lock_irqsave(&prv->lock, flags);
854     list_del_init(&sdom->sdom_elem);
855     spin_unlock_irqrestore(&prv->lock, flags);
856     xfree(data);
857 }
858 
859 static int
rt_dom_init(const struct scheduler * ops,struct domain * dom)860 rt_dom_init(const struct scheduler *ops, struct domain *dom)
861 {
862     struct rt_dom *sdom;
863 
864     /* IDLE Domain does not link on rt_private */
865     if ( is_idle_domain(dom) )
866         return 0;
867 
868     sdom = rt_alloc_domdata(ops, dom);
869     if ( sdom == NULL )
870         return -ENOMEM;
871 
872     dom->sched_priv = sdom;
873 
874     return 0;
875 }
876 
877 static void
rt_dom_destroy(const struct scheduler * ops,struct domain * dom)878 rt_dom_destroy(const struct scheduler *ops, struct domain *dom)
879 {
880     rt_free_domdata(ops, rt_dom(dom));
881 }
882 
883 static void *
rt_alloc_vdata(const struct scheduler * ops,struct vcpu * vc,void * dd)884 rt_alloc_vdata(const struct scheduler *ops, struct vcpu *vc, void *dd)
885 {
886     struct rt_vcpu *svc;
887 
888     /* Allocate per-VCPU info */
889     svc = xzalloc(struct rt_vcpu);
890     if ( svc == NULL )
891         return NULL;
892 
893     INIT_LIST_HEAD(&svc->q_elem);
894     INIT_LIST_HEAD(&svc->replq_elem);
895     svc->flags = 0U;
896     svc->sdom = dd;
897     svc->vcpu = vc;
898     svc->last_start = 0;
899 
900     __set_bit(__RTDS_extratime, &svc->flags);
901     svc->priority_level = 0;
902     svc->period = RTDS_DEFAULT_PERIOD;
903     if ( !is_idle_vcpu(vc) )
904         svc->budget = RTDS_DEFAULT_BUDGET;
905 
906     SCHED_STAT_CRANK(vcpu_alloc);
907 
908     return svc;
909 }
910 
911 static void
rt_free_vdata(const struct scheduler * ops,void * priv)912 rt_free_vdata(const struct scheduler *ops, void *priv)
913 {
914     struct rt_vcpu *svc = priv;
915 
916     xfree(svc);
917 }
918 
919 /*
920  * It is called in sched_move_domain() and sched_init_vcpu
921  * in schedule.c.
922  * When move a domain to a new cpupool.
923  * It inserts vcpus of moving domain to the scheduler's RunQ in
924  * dest. cpupool.
925  */
926 static void
rt_vcpu_insert(const struct scheduler * ops,struct vcpu * vc)927 rt_vcpu_insert(const struct scheduler *ops, struct vcpu *vc)
928 {
929     struct rt_vcpu *svc = rt_vcpu(vc);
930     s_time_t now;
931     spinlock_t *lock;
932 
933     BUG_ON( is_idle_vcpu(vc) );
934 
935     /* This is safe because vc isn't yet being scheduled */
936     vc->processor = rt_cpu_pick(ops, vc);
937 
938     lock = vcpu_schedule_lock_irq(vc);
939 
940     now = NOW();
941     if ( now >= svc->cur_deadline )
942         rt_update_deadline(now, svc);
943 
944     if ( !vcpu_on_q(svc) && vcpu_runnable(vc) )
945     {
946         replq_insert(ops, svc);
947 
948         if ( !vc->is_running )
949             runq_insert(ops, svc);
950     }
951     vcpu_schedule_unlock_irq(lock, vc);
952 
953     SCHED_STAT_CRANK(vcpu_insert);
954 }
955 
956 /*
957  * Remove rt_vcpu svc from the old scheduler in source cpupool.
958  */
959 static void
rt_vcpu_remove(const struct scheduler * ops,struct vcpu * vc)960 rt_vcpu_remove(const struct scheduler *ops, struct vcpu *vc)
961 {
962     struct rt_vcpu * const svc = rt_vcpu(vc);
963     struct rt_dom * const sdom = svc->sdom;
964     spinlock_t *lock;
965 
966     SCHED_STAT_CRANK(vcpu_remove);
967 
968     BUG_ON( sdom == NULL );
969 
970     lock = vcpu_schedule_lock_irq(vc);
971     if ( vcpu_on_q(svc) )
972         q_remove(svc);
973 
974     if ( vcpu_on_replq(svc) )
975         replq_remove(ops,svc);
976 
977     vcpu_schedule_unlock_irq(lock, vc);
978 }
979 
980 /*
981  * Burn budget in nanosecond granularity
982  */
983 static void
burn_budget(const struct scheduler * ops,struct rt_vcpu * svc,s_time_t now)984 burn_budget(const struct scheduler *ops, struct rt_vcpu *svc, s_time_t now)
985 {
986     s_time_t delta;
987 
988     /* don't burn budget for idle VCPU */
989     if ( is_idle_vcpu(svc->vcpu) )
990         return;
991 
992     /* burn at nanoseconds level */
993     delta = now - svc->last_start;
994     /*
995      * delta < 0 only happens in nested virtualization;
996      * TODO: how should we handle delta < 0 in a better way?
997      */
998     if ( delta < 0 )
999     {
1000         printk("%s, ATTENTION: now is behind last_start! delta=%"PRI_stime"\n",
1001                 __func__, delta);
1002         svc->last_start = now;
1003         return;
1004     }
1005 
1006     svc->cur_budget -= delta;
1007     svc->last_start = now;
1008 
1009     if ( svc->cur_budget <= 0 )
1010     {
1011         if ( has_extratime(svc) )
1012         {
1013             svc->priority_level++;
1014             svc->cur_budget = svc->budget;
1015         }
1016         else
1017         {
1018             svc->cur_budget = 0;
1019             __set_bit(__RTDS_depleted, &svc->flags);
1020         }
1021     }
1022 
1023     /* TRACE */
1024     {
1025         struct __packed {
1026             unsigned vcpu:16, dom:16;
1027             uint64_t cur_budget;
1028             int delta;
1029             unsigned priority_level;
1030             bool has_extratime;
1031         } d;
1032         d.dom = svc->vcpu->domain->domain_id;
1033         d.vcpu = svc->vcpu->vcpu_id;
1034         d.cur_budget = (uint64_t) svc->cur_budget;
1035         d.delta = delta;
1036         d.priority_level = svc->priority_level;
1037         d.has_extratime = svc->flags & RTDS_extratime;
1038         trace_var(TRC_RTDS_BUDGET_BURN, 1,
1039                   sizeof(d),
1040                   (unsigned char *) &d);
1041     }
1042 }
1043 
1044 /*
1045  * RunQ is sorted. Pick first one within cpumask. If no one, return NULL
1046  * lock is grabbed before calling this function
1047  */
1048 static struct rt_vcpu *
runq_pick(const struct scheduler * ops,const cpumask_t * mask)1049 runq_pick(const struct scheduler *ops, const cpumask_t *mask)
1050 {
1051     struct list_head *runq = rt_runq(ops);
1052     struct list_head *iter;
1053     struct rt_vcpu *svc = NULL;
1054     struct rt_vcpu *iter_svc = NULL;
1055     cpumask_t cpu_common;
1056     cpumask_t *online;
1057 
1058     list_for_each ( iter, runq )
1059     {
1060         iter_svc = q_elem(iter);
1061 
1062         /* mask cpu_hard_affinity & cpupool & mask */
1063         online = cpupool_domain_cpumask(iter_svc->vcpu->domain);
1064         cpumask_and(&cpu_common, online, iter_svc->vcpu->cpu_hard_affinity);
1065         cpumask_and(&cpu_common, mask, &cpu_common);
1066         if ( cpumask_empty(&cpu_common) )
1067             continue;
1068 
1069         ASSERT( iter_svc->cur_budget > 0 );
1070 
1071         svc = iter_svc;
1072         break;
1073     }
1074 
1075     /* TRACE */
1076     {
1077         if( svc != NULL )
1078         {
1079             struct __packed {
1080                 unsigned vcpu:16, dom:16;
1081                 uint64_t cur_deadline, cur_budget;
1082             } d;
1083             d.dom = svc->vcpu->domain->domain_id;
1084             d.vcpu = svc->vcpu->vcpu_id;
1085             d.cur_deadline = (uint64_t) svc->cur_deadline;
1086             d.cur_budget = (uint64_t) svc->cur_budget;
1087             trace_var(TRC_RTDS_RUNQ_PICK, 1,
1088                       sizeof(d),
1089                       (unsigned char *) &d);
1090         }
1091     }
1092 
1093     return svc;
1094 }
1095 
1096 /*
1097  * schedule function for rt scheduler.
1098  * The lock is already grabbed in schedule.c, no need to lock here
1099  */
1100 static struct task_slice
rt_schedule(const struct scheduler * ops,s_time_t now,bool_t tasklet_work_scheduled)1101 rt_schedule(const struct scheduler *ops, s_time_t now, bool_t tasklet_work_scheduled)
1102 {
1103     const int cpu = smp_processor_id();
1104     struct rt_private *prv = rt_priv(ops);
1105     struct rt_vcpu *const scurr = rt_vcpu(current);
1106     struct rt_vcpu *snext = NULL;
1107     struct task_slice ret = { .migrated = 0 };
1108 
1109     /* TRACE */
1110     {
1111         struct __packed {
1112             unsigned cpu:16, tasklet:8, tickled:4, idle:4;
1113         } d;
1114         d.cpu = cpu;
1115         d.tasklet = tasklet_work_scheduled;
1116         d.tickled = cpumask_test_cpu(cpu, &prv->tickled);
1117         d.idle = is_idle_vcpu(current);
1118         trace_var(TRC_RTDS_SCHEDULE, 1,
1119                   sizeof(d),
1120                   (unsigned char *)&d);
1121     }
1122 
1123     /* clear ticked bit now that we've been scheduled */
1124     cpumask_clear_cpu(cpu, &prv->tickled);
1125 
1126     /* burn_budget would return for IDLE VCPU */
1127     burn_budget(ops, scurr, now);
1128 
1129     if ( tasklet_work_scheduled )
1130     {
1131         trace_var(TRC_RTDS_SCHED_TASKLET, 1, 0,  NULL);
1132         snext = rt_vcpu(idle_vcpu[cpu]);
1133     }
1134     else
1135     {
1136         snext = runq_pick(ops, cpumask_of(cpu));
1137         if ( snext == NULL )
1138             snext = rt_vcpu(idle_vcpu[cpu]);
1139 
1140         /* if scurr has higher priority and budget, still pick scurr */
1141         if ( !is_idle_vcpu(current) &&
1142              vcpu_runnable(current) &&
1143              scurr->cur_budget > 0 &&
1144              ( is_idle_vcpu(snext->vcpu) ||
1145                compare_vcpu_priority(scurr, snext) > 0 ) )
1146             snext = scurr;
1147     }
1148 
1149     if ( snext != scurr &&
1150          !is_idle_vcpu(current) &&
1151          vcpu_runnable(current) )
1152         __set_bit(__RTDS_delayed_runq_add, &scurr->flags);
1153 
1154     snext->last_start = now;
1155     ret.time =  -1; /* if an idle vcpu is picked */
1156     if ( !is_idle_vcpu(snext->vcpu) )
1157     {
1158         if ( snext != scurr )
1159         {
1160             q_remove(snext);
1161             __set_bit(__RTDS_scheduled, &snext->flags);
1162         }
1163         if ( snext->vcpu->processor != cpu )
1164         {
1165             snext->vcpu->processor = cpu;
1166             ret.migrated = 1;
1167         }
1168         ret.time = snext->cur_budget; /* invoke the scheduler next time */
1169     }
1170     ret.task = snext->vcpu;
1171 
1172     return ret;
1173 }
1174 
1175 /*
1176  * Remove VCPU from RunQ
1177  * The lock is already grabbed in schedule.c, no need to lock here
1178  */
1179 static void
rt_vcpu_sleep(const struct scheduler * ops,struct vcpu * vc)1180 rt_vcpu_sleep(const struct scheduler *ops, struct vcpu *vc)
1181 {
1182     struct rt_vcpu * const svc = rt_vcpu(vc);
1183 
1184     BUG_ON( is_idle_vcpu(vc) );
1185     SCHED_STAT_CRANK(vcpu_sleep);
1186 
1187     if ( curr_on_cpu(vc->processor) == vc )
1188         cpu_raise_softirq(vc->processor, SCHEDULE_SOFTIRQ);
1189     else if ( vcpu_on_q(svc) )
1190     {
1191         q_remove(svc);
1192         replq_remove(ops, svc);
1193     }
1194     else if ( svc->flags & RTDS_delayed_runq_add )
1195         __clear_bit(__RTDS_delayed_runq_add, &svc->flags);
1196 }
1197 
1198 /*
1199  * Pick a cpu where to run a vcpu,
1200  * possibly kicking out the vcpu running there
1201  * Called by wake() and context_saved()
1202  * We have a running candidate here, the kick logic is:
1203  * Among all the cpus that are within the cpu affinity
1204  * 1) if there are any idle CPUs, kick one.
1205       For cache benefit, we check new->cpu as first
1206  * 2) now all pcpus are busy;
1207  *    among all the running vcpus, pick lowest priority one
1208  *    if snext has higher priority, kick it.
1209  *
1210  * TODO:
1211  * 1) what if these two vcpus belongs to the same domain?
1212  *    replace a vcpu belonging to the same domain introduces more overhead
1213  *
1214  * lock is grabbed before calling this function
1215  */
1216 static void
runq_tickle(const struct scheduler * ops,struct rt_vcpu * new)1217 runq_tickle(const struct scheduler *ops, struct rt_vcpu *new)
1218 {
1219     struct rt_private *prv = rt_priv(ops);
1220     struct rt_vcpu *latest_deadline_vcpu = NULL; /* lowest priority */
1221     struct rt_vcpu *iter_svc;
1222     struct vcpu *iter_vc;
1223     int cpu = 0, cpu_to_tickle = 0;
1224     cpumask_t not_tickled;
1225     cpumask_t *online;
1226 
1227     if ( new == NULL || is_idle_vcpu(new->vcpu) )
1228         return;
1229 
1230     online = cpupool_domain_cpumask(new->vcpu->domain);
1231     cpumask_and(&not_tickled, online, new->vcpu->cpu_hard_affinity);
1232     cpumask_andnot(&not_tickled, &not_tickled, &prv->tickled);
1233 
1234     /*
1235      * 1) If there are any idle CPUs, kick one.
1236      *    For cache benefit,we first search new->cpu.
1237      *    The same loop also find the one with lowest priority.
1238      */
1239     cpu = cpumask_test_or_cycle(new->vcpu->processor, &not_tickled);
1240     while ( cpu!= nr_cpu_ids )
1241     {
1242         iter_vc = curr_on_cpu(cpu);
1243         if ( is_idle_vcpu(iter_vc) )
1244         {
1245             SCHED_STAT_CRANK(tickled_idle_cpu);
1246             cpu_to_tickle = cpu;
1247             goto out;
1248         }
1249         iter_svc = rt_vcpu(iter_vc);
1250         if ( latest_deadline_vcpu == NULL ||
1251              compare_vcpu_priority(iter_svc, latest_deadline_vcpu) < 0 )
1252             latest_deadline_vcpu = iter_svc;
1253 
1254         cpumask_clear_cpu(cpu, &not_tickled);
1255         cpu = cpumask_cycle(cpu, &not_tickled);
1256     }
1257 
1258     /* 2) candicate has higher priority, kick out lowest priority vcpu */
1259     if ( latest_deadline_vcpu != NULL &&
1260          compare_vcpu_priority(latest_deadline_vcpu, new) < 0 )
1261     {
1262         SCHED_STAT_CRANK(tickled_busy_cpu);
1263         cpu_to_tickle = latest_deadline_vcpu->vcpu->processor;
1264         goto out;
1265     }
1266 
1267     /* didn't tickle any cpu */
1268     SCHED_STAT_CRANK(tickled_no_cpu);
1269     return;
1270  out:
1271     /* TRACE */
1272     {
1273         struct {
1274             unsigned cpu:16, pad:16;
1275         } d;
1276         d.cpu = cpu_to_tickle;
1277         d.pad = 0;
1278         trace_var(TRC_RTDS_TICKLE, 1,
1279                   sizeof(d),
1280                   (unsigned char *)&d);
1281     }
1282 
1283     cpumask_set_cpu(cpu_to_tickle, &prv->tickled);
1284     cpu_raise_softirq(cpu_to_tickle, SCHEDULE_SOFTIRQ);
1285     return;
1286 }
1287 
1288 /*
1289  * Should always wake up runnable vcpu, put it back to RunQ.
1290  * Check priority to raise interrupt
1291  * The lock is already grabbed in schedule.c, no need to lock here
1292  * TODO: what if these two vcpus belongs to the same domain?
1293  */
1294 static void
rt_vcpu_wake(const struct scheduler * ops,struct vcpu * vc)1295 rt_vcpu_wake(const struct scheduler *ops, struct vcpu *vc)
1296 {
1297     struct rt_vcpu * const svc = rt_vcpu(vc);
1298     s_time_t now;
1299     bool_t missed;
1300 
1301     BUG_ON( is_idle_vcpu(vc) );
1302 
1303     if ( unlikely(curr_on_cpu(vc->processor) == vc) )
1304     {
1305         SCHED_STAT_CRANK(vcpu_wake_running);
1306         return;
1307     }
1308 
1309     /* on RunQ/DepletedQ, just update info is ok */
1310     if ( unlikely(vcpu_on_q(svc)) )
1311     {
1312         SCHED_STAT_CRANK(vcpu_wake_onrunq);
1313         return;
1314     }
1315 
1316     if ( likely(vcpu_runnable(vc)) )
1317         SCHED_STAT_CRANK(vcpu_wake_runnable);
1318     else
1319         SCHED_STAT_CRANK(vcpu_wake_not_runnable);
1320 
1321     /*
1322      * If a deadline passed while svc was asleep/blocked, we need new
1323      * scheduling parameters (a new deadline and full budget).
1324      */
1325     now = NOW();
1326 
1327     missed = ( now >= svc->cur_deadline );
1328     if ( missed )
1329         rt_update_deadline(now, svc);
1330 
1331     /*
1332      * If context hasn't been saved for this vcpu yet, we can't put it on
1333      * the run-queue/depleted-queue. Instead, we set the appropriate flag,
1334      * the vcpu will be put back on queue after the context has been saved
1335      * (in rt_context_save()).
1336      */
1337     if ( unlikely(svc->flags & RTDS_scheduled) )
1338     {
1339         __set_bit(__RTDS_delayed_runq_add, &svc->flags);
1340         /*
1341          * The vcpu is waking up already, and we didn't even had the time to
1342          * remove its next replenishment event from the replenishment queue
1343          * when it blocked! No big deal. If we did not miss the deadline in
1344          * the meantime, let's just leave it there. If we did, let's remove it
1345          * and queue a new one (to occur at our new deadline).
1346          */
1347         if ( missed )
1348            replq_reinsert(ops, svc);
1349         return;
1350     }
1351 
1352     /* Replenishment event got cancelled when we blocked. Add it back. */
1353     replq_insert(ops, svc);
1354     /* insert svc to runq/depletedq because svc is not in queue now */
1355     runq_insert(ops, svc);
1356 
1357     runq_tickle(ops, svc);
1358 }
1359 
1360 /*
1361  * scurr has finished context switch, insert it back to the RunQ,
1362  * and then pick the highest priority vcpu from runq to run
1363  */
1364 static void
rt_context_saved(const struct scheduler * ops,struct vcpu * vc)1365 rt_context_saved(const struct scheduler *ops, struct vcpu *vc)
1366 {
1367     struct rt_vcpu *svc = rt_vcpu(vc);
1368     spinlock_t *lock = vcpu_schedule_lock_irq(vc);
1369 
1370     __clear_bit(__RTDS_scheduled, &svc->flags);
1371     /* not insert idle vcpu to runq */
1372     if ( is_idle_vcpu(vc) )
1373         goto out;
1374 
1375     if ( __test_and_clear_bit(__RTDS_delayed_runq_add, &svc->flags) &&
1376          likely(vcpu_runnable(vc)) )
1377     {
1378         runq_insert(ops, svc);
1379         runq_tickle(ops, svc);
1380     }
1381     else
1382         replq_remove(ops, svc);
1383 
1384 out:
1385     vcpu_schedule_unlock_irq(lock, vc);
1386 }
1387 
1388 /*
1389  * set/get each vcpu info of each domain
1390  */
1391 static int
rt_dom_cntl(const struct scheduler * ops,struct domain * d,struct xen_domctl_scheduler_op * op)1392 rt_dom_cntl(
1393     const struct scheduler *ops,
1394     struct domain *d,
1395     struct xen_domctl_scheduler_op *op)
1396 {
1397     struct rt_private *prv = rt_priv(ops);
1398     struct rt_vcpu *svc;
1399     struct vcpu *v;
1400     unsigned long flags;
1401     int rc = 0;
1402     struct xen_domctl_schedparam_vcpu local_sched;
1403     s_time_t period, budget;
1404     uint32_t index = 0;
1405 
1406     switch ( op->cmd )
1407     {
1408     case XEN_DOMCTL_SCHEDOP_getinfo:
1409         /* Return the default parameters. */
1410         op->u.rtds.period = RTDS_DEFAULT_PERIOD / MICROSECS(1);
1411         op->u.rtds.budget = RTDS_DEFAULT_BUDGET / MICROSECS(1);
1412         break;
1413     case XEN_DOMCTL_SCHEDOP_putinfo:
1414         if ( op->u.rtds.period == 0 || op->u.rtds.budget == 0 )
1415         {
1416             rc = -EINVAL;
1417             break;
1418         }
1419         spin_lock_irqsave(&prv->lock, flags);
1420         for_each_vcpu ( d, v )
1421         {
1422             svc = rt_vcpu(v);
1423             svc->period = MICROSECS(op->u.rtds.period); /* transfer to nanosec */
1424             svc->budget = MICROSECS(op->u.rtds.budget);
1425         }
1426         spin_unlock_irqrestore(&prv->lock, flags);
1427         break;
1428     case XEN_DOMCTL_SCHEDOP_getvcpuinfo:
1429     case XEN_DOMCTL_SCHEDOP_putvcpuinfo:
1430         while ( index < op->u.v.nr_vcpus )
1431         {
1432             if ( copy_from_guest_offset(&local_sched,
1433                                         op->u.v.vcpus, index, 1) )
1434             {
1435                 rc = -EFAULT;
1436                 break;
1437             }
1438             if ( local_sched.vcpuid >= d->max_vcpus ||
1439                  d->vcpu[local_sched.vcpuid] == NULL )
1440             {
1441                 rc = -EINVAL;
1442                 break;
1443             }
1444 
1445             if ( op->cmd == XEN_DOMCTL_SCHEDOP_getvcpuinfo )
1446             {
1447                 spin_lock_irqsave(&prv->lock, flags);
1448                 svc = rt_vcpu(d->vcpu[local_sched.vcpuid]);
1449                 local_sched.u.rtds.budget = svc->budget / MICROSECS(1);
1450                 local_sched.u.rtds.period = svc->period / MICROSECS(1);
1451                 if ( has_extratime(svc) )
1452                     local_sched.u.rtds.flags |= XEN_DOMCTL_SCHEDRT_extra;
1453                 else
1454                     local_sched.u.rtds.flags &= ~XEN_DOMCTL_SCHEDRT_extra;
1455                 spin_unlock_irqrestore(&prv->lock, flags);
1456 
1457                 if ( copy_to_guest_offset(op->u.v.vcpus, index,
1458                                           &local_sched, 1) )
1459                 {
1460                     rc = -EFAULT;
1461                     break;
1462                 }
1463             }
1464             else
1465             {
1466                 period = MICROSECS(local_sched.u.rtds.period);
1467                 budget = MICROSECS(local_sched.u.rtds.budget);
1468                 if ( period > RTDS_MAX_PERIOD || budget < RTDS_MIN_BUDGET ||
1469                      budget > period || period < RTDS_MIN_PERIOD )
1470                 {
1471                     rc = -EINVAL;
1472                     break;
1473                 }
1474 
1475                 spin_lock_irqsave(&prv->lock, flags);
1476                 svc = rt_vcpu(d->vcpu[local_sched.vcpuid]);
1477                 svc->period = period;
1478                 svc->budget = budget;
1479                 if ( local_sched.u.rtds.flags & XEN_DOMCTL_SCHEDRT_extra )
1480                     __set_bit(__RTDS_extratime, &svc->flags);
1481                 else
1482                     __clear_bit(__RTDS_extratime, &svc->flags);
1483                 spin_unlock_irqrestore(&prv->lock, flags);
1484             }
1485             /* Process a most 64 vCPUs without checking for preemptions. */
1486             if ( (++index > 63) && hypercall_preempt_check() )
1487                 break;
1488         }
1489         if ( !rc )
1490             /* notify upper caller how many vcpus have been processed. */
1491             op->u.v.nr_vcpus = index;
1492         break;
1493     }
1494 
1495     return rc;
1496 }
1497 
1498 /*
1499  * The replenishment timer handler picks vcpus
1500  * from the replq and does the actual replenishment.
1501  */
repl_timer_handler(void * data)1502 static void repl_timer_handler(void *data){
1503     s_time_t now;
1504     struct scheduler *ops = data;
1505     struct rt_private *prv = rt_priv(ops);
1506     struct list_head *replq = rt_replq(ops);
1507     struct list_head *runq = rt_runq(ops);
1508     struct timer *repl_timer = prv->repl_timer;
1509     struct list_head *iter, *tmp;
1510     struct rt_vcpu *svc;
1511     LIST_HEAD(tmp_replq);
1512 
1513     spin_lock_irq(&prv->lock);
1514 
1515     now = NOW();
1516 
1517     /*
1518      * Do the replenishment and move replenished vcpus
1519      * to the temporary list to tickle.
1520      * If svc is on run queue, we need to put it at
1521      * the correct place since its deadline changes.
1522      */
1523     list_for_each_safe ( iter, tmp, replq )
1524     {
1525         svc = replq_elem(iter);
1526 
1527         if ( now < svc->cur_deadline )
1528             break;
1529 
1530         list_del(&svc->replq_elem);
1531         rt_update_deadline(now, svc);
1532         list_add(&svc->replq_elem, &tmp_replq);
1533 
1534         if ( vcpu_on_q(svc) )
1535         {
1536             q_remove(svc);
1537             runq_insert(ops, svc);
1538         }
1539     }
1540 
1541     /*
1542      * Iterate through the list of updated vcpus.
1543      * If an updated vcpu is running, tickle the head of the
1544      * runqueue if it has a higher priority.
1545      * If an updated vcpu was depleted and on the runqueue, tickle it.
1546      * Finally, reinsert the vcpus back to replenishement events list.
1547      */
1548     list_for_each_safe ( iter, tmp, &tmp_replq )
1549     {
1550         svc = replq_elem(iter);
1551 
1552         if ( curr_on_cpu(svc->vcpu->processor) == svc->vcpu &&
1553              !list_empty(runq) )
1554         {
1555             struct rt_vcpu *next_on_runq = q_elem(runq->next);
1556 
1557             if ( compare_vcpu_priority(svc, next_on_runq) < 0 )
1558                 runq_tickle(ops, next_on_runq);
1559         }
1560         else if ( __test_and_clear_bit(__RTDS_depleted, &svc->flags) &&
1561                   vcpu_on_q(svc) )
1562             runq_tickle(ops, svc);
1563 
1564         list_del(&svc->replq_elem);
1565         deadline_replq_insert(svc, &svc->replq_elem, replq);
1566     }
1567 
1568     /*
1569      * If there are vcpus left in the replenishment event list,
1570      * set the next replenishment to happen at the deadline of
1571      * the one in the front.
1572      */
1573     if ( !list_empty(replq) )
1574         set_timer(repl_timer, replq_elem(replq->next)->cur_deadline);
1575 
1576     spin_unlock_irq(&prv->lock);
1577 }
1578 
1579 static const struct scheduler sched_rtds_def = {
1580     .name           = "SMP RTDS Scheduler",
1581     .opt_name       = "rtds",
1582     .sched_id       = XEN_SCHEDULER_RTDS,
1583     .sched_data     = NULL,
1584 
1585     .dump_cpu_state = rt_dump_pcpu,
1586     .dump_settings  = rt_dump,
1587     .init           = rt_init,
1588     .deinit         = rt_deinit,
1589     .init_pdata     = rt_init_pdata,
1590     .switch_sched   = rt_switch_sched,
1591     .deinit_pdata   = rt_deinit_pdata,
1592     .alloc_domdata  = rt_alloc_domdata,
1593     .free_domdata   = rt_free_domdata,
1594     .init_domain    = rt_dom_init,
1595     .destroy_domain = rt_dom_destroy,
1596     .alloc_vdata    = rt_alloc_vdata,
1597     .free_vdata     = rt_free_vdata,
1598     .insert_vcpu    = rt_vcpu_insert,
1599     .remove_vcpu    = rt_vcpu_remove,
1600 
1601     .adjust         = rt_dom_cntl,
1602 
1603     .pick_cpu       = rt_cpu_pick,
1604     .do_schedule    = rt_schedule,
1605     .sleep          = rt_vcpu_sleep,
1606     .wake           = rt_vcpu_wake,
1607     .context_saved  = rt_context_saved,
1608 };
1609 
1610 REGISTER_SCHEDULER(sched_rtds_def);
1611