1 /****************************************************************************
2  * (C) 2002-2003 - Rolf Neugebauer - Intel Research Cambridge
3  * (C) 2002-2003 University of Cambridge
4  * (C) 2004      - Mark Williamson - Intel Research Cambridge
5  ****************************************************************************
6  *
7  *        File: common/schedule.c
8  *      Author: Rolf Neugebauer & Keir Fraser
9  *              Updated for generic API by Mark Williamson
10  *
11  * Description: Generic CPU scheduling code
12  *              implements support functionality for the Xen scheduler API.
13  *
14  */
15 
16 #ifndef COMPAT
17 #include <xen/init.h>
18 #include <xen/lib.h>
19 #include <xen/sched.h>
20 #include <xen/domain.h>
21 #include <xen/delay.h>
22 #include <xen/event.h>
23 #include <xen/time.h>
24 #include <xen/timer.h>
25 #include <xen/perfc.h>
26 #include <xen/sched-if.h>
27 #include <xen/softirq.h>
28 #include <xen/trace.h>
29 #include <xen/mm.h>
30 #include <xen/err.h>
31 #include <xen/guest_access.h>
32 #include <xen/hypercall.h>
33 #include <xen/multicall.h>
34 #include <xen/cpu.h>
35 #include <xen/preempt.h>
36 #include <xen/event.h>
37 #include <public/sched.h>
38 #include <xsm/xsm.h>
39 #include <xen/err.h>
40 
41 /* opt_sched: scheduler - default to configured value */
42 static char __initdata opt_sched[10] = CONFIG_SCHED_DEFAULT;
43 string_param("sched", opt_sched);
44 
45 /* if sched_smt_power_savings is set,
46  * scheduler will give preferrence to partially idle package compared to
47  * the full idle package, when picking pCPU to schedule vCPU.
48  */
49 bool_t sched_smt_power_savings = 0;
50 boolean_param("sched_smt_power_savings", sched_smt_power_savings);
51 
52 /* Default scheduling rate limit: 1ms
53  * The behavior when sched_ratelimit_us is greater than sched_credit_tslice_ms is undefined
54  * */
55 int sched_ratelimit_us = SCHED_DEFAULT_RATELIMIT_US;
56 integer_param("sched_ratelimit_us", sched_ratelimit_us);
57 /* Various timer handlers. */
58 static void s_timer_fn(void *unused);
59 static void vcpu_periodic_timer_fn(void *data);
60 static void vcpu_singleshot_timer_fn(void *data);
61 static void poll_timer_fn(void *data);
62 
63 /* This is global for now so that private implementations can reach it */
64 DEFINE_PER_CPU(struct schedule_data, schedule_data);
65 DEFINE_PER_CPU(struct scheduler *, scheduler);
66 
67 /* Scratch space for cpumasks. */
68 DEFINE_PER_CPU(cpumask_t, cpumask_scratch);
69 
70 extern const struct scheduler *__start_schedulers_array[], *__end_schedulers_array[];
71 #define NUM_SCHEDULERS (__end_schedulers_array - __start_schedulers_array)
72 #define schedulers __start_schedulers_array
73 
74 static struct scheduler __read_mostly ops;
75 
76 #define SCHED_OP(opsptr, fn, ...)                                          \
77          (( (opsptr)->fn != NULL ) ? (opsptr)->fn(opsptr, ##__VA_ARGS__ )  \
78           : (typeof((opsptr)->fn(opsptr, ##__VA_ARGS__)))0 )
79 
dom_scheduler(const struct domain * d)80 static inline struct scheduler *dom_scheduler(const struct domain *d)
81 {
82     if ( likely(d->cpupool != NULL) )
83         return d->cpupool->sched;
84 
85     /*
86      * If d->cpupool is NULL, this is the idle domain. This is special
87      * because the idle domain does not really belong to any cpupool, and,
88      * hence, does not really have a scheduler.
89      *
90      * This is (should be!) only called like this for allocating the idle
91      * vCPUs for the first time, during boot, in which case what we want
92      * is the default scheduler that has been, choosen at boot.
93      */
94     ASSERT(is_idle_domain(d));
95     return &ops;
96 }
97 
vcpu_scheduler(const struct vcpu * v)98 static inline struct scheduler *vcpu_scheduler(const struct vcpu *v)
99 {
100     struct domain *d = v->domain;
101 
102     if ( likely(d->cpupool != NULL) )
103         return d->cpupool->sched;
104 
105     /*
106      * If d->cpupool is NULL, this is a vCPU of the idle domain. And this
107      * case is special because the idle domain does not really belong to
108      * a cpupool and, hence, doesn't really have a scheduler). In fact, its
109      * vCPUs (may) run on pCPUs which are in different pools, with different
110      * schedulers.
111      *
112      * What we want, in this case, is the scheduler of the pCPU where this
113      * particular idle vCPU is running. And, since v->processor never changes
114      * for idle vCPUs, it is safe to use it, with no locks, to figure that out.
115      */
116     ASSERT(is_idle_domain(d));
117     return per_cpu(scheduler, v->processor);
118 }
119 #define VCPU2ONLINE(_v) cpupool_domain_cpumask((_v)->domain)
120 
trace_runstate_change(struct vcpu * v,int new_state)121 static inline void trace_runstate_change(struct vcpu *v, int new_state)
122 {
123     struct { uint32_t vcpu:16, domain:16; } d;
124     uint32_t event;
125 
126     if ( likely(!tb_init_done) )
127         return;
128 
129     d.vcpu = v->vcpu_id;
130     d.domain = v->domain->domain_id;
131 
132     event = TRC_SCHED_RUNSTATE_CHANGE;
133     event |= ( v->runstate.state & 0x3 ) << 8;
134     event |= ( new_state & 0x3 ) << 4;
135 
136     __trace_var(event, 1/*tsc*/, sizeof(d), &d);
137 }
138 
trace_continue_running(struct vcpu * v)139 static inline void trace_continue_running(struct vcpu *v)
140 {
141     struct { uint32_t vcpu:16, domain:16; } d;
142 
143     if ( likely(!tb_init_done) )
144         return;
145 
146     d.vcpu = v->vcpu_id;
147     d.domain = v->domain->domain_id;
148 
149     __trace_var(TRC_SCHED_CONTINUE_RUNNING, 1/*tsc*/, sizeof(d), &d);
150 }
151 
vcpu_urgent_count_update(struct vcpu * v)152 static inline void vcpu_urgent_count_update(struct vcpu *v)
153 {
154     if ( is_idle_vcpu(v) )
155         return;
156 
157     if ( unlikely(v->is_urgent) )
158     {
159         if ( !(v->pause_flags & VPF_blocked) ||
160              !test_bit(v->vcpu_id, v->domain->poll_mask) )
161         {
162             v->is_urgent = 0;
163             atomic_dec(&per_cpu(schedule_data,v->processor).urgent_count);
164         }
165     }
166     else
167     {
168         if ( unlikely(v->pause_flags & VPF_blocked) &&
169              unlikely(test_bit(v->vcpu_id, v->domain->poll_mask)) )
170         {
171             v->is_urgent = 1;
172             atomic_inc(&per_cpu(schedule_data,v->processor).urgent_count);
173         }
174     }
175 }
176 
vcpu_runstate_change(struct vcpu * v,int new_state,s_time_t new_entry_time)177 static inline void vcpu_runstate_change(
178     struct vcpu *v, int new_state, s_time_t new_entry_time)
179 {
180     s_time_t delta;
181 
182     ASSERT(v->runstate.state != new_state);
183     ASSERT(spin_is_locked(per_cpu(schedule_data,v->processor).schedule_lock));
184 
185     vcpu_urgent_count_update(v);
186 
187     trace_runstate_change(v, new_state);
188 
189     delta = new_entry_time - v->runstate.state_entry_time;
190     if ( delta > 0 )
191     {
192         v->runstate.time[v->runstate.state] += delta;
193         v->runstate.state_entry_time = new_entry_time;
194     }
195 
196     v->runstate.state = new_state;
197 }
198 
vcpu_runstate_get(struct vcpu * v,struct vcpu_runstate_info * runstate)199 void vcpu_runstate_get(struct vcpu *v, struct vcpu_runstate_info *runstate)
200 {
201     spinlock_t *lock = likely(v == current) ? NULL : vcpu_schedule_lock_irq(v);
202     s_time_t delta;
203 
204     memcpy(runstate, &v->runstate, sizeof(*runstate));
205     delta = NOW() - runstate->state_entry_time;
206     if ( delta > 0 )
207         runstate->time[runstate->state] += delta;
208 
209     if ( unlikely(lock != NULL) )
210         vcpu_schedule_unlock_irq(lock, v);
211 }
212 
get_cpu_idle_time(unsigned int cpu)213 uint64_t get_cpu_idle_time(unsigned int cpu)
214 {
215     struct vcpu_runstate_info state = { 0 };
216     struct vcpu *v = idle_vcpu[cpu];
217 
218     if ( cpu_online(cpu) && v )
219         vcpu_runstate_get(v, &state);
220 
221     return state.time[RUNSTATE_running];
222 }
223 
224 /*
225  * If locks are different, take the one with the lower address first.
226  * This avoids dead- or live-locks when this code is running on both
227  * cpus at the same time.
228  */
sched_spin_lock_double(spinlock_t * lock1,spinlock_t * lock2,unsigned long * flags)229 static void sched_spin_lock_double(spinlock_t *lock1, spinlock_t *lock2,
230                                    unsigned long *flags)
231 {
232     if ( lock1 == lock2 )
233     {
234         spin_lock_irqsave(lock1, *flags);
235     }
236     else if ( lock1 < lock2 )
237     {
238         spin_lock_irqsave(lock1, *flags);
239         spin_lock(lock2);
240     }
241     else
242     {
243         spin_lock_irqsave(lock2, *flags);
244         spin_lock(lock1);
245     }
246 }
247 
sched_spin_unlock_double(spinlock_t * lock1,spinlock_t * lock2,unsigned long flags)248 static void sched_spin_unlock_double(spinlock_t *lock1, spinlock_t *lock2,
249                                      unsigned long flags)
250 {
251     if ( lock1 != lock2 )
252         spin_unlock(lock2);
253     spin_unlock_irqrestore(lock1, flags);
254 }
255 
sched_init_vcpu(struct vcpu * v,unsigned int processor)256 int sched_init_vcpu(struct vcpu *v, unsigned int processor)
257 {
258     struct domain *d = v->domain;
259 
260     /*
261      * Initialize processor and affinity settings. The idler, and potentially
262      * domain-0 VCPUs, are pinned onto their respective physical CPUs.
263      */
264     v->processor = processor;
265     if ( is_idle_domain(d) || d->is_pinned )
266         cpumask_copy(v->cpu_hard_affinity, cpumask_of(processor));
267     else
268         cpumask_setall(v->cpu_hard_affinity);
269 
270     cpumask_setall(v->cpu_soft_affinity);
271 
272     /* Initialise the per-vcpu timers. */
273     init_timer(&v->periodic_timer, vcpu_periodic_timer_fn,
274                v, v->processor);
275     init_timer(&v->singleshot_timer, vcpu_singleshot_timer_fn,
276                v, v->processor);
277     init_timer(&v->poll_timer, poll_timer_fn,
278                v, v->processor);
279 
280     v->sched_priv = SCHED_OP(dom_scheduler(d), alloc_vdata, v,
281 		             d->sched_priv);
282     if ( v->sched_priv == NULL )
283         return 1;
284 
285     /* Idle VCPUs are scheduled immediately, so don't put them in runqueue. */
286     if ( is_idle_domain(d) )
287     {
288         per_cpu(schedule_data, v->processor).curr = v;
289         v->is_running = 1;
290     }
291     else
292     {
293         SCHED_OP(dom_scheduler(d), insert_vcpu, v);
294     }
295 
296     return 0;
297 }
298 
sched_move_irqs(struct vcpu * v)299 static void sched_move_irqs(struct vcpu *v)
300 {
301     arch_move_irqs(v);
302     evtchn_move_pirqs(v);
303 }
304 
sched_move_domain(struct domain * d,struct cpupool * c)305 int sched_move_domain(struct domain *d, struct cpupool *c)
306 {
307     struct vcpu *v;
308     unsigned int new_p;
309     void **vcpu_priv;
310     void *domdata;
311     void *vcpudata;
312     struct scheduler *old_ops;
313     void *old_domdata;
314 
315     for_each_vcpu ( d, v )
316     {
317         if ( v->affinity_broken )
318             return -EBUSY;
319     }
320 
321     domdata = SCHED_OP(c->sched, alloc_domdata, d);
322     if ( domdata == NULL )
323         return -ENOMEM;
324 
325     vcpu_priv = xzalloc_array(void *, d->max_vcpus);
326     if ( vcpu_priv == NULL )
327     {
328         SCHED_OP(c->sched, free_domdata, domdata);
329         return -ENOMEM;
330     }
331 
332     for_each_vcpu ( d, v )
333     {
334         vcpu_priv[v->vcpu_id] = SCHED_OP(c->sched, alloc_vdata, v, domdata);
335         if ( vcpu_priv[v->vcpu_id] == NULL )
336         {
337             for_each_vcpu ( d, v )
338                 xfree(vcpu_priv[v->vcpu_id]);
339             xfree(vcpu_priv);
340             SCHED_OP(c->sched, free_domdata, domdata);
341             return -ENOMEM;
342         }
343     }
344 
345     domain_pause(d);
346 
347     old_ops = dom_scheduler(d);
348     old_domdata = d->sched_priv;
349 
350     for_each_vcpu ( d, v )
351     {
352         SCHED_OP(old_ops, remove_vcpu, v);
353     }
354 
355     d->cpupool = c;
356     d->sched_priv = domdata;
357 
358     new_p = cpumask_first(c->cpu_valid);
359     for_each_vcpu ( d, v )
360     {
361         spinlock_t *lock;
362 
363         vcpudata = v->sched_priv;
364 
365         migrate_timer(&v->periodic_timer, new_p);
366         migrate_timer(&v->singleshot_timer, new_p);
367         migrate_timer(&v->poll_timer, new_p);
368 
369         cpumask_setall(v->cpu_hard_affinity);
370         cpumask_setall(v->cpu_soft_affinity);
371 
372         lock = vcpu_schedule_lock_irq(v);
373         v->processor = new_p;
374         /*
375          * With v->processor modified we must not
376          * - make any further changes assuming we hold the scheduler lock,
377          * - use vcpu_schedule_unlock_irq().
378          */
379         spin_unlock_irq(lock);
380 
381         v->sched_priv = vcpu_priv[v->vcpu_id];
382         if ( !d->is_dying )
383             sched_move_irqs(v);
384 
385         new_p = cpumask_cycle(new_p, c->cpu_valid);
386 
387         SCHED_OP(c->sched, insert_vcpu, v);
388 
389         SCHED_OP(old_ops, free_vdata, vcpudata);
390     }
391 
392     domain_update_node_affinity(d);
393 
394     domain_unpause(d);
395 
396     SCHED_OP(old_ops, free_domdata, old_domdata);
397 
398     xfree(vcpu_priv);
399 
400     return 0;
401 }
402 
sched_destroy_vcpu(struct vcpu * v)403 void sched_destroy_vcpu(struct vcpu *v)
404 {
405     kill_timer(&v->periodic_timer);
406     kill_timer(&v->singleshot_timer);
407     kill_timer(&v->poll_timer);
408     if ( test_and_clear_bool(v->is_urgent) )
409         atomic_dec(&per_cpu(schedule_data, v->processor).urgent_count);
410     SCHED_OP(vcpu_scheduler(v), remove_vcpu, v);
411     SCHED_OP(vcpu_scheduler(v), free_vdata, v->sched_priv);
412 }
413 
sched_init_domain(struct domain * d,int poolid)414 int sched_init_domain(struct domain *d, int poolid)
415 {
416     int ret;
417 
418     ASSERT(d->cpupool == NULL);
419 
420     if ( (ret = cpupool_add_domain(d, poolid)) )
421         return ret;
422 
423     SCHED_STAT_CRANK(dom_init);
424     TRACE_1D(TRC_SCHED_DOM_ADD, d->domain_id);
425     return SCHED_OP(dom_scheduler(d), init_domain, d);
426 }
427 
sched_destroy_domain(struct domain * d)428 void sched_destroy_domain(struct domain *d)
429 {
430     ASSERT(d->cpupool != NULL || is_idle_domain(d));
431 
432     SCHED_STAT_CRANK(dom_destroy);
433     TRACE_1D(TRC_SCHED_DOM_REM, d->domain_id);
434     SCHED_OP(dom_scheduler(d), destroy_domain, d);
435 
436     cpupool_rm_domain(d);
437 }
438 
vcpu_sleep_nosync(struct vcpu * v)439 void vcpu_sleep_nosync(struct vcpu *v)
440 {
441     unsigned long flags;
442     spinlock_t *lock;
443 
444     TRACE_2D(TRC_SCHED_SLEEP, v->domain->domain_id, v->vcpu_id);
445 
446     lock = vcpu_schedule_lock_irqsave(v, &flags);
447 
448     if ( likely(!vcpu_runnable(v)) )
449     {
450         if ( v->runstate.state == RUNSTATE_runnable )
451             vcpu_runstate_change(v, RUNSTATE_offline, NOW());
452 
453         SCHED_OP(vcpu_scheduler(v), sleep, v);
454     }
455 
456     vcpu_schedule_unlock_irqrestore(lock, flags, v);
457 }
458 
vcpu_sleep_sync(struct vcpu * v)459 void vcpu_sleep_sync(struct vcpu *v)
460 {
461     vcpu_sleep_nosync(v);
462 
463     while ( !vcpu_runnable(v) && v->is_running )
464         cpu_relax();
465 
466     sync_vcpu_execstate(v);
467 }
468 
vcpu_wake(struct vcpu * v)469 void vcpu_wake(struct vcpu *v)
470 {
471     unsigned long flags;
472     spinlock_t *lock;
473 
474     TRACE_2D(TRC_SCHED_WAKE, v->domain->domain_id, v->vcpu_id);
475 
476     lock = vcpu_schedule_lock_irqsave(v, &flags);
477 
478     if ( likely(vcpu_runnable(v)) )
479     {
480         if ( v->runstate.state >= RUNSTATE_blocked )
481             vcpu_runstate_change(v, RUNSTATE_runnable, NOW());
482         SCHED_OP(vcpu_scheduler(v), wake, v);
483     }
484     else if ( !(v->pause_flags & VPF_blocked) )
485     {
486         if ( v->runstate.state == RUNSTATE_blocked )
487             vcpu_runstate_change(v, RUNSTATE_offline, NOW());
488     }
489 
490     vcpu_schedule_unlock_irqrestore(lock, flags, v);
491 }
492 
vcpu_unblock(struct vcpu * v)493 void vcpu_unblock(struct vcpu *v)
494 {
495     if ( !test_and_clear_bit(_VPF_blocked, &v->pause_flags) )
496         return;
497 
498     /* Polling period ends when a VCPU is unblocked. */
499     if ( unlikely(v->poll_evtchn != 0) )
500     {
501         v->poll_evtchn = 0;
502         /*
503          * We *must* re-clear _VPF_blocked to avoid racing other wakeups of
504          * this VCPU (and it then going back to sleep on poll_mask).
505          * Test-and-clear is idiomatic and ensures clear_bit not reordered.
506          */
507         if ( test_and_clear_bit(v->vcpu_id, v->domain->poll_mask) )
508             clear_bit(_VPF_blocked, &v->pause_flags);
509     }
510 
511     vcpu_wake(v);
512 }
513 
514 /*
515  * Do the actual movement of a vcpu from old to new CPU. Locks for *both*
516  * CPUs needs to have been taken already when calling this!
517  */
vcpu_move_locked(struct vcpu * v,unsigned int new_cpu)518 static void vcpu_move_locked(struct vcpu *v, unsigned int new_cpu)
519 {
520     unsigned int old_cpu = v->processor;
521 
522     /*
523      * Transfer urgency status to new CPU before switching CPUs, as
524      * once the switch occurs, v->is_urgent is no longer protected by
525      * the per-CPU scheduler lock we are holding.
526      */
527     if ( unlikely(v->is_urgent) && (old_cpu != new_cpu) )
528     {
529         atomic_inc(&per_cpu(schedule_data, new_cpu).urgent_count);
530         atomic_dec(&per_cpu(schedule_data, old_cpu).urgent_count);
531     }
532 
533     /*
534      * Actual CPU switch to new CPU.  This is safe because the lock
535      * pointer cant' change while the current lock is held.
536      */
537     if ( vcpu_scheduler(v)->migrate )
538         SCHED_OP(vcpu_scheduler(v), migrate, v, new_cpu);
539     else
540         v->processor = new_cpu;
541 }
542 
543 /*
544  * Move a vcpu from its current processor to a target new processor,
545  * without asking the scheduler to do any placement. This is intended
546  * for being called from special contexts, where things are quiet
547  * enough that no contention is supposed to happen (i.e., during
548  * shutdown or software suspend, like ACPI S3).
549  */
vcpu_move_nosched(struct vcpu * v,unsigned int new_cpu)550 static void vcpu_move_nosched(struct vcpu *v, unsigned int new_cpu)
551 {
552     unsigned long flags;
553     spinlock_t *lock, *new_lock;
554 
555     ASSERT(system_state == SYS_STATE_suspend);
556     ASSERT(!vcpu_runnable(v) && (atomic_read(&v->pause_count) ||
557                                  atomic_read(&v->domain->pause_count)));
558 
559     lock = per_cpu(schedule_data, v->processor).schedule_lock;
560     new_lock = per_cpu(schedule_data, new_cpu).schedule_lock;
561 
562     sched_spin_lock_double(lock, new_lock, &flags);
563     ASSERT(new_cpu != v->processor);
564     vcpu_move_locked(v, new_cpu);
565     sched_spin_unlock_double(lock, new_lock, flags);
566 
567     sched_move_irqs(v);
568 }
569 
vcpu_migrate(struct vcpu * v)570 static void vcpu_migrate(struct vcpu *v)
571 {
572     unsigned long flags;
573     unsigned int old_cpu, new_cpu;
574     spinlock_t *old_lock, *new_lock;
575     bool_t pick_called = 0;
576 
577     old_cpu = new_cpu = v->processor;
578     for ( ; ; )
579     {
580         /*
581          * We need another iteration if the pre-calculated lock addresses
582          * are not correct any longer after evaluating old and new cpu holding
583          * the locks.
584          */
585         old_lock = per_cpu(schedule_data, old_cpu).schedule_lock;
586         new_lock = per_cpu(schedule_data, new_cpu).schedule_lock;
587 
588         sched_spin_lock_double(old_lock, new_lock, &flags);
589 
590         old_cpu = v->processor;
591         if ( old_lock == per_cpu(schedule_data, old_cpu).schedule_lock )
592         {
593             /*
594              * If we selected a CPU on the previosu iteration, check if it
595              * remains suitable for running this vCPU.
596              */
597             if ( pick_called &&
598                  (new_lock == per_cpu(schedule_data, new_cpu).schedule_lock) &&
599                  cpumask_test_cpu(new_cpu, v->cpu_hard_affinity) &&
600                  cpumask_test_cpu(new_cpu, v->domain->cpupool->cpu_valid) )
601                 break;
602 
603             /* Select a new CPU. */
604             new_cpu = SCHED_OP(vcpu_scheduler(v), pick_cpu, v);
605             if ( (new_lock == per_cpu(schedule_data, new_cpu).schedule_lock) &&
606                  cpumask_test_cpu(new_cpu, v->domain->cpupool->cpu_valid) )
607                 break;
608             pick_called = 1;
609         }
610         else
611         {
612             /*
613              * We do not hold the scheduler lock appropriate for this vCPU.
614              * Thus we cannot select a new CPU on this iteration. Try again.
615              */
616             pick_called = 0;
617         }
618 
619         sched_spin_unlock_double(old_lock, new_lock, flags);
620     }
621 
622     /*
623      * NB. Check of v->running happens /after/ setting migration flag
624      * because they both happen in (different) spinlock regions, and those
625      * regions are strictly serialised.
626      */
627     if ( v->is_running ||
628          !test_and_clear_bit(_VPF_migrating, &v->pause_flags) )
629     {
630         sched_spin_unlock_double(old_lock, new_lock, flags);
631         return;
632     }
633 
634     vcpu_move_locked(v, new_cpu);
635 
636     sched_spin_unlock_double(old_lock, new_lock, flags);
637 
638     if ( old_cpu != new_cpu )
639         sched_move_irqs(v);
640 
641     /* Wake on new CPU. */
642     vcpu_wake(v);
643 }
644 
645 /*
646  * Force a VCPU through a deschedule/reschedule path.
647  * For example, using this when setting the periodic timer period means that
648  * most periodic-timer state need only be touched from within the scheduler
649  * which can thus be done without need for synchronisation.
650  */
vcpu_force_reschedule(struct vcpu * v)651 void vcpu_force_reschedule(struct vcpu *v)
652 {
653     spinlock_t *lock = vcpu_schedule_lock_irq(v);
654 
655     if ( v->is_running )
656         set_bit(_VPF_migrating, &v->pause_flags);
657     vcpu_schedule_unlock_irq(lock, v);
658 
659     if ( v->pause_flags & VPF_migrating )
660     {
661         vcpu_sleep_nosync(v);
662         vcpu_migrate(v);
663     }
664 }
665 
restore_vcpu_affinity(struct domain * d)666 void restore_vcpu_affinity(struct domain *d)
667 {
668     unsigned int cpu = smp_processor_id();
669     struct vcpu *v;
670 
671     ASSERT(system_state == SYS_STATE_resume);
672 
673     for_each_vcpu ( d, v )
674     {
675         spinlock_t *lock;
676 
677         ASSERT(!vcpu_runnable(v));
678 
679         lock = vcpu_schedule_lock_irq(v);
680 
681         if ( v->affinity_broken )
682         {
683             cpumask_copy(v->cpu_hard_affinity, v->cpu_hard_affinity_saved);
684             v->affinity_broken = 0;
685 
686         }
687 
688         /*
689          * During suspend (in cpu_disable_scheduler()), we moved every vCPU
690          * to BSP (which, as of now, is pCPU 0), as a temporary measure to
691          * allow the nonboot processors to have their data structure freed
692          * and go to sleep. But nothing guardantees that the BSP is a valid
693          * pCPU for a particular domain.
694          *
695          * Therefore, here, before actually unpausing the domains, we should
696          * set v->processor of each of their vCPUs to something that will
697          * make sense for the scheduler of the cpupool in which they are in.
698          */
699         cpumask_and(cpumask_scratch_cpu(cpu), v->cpu_hard_affinity,
700                     cpupool_domain_cpumask(v->domain));
701         v->processor = cpumask_any(cpumask_scratch_cpu(cpu));
702 
703         spin_unlock_irq(lock);
704 
705         lock = vcpu_schedule_lock_irq(v);
706         v->processor = SCHED_OP(vcpu_scheduler(v), pick_cpu, v);
707         spin_unlock_irq(lock);
708     }
709 
710     domain_update_node_affinity(d);
711 }
712 
713 /*
714  * This function is used by cpu_hotplug code from stop_machine context
715  * and from cpupools to switch schedulers on a cpu.
716  */
cpu_disable_scheduler(unsigned int cpu)717 int cpu_disable_scheduler(unsigned int cpu)
718 {
719     struct domain *d;
720     struct vcpu *v;
721     struct cpupool *c;
722     cpumask_t online_affinity;
723     unsigned int new_cpu;
724     int ret = 0;
725 
726     c = per_cpu(cpupool, cpu);
727     if ( c == NULL )
728         return ret;
729 
730     /*
731      * We'd need the domain RCU lock, but:
732      *  - when we are called from cpupool code, it's acquired there already;
733      *  - when we are called for CPU teardown, we're in stop-machine context,
734      *    so that's not be a problem.
735      */
736     for_each_domain_in_cpupool ( d, c )
737     {
738         for_each_vcpu ( d, v )
739         {
740             unsigned long flags;
741             spinlock_t *lock = vcpu_schedule_lock_irqsave(v, &flags);
742 
743             cpumask_and(&online_affinity, v->cpu_hard_affinity, c->cpu_valid);
744             if ( cpumask_empty(&online_affinity) &&
745                  cpumask_test_cpu(cpu, v->cpu_hard_affinity) )
746             {
747                 if ( v->affinity_broken )
748                 {
749                     /* The vcpu is temporarily pinned, can't move it. */
750                     vcpu_schedule_unlock_irqrestore(lock, flags, v);
751                     ret = -EADDRINUSE;
752                     break;
753                 }
754 
755                 if (system_state == SYS_STATE_suspend)
756                 {
757                     cpumask_copy(v->cpu_hard_affinity_saved,
758                                  v->cpu_hard_affinity);
759                     v->affinity_broken = 1;
760                 }
761                 else
762                     printk(XENLOG_DEBUG "Breaking affinity for %pv\n", v);
763 
764                 cpumask_setall(v->cpu_hard_affinity);
765             }
766 
767             if ( v->processor != cpu )
768             {
769                 /* The vcpu is not on this cpu, so we can move on. */
770                 vcpu_schedule_unlock_irqrestore(lock, flags, v);
771                 continue;
772             }
773 
774             /* If it is on this cpu, we must send it away. */
775             if ( unlikely(system_state == SYS_STATE_suspend) )
776             {
777                 vcpu_schedule_unlock_irqrestore(lock, flags, v);
778 
779                 /*
780                  * If we are doing a shutdown/suspend, it is not necessary to
781                  * ask the scheduler to chime in. In fact:
782                  *  * there is no reason for it: the end result we are after
783                  *    is just 'all the vcpus on the boot pcpu, and no vcpu
784                  *    anywhere else', so let's just go for it;
785                  *  * it's wrong, for cpupools with only non-boot pcpus, as
786                  *    the scheduler would always fail to send the vcpus away
787                  *    from the last online (non boot) pcpu!
788                  *
789                  * Therefore, in the shutdown/suspend case, we just pick up
790                  * one (still) online pcpu. Note that, at this stage, all
791                  * domains (including dom0) have been paused already, so we
792                  * do not expect any vcpu activity at all.
793                  */
794                 cpumask_andnot(&online_affinity, &cpu_online_map,
795                                cpumask_of(cpu));
796                 BUG_ON(cpumask_empty(&online_affinity));
797                 /*
798                  * As boot cpu is, usually, pcpu #0, using cpumask_first()
799                  * will make us converge quicker.
800                  */
801                 new_cpu = cpumask_first(&online_affinity);
802                 vcpu_move_nosched(v, new_cpu);
803             }
804             else
805             {
806                 /*
807                  * OTOH, if the system is still live, and we are here because
808                  * we are doing some cpupool manipulations:
809                  *  * we want to call the scheduler, and let it re-evaluation
810                  *    the placement of the vcpu, taking into account the new
811                  *    cpupool configuration;
812                  *  * the scheduler will always fine a suitable solution, or
813                  *    things would have failed before getting in here.
814                  */
815                 set_bit(_VPF_migrating, &v->pause_flags);
816                 vcpu_schedule_unlock_irqrestore(lock, flags, v);
817                 vcpu_sleep_nosync(v);
818                 vcpu_migrate(v);
819 
820                 /*
821                  * The only caveat, in this case, is that if a vcpu active in
822                  * the hypervisor isn't migratable. In this case, the caller
823                  * should try again after releasing and reaquiring all locks.
824                  */
825                 if ( v->processor == cpu )
826                     ret = -EAGAIN;
827             }
828         }
829     }
830 
831     return ret;
832 }
833 
vcpu_set_affinity(struct vcpu * v,const cpumask_t * affinity,cpumask_t * which)834 static int vcpu_set_affinity(
835     struct vcpu *v, const cpumask_t *affinity, cpumask_t *which)
836 {
837     spinlock_t *lock;
838     int ret = 0;
839 
840     lock = vcpu_schedule_lock_irq(v);
841 
842     if ( v->affinity_broken )
843         ret = -EBUSY;
844     else
845     {
846         cpumask_copy(which, affinity);
847 
848         /*
849          * Always ask the scheduler to re-evaluate placement
850          * when changing the affinity.
851          */
852         set_bit(_VPF_migrating, &v->pause_flags);
853     }
854 
855     vcpu_schedule_unlock_irq(lock, v);
856 
857     domain_update_node_affinity(v->domain);
858 
859     if ( v->pause_flags & VPF_migrating )
860     {
861         vcpu_sleep_nosync(v);
862         vcpu_migrate(v);
863     }
864 
865     return ret;
866 }
867 
vcpu_set_hard_affinity(struct vcpu * v,const cpumask_t * affinity)868 int vcpu_set_hard_affinity(struct vcpu *v, const cpumask_t *affinity)
869 {
870     cpumask_t online_affinity;
871     cpumask_t *online;
872 
873     if ( v->domain->is_pinned )
874         return -EINVAL;
875 
876     online = VCPU2ONLINE(v);
877     cpumask_and(&online_affinity, affinity, online);
878     if ( cpumask_empty(&online_affinity) )
879         return -EINVAL;
880 
881     return vcpu_set_affinity(v, affinity, v->cpu_hard_affinity);
882 }
883 
vcpu_set_soft_affinity(struct vcpu * v,const cpumask_t * affinity)884 int vcpu_set_soft_affinity(struct vcpu *v, const cpumask_t *affinity)
885 {
886     return vcpu_set_affinity(v, affinity, v->cpu_soft_affinity);
887 }
888 
889 /* Block the currently-executing domain until a pertinent event occurs. */
vcpu_block(void)890 void vcpu_block(void)
891 {
892     struct vcpu *v = current;
893 
894     set_bit(_VPF_blocked, &v->pause_flags);
895 
896     arch_vcpu_block(v);
897 
898     /* Check for events /after/ blocking: avoids wakeup waiting race. */
899     if ( local_events_need_delivery() )
900     {
901         clear_bit(_VPF_blocked, &v->pause_flags);
902     }
903     else
904     {
905         TRACE_2D(TRC_SCHED_BLOCK, v->domain->domain_id, v->vcpu_id);
906         raise_softirq(SCHEDULE_SOFTIRQ);
907     }
908 }
909 
vcpu_block_enable_events(void)910 static void vcpu_block_enable_events(void)
911 {
912     local_event_delivery_enable();
913     vcpu_block();
914 }
915 
do_poll(struct sched_poll * sched_poll)916 static long do_poll(struct sched_poll *sched_poll)
917 {
918     struct vcpu   *v = current;
919     struct domain *d = v->domain;
920     evtchn_port_t  port;
921     long           rc;
922     unsigned int   i;
923 
924     /* Fairly arbitrary limit. */
925     if ( sched_poll->nr_ports > 128 )
926         return -EINVAL;
927 
928     if ( !guest_handle_okay(sched_poll->ports, sched_poll->nr_ports) )
929         return -EFAULT;
930 
931     set_bit(_VPF_blocked, &v->pause_flags);
932     v->poll_evtchn = -1;
933     set_bit(v->vcpu_id, d->poll_mask);
934 
935     arch_vcpu_block(v);
936 
937 #ifndef CONFIG_X86 /* set_bit() implies mb() on x86 */
938     /* Check for events /after/ setting flags: avoids wakeup waiting race. */
939     smp_mb();
940 
941     /*
942      * Someone may have seen we are blocked but not that we are polling, or
943      * vice versa. We are certainly being woken, so clean up and bail. Beyond
944      * this point others can be guaranteed to clean up for us if they wake us.
945      */
946     rc = 0;
947     if ( (v->poll_evtchn == 0) ||
948          !test_bit(_VPF_blocked, &v->pause_flags) ||
949          !test_bit(v->vcpu_id, d->poll_mask) )
950         goto out;
951 #endif
952 
953     rc = 0;
954     if ( local_events_need_delivery() )
955         goto out;
956 
957     for ( i = 0; i < sched_poll->nr_ports; i++ )
958     {
959         rc = -EFAULT;
960         if ( __copy_from_guest_offset(&port, sched_poll->ports, i, 1) )
961             goto out;
962 
963         rc = -EINVAL;
964         if ( port >= d->max_evtchns )
965             goto out;
966 
967         rc = 0;
968         if ( evtchn_port_is_pending(d, port) )
969             goto out;
970     }
971 
972     if ( sched_poll->nr_ports == 1 )
973         v->poll_evtchn = port;
974 
975     if ( sched_poll->timeout != 0 )
976         set_timer(&v->poll_timer, sched_poll->timeout);
977 
978     TRACE_2D(TRC_SCHED_BLOCK, d->domain_id, v->vcpu_id);
979     raise_softirq(SCHEDULE_SOFTIRQ);
980 
981     return 0;
982 
983  out:
984     v->poll_evtchn = 0;
985     clear_bit(v->vcpu_id, d->poll_mask);
986     clear_bit(_VPF_blocked, &v->pause_flags);
987     return rc;
988 }
989 
990 /* Voluntarily yield the processor for this allocation. */
vcpu_yield(void)991 long vcpu_yield(void)
992 {
993     struct vcpu * v=current;
994     spinlock_t *lock = vcpu_schedule_lock_irq(v);
995 
996     SCHED_OP(vcpu_scheduler(v), yield, v);
997     vcpu_schedule_unlock_irq(lock, v);
998 
999     SCHED_STAT_CRANK(vcpu_yield);
1000 
1001     TRACE_2D(TRC_SCHED_YIELD, current->domain->domain_id, current->vcpu_id);
1002     raise_softirq(SCHEDULE_SOFTIRQ);
1003     return 0;
1004 }
1005 
domain_watchdog_timeout(void * data)1006 static void domain_watchdog_timeout(void *data)
1007 {
1008     struct domain *d = data;
1009 
1010     if ( d->is_shutting_down || d->is_dying )
1011         return;
1012 
1013     printk("Watchdog timer fired for domain %u\n", d->domain_id);
1014     domain_shutdown(d, SHUTDOWN_watchdog);
1015 }
1016 
domain_watchdog(struct domain * d,uint32_t id,uint32_t timeout)1017 static long domain_watchdog(struct domain *d, uint32_t id, uint32_t timeout)
1018 {
1019     if ( id > NR_DOMAIN_WATCHDOG_TIMERS )
1020         return -EINVAL;
1021 
1022     spin_lock(&d->watchdog_lock);
1023 
1024     if ( id == 0 )
1025     {
1026         for ( id = 0; id < NR_DOMAIN_WATCHDOG_TIMERS; id++ )
1027         {
1028             if ( test_and_set_bit(id, &d->watchdog_inuse_map) )
1029                 continue;
1030             set_timer(&d->watchdog_timer[id], NOW() + SECONDS(timeout));
1031             break;
1032         }
1033         spin_unlock(&d->watchdog_lock);
1034         return id == NR_DOMAIN_WATCHDOG_TIMERS ? -ENOSPC : id + 1;
1035     }
1036 
1037     id -= 1;
1038     if ( !test_bit(id, &d->watchdog_inuse_map) )
1039     {
1040         spin_unlock(&d->watchdog_lock);
1041         return -EINVAL;
1042     }
1043 
1044     if ( timeout == 0 )
1045     {
1046         stop_timer(&d->watchdog_timer[id]);
1047         clear_bit(id, &d->watchdog_inuse_map);
1048     }
1049     else
1050     {
1051         set_timer(&d->watchdog_timer[id], NOW() + SECONDS(timeout));
1052     }
1053 
1054     spin_unlock(&d->watchdog_lock);
1055     return 0;
1056 }
1057 
watchdog_domain_init(struct domain * d)1058 void watchdog_domain_init(struct domain *d)
1059 {
1060     unsigned int i;
1061 
1062     spin_lock_init(&d->watchdog_lock);
1063 
1064     d->watchdog_inuse_map = 0;
1065 
1066     for ( i = 0; i < NR_DOMAIN_WATCHDOG_TIMERS; i++ )
1067         init_timer(&d->watchdog_timer[i], domain_watchdog_timeout, d, 0);
1068 }
1069 
watchdog_domain_destroy(struct domain * d)1070 void watchdog_domain_destroy(struct domain *d)
1071 {
1072     unsigned int i;
1073 
1074     for ( i = 0; i < NR_DOMAIN_WATCHDOG_TIMERS; i++ )
1075         kill_timer(&d->watchdog_timer[i]);
1076 }
1077 
vcpu_pin_override(struct vcpu * v,int cpu)1078 int vcpu_pin_override(struct vcpu *v, int cpu)
1079 {
1080     spinlock_t *lock;
1081     int ret = -EINVAL;
1082 
1083     lock = vcpu_schedule_lock_irq(v);
1084 
1085     if ( cpu < 0 )
1086     {
1087         if ( v->affinity_broken )
1088         {
1089             cpumask_copy(v->cpu_hard_affinity, v->cpu_hard_affinity_saved);
1090             v->affinity_broken = 0;
1091             set_bit(_VPF_migrating, &v->pause_flags);
1092             ret = 0;
1093         }
1094     }
1095     else if ( cpu < nr_cpu_ids )
1096     {
1097         if ( v->affinity_broken )
1098             ret = -EBUSY;
1099         else if ( cpumask_test_cpu(cpu, VCPU2ONLINE(v)) )
1100         {
1101             cpumask_copy(v->cpu_hard_affinity_saved, v->cpu_hard_affinity);
1102             v->affinity_broken = 1;
1103             cpumask_copy(v->cpu_hard_affinity, cpumask_of(cpu));
1104             set_bit(_VPF_migrating, &v->pause_flags);
1105             ret = 0;
1106         }
1107     }
1108 
1109     vcpu_schedule_unlock_irq(lock, v);
1110 
1111     domain_update_node_affinity(v->domain);
1112 
1113     if ( v->pause_flags & VPF_migrating )
1114     {
1115         vcpu_sleep_nosync(v);
1116         vcpu_migrate(v);
1117     }
1118 
1119     return ret;
1120 }
1121 
1122 typedef long ret_t;
1123 
1124 #endif /* !COMPAT */
1125 
do_sched_op(int cmd,XEN_GUEST_HANDLE_PARAM (void)arg)1126 ret_t do_sched_op(int cmd, XEN_GUEST_HANDLE_PARAM(void) arg)
1127 {
1128     ret_t ret = 0;
1129 
1130     switch ( cmd )
1131     {
1132     case SCHEDOP_yield:
1133     {
1134         ret = vcpu_yield();
1135         break;
1136     }
1137 
1138     case SCHEDOP_block:
1139     {
1140         vcpu_block_enable_events();
1141         break;
1142     }
1143 
1144     case SCHEDOP_shutdown:
1145     {
1146         struct sched_shutdown sched_shutdown;
1147 
1148         ret = -EFAULT;
1149         if ( copy_from_guest(&sched_shutdown, arg, 1) )
1150             break;
1151 
1152         TRACE_3D(TRC_SCHED_SHUTDOWN,
1153                  current->domain->domain_id, current->vcpu_id,
1154                  sched_shutdown.reason);
1155         ret = domain_shutdown(current->domain, (u8)sched_shutdown.reason);
1156 
1157         break;
1158     }
1159 
1160     case SCHEDOP_shutdown_code:
1161     {
1162         struct sched_shutdown sched_shutdown;
1163         struct domain *d = current->domain;
1164 
1165         ret = -EFAULT;
1166         if ( copy_from_guest(&sched_shutdown, arg, 1) )
1167             break;
1168 
1169         TRACE_3D(TRC_SCHED_SHUTDOWN_CODE,
1170                  d->domain_id, current->vcpu_id, sched_shutdown.reason);
1171 
1172         spin_lock(&d->shutdown_lock);
1173         if ( d->shutdown_code == SHUTDOWN_CODE_INVALID )
1174             d->shutdown_code = (u8)sched_shutdown.reason;
1175         spin_unlock(&d->shutdown_lock);
1176 
1177         ret = 0;
1178         break;
1179     }
1180 
1181     case SCHEDOP_poll:
1182     {
1183         struct sched_poll sched_poll;
1184 
1185         ret = -EFAULT;
1186         if ( copy_from_guest(&sched_poll, arg, 1) )
1187             break;
1188 
1189         ret = do_poll(&sched_poll);
1190 
1191         break;
1192     }
1193 
1194     case SCHEDOP_remote_shutdown:
1195     {
1196         struct domain *d;
1197         struct sched_remote_shutdown sched_remote_shutdown;
1198 
1199         ret = -EFAULT;
1200         if ( copy_from_guest(&sched_remote_shutdown, arg, 1) )
1201             break;
1202 
1203         ret = -ESRCH;
1204         d = rcu_lock_domain_by_id(sched_remote_shutdown.domain_id);
1205         if ( d == NULL )
1206             break;
1207 
1208         ret = xsm_schedop_shutdown(XSM_DM_PRIV, current->domain, d);
1209         if ( likely(!ret) )
1210             domain_shutdown(d, sched_remote_shutdown.reason);
1211 
1212         rcu_unlock_domain(d);
1213 
1214         break;
1215     }
1216 
1217     case SCHEDOP_watchdog:
1218     {
1219         struct sched_watchdog sched_watchdog;
1220 
1221         ret = -EFAULT;
1222         if ( copy_from_guest(&sched_watchdog, arg, 1) )
1223             break;
1224 
1225         ret = domain_watchdog(
1226             current->domain, sched_watchdog.id, sched_watchdog.timeout);
1227         break;
1228     }
1229 
1230     case SCHEDOP_pin_override:
1231     {
1232         struct sched_pin_override sched_pin_override;
1233 
1234         ret = -EPERM;
1235         if ( !is_hardware_domain(current->domain) )
1236             break;
1237 
1238         ret = -EFAULT;
1239         if ( copy_from_guest(&sched_pin_override, arg, 1) )
1240             break;
1241 
1242         ret = vcpu_pin_override(current, sched_pin_override.pcpu);
1243 
1244         break;
1245     }
1246 
1247     default:
1248         ret = -ENOSYS;
1249     }
1250 
1251     return ret;
1252 }
1253 
1254 #ifndef COMPAT
1255 
1256 /* Per-vcpu oneshot-timer hypercall. */
do_set_timer_op(s_time_t timeout)1257 long do_set_timer_op(s_time_t timeout)
1258 {
1259     struct vcpu *v = current;
1260     s_time_t offset = timeout - NOW();
1261 
1262     if ( timeout == 0 )
1263     {
1264         stop_timer(&v->singleshot_timer);
1265     }
1266     else if ( unlikely(timeout < 0) || /* overflow into 64th bit? */
1267               unlikely((offset > 0) && ((uint32_t)(offset >> 50) != 0)) )
1268     {
1269         /*
1270          * Linux workaround: occasionally we will see timeouts a long way in
1271          * the future due to wrapping in Linux's jiffy time handling. We check
1272          * for timeouts wrapped negative, and for positive timeouts more than
1273          * about 13 days in the future (2^50ns). The correct fix is to trigger
1274          * an interrupt immediately (since Linux in fact has pending work to
1275          * do in this situation). However, older guests also set a long timeout
1276          * when they have *no* pending timers at all: setting an immediate
1277          * timeout in this case can burn a lot of CPU. We therefore go for a
1278          * reasonable middleground of triggering a timer event in 100ms.
1279          */
1280         gdprintk(XENLOG_INFO, "Warning: huge timeout set: %"PRIx64"\n",
1281                  timeout);
1282         set_timer(&v->singleshot_timer, NOW() + MILLISECS(100));
1283     }
1284     else
1285     {
1286         migrate_timer(&v->singleshot_timer, smp_processor_id());
1287         set_timer(&v->singleshot_timer, timeout);
1288     }
1289 
1290     return 0;
1291 }
1292 
1293 /* sched_id - fetch ID of current scheduler */
sched_id(void)1294 int sched_id(void)
1295 {
1296     return ops.sched_id;
1297 }
1298 
1299 /* Adjust scheduling parameter for a given domain. */
sched_adjust(struct domain * d,struct xen_domctl_scheduler_op * op)1300 long sched_adjust(struct domain *d, struct xen_domctl_scheduler_op *op)
1301 {
1302     long ret;
1303 
1304     ret = xsm_domctl_scheduler_op(XSM_HOOK, d, op->cmd);
1305     if ( ret )
1306         return ret;
1307 
1308     if ( op->sched_id != dom_scheduler(d)->sched_id )
1309         return -EINVAL;
1310 
1311     switch ( op->cmd )
1312     {
1313     case XEN_DOMCTL_SCHEDOP_putinfo:
1314     case XEN_DOMCTL_SCHEDOP_getinfo:
1315     case XEN_DOMCTL_SCHEDOP_putvcpuinfo:
1316     case XEN_DOMCTL_SCHEDOP_getvcpuinfo:
1317         break;
1318     default:
1319         return -EINVAL;
1320     }
1321 
1322     /* NB: the pluggable scheduler code needs to take care
1323      * of locking by itself. */
1324     if ( (ret = SCHED_OP(dom_scheduler(d), adjust, d, op)) == 0 )
1325         TRACE_1D(TRC_SCHED_ADJDOM, d->domain_id);
1326 
1327     return ret;
1328 }
1329 
sched_adjust_global(struct xen_sysctl_scheduler_op * op)1330 long sched_adjust_global(struct xen_sysctl_scheduler_op *op)
1331 {
1332     struct cpupool *pool;
1333     int rc;
1334 
1335     rc = xsm_sysctl_scheduler_op(XSM_HOOK, op->cmd);
1336     if ( rc )
1337         return rc;
1338 
1339     if ( (op->cmd != XEN_SYSCTL_SCHEDOP_putinfo) &&
1340          (op->cmd != XEN_SYSCTL_SCHEDOP_getinfo) )
1341         return -EINVAL;
1342 
1343     pool = cpupool_get_by_id(op->cpupool_id);
1344     if ( pool == NULL )
1345         return -ESRCH;
1346 
1347     rc = ((op->sched_id == pool->sched->sched_id)
1348           ? SCHED_OP(pool->sched, adjust_global, op) : -EINVAL);
1349 
1350     cpupool_put(pool);
1351 
1352     return rc;
1353 }
1354 
vcpu_periodic_timer_work(struct vcpu * v)1355 static void vcpu_periodic_timer_work(struct vcpu *v)
1356 {
1357     s_time_t now = NOW();
1358     s_time_t periodic_next_event;
1359 
1360     if ( v->periodic_period == 0 )
1361         return;
1362 
1363     periodic_next_event = v->periodic_last_event + v->periodic_period;
1364 
1365     if ( now >= periodic_next_event )
1366     {
1367         send_timer_event(v);
1368         v->periodic_last_event = now;
1369         periodic_next_event = now + v->periodic_period;
1370     }
1371 
1372     migrate_timer(&v->periodic_timer, smp_processor_id());
1373     set_timer(&v->periodic_timer, periodic_next_event);
1374 }
1375 
1376 /*
1377  * The main function
1378  * - deschedule the current domain (scheduler independent).
1379  * - pick a new domain (scheduler dependent).
1380  */
schedule(void)1381 static void schedule(void)
1382 {
1383     struct vcpu          *prev = current, *next = NULL;
1384     s_time_t              now;
1385     struct scheduler     *sched;
1386     unsigned long        *tasklet_work = &this_cpu(tasklet_work_to_do);
1387     bool_t                tasklet_work_scheduled = 0;
1388     struct schedule_data *sd;
1389     spinlock_t           *lock;
1390     struct task_slice     next_slice;
1391     int cpu = smp_processor_id();
1392 
1393     ASSERT_NOT_IN_ATOMIC();
1394 
1395     SCHED_STAT_CRANK(sched_run);
1396 
1397     sd = &this_cpu(schedule_data);
1398 
1399     /* Update tasklet scheduling status. */
1400     switch ( *tasklet_work )
1401     {
1402     case TASKLET_enqueued:
1403         set_bit(_TASKLET_scheduled, tasklet_work);
1404         /* fallthrough */
1405     case TASKLET_enqueued|TASKLET_scheduled:
1406         tasklet_work_scheduled = 1;
1407         break;
1408     case TASKLET_scheduled:
1409         clear_bit(_TASKLET_scheduled, tasklet_work);
1410     case 0:
1411         /*tasklet_work_scheduled = 0;*/
1412         break;
1413     default:
1414         BUG();
1415     }
1416 
1417     lock = pcpu_schedule_lock_irq(cpu);
1418 
1419     now = NOW();
1420 
1421     stop_timer(&sd->s_timer);
1422 
1423     /* get policy-specific decision on scheduling... */
1424     sched = this_cpu(scheduler);
1425     next_slice = sched->do_schedule(sched, now, tasklet_work_scheduled);
1426 
1427     next = next_slice.task;
1428 
1429     sd->curr = next;
1430 
1431     if ( next_slice.time >= 0 ) /* -ve means no limit */
1432         set_timer(&sd->s_timer, now + next_slice.time);
1433 
1434     if ( unlikely(prev == next) )
1435     {
1436         pcpu_schedule_unlock_irq(lock, cpu);
1437         TRACE_4D(TRC_SCHED_SWITCH_INFCONT,
1438                  next->domain->domain_id, next->vcpu_id,
1439                  now - prev->runstate.state_entry_time,
1440                  next_slice.time);
1441         trace_continue_running(next);
1442         return continue_running(prev);
1443     }
1444 
1445     TRACE_3D(TRC_SCHED_SWITCH_INFPREV,
1446              prev->domain->domain_id, prev->vcpu_id,
1447              now - prev->runstate.state_entry_time);
1448     TRACE_4D(TRC_SCHED_SWITCH_INFNEXT,
1449              next->domain->domain_id, next->vcpu_id,
1450              (next->runstate.state == RUNSTATE_runnable) ?
1451              (now - next->runstate.state_entry_time) : 0,
1452              next_slice.time);
1453 
1454     ASSERT(prev->runstate.state == RUNSTATE_running);
1455 
1456     TRACE_4D(TRC_SCHED_SWITCH,
1457              prev->domain->domain_id, prev->vcpu_id,
1458              next->domain->domain_id, next->vcpu_id);
1459 
1460     vcpu_runstate_change(
1461         prev,
1462         ((prev->pause_flags & VPF_blocked) ? RUNSTATE_blocked :
1463          (vcpu_runnable(prev) ? RUNSTATE_runnable : RUNSTATE_offline)),
1464         now);
1465     prev->last_run_time = now;
1466 
1467     ASSERT(next->runstate.state != RUNSTATE_running);
1468     vcpu_runstate_change(next, RUNSTATE_running, now);
1469 
1470     /*
1471      * NB. Don't add any trace records from here until the actual context
1472      * switch, else lost_records resume will not work properly.
1473      */
1474 
1475     ASSERT(!next->is_running);
1476     next->is_running = 1;
1477 
1478     pcpu_schedule_unlock_irq(lock, cpu);
1479 
1480     SCHED_STAT_CRANK(sched_ctx);
1481 
1482     stop_timer(&prev->periodic_timer);
1483 
1484     if ( next_slice.migrated )
1485         sched_move_irqs(next);
1486 
1487     vcpu_periodic_timer_work(next);
1488 
1489     context_switch(prev, next);
1490 }
1491 
context_saved(struct vcpu * prev)1492 void context_saved(struct vcpu *prev)
1493 {
1494     /* Clear running flag /after/ writing context to memory. */
1495     smp_wmb();
1496 
1497     prev->is_running = 0;
1498 
1499     /* Check for migration request /after/ clearing running flag. */
1500     smp_mb();
1501 
1502     SCHED_OP(vcpu_scheduler(prev), context_saved, prev);
1503 
1504     if ( unlikely(prev->pause_flags & VPF_migrating) )
1505         vcpu_migrate(prev);
1506 }
1507 
1508 /* The scheduler timer: force a run through the scheduler */
s_timer_fn(void * unused)1509 static void s_timer_fn(void *unused)
1510 {
1511     raise_softirq(SCHEDULE_SOFTIRQ);
1512     SCHED_STAT_CRANK(sched_irq);
1513 }
1514 
1515 /* Per-VCPU periodic timer function: sends a virtual timer interrupt. */
vcpu_periodic_timer_fn(void * data)1516 static void vcpu_periodic_timer_fn(void *data)
1517 {
1518     struct vcpu *v = data;
1519     vcpu_periodic_timer_work(v);
1520 }
1521 
1522 /* Per-VCPU single-shot timer function: sends a virtual timer interrupt. */
vcpu_singleshot_timer_fn(void * data)1523 static void vcpu_singleshot_timer_fn(void *data)
1524 {
1525     struct vcpu *v = data;
1526     send_timer_event(v);
1527 }
1528 
1529 /* SCHEDOP_poll timeout callback. */
poll_timer_fn(void * data)1530 static void poll_timer_fn(void *data)
1531 {
1532     struct vcpu *v = data;
1533 
1534     if ( test_and_clear_bit(v->vcpu_id, v->domain->poll_mask) )
1535         vcpu_unblock(v);
1536 }
1537 
cpu_schedule_up(unsigned int cpu)1538 static int cpu_schedule_up(unsigned int cpu)
1539 {
1540     struct schedule_data *sd = &per_cpu(schedule_data, cpu);
1541     void *sched_priv;
1542 
1543     per_cpu(scheduler, cpu) = &ops;
1544     spin_lock_init(&sd->_lock);
1545     sd->schedule_lock = &sd->_lock;
1546     sd->curr = idle_vcpu[cpu];
1547     init_timer(&sd->s_timer, s_timer_fn, NULL, cpu);
1548     atomic_set(&sd->urgent_count, 0);
1549 
1550     /* Boot CPU is dealt with later in schedule_init(). */
1551     if ( cpu == 0 )
1552         return 0;
1553 
1554     if ( idle_vcpu[cpu] == NULL )
1555         alloc_vcpu(idle_vcpu[0]->domain, cpu, cpu);
1556     else
1557     {
1558         struct vcpu *idle = idle_vcpu[cpu];
1559 
1560         /*
1561          * During (ACPI?) suspend the idle vCPU for this pCPU is not freed,
1562          * while its scheduler specific data (what is pointed by sched_priv)
1563          * is. Also, at this stage of the resume path, we attach the pCPU
1564          * to the default scheduler, no matter in what cpupool it was before
1565          * suspend. To avoid inconsistency, let's allocate default scheduler
1566          * data for the idle vCPU here. If the pCPU was in a different pool
1567          * with a different scheduler, it is schedule_cpu_switch(), invoked
1568          * later, that will set things up as appropriate.
1569          */
1570         ASSERT(idle->sched_priv == NULL);
1571 
1572         idle->sched_priv = SCHED_OP(&ops, alloc_vdata, idle,
1573                                     idle->domain->sched_priv);
1574         if ( idle->sched_priv == NULL )
1575             return -ENOMEM;
1576     }
1577     if ( idle_vcpu[cpu] == NULL )
1578         return -ENOMEM;
1579 
1580     /*
1581      * We don't want to risk calling xfree() on an sd->sched_priv
1582      * (e.g., inside free_pdata, from cpu_schedule_down() called
1583      * during CPU_UP_CANCELLED) that contains an IS_ERR value.
1584      */
1585     sched_priv = SCHED_OP(&ops, alloc_pdata, cpu);
1586     if ( IS_ERR(sched_priv) )
1587         return PTR_ERR(sched_priv);
1588 
1589     sd->sched_priv = sched_priv;
1590 
1591     return 0;
1592 }
1593 
cpu_schedule_down(unsigned int cpu)1594 static void cpu_schedule_down(unsigned int cpu)
1595 {
1596     struct schedule_data *sd = &per_cpu(schedule_data, cpu);
1597     struct scheduler *sched = per_cpu(scheduler, cpu);
1598 
1599     SCHED_OP(sched, free_pdata, sd->sched_priv, cpu);
1600     SCHED_OP(sched, free_vdata, idle_vcpu[cpu]->sched_priv);
1601 
1602     idle_vcpu[cpu]->sched_priv = NULL;
1603     sd->sched_priv = NULL;
1604 
1605     kill_timer(&sd->s_timer);
1606 }
1607 
cpu_schedule_callback(struct notifier_block * nfb,unsigned long action,void * hcpu)1608 static int cpu_schedule_callback(
1609     struct notifier_block *nfb, unsigned long action, void *hcpu)
1610 {
1611     unsigned int cpu = (unsigned long)hcpu;
1612     struct scheduler *sched = per_cpu(scheduler, cpu);
1613     struct schedule_data *sd = &per_cpu(schedule_data, cpu);
1614     int rc = 0;
1615 
1616     /*
1617      * From the scheduler perspective, bringing up a pCPU requires
1618      * allocating and initializing the per-pCPU scheduler specific data,
1619      * as well as "registering" this pCPU to the scheduler (which may
1620      * involve modifying some scheduler wide data structures).
1621      * This happens by calling the alloc_pdata and init_pdata hooks, in
1622      * this order. A scheduler that does not need to allocate any per-pCPU
1623      * data can avoid implementing alloc_pdata. init_pdata may, however, be
1624      * necessary/useful in this case too (e.g., it can contain the "register
1625      * the pCPU to the scheduler" part). alloc_pdata (if present) is called
1626      * during CPU_UP_PREPARE. init_pdata (if present) is called during
1627      * CPU_STARTING.
1628      *
1629      * On the other hand, at teardown, we need to reverse what has been done
1630      * during initialization, and then free the per-pCPU specific data. This
1631      * happens by calling the deinit_pdata and free_pdata hooks, in this
1632      * order. If no per-pCPU memory was allocated, there is no need to
1633      * provide an implementation of free_pdata. deinit_pdata may, however,
1634      * be necessary/useful in this case too (e.g., it can undo something done
1635      * on scheduler wide data structure during init_pdata). Both deinit_pdata
1636      * and free_pdata are called during CPU_DEAD.
1637      *
1638      * If someting goes wrong during bringup, we go to CPU_UP_CANCELLED
1639      * *before* having called init_pdata. In this case, as there is no
1640      * initialization needing undoing, only free_pdata should be called.
1641      * This means it is possible to call free_pdata just after alloc_pdata,
1642      * without a init_pdata/deinit_pdata "cycle" in between the two.
1643      *
1644      * So, in summary, the usage pattern should look either
1645      *  - alloc_pdata-->init_pdata-->deinit_pdata-->free_pdata, or
1646      *  - alloc_pdata-->free_pdata.
1647      */
1648     switch ( action )
1649     {
1650     case CPU_STARTING:
1651         SCHED_OP(sched, init_pdata, sd->sched_priv, cpu);
1652         break;
1653     case CPU_UP_PREPARE:
1654         rc = cpu_schedule_up(cpu);
1655         break;
1656     case CPU_DEAD:
1657         SCHED_OP(sched, deinit_pdata, sd->sched_priv, cpu);
1658         /* Fallthrough */
1659     case CPU_UP_CANCELED:
1660         cpu_schedule_down(cpu);
1661         break;
1662     default:
1663         break;
1664     }
1665 
1666     return !rc ? NOTIFY_DONE : notifier_from_errno(rc);
1667 }
1668 
1669 static struct notifier_block cpu_schedule_nfb = {
1670     .notifier_call = cpu_schedule_callback
1671 };
1672 
1673 /* Initialise the data structures. */
scheduler_init(void)1674 void __init scheduler_init(void)
1675 {
1676     struct domain *idle_domain;
1677     int i;
1678 
1679     open_softirq(SCHEDULE_SOFTIRQ, schedule);
1680 
1681     for ( i = 0; i < NUM_SCHEDULERS; i++)
1682     {
1683         if ( schedulers[i]->global_init && schedulers[i]->global_init() < 0 )
1684             schedulers[i] = NULL;
1685         else if ( !ops.name && !strcmp(schedulers[i]->opt_name, opt_sched) )
1686             ops = *schedulers[i];
1687     }
1688 
1689     if ( !ops.name )
1690     {
1691         printk("Could not find scheduler: %s\n", opt_sched);
1692         for ( i = 0; i < NUM_SCHEDULERS; i++ )
1693             if ( schedulers[i] &&
1694                  !strcmp(schedulers[i]->opt_name, CONFIG_SCHED_DEFAULT) )
1695             {
1696                 ops = *schedulers[i];
1697                 break;
1698             }
1699         BUG_ON(!ops.name);
1700         printk("Using '%s' (%s)\n", ops.name, ops.opt_name);
1701     }
1702 
1703     if ( cpu_schedule_up(0) )
1704         BUG();
1705     register_cpu_notifier(&cpu_schedule_nfb);
1706 
1707     printk("Using scheduler: %s (%s)\n", ops.name, ops.opt_name);
1708     if ( SCHED_OP(&ops, init) )
1709         panic("scheduler returned error on init");
1710 
1711     if ( sched_ratelimit_us &&
1712          (sched_ratelimit_us > XEN_SYSCTL_SCHED_RATELIMIT_MAX
1713           || sched_ratelimit_us < XEN_SYSCTL_SCHED_RATELIMIT_MIN) )
1714     {
1715         printk("WARNING: sched_ratelimit_us outside of valid range [%d,%d].\n"
1716                " Resetting to default %u\n",
1717                XEN_SYSCTL_SCHED_RATELIMIT_MIN,
1718                XEN_SYSCTL_SCHED_RATELIMIT_MAX,
1719                SCHED_DEFAULT_RATELIMIT_US);
1720         sched_ratelimit_us = SCHED_DEFAULT_RATELIMIT_US;
1721     }
1722 
1723     idle_domain = domain_create(DOMID_IDLE, 0, 0, NULL);
1724     BUG_ON(IS_ERR(idle_domain));
1725     idle_domain->vcpu = idle_vcpu;
1726     idle_domain->max_vcpus = nr_cpu_ids;
1727     if ( alloc_vcpu(idle_domain, 0, 0) == NULL )
1728         BUG();
1729     this_cpu(schedule_data).sched_priv = SCHED_OP(&ops, alloc_pdata, 0);
1730     BUG_ON(IS_ERR(this_cpu(schedule_data).sched_priv));
1731     SCHED_OP(&ops, init_pdata, this_cpu(schedule_data).sched_priv, 0);
1732 }
1733 
1734 /*
1735  * Move a pCPU outside of the influence of the scheduler of its current
1736  * cpupool, or subject it to the scheduler of a new cpupool.
1737  *
1738  * For the pCPUs that are removed from their cpupool, their scheduler becomes
1739  * &ops (the default scheduler, selected at boot, which also services the
1740  * default cpupool). However, as these pCPUs are not really part of any pool,
1741  * there won't be any scheduling event on them, not even from the default
1742  * scheduler. Basically, they will just sit idle until they are explicitly
1743  * added back to a cpupool.
1744  */
schedule_cpu_switch(unsigned int cpu,struct cpupool * c)1745 int schedule_cpu_switch(unsigned int cpu, struct cpupool *c)
1746 {
1747     struct vcpu *idle;
1748     void *ppriv, *ppriv_old, *vpriv, *vpriv_old;
1749     struct scheduler *old_ops = per_cpu(scheduler, cpu);
1750     struct scheduler *new_ops = (c == NULL) ? &ops : c->sched;
1751     struct cpupool *old_pool = per_cpu(cpupool, cpu);
1752     spinlock_t * old_lock;
1753 
1754     /*
1755      * pCPUs only move from a valid cpupool to free (i.e., out of any pool),
1756      * or from free to a valid cpupool. In the former case (which happens when
1757      * c is NULL), we want the CPU to have been marked as free already, as
1758      * well as to not be valid for the source pool any longer, when we get to
1759      * here. In the latter case (which happens when c is a valid cpupool), we
1760      * want the CPU to still be marked as free, as well as to not yet be valid
1761      * for the destination pool.
1762      */
1763     ASSERT(c != old_pool && (c != NULL || old_pool != NULL));
1764     ASSERT(cpumask_test_cpu(cpu, &cpupool_free_cpus));
1765     ASSERT((c == NULL && !cpumask_test_cpu(cpu, old_pool->cpu_valid)) ||
1766            (c != NULL && !cpumask_test_cpu(cpu, c->cpu_valid)));
1767 
1768     if ( old_ops == new_ops )
1769         goto out;
1770 
1771     /*
1772      * To setup the cpu for the new scheduler we need:
1773      *  - a valid instance of per-CPU scheduler specific data, as it is
1774      *    allocated by SCHED_OP(alloc_pdata). Note that we do not want to
1775      *    initialize it yet (i.e., we are not calling SCHED_OP(init_pdata)).
1776      *    That will be done by the target scheduler, in SCHED_OP(switch_sched),
1777      *    in proper ordering and with locking.
1778      *  - a valid instance of per-vCPU scheduler specific data, for the idle
1779      *    vCPU of cpu. That is what the target scheduler will use for the
1780      *    sched_priv field of the per-vCPU info of the idle domain.
1781      */
1782     idle = idle_vcpu[cpu];
1783     ppriv = SCHED_OP(new_ops, alloc_pdata, cpu);
1784     if ( IS_ERR(ppriv) )
1785         return PTR_ERR(ppriv);
1786     vpriv = SCHED_OP(new_ops, alloc_vdata, idle, idle->domain->sched_priv);
1787     if ( vpriv == NULL )
1788     {
1789         SCHED_OP(new_ops, free_pdata, ppriv, cpu);
1790         return -ENOMEM;
1791     }
1792 
1793     SCHED_OP(old_ops, tick_suspend, cpu);
1794 
1795     /*
1796      * The actual switch, including (if necessary) the rerouting of the
1797      * scheduler lock to whatever new_ops prefers,  needs to happen in one
1798      * critical section, protected by old_ops' lock, or races are possible.
1799      * It is, in fact, the lock of another scheduler that we are taking (the
1800      * scheduler of the cpupool that cpu still belongs to). But that is ok
1801      * as, anyone trying to schedule on this cpu will spin until when we
1802      * release that lock (bottom of this function). When he'll get the lock
1803      * --thanks to the loop inside *_schedule_lock() functions-- he'll notice
1804      * that the lock itself changed, and retry acquiring the new one (which
1805      * will be the correct, remapped one, at that point).
1806      */
1807     old_lock = pcpu_schedule_lock_irq(cpu);
1808 
1809     vpriv_old = idle->sched_priv;
1810     ppriv_old = per_cpu(schedule_data, cpu).sched_priv;
1811     SCHED_OP(new_ops, switch_sched, cpu, ppriv, vpriv);
1812 
1813     /* _Not_ pcpu_schedule_unlock(): schedule_lock may have changed! */
1814     spin_unlock_irq(old_lock);
1815 
1816     SCHED_OP(new_ops, tick_resume, cpu);
1817 
1818     SCHED_OP(old_ops, deinit_pdata, ppriv_old, cpu);
1819 
1820     SCHED_OP(old_ops, free_vdata, vpriv_old);
1821     SCHED_OP(old_ops, free_pdata, ppriv_old, cpu);
1822 
1823  out:
1824     per_cpu(cpupool, cpu) = c;
1825     /* When a cpu is added to a pool, trigger it to go pick up some work */
1826     if ( c != NULL )
1827         cpu_raise_softirq(cpu, SCHEDULE_SOFTIRQ);
1828 
1829     return 0;
1830 }
1831 
scheduler_get_default(void)1832 struct scheduler *scheduler_get_default(void)
1833 {
1834     return &ops;
1835 }
1836 
scheduler_alloc(unsigned int sched_id,int * perr)1837 struct scheduler *scheduler_alloc(unsigned int sched_id, int *perr)
1838 {
1839     int i;
1840     struct scheduler *sched;
1841 
1842     for ( i = 0; i < NUM_SCHEDULERS; i++ )
1843         if ( schedulers[i] && schedulers[i]->sched_id == sched_id )
1844             goto found;
1845     *perr = -ENOENT;
1846     return NULL;
1847 
1848  found:
1849     *perr = -ENOMEM;
1850     if ( (sched = xmalloc(struct scheduler)) == NULL )
1851         return NULL;
1852     memcpy(sched, schedulers[i], sizeof(*sched));
1853     if ( (*perr = SCHED_OP(sched, init)) != 0 )
1854     {
1855         xfree(sched);
1856         sched = NULL;
1857     }
1858 
1859     return sched;
1860 }
1861 
scheduler_free(struct scheduler * sched)1862 void scheduler_free(struct scheduler *sched)
1863 {
1864     BUG_ON(sched == &ops);
1865     SCHED_OP(sched, deinit);
1866     xfree(sched);
1867 }
1868 
schedule_dump(struct cpupool * c)1869 void schedule_dump(struct cpupool *c)
1870 {
1871     unsigned int      i;
1872     struct scheduler *sched;
1873     cpumask_t        *cpus;
1874 
1875     /* Locking, if necessary, must be handled withing each scheduler */
1876 
1877     if ( c != NULL )
1878     {
1879         sched = c->sched;
1880         cpus = c->cpu_valid;
1881         printk("Scheduler: %s (%s)\n", sched->name, sched->opt_name);
1882         SCHED_OP(sched, dump_settings);
1883     }
1884     else
1885     {
1886         sched = &ops;
1887         cpus = &cpupool_free_cpus;
1888     }
1889 
1890     if ( sched->dump_cpu_state != NULL )
1891     {
1892         printk("CPUs info:\n");
1893         for_each_cpu (i, cpus)
1894             SCHED_OP(sched, dump_cpu_state, i);
1895     }
1896 }
1897 
sched_tick_suspend(void)1898 void sched_tick_suspend(void)
1899 {
1900     struct scheduler *sched;
1901     unsigned int cpu = smp_processor_id();
1902 
1903     sched = per_cpu(scheduler, cpu);
1904     SCHED_OP(sched, tick_suspend, cpu);
1905     rcu_idle_enter(cpu);
1906     rcu_idle_timer_start();
1907 }
1908 
sched_tick_resume(void)1909 void sched_tick_resume(void)
1910 {
1911     struct scheduler *sched;
1912     unsigned int cpu = smp_processor_id();
1913 
1914     rcu_idle_timer_stop();
1915     rcu_idle_exit(cpu);
1916     sched = per_cpu(scheduler, cpu);
1917     SCHED_OP(sched, tick_resume, cpu);
1918 }
1919 
wait(void)1920 void wait(void)
1921 {
1922     schedule();
1923 }
1924 
1925 #ifdef CONFIG_COMPAT
1926 #include "compat/schedule.c"
1927 #endif
1928 
1929 #endif /* !COMPAT */
1930 
1931 /*
1932  * Local variables:
1933  * mode: C
1934  * c-file-style: "BSD"
1935  * c-basic-offset: 4
1936  * tab-width: 4
1937  * indent-tabs-mode: nil
1938  * End:
1939  */
1940