1 /****************************************************************************
2 * (C) 2002-2003 - Rolf Neugebauer - Intel Research Cambridge
3 * (C) 2002-2003 University of Cambridge
4 * (C) 2004 - Mark Williamson - Intel Research Cambridge
5 ****************************************************************************
6 *
7 * File: common/schedule.c
8 * Author: Rolf Neugebauer & Keir Fraser
9 * Updated for generic API by Mark Williamson
10 *
11 * Description: Generic CPU scheduling code
12 * implements support functionality for the Xen scheduler API.
13 *
14 */
15
16 #ifndef COMPAT
17 #include <xen/init.h>
18 #include <xen/lib.h>
19 #include <xen/sched.h>
20 #include <xen/domain.h>
21 #include <xen/delay.h>
22 #include <xen/event.h>
23 #include <xen/time.h>
24 #include <xen/timer.h>
25 #include <xen/perfc.h>
26 #include <xen/sched-if.h>
27 #include <xen/softirq.h>
28 #include <xen/trace.h>
29 #include <xen/mm.h>
30 #include <xen/err.h>
31 #include <xen/guest_access.h>
32 #include <xen/hypercall.h>
33 #include <xen/multicall.h>
34 #include <xen/cpu.h>
35 #include <xen/preempt.h>
36 #include <xen/event.h>
37 #include <public/sched.h>
38 #include <xsm/xsm.h>
39 #include <xen/err.h>
40
41 /* opt_sched: scheduler - default to configured value */
42 static char __initdata opt_sched[10] = CONFIG_SCHED_DEFAULT;
43 string_param("sched", opt_sched);
44
45 /* if sched_smt_power_savings is set,
46 * scheduler will give preferrence to partially idle package compared to
47 * the full idle package, when picking pCPU to schedule vCPU.
48 */
49 bool_t sched_smt_power_savings = 0;
50 boolean_param("sched_smt_power_savings", sched_smt_power_savings);
51
52 /* Default scheduling rate limit: 1ms
53 * The behavior when sched_ratelimit_us is greater than sched_credit_tslice_ms is undefined
54 * */
55 int sched_ratelimit_us = SCHED_DEFAULT_RATELIMIT_US;
56 integer_param("sched_ratelimit_us", sched_ratelimit_us);
57 /* Various timer handlers. */
58 static void s_timer_fn(void *unused);
59 static void vcpu_periodic_timer_fn(void *data);
60 static void vcpu_singleshot_timer_fn(void *data);
61 static void poll_timer_fn(void *data);
62
63 /* This is global for now so that private implementations can reach it */
64 DEFINE_PER_CPU(struct schedule_data, schedule_data);
65 DEFINE_PER_CPU(struct scheduler *, scheduler);
66
67 /* Scratch space for cpumasks. */
68 DEFINE_PER_CPU(cpumask_t, cpumask_scratch);
69
70 extern const struct scheduler *__start_schedulers_array[], *__end_schedulers_array[];
71 #define NUM_SCHEDULERS (__end_schedulers_array - __start_schedulers_array)
72 #define schedulers __start_schedulers_array
73
74 static struct scheduler __read_mostly ops;
75
76 #define SCHED_OP(opsptr, fn, ...) \
77 (( (opsptr)->fn != NULL ) ? (opsptr)->fn(opsptr, ##__VA_ARGS__ ) \
78 : (typeof((opsptr)->fn(opsptr, ##__VA_ARGS__)))0 )
79
dom_scheduler(const struct domain * d)80 static inline struct scheduler *dom_scheduler(const struct domain *d)
81 {
82 if ( likely(d->cpupool != NULL) )
83 return d->cpupool->sched;
84
85 /*
86 * If d->cpupool is NULL, this is the idle domain. This is special
87 * because the idle domain does not really belong to any cpupool, and,
88 * hence, does not really have a scheduler.
89 *
90 * This is (should be!) only called like this for allocating the idle
91 * vCPUs for the first time, during boot, in which case what we want
92 * is the default scheduler that has been, choosen at boot.
93 */
94 ASSERT(is_idle_domain(d));
95 return &ops;
96 }
97
vcpu_scheduler(const struct vcpu * v)98 static inline struct scheduler *vcpu_scheduler(const struct vcpu *v)
99 {
100 struct domain *d = v->domain;
101
102 if ( likely(d->cpupool != NULL) )
103 return d->cpupool->sched;
104
105 /*
106 * If d->cpupool is NULL, this is a vCPU of the idle domain. And this
107 * case is special because the idle domain does not really belong to
108 * a cpupool and, hence, doesn't really have a scheduler). In fact, its
109 * vCPUs (may) run on pCPUs which are in different pools, with different
110 * schedulers.
111 *
112 * What we want, in this case, is the scheduler of the pCPU where this
113 * particular idle vCPU is running. And, since v->processor never changes
114 * for idle vCPUs, it is safe to use it, with no locks, to figure that out.
115 */
116 ASSERT(is_idle_domain(d));
117 return per_cpu(scheduler, v->processor);
118 }
119 #define VCPU2ONLINE(_v) cpupool_domain_cpumask((_v)->domain)
120
trace_runstate_change(struct vcpu * v,int new_state)121 static inline void trace_runstate_change(struct vcpu *v, int new_state)
122 {
123 struct { uint32_t vcpu:16, domain:16; } d;
124 uint32_t event;
125
126 if ( likely(!tb_init_done) )
127 return;
128
129 d.vcpu = v->vcpu_id;
130 d.domain = v->domain->domain_id;
131
132 event = TRC_SCHED_RUNSTATE_CHANGE;
133 event |= ( v->runstate.state & 0x3 ) << 8;
134 event |= ( new_state & 0x3 ) << 4;
135
136 __trace_var(event, 1/*tsc*/, sizeof(d), &d);
137 }
138
trace_continue_running(struct vcpu * v)139 static inline void trace_continue_running(struct vcpu *v)
140 {
141 struct { uint32_t vcpu:16, domain:16; } d;
142
143 if ( likely(!tb_init_done) )
144 return;
145
146 d.vcpu = v->vcpu_id;
147 d.domain = v->domain->domain_id;
148
149 __trace_var(TRC_SCHED_CONTINUE_RUNNING, 1/*tsc*/, sizeof(d), &d);
150 }
151
vcpu_urgent_count_update(struct vcpu * v)152 static inline void vcpu_urgent_count_update(struct vcpu *v)
153 {
154 if ( is_idle_vcpu(v) )
155 return;
156
157 if ( unlikely(v->is_urgent) )
158 {
159 if ( !(v->pause_flags & VPF_blocked) ||
160 !test_bit(v->vcpu_id, v->domain->poll_mask) )
161 {
162 v->is_urgent = 0;
163 atomic_dec(&per_cpu(schedule_data,v->processor).urgent_count);
164 }
165 }
166 else
167 {
168 if ( unlikely(v->pause_flags & VPF_blocked) &&
169 unlikely(test_bit(v->vcpu_id, v->domain->poll_mask)) )
170 {
171 v->is_urgent = 1;
172 atomic_inc(&per_cpu(schedule_data,v->processor).urgent_count);
173 }
174 }
175 }
176
vcpu_runstate_change(struct vcpu * v,int new_state,s_time_t new_entry_time)177 static inline void vcpu_runstate_change(
178 struct vcpu *v, int new_state, s_time_t new_entry_time)
179 {
180 s_time_t delta;
181
182 ASSERT(v->runstate.state != new_state);
183 ASSERT(spin_is_locked(per_cpu(schedule_data,v->processor).schedule_lock));
184
185 vcpu_urgent_count_update(v);
186
187 trace_runstate_change(v, new_state);
188
189 delta = new_entry_time - v->runstate.state_entry_time;
190 if ( delta > 0 )
191 {
192 v->runstate.time[v->runstate.state] += delta;
193 v->runstate.state_entry_time = new_entry_time;
194 }
195
196 v->runstate.state = new_state;
197 }
198
vcpu_runstate_get(struct vcpu * v,struct vcpu_runstate_info * runstate)199 void vcpu_runstate_get(struct vcpu *v, struct vcpu_runstate_info *runstate)
200 {
201 spinlock_t *lock = likely(v == current) ? NULL : vcpu_schedule_lock_irq(v);
202 s_time_t delta;
203
204 memcpy(runstate, &v->runstate, sizeof(*runstate));
205 delta = NOW() - runstate->state_entry_time;
206 if ( delta > 0 )
207 runstate->time[runstate->state] += delta;
208
209 if ( unlikely(lock != NULL) )
210 vcpu_schedule_unlock_irq(lock, v);
211 }
212
get_cpu_idle_time(unsigned int cpu)213 uint64_t get_cpu_idle_time(unsigned int cpu)
214 {
215 struct vcpu_runstate_info state = { 0 };
216 struct vcpu *v = idle_vcpu[cpu];
217
218 if ( cpu_online(cpu) && v )
219 vcpu_runstate_get(v, &state);
220
221 return state.time[RUNSTATE_running];
222 }
223
224 /*
225 * If locks are different, take the one with the lower address first.
226 * This avoids dead- or live-locks when this code is running on both
227 * cpus at the same time.
228 */
sched_spin_lock_double(spinlock_t * lock1,spinlock_t * lock2,unsigned long * flags)229 static void sched_spin_lock_double(spinlock_t *lock1, spinlock_t *lock2,
230 unsigned long *flags)
231 {
232 if ( lock1 == lock2 )
233 {
234 spin_lock_irqsave(lock1, *flags);
235 }
236 else if ( lock1 < lock2 )
237 {
238 spin_lock_irqsave(lock1, *flags);
239 spin_lock(lock2);
240 }
241 else
242 {
243 spin_lock_irqsave(lock2, *flags);
244 spin_lock(lock1);
245 }
246 }
247
sched_spin_unlock_double(spinlock_t * lock1,spinlock_t * lock2,unsigned long flags)248 static void sched_spin_unlock_double(spinlock_t *lock1, spinlock_t *lock2,
249 unsigned long flags)
250 {
251 if ( lock1 != lock2 )
252 spin_unlock(lock2);
253 spin_unlock_irqrestore(lock1, flags);
254 }
255
sched_init_vcpu(struct vcpu * v,unsigned int processor)256 int sched_init_vcpu(struct vcpu *v, unsigned int processor)
257 {
258 struct domain *d = v->domain;
259
260 /*
261 * Initialize processor and affinity settings. The idler, and potentially
262 * domain-0 VCPUs, are pinned onto their respective physical CPUs.
263 */
264 v->processor = processor;
265 if ( is_idle_domain(d) || d->is_pinned )
266 cpumask_copy(v->cpu_hard_affinity, cpumask_of(processor));
267 else
268 cpumask_setall(v->cpu_hard_affinity);
269
270 cpumask_setall(v->cpu_soft_affinity);
271
272 /* Initialise the per-vcpu timers. */
273 init_timer(&v->periodic_timer, vcpu_periodic_timer_fn,
274 v, v->processor);
275 init_timer(&v->singleshot_timer, vcpu_singleshot_timer_fn,
276 v, v->processor);
277 init_timer(&v->poll_timer, poll_timer_fn,
278 v, v->processor);
279
280 v->sched_priv = SCHED_OP(dom_scheduler(d), alloc_vdata, v,
281 d->sched_priv);
282 if ( v->sched_priv == NULL )
283 return 1;
284
285 /* Idle VCPUs are scheduled immediately, so don't put them in runqueue. */
286 if ( is_idle_domain(d) )
287 {
288 per_cpu(schedule_data, v->processor).curr = v;
289 v->is_running = 1;
290 }
291 else
292 {
293 SCHED_OP(dom_scheduler(d), insert_vcpu, v);
294 }
295
296 return 0;
297 }
298
sched_move_irqs(struct vcpu * v)299 static void sched_move_irqs(struct vcpu *v)
300 {
301 arch_move_irqs(v);
302 evtchn_move_pirqs(v);
303 }
304
sched_move_domain(struct domain * d,struct cpupool * c)305 int sched_move_domain(struct domain *d, struct cpupool *c)
306 {
307 struct vcpu *v;
308 unsigned int new_p;
309 void **vcpu_priv;
310 void *domdata;
311 void *vcpudata;
312 struct scheduler *old_ops;
313 void *old_domdata;
314
315 for_each_vcpu ( d, v )
316 {
317 if ( v->affinity_broken )
318 return -EBUSY;
319 }
320
321 domdata = SCHED_OP(c->sched, alloc_domdata, d);
322 if ( domdata == NULL )
323 return -ENOMEM;
324
325 vcpu_priv = xzalloc_array(void *, d->max_vcpus);
326 if ( vcpu_priv == NULL )
327 {
328 SCHED_OP(c->sched, free_domdata, domdata);
329 return -ENOMEM;
330 }
331
332 for_each_vcpu ( d, v )
333 {
334 vcpu_priv[v->vcpu_id] = SCHED_OP(c->sched, alloc_vdata, v, domdata);
335 if ( vcpu_priv[v->vcpu_id] == NULL )
336 {
337 for_each_vcpu ( d, v )
338 xfree(vcpu_priv[v->vcpu_id]);
339 xfree(vcpu_priv);
340 SCHED_OP(c->sched, free_domdata, domdata);
341 return -ENOMEM;
342 }
343 }
344
345 domain_pause(d);
346
347 old_ops = dom_scheduler(d);
348 old_domdata = d->sched_priv;
349
350 for_each_vcpu ( d, v )
351 {
352 SCHED_OP(old_ops, remove_vcpu, v);
353 }
354
355 d->cpupool = c;
356 d->sched_priv = domdata;
357
358 new_p = cpumask_first(c->cpu_valid);
359 for_each_vcpu ( d, v )
360 {
361 spinlock_t *lock;
362
363 vcpudata = v->sched_priv;
364
365 migrate_timer(&v->periodic_timer, new_p);
366 migrate_timer(&v->singleshot_timer, new_p);
367 migrate_timer(&v->poll_timer, new_p);
368
369 cpumask_setall(v->cpu_hard_affinity);
370 cpumask_setall(v->cpu_soft_affinity);
371
372 lock = vcpu_schedule_lock_irq(v);
373 v->processor = new_p;
374 /*
375 * With v->processor modified we must not
376 * - make any further changes assuming we hold the scheduler lock,
377 * - use vcpu_schedule_unlock_irq().
378 */
379 spin_unlock_irq(lock);
380
381 v->sched_priv = vcpu_priv[v->vcpu_id];
382 if ( !d->is_dying )
383 sched_move_irqs(v);
384
385 new_p = cpumask_cycle(new_p, c->cpu_valid);
386
387 SCHED_OP(c->sched, insert_vcpu, v);
388
389 SCHED_OP(old_ops, free_vdata, vcpudata);
390 }
391
392 domain_update_node_affinity(d);
393
394 domain_unpause(d);
395
396 SCHED_OP(old_ops, free_domdata, old_domdata);
397
398 xfree(vcpu_priv);
399
400 return 0;
401 }
402
sched_destroy_vcpu(struct vcpu * v)403 void sched_destroy_vcpu(struct vcpu *v)
404 {
405 kill_timer(&v->periodic_timer);
406 kill_timer(&v->singleshot_timer);
407 kill_timer(&v->poll_timer);
408 if ( test_and_clear_bool(v->is_urgent) )
409 atomic_dec(&per_cpu(schedule_data, v->processor).urgent_count);
410 SCHED_OP(vcpu_scheduler(v), remove_vcpu, v);
411 SCHED_OP(vcpu_scheduler(v), free_vdata, v->sched_priv);
412 }
413
sched_init_domain(struct domain * d,int poolid)414 int sched_init_domain(struct domain *d, int poolid)
415 {
416 int ret;
417
418 ASSERT(d->cpupool == NULL);
419
420 if ( (ret = cpupool_add_domain(d, poolid)) )
421 return ret;
422
423 SCHED_STAT_CRANK(dom_init);
424 TRACE_1D(TRC_SCHED_DOM_ADD, d->domain_id);
425 return SCHED_OP(dom_scheduler(d), init_domain, d);
426 }
427
sched_destroy_domain(struct domain * d)428 void sched_destroy_domain(struct domain *d)
429 {
430 ASSERT(d->cpupool != NULL || is_idle_domain(d));
431
432 SCHED_STAT_CRANK(dom_destroy);
433 TRACE_1D(TRC_SCHED_DOM_REM, d->domain_id);
434 SCHED_OP(dom_scheduler(d), destroy_domain, d);
435
436 cpupool_rm_domain(d);
437 }
438
vcpu_sleep_nosync(struct vcpu * v)439 void vcpu_sleep_nosync(struct vcpu *v)
440 {
441 unsigned long flags;
442 spinlock_t *lock;
443
444 TRACE_2D(TRC_SCHED_SLEEP, v->domain->domain_id, v->vcpu_id);
445
446 lock = vcpu_schedule_lock_irqsave(v, &flags);
447
448 if ( likely(!vcpu_runnable(v)) )
449 {
450 if ( v->runstate.state == RUNSTATE_runnable )
451 vcpu_runstate_change(v, RUNSTATE_offline, NOW());
452
453 SCHED_OP(vcpu_scheduler(v), sleep, v);
454 }
455
456 vcpu_schedule_unlock_irqrestore(lock, flags, v);
457 }
458
vcpu_sleep_sync(struct vcpu * v)459 void vcpu_sleep_sync(struct vcpu *v)
460 {
461 vcpu_sleep_nosync(v);
462
463 while ( !vcpu_runnable(v) && v->is_running )
464 cpu_relax();
465
466 sync_vcpu_execstate(v);
467 }
468
vcpu_wake(struct vcpu * v)469 void vcpu_wake(struct vcpu *v)
470 {
471 unsigned long flags;
472 spinlock_t *lock;
473
474 TRACE_2D(TRC_SCHED_WAKE, v->domain->domain_id, v->vcpu_id);
475
476 lock = vcpu_schedule_lock_irqsave(v, &flags);
477
478 if ( likely(vcpu_runnable(v)) )
479 {
480 if ( v->runstate.state >= RUNSTATE_blocked )
481 vcpu_runstate_change(v, RUNSTATE_runnable, NOW());
482 SCHED_OP(vcpu_scheduler(v), wake, v);
483 }
484 else if ( !(v->pause_flags & VPF_blocked) )
485 {
486 if ( v->runstate.state == RUNSTATE_blocked )
487 vcpu_runstate_change(v, RUNSTATE_offline, NOW());
488 }
489
490 vcpu_schedule_unlock_irqrestore(lock, flags, v);
491 }
492
vcpu_unblock(struct vcpu * v)493 void vcpu_unblock(struct vcpu *v)
494 {
495 if ( !test_and_clear_bit(_VPF_blocked, &v->pause_flags) )
496 return;
497
498 /* Polling period ends when a VCPU is unblocked. */
499 if ( unlikely(v->poll_evtchn != 0) )
500 {
501 v->poll_evtchn = 0;
502 /*
503 * We *must* re-clear _VPF_blocked to avoid racing other wakeups of
504 * this VCPU (and it then going back to sleep on poll_mask).
505 * Test-and-clear is idiomatic and ensures clear_bit not reordered.
506 */
507 if ( test_and_clear_bit(v->vcpu_id, v->domain->poll_mask) )
508 clear_bit(_VPF_blocked, &v->pause_flags);
509 }
510
511 vcpu_wake(v);
512 }
513
514 /*
515 * Do the actual movement of a vcpu from old to new CPU. Locks for *both*
516 * CPUs needs to have been taken already when calling this!
517 */
vcpu_move_locked(struct vcpu * v,unsigned int new_cpu)518 static void vcpu_move_locked(struct vcpu *v, unsigned int new_cpu)
519 {
520 unsigned int old_cpu = v->processor;
521
522 /*
523 * Transfer urgency status to new CPU before switching CPUs, as
524 * once the switch occurs, v->is_urgent is no longer protected by
525 * the per-CPU scheduler lock we are holding.
526 */
527 if ( unlikely(v->is_urgent) && (old_cpu != new_cpu) )
528 {
529 atomic_inc(&per_cpu(schedule_data, new_cpu).urgent_count);
530 atomic_dec(&per_cpu(schedule_data, old_cpu).urgent_count);
531 }
532
533 /*
534 * Actual CPU switch to new CPU. This is safe because the lock
535 * pointer cant' change while the current lock is held.
536 */
537 if ( vcpu_scheduler(v)->migrate )
538 SCHED_OP(vcpu_scheduler(v), migrate, v, new_cpu);
539 else
540 v->processor = new_cpu;
541 }
542
543 /*
544 * Move a vcpu from its current processor to a target new processor,
545 * without asking the scheduler to do any placement. This is intended
546 * for being called from special contexts, where things are quiet
547 * enough that no contention is supposed to happen (i.e., during
548 * shutdown or software suspend, like ACPI S3).
549 */
vcpu_move_nosched(struct vcpu * v,unsigned int new_cpu)550 static void vcpu_move_nosched(struct vcpu *v, unsigned int new_cpu)
551 {
552 unsigned long flags;
553 spinlock_t *lock, *new_lock;
554
555 ASSERT(system_state == SYS_STATE_suspend);
556 ASSERT(!vcpu_runnable(v) && (atomic_read(&v->pause_count) ||
557 atomic_read(&v->domain->pause_count)));
558
559 lock = per_cpu(schedule_data, v->processor).schedule_lock;
560 new_lock = per_cpu(schedule_data, new_cpu).schedule_lock;
561
562 sched_spin_lock_double(lock, new_lock, &flags);
563 ASSERT(new_cpu != v->processor);
564 vcpu_move_locked(v, new_cpu);
565 sched_spin_unlock_double(lock, new_lock, flags);
566
567 sched_move_irqs(v);
568 }
569
vcpu_migrate(struct vcpu * v)570 static void vcpu_migrate(struct vcpu *v)
571 {
572 unsigned long flags;
573 unsigned int old_cpu, new_cpu;
574 spinlock_t *old_lock, *new_lock;
575 bool_t pick_called = 0;
576
577 old_cpu = new_cpu = v->processor;
578 for ( ; ; )
579 {
580 /*
581 * We need another iteration if the pre-calculated lock addresses
582 * are not correct any longer after evaluating old and new cpu holding
583 * the locks.
584 */
585 old_lock = per_cpu(schedule_data, old_cpu).schedule_lock;
586 new_lock = per_cpu(schedule_data, new_cpu).schedule_lock;
587
588 sched_spin_lock_double(old_lock, new_lock, &flags);
589
590 old_cpu = v->processor;
591 if ( old_lock == per_cpu(schedule_data, old_cpu).schedule_lock )
592 {
593 /*
594 * If we selected a CPU on the previosu iteration, check if it
595 * remains suitable for running this vCPU.
596 */
597 if ( pick_called &&
598 (new_lock == per_cpu(schedule_data, new_cpu).schedule_lock) &&
599 cpumask_test_cpu(new_cpu, v->cpu_hard_affinity) &&
600 cpumask_test_cpu(new_cpu, v->domain->cpupool->cpu_valid) )
601 break;
602
603 /* Select a new CPU. */
604 new_cpu = SCHED_OP(vcpu_scheduler(v), pick_cpu, v);
605 if ( (new_lock == per_cpu(schedule_data, new_cpu).schedule_lock) &&
606 cpumask_test_cpu(new_cpu, v->domain->cpupool->cpu_valid) )
607 break;
608 pick_called = 1;
609 }
610 else
611 {
612 /*
613 * We do not hold the scheduler lock appropriate for this vCPU.
614 * Thus we cannot select a new CPU on this iteration. Try again.
615 */
616 pick_called = 0;
617 }
618
619 sched_spin_unlock_double(old_lock, new_lock, flags);
620 }
621
622 /*
623 * NB. Check of v->running happens /after/ setting migration flag
624 * because they both happen in (different) spinlock regions, and those
625 * regions are strictly serialised.
626 */
627 if ( v->is_running ||
628 !test_and_clear_bit(_VPF_migrating, &v->pause_flags) )
629 {
630 sched_spin_unlock_double(old_lock, new_lock, flags);
631 return;
632 }
633
634 vcpu_move_locked(v, new_cpu);
635
636 sched_spin_unlock_double(old_lock, new_lock, flags);
637
638 if ( old_cpu != new_cpu )
639 sched_move_irqs(v);
640
641 /* Wake on new CPU. */
642 vcpu_wake(v);
643 }
644
645 /*
646 * Force a VCPU through a deschedule/reschedule path.
647 * For example, using this when setting the periodic timer period means that
648 * most periodic-timer state need only be touched from within the scheduler
649 * which can thus be done without need for synchronisation.
650 */
vcpu_force_reschedule(struct vcpu * v)651 void vcpu_force_reschedule(struct vcpu *v)
652 {
653 spinlock_t *lock = vcpu_schedule_lock_irq(v);
654
655 if ( v->is_running )
656 set_bit(_VPF_migrating, &v->pause_flags);
657 vcpu_schedule_unlock_irq(lock, v);
658
659 if ( v->pause_flags & VPF_migrating )
660 {
661 vcpu_sleep_nosync(v);
662 vcpu_migrate(v);
663 }
664 }
665
restore_vcpu_affinity(struct domain * d)666 void restore_vcpu_affinity(struct domain *d)
667 {
668 unsigned int cpu = smp_processor_id();
669 struct vcpu *v;
670
671 ASSERT(system_state == SYS_STATE_resume);
672
673 for_each_vcpu ( d, v )
674 {
675 spinlock_t *lock;
676
677 ASSERT(!vcpu_runnable(v));
678
679 lock = vcpu_schedule_lock_irq(v);
680
681 if ( v->affinity_broken )
682 {
683 cpumask_copy(v->cpu_hard_affinity, v->cpu_hard_affinity_saved);
684 v->affinity_broken = 0;
685
686 }
687
688 /*
689 * During suspend (in cpu_disable_scheduler()), we moved every vCPU
690 * to BSP (which, as of now, is pCPU 0), as a temporary measure to
691 * allow the nonboot processors to have their data structure freed
692 * and go to sleep. But nothing guardantees that the BSP is a valid
693 * pCPU for a particular domain.
694 *
695 * Therefore, here, before actually unpausing the domains, we should
696 * set v->processor of each of their vCPUs to something that will
697 * make sense for the scheduler of the cpupool in which they are in.
698 */
699 cpumask_and(cpumask_scratch_cpu(cpu), v->cpu_hard_affinity,
700 cpupool_domain_cpumask(v->domain));
701 v->processor = cpumask_any(cpumask_scratch_cpu(cpu));
702
703 spin_unlock_irq(lock);
704
705 lock = vcpu_schedule_lock_irq(v);
706 v->processor = SCHED_OP(vcpu_scheduler(v), pick_cpu, v);
707 spin_unlock_irq(lock);
708 }
709
710 domain_update_node_affinity(d);
711 }
712
713 /*
714 * This function is used by cpu_hotplug code from stop_machine context
715 * and from cpupools to switch schedulers on a cpu.
716 */
cpu_disable_scheduler(unsigned int cpu)717 int cpu_disable_scheduler(unsigned int cpu)
718 {
719 struct domain *d;
720 struct vcpu *v;
721 struct cpupool *c;
722 cpumask_t online_affinity;
723 unsigned int new_cpu;
724 int ret = 0;
725
726 c = per_cpu(cpupool, cpu);
727 if ( c == NULL )
728 return ret;
729
730 /*
731 * We'd need the domain RCU lock, but:
732 * - when we are called from cpupool code, it's acquired there already;
733 * - when we are called for CPU teardown, we're in stop-machine context,
734 * so that's not be a problem.
735 */
736 for_each_domain_in_cpupool ( d, c )
737 {
738 for_each_vcpu ( d, v )
739 {
740 unsigned long flags;
741 spinlock_t *lock = vcpu_schedule_lock_irqsave(v, &flags);
742
743 cpumask_and(&online_affinity, v->cpu_hard_affinity, c->cpu_valid);
744 if ( cpumask_empty(&online_affinity) &&
745 cpumask_test_cpu(cpu, v->cpu_hard_affinity) )
746 {
747 if ( v->affinity_broken )
748 {
749 /* The vcpu is temporarily pinned, can't move it. */
750 vcpu_schedule_unlock_irqrestore(lock, flags, v);
751 ret = -EADDRINUSE;
752 break;
753 }
754
755 if (system_state == SYS_STATE_suspend)
756 {
757 cpumask_copy(v->cpu_hard_affinity_saved,
758 v->cpu_hard_affinity);
759 v->affinity_broken = 1;
760 }
761 else
762 printk(XENLOG_DEBUG "Breaking affinity for %pv\n", v);
763
764 cpumask_setall(v->cpu_hard_affinity);
765 }
766
767 if ( v->processor != cpu )
768 {
769 /* The vcpu is not on this cpu, so we can move on. */
770 vcpu_schedule_unlock_irqrestore(lock, flags, v);
771 continue;
772 }
773
774 /* If it is on this cpu, we must send it away. */
775 if ( unlikely(system_state == SYS_STATE_suspend) )
776 {
777 vcpu_schedule_unlock_irqrestore(lock, flags, v);
778
779 /*
780 * If we are doing a shutdown/suspend, it is not necessary to
781 * ask the scheduler to chime in. In fact:
782 * * there is no reason for it: the end result we are after
783 * is just 'all the vcpus on the boot pcpu, and no vcpu
784 * anywhere else', so let's just go for it;
785 * * it's wrong, for cpupools with only non-boot pcpus, as
786 * the scheduler would always fail to send the vcpus away
787 * from the last online (non boot) pcpu!
788 *
789 * Therefore, in the shutdown/suspend case, we just pick up
790 * one (still) online pcpu. Note that, at this stage, all
791 * domains (including dom0) have been paused already, so we
792 * do not expect any vcpu activity at all.
793 */
794 cpumask_andnot(&online_affinity, &cpu_online_map,
795 cpumask_of(cpu));
796 BUG_ON(cpumask_empty(&online_affinity));
797 /*
798 * As boot cpu is, usually, pcpu #0, using cpumask_first()
799 * will make us converge quicker.
800 */
801 new_cpu = cpumask_first(&online_affinity);
802 vcpu_move_nosched(v, new_cpu);
803 }
804 else
805 {
806 /*
807 * OTOH, if the system is still live, and we are here because
808 * we are doing some cpupool manipulations:
809 * * we want to call the scheduler, and let it re-evaluation
810 * the placement of the vcpu, taking into account the new
811 * cpupool configuration;
812 * * the scheduler will always fine a suitable solution, or
813 * things would have failed before getting in here.
814 */
815 set_bit(_VPF_migrating, &v->pause_flags);
816 vcpu_schedule_unlock_irqrestore(lock, flags, v);
817 vcpu_sleep_nosync(v);
818 vcpu_migrate(v);
819
820 /*
821 * The only caveat, in this case, is that if a vcpu active in
822 * the hypervisor isn't migratable. In this case, the caller
823 * should try again after releasing and reaquiring all locks.
824 */
825 if ( v->processor == cpu )
826 ret = -EAGAIN;
827 }
828 }
829 }
830
831 return ret;
832 }
833
vcpu_set_affinity(struct vcpu * v,const cpumask_t * affinity,cpumask_t * which)834 static int vcpu_set_affinity(
835 struct vcpu *v, const cpumask_t *affinity, cpumask_t *which)
836 {
837 spinlock_t *lock;
838 int ret = 0;
839
840 lock = vcpu_schedule_lock_irq(v);
841
842 if ( v->affinity_broken )
843 ret = -EBUSY;
844 else
845 {
846 cpumask_copy(which, affinity);
847
848 /*
849 * Always ask the scheduler to re-evaluate placement
850 * when changing the affinity.
851 */
852 set_bit(_VPF_migrating, &v->pause_flags);
853 }
854
855 vcpu_schedule_unlock_irq(lock, v);
856
857 domain_update_node_affinity(v->domain);
858
859 if ( v->pause_flags & VPF_migrating )
860 {
861 vcpu_sleep_nosync(v);
862 vcpu_migrate(v);
863 }
864
865 return ret;
866 }
867
vcpu_set_hard_affinity(struct vcpu * v,const cpumask_t * affinity)868 int vcpu_set_hard_affinity(struct vcpu *v, const cpumask_t *affinity)
869 {
870 cpumask_t online_affinity;
871 cpumask_t *online;
872
873 if ( v->domain->is_pinned )
874 return -EINVAL;
875
876 online = VCPU2ONLINE(v);
877 cpumask_and(&online_affinity, affinity, online);
878 if ( cpumask_empty(&online_affinity) )
879 return -EINVAL;
880
881 return vcpu_set_affinity(v, affinity, v->cpu_hard_affinity);
882 }
883
vcpu_set_soft_affinity(struct vcpu * v,const cpumask_t * affinity)884 int vcpu_set_soft_affinity(struct vcpu *v, const cpumask_t *affinity)
885 {
886 return vcpu_set_affinity(v, affinity, v->cpu_soft_affinity);
887 }
888
889 /* Block the currently-executing domain until a pertinent event occurs. */
vcpu_block(void)890 void vcpu_block(void)
891 {
892 struct vcpu *v = current;
893
894 set_bit(_VPF_blocked, &v->pause_flags);
895
896 arch_vcpu_block(v);
897
898 /* Check for events /after/ blocking: avoids wakeup waiting race. */
899 if ( local_events_need_delivery() )
900 {
901 clear_bit(_VPF_blocked, &v->pause_flags);
902 }
903 else
904 {
905 TRACE_2D(TRC_SCHED_BLOCK, v->domain->domain_id, v->vcpu_id);
906 raise_softirq(SCHEDULE_SOFTIRQ);
907 }
908 }
909
vcpu_block_enable_events(void)910 static void vcpu_block_enable_events(void)
911 {
912 local_event_delivery_enable();
913 vcpu_block();
914 }
915
do_poll(struct sched_poll * sched_poll)916 static long do_poll(struct sched_poll *sched_poll)
917 {
918 struct vcpu *v = current;
919 struct domain *d = v->domain;
920 evtchn_port_t port;
921 long rc;
922 unsigned int i;
923
924 /* Fairly arbitrary limit. */
925 if ( sched_poll->nr_ports > 128 )
926 return -EINVAL;
927
928 if ( !guest_handle_okay(sched_poll->ports, sched_poll->nr_ports) )
929 return -EFAULT;
930
931 set_bit(_VPF_blocked, &v->pause_flags);
932 v->poll_evtchn = -1;
933 set_bit(v->vcpu_id, d->poll_mask);
934
935 arch_vcpu_block(v);
936
937 #ifndef CONFIG_X86 /* set_bit() implies mb() on x86 */
938 /* Check for events /after/ setting flags: avoids wakeup waiting race. */
939 smp_mb();
940
941 /*
942 * Someone may have seen we are blocked but not that we are polling, or
943 * vice versa. We are certainly being woken, so clean up and bail. Beyond
944 * this point others can be guaranteed to clean up for us if they wake us.
945 */
946 rc = 0;
947 if ( (v->poll_evtchn == 0) ||
948 !test_bit(_VPF_blocked, &v->pause_flags) ||
949 !test_bit(v->vcpu_id, d->poll_mask) )
950 goto out;
951 #endif
952
953 rc = 0;
954 if ( local_events_need_delivery() )
955 goto out;
956
957 for ( i = 0; i < sched_poll->nr_ports; i++ )
958 {
959 rc = -EFAULT;
960 if ( __copy_from_guest_offset(&port, sched_poll->ports, i, 1) )
961 goto out;
962
963 rc = -EINVAL;
964 if ( port >= d->max_evtchns )
965 goto out;
966
967 rc = 0;
968 if ( evtchn_port_is_pending(d, port) )
969 goto out;
970 }
971
972 if ( sched_poll->nr_ports == 1 )
973 v->poll_evtchn = port;
974
975 if ( sched_poll->timeout != 0 )
976 set_timer(&v->poll_timer, sched_poll->timeout);
977
978 TRACE_2D(TRC_SCHED_BLOCK, d->domain_id, v->vcpu_id);
979 raise_softirq(SCHEDULE_SOFTIRQ);
980
981 return 0;
982
983 out:
984 v->poll_evtchn = 0;
985 clear_bit(v->vcpu_id, d->poll_mask);
986 clear_bit(_VPF_blocked, &v->pause_flags);
987 return rc;
988 }
989
990 /* Voluntarily yield the processor for this allocation. */
vcpu_yield(void)991 long vcpu_yield(void)
992 {
993 struct vcpu * v=current;
994 spinlock_t *lock = vcpu_schedule_lock_irq(v);
995
996 SCHED_OP(vcpu_scheduler(v), yield, v);
997 vcpu_schedule_unlock_irq(lock, v);
998
999 SCHED_STAT_CRANK(vcpu_yield);
1000
1001 TRACE_2D(TRC_SCHED_YIELD, current->domain->domain_id, current->vcpu_id);
1002 raise_softirq(SCHEDULE_SOFTIRQ);
1003 return 0;
1004 }
1005
domain_watchdog_timeout(void * data)1006 static void domain_watchdog_timeout(void *data)
1007 {
1008 struct domain *d = data;
1009
1010 if ( d->is_shutting_down || d->is_dying )
1011 return;
1012
1013 printk("Watchdog timer fired for domain %u\n", d->domain_id);
1014 domain_shutdown(d, SHUTDOWN_watchdog);
1015 }
1016
domain_watchdog(struct domain * d,uint32_t id,uint32_t timeout)1017 static long domain_watchdog(struct domain *d, uint32_t id, uint32_t timeout)
1018 {
1019 if ( id > NR_DOMAIN_WATCHDOG_TIMERS )
1020 return -EINVAL;
1021
1022 spin_lock(&d->watchdog_lock);
1023
1024 if ( id == 0 )
1025 {
1026 for ( id = 0; id < NR_DOMAIN_WATCHDOG_TIMERS; id++ )
1027 {
1028 if ( test_and_set_bit(id, &d->watchdog_inuse_map) )
1029 continue;
1030 set_timer(&d->watchdog_timer[id], NOW() + SECONDS(timeout));
1031 break;
1032 }
1033 spin_unlock(&d->watchdog_lock);
1034 return id == NR_DOMAIN_WATCHDOG_TIMERS ? -ENOSPC : id + 1;
1035 }
1036
1037 id -= 1;
1038 if ( !test_bit(id, &d->watchdog_inuse_map) )
1039 {
1040 spin_unlock(&d->watchdog_lock);
1041 return -EINVAL;
1042 }
1043
1044 if ( timeout == 0 )
1045 {
1046 stop_timer(&d->watchdog_timer[id]);
1047 clear_bit(id, &d->watchdog_inuse_map);
1048 }
1049 else
1050 {
1051 set_timer(&d->watchdog_timer[id], NOW() + SECONDS(timeout));
1052 }
1053
1054 spin_unlock(&d->watchdog_lock);
1055 return 0;
1056 }
1057
watchdog_domain_init(struct domain * d)1058 void watchdog_domain_init(struct domain *d)
1059 {
1060 unsigned int i;
1061
1062 spin_lock_init(&d->watchdog_lock);
1063
1064 d->watchdog_inuse_map = 0;
1065
1066 for ( i = 0; i < NR_DOMAIN_WATCHDOG_TIMERS; i++ )
1067 init_timer(&d->watchdog_timer[i], domain_watchdog_timeout, d, 0);
1068 }
1069
watchdog_domain_destroy(struct domain * d)1070 void watchdog_domain_destroy(struct domain *d)
1071 {
1072 unsigned int i;
1073
1074 for ( i = 0; i < NR_DOMAIN_WATCHDOG_TIMERS; i++ )
1075 kill_timer(&d->watchdog_timer[i]);
1076 }
1077
vcpu_pin_override(struct vcpu * v,int cpu)1078 int vcpu_pin_override(struct vcpu *v, int cpu)
1079 {
1080 spinlock_t *lock;
1081 int ret = -EINVAL;
1082
1083 lock = vcpu_schedule_lock_irq(v);
1084
1085 if ( cpu < 0 )
1086 {
1087 if ( v->affinity_broken )
1088 {
1089 cpumask_copy(v->cpu_hard_affinity, v->cpu_hard_affinity_saved);
1090 v->affinity_broken = 0;
1091 set_bit(_VPF_migrating, &v->pause_flags);
1092 ret = 0;
1093 }
1094 }
1095 else if ( cpu < nr_cpu_ids )
1096 {
1097 if ( v->affinity_broken )
1098 ret = -EBUSY;
1099 else if ( cpumask_test_cpu(cpu, VCPU2ONLINE(v)) )
1100 {
1101 cpumask_copy(v->cpu_hard_affinity_saved, v->cpu_hard_affinity);
1102 v->affinity_broken = 1;
1103 cpumask_copy(v->cpu_hard_affinity, cpumask_of(cpu));
1104 set_bit(_VPF_migrating, &v->pause_flags);
1105 ret = 0;
1106 }
1107 }
1108
1109 vcpu_schedule_unlock_irq(lock, v);
1110
1111 domain_update_node_affinity(v->domain);
1112
1113 if ( v->pause_flags & VPF_migrating )
1114 {
1115 vcpu_sleep_nosync(v);
1116 vcpu_migrate(v);
1117 }
1118
1119 return ret;
1120 }
1121
1122 typedef long ret_t;
1123
1124 #endif /* !COMPAT */
1125
do_sched_op(int cmd,XEN_GUEST_HANDLE_PARAM (void)arg)1126 ret_t do_sched_op(int cmd, XEN_GUEST_HANDLE_PARAM(void) arg)
1127 {
1128 ret_t ret = 0;
1129
1130 switch ( cmd )
1131 {
1132 case SCHEDOP_yield:
1133 {
1134 ret = vcpu_yield();
1135 break;
1136 }
1137
1138 case SCHEDOP_block:
1139 {
1140 vcpu_block_enable_events();
1141 break;
1142 }
1143
1144 case SCHEDOP_shutdown:
1145 {
1146 struct sched_shutdown sched_shutdown;
1147
1148 ret = -EFAULT;
1149 if ( copy_from_guest(&sched_shutdown, arg, 1) )
1150 break;
1151
1152 TRACE_3D(TRC_SCHED_SHUTDOWN,
1153 current->domain->domain_id, current->vcpu_id,
1154 sched_shutdown.reason);
1155 ret = domain_shutdown(current->domain, (u8)sched_shutdown.reason);
1156
1157 break;
1158 }
1159
1160 case SCHEDOP_shutdown_code:
1161 {
1162 struct sched_shutdown sched_shutdown;
1163 struct domain *d = current->domain;
1164
1165 ret = -EFAULT;
1166 if ( copy_from_guest(&sched_shutdown, arg, 1) )
1167 break;
1168
1169 TRACE_3D(TRC_SCHED_SHUTDOWN_CODE,
1170 d->domain_id, current->vcpu_id, sched_shutdown.reason);
1171
1172 spin_lock(&d->shutdown_lock);
1173 if ( d->shutdown_code == SHUTDOWN_CODE_INVALID )
1174 d->shutdown_code = (u8)sched_shutdown.reason;
1175 spin_unlock(&d->shutdown_lock);
1176
1177 ret = 0;
1178 break;
1179 }
1180
1181 case SCHEDOP_poll:
1182 {
1183 struct sched_poll sched_poll;
1184
1185 ret = -EFAULT;
1186 if ( copy_from_guest(&sched_poll, arg, 1) )
1187 break;
1188
1189 ret = do_poll(&sched_poll);
1190
1191 break;
1192 }
1193
1194 case SCHEDOP_remote_shutdown:
1195 {
1196 struct domain *d;
1197 struct sched_remote_shutdown sched_remote_shutdown;
1198
1199 ret = -EFAULT;
1200 if ( copy_from_guest(&sched_remote_shutdown, arg, 1) )
1201 break;
1202
1203 ret = -ESRCH;
1204 d = rcu_lock_domain_by_id(sched_remote_shutdown.domain_id);
1205 if ( d == NULL )
1206 break;
1207
1208 ret = xsm_schedop_shutdown(XSM_DM_PRIV, current->domain, d);
1209 if ( likely(!ret) )
1210 domain_shutdown(d, sched_remote_shutdown.reason);
1211
1212 rcu_unlock_domain(d);
1213
1214 break;
1215 }
1216
1217 case SCHEDOP_watchdog:
1218 {
1219 struct sched_watchdog sched_watchdog;
1220
1221 ret = -EFAULT;
1222 if ( copy_from_guest(&sched_watchdog, arg, 1) )
1223 break;
1224
1225 ret = domain_watchdog(
1226 current->domain, sched_watchdog.id, sched_watchdog.timeout);
1227 break;
1228 }
1229
1230 case SCHEDOP_pin_override:
1231 {
1232 struct sched_pin_override sched_pin_override;
1233
1234 ret = -EPERM;
1235 if ( !is_hardware_domain(current->domain) )
1236 break;
1237
1238 ret = -EFAULT;
1239 if ( copy_from_guest(&sched_pin_override, arg, 1) )
1240 break;
1241
1242 ret = vcpu_pin_override(current, sched_pin_override.pcpu);
1243
1244 break;
1245 }
1246
1247 default:
1248 ret = -ENOSYS;
1249 }
1250
1251 return ret;
1252 }
1253
1254 #ifndef COMPAT
1255
1256 /* Per-vcpu oneshot-timer hypercall. */
do_set_timer_op(s_time_t timeout)1257 long do_set_timer_op(s_time_t timeout)
1258 {
1259 struct vcpu *v = current;
1260 s_time_t offset = timeout - NOW();
1261
1262 if ( timeout == 0 )
1263 {
1264 stop_timer(&v->singleshot_timer);
1265 }
1266 else if ( unlikely(timeout < 0) || /* overflow into 64th bit? */
1267 unlikely((offset > 0) && ((uint32_t)(offset >> 50) != 0)) )
1268 {
1269 /*
1270 * Linux workaround: occasionally we will see timeouts a long way in
1271 * the future due to wrapping in Linux's jiffy time handling. We check
1272 * for timeouts wrapped negative, and for positive timeouts more than
1273 * about 13 days in the future (2^50ns). The correct fix is to trigger
1274 * an interrupt immediately (since Linux in fact has pending work to
1275 * do in this situation). However, older guests also set a long timeout
1276 * when they have *no* pending timers at all: setting an immediate
1277 * timeout in this case can burn a lot of CPU. We therefore go for a
1278 * reasonable middleground of triggering a timer event in 100ms.
1279 */
1280 gdprintk(XENLOG_INFO, "Warning: huge timeout set: %"PRIx64"\n",
1281 timeout);
1282 set_timer(&v->singleshot_timer, NOW() + MILLISECS(100));
1283 }
1284 else
1285 {
1286 migrate_timer(&v->singleshot_timer, smp_processor_id());
1287 set_timer(&v->singleshot_timer, timeout);
1288 }
1289
1290 return 0;
1291 }
1292
1293 /* sched_id - fetch ID of current scheduler */
sched_id(void)1294 int sched_id(void)
1295 {
1296 return ops.sched_id;
1297 }
1298
1299 /* Adjust scheduling parameter for a given domain. */
sched_adjust(struct domain * d,struct xen_domctl_scheduler_op * op)1300 long sched_adjust(struct domain *d, struct xen_domctl_scheduler_op *op)
1301 {
1302 long ret;
1303
1304 ret = xsm_domctl_scheduler_op(XSM_HOOK, d, op->cmd);
1305 if ( ret )
1306 return ret;
1307
1308 if ( op->sched_id != dom_scheduler(d)->sched_id )
1309 return -EINVAL;
1310
1311 switch ( op->cmd )
1312 {
1313 case XEN_DOMCTL_SCHEDOP_putinfo:
1314 case XEN_DOMCTL_SCHEDOP_getinfo:
1315 case XEN_DOMCTL_SCHEDOP_putvcpuinfo:
1316 case XEN_DOMCTL_SCHEDOP_getvcpuinfo:
1317 break;
1318 default:
1319 return -EINVAL;
1320 }
1321
1322 /* NB: the pluggable scheduler code needs to take care
1323 * of locking by itself. */
1324 if ( (ret = SCHED_OP(dom_scheduler(d), adjust, d, op)) == 0 )
1325 TRACE_1D(TRC_SCHED_ADJDOM, d->domain_id);
1326
1327 return ret;
1328 }
1329
sched_adjust_global(struct xen_sysctl_scheduler_op * op)1330 long sched_adjust_global(struct xen_sysctl_scheduler_op *op)
1331 {
1332 struct cpupool *pool;
1333 int rc;
1334
1335 rc = xsm_sysctl_scheduler_op(XSM_HOOK, op->cmd);
1336 if ( rc )
1337 return rc;
1338
1339 if ( (op->cmd != XEN_SYSCTL_SCHEDOP_putinfo) &&
1340 (op->cmd != XEN_SYSCTL_SCHEDOP_getinfo) )
1341 return -EINVAL;
1342
1343 pool = cpupool_get_by_id(op->cpupool_id);
1344 if ( pool == NULL )
1345 return -ESRCH;
1346
1347 rc = ((op->sched_id == pool->sched->sched_id)
1348 ? SCHED_OP(pool->sched, adjust_global, op) : -EINVAL);
1349
1350 cpupool_put(pool);
1351
1352 return rc;
1353 }
1354
vcpu_periodic_timer_work(struct vcpu * v)1355 static void vcpu_periodic_timer_work(struct vcpu *v)
1356 {
1357 s_time_t now = NOW();
1358 s_time_t periodic_next_event;
1359
1360 if ( v->periodic_period == 0 )
1361 return;
1362
1363 periodic_next_event = v->periodic_last_event + v->periodic_period;
1364
1365 if ( now >= periodic_next_event )
1366 {
1367 send_timer_event(v);
1368 v->periodic_last_event = now;
1369 periodic_next_event = now + v->periodic_period;
1370 }
1371
1372 migrate_timer(&v->periodic_timer, smp_processor_id());
1373 set_timer(&v->periodic_timer, periodic_next_event);
1374 }
1375
1376 /*
1377 * The main function
1378 * - deschedule the current domain (scheduler independent).
1379 * - pick a new domain (scheduler dependent).
1380 */
schedule(void)1381 static void schedule(void)
1382 {
1383 struct vcpu *prev = current, *next = NULL;
1384 s_time_t now;
1385 struct scheduler *sched;
1386 unsigned long *tasklet_work = &this_cpu(tasklet_work_to_do);
1387 bool_t tasklet_work_scheduled = 0;
1388 struct schedule_data *sd;
1389 spinlock_t *lock;
1390 struct task_slice next_slice;
1391 int cpu = smp_processor_id();
1392
1393 ASSERT_NOT_IN_ATOMIC();
1394
1395 SCHED_STAT_CRANK(sched_run);
1396
1397 sd = &this_cpu(schedule_data);
1398
1399 /* Update tasklet scheduling status. */
1400 switch ( *tasklet_work )
1401 {
1402 case TASKLET_enqueued:
1403 set_bit(_TASKLET_scheduled, tasklet_work);
1404 /* fallthrough */
1405 case TASKLET_enqueued|TASKLET_scheduled:
1406 tasklet_work_scheduled = 1;
1407 break;
1408 case TASKLET_scheduled:
1409 clear_bit(_TASKLET_scheduled, tasklet_work);
1410 case 0:
1411 /*tasklet_work_scheduled = 0;*/
1412 break;
1413 default:
1414 BUG();
1415 }
1416
1417 lock = pcpu_schedule_lock_irq(cpu);
1418
1419 now = NOW();
1420
1421 stop_timer(&sd->s_timer);
1422
1423 /* get policy-specific decision on scheduling... */
1424 sched = this_cpu(scheduler);
1425 next_slice = sched->do_schedule(sched, now, tasklet_work_scheduled);
1426
1427 next = next_slice.task;
1428
1429 sd->curr = next;
1430
1431 if ( next_slice.time >= 0 ) /* -ve means no limit */
1432 set_timer(&sd->s_timer, now + next_slice.time);
1433
1434 if ( unlikely(prev == next) )
1435 {
1436 pcpu_schedule_unlock_irq(lock, cpu);
1437 TRACE_4D(TRC_SCHED_SWITCH_INFCONT,
1438 next->domain->domain_id, next->vcpu_id,
1439 now - prev->runstate.state_entry_time,
1440 next_slice.time);
1441 trace_continue_running(next);
1442 return continue_running(prev);
1443 }
1444
1445 TRACE_3D(TRC_SCHED_SWITCH_INFPREV,
1446 prev->domain->domain_id, prev->vcpu_id,
1447 now - prev->runstate.state_entry_time);
1448 TRACE_4D(TRC_SCHED_SWITCH_INFNEXT,
1449 next->domain->domain_id, next->vcpu_id,
1450 (next->runstate.state == RUNSTATE_runnable) ?
1451 (now - next->runstate.state_entry_time) : 0,
1452 next_slice.time);
1453
1454 ASSERT(prev->runstate.state == RUNSTATE_running);
1455
1456 TRACE_4D(TRC_SCHED_SWITCH,
1457 prev->domain->domain_id, prev->vcpu_id,
1458 next->domain->domain_id, next->vcpu_id);
1459
1460 vcpu_runstate_change(
1461 prev,
1462 ((prev->pause_flags & VPF_blocked) ? RUNSTATE_blocked :
1463 (vcpu_runnable(prev) ? RUNSTATE_runnable : RUNSTATE_offline)),
1464 now);
1465 prev->last_run_time = now;
1466
1467 ASSERT(next->runstate.state != RUNSTATE_running);
1468 vcpu_runstate_change(next, RUNSTATE_running, now);
1469
1470 /*
1471 * NB. Don't add any trace records from here until the actual context
1472 * switch, else lost_records resume will not work properly.
1473 */
1474
1475 ASSERT(!next->is_running);
1476 next->is_running = 1;
1477
1478 pcpu_schedule_unlock_irq(lock, cpu);
1479
1480 SCHED_STAT_CRANK(sched_ctx);
1481
1482 stop_timer(&prev->periodic_timer);
1483
1484 if ( next_slice.migrated )
1485 sched_move_irqs(next);
1486
1487 vcpu_periodic_timer_work(next);
1488
1489 context_switch(prev, next);
1490 }
1491
context_saved(struct vcpu * prev)1492 void context_saved(struct vcpu *prev)
1493 {
1494 /* Clear running flag /after/ writing context to memory. */
1495 smp_wmb();
1496
1497 prev->is_running = 0;
1498
1499 /* Check for migration request /after/ clearing running flag. */
1500 smp_mb();
1501
1502 SCHED_OP(vcpu_scheduler(prev), context_saved, prev);
1503
1504 if ( unlikely(prev->pause_flags & VPF_migrating) )
1505 vcpu_migrate(prev);
1506 }
1507
1508 /* The scheduler timer: force a run through the scheduler */
s_timer_fn(void * unused)1509 static void s_timer_fn(void *unused)
1510 {
1511 raise_softirq(SCHEDULE_SOFTIRQ);
1512 SCHED_STAT_CRANK(sched_irq);
1513 }
1514
1515 /* Per-VCPU periodic timer function: sends a virtual timer interrupt. */
vcpu_periodic_timer_fn(void * data)1516 static void vcpu_periodic_timer_fn(void *data)
1517 {
1518 struct vcpu *v = data;
1519 vcpu_periodic_timer_work(v);
1520 }
1521
1522 /* Per-VCPU single-shot timer function: sends a virtual timer interrupt. */
vcpu_singleshot_timer_fn(void * data)1523 static void vcpu_singleshot_timer_fn(void *data)
1524 {
1525 struct vcpu *v = data;
1526 send_timer_event(v);
1527 }
1528
1529 /* SCHEDOP_poll timeout callback. */
poll_timer_fn(void * data)1530 static void poll_timer_fn(void *data)
1531 {
1532 struct vcpu *v = data;
1533
1534 if ( test_and_clear_bit(v->vcpu_id, v->domain->poll_mask) )
1535 vcpu_unblock(v);
1536 }
1537
cpu_schedule_up(unsigned int cpu)1538 static int cpu_schedule_up(unsigned int cpu)
1539 {
1540 struct schedule_data *sd = &per_cpu(schedule_data, cpu);
1541 void *sched_priv;
1542
1543 per_cpu(scheduler, cpu) = &ops;
1544 spin_lock_init(&sd->_lock);
1545 sd->schedule_lock = &sd->_lock;
1546 sd->curr = idle_vcpu[cpu];
1547 init_timer(&sd->s_timer, s_timer_fn, NULL, cpu);
1548 atomic_set(&sd->urgent_count, 0);
1549
1550 /* Boot CPU is dealt with later in schedule_init(). */
1551 if ( cpu == 0 )
1552 return 0;
1553
1554 if ( idle_vcpu[cpu] == NULL )
1555 alloc_vcpu(idle_vcpu[0]->domain, cpu, cpu);
1556 else
1557 {
1558 struct vcpu *idle = idle_vcpu[cpu];
1559
1560 /*
1561 * During (ACPI?) suspend the idle vCPU for this pCPU is not freed,
1562 * while its scheduler specific data (what is pointed by sched_priv)
1563 * is. Also, at this stage of the resume path, we attach the pCPU
1564 * to the default scheduler, no matter in what cpupool it was before
1565 * suspend. To avoid inconsistency, let's allocate default scheduler
1566 * data for the idle vCPU here. If the pCPU was in a different pool
1567 * with a different scheduler, it is schedule_cpu_switch(), invoked
1568 * later, that will set things up as appropriate.
1569 */
1570 ASSERT(idle->sched_priv == NULL);
1571
1572 idle->sched_priv = SCHED_OP(&ops, alloc_vdata, idle,
1573 idle->domain->sched_priv);
1574 if ( idle->sched_priv == NULL )
1575 return -ENOMEM;
1576 }
1577 if ( idle_vcpu[cpu] == NULL )
1578 return -ENOMEM;
1579
1580 /*
1581 * We don't want to risk calling xfree() on an sd->sched_priv
1582 * (e.g., inside free_pdata, from cpu_schedule_down() called
1583 * during CPU_UP_CANCELLED) that contains an IS_ERR value.
1584 */
1585 sched_priv = SCHED_OP(&ops, alloc_pdata, cpu);
1586 if ( IS_ERR(sched_priv) )
1587 return PTR_ERR(sched_priv);
1588
1589 sd->sched_priv = sched_priv;
1590
1591 return 0;
1592 }
1593
cpu_schedule_down(unsigned int cpu)1594 static void cpu_schedule_down(unsigned int cpu)
1595 {
1596 struct schedule_data *sd = &per_cpu(schedule_data, cpu);
1597 struct scheduler *sched = per_cpu(scheduler, cpu);
1598
1599 SCHED_OP(sched, free_pdata, sd->sched_priv, cpu);
1600 SCHED_OP(sched, free_vdata, idle_vcpu[cpu]->sched_priv);
1601
1602 idle_vcpu[cpu]->sched_priv = NULL;
1603 sd->sched_priv = NULL;
1604
1605 kill_timer(&sd->s_timer);
1606 }
1607
cpu_schedule_callback(struct notifier_block * nfb,unsigned long action,void * hcpu)1608 static int cpu_schedule_callback(
1609 struct notifier_block *nfb, unsigned long action, void *hcpu)
1610 {
1611 unsigned int cpu = (unsigned long)hcpu;
1612 struct scheduler *sched = per_cpu(scheduler, cpu);
1613 struct schedule_data *sd = &per_cpu(schedule_data, cpu);
1614 int rc = 0;
1615
1616 /*
1617 * From the scheduler perspective, bringing up a pCPU requires
1618 * allocating and initializing the per-pCPU scheduler specific data,
1619 * as well as "registering" this pCPU to the scheduler (which may
1620 * involve modifying some scheduler wide data structures).
1621 * This happens by calling the alloc_pdata and init_pdata hooks, in
1622 * this order. A scheduler that does not need to allocate any per-pCPU
1623 * data can avoid implementing alloc_pdata. init_pdata may, however, be
1624 * necessary/useful in this case too (e.g., it can contain the "register
1625 * the pCPU to the scheduler" part). alloc_pdata (if present) is called
1626 * during CPU_UP_PREPARE. init_pdata (if present) is called during
1627 * CPU_STARTING.
1628 *
1629 * On the other hand, at teardown, we need to reverse what has been done
1630 * during initialization, and then free the per-pCPU specific data. This
1631 * happens by calling the deinit_pdata and free_pdata hooks, in this
1632 * order. If no per-pCPU memory was allocated, there is no need to
1633 * provide an implementation of free_pdata. deinit_pdata may, however,
1634 * be necessary/useful in this case too (e.g., it can undo something done
1635 * on scheduler wide data structure during init_pdata). Both deinit_pdata
1636 * and free_pdata are called during CPU_DEAD.
1637 *
1638 * If someting goes wrong during bringup, we go to CPU_UP_CANCELLED
1639 * *before* having called init_pdata. In this case, as there is no
1640 * initialization needing undoing, only free_pdata should be called.
1641 * This means it is possible to call free_pdata just after alloc_pdata,
1642 * without a init_pdata/deinit_pdata "cycle" in between the two.
1643 *
1644 * So, in summary, the usage pattern should look either
1645 * - alloc_pdata-->init_pdata-->deinit_pdata-->free_pdata, or
1646 * - alloc_pdata-->free_pdata.
1647 */
1648 switch ( action )
1649 {
1650 case CPU_STARTING:
1651 SCHED_OP(sched, init_pdata, sd->sched_priv, cpu);
1652 break;
1653 case CPU_UP_PREPARE:
1654 rc = cpu_schedule_up(cpu);
1655 break;
1656 case CPU_DEAD:
1657 SCHED_OP(sched, deinit_pdata, sd->sched_priv, cpu);
1658 /* Fallthrough */
1659 case CPU_UP_CANCELED:
1660 cpu_schedule_down(cpu);
1661 break;
1662 default:
1663 break;
1664 }
1665
1666 return !rc ? NOTIFY_DONE : notifier_from_errno(rc);
1667 }
1668
1669 static struct notifier_block cpu_schedule_nfb = {
1670 .notifier_call = cpu_schedule_callback
1671 };
1672
1673 /* Initialise the data structures. */
scheduler_init(void)1674 void __init scheduler_init(void)
1675 {
1676 struct domain *idle_domain;
1677 int i;
1678
1679 open_softirq(SCHEDULE_SOFTIRQ, schedule);
1680
1681 for ( i = 0; i < NUM_SCHEDULERS; i++)
1682 {
1683 if ( schedulers[i]->global_init && schedulers[i]->global_init() < 0 )
1684 schedulers[i] = NULL;
1685 else if ( !ops.name && !strcmp(schedulers[i]->opt_name, opt_sched) )
1686 ops = *schedulers[i];
1687 }
1688
1689 if ( !ops.name )
1690 {
1691 printk("Could not find scheduler: %s\n", opt_sched);
1692 for ( i = 0; i < NUM_SCHEDULERS; i++ )
1693 if ( schedulers[i] &&
1694 !strcmp(schedulers[i]->opt_name, CONFIG_SCHED_DEFAULT) )
1695 {
1696 ops = *schedulers[i];
1697 break;
1698 }
1699 BUG_ON(!ops.name);
1700 printk("Using '%s' (%s)\n", ops.name, ops.opt_name);
1701 }
1702
1703 if ( cpu_schedule_up(0) )
1704 BUG();
1705 register_cpu_notifier(&cpu_schedule_nfb);
1706
1707 printk("Using scheduler: %s (%s)\n", ops.name, ops.opt_name);
1708 if ( SCHED_OP(&ops, init) )
1709 panic("scheduler returned error on init");
1710
1711 if ( sched_ratelimit_us &&
1712 (sched_ratelimit_us > XEN_SYSCTL_SCHED_RATELIMIT_MAX
1713 || sched_ratelimit_us < XEN_SYSCTL_SCHED_RATELIMIT_MIN) )
1714 {
1715 printk("WARNING: sched_ratelimit_us outside of valid range [%d,%d].\n"
1716 " Resetting to default %u\n",
1717 XEN_SYSCTL_SCHED_RATELIMIT_MIN,
1718 XEN_SYSCTL_SCHED_RATELIMIT_MAX,
1719 SCHED_DEFAULT_RATELIMIT_US);
1720 sched_ratelimit_us = SCHED_DEFAULT_RATELIMIT_US;
1721 }
1722
1723 idle_domain = domain_create(DOMID_IDLE, 0, 0, NULL);
1724 BUG_ON(IS_ERR(idle_domain));
1725 idle_domain->vcpu = idle_vcpu;
1726 idle_domain->max_vcpus = nr_cpu_ids;
1727 if ( alloc_vcpu(idle_domain, 0, 0) == NULL )
1728 BUG();
1729 this_cpu(schedule_data).sched_priv = SCHED_OP(&ops, alloc_pdata, 0);
1730 BUG_ON(IS_ERR(this_cpu(schedule_data).sched_priv));
1731 SCHED_OP(&ops, init_pdata, this_cpu(schedule_data).sched_priv, 0);
1732 }
1733
1734 /*
1735 * Move a pCPU outside of the influence of the scheduler of its current
1736 * cpupool, or subject it to the scheduler of a new cpupool.
1737 *
1738 * For the pCPUs that are removed from their cpupool, their scheduler becomes
1739 * &ops (the default scheduler, selected at boot, which also services the
1740 * default cpupool). However, as these pCPUs are not really part of any pool,
1741 * there won't be any scheduling event on them, not even from the default
1742 * scheduler. Basically, they will just sit idle until they are explicitly
1743 * added back to a cpupool.
1744 */
schedule_cpu_switch(unsigned int cpu,struct cpupool * c)1745 int schedule_cpu_switch(unsigned int cpu, struct cpupool *c)
1746 {
1747 struct vcpu *idle;
1748 void *ppriv, *ppriv_old, *vpriv, *vpriv_old;
1749 struct scheduler *old_ops = per_cpu(scheduler, cpu);
1750 struct scheduler *new_ops = (c == NULL) ? &ops : c->sched;
1751 struct cpupool *old_pool = per_cpu(cpupool, cpu);
1752 spinlock_t * old_lock;
1753
1754 /*
1755 * pCPUs only move from a valid cpupool to free (i.e., out of any pool),
1756 * or from free to a valid cpupool. In the former case (which happens when
1757 * c is NULL), we want the CPU to have been marked as free already, as
1758 * well as to not be valid for the source pool any longer, when we get to
1759 * here. In the latter case (which happens when c is a valid cpupool), we
1760 * want the CPU to still be marked as free, as well as to not yet be valid
1761 * for the destination pool.
1762 */
1763 ASSERT(c != old_pool && (c != NULL || old_pool != NULL));
1764 ASSERT(cpumask_test_cpu(cpu, &cpupool_free_cpus));
1765 ASSERT((c == NULL && !cpumask_test_cpu(cpu, old_pool->cpu_valid)) ||
1766 (c != NULL && !cpumask_test_cpu(cpu, c->cpu_valid)));
1767
1768 if ( old_ops == new_ops )
1769 goto out;
1770
1771 /*
1772 * To setup the cpu for the new scheduler we need:
1773 * - a valid instance of per-CPU scheduler specific data, as it is
1774 * allocated by SCHED_OP(alloc_pdata). Note that we do not want to
1775 * initialize it yet (i.e., we are not calling SCHED_OP(init_pdata)).
1776 * That will be done by the target scheduler, in SCHED_OP(switch_sched),
1777 * in proper ordering and with locking.
1778 * - a valid instance of per-vCPU scheduler specific data, for the idle
1779 * vCPU of cpu. That is what the target scheduler will use for the
1780 * sched_priv field of the per-vCPU info of the idle domain.
1781 */
1782 idle = idle_vcpu[cpu];
1783 ppriv = SCHED_OP(new_ops, alloc_pdata, cpu);
1784 if ( IS_ERR(ppriv) )
1785 return PTR_ERR(ppriv);
1786 vpriv = SCHED_OP(new_ops, alloc_vdata, idle, idle->domain->sched_priv);
1787 if ( vpriv == NULL )
1788 {
1789 SCHED_OP(new_ops, free_pdata, ppriv, cpu);
1790 return -ENOMEM;
1791 }
1792
1793 SCHED_OP(old_ops, tick_suspend, cpu);
1794
1795 /*
1796 * The actual switch, including (if necessary) the rerouting of the
1797 * scheduler lock to whatever new_ops prefers, needs to happen in one
1798 * critical section, protected by old_ops' lock, or races are possible.
1799 * It is, in fact, the lock of another scheduler that we are taking (the
1800 * scheduler of the cpupool that cpu still belongs to). But that is ok
1801 * as, anyone trying to schedule on this cpu will spin until when we
1802 * release that lock (bottom of this function). When he'll get the lock
1803 * --thanks to the loop inside *_schedule_lock() functions-- he'll notice
1804 * that the lock itself changed, and retry acquiring the new one (which
1805 * will be the correct, remapped one, at that point).
1806 */
1807 old_lock = pcpu_schedule_lock_irq(cpu);
1808
1809 vpriv_old = idle->sched_priv;
1810 ppriv_old = per_cpu(schedule_data, cpu).sched_priv;
1811 SCHED_OP(new_ops, switch_sched, cpu, ppriv, vpriv);
1812
1813 /* _Not_ pcpu_schedule_unlock(): schedule_lock may have changed! */
1814 spin_unlock_irq(old_lock);
1815
1816 SCHED_OP(new_ops, tick_resume, cpu);
1817
1818 SCHED_OP(old_ops, deinit_pdata, ppriv_old, cpu);
1819
1820 SCHED_OP(old_ops, free_vdata, vpriv_old);
1821 SCHED_OP(old_ops, free_pdata, ppriv_old, cpu);
1822
1823 out:
1824 per_cpu(cpupool, cpu) = c;
1825 /* When a cpu is added to a pool, trigger it to go pick up some work */
1826 if ( c != NULL )
1827 cpu_raise_softirq(cpu, SCHEDULE_SOFTIRQ);
1828
1829 return 0;
1830 }
1831
scheduler_get_default(void)1832 struct scheduler *scheduler_get_default(void)
1833 {
1834 return &ops;
1835 }
1836
scheduler_alloc(unsigned int sched_id,int * perr)1837 struct scheduler *scheduler_alloc(unsigned int sched_id, int *perr)
1838 {
1839 int i;
1840 struct scheduler *sched;
1841
1842 for ( i = 0; i < NUM_SCHEDULERS; i++ )
1843 if ( schedulers[i] && schedulers[i]->sched_id == sched_id )
1844 goto found;
1845 *perr = -ENOENT;
1846 return NULL;
1847
1848 found:
1849 *perr = -ENOMEM;
1850 if ( (sched = xmalloc(struct scheduler)) == NULL )
1851 return NULL;
1852 memcpy(sched, schedulers[i], sizeof(*sched));
1853 if ( (*perr = SCHED_OP(sched, init)) != 0 )
1854 {
1855 xfree(sched);
1856 sched = NULL;
1857 }
1858
1859 return sched;
1860 }
1861
scheduler_free(struct scheduler * sched)1862 void scheduler_free(struct scheduler *sched)
1863 {
1864 BUG_ON(sched == &ops);
1865 SCHED_OP(sched, deinit);
1866 xfree(sched);
1867 }
1868
schedule_dump(struct cpupool * c)1869 void schedule_dump(struct cpupool *c)
1870 {
1871 unsigned int i;
1872 struct scheduler *sched;
1873 cpumask_t *cpus;
1874
1875 /* Locking, if necessary, must be handled withing each scheduler */
1876
1877 if ( c != NULL )
1878 {
1879 sched = c->sched;
1880 cpus = c->cpu_valid;
1881 printk("Scheduler: %s (%s)\n", sched->name, sched->opt_name);
1882 SCHED_OP(sched, dump_settings);
1883 }
1884 else
1885 {
1886 sched = &ops;
1887 cpus = &cpupool_free_cpus;
1888 }
1889
1890 if ( sched->dump_cpu_state != NULL )
1891 {
1892 printk("CPUs info:\n");
1893 for_each_cpu (i, cpus)
1894 SCHED_OP(sched, dump_cpu_state, i);
1895 }
1896 }
1897
sched_tick_suspend(void)1898 void sched_tick_suspend(void)
1899 {
1900 struct scheduler *sched;
1901 unsigned int cpu = smp_processor_id();
1902
1903 sched = per_cpu(scheduler, cpu);
1904 SCHED_OP(sched, tick_suspend, cpu);
1905 rcu_idle_enter(cpu);
1906 rcu_idle_timer_start();
1907 }
1908
sched_tick_resume(void)1909 void sched_tick_resume(void)
1910 {
1911 struct scheduler *sched;
1912 unsigned int cpu = smp_processor_id();
1913
1914 rcu_idle_timer_stop();
1915 rcu_idle_exit(cpu);
1916 sched = per_cpu(scheduler, cpu);
1917 SCHED_OP(sched, tick_resume, cpu);
1918 }
1919
wait(void)1920 void wait(void)
1921 {
1922 schedule();
1923 }
1924
1925 #ifdef CONFIG_COMPAT
1926 #include "compat/schedule.c"
1927 #endif
1928
1929 #endif /* !COMPAT */
1930
1931 /*
1932 * Local variables:
1933 * mode: C
1934 * c-file-style: "BSD"
1935 * c-basic-offset: 4
1936 * tab-width: 4
1937 * indent-tabs-mode: nil
1938 * End:
1939 */
1940