1 /******************************************************************************
2  * Additional declarations for the generic scheduler interface.  This should
3  * only be included by files that implement conforming schedulers.
4  *
5  * Portions by Mark Williamson are (C) 2004 Intel Research Cambridge
6  */
7 
8 #ifndef __XEN_SCHED_IF_H__
9 #define __XEN_SCHED_IF_H__
10 
11 #include <xen/percpu.h>
12 
13 /* A global pointer to the initial cpupool (POOL0). */
14 extern struct cpupool *cpupool0;
15 
16 /* cpus currently in no cpupool */
17 extern cpumask_t cpupool_free_cpus;
18 
19 /* Scheduler generic parameters
20  * */
21 #define SCHED_DEFAULT_RATELIMIT_US 1000
22 extern int sched_ratelimit_us;
23 
24 
25 /*
26  * In order to allow a scheduler to remap the lock->cpu mapping,
27  * we have a per-cpu pointer, along with a pre-allocated set of
28  * locks.  The generic schedule init code will point each schedule lock
29  * pointer to the schedule lock; if the scheduler wants to remap them,
30  * it can simply modify the schedule locks.
31  *
32  * For cache betterness, keep the actual lock in the same cache area
33  * as the rest of the struct.  Just have the scheduler point to the
34  * one it wants (This may be the one right in front of it).*/
35 struct schedule_data {
36     spinlock_t         *schedule_lock,
37                        _lock;
38     struct vcpu        *curr;           /* current task                    */
39     void               *sched_priv;
40     struct timer        s_timer;        /* scheduling timer                */
41     atomic_t            urgent_count;   /* how many urgent vcpus           */
42 };
43 
44 #define curr_on_cpu(c)    (per_cpu(schedule_data, c).curr)
45 
46 DECLARE_PER_CPU(struct schedule_data, schedule_data);
47 DECLARE_PER_CPU(struct scheduler *, scheduler);
48 DECLARE_PER_CPU(struct cpupool *, cpupool);
49 
50 /*
51  * Scratch space, for avoiding having too many cpumask_t on the stack.
52  * Within each scheduler, when using the scratch mask of one pCPU:
53  * - the pCPU must belong to the scheduler,
54  * - the caller must own the per-pCPU scheduler lock (a.k.a. runqueue
55  *   lock).
56  */
57 DECLARE_PER_CPU(cpumask_t, cpumask_scratch);
58 #define cpumask_scratch        (&this_cpu(cpumask_scratch))
59 #define cpumask_scratch_cpu(c) (&per_cpu(cpumask_scratch, c))
60 
61 #define sched_lock(kind, param, cpu, irq, arg...) \
62 static inline spinlock_t *kind##_schedule_lock##irq(param EXTRA_TYPE(arg)) \
63 { \
64     for ( ; ; ) \
65     { \
66         spinlock_t *lock = per_cpu(schedule_data, cpu).schedule_lock; \
67         /* \
68          * v->processor may change when grabbing the lock; but \
69          * per_cpu(v->processor) may also change, if changing cpu pool \
70          * also changes the scheduler lock.  Retry until they match. \
71          * \
72          * It may also be the case that v->processor may change but the \
73          * lock may be the same; this will succeed in that case. \
74          */ \
75         spin_lock##irq(lock, ## arg); \
76         if ( likely(lock == per_cpu(schedule_data, cpu).schedule_lock) ) \
77             return lock; \
78         spin_unlock##irq(lock, ## arg); \
79     } \
80 }
81 
82 #define sched_unlock(kind, param, cpu, irq, arg...) \
83 static inline void kind##_schedule_unlock##irq(spinlock_t *lock \
84                                                EXTRA_TYPE(arg), param) \
85 { \
86     ASSERT(lock == per_cpu(schedule_data, cpu).schedule_lock); \
87     spin_unlock##irq(lock, ## arg); \
88 }
89 
90 #define EXTRA_TYPE(arg)
91 sched_lock(pcpu, unsigned int cpu,     cpu, )
92 sched_lock(vcpu, const struct vcpu *v, v->processor, )
sched_lock(pcpu,unsigned int cpu,cpu,_irq)93 sched_lock(pcpu, unsigned int cpu,     cpu,          _irq)
94 sched_lock(vcpu, const struct vcpu *v, v->processor, _irq)
95 sched_unlock(pcpu, unsigned int cpu,     cpu, )
96 sched_unlock(vcpu, const struct vcpu *v, v->processor, )
97 sched_unlock(pcpu, unsigned int cpu,     cpu,          _irq)
98 sched_unlock(vcpu, const struct vcpu *v, v->processor, _irq)
99 #undef EXTRA_TYPE
100 
101 #define EXTRA_TYPE(arg) , unsigned long arg
102 #define spin_unlock_irqsave spin_unlock_irqrestore
103 sched_lock(pcpu, unsigned int cpu,     cpu,          _irqsave, *flags)
104 sched_lock(vcpu, const struct vcpu *v, v->processor, _irqsave, *flags)
105 #undef spin_unlock_irqsave
106 sched_unlock(pcpu, unsigned int cpu,     cpu,          _irqrestore, flags)
107 sched_unlock(vcpu, const struct vcpu *v, v->processor, _irqrestore, flags)
108 #undef EXTRA_TYPE
109 
110 #undef sched_unlock
111 #undef sched_lock
112 
113 static inline spinlock_t *pcpu_schedule_trylock(unsigned int cpu)
114 {
115     spinlock_t *lock = per_cpu(schedule_data, cpu).schedule_lock;
116 
117     if ( !spin_trylock(lock) )
118         return NULL;
119     if ( lock == per_cpu(schedule_data, cpu).schedule_lock )
120         return lock;
121     spin_unlock(lock);
122     return NULL;
123 }
124 
125 struct task_slice {
126     struct vcpu *task;
127     s_time_t     time;
128     bool_t       migrated;
129 };
130 
131 struct scheduler {
132     char *name;             /* full name for this scheduler      */
133     char *opt_name;         /* option name for this scheduler    */
134     unsigned int sched_id;  /* ID for this scheduler             */
135     void *sched_data;       /* global data pointer               */
136 
137     int          (*global_init)    (void);
138 
139     int          (*init)           (struct scheduler *);
140     void         (*deinit)         (struct scheduler *);
141 
142     void         (*free_vdata)     (const struct scheduler *, void *);
143     void *       (*alloc_vdata)    (const struct scheduler *, struct vcpu *,
144                                     void *);
145     void         (*free_pdata)     (const struct scheduler *, void *, int);
146     void *       (*alloc_pdata)    (const struct scheduler *, int);
147     void         (*init_pdata)     (const struct scheduler *, void *, int);
148     void         (*deinit_pdata)   (const struct scheduler *, void *, int);
149     void         (*free_domdata)   (const struct scheduler *, void *);
150     void *       (*alloc_domdata)  (const struct scheduler *, struct domain *);
151 
152     void         (*switch_sched)   (struct scheduler *, unsigned int,
153                                     void *, void *);
154 
155     int          (*init_domain)    (const struct scheduler *, struct domain *);
156     void         (*destroy_domain) (const struct scheduler *, struct domain *);
157 
158     /* Activate / deactivate vcpus in a cpu pool */
159     void         (*insert_vcpu)    (const struct scheduler *, struct vcpu *);
160     void         (*remove_vcpu)    (const struct scheduler *, struct vcpu *);
161 
162     void         (*sleep)          (const struct scheduler *, struct vcpu *);
163     void         (*wake)           (const struct scheduler *, struct vcpu *);
164     void         (*yield)          (const struct scheduler *, struct vcpu *);
165     void         (*context_saved)  (const struct scheduler *, struct vcpu *);
166 
167     struct task_slice (*do_schedule) (const struct scheduler *, s_time_t,
168                                       bool_t tasklet_work_scheduled);
169 
170     int          (*pick_cpu)       (const struct scheduler *, struct vcpu *);
171     void         (*migrate)        (const struct scheduler *, struct vcpu *,
172                                     unsigned int);
173     int          (*adjust)         (const struct scheduler *, struct domain *,
174                                     struct xen_domctl_scheduler_op *);
175     int          (*adjust_global)  (const struct scheduler *,
176                                     struct xen_sysctl_scheduler_op *);
177     void         (*dump_settings)  (const struct scheduler *);
178     void         (*dump_cpu_state) (const struct scheduler *, int);
179 
180     void         (*tick_suspend)    (const struct scheduler *, unsigned int);
181     void         (*tick_resume)     (const struct scheduler *, unsigned int);
182 };
183 
184 #define REGISTER_SCHEDULER(x) static const struct scheduler *x##_entry \
185   __used_section(".data.schedulers") = &x;
186 
187 struct cpupool
188 {
189     int              cpupool_id;
190     cpumask_var_t    cpu_valid;      /* all cpus assigned to pool */
191     cpumask_var_t    cpu_suspended;  /* cpus in S3 that should be in this pool */
192     struct cpupool   *next;
193     unsigned int     n_dom;
194     struct scheduler *sched;
195     atomic_t         refcnt;
196 };
197 
198 #define cpupool_online_cpumask(_pool) \
199     (((_pool) == NULL) ? &cpu_online_map : (_pool)->cpu_valid)
200 
cpupool_domain_cpumask(struct domain * d)201 static inline cpumask_t* cpupool_domain_cpumask(struct domain *d)
202 {
203     /*
204      * d->cpupool is NULL only for the idle domain, and no one should
205      * be interested in calling this for the idle domain.
206      */
207     ASSERT(d->cpupool != NULL);
208     return d->cpupool->cpu_valid;
209 }
210 
211 /*
212  * Hard and soft affinity load balancing.
213  *
214  * Idea is each vcpu has some pcpus that it prefers, some that it does not
215  * prefer but is OK with, and some that it cannot run on at all. The first
216  * set of pcpus are the ones that are both in the soft affinity *and* in the
217  * hard affinity; the second set of pcpus are the ones that are in the hard
218  * affinity but *not* in the soft affinity; the third set of pcpus are the
219  * ones that are not in the hard affinity.
220  *
221  * We implement a two step balancing logic. Basically, every time there is
222  * the need to decide where to run a vcpu, we first check the soft affinity
223  * (well, actually, the && between soft and hard affinity), to see if we can
224  * send it where it prefers to (and can) run on. However, if the first step
225  * does not find any suitable and free pcpu, we fall back checking the hard
226  * affinity.
227  */
228 #define BALANCE_SOFT_AFFINITY    0
229 #define BALANCE_HARD_AFFINITY    1
230 
231 #define for_each_affinity_balance_step(step) \
232     for ( (step) = 0; (step) <= BALANCE_HARD_AFFINITY; (step)++ )
233 
234 /*
235  * Hard affinity balancing is always necessary and must never be skipped.
236  * But soft affinity need only be considered when it has a functionally
237  * different effect than other constraints (such as hard affinity, cpus
238  * online, or cpupools).
239  *
240  * Soft affinity only needs to be considered if:
241  * * The cpus in the cpupool are not a subset of soft affinity
242  * * The hard affinity is not a subset of soft affinity
243  * * There is an overlap between the soft affinity and the mask which is
244  *   currently being considered.
245  */
has_soft_affinity(const struct vcpu * v,const cpumask_t * mask)246 static inline int has_soft_affinity(const struct vcpu *v,
247                                     const cpumask_t *mask)
248 {
249     return !cpumask_subset(cpupool_domain_cpumask(v->domain),
250                            v->cpu_soft_affinity) &&
251            !cpumask_subset(v->cpu_hard_affinity, v->cpu_soft_affinity) &&
252            cpumask_intersects(v->cpu_soft_affinity, mask);
253 }
254 
255 /*
256  * This function copies in mask the cpumask that should be used for a
257  * particular affinity balancing step. For the soft affinity one, the pcpus
258  * that are not part of vc's hard affinity are filtered out from the result,
259  * to avoid running a vcpu where it would like, but is not allowed to!
260  */
261 static inline void
affinity_balance_cpumask(const struct vcpu * v,int step,cpumask_t * mask)262 affinity_balance_cpumask(const struct vcpu *v, int step, cpumask_t *mask)
263 {
264     if ( step == BALANCE_SOFT_AFFINITY )
265     {
266         cpumask_and(mask, v->cpu_soft_affinity, v->cpu_hard_affinity);
267 
268         if ( unlikely(cpumask_empty(mask)) )
269             cpumask_copy(mask, v->cpu_hard_affinity);
270     }
271     else /* step == BALANCE_HARD_AFFINITY */
272         cpumask_copy(mask, v->cpu_hard_affinity);
273 }
274 
275 #endif /* __XEN_SCHED_IF_H__ */
276