1 /*
2 * cpuidle_menu - menu governor for cpu idle, main idea come from Linux.
3 * drivers/cpuidle/governors/menu.c
4 *
5 * Copyright (C) 2006-2007 Adam Belay <abelay@novell.com>
6 * Copyright (C) 2007, 2008 Intel Corporation
7 *
8 * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
9 *
10 * This program is free software; you can redistribute it and/or modify
11 * it under the terms of the GNU General Public License as published by
12 * the Free Software Foundation; either version 2 of the License, or (at
13 * your option) any later version.
14 *
15 * This program is distributed in the hope that it will be useful, but
16 * WITHOUT ANY WARRANTY; without even the implied warranty of
17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
18 * General Public License for more details.
19 *
20 * You should have received a copy of the GNU General Public License along
21 * with this program; If not, see <http://www.gnu.org/licenses/>.
22 *
23 * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
24 */
25 #include <xen/errno.h>
26 #include <xen/lib.h>
27 #include <xen/types.h>
28 #include <xen/acpi.h>
29 #include <xen/timer.h>
30 #include <xen/cpuidle.h>
31 #include <asm/irq.h>
32
33 #define BUCKETS 6
34 #define RESOLUTION 1024
35 #define DECAY 4
36 #define MAX_INTERESTING 50000
37 #define LATENCY_MULTIPLIER 10
38
39 /*
40 * Concepts and ideas behind the menu governor
41 *
42 * For the menu governor, there are 3 decision factors for picking a C
43 * state:
44 * 1) Energy break even point
45 * 2) Performance impact
46 * 3) Latency tolerance (TBD: from guest virtual C state)
47 * These these three factors are treated independently.
48 *
49 * Energy break even point
50 * -----------------------
51 * C state entry and exit have an energy cost, and a certain amount of time in
52 * the C state is required to actually break even on this cost. CPUIDLE
53 * provides us this duration in the "target_residency" field. So all that we
54 * need is a good prediction of how long we'll be idle. Like the traditional
55 * menu governor, we start with the actual known "next timer event" time.
56 *
57 * Since there are other source of wakeups (interrupts for example) than
58 * the next timer event, this estimation is rather optimistic. To get a
59 * more realistic estimate, a correction factor is applied to the estimate,
60 * that is based on historic behavior. For example, if in the past the actual
61 * duration always was 50% of the next timer tick, the correction factor will
62 * be 0.5.
63 *
64 * menu uses a running average for this correction factor, however it uses a
65 * set of factors, not just a single factor. This stems from the realization
66 * that the ratio is dependent on the order of magnitude of the expected
67 * duration; if we expect 500 milliseconds of idle time the likelihood of
68 * getting an interrupt very early is much higher than if we expect 50 micro
69 * seconds of idle time.
70 * For this reason we keep an array of 6 independent factors, that gets
71 * indexed based on the magnitude of the expected duration
72 *
73 * Limiting Performance Impact
74 * ---------------------------
75 * C states, especially those with large exit latencies, can have a real
76 * noticable impact on workloads, which is not acceptable for most sysadmins,
77 * and in addition, less performance has a power price of its own.
78 *
79 * As a general rule of thumb, menu assumes that the following heuristic
80 * holds:
81 * The busier the system, the less impact of C states is acceptable
82 *
83 * This rule-of-thumb is implemented using average interrupt interval:
84 * If the exit latency times multiplier is longer than the average
85 * interrupt interval, the C state is not considered a candidate
86 * for selection due to a too high performance impact. So the smaller
87 * the average interrupt interval is, the smaller C state latency should be
88 * and thus the less likely a busy CPU will hit such a deep C state.
89 *
90 * As an additional rule to reduce the performance impact, menu tries to
91 * limit the exit latency duration to be no more than 10% of the decaying
92 * measured idle time.
93 */
94
95 struct perf_factor{
96 s_time_t time_stamp;
97 s_time_t duration;
98 unsigned int irq_count_stamp;
99 unsigned int irq_sum;
100 };
101
102 struct menu_device
103 {
104 int last_state_idx;
105 unsigned int expected_us;
106 u64 predicted_us;
107 u64 latency_factor;
108 unsigned int measured_us;
109 unsigned int exit_us;
110 unsigned int bucket;
111 u64 correction_factor[BUCKETS];
112 struct perf_factor pf;
113 };
114
115 static DEFINE_PER_CPU(struct menu_device, menu_devices);
116
which_bucket(unsigned int duration)117 static inline int which_bucket(unsigned int duration)
118 {
119 int bucket = 0;
120
121 if (duration < 10)
122 return bucket;
123 if (duration < 100)
124 return bucket + 1;
125 if (duration < 1000)
126 return bucket + 2;
127 if (duration < 10000)
128 return bucket + 3;
129 if (duration < 100000)
130 return bucket + 4;
131 return bucket + 5;
132 }
133
134 /*
135 * Return the average interrupt interval to take I/O performance
136 * requirements into account. The smaller the average interrupt
137 * interval to be, the more busy I/O activity, and thus the higher
138 * the barrier to go to an expensive C state.
139 */
140
141 /* 5 milisec sampling period */
142 #define SAMPLING_PERIOD 5000000
143
144 /* for I/O interrupt, we give 8x multiplier compared to C state latency*/
145 #define IO_MULTIPLIER 8
146
avg_intr_interval_us(void)147 static inline s_time_t avg_intr_interval_us(void)
148 {
149 struct menu_device *data = &__get_cpu_var(menu_devices);
150 s_time_t duration, now;
151 s_time_t avg_interval;
152 unsigned int irq_sum;
153
154 now = NOW();
155 duration = (data->pf.duration + (now - data->pf.time_stamp)
156 * (DECAY - 1)) / DECAY;
157
158 irq_sum = (data->pf.irq_sum + (this_cpu(irq_count) - data->pf.irq_count_stamp)
159 * (DECAY - 1)) / DECAY;
160
161 if (irq_sum == 0)
162 /* no irq recently, so return a big enough interval: 1 sec */
163 avg_interval = 1000000;
164 else
165 avg_interval = duration / irq_sum / 1000; /* in us */
166
167 if ( duration >= SAMPLING_PERIOD){
168 data->pf.time_stamp = now;
169 data->pf.duration = duration;
170 data->pf.irq_count_stamp= this_cpu(irq_count);
171 data->pf.irq_sum = irq_sum;
172 }
173
174 return avg_interval;
175 }
176
get_sleep_length_us(void)177 static unsigned int get_sleep_length_us(void)
178 {
179 s_time_t us = (this_cpu(timer_deadline) - NOW()) / 1000;
180 /*
181 * while us < 0 or us > (u32)-1, return a large u32,
182 * choose (unsigned int)-2000 to avoid wrapping while added with exit
183 * latency because the latency should not larger than 2ms
184 */
185 return (us >> 32) ? (unsigned int)-2000 : (unsigned int)us;
186 }
187
menu_select(struct acpi_processor_power * power)188 static int menu_select(struct acpi_processor_power *power)
189 {
190 struct menu_device *data = &__get_cpu_var(menu_devices);
191 int i;
192 s_time_t io_interval;
193
194 /* TBD: Change to 0 if C0(polling mode) support is added later*/
195 data->last_state_idx = CPUIDLE_DRIVER_STATE_START;
196 data->exit_us = 0;
197
198 /* determine the expected residency time, round up */
199 data->expected_us = get_sleep_length_us();
200
201 data->bucket = which_bucket(data->expected_us);
202
203 io_interval = avg_intr_interval_us();
204
205 data->latency_factor = DIV_ROUND(
206 data->latency_factor * (DECAY - 1) + data->measured_us,
207 DECAY);
208
209 /*
210 * if the correction factor is 0 (eg first time init or cpu hotplug
211 * etc), we actually want to start out with a unity factor.
212 */
213 if (data->correction_factor[data->bucket] == 0)
214 data->correction_factor[data->bucket] = RESOLUTION * DECAY;
215
216 /* Make sure to round up for half microseconds */
217 data->predicted_us = DIV_ROUND(
218 data->expected_us * data->correction_factor[data->bucket],
219 RESOLUTION * DECAY);
220
221 /* find the deepest idle state that satisfies our constraints */
222 for ( i = CPUIDLE_DRIVER_STATE_START + 1; i < power->count; i++ )
223 {
224 struct acpi_processor_cx *s = &power->states[i];
225
226 if (s->target_residency > data->predicted_us)
227 break;
228 if (s->latency * IO_MULTIPLIER > io_interval)
229 break;
230 if (s->latency * LATENCY_MULTIPLIER > data->latency_factor)
231 break;
232 /* TBD: we need to check the QoS requirment in future */
233 data->exit_us = s->latency;
234 data->last_state_idx = i;
235 }
236
237 return data->last_state_idx;
238 }
239
menu_reflect(struct acpi_processor_power * power)240 static void menu_reflect(struct acpi_processor_power *power)
241 {
242 struct menu_device *data = &__get_cpu_var(menu_devices);
243 u64 new_factor;
244
245 data->measured_us = power->last_residency;
246
247 /*
248 * We correct for the exit latency; we are assuming here that the
249 * exit latency happens after the event that we're interested in.
250 */
251 if (data->measured_us > data->exit_us)
252 data->measured_us -= data->exit_us;
253
254 /* update our correction ratio */
255
256 new_factor = data->correction_factor[data->bucket]
257 * (DECAY - 1) / DECAY;
258
259 if (data->expected_us > 0 && data->measured_us < MAX_INTERESTING)
260 new_factor += RESOLUTION * data->measured_us / data->expected_us;
261 else
262 /*
263 * we were idle so long that we count it as a perfect
264 * prediction
265 */
266 new_factor += RESOLUTION;
267
268 /*
269 * We don't want 0 as factor; we always want at least
270 * a tiny bit of estimated time.
271 */
272 if (new_factor == 0)
273 new_factor = 1;
274
275 data->correction_factor[data->bucket] = new_factor;
276 }
277
menu_enable_device(struct acpi_processor_power * power)278 static int menu_enable_device(struct acpi_processor_power *power)
279 {
280 if (!cpu_online(power->cpu))
281 return -1;
282
283 memset(&per_cpu(menu_devices, power->cpu), 0, sizeof(struct menu_device));
284
285 return 0;
286 }
287
288 static struct cpuidle_governor menu_governor =
289 {
290 .name = "menu",
291 .rating = 20,
292 .enable = menu_enable_device,
293 .select = menu_select,
294 .reflect = menu_reflect,
295 };
296
297 struct cpuidle_governor *cpuidle_current_governor = &menu_governor;
menu_get_trace_data(u32 * expected,u32 * pred)298 void menu_get_trace_data(u32 *expected, u32 *pred)
299 {
300 struct menu_device *data = &__get_cpu_var(menu_devices);
301 *expected = data->expected_us;
302 *pred = data->predicted_us;
303 }
304