1 /*
2  * cpuidle_menu - menu governor for cpu idle, main idea come from Linux.
3  *            drivers/cpuidle/governors/menu.c
4  *
5  *  Copyright (C) 2006-2007 Adam Belay <abelay@novell.com>
6  *  Copyright (C) 2007, 2008 Intel Corporation
7  *
8  * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
9  *
10  *  This program is free software; you can redistribute it and/or modify
11  *  it under the terms of the GNU General Public License as published by
12  *  the Free Software Foundation; either version 2 of the License, or (at
13  *  your option) any later version.
14  *
15  *  This program is distributed in the hope that it will be useful, but
16  *  WITHOUT ANY WARRANTY; without even the implied warranty of
17  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
18  *  General Public License for more details.
19  *
20  *  You should have received a copy of the GNU General Public License along
21  *  with this program; If not, see <http://www.gnu.org/licenses/>.
22  *
23  * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
24  */
25 #include <xen/errno.h>
26 #include <xen/lib.h>
27 #include <xen/types.h>
28 #include <xen/acpi.h>
29 #include <xen/timer.h>
30 #include <xen/cpuidle.h>
31 #include <asm/irq.h>
32 
33 #define BUCKETS 6
34 #define RESOLUTION 1024
35 #define DECAY 4
36 #define MAX_INTERESTING 50000
37 #define LATENCY_MULTIPLIER 10
38 
39 /*
40  * Concepts and ideas behind the menu governor
41  *
42  * For the menu governor, there are 3 decision factors for picking a C
43  * state:
44  * 1) Energy break even point
45  * 2) Performance impact
46  * 3) Latency tolerance (TBD: from guest virtual C state)
47  * These these three factors are treated independently.
48  *
49  * Energy break even point
50  * -----------------------
51  * C state entry and exit have an energy cost, and a certain amount of time in
52  * the  C state is required to actually break even on this cost. CPUIDLE
53  * provides us this duration in the "target_residency" field. So all that we
54  * need is a good prediction of how long we'll be idle. Like the traditional
55  * menu governor, we start with the actual known "next timer event" time.
56  *
57  * Since there are other source of wakeups (interrupts for example) than
58  * the next timer event, this estimation is rather optimistic. To get a
59  * more realistic estimate, a correction factor is applied to the estimate,
60  * that is based on historic behavior. For example, if in the past the actual
61  * duration always was 50% of the next timer tick, the correction factor will
62  * be 0.5.
63  *
64  * menu uses a running average for this correction factor, however it uses a
65  * set of factors, not just a single factor. This stems from the realization
66  * that the ratio is dependent on the order of magnitude of the expected
67  * duration; if we expect 500 milliseconds of idle time the likelihood of
68  * getting an interrupt very early is much higher than if we expect 50 micro
69  * seconds of idle time.
70  * For this reason we keep an array of 6 independent factors, that gets
71  * indexed based on the magnitude of the expected duration
72  *
73  * Limiting Performance Impact
74  * ---------------------------
75  * C states, especially those with large exit latencies, can have a real
76  * noticable impact on workloads, which is not acceptable for most sysadmins,
77  * and in addition, less performance has a power price of its own.
78  *
79  * As a general rule of thumb, menu assumes that the following heuristic
80  * holds:
81  *     The busier the system, the less impact of C states is acceptable
82  *
83  * This rule-of-thumb is implemented using average interrupt interval:
84  * If the exit latency times multiplier is longer than the average
85  * interrupt interval, the C state is not considered a candidate
86  * for selection due to a too high performance impact. So the smaller
87  * the average interrupt interval is, the smaller C state latency should be
88  * and thus the less likely a busy CPU will hit such a deep C state.
89  *
90  * As an additional rule to reduce the performance impact, menu tries to
91  * limit the exit latency duration to be no more than 10% of the decaying
92  * measured idle time.
93  */
94 
95 struct perf_factor{
96     s_time_t    time_stamp;
97     s_time_t    duration;
98     unsigned int irq_count_stamp;
99     unsigned int irq_sum;
100 };
101 
102 struct menu_device
103 {
104     int             last_state_idx;
105     unsigned int    expected_us;
106     u64             predicted_us;
107     u64             latency_factor;
108     unsigned int    measured_us;
109     unsigned int    exit_us;
110     unsigned int    bucket;
111     u64             correction_factor[BUCKETS];
112     struct perf_factor pf;
113 };
114 
115 static DEFINE_PER_CPU(struct menu_device, menu_devices);
116 
which_bucket(unsigned int duration)117 static inline int which_bucket(unsigned int duration)
118 {
119    int bucket = 0;
120 
121    if (duration < 10)
122        return bucket;
123    if (duration < 100)
124        return bucket + 1;
125    if (duration < 1000)
126        return bucket + 2;
127    if (duration < 10000)
128        return bucket + 3;
129    if (duration < 100000)
130        return bucket + 4;
131    return bucket + 5;
132 }
133 
134 /*
135  * Return the average interrupt interval to take I/O performance
136  * requirements into account. The smaller the average interrupt
137  * interval to be, the more busy I/O activity, and thus the higher
138  * the barrier to go to an expensive C state.
139  */
140 
141 /* 5 milisec sampling period */
142 #define SAMPLING_PERIOD     5000000
143 
144 /* for I/O interrupt, we give 8x multiplier compared to C state latency*/
145 #define IO_MULTIPLIER       8
146 
avg_intr_interval_us(void)147 static inline s_time_t avg_intr_interval_us(void)
148 {
149     struct menu_device *data = &__get_cpu_var(menu_devices);
150     s_time_t    duration, now;
151     s_time_t    avg_interval;
152     unsigned int irq_sum;
153 
154     now = NOW();
155     duration = (data->pf.duration + (now - data->pf.time_stamp)
156             * (DECAY - 1)) / DECAY;
157 
158     irq_sum = (data->pf.irq_sum + (this_cpu(irq_count) - data->pf.irq_count_stamp)
159             * (DECAY - 1)) / DECAY;
160 
161     if (irq_sum == 0)
162         /* no irq recently, so return a big enough interval: 1 sec */
163         avg_interval = 1000000;
164     else
165         avg_interval = duration / irq_sum / 1000; /* in us */
166 
167     if ( duration >= SAMPLING_PERIOD){
168         data->pf.time_stamp = now;
169         data->pf.duration = duration;
170         data->pf.irq_count_stamp= this_cpu(irq_count);
171         data->pf.irq_sum = irq_sum;
172     }
173 
174     return avg_interval;
175 }
176 
get_sleep_length_us(void)177 static unsigned int get_sleep_length_us(void)
178 {
179     s_time_t us = (this_cpu(timer_deadline) - NOW()) / 1000;
180     /*
181      * while us < 0 or us > (u32)-1, return a large u32,
182      * choose (unsigned int)-2000 to avoid wrapping while added with exit
183      * latency because the latency should not larger than 2ms
184      */
185     return (us >> 32) ? (unsigned int)-2000 : (unsigned int)us;
186 }
187 
menu_select(struct acpi_processor_power * power)188 static int menu_select(struct acpi_processor_power *power)
189 {
190     struct menu_device *data = &__get_cpu_var(menu_devices);
191     int i;
192     s_time_t    io_interval;
193 
194     /*  TBD: Change to 0 if C0(polling mode) support is added later*/
195     data->last_state_idx = CPUIDLE_DRIVER_STATE_START;
196     data->exit_us = 0;
197 
198     /* determine the expected residency time, round up */
199     data->expected_us = get_sleep_length_us();
200 
201     data->bucket = which_bucket(data->expected_us);
202 
203     io_interval = avg_intr_interval_us();
204 
205     data->latency_factor = DIV_ROUND(
206             data->latency_factor * (DECAY - 1) + data->measured_us,
207             DECAY);
208 
209     /*
210      * if the correction factor is 0 (eg first time init or cpu hotplug
211      * etc), we actually want to start out with a unity factor.
212      */
213     if (data->correction_factor[data->bucket] == 0)
214         data->correction_factor[data->bucket] = RESOLUTION * DECAY;
215 
216     /* Make sure to round up for half microseconds */
217     data->predicted_us = DIV_ROUND(
218             data->expected_us * data->correction_factor[data->bucket],
219             RESOLUTION * DECAY);
220 
221     /* find the deepest idle state that satisfies our constraints */
222     for ( i = CPUIDLE_DRIVER_STATE_START + 1; i < power->count; i++ )
223     {
224         struct acpi_processor_cx *s = &power->states[i];
225 
226         if (s->target_residency > data->predicted_us)
227             break;
228         if (s->latency * IO_MULTIPLIER > io_interval)
229             break;
230         if (s->latency * LATENCY_MULTIPLIER > data->latency_factor)
231             break;
232         /* TBD: we need to check the QoS requirment in future */
233         data->exit_us = s->latency;
234         data->last_state_idx = i;
235     }
236 
237     return data->last_state_idx;
238 }
239 
menu_reflect(struct acpi_processor_power * power)240 static void menu_reflect(struct acpi_processor_power *power)
241 {
242     struct menu_device *data = &__get_cpu_var(menu_devices);
243     u64 new_factor;
244 
245     data->measured_us = power->last_residency;
246 
247     /*
248      * We correct for the exit latency; we are assuming here that the
249      * exit latency happens after the event that we're interested in.
250      */
251     if (data->measured_us > data->exit_us)
252         data->measured_us -= data->exit_us;
253 
254     /* update our correction ratio */
255 
256     new_factor = data->correction_factor[data->bucket]
257         * (DECAY - 1) / DECAY;
258 
259     if (data->expected_us > 0 && data->measured_us < MAX_INTERESTING)
260         new_factor += RESOLUTION * data->measured_us / data->expected_us;
261     else
262         /*
263          * we were idle so long that we count it as a perfect
264          * prediction
265          */
266         new_factor += RESOLUTION;
267 
268     /*
269      * We don't want 0 as factor; we always want at least
270      * a tiny bit of estimated time.
271      */
272     if (new_factor == 0)
273         new_factor = 1;
274 
275     data->correction_factor[data->bucket] = new_factor;
276 }
277 
menu_enable_device(struct acpi_processor_power * power)278 static int menu_enable_device(struct acpi_processor_power *power)
279 {
280     if (!cpu_online(power->cpu))
281         return -1;
282 
283     memset(&per_cpu(menu_devices, power->cpu), 0, sizeof(struct menu_device));
284 
285     return 0;
286 }
287 
288 static struct cpuidle_governor menu_governor =
289 {
290     .name =         "menu",
291     .rating =       20,
292     .enable =       menu_enable_device,
293     .select =       menu_select,
294     .reflect =      menu_reflect,
295 };
296 
297 struct cpuidle_governor *cpuidle_current_governor = &menu_governor;
menu_get_trace_data(u32 * expected,u32 * pred)298 void menu_get_trace_data(u32 *expected, u32 *pred)
299 {
300     struct menu_device *data = &__get_cpu_var(menu_devices);
301     *expected = data->expected_us;
302     *pred = data->predicted_us;
303 }
304