1 /*
2 * cpu_idle - xen idle state module derived from Linux
3 * drivers/acpi/processor_idle.c &
4 * arch/x86/kernel/acpi/cstate.c
5 *
6 * Copyright (C) 2001, 2002 Andy Grover <andrew.grover@intel.com>
7 * Copyright (C) 2001, 2002 Paul Diefenbaugh <paul.s.diefenbaugh@intel.com>
8 * Copyright (C) 2004, 2005 Dominik Brodowski <linux@brodo.de>
9 * Copyright (C) 2004 Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>
10 * - Added processor hotplug support
11 * Copyright (C) 2005 Venkatesh Pallipadi <venkatesh.pallipadi@intel.com>
12 * - Added support for C3 on SMP
13 * Copyright (C) 2007, 2008 Intel Corporation
14 *
15 * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
16 *
17 * This program is free software; you can redistribute it and/or modify
18 * it under the terms of the GNU General Public License as published by
19 * the Free Software Foundation; either version 2 of the License, or (at
20 * your option) any later version.
21 *
22 * This program is distributed in the hope that it will be useful, but
23 * WITHOUT ANY WARRANTY; without even the implied warranty of
24 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
25 * General Public License for more details.
26 *
27 * You should have received a copy of the GNU General Public License along
28 * with this program; If not, see <http://www.gnu.org/licenses/>.
29 *
30 * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
31 */
32
33 #include <xen/errno.h>
34 #include <xen/lib.h>
35 #include <xen/types.h>
36 #include <xen/acpi.h>
37 #include <xen/smp.h>
38 #include <xen/guest_access.h>
39 #include <xen/keyhandler.h>
40 #include <xen/trace.h>
41 #include <xen/sched-if.h>
42 #include <xen/irq.h>
43 #include <asm/cache.h>
44 #include <asm/io.h>
45 #include <asm/iocap.h>
46 #include <asm/hpet.h>
47 #include <asm/processor.h>
48 #include <xen/pmstat.h>
49 #include <xen/softirq.h>
50 #include <public/platform.h>
51 #include <public/sysctl.h>
52 #include <acpi/cpufreq/cpufreq.h>
53 #include <asm/apic.h>
54 #include <asm/cpuidle.h>
55 #include <asm/mwait.h>
56 #include <xen/notifier.h>
57 #include <xen/cpu.h>
58
59 /*#define DEBUG_PM_CX*/
60
61 #define GET_HW_RES_IN_NS(msr, val) \
62 do { rdmsrl(msr, val); val = tsc_ticks2ns(val); } while( 0 )
63 #define GET_MC6_RES(val) GET_HW_RES_IN_NS(0x664, val)
64 #define GET_PC2_RES(val) GET_HW_RES_IN_NS(0x60D, val) /* SNB onwards */
65 #define GET_PC3_RES(val) GET_HW_RES_IN_NS(0x3F8, val)
66 #define GET_PC6_RES(val) GET_HW_RES_IN_NS(0x3F9, val)
67 #define GET_PC7_RES(val) GET_HW_RES_IN_NS(0x3FA, val)
68 #define GET_PC8_RES(val) GET_HW_RES_IN_NS(0x630, val) /* some Haswells only */
69 #define GET_PC9_RES(val) GET_HW_RES_IN_NS(0x631, val) /* some Haswells only */
70 #define GET_PC10_RES(val) GET_HW_RES_IN_NS(0x632, val) /* some Haswells only */
71 #define GET_CC1_RES(val) GET_HW_RES_IN_NS(0x660, val) /* Silvermont only */
72 #define GET_CC3_RES(val) GET_HW_RES_IN_NS(0x3FC, val)
73 #define GET_CC6_RES(val) GET_HW_RES_IN_NS(0x3FD, val)
74 #define GET_CC7_RES(val) GET_HW_RES_IN_NS(0x3FE, val) /* SNB onwards */
75 #define PHI_CC6_RES(val) GET_HW_RES_IN_NS(0x3FF, val) /* Xeon Phi only */
76
lapic_timer_nop(void)77 static void lapic_timer_nop(void) { }
78 void (*__read_mostly lapic_timer_off)(void);
79 void (*__read_mostly lapic_timer_on)(void);
80
lapic_timer_init(void)81 bool lapic_timer_init(void)
82 {
83 if ( boot_cpu_has(X86_FEATURE_ARAT) )
84 {
85 lapic_timer_off = lapic_timer_nop;
86 lapic_timer_on = lapic_timer_nop;
87 }
88 else if ( hpet_broadcast_is_available() )
89 {
90 lapic_timer_off = hpet_broadcast_enter;
91 lapic_timer_on = hpet_broadcast_exit;
92 }
93 else if ( pit_broadcast_is_available() )
94 {
95 lapic_timer_off = pit_broadcast_enter;
96 lapic_timer_on = pit_broadcast_exit;
97 }
98 else
99 return false;
100
101 return true;
102 }
103
104 static uint64_t (*__read_mostly tick_to_ns)(uint64_t) = acpi_pm_tick_to_ns;
105
106 void (*__read_mostly pm_idle_save)(void);
107 unsigned int max_cstate __read_mostly = ACPI_PROCESSOR_MAX_POWER - 1;
108 integer_param("max_cstate", max_cstate);
109 static bool __read_mostly local_apic_timer_c2_ok;
110 boolean_param("lapic_timer_c2_ok", local_apic_timer_c2_ok);
111
112 struct acpi_processor_power *__read_mostly processor_powers[NR_CPUS];
113
114 struct hw_residencies
115 {
116 uint64_t mc0;
117 uint64_t mc6;
118 uint64_t pc2;
119 uint64_t pc3;
120 uint64_t pc4;
121 uint64_t pc6;
122 uint64_t pc7;
123 uint64_t pc8;
124 uint64_t pc9;
125 uint64_t pc10;
126 uint64_t cc1;
127 uint64_t cc3;
128 uint64_t cc6;
129 uint64_t cc7;
130 };
131
do_get_hw_residencies(void * arg)132 static void do_get_hw_residencies(void *arg)
133 {
134 struct cpuinfo_x86 *c = ¤t_cpu_data;
135 struct hw_residencies *hw_res = arg;
136
137 if ( c->x86_vendor != X86_VENDOR_INTEL || c->x86 != 6 )
138 return;
139
140 switch ( c->x86_model )
141 {
142 /* 4th generation Intel Core (Haswell) */
143 case 0x45:
144 GET_PC8_RES(hw_res->pc8);
145 GET_PC9_RES(hw_res->pc9);
146 GET_PC10_RES(hw_res->pc10);
147 /* fall through */
148 /* Sandy bridge */
149 case 0x2A:
150 case 0x2D:
151 /* Ivy bridge */
152 case 0x3A:
153 case 0x3E:
154 /* Haswell */
155 case 0x3C:
156 case 0x3F:
157 case 0x46:
158 /* Broadwell */
159 case 0x3D:
160 case 0x47:
161 case 0x4F:
162 case 0x56:
163 /* Skylake */
164 case 0x4E:
165 case 0x55:
166 case 0x5E:
167 /* Cannon Lake */
168 case 0x66:
169 /* Kaby Lake */
170 case 0x8E:
171 case 0x9E:
172 GET_PC2_RES(hw_res->pc2);
173 GET_CC7_RES(hw_res->cc7);
174 /* fall through */
175 /* Nehalem */
176 case 0x1A:
177 case 0x1E:
178 case 0x1F:
179 case 0x2E:
180 /* Westmere */
181 case 0x25:
182 case 0x2C:
183 case 0x2F:
184 GET_PC3_RES(hw_res->pc3);
185 GET_PC6_RES(hw_res->pc6);
186 GET_PC7_RES(hw_res->pc7);
187 GET_CC3_RES(hw_res->cc3);
188 GET_CC6_RES(hw_res->cc6);
189 break;
190 /* Xeon Phi Knights Landing */
191 case 0x57:
192 /* Xeon Phi Knights Mill */
193 case 0x85:
194 GET_CC3_RES(hw_res->mc0); /* abusing GET_CC3_RES */
195 GET_CC6_RES(hw_res->mc6); /* abusing GET_CC6_RES */
196 GET_PC2_RES(hw_res->pc2);
197 GET_PC3_RES(hw_res->pc3);
198 GET_PC6_RES(hw_res->pc6);
199 GET_PC7_RES(hw_res->pc7);
200 PHI_CC6_RES(hw_res->cc6);
201 break;
202 /* various Atoms */
203 case 0x27:
204 GET_PC3_RES(hw_res->pc2); /* abusing GET_PC3_RES */
205 GET_PC6_RES(hw_res->pc4); /* abusing GET_PC6_RES */
206 GET_PC7_RES(hw_res->pc6); /* abusing GET_PC7_RES */
207 break;
208 /* Silvermont */
209 case 0x37:
210 case 0x4A:
211 case 0x4D:
212 case 0x5A:
213 case 0x5D:
214 /* Airmont */
215 case 0x4C:
216 GET_MC6_RES(hw_res->mc6);
217 GET_PC7_RES(hw_res->pc6); /* abusing GET_PC7_RES */
218 GET_CC1_RES(hw_res->cc1);
219 GET_CC6_RES(hw_res->cc6);
220 break;
221 /* Goldmont */
222 case 0x5C:
223 case 0x5F:
224 /* Goldmont Plus */
225 case 0x7A:
226 GET_PC2_RES(hw_res->pc2);
227 GET_PC3_RES(hw_res->pc3);
228 GET_PC6_RES(hw_res->pc6);
229 GET_PC10_RES(hw_res->pc10);
230 GET_CC1_RES(hw_res->cc1);
231 GET_CC3_RES(hw_res->cc3);
232 GET_CC6_RES(hw_res->cc6);
233 break;
234 }
235 }
236
get_hw_residencies(uint32_t cpu,struct hw_residencies * hw_res)237 static void get_hw_residencies(uint32_t cpu, struct hw_residencies *hw_res)
238 {
239 memset(hw_res, 0, sizeof(*hw_res));
240
241 if ( smp_processor_id() == cpu )
242 do_get_hw_residencies(hw_res);
243 else
244 on_selected_cpus(cpumask_of(cpu), do_get_hw_residencies, hw_res, 1);
245 }
246
print_hw_residencies(uint32_t cpu)247 static void print_hw_residencies(uint32_t cpu)
248 {
249 struct hw_residencies hw_res;
250
251 get_hw_residencies(cpu, &hw_res);
252
253 if ( hw_res.mc0 | hw_res.mc6 )
254 printk("MC0[%"PRIu64"] MC6[%"PRIu64"]\n",
255 hw_res.mc0, hw_res.mc6);
256 printk("PC2[%"PRIu64"] PC%d[%"PRIu64"] PC6[%"PRIu64"] PC7[%"PRIu64"]\n",
257 hw_res.pc2,
258 hw_res.pc4 ? 4 : 3, hw_res.pc4 ?: hw_res.pc3,
259 hw_res.pc6, hw_res.pc7);
260 if ( hw_res.pc8 | hw_res.pc9 | hw_res.pc10 )
261 printk("PC8[%"PRIu64"] PC9[%"PRIu64"] PC10[%"PRIu64"]\n",
262 hw_res.pc8, hw_res.pc9, hw_res.pc10);
263 printk("CC%d[%"PRIu64"] CC6[%"PRIu64"] CC7[%"PRIu64"]\n",
264 hw_res.cc1 ? 1 : 3, hw_res.cc1 ?: hw_res.cc3,
265 hw_res.cc6, hw_res.cc7);
266 }
267
268 static char* acpi_cstate_method_name[] =
269 {
270 "NONE",
271 "SYSIO",
272 "FFH",
273 "HALT"
274 };
275
get_stime_tick(void)276 static uint64_t get_stime_tick(void) { return (uint64_t)NOW(); }
stime_ticks_elapsed(uint64_t t1,uint64_t t2)277 static uint64_t stime_ticks_elapsed(uint64_t t1, uint64_t t2) { return t2 - t1; }
stime_tick_to_ns(uint64_t ticks)278 static uint64_t stime_tick_to_ns(uint64_t ticks) { return ticks; }
279
get_acpi_pm_tick(void)280 static uint64_t get_acpi_pm_tick(void) { return (uint64_t)inl(pmtmr_ioport); }
acpi_pm_ticks_elapsed(uint64_t t1,uint64_t t2)281 static uint64_t acpi_pm_ticks_elapsed(uint64_t t1, uint64_t t2)
282 {
283 if ( t2 >= t1 )
284 return (t2 - t1);
285 else if ( !(acpi_gbl_FADT.flags & ACPI_FADT_32BIT_TIMER) )
286 return (((0x00FFFFFF - t1) + t2 + 1) & 0x00FFFFFF);
287 else
288 return ((0xFFFFFFFF - t1) + t2 +1);
289 }
290
291 uint64_t (*__read_mostly cpuidle_get_tick)(void) = get_acpi_pm_tick;
292 static uint64_t (*__read_mostly ticks_elapsed)(uint64_t, uint64_t)
293 = acpi_pm_ticks_elapsed;
294
print_acpi_power(uint32_t cpu,struct acpi_processor_power * power)295 static void print_acpi_power(uint32_t cpu, struct acpi_processor_power *power)
296 {
297 uint64_t idle_res = 0, idle_usage = 0;
298 uint64_t last_state_update_tick, current_tick, current_stime;
299 uint64_t usage[ACPI_PROCESSOR_MAX_POWER] = { 0 };
300 uint64_t res_tick[ACPI_PROCESSOR_MAX_POWER] = { 0 };
301 unsigned int i;
302 signed int last_state_idx;
303
304 printk("==cpu%d==\n", cpu);
305 last_state_idx = power->last_state ? power->last_state->idx : -1;
306 printk("active state:\t\tC%d\n", last_state_idx);
307 printk("max_cstate:\t\tC%d\n", max_cstate);
308 printk("states:\n");
309
310 spin_lock_irq(&power->stat_lock);
311 current_tick = cpuidle_get_tick();
312 current_stime = NOW();
313 for ( i = 1; i < power->count; i++ )
314 {
315 res_tick[i] = power->states[i].time;
316 usage[i] = power->states[i].usage;
317 }
318 last_state_update_tick = power->last_state_update_tick;
319 spin_unlock_irq(&power->stat_lock);
320
321 if ( last_state_idx >= 0 )
322 {
323 res_tick[last_state_idx] += ticks_elapsed(last_state_update_tick,
324 current_tick);
325 usage[last_state_idx]++;
326 }
327
328 for ( i = 1; i < power->count; i++ )
329 {
330 idle_usage += usage[i];
331 idle_res += tick_to_ns(res_tick[i]);
332
333 printk((last_state_idx == i) ? " *" : " ");
334 printk("C%d:\t", i);
335 printk("type[C%d] ", power->states[i].type);
336 printk("latency[%03d] ", power->states[i].latency);
337 printk("usage[%08"PRIu64"] ", usage[i]);
338 printk("method[%5s] ", acpi_cstate_method_name[power->states[i].entry_method]);
339 printk("duration[%"PRIu64"]\n", tick_to_ns(res_tick[i]));
340 }
341 printk((last_state_idx == 0) ? " *" : " ");
342 printk("C0:\tusage[%08"PRIu64"] duration[%"PRIu64"]\n",
343 usage[0] + idle_usage, current_stime - idle_res);
344
345 print_hw_residencies(cpu);
346 }
347
dump_cx(unsigned char key)348 static void dump_cx(unsigned char key)
349 {
350 unsigned int cpu;
351
352 printk("'%c' pressed -> printing ACPI Cx structures\n", key);
353 for_each_online_cpu ( cpu )
354 if (processor_powers[cpu])
355 print_acpi_power(cpu, processor_powers[cpu]);
356 }
357
cpu_idle_key_init(void)358 static int __init cpu_idle_key_init(void)
359 {
360 register_keyhandler('c', dump_cx, "dump ACPI Cx structures", 1);
361 return 0;
362 }
363 __initcall(cpu_idle_key_init);
364
365 /*
366 * The bit is set iff cpu use monitor/mwait to enter C state
367 * with this flag set, CPU can be waken up from C state
368 * by writing to specific memory address, instead of sending an IPI.
369 */
370 static cpumask_t cpuidle_mwait_flags;
371
cpuidle_wakeup_mwait(cpumask_t * mask)372 void cpuidle_wakeup_mwait(cpumask_t *mask)
373 {
374 cpumask_t target;
375 unsigned int cpu;
376
377 cpumask_and(&target, mask, &cpuidle_mwait_flags);
378
379 /* CPU is MWAITing on the cpuidle_mwait_wakeup flag. */
380 for_each_cpu(cpu, &target)
381 mwait_wakeup(cpu) = 0;
382
383 cpumask_andnot(mask, mask, &target);
384 }
385
arch_skip_send_event_check(unsigned int cpu)386 bool arch_skip_send_event_check(unsigned int cpu)
387 {
388 /*
389 * This relies on softirq_pending() and mwait_wakeup() to access data
390 * on the same cache line.
391 */
392 smp_mb();
393 return !!cpumask_test_cpu(cpu, &cpuidle_mwait_flags);
394 }
395
mwait_idle_with_hints(unsigned int eax,unsigned int ecx)396 void mwait_idle_with_hints(unsigned int eax, unsigned int ecx)
397 {
398 unsigned int cpu = smp_processor_id();
399 s_time_t expires = per_cpu(timer_deadline, cpu);
400
401 if ( boot_cpu_has(X86_FEATURE_CLFLUSH_MONITOR) )
402 {
403 mb();
404 clflush((void *)&mwait_wakeup(cpu));
405 mb();
406 }
407
408 __monitor((void *)&mwait_wakeup(cpu), 0, 0);
409 smp_mb();
410
411 /*
412 * Timer deadline passing is the event on which we will be woken via
413 * cpuidle_mwait_wakeup. So check it now that the location is armed.
414 */
415 if ( (expires > NOW() || expires == 0) && !softirq_pending(cpu) )
416 {
417 cpumask_set_cpu(cpu, &cpuidle_mwait_flags);
418 __mwait(eax, ecx);
419 cpumask_clear_cpu(cpu, &cpuidle_mwait_flags);
420 }
421
422 if ( expires <= NOW() && expires > 0 )
423 raise_softirq(TIMER_SOFTIRQ);
424 }
425
acpi_processor_ffh_cstate_enter(struct acpi_processor_cx * cx)426 static void acpi_processor_ffh_cstate_enter(struct acpi_processor_cx *cx)
427 {
428 mwait_idle_with_hints(cx->address, MWAIT_ECX_INTERRUPT_BREAK);
429 }
430
acpi_idle_do_entry(struct acpi_processor_cx * cx)431 static void acpi_idle_do_entry(struct acpi_processor_cx *cx)
432 {
433 switch ( cx->entry_method )
434 {
435 case ACPI_CSTATE_EM_FFH:
436 /* Call into architectural FFH based C-state */
437 acpi_processor_ffh_cstate_enter(cx);
438 return;
439 case ACPI_CSTATE_EM_SYSIO:
440 /* IO port based C-state */
441 inb(cx->address);
442 /* Dummy wait op - must do something useless after P_LVL2 read
443 because chipsets cannot guarantee that STPCLK# signal
444 gets asserted in time to freeze execution properly. */
445 inl(pmtmr_ioport);
446 return;
447 case ACPI_CSTATE_EM_HALT:
448 safe_halt();
449 local_irq_disable();
450 return;
451 }
452 }
453
acpi_idle_bm_check(void)454 static int acpi_idle_bm_check(void)
455 {
456 u32 bm_status = 0;
457
458 acpi_get_register(ACPI_BITREG_BUS_MASTER_STATUS, &bm_status);
459 if ( bm_status )
460 acpi_set_register(ACPI_BITREG_BUS_MASTER_STATUS, 1);
461 /*
462 * TBD: PIIX4 Erratum #18: Note that BM_STS doesn't always reflect
463 * the true state of bus mastering activity; forcing us to
464 * manually check the BMIDEA bit of each IDE channel.
465 */
466 return bm_status;
467 }
468
469 static struct {
470 spinlock_t lock;
471 unsigned int count;
472 } c3_cpu_status = { .lock = SPIN_LOCK_UNLOCKED };
473
trace_exit_reason(u32 * irq_traced)474 void trace_exit_reason(u32 *irq_traced)
475 {
476 if ( unlikely(tb_init_done) )
477 {
478 int i, curbit;
479 u32 irr_status[8] = { 0 };
480
481 /* Get local apic IRR register */
482 for ( i = 0; i < 8; i++ )
483 irr_status[i] = apic_read(APIC_IRR + (i << 4));
484 i = 0;
485 curbit = find_first_bit((const unsigned long *)irr_status, 256);
486 while ( i < 4 && curbit < 256 )
487 {
488 irq_traced[i++] = curbit;
489 curbit = find_next_bit((const unsigned long *)irr_status, 256, curbit + 1);
490 }
491 }
492 }
493
494 /*
495 * "AAJ72. EOI Transaction May Not be Sent if Software Enters Core C6 During
496 * an Interrupt Service Routine"
497 *
498 * There was an errata with some Core i7 processors that an EOI transaction
499 * may not be sent if software enters core C6 during an interrupt service
500 * routine. So we don't enter deep Cx state if there is an EOI pending.
501 */
errata_c6_eoi_workaround(void)502 static bool errata_c6_eoi_workaround(void)
503 {
504 static int8_t fix_needed = -1;
505
506 if ( unlikely(fix_needed == -1) )
507 {
508 int model = boot_cpu_data.x86_model;
509 fix_needed = (cpu_has_apic && !directed_eoi_enabled &&
510 (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) &&
511 (boot_cpu_data.x86 == 6) &&
512 ((model == 0x1a) || (model == 0x1e) || (model == 0x1f) ||
513 (model == 0x25) || (model == 0x2c) || (model == 0x2f)));
514 }
515
516 return (fix_needed && cpu_has_pending_apic_eoi());
517 }
518
update_last_cx_stat(struct acpi_processor_power * power,struct acpi_processor_cx * cx,uint64_t ticks)519 void update_last_cx_stat(struct acpi_processor_power *power,
520 struct acpi_processor_cx *cx, uint64_t ticks)
521 {
522 ASSERT(!local_irq_is_enabled());
523
524 spin_lock(&power->stat_lock);
525 power->last_state = cx;
526 power->last_state_update_tick = ticks;
527 spin_unlock(&power->stat_lock);
528 }
529
update_idle_stats(struct acpi_processor_power * power,struct acpi_processor_cx * cx,uint64_t before,uint64_t after)530 void update_idle_stats(struct acpi_processor_power *power,
531 struct acpi_processor_cx *cx,
532 uint64_t before, uint64_t after)
533 {
534 int64_t sleep_ticks = ticks_elapsed(before, after);
535 /* Interrupts are disabled */
536
537 spin_lock(&power->stat_lock);
538
539 cx->usage++;
540 if ( sleep_ticks > 0 )
541 {
542 power->last_residency = tick_to_ns(sleep_ticks) / 1000UL;
543 cx->time += sleep_ticks;
544 }
545 power->last_state = &power->states[0];
546 power->last_state_update_tick = after;
547
548 spin_unlock(&power->stat_lock);
549 }
550
acpi_processor_idle(void)551 static void acpi_processor_idle(void)
552 {
553 struct acpi_processor_power *power = processor_powers[smp_processor_id()];
554 struct acpi_processor_cx *cx = NULL;
555 int next_state;
556 uint64_t t1, t2 = 0;
557 u32 exp = 0, pred = 0;
558 u32 irq_traced[4] = { 0 };
559
560 if ( max_cstate > 0 && power && !sched_has_urgent_vcpu() &&
561 (next_state = cpuidle_current_governor->select(power)) > 0 )
562 {
563 cx = &power->states[next_state];
564 if ( cx->type == ACPI_STATE_C3 && power->flags.bm_check &&
565 acpi_idle_bm_check() )
566 cx = power->safe_state;
567 if ( cx->idx > max_cstate )
568 cx = &power->states[max_cstate];
569 menu_get_trace_data(&exp, &pred);
570 }
571 if ( !cx )
572 {
573 if ( pm_idle_save )
574 pm_idle_save();
575 else
576 safe_halt();
577 return;
578 }
579
580 cpufreq_dbs_timer_suspend();
581
582 sched_tick_suspend();
583 /* sched_tick_suspend() can raise TIMER_SOFTIRQ. Process it now. */
584 process_pending_softirqs();
585
586 /*
587 * Interrupts must be disabled during bus mastering calculations and
588 * for C2/C3 transitions.
589 */
590 local_irq_disable();
591
592 if ( !cpu_is_haltable(smp_processor_id()) )
593 {
594 local_irq_enable();
595 sched_tick_resume();
596 cpufreq_dbs_timer_resume();
597 return;
598 }
599
600 if ( (cx->type == ACPI_STATE_C3) && errata_c6_eoi_workaround() )
601 cx = power->safe_state;
602
603
604 /*
605 * Sleep:
606 * ------
607 * Invoke the current Cx state to put the processor to sleep.
608 */
609 switch ( cx->type )
610 {
611 case ACPI_STATE_C1:
612 case ACPI_STATE_C2:
613 if ( cx->type == ACPI_STATE_C1 || local_apic_timer_c2_ok )
614 {
615 /* Get start time (ticks) */
616 t1 = cpuidle_get_tick();
617 /* Trace cpu idle entry */
618 TRACE_4D(TRC_PM_IDLE_ENTRY, cx->idx, t1, exp, pred);
619
620 update_last_cx_stat(power, cx, t1);
621
622 /* Invoke C2 */
623 acpi_idle_do_entry(cx);
624 /* Get end time (ticks) */
625 t2 = cpuidle_get_tick();
626 trace_exit_reason(irq_traced);
627 /* Trace cpu idle exit */
628 TRACE_6D(TRC_PM_IDLE_EXIT, cx->idx, t2,
629 irq_traced[0], irq_traced[1], irq_traced[2], irq_traced[3]);
630 /* Update statistics */
631 update_idle_stats(power, cx, t1, t2);
632 /* Re-enable interrupts */
633 local_irq_enable();
634 break;
635 }
636
637 case ACPI_STATE_C3:
638 /*
639 * Before invoking C3, be aware that TSC/APIC timer may be
640 * stopped by H/W. Without carefully handling of TSC/APIC stop issues,
641 * deep C state can't work correctly.
642 */
643 /* preparing APIC stop */
644 lapic_timer_off();
645
646 /* Get start time (ticks) */
647 t1 = cpuidle_get_tick();
648 /* Trace cpu idle entry */
649 TRACE_4D(TRC_PM_IDLE_ENTRY, cx->idx, t1, exp, pred);
650
651 update_last_cx_stat(power, cx, t1);
652
653 /*
654 * disable bus master
655 * bm_check implies we need ARB_DIS
656 * !bm_check implies we need cache flush
657 * bm_control implies whether we can do ARB_DIS
658 *
659 * That leaves a case where bm_check is set and bm_control is
660 * not set. In that case we cannot do much, we enter C3
661 * without doing anything.
662 */
663 if ( cx->type != ACPI_STATE_C3 )
664 /* nothing to be done here */;
665 else if ( power->flags.bm_check && power->flags.bm_control )
666 {
667 spin_lock(&c3_cpu_status.lock);
668 if ( ++c3_cpu_status.count == num_online_cpus() )
669 {
670 /*
671 * All CPUs are trying to go to C3
672 * Disable bus master arbitration
673 */
674 acpi_set_register(ACPI_BITREG_ARB_DISABLE, 1);
675 }
676 spin_unlock(&c3_cpu_status.lock);
677 }
678 else if ( !power->flags.bm_check )
679 {
680 /* SMP with no shared cache... Invalidate cache */
681 ACPI_FLUSH_CPU_CACHE();
682 }
683
684 /* Invoke C3 */
685 acpi_idle_do_entry(cx);
686
687 if ( (cx->type == ACPI_STATE_C3) &&
688 power->flags.bm_check && power->flags.bm_control )
689 {
690 /* Enable bus master arbitration */
691 spin_lock(&c3_cpu_status.lock);
692 if ( c3_cpu_status.count-- == num_online_cpus() )
693 acpi_set_register(ACPI_BITREG_ARB_DISABLE, 0);
694 spin_unlock(&c3_cpu_status.lock);
695 }
696
697 /* Get end time (ticks) */
698 t2 = cpuidle_get_tick();
699
700 /* recovering TSC */
701 cstate_restore_tsc();
702 trace_exit_reason(irq_traced);
703 /* Trace cpu idle exit */
704 TRACE_6D(TRC_PM_IDLE_EXIT, cx->idx, t2,
705 irq_traced[0], irq_traced[1], irq_traced[2], irq_traced[3]);
706
707 /* Update statistics */
708 update_idle_stats(power, cx, t1, t2);
709 /* Re-enable interrupts */
710 local_irq_enable();
711 /* recovering APIC */
712 lapic_timer_on();
713
714 break;
715
716 default:
717 /* Now in C0 */
718 power->last_state = &power->states[0];
719 local_irq_enable();
720 sched_tick_resume();
721 cpufreq_dbs_timer_resume();
722 return;
723 }
724
725 /* Now in C0 */
726 power->last_state = &power->states[0];
727
728 sched_tick_resume();
729 cpufreq_dbs_timer_resume();
730
731 if ( cpuidle_current_governor->reflect )
732 cpuidle_current_governor->reflect(power);
733 }
734
acpi_dead_idle(void)735 void acpi_dead_idle(void)
736 {
737 struct acpi_processor_power *power;
738 struct acpi_processor_cx *cx;
739
740 if ( (power = processor_powers[smp_processor_id()]) == NULL )
741 goto default_halt;
742
743 if ( (cx = &power->states[power->count-1]) == NULL )
744 goto default_halt;
745
746 if ( cx->entry_method == ACPI_CSTATE_EM_FFH )
747 {
748 void *mwait_ptr = &mwait_wakeup(smp_processor_id());
749
750 /*
751 * Cache must be flushed as the last operation before sleeping.
752 * Otherwise, CPU may still hold dirty data, breaking cache coherency,
753 * leading to strange errors.
754 */
755 wbinvd();
756
757 while ( 1 )
758 {
759 /*
760 * 1. The CLFLUSH is a workaround for erratum AAI65 for
761 * the Xeon 7400 series.
762 * 2. The WBINVD is insufficient due to the spurious-wakeup
763 * case where we return around the loop.
764 * 3. Unlike wbinvd, clflush is a light weight but not serializing
765 * instruction, hence memory fence is necessary to make sure all
766 * load/store visible before flush cache line.
767 */
768 mb();
769 clflush(mwait_ptr);
770 __monitor(mwait_ptr, 0, 0);
771 mb();
772 __mwait(cx->address, 0);
773 }
774 }
775 else if ( current_cpu_data.x86_vendor == X86_VENDOR_AMD &&
776 cx->entry_method == ACPI_CSTATE_EM_SYSIO )
777 {
778 /* Intel prefers not to use SYSIO */
779
780 /* Avoid references to shared data after the cache flush */
781 u32 address = cx->address;
782 u32 pmtmr_ioport_local = pmtmr_ioport;
783
784 wbinvd();
785
786 while ( 1 )
787 {
788 inb(address);
789 inl(pmtmr_ioport_local);
790 }
791 }
792
793 default_halt:
794 default_dead_idle();
795 }
796
cpuidle_init_cpu(unsigned int cpu)797 int cpuidle_init_cpu(unsigned int cpu)
798 {
799 struct acpi_processor_power *acpi_power;
800
801 acpi_power = processor_powers[cpu];
802 if ( !acpi_power )
803 {
804 unsigned int i;
805
806 if ( cpu == 0 && boot_cpu_has(X86_FEATURE_NONSTOP_TSC) )
807 {
808 cpuidle_get_tick = get_stime_tick;
809 ticks_elapsed = stime_ticks_elapsed;
810 tick_to_ns = stime_tick_to_ns;
811 }
812
813 acpi_power = xzalloc(struct acpi_processor_power);
814 if ( !acpi_power )
815 return -ENOMEM;
816
817 for ( i = 0; i < ACPI_PROCESSOR_MAX_POWER; i++ )
818 acpi_power->states[i].idx = i;
819
820 acpi_power->cpu = cpu;
821 processor_powers[cpu] = acpi_power;
822 }
823
824 acpi_power->count = 2;
825 acpi_power->states[1].type = ACPI_STATE_C1;
826 acpi_power->states[1].entry_method = ACPI_CSTATE_EM_HALT;
827 acpi_power->safe_state = &acpi_power->states[1];
828 spin_lock_init(&acpi_power->stat_lock);
829
830 return 0;
831 }
832
acpi_processor_ffh_cstate_probe(xen_processor_cx_t * cx)833 static int acpi_processor_ffh_cstate_probe(xen_processor_cx_t *cx)
834 {
835 struct cpuinfo_x86 *c = ¤t_cpu_data;
836 unsigned int eax, ebx, ecx, edx;
837 unsigned int edx_part;
838 unsigned int cstate_type; /* C-state type and not ACPI C-state type */
839 unsigned int num_cstate_subtype;
840 int ret = 0;
841 static unsigned long printed;
842
843 if ( c->cpuid_level < CPUID_MWAIT_LEAF )
844 {
845 printk(XENLOG_INFO "MWAIT leaf not supported by cpuid\n");
846 return -EFAULT;
847 }
848
849 cpuid(CPUID_MWAIT_LEAF, &eax, &ebx, &ecx, &edx);
850 if ( opt_cpu_info )
851 printk(XENLOG_DEBUG "cpuid.MWAIT[eax=%x ebx=%x ecx=%x edx=%x]\n",
852 eax, ebx, ecx, edx);
853
854 /* Check whether this particular cx_type (in CST) is supported or not */
855 cstate_type = (cx->reg.address >> MWAIT_SUBSTATE_SIZE) + 1;
856 edx_part = edx >> (cstate_type * MWAIT_SUBSTATE_SIZE);
857 num_cstate_subtype = edx_part & MWAIT_SUBSTATE_MASK;
858
859 if ( num_cstate_subtype < (cx->reg.address & MWAIT_SUBSTATE_MASK) )
860 ret = -ERANGE;
861 /* mwait ecx extensions INTERRUPT_BREAK should be supported for C2/C3 */
862 else if ( !(ecx & CPUID5_ECX_EXTENSIONS_SUPPORTED) ||
863 !(ecx & CPUID5_ECX_INTERRUPT_BREAK) )
864 ret = -ENODEV;
865 else if ( opt_cpu_info || cx->type >= BITS_PER_LONG ||
866 !test_and_set_bit(cx->type, &printed) )
867 printk(XENLOG_INFO "Monitor-Mwait will be used to enter C%d state\n",
868 cx->type);
869 return ret;
870 }
871
872 /*
873 * Initialize bm_flags based on the CPU cache properties
874 * On SMP it depends on cache configuration
875 * - When cache is not shared among all CPUs, we flush cache
876 * before entering C3.
877 * - When cache is shared among all CPUs, we use bm_check
878 * mechanism as in UP case
879 *
880 * This routine is called only after all the CPUs are online
881 */
acpi_processor_power_init_bm_check(struct acpi_processor_flags * flags)882 static void acpi_processor_power_init_bm_check(struct acpi_processor_flags *flags)
883 {
884 struct cpuinfo_x86 *c = ¤t_cpu_data;
885
886 flags->bm_check = 0;
887 if ( num_online_cpus() == 1 )
888 flags->bm_check = 1;
889 else if ( (c->x86_vendor == X86_VENDOR_INTEL) ||
890 ((c->x86_vendor == X86_VENDOR_AMD) && (c->x86 == 0x15)) )
891 {
892 /*
893 * Today all MP CPUs that support C3 share cache.
894 * And caches should not be flushed by software while
895 * entering C3 type state.
896 */
897 flags->bm_check = 1;
898 }
899
900 /*
901 * On all recent platforms, ARB_DISABLE is a nop.
902 * So, set bm_control to zero to indicate that ARB_DISABLE
903 * is not required while entering C3 type state on
904 * P4, Core and beyond CPUs
905 */
906 if ( c->x86_vendor == X86_VENDOR_INTEL &&
907 (c->x86 > 0x6 || (c->x86 == 6 && c->x86_model >= 14)) )
908 flags->bm_control = 0;
909 }
910
911 #define VENDOR_INTEL (1)
912 #define NATIVE_CSTATE_BEYOND_HALT (2)
913
check_cx(struct acpi_processor_power * power,xen_processor_cx_t * cx)914 static int check_cx(struct acpi_processor_power *power, xen_processor_cx_t *cx)
915 {
916 static int bm_check_flag = -1;
917 static int bm_control_flag = -1;
918
919 switch ( cx->reg.space_id )
920 {
921 case ACPI_ADR_SPACE_SYSTEM_IO:
922 if ( cx->reg.address == 0 )
923 return -EINVAL;
924 break;
925
926 case ACPI_ADR_SPACE_FIXED_HARDWARE:
927 if ( cx->reg.bit_width != VENDOR_INTEL ||
928 cx->reg.bit_offset != NATIVE_CSTATE_BEYOND_HALT )
929 return -EINVAL;
930
931 /* assume all logical cpu has the same support for mwait */
932 if ( acpi_processor_ffh_cstate_probe(cx) )
933 return -EINVAL;
934 break;
935
936 default:
937 return -ENODEV;
938 }
939
940 switch ( cx->type )
941 {
942 case ACPI_STATE_C2:
943 if ( local_apic_timer_c2_ok )
944 break;
945 case ACPI_STATE_C3:
946 if ( !lapic_timer_init() )
947 return -EINVAL;
948
949 /* All the logic here assumes flags.bm_check is same across all CPUs */
950 if ( bm_check_flag < 0 )
951 {
952 /* Determine whether bm_check is needed based on CPU */
953 acpi_processor_power_init_bm_check(&(power->flags));
954 }
955 else
956 {
957 power->flags.bm_check = bm_check_flag;
958 power->flags.bm_control = bm_control_flag;
959 }
960
961 if ( power->flags.bm_check )
962 {
963 if ( !power->flags.bm_control )
964 {
965 if ( power->flags.has_cst != 1 )
966 {
967 /* bus mastering control is necessary */
968 ACPI_DEBUG_PRINT((ACPI_DB_INFO,
969 "C3 support requires BM control\n"));
970 return -EINVAL;
971 }
972 else
973 {
974 /* Here we enter C3 without bus mastering */
975 ACPI_DEBUG_PRINT((ACPI_DB_INFO,
976 "C3 support without BM control\n"));
977 }
978 }
979 /*
980 * On older chipsets, BM_RLD needs to be set in order for Bus
981 * Master activity to wake the system from C3, hence
982 * acpi_set_register() is always being called once below. Newer
983 * chipsets handle DMA during C3 automatically and BM_RLD is a
984 * NOP. In either case, the proper way to handle BM_RLD is to
985 * set it and leave it set.
986 */
987 }
988 else
989 {
990 /*
991 * WBINVD should be set in fadt, for C3 state to be
992 * supported on when bm_check is not required.
993 */
994 if ( !(acpi_gbl_FADT.flags & ACPI_FADT_WBINVD) )
995 {
996 ACPI_DEBUG_PRINT((ACPI_DB_INFO,
997 "Cache invalidation should work properly"
998 " for C3 to be enabled on SMP systems\n"));
999 return -EINVAL;
1000 }
1001 }
1002
1003 if ( bm_check_flag < 0 )
1004 {
1005 bm_check_flag = power->flags.bm_check;
1006 bm_control_flag = power->flags.bm_control;
1007 acpi_set_register(ACPI_BITREG_BUS_MASTER_RLD, bm_check_flag);
1008 }
1009
1010 break;
1011 }
1012
1013 return 0;
1014 }
1015
1016 static unsigned int latency_factor = 2;
1017 integer_param("idle_latency_factor", latency_factor);
1018
set_cx(struct acpi_processor_power * acpi_power,xen_processor_cx_t * xen_cx)1019 static void set_cx(
1020 struct acpi_processor_power *acpi_power,
1021 xen_processor_cx_t *xen_cx)
1022 {
1023 struct acpi_processor_cx *cx;
1024
1025 if ( check_cx(acpi_power, xen_cx) != 0 )
1026 return;
1027
1028 switch ( xen_cx->type )
1029 {
1030 case ACPI_STATE_C1:
1031 cx = &acpi_power->states[1];
1032 break;
1033 default:
1034 if ( acpi_power->count >= ACPI_PROCESSOR_MAX_POWER )
1035 {
1036 case ACPI_STATE_C0:
1037 printk(XENLOG_WARNING "CPU%u: C%d data ignored\n",
1038 acpi_power->cpu, xen_cx->type);
1039 return;
1040 }
1041 cx = &acpi_power->states[acpi_power->count];
1042 cx->type = xen_cx->type;
1043 break;
1044 }
1045
1046 cx->address = xen_cx->reg.address;
1047
1048 switch ( xen_cx->reg.space_id )
1049 {
1050 case ACPI_ADR_SPACE_FIXED_HARDWARE:
1051 if ( xen_cx->reg.bit_width == VENDOR_INTEL &&
1052 xen_cx->reg.bit_offset == NATIVE_CSTATE_BEYOND_HALT &&
1053 boot_cpu_has(X86_FEATURE_MONITOR) )
1054 cx->entry_method = ACPI_CSTATE_EM_FFH;
1055 else
1056 cx->entry_method = ACPI_CSTATE_EM_HALT;
1057 break;
1058 case ACPI_ADR_SPACE_SYSTEM_IO:
1059 if ( ioports_deny_access(hardware_domain, cx->address, cx->address) )
1060 printk(XENLOG_WARNING "Could not deny access to port %04x\n",
1061 cx->address);
1062 cx->entry_method = ACPI_CSTATE_EM_SYSIO;
1063 break;
1064 default:
1065 cx->entry_method = ACPI_CSTATE_EM_NONE;
1066 break;
1067 }
1068
1069 cx->latency = xen_cx->latency;
1070 cx->target_residency = cx->latency * latency_factor;
1071
1072 smp_wmb();
1073 acpi_power->count += (cx->type != ACPI_STATE_C1);
1074 if ( cx->type == ACPI_STATE_C1 || cx->type == ACPI_STATE_C2 )
1075 acpi_power->safe_state = cx;
1076 }
1077
get_cpu_id(u32 acpi_id)1078 int get_cpu_id(u32 acpi_id)
1079 {
1080 int i;
1081 u32 apic_id;
1082
1083 if ( acpi_id >= MAX_MADT_ENTRIES )
1084 return -1;
1085
1086 apic_id = x86_acpiid_to_apicid[acpi_id];
1087 if ( apic_id == BAD_APICID )
1088 return -1;
1089
1090 for ( i = 0; i < nr_cpu_ids; i++ )
1091 {
1092 if ( apic_id == x86_cpu_to_apicid[i] )
1093 return i;
1094 }
1095
1096 return -1;
1097 }
1098
1099 #ifdef DEBUG_PM_CX
print_cx_pminfo(uint32_t cpu,struct xen_processor_power * power)1100 static void print_cx_pminfo(uint32_t cpu, struct xen_processor_power *power)
1101 {
1102 XEN_GUEST_HANDLE(xen_processor_cx_t) states;
1103 xen_processor_cx_t state;
1104 XEN_GUEST_HANDLE(xen_processor_csd_t) csd;
1105 xen_processor_csd_t dp;
1106 uint32_t i;
1107
1108 printk("cpu%d cx acpi info:\n", cpu);
1109 printk("\tcount = %d\n", power->count);
1110 printk("\tflags: bm_cntl[%d], bm_chk[%d], has_cst[%d],\n"
1111 "\t pwr_setup_done[%d], bm_rld_set[%d]\n",
1112 power->flags.bm_control, power->flags.bm_check, power->flags.has_cst,
1113 power->flags.power_setup_done, power->flags.bm_rld_set);
1114
1115 states = power->states;
1116
1117 for ( i = 0; i < power->count; i++ )
1118 {
1119 if ( unlikely(copy_from_guest_offset(&state, states, i, 1)) )
1120 return;
1121
1122 printk("\tstates[%d]:\n", i);
1123 printk("\t\treg.space_id = %#x\n", state.reg.space_id);
1124 printk("\t\treg.bit_width = %#x\n", state.reg.bit_width);
1125 printk("\t\treg.bit_offset = %#x\n", state.reg.bit_offset);
1126 printk("\t\treg.access_size = %#x\n", state.reg.access_size);
1127 printk("\t\treg.address = %#"PRIx64"\n", state.reg.address);
1128 printk("\t\ttype = %d\n", state.type);
1129 printk("\t\tlatency = %d\n", state.latency);
1130 printk("\t\tpower = %d\n", state.power);
1131
1132 csd = state.dp;
1133 printk("\t\tdp(@0x%p)\n", csd.p);
1134
1135 if ( csd.p != NULL )
1136 {
1137 if ( unlikely(copy_from_guest(&dp, csd, 1)) )
1138 return;
1139 printk("\t\t\tdomain = %d\n", dp.domain);
1140 printk("\t\t\tcoord_type = %d\n", dp.coord_type);
1141 printk("\t\t\tnum = %d\n", dp.num);
1142 }
1143 }
1144 }
1145 #else
1146 #define print_cx_pminfo(c, p)
1147 #endif
1148
set_cx_pminfo(uint32_t cpu,struct xen_processor_power * power)1149 long set_cx_pminfo(uint32_t cpu, struct xen_processor_power *power)
1150 {
1151 XEN_GUEST_HANDLE(xen_processor_cx_t) states;
1152 xen_processor_cx_t xen_cx;
1153 struct acpi_processor_power *acpi_power;
1154 int cpu_id, i, ret;
1155
1156 if ( unlikely(!guest_handle_okay(power->states, power->count)) )
1157 return -EFAULT;
1158
1159 if ( pm_idle_save && pm_idle != acpi_processor_idle )
1160 return 0;
1161
1162 print_cx_pminfo(cpu, power);
1163
1164 /* map from acpi_id to cpu_id */
1165 cpu_id = get_cpu_id(cpu);
1166 if ( cpu_id == -1 )
1167 {
1168 static bool warn_once = true;
1169
1170 if ( warn_once || opt_cpu_info )
1171 printk(XENLOG_WARNING "No CPU ID for APIC ID %#x\n", cpu);
1172 warn_once = false;
1173 return -EINVAL;
1174 }
1175
1176 ret = cpuidle_init_cpu(cpu_id);
1177 if ( ret < 0 )
1178 return ret;
1179
1180 acpi_power = processor_powers[cpu_id];
1181 acpi_power->flags.bm_check = power->flags.bm_check;
1182 acpi_power->flags.bm_control = power->flags.bm_control;
1183 acpi_power->flags.has_cst = power->flags.has_cst;
1184
1185 states = power->states;
1186 for ( i = 0; i < power->count; i++ )
1187 {
1188 if ( unlikely(copy_from_guest_offset(&xen_cx, states, i, 1)) )
1189 return -EFAULT;
1190
1191 set_cx(acpi_power, &xen_cx);
1192 }
1193
1194 if ( cpuidle_current_governor->enable &&
1195 cpuidle_current_governor->enable(acpi_power) )
1196 return -EFAULT;
1197
1198 /* FIXME: C-state dependency is not supported by far */
1199
1200 if ( cpu_id == 0 )
1201 {
1202 if ( pm_idle_save == NULL )
1203 {
1204 pm_idle_save = pm_idle;
1205 pm_idle = acpi_processor_idle;
1206 }
1207
1208 dead_idle = acpi_dead_idle;
1209 }
1210
1211 return 0;
1212 }
1213
pmstat_get_cx_nr(uint32_t cpuid)1214 uint32_t pmstat_get_cx_nr(uint32_t cpuid)
1215 {
1216 return processor_powers[cpuid] ? processor_powers[cpuid]->count : 0;
1217 }
1218
pmstat_get_cx_stat(uint32_t cpuid,struct pm_cx_stat * stat)1219 int pmstat_get_cx_stat(uint32_t cpuid, struct pm_cx_stat *stat)
1220 {
1221 struct acpi_processor_power *power = processor_powers[cpuid];
1222 uint64_t idle_usage = 0, idle_res = 0;
1223 uint64_t last_state_update_tick, current_stime, current_tick;
1224 uint64_t usage[ACPI_PROCESSOR_MAX_POWER] = { 0 };
1225 uint64_t res[ACPI_PROCESSOR_MAX_POWER] = { 0 };
1226 unsigned int i, nr, nr_pc = 0, nr_cc = 0;
1227
1228 if ( power == NULL )
1229 {
1230 stat->last = 0;
1231 stat->nr = 0;
1232 stat->idle_time = 0;
1233 stat->nr_pc = 0;
1234 stat->nr_cc = 0;
1235 return 0;
1236 }
1237
1238 stat->idle_time = get_cpu_idle_time(cpuid);
1239 nr = min(stat->nr, power->count);
1240
1241 /* mimic the stat when detail info hasn't been registered by dom0 */
1242 if ( pm_idle_save == NULL )
1243 {
1244 stat->nr = 2;
1245 stat->last = power->last_state ? power->last_state->idx : 0;
1246
1247 usage[1] = idle_usage = 1;
1248 res[1] = idle_res = stat->idle_time;
1249
1250 current_stime = NOW();
1251 }
1252 else
1253 {
1254 struct hw_residencies hw_res;
1255 signed int last_state_idx;
1256
1257 stat->nr = power->count;
1258
1259 spin_lock_irq(&power->stat_lock);
1260 current_tick = cpuidle_get_tick();
1261 current_stime = NOW();
1262 for ( i = 1; i < nr; i++ )
1263 {
1264 usage[i] = power->states[i].usage;
1265 res[i] = power->states[i].time;
1266 }
1267 last_state_update_tick = power->last_state_update_tick;
1268 last_state_idx = power->last_state ? power->last_state->idx : -1;
1269 spin_unlock_irq(&power->stat_lock);
1270
1271 if ( last_state_idx >= 0 )
1272 {
1273 usage[last_state_idx]++;
1274 res[last_state_idx] += ticks_elapsed(last_state_update_tick,
1275 current_tick);
1276 stat->last = last_state_idx;
1277 }
1278 else
1279 stat->last = 0;
1280
1281 for ( i = 1; i < nr; i++ )
1282 {
1283 res[i] = tick_to_ns(res[i]);
1284 idle_usage += usage[i];
1285 idle_res += res[i];
1286 }
1287
1288 get_hw_residencies(cpuid, &hw_res);
1289
1290 #define PUT_xC(what, n) do { \
1291 if ( stat->nr_##what >= n && \
1292 copy_to_guest_offset(stat->what, n - 1, &hw_res.what##n, 1) ) \
1293 return -EFAULT; \
1294 if ( hw_res.what##n ) \
1295 nr_##what = n; \
1296 } while ( 0 )
1297 #define PUT_PC(n) PUT_xC(pc, n)
1298 PUT_PC(2);
1299 PUT_PC(3);
1300 PUT_PC(4);
1301 PUT_PC(6);
1302 PUT_PC(7);
1303 PUT_PC(8);
1304 PUT_PC(9);
1305 PUT_PC(10);
1306 #undef PUT_PC
1307 #define PUT_CC(n) PUT_xC(cc, n)
1308 PUT_CC(1);
1309 PUT_CC(3);
1310 PUT_CC(6);
1311 PUT_CC(7);
1312 #undef PUT_CC
1313 #undef PUT_xC
1314 }
1315
1316 usage[0] += idle_usage;
1317 res[0] = current_stime - idle_res;
1318
1319 if ( copy_to_guest(stat->triggers, usage, nr) ||
1320 copy_to_guest(stat->residencies, res, nr) )
1321 return -EFAULT;
1322
1323 stat->nr_pc = nr_pc;
1324 stat->nr_cc = nr_cc;
1325
1326 return 0;
1327 }
1328
pmstat_reset_cx_stat(uint32_t cpuid)1329 int pmstat_reset_cx_stat(uint32_t cpuid)
1330 {
1331 return 0;
1332 }
1333
cpuidle_disable_deep_cstate(void)1334 void cpuidle_disable_deep_cstate(void)
1335 {
1336 if ( max_cstate > 1 )
1337 {
1338 if ( local_apic_timer_c2_ok )
1339 max_cstate = 2;
1340 else
1341 max_cstate = 1;
1342 }
1343
1344 mb();
1345
1346 hpet_disable_legacy_broadcast();
1347 }
1348
cpuidle_using_deep_cstate(void)1349 bool cpuidle_using_deep_cstate(void)
1350 {
1351 return xen_cpuidle && max_cstate > (local_apic_timer_c2_ok ? 2 : 1);
1352 }
1353
cpu_callback(struct notifier_block * nfb,unsigned long action,void * hcpu)1354 static int cpu_callback(
1355 struct notifier_block *nfb, unsigned long action, void *hcpu)
1356 {
1357 unsigned int cpu = (unsigned long)hcpu;
1358
1359 /* Only hook on CPU_ONLINE because a dead cpu may utilize the info to
1360 * to enter deep C-state */
1361 switch ( action )
1362 {
1363 case CPU_ONLINE:
1364 (void)cpuidle_init_cpu(cpu);
1365 break;
1366 default:
1367 break;
1368 }
1369
1370 return NOTIFY_DONE;
1371 }
1372
1373 static struct notifier_block cpu_nfb = {
1374 .notifier_call = cpu_callback
1375 };
1376
cpuidle_presmp_init(void)1377 static int __init cpuidle_presmp_init(void)
1378 {
1379 void *cpu = (void *)(long)smp_processor_id();
1380
1381 if ( !xen_cpuidle )
1382 return 0;
1383
1384 mwait_idle_init(&cpu_nfb);
1385 cpu_nfb.notifier_call(&cpu_nfb, CPU_ONLINE, cpu);
1386 register_cpu_notifier(&cpu_nfb);
1387 return 0;
1388 }
1389 presmp_initcall(cpuidle_presmp_init);
1390
1391