1 /*
2  * cpu_idle - xen idle state module derived from Linux
3  *            drivers/acpi/processor_idle.c &
4  *            arch/x86/kernel/acpi/cstate.c
5  *
6  *  Copyright (C) 2001, 2002 Andy Grover <andrew.grover@intel.com>
7  *  Copyright (C) 2001, 2002 Paul Diefenbaugh <paul.s.diefenbaugh@intel.com>
8  *  Copyright (C) 2004, 2005 Dominik Brodowski <linux@brodo.de>
9  *  Copyright (C) 2004  Anil S Keshavamurthy <anil.s.keshavamurthy@intel.com>
10  *                      - Added processor hotplug support
11  *  Copyright (C) 2005  Venkatesh Pallipadi <venkatesh.pallipadi@intel.com>
12  *                      - Added support for C3 on SMP
13  *  Copyright (C) 2007, 2008 Intel Corporation
14  *
15  * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
16  *
17  *  This program is free software; you can redistribute it and/or modify
18  *  it under the terms of the GNU General Public License as published by
19  *  the Free Software Foundation; either version 2 of the License, or (at
20  *  your option) any later version.
21  *
22  *  This program is distributed in the hope that it will be useful, but
23  *  WITHOUT ANY WARRANTY; without even the implied warranty of
24  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
25  *  General Public License for more details.
26  *
27  *  You should have received a copy of the GNU General Public License along
28  *  with this program; If not, see <http://www.gnu.org/licenses/>.
29  *
30  * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
31  */
32 
33 #include <xen/errno.h>
34 #include <xen/lib.h>
35 #include <xen/types.h>
36 #include <xen/acpi.h>
37 #include <xen/smp.h>
38 #include <xen/guest_access.h>
39 #include <xen/keyhandler.h>
40 #include <xen/trace.h>
41 #include <xen/sched-if.h>
42 #include <xen/irq.h>
43 #include <asm/cache.h>
44 #include <asm/io.h>
45 #include <asm/iocap.h>
46 #include <asm/hpet.h>
47 #include <asm/processor.h>
48 #include <xen/pmstat.h>
49 #include <xen/softirq.h>
50 #include <public/platform.h>
51 #include <public/sysctl.h>
52 #include <acpi/cpufreq/cpufreq.h>
53 #include <asm/apic.h>
54 #include <asm/cpuidle.h>
55 #include <asm/mwait.h>
56 #include <xen/notifier.h>
57 #include <xen/cpu.h>
58 
59 /*#define DEBUG_PM_CX*/
60 
61 #define GET_HW_RES_IN_NS(msr, val) \
62     do { rdmsrl(msr, val); val = tsc_ticks2ns(val); } while( 0 )
63 #define GET_MC6_RES(val)  GET_HW_RES_IN_NS(0x664, val)
64 #define GET_PC2_RES(val)  GET_HW_RES_IN_NS(0x60D, val) /* SNB onwards */
65 #define GET_PC3_RES(val)  GET_HW_RES_IN_NS(0x3F8, val)
66 #define GET_PC6_RES(val)  GET_HW_RES_IN_NS(0x3F9, val)
67 #define GET_PC7_RES(val)  GET_HW_RES_IN_NS(0x3FA, val)
68 #define GET_PC8_RES(val)  GET_HW_RES_IN_NS(0x630, val) /* some Haswells only */
69 #define GET_PC9_RES(val)  GET_HW_RES_IN_NS(0x631, val) /* some Haswells only */
70 #define GET_PC10_RES(val) GET_HW_RES_IN_NS(0x632, val) /* some Haswells only */
71 #define GET_CC1_RES(val)  GET_HW_RES_IN_NS(0x660, val) /* Silvermont only */
72 #define GET_CC3_RES(val)  GET_HW_RES_IN_NS(0x3FC, val)
73 #define GET_CC6_RES(val)  GET_HW_RES_IN_NS(0x3FD, val)
74 #define GET_CC7_RES(val)  GET_HW_RES_IN_NS(0x3FE, val) /* SNB onwards */
75 #define PHI_CC6_RES(val)  GET_HW_RES_IN_NS(0x3FF, val) /* Xeon Phi only */
76 
lapic_timer_nop(void)77 static void lapic_timer_nop(void) { }
78 void (*__read_mostly lapic_timer_off)(void);
79 void (*__read_mostly lapic_timer_on)(void);
80 
lapic_timer_init(void)81 bool lapic_timer_init(void)
82 {
83     if ( boot_cpu_has(X86_FEATURE_ARAT) )
84     {
85         lapic_timer_off = lapic_timer_nop;
86         lapic_timer_on = lapic_timer_nop;
87     }
88     else if ( hpet_broadcast_is_available() )
89     {
90         lapic_timer_off = hpet_broadcast_enter;
91         lapic_timer_on = hpet_broadcast_exit;
92     }
93     else if ( pit_broadcast_is_available() )
94     {
95         lapic_timer_off = pit_broadcast_enter;
96         lapic_timer_on = pit_broadcast_exit;
97     }
98     else
99         return false;
100 
101     return true;
102 }
103 
104 static uint64_t (*__read_mostly tick_to_ns)(uint64_t) = acpi_pm_tick_to_ns;
105 
106 void (*__read_mostly pm_idle_save)(void);
107 unsigned int max_cstate __read_mostly = ACPI_PROCESSOR_MAX_POWER - 1;
108 integer_param("max_cstate", max_cstate);
109 static bool __read_mostly local_apic_timer_c2_ok;
110 boolean_param("lapic_timer_c2_ok", local_apic_timer_c2_ok);
111 
112 struct acpi_processor_power *__read_mostly processor_powers[NR_CPUS];
113 
114 struct hw_residencies
115 {
116     uint64_t mc0;
117     uint64_t mc6;
118     uint64_t pc2;
119     uint64_t pc3;
120     uint64_t pc4;
121     uint64_t pc6;
122     uint64_t pc7;
123     uint64_t pc8;
124     uint64_t pc9;
125     uint64_t pc10;
126     uint64_t cc1;
127     uint64_t cc3;
128     uint64_t cc6;
129     uint64_t cc7;
130 };
131 
do_get_hw_residencies(void * arg)132 static void do_get_hw_residencies(void *arg)
133 {
134     struct cpuinfo_x86 *c = &current_cpu_data;
135     struct hw_residencies *hw_res = arg;
136 
137     if ( c->x86_vendor != X86_VENDOR_INTEL || c->x86 != 6 )
138         return;
139 
140     switch ( c->x86_model )
141     {
142     /* 4th generation Intel Core (Haswell) */
143     case 0x45:
144         GET_PC8_RES(hw_res->pc8);
145         GET_PC9_RES(hw_res->pc9);
146         GET_PC10_RES(hw_res->pc10);
147         /* fall through */
148     /* Sandy bridge */
149     case 0x2A:
150     case 0x2D:
151     /* Ivy bridge */
152     case 0x3A:
153     case 0x3E:
154     /* Haswell */
155     case 0x3C:
156     case 0x3F:
157     case 0x46:
158     /* Broadwell */
159     case 0x3D:
160     case 0x47:
161     case 0x4F:
162     case 0x56:
163     /* Skylake */
164     case 0x4E:
165     case 0x55:
166     case 0x5E:
167     /* Cannon Lake */
168     case 0x66:
169     /* Kaby Lake */
170     case 0x8E:
171     case 0x9E:
172         GET_PC2_RES(hw_res->pc2);
173         GET_CC7_RES(hw_res->cc7);
174         /* fall through */
175     /* Nehalem */
176     case 0x1A:
177     case 0x1E:
178     case 0x1F:
179     case 0x2E:
180     /* Westmere */
181     case 0x25:
182     case 0x2C:
183     case 0x2F:
184         GET_PC3_RES(hw_res->pc3);
185         GET_PC6_RES(hw_res->pc6);
186         GET_PC7_RES(hw_res->pc7);
187         GET_CC3_RES(hw_res->cc3);
188         GET_CC6_RES(hw_res->cc6);
189         break;
190     /* Xeon Phi Knights Landing */
191     case 0x57:
192     /* Xeon Phi Knights Mill */
193     case 0x85:
194         GET_CC3_RES(hw_res->mc0); /* abusing GET_CC3_RES */
195         GET_CC6_RES(hw_res->mc6); /* abusing GET_CC6_RES */
196         GET_PC2_RES(hw_res->pc2);
197         GET_PC3_RES(hw_res->pc3);
198         GET_PC6_RES(hw_res->pc6);
199         GET_PC7_RES(hw_res->pc7);
200         PHI_CC6_RES(hw_res->cc6);
201         break;
202     /* various Atoms */
203     case 0x27:
204         GET_PC3_RES(hw_res->pc2); /* abusing GET_PC3_RES */
205         GET_PC6_RES(hw_res->pc4); /* abusing GET_PC6_RES */
206         GET_PC7_RES(hw_res->pc6); /* abusing GET_PC7_RES */
207         break;
208     /* Silvermont */
209     case 0x37:
210     case 0x4A:
211     case 0x4D:
212     case 0x5A:
213     case 0x5D:
214     /* Airmont */
215     case 0x4C:
216         GET_MC6_RES(hw_res->mc6);
217         GET_PC7_RES(hw_res->pc6); /* abusing GET_PC7_RES */
218         GET_CC1_RES(hw_res->cc1);
219         GET_CC6_RES(hw_res->cc6);
220         break;
221     /* Goldmont */
222     case 0x5C:
223     case 0x5F:
224     /* Goldmont Plus */
225     case 0x7A:
226         GET_PC2_RES(hw_res->pc2);
227         GET_PC3_RES(hw_res->pc3);
228         GET_PC6_RES(hw_res->pc6);
229         GET_PC10_RES(hw_res->pc10);
230         GET_CC1_RES(hw_res->cc1);
231         GET_CC3_RES(hw_res->cc3);
232         GET_CC6_RES(hw_res->cc6);
233         break;
234     }
235 }
236 
get_hw_residencies(uint32_t cpu,struct hw_residencies * hw_res)237 static void get_hw_residencies(uint32_t cpu, struct hw_residencies *hw_res)
238 {
239     memset(hw_res, 0, sizeof(*hw_res));
240 
241     if ( smp_processor_id() == cpu )
242         do_get_hw_residencies(hw_res);
243     else
244         on_selected_cpus(cpumask_of(cpu), do_get_hw_residencies, hw_res, 1);
245 }
246 
print_hw_residencies(uint32_t cpu)247 static void print_hw_residencies(uint32_t cpu)
248 {
249     struct hw_residencies hw_res;
250 
251     get_hw_residencies(cpu, &hw_res);
252 
253     if ( hw_res.mc0 | hw_res.mc6 )
254         printk("MC0[%"PRIu64"] MC6[%"PRIu64"]\n",
255                hw_res.mc0, hw_res.mc6);
256     printk("PC2[%"PRIu64"] PC%d[%"PRIu64"] PC6[%"PRIu64"] PC7[%"PRIu64"]\n",
257            hw_res.pc2,
258            hw_res.pc4 ? 4 : 3, hw_res.pc4 ?: hw_res.pc3,
259            hw_res.pc6, hw_res.pc7);
260     if ( hw_res.pc8 | hw_res.pc9 | hw_res.pc10 )
261         printk("PC8[%"PRIu64"] PC9[%"PRIu64"] PC10[%"PRIu64"]\n",
262                hw_res.pc8, hw_res.pc9, hw_res.pc10);
263     printk("CC%d[%"PRIu64"] CC6[%"PRIu64"] CC7[%"PRIu64"]\n",
264            hw_res.cc1 ? 1 : 3, hw_res.cc1 ?: hw_res.cc3,
265            hw_res.cc6, hw_res.cc7);
266 }
267 
268 static char* acpi_cstate_method_name[] =
269 {
270     "NONE",
271     "SYSIO",
272     "FFH",
273     "HALT"
274 };
275 
get_stime_tick(void)276 static uint64_t get_stime_tick(void) { return (uint64_t)NOW(); }
stime_ticks_elapsed(uint64_t t1,uint64_t t2)277 static uint64_t stime_ticks_elapsed(uint64_t t1, uint64_t t2) { return t2 - t1; }
stime_tick_to_ns(uint64_t ticks)278 static uint64_t stime_tick_to_ns(uint64_t ticks) { return ticks; }
279 
get_acpi_pm_tick(void)280 static uint64_t get_acpi_pm_tick(void) { return (uint64_t)inl(pmtmr_ioport); }
acpi_pm_ticks_elapsed(uint64_t t1,uint64_t t2)281 static uint64_t acpi_pm_ticks_elapsed(uint64_t t1, uint64_t t2)
282 {
283     if ( t2 >= t1 )
284         return (t2 - t1);
285     else if ( !(acpi_gbl_FADT.flags & ACPI_FADT_32BIT_TIMER) )
286         return (((0x00FFFFFF - t1) + t2 + 1) & 0x00FFFFFF);
287     else
288         return ((0xFFFFFFFF - t1) + t2 +1);
289 }
290 
291 uint64_t (*__read_mostly cpuidle_get_tick)(void) = get_acpi_pm_tick;
292 static uint64_t (*__read_mostly ticks_elapsed)(uint64_t, uint64_t)
293     = acpi_pm_ticks_elapsed;
294 
print_acpi_power(uint32_t cpu,struct acpi_processor_power * power)295 static void print_acpi_power(uint32_t cpu, struct acpi_processor_power *power)
296 {
297     uint64_t idle_res = 0, idle_usage = 0;
298     uint64_t last_state_update_tick, current_tick, current_stime;
299     uint64_t usage[ACPI_PROCESSOR_MAX_POWER] = { 0 };
300     uint64_t res_tick[ACPI_PROCESSOR_MAX_POWER] = { 0 };
301     unsigned int i;
302     signed int last_state_idx;
303 
304     printk("==cpu%d==\n", cpu);
305     last_state_idx = power->last_state ? power->last_state->idx : -1;
306     printk("active state:\t\tC%d\n", last_state_idx);
307     printk("max_cstate:\t\tC%d\n", max_cstate);
308     printk("states:\n");
309 
310     spin_lock_irq(&power->stat_lock);
311     current_tick = cpuidle_get_tick();
312     current_stime = NOW();
313     for ( i = 1; i < power->count; i++ )
314     {
315         res_tick[i] = power->states[i].time;
316         usage[i] = power->states[i].usage;
317     }
318     last_state_update_tick = power->last_state_update_tick;
319     spin_unlock_irq(&power->stat_lock);
320 
321     if ( last_state_idx >= 0 )
322     {
323         res_tick[last_state_idx] += ticks_elapsed(last_state_update_tick,
324                                                   current_tick);
325         usage[last_state_idx]++;
326     }
327 
328     for ( i = 1; i < power->count; i++ )
329     {
330         idle_usage += usage[i];
331         idle_res += tick_to_ns(res_tick[i]);
332 
333         printk((last_state_idx == i) ? "   *" : "    ");
334         printk("C%d:\t", i);
335         printk("type[C%d] ", power->states[i].type);
336         printk("latency[%03d] ", power->states[i].latency);
337         printk("usage[%08"PRIu64"] ", usage[i]);
338         printk("method[%5s] ", acpi_cstate_method_name[power->states[i].entry_method]);
339         printk("duration[%"PRIu64"]\n", tick_to_ns(res_tick[i]));
340     }
341     printk((last_state_idx == 0) ? "   *" : "    ");
342     printk("C0:\tusage[%08"PRIu64"] duration[%"PRIu64"]\n",
343            usage[0] + idle_usage, current_stime - idle_res);
344 
345     print_hw_residencies(cpu);
346 }
347 
dump_cx(unsigned char key)348 static void dump_cx(unsigned char key)
349 {
350     unsigned int cpu;
351 
352     printk("'%c' pressed -> printing ACPI Cx structures\n", key);
353     for_each_online_cpu ( cpu )
354         if (processor_powers[cpu])
355             print_acpi_power(cpu, processor_powers[cpu]);
356 }
357 
cpu_idle_key_init(void)358 static int __init cpu_idle_key_init(void)
359 {
360     register_keyhandler('c', dump_cx, "dump ACPI Cx structures", 1);
361     return 0;
362 }
363 __initcall(cpu_idle_key_init);
364 
365 /*
366  * The bit is set iff cpu use monitor/mwait to enter C state
367  * with this flag set, CPU can be waken up from C state
368  * by writing to specific memory address, instead of sending an IPI.
369  */
370 static cpumask_t cpuidle_mwait_flags;
371 
cpuidle_wakeup_mwait(cpumask_t * mask)372 void cpuidle_wakeup_mwait(cpumask_t *mask)
373 {
374     cpumask_t target;
375     unsigned int cpu;
376 
377     cpumask_and(&target, mask, &cpuidle_mwait_flags);
378 
379     /* CPU is MWAITing on the cpuidle_mwait_wakeup flag. */
380     for_each_cpu(cpu, &target)
381         mwait_wakeup(cpu) = 0;
382 
383     cpumask_andnot(mask, mask, &target);
384 }
385 
arch_skip_send_event_check(unsigned int cpu)386 bool arch_skip_send_event_check(unsigned int cpu)
387 {
388     /*
389      * This relies on softirq_pending() and mwait_wakeup() to access data
390      * on the same cache line.
391      */
392     smp_mb();
393     return !!cpumask_test_cpu(cpu, &cpuidle_mwait_flags);
394 }
395 
mwait_idle_with_hints(unsigned int eax,unsigned int ecx)396 void mwait_idle_with_hints(unsigned int eax, unsigned int ecx)
397 {
398     unsigned int cpu = smp_processor_id();
399     s_time_t expires = per_cpu(timer_deadline, cpu);
400 
401     if ( boot_cpu_has(X86_FEATURE_CLFLUSH_MONITOR) )
402     {
403         mb();
404         clflush((void *)&mwait_wakeup(cpu));
405         mb();
406     }
407 
408     __monitor((void *)&mwait_wakeup(cpu), 0, 0);
409     smp_mb();
410 
411     /*
412      * Timer deadline passing is the event on which we will be woken via
413      * cpuidle_mwait_wakeup. So check it now that the location is armed.
414      */
415     if ( (expires > NOW() || expires == 0) && !softirq_pending(cpu) )
416     {
417         cpumask_set_cpu(cpu, &cpuidle_mwait_flags);
418         __mwait(eax, ecx);
419         cpumask_clear_cpu(cpu, &cpuidle_mwait_flags);
420     }
421 
422     if ( expires <= NOW() && expires > 0 )
423         raise_softirq(TIMER_SOFTIRQ);
424 }
425 
acpi_processor_ffh_cstate_enter(struct acpi_processor_cx * cx)426 static void acpi_processor_ffh_cstate_enter(struct acpi_processor_cx *cx)
427 {
428     mwait_idle_with_hints(cx->address, MWAIT_ECX_INTERRUPT_BREAK);
429 }
430 
acpi_idle_do_entry(struct acpi_processor_cx * cx)431 static void acpi_idle_do_entry(struct acpi_processor_cx *cx)
432 {
433     switch ( cx->entry_method )
434     {
435     case ACPI_CSTATE_EM_FFH:
436         /* Call into architectural FFH based C-state */
437         acpi_processor_ffh_cstate_enter(cx);
438         return;
439     case ACPI_CSTATE_EM_SYSIO:
440         /* IO port based C-state */
441         inb(cx->address);
442         /* Dummy wait op - must do something useless after P_LVL2 read
443            because chipsets cannot guarantee that STPCLK# signal
444            gets asserted in time to freeze execution properly. */
445         inl(pmtmr_ioport);
446         return;
447     case ACPI_CSTATE_EM_HALT:
448         safe_halt();
449         local_irq_disable();
450         return;
451     }
452 }
453 
acpi_idle_bm_check(void)454 static int acpi_idle_bm_check(void)
455 {
456     u32 bm_status = 0;
457 
458     acpi_get_register(ACPI_BITREG_BUS_MASTER_STATUS, &bm_status);
459     if ( bm_status )
460         acpi_set_register(ACPI_BITREG_BUS_MASTER_STATUS, 1);
461     /*
462      * TBD: PIIX4 Erratum #18: Note that BM_STS doesn't always reflect
463      * the true state of bus mastering activity; forcing us to
464      * manually check the BMIDEA bit of each IDE channel.
465      */
466     return bm_status;
467 }
468 
469 static struct {
470     spinlock_t lock;
471     unsigned int count;
472 } c3_cpu_status = { .lock = SPIN_LOCK_UNLOCKED };
473 
trace_exit_reason(u32 * irq_traced)474 void trace_exit_reason(u32 *irq_traced)
475 {
476     if ( unlikely(tb_init_done) )
477     {
478         int i, curbit;
479         u32 irr_status[8] = { 0 };
480 
481         /* Get local apic IRR register */
482         for ( i = 0; i < 8; i++ )
483             irr_status[i] = apic_read(APIC_IRR + (i << 4));
484         i = 0;
485         curbit = find_first_bit((const unsigned long *)irr_status, 256);
486         while ( i < 4 && curbit < 256 )
487         {
488             irq_traced[i++] = curbit;
489             curbit = find_next_bit((const unsigned long *)irr_status, 256, curbit + 1);
490         }
491     }
492 }
493 
494 /*
495  * "AAJ72. EOI Transaction May Not be Sent if Software Enters Core C6 During
496  * an Interrupt Service Routine"
497  *
498  * There was an errata with some Core i7 processors that an EOI transaction
499  * may not be sent if software enters core C6 during an interrupt service
500  * routine. So we don't enter deep Cx state if there is an EOI pending.
501  */
errata_c6_eoi_workaround(void)502 static bool errata_c6_eoi_workaround(void)
503 {
504     static int8_t fix_needed = -1;
505 
506     if ( unlikely(fix_needed == -1) )
507     {
508         int model = boot_cpu_data.x86_model;
509         fix_needed = (cpu_has_apic && !directed_eoi_enabled &&
510                       (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) &&
511                       (boot_cpu_data.x86 == 6) &&
512                       ((model == 0x1a) || (model == 0x1e) || (model == 0x1f) ||
513                        (model == 0x25) || (model == 0x2c) || (model == 0x2f)));
514     }
515 
516     return (fix_needed && cpu_has_pending_apic_eoi());
517 }
518 
update_last_cx_stat(struct acpi_processor_power * power,struct acpi_processor_cx * cx,uint64_t ticks)519 void update_last_cx_stat(struct acpi_processor_power *power,
520                          struct acpi_processor_cx *cx, uint64_t ticks)
521 {
522     ASSERT(!local_irq_is_enabled());
523 
524     spin_lock(&power->stat_lock);
525     power->last_state = cx;
526     power->last_state_update_tick = ticks;
527     spin_unlock(&power->stat_lock);
528 }
529 
update_idle_stats(struct acpi_processor_power * power,struct acpi_processor_cx * cx,uint64_t before,uint64_t after)530 void update_idle_stats(struct acpi_processor_power *power,
531                        struct acpi_processor_cx *cx,
532                        uint64_t before, uint64_t after)
533 {
534     int64_t sleep_ticks = ticks_elapsed(before, after);
535     /* Interrupts are disabled */
536 
537     spin_lock(&power->stat_lock);
538 
539     cx->usage++;
540     if ( sleep_ticks > 0 )
541     {
542         power->last_residency = tick_to_ns(sleep_ticks) / 1000UL;
543         cx->time += sleep_ticks;
544     }
545     power->last_state = &power->states[0];
546     power->last_state_update_tick = after;
547 
548     spin_unlock(&power->stat_lock);
549 }
550 
acpi_processor_idle(void)551 static void acpi_processor_idle(void)
552 {
553     struct acpi_processor_power *power = processor_powers[smp_processor_id()];
554     struct acpi_processor_cx *cx = NULL;
555     int next_state;
556     uint64_t t1, t2 = 0;
557     u32 exp = 0, pred = 0;
558     u32 irq_traced[4] = { 0 };
559 
560     if ( max_cstate > 0 && power && !sched_has_urgent_vcpu() &&
561          (next_state = cpuidle_current_governor->select(power)) > 0 )
562     {
563         cx = &power->states[next_state];
564         if ( cx->type == ACPI_STATE_C3 && power->flags.bm_check &&
565              acpi_idle_bm_check() )
566             cx = power->safe_state;
567         if ( cx->idx > max_cstate )
568             cx = &power->states[max_cstate];
569         menu_get_trace_data(&exp, &pred);
570     }
571     if ( !cx )
572     {
573         if ( pm_idle_save )
574             pm_idle_save();
575         else
576             safe_halt();
577         return;
578     }
579 
580     cpufreq_dbs_timer_suspend();
581 
582     sched_tick_suspend();
583     /* sched_tick_suspend() can raise TIMER_SOFTIRQ. Process it now. */
584     process_pending_softirqs();
585 
586     /*
587      * Interrupts must be disabled during bus mastering calculations and
588      * for C2/C3 transitions.
589      */
590     local_irq_disable();
591 
592     if ( !cpu_is_haltable(smp_processor_id()) )
593     {
594         local_irq_enable();
595         sched_tick_resume();
596         cpufreq_dbs_timer_resume();
597         return;
598     }
599 
600     if ( (cx->type == ACPI_STATE_C3) && errata_c6_eoi_workaround() )
601         cx = power->safe_state;
602 
603 
604     /*
605      * Sleep:
606      * ------
607      * Invoke the current Cx state to put the processor to sleep.
608      */
609     switch ( cx->type )
610     {
611     case ACPI_STATE_C1:
612     case ACPI_STATE_C2:
613         if ( cx->type == ACPI_STATE_C1 || local_apic_timer_c2_ok )
614         {
615             /* Get start time (ticks) */
616             t1 = cpuidle_get_tick();
617             /* Trace cpu idle entry */
618             TRACE_4D(TRC_PM_IDLE_ENTRY, cx->idx, t1, exp, pred);
619 
620             update_last_cx_stat(power, cx, t1);
621 
622             /* Invoke C2 */
623             acpi_idle_do_entry(cx);
624             /* Get end time (ticks) */
625             t2 = cpuidle_get_tick();
626             trace_exit_reason(irq_traced);
627             /* Trace cpu idle exit */
628             TRACE_6D(TRC_PM_IDLE_EXIT, cx->idx, t2,
629                      irq_traced[0], irq_traced[1], irq_traced[2], irq_traced[3]);
630             /* Update statistics */
631             update_idle_stats(power, cx, t1, t2);
632             /* Re-enable interrupts */
633             local_irq_enable();
634             break;
635         }
636 
637     case ACPI_STATE_C3:
638         /*
639          * Before invoking C3, be aware that TSC/APIC timer may be
640          * stopped by H/W. Without carefully handling of TSC/APIC stop issues,
641          * deep C state can't work correctly.
642          */
643         /* preparing APIC stop */
644         lapic_timer_off();
645 
646         /* Get start time (ticks) */
647         t1 = cpuidle_get_tick();
648         /* Trace cpu idle entry */
649         TRACE_4D(TRC_PM_IDLE_ENTRY, cx->idx, t1, exp, pred);
650 
651         update_last_cx_stat(power, cx, t1);
652 
653         /*
654          * disable bus master
655          * bm_check implies we need ARB_DIS
656          * !bm_check implies we need cache flush
657          * bm_control implies whether we can do ARB_DIS
658          *
659          * That leaves a case where bm_check is set and bm_control is
660          * not set. In that case we cannot do much, we enter C3
661          * without doing anything.
662          */
663         if ( cx->type != ACPI_STATE_C3 )
664             /* nothing to be done here */;
665         else if ( power->flags.bm_check && power->flags.bm_control )
666         {
667             spin_lock(&c3_cpu_status.lock);
668             if ( ++c3_cpu_status.count == num_online_cpus() )
669             {
670                 /*
671                  * All CPUs are trying to go to C3
672                  * Disable bus master arbitration
673                  */
674                 acpi_set_register(ACPI_BITREG_ARB_DISABLE, 1);
675             }
676             spin_unlock(&c3_cpu_status.lock);
677         }
678         else if ( !power->flags.bm_check )
679         {
680             /* SMP with no shared cache... Invalidate cache  */
681             ACPI_FLUSH_CPU_CACHE();
682         }
683 
684         /* Invoke C3 */
685         acpi_idle_do_entry(cx);
686 
687         if ( (cx->type == ACPI_STATE_C3) &&
688              power->flags.bm_check && power->flags.bm_control )
689         {
690             /* Enable bus master arbitration */
691             spin_lock(&c3_cpu_status.lock);
692             if ( c3_cpu_status.count-- == num_online_cpus() )
693                 acpi_set_register(ACPI_BITREG_ARB_DISABLE, 0);
694             spin_unlock(&c3_cpu_status.lock);
695         }
696 
697         /* Get end time (ticks) */
698         t2 = cpuidle_get_tick();
699 
700         /* recovering TSC */
701         cstate_restore_tsc();
702         trace_exit_reason(irq_traced);
703         /* Trace cpu idle exit */
704         TRACE_6D(TRC_PM_IDLE_EXIT, cx->idx, t2,
705                  irq_traced[0], irq_traced[1], irq_traced[2], irq_traced[3]);
706 
707         /* Update statistics */
708         update_idle_stats(power, cx, t1, t2);
709         /* Re-enable interrupts */
710         local_irq_enable();
711         /* recovering APIC */
712         lapic_timer_on();
713 
714         break;
715 
716     default:
717         /* Now in C0 */
718         power->last_state = &power->states[0];
719         local_irq_enable();
720         sched_tick_resume();
721         cpufreq_dbs_timer_resume();
722         return;
723     }
724 
725     /* Now in C0 */
726     power->last_state = &power->states[0];
727 
728     sched_tick_resume();
729     cpufreq_dbs_timer_resume();
730 
731     if ( cpuidle_current_governor->reflect )
732         cpuidle_current_governor->reflect(power);
733 }
734 
acpi_dead_idle(void)735 void acpi_dead_idle(void)
736 {
737     struct acpi_processor_power *power;
738     struct acpi_processor_cx *cx;
739 
740     if ( (power = processor_powers[smp_processor_id()]) == NULL )
741         goto default_halt;
742 
743     if ( (cx = &power->states[power->count-1]) == NULL )
744         goto default_halt;
745 
746     if ( cx->entry_method == ACPI_CSTATE_EM_FFH )
747     {
748         void *mwait_ptr = &mwait_wakeup(smp_processor_id());
749 
750         /*
751          * Cache must be flushed as the last operation before sleeping.
752          * Otherwise, CPU may still hold dirty data, breaking cache coherency,
753          * leading to strange errors.
754          */
755         wbinvd();
756 
757         while ( 1 )
758         {
759             /*
760              * 1. The CLFLUSH is a workaround for erratum AAI65 for
761              * the Xeon 7400 series.
762              * 2. The WBINVD is insufficient due to the spurious-wakeup
763              * case where we return around the loop.
764              * 3. Unlike wbinvd, clflush is a light weight but not serializing
765              * instruction, hence memory fence is necessary to make sure all
766              * load/store visible before flush cache line.
767              */
768             mb();
769             clflush(mwait_ptr);
770             __monitor(mwait_ptr, 0, 0);
771             mb();
772             __mwait(cx->address, 0);
773         }
774     }
775     else if ( current_cpu_data.x86_vendor == X86_VENDOR_AMD &&
776               cx->entry_method == ACPI_CSTATE_EM_SYSIO )
777     {
778         /* Intel prefers not to use SYSIO */
779 
780         /* Avoid references to shared data after the cache flush */
781         u32 address = cx->address;
782         u32 pmtmr_ioport_local = pmtmr_ioport;
783 
784         wbinvd();
785 
786         while ( 1 )
787         {
788             inb(address);
789             inl(pmtmr_ioport_local);
790         }
791     }
792 
793 default_halt:
794     default_dead_idle();
795 }
796 
cpuidle_init_cpu(unsigned int cpu)797 int cpuidle_init_cpu(unsigned int cpu)
798 {
799     struct acpi_processor_power *acpi_power;
800 
801     acpi_power = processor_powers[cpu];
802     if ( !acpi_power )
803     {
804         unsigned int i;
805 
806         if ( cpu == 0 && boot_cpu_has(X86_FEATURE_NONSTOP_TSC) )
807         {
808             cpuidle_get_tick = get_stime_tick;
809             ticks_elapsed = stime_ticks_elapsed;
810             tick_to_ns = stime_tick_to_ns;
811         }
812 
813         acpi_power = xzalloc(struct acpi_processor_power);
814         if ( !acpi_power )
815             return -ENOMEM;
816 
817         for ( i = 0; i < ACPI_PROCESSOR_MAX_POWER; i++ )
818             acpi_power->states[i].idx = i;
819 
820         acpi_power->cpu = cpu;
821         processor_powers[cpu] = acpi_power;
822     }
823 
824     acpi_power->count = 2;
825     acpi_power->states[1].type = ACPI_STATE_C1;
826     acpi_power->states[1].entry_method = ACPI_CSTATE_EM_HALT;
827     acpi_power->safe_state = &acpi_power->states[1];
828     spin_lock_init(&acpi_power->stat_lock);
829 
830     return 0;
831 }
832 
acpi_processor_ffh_cstate_probe(xen_processor_cx_t * cx)833 static int acpi_processor_ffh_cstate_probe(xen_processor_cx_t *cx)
834 {
835     struct cpuinfo_x86 *c = &current_cpu_data;
836     unsigned int eax, ebx, ecx, edx;
837     unsigned int edx_part;
838     unsigned int cstate_type; /* C-state type and not ACPI C-state type */
839     unsigned int num_cstate_subtype;
840     int ret = 0;
841     static unsigned long printed;
842 
843     if ( c->cpuid_level < CPUID_MWAIT_LEAF )
844     {
845         printk(XENLOG_INFO "MWAIT leaf not supported by cpuid\n");
846         return -EFAULT;
847     }
848 
849     cpuid(CPUID_MWAIT_LEAF, &eax, &ebx, &ecx, &edx);
850     if ( opt_cpu_info )
851         printk(XENLOG_DEBUG "cpuid.MWAIT[eax=%x ebx=%x ecx=%x edx=%x]\n",
852                eax, ebx, ecx, edx);
853 
854     /* Check whether this particular cx_type (in CST) is supported or not */
855     cstate_type = (cx->reg.address >> MWAIT_SUBSTATE_SIZE) + 1;
856     edx_part = edx >> (cstate_type * MWAIT_SUBSTATE_SIZE);
857     num_cstate_subtype = edx_part & MWAIT_SUBSTATE_MASK;
858 
859     if ( num_cstate_subtype < (cx->reg.address & MWAIT_SUBSTATE_MASK) )
860         ret = -ERANGE;
861     /* mwait ecx extensions INTERRUPT_BREAK should be supported for C2/C3 */
862     else if ( !(ecx & CPUID5_ECX_EXTENSIONS_SUPPORTED) ||
863               !(ecx & CPUID5_ECX_INTERRUPT_BREAK) )
864         ret = -ENODEV;
865     else if ( opt_cpu_info || cx->type >= BITS_PER_LONG ||
866               !test_and_set_bit(cx->type, &printed) )
867         printk(XENLOG_INFO "Monitor-Mwait will be used to enter C%d state\n",
868                cx->type);
869     return ret;
870 }
871 
872 /*
873  * Initialize bm_flags based on the CPU cache properties
874  * On SMP it depends on cache configuration
875  * - When cache is not shared among all CPUs, we flush cache
876  *   before entering C3.
877  * - When cache is shared among all CPUs, we use bm_check
878  *   mechanism as in UP case
879  *
880  * This routine is called only after all the CPUs are online
881  */
acpi_processor_power_init_bm_check(struct acpi_processor_flags * flags)882 static void acpi_processor_power_init_bm_check(struct acpi_processor_flags *flags)
883 {
884     struct cpuinfo_x86 *c = &current_cpu_data;
885 
886     flags->bm_check = 0;
887     if ( num_online_cpus() == 1 )
888         flags->bm_check = 1;
889     else if ( (c->x86_vendor == X86_VENDOR_INTEL) ||
890               ((c->x86_vendor == X86_VENDOR_AMD) && (c->x86 == 0x15)) )
891     {
892         /*
893          * Today all MP CPUs that support C3 share cache.
894          * And caches should not be flushed by software while
895          * entering C3 type state.
896          */
897         flags->bm_check = 1;
898     }
899 
900     /*
901      * On all recent platforms, ARB_DISABLE is a nop.
902      * So, set bm_control to zero to indicate that ARB_DISABLE
903      * is not required while entering C3 type state on
904      * P4, Core and beyond CPUs
905      */
906     if ( c->x86_vendor == X86_VENDOR_INTEL &&
907         (c->x86 > 0x6 || (c->x86 == 6 && c->x86_model >= 14)) )
908             flags->bm_control = 0;
909 }
910 
911 #define VENDOR_INTEL                   (1)
912 #define NATIVE_CSTATE_BEYOND_HALT      (2)
913 
check_cx(struct acpi_processor_power * power,xen_processor_cx_t * cx)914 static int check_cx(struct acpi_processor_power *power, xen_processor_cx_t *cx)
915 {
916     static int bm_check_flag = -1;
917     static int bm_control_flag = -1;
918 
919     switch ( cx->reg.space_id )
920     {
921     case ACPI_ADR_SPACE_SYSTEM_IO:
922         if ( cx->reg.address == 0 )
923             return -EINVAL;
924         break;
925 
926     case ACPI_ADR_SPACE_FIXED_HARDWARE:
927         if ( cx->reg.bit_width != VENDOR_INTEL ||
928              cx->reg.bit_offset != NATIVE_CSTATE_BEYOND_HALT )
929             return -EINVAL;
930 
931         /* assume all logical cpu has the same support for mwait */
932         if ( acpi_processor_ffh_cstate_probe(cx) )
933             return -EINVAL;
934         break;
935 
936     default:
937         return -ENODEV;
938     }
939 
940     switch ( cx->type )
941     {
942     case ACPI_STATE_C2:
943         if ( local_apic_timer_c2_ok )
944             break;
945     case ACPI_STATE_C3:
946         if ( !lapic_timer_init() )
947             return -EINVAL;
948 
949         /* All the logic here assumes flags.bm_check is same across all CPUs */
950         if ( bm_check_flag < 0 )
951         {
952             /* Determine whether bm_check is needed based on CPU  */
953             acpi_processor_power_init_bm_check(&(power->flags));
954         }
955         else
956         {
957             power->flags.bm_check = bm_check_flag;
958             power->flags.bm_control = bm_control_flag;
959         }
960 
961         if ( power->flags.bm_check )
962         {
963             if ( !power->flags.bm_control )
964             {
965                 if ( power->flags.has_cst != 1 )
966                 {
967                     /* bus mastering control is necessary */
968                     ACPI_DEBUG_PRINT((ACPI_DB_INFO,
969                         "C3 support requires BM control\n"));
970                     return -EINVAL;
971                 }
972                 else
973                 {
974                     /* Here we enter C3 without bus mastering */
975                     ACPI_DEBUG_PRINT((ACPI_DB_INFO,
976                         "C3 support without BM control\n"));
977                 }
978             }
979             /*
980              * On older chipsets, BM_RLD needs to be set in order for Bus
981              * Master activity to wake the system from C3, hence
982              * acpi_set_register() is always being called once below.  Newer
983              * chipsets handle DMA during C3 automatically and BM_RLD is a
984              * NOP.  In either case, the proper way to handle BM_RLD is to
985              * set it and leave it set.
986              */
987         }
988         else
989         {
990             /*
991              * WBINVD should be set in fadt, for C3 state to be
992              * supported on when bm_check is not required.
993              */
994             if ( !(acpi_gbl_FADT.flags & ACPI_FADT_WBINVD) )
995             {
996                 ACPI_DEBUG_PRINT((ACPI_DB_INFO,
997                           "Cache invalidation should work properly"
998                           " for C3 to be enabled on SMP systems\n"));
999                 return -EINVAL;
1000             }
1001         }
1002 
1003         if ( bm_check_flag < 0 )
1004         {
1005             bm_check_flag = power->flags.bm_check;
1006             bm_control_flag = power->flags.bm_control;
1007             acpi_set_register(ACPI_BITREG_BUS_MASTER_RLD, bm_check_flag);
1008         }
1009 
1010         break;
1011     }
1012 
1013     return 0;
1014 }
1015 
1016 static unsigned int latency_factor = 2;
1017 integer_param("idle_latency_factor", latency_factor);
1018 
set_cx(struct acpi_processor_power * acpi_power,xen_processor_cx_t * xen_cx)1019 static void set_cx(
1020     struct acpi_processor_power *acpi_power,
1021     xen_processor_cx_t *xen_cx)
1022 {
1023     struct acpi_processor_cx *cx;
1024 
1025     if ( check_cx(acpi_power, xen_cx) != 0 )
1026         return;
1027 
1028     switch ( xen_cx->type )
1029     {
1030     case ACPI_STATE_C1:
1031         cx = &acpi_power->states[1];
1032         break;
1033     default:
1034         if ( acpi_power->count >= ACPI_PROCESSOR_MAX_POWER )
1035         {
1036     case ACPI_STATE_C0:
1037             printk(XENLOG_WARNING "CPU%u: C%d data ignored\n",
1038                    acpi_power->cpu, xen_cx->type);
1039             return;
1040         }
1041         cx = &acpi_power->states[acpi_power->count];
1042         cx->type = xen_cx->type;
1043         break;
1044     }
1045 
1046     cx->address = xen_cx->reg.address;
1047 
1048     switch ( xen_cx->reg.space_id )
1049     {
1050     case ACPI_ADR_SPACE_FIXED_HARDWARE:
1051         if ( xen_cx->reg.bit_width == VENDOR_INTEL &&
1052              xen_cx->reg.bit_offset == NATIVE_CSTATE_BEYOND_HALT &&
1053              boot_cpu_has(X86_FEATURE_MONITOR) )
1054             cx->entry_method = ACPI_CSTATE_EM_FFH;
1055         else
1056             cx->entry_method = ACPI_CSTATE_EM_HALT;
1057         break;
1058     case ACPI_ADR_SPACE_SYSTEM_IO:
1059         if ( ioports_deny_access(hardware_domain, cx->address, cx->address) )
1060             printk(XENLOG_WARNING "Could not deny access to port %04x\n",
1061                    cx->address);
1062         cx->entry_method = ACPI_CSTATE_EM_SYSIO;
1063         break;
1064     default:
1065         cx->entry_method = ACPI_CSTATE_EM_NONE;
1066         break;
1067     }
1068 
1069     cx->latency = xen_cx->latency;
1070     cx->target_residency = cx->latency * latency_factor;
1071 
1072     smp_wmb();
1073     acpi_power->count += (cx->type != ACPI_STATE_C1);
1074     if ( cx->type == ACPI_STATE_C1 || cx->type == ACPI_STATE_C2 )
1075         acpi_power->safe_state = cx;
1076 }
1077 
get_cpu_id(u32 acpi_id)1078 int get_cpu_id(u32 acpi_id)
1079 {
1080     int i;
1081     u32 apic_id;
1082 
1083     if ( acpi_id >= MAX_MADT_ENTRIES )
1084         return -1;
1085 
1086     apic_id = x86_acpiid_to_apicid[acpi_id];
1087     if ( apic_id == BAD_APICID )
1088         return -1;
1089 
1090     for ( i = 0; i < nr_cpu_ids; i++ )
1091     {
1092         if ( apic_id == x86_cpu_to_apicid[i] )
1093             return i;
1094     }
1095 
1096     return -1;
1097 }
1098 
1099 #ifdef DEBUG_PM_CX
print_cx_pminfo(uint32_t cpu,struct xen_processor_power * power)1100 static void print_cx_pminfo(uint32_t cpu, struct xen_processor_power *power)
1101 {
1102     XEN_GUEST_HANDLE(xen_processor_cx_t) states;
1103     xen_processor_cx_t  state;
1104     XEN_GUEST_HANDLE(xen_processor_csd_t) csd;
1105     xen_processor_csd_t dp;
1106     uint32_t i;
1107 
1108     printk("cpu%d cx acpi info:\n", cpu);
1109     printk("\tcount = %d\n", power->count);
1110     printk("\tflags: bm_cntl[%d], bm_chk[%d], has_cst[%d],\n"
1111            "\t       pwr_setup_done[%d], bm_rld_set[%d]\n",
1112            power->flags.bm_control, power->flags.bm_check, power->flags.has_cst,
1113            power->flags.power_setup_done, power->flags.bm_rld_set);
1114 
1115     states = power->states;
1116 
1117     for ( i = 0; i < power->count; i++ )
1118     {
1119         if ( unlikely(copy_from_guest_offset(&state, states, i, 1)) )
1120             return;
1121 
1122         printk("\tstates[%d]:\n", i);
1123         printk("\t\treg.space_id = %#x\n", state.reg.space_id);
1124         printk("\t\treg.bit_width = %#x\n", state.reg.bit_width);
1125         printk("\t\treg.bit_offset = %#x\n", state.reg.bit_offset);
1126         printk("\t\treg.access_size = %#x\n", state.reg.access_size);
1127         printk("\t\treg.address = %#"PRIx64"\n", state.reg.address);
1128         printk("\t\ttype    = %d\n", state.type);
1129         printk("\t\tlatency = %d\n", state.latency);
1130         printk("\t\tpower   = %d\n", state.power);
1131 
1132         csd = state.dp;
1133         printk("\t\tdp(@0x%p)\n", csd.p);
1134 
1135         if ( csd.p != NULL )
1136         {
1137             if ( unlikely(copy_from_guest(&dp, csd, 1)) )
1138                 return;
1139             printk("\t\t\tdomain = %d\n", dp.domain);
1140             printk("\t\t\tcoord_type   = %d\n", dp.coord_type);
1141             printk("\t\t\tnum = %d\n", dp.num);
1142         }
1143     }
1144 }
1145 #else
1146 #define print_cx_pminfo(c, p)
1147 #endif
1148 
set_cx_pminfo(uint32_t cpu,struct xen_processor_power * power)1149 long set_cx_pminfo(uint32_t cpu, struct xen_processor_power *power)
1150 {
1151     XEN_GUEST_HANDLE(xen_processor_cx_t) states;
1152     xen_processor_cx_t xen_cx;
1153     struct acpi_processor_power *acpi_power;
1154     int cpu_id, i, ret;
1155 
1156     if ( unlikely(!guest_handle_okay(power->states, power->count)) )
1157         return -EFAULT;
1158 
1159     if ( pm_idle_save && pm_idle != acpi_processor_idle )
1160         return 0;
1161 
1162     print_cx_pminfo(cpu, power);
1163 
1164     /* map from acpi_id to cpu_id */
1165     cpu_id = get_cpu_id(cpu);
1166     if ( cpu_id == -1 )
1167     {
1168         static bool warn_once = true;
1169 
1170         if ( warn_once || opt_cpu_info )
1171             printk(XENLOG_WARNING "No CPU ID for APIC ID %#x\n", cpu);
1172         warn_once = false;
1173         return -EINVAL;
1174     }
1175 
1176     ret = cpuidle_init_cpu(cpu_id);
1177     if ( ret < 0 )
1178         return ret;
1179 
1180     acpi_power = processor_powers[cpu_id];
1181     acpi_power->flags.bm_check = power->flags.bm_check;
1182     acpi_power->flags.bm_control = power->flags.bm_control;
1183     acpi_power->flags.has_cst = power->flags.has_cst;
1184 
1185     states = power->states;
1186     for ( i = 0; i < power->count; i++ )
1187     {
1188         if ( unlikely(copy_from_guest_offset(&xen_cx, states, i, 1)) )
1189             return -EFAULT;
1190 
1191         set_cx(acpi_power, &xen_cx);
1192     }
1193 
1194     if ( cpuidle_current_governor->enable &&
1195          cpuidle_current_governor->enable(acpi_power) )
1196         return -EFAULT;
1197 
1198     /* FIXME: C-state dependency is not supported by far */
1199 
1200     if ( cpu_id == 0 )
1201     {
1202         if ( pm_idle_save == NULL )
1203         {
1204             pm_idle_save = pm_idle;
1205             pm_idle = acpi_processor_idle;
1206         }
1207 
1208         dead_idle = acpi_dead_idle;
1209     }
1210 
1211     return 0;
1212 }
1213 
pmstat_get_cx_nr(uint32_t cpuid)1214 uint32_t pmstat_get_cx_nr(uint32_t cpuid)
1215 {
1216     return processor_powers[cpuid] ? processor_powers[cpuid]->count : 0;
1217 }
1218 
pmstat_get_cx_stat(uint32_t cpuid,struct pm_cx_stat * stat)1219 int pmstat_get_cx_stat(uint32_t cpuid, struct pm_cx_stat *stat)
1220 {
1221     struct acpi_processor_power *power = processor_powers[cpuid];
1222     uint64_t idle_usage = 0, idle_res = 0;
1223     uint64_t last_state_update_tick, current_stime, current_tick;
1224     uint64_t usage[ACPI_PROCESSOR_MAX_POWER] = { 0 };
1225     uint64_t res[ACPI_PROCESSOR_MAX_POWER] = { 0 };
1226     unsigned int i, nr, nr_pc = 0, nr_cc = 0;
1227 
1228     if ( power == NULL )
1229     {
1230         stat->last = 0;
1231         stat->nr = 0;
1232         stat->idle_time = 0;
1233         stat->nr_pc = 0;
1234         stat->nr_cc = 0;
1235         return 0;
1236     }
1237 
1238     stat->idle_time = get_cpu_idle_time(cpuid);
1239     nr = min(stat->nr, power->count);
1240 
1241     /* mimic the stat when detail info hasn't been registered by dom0 */
1242     if ( pm_idle_save == NULL )
1243     {
1244         stat->nr = 2;
1245         stat->last = power->last_state ? power->last_state->idx : 0;
1246 
1247         usage[1] = idle_usage = 1;
1248         res[1] = idle_res = stat->idle_time;
1249 
1250         current_stime = NOW();
1251     }
1252     else
1253     {
1254         struct hw_residencies hw_res;
1255         signed int last_state_idx;
1256 
1257         stat->nr = power->count;
1258 
1259         spin_lock_irq(&power->stat_lock);
1260         current_tick = cpuidle_get_tick();
1261         current_stime = NOW();
1262         for ( i = 1; i < nr; i++ )
1263         {
1264             usage[i] = power->states[i].usage;
1265             res[i] = power->states[i].time;
1266         }
1267         last_state_update_tick = power->last_state_update_tick;
1268         last_state_idx = power->last_state ? power->last_state->idx : -1;
1269         spin_unlock_irq(&power->stat_lock);
1270 
1271         if ( last_state_idx >= 0 )
1272         {
1273             usage[last_state_idx]++;
1274             res[last_state_idx] += ticks_elapsed(last_state_update_tick,
1275                                                  current_tick);
1276             stat->last = last_state_idx;
1277         }
1278         else
1279             stat->last = 0;
1280 
1281         for ( i = 1; i < nr; i++ )
1282         {
1283             res[i] = tick_to_ns(res[i]);
1284             idle_usage += usage[i];
1285             idle_res += res[i];
1286         }
1287 
1288         get_hw_residencies(cpuid, &hw_res);
1289 
1290 #define PUT_xC(what, n) do { \
1291         if ( stat->nr_##what >= n && \
1292              copy_to_guest_offset(stat->what, n - 1, &hw_res.what##n, 1) ) \
1293             return -EFAULT; \
1294         if ( hw_res.what##n ) \
1295             nr_##what = n; \
1296     } while ( 0 )
1297 #define PUT_PC(n) PUT_xC(pc, n)
1298         PUT_PC(2);
1299         PUT_PC(3);
1300         PUT_PC(4);
1301         PUT_PC(6);
1302         PUT_PC(7);
1303         PUT_PC(8);
1304         PUT_PC(9);
1305         PUT_PC(10);
1306 #undef PUT_PC
1307 #define PUT_CC(n) PUT_xC(cc, n)
1308         PUT_CC(1);
1309         PUT_CC(3);
1310         PUT_CC(6);
1311         PUT_CC(7);
1312 #undef PUT_CC
1313 #undef PUT_xC
1314     }
1315 
1316     usage[0] += idle_usage;
1317     res[0] = current_stime - idle_res;
1318 
1319     if ( copy_to_guest(stat->triggers, usage, nr) ||
1320          copy_to_guest(stat->residencies, res, nr) )
1321         return -EFAULT;
1322 
1323     stat->nr_pc = nr_pc;
1324     stat->nr_cc = nr_cc;
1325 
1326     return 0;
1327 }
1328 
pmstat_reset_cx_stat(uint32_t cpuid)1329 int pmstat_reset_cx_stat(uint32_t cpuid)
1330 {
1331     return 0;
1332 }
1333 
cpuidle_disable_deep_cstate(void)1334 void cpuidle_disable_deep_cstate(void)
1335 {
1336     if ( max_cstate > 1 )
1337     {
1338         if ( local_apic_timer_c2_ok )
1339             max_cstate = 2;
1340         else
1341             max_cstate = 1;
1342     }
1343 
1344     mb();
1345 
1346     hpet_disable_legacy_broadcast();
1347 }
1348 
cpuidle_using_deep_cstate(void)1349 bool cpuidle_using_deep_cstate(void)
1350 {
1351     return xen_cpuidle && max_cstate > (local_apic_timer_c2_ok ? 2 : 1);
1352 }
1353 
cpu_callback(struct notifier_block * nfb,unsigned long action,void * hcpu)1354 static int cpu_callback(
1355     struct notifier_block *nfb, unsigned long action, void *hcpu)
1356 {
1357     unsigned int cpu = (unsigned long)hcpu;
1358 
1359     /* Only hook on CPU_ONLINE because a dead cpu may utilize the info to
1360      * to enter deep C-state */
1361     switch ( action )
1362     {
1363     case CPU_ONLINE:
1364         (void)cpuidle_init_cpu(cpu);
1365         break;
1366     default:
1367         break;
1368     }
1369 
1370     return NOTIFY_DONE;
1371 }
1372 
1373 static struct notifier_block cpu_nfb = {
1374     .notifier_call = cpu_callback
1375 };
1376 
cpuidle_presmp_init(void)1377 static int __init cpuidle_presmp_init(void)
1378 {
1379     void *cpu = (void *)(long)smp_processor_id();
1380 
1381     if ( !xen_cpuidle )
1382         return 0;
1383 
1384     mwait_idle_init(&cpu_nfb);
1385     cpu_nfb.notifier_call(&cpu_nfb, CPU_ONLINE, cpu);
1386     register_cpu_notifier(&cpu_nfb);
1387     return 0;
1388 }
1389 presmp_initcall(cpuidle_presmp_init);
1390 
1391