1 /******************************************************************************
2 * arch/x86/time.c
3 *
4 * Per-CPU time calibration and management.
5 *
6 * Copyright (c) 2002-2005, K A Fraser
7 *
8 * Portions from Linux are:
9 * Copyright (c) 1991, 1992, 1995 Linus Torvalds
10 */
11
12 #include <xen/errno.h>
13 #include <xen/event.h>
14 #include <xen/sched.h>
15 #include <xen/lib.h>
16 #include <xen/init.h>
17 #include <xen/time.h>
18 #include <xen/timer.h>
19 #include <xen/smp.h>
20 #include <xen/irq.h>
21 #include <xen/softirq.h>
22 #include <xen/efi.h>
23 #include <xen/cpuidle.h>
24 #include <xen/symbols.h>
25 #include <xen/keyhandler.h>
26 #include <xen/guest_access.h>
27 #include <asm/io.h>
28 #include <asm/msr.h>
29 #include <asm/mpspec.h>
30 #include <asm/processor.h>
31 #include <asm/fixmap.h>
32 #include <asm/guest.h>
33 #include <asm/mc146818rtc.h>
34 #include <asm/div64.h>
35 #include <asm/acpi.h>
36 #include <asm/hpet.h>
37 #include <io_ports.h>
38 #include <asm/setup.h> /* for early_time_init */
39 #include <public/arch-x86/cpuid.h>
40
41 /* opt_clocksource: Force clocksource to one of: pit, hpet, acpi. */
42 static char __initdata opt_clocksource[10];
43 string_param("clocksource", opt_clocksource);
44
45 unsigned long __read_mostly cpu_khz; /* CPU clock frequency in kHz. */
46 DEFINE_SPINLOCK(rtc_lock);
47 unsigned long pit0_ticks;
48
49 struct cpu_time_stamp {
50 u64 local_tsc;
51 s_time_t local_stime;
52 s_time_t master_stime;
53 };
54
55 struct cpu_time {
56 struct cpu_time_stamp stamp;
57 struct time_scale tsc_scale;
58 };
59
60 struct platform_timesource {
61 char *id;
62 char *name;
63 u64 frequency;
64 u64 (*read_counter)(void);
65 s64 (*init)(struct platform_timesource *);
66 void (*resume)(struct platform_timesource *);
67 int counter_bits;
68 };
69
70 static DEFINE_PER_CPU(struct cpu_time, cpu_time);
71
72 /* Calibrate all CPUs to platform timer every EPOCH. */
73 #define EPOCH MILLISECS(1000)
74 static struct timer calibration_timer;
75
76 /*
77 * We simulate a 32-bit platform timer from the 16-bit PIT ch2 counter.
78 * Otherwise overflow happens too quickly (~50ms) for us to guarantee that
79 * softirq handling will happen in time.
80 *
81 * The pit_lock protects the 16- and 32-bit stamp fields as well as the
82 */
83 static DEFINE_SPINLOCK(pit_lock);
84 static u16 pit_stamp16;
85 static u32 pit_stamp32;
86 static bool __read_mostly using_pit;
87
88 /* Boot timestamp, filled in head.S */
89 u64 __initdata boot_tsc_stamp;
90
91 /*
92 * 32-bit division of integer dividend and integer divisor yielding
93 * 32-bit fractional quotient.
94 */
div_frac(u32 dividend,u32 divisor)95 static inline u32 div_frac(u32 dividend, u32 divisor)
96 {
97 u32 quotient, remainder;
98 ASSERT(dividend < divisor);
99 asm (
100 "divl %4"
101 : "=a" (quotient), "=d" (remainder)
102 : "0" (0), "1" (dividend), "r" (divisor) );
103 return quotient;
104 }
105
106 /*
107 * 32-bit multiplication of multiplicand and fractional multiplier
108 * yielding 32-bit product (radix point at same position as in multiplicand).
109 */
mul_frac(u32 multiplicand,u32 multiplier)110 static inline u32 mul_frac(u32 multiplicand, u32 multiplier)
111 {
112 u32 product_int, product_frac;
113 asm (
114 "mul %3"
115 : "=a" (product_frac), "=d" (product_int)
116 : "0" (multiplicand), "r" (multiplier) );
117 return product_int;
118 }
119
120 /*
121 * Scale a 64-bit delta by scaling and multiplying by a 32-bit fraction,
122 * yielding a 64-bit result.
123 */
scale_delta(u64 delta,const struct time_scale * scale)124 u64 scale_delta(u64 delta, const struct time_scale *scale)
125 {
126 u64 product;
127
128 if ( scale->shift < 0 )
129 delta >>= -scale->shift;
130 else
131 delta <<= scale->shift;
132
133 asm (
134 "mulq %2 ; shrd $32,%1,%0"
135 : "=a" (product), "=d" (delta)
136 : "rm" (delta), "0" ((u64)scale->mul_frac) );
137
138 return product;
139 }
140
141 #define _TS_MUL_FRAC_IDENTITY 0x80000000UL
142
143 /* Compute the reciprocal of the given time_scale. */
scale_reciprocal(struct time_scale scale)144 static inline struct time_scale scale_reciprocal(struct time_scale scale)
145 {
146 struct time_scale reciprocal;
147 u32 dividend;
148
149 ASSERT(scale.mul_frac != 0);
150 dividend = _TS_MUL_FRAC_IDENTITY;
151 reciprocal.shift = 1 - scale.shift;
152 while ( unlikely(dividend >= scale.mul_frac) )
153 {
154 dividend >>= 1;
155 reciprocal.shift++;
156 }
157
158 asm (
159 "divl %4"
160 : "=a" (reciprocal.mul_frac), "=d" (dividend)
161 : "0" (0), "1" (dividend), "r" (scale.mul_frac) );
162
163 return reciprocal;
164 }
165
166 /*
167 * cpu_mask that denotes the CPUs that needs timer interrupt coming in as
168 * IPIs in place of local APIC timers
169 */
170 static cpumask_t pit_broadcast_mask;
171
smp_send_timer_broadcast_ipi(void)172 static void smp_send_timer_broadcast_ipi(void)
173 {
174 int cpu = smp_processor_id();
175 cpumask_t mask;
176
177 cpumask_and(&mask, &cpu_online_map, &pit_broadcast_mask);
178
179 if ( cpumask_test_cpu(cpu, &mask) )
180 {
181 __cpumask_clear_cpu(cpu, &mask);
182 raise_softirq(TIMER_SOFTIRQ);
183 }
184
185 if ( !cpumask_empty(&mask) )
186 {
187 cpumask_raise_softirq(&mask, TIMER_SOFTIRQ);
188 }
189 }
190
timer_interrupt(int irq,void * dev_id,struct cpu_user_regs * regs)191 static void timer_interrupt(int irq, void *dev_id, struct cpu_user_regs *regs)
192 {
193 ASSERT(local_irq_is_enabled());
194
195 if ( hpet_legacy_irq_tick() )
196 return;
197
198 /* Only for start-of-day interruopt tests in io_apic.c. */
199 pit0_ticks++;
200
201 /* Rough hack to allow accurate timers to sort-of-work with no APIC. */
202 if ( !cpu_has_apic )
203 raise_softirq(TIMER_SOFTIRQ);
204
205 if ( xen_cpuidle )
206 smp_send_timer_broadcast_ipi();
207
208 /* Emulate a 32-bit PIT counter. */
209 if ( using_pit )
210 {
211 u16 count;
212
213 spin_lock_irq(&pit_lock);
214
215 outb(0x80, PIT_MODE);
216 count = inb(PIT_CH2);
217 count |= inb(PIT_CH2) << 8;
218
219 pit_stamp32 += (u16)(pit_stamp16 - count);
220 pit_stamp16 = count;
221
222 spin_unlock_irq(&pit_lock);
223 }
224 }
225
226 static struct irqaction __read_mostly irq0 = {
227 timer_interrupt, "timer", NULL
228 };
229
230 #define CLOCK_TICK_RATE 1193182 /* system crystal frequency (Hz) */
231 #define CALIBRATE_FRAC 20 /* calibrate over 50ms */
232 #define CALIBRATE_VALUE(freq) (((freq) + CALIBRATE_FRAC / 2) / CALIBRATE_FRAC)
233
preinit_pit(void)234 static void preinit_pit(void)
235 {
236 /* Set PIT channel 0 to HZ Hz. */
237 #define LATCH (((CLOCK_TICK_RATE)+(HZ/2))/HZ)
238 outb_p(0x34, PIT_MODE); /* binary, mode 2, LSB/MSB, ch 0 */
239 outb_p(LATCH & 0xff, PIT_CH0); /* LSB */
240 outb(LATCH >> 8, PIT_CH0); /* MSB */
241 #undef LATCH
242 }
243
set_time_scale(struct time_scale * ts,u64 ticks_per_sec)244 void set_time_scale(struct time_scale *ts, u64 ticks_per_sec)
245 {
246 u64 tps64 = ticks_per_sec;
247 u32 tps32;
248 int shift = 0;
249
250 ASSERT(tps64 != 0);
251
252 while ( tps64 > (MILLISECS(1000)*2) )
253 {
254 tps64 >>= 1;
255 shift--;
256 }
257
258 tps32 = (u32)tps64;
259 while ( tps32 <= (u32)MILLISECS(1000) )
260 {
261 tps32 <<= 1;
262 shift++;
263 }
264
265 ts->mul_frac = div_frac(MILLISECS(1000), tps32);
266 ts->shift = shift;
267 }
268
freq_string(u64 freq)269 static char *freq_string(u64 freq)
270 {
271 static char s[20];
272 unsigned int x, y;
273 y = (unsigned int)do_div(freq, 1000000) / 1000;
274 x = (unsigned int)freq;
275 snprintf(s, sizeof(s), "%u.%03uMHz", x, y);
276 return s;
277 }
278
279 /************************************************************
280 * PLATFORM TIMER 1: PROGRAMMABLE INTERVAL TIMER (LEGACY PIT)
281 */
282
read_pit_count(void)283 static u64 read_pit_count(void)
284 {
285 u16 count16;
286 u32 count32;
287 unsigned long flags;
288
289 spin_lock_irqsave(&pit_lock, flags);
290
291 outb(0x80, PIT_MODE);
292 count16 = inb(PIT_CH2);
293 count16 |= inb(PIT_CH2) << 8;
294
295 count32 = pit_stamp32 + (u16)(pit_stamp16 - count16);
296
297 spin_unlock_irqrestore(&pit_lock, flags);
298
299 return count32;
300 }
301
init_pit(struct platform_timesource * pts)302 static s64 __init init_pit(struct platform_timesource *pts)
303 {
304 u8 portb = inb(0x61);
305 u64 start, end;
306 unsigned long count;
307
308 using_pit = true;
309
310 /* Set the Gate high, disable speaker. */
311 outb((portb & ~0x02) | 0x01, 0x61);
312
313 /*
314 * Now let's take care of CTC channel 2: mode 0, (interrupt on
315 * terminal count mode), binary count, load CALIBRATE_LATCH count,
316 * (LSB and MSB) to begin countdown.
317 */
318 #define CALIBRATE_LATCH CALIBRATE_VALUE(CLOCK_TICK_RATE)
319 outb(0xb0, PIT_MODE); /* binary, mode 0, LSB/MSB, Ch 2 */
320 outb(CALIBRATE_LATCH & 0xff, PIT_CH2); /* LSB of count */
321 outb(CALIBRATE_LATCH >> 8, PIT_CH2); /* MSB of count */
322 #undef CALIBRATE_LATCH
323
324 start = rdtsc_ordered();
325 for ( count = 0; !(inb(0x61) & 0x20); ++count )
326 continue;
327 end = rdtsc_ordered();
328
329 /* Set the Gate low, disable speaker. */
330 outb(portb & ~0x03, 0x61);
331
332 /* Error if the CTC doesn't behave itself. */
333 if ( count == 0 )
334 return 0;
335
336 return (end - start) * CALIBRATE_FRAC;
337 }
338
resume_pit(struct platform_timesource * pts)339 static void resume_pit(struct platform_timesource *pts)
340 {
341 /* Set CTC channel 2 to mode 0 again; initial value does not matter. */
342 outb(0xb0, PIT_MODE); /* binary, mode 0, LSB/MSB, Ch 2 */
343 outb(0, PIT_CH2); /* LSB of count */
344 outb(0, PIT_CH2); /* MSB of count */
345 }
346
347 static struct platform_timesource __initdata plt_pit =
348 {
349 .id = "pit",
350 .name = "PIT",
351 .frequency = CLOCK_TICK_RATE,
352 .read_counter = read_pit_count,
353 .counter_bits = 32,
354 .init = init_pit,
355 .resume = resume_pit,
356 };
357
358 /************************************************************
359 * PLATFORM TIMER 2: HIGH PRECISION EVENT TIMER (HPET)
360 */
361
read_hpet_count(void)362 static u64 read_hpet_count(void)
363 {
364 return hpet_read32(HPET_COUNTER);
365 }
366
init_hpet(struct platform_timesource * pts)367 static s64 __init init_hpet(struct platform_timesource *pts)
368 {
369 u64 hpet_rate = hpet_setup(), start;
370 u32 count, target;
371
372 if ( hpet_rate == 0 )
373 return 0;
374
375 pts->frequency = hpet_rate;
376
377 count = hpet_read32(HPET_COUNTER);
378 start = rdtsc_ordered();
379 target = count + CALIBRATE_VALUE(hpet_rate);
380 if ( target < count )
381 while ( hpet_read32(HPET_COUNTER) >= count )
382 continue;
383 while ( hpet_read32(HPET_COUNTER) < target )
384 continue;
385
386 return (rdtsc_ordered() - start) * CALIBRATE_FRAC;
387 }
388
resume_hpet(struct platform_timesource * pts)389 static void resume_hpet(struct platform_timesource *pts)
390 {
391 hpet_resume(NULL);
392 }
393
394 static struct platform_timesource __initdata plt_hpet =
395 {
396 .id = "hpet",
397 .name = "HPET",
398 .read_counter = read_hpet_count,
399 .counter_bits = 32,
400 .init = init_hpet,
401 .resume = resume_hpet
402 };
403
404 /************************************************************
405 * PLATFORM TIMER 3: ACPI PM TIMER
406 */
407
408 u32 __read_mostly pmtmr_ioport;
409 unsigned int __initdata pmtmr_width;
410
411 /* ACPI PM timer ticks at 3.579545 MHz. */
412 #define ACPI_PM_FREQUENCY 3579545
413
read_pmtimer_count(void)414 static u64 read_pmtimer_count(void)
415 {
416 return inl(pmtmr_ioport);
417 }
418
init_pmtimer(struct platform_timesource * pts)419 static s64 __init init_pmtimer(struct platform_timesource *pts)
420 {
421 u64 start;
422 u32 count, target, mask = 0xffffff;
423
424 if ( !pmtmr_ioport || !pmtmr_width )
425 return 0;
426
427 if ( pmtmr_width == 32 )
428 {
429 pts->counter_bits = 32;
430 mask = 0xffffffff;
431 }
432
433 count = inl(pmtmr_ioport) & mask;
434 start = rdtsc_ordered();
435 target = count + CALIBRATE_VALUE(ACPI_PM_FREQUENCY);
436 if ( target < count )
437 while ( (inl(pmtmr_ioport) & mask) >= count )
438 continue;
439 while ( (inl(pmtmr_ioport) & mask) < target )
440 continue;
441
442 return (rdtsc_ordered() - start) * CALIBRATE_FRAC;
443 }
444
445 static struct platform_timesource __initdata plt_pmtimer =
446 {
447 .id = "acpi",
448 .name = "ACPI PM Timer",
449 .frequency = ACPI_PM_FREQUENCY,
450 .read_counter = read_pmtimer_count,
451 .counter_bits = 24,
452 .init = init_pmtimer
453 };
454
455 static struct time_scale __read_mostly pmt_scale;
456 static struct time_scale __read_mostly pmt_scale_r;
457
init_pmtmr_scale(void)458 static __init int init_pmtmr_scale(void)
459 {
460 set_time_scale(&pmt_scale, ACPI_PM_FREQUENCY);
461 pmt_scale_r = scale_reciprocal(pmt_scale);
462 return 0;
463 }
464 __initcall(init_pmtmr_scale);
465
acpi_pm_tick_to_ns(uint64_t ticks)466 uint64_t acpi_pm_tick_to_ns(uint64_t ticks)
467 {
468 return scale_delta(ticks, &pmt_scale);
469 }
470
ns_to_acpi_pm_tick(uint64_t ns)471 uint64_t ns_to_acpi_pm_tick(uint64_t ns)
472 {
473 return scale_delta(ns, &pmt_scale_r);
474 }
475
476 /************************************************************
477 * PLATFORM TIMER 4: TSC
478 */
479 static unsigned int __initdata tsc_flags;
480
481 /* TSC is reliable across sockets */
482 #define TSC_RELIABLE_SOCKET (1 << 0)
483
484 /*
485 * Called in verify_tsc_reliability() under reliable TSC conditions
486 * thus reusing all the checks already performed there.
487 */
init_tsc(struct platform_timesource * pts)488 static s64 __init init_tsc(struct platform_timesource *pts)
489 {
490 u64 ret = pts->frequency;
491
492 if ( nr_cpu_ids != num_present_cpus() )
493 {
494 printk(XENLOG_WARNING "TSC: CPU Hotplug intended\n");
495 ret = 0;
496 }
497
498 if ( nr_sockets > 1 && !(tsc_flags & TSC_RELIABLE_SOCKET) )
499 {
500 printk(XENLOG_WARNING "TSC: Not invariant across sockets\n");
501 ret = 0;
502 }
503
504 if ( !ret )
505 printk(XENLOG_DEBUG "TSC: Not setting it as clocksource\n");
506
507 return ret;
508 }
509
read_tsc(void)510 static u64 read_tsc(void)
511 {
512 return rdtsc_ordered();
513 }
514
515 static struct platform_timesource __initdata plt_tsc =
516 {
517 .id = "tsc",
518 .name = "TSC",
519 .read_counter = read_tsc,
520 /*
521 * Calculations for platform timer overflow assume u64 boundary.
522 * Hence we set to less than 64, such that the TSC wraparound is
523 * correctly checked and handled.
524 */
525 .counter_bits = 63,
526 .init = init_tsc,
527 };
528
529 #ifdef CONFIG_XEN_GUEST
530 /************************************************************
531 * PLATFORM TIMER 5: XEN PV CLOCK SOURCE
532 *
533 * Xen clock source is a variant of TSC source.
534 */
535
xen_timer_cpu_frequency(void)536 static uint64_t xen_timer_cpu_frequency(void)
537 {
538 struct vcpu_time_info *info = &this_cpu(vcpu_info)->time;
539 uint64_t freq;
540
541 freq = 1000000000ULL << 32;
542 do_div(freq, info->tsc_to_system_mul);
543 if ( info->tsc_shift < 0 )
544 freq <<= -info->tsc_shift;
545 else
546 freq >>= info->tsc_shift;
547
548 return freq;
549 }
550
init_xen_timer(struct platform_timesource * pts)551 static int64_t __init init_xen_timer(struct platform_timesource *pts)
552 {
553 if ( !xen_guest )
554 return 0;
555
556 pts->frequency = xen_timer_cpu_frequency();
557
558 return pts->frequency;
559 }
560
read_cycle(const struct vcpu_time_info * info,uint64_t tsc)561 static always_inline uint64_t read_cycle(const struct vcpu_time_info *info,
562 uint64_t tsc)
563 {
564 uint64_t delta = tsc - info->tsc_timestamp;
565 struct time_scale ts = {
566 .shift = info->tsc_shift,
567 .mul_frac = info->tsc_to_system_mul,
568 };
569 uint64_t offset = scale_delta(delta, &ts);
570
571 return info->system_time + offset;
572 }
573
read_xen_timer(void)574 static uint64_t read_xen_timer(void)
575 {
576 struct vcpu_time_info *info = &this_cpu(vcpu_info)->time;
577 uint32_t version;
578 uint64_t ret;
579 uint64_t last;
580 static uint64_t last_value;
581
582 do {
583 version = info->version & ~1;
584 /* Make sure version is read before the data */
585 smp_rmb();
586
587 ret = read_cycle(info, rdtsc_ordered());
588 /* Ignore fancy flags for now */
589
590 /* Make sure version is reread after the data */
591 smp_rmb();
592 } while ( unlikely(version != info->version) );
593
594 /* Maintain a monotonic global value */
595 do {
596 last = read_atomic(&last_value);
597 if ( ret < last )
598 return last;
599 } while ( unlikely(cmpxchg(&last_value, last, ret) != last) );
600
601 return ret;
602 }
603
604 static struct platform_timesource __initdata plt_xen_timer =
605 {
606 .id = "xen",
607 .name = "XEN PV CLOCK",
608 .read_counter = read_xen_timer,
609 .init = init_xen_timer,
610 .counter_bits = 63,
611 };
612 #endif
613
614 /************************************************************
615 * GENERIC PLATFORM TIMER INFRASTRUCTURE
616 */
617
618 /* details of chosen timesource */
619 static struct platform_timesource __read_mostly plt_src;
620 /* hardware-width mask */
621 static u64 __read_mostly plt_mask;
622 /* ns between calls to plt_overflow() */
623 static u64 __read_mostly plt_overflow_period;
624 /* scale: platform counter -> nanosecs */
625 static struct time_scale __read_mostly plt_scale;
626
627 /* Protected by platform_timer_lock. */
628 static DEFINE_SPINLOCK(platform_timer_lock);
629 static s_time_t stime_platform_stamp; /* System time at below platform time */
630 static u64 platform_timer_stamp; /* Platform time at above system time */
631 static u64 plt_stamp64; /* 64-bit platform counter stamp */
632 static u64 plt_stamp; /* hardware-width platform counter stamp */
633 static struct timer plt_overflow_timer;
634
__read_platform_stime(u64 platform_time)635 static s_time_t __read_platform_stime(u64 platform_time)
636 {
637 u64 diff = platform_time - platform_timer_stamp;
638 ASSERT(spin_is_locked(&platform_timer_lock));
639 return (stime_platform_stamp + scale_delta(diff, &plt_scale));
640 }
641
plt_overflow(void * unused)642 static void plt_overflow(void *unused)
643 {
644 int i;
645 u64 count;
646 s_time_t now, plt_now, plt_wrap;
647
648 spin_lock_irq(&platform_timer_lock);
649
650 count = plt_src.read_counter();
651 plt_stamp64 += (count - plt_stamp) & plt_mask;
652 plt_stamp = count;
653
654 now = NOW();
655 plt_wrap = __read_platform_stime(plt_stamp64);
656 for ( i = 0; i < 10; i++ )
657 {
658 plt_now = plt_wrap;
659 plt_wrap = __read_platform_stime(plt_stamp64 + plt_mask + 1);
660 if ( ABS(plt_wrap - now) > ABS(plt_now - now) )
661 break;
662 plt_stamp64 += plt_mask + 1;
663 }
664 if ( i != 0 )
665 {
666 static bool warned_once;
667
668 if ( !test_and_set_bool(warned_once) )
669 printk("Platform timer appears to have unexpectedly wrapped "
670 "%u%s times.\n", i, (i == 10) ? " or more" : "");
671 }
672
673 spin_unlock_irq(&platform_timer_lock);
674
675 set_timer(&plt_overflow_timer, NOW() + plt_overflow_period);
676 }
677
read_platform_stime(u64 * stamp)678 static s_time_t read_platform_stime(u64 *stamp)
679 {
680 u64 plt_counter, count;
681 s_time_t stime;
682
683 ASSERT(!local_irq_is_enabled());
684
685 spin_lock(&platform_timer_lock);
686 plt_counter = plt_src.read_counter();
687 count = plt_stamp64 + ((plt_counter - plt_stamp) & plt_mask);
688 stime = __read_platform_stime(count);
689 spin_unlock(&platform_timer_lock);
690
691 if ( unlikely(stamp) )
692 *stamp = plt_counter;
693
694 return stime;
695 }
696
platform_time_calibration(void)697 static void platform_time_calibration(void)
698 {
699 u64 count;
700 s_time_t stamp;
701 unsigned long flags;
702
703 spin_lock_irqsave(&platform_timer_lock, flags);
704 count = plt_stamp64 + ((plt_src.read_counter() - plt_stamp) & plt_mask);
705 stamp = __read_platform_stime(count);
706 stime_platform_stamp = stamp;
707 platform_timer_stamp = count;
708 spin_unlock_irqrestore(&platform_timer_lock, flags);
709 }
710
resume_platform_timer(void)711 static void resume_platform_timer(void)
712 {
713 /* Timer source can be reset when backing from S3 to S0 */
714 if ( plt_src.resume )
715 plt_src.resume(&plt_src);
716
717 plt_stamp64 = platform_timer_stamp;
718 plt_stamp = plt_src.read_counter();
719 }
720
reset_platform_timer(void)721 static void __init reset_platform_timer(void)
722 {
723 /* Deactivate any timers running */
724 kill_timer(&plt_overflow_timer);
725 kill_timer(&calibration_timer);
726
727 /* Reset counters and stamps */
728 spin_lock_irq(&platform_timer_lock);
729 plt_stamp = 0;
730 plt_stamp64 = 0;
731 platform_timer_stamp = 0;
732 stime_platform_stamp = 0;
733 spin_unlock_irq(&platform_timer_lock);
734 }
735
try_platform_timer(struct platform_timesource * pts)736 static s64 __init try_platform_timer(struct platform_timesource *pts)
737 {
738 s64 rc = pts->init(pts);
739
740 if ( rc <= 0 )
741 return rc;
742
743 /* We have a platform timesource already so reset it */
744 if ( plt_src.counter_bits != 0 )
745 reset_platform_timer();
746
747 plt_mask = (u64)~0ull >> (64 - pts->counter_bits);
748
749 set_time_scale(&plt_scale, pts->frequency);
750
751 plt_overflow_period = scale_delta(
752 1ull << (pts->counter_bits - 1), &plt_scale);
753 plt_src = *pts;
754
755 return rc;
756 }
757
init_platform_timer(void)758 static u64 __init init_platform_timer(void)
759 {
760 static struct platform_timesource * __initdata plt_timers[] = {
761 #ifdef CONFIG_XEN_GUEST
762 &plt_xen_timer,
763 #endif
764 &plt_hpet, &plt_pmtimer, &plt_pit
765 };
766
767 struct platform_timesource *pts = NULL;
768 unsigned int i;
769 s64 rc = -1;
770
771 /* clocksource=tsc is initialized via __initcalls (when CPUs are up). */
772 if ( (opt_clocksource[0] != '\0') && strcmp(opt_clocksource, "tsc") )
773 {
774 for ( i = 0; i < ARRAY_SIZE(plt_timers); i++ )
775 {
776 pts = plt_timers[i];
777 if ( !strcmp(opt_clocksource, pts->id) )
778 {
779 rc = try_platform_timer(pts);
780 break;
781 }
782 }
783
784 if ( rc <= 0 )
785 printk("WARNING: %s clocksource '%s'.\n",
786 (rc == 0) ? "Could not initialise" : "Unrecognised",
787 opt_clocksource);
788 }
789
790 if ( rc <= 0 )
791 {
792 for ( i = 0; i < ARRAY_SIZE(plt_timers); i++ )
793 {
794 pts = plt_timers[i];
795 if ( (rc = try_platform_timer(pts)) > 0 )
796 break;
797 }
798 }
799
800 if ( rc <= 0 )
801 panic("Unable to find usable platform timer");
802
803 printk("Platform timer is %s %s\n",
804 freq_string(pts->frequency), pts->name);
805
806 return rc;
807 }
808
stime2tsc(s_time_t stime)809 u64 stime2tsc(s_time_t stime)
810 {
811 struct cpu_time *t;
812 struct time_scale sys_to_tsc;
813 s_time_t stime_delta;
814
815 t = &this_cpu(cpu_time);
816 sys_to_tsc = scale_reciprocal(t->tsc_scale);
817
818 stime_delta = stime - t->stamp.local_stime;
819 if ( stime_delta < 0 )
820 stime_delta = 0;
821
822 return t->stamp.local_tsc + scale_delta(stime_delta, &sys_to_tsc);
823 }
824
cstate_restore_tsc(void)825 void cstate_restore_tsc(void)
826 {
827 if ( boot_cpu_has(X86_FEATURE_NONSTOP_TSC) )
828 return;
829
830 write_tsc(stime2tsc(read_platform_stime(NULL)));
831 }
832
833 /***************************************************************************
834 * CMOS Timer functions
835 ***************************************************************************/
836
837 /* Converts Gregorian date to seconds since 1970-01-01 00:00:00.
838 * Assumes input in normal date format, i.e. 1980-12-31 23:59:59
839 * => year=1980, mon=12, day=31, hour=23, min=59, sec=59.
840 *
841 * [For the Julian calendar (which was used in Russia before 1917,
842 * Britain & colonies before 1752, anywhere else before 1582,
843 * and is still in use by some communities) leave out the
844 * -year/100+year/400 terms, and add 10.]
845 *
846 * This algorithm was first published by Gauss (I think).
847 *
848 * WARNING: this function will overflow on 2106-02-07 06:28:16 on
849 * machines were long is 32-bit! (However, as time_t is signed, we
850 * will already get problems at other places on 2038-01-19 03:14:08)
851 */
852 unsigned long
mktime(unsigned int year,unsigned int mon,unsigned int day,unsigned int hour,unsigned int min,unsigned int sec)853 mktime (unsigned int year, unsigned int mon,
854 unsigned int day, unsigned int hour,
855 unsigned int min, unsigned int sec)
856 {
857 /* 1..12 -> 11,12,1..10: put Feb last since it has a leap day. */
858 if ( 0 >= (int) (mon -= 2) )
859 {
860 mon += 12;
861 year -= 1;
862 }
863
864 return ((((unsigned long)(year/4 - year/100 + year/400 + 367*mon/12 + day)+
865 year*365 - 719499
866 )*24 + hour /* now have hours */
867 )*60 + min /* now have minutes */
868 )*60 + sec; /* finally seconds */
869 }
870
871 struct rtc_time {
872 unsigned int year, mon, day, hour, min, sec;
873 };
874
__get_cmos_time(struct rtc_time * rtc)875 static void __get_cmos_time(struct rtc_time *rtc)
876 {
877 rtc->sec = CMOS_READ(RTC_SECONDS);
878 rtc->min = CMOS_READ(RTC_MINUTES);
879 rtc->hour = CMOS_READ(RTC_HOURS);
880 rtc->day = CMOS_READ(RTC_DAY_OF_MONTH);
881 rtc->mon = CMOS_READ(RTC_MONTH);
882 rtc->year = CMOS_READ(RTC_YEAR);
883
884 if ( RTC_ALWAYS_BCD || !(CMOS_READ(RTC_CONTROL) & RTC_DM_BINARY) )
885 {
886 BCD_TO_BIN(rtc->sec);
887 BCD_TO_BIN(rtc->min);
888 BCD_TO_BIN(rtc->hour);
889 BCD_TO_BIN(rtc->day);
890 BCD_TO_BIN(rtc->mon);
891 BCD_TO_BIN(rtc->year);
892 }
893
894 if ( (rtc->year += 1900) < 1970 )
895 rtc->year += 100;
896 }
897
get_cmos_time(void)898 static unsigned long get_cmos_time(void)
899 {
900 unsigned long res, flags;
901 struct rtc_time rtc;
902 unsigned int seconds = 60;
903 static bool __read_mostly cmos_rtc_probe;
904 boolean_param("cmos-rtc-probe", cmos_rtc_probe);
905
906 if ( efi_enabled(EFI_RS) )
907 {
908 res = efi_get_time();
909 if ( res )
910 return res;
911 }
912
913 if ( likely(!(acpi_gbl_FADT.boot_flags & ACPI_FADT_NO_CMOS_RTC)) )
914 cmos_rtc_probe = false;
915 else if ( system_state < SYS_STATE_smp_boot && !cmos_rtc_probe )
916 panic("System with no CMOS RTC advertised must be booted from EFI"
917 " (or with command line option \"cmos-rtc-probe\")");
918
919 for ( ; ; )
920 {
921 s_time_t start, t1, t2;
922
923 spin_lock_irqsave(&rtc_lock, flags);
924
925 /* read RTC exactly on falling edge of update flag */
926 start = NOW();
927 do { /* may take up to 1 second... */
928 t1 = NOW() - start;
929 } while ( !(CMOS_READ(RTC_FREQ_SELECT) & RTC_UIP) &&
930 t1 <= SECONDS(1) );
931
932 start = NOW();
933 do { /* must try at least 2.228 ms */
934 t2 = NOW() - start;
935 } while ( (CMOS_READ(RTC_FREQ_SELECT) & RTC_UIP) &&
936 t2 < MILLISECS(3) );
937
938 __get_cmos_time(&rtc);
939
940 spin_unlock_irqrestore(&rtc_lock, flags);
941
942 if ( likely(!cmos_rtc_probe) ||
943 t1 > SECONDS(1) || t2 >= MILLISECS(3) ||
944 rtc.sec >= 60 || rtc.min >= 60 || rtc.hour >= 24 ||
945 !rtc.day || rtc.day > 31 ||
946 !rtc.mon || rtc.mon > 12 )
947 break;
948
949 if ( seconds < 60 )
950 {
951 if ( rtc.sec != seconds )
952 cmos_rtc_probe = false;
953 break;
954 }
955
956 process_pending_softirqs();
957
958 seconds = rtc.sec;
959 }
960
961 if ( unlikely(cmos_rtc_probe) )
962 panic("No CMOS RTC found - system must be booted from EFI");
963
964 return mktime(rtc.year, rtc.mon, rtc.day, rtc.hour, rtc.min, rtc.sec);
965 }
966
get_wallclock_time(void)967 static unsigned long get_wallclock_time(void)
968 {
969 #ifdef CONFIG_XEN_GUEST
970 if ( xen_guest )
971 {
972 struct shared_info *sh_info = XEN_shared_info;
973 uint32_t wc_version;
974 uint64_t wc_sec;
975
976 do {
977 wc_version = sh_info->wc_version & ~1;
978 smp_rmb();
979
980 wc_sec = sh_info->wc_sec;
981 smp_rmb();
982 } while ( wc_version != sh_info->wc_version );
983
984 return wc_sec + read_xen_timer() / 1000000000;
985 }
986 #endif
987
988 return get_cmos_time();
989 }
990
991 /***************************************************************************
992 * System Time
993 ***************************************************************************/
994
get_s_time_fixed(u64 at_tsc)995 s_time_t get_s_time_fixed(u64 at_tsc)
996 {
997 const struct cpu_time *t = &this_cpu(cpu_time);
998 u64 tsc, delta;
999 s_time_t now;
1000
1001 if ( at_tsc )
1002 tsc = at_tsc;
1003 else
1004 tsc = rdtsc_ordered();
1005 delta = tsc - t->stamp.local_tsc;
1006 now = t->stamp.local_stime + scale_delta(delta, &t->tsc_scale);
1007
1008 return now;
1009 }
1010
get_s_time()1011 s_time_t get_s_time()
1012 {
1013 return get_s_time_fixed(0);
1014 }
1015
tsc_ticks2ns(uint64_t ticks)1016 uint64_t tsc_ticks2ns(uint64_t ticks)
1017 {
1018 struct cpu_time *t = &this_cpu(cpu_time);
1019
1020 return scale_delta(ticks, &t->tsc_scale);
1021 }
1022
__update_vcpu_system_time(struct vcpu * v,int force)1023 static void __update_vcpu_system_time(struct vcpu *v, int force)
1024 {
1025 const struct cpu_time *t;
1026 struct vcpu_time_info *u, _u = {};
1027 struct domain *d = v->domain;
1028 s_time_t tsc_stamp;
1029
1030 if ( v->vcpu_info == NULL )
1031 return;
1032
1033 t = &this_cpu(cpu_time);
1034 u = &vcpu_info(v, time);
1035
1036 if ( d->arch.vtsc )
1037 {
1038 s_time_t stime = t->stamp.local_stime;
1039
1040 if ( is_hvm_domain(d) )
1041 {
1042 struct pl_time *pl = v->domain->arch.hvm_domain.pl_time;
1043
1044 stime += pl->stime_offset + v->arch.hvm_vcpu.stime_offset;
1045 if ( stime >= 0 )
1046 tsc_stamp = gtime_to_gtsc(d, stime);
1047 else
1048 tsc_stamp = -gtime_to_gtsc(d, -stime);
1049 }
1050 else
1051 tsc_stamp = gtime_to_gtsc(d, stime);
1052
1053 _u.tsc_to_system_mul = d->arch.vtsc_to_ns.mul_frac;
1054 _u.tsc_shift = d->arch.vtsc_to_ns.shift;
1055 }
1056 else
1057 {
1058 if ( is_hvm_domain(d) && hvm_tsc_scaling_supported )
1059 {
1060 tsc_stamp = hvm_scale_tsc(d, t->stamp.local_tsc);
1061 _u.tsc_to_system_mul = d->arch.vtsc_to_ns.mul_frac;
1062 _u.tsc_shift = d->arch.vtsc_to_ns.shift;
1063 }
1064 else
1065 {
1066 tsc_stamp = t->stamp.local_tsc;
1067 _u.tsc_to_system_mul = t->tsc_scale.mul_frac;
1068 _u.tsc_shift = t->tsc_scale.shift;
1069 }
1070 }
1071
1072 _u.tsc_timestamp = tsc_stamp;
1073 _u.system_time = t->stamp.local_stime;
1074
1075 /*
1076 * It's expected that domains cope with this bit changing on every
1077 * pvclock read to check whether they can resort solely on this tuple
1078 * or if it further requires monotonicity checks with other vcpus.
1079 */
1080 if ( clocksource_is_tsc() )
1081 _u.flags |= XEN_PVCLOCK_TSC_STABLE_BIT;
1082
1083 if ( is_hvm_domain(d) )
1084 _u.tsc_timestamp += v->arch.hvm_vcpu.cache_tsc_offset;
1085
1086 /* Don't bother unless timestamp record has changed or we are forced. */
1087 _u.version = u->version; /* make versions match for memcmp test */
1088 if ( !force && !memcmp(u, &_u, sizeof(_u)) )
1089 return;
1090
1091 /* 1. Update guest kernel version. */
1092 _u.version = u->version = version_update_begin(u->version);
1093 wmb();
1094 /* 2. Update all other guest kernel fields. */
1095 *u = _u;
1096 wmb();
1097 /* 3. Update guest kernel version. */
1098 u->version = version_update_end(u->version);
1099
1100 if ( !update_secondary_system_time(v, &_u) && is_pv_domain(d) &&
1101 !is_pv_32bit_domain(d) && !(v->arch.flags & TF_kernel_mode) )
1102 v->arch.pv_vcpu.pending_system_time = _u;
1103 }
1104
update_secondary_system_time(struct vcpu * v,struct vcpu_time_info * u)1105 bool update_secondary_system_time(struct vcpu *v,
1106 struct vcpu_time_info *u)
1107 {
1108 XEN_GUEST_HANDLE(vcpu_time_info_t) user_u = v->arch.time_info_guest;
1109 struct guest_memory_policy policy =
1110 { .smap_policy = SMAP_CHECK_ENABLED, .nested_guest_mode = false };
1111
1112 if ( guest_handle_is_null(user_u) )
1113 return true;
1114
1115 update_guest_memory_policy(v, &policy);
1116
1117 /* 1. Update userspace version. */
1118 if ( __copy_field_to_guest(user_u, u, version) == sizeof(u->version) )
1119 {
1120 update_guest_memory_policy(v, &policy);
1121 return false;
1122 }
1123 wmb();
1124 /* 2. Update all other userspace fields. */
1125 __copy_to_guest(user_u, u, 1);
1126 wmb();
1127 /* 3. Update userspace version. */
1128 u->version = version_update_end(u->version);
1129 __copy_field_to_guest(user_u, u, version);
1130
1131 update_guest_memory_policy(v, &policy);
1132
1133 return true;
1134 }
1135
update_vcpu_system_time(struct vcpu * v)1136 void update_vcpu_system_time(struct vcpu *v)
1137 {
1138 __update_vcpu_system_time(v, 0);
1139 }
1140
force_update_vcpu_system_time(struct vcpu * v)1141 void force_update_vcpu_system_time(struct vcpu *v)
1142 {
1143 __update_vcpu_system_time(v, 1);
1144 }
1145
update_domain_rtc(void)1146 static void update_domain_rtc(void)
1147 {
1148 struct domain *d;
1149
1150 rcu_read_lock(&domlist_read_lock);
1151
1152 for_each_domain ( d )
1153 if ( is_hvm_domain(d) )
1154 rtc_update_clock(d);
1155
1156 rcu_read_unlock(&domlist_read_lock);
1157 }
1158
domain_set_time_offset(struct domain * d,int64_t time_offset_seconds)1159 void domain_set_time_offset(struct domain *d, int64_t time_offset_seconds)
1160 {
1161 d->time_offset_seconds = time_offset_seconds;
1162 if ( is_hvm_domain(d) )
1163 rtc_update_clock(d);
1164 update_domain_wallclock_time(d);
1165 }
1166
cpu_frequency_change(u64 freq)1167 int cpu_frequency_change(u64 freq)
1168 {
1169 struct cpu_time *t = &this_cpu(cpu_time);
1170 u64 curr_tsc;
1171
1172 /* Sanity check: CPU frequency allegedly dropping below 1MHz? */
1173 if ( freq < 1000000u )
1174 {
1175 printk(XENLOG_WARNING "Rejecting CPU frequency change "
1176 "to %"PRIu64" Hz\n", freq);
1177 return -EINVAL;
1178 }
1179
1180 local_irq_disable();
1181 /* Platform time /first/, as we may be delayed by platform_timer_lock. */
1182 t->stamp.master_stime = read_platform_stime(NULL);
1183 curr_tsc = rdtsc_ordered();
1184 /* TSC-extrapolated time may be bogus after frequency change. */
1185 /*t->stamp.local_stime = get_s_time_fixed(curr_tsc);*/
1186 t->stamp.local_stime = t->stamp.master_stime;
1187 t->stamp.local_tsc = curr_tsc;
1188 set_time_scale(&t->tsc_scale, freq);
1189 local_irq_enable();
1190
1191 update_vcpu_system_time(current);
1192
1193 /* A full epoch should pass before we check for deviation. */
1194 if ( smp_processor_id() == 0 )
1195 {
1196 set_timer(&calibration_timer, NOW() + EPOCH);
1197 platform_time_calibration();
1198 }
1199
1200 return 0;
1201 }
1202
1203 /* Per-CPU communication between rendezvous IRQ and softirq handler. */
1204 static DEFINE_PER_CPU(struct cpu_time_stamp, cpu_calibration);
1205
1206 /* Softirq handler for per-CPU time calibration. */
local_time_calibration(void)1207 static void local_time_calibration(void)
1208 {
1209 struct cpu_time *t = &this_cpu(cpu_time);
1210 const struct cpu_time_stamp *c = &this_cpu(cpu_calibration);
1211
1212 /*
1213 * System (extrapolated from local and master oscillators) and TSC
1214 * timestamps, taken during this calibration and the previous one.
1215 */
1216 struct cpu_time_stamp prev, curr;
1217
1218 /*
1219 * System time and TSC ticks elapsed during the previous calibration
1220 * 'epoch'. These values are down-shifted to fit in 32 bits.
1221 */
1222 u64 stime_elapsed64, tsc_elapsed64;
1223 u32 stime_elapsed32, tsc_elapsed32;
1224
1225 /* Error correction to slow down a fast local clock. */
1226 u32 error_factor = 0;
1227
1228 /* Calculated TSC shift to ensure 32-bit scale multiplier. */
1229 int tsc_shift = 0;
1230
1231 /* The overall calibration scale multiplier. */
1232 u32 calibration_mul_frac;
1233
1234 if ( boot_cpu_has(X86_FEATURE_CONSTANT_TSC) )
1235 {
1236 /* Atomically read cpu_calibration struct and write cpu_time struct. */
1237 local_irq_disable();
1238 t->stamp = *c;
1239 local_irq_enable();
1240 update_vcpu_system_time(current);
1241 goto out;
1242 }
1243
1244 prev = t->stamp;
1245
1246 /* Disabling IRQs ensures we atomically read cpu_calibration struct. */
1247 local_irq_disable();
1248 curr = *c;
1249 local_irq_enable();
1250
1251 #if 0
1252 printk("PRE%d: tsc=%"PRIu64" stime=%"PRIu64" master=%"PRIu64"\n",
1253 smp_processor_id(), prev.local_tsc, prev.local_stime, prev.master_stime);
1254 printk("CUR%d: tsc=%"PRIu64" stime=%"PRIu64" master=%"PRIu64
1255 " -> %"PRId64"\n",
1256 smp_processor_id(), curr.local_tsc, curr.local_stime, curr.master_stime,
1257 curr.master_stime - curr.local_stime);
1258 #endif
1259
1260 /* Local time warps forward if it lags behind master time. */
1261 if ( curr.local_stime < curr.master_stime )
1262 curr.local_stime = curr.master_stime;
1263
1264 stime_elapsed64 = curr.master_stime - prev.master_stime;
1265 tsc_elapsed64 = curr.local_tsc - prev.local_tsc;
1266
1267 /*
1268 * Weirdness can happen if we lose sync with the platform timer.
1269 * We could be smarter here: resync platform timer with local timer?
1270 */
1271 if ( ((s64)stime_elapsed64 < (EPOCH / 2)) )
1272 goto out;
1273
1274 /*
1275 * Calculate error-correction factor. This only slows down a fast local
1276 * clock (slow clocks are warped forwards). The scale factor is clamped
1277 * to >= 0.5.
1278 */
1279 if ( curr.local_stime != curr.master_stime )
1280 {
1281 u64 local_stime_err = curr.local_stime - curr.master_stime;
1282
1283 if ( local_stime_err > EPOCH )
1284 local_stime_err = EPOCH;
1285 error_factor = div_frac(EPOCH, EPOCH + (u32)local_stime_err);
1286 }
1287
1288 /*
1289 * We require 0 < stime_elapsed < 2^31.
1290 * This allows us to binary shift a 32-bit tsc_elapsed such that:
1291 * stime_elapsed < tsc_elapsed <= 2*stime_elapsed
1292 */
1293 while ( ((u32)stime_elapsed64 != stime_elapsed64) ||
1294 ((s32)stime_elapsed64 < 0) )
1295 {
1296 stime_elapsed64 >>= 1;
1297 tsc_elapsed64 >>= 1;
1298 }
1299
1300 /* stime_master_diff now fits in a 32-bit word. */
1301 stime_elapsed32 = (u32)stime_elapsed64;
1302
1303 /* tsc_elapsed <= 2*stime_elapsed */
1304 while ( tsc_elapsed64 > (stime_elapsed32 * 2) )
1305 {
1306 tsc_elapsed64 >>= 1;
1307 tsc_shift--;
1308 }
1309
1310 /* Local difference must now fit in 32 bits. */
1311 ASSERT((u32)tsc_elapsed64 == tsc_elapsed64);
1312 tsc_elapsed32 = (u32)tsc_elapsed64;
1313
1314 /* tsc_elapsed > stime_elapsed */
1315 ASSERT(tsc_elapsed32 != 0);
1316 while ( tsc_elapsed32 <= stime_elapsed32 )
1317 {
1318 tsc_elapsed32 <<= 1;
1319 tsc_shift++;
1320 }
1321
1322 calibration_mul_frac = div_frac(stime_elapsed32, tsc_elapsed32);
1323 if ( error_factor != 0 )
1324 calibration_mul_frac = mul_frac(calibration_mul_frac, error_factor);
1325
1326 #if 0
1327 printk("---%d: %08x %08x %d\n", smp_processor_id(),
1328 error_factor, calibration_mul_frac, tsc_shift);
1329 #endif
1330
1331 /* Record new timestamp information, atomically w.r.t. interrupts. */
1332 local_irq_disable();
1333 t->tsc_scale.mul_frac = calibration_mul_frac;
1334 t->tsc_scale.shift = tsc_shift;
1335 t->stamp = curr;
1336 local_irq_enable();
1337
1338 update_vcpu_system_time(current);
1339
1340 out:
1341 if ( smp_processor_id() == 0 )
1342 {
1343 set_timer(&calibration_timer, NOW() + EPOCH);
1344 platform_time_calibration();
1345 }
1346 }
1347
1348 /*
1349 * TSC Reliability check
1350 */
1351
1352 /*
1353 * The Linux original version of this function is
1354 * Copyright (c) 2006, Red Hat, Inc., Ingo Molnar
1355 */
check_tsc_warp(unsigned long tsc_khz,unsigned long * max_warp)1356 static void check_tsc_warp(unsigned long tsc_khz, unsigned long *max_warp)
1357 {
1358 static DEFINE_SPINLOCK(sync_lock);
1359 static cycles_t last_tsc;
1360
1361 cycles_t start, now, prev, end;
1362 int i;
1363
1364 start = rdtsc_ordered();
1365
1366 /* The measurement runs for 20 msecs: */
1367 end = start + tsc_khz * 20ULL;
1368 now = start;
1369
1370 for ( i = 0; ; i++ )
1371 {
1372 /*
1373 * We take the global lock, measure TSC, save the
1374 * previous TSC that was measured (possibly on
1375 * another CPU) and update the previous TSC timestamp.
1376 */
1377 spin_lock(&sync_lock);
1378 prev = last_tsc;
1379 now = rdtsc_ordered();
1380 last_tsc = now;
1381 spin_unlock(&sync_lock);
1382
1383 /*
1384 * Be nice every now and then (and also check whether measurement is
1385 * done [we also insert a 10 million loops safety exit, so we dont
1386 * lock up in case the TSC readout is totally broken]):
1387 */
1388 if ( unlikely(!(i & 7)) )
1389 {
1390 if ( (now > end) || (i > 10000000) )
1391 break;
1392 cpu_relax();
1393 /*touch_nmi_watchdog();*/
1394 }
1395
1396 /*
1397 * Outside the critical section we can now see whether we saw a
1398 * time-warp of the TSC going backwards:
1399 */
1400 if ( unlikely(prev > now) )
1401 {
1402 spin_lock(&sync_lock);
1403 if ( *max_warp < prev - now )
1404 *max_warp = prev - now;
1405 spin_unlock(&sync_lock);
1406 }
1407 }
1408 }
1409
1410 static unsigned long tsc_max_warp, tsc_check_count;
1411 static cpumask_t tsc_check_cpumask;
1412
tsc_check_slave(void * unused)1413 static void tsc_check_slave(void *unused)
1414 {
1415 unsigned int cpu = smp_processor_id();
1416 local_irq_disable();
1417 while ( !cpumask_test_cpu(cpu, &tsc_check_cpumask) )
1418 cpu_relax();
1419 check_tsc_warp(cpu_khz, &tsc_max_warp);
1420 cpumask_clear_cpu(cpu, &tsc_check_cpumask);
1421 local_irq_enable();
1422 }
1423
tsc_check_reliability(void)1424 static void tsc_check_reliability(void)
1425 {
1426 unsigned int cpu = smp_processor_id();
1427 static DEFINE_SPINLOCK(lock);
1428
1429 spin_lock(&lock);
1430
1431 tsc_check_count++;
1432 smp_call_function(tsc_check_slave, NULL, 0);
1433 cpumask_andnot(&tsc_check_cpumask, &cpu_online_map, cpumask_of(cpu));
1434 local_irq_disable();
1435 check_tsc_warp(cpu_khz, &tsc_max_warp);
1436 local_irq_enable();
1437 while ( !cpumask_empty(&tsc_check_cpumask) )
1438 cpu_relax();
1439
1440 spin_unlock(&lock);
1441 }
1442
1443 /*
1444 * Rendezvous for all CPUs in IRQ context.
1445 * Master CPU snapshots the platform timer.
1446 * All CPUS snapshot their local TSC and extrapolation of system time.
1447 */
1448 struct calibration_rendezvous {
1449 cpumask_t cpu_calibration_map;
1450 atomic_t semaphore;
1451 s_time_t master_stime;
1452 u64 master_tsc_stamp;
1453 };
1454
1455 static void
time_calibration_rendezvous_tail(const struct calibration_rendezvous * r)1456 time_calibration_rendezvous_tail(const struct calibration_rendezvous *r)
1457 {
1458 struct cpu_time_stamp *c = &this_cpu(cpu_calibration);
1459
1460 c->local_tsc = rdtsc_ordered();
1461 c->local_stime = get_s_time_fixed(c->local_tsc);
1462 c->master_stime = r->master_stime;
1463
1464 raise_softirq(TIME_CALIBRATE_SOFTIRQ);
1465 }
1466
1467 /*
1468 * Keep TSCs in sync when they run at the same rate, but may stop in
1469 * deep-sleep C states.
1470 */
time_calibration_tsc_rendezvous(void * _r)1471 static void time_calibration_tsc_rendezvous(void *_r)
1472 {
1473 int i;
1474 struct calibration_rendezvous *r = _r;
1475 unsigned int total_cpus = cpumask_weight(&r->cpu_calibration_map);
1476
1477 /* Loop to get rid of cache effects on TSC skew. */
1478 for ( i = 4; i >= 0; i-- )
1479 {
1480 if ( smp_processor_id() == 0 )
1481 {
1482 while ( atomic_read(&r->semaphore) != (total_cpus - 1) )
1483 cpu_relax();
1484
1485 if ( r->master_stime == 0 )
1486 {
1487 r->master_stime = read_platform_stime(NULL);
1488 r->master_tsc_stamp = rdtsc_ordered();
1489 }
1490 atomic_inc(&r->semaphore);
1491
1492 if ( i == 0 )
1493 write_tsc(r->master_tsc_stamp);
1494
1495 while ( atomic_read(&r->semaphore) != (2*total_cpus - 1) )
1496 cpu_relax();
1497 atomic_set(&r->semaphore, 0);
1498 }
1499 else
1500 {
1501 atomic_inc(&r->semaphore);
1502 while ( atomic_read(&r->semaphore) < total_cpus )
1503 cpu_relax();
1504
1505 if ( i == 0 )
1506 write_tsc(r->master_tsc_stamp);
1507
1508 atomic_inc(&r->semaphore);
1509 while ( atomic_read(&r->semaphore) > total_cpus )
1510 cpu_relax();
1511 }
1512 }
1513
1514 time_calibration_rendezvous_tail(r);
1515 }
1516
1517 /* Ordinary rendezvous function which does not modify TSC values. */
time_calibration_std_rendezvous(void * _r)1518 static void time_calibration_std_rendezvous(void *_r)
1519 {
1520 struct calibration_rendezvous *r = _r;
1521 unsigned int total_cpus = cpumask_weight(&r->cpu_calibration_map);
1522
1523 if ( smp_processor_id() == 0 )
1524 {
1525 while ( atomic_read(&r->semaphore) != (total_cpus - 1) )
1526 cpu_relax();
1527 r->master_stime = read_platform_stime(NULL);
1528 smp_wmb(); /* write r->master_stime /then/ signal */
1529 atomic_inc(&r->semaphore);
1530 }
1531 else
1532 {
1533 atomic_inc(&r->semaphore);
1534 while ( atomic_read(&r->semaphore) != total_cpus )
1535 cpu_relax();
1536 smp_rmb(); /* receive signal /then/ read r->master_stime */
1537 }
1538
1539 time_calibration_rendezvous_tail(r);
1540 }
1541
1542 /*
1543 * Rendezvous function used when clocksource is TSC and
1544 * no CPU hotplug will be performed.
1545 */
time_calibration_nop_rendezvous(void * rv)1546 static void time_calibration_nop_rendezvous(void *rv)
1547 {
1548 const struct calibration_rendezvous *r = rv;
1549 struct cpu_time_stamp *c = &this_cpu(cpu_calibration);
1550
1551 c->local_tsc = r->master_tsc_stamp;
1552 c->local_stime = r->master_stime;
1553 c->master_stime = r->master_stime;
1554
1555 raise_softirq(TIME_CALIBRATE_SOFTIRQ);
1556 }
1557
1558 static void (*time_calibration_rendezvous_fn)(void *) =
1559 time_calibration_std_rendezvous;
1560
time_calibration(void * unused)1561 static void time_calibration(void *unused)
1562 {
1563 struct calibration_rendezvous r = {
1564 .semaphore = ATOMIC_INIT(0)
1565 };
1566
1567 if ( clocksource_is_tsc() )
1568 {
1569 local_irq_disable();
1570 r.master_stime = read_platform_stime(&r.master_tsc_stamp);
1571 local_irq_enable();
1572 }
1573
1574 cpumask_copy(&r.cpu_calibration_map, &cpu_online_map);
1575
1576 /* @wait=1 because we must wait for all cpus before freeing @r. */
1577 on_selected_cpus(&r.cpu_calibration_map,
1578 time_calibration_rendezvous_fn,
1579 &r, 1);
1580 }
1581
1582 static struct cpu_time_stamp ap_bringup_ref;
1583
time_latch_stamps(void)1584 void time_latch_stamps(void)
1585 {
1586 unsigned long flags;
1587
1588 local_irq_save(flags);
1589 ap_bringup_ref.master_stime = read_platform_stime(NULL);
1590 ap_bringup_ref.local_tsc = rdtsc_ordered();
1591 local_irq_restore(flags);
1592
1593 ap_bringup_ref.local_stime = get_s_time_fixed(ap_bringup_ref.local_tsc);
1594 }
1595
init_percpu_time(void)1596 void init_percpu_time(void)
1597 {
1598 struct cpu_time *t = &this_cpu(cpu_time);
1599 unsigned long flags;
1600 u64 tsc;
1601 s_time_t now;
1602
1603 /* Initial estimate for TSC rate. */
1604 t->tsc_scale = per_cpu(cpu_time, 0).tsc_scale;
1605
1606 local_irq_save(flags);
1607 now = read_platform_stime(NULL);
1608 tsc = rdtsc_ordered();
1609 local_irq_restore(flags);
1610
1611 t->stamp.master_stime = now;
1612 /*
1613 * To avoid a discontinuity (TSC and platform clock can't be expected
1614 * to be in perfect sync), initialization here needs to match up with
1615 * local_time_calibration()'s decision whether to use its fast path.
1616 */
1617 if ( boot_cpu_has(X86_FEATURE_CONSTANT_TSC) )
1618 {
1619 if ( system_state < SYS_STATE_smp_boot )
1620 now = get_s_time_fixed(tsc);
1621 else
1622 now += ap_bringup_ref.local_stime - ap_bringup_ref.master_stime;
1623 }
1624 t->stamp.local_tsc = tsc;
1625 t->stamp.local_stime = now;
1626 }
1627
1628 /*
1629 * On certain older Intel CPUs writing the TSC MSR clears the upper 32 bits.
1630 * Obviously we must not use write_tsc() on such CPUs.
1631 *
1632 * Additionally, AMD specifies that being able to write the TSC MSR is not an
1633 * architectural feature (but, other than their manual says, also cannot be
1634 * determined from CPUID bits).
1635 */
tsc_check_writability(void)1636 static void __init tsc_check_writability(void)
1637 {
1638 const char *what = NULL;
1639 uint64_t tsc;
1640
1641 /*
1642 * If all CPUs are reported as synchronised and in sync, we never write
1643 * the TSCs (except unavoidably, when a CPU is physically hot-plugged).
1644 * Hence testing for writability is pointless and even harmful.
1645 */
1646 if ( boot_cpu_has(X86_FEATURE_TSC_RELIABLE) )
1647 return;
1648
1649 tsc = rdtsc();
1650 if ( wrmsr_safe(MSR_IA32_TSC, 0) == 0 )
1651 {
1652 uint64_t tmp, tmp2 = rdtsc();
1653
1654 write_tsc(tsc | (1ULL << 32));
1655 tmp = rdtsc();
1656 if ( ABS((s64)tmp - (s64)tmp2) < (1LL << 31) )
1657 what = "only partially";
1658 }
1659 else
1660 {
1661 what = "not";
1662 }
1663
1664 /* Nothing to do if the TSC is fully writable. */
1665 if ( !what )
1666 {
1667 /*
1668 * Paranoia - write back original TSC value. However, APs get synced
1669 * with BSP as they are brought up, so this doesn't much matter.
1670 */
1671 write_tsc(tsc);
1672 return;
1673 }
1674
1675 printk(XENLOG_WARNING "TSC %s writable\n", what);
1676
1677 /* time_calibration_tsc_rendezvous() must not be used */
1678 setup_clear_cpu_cap(X86_FEATURE_CONSTANT_TSC);
1679
1680 /* cstate_restore_tsc() must not be used (or do nothing) */
1681 if ( !boot_cpu_has(X86_FEATURE_NONSTOP_TSC) )
1682 cpuidle_disable_deep_cstate();
1683
1684 /* synchronize_tsc_slave() must do nothing */
1685 disable_tsc_sync = true;
1686 }
1687
reset_percpu_time(void * unused)1688 static void __init reset_percpu_time(void *unused)
1689 {
1690 struct cpu_time *t = &this_cpu(cpu_time);
1691
1692 t->stamp.local_tsc = boot_tsc_stamp;
1693 t->stamp.local_stime = 0;
1694 t->stamp.local_stime = get_s_time_fixed(boot_tsc_stamp);
1695 t->stamp.master_stime = t->stamp.local_stime;
1696 }
1697
try_platform_timer_tail(bool late)1698 static void __init try_platform_timer_tail(bool late)
1699 {
1700 init_timer(&plt_overflow_timer, plt_overflow, NULL, 0);
1701 plt_overflow(NULL);
1702
1703 platform_timer_stamp = plt_stamp64;
1704 stime_platform_stamp = NOW();
1705
1706 if ( !late )
1707 init_percpu_time();
1708
1709 init_timer(&calibration_timer, time_calibration, NULL, 0);
1710 set_timer(&calibration_timer, NOW() + EPOCH);
1711 }
1712
1713 /* Late init function, after all cpus have booted */
verify_tsc_reliability(void)1714 static int __init verify_tsc_reliability(void)
1715 {
1716 if ( boot_cpu_has(X86_FEATURE_TSC_RELIABLE) )
1717 {
1718 /*
1719 * Sadly, despite processor vendors' best design guidance efforts, on
1720 * some systems, cpus may come out of reset improperly synchronized.
1721 * So we must verify there is no warp and we can't do that until all
1722 * CPUs are booted.
1723 */
1724 tsc_check_reliability();
1725 if ( tsc_max_warp )
1726 {
1727 printk("TSC warp detected, disabling TSC_RELIABLE\n");
1728 setup_clear_cpu_cap(X86_FEATURE_TSC_RELIABLE);
1729 }
1730 else if ( !strcmp(opt_clocksource, "tsc") &&
1731 (try_platform_timer(&plt_tsc) > 0) )
1732 {
1733 /*
1734 * Platform timer has changed and CPU time will only be updated
1735 * after we set again the calibration timer, which means we need to
1736 * seed again each local CPU time. At this stage TSC is known to be
1737 * reliable i.e. monotonically increasing across all CPUs so this
1738 * lets us remove the skew between platform timer and TSC, since
1739 * these are now effectively the same.
1740 */
1741 on_selected_cpus(&cpu_online_map, reset_percpu_time, NULL, 1);
1742
1743 /*
1744 * We won't do CPU Hotplug and TSC clocksource is being used which
1745 * means we have a reliable TSC, plus we don't sync with any other
1746 * clocksource so no need for rendezvous.
1747 */
1748 time_calibration_rendezvous_fn = time_calibration_nop_rendezvous;
1749
1750 /* Finish platform timer switch. */
1751 try_platform_timer_tail(true);
1752
1753 printk("Switched to Platform timer %s TSC\n",
1754 freq_string(plt_src.frequency));
1755 return 0;
1756 }
1757 }
1758
1759 /*
1760 * Re-run the TSC writability check if it didn't run to completion, as
1761 * X86_FEATURE_TSC_RELIABLE may have been cleared by now. This is needed
1762 * for determining which rendezvous function to use (below).
1763 */
1764 if ( !disable_tsc_sync )
1765 tsc_check_writability();
1766
1767 /*
1768 * While with constant-rate TSCs the scale factor can be shared, when TSCs
1769 * are not marked as 'reliable', re-sync during rendezvous.
1770 */
1771 if ( boot_cpu_has(X86_FEATURE_CONSTANT_TSC) &&
1772 !boot_cpu_has(X86_FEATURE_TSC_RELIABLE) )
1773 time_calibration_rendezvous_fn = time_calibration_tsc_rendezvous;
1774
1775 return 0;
1776 }
1777 __initcall(verify_tsc_reliability);
1778
1779 /* Late init function (after interrupts are enabled). */
init_xen_time(void)1780 int __init init_xen_time(void)
1781 {
1782 tsc_check_writability();
1783
1784 open_softirq(TIME_CALIBRATE_SOFTIRQ, local_time_calibration);
1785
1786 /* NB. get_wallclock_time() can take over one second to execute. */
1787 do_settime(get_wallclock_time(), 0, NOW());
1788
1789 /* Finish platform timer initialization. */
1790 try_platform_timer_tail(false);
1791
1792 return 0;
1793 }
1794
1795
1796 /* Early init function. */
early_time_init(void)1797 void __init early_time_init(void)
1798 {
1799 struct cpu_time *t = &this_cpu(cpu_time);
1800 u64 tmp;
1801
1802 preinit_pit();
1803 tmp = init_platform_timer();
1804 plt_tsc.frequency = tmp;
1805
1806 set_time_scale(&t->tsc_scale, tmp);
1807 t->stamp.local_tsc = boot_tsc_stamp;
1808
1809 do_div(tmp, 1000);
1810 cpu_khz = (unsigned long)tmp;
1811 printk("Detected %lu.%03lu MHz processor.\n",
1812 cpu_khz / 1000, cpu_khz % 1000);
1813
1814 setup_irq(0, 0, &irq0);
1815 }
1816
1817 /* keep pit enabled for pit_broadcast working while cpuidle enabled */
_disable_pit_irq(void (* hpet_broadcast_setup)(void))1818 static int _disable_pit_irq(void(*hpet_broadcast_setup)(void))
1819 {
1820 int ret = 1;
1821
1822 if ( using_pit || !cpu_has_apic )
1823 return -1;
1824
1825 /*
1826 * If we do not rely on PIT CH0 then we can use HPET for one-shot timer
1827 * emulation when entering deep C states.
1828 * XXX dom0 may rely on RTC interrupt delivery, so only enable
1829 * hpet_broadcast if FSB mode available or if force_hpet_broadcast.
1830 */
1831 if ( cpuidle_using_deep_cstate() && !boot_cpu_has(X86_FEATURE_ARAT) )
1832 {
1833 hpet_broadcast_setup();
1834 if ( !hpet_broadcast_is_available() )
1835 {
1836 if ( xen_cpuidle > 0 )
1837 {
1838 printk("%ps() failed, turning to PIT broadcast\n",
1839 hpet_broadcast_setup);
1840 return -1;
1841 }
1842 ret = 0;
1843 }
1844 }
1845
1846 /* Disable PIT CH0 timer interrupt. */
1847 outb_p(0x30, PIT_MODE);
1848 outb_p(0, PIT_CH0);
1849 outb_p(0, PIT_CH0);
1850
1851 return ret;
1852 }
1853
disable_pit_irq(void)1854 static int __init disable_pit_irq(void)
1855 {
1856 if ( !_disable_pit_irq(hpet_broadcast_init) )
1857 {
1858 xen_cpuidle = 0;
1859 printk("CPUIDLE: disabled due to no HPET. "
1860 "Force enable with 'cpuidle'.\n");
1861 }
1862
1863 return 0;
1864 }
1865 __initcall(disable_pit_irq);
1866
pit_broadcast_enter(void)1867 void pit_broadcast_enter(void)
1868 {
1869 cpumask_set_cpu(smp_processor_id(), &pit_broadcast_mask);
1870 }
1871
pit_broadcast_exit(void)1872 void pit_broadcast_exit(void)
1873 {
1874 int cpu = smp_processor_id();
1875
1876 if ( cpumask_test_and_clear_cpu(cpu, &pit_broadcast_mask) )
1877 reprogram_timer(this_cpu(timer_deadline));
1878 }
1879
pit_broadcast_is_available(void)1880 int pit_broadcast_is_available(void)
1881 {
1882 return cpuidle_using_deep_cstate();
1883 }
1884
send_timer_event(struct vcpu * v)1885 void send_timer_event(struct vcpu *v)
1886 {
1887 send_guest_vcpu_virq(v, VIRQ_TIMER);
1888 }
1889
1890 /* "cmos_utc_offset" is the difference between UTC time and CMOS time. */
1891 static long cmos_utc_offset; /* in seconds */
1892
time_suspend(void)1893 int time_suspend(void)
1894 {
1895 if ( smp_processor_id() == 0 )
1896 {
1897 cmos_utc_offset = -get_wallclock_time();
1898 cmos_utc_offset += get_sec();
1899 kill_timer(&calibration_timer);
1900
1901 /* Sync platform timer stamps. */
1902 platform_time_calibration();
1903 }
1904
1905 /* Better to cancel calibration timer for accuracy. */
1906 clear_bit(TIME_CALIBRATE_SOFTIRQ, &softirq_pending(smp_processor_id()));
1907
1908 return 0;
1909 }
1910
time_resume(void)1911 int time_resume(void)
1912 {
1913 preinit_pit();
1914
1915 resume_platform_timer();
1916
1917 if ( !_disable_pit_irq(hpet_broadcast_resume) )
1918 BUG();
1919
1920 init_percpu_time();
1921
1922 set_timer(&calibration_timer, NOW() + EPOCH);
1923
1924 do_settime(get_wallclock_time() + cmos_utc_offset, 0, NOW());
1925
1926 update_vcpu_system_time(current);
1927
1928 update_domain_rtc();
1929
1930 return 0;
1931 }
1932
hwdom_pit_access(struct ioreq * ioreq)1933 int hwdom_pit_access(struct ioreq *ioreq)
1934 {
1935 /* Is Xen using Channel 2? Then disallow direct dom0 access. */
1936 if ( using_pit )
1937 return 0;
1938
1939 switch ( ioreq->addr )
1940 {
1941 case PIT_CH2:
1942 if ( ioreq->dir == IOREQ_READ )
1943 ioreq->data = inb(PIT_CH2);
1944 else
1945 outb(ioreq->data, PIT_CH2);
1946 return 1;
1947
1948 case PIT_MODE:
1949 if ( ioreq->dir == IOREQ_READ )
1950 return 0; /* urk! */
1951 switch ( ioreq->data & 0xc0 )
1952 {
1953 case 0xc0: /* Read Back */
1954 if ( ioreq->data & 0x08 ) /* Select Channel 2? */
1955 outb(ioreq->data & 0xf8, PIT_MODE);
1956 if ( !(ioreq->data & 0x06) ) /* Select Channel 0/1? */
1957 return 1; /* no - we're done */
1958 /* Filter Channel 2 and reserved bit 0. */
1959 ioreq->data &= ~0x09;
1960 return 0; /* emulate ch0/1 readback */
1961 case 0x80: /* Select Counter 2 */
1962 outb(ioreq->data, PIT_MODE);
1963 return 1;
1964 }
1965 break;
1966
1967 case 0x61:
1968 if ( ioreq->dir == IOREQ_READ )
1969 ioreq->data = inb(0x61);
1970 else
1971 outb((inb(0x61) & ~3) | (ioreq->data & 3), 0x61);
1972 return 1;
1973 }
1974
1975 return 0;
1976 }
1977
1978 /*
1979 * PV SoftTSC Emulation.
1980 */
1981
1982 /*
1983 * tsc=unstable: Override all tests; assume TSC is unreliable.
1984 * tsc=skewed: Assume TSCs are individually reliable, but skewed across CPUs.
1985 * tsc=stable:socket: Assume TSCs are reliable across sockets.
1986 */
tsc_parse(const char * s)1987 static int __init tsc_parse(const char *s)
1988 {
1989 if ( !strcmp(s, "unstable") )
1990 {
1991 setup_clear_cpu_cap(X86_FEATURE_CONSTANT_TSC);
1992 setup_clear_cpu_cap(X86_FEATURE_NONSTOP_TSC);
1993 setup_clear_cpu_cap(X86_FEATURE_TSC_RELIABLE);
1994 }
1995 else if ( !strcmp(s, "skewed") )
1996 setup_clear_cpu_cap(X86_FEATURE_TSC_RELIABLE);
1997 else if ( !strcmp(s, "stable:socket") )
1998 tsc_flags |= TSC_RELIABLE_SOCKET;
1999 else
2000 return -EINVAL;
2001
2002 return 0;
2003 }
2004 custom_param("tsc", tsc_parse);
2005
gtime_to_gtsc(struct domain * d,u64 time)2006 u64 gtime_to_gtsc(struct domain *d, u64 time)
2007 {
2008 if ( !is_hvm_domain(d) )
2009 {
2010 if ( time < d->arch.vtsc_offset )
2011 return -scale_delta(d->arch.vtsc_offset - time,
2012 &d->arch.ns_to_vtsc);
2013 time -= d->arch.vtsc_offset;
2014 }
2015 return scale_delta(time, &d->arch.ns_to_vtsc);
2016 }
2017
gtsc_to_gtime(struct domain * d,u64 tsc)2018 u64 gtsc_to_gtime(struct domain *d, u64 tsc)
2019 {
2020 u64 time = scale_delta(tsc, &d->arch.vtsc_to_ns);
2021
2022 if ( !is_hvm_domain(d) )
2023 time += d->arch.vtsc_offset;
2024 return time;
2025 }
2026
pv_soft_rdtsc(struct vcpu * v,struct cpu_user_regs * regs,int rdtscp)2027 void pv_soft_rdtsc(struct vcpu *v, struct cpu_user_regs *regs, int rdtscp)
2028 {
2029 s_time_t now = get_s_time();
2030 struct domain *d = v->domain;
2031
2032 spin_lock(&d->arch.vtsc_lock);
2033
2034 #if !defined(NDEBUG) || defined(CONFIG_PERF_COUNTERS)
2035 if ( guest_kernel_mode(v, regs) )
2036 d->arch.vtsc_kerncount++;
2037 else
2038 d->arch.vtsc_usercount++;
2039 #endif
2040
2041 if ( (int64_t)(now - d->arch.vtsc_last) > 0 )
2042 d->arch.vtsc_last = now;
2043 else
2044 now = ++d->arch.vtsc_last;
2045
2046 spin_unlock(&d->arch.vtsc_lock);
2047
2048 msr_split(regs, gtime_to_gtsc(d, now));
2049
2050 if ( rdtscp )
2051 regs->rcx =
2052 (d->arch.tsc_mode == TSC_MODE_PVRDTSCP) ? d->arch.incarnation : 0;
2053 }
2054
clocksource_is_tsc(void)2055 bool clocksource_is_tsc(void)
2056 {
2057 return plt_src.read_counter == read_tsc;
2058 }
2059
host_tsc_is_safe(void)2060 int host_tsc_is_safe(void)
2061 {
2062 return boot_cpu_has(X86_FEATURE_TSC_RELIABLE);
2063 }
2064
2065 /*
2066 * called to collect tsc-related data only for save file or live
2067 * migrate; called after last rdtsc is done on this incarnation
2068 */
tsc_get_info(struct domain * d,uint32_t * tsc_mode,uint64_t * elapsed_nsec,uint32_t * gtsc_khz,uint32_t * incarnation)2069 void tsc_get_info(struct domain *d, uint32_t *tsc_mode,
2070 uint64_t *elapsed_nsec, uint32_t *gtsc_khz,
2071 uint32_t *incarnation)
2072 {
2073 bool enable_tsc_scaling = is_hvm_domain(d) &&
2074 hvm_tsc_scaling_supported && !d->arch.vtsc;
2075
2076 *incarnation = d->arch.incarnation;
2077 *tsc_mode = d->arch.tsc_mode;
2078
2079 switch ( *tsc_mode )
2080 {
2081 uint64_t tsc;
2082
2083 case TSC_MODE_NEVER_EMULATE:
2084 *elapsed_nsec = *gtsc_khz = 0;
2085 break;
2086 case TSC_MODE_DEFAULT:
2087 if ( d->arch.vtsc )
2088 {
2089 case TSC_MODE_ALWAYS_EMULATE:
2090 *elapsed_nsec = get_s_time() - d->arch.vtsc_offset;
2091 *gtsc_khz = d->arch.tsc_khz;
2092 break;
2093 }
2094 tsc = rdtsc();
2095 *elapsed_nsec = scale_delta(tsc, &d->arch.vtsc_to_ns);
2096 *gtsc_khz = enable_tsc_scaling ? d->arch.tsc_khz : cpu_khz;
2097 break;
2098 case TSC_MODE_PVRDTSCP:
2099 if ( d->arch.vtsc )
2100 {
2101 *elapsed_nsec = get_s_time() - d->arch.vtsc_offset;
2102 *gtsc_khz = cpu_khz;
2103 }
2104 else
2105 {
2106 tsc = rdtsc();
2107 *elapsed_nsec = scale_delta(tsc, &this_cpu(cpu_time).tsc_scale) -
2108 d->arch.vtsc_offset;
2109 *gtsc_khz = enable_tsc_scaling ? d->arch.tsc_khz
2110 : 0 /* ignored by tsc_set_info */;
2111 }
2112 break;
2113 }
2114
2115 if ( (int64_t)*elapsed_nsec < 0 )
2116 *elapsed_nsec = 0;
2117 }
2118
2119 /*
2120 * This may be called as many as three times for a domain, once when the
2121 * hypervisor creates the domain, once when the toolstack creates the
2122 * domain and, if restoring/migrating, once when saved/migrated values
2123 * are restored. Care must be taken that, if multiple calls occur,
2124 * only the last "sticks" and all are completed before the guest executes
2125 * an rdtsc instruction
2126 */
tsc_set_info(struct domain * d,uint32_t tsc_mode,uint64_t elapsed_nsec,uint32_t gtsc_khz,uint32_t incarnation)2127 void tsc_set_info(struct domain *d,
2128 uint32_t tsc_mode, uint64_t elapsed_nsec,
2129 uint32_t gtsc_khz, uint32_t incarnation)
2130 {
2131 if ( is_idle_domain(d) || is_hardware_domain(d) )
2132 {
2133 d->arch.vtsc = 0;
2134 return;
2135 }
2136
2137 switch ( d->arch.tsc_mode = tsc_mode )
2138 {
2139 bool enable_tsc_scaling;
2140
2141 case TSC_MODE_DEFAULT:
2142 case TSC_MODE_ALWAYS_EMULATE:
2143 d->arch.vtsc_offset = get_s_time() - elapsed_nsec;
2144 d->arch.tsc_khz = gtsc_khz ?: cpu_khz;
2145 set_time_scale(&d->arch.vtsc_to_ns, d->arch.tsc_khz * 1000);
2146
2147 /*
2148 * In default mode use native TSC if the host has safe TSC and
2149 * host and guest frequencies are the same (either "naturally" or
2150 * - for HVM/PVH - via TSC scaling).
2151 * When a guest is created, gtsc_khz is passed in as zero, making
2152 * d->arch.tsc_khz == cpu_khz. Thus no need to check incarnation.
2153 */
2154 if ( tsc_mode == TSC_MODE_DEFAULT && host_tsc_is_safe() &&
2155 (d->arch.tsc_khz == cpu_khz ||
2156 (is_hvm_domain(d) &&
2157 hvm_get_tsc_scaling_ratio(d->arch.tsc_khz))) )
2158 {
2159 case TSC_MODE_NEVER_EMULATE:
2160 d->arch.vtsc = 0;
2161 break;
2162 }
2163 d->arch.vtsc = 1;
2164 d->arch.ns_to_vtsc = scale_reciprocal(d->arch.vtsc_to_ns);
2165 break;
2166 case TSC_MODE_PVRDTSCP:
2167 d->arch.vtsc = !boot_cpu_has(X86_FEATURE_RDTSCP) ||
2168 !host_tsc_is_safe();
2169 enable_tsc_scaling = is_hvm_domain(d) && !d->arch.vtsc &&
2170 hvm_get_tsc_scaling_ratio(gtsc_khz ?: cpu_khz);
2171 d->arch.tsc_khz = (enable_tsc_scaling && gtsc_khz) ? gtsc_khz : cpu_khz;
2172 set_time_scale(&d->arch.vtsc_to_ns, d->arch.tsc_khz * 1000 );
2173 d->arch.ns_to_vtsc = scale_reciprocal(d->arch.vtsc_to_ns);
2174 if ( d->arch.vtsc )
2175 d->arch.vtsc_offset = get_s_time() - elapsed_nsec;
2176 else {
2177 /* when using native TSC, offset is nsec relative to power-on
2178 * of physical machine */
2179 d->arch.vtsc_offset = scale_delta(rdtsc(),
2180 &this_cpu(cpu_time).tsc_scale) -
2181 elapsed_nsec;
2182 }
2183 break;
2184 }
2185 d->arch.incarnation = incarnation + 1;
2186 if ( is_hvm_domain(d) )
2187 {
2188 if ( hvm_tsc_scaling_supported && !d->arch.vtsc )
2189 d->arch.hvm_domain.tsc_scaling_ratio =
2190 hvm_get_tsc_scaling_ratio(d->arch.tsc_khz);
2191
2192 hvm_set_rdtsc_exiting(d, d->arch.vtsc);
2193 if ( d->vcpu && d->vcpu[0] && incarnation == 0 )
2194 {
2195 /*
2196 * set_tsc_offset() is called from hvm_vcpu_initialise() before
2197 * tsc_set_info(). New vtsc mode may require recomputing TSC
2198 * offset.
2199 * We only need to do this for BSP during initial boot. APs will
2200 * call set_tsc_offset() later from hvm_vcpu_reset_state() and they
2201 * will sync their TSC to BSP's sync_tsc.
2202 */
2203 d->arch.hvm_domain.sync_tsc = rdtsc();
2204 hvm_funcs.set_tsc_offset(d->vcpu[0],
2205 d->vcpu[0]->arch.hvm_vcpu.cache_tsc_offset,
2206 d->arch.hvm_domain.sync_tsc);
2207 }
2208 }
2209
2210 recalculate_cpuid_policy(d);
2211 }
2212
2213 /* vtsc may incur measurable performance degradation, diagnose with this */
dump_softtsc(unsigned char key)2214 static void dump_softtsc(unsigned char key)
2215 {
2216 struct domain *d;
2217 int domcnt = 0;
2218
2219 tsc_check_reliability();
2220 if ( boot_cpu_has(X86_FEATURE_TSC_RELIABLE) )
2221 printk("TSC marked as reliable, "
2222 "warp = %lu (count=%lu)\n", tsc_max_warp, tsc_check_count);
2223 else if ( boot_cpu_has(X86_FEATURE_CONSTANT_TSC ) )
2224 {
2225 printk("TSC has constant rate, ");
2226 if (max_cstate <= 2 && tsc_max_warp == 0)
2227 printk("no deep Cstates, passed warp test, deemed reliable, ");
2228 else
2229 printk("deep Cstates possible, so not reliable, ");
2230 printk("warp=%lu (count=%lu)\n", tsc_max_warp, tsc_check_count);
2231 } else
2232 printk("TSC not marked as either constant or reliable, "
2233 "warp=%lu (count=%lu)\n", tsc_max_warp, tsc_check_count);
2234 for_each_domain ( d )
2235 {
2236 if ( is_hardware_domain(d) && d->arch.tsc_mode == TSC_MODE_DEFAULT )
2237 continue;
2238 printk("dom%u%s: mode=%d",d->domain_id,
2239 is_hvm_domain(d) ? "(hvm)" : "", d->arch.tsc_mode);
2240 if ( d->arch.vtsc_offset )
2241 printk(",ofs=%#"PRIx64, d->arch.vtsc_offset);
2242 if ( d->arch.tsc_khz )
2243 printk(",khz=%"PRIu32, d->arch.tsc_khz);
2244 if ( d->arch.incarnation )
2245 printk(",inc=%"PRIu32, d->arch.incarnation);
2246 #if !defined(NDEBUG) || defined(CONFIG_PERF_COUNTERS)
2247 if ( d->arch.vtsc_kerncount | d->arch.vtsc_usercount )
2248 printk(",vtsc count: %"PRIu64" kernel,%"PRIu64" user",
2249 d->arch.vtsc_kerncount, d->arch.vtsc_usercount);
2250 #endif
2251 printk("\n");
2252 domcnt++;
2253 }
2254
2255 if ( !domcnt )
2256 printk("No domains have emulated TSC\n");
2257 }
2258
setup_dump_softtsc(void)2259 static int __init setup_dump_softtsc(void)
2260 {
2261 register_keyhandler('s', dump_softtsc, "dump softtsc stats", 1);
2262 return 0;
2263 }
2264 __initcall(setup_dump_softtsc);
2265
2266 /*
2267 * Local variables:
2268 * mode: C
2269 * c-file-style: "BSD"
2270 * c-basic-offset: 4
2271 * tab-width: 4
2272 * indent-tabs-mode: nil
2273 * End:
2274 */
2275