1 /******************************************************************************
2  * arch/x86/time.c
3  *
4  * Per-CPU time calibration and management.
5  *
6  * Copyright (c) 2002-2005, K A Fraser
7  *
8  * Portions from Linux are:
9  * Copyright (c) 1991, 1992, 1995  Linus Torvalds
10  */
11 
12 #include <xen/errno.h>
13 #include <xen/event.h>
14 #include <xen/sched.h>
15 #include <xen/lib.h>
16 #include <xen/init.h>
17 #include <xen/time.h>
18 #include <xen/timer.h>
19 #include <xen/smp.h>
20 #include <xen/irq.h>
21 #include <xen/softirq.h>
22 #include <xen/efi.h>
23 #include <xen/cpuidle.h>
24 #include <xen/symbols.h>
25 #include <xen/keyhandler.h>
26 #include <xen/guest_access.h>
27 #include <asm/io.h>
28 #include <asm/msr.h>
29 #include <asm/mpspec.h>
30 #include <asm/processor.h>
31 #include <asm/fixmap.h>
32 #include <asm/guest.h>
33 #include <asm/mc146818rtc.h>
34 #include <asm/div64.h>
35 #include <asm/acpi.h>
36 #include <asm/hpet.h>
37 #include <io_ports.h>
38 #include <asm/setup.h> /* for early_time_init */
39 #include <public/arch-x86/cpuid.h>
40 
41 /* opt_clocksource: Force clocksource to one of: pit, hpet, acpi. */
42 static char __initdata opt_clocksource[10];
43 string_param("clocksource", opt_clocksource);
44 
45 unsigned long __read_mostly cpu_khz;  /* CPU clock frequency in kHz. */
46 DEFINE_SPINLOCK(rtc_lock);
47 unsigned long pit0_ticks;
48 
49 struct cpu_time_stamp {
50     u64 local_tsc;
51     s_time_t local_stime;
52     s_time_t master_stime;
53 };
54 
55 struct cpu_time {
56     struct cpu_time_stamp stamp;
57     struct time_scale tsc_scale;
58 };
59 
60 struct platform_timesource {
61     char *id;
62     char *name;
63     u64 frequency;
64     u64 (*read_counter)(void);
65     s64 (*init)(struct platform_timesource *);
66     void (*resume)(struct platform_timesource *);
67     int counter_bits;
68 };
69 
70 static DEFINE_PER_CPU(struct cpu_time, cpu_time);
71 
72 /* Calibrate all CPUs to platform timer every EPOCH. */
73 #define EPOCH MILLISECS(1000)
74 static struct timer calibration_timer;
75 
76 /*
77  * We simulate a 32-bit platform timer from the 16-bit PIT ch2 counter.
78  * Otherwise overflow happens too quickly (~50ms) for us to guarantee that
79  * softirq handling will happen in time.
80  *
81  * The pit_lock protects the 16- and 32-bit stamp fields as well as the
82  */
83 static DEFINE_SPINLOCK(pit_lock);
84 static u16 pit_stamp16;
85 static u32 pit_stamp32;
86 static bool __read_mostly using_pit;
87 
88 /* Boot timestamp, filled in head.S */
89 u64 __initdata boot_tsc_stamp;
90 
91 /*
92  * 32-bit division of integer dividend and integer divisor yielding
93  * 32-bit fractional quotient.
94  */
div_frac(u32 dividend,u32 divisor)95 static inline u32 div_frac(u32 dividend, u32 divisor)
96 {
97     u32 quotient, remainder;
98     ASSERT(dividend < divisor);
99     asm (
100         "divl %4"
101         : "=a" (quotient), "=d" (remainder)
102         : "0" (0), "1" (dividend), "r" (divisor) );
103     return quotient;
104 }
105 
106 /*
107  * 32-bit multiplication of multiplicand and fractional multiplier
108  * yielding 32-bit product (radix point at same position as in multiplicand).
109  */
mul_frac(u32 multiplicand,u32 multiplier)110 static inline u32 mul_frac(u32 multiplicand, u32 multiplier)
111 {
112     u32 product_int, product_frac;
113     asm (
114         "mul %3"
115         : "=a" (product_frac), "=d" (product_int)
116         : "0" (multiplicand), "r" (multiplier) );
117     return product_int;
118 }
119 
120 /*
121  * Scale a 64-bit delta by scaling and multiplying by a 32-bit fraction,
122  * yielding a 64-bit result.
123  */
scale_delta(u64 delta,const struct time_scale * scale)124 u64 scale_delta(u64 delta, const struct time_scale *scale)
125 {
126     u64 product;
127 
128     if ( scale->shift < 0 )
129         delta >>= -scale->shift;
130     else
131         delta <<= scale->shift;
132 
133     asm (
134         "mulq %2 ; shrd $32,%1,%0"
135         : "=a" (product), "=d" (delta)
136         : "rm" (delta), "0" ((u64)scale->mul_frac) );
137 
138     return product;
139 }
140 
141 #define _TS_MUL_FRAC_IDENTITY 0x80000000UL
142 
143 /* Compute the reciprocal of the given time_scale. */
scale_reciprocal(struct time_scale scale)144 static inline struct time_scale scale_reciprocal(struct time_scale scale)
145 {
146     struct time_scale reciprocal;
147     u32 dividend;
148 
149     ASSERT(scale.mul_frac != 0);
150     dividend = _TS_MUL_FRAC_IDENTITY;
151     reciprocal.shift = 1 - scale.shift;
152     while ( unlikely(dividend >= scale.mul_frac) )
153     {
154         dividend >>= 1;
155         reciprocal.shift++;
156     }
157 
158     asm (
159         "divl %4"
160         : "=a" (reciprocal.mul_frac), "=d" (dividend)
161         : "0" (0), "1" (dividend), "r" (scale.mul_frac) );
162 
163     return reciprocal;
164 }
165 
166 /*
167  * cpu_mask that denotes the CPUs that needs timer interrupt coming in as
168  * IPIs in place of local APIC timers
169  */
170 static cpumask_t pit_broadcast_mask;
171 
smp_send_timer_broadcast_ipi(void)172 static void smp_send_timer_broadcast_ipi(void)
173 {
174     int cpu = smp_processor_id();
175     cpumask_t mask;
176 
177     cpumask_and(&mask, &cpu_online_map, &pit_broadcast_mask);
178 
179     if ( cpumask_test_cpu(cpu, &mask) )
180     {
181         __cpumask_clear_cpu(cpu, &mask);
182         raise_softirq(TIMER_SOFTIRQ);
183     }
184 
185     if ( !cpumask_empty(&mask) )
186     {
187         cpumask_raise_softirq(&mask, TIMER_SOFTIRQ);
188     }
189 }
190 
timer_interrupt(int irq,void * dev_id,struct cpu_user_regs * regs)191 static void timer_interrupt(int irq, void *dev_id, struct cpu_user_regs *regs)
192 {
193     ASSERT(local_irq_is_enabled());
194 
195     if ( hpet_legacy_irq_tick() )
196         return;
197 
198     /* Only for start-of-day interruopt tests in io_apic.c. */
199     pit0_ticks++;
200 
201     /* Rough hack to allow accurate timers to sort-of-work with no APIC. */
202     if ( !cpu_has_apic )
203         raise_softirq(TIMER_SOFTIRQ);
204 
205     if ( xen_cpuidle )
206         smp_send_timer_broadcast_ipi();
207 
208     /* Emulate a 32-bit PIT counter. */
209     if ( using_pit )
210     {
211         u16 count;
212 
213         spin_lock_irq(&pit_lock);
214 
215         outb(0x80, PIT_MODE);
216         count  = inb(PIT_CH2);
217         count |= inb(PIT_CH2) << 8;
218 
219         pit_stamp32 += (u16)(pit_stamp16 - count);
220         pit_stamp16 = count;
221 
222         spin_unlock_irq(&pit_lock);
223     }
224 }
225 
226 static struct irqaction __read_mostly irq0 = {
227     timer_interrupt, "timer", NULL
228 };
229 
230 #define CLOCK_TICK_RATE 1193182 /* system crystal frequency (Hz) */
231 #define CALIBRATE_FRAC  20      /* calibrate over 50ms */
232 #define CALIBRATE_VALUE(freq) (((freq) + CALIBRATE_FRAC / 2) / CALIBRATE_FRAC)
233 
preinit_pit(void)234 static void preinit_pit(void)
235 {
236     /* Set PIT channel 0 to HZ Hz. */
237 #define LATCH (((CLOCK_TICK_RATE)+(HZ/2))/HZ)
238     outb_p(0x34, PIT_MODE);        /* binary, mode 2, LSB/MSB, ch 0 */
239     outb_p(LATCH & 0xff, PIT_CH0); /* LSB */
240     outb(LATCH >> 8, PIT_CH0);     /* MSB */
241 #undef LATCH
242 }
243 
set_time_scale(struct time_scale * ts,u64 ticks_per_sec)244 void set_time_scale(struct time_scale *ts, u64 ticks_per_sec)
245 {
246     u64 tps64 = ticks_per_sec;
247     u32 tps32;
248     int shift = 0;
249 
250     ASSERT(tps64 != 0);
251 
252     while ( tps64 > (MILLISECS(1000)*2) )
253     {
254         tps64 >>= 1;
255         shift--;
256     }
257 
258     tps32 = (u32)tps64;
259     while ( tps32 <= (u32)MILLISECS(1000) )
260     {
261         tps32 <<= 1;
262         shift++;
263     }
264 
265     ts->mul_frac = div_frac(MILLISECS(1000), tps32);
266     ts->shift    = shift;
267 }
268 
freq_string(u64 freq)269 static char *freq_string(u64 freq)
270 {
271     static char s[20];
272     unsigned int x, y;
273     y = (unsigned int)do_div(freq, 1000000) / 1000;
274     x = (unsigned int)freq;
275     snprintf(s, sizeof(s), "%u.%03uMHz", x, y);
276     return s;
277 }
278 
279 /************************************************************
280  * PLATFORM TIMER 1: PROGRAMMABLE INTERVAL TIMER (LEGACY PIT)
281  */
282 
read_pit_count(void)283 static u64 read_pit_count(void)
284 {
285     u16 count16;
286     u32 count32;
287     unsigned long flags;
288 
289     spin_lock_irqsave(&pit_lock, flags);
290 
291     outb(0x80, PIT_MODE);
292     count16  = inb(PIT_CH2);
293     count16 |= inb(PIT_CH2) << 8;
294 
295     count32 = pit_stamp32 + (u16)(pit_stamp16 - count16);
296 
297     spin_unlock_irqrestore(&pit_lock, flags);
298 
299     return count32;
300 }
301 
init_pit(struct platform_timesource * pts)302 static s64 __init init_pit(struct platform_timesource *pts)
303 {
304     u8 portb = inb(0x61);
305     u64 start, end;
306     unsigned long count;
307 
308     using_pit = true;
309 
310     /* Set the Gate high, disable speaker. */
311     outb((portb & ~0x02) | 0x01, 0x61);
312 
313     /*
314      * Now let's take care of CTC channel 2: mode 0, (interrupt on
315      * terminal count mode), binary count, load CALIBRATE_LATCH count,
316      * (LSB and MSB) to begin countdown.
317      */
318 #define CALIBRATE_LATCH CALIBRATE_VALUE(CLOCK_TICK_RATE)
319     outb(0xb0, PIT_MODE);                  /* binary, mode 0, LSB/MSB, Ch 2 */
320     outb(CALIBRATE_LATCH & 0xff, PIT_CH2); /* LSB of count */
321     outb(CALIBRATE_LATCH >> 8, PIT_CH2);   /* MSB of count */
322 #undef CALIBRATE_LATCH
323 
324     start = rdtsc_ordered();
325     for ( count = 0; !(inb(0x61) & 0x20); ++count )
326         continue;
327     end = rdtsc_ordered();
328 
329     /* Set the Gate low, disable speaker. */
330     outb(portb & ~0x03, 0x61);
331 
332     /* Error if the CTC doesn't behave itself. */
333     if ( count == 0 )
334         return 0;
335 
336     return (end - start) * CALIBRATE_FRAC;
337 }
338 
resume_pit(struct platform_timesource * pts)339 static void resume_pit(struct platform_timesource *pts)
340 {
341     /* Set CTC channel 2 to mode 0 again; initial value does not matter. */
342     outb(0xb0, PIT_MODE); /* binary, mode 0, LSB/MSB, Ch 2 */
343     outb(0, PIT_CH2);     /* LSB of count */
344     outb(0, PIT_CH2);     /* MSB of count */
345 }
346 
347 static struct platform_timesource __initdata plt_pit =
348 {
349     .id = "pit",
350     .name = "PIT",
351     .frequency = CLOCK_TICK_RATE,
352     .read_counter = read_pit_count,
353     .counter_bits = 32,
354     .init = init_pit,
355     .resume = resume_pit,
356 };
357 
358 /************************************************************
359  * PLATFORM TIMER 2: HIGH PRECISION EVENT TIMER (HPET)
360  */
361 
read_hpet_count(void)362 static u64 read_hpet_count(void)
363 {
364     return hpet_read32(HPET_COUNTER);
365 }
366 
init_hpet(struct platform_timesource * pts)367 static s64 __init init_hpet(struct platform_timesource *pts)
368 {
369     u64 hpet_rate = hpet_setup(), start;
370     u32 count, target;
371 
372     if ( hpet_rate == 0 )
373         return 0;
374 
375     pts->frequency = hpet_rate;
376 
377     count = hpet_read32(HPET_COUNTER);
378     start = rdtsc_ordered();
379     target = count + CALIBRATE_VALUE(hpet_rate);
380     if ( target < count )
381         while ( hpet_read32(HPET_COUNTER) >= count )
382             continue;
383     while ( hpet_read32(HPET_COUNTER) < target )
384         continue;
385 
386     return (rdtsc_ordered() - start) * CALIBRATE_FRAC;
387 }
388 
resume_hpet(struct platform_timesource * pts)389 static void resume_hpet(struct platform_timesource *pts)
390 {
391     hpet_resume(NULL);
392 }
393 
394 static struct platform_timesource __initdata plt_hpet =
395 {
396     .id = "hpet",
397     .name = "HPET",
398     .read_counter = read_hpet_count,
399     .counter_bits = 32,
400     .init = init_hpet,
401     .resume = resume_hpet
402 };
403 
404 /************************************************************
405  * PLATFORM TIMER 3: ACPI PM TIMER
406  */
407 
408 u32 __read_mostly pmtmr_ioport;
409 unsigned int __initdata pmtmr_width;
410 
411 /* ACPI PM timer ticks at 3.579545 MHz. */
412 #define ACPI_PM_FREQUENCY 3579545
413 
read_pmtimer_count(void)414 static u64 read_pmtimer_count(void)
415 {
416     return inl(pmtmr_ioport);
417 }
418 
init_pmtimer(struct platform_timesource * pts)419 static s64 __init init_pmtimer(struct platform_timesource *pts)
420 {
421     u64 start;
422     u32 count, target, mask = 0xffffff;
423 
424     if ( !pmtmr_ioport || !pmtmr_width )
425         return 0;
426 
427     if ( pmtmr_width == 32 )
428     {
429         pts->counter_bits = 32;
430         mask = 0xffffffff;
431     }
432 
433     count = inl(pmtmr_ioport) & mask;
434     start = rdtsc_ordered();
435     target = count + CALIBRATE_VALUE(ACPI_PM_FREQUENCY);
436     if ( target < count )
437         while ( (inl(pmtmr_ioport) & mask) >= count )
438             continue;
439     while ( (inl(pmtmr_ioport) & mask) < target )
440         continue;
441 
442     return (rdtsc_ordered() - start) * CALIBRATE_FRAC;
443 }
444 
445 static struct platform_timesource __initdata plt_pmtimer =
446 {
447     .id = "acpi",
448     .name = "ACPI PM Timer",
449     .frequency = ACPI_PM_FREQUENCY,
450     .read_counter = read_pmtimer_count,
451     .counter_bits = 24,
452     .init = init_pmtimer
453 };
454 
455 static struct time_scale __read_mostly pmt_scale;
456 static struct time_scale __read_mostly pmt_scale_r;
457 
init_pmtmr_scale(void)458 static __init int init_pmtmr_scale(void)
459 {
460     set_time_scale(&pmt_scale, ACPI_PM_FREQUENCY);
461     pmt_scale_r = scale_reciprocal(pmt_scale);
462     return 0;
463 }
464 __initcall(init_pmtmr_scale);
465 
acpi_pm_tick_to_ns(uint64_t ticks)466 uint64_t acpi_pm_tick_to_ns(uint64_t ticks)
467 {
468     return scale_delta(ticks, &pmt_scale);
469 }
470 
ns_to_acpi_pm_tick(uint64_t ns)471 uint64_t ns_to_acpi_pm_tick(uint64_t ns)
472 {
473     return scale_delta(ns, &pmt_scale_r);
474 }
475 
476 /************************************************************
477  * PLATFORM TIMER 4: TSC
478  */
479 static unsigned int __initdata tsc_flags;
480 
481 /* TSC is reliable across sockets */
482 #define TSC_RELIABLE_SOCKET (1 << 0)
483 
484 /*
485  * Called in verify_tsc_reliability() under reliable TSC conditions
486  * thus reusing all the checks already performed there.
487  */
init_tsc(struct platform_timesource * pts)488 static s64 __init init_tsc(struct platform_timesource *pts)
489 {
490     u64 ret = pts->frequency;
491 
492     if ( nr_cpu_ids != num_present_cpus() )
493     {
494         printk(XENLOG_WARNING "TSC: CPU Hotplug intended\n");
495         ret = 0;
496     }
497 
498     if ( nr_sockets > 1 && !(tsc_flags & TSC_RELIABLE_SOCKET) )
499     {
500         printk(XENLOG_WARNING "TSC: Not invariant across sockets\n");
501         ret = 0;
502     }
503 
504     if ( !ret )
505         printk(XENLOG_DEBUG "TSC: Not setting it as clocksource\n");
506 
507     return ret;
508 }
509 
read_tsc(void)510 static u64 read_tsc(void)
511 {
512     return rdtsc_ordered();
513 }
514 
515 static struct platform_timesource __initdata plt_tsc =
516 {
517     .id = "tsc",
518     .name = "TSC",
519     .read_counter = read_tsc,
520     /*
521      * Calculations for platform timer overflow assume u64 boundary.
522      * Hence we set to less than 64, such that the TSC wraparound is
523      * correctly checked and handled.
524      */
525     .counter_bits = 63,
526     .init = init_tsc,
527 };
528 
529 #ifdef CONFIG_XEN_GUEST
530 /************************************************************
531  * PLATFORM TIMER 5: XEN PV CLOCK SOURCE
532  *
533  * Xen clock source is a variant of TSC source.
534  */
535 
xen_timer_cpu_frequency(void)536 static uint64_t xen_timer_cpu_frequency(void)
537 {
538     struct vcpu_time_info *info = &this_cpu(vcpu_info)->time;
539     uint64_t freq;
540 
541     freq = 1000000000ULL << 32;
542     do_div(freq, info->tsc_to_system_mul);
543     if ( info->tsc_shift < 0 )
544         freq <<= -info->tsc_shift;
545     else
546         freq >>= info->tsc_shift;
547 
548     return freq;
549 }
550 
init_xen_timer(struct platform_timesource * pts)551 static int64_t __init init_xen_timer(struct platform_timesource *pts)
552 {
553     if ( !xen_guest )
554         return 0;
555 
556     pts->frequency = xen_timer_cpu_frequency();
557 
558     return pts->frequency;
559 }
560 
read_cycle(const struct vcpu_time_info * info,uint64_t tsc)561 static always_inline uint64_t read_cycle(const struct vcpu_time_info *info,
562                                          uint64_t tsc)
563 {
564     uint64_t delta = tsc - info->tsc_timestamp;
565     struct time_scale ts = {
566         .shift    = info->tsc_shift,
567         .mul_frac = info->tsc_to_system_mul,
568     };
569     uint64_t offset = scale_delta(delta, &ts);
570 
571     return info->system_time + offset;
572 }
573 
read_xen_timer(void)574 static uint64_t read_xen_timer(void)
575 {
576     struct vcpu_time_info *info = &this_cpu(vcpu_info)->time;
577     uint32_t version;
578     uint64_t ret;
579     uint64_t last;
580     static uint64_t last_value;
581 
582     do {
583         version = info->version & ~1;
584         /* Make sure version is read before the data */
585         smp_rmb();
586 
587         ret = read_cycle(info, rdtsc_ordered());
588         /* Ignore fancy flags for now */
589 
590         /* Make sure version is reread after the data */
591         smp_rmb();
592     } while ( unlikely(version != info->version) );
593 
594     /* Maintain a monotonic global value */
595     do {
596         last = read_atomic(&last_value);
597         if ( ret < last )
598             return last;
599     } while ( unlikely(cmpxchg(&last_value, last, ret) != last) );
600 
601     return ret;
602 }
603 
604 static struct platform_timesource __initdata plt_xen_timer =
605 {
606     .id = "xen",
607     .name = "XEN PV CLOCK",
608     .read_counter = read_xen_timer,
609     .init = init_xen_timer,
610     .counter_bits = 63,
611 };
612 #endif
613 
614 /************************************************************
615  * GENERIC PLATFORM TIMER INFRASTRUCTURE
616  */
617 
618 /* details of chosen timesource */
619 static struct platform_timesource __read_mostly plt_src;
620 /* hardware-width mask */
621 static u64 __read_mostly plt_mask;
622  /* ns between calls to plt_overflow() */
623 static u64 __read_mostly plt_overflow_period;
624 /* scale: platform counter -> nanosecs */
625 static struct time_scale __read_mostly plt_scale;
626 
627 /* Protected by platform_timer_lock. */
628 static DEFINE_SPINLOCK(platform_timer_lock);
629 static s_time_t stime_platform_stamp; /* System time at below platform time */
630 static u64 platform_timer_stamp;      /* Platform time at above system time */
631 static u64 plt_stamp64;          /* 64-bit platform counter stamp           */
632 static u64 plt_stamp;            /* hardware-width platform counter stamp   */
633 static struct timer plt_overflow_timer;
634 
__read_platform_stime(u64 platform_time)635 static s_time_t __read_platform_stime(u64 platform_time)
636 {
637     u64 diff = platform_time - platform_timer_stamp;
638     ASSERT(spin_is_locked(&platform_timer_lock));
639     return (stime_platform_stamp + scale_delta(diff, &plt_scale));
640 }
641 
plt_overflow(void * unused)642 static void plt_overflow(void *unused)
643 {
644     int i;
645     u64 count;
646     s_time_t now, plt_now, plt_wrap;
647 
648     spin_lock_irq(&platform_timer_lock);
649 
650     count = plt_src.read_counter();
651     plt_stamp64 += (count - plt_stamp) & plt_mask;
652     plt_stamp = count;
653 
654     now = NOW();
655     plt_wrap = __read_platform_stime(plt_stamp64);
656     for ( i = 0; i < 10; i++ )
657     {
658         plt_now = plt_wrap;
659         plt_wrap = __read_platform_stime(plt_stamp64 + plt_mask + 1);
660         if ( ABS(plt_wrap - now) > ABS(plt_now - now) )
661             break;
662         plt_stamp64 += plt_mask + 1;
663     }
664     if ( i != 0 )
665     {
666         static bool warned_once;
667 
668         if ( !test_and_set_bool(warned_once) )
669             printk("Platform timer appears to have unexpectedly wrapped "
670                    "%u%s times.\n", i, (i == 10) ? " or more" : "");
671     }
672 
673     spin_unlock_irq(&platform_timer_lock);
674 
675     set_timer(&plt_overflow_timer, NOW() + plt_overflow_period);
676 }
677 
read_platform_stime(u64 * stamp)678 static s_time_t read_platform_stime(u64 *stamp)
679 {
680     u64 plt_counter, count;
681     s_time_t stime;
682 
683     ASSERT(!local_irq_is_enabled());
684 
685     spin_lock(&platform_timer_lock);
686     plt_counter = plt_src.read_counter();
687     count = plt_stamp64 + ((plt_counter - plt_stamp) & plt_mask);
688     stime = __read_platform_stime(count);
689     spin_unlock(&platform_timer_lock);
690 
691     if ( unlikely(stamp) )
692         *stamp = plt_counter;
693 
694     return stime;
695 }
696 
platform_time_calibration(void)697 static void platform_time_calibration(void)
698 {
699     u64 count;
700     s_time_t stamp;
701     unsigned long flags;
702 
703     spin_lock_irqsave(&platform_timer_lock, flags);
704     count = plt_stamp64 + ((plt_src.read_counter() - plt_stamp) & plt_mask);
705     stamp = __read_platform_stime(count);
706     stime_platform_stamp = stamp;
707     platform_timer_stamp = count;
708     spin_unlock_irqrestore(&platform_timer_lock, flags);
709 }
710 
resume_platform_timer(void)711 static void resume_platform_timer(void)
712 {
713     /* Timer source can be reset when backing from S3 to S0 */
714     if ( plt_src.resume )
715         plt_src.resume(&plt_src);
716 
717     plt_stamp64 = platform_timer_stamp;
718     plt_stamp = plt_src.read_counter();
719 }
720 
reset_platform_timer(void)721 static void __init reset_platform_timer(void)
722 {
723     /* Deactivate any timers running */
724     kill_timer(&plt_overflow_timer);
725     kill_timer(&calibration_timer);
726 
727     /* Reset counters and stamps */
728     spin_lock_irq(&platform_timer_lock);
729     plt_stamp = 0;
730     plt_stamp64 = 0;
731     platform_timer_stamp = 0;
732     stime_platform_stamp = 0;
733     spin_unlock_irq(&platform_timer_lock);
734 }
735 
try_platform_timer(struct platform_timesource * pts)736 static s64 __init try_platform_timer(struct platform_timesource *pts)
737 {
738     s64 rc = pts->init(pts);
739 
740     if ( rc <= 0 )
741         return rc;
742 
743     /* We have a platform timesource already so reset it */
744     if ( plt_src.counter_bits != 0 )
745         reset_platform_timer();
746 
747     plt_mask = (u64)~0ull >> (64 - pts->counter_bits);
748 
749     set_time_scale(&plt_scale, pts->frequency);
750 
751     plt_overflow_period = scale_delta(
752         1ull << (pts->counter_bits - 1), &plt_scale);
753     plt_src = *pts;
754 
755     return rc;
756 }
757 
init_platform_timer(void)758 static u64 __init init_platform_timer(void)
759 {
760     static struct platform_timesource * __initdata plt_timers[] = {
761 #ifdef CONFIG_XEN_GUEST
762         &plt_xen_timer,
763 #endif
764         &plt_hpet, &plt_pmtimer, &plt_pit
765     };
766 
767     struct platform_timesource *pts = NULL;
768     unsigned int i;
769     s64 rc = -1;
770 
771     /* clocksource=tsc is initialized via __initcalls (when CPUs are up). */
772     if ( (opt_clocksource[0] != '\0') && strcmp(opt_clocksource, "tsc") )
773     {
774         for ( i = 0; i < ARRAY_SIZE(plt_timers); i++ )
775         {
776             pts = plt_timers[i];
777             if ( !strcmp(opt_clocksource, pts->id) )
778             {
779                 rc = try_platform_timer(pts);
780                 break;
781             }
782         }
783 
784         if ( rc <= 0 )
785             printk("WARNING: %s clocksource '%s'.\n",
786                    (rc == 0) ? "Could not initialise" : "Unrecognised",
787                    opt_clocksource);
788     }
789 
790     if ( rc <= 0 )
791     {
792         for ( i = 0; i < ARRAY_SIZE(plt_timers); i++ )
793         {
794             pts = plt_timers[i];
795             if ( (rc = try_platform_timer(pts)) > 0 )
796                 break;
797         }
798     }
799 
800     if ( rc <= 0 )
801         panic("Unable to find usable platform timer");
802 
803     printk("Platform timer is %s %s\n",
804            freq_string(pts->frequency), pts->name);
805 
806     return rc;
807 }
808 
stime2tsc(s_time_t stime)809 u64 stime2tsc(s_time_t stime)
810 {
811     struct cpu_time *t;
812     struct time_scale sys_to_tsc;
813     s_time_t stime_delta;
814 
815     t = &this_cpu(cpu_time);
816     sys_to_tsc = scale_reciprocal(t->tsc_scale);
817 
818     stime_delta = stime - t->stamp.local_stime;
819     if ( stime_delta < 0 )
820         stime_delta = 0;
821 
822     return t->stamp.local_tsc + scale_delta(stime_delta, &sys_to_tsc);
823 }
824 
cstate_restore_tsc(void)825 void cstate_restore_tsc(void)
826 {
827     if ( boot_cpu_has(X86_FEATURE_NONSTOP_TSC) )
828         return;
829 
830     write_tsc(stime2tsc(read_platform_stime(NULL)));
831 }
832 
833 /***************************************************************************
834  * CMOS Timer functions
835  ***************************************************************************/
836 
837 /* Converts Gregorian date to seconds since 1970-01-01 00:00:00.
838  * Assumes input in normal date format, i.e. 1980-12-31 23:59:59
839  * => year=1980, mon=12, day=31, hour=23, min=59, sec=59.
840  *
841  * [For the Julian calendar (which was used in Russia before 1917,
842  * Britain & colonies before 1752, anywhere else before 1582,
843  * and is still in use by some communities) leave out the
844  * -year/100+year/400 terms, and add 10.]
845  *
846  * This algorithm was first published by Gauss (I think).
847  *
848  * WARNING: this function will overflow on 2106-02-07 06:28:16 on
849  * machines were long is 32-bit! (However, as time_t is signed, we
850  * will already get problems at other places on 2038-01-19 03:14:08)
851  */
852 unsigned long
mktime(unsigned int year,unsigned int mon,unsigned int day,unsigned int hour,unsigned int min,unsigned int sec)853 mktime (unsigned int year, unsigned int mon,
854         unsigned int day, unsigned int hour,
855         unsigned int min, unsigned int sec)
856 {
857     /* 1..12 -> 11,12,1..10: put Feb last since it has a leap day. */
858     if ( 0 >= (int) (mon -= 2) )
859     {
860         mon += 12;
861         year -= 1;
862     }
863 
864     return ((((unsigned long)(year/4 - year/100 + year/400 + 367*mon/12 + day)+
865               year*365 - 719499
866         )*24 + hour /* now have hours */
867         )*60 + min  /* now have minutes */
868         )*60 + sec; /* finally seconds */
869 }
870 
871 struct rtc_time {
872     unsigned int year, mon, day, hour, min, sec;
873 };
874 
__get_cmos_time(struct rtc_time * rtc)875 static void __get_cmos_time(struct rtc_time *rtc)
876 {
877     rtc->sec  = CMOS_READ(RTC_SECONDS);
878     rtc->min  = CMOS_READ(RTC_MINUTES);
879     rtc->hour = CMOS_READ(RTC_HOURS);
880     rtc->day  = CMOS_READ(RTC_DAY_OF_MONTH);
881     rtc->mon  = CMOS_READ(RTC_MONTH);
882     rtc->year = CMOS_READ(RTC_YEAR);
883 
884     if ( RTC_ALWAYS_BCD || !(CMOS_READ(RTC_CONTROL) & RTC_DM_BINARY) )
885     {
886         BCD_TO_BIN(rtc->sec);
887         BCD_TO_BIN(rtc->min);
888         BCD_TO_BIN(rtc->hour);
889         BCD_TO_BIN(rtc->day);
890         BCD_TO_BIN(rtc->mon);
891         BCD_TO_BIN(rtc->year);
892     }
893 
894     if ( (rtc->year += 1900) < 1970 )
895         rtc->year += 100;
896 }
897 
get_cmos_time(void)898 static unsigned long get_cmos_time(void)
899 {
900     unsigned long res, flags;
901     struct rtc_time rtc;
902     unsigned int seconds = 60;
903     static bool __read_mostly cmos_rtc_probe;
904     boolean_param("cmos-rtc-probe", cmos_rtc_probe);
905 
906     if ( efi_enabled(EFI_RS) )
907     {
908         res = efi_get_time();
909         if ( res )
910             return res;
911     }
912 
913     if ( likely(!(acpi_gbl_FADT.boot_flags & ACPI_FADT_NO_CMOS_RTC)) )
914         cmos_rtc_probe = false;
915     else if ( system_state < SYS_STATE_smp_boot && !cmos_rtc_probe )
916         panic("System with no CMOS RTC advertised must be booted from EFI"
917               " (or with command line option \"cmos-rtc-probe\")");
918 
919     for ( ; ; )
920     {
921         s_time_t start, t1, t2;
922 
923         spin_lock_irqsave(&rtc_lock, flags);
924 
925         /* read RTC exactly on falling edge of update flag */
926         start = NOW();
927         do { /* may take up to 1 second... */
928             t1 = NOW() - start;
929         } while ( !(CMOS_READ(RTC_FREQ_SELECT) & RTC_UIP) &&
930                   t1 <= SECONDS(1) );
931 
932         start = NOW();
933         do { /* must try at least 2.228 ms */
934             t2 = NOW() - start;
935         } while ( (CMOS_READ(RTC_FREQ_SELECT) & RTC_UIP) &&
936                   t2 < MILLISECS(3) );
937 
938         __get_cmos_time(&rtc);
939 
940         spin_unlock_irqrestore(&rtc_lock, flags);
941 
942         if ( likely(!cmos_rtc_probe) ||
943              t1 > SECONDS(1) || t2 >= MILLISECS(3) ||
944              rtc.sec >= 60 || rtc.min >= 60 || rtc.hour >= 24 ||
945              !rtc.day || rtc.day > 31 ||
946              !rtc.mon || rtc.mon > 12 )
947             break;
948 
949         if ( seconds < 60 )
950         {
951             if ( rtc.sec != seconds )
952                 cmos_rtc_probe = false;
953             break;
954         }
955 
956         process_pending_softirqs();
957 
958         seconds = rtc.sec;
959     }
960 
961     if ( unlikely(cmos_rtc_probe) )
962         panic("No CMOS RTC found - system must be booted from EFI");
963 
964     return mktime(rtc.year, rtc.mon, rtc.day, rtc.hour, rtc.min, rtc.sec);
965 }
966 
get_wallclock_time(void)967 static unsigned long get_wallclock_time(void)
968 {
969 #ifdef CONFIG_XEN_GUEST
970     if ( xen_guest )
971     {
972         struct shared_info *sh_info = XEN_shared_info;
973         uint32_t wc_version;
974         uint64_t wc_sec;
975 
976         do {
977             wc_version = sh_info->wc_version & ~1;
978             smp_rmb();
979 
980             wc_sec  = sh_info->wc_sec;
981             smp_rmb();
982         } while ( wc_version != sh_info->wc_version );
983 
984         return wc_sec + read_xen_timer() / 1000000000;
985     }
986 #endif
987 
988     return get_cmos_time();
989 }
990 
991 /***************************************************************************
992  * System Time
993  ***************************************************************************/
994 
get_s_time_fixed(u64 at_tsc)995 s_time_t get_s_time_fixed(u64 at_tsc)
996 {
997     const struct cpu_time *t = &this_cpu(cpu_time);
998     u64 tsc, delta;
999     s_time_t now;
1000 
1001     if ( at_tsc )
1002         tsc = at_tsc;
1003     else
1004         tsc = rdtsc_ordered();
1005     delta = tsc - t->stamp.local_tsc;
1006     now = t->stamp.local_stime + scale_delta(delta, &t->tsc_scale);
1007 
1008     return now;
1009 }
1010 
get_s_time()1011 s_time_t get_s_time()
1012 {
1013     return get_s_time_fixed(0);
1014 }
1015 
tsc_ticks2ns(uint64_t ticks)1016 uint64_t tsc_ticks2ns(uint64_t ticks)
1017 {
1018     struct cpu_time *t = &this_cpu(cpu_time);
1019 
1020     return scale_delta(ticks, &t->tsc_scale);
1021 }
1022 
__update_vcpu_system_time(struct vcpu * v,int force)1023 static void __update_vcpu_system_time(struct vcpu *v, int force)
1024 {
1025     const struct cpu_time *t;
1026     struct vcpu_time_info *u, _u = {};
1027     struct domain *d = v->domain;
1028     s_time_t tsc_stamp;
1029 
1030     if ( v->vcpu_info == NULL )
1031         return;
1032 
1033     t = &this_cpu(cpu_time);
1034     u = &vcpu_info(v, time);
1035 
1036     if ( d->arch.vtsc )
1037     {
1038         s_time_t stime = t->stamp.local_stime;
1039 
1040         if ( is_hvm_domain(d) )
1041         {
1042             struct pl_time *pl = v->domain->arch.hvm_domain.pl_time;
1043 
1044             stime += pl->stime_offset + v->arch.hvm_vcpu.stime_offset;
1045             if ( stime >= 0 )
1046                 tsc_stamp = gtime_to_gtsc(d, stime);
1047             else
1048                 tsc_stamp = -gtime_to_gtsc(d, -stime);
1049         }
1050         else
1051             tsc_stamp = gtime_to_gtsc(d, stime);
1052 
1053         _u.tsc_to_system_mul = d->arch.vtsc_to_ns.mul_frac;
1054         _u.tsc_shift         = d->arch.vtsc_to_ns.shift;
1055     }
1056     else
1057     {
1058         if ( is_hvm_domain(d) && hvm_tsc_scaling_supported )
1059         {
1060             tsc_stamp            = hvm_scale_tsc(d, t->stamp.local_tsc);
1061             _u.tsc_to_system_mul = d->arch.vtsc_to_ns.mul_frac;
1062             _u.tsc_shift         = d->arch.vtsc_to_ns.shift;
1063         }
1064         else
1065         {
1066             tsc_stamp            = t->stamp.local_tsc;
1067             _u.tsc_to_system_mul = t->tsc_scale.mul_frac;
1068             _u.tsc_shift         = t->tsc_scale.shift;
1069         }
1070     }
1071 
1072     _u.tsc_timestamp = tsc_stamp;
1073     _u.system_time   = t->stamp.local_stime;
1074 
1075     /*
1076      * It's expected that domains cope with this bit changing on every
1077      * pvclock read to check whether they can resort solely on this tuple
1078      * or if it further requires monotonicity checks with other vcpus.
1079      */
1080     if ( clocksource_is_tsc() )
1081         _u.flags |= XEN_PVCLOCK_TSC_STABLE_BIT;
1082 
1083     if ( is_hvm_domain(d) )
1084         _u.tsc_timestamp += v->arch.hvm_vcpu.cache_tsc_offset;
1085 
1086     /* Don't bother unless timestamp record has changed or we are forced. */
1087     _u.version = u->version; /* make versions match for memcmp test */
1088     if ( !force && !memcmp(u, &_u, sizeof(_u)) )
1089         return;
1090 
1091     /* 1. Update guest kernel version. */
1092     _u.version = u->version = version_update_begin(u->version);
1093     wmb();
1094     /* 2. Update all other guest kernel fields. */
1095     *u = _u;
1096     wmb();
1097     /* 3. Update guest kernel version. */
1098     u->version = version_update_end(u->version);
1099 
1100     if ( !update_secondary_system_time(v, &_u) && is_pv_domain(d) &&
1101          !is_pv_32bit_domain(d) && !(v->arch.flags & TF_kernel_mode) )
1102         v->arch.pv_vcpu.pending_system_time = _u;
1103 }
1104 
update_secondary_system_time(struct vcpu * v,struct vcpu_time_info * u)1105 bool update_secondary_system_time(struct vcpu *v,
1106                                   struct vcpu_time_info *u)
1107 {
1108     XEN_GUEST_HANDLE(vcpu_time_info_t) user_u = v->arch.time_info_guest;
1109     struct guest_memory_policy policy =
1110         { .smap_policy = SMAP_CHECK_ENABLED, .nested_guest_mode = false };
1111 
1112     if ( guest_handle_is_null(user_u) )
1113         return true;
1114 
1115     update_guest_memory_policy(v, &policy);
1116 
1117     /* 1. Update userspace version. */
1118     if ( __copy_field_to_guest(user_u, u, version) == sizeof(u->version) )
1119     {
1120         update_guest_memory_policy(v, &policy);
1121         return false;
1122     }
1123     wmb();
1124     /* 2. Update all other userspace fields. */
1125     __copy_to_guest(user_u, u, 1);
1126     wmb();
1127     /* 3. Update userspace version. */
1128     u->version = version_update_end(u->version);
1129     __copy_field_to_guest(user_u, u, version);
1130 
1131     update_guest_memory_policy(v, &policy);
1132 
1133     return true;
1134 }
1135 
update_vcpu_system_time(struct vcpu * v)1136 void update_vcpu_system_time(struct vcpu *v)
1137 {
1138     __update_vcpu_system_time(v, 0);
1139 }
1140 
force_update_vcpu_system_time(struct vcpu * v)1141 void force_update_vcpu_system_time(struct vcpu *v)
1142 {
1143     __update_vcpu_system_time(v, 1);
1144 }
1145 
update_domain_rtc(void)1146 static void update_domain_rtc(void)
1147 {
1148     struct domain *d;
1149 
1150     rcu_read_lock(&domlist_read_lock);
1151 
1152     for_each_domain ( d )
1153         if ( is_hvm_domain(d) )
1154             rtc_update_clock(d);
1155 
1156     rcu_read_unlock(&domlist_read_lock);
1157 }
1158 
domain_set_time_offset(struct domain * d,int64_t time_offset_seconds)1159 void domain_set_time_offset(struct domain *d, int64_t time_offset_seconds)
1160 {
1161     d->time_offset_seconds = time_offset_seconds;
1162     if ( is_hvm_domain(d) )
1163         rtc_update_clock(d);
1164     update_domain_wallclock_time(d);
1165 }
1166 
cpu_frequency_change(u64 freq)1167 int cpu_frequency_change(u64 freq)
1168 {
1169     struct cpu_time *t = &this_cpu(cpu_time);
1170     u64 curr_tsc;
1171 
1172     /* Sanity check: CPU frequency allegedly dropping below 1MHz? */
1173     if ( freq < 1000000u )
1174     {
1175         printk(XENLOG_WARNING "Rejecting CPU frequency change "
1176                "to %"PRIu64" Hz\n", freq);
1177         return -EINVAL;
1178     }
1179 
1180     local_irq_disable();
1181     /* Platform time /first/, as we may be delayed by platform_timer_lock. */
1182     t->stamp.master_stime = read_platform_stime(NULL);
1183     curr_tsc = rdtsc_ordered();
1184     /* TSC-extrapolated time may be bogus after frequency change. */
1185     /*t->stamp.local_stime = get_s_time_fixed(curr_tsc);*/
1186     t->stamp.local_stime = t->stamp.master_stime;
1187     t->stamp.local_tsc = curr_tsc;
1188     set_time_scale(&t->tsc_scale, freq);
1189     local_irq_enable();
1190 
1191     update_vcpu_system_time(current);
1192 
1193     /* A full epoch should pass before we check for deviation. */
1194     if ( smp_processor_id() == 0 )
1195     {
1196         set_timer(&calibration_timer, NOW() + EPOCH);
1197         platform_time_calibration();
1198     }
1199 
1200     return 0;
1201 }
1202 
1203 /* Per-CPU communication between rendezvous IRQ and softirq handler. */
1204 static DEFINE_PER_CPU(struct cpu_time_stamp, cpu_calibration);
1205 
1206 /* Softirq handler for per-CPU time calibration. */
local_time_calibration(void)1207 static void local_time_calibration(void)
1208 {
1209     struct cpu_time *t = &this_cpu(cpu_time);
1210     const struct cpu_time_stamp *c = &this_cpu(cpu_calibration);
1211 
1212     /*
1213      * System (extrapolated from local and master oscillators) and TSC
1214      * timestamps, taken during this calibration and the previous one.
1215      */
1216     struct cpu_time_stamp prev, curr;
1217 
1218     /*
1219      * System time and TSC ticks elapsed during the previous calibration
1220      * 'epoch'. These values are down-shifted to fit in 32 bits.
1221      */
1222     u64 stime_elapsed64, tsc_elapsed64;
1223     u32 stime_elapsed32, tsc_elapsed32;
1224 
1225     /* Error correction to slow down a fast local clock. */
1226     u32 error_factor = 0;
1227 
1228     /* Calculated TSC shift to ensure 32-bit scale multiplier. */
1229     int tsc_shift = 0;
1230 
1231     /* The overall calibration scale multiplier. */
1232     u32 calibration_mul_frac;
1233 
1234     if ( boot_cpu_has(X86_FEATURE_CONSTANT_TSC) )
1235     {
1236         /* Atomically read cpu_calibration struct and write cpu_time struct. */
1237         local_irq_disable();
1238         t->stamp = *c;
1239         local_irq_enable();
1240         update_vcpu_system_time(current);
1241         goto out;
1242     }
1243 
1244     prev = t->stamp;
1245 
1246     /* Disabling IRQs ensures we atomically read cpu_calibration struct. */
1247     local_irq_disable();
1248     curr = *c;
1249     local_irq_enable();
1250 
1251 #if 0
1252     printk("PRE%d: tsc=%"PRIu64" stime=%"PRIu64" master=%"PRIu64"\n",
1253            smp_processor_id(), prev.local_tsc, prev.local_stime, prev.master_stime);
1254     printk("CUR%d: tsc=%"PRIu64" stime=%"PRIu64" master=%"PRIu64
1255            " -> %"PRId64"\n",
1256            smp_processor_id(), curr.local_tsc, curr.local_stime, curr.master_stime,
1257            curr.master_stime - curr.local_stime);
1258 #endif
1259 
1260     /* Local time warps forward if it lags behind master time. */
1261     if ( curr.local_stime < curr.master_stime )
1262         curr.local_stime = curr.master_stime;
1263 
1264     stime_elapsed64 = curr.master_stime - prev.master_stime;
1265     tsc_elapsed64   = curr.local_tsc - prev.local_tsc;
1266 
1267     /*
1268      * Weirdness can happen if we lose sync with the platform timer.
1269      * We could be smarter here: resync platform timer with local timer?
1270      */
1271     if ( ((s64)stime_elapsed64 < (EPOCH / 2)) )
1272         goto out;
1273 
1274     /*
1275      * Calculate error-correction factor. This only slows down a fast local
1276      * clock (slow clocks are warped forwards). The scale factor is clamped
1277      * to >= 0.5.
1278      */
1279     if ( curr.local_stime != curr.master_stime )
1280     {
1281         u64 local_stime_err = curr.local_stime - curr.master_stime;
1282 
1283         if ( local_stime_err > EPOCH )
1284             local_stime_err = EPOCH;
1285         error_factor = div_frac(EPOCH, EPOCH + (u32)local_stime_err);
1286     }
1287 
1288     /*
1289      * We require 0 < stime_elapsed < 2^31.
1290      * This allows us to binary shift a 32-bit tsc_elapsed such that:
1291      * stime_elapsed < tsc_elapsed <= 2*stime_elapsed
1292      */
1293     while ( ((u32)stime_elapsed64 != stime_elapsed64) ||
1294             ((s32)stime_elapsed64 < 0) )
1295     {
1296         stime_elapsed64 >>= 1;
1297         tsc_elapsed64   >>= 1;
1298     }
1299 
1300     /* stime_master_diff now fits in a 32-bit word. */
1301     stime_elapsed32 = (u32)stime_elapsed64;
1302 
1303     /* tsc_elapsed <= 2*stime_elapsed */
1304     while ( tsc_elapsed64 > (stime_elapsed32 * 2) )
1305     {
1306         tsc_elapsed64 >>= 1;
1307         tsc_shift--;
1308     }
1309 
1310     /* Local difference must now fit in 32 bits. */
1311     ASSERT((u32)tsc_elapsed64 == tsc_elapsed64);
1312     tsc_elapsed32 = (u32)tsc_elapsed64;
1313 
1314     /* tsc_elapsed > stime_elapsed */
1315     ASSERT(tsc_elapsed32 != 0);
1316     while ( tsc_elapsed32 <= stime_elapsed32 )
1317     {
1318         tsc_elapsed32 <<= 1;
1319         tsc_shift++;
1320     }
1321 
1322     calibration_mul_frac = div_frac(stime_elapsed32, tsc_elapsed32);
1323     if ( error_factor != 0 )
1324         calibration_mul_frac = mul_frac(calibration_mul_frac, error_factor);
1325 
1326 #if 0
1327     printk("---%d: %08x %08x %d\n", smp_processor_id(),
1328            error_factor, calibration_mul_frac, tsc_shift);
1329 #endif
1330 
1331     /* Record new timestamp information, atomically w.r.t. interrupts. */
1332     local_irq_disable();
1333     t->tsc_scale.mul_frac = calibration_mul_frac;
1334     t->tsc_scale.shift    = tsc_shift;
1335     t->stamp              = curr;
1336     local_irq_enable();
1337 
1338     update_vcpu_system_time(current);
1339 
1340  out:
1341     if ( smp_processor_id() == 0 )
1342     {
1343         set_timer(&calibration_timer, NOW() + EPOCH);
1344         platform_time_calibration();
1345     }
1346 }
1347 
1348 /*
1349  * TSC Reliability check
1350  */
1351 
1352 /*
1353  * The Linux original version of this function is
1354  * Copyright (c) 2006, Red Hat, Inc., Ingo Molnar
1355  */
check_tsc_warp(unsigned long tsc_khz,unsigned long * max_warp)1356 static void check_tsc_warp(unsigned long tsc_khz, unsigned long *max_warp)
1357 {
1358     static DEFINE_SPINLOCK(sync_lock);
1359     static cycles_t last_tsc;
1360 
1361     cycles_t start, now, prev, end;
1362     int i;
1363 
1364     start = rdtsc_ordered();
1365 
1366     /* The measurement runs for 20 msecs: */
1367     end = start + tsc_khz * 20ULL;
1368     now = start;
1369 
1370     for ( i = 0; ; i++ )
1371     {
1372         /*
1373          * We take the global lock, measure TSC, save the
1374          * previous TSC that was measured (possibly on
1375          * another CPU) and update the previous TSC timestamp.
1376          */
1377         spin_lock(&sync_lock);
1378         prev = last_tsc;
1379         now = rdtsc_ordered();
1380         last_tsc = now;
1381         spin_unlock(&sync_lock);
1382 
1383         /*
1384          * Be nice every now and then (and also check whether measurement is
1385          * done [we also insert a 10 million loops safety exit, so we dont
1386          * lock up in case the TSC readout is totally broken]):
1387          */
1388         if ( unlikely(!(i & 7)) )
1389         {
1390             if ( (now > end) || (i > 10000000) )
1391                 break;
1392             cpu_relax();
1393             /*touch_nmi_watchdog();*/
1394         }
1395 
1396         /*
1397          * Outside the critical section we can now see whether we saw a
1398          * time-warp of the TSC going backwards:
1399          */
1400         if ( unlikely(prev > now) )
1401         {
1402             spin_lock(&sync_lock);
1403             if ( *max_warp < prev - now )
1404                 *max_warp = prev - now;
1405             spin_unlock(&sync_lock);
1406         }
1407     }
1408 }
1409 
1410 static unsigned long tsc_max_warp, tsc_check_count;
1411 static cpumask_t tsc_check_cpumask;
1412 
tsc_check_slave(void * unused)1413 static void tsc_check_slave(void *unused)
1414 {
1415     unsigned int cpu = smp_processor_id();
1416     local_irq_disable();
1417     while ( !cpumask_test_cpu(cpu, &tsc_check_cpumask) )
1418         cpu_relax();
1419     check_tsc_warp(cpu_khz, &tsc_max_warp);
1420     cpumask_clear_cpu(cpu, &tsc_check_cpumask);
1421     local_irq_enable();
1422 }
1423 
tsc_check_reliability(void)1424 static void tsc_check_reliability(void)
1425 {
1426     unsigned int cpu = smp_processor_id();
1427     static DEFINE_SPINLOCK(lock);
1428 
1429     spin_lock(&lock);
1430 
1431     tsc_check_count++;
1432     smp_call_function(tsc_check_slave, NULL, 0);
1433     cpumask_andnot(&tsc_check_cpumask, &cpu_online_map, cpumask_of(cpu));
1434     local_irq_disable();
1435     check_tsc_warp(cpu_khz, &tsc_max_warp);
1436     local_irq_enable();
1437     while ( !cpumask_empty(&tsc_check_cpumask) )
1438         cpu_relax();
1439 
1440     spin_unlock(&lock);
1441 }
1442 
1443 /*
1444  * Rendezvous for all CPUs in IRQ context.
1445  * Master CPU snapshots the platform timer.
1446  * All CPUS snapshot their local TSC and extrapolation of system time.
1447  */
1448 struct calibration_rendezvous {
1449     cpumask_t cpu_calibration_map;
1450     atomic_t semaphore;
1451     s_time_t master_stime;
1452     u64 master_tsc_stamp;
1453 };
1454 
1455 static void
time_calibration_rendezvous_tail(const struct calibration_rendezvous * r)1456 time_calibration_rendezvous_tail(const struct calibration_rendezvous *r)
1457 {
1458     struct cpu_time_stamp *c = &this_cpu(cpu_calibration);
1459 
1460     c->local_tsc    = rdtsc_ordered();
1461     c->local_stime  = get_s_time_fixed(c->local_tsc);
1462     c->master_stime = r->master_stime;
1463 
1464     raise_softirq(TIME_CALIBRATE_SOFTIRQ);
1465 }
1466 
1467 /*
1468  * Keep TSCs in sync when they run at the same rate, but may stop in
1469  * deep-sleep C states.
1470  */
time_calibration_tsc_rendezvous(void * _r)1471 static void time_calibration_tsc_rendezvous(void *_r)
1472 {
1473     int i;
1474     struct calibration_rendezvous *r = _r;
1475     unsigned int total_cpus = cpumask_weight(&r->cpu_calibration_map);
1476 
1477     /* Loop to get rid of cache effects on TSC skew. */
1478     for ( i = 4; i >= 0; i-- )
1479     {
1480         if ( smp_processor_id() == 0 )
1481         {
1482             while ( atomic_read(&r->semaphore) != (total_cpus - 1) )
1483                 cpu_relax();
1484 
1485             if ( r->master_stime == 0 )
1486             {
1487                 r->master_stime = read_platform_stime(NULL);
1488                 r->master_tsc_stamp = rdtsc_ordered();
1489             }
1490             atomic_inc(&r->semaphore);
1491 
1492             if ( i == 0 )
1493                 write_tsc(r->master_tsc_stamp);
1494 
1495             while ( atomic_read(&r->semaphore) != (2*total_cpus - 1) )
1496                 cpu_relax();
1497             atomic_set(&r->semaphore, 0);
1498         }
1499         else
1500         {
1501             atomic_inc(&r->semaphore);
1502             while ( atomic_read(&r->semaphore) < total_cpus )
1503                 cpu_relax();
1504 
1505             if ( i == 0 )
1506                 write_tsc(r->master_tsc_stamp);
1507 
1508             atomic_inc(&r->semaphore);
1509             while ( atomic_read(&r->semaphore) > total_cpus )
1510                 cpu_relax();
1511         }
1512     }
1513 
1514     time_calibration_rendezvous_tail(r);
1515 }
1516 
1517 /* Ordinary rendezvous function which does not modify TSC values. */
time_calibration_std_rendezvous(void * _r)1518 static void time_calibration_std_rendezvous(void *_r)
1519 {
1520     struct calibration_rendezvous *r = _r;
1521     unsigned int total_cpus = cpumask_weight(&r->cpu_calibration_map);
1522 
1523     if ( smp_processor_id() == 0 )
1524     {
1525         while ( atomic_read(&r->semaphore) != (total_cpus - 1) )
1526             cpu_relax();
1527         r->master_stime = read_platform_stime(NULL);
1528         smp_wmb(); /* write r->master_stime /then/ signal */
1529         atomic_inc(&r->semaphore);
1530     }
1531     else
1532     {
1533         atomic_inc(&r->semaphore);
1534         while ( atomic_read(&r->semaphore) != total_cpus )
1535             cpu_relax();
1536         smp_rmb(); /* receive signal /then/ read r->master_stime */
1537     }
1538 
1539     time_calibration_rendezvous_tail(r);
1540 }
1541 
1542 /*
1543  * Rendezvous function used when clocksource is TSC and
1544  * no CPU hotplug will be performed.
1545  */
time_calibration_nop_rendezvous(void * rv)1546 static void time_calibration_nop_rendezvous(void *rv)
1547 {
1548     const struct calibration_rendezvous *r = rv;
1549     struct cpu_time_stamp *c = &this_cpu(cpu_calibration);
1550 
1551     c->local_tsc    = r->master_tsc_stamp;
1552     c->local_stime  = r->master_stime;
1553     c->master_stime = r->master_stime;
1554 
1555     raise_softirq(TIME_CALIBRATE_SOFTIRQ);
1556 }
1557 
1558 static void (*time_calibration_rendezvous_fn)(void *) =
1559     time_calibration_std_rendezvous;
1560 
time_calibration(void * unused)1561 static void time_calibration(void *unused)
1562 {
1563     struct calibration_rendezvous r = {
1564         .semaphore = ATOMIC_INIT(0)
1565     };
1566 
1567     if ( clocksource_is_tsc() )
1568     {
1569         local_irq_disable();
1570         r.master_stime = read_platform_stime(&r.master_tsc_stamp);
1571         local_irq_enable();
1572     }
1573 
1574     cpumask_copy(&r.cpu_calibration_map, &cpu_online_map);
1575 
1576     /* @wait=1 because we must wait for all cpus before freeing @r. */
1577     on_selected_cpus(&r.cpu_calibration_map,
1578                      time_calibration_rendezvous_fn,
1579                      &r, 1);
1580 }
1581 
1582 static struct cpu_time_stamp ap_bringup_ref;
1583 
time_latch_stamps(void)1584 void time_latch_stamps(void)
1585 {
1586     unsigned long flags;
1587 
1588     local_irq_save(flags);
1589     ap_bringup_ref.master_stime = read_platform_stime(NULL);
1590     ap_bringup_ref.local_tsc = rdtsc_ordered();
1591     local_irq_restore(flags);
1592 
1593     ap_bringup_ref.local_stime = get_s_time_fixed(ap_bringup_ref.local_tsc);
1594 }
1595 
init_percpu_time(void)1596 void init_percpu_time(void)
1597 {
1598     struct cpu_time *t = &this_cpu(cpu_time);
1599     unsigned long flags;
1600     u64 tsc;
1601     s_time_t now;
1602 
1603     /* Initial estimate for TSC rate. */
1604     t->tsc_scale = per_cpu(cpu_time, 0).tsc_scale;
1605 
1606     local_irq_save(flags);
1607     now = read_platform_stime(NULL);
1608     tsc = rdtsc_ordered();
1609     local_irq_restore(flags);
1610 
1611     t->stamp.master_stime = now;
1612     /*
1613      * To avoid a discontinuity (TSC and platform clock can't be expected
1614      * to be in perfect sync), initialization here needs to match up with
1615      * local_time_calibration()'s decision whether to use its fast path.
1616      */
1617     if ( boot_cpu_has(X86_FEATURE_CONSTANT_TSC) )
1618     {
1619         if ( system_state < SYS_STATE_smp_boot )
1620             now = get_s_time_fixed(tsc);
1621         else
1622             now += ap_bringup_ref.local_stime - ap_bringup_ref.master_stime;
1623     }
1624     t->stamp.local_tsc   = tsc;
1625     t->stamp.local_stime = now;
1626 }
1627 
1628 /*
1629  * On certain older Intel CPUs writing the TSC MSR clears the upper 32 bits.
1630  * Obviously we must not use write_tsc() on such CPUs.
1631  *
1632  * Additionally, AMD specifies that being able to write the TSC MSR is not an
1633  * architectural feature (but, other than their manual says, also cannot be
1634  * determined from CPUID bits).
1635  */
tsc_check_writability(void)1636 static void __init tsc_check_writability(void)
1637 {
1638     const char *what = NULL;
1639     uint64_t tsc;
1640 
1641     /*
1642      * If all CPUs are reported as synchronised and in sync, we never write
1643      * the TSCs (except unavoidably, when a CPU is physically hot-plugged).
1644      * Hence testing for writability is pointless and even harmful.
1645      */
1646     if ( boot_cpu_has(X86_FEATURE_TSC_RELIABLE) )
1647         return;
1648 
1649     tsc = rdtsc();
1650     if ( wrmsr_safe(MSR_IA32_TSC, 0) == 0 )
1651     {
1652         uint64_t tmp, tmp2 = rdtsc();
1653 
1654         write_tsc(tsc | (1ULL << 32));
1655         tmp = rdtsc();
1656         if ( ABS((s64)tmp - (s64)tmp2) < (1LL << 31) )
1657             what = "only partially";
1658     }
1659     else
1660     {
1661         what = "not";
1662     }
1663 
1664     /* Nothing to do if the TSC is fully writable. */
1665     if ( !what )
1666     {
1667         /*
1668          * Paranoia - write back original TSC value. However, APs get synced
1669          * with BSP as they are brought up, so this doesn't much matter.
1670          */
1671         write_tsc(tsc);
1672         return;
1673     }
1674 
1675     printk(XENLOG_WARNING "TSC %s writable\n", what);
1676 
1677     /* time_calibration_tsc_rendezvous() must not be used */
1678     setup_clear_cpu_cap(X86_FEATURE_CONSTANT_TSC);
1679 
1680     /* cstate_restore_tsc() must not be used (or do nothing) */
1681     if ( !boot_cpu_has(X86_FEATURE_NONSTOP_TSC) )
1682         cpuidle_disable_deep_cstate();
1683 
1684     /* synchronize_tsc_slave() must do nothing */
1685     disable_tsc_sync = true;
1686 }
1687 
reset_percpu_time(void * unused)1688 static void __init reset_percpu_time(void *unused)
1689 {
1690     struct cpu_time *t = &this_cpu(cpu_time);
1691 
1692     t->stamp.local_tsc = boot_tsc_stamp;
1693     t->stamp.local_stime = 0;
1694     t->stamp.local_stime = get_s_time_fixed(boot_tsc_stamp);
1695     t->stamp.master_stime = t->stamp.local_stime;
1696 }
1697 
try_platform_timer_tail(bool late)1698 static void __init try_platform_timer_tail(bool late)
1699 {
1700     init_timer(&plt_overflow_timer, plt_overflow, NULL, 0);
1701     plt_overflow(NULL);
1702 
1703     platform_timer_stamp = plt_stamp64;
1704     stime_platform_stamp = NOW();
1705 
1706     if ( !late )
1707         init_percpu_time();
1708 
1709     init_timer(&calibration_timer, time_calibration, NULL, 0);
1710     set_timer(&calibration_timer, NOW() + EPOCH);
1711 }
1712 
1713 /* Late init function, after all cpus have booted */
verify_tsc_reliability(void)1714 static int __init verify_tsc_reliability(void)
1715 {
1716     if ( boot_cpu_has(X86_FEATURE_TSC_RELIABLE) )
1717     {
1718         /*
1719          * Sadly, despite processor vendors' best design guidance efforts, on
1720          * some systems, cpus may come out of reset improperly synchronized.
1721          * So we must verify there is no warp and we can't do that until all
1722          * CPUs are booted.
1723          */
1724         tsc_check_reliability();
1725         if ( tsc_max_warp )
1726         {
1727             printk("TSC warp detected, disabling TSC_RELIABLE\n");
1728             setup_clear_cpu_cap(X86_FEATURE_TSC_RELIABLE);
1729         }
1730         else if ( !strcmp(opt_clocksource, "tsc") &&
1731                   (try_platform_timer(&plt_tsc) > 0) )
1732         {
1733             /*
1734              * Platform timer has changed and CPU time will only be updated
1735              * after we set again the calibration timer, which means we need to
1736              * seed again each local CPU time. At this stage TSC is known to be
1737              * reliable i.e. monotonically increasing across all CPUs so this
1738              * lets us remove the skew between platform timer and TSC, since
1739              * these are now effectively the same.
1740              */
1741             on_selected_cpus(&cpu_online_map, reset_percpu_time, NULL, 1);
1742 
1743             /*
1744              * We won't do CPU Hotplug and TSC clocksource is being used which
1745              * means we have a reliable TSC, plus we don't sync with any other
1746              * clocksource so no need for rendezvous.
1747              */
1748             time_calibration_rendezvous_fn = time_calibration_nop_rendezvous;
1749 
1750             /* Finish platform timer switch. */
1751             try_platform_timer_tail(true);
1752 
1753             printk("Switched to Platform timer %s TSC\n",
1754                    freq_string(plt_src.frequency));
1755             return 0;
1756         }
1757     }
1758 
1759     /*
1760      * Re-run the TSC writability check if it didn't run to completion, as
1761      * X86_FEATURE_TSC_RELIABLE may have been cleared by now. This is needed
1762      * for determining which rendezvous function to use (below).
1763      */
1764     if ( !disable_tsc_sync )
1765         tsc_check_writability();
1766 
1767     /*
1768      * While with constant-rate TSCs the scale factor can be shared, when TSCs
1769      * are not marked as 'reliable', re-sync during rendezvous.
1770      */
1771     if ( boot_cpu_has(X86_FEATURE_CONSTANT_TSC) &&
1772          !boot_cpu_has(X86_FEATURE_TSC_RELIABLE) )
1773         time_calibration_rendezvous_fn = time_calibration_tsc_rendezvous;
1774 
1775     return 0;
1776 }
1777 __initcall(verify_tsc_reliability);
1778 
1779 /* Late init function (after interrupts are enabled). */
init_xen_time(void)1780 int __init init_xen_time(void)
1781 {
1782     tsc_check_writability();
1783 
1784     open_softirq(TIME_CALIBRATE_SOFTIRQ, local_time_calibration);
1785 
1786     /* NB. get_wallclock_time() can take over one second to execute. */
1787     do_settime(get_wallclock_time(), 0, NOW());
1788 
1789     /* Finish platform timer initialization. */
1790     try_platform_timer_tail(false);
1791 
1792     return 0;
1793 }
1794 
1795 
1796 /* Early init function. */
early_time_init(void)1797 void __init early_time_init(void)
1798 {
1799     struct cpu_time *t = &this_cpu(cpu_time);
1800     u64 tmp;
1801 
1802     preinit_pit();
1803     tmp = init_platform_timer();
1804     plt_tsc.frequency = tmp;
1805 
1806     set_time_scale(&t->tsc_scale, tmp);
1807     t->stamp.local_tsc = boot_tsc_stamp;
1808 
1809     do_div(tmp, 1000);
1810     cpu_khz = (unsigned long)tmp;
1811     printk("Detected %lu.%03lu MHz processor.\n",
1812            cpu_khz / 1000, cpu_khz % 1000);
1813 
1814     setup_irq(0, 0, &irq0);
1815 }
1816 
1817 /* keep pit enabled for pit_broadcast working while cpuidle enabled */
_disable_pit_irq(void (* hpet_broadcast_setup)(void))1818 static int _disable_pit_irq(void(*hpet_broadcast_setup)(void))
1819 {
1820     int ret = 1;
1821 
1822     if ( using_pit || !cpu_has_apic )
1823         return -1;
1824 
1825     /*
1826      * If we do not rely on PIT CH0 then we can use HPET for one-shot timer
1827      * emulation when entering deep C states.
1828      * XXX dom0 may rely on RTC interrupt delivery, so only enable
1829      * hpet_broadcast if FSB mode available or if force_hpet_broadcast.
1830      */
1831     if ( cpuidle_using_deep_cstate() && !boot_cpu_has(X86_FEATURE_ARAT) )
1832     {
1833         hpet_broadcast_setup();
1834         if ( !hpet_broadcast_is_available() )
1835         {
1836             if ( xen_cpuidle > 0 )
1837             {
1838                 printk("%ps() failed, turning to PIT broadcast\n",
1839                        hpet_broadcast_setup);
1840                 return -1;
1841             }
1842             ret = 0;
1843         }
1844     }
1845 
1846     /* Disable PIT CH0 timer interrupt. */
1847     outb_p(0x30, PIT_MODE);
1848     outb_p(0, PIT_CH0);
1849     outb_p(0, PIT_CH0);
1850 
1851     return ret;
1852 }
1853 
disable_pit_irq(void)1854 static int __init disable_pit_irq(void)
1855 {
1856     if ( !_disable_pit_irq(hpet_broadcast_init) )
1857     {
1858         xen_cpuidle = 0;
1859         printk("CPUIDLE: disabled due to no HPET. "
1860                "Force enable with 'cpuidle'.\n");
1861     }
1862 
1863     return 0;
1864 }
1865 __initcall(disable_pit_irq);
1866 
pit_broadcast_enter(void)1867 void pit_broadcast_enter(void)
1868 {
1869     cpumask_set_cpu(smp_processor_id(), &pit_broadcast_mask);
1870 }
1871 
pit_broadcast_exit(void)1872 void pit_broadcast_exit(void)
1873 {
1874     int cpu = smp_processor_id();
1875 
1876     if ( cpumask_test_and_clear_cpu(cpu, &pit_broadcast_mask) )
1877         reprogram_timer(this_cpu(timer_deadline));
1878 }
1879 
pit_broadcast_is_available(void)1880 int pit_broadcast_is_available(void)
1881 {
1882     return cpuidle_using_deep_cstate();
1883 }
1884 
send_timer_event(struct vcpu * v)1885 void send_timer_event(struct vcpu *v)
1886 {
1887     send_guest_vcpu_virq(v, VIRQ_TIMER);
1888 }
1889 
1890 /* "cmos_utc_offset" is the difference between UTC time and CMOS time. */
1891 static long cmos_utc_offset; /* in seconds */
1892 
time_suspend(void)1893 int time_suspend(void)
1894 {
1895     if ( smp_processor_id() == 0 )
1896     {
1897         cmos_utc_offset = -get_wallclock_time();
1898         cmos_utc_offset += get_sec();
1899         kill_timer(&calibration_timer);
1900 
1901         /* Sync platform timer stamps. */
1902         platform_time_calibration();
1903     }
1904 
1905     /* Better to cancel calibration timer for accuracy. */
1906     clear_bit(TIME_CALIBRATE_SOFTIRQ, &softirq_pending(smp_processor_id()));
1907 
1908     return 0;
1909 }
1910 
time_resume(void)1911 int time_resume(void)
1912 {
1913     preinit_pit();
1914 
1915     resume_platform_timer();
1916 
1917     if ( !_disable_pit_irq(hpet_broadcast_resume) )
1918         BUG();
1919 
1920     init_percpu_time();
1921 
1922     set_timer(&calibration_timer, NOW() + EPOCH);
1923 
1924     do_settime(get_wallclock_time() + cmos_utc_offset, 0, NOW());
1925 
1926     update_vcpu_system_time(current);
1927 
1928     update_domain_rtc();
1929 
1930     return 0;
1931 }
1932 
hwdom_pit_access(struct ioreq * ioreq)1933 int hwdom_pit_access(struct ioreq *ioreq)
1934 {
1935     /* Is Xen using Channel 2? Then disallow direct dom0 access. */
1936     if ( using_pit )
1937         return 0;
1938 
1939     switch ( ioreq->addr )
1940     {
1941     case PIT_CH2:
1942         if ( ioreq->dir == IOREQ_READ )
1943             ioreq->data = inb(PIT_CH2);
1944         else
1945             outb(ioreq->data, PIT_CH2);
1946         return 1;
1947 
1948     case PIT_MODE:
1949         if ( ioreq->dir == IOREQ_READ )
1950             return 0; /* urk! */
1951         switch ( ioreq->data & 0xc0 )
1952         {
1953         case 0xc0: /* Read Back */
1954             if ( ioreq->data & 0x08 )    /* Select Channel 2? */
1955                 outb(ioreq->data & 0xf8, PIT_MODE);
1956             if ( !(ioreq->data & 0x06) ) /* Select Channel 0/1? */
1957                 return 1; /* no - we're done */
1958             /* Filter Channel 2 and reserved bit 0. */
1959             ioreq->data &= ~0x09;
1960             return 0; /* emulate ch0/1 readback */
1961         case 0x80: /* Select Counter 2 */
1962             outb(ioreq->data, PIT_MODE);
1963             return 1;
1964         }
1965         break;
1966 
1967     case 0x61:
1968         if ( ioreq->dir == IOREQ_READ )
1969             ioreq->data = inb(0x61);
1970         else
1971             outb((inb(0x61) & ~3) | (ioreq->data & 3), 0x61);
1972         return 1;
1973     }
1974 
1975     return 0;
1976 }
1977 
1978 /*
1979  * PV SoftTSC Emulation.
1980  */
1981 
1982 /*
1983  * tsc=unstable: Override all tests; assume TSC is unreliable.
1984  * tsc=skewed: Assume TSCs are individually reliable, but skewed across CPUs.
1985  * tsc=stable:socket: Assume TSCs are reliable across sockets.
1986  */
tsc_parse(const char * s)1987 static int __init tsc_parse(const char *s)
1988 {
1989     if ( !strcmp(s, "unstable") )
1990     {
1991         setup_clear_cpu_cap(X86_FEATURE_CONSTANT_TSC);
1992         setup_clear_cpu_cap(X86_FEATURE_NONSTOP_TSC);
1993         setup_clear_cpu_cap(X86_FEATURE_TSC_RELIABLE);
1994     }
1995     else if ( !strcmp(s, "skewed") )
1996         setup_clear_cpu_cap(X86_FEATURE_TSC_RELIABLE);
1997     else if ( !strcmp(s, "stable:socket") )
1998         tsc_flags |= TSC_RELIABLE_SOCKET;
1999     else
2000         return -EINVAL;
2001 
2002     return 0;
2003 }
2004 custom_param("tsc", tsc_parse);
2005 
gtime_to_gtsc(struct domain * d,u64 time)2006 u64 gtime_to_gtsc(struct domain *d, u64 time)
2007 {
2008     if ( !is_hvm_domain(d) )
2009     {
2010         if ( time < d->arch.vtsc_offset )
2011             return -scale_delta(d->arch.vtsc_offset - time,
2012                                 &d->arch.ns_to_vtsc);
2013         time -= d->arch.vtsc_offset;
2014     }
2015     return scale_delta(time, &d->arch.ns_to_vtsc);
2016 }
2017 
gtsc_to_gtime(struct domain * d,u64 tsc)2018 u64 gtsc_to_gtime(struct domain *d, u64 tsc)
2019 {
2020     u64 time = scale_delta(tsc, &d->arch.vtsc_to_ns);
2021 
2022     if ( !is_hvm_domain(d) )
2023         time += d->arch.vtsc_offset;
2024     return time;
2025 }
2026 
pv_soft_rdtsc(struct vcpu * v,struct cpu_user_regs * regs,int rdtscp)2027 void pv_soft_rdtsc(struct vcpu *v, struct cpu_user_regs *regs, int rdtscp)
2028 {
2029     s_time_t now = get_s_time();
2030     struct domain *d = v->domain;
2031 
2032     spin_lock(&d->arch.vtsc_lock);
2033 
2034 #if !defined(NDEBUG) || defined(CONFIG_PERF_COUNTERS)
2035     if ( guest_kernel_mode(v, regs) )
2036         d->arch.vtsc_kerncount++;
2037     else
2038         d->arch.vtsc_usercount++;
2039 #endif
2040 
2041     if ( (int64_t)(now - d->arch.vtsc_last) > 0 )
2042         d->arch.vtsc_last = now;
2043     else
2044         now = ++d->arch.vtsc_last;
2045 
2046     spin_unlock(&d->arch.vtsc_lock);
2047 
2048     msr_split(regs, gtime_to_gtsc(d, now));
2049 
2050     if ( rdtscp )
2051          regs->rcx =
2052              (d->arch.tsc_mode == TSC_MODE_PVRDTSCP) ? d->arch.incarnation : 0;
2053 }
2054 
clocksource_is_tsc(void)2055 bool clocksource_is_tsc(void)
2056 {
2057     return plt_src.read_counter == read_tsc;
2058 }
2059 
host_tsc_is_safe(void)2060 int host_tsc_is_safe(void)
2061 {
2062     return boot_cpu_has(X86_FEATURE_TSC_RELIABLE);
2063 }
2064 
2065 /*
2066  * called to collect tsc-related data only for save file or live
2067  * migrate; called after last rdtsc is done on this incarnation
2068  */
tsc_get_info(struct domain * d,uint32_t * tsc_mode,uint64_t * elapsed_nsec,uint32_t * gtsc_khz,uint32_t * incarnation)2069 void tsc_get_info(struct domain *d, uint32_t *tsc_mode,
2070                   uint64_t *elapsed_nsec, uint32_t *gtsc_khz,
2071                   uint32_t *incarnation)
2072 {
2073     bool enable_tsc_scaling = is_hvm_domain(d) &&
2074                               hvm_tsc_scaling_supported && !d->arch.vtsc;
2075 
2076     *incarnation = d->arch.incarnation;
2077     *tsc_mode = d->arch.tsc_mode;
2078 
2079     switch ( *tsc_mode )
2080     {
2081         uint64_t tsc;
2082 
2083     case TSC_MODE_NEVER_EMULATE:
2084         *elapsed_nsec = *gtsc_khz = 0;
2085         break;
2086     case TSC_MODE_DEFAULT:
2087         if ( d->arch.vtsc )
2088         {
2089     case TSC_MODE_ALWAYS_EMULATE:
2090             *elapsed_nsec = get_s_time() - d->arch.vtsc_offset;
2091             *gtsc_khz = d->arch.tsc_khz;
2092             break;
2093         }
2094         tsc = rdtsc();
2095         *elapsed_nsec = scale_delta(tsc, &d->arch.vtsc_to_ns);
2096         *gtsc_khz = enable_tsc_scaling ? d->arch.tsc_khz : cpu_khz;
2097         break;
2098     case TSC_MODE_PVRDTSCP:
2099         if ( d->arch.vtsc )
2100         {
2101             *elapsed_nsec = get_s_time() - d->arch.vtsc_offset;
2102             *gtsc_khz = cpu_khz;
2103         }
2104         else
2105         {
2106             tsc = rdtsc();
2107             *elapsed_nsec = scale_delta(tsc, &this_cpu(cpu_time).tsc_scale) -
2108                             d->arch.vtsc_offset;
2109             *gtsc_khz = enable_tsc_scaling ? d->arch.tsc_khz
2110                                            : 0 /* ignored by tsc_set_info */;
2111         }
2112         break;
2113     }
2114 
2115     if ( (int64_t)*elapsed_nsec < 0 )
2116         *elapsed_nsec = 0;
2117 }
2118 
2119 /*
2120  * This may be called as many as three times for a domain, once when the
2121  * hypervisor creates the domain, once when the toolstack creates the
2122  * domain and, if restoring/migrating, once when saved/migrated values
2123  * are restored.  Care must be taken that, if multiple calls occur,
2124  * only the last "sticks" and all are completed before the guest executes
2125  * an rdtsc instruction
2126  */
tsc_set_info(struct domain * d,uint32_t tsc_mode,uint64_t elapsed_nsec,uint32_t gtsc_khz,uint32_t incarnation)2127 void tsc_set_info(struct domain *d,
2128                   uint32_t tsc_mode, uint64_t elapsed_nsec,
2129                   uint32_t gtsc_khz, uint32_t incarnation)
2130 {
2131     if ( is_idle_domain(d) || is_hardware_domain(d) )
2132     {
2133         d->arch.vtsc = 0;
2134         return;
2135     }
2136 
2137     switch ( d->arch.tsc_mode = tsc_mode )
2138     {
2139         bool enable_tsc_scaling;
2140 
2141     case TSC_MODE_DEFAULT:
2142     case TSC_MODE_ALWAYS_EMULATE:
2143         d->arch.vtsc_offset = get_s_time() - elapsed_nsec;
2144         d->arch.tsc_khz = gtsc_khz ?: cpu_khz;
2145         set_time_scale(&d->arch.vtsc_to_ns, d->arch.tsc_khz * 1000);
2146 
2147         /*
2148          * In default mode use native TSC if the host has safe TSC and
2149          * host and guest frequencies are the same (either "naturally" or
2150          * - for HVM/PVH - via TSC scaling).
2151          * When a guest is created, gtsc_khz is passed in as zero, making
2152          * d->arch.tsc_khz == cpu_khz. Thus no need to check incarnation.
2153          */
2154         if ( tsc_mode == TSC_MODE_DEFAULT && host_tsc_is_safe() &&
2155              (d->arch.tsc_khz == cpu_khz ||
2156               (is_hvm_domain(d) &&
2157                hvm_get_tsc_scaling_ratio(d->arch.tsc_khz))) )
2158         {
2159     case TSC_MODE_NEVER_EMULATE:
2160             d->arch.vtsc = 0;
2161             break;
2162         }
2163         d->arch.vtsc = 1;
2164         d->arch.ns_to_vtsc = scale_reciprocal(d->arch.vtsc_to_ns);
2165         break;
2166     case TSC_MODE_PVRDTSCP:
2167         d->arch.vtsc = !boot_cpu_has(X86_FEATURE_RDTSCP) ||
2168                        !host_tsc_is_safe();
2169         enable_tsc_scaling = is_hvm_domain(d) && !d->arch.vtsc &&
2170                              hvm_get_tsc_scaling_ratio(gtsc_khz ?: cpu_khz);
2171         d->arch.tsc_khz = (enable_tsc_scaling && gtsc_khz) ? gtsc_khz : cpu_khz;
2172         set_time_scale(&d->arch.vtsc_to_ns, d->arch.tsc_khz * 1000 );
2173         d->arch.ns_to_vtsc = scale_reciprocal(d->arch.vtsc_to_ns);
2174         if ( d->arch.vtsc )
2175             d->arch.vtsc_offset = get_s_time() - elapsed_nsec;
2176         else {
2177             /* when using native TSC, offset is nsec relative to power-on
2178              * of physical machine */
2179             d->arch.vtsc_offset = scale_delta(rdtsc(),
2180                                               &this_cpu(cpu_time).tsc_scale) -
2181                                   elapsed_nsec;
2182         }
2183         break;
2184     }
2185     d->arch.incarnation = incarnation + 1;
2186     if ( is_hvm_domain(d) )
2187     {
2188         if ( hvm_tsc_scaling_supported && !d->arch.vtsc )
2189             d->arch.hvm_domain.tsc_scaling_ratio =
2190                 hvm_get_tsc_scaling_ratio(d->arch.tsc_khz);
2191 
2192         hvm_set_rdtsc_exiting(d, d->arch.vtsc);
2193         if ( d->vcpu && d->vcpu[0] && incarnation == 0 )
2194         {
2195             /*
2196              * set_tsc_offset() is called from hvm_vcpu_initialise() before
2197              * tsc_set_info(). New vtsc mode may require recomputing TSC
2198              * offset.
2199              * We only need to do this for BSP during initial boot. APs will
2200              * call set_tsc_offset() later from hvm_vcpu_reset_state() and they
2201              * will sync their TSC to BSP's sync_tsc.
2202              */
2203             d->arch.hvm_domain.sync_tsc = rdtsc();
2204             hvm_funcs.set_tsc_offset(d->vcpu[0],
2205                                      d->vcpu[0]->arch.hvm_vcpu.cache_tsc_offset,
2206                                      d->arch.hvm_domain.sync_tsc);
2207         }
2208     }
2209 
2210     recalculate_cpuid_policy(d);
2211 }
2212 
2213 /* vtsc may incur measurable performance degradation, diagnose with this */
dump_softtsc(unsigned char key)2214 static void dump_softtsc(unsigned char key)
2215 {
2216     struct domain *d;
2217     int domcnt = 0;
2218 
2219     tsc_check_reliability();
2220     if ( boot_cpu_has(X86_FEATURE_TSC_RELIABLE) )
2221         printk("TSC marked as reliable, "
2222                "warp = %lu (count=%lu)\n", tsc_max_warp, tsc_check_count);
2223     else if ( boot_cpu_has(X86_FEATURE_CONSTANT_TSC ) )
2224     {
2225         printk("TSC has constant rate, ");
2226         if (max_cstate <= 2 && tsc_max_warp == 0)
2227             printk("no deep Cstates, passed warp test, deemed reliable, ");
2228         else
2229             printk("deep Cstates possible, so not reliable, ");
2230         printk("warp=%lu (count=%lu)\n", tsc_max_warp, tsc_check_count);
2231     } else
2232         printk("TSC not marked as either constant or reliable, "
2233                "warp=%lu (count=%lu)\n", tsc_max_warp, tsc_check_count);
2234     for_each_domain ( d )
2235     {
2236         if ( is_hardware_domain(d) && d->arch.tsc_mode == TSC_MODE_DEFAULT )
2237             continue;
2238         printk("dom%u%s: mode=%d",d->domain_id,
2239                 is_hvm_domain(d) ? "(hvm)" : "", d->arch.tsc_mode);
2240         if ( d->arch.vtsc_offset )
2241             printk(",ofs=%#"PRIx64, d->arch.vtsc_offset);
2242         if ( d->arch.tsc_khz )
2243             printk(",khz=%"PRIu32, d->arch.tsc_khz);
2244         if ( d->arch.incarnation )
2245             printk(",inc=%"PRIu32, d->arch.incarnation);
2246 #if !defined(NDEBUG) || defined(CONFIG_PERF_COUNTERS)
2247         if ( d->arch.vtsc_kerncount | d->arch.vtsc_usercount )
2248             printk(",vtsc count: %"PRIu64" kernel,%"PRIu64" user",
2249                    d->arch.vtsc_kerncount, d->arch.vtsc_usercount);
2250 #endif
2251         printk("\n");
2252         domcnt++;
2253     }
2254 
2255     if ( !domcnt )
2256             printk("No domains have emulated TSC\n");
2257 }
2258 
setup_dump_softtsc(void)2259 static int __init setup_dump_softtsc(void)
2260 {
2261     register_keyhandler('s', dump_softtsc, "dump softtsc stats", 1);
2262     return 0;
2263 }
2264 __initcall(setup_dump_softtsc);
2265 
2266 /*
2267  * Local variables:
2268  * mode: C
2269  * c-file-style: "BSD"
2270  * c-basic-offset: 4
2271  * tab-width: 4
2272  * indent-tabs-mode: nil
2273  * End:
2274  */
2275