1 /* pvrdtscp algorithm
2 *
3 * This sample code demonstrates the use of the paravirtualized rdtscp
4 * algorithm. Using this algorithm, an application may communicate with
5 * the Xen hypervisor (version 4.0+) to obtain timestamp information which
6 * is both monotonically increasing and has a fixed 1 GHz rate, even across
7 * migrations between machines with different TSC rates and offsets.
8 * Further,the algorithm provides performance near the performance of a
9 * native rdtsc/rdtscp instruction -- much faster than emulation PROVIDED
10 * the application is running on a machine on which the rdtscp instruction
11 * is supported and TSC is "safe". The application must also be running in a
12 * PV domain. (HVM domains may be supported at a later time.) On machines
13 * where TSC is unsafe or the rdtscp instruction is not supported, Xen
14 * (v4.0+) provides emulation which is slower but consistent with the pvrdtscp
15 * algorithm, thus providing support for the algorithm for live migration
16 * across all machines.
17 *
18 * More information can be found within the Xen (4.0+) source tree at
19 * docs/misc/tscmode.txt
20 *
21 * Copyright (c) 2009 Oracle Corporation and/or its affiliates.
22 * All rights reserved
23 * Written by: Dan Magenheimer <dan.magenheimer@oracle.com>
24 *
25 * This code is derived from code licensed under the GNU
26 * General Public License ("GPL") version 2 and is therefore itself
27 * also licensed under the GPL version 2.
28 *
29 * This code is known to compile and run on Oracle Enterprise Linux 5 Update 2
30 * using gcc version 4.1.2, but its purpose is to describe the pvrdtscp
31 * algorithm and its ABI to Xen version 4.0+
32 */
33
34 #include <stdio.h>
35 #include <stdlib.h>
36 #include <string.h>
37 #include <sys/wait.h>
38
39 #ifdef __LP64__
40 #define __X86_64__
41 typedef unsigned short u16;
42 typedef unsigned int u32;
43 typedef unsigned long u64;
44 typedef int i32;
45 typedef long i64;
46 #define NSEC_PER_SEC 1000000000
47 #else
48 #define __X86_32__
49 typedef unsigned int u16;
50 typedef unsigned long u32;
51 typedef unsigned long long u64;
52 typedef long i32;
53 typedef long long i64;
54 #define NSEC_PER_SEC 1000000000L
55 #endif
56
hvm_cpuid(u32 idx,u32 sub,u32 * eax,u32 * ebx,u32 * ecx,u32 * edx)57 static inline void hvm_cpuid(u32 idx, u32 sub,
58 u32 *eax, u32 *ebx, u32 *ecx, u32 *edx)
59 {
60 *eax = idx, *ecx = sub;
61 asm("cpuid" : "=a" (*eax), "=b" (*ebx), "=c" (*ecx), "=d" (*edx)
62 : "0" (*eax), "2" (*ecx));
63 }
64
pv_cpuid(u32 idx,u32 sub,u32 * eax,u32 * ebx,u32 * ecx,u32 * edx)65 static inline void pv_cpuid(u32 idx, u32 sub,
66 u32 *eax, u32 *ebx, u32 *ecx, u32 *edx)
67 {
68 *eax = idx, *ecx = sub;
69 asm volatile ( "ud2a ; .ascii \"xen\"; cpuid" : "=a" (*eax),
70 "=b" (*ebx), "=c" (*ecx), "=d" (*edx) : "0" (*eax), "2" (*ecx));
71 }
72
do_rdtscp(u32 * aux)73 static inline u64 do_rdtscp(u32 *aux)
74 {
75 static u64 last = 0;
76 u32 lo32, hi32;
77 u64 val;
78
79 asm volatile(".byte 0x0f,0x01,0xf9":"=a"(lo32),"=d"(hi32),"=c" (*aux));
80 val = lo32 | ((u64)hi32 << 32);
81 return val;
82 }
83
get_xen_tsc_mode(void)84 static inline int get_xen_tsc_mode(void)
85 {
86 u32 val, dummy1, dummy2, dummy3;
87 pv_cpuid(0x40000003,0,&dummy1,&val,&dummy2,&dummy3);
88 return val;
89 }
90
get_xen_vtsc(void)91 static inline int get_xen_vtsc(void)
92 {
93 u32 val, dummy1, dummy2, dummy3;
94 pv_cpuid(0x40000003,0,&val,&dummy1,&dummy2,&dummy3);
95 return val & 1;
96 }
97
get_xen_vtsc_khz(void)98 static inline int get_xen_vtsc_khz(void)
99 {
100 u32 val, dummy1, dummy2, dummy3;
101 pv_cpuid(0x40000003,0,&dummy1,&dummy2,&val,&dummy3);
102 return val;
103 }
104
get_xen_cpu_khz(void)105 static inline u32 get_xen_cpu_khz(void)
106 {
107 u32 cpu_khz, dummy1, dummy2, dummy3;
108 pv_cpuid(0x40000003,2,&cpu_khz,&dummy1,&dummy2,&dummy3);
109 return cpu_khz;
110 }
111
get_xen_incarnation(void)112 static inline u32 get_xen_incarnation(void)
113 {
114 u32 incarn, dummy1, dummy2, dummy3;
115 pv_cpuid(0x40000003,0,&dummy1,&dummy2,&dummy3,&incarn);
116 return incarn;
117 }
118
get_xen_time_values(u64 * offset,u32 * mul_frac,u32 * shift)119 static inline void get_xen_time_values(u64 *offset, u32 *mul_frac, u32 *shift)
120 {
121 u32 off_lo, off_hi, sys_lo, sys_hi, dummy;
122
123 pv_cpuid(0x40000003,1,&off_lo,&off_hi,mul_frac,shift);
124 *offset = off_lo | ((u64)off_hi << 32);
125 }
126
scale_delta(u64 delta,u32 tsc_mul_frac,i32 tsc_shift)127 static inline u64 scale_delta(u64 delta, u32 tsc_mul_frac, i32 tsc_shift)
128 {
129 u64 product;
130 #ifdef __X86_32__
131 u32 tmp1, tmp2;
132 #endif
133
134 if ( tsc_shift < 0 )
135 delta >>= -tsc_shift;
136 else
137 delta <<= tsc_shift;
138
139 #ifdef __X86_32__
140 asm (
141 "mul %5 ; "
142 "mov %4,%%eax ; "
143 "mov %%edx,%4 ; "
144 "mul %5 ; "
145 "xor %5,%5 ; "
146 "add %4,%%eax ; "
147 "adc %5,%%edx ; "
148 : "=A" (product), "=r" (tmp1), "=r" (tmp2)
149 : "a" ((u32)delta), "1" ((u32)(delta >> 32)), "2" (tsc_mul_frac) );
150 #else
151 asm (
152 "mul %%rdx ; shrd $32,%%rdx,%%rax"
153 : "=a" (product) : "0" (delta), "d" ((u64)tsc_mul_frac) );
154 #endif
155
156 return product;
157 }
158
get_pvrdtscp_timestamp(int * discontinuity)159 static inline u64 get_pvrdtscp_timestamp(int *discontinuity)
160 {
161 static int firsttime = 1;
162 static u64 last_pvrdtscp_timestamp = 0;
163 static u32 last_tsc_aux;
164 static u64 xen_ns_offset;
165 static u32 xen_tsc_to_ns_mul_frac, xen_tsc_to_ns_shift;
166 u32 this_tsc_aux;
167 u64 timestamp, cur_tsc, cur_ns;
168
169 if (firsttime) {
170 cur_tsc = do_rdtscp(&last_tsc_aux);
171 get_xen_time_values(&xen_ns_offset, &xen_tsc_to_ns_mul_frac,
172 &xen_tsc_to_ns_shift);
173 cur_ns = scale_delta(cur_tsc, xen_tsc_to_ns_mul_frac,
174 xen_tsc_to_ns_shift);
175 timestamp = cur_ns - xen_ns_offset;
176 last_pvrdtscp_timestamp = timestamp;
177 firsttime = 0;
178 }
179 cur_tsc = do_rdtscp(&this_tsc_aux);
180 *discontinuity = 0;
181 while (this_tsc_aux != last_tsc_aux) {
182 /* if tsc_aux changed, try again */
183 last_tsc_aux = this_tsc_aux;
184 get_xen_time_values(&xen_ns_offset, &xen_tsc_to_ns_mul_frac,
185 &xen_tsc_to_ns_shift);
186 cur_tsc = do_rdtscp(&this_tsc_aux);
187 *discontinuity = 1;
188 }
189
190 /* compute nsec from TSC and Xen time values */
191 cur_ns = scale_delta(cur_tsc, xen_tsc_to_ns_mul_frac,
192 xen_tsc_to_ns_shift);
193 timestamp = cur_ns - xen_ns_offset;
194
195 /* enforce monotonicity just in case */
196 if ((i64)(timestamp - last_pvrdtscp_timestamp) > 0)
197 last_pvrdtscp_timestamp = timestamp;
198 else {
199 /* this should never happen but we'll check it anyway in
200 * case of some strange combination of scaling errors
201 * occurs across a very fast migration */
202 printf("Time went backwards by %lluns\n",
203 (unsigned long long)(last_pvrdtscp_timestamp-timestamp));
204 timestamp = ++last_pvrdtscp_timestamp;
205 }
206 return timestamp;
207 }
208
209 #define HVM 1
210 #define PVM 0
211
running_on_xen(int hvm,u16 * version_major,u16 * version_minor)212 static int running_on_xen(int hvm, u16 *version_major, u16 *version_minor)
213 {
214 u32 eax, ebx, ecx, edx, base;
215 union { char csig[16]; u32 u[4]; } sig;
216
217 for (base=0x40000000; base < 0x40010000; base += 0x100) {
218 if (hvm==HVM)
219 hvm_cpuid(base,0,&eax,&ebx,&ecx,&edx);
220 else
221 pv_cpuid(base,0,&eax,&ebx,&ecx,&edx);
222 sig.u[0] = ebx; sig.u[1] = ecx; sig.u[2] = edx;
223 sig.csig[12] = '\0';
224 if (!strcmp("XenVMMXenVMM",&sig.csig[0]) && (eax >= (base+2))) {
225 if (hvm==HVM)
226 hvm_cpuid(base+1,0,&eax,&ebx,&ecx,&edx);
227 else
228 pv_cpuid(base+1,0,&eax,&ebx,&ecx,&edx);
229 *version_major = (eax >> 16) & 0xffff;
230 *version_minor = eax & 0xffff;
231 return 1;
232 }
233 }
234 return 0;
235 }
236
main(int ac,char ** av)237 main(int ac, char **av)
238 {
239 u32 dummy;
240 u16 version_hi, version_lo;
241 u64 ts, last_ts;
242 int status, discontinuity = 0;
243 pid_t pid;
244
245 if (running_on_xen(HVM,&version_hi,&version_lo)) {
246 printf("running on Xen v%d.%d as an HVM domain, "
247 "pvrdtsc not supported, exiting\n",
248 (int)version_hi, (int)version_lo);
249 exit(0);
250 }
251 pid = fork();
252 if (pid == -1) {
253 fprintf(stderr,"Huh? Fork failed\n");
254 return 0;
255 }
256 else if (pid == 0) { /* child */
257 pv_cpuid(0x40000000,0,&dummy,&dummy,&dummy,&dummy);
258 exit(0);
259 }
260 waitpid(pid,&status,0);
261 if (!WIFEXITED(status))
262 exit(0);
263 if (!running_on_xen(PVM,&version_hi,&version_lo)) {
264 printf("not running on Xen, exiting\n");
265 exit(0);
266 }
267 printf("running on Xen v%d.%d as a PV domain\n",
268 (int)version_hi, (int)version_lo);
269 if ( version_hi <= 3 ) {
270 printf("pvrdtscp requires Xen version 4.0 or greater\n");
271 /* exit(0); FIXME after xen-unstable is officially v4.0 */
272 }
273 if ( get_xen_tsc_mode() != 3 )
274 printf("tsc_mode not pvrdtscp, set tsc_mode=3, exiting\n");
275
276 /* OK, we are on Xen, now loop forever checking timestamps */
277 ts = get_pvrdtscp_timestamp(&discontinuity);
278 printf("Starting with ts=%lluns 0x%llx (%llusec)\n",ts,ts,ts/NSEC_PER_SEC);
279 printf("incarn=%d: vtsc=%d, vtsc_khz=%lu, phys cpu_khz=%lu\n",
280 (unsigned long)get_xen_incarnation(),
281 (unsigned long)get_xen_vtsc(),
282 (unsigned long)get_xen_vtsc_khz(),
283 (unsigned long)get_xen_cpu_khz());
284 ts = get_pvrdtscp_timestamp(&discontinuity);
285 last_ts = ts;
286 while (1) {
287 ts = get_pvrdtscp_timestamp(&discontinuity);
288 if (discontinuity)
289 printf("migrated/restored, incarn=%d: "
290 "vtsc now %d, vtsc_khz=%lu, phys cpu_khz=%lu\n",
291 (unsigned long)get_xen_incarnation(),
292 (unsigned long)get_xen_vtsc(),
293 (unsigned long)get_xen_vtsc_khz(),
294 (unsigned long)get_xen_cpu_khz());
295 if (ts < last_ts)
296 /* this should NEVER happen, especially since there
297 * is a check for it in get_pvrdtscp_timestamp() */
298 printf("Time went backwards: %lluns (%llusec)\n",
299 last_ts-ts,(last_ts-ts)/NSEC_PER_SEC);
300 if (ts > last_ts + 200000000LL)
301 /* this is OK, usually about 2sec for save/restore
302 * and a fraction of a second for live migrate */
303 printf("Time jumped forward %lluns (%llusec)\n",
304 ts-last_ts,(ts-last_ts)/NSEC_PER_SEC);
305 last_ts = ts;
306 }
307 }
308