1 /* pvrdtscp algorithm
2  *
3  * This sample code demonstrates the use of the paravirtualized rdtscp
4  * algorithm.  Using this algorithm, an application may communicate with
5  * the Xen hypervisor (version 4.0+) to obtain timestamp information which
6  * is both monotonically increasing and has a fixed 1 GHz rate, even across
7  * migrations between machines with different TSC rates and offsets.
8  * Further,the algorithm provides performance near the performance of a
9  * native rdtsc/rdtscp instruction -- much faster than emulation PROVIDED
10  * the application is running on a machine on which the rdtscp instruction
11  * is supported and TSC is "safe". The application must also be running in a
12  * PV domain.  (HVM domains may be supported at a later time.) On machines
13  * where TSC is unsafe or the rdtscp instruction is not supported, Xen
14  * (v4.0+) provides emulation which is slower but consistent with the pvrdtscp
15  * algorithm, thus providing support for the algorithm for live migration
16  * across all machines.
17  *
18  * More information can be found within the Xen (4.0+) source tree at
19  *  docs/misc/tscmode.txt
20  *
21  * Copyright (c) 2009 Oracle Corporation and/or its affiliates.
22  * All rights reserved
23  * Written by: Dan Magenheimer <dan.magenheimer@oracle.com>
24  *
25  * This code is derived from code licensed under the GNU
26  * General Public License ("GPL") version 2 and is therefore itself
27  * also licensed under the GPL version 2.
28  *
29  * This code is known to compile and run on Oracle Enterprise Linux 5 Update 2
30  * using gcc version 4.1.2, but its purpose is to describe the pvrdtscp
31  * algorithm and its ABI to Xen version 4.0+
32  */
33 
34 #include <stdio.h>
35 #include <stdlib.h>
36 #include <string.h>
37 #include <sys/wait.h>
38 
39 #ifdef __LP64__
40 #define __X86_64__
41 typedef unsigned short u16;
42 typedef unsigned int u32;
43 typedef unsigned long u64;
44 typedef int i32;
45 typedef long i64;
46 #define NSEC_PER_SEC 1000000000
47 #else
48 #define __X86_32__
49 typedef unsigned int u16;
50 typedef unsigned long u32;
51 typedef unsigned long long u64;
52 typedef long i32;
53 typedef long long i64;
54 #define NSEC_PER_SEC 1000000000L
55 #endif
56 
hvm_cpuid(u32 idx,u32 sub,u32 * eax,u32 * ebx,u32 * ecx,u32 * edx)57 static inline void hvm_cpuid(u32 idx, u32 sub,
58 				u32 *eax, u32 *ebx, u32 *ecx, u32 *edx)
59 {
60 	*eax = idx, *ecx = sub;
61 	asm("cpuid" : "=a" (*eax), "=b" (*ebx), "=c" (*ecx), "=d" (*edx)
62 	    : "0" (*eax), "2" (*ecx));
63 }
64 
pv_cpuid(u32 idx,u32 sub,u32 * eax,u32 * ebx,u32 * ecx,u32 * edx)65 static inline void pv_cpuid(u32 idx, u32 sub,
66 				u32 *eax, u32 *ebx, u32 *ecx, u32 *edx)
67 {
68 	*eax = idx, *ecx = sub;
69 	asm volatile ( "ud2a ; .ascii \"xen\"; cpuid" : "=a" (*eax),
70             "=b" (*ebx), "=c" (*ecx), "=d" (*edx) : "0" (*eax), "2" (*ecx));
71 }
72 
do_rdtscp(u32 * aux)73 static inline u64 do_rdtscp(u32 *aux)
74 {
75 static u64 last = 0;
76 	u32 lo32, hi32;
77 	u64 val;
78 
79 	asm volatile(".byte 0x0f,0x01,0xf9":"=a"(lo32),"=d"(hi32),"=c" (*aux));
80 	val = lo32 | ((u64)hi32 << 32);
81 	return val;
82 }
83 
get_xen_tsc_mode(void)84 static inline int get_xen_tsc_mode(void)
85 {
86 	u32 val, dummy1, dummy2, dummy3;
87 	pv_cpuid(0x40000003,0,&dummy1,&val,&dummy2,&dummy3);
88 	return val;
89 }
90 
get_xen_vtsc(void)91 static inline int get_xen_vtsc(void)
92 {
93 	u32 val, dummy1, dummy2, dummy3;
94 	pv_cpuid(0x40000003,0,&val,&dummy1,&dummy2,&dummy3);
95 	return val & 1;
96 }
97 
get_xen_vtsc_khz(void)98 static inline int get_xen_vtsc_khz(void)
99 {
100 	u32 val, dummy1, dummy2, dummy3;
101 	pv_cpuid(0x40000003,0,&dummy1,&dummy2,&val,&dummy3);
102 	return val;
103 }
104 
get_xen_cpu_khz(void)105 static inline u32 get_xen_cpu_khz(void)
106 {
107 	u32 cpu_khz, dummy1, dummy2, dummy3;
108 	pv_cpuid(0x40000003,2,&cpu_khz,&dummy1,&dummy2,&dummy3);
109 	return cpu_khz;
110 }
111 
get_xen_incarnation(void)112 static inline u32 get_xen_incarnation(void)
113 {
114 	u32 incarn, dummy1, dummy2, dummy3;
115 	pv_cpuid(0x40000003,0,&dummy1,&dummy2,&dummy3,&incarn);
116 	return incarn;
117 }
118 
get_xen_time_values(u64 * offset,u32 * mul_frac,u32 * shift)119 static inline void get_xen_time_values(u64 *offset, u32 *mul_frac, u32 *shift)
120 {
121 	u32 off_lo, off_hi, sys_lo, sys_hi, dummy;
122 
123 	pv_cpuid(0x40000003,1,&off_lo,&off_hi,mul_frac,shift);
124 	*offset = off_lo | ((u64)off_hi << 32);
125 }
126 
scale_delta(u64 delta,u32 tsc_mul_frac,i32 tsc_shift)127 static inline u64 scale_delta(u64 delta, u32 tsc_mul_frac, i32 tsc_shift)
128 {
129     u64 product;
130 #ifdef __X86_32__
131     u32 tmp1, tmp2;
132 #endif
133 
134     if ( tsc_shift < 0 )
135         delta >>= -tsc_shift;
136     else
137         delta <<= tsc_shift;
138 
139 #ifdef __X86_32__
140     asm (
141         "mul  %5       ; "
142         "mov  %4,%%eax ; "
143         "mov  %%edx,%4 ; "
144         "mul  %5       ; "
145         "xor  %5,%5    ; "
146         "add  %4,%%eax ; "
147         "adc  %5,%%edx ; "
148         : "=A" (product), "=r" (tmp1), "=r" (tmp2)
149         : "a" ((u32)delta), "1" ((u32)(delta >> 32)), "2" (tsc_mul_frac) );
150 #else
151     asm (
152         "mul %%rdx ; shrd $32,%%rdx,%%rax"
153         : "=a" (product) : "0" (delta), "d" ((u64)tsc_mul_frac) );
154 #endif
155 
156     return product;
157 }
158 
get_pvrdtscp_timestamp(int * discontinuity)159 static inline u64 get_pvrdtscp_timestamp(int *discontinuity)
160 {
161 	static int firsttime = 1;
162 	static u64 last_pvrdtscp_timestamp = 0;
163 	static u32 last_tsc_aux;
164 	static u64 xen_ns_offset;
165 	static u32 xen_tsc_to_ns_mul_frac, xen_tsc_to_ns_shift;
166 	u32 this_tsc_aux;
167 	u64 timestamp, cur_tsc, cur_ns;
168 
169 	if (firsttime) {
170 		cur_tsc = do_rdtscp(&last_tsc_aux);
171 		get_xen_time_values(&xen_ns_offset, &xen_tsc_to_ns_mul_frac,
172 					&xen_tsc_to_ns_shift);
173 		cur_ns = scale_delta(cur_tsc, xen_tsc_to_ns_mul_frac,
174 					xen_tsc_to_ns_shift);
175 		timestamp = cur_ns - xen_ns_offset;
176 		last_pvrdtscp_timestamp = timestamp;
177 		firsttime = 0;
178 	}
179 	cur_tsc = do_rdtscp(&this_tsc_aux);
180 	*discontinuity = 0;
181 	while (this_tsc_aux != last_tsc_aux) {
182 		/* if tsc_aux changed, try again */
183 		last_tsc_aux = this_tsc_aux;
184 		get_xen_time_values(&xen_ns_offset, &xen_tsc_to_ns_mul_frac,
185 					&xen_tsc_to_ns_shift);
186 		cur_tsc = do_rdtscp(&this_tsc_aux);
187 		*discontinuity = 1;
188 	}
189 
190 	/* compute nsec from TSC and Xen time values */
191 	cur_ns = scale_delta(cur_tsc, xen_tsc_to_ns_mul_frac,
192 					xen_tsc_to_ns_shift);
193 	timestamp = cur_ns - xen_ns_offset;
194 
195 	/* enforce monotonicity just in case */
196 	if ((i64)(timestamp - last_pvrdtscp_timestamp) > 0)
197 		last_pvrdtscp_timestamp = timestamp;
198 	else {
199 		/* this should never happen but we'll check it anyway in
200 		 * case of some strange combination of scaling errors
201 		 * occurs across a very fast migration */
202 		printf("Time went backwards by %lluns\n",
203 		    (unsigned long long)(last_pvrdtscp_timestamp-timestamp));
204 		timestamp = ++last_pvrdtscp_timestamp;
205 	}
206 	return timestamp;
207 }
208 
209 #define HVM 1
210 #define PVM 0
211 
running_on_xen(int hvm,u16 * version_major,u16 * version_minor)212 static int running_on_xen(int hvm, u16 *version_major, u16 *version_minor)
213 {
214 	u32 eax, ebx, ecx, edx, base;
215 	union { char csig[16]; u32 u[4]; } sig;
216 
217 	for (base=0x40000000; base < 0x40010000; base += 0x100) {
218 		if (hvm==HVM)
219 			hvm_cpuid(base,0,&eax,&ebx,&ecx,&edx);
220 		else
221 			pv_cpuid(base,0,&eax,&ebx,&ecx,&edx);
222 		sig.u[0] = ebx; sig.u[1] = ecx; sig.u[2] = edx;
223 		sig.csig[12] = '\0';
224 		if (!strcmp("XenVMMXenVMM",&sig.csig[0]) && (eax >= (base+2))) {
225 				if (hvm==HVM)
226 					hvm_cpuid(base+1,0,&eax,&ebx,&ecx,&edx);
227 				else
228 					pv_cpuid(base+1,0,&eax,&ebx,&ecx,&edx);
229 				*version_major = (eax >> 16) & 0xffff;
230 				*version_minor = eax & 0xffff;
231 				return 1;
232 		}
233 	}
234 	return 0;
235 }
236 
main(int ac,char ** av)237 main(int ac, char **av)
238 {
239 	u32 dummy;
240 	u16 version_hi, version_lo;
241 	u64 ts, last_ts;
242 	int status, discontinuity = 0;
243 	pid_t pid;
244 
245 	if (running_on_xen(HVM,&version_hi,&version_lo)) {
246 		printf("running on Xen v%d.%d as an HVM domain, "
247 			"pvrdtsc not supported, exiting\n",
248 			(int)version_hi, (int)version_lo);
249 		exit(0);
250 	}
251 	pid = fork();
252 	if (pid == -1) {
253 		fprintf(stderr,"Huh? Fork failed\n");
254 		return 0;
255 	}
256 	else if (pid == 0) { /* child */
257 		pv_cpuid(0x40000000,0,&dummy,&dummy,&dummy,&dummy);
258 		exit(0);
259 	}
260 	waitpid(pid,&status,0);
261 	if (!WIFEXITED(status))
262 		exit(0);
263 	if (!running_on_xen(PVM,&version_hi,&version_lo)) {
264 		printf("not running on Xen, exiting\n");
265 		exit(0);
266 	}
267 	printf("running on Xen v%d.%d as a PV domain\n",
268 		(int)version_hi, (int)version_lo);
269 	if ( version_hi <= 3 ) {
270 		printf("pvrdtscp requires Xen version 4.0 or greater\n");
271 		/* exit(0); FIXME after xen-unstable is officially v4.0 */
272 	}
273 	if ( get_xen_tsc_mode() != 3 )
274 		printf("tsc_mode not pvrdtscp, set tsc_mode=3, exiting\n");
275 
276 	/* OK, we are on Xen, now loop forever checking timestamps */
277 	ts = get_pvrdtscp_timestamp(&discontinuity);
278 	printf("Starting with ts=%lluns 0x%llx (%llusec)\n",ts,ts,ts/NSEC_PER_SEC);
279 	printf("incarn=%d: vtsc=%d, vtsc_khz=%lu, phys cpu_khz=%lu\n",
280 				(unsigned long)get_xen_incarnation(),
281 				(unsigned long)get_xen_vtsc(),
282 				(unsigned long)get_xen_vtsc_khz(),
283 				(unsigned long)get_xen_cpu_khz());
284 	ts = get_pvrdtscp_timestamp(&discontinuity);
285 	last_ts = ts;
286 	while (1) {
287 		ts = get_pvrdtscp_timestamp(&discontinuity);
288 		if (discontinuity)
289 			printf("migrated/restored, incarn=%d: "
290                                "vtsc now %d, vtsc_khz=%lu, phys cpu_khz=%lu\n",
291 				(unsigned long)get_xen_incarnation(),
292 				(unsigned long)get_xen_vtsc(),
293 				(unsigned long)get_xen_vtsc_khz(),
294 				(unsigned long)get_xen_cpu_khz());
295 		if (ts < last_ts)
296 			/* this should NEVER happen, especially since there
297 			 * is a check for it in get_pvrdtscp_timestamp() */
298 			printf("Time went backwards: %lluns (%llusec)\n",
299 				last_ts-ts,(last_ts-ts)/NSEC_PER_SEC);
300 		if (ts > last_ts + 200000000LL)
301 			/* this is OK, usually about 2sec for save/restore
302 			 * and a fraction of a second for live migrate */
303 			printf("Time jumped forward %lluns (%llusec)\n",
304 				ts-last_ts,(ts-last_ts)/NSEC_PER_SEC);
305 		last_ts = ts;
306 	}
307 }
308