1 /* --------------------------------------------------------------------------
2  * VMAC and VHASH Implementation by Ted Krovetz (tdk@acm.org) and Wei Dai.
3  * This implementation is herby placed in the public domain.
4  * The authors offers no warranty. Use at your own risk.
5  * Please send bug reports to the authors.
6  * Last modified: 17 APR 08, 1700 PDT
7  * ----------------------------------------------------------------------- */
8 
9 /* start for Xen */
10 #include <xen/init.h>
11 #include <xen/byteswap.h>
12 #include <xen/types.h>
13 #include <xen/lib.h>
14 #include <crypto/vmac.h>
15 #define UINT64_C(x)  x##ULL
16 /* end for Xen */
17 
18 /* Enable code tuned for 64-bit registers; otherwise tuned for 32-bit */
19 #ifndef VMAC_ARCH_64
20 #define VMAC_ARCH_64 (__x86_64__ || __ppc64__ || _M_X64)
21 #endif
22 
23 /* Enable code tuned for Intel SSE2 instruction set                   */
24 #if ((__SSE2__ || (_M_IX86_FP >= 2)) && ( ! VMAC_ARCH_64))
25 #define VMAC_USE_SSE2    1
26 #include <emmintrin.h>
27 #endif
28 
29 /* Native word reads. Update (or define via compiler) if incorrect */
30 #ifndef VMAC_ARCH_BIG_ENDIAN       /* Assume big-endian unless on the list */
31 #define VMAC_ARCH_BIG_ENDIAN \
32     (!(__x86_64__ || __i386__ || _M_IX86 || \
33        _M_X64 || __ARMEL__ || __MIPSEL__))
34 #endif
35 
36 /* ----------------------------------------------------------------------- */
37 /* Constants and masks                                                     */
38 
39 const uint64_t p64   = UINT64_C(0xfffffffffffffeff);  /* 2^64 - 257 prime  */
40 const uint64_t m62   = UINT64_C(0x3fffffffffffffff);  /* 62-bit mask       */
41 const uint64_t m63   = UINT64_C(0x7fffffffffffffff);  /* 63-bit mask       */
42 const uint64_t m64   = UINT64_C(0xffffffffffffffff);  /* 64-bit mask       */
43 const uint64_t mpoly = UINT64_C(0x1fffffff1fffffff);  /* Poly key mask     */
44 
45 /* ----------------------------------------------------------------------- *
46  * The following routines are used in this implementation. They are
47  * written via macros to simulate zero-overhead call-by-reference.
48  * All have default implemantations for when they are not defined in an
49  * architecture-specific manner.
50  *
51  * MUL64: 64x64->128-bit multiplication
52  * PMUL64: assumes top bits cleared on inputs
53  * ADD128: 128x128->128-bit addition
54  * ----------------------------------------------------------------------- */
55 
56 /* ----------------------------------------------------------------------- */
57 #if (__GNUC__ && (__x86_64__ || __amd64__))
58 /* ----------------------------------------------------------------------- */
59 
60 #define ADD128(rh,rl,ih,il)                                               \
61     asm ("addq %3, %1 \n\t"                                               \
62          "adcq %2, %0"                                                    \
63     : "+r"(rh),"+r"(rl)                                                   \
64     : "r"(ih),"r"(il) : "cc");
65 
66 #define MUL64(rh,rl,i1,i2)                                                \
67     asm ("mulq %3" : "=a"(rl), "=d"(rh) : "a"(i1), "r"(i2) : "cc")
68 
69 #define PMUL64 MUL64
70 
71 
72 /* ----------------------------------------------------------------------- */
73 #elif (__GNUC__ && __ppc64__)
74 /* ----------------------------------------------------------------------- */
75 
76 #define ADD128(rh,rl,ih,il)                                               \
77     asm volatile (  "addc %1, %1, %3 \n\t"                                \
78                     "adde %0, %0, %2"                                     \
79     : "+r"(rh),"+r"(rl)                                                   \
80     : "r"(ih),"r"(il));
81 
82 #define MUL64(rh,rl,i1,i2)                                                \
83 { uint64_t _i1 = (i1), _i2 = (i2);                                        \
84     rl = _i1 * _i2;                                                       \
85     asm volatile ("mulhdu %0, %1, %2" : "=r" (rh) : "r" (_i1), "r" (_i2));\
86 }
87 
88 #define PMUL64 MUL64
89 
90 /* ----------------------------------------------------------------------- */
91 #elif _MSC_VER
92 /* ----------------------------------------------------------------------- */
93 
94 #include <intrin.h>
95 
96 #if (_M_IA64 || _M_X64) && \
97     (!defined(__INTEL_COMPILER) || __INTEL_COMPILER >= 1000)
98 #define MUL64(rh,rl,i1,i2)   (rl) = _umul128(i1,i2,&(rh));
99 #pragma intrinsic(_umul128)
100 #define PMUL64 MUL64
101 #endif
102 
103 /* MSVC uses add, adc in this version */
104 #define ADD128(rh,rl,ih,il)                                          \
105     {   uint64_t _il = (il);                                         \
106         (rl) += (_il);                                               \
107         (rh) += (ih) + ((rl) < (_il));                               \
108     }
109 
110 #if _MSC_VER >= 1400 && \
111     (!defined(__INTEL_COMPILER) || __INTEL_COMPILER >= 1000)
112 #define MUL32(i1,i2)    (__emulu((uint32_t)(i1),(uint32_t)(i2)))
113 #pragma intrinsic(__emulu)
114 #endif
115 
116 /* ----------------------------------------------------------------------- */
117 #endif
118 /* ----------------------------------------------------------------------- */
119 
120 #if __GNUC__
121 #define ALIGN(n)      __attribute__ ((aligned(n)))
122 #define NOINLINE      __attribute__ ((noinline))
123 #elif _MSC_VER
124 #define ALIGN(n)      __declspec(align(n))
125 #define NOINLINE      __declspec(noinline)
126 #else
127 #define ALIGN(n)
128 #define NOINLINE
129 #endif
130 
131 /* ----------------------------------------------------------------------- */
132 /* Default implementations, if not defined above                           */
133 /* ----------------------------------------------------------------------- */
134 
135 #ifndef ADD128
136 #define ADD128(rh,rl,ih,il)                                              \
137     {   uint64_t _il = (il);                                             \
138         (rl) += (_il);                                                   \
139         if ((rl) < (_il)) (rh)++;                                        \
140         (rh) += (ih);                                                    \
141     }
142 #endif
143 
144 #ifndef MUL32
145 #define MUL32(i1,i2)    ((uint64_t)(uint32_t)(i1)*(uint32_t)(i2))
146 #endif
147 
148 #ifndef PMUL64              /* rh may not be same as i1 or i2 */
149 #define PMUL64(rh,rl,i1,i2) /* Assumes m doesn't overflow     */         \
150     {   uint64_t _i1 = (i1), _i2 = (i2);                                 \
151         uint64_t m = MUL32(_i1,_i2>>32) + MUL32(_i1>>32,_i2);            \
152         rh         = MUL32(_i1>>32,_i2>>32);                             \
153         rl         = MUL32(_i1,_i2);                                     \
154         ADD128(rh,rl,(m >> 32),(m << 32));                               \
155     }
156 #endif
157 
158 #ifndef MUL64
159 #define MUL64(rh,rl,i1,i2)                                               \
160     {   uint64_t _i1 = (i1), _i2 = (i2);                                 \
161         uint64_t m1= MUL32(_i1,_i2>>32);                                 \
162         uint64_t m2= MUL32(_i1>>32,_i2);                                 \
163         rh         = MUL32(_i1>>32,_i2>>32);                             \
164         rl         = MUL32(_i1,_i2);                                     \
165         ADD128(rh,rl,(m1 >> 32),(m1 << 32));                             \
166         ADD128(rh,rl,(m2 >> 32),(m2 << 32));                             \
167     }
168 #endif
169 
170 /* ----------------------------------------------------------------------- */
171 
172 #if (VMAC_PREFER_BIG_ENDIAN)
173 #  define get64PE get64BE
174 #else
175 #  define get64PE get64LE
176 #endif
177 
178 #if (VMAC_ARCH_BIG_ENDIAN)
179 #  define get64BE(ptr) (*(uint64_t *)(ptr))
180 #  define get64LE(ptr) bswap64(*(uint64_t *)(ptr))
181 #else /* assume little-endian */
182 #  define get64BE(ptr) bswap64(*(uint64_t *)(ptr))
183 #  define get64LE(ptr) (*(uint64_t *)(ptr))
184 #endif
185 
186 
187 /* --------------------------------------------------------------------- *
188  * For highest performance the L1 NH and L2 polynomial hashes should be
189  * carefully implemented to take advantage of one's target architechture.
190  * Here these two hash functions are defined multiple time; once for
191  * 64-bit architectures, once for 32-bit SSE2 architectures, and once
192  * for the rest (32-bit) architectures.
193  * For each, nh_16 *must* be defined (works on multiples of 16 bytes).
194  * Optionally, nh_vmac_nhbytes can be defined (for multiples of
195  * VMAC_NHBYTES), and nh_16_2 and nh_vmac_nhbytes_2 (versions that do two
196  * NH computations at once).
197  * --------------------------------------------------------------------- */
198 
199 /* ----------------------------------------------------------------------- */
200 #if VMAC_ARCH_64
201 /* ----------------------------------------------------------------------- */
202 
203 #define nh_16(mp, kp, nw, rh, rl)                                            \
204 {   int i; uint64_t th, tl;                                                  \
205     rh = rl = 0;                                                             \
206     for (i = 0; i < nw; i+= 2) {                                             \
207         MUL64(th,tl,get64PE((mp)+i  )+(kp)[i  ],get64PE((mp)+i+1)+(kp)[i+1]);\
208         ADD128(rh,rl,th,tl);                                                 \
209     }                                                                        \
210 }
211 #define nh_16_2(mp, kp, nw, rh, rl, rh1, rl1)                                \
212 {   int i; uint64_t th, tl;                                                  \
213     rh1 = rl1 = rh = rl = 0;                                                 \
214     for (i = 0; i < nw; i+= 2) {                                             \
215         MUL64(th,tl,get64PE((mp)+i  )+(kp)[i  ],get64PE((mp)+i+1)+(kp)[i+1]);\
216         ADD128(rh,rl,th,tl);                                                 \
217         MUL64(th,tl,get64PE((mp)+i  )+(kp)[i+2],get64PE((mp)+i+1)+(kp)[i+3]);\
218         ADD128(rh1,rl1,th,tl);                                               \
219     }                                                                        \
220 }
221 
222 #if (VMAC_NHBYTES >= 64) /* These versions do 64-bytes of message at a time */
223 #define nh_vmac_nhbytes(mp, kp, nw, rh, rl)                                  \
224 {   int i; uint64_t th, tl;                                                  \
225     rh = rl = 0;                                                             \
226     for (i = 0; i < nw; i+= 8) {                                             \
227         MUL64(th,tl,get64PE((mp)+i  )+(kp)[i  ],get64PE((mp)+i+1)+(kp)[i+1]);\
228         ADD128(rh,rl,th,tl);                                                 \
229         MUL64(th,tl,get64PE((mp)+i+2)+(kp)[i+2],get64PE((mp)+i+3)+(kp)[i+3]);\
230         ADD128(rh,rl,th,tl);                                                 \
231         MUL64(th,tl,get64PE((mp)+i+4)+(kp)[i+4],get64PE((mp)+i+5)+(kp)[i+5]);\
232         ADD128(rh,rl,th,tl);                                                 \
233         MUL64(th,tl,get64PE((mp)+i+6)+(kp)[i+6],get64PE((mp)+i+7)+(kp)[i+7]);\
234         ADD128(rh,rl,th,tl);                                                 \
235     }                                                                        \
236 }
237 #define nh_vmac_nhbytes_2(mp, kp, nw, rh, rl, rh1, rl1)                      \
238 {   int i; uint64_t th, tl;                                                  \
239     rh1 = rl1 = rh = rl = 0;                                                 \
240     for (i = 0; i < nw; i+= 8) {                                             \
241         MUL64(th,tl,get64PE((mp)+i  )+(kp)[i  ],get64PE((mp)+i+1)+(kp)[i+1]);\
242         ADD128(rh,rl,th,tl);                                                 \
243         MUL64(th,tl,get64PE((mp)+i  )+(kp)[i+2],get64PE((mp)+i+1)+(kp)[i+3]);\
244         ADD128(rh1,rl1,th,tl);                                               \
245         MUL64(th,tl,get64PE((mp)+i+2)+(kp)[i+2],get64PE((mp)+i+3)+(kp)[i+3]);\
246         ADD128(rh,rl,th,tl);                                                 \
247         MUL64(th,tl,get64PE((mp)+i+2)+(kp)[i+4],get64PE((mp)+i+3)+(kp)[i+5]);\
248         ADD128(rh1,rl1,th,tl);                                               \
249         MUL64(th,tl,get64PE((mp)+i+4)+(kp)[i+4],get64PE((mp)+i+5)+(kp)[i+5]);\
250         ADD128(rh,rl,th,tl);                                                 \
251         MUL64(th,tl,get64PE((mp)+i+4)+(kp)[i+6],get64PE((mp)+i+5)+(kp)[i+7]);\
252         ADD128(rh1,rl1,th,tl);                                               \
253         MUL64(th,tl,get64PE((mp)+i+6)+(kp)[i+6],get64PE((mp)+i+7)+(kp)[i+7]);\
254         ADD128(rh,rl,th,tl);                                                 \
255         MUL64(th,tl,get64PE((mp)+i+6)+(kp)[i+8],get64PE((mp)+i+7)+(kp)[i+9]);\
256         ADD128(rh1,rl1,th,tl);                                               \
257     }                                                                        \
258 }
259 #endif
260 
261 #define poly_step(ah, al, kh, kl, mh, ml)                   \
262 {   uint64_t t1h, t1l, t2h, t2l, t3h, t3l, z=0;             \
263     /* compute ab*cd, put bd into result registers */       \
264     PMUL64(t3h,t3l,al,kh);                                  \
265     PMUL64(t2h,t2l,ah,kl);                                  \
266     PMUL64(t1h,t1l,ah,2*kh);                                \
267     PMUL64(ah,al,al,kl);                                    \
268     /* add 2 * ac to result */                              \
269     ADD128(ah,al,t1h,t1l);                                  \
270     /* add together ad + bc */                              \
271     ADD128(t2h,t2l,t3h,t3l);                                \
272     /* now (ah,al), (t2l,2*t2h) need summing */             \
273     /* first add the high registers, carrying into t2h */   \
274     ADD128(t2h,ah,z,t2l);                                   \
275     /* double t2h and add top bit of ah */                  \
276     t2h = 2 * t2h + (ah >> 63);                             \
277     ah &= m63;                                              \
278     /* now add the low registers */                         \
279     ADD128(ah,al,mh,ml);                                    \
280     ADD128(ah,al,z,t2h);                                    \
281 }
282 
283 /* ----------------------------------------------------------------------- */
284 #elif VMAC_USE_SSE2
285 /* ----------------------------------------------------------------------- */
286 
287 // macros from Crypto++ for sharing inline assembly code between MSVC and GNU C
288 #if defined(__GNUC__)
289 	// define these in two steps to allow arguments to be expanded
290 	#define GNU_AS2(x, y) #x ", " #y ";"
291 	#define GNU_AS3(x, y, z) #x ", " #y ", " #z ";"
292 	#define GNU_ASL(x) "\n" #x ":"
293 	#define GNU_ASJ(x, y, z) #x " " #y #z ";"
294 	#define AS2(x, y) GNU_AS2(x, y)
295 	#define AS3(x, y, z) GNU_AS3(x, y, z)
296 	#define ASS(x, y, a, b, c, d) #x ", " #y ", " #a "*64+" #b "*16+" #c "*4+" #d ";"
297 	#define ASL(x) GNU_ASL(x)
298 	#define ASJ(x, y, z) GNU_ASJ(x, y, z)
299 #else
300 	#define AS2(x, y) __asm {x, y}
301 	#define AS3(x, y, z) __asm {x, y, z}
302 	#define ASS(x, y, a, b, c, d) __asm {x, y, _MM_SHUFFLE(a, b, c, d)}
303 	#define ASL(x) __asm {label##x:}
304 	#define ASJ(x, y, z) __asm {x label##y}
305 #endif
306 
nh_16_func(const uint64_t * mp,const uint64_t * kp,size_t nw,uint64_t * rh,uint64_t * rl)307 static void NOINLINE nh_16_func(const uint64_t *mp, const uint64_t *kp, size_t nw, uint64_t *rh, uint64_t *rl)
308 {
309 	// This assembly version, using MMX registers, is just as fast as the
310 	// intrinsics version (which uses XMM registers) on the Intel Core 2,
311 	// but is much faster on the Pentium 4. In order to schedule multiplies
312 	// as early as possible, the loop interleaves operations for the current
313 	// block and the next block. To mask out high 32-bits, we use "movd"
314 	// to move the lower 32-bits to the stack and then back. Surprisingly,
315 	// this is faster than any other method.
316 #ifdef __GNUC__
317 	__asm__ __volatile__
318 	(
319 		".intel_syntax noprefix;"
320 #else
321 		AS2(	mov		esi, mp)
322 		AS2(	mov		edi, kp)
323 		AS2(	mov		ecx, nw)
324 		AS2(	mov		eax, rl)
325 		AS2(	mov		edx, rh)
326 #endif
327 		AS2(	sub		esp, 12)
328 		AS2(	movq	mm6, [esi])
329 		AS2(	paddq	mm6, [edi])
330 		AS2(	movq	mm5, [esi+8])
331 		AS2(	paddq	mm5, [edi+8])
332 		AS2(	add		esi, 16)
333 		AS2(	add		edi, 16)
334 		AS2(	movq	mm4, mm6)
335 		ASS(	pshufw	mm2, mm6, 1, 0, 3, 2)
336 		AS2(	pmuludq	mm6, mm5)
337 		ASS(	pshufw	mm3, mm5, 1, 0, 3, 2)
338 		AS2(	pmuludq	mm5, mm2)
339 		AS2(	pmuludq	mm2, mm3)
340 		AS2(	pmuludq	mm3, mm4)
341 		AS2(	pxor	mm7, mm7)
342 		AS2(	movd	[esp], mm6)
343 		AS2(	psrlq	mm6, 32)
344 		AS2(	movd	[esp+4], mm5)
345 		AS2(	psrlq	mm5, 32)
346 		AS2(	sub		ecx, 2)
347 		ASJ(	jz,		1, f)
348 		ASL(0)
349 		AS2(	movq	mm0, [esi])
350 		AS2(	paddq	mm0, [edi])
351 		AS2(	movq	mm1, [esi+8])
352 		AS2(	paddq	mm1, [edi+8])
353 		AS2(	add		esi, 16)
354 		AS2(	add		edi, 16)
355 		AS2(	movq	mm4, mm0)
356 		AS2(	paddq	mm5, mm2)
357 		ASS(	pshufw	mm2, mm0, 1, 0, 3, 2)
358 		AS2(	pmuludq	mm0, mm1)
359 		AS2(	movd	[esp+8], mm3)
360 		AS2(	psrlq	mm3, 32)
361 		AS2(	paddq	mm5, mm3)
362 		ASS(	pshufw	mm3, mm1, 1, 0, 3, 2)
363 		AS2(	pmuludq	mm1, mm2)
364 		AS2(	pmuludq	mm2, mm3)
365 		AS2(	pmuludq	mm3, mm4)
366 		AS2(	movd	mm4, [esp])
367 		AS2(	paddq	mm7, mm4)
368 		AS2(	movd	mm4, [esp+4])
369 		AS2(	paddq	mm6, mm4)
370 		AS2(	movd	mm4, [esp+8])
371 		AS2(	paddq	mm6, mm4)
372 		AS2(	movd	[esp], mm0)
373 		AS2(	psrlq	mm0, 32)
374 		AS2(	paddq	mm6, mm0)
375 		AS2(	movd	[esp+4], mm1)
376 		AS2(	psrlq	mm1, 32)
377 		AS2(	paddq	mm5, mm1)
378 		AS2(	sub		ecx, 2)
379 		ASJ(	jnz,	0, b)
380 		ASL(1)
381 		AS2(	paddq	mm5, mm2)
382 		AS2(	movd	[esp+8], mm3)
383 		AS2(	psrlq	mm3, 32)
384 		AS2(	paddq	mm5, mm3)
385 		AS2(	movd	mm4, [esp])
386 		AS2(	paddq	mm7, mm4)
387 		AS2(	movd	mm4, [esp+4])
388 		AS2(	paddq	mm6, mm4)
389 		AS2(	movd	mm4, [esp+8])
390 		AS2(	paddq	mm6, mm4)
391 
392 		ASS(	pshufw	mm0, mm7, 3, 2, 1, 0)
393 		AS2(	psrlq	mm7, 32)
394 		AS2(	paddq	mm6, mm7)
395 		AS2(	punpckldq	mm0, mm6)
396 		AS2(	psrlq	mm6, 32)
397 		AS2(	paddq	mm5, mm6)
398 		AS2(	movq	[eax], mm0)
399 		AS2(	movq	[edx], mm5)
400 		AS2(	add		esp, 12)
401 #ifdef __GNUC__
402 		".att_syntax prefix;"
403 		:
404 		: "S" (mp), "D" (kp), "c" (nw), "a" (rl), "d" (rh)
405 		: "memory", "cc"
406 	);
407 #endif
408 }
409 #define nh_16(mp, kp, nw, rh, rl)   nh_16_func(mp, kp, nw, &(rh), &(rl));
410 
poly_step_func(uint64_t * ahi,uint64_t * alo,const uint64_t * kh,const uint64_t * kl,const uint64_t * mh,const uint64_t * ml)411 static void poly_step_func(uint64_t *ahi, uint64_t *alo, const uint64_t *kh,
412                const uint64_t *kl, const uint64_t *mh, const uint64_t *ml)
413 {
414 	// This code tries to schedule the multiplies as early as possible to overcome
415 	// the long latencies on the Pentium 4. It also minimizes "movq" instructions
416 	// which are very expensive on the P4.
417 
418 #define a0 [eax+0]
419 #define a1 [eax+4]
420 #define a2 [ebx+0]
421 #define a3 [ebx+4]
422 #define k0 [ecx+0]
423 #define k1 [ecx+4]
424 #define k2 [edx+0]
425 #define k3 [edx+4]
426 
427 #ifdef __GNUC__
428 	uint32_t temp;
429 	__asm__ __volatile__
430 	(
431 		"mov %%ebx, %0;"
432 		"mov %1, %%ebx;"
433 		".intel_syntax noprefix;"
434 #else
435 		AS2(	mov		ebx, ahi)
436 		AS2(	mov		edx, kh)
437 		AS2(	mov		eax, alo)
438 		AS2(	mov		ecx, kl)
439 		AS2(	mov		esi, mh)
440 		AS2(	mov		edi, ml)
441 #endif
442 
443 		AS2(	movd	mm0, a3)
444 		AS2(	movq	mm4, mm0)
445 		AS2(	pmuludq	mm0, k3)		// a3*k3
446 		AS2(	movd	mm1, a0)
447 		AS2(	pmuludq	mm1, k2)		// a0*k2
448 		AS2(	movd	mm2, a1)
449 		AS2(	movd	mm6, k1)
450 		AS2(	pmuludq	mm2, mm6)		// a1*k1
451 		AS2(	movd	mm3, a2)
452 		AS2(	movq	mm5, mm3)
453 		AS2(	movd	mm7, k0)
454 		AS2(	pmuludq	mm3, mm7)		// a2*k0
455 		AS2(	pmuludq	mm4, mm7)		// a3*k0
456 		AS2(	pmuludq	mm5, mm6)		// a2*k1
457 		AS2(	psllq	mm0, 1)
458 		AS2(	paddq	mm0, [esi])
459 		AS2(	paddq	mm0, mm1)
460 		AS2(	movd	mm1, a1)
461 		AS2(	paddq	mm4, mm5)
462 		AS2(	movq	mm5, mm1)
463 		AS2(	pmuludq	mm1, k2)		// a1*k2
464 		AS2(	paddq	mm0, mm2)
465 		AS2(	movd	mm2, a0)
466 		AS2(	paddq	mm0, mm3)
467 		AS2(	movq	mm3, mm2)
468 		AS2(	pmuludq	mm2, k3)		// a0*k3
469 		AS2(	pmuludq	mm3, mm7)		// a0*k0
470 		AS2(	movd	esi, mm0)
471 		AS2(	psrlq	mm0, 32)
472 		AS2(	pmuludq	mm7, mm5)		// a1*k0
473 		AS2(	pmuludq	mm5, k3)		// a1*k3
474 		AS2(	paddq	mm0, mm1)
475 		AS2(	movd	mm1, a2)
476 		AS2(	pmuludq	mm1, k2)		// a2*k2
477 		AS2(	paddq	mm0, mm2)
478 		AS2(	paddq	mm0, mm4)
479 		AS2(	movq	mm4, mm0)
480 		AS2(	movd	mm2, a3)
481 		AS2(	pmuludq	mm2, mm6)		// a3*k1
482 		AS2(	pmuludq	mm6, a0)		// a0*k1
483 		AS2(	psrlq	mm0, 31)
484 		AS2(	paddq	mm0, mm3)
485 		AS2(	movd	mm3, [edi])
486 		AS2(	paddq	mm0, mm3)
487 		AS2(	movd	mm3, a2)
488 		AS2(	pmuludq	mm3, k3)		// a2*k3
489 		AS2(	paddq	mm5, mm1)
490 		AS2(	movd	mm1, a3)
491 		AS2(	pmuludq	mm1, k2)		// a3*k2
492 		AS2(	paddq	mm5, mm2)
493 		AS2(	movd	mm2, [edi+4])
494 		AS2(	psllq	mm5, 1)
495 		AS2(	paddq	mm0, mm5)
496 		AS2(	movq	mm5, mm0)
497 		AS2(	psllq	mm4, 33)
498 		AS2(	psrlq	mm0, 32)
499 		AS2(	paddq	mm6, mm7)
500 		AS2(	movd	mm7, esi)
501 		AS2(	paddq	mm0, mm6)
502 		AS2(	paddq	mm0, mm2)
503 		AS2(	paddq	mm3, mm1)
504 		AS2(	psllq	mm3, 1)
505 		AS2(	paddq	mm0, mm3)
506 		AS2(	psrlq	mm4, 1)
507 		AS2(	punpckldq	mm5, mm0)
508 		AS2(	psrlq	mm0, 32)
509 		AS2(	por		mm4, mm7)
510 		AS2(	paddq	mm0, mm4)
511 		AS2(	movq	a0, mm5)
512 		AS2(	movq	a2, mm0)
513 #ifdef __GNUC__
514 		".att_syntax prefix;"
515 		"mov %0, %%ebx;"
516 		: "=m" (temp)
517 		: "m" (ahi), "D" (ml), "d" (kh), "a" (alo), "S" (mh), "c" (kl)
518 		: "memory", "cc"
519 	);
520 #endif
521 
522 
523 #undef a0
524 #undef a1
525 #undef a2
526 #undef a3
527 #undef k0
528 #undef k1
529 #undef k2
530 #undef k3
531 }
532 
533 #define poly_step(ah, al, kh, kl, mh, ml)   \
534         poly_step_func(&(ah), &(al), &(kh), &(kl), &(mh), &(ml))
535 
536 /* ----------------------------------------------------------------------- */
537 #else /* not VMAC_ARCH_64 and not SSE2 */
538 /* ----------------------------------------------------------------------- */
539 
540 #ifndef nh_16
541 #define nh_16(mp, kp, nw, rh, rl)                                       \
542 {   uint64_t t1,t2,m1,m2,t;                                             \
543     int i;                                                              \
544     rh = rl = t = 0;                                                    \
545     for (i = 0; i < nw; i+=2)  {                                        \
546         t1  = get64PE(mp+i) + kp[i];                                    \
547         t2  = get64PE(mp+i+1) + kp[i+1];                                \
548         m2  = MUL32(t1 >> 32, t2);                                      \
549         m1  = MUL32(t1, t2 >> 32);                                      \
550         ADD128(rh,rl,MUL32(t1 >> 32,t2 >> 32),MUL32(t1,t2));            \
551         rh += (uint64_t)(uint32_t)(m1 >> 32) + (uint32_t)(m2 >> 32);    \
552         t  += (uint64_t)(uint32_t)m1 + (uint32_t)m2;                    \
553     }                                                                   \
554     ADD128(rh,rl,(t >> 32),(t << 32));                                  \
555 }
556 #endif
557 
poly_step_func(uint64_t * ahi,uint64_t * alo,const uint64_t * kh,const uint64_t * kl,const uint64_t * mh,const uint64_t * ml)558 static void poly_step_func(uint64_t *ahi, uint64_t *alo, const uint64_t *kh,
559                const uint64_t *kl, const uint64_t *mh, const uint64_t *ml)
560 {
561 
562 #if VMAC_ARCH_BIG_ENDIAN
563 #define INDEX_HIGH 0
564 #define INDEX_LOW 1
565 #else
566 #define INDEX_HIGH 1
567 #define INDEX_LOW 0
568 #endif
569 
570 #define a0 *(((uint32_t*)alo)+INDEX_LOW)
571 #define a1 *(((uint32_t*)alo)+INDEX_HIGH)
572 #define a2 *(((uint32_t*)ahi)+INDEX_LOW)
573 #define a3 *(((uint32_t*)ahi)+INDEX_HIGH)
574 #define k0 *(((uint32_t*)kl)+INDEX_LOW)
575 #define k1 *(((uint32_t*)kl)+INDEX_HIGH)
576 #define k2 *(((uint32_t*)kh)+INDEX_LOW)
577 #define k3 *(((uint32_t*)kh)+INDEX_HIGH)
578 
579     uint64_t p, q, t;
580     uint32_t t2;
581 
582     p = MUL32(a3, k3);
583     p += p;
584 	p += *(uint64_t *)mh;
585     p += MUL32(a0, k2);
586     p += MUL32(a1, k1);
587     p += MUL32(a2, k0);
588     t = (uint32_t)(p);
589     p >>= 32;
590     p += MUL32(a0, k3);
591     p += MUL32(a1, k2);
592     p += MUL32(a2, k1);
593     p += MUL32(a3, k0);
594     t |= ((uint64_t)((uint32_t)p & 0x7fffffff)) << 32;
595     p >>= 31;
596     p += (uint64_t)(((uint32_t*)ml)[INDEX_LOW]);
597     p += MUL32(a0, k0);
598     q =  MUL32(a1, k3);
599     q += MUL32(a2, k2);
600     q += MUL32(a3, k1);
601     q += q;
602     p += q;
603     t2 = (uint32_t)(p);
604     p >>= 32;
605     p += (uint64_t)(((uint32_t*)ml)[INDEX_HIGH]);
606     p += MUL32(a0, k1);
607     p += MUL32(a1, k0);
608     q =  MUL32(a2, k3);
609     q += MUL32(a3, k2);
610     q += q;
611     p += q;
612     *(uint64_t *)(alo) = (p << 32) | t2;
613     p >>= 32;
614     *(uint64_t *)(ahi) = p + t;
615 
616 #undef a0
617 #undef a1
618 #undef a2
619 #undef a3
620 #undef k0
621 #undef k1
622 #undef k2
623 #undef k3
624 }
625 
626 #define poly_step(ah, al, kh, kl, mh, ml)   \
627         poly_step_func(&(ah), &(al), &(kh), &(kl), &(mh), &(ml))
628 
629 /* ----------------------------------------------------------------------- */
630 #endif  /* end of specialized NH and poly definitions */
631 /* ----------------------------------------------------------------------- */
632 
633 /* At least nh_16 is defined. Defined others as needed  here               */
634 #ifndef nh_16_2
635 #define nh_16_2(mp, kp, nw, rh, rl, rh2, rl2)                           \
636     nh_16(mp, kp, nw, rh, rl);                                          \
637     nh_16(mp, ((kp)+2), nw, rh2, rl2);
638 #endif
639 #ifndef nh_vmac_nhbytes
640 #define nh_vmac_nhbytes(mp, kp, nw, rh, rl)                             \
641     nh_16(mp, kp, nw, rh, rl)
642 #endif
643 #ifndef nh_vmac_nhbytes_2
644 #define nh_vmac_nhbytes_2(mp, kp, nw, rh, rl, rh2, rl2)                 \
645     nh_vmac_nhbytes(mp, kp, nw, rh, rl);                                \
646     nh_vmac_nhbytes(mp, ((kp)+2), nw, rh2, rl2);
647 #endif
648 
649 /* ----------------------------------------------------------------------- */
650 
vhash_abort(vmac_ctx_t * ctx)651 static void vhash_abort(vmac_ctx_t *ctx)
652 {
653     ctx->polytmp[0] = ctx->polykey[0] ;
654     ctx->polytmp[1] = ctx->polykey[1] ;
655     #if (VMAC_TAG_LEN == 128)
656     ctx->polytmp[2] = ctx->polykey[2] ;
657     ctx->polytmp[3] = ctx->polykey[3] ;
658     #endif
659     ctx->first_block_processed = 0;
660 }
661 
662 /* ----------------------------------------------------------------------- */
l3hash(uint64_t p1,uint64_t p2,uint64_t k1,uint64_t k2,uint64_t len)663 static uint64_t l3hash(uint64_t p1, uint64_t p2,
664                        uint64_t k1, uint64_t k2, uint64_t len)
665 {
666     uint64_t rh, rl, t, z=0;
667 
668     /* fully reduce (p1,p2)+(len,0) mod p127 */
669     t = p1 >> 63;
670     p1 &= m63;
671     ADD128(p1, p2, len, t);
672     /* At this point, (p1,p2) is at most 2^127+(len<<64) */
673     t = (p1 > m63) + ((p1 == m63) && (p2 == m64));
674     ADD128(p1, p2, z, t);
675     p1 &= m63;
676 
677     /* compute (p1,p2)/(2^64-2^32) and (p1,p2)%(2^64-2^32) */
678     t = p1 + (p2 >> 32);
679     t += (t >> 32);
680     t += (uint32_t)t > 0xfffffffeu;
681     p1 += (t >> 32);
682     p2 += (p1 << 32);
683 
684     /* compute (p1+k1)%p64 and (p2+k2)%p64 */
685     p1 += k1;
686     p1 += (0 - (p1 < k1)) & 257;
687     p2 += k2;
688     p2 += (0 - (p2 < k2)) & 257;
689 
690     /* compute (p1+k1)*(p2+k2)%p64 */
691     MUL64(rh, rl, p1, p2);
692     t = rh >> 56;
693     ADD128(t, rl, z, rh);
694     rh <<= 8;
695     ADD128(t, rl, z, rh);
696     t += t << 8;
697     rl += t;
698     rl += (0 - (rl < t)) & 257;
699     rl += (0 - (rl > p64-1)) & 257;
700     return rl;
701 }
702 
703 /* ----------------------------------------------------------------------- */
704 
vhash_update(unsigned char * m,unsigned int mbytes,vmac_ctx_t * ctx)705 void vhash_update(unsigned char *m,
706                   unsigned int   mbytes, /* Pos multiple of VMAC_NHBYTES */
707                   vmac_ctx_t    *ctx)
708 {
709     uint64_t rh, rl, *mptr;
710     const uint64_t *kptr = (uint64_t *)ctx->nhkey;
711     int i;
712     uint64_t ch, cl;
713     uint64_t pkh = ctx->polykey[0];
714     uint64_t pkl = ctx->polykey[1];
715     #if (VMAC_TAG_LEN == 128)
716     uint64_t ch2, cl2, rh2, rl2;
717     uint64_t pkh2 = ctx->polykey[2];
718     uint64_t pkl2 = ctx->polykey[3];
719     #endif
720 
721     mptr = (uint64_t *)m;
722     i = mbytes / VMAC_NHBYTES;  /* Must be non-zero */
723 
724     ch = ctx->polytmp[0];
725     cl = ctx->polytmp[1];
726     #if (VMAC_TAG_LEN == 128)
727     ch2 = ctx->polytmp[2];
728     cl2 = ctx->polytmp[3];
729     #endif
730 
731     if ( ! ctx->first_block_processed) {
732         ctx->first_block_processed = 1;
733         #if (VMAC_TAG_LEN == 64)
734         nh_vmac_nhbytes(mptr,kptr,VMAC_NHBYTES/8,rh,rl);
735         #else
736         nh_vmac_nhbytes_2(mptr,kptr,VMAC_NHBYTES/8,rh,rl,rh2,rl2);
737         rh2 &= m62;
738         ADD128(ch2,cl2,rh2,rl2);
739         #endif
740         rh &= m62;
741         ADD128(ch,cl,rh,rl);
742         mptr += (VMAC_NHBYTES/sizeof(uint64_t));
743         i--;
744     }
745 
746     while (i--) {
747         #if (VMAC_TAG_LEN == 64)
748         nh_vmac_nhbytes(mptr,kptr,VMAC_NHBYTES/8,rh,rl);
749         #else
750         nh_vmac_nhbytes_2(mptr,kptr,VMAC_NHBYTES/8,rh,rl,rh2,rl2);
751         rh2 &= m62;
752         poly_step(ch2,cl2,pkh2,pkl2,rh2,rl2);
753         #endif
754         rh &= m62;
755         poly_step(ch,cl,pkh,pkl,rh,rl);
756         mptr += (VMAC_NHBYTES/sizeof(uint64_t));
757     }
758 
759     ctx->polytmp[0] = ch;
760     ctx->polytmp[1] = cl;
761     #if (VMAC_TAG_LEN == 128)
762     ctx->polytmp[2] = ch2;
763     ctx->polytmp[3] = cl2;
764     #endif
765     #if VMAC_USE_SSE2
766     _mm_empty(); /* SSE2 version of poly_step uses mmx instructions */
767     #endif
768 }
769 
770 /* ----------------------------------------------------------------------- */
771 
vhash(unsigned char m[],unsigned int mbytes,uint64_t * tagl,vmac_ctx_t * ctx)772 uint64_t vhash(unsigned char m[],
773           unsigned int mbytes,
774           uint64_t *tagl,
775           vmac_ctx_t *ctx)
776 {
777     uint64_t rh, rl, *mptr;
778     const uint64_t *kptr = (uint64_t *)ctx->nhkey;
779     int i, remaining;
780     uint64_t ch, cl;
781     uint64_t pkh = ctx->polykey[0];
782     uint64_t pkl = ctx->polykey[1];
783     #if (VMAC_TAG_LEN == 128)
784         uint64_t ch2, cl2, rh2, rl2;
785         uint64_t pkh2 = ctx->polykey[2];
786         uint64_t pkl2 = ctx->polykey[3];
787     #endif
788 
789     mptr = (uint64_t *)m;
790     i = mbytes / VMAC_NHBYTES;
791     remaining = mbytes % VMAC_NHBYTES;
792 
793     if (ctx->first_block_processed)
794     {
795         ch = ctx->polytmp[0];
796         cl = ctx->polytmp[1];
797         #if (VMAC_TAG_LEN == 128)
798         ch2 = ctx->polytmp[2];
799         cl2 = ctx->polytmp[3];
800         #endif
801     }
802     else if (i)
803     {
804         #if (VMAC_TAG_LEN == 64)
805         nh_vmac_nhbytes(mptr,kptr,VMAC_NHBYTES/8,ch,cl);
806         #else
807         nh_vmac_nhbytes_2(mptr,kptr,VMAC_NHBYTES/8,ch,cl,ch2,cl2);
808         ch2 &= m62;
809         ADD128(ch2,cl2,pkh2,pkl2);
810         #endif
811         ch &= m62;
812         ADD128(ch,cl,pkh,pkl);
813         mptr += (VMAC_NHBYTES/sizeof(uint64_t));
814         i--;
815     }
816     else if (remaining)
817     {
818         #if (VMAC_TAG_LEN == 64)
819         nh_16(mptr,kptr,2*((remaining+15)/16),ch,cl);
820         #else
821         nh_16_2(mptr,kptr,2*((remaining+15)/16),ch,cl,ch2,cl2);
822         ch2 &= m62;
823         ADD128(ch2,cl2,pkh2,pkl2);
824         #endif
825         ch &= m62;
826         ADD128(ch,cl,pkh,pkl);
827         mptr += (VMAC_NHBYTES/sizeof(uint64_t));
828         goto do_l3;
829     }
830     else /* Empty String */
831     {
832         ch = pkh; cl = pkl;
833         #if (VMAC_TAG_LEN == 128)
834         ch2 = pkh2; cl2 = pkl2;
835         #endif
836         goto do_l3;
837     }
838 
839     while (i--) {
840         #if (VMAC_TAG_LEN == 64)
841         nh_vmac_nhbytes(mptr,kptr,VMAC_NHBYTES/8,rh,rl);
842         #else
843         nh_vmac_nhbytes_2(mptr,kptr,VMAC_NHBYTES/8,rh,rl,rh2,rl2);
844         rh2 &= m62;
845         poly_step(ch2,cl2,pkh2,pkl2,rh2,rl2);
846         #endif
847         rh &= m62;
848         poly_step(ch,cl,pkh,pkl,rh,rl);
849         mptr += (VMAC_NHBYTES/sizeof(uint64_t));
850     }
851     if (remaining) {
852         #if (VMAC_TAG_LEN == 64)
853         nh_16(mptr,kptr,2*((remaining+15)/16),rh,rl);
854         #else
855         nh_16_2(mptr,kptr,2*((remaining+15)/16),rh,rl,rh2,rl2);
856         rh2 &= m62;
857         poly_step(ch2,cl2,pkh2,pkl2,rh2,rl2);
858         #endif
859         rh &= m62;
860         poly_step(ch,cl,pkh,pkl,rh,rl);
861     }
862 
863 do_l3:
864     #if VMAC_USE_SSE2
865     _mm_empty(); /* SSE2 version of poly_step uses mmx instructions */
866     #endif
867     vhash_abort(ctx);
868     remaining *= 8;
869 #if (VMAC_TAG_LEN == 128)
870     *tagl = l3hash(ch2, cl2, ctx->l3key[2], ctx->l3key[3],remaining);
871 #endif
872     return l3hash(ch, cl, ctx->l3key[0], ctx->l3key[1],remaining);
873 }
874 
875 /* ----------------------------------------------------------------------- */
876 
vmac(unsigned char m[],unsigned int mbytes,unsigned char n[16],uint64_t * tagl,vmac_ctx_t * ctx)877 uint64_t vmac(unsigned char m[],
878          unsigned int mbytes,
879          unsigned char n[16],
880          uint64_t *tagl,
881          vmac_ctx_t *ctx)
882 {
883 #if (VMAC_TAG_LEN == 64)
884     uint64_t *in_n, *out_p;
885     uint64_t p, h;
886     int i;
887 
888     #if VMAC_CACHE_NONCES
889     in_n = ctx->cached_nonce;
890     out_p = ctx->cached_aes;
891     #else
892     uint64_t tmp[2];
893     in_n = out_p = tmp;
894     #endif
895 
896     i = n[15] & 1;
897     #if VMAC_CACHE_NONCES
898     if ((*(uint64_t *)(n+8) != in_n[1]) ||
899         (*(uint64_t *)(n  ) != in_n[0])) {
900     #endif
901 
902         in_n[0] = *(uint64_t *)(n  );
903         in_n[1] = *(uint64_t *)(n+8);
904         ((unsigned char *)in_n)[15] &= 0xFE;
905         aes_encryption(in_n, out_p, &ctx->cipher_key);
906 
907     #if VMAC_CACHE_NONCES
908         ((unsigned char *)in_n)[15] |= (unsigned char)(1-i);
909     }
910     #endif
911     p = get64BE(out_p + i);
912     h = vhash(m, mbytes, (uint64_t *)0, ctx);
913     return p + h;
914 #else
915     uint64_t tmp[2];
916     uint64_t th,tl;
917     aes_encryption(n, (unsigned char *)tmp, &ctx->cipher_key);
918     th = vhash(m, mbytes, &tl, ctx);
919     th += get64BE(tmp);
920     *tagl = tl + get64BE(tmp+1);
921     return th;
922 #endif
923 }
924 
925 /* ----------------------------------------------------------------------- */
926 
vmac_set_key(unsigned char user_key[],vmac_ctx_t * ctx)927 void vmac_set_key(unsigned char user_key[], vmac_ctx_t *ctx)
928 {
929     uint64_t in[2] = {0}, out[2];
930     unsigned i;
931     aes_key_setup(user_key, &ctx->cipher_key);
932 
933     /* Fill nh key */
934     ((unsigned char *)in)[0] = 0x80;
935     for (i = 0; i < sizeof(ctx->nhkey)/8; i+=2) {
936         aes_encryption((unsigned char *)in, (unsigned char *)out,
937                                                          &ctx->cipher_key);
938         ctx->nhkey[i  ] = get64BE(out);
939         ctx->nhkey[i+1] = get64BE(out+1);
940         ((unsigned char *)in)[15] += 1;
941     }
942 
943     /* Fill poly key */
944     ((unsigned char *)in)[0] = 0xC0;
945     in[1] = 0;
946     for (i = 0; i < sizeof(ctx->polykey)/8; i+=2) {
947         aes_encryption((unsigned char *)in, (unsigned char *)out,
948                                                          &ctx->cipher_key);
949         ctx->polytmp[i  ] = ctx->polykey[i  ] = get64BE(out) & mpoly;
950         ctx->polytmp[i+1] = ctx->polykey[i+1] = get64BE(out+1) & mpoly;
951         ((unsigned char *)in)[15] += 1;
952     }
953 
954     /* Fill ip key */
955     ((unsigned char *)in)[0] = 0xE0;
956     in[1] = 0;
957     for (i = 0; i < sizeof(ctx->l3key)/8; i+=2) {
958         do {
959             aes_encryption((unsigned char *)in, (unsigned char *)out,
960                                                          &ctx->cipher_key);
961             ctx->l3key[i  ] = get64BE(out);
962             ctx->l3key[i+1] = get64BE(out+1);
963             ((unsigned char *)in)[15] += 1;
964         } while (ctx->l3key[i] >= p64 || ctx->l3key[i+1] >= p64);
965     }
966 
967     /* Invalidate nonce/aes cache and reset other elements */
968     #if (VMAC_TAG_LEN == 64) && (VMAC_CACHE_NONCES)
969     ctx->cached_nonce[0] = (uint64_t)-1; /* Ensure illegal nonce */
970     ctx->cached_nonce[1] = (uint64_t)0;  /* Ensure illegal nonce */
971     #endif
972     ctx->first_block_processed = 0;
973 }
974 
975 /* ----------------------------------------------------------------------- */
976 
977 
978 #if VMAC_RUN_TESTS
979 
980 #include <stdlib.h>
981 #include <stdio.h>
982 #include <time.h>
983 #include <string.h>
984 
prime(void)985 unsigned prime(void)  /* Wake variable speed cpu, get rough speed estimate */
986 {
987     volatile uint64_t i;
988     volatile uint64_t j=1;
989     unsigned cnt=0;
990     volatile clock_t ticks = clock();
991     do {
992         for (i = 0; i < 500000; i++) {
993             uint64_t x = get64PE(&j);
994             j = x * x + (uint64_t)ticks;
995         }
996         cnt++;
997     } while (clock() - ticks < (CLOCKS_PER_SEC/2));
998     return cnt;  /* cnt is millions of iterations per second */
999 }
1000 
main(void)1001 int main(void)
1002 {
1003     ALIGN(16) vmac_ctx_t ctx, ctx_aio, ctx_inc1, ctx_inc2;
1004     uint64_t res, tagl;
1005     void *p;
1006     unsigned char *m;
1007     ALIGN(4) unsigned char key[] = "abcdefghijklmnop";
1008     ALIGN(4) unsigned char nonce[] = "\0\0\0\0\0\0\0\0bcdefghi";
1009     unsigned int  vector_lengths[] = {0,3,48,300,3000000};
1010     #if (VMAC_TAG_LEN == 64)
1011     ALIGN(4) char *should_be[] = {"2576BE1C56D8B81B","2D376CF5B1813CE5",
1012                         "E8421F61D573D298","4492DF6C5CAC1BBE",
1013                         "09BA597DD7601113"};
1014     #else
1015     ALIGN(4) char *should_be[] = {"472766C70F74ED23481D6D7DE4E80DAC",
1016                          "4EE815A06A1D71EDD36FC75D51188A42",
1017                          "09F2C80C8E1007A0C12FAE19FE4504AE",
1018                          "66438817154850C61D8A412164803BCB",
1019                          "2B6B02288FFC461B75485DE893C629DC"};
1020     #endif
1021     unsigned speed_lengths[] = {16, 32, 64, 128, 256, 512, 1024, 2048, 4096};
1022     unsigned i, j, *speed_iters;
1023     clock_t ticks;
1024     double cpb;
1025     const unsigned int buf_len = 3 * (1 << 20);
1026 
1027     j = prime();
1028     i = sizeof(speed_lengths)/sizeof(speed_lengths[0]);
1029     speed_iters = (unsigned *)malloc(i*sizeof(speed_iters[0]));
1030     speed_iters[i-1] = j * (1 << 12);
1031     while (--i) speed_iters[i-1] = (unsigned)(1.3 * speed_iters[i]);
1032 
1033     /* Initialize context and message buffer, all 16-byte aligned */
1034     p = malloc(buf_len + 32);
1035     m = (unsigned char *)(((size_t)p + 16) & ~((size_t)15));
1036     memset(m, 0, buf_len + 16);
1037     vmac_set_key(key, &ctx);
1038 
1039     /* Test incremental and all-in-one interfaces for correctness */
1040     vmac_set_key(key, &ctx_aio);
1041     vmac_set_key(key, &ctx_inc1);
1042     vmac_set_key(key, &ctx_inc2);
1043 
1044 
1045     /*
1046     for (i = 0; i <= 512; i++) {
1047         vhash_update(m,(i/VMAC_NHBYTES)*VMAC_NHBYTES,&ctx_inc1);
1048         tagh = vmac(m+(i/VMAC_NHBYTES)*VMAC_NHBYTES, i%VMAC_NHBYTES,
1049                                                       nonce, &tagl, &ctx);
1050         vhash_update(m,(i/VMAC_NHBYTES)*VMAC_NHBYTES,&ctx_inc1);
1051         for (j = 0; j < vector_lengths[i]; j++)
1052             m[j] = (unsigned char)('a'+j%3);
1053 
1054     }
1055     */
1056 
1057     /* Generate vectors */
1058     for (i = 0; i < sizeof(vector_lengths)/sizeof(unsigned int); i++) {
1059         for (j = 0; j < vector_lengths[i]; j++)
1060             m[j] = (unsigned char)('a'+j%3);
1061         res = vmac(m, vector_lengths[i], nonce, &tagl, &ctx);
1062         #if (VMAC_TAG_LEN == 64)
1063         printf("\'abc\' * %7u: %016llX Should be: %s\n",
1064               vector_lengths[i]/3,res,should_be[i]);
1065         #else
1066         printf("\'abc\' * %7u: %016llX%016llX\nShould be      : %s\n",
1067               vector_lengths[i]/3,res,tagl,should_be[i]);
1068         #endif
1069     }
1070 
1071     /* Speed test */
1072     for (i = 0; i < sizeof(speed_lengths)/sizeof(unsigned int); i++) {
1073         ticks = clock();
1074         for (j = 0; j < speed_iters[i]; j++) {
1075             #if HASH_ONLY
1076             res = vhash(m, speed_lengths[i], &tagl, &ctx);
1077             #else
1078             res = vmac(m, speed_lengths[i], nonce, &tagl, &ctx);
1079             nonce[7]++;
1080             #endif
1081         }
1082         ticks = clock() - ticks;
1083         cpb = ((ticks*VMAC_HZ)/
1084               ((double)CLOCKS_PER_SEC*speed_lengths[i]*speed_iters[i]));
1085         printf("%4u bytes, %2.2f cpb\n", speed_lengths[i], cpb);
1086     }
1087     return 1;
1088 }
1089 
1090 #endif
1091