1 /* --------------------------------------------------------------------------
2  * VMAC and VHASH Implementation by Ted Krovetz (tdk@acm.org) and Wei Dai.
3  * This implementation is herby placed in the public domain.
4  * The authors offers no warranty. Use at your own risk.
5  * Please send bug reports to the authors.
6  * Last modified: 17 APR 08, 1700 PDT
7  * ----------------------------------------------------------------------- */
8 
9 /* start for Xen */
10 #include <xen/init.h>
11 #include <xen/types.h>
12 #include <xen/lib.h>
13 #include <crypto/vmac.h>
14 #define UINT64_C(x)  x##ULL
15 /* end for Xen */
16 
17 /* Enable code tuned for 64-bit registers; otherwise tuned for 32-bit */
18 #ifndef VMAC_ARCH_64
19 #define VMAC_ARCH_64 (__x86_64__ || __ppc64__ || _M_X64)
20 #endif
21 
22 /* Enable code tuned for Intel SSE2 instruction set                   */
23 #if ((__SSE2__ || (_M_IX86_FP >= 2)) && ( ! VMAC_ARCH_64))
24 #define VMAC_USE_SSE2    1
25 #include <emmintrin.h>
26 #endif
27 
28 /* Native word reads. Update (or define via compiler) if incorrect */
29 #ifndef VMAC_ARCH_BIG_ENDIAN       /* Assume big-endian unless on the list */
30 #define VMAC_ARCH_BIG_ENDIAN \
31     (!(__x86_64__ || __i386__ || _M_IX86 || \
32        _M_X64 || __ARMEL__ || __MIPSEL__))
33 #endif
34 
35 /* ----------------------------------------------------------------------- */
36 /* Constants and masks                                                     */
37 
38 const uint64_t p64   = UINT64_C(0xfffffffffffffeff);  /* 2^64 - 257 prime  */
39 const uint64_t m62   = UINT64_C(0x3fffffffffffffff);  /* 62-bit mask       */
40 const uint64_t m63   = UINT64_C(0x7fffffffffffffff);  /* 63-bit mask       */
41 const uint64_t m64   = UINT64_C(0xffffffffffffffff);  /* 64-bit mask       */
42 const uint64_t mpoly = UINT64_C(0x1fffffff1fffffff);  /* Poly key mask     */
43 
44 /* ----------------------------------------------------------------------- *
45  * The following routines are used in this implementation. They are
46  * written via macros to simulate zero-overhead call-by-reference.
47  * All have default implemantations for when they are not defined in an
48  * architecture-specific manner.
49  *
50  * MUL64: 64x64->128-bit multiplication
51  * PMUL64: assumes top bits cleared on inputs
52  * ADD128: 128x128->128-bit addition
53  * GET_REVERSED_64: load and byte-reverse 64-bit word
54  * ----------------------------------------------------------------------- */
55 
56 /* ----------------------------------------------------------------------- */
57 #if (__GNUC__ && (__x86_64__ || __amd64__))
58 /* ----------------------------------------------------------------------- */
59 
60 #define ADD128(rh,rl,ih,il)                                               \
61     asm ("addq %3, %1 \n\t"                                               \
62          "adcq %2, %0"                                                    \
63     : "+r"(rh),"+r"(rl)                                                   \
64     : "r"(ih),"r"(il) : "cc");
65 
66 #define MUL64(rh,rl,i1,i2)                                                \
67     asm ("mulq %3" : "=a"(rl), "=d"(rh) : "a"(i1), "r"(i2) : "cc")
68 
69 #define PMUL64 MUL64
70 
71 #define GET_REVERSED_64(p)                                                \
72     ({uint64_t x;                                                         \
73      asm ("bswapq %0" : "=r" (x) : "0"(*(uint64_t *)(p))); x;})
74 
75 /* ----------------------------------------------------------------------- */
76 #elif (__GNUC__ && __i386__)
77 /* ----------------------------------------------------------------------- */
78 
79 #define GET_REVERSED_64(p)                                                \
80     ({ uint64_t x;                                                        \
81     uint32_t *tp = (uint32_t *)(p);                                       \
82     asm  ("bswap %%edx\n\t"                                               \
83           "bswap %%eax"                                                   \
84     : "=A"(x)                                                             \
85     : "a"(tp[1]), "d"(tp[0]));                                            \
86     x; })
87 
88 /* ----------------------------------------------------------------------- */
89 #elif (__GNUC__ && __ppc64__)
90 /* ----------------------------------------------------------------------- */
91 
92 #define ADD128(rh,rl,ih,il)                                               \
93     asm volatile (  "addc %1, %1, %3 \n\t"                                \
94                     "adde %0, %0, %2"                                     \
95     : "+r"(rh),"+r"(rl)                                                   \
96     : "r"(ih),"r"(il));
97 
98 #define MUL64(rh,rl,i1,i2)                                                \
99 { uint64_t _i1 = (i1), _i2 = (i2);                                        \
100     rl = _i1 * _i2;                                                       \
101     asm volatile ("mulhdu %0, %1, %2" : "=r" (rh) : "r" (_i1), "r" (_i2));\
102 }
103 
104 #define PMUL64 MUL64
105 
106 #define GET_REVERSED_64(p)                                                \
107     ({ uint32_t hi, lo, *_p = (uint32_t *)(p);                            \
108        asm volatile ("lwbrx %0, %1, %2" : "=r"(lo) : "b%"(0), "r"(_p) );  \
109        asm volatile ("lwbrx %0, %1, %2" : "=r"(hi) : "b%"(4), "r"(_p) );  \
110        ((uint64_t)hi << 32) | (uint64_t)lo; } )
111 
112 /* ----------------------------------------------------------------------- */
113 #elif (__GNUC__ && (__ppc__ || __PPC__))
114 /* ----------------------------------------------------------------------- */
115 
116 #define GET_REVERSED_64(p)                                                \
117     ({ uint32_t hi, lo, *_p = (uint32_t *)(p);                            \
118        asm volatile ("lwbrx %0, %1, %2" : "=r"(lo) : "b%"(0), "r"(_p) );  \
119        asm volatile ("lwbrx %0, %1, %2" : "=r"(hi) : "b%"(4), "r"(_p) );  \
120        ((uint64_t)hi << 32) | (uint64_t)lo; } )
121 
122 /* ----------------------------------------------------------------------- */
123 #elif (__GNUC__ && (__ARMEL__ || __ARM__))
124 /* ----------------------------------------------------------------------- */
125 
126 #define bswap32(v)                                                        \
127 ({ uint32_t tmp,out;                                                      \
128     asm volatile(                                                         \
129         "eor    %1, %2, %2, ror #16\n"                                    \
130         "bic    %1, %1, #0x00ff0000\n"                                    \
131         "mov    %0, %2, ror #8\n"                                         \
132         "eor    %0, %0, %1, lsr #8"                                       \
133     : "=r" (out), "=&r" (tmp)                                             \
134     : "r" (v));                                                           \
135     out;})
136 
137 /* ----------------------------------------------------------------------- */
138 #elif _MSC_VER
139 /* ----------------------------------------------------------------------- */
140 
141 #include <intrin.h>
142 
143 #if (_M_IA64 || _M_X64) && \
144     (!defined(__INTEL_COMPILER) || __INTEL_COMPILER >= 1000)
145 #define MUL64(rh,rl,i1,i2)   (rl) = _umul128(i1,i2,&(rh));
146 #pragma intrinsic(_umul128)
147 #define PMUL64 MUL64
148 #endif
149 
150 /* MSVC uses add, adc in this version */
151 #define ADD128(rh,rl,ih,il)                                          \
152     {   uint64_t _il = (il);                                         \
153         (rl) += (_il);                                               \
154         (rh) += (ih) + ((rl) < (_il));                               \
155     }
156 
157 #if _MSC_VER >= 1300
158 #define GET_REVERSED_64(p) _byteswap_uint64(*(uint64_t *)(p))
159 #pragma intrinsic(_byteswap_uint64)
160 #endif
161 
162 #if _MSC_VER >= 1400 && \
163     (!defined(__INTEL_COMPILER) || __INTEL_COMPILER >= 1000)
164 #define MUL32(i1,i2)    (__emulu((uint32_t)(i1),(uint32_t)(i2)))
165 #pragma intrinsic(__emulu)
166 #endif
167 
168 /* ----------------------------------------------------------------------- */
169 #endif
170 /* ----------------------------------------------------------------------- */
171 
172 #if __GNUC__
173 #define ALIGN(n)      __attribute__ ((aligned(n)))
174 #define NOINLINE      __attribute__ ((noinline))
175 #elif _MSC_VER
176 #define ALIGN(n)      __declspec(align(n))
177 #define NOINLINE      __declspec(noinline)
178 #else
179 #define ALIGN(n)
180 #define NOINLINE
181 #endif
182 
183 /* ----------------------------------------------------------------------- */
184 /* Default implementations, if not defined above                           */
185 /* ----------------------------------------------------------------------- */
186 
187 #ifndef ADD128
188 #define ADD128(rh,rl,ih,il)                                              \
189     {   uint64_t _il = (il);                                             \
190         (rl) += (_il);                                                   \
191         if ((rl) < (_il)) (rh)++;                                        \
192         (rh) += (ih);                                                    \
193     }
194 #endif
195 
196 #ifndef MUL32
197 #define MUL32(i1,i2)    ((uint64_t)(uint32_t)(i1)*(uint32_t)(i2))
198 #endif
199 
200 #ifndef PMUL64              /* rh may not be same as i1 or i2 */
201 #define PMUL64(rh,rl,i1,i2) /* Assumes m doesn't overflow     */         \
202     {   uint64_t _i1 = (i1), _i2 = (i2);                                 \
203         uint64_t m = MUL32(_i1,_i2>>32) + MUL32(_i1>>32,_i2);            \
204         rh         = MUL32(_i1>>32,_i2>>32);                             \
205         rl         = MUL32(_i1,_i2);                                     \
206         ADD128(rh,rl,(m >> 32),(m << 32));                               \
207     }
208 #endif
209 
210 #ifndef MUL64
211 #define MUL64(rh,rl,i1,i2)                                               \
212     {   uint64_t _i1 = (i1), _i2 = (i2);                                 \
213         uint64_t m1= MUL32(_i1,_i2>>32);                                 \
214         uint64_t m2= MUL32(_i1>>32,_i2);                                 \
215         rh         = MUL32(_i1>>32,_i2>>32);                             \
216         rl         = MUL32(_i1,_i2);                                     \
217         ADD128(rh,rl,(m1 >> 32),(m1 << 32));                             \
218         ADD128(rh,rl,(m2 >> 32),(m2 << 32));                             \
219     }
220 #endif
221 
222 #ifndef GET_REVERSED_64
223 #ifndef bswap64
224 #ifndef bswap32
225 #define bswap32(x)                                                        \
226   ({ uint32_t bsx = (x);                                                  \
227       ((((bsx) & 0xff000000u) >> 24) | (((bsx) & 0x00ff0000u) >>  8) |    \
228        (((bsx) & 0x0000ff00u) <<  8) | (((bsx) & 0x000000ffu) << 24)); })
229 #endif
230 #define bswap64(x)                                                        \
231      ({ union { uint64_t ll; uint32_t l[2]; } w, r;                       \
232          w.ll = (x);                                                      \
233          r.l[0] = bswap32 (w.l[1]);                                       \
234          r.l[1] = bswap32 (w.l[0]);                                       \
235          r.ll; })
236 #endif
237 #define GET_REVERSED_64(p) bswap64(*(uint64_t *)(p))
238 #endif
239 
240 /* ----------------------------------------------------------------------- */
241 
242 #if (VMAC_PREFER_BIG_ENDIAN)
243 #  define get64PE get64BE
244 #else
245 #  define get64PE get64LE
246 #endif
247 
248 #if (VMAC_ARCH_BIG_ENDIAN)
249 #  define get64BE(ptr) (*(uint64_t *)(ptr))
250 #  define get64LE(ptr) GET_REVERSED_64(ptr)
251 #else /* assume little-endian */
252 #  define get64BE(ptr) GET_REVERSED_64(ptr)
253 #  define get64LE(ptr) (*(uint64_t *)(ptr))
254 #endif
255 
256 
257 /* --------------------------------------------------------------------- *
258  * For highest performance the L1 NH and L2 polynomial hashes should be
259  * carefully implemented to take advantage of one's target architechture.
260  * Here these two hash functions are defined multiple time; once for
261  * 64-bit architectures, once for 32-bit SSE2 architectures, and once
262  * for the rest (32-bit) architectures.
263  * For each, nh_16 *must* be defined (works on multiples of 16 bytes).
264  * Optionally, nh_vmac_nhbytes can be defined (for multiples of
265  * VMAC_NHBYTES), and nh_16_2 and nh_vmac_nhbytes_2 (versions that do two
266  * NH computations at once).
267  * --------------------------------------------------------------------- */
268 
269 /* ----------------------------------------------------------------------- */
270 #if VMAC_ARCH_64
271 /* ----------------------------------------------------------------------- */
272 
273 #define nh_16(mp, kp, nw, rh, rl)                                            \
274 {   int i; uint64_t th, tl;                                                  \
275     rh = rl = 0;                                                             \
276     for (i = 0; i < nw; i+= 2) {                                             \
277         MUL64(th,tl,get64PE((mp)+i  )+(kp)[i  ],get64PE((mp)+i+1)+(kp)[i+1]);\
278         ADD128(rh,rl,th,tl);                                                 \
279     }                                                                        \
280 }
281 #define nh_16_2(mp, kp, nw, rh, rl, rh1, rl1)                                \
282 {   int i; uint64_t th, tl;                                                  \
283     rh1 = rl1 = rh = rl = 0;                                                 \
284     for (i = 0; i < nw; i+= 2) {                                             \
285         MUL64(th,tl,get64PE((mp)+i  )+(kp)[i  ],get64PE((mp)+i+1)+(kp)[i+1]);\
286         ADD128(rh,rl,th,tl);                                                 \
287         MUL64(th,tl,get64PE((mp)+i  )+(kp)[i+2],get64PE((mp)+i+1)+(kp)[i+3]);\
288         ADD128(rh1,rl1,th,tl);                                               \
289     }                                                                        \
290 }
291 
292 #if (VMAC_NHBYTES >= 64) /* These versions do 64-bytes of message at a time */
293 #define nh_vmac_nhbytes(mp, kp, nw, rh, rl)                                  \
294 {   int i; uint64_t th, tl;                                                  \
295     rh = rl = 0;                                                             \
296     for (i = 0; i < nw; i+= 8) {                                             \
297         MUL64(th,tl,get64PE((mp)+i  )+(kp)[i  ],get64PE((mp)+i+1)+(kp)[i+1]);\
298         ADD128(rh,rl,th,tl);                                                 \
299         MUL64(th,tl,get64PE((mp)+i+2)+(kp)[i+2],get64PE((mp)+i+3)+(kp)[i+3]);\
300         ADD128(rh,rl,th,tl);                                                 \
301         MUL64(th,tl,get64PE((mp)+i+4)+(kp)[i+4],get64PE((mp)+i+5)+(kp)[i+5]);\
302         ADD128(rh,rl,th,tl);                                                 \
303         MUL64(th,tl,get64PE((mp)+i+6)+(kp)[i+6],get64PE((mp)+i+7)+(kp)[i+7]);\
304         ADD128(rh,rl,th,tl);                                                 \
305     }                                                                        \
306 }
307 #define nh_vmac_nhbytes_2(mp, kp, nw, rh, rl, rh1, rl1)                      \
308 {   int i; uint64_t th, tl;                                                  \
309     rh1 = rl1 = rh = rl = 0;                                                 \
310     for (i = 0; i < nw; i+= 8) {                                             \
311         MUL64(th,tl,get64PE((mp)+i  )+(kp)[i  ],get64PE((mp)+i+1)+(kp)[i+1]);\
312         ADD128(rh,rl,th,tl);                                                 \
313         MUL64(th,tl,get64PE((mp)+i  )+(kp)[i+2],get64PE((mp)+i+1)+(kp)[i+3]);\
314         ADD128(rh1,rl1,th,tl);                                               \
315         MUL64(th,tl,get64PE((mp)+i+2)+(kp)[i+2],get64PE((mp)+i+3)+(kp)[i+3]);\
316         ADD128(rh,rl,th,tl);                                                 \
317         MUL64(th,tl,get64PE((mp)+i+2)+(kp)[i+4],get64PE((mp)+i+3)+(kp)[i+5]);\
318         ADD128(rh1,rl1,th,tl);                                               \
319         MUL64(th,tl,get64PE((mp)+i+4)+(kp)[i+4],get64PE((mp)+i+5)+(kp)[i+5]);\
320         ADD128(rh,rl,th,tl);                                                 \
321         MUL64(th,tl,get64PE((mp)+i+4)+(kp)[i+6],get64PE((mp)+i+5)+(kp)[i+7]);\
322         ADD128(rh1,rl1,th,tl);                                               \
323         MUL64(th,tl,get64PE((mp)+i+6)+(kp)[i+6],get64PE((mp)+i+7)+(kp)[i+7]);\
324         ADD128(rh,rl,th,tl);                                                 \
325         MUL64(th,tl,get64PE((mp)+i+6)+(kp)[i+8],get64PE((mp)+i+7)+(kp)[i+9]);\
326         ADD128(rh1,rl1,th,tl);                                               \
327     }                                                                        \
328 }
329 #endif
330 
331 #define poly_step(ah, al, kh, kl, mh, ml)                   \
332 {   uint64_t t1h, t1l, t2h, t2l, t3h, t3l, z=0;             \
333     /* compute ab*cd, put bd into result registers */       \
334     PMUL64(t3h,t3l,al,kh);                                  \
335     PMUL64(t2h,t2l,ah,kl);                                  \
336     PMUL64(t1h,t1l,ah,2*kh);                                \
337     PMUL64(ah,al,al,kl);                                    \
338     /* add 2 * ac to result */                              \
339     ADD128(ah,al,t1h,t1l);                                  \
340     /* add together ad + bc */                              \
341     ADD128(t2h,t2l,t3h,t3l);                                \
342     /* now (ah,al), (t2l,2*t2h) need summing */             \
343     /* first add the high registers, carrying into t2h */   \
344     ADD128(t2h,ah,z,t2l);                                   \
345     /* double t2h and add top bit of ah */                  \
346     t2h = 2 * t2h + (ah >> 63);                             \
347     ah &= m63;                                              \
348     /* now add the low registers */                         \
349     ADD128(ah,al,mh,ml);                                    \
350     ADD128(ah,al,z,t2h);                                    \
351 }
352 
353 /* ----------------------------------------------------------------------- */
354 #elif VMAC_USE_SSE2
355 /* ----------------------------------------------------------------------- */
356 
357 // macros from Crypto++ for sharing inline assembly code between MSVC and GNU C
358 #if defined(__GNUC__)
359 	// define these in two steps to allow arguments to be expanded
360 	#define GNU_AS2(x, y) #x ", " #y ";"
361 	#define GNU_AS3(x, y, z) #x ", " #y ", " #z ";"
362 	#define GNU_ASL(x) "\n" #x ":"
363 	#define GNU_ASJ(x, y, z) #x " " #y #z ";"
364 	#define AS2(x, y) GNU_AS2(x, y)
365 	#define AS3(x, y, z) GNU_AS3(x, y, z)
366 	#define ASS(x, y, a, b, c, d) #x ", " #y ", " #a "*64+" #b "*16+" #c "*4+" #d ";"
367 	#define ASL(x) GNU_ASL(x)
368 	#define ASJ(x, y, z) GNU_ASJ(x, y, z)
369 #else
370 	#define AS2(x, y) __asm {x, y}
371 	#define AS3(x, y, z) __asm {x, y, z}
372 	#define ASS(x, y, a, b, c, d) __asm {x, y, _MM_SHUFFLE(a, b, c, d)}
373 	#define ASL(x) __asm {label##x:}
374 	#define ASJ(x, y, z) __asm {x label##y}
375 #endif
376 
nh_16_func(const uint64_t * mp,const uint64_t * kp,size_t nw,uint64_t * rh,uint64_t * rl)377 static void NOINLINE nh_16_func(const uint64_t *mp, const uint64_t *kp, size_t nw, uint64_t *rh, uint64_t *rl)
378 {
379 	// This assembly version, using MMX registers, is just as fast as the
380 	// intrinsics version (which uses XMM registers) on the Intel Core 2,
381 	// but is much faster on the Pentium 4. In order to schedule multiplies
382 	// as early as possible, the loop interleaves operations for the current
383 	// block and the next block. To mask out high 32-bits, we use "movd"
384 	// to move the lower 32-bits to the stack and then back. Surprisingly,
385 	// this is faster than any other method.
386 #ifdef __GNUC__
387 	__asm__ __volatile__
388 	(
389 		".intel_syntax noprefix;"
390 #else
391 		AS2(	mov		esi, mp)
392 		AS2(	mov		edi, kp)
393 		AS2(	mov		ecx, nw)
394 		AS2(	mov		eax, rl)
395 		AS2(	mov		edx, rh)
396 #endif
397 		AS2(	sub		esp, 12)
398 		AS2(	movq	mm6, [esi])
399 		AS2(	paddq	mm6, [edi])
400 		AS2(	movq	mm5, [esi+8])
401 		AS2(	paddq	mm5, [edi+8])
402 		AS2(	add		esi, 16)
403 		AS2(	add		edi, 16)
404 		AS2(	movq	mm4, mm6)
405 		ASS(	pshufw	mm2, mm6, 1, 0, 3, 2)
406 		AS2(	pmuludq	mm6, mm5)
407 		ASS(	pshufw	mm3, mm5, 1, 0, 3, 2)
408 		AS2(	pmuludq	mm5, mm2)
409 		AS2(	pmuludq	mm2, mm3)
410 		AS2(	pmuludq	mm3, mm4)
411 		AS2(	pxor	mm7, mm7)
412 		AS2(	movd	[esp], mm6)
413 		AS2(	psrlq	mm6, 32)
414 		AS2(	movd	[esp+4], mm5)
415 		AS2(	psrlq	mm5, 32)
416 		AS2(	sub		ecx, 2)
417 		ASJ(	jz,		1, f)
418 		ASL(0)
419 		AS2(	movq	mm0, [esi])
420 		AS2(	paddq	mm0, [edi])
421 		AS2(	movq	mm1, [esi+8])
422 		AS2(	paddq	mm1, [edi+8])
423 		AS2(	add		esi, 16)
424 		AS2(	add		edi, 16)
425 		AS2(	movq	mm4, mm0)
426 		AS2(	paddq	mm5, mm2)
427 		ASS(	pshufw	mm2, mm0, 1, 0, 3, 2)
428 		AS2(	pmuludq	mm0, mm1)
429 		AS2(	movd	[esp+8], mm3)
430 		AS2(	psrlq	mm3, 32)
431 		AS2(	paddq	mm5, mm3)
432 		ASS(	pshufw	mm3, mm1, 1, 0, 3, 2)
433 		AS2(	pmuludq	mm1, mm2)
434 		AS2(	pmuludq	mm2, mm3)
435 		AS2(	pmuludq	mm3, mm4)
436 		AS2(	movd	mm4, [esp])
437 		AS2(	paddq	mm7, mm4)
438 		AS2(	movd	mm4, [esp+4])
439 		AS2(	paddq	mm6, mm4)
440 		AS2(	movd	mm4, [esp+8])
441 		AS2(	paddq	mm6, mm4)
442 		AS2(	movd	[esp], mm0)
443 		AS2(	psrlq	mm0, 32)
444 		AS2(	paddq	mm6, mm0)
445 		AS2(	movd	[esp+4], mm1)
446 		AS2(	psrlq	mm1, 32)
447 		AS2(	paddq	mm5, mm1)
448 		AS2(	sub		ecx, 2)
449 		ASJ(	jnz,	0, b)
450 		ASL(1)
451 		AS2(	paddq	mm5, mm2)
452 		AS2(	movd	[esp+8], mm3)
453 		AS2(	psrlq	mm3, 32)
454 		AS2(	paddq	mm5, mm3)
455 		AS2(	movd	mm4, [esp])
456 		AS2(	paddq	mm7, mm4)
457 		AS2(	movd	mm4, [esp+4])
458 		AS2(	paddq	mm6, mm4)
459 		AS2(	movd	mm4, [esp+8])
460 		AS2(	paddq	mm6, mm4)
461 
462 		ASS(	pshufw	mm0, mm7, 3, 2, 1, 0)
463 		AS2(	psrlq	mm7, 32)
464 		AS2(	paddq	mm6, mm7)
465 		AS2(	punpckldq	mm0, mm6)
466 		AS2(	psrlq	mm6, 32)
467 		AS2(	paddq	mm5, mm6)
468 		AS2(	movq	[eax], mm0)
469 		AS2(	movq	[edx], mm5)
470 		AS2(	add		esp, 12)
471 #ifdef __GNUC__
472 		".att_syntax prefix;"
473 		:
474 		: "S" (mp), "D" (kp), "c" (nw), "a" (rl), "d" (rh)
475 		: "memory", "cc"
476 	);
477 #endif
478 }
479 #define nh_16(mp, kp, nw, rh, rl)   nh_16_func(mp, kp, nw, &(rh), &(rl));
480 
poly_step_func(uint64_t * ahi,uint64_t * alo,const uint64_t * kh,const uint64_t * kl,const uint64_t * mh,const uint64_t * ml)481 static void poly_step_func(uint64_t *ahi, uint64_t *alo, const uint64_t *kh,
482                const uint64_t *kl, const uint64_t *mh, const uint64_t *ml)
483 {
484 	// This code tries to schedule the multiplies as early as possible to overcome
485 	// the long latencies on the Pentium 4. It also minimizes "movq" instructions
486 	// which are very expensive on the P4.
487 
488 #define a0 [eax+0]
489 #define a1 [eax+4]
490 #define a2 [ebx+0]
491 #define a3 [ebx+4]
492 #define k0 [ecx+0]
493 #define k1 [ecx+4]
494 #define k2 [edx+0]
495 #define k3 [edx+4]
496 
497 #ifdef __GNUC__
498 	uint32_t temp;
499 	__asm__ __volatile__
500 	(
501 		"mov %%ebx, %0;"
502 		"mov %1, %%ebx;"
503 		".intel_syntax noprefix;"
504 #else
505 		AS2(	mov		ebx, ahi)
506 		AS2(	mov		edx, kh)
507 		AS2(	mov		eax, alo)
508 		AS2(	mov		ecx, kl)
509 		AS2(	mov		esi, mh)
510 		AS2(	mov		edi, ml)
511 #endif
512 
513 		AS2(	movd	mm0, a3)
514 		AS2(	movq	mm4, mm0)
515 		AS2(	pmuludq	mm0, k3)		// a3*k3
516 		AS2(	movd	mm1, a0)
517 		AS2(	pmuludq	mm1, k2)		// a0*k2
518 		AS2(	movd	mm2, a1)
519 		AS2(	movd	mm6, k1)
520 		AS2(	pmuludq	mm2, mm6)		// a1*k1
521 		AS2(	movd	mm3, a2)
522 		AS2(	movq	mm5, mm3)
523 		AS2(	movd	mm7, k0)
524 		AS2(	pmuludq	mm3, mm7)		// a2*k0
525 		AS2(	pmuludq	mm4, mm7)		// a3*k0
526 		AS2(	pmuludq	mm5, mm6)		// a2*k1
527 		AS2(	psllq	mm0, 1)
528 		AS2(	paddq	mm0, [esi])
529 		AS2(	paddq	mm0, mm1)
530 		AS2(	movd	mm1, a1)
531 		AS2(	paddq	mm4, mm5)
532 		AS2(	movq	mm5, mm1)
533 		AS2(	pmuludq	mm1, k2)		// a1*k2
534 		AS2(	paddq	mm0, mm2)
535 		AS2(	movd	mm2, a0)
536 		AS2(	paddq	mm0, mm3)
537 		AS2(	movq	mm3, mm2)
538 		AS2(	pmuludq	mm2, k3)		// a0*k3
539 		AS2(	pmuludq	mm3, mm7)		// a0*k0
540 		AS2(	movd	esi, mm0)
541 		AS2(	psrlq	mm0, 32)
542 		AS2(	pmuludq	mm7, mm5)		// a1*k0
543 		AS2(	pmuludq	mm5, k3)		// a1*k3
544 		AS2(	paddq	mm0, mm1)
545 		AS2(	movd	mm1, a2)
546 		AS2(	pmuludq	mm1, k2)		// a2*k2
547 		AS2(	paddq	mm0, mm2)
548 		AS2(	paddq	mm0, mm4)
549 		AS2(	movq	mm4, mm0)
550 		AS2(	movd	mm2, a3)
551 		AS2(	pmuludq	mm2, mm6)		// a3*k1
552 		AS2(	pmuludq	mm6, a0)		// a0*k1
553 		AS2(	psrlq	mm0, 31)
554 		AS2(	paddq	mm0, mm3)
555 		AS2(	movd	mm3, [edi])
556 		AS2(	paddq	mm0, mm3)
557 		AS2(	movd	mm3, a2)
558 		AS2(	pmuludq	mm3, k3)		// a2*k3
559 		AS2(	paddq	mm5, mm1)
560 		AS2(	movd	mm1, a3)
561 		AS2(	pmuludq	mm1, k2)		// a3*k2
562 		AS2(	paddq	mm5, mm2)
563 		AS2(	movd	mm2, [edi+4])
564 		AS2(	psllq	mm5, 1)
565 		AS2(	paddq	mm0, mm5)
566 		AS2(	movq	mm5, mm0)
567 		AS2(	psllq	mm4, 33)
568 		AS2(	psrlq	mm0, 32)
569 		AS2(	paddq	mm6, mm7)
570 		AS2(	movd	mm7, esi)
571 		AS2(	paddq	mm0, mm6)
572 		AS2(	paddq	mm0, mm2)
573 		AS2(	paddq	mm3, mm1)
574 		AS2(	psllq	mm3, 1)
575 		AS2(	paddq	mm0, mm3)
576 		AS2(	psrlq	mm4, 1)
577 		AS2(	punpckldq	mm5, mm0)
578 		AS2(	psrlq	mm0, 32)
579 		AS2(	por		mm4, mm7)
580 		AS2(	paddq	mm0, mm4)
581 		AS2(	movq	a0, mm5)
582 		AS2(	movq	a2, mm0)
583 #ifdef __GNUC__
584 		".att_syntax prefix;"
585 		"mov %0, %%ebx;"
586 		: "=m" (temp)
587 		: "m" (ahi), "D" (ml), "d" (kh), "a" (alo), "S" (mh), "c" (kl)
588 		: "memory", "cc"
589 	);
590 #endif
591 
592 
593 #undef a0
594 #undef a1
595 #undef a2
596 #undef a3
597 #undef k0
598 #undef k1
599 #undef k2
600 #undef k3
601 }
602 
603 #define poly_step(ah, al, kh, kl, mh, ml)   \
604         poly_step_func(&(ah), &(al), &(kh), &(kl), &(mh), &(ml))
605 
606 /* ----------------------------------------------------------------------- */
607 #else /* not VMAC_ARCH_64 and not SSE2 */
608 /* ----------------------------------------------------------------------- */
609 
610 #ifndef nh_16
611 #define nh_16(mp, kp, nw, rh, rl)                                       \
612 {   uint64_t t1,t2,m1,m2,t;                                             \
613     int i;                                                              \
614     rh = rl = t = 0;                                                    \
615     for (i = 0; i < nw; i+=2)  {                                        \
616         t1  = get64PE(mp+i) + kp[i];                                    \
617         t2  = get64PE(mp+i+1) + kp[i+1];                                \
618         m2  = MUL32(t1 >> 32, t2);                                      \
619         m1  = MUL32(t1, t2 >> 32);                                      \
620         ADD128(rh,rl,MUL32(t1 >> 32,t2 >> 32),MUL32(t1,t2));            \
621         rh += (uint64_t)(uint32_t)(m1 >> 32) + (uint32_t)(m2 >> 32);    \
622         t  += (uint64_t)(uint32_t)m1 + (uint32_t)m2;                    \
623     }                                                                   \
624     ADD128(rh,rl,(t >> 32),(t << 32));                                  \
625 }
626 #endif
627 
poly_step_func(uint64_t * ahi,uint64_t * alo,const uint64_t * kh,const uint64_t * kl,const uint64_t * mh,const uint64_t * ml)628 static void poly_step_func(uint64_t *ahi, uint64_t *alo, const uint64_t *kh,
629                const uint64_t *kl, const uint64_t *mh, const uint64_t *ml)
630 {
631 
632 #if VMAC_ARCH_BIG_ENDIAN
633 #define INDEX_HIGH 0
634 #define INDEX_LOW 1
635 #else
636 #define INDEX_HIGH 1
637 #define INDEX_LOW 0
638 #endif
639 
640 #define a0 *(((uint32_t*)alo)+INDEX_LOW)
641 #define a1 *(((uint32_t*)alo)+INDEX_HIGH)
642 #define a2 *(((uint32_t*)ahi)+INDEX_LOW)
643 #define a3 *(((uint32_t*)ahi)+INDEX_HIGH)
644 #define k0 *(((uint32_t*)kl)+INDEX_LOW)
645 #define k1 *(((uint32_t*)kl)+INDEX_HIGH)
646 #define k2 *(((uint32_t*)kh)+INDEX_LOW)
647 #define k3 *(((uint32_t*)kh)+INDEX_HIGH)
648 
649     uint64_t p, q, t;
650     uint32_t t2;
651 
652     p = MUL32(a3, k3);
653     p += p;
654 	p += *(uint64_t *)mh;
655     p += MUL32(a0, k2);
656     p += MUL32(a1, k1);
657     p += MUL32(a2, k0);
658     t = (uint32_t)(p);
659     p >>= 32;
660     p += MUL32(a0, k3);
661     p += MUL32(a1, k2);
662     p += MUL32(a2, k1);
663     p += MUL32(a3, k0);
664     t |= ((uint64_t)((uint32_t)p & 0x7fffffff)) << 32;
665     p >>= 31;
666     p += (uint64_t)(((uint32_t*)ml)[INDEX_LOW]);
667     p += MUL32(a0, k0);
668     q =  MUL32(a1, k3);
669     q += MUL32(a2, k2);
670     q += MUL32(a3, k1);
671     q += q;
672     p += q;
673     t2 = (uint32_t)(p);
674     p >>= 32;
675     p += (uint64_t)(((uint32_t*)ml)[INDEX_HIGH]);
676     p += MUL32(a0, k1);
677     p += MUL32(a1, k0);
678     q =  MUL32(a2, k3);
679     q += MUL32(a3, k2);
680     q += q;
681     p += q;
682     *(uint64_t *)(alo) = (p << 32) | t2;
683     p >>= 32;
684     *(uint64_t *)(ahi) = p + t;
685 
686 #undef a0
687 #undef a1
688 #undef a2
689 #undef a3
690 #undef k0
691 #undef k1
692 #undef k2
693 #undef k3
694 }
695 
696 #define poly_step(ah, al, kh, kl, mh, ml)   \
697         poly_step_func(&(ah), &(al), &(kh), &(kl), &(mh), &(ml))
698 
699 /* ----------------------------------------------------------------------- */
700 #endif  /* end of specialized NH and poly definitions */
701 /* ----------------------------------------------------------------------- */
702 
703 /* At least nh_16 is defined. Defined others as needed  here               */
704 #ifndef nh_16_2
705 #define nh_16_2(mp, kp, nw, rh, rl, rh2, rl2)                           \
706     nh_16(mp, kp, nw, rh, rl);                                          \
707     nh_16(mp, ((kp)+2), nw, rh2, rl2);
708 #endif
709 #ifndef nh_vmac_nhbytes
710 #define nh_vmac_nhbytes(mp, kp, nw, rh, rl)                             \
711     nh_16(mp, kp, nw, rh, rl)
712 #endif
713 #ifndef nh_vmac_nhbytes_2
714 #define nh_vmac_nhbytes_2(mp, kp, nw, rh, rl, rh2, rl2)                 \
715     nh_vmac_nhbytes(mp, kp, nw, rh, rl);                                \
716     nh_vmac_nhbytes(mp, ((kp)+2), nw, rh2, rl2);
717 #endif
718 
719 /* ----------------------------------------------------------------------- */
720 
vhash_abort(vmac_ctx_t * ctx)721 static void vhash_abort(vmac_ctx_t *ctx)
722 {
723     ctx->polytmp[0] = ctx->polykey[0] ;
724     ctx->polytmp[1] = ctx->polykey[1] ;
725     #if (VMAC_TAG_LEN == 128)
726     ctx->polytmp[2] = ctx->polykey[2] ;
727     ctx->polytmp[3] = ctx->polykey[3] ;
728     #endif
729     ctx->first_block_processed = 0;
730 }
731 
732 /* ----------------------------------------------------------------------- */
l3hash(uint64_t p1,uint64_t p2,uint64_t k1,uint64_t k2,uint64_t len)733 static uint64_t l3hash(uint64_t p1, uint64_t p2,
734                        uint64_t k1, uint64_t k2, uint64_t len)
735 {
736     uint64_t rh, rl, t, z=0;
737 
738     /* fully reduce (p1,p2)+(len,0) mod p127 */
739     t = p1 >> 63;
740     p1 &= m63;
741     ADD128(p1, p2, len, t);
742     /* At this point, (p1,p2) is at most 2^127+(len<<64) */
743     t = (p1 > m63) + ((p1 == m63) && (p2 == m64));
744     ADD128(p1, p2, z, t);
745     p1 &= m63;
746 
747     /* compute (p1,p2)/(2^64-2^32) and (p1,p2)%(2^64-2^32) */
748     t = p1 + (p2 >> 32);
749     t += (t >> 32);
750     t += (uint32_t)t > 0xfffffffeu;
751     p1 += (t >> 32);
752     p2 += (p1 << 32);
753 
754     /* compute (p1+k1)%p64 and (p2+k2)%p64 */
755     p1 += k1;
756     p1 += (0 - (p1 < k1)) & 257;
757     p2 += k2;
758     p2 += (0 - (p2 < k2)) & 257;
759 
760     /* compute (p1+k1)*(p2+k2)%p64 */
761     MUL64(rh, rl, p1, p2);
762     t = rh >> 56;
763     ADD128(t, rl, z, rh);
764     rh <<= 8;
765     ADD128(t, rl, z, rh);
766     t += t << 8;
767     rl += t;
768     rl += (0 - (rl < t)) & 257;
769     rl += (0 - (rl > p64-1)) & 257;
770     return rl;
771 }
772 
773 /* ----------------------------------------------------------------------- */
774 
vhash_update(unsigned char * m,unsigned int mbytes,vmac_ctx_t * ctx)775 void vhash_update(unsigned char *m,
776                   unsigned int   mbytes, /* Pos multiple of VMAC_NHBYTES */
777                   vmac_ctx_t    *ctx)
778 {
779     uint64_t rh, rl, *mptr;
780     const uint64_t *kptr = (uint64_t *)ctx->nhkey;
781     int i;
782     uint64_t ch, cl;
783     uint64_t pkh = ctx->polykey[0];
784     uint64_t pkl = ctx->polykey[1];
785     #if (VMAC_TAG_LEN == 128)
786     uint64_t ch2, cl2, rh2, rl2;
787     uint64_t pkh2 = ctx->polykey[2];
788     uint64_t pkl2 = ctx->polykey[3];
789     #endif
790 
791     mptr = (uint64_t *)m;
792     i = mbytes / VMAC_NHBYTES;  /* Must be non-zero */
793 
794     ch = ctx->polytmp[0];
795     cl = ctx->polytmp[1];
796     #if (VMAC_TAG_LEN == 128)
797     ch2 = ctx->polytmp[2];
798     cl2 = ctx->polytmp[3];
799     #endif
800 
801     if ( ! ctx->first_block_processed) {
802         ctx->first_block_processed = 1;
803         #if (VMAC_TAG_LEN == 64)
804         nh_vmac_nhbytes(mptr,kptr,VMAC_NHBYTES/8,rh,rl);
805         #else
806         nh_vmac_nhbytes_2(mptr,kptr,VMAC_NHBYTES/8,rh,rl,rh2,rl2);
807         rh2 &= m62;
808         ADD128(ch2,cl2,rh2,rl2);
809         #endif
810         rh &= m62;
811         ADD128(ch,cl,rh,rl);
812         mptr += (VMAC_NHBYTES/sizeof(uint64_t));
813         i--;
814     }
815 
816     while (i--) {
817         #if (VMAC_TAG_LEN == 64)
818         nh_vmac_nhbytes(mptr,kptr,VMAC_NHBYTES/8,rh,rl);
819         #else
820         nh_vmac_nhbytes_2(mptr,kptr,VMAC_NHBYTES/8,rh,rl,rh2,rl2);
821         rh2 &= m62;
822         poly_step(ch2,cl2,pkh2,pkl2,rh2,rl2);
823         #endif
824         rh &= m62;
825         poly_step(ch,cl,pkh,pkl,rh,rl);
826         mptr += (VMAC_NHBYTES/sizeof(uint64_t));
827     }
828 
829     ctx->polytmp[0] = ch;
830     ctx->polytmp[1] = cl;
831     #if (VMAC_TAG_LEN == 128)
832     ctx->polytmp[2] = ch2;
833     ctx->polytmp[3] = cl2;
834     #endif
835     #if VMAC_USE_SSE2
836     _mm_empty(); /* SSE2 version of poly_step uses mmx instructions */
837     #endif
838 }
839 
840 /* ----------------------------------------------------------------------- */
841 
vhash(unsigned char m[],unsigned int mbytes,uint64_t * tagl,vmac_ctx_t * ctx)842 uint64_t vhash(unsigned char m[],
843           unsigned int mbytes,
844           uint64_t *tagl,
845           vmac_ctx_t *ctx)
846 {
847     uint64_t rh, rl, *mptr;
848     const uint64_t *kptr = (uint64_t *)ctx->nhkey;
849     int i, remaining;
850     uint64_t ch, cl;
851     uint64_t pkh = ctx->polykey[0];
852     uint64_t pkl = ctx->polykey[1];
853     #if (VMAC_TAG_LEN == 128)
854         uint64_t ch2, cl2, rh2, rl2;
855         uint64_t pkh2 = ctx->polykey[2];
856         uint64_t pkl2 = ctx->polykey[3];
857     #endif
858 
859     mptr = (uint64_t *)m;
860     i = mbytes / VMAC_NHBYTES;
861     remaining = mbytes % VMAC_NHBYTES;
862 
863     if (ctx->first_block_processed)
864     {
865         ch = ctx->polytmp[0];
866         cl = ctx->polytmp[1];
867         #if (VMAC_TAG_LEN == 128)
868         ch2 = ctx->polytmp[2];
869         cl2 = ctx->polytmp[3];
870         #endif
871     }
872     else if (i)
873     {
874         #if (VMAC_TAG_LEN == 64)
875         nh_vmac_nhbytes(mptr,kptr,VMAC_NHBYTES/8,ch,cl);
876         #else
877         nh_vmac_nhbytes_2(mptr,kptr,VMAC_NHBYTES/8,ch,cl,ch2,cl2);
878         ch2 &= m62;
879         ADD128(ch2,cl2,pkh2,pkl2);
880         #endif
881         ch &= m62;
882         ADD128(ch,cl,pkh,pkl);
883         mptr += (VMAC_NHBYTES/sizeof(uint64_t));
884         i--;
885     }
886     else if (remaining)
887     {
888         #if (VMAC_TAG_LEN == 64)
889         nh_16(mptr,kptr,2*((remaining+15)/16),ch,cl);
890         #else
891         nh_16_2(mptr,kptr,2*((remaining+15)/16),ch,cl,ch2,cl2);
892         ch2 &= m62;
893         ADD128(ch2,cl2,pkh2,pkl2);
894         #endif
895         ch &= m62;
896         ADD128(ch,cl,pkh,pkl);
897         mptr += (VMAC_NHBYTES/sizeof(uint64_t));
898         goto do_l3;
899     }
900     else /* Empty String */
901     {
902         ch = pkh; cl = pkl;
903         #if (VMAC_TAG_LEN == 128)
904         ch2 = pkh2; cl2 = pkl2;
905         #endif
906         goto do_l3;
907     }
908 
909     while (i--) {
910         #if (VMAC_TAG_LEN == 64)
911         nh_vmac_nhbytes(mptr,kptr,VMAC_NHBYTES/8,rh,rl);
912         #else
913         nh_vmac_nhbytes_2(mptr,kptr,VMAC_NHBYTES/8,rh,rl,rh2,rl2);
914         rh2 &= m62;
915         poly_step(ch2,cl2,pkh2,pkl2,rh2,rl2);
916         #endif
917         rh &= m62;
918         poly_step(ch,cl,pkh,pkl,rh,rl);
919         mptr += (VMAC_NHBYTES/sizeof(uint64_t));
920     }
921     if (remaining) {
922         #if (VMAC_TAG_LEN == 64)
923         nh_16(mptr,kptr,2*((remaining+15)/16),rh,rl);
924         #else
925         nh_16_2(mptr,kptr,2*((remaining+15)/16),rh,rl,rh2,rl2);
926         rh2 &= m62;
927         poly_step(ch2,cl2,pkh2,pkl2,rh2,rl2);
928         #endif
929         rh &= m62;
930         poly_step(ch,cl,pkh,pkl,rh,rl);
931     }
932 
933 do_l3:
934     #if VMAC_USE_SSE2
935     _mm_empty(); /* SSE2 version of poly_step uses mmx instructions */
936     #endif
937     vhash_abort(ctx);
938     remaining *= 8;
939 #if (VMAC_TAG_LEN == 128)
940     *tagl = l3hash(ch2, cl2, ctx->l3key[2], ctx->l3key[3],remaining);
941 #endif
942     return l3hash(ch, cl, ctx->l3key[0], ctx->l3key[1],remaining);
943 }
944 
945 /* ----------------------------------------------------------------------- */
946 
vmac(unsigned char m[],unsigned int mbytes,unsigned char n[16],uint64_t * tagl,vmac_ctx_t * ctx)947 uint64_t vmac(unsigned char m[],
948          unsigned int mbytes,
949          unsigned char n[16],
950          uint64_t *tagl,
951          vmac_ctx_t *ctx)
952 {
953 #if (VMAC_TAG_LEN == 64)
954     uint64_t *in_n, *out_p;
955     uint64_t p, h;
956     int i;
957 
958     #if VMAC_CACHE_NONCES
959     in_n = ctx->cached_nonce;
960     out_p = ctx->cached_aes;
961     #else
962     uint64_t tmp[2];
963     in_n = out_p = tmp;
964     #endif
965 
966     i = n[15] & 1;
967     #if VMAC_CACHE_NONCES
968     if ((*(uint64_t *)(n+8) != in_n[1]) ||
969         (*(uint64_t *)(n  ) != in_n[0])) {
970     #endif
971 
972         in_n[0] = *(uint64_t *)(n  );
973         in_n[1] = *(uint64_t *)(n+8);
974         ((unsigned char *)in_n)[15] &= 0xFE;
975         aes_encryption(in_n, out_p, &ctx->cipher_key);
976 
977     #if VMAC_CACHE_NONCES
978         ((unsigned char *)in_n)[15] |= (unsigned char)(1-i);
979     }
980     #endif
981     p = get64BE(out_p + i);
982     h = vhash(m, mbytes, (uint64_t *)0, ctx);
983     return p + h;
984 #else
985     uint64_t tmp[2];
986     uint64_t th,tl;
987     aes_encryption(n, (unsigned char *)tmp, &ctx->cipher_key);
988     th = vhash(m, mbytes, &tl, ctx);
989     th += get64BE(tmp);
990     *tagl = tl + get64BE(tmp+1);
991     return th;
992 #endif
993 }
994 
995 /* ----------------------------------------------------------------------- */
996 
vmac_set_key(unsigned char user_key[],vmac_ctx_t * ctx)997 void vmac_set_key(unsigned char user_key[], vmac_ctx_t *ctx)
998 {
999     uint64_t in[2] = {0}, out[2];
1000     unsigned i;
1001     aes_key_setup(user_key, &ctx->cipher_key);
1002 
1003     /* Fill nh key */
1004     ((unsigned char *)in)[0] = 0x80;
1005     for (i = 0; i < sizeof(ctx->nhkey)/8; i+=2) {
1006         aes_encryption((unsigned char *)in, (unsigned char *)out,
1007                                                          &ctx->cipher_key);
1008         ctx->nhkey[i  ] = get64BE(out);
1009         ctx->nhkey[i+1] = get64BE(out+1);
1010         ((unsigned char *)in)[15] += 1;
1011     }
1012 
1013     /* Fill poly key */
1014     ((unsigned char *)in)[0] = 0xC0;
1015     in[1] = 0;
1016     for (i = 0; i < sizeof(ctx->polykey)/8; i+=2) {
1017         aes_encryption((unsigned char *)in, (unsigned char *)out,
1018                                                          &ctx->cipher_key);
1019         ctx->polytmp[i  ] = ctx->polykey[i  ] = get64BE(out) & mpoly;
1020         ctx->polytmp[i+1] = ctx->polykey[i+1] = get64BE(out+1) & mpoly;
1021         ((unsigned char *)in)[15] += 1;
1022     }
1023 
1024     /* Fill ip key */
1025     ((unsigned char *)in)[0] = 0xE0;
1026     in[1] = 0;
1027     for (i = 0; i < sizeof(ctx->l3key)/8; i+=2) {
1028         do {
1029             aes_encryption((unsigned char *)in, (unsigned char *)out,
1030                                                          &ctx->cipher_key);
1031             ctx->l3key[i  ] = get64BE(out);
1032             ctx->l3key[i+1] = get64BE(out+1);
1033             ((unsigned char *)in)[15] += 1;
1034         } while (ctx->l3key[i] >= p64 || ctx->l3key[i+1] >= p64);
1035     }
1036 
1037     /* Invalidate nonce/aes cache and reset other elements */
1038     #if (VMAC_TAG_LEN == 64) && (VMAC_CACHE_NONCES)
1039     ctx->cached_nonce[0] = (uint64_t)-1; /* Ensure illegal nonce */
1040     ctx->cached_nonce[1] = (uint64_t)0;  /* Ensure illegal nonce */
1041     #endif
1042     ctx->first_block_processed = 0;
1043 }
1044 
1045 /* ----------------------------------------------------------------------- */
1046 
1047 
1048 #if VMAC_RUN_TESTS
1049 
1050 #include <stdlib.h>
1051 #include <stdio.h>
1052 #include <time.h>
1053 #include <string.h>
1054 
prime(void)1055 unsigned prime(void)  /* Wake variable speed cpu, get rough speed estimate */
1056 {
1057     volatile uint64_t i;
1058     volatile uint64_t j=1;
1059     unsigned cnt=0;
1060     volatile clock_t ticks = clock();
1061     do {
1062         for (i = 0; i < 500000; i++) {
1063             uint64_t x = get64PE(&j);
1064             j = x * x + (uint64_t)ticks;
1065         }
1066         cnt++;
1067     } while (clock() - ticks < (CLOCKS_PER_SEC/2));
1068     return cnt;  /* cnt is millions of iterations per second */
1069 }
1070 
main(void)1071 int main(void)
1072 {
1073     ALIGN(16) vmac_ctx_t ctx, ctx_aio, ctx_inc1, ctx_inc2;
1074     uint64_t res, tagl;
1075     void *p;
1076     unsigned char *m;
1077     ALIGN(4) unsigned char key[] = "abcdefghijklmnop";
1078     ALIGN(4) unsigned char nonce[] = "\0\0\0\0\0\0\0\0bcdefghi";
1079     unsigned int  vector_lengths[] = {0,3,48,300,3000000};
1080     #if (VMAC_TAG_LEN == 64)
1081     ALIGN(4) char *should_be[] = {"2576BE1C56D8B81B","2D376CF5B1813CE5",
1082                         "E8421F61D573D298","4492DF6C5CAC1BBE",
1083                         "09BA597DD7601113"};
1084     #else
1085     ALIGN(4) char *should_be[] = {"472766C70F74ED23481D6D7DE4E80DAC",
1086                          "4EE815A06A1D71EDD36FC75D51188A42",
1087                          "09F2C80C8E1007A0C12FAE19FE4504AE",
1088                          "66438817154850C61D8A412164803BCB",
1089                          "2B6B02288FFC461B75485DE893C629DC"};
1090     #endif
1091     unsigned speed_lengths[] = {16, 32, 64, 128, 256, 512, 1024, 2048, 4096};
1092     unsigned i, j, *speed_iters;
1093     clock_t ticks;
1094     double cpb;
1095     const unsigned int buf_len = 3 * (1 << 20);
1096 
1097     j = prime();
1098     i = sizeof(speed_lengths)/sizeof(speed_lengths[0]);
1099     speed_iters = (unsigned *)malloc(i*sizeof(speed_iters[0]));
1100     speed_iters[i-1] = j * (1 << 12);
1101     while (--i) speed_iters[i-1] = (unsigned)(1.3 * speed_iters[i]);
1102 
1103     /* Initialize context and message buffer, all 16-byte aligned */
1104     p = malloc(buf_len + 32);
1105     m = (unsigned char *)(((size_t)p + 16) & ~((size_t)15));
1106     memset(m, 0, buf_len + 16);
1107     vmac_set_key(key, &ctx);
1108 
1109     /* Test incremental and all-in-one interfaces for correctness */
1110     vmac_set_key(key, &ctx_aio);
1111     vmac_set_key(key, &ctx_inc1);
1112     vmac_set_key(key, &ctx_inc2);
1113 
1114 
1115     /*
1116     for (i = 0; i <= 512; i++) {
1117         vhash_update(m,(i/VMAC_NHBYTES)*VMAC_NHBYTES,&ctx_inc1);
1118         tagh = vmac(m+(i/VMAC_NHBYTES)*VMAC_NHBYTES, i%VMAC_NHBYTES,
1119                                                       nonce, &tagl, &ctx);
1120         vhash_update(m,(i/VMAC_NHBYTES)*VMAC_NHBYTES,&ctx_inc1);
1121         for (j = 0; j < vector_lengths[i]; j++)
1122             m[j] = (unsigned char)('a'+j%3);
1123 
1124     }
1125     */
1126 
1127     /* Generate vectors */
1128     for (i = 0; i < sizeof(vector_lengths)/sizeof(unsigned int); i++) {
1129         for (j = 0; j < vector_lengths[i]; j++)
1130             m[j] = (unsigned char)('a'+j%3);
1131         res = vmac(m, vector_lengths[i], nonce, &tagl, &ctx);
1132         #if (VMAC_TAG_LEN == 64)
1133         printf("\'abc\' * %7u: %016llX Should be: %s\n",
1134               vector_lengths[i]/3,res,should_be[i]);
1135         #else
1136         printf("\'abc\' * %7u: %016llX%016llX\nShould be      : %s\n",
1137               vector_lengths[i]/3,res,tagl,should_be[i]);
1138         #endif
1139     }
1140 
1141     /* Speed test */
1142     for (i = 0; i < sizeof(speed_lengths)/sizeof(unsigned int); i++) {
1143         ticks = clock();
1144         for (j = 0; j < speed_iters[i]; j++) {
1145             #if HASH_ONLY
1146             res = vhash(m, speed_lengths[i], &tagl, &ctx);
1147             #else
1148             res = vmac(m, speed_lengths[i], nonce, &tagl, &ctx);
1149             nonce[7]++;
1150             #endif
1151         }
1152         ticks = clock() - ticks;
1153         cpb = ((ticks*VMAC_HZ)/
1154               ((double)CLOCKS_PER_SEC*speed_lengths[i]*speed_iters[i]));
1155         printf("%4u bytes, %2.2f cpb\n", speed_lengths[i], cpb);
1156     }
1157     return 1;
1158 }
1159 
1160 #endif
1161