1 /* --------------------------------------------------------------------------
2 * VMAC and VHASH Implementation by Ted Krovetz (tdk@acm.org) and Wei Dai.
3 * This implementation is herby placed in the public domain.
4 * The authors offers no warranty. Use at your own risk.
5 * Please send bug reports to the authors.
6 * Last modified: 17 APR 08, 1700 PDT
7 * ----------------------------------------------------------------------- */
8
9 /* start for Xen */
10 #include <xen/init.h>
11 #include <xen/types.h>
12 #include <xen/lib.h>
13 #include <crypto/vmac.h>
14 #define UINT64_C(x) x##ULL
15 /* end for Xen */
16
17 /* Enable code tuned for 64-bit registers; otherwise tuned for 32-bit */
18 #ifndef VMAC_ARCH_64
19 #define VMAC_ARCH_64 (__x86_64__ || __ppc64__ || _M_X64)
20 #endif
21
22 /* Enable code tuned for Intel SSE2 instruction set */
23 #if ((__SSE2__ || (_M_IX86_FP >= 2)) && ( ! VMAC_ARCH_64))
24 #define VMAC_USE_SSE2 1
25 #include <emmintrin.h>
26 #endif
27
28 /* Native word reads. Update (or define via compiler) if incorrect */
29 #ifndef VMAC_ARCH_BIG_ENDIAN /* Assume big-endian unless on the list */
30 #define VMAC_ARCH_BIG_ENDIAN \
31 (!(__x86_64__ || __i386__ || _M_IX86 || \
32 _M_X64 || __ARMEL__ || __MIPSEL__))
33 #endif
34
35 /* ----------------------------------------------------------------------- */
36 /* Constants and masks */
37
38 const uint64_t p64 = UINT64_C(0xfffffffffffffeff); /* 2^64 - 257 prime */
39 const uint64_t m62 = UINT64_C(0x3fffffffffffffff); /* 62-bit mask */
40 const uint64_t m63 = UINT64_C(0x7fffffffffffffff); /* 63-bit mask */
41 const uint64_t m64 = UINT64_C(0xffffffffffffffff); /* 64-bit mask */
42 const uint64_t mpoly = UINT64_C(0x1fffffff1fffffff); /* Poly key mask */
43
44 /* ----------------------------------------------------------------------- *
45 * The following routines are used in this implementation. They are
46 * written via macros to simulate zero-overhead call-by-reference.
47 * All have default implemantations for when they are not defined in an
48 * architecture-specific manner.
49 *
50 * MUL64: 64x64->128-bit multiplication
51 * PMUL64: assumes top bits cleared on inputs
52 * ADD128: 128x128->128-bit addition
53 * GET_REVERSED_64: load and byte-reverse 64-bit word
54 * ----------------------------------------------------------------------- */
55
56 /* ----------------------------------------------------------------------- */
57 #if (__GNUC__ && (__x86_64__ || __amd64__))
58 /* ----------------------------------------------------------------------- */
59
60 #define ADD128(rh,rl,ih,il) \
61 asm ("addq %3, %1 \n\t" \
62 "adcq %2, %0" \
63 : "+r"(rh),"+r"(rl) \
64 : "r"(ih),"r"(il) : "cc");
65
66 #define MUL64(rh,rl,i1,i2) \
67 asm ("mulq %3" : "=a"(rl), "=d"(rh) : "a"(i1), "r"(i2) : "cc")
68
69 #define PMUL64 MUL64
70
71 #define GET_REVERSED_64(p) \
72 ({uint64_t x; \
73 asm ("bswapq %0" : "=r" (x) : "0"(*(uint64_t *)(p))); x;})
74
75 /* ----------------------------------------------------------------------- */
76 #elif (__GNUC__ && __i386__)
77 /* ----------------------------------------------------------------------- */
78
79 #define GET_REVERSED_64(p) \
80 ({ uint64_t x; \
81 uint32_t *tp = (uint32_t *)(p); \
82 asm ("bswap %%edx\n\t" \
83 "bswap %%eax" \
84 : "=A"(x) \
85 : "a"(tp[1]), "d"(tp[0])); \
86 x; })
87
88 /* ----------------------------------------------------------------------- */
89 #elif (__GNUC__ && __ppc64__)
90 /* ----------------------------------------------------------------------- */
91
92 #define ADD128(rh,rl,ih,il) \
93 asm volatile ( "addc %1, %1, %3 \n\t" \
94 "adde %0, %0, %2" \
95 : "+r"(rh),"+r"(rl) \
96 : "r"(ih),"r"(il));
97
98 #define MUL64(rh,rl,i1,i2) \
99 { uint64_t _i1 = (i1), _i2 = (i2); \
100 rl = _i1 * _i2; \
101 asm volatile ("mulhdu %0, %1, %2" : "=r" (rh) : "r" (_i1), "r" (_i2));\
102 }
103
104 #define PMUL64 MUL64
105
106 #define GET_REVERSED_64(p) \
107 ({ uint32_t hi, lo, *_p = (uint32_t *)(p); \
108 asm volatile ("lwbrx %0, %1, %2" : "=r"(lo) : "b%"(0), "r"(_p) ); \
109 asm volatile ("lwbrx %0, %1, %2" : "=r"(hi) : "b%"(4), "r"(_p) ); \
110 ((uint64_t)hi << 32) | (uint64_t)lo; } )
111
112 /* ----------------------------------------------------------------------- */
113 #elif (__GNUC__ && (__ppc__ || __PPC__))
114 /* ----------------------------------------------------------------------- */
115
116 #define GET_REVERSED_64(p) \
117 ({ uint32_t hi, lo, *_p = (uint32_t *)(p); \
118 asm volatile ("lwbrx %0, %1, %2" : "=r"(lo) : "b%"(0), "r"(_p) ); \
119 asm volatile ("lwbrx %0, %1, %2" : "=r"(hi) : "b%"(4), "r"(_p) ); \
120 ((uint64_t)hi << 32) | (uint64_t)lo; } )
121
122 /* ----------------------------------------------------------------------- */
123 #elif (__GNUC__ && (__ARMEL__ || __ARM__))
124 /* ----------------------------------------------------------------------- */
125
126 #define bswap32(v) \
127 ({ uint32_t tmp,out; \
128 asm volatile( \
129 "eor %1, %2, %2, ror #16\n" \
130 "bic %1, %1, #0x00ff0000\n" \
131 "mov %0, %2, ror #8\n" \
132 "eor %0, %0, %1, lsr #8" \
133 : "=r" (out), "=&r" (tmp) \
134 : "r" (v)); \
135 out;})
136
137 /* ----------------------------------------------------------------------- */
138 #elif _MSC_VER
139 /* ----------------------------------------------------------------------- */
140
141 #include <intrin.h>
142
143 #if (_M_IA64 || _M_X64) && \
144 (!defined(__INTEL_COMPILER) || __INTEL_COMPILER >= 1000)
145 #define MUL64(rh,rl,i1,i2) (rl) = _umul128(i1,i2,&(rh));
146 #pragma intrinsic(_umul128)
147 #define PMUL64 MUL64
148 #endif
149
150 /* MSVC uses add, adc in this version */
151 #define ADD128(rh,rl,ih,il) \
152 { uint64_t _il = (il); \
153 (rl) += (_il); \
154 (rh) += (ih) + ((rl) < (_il)); \
155 }
156
157 #if _MSC_VER >= 1300
158 #define GET_REVERSED_64(p) _byteswap_uint64(*(uint64_t *)(p))
159 #pragma intrinsic(_byteswap_uint64)
160 #endif
161
162 #if _MSC_VER >= 1400 && \
163 (!defined(__INTEL_COMPILER) || __INTEL_COMPILER >= 1000)
164 #define MUL32(i1,i2) (__emulu((uint32_t)(i1),(uint32_t)(i2)))
165 #pragma intrinsic(__emulu)
166 #endif
167
168 /* ----------------------------------------------------------------------- */
169 #endif
170 /* ----------------------------------------------------------------------- */
171
172 #if __GNUC__
173 #define ALIGN(n) __attribute__ ((aligned(n)))
174 #define NOINLINE __attribute__ ((noinline))
175 #elif _MSC_VER
176 #define ALIGN(n) __declspec(align(n))
177 #define NOINLINE __declspec(noinline)
178 #else
179 #define ALIGN(n)
180 #define NOINLINE
181 #endif
182
183 /* ----------------------------------------------------------------------- */
184 /* Default implementations, if not defined above */
185 /* ----------------------------------------------------------------------- */
186
187 #ifndef ADD128
188 #define ADD128(rh,rl,ih,il) \
189 { uint64_t _il = (il); \
190 (rl) += (_il); \
191 if ((rl) < (_il)) (rh)++; \
192 (rh) += (ih); \
193 }
194 #endif
195
196 #ifndef MUL32
197 #define MUL32(i1,i2) ((uint64_t)(uint32_t)(i1)*(uint32_t)(i2))
198 #endif
199
200 #ifndef PMUL64 /* rh may not be same as i1 or i2 */
201 #define PMUL64(rh,rl,i1,i2) /* Assumes m doesn't overflow */ \
202 { uint64_t _i1 = (i1), _i2 = (i2); \
203 uint64_t m = MUL32(_i1,_i2>>32) + MUL32(_i1>>32,_i2); \
204 rh = MUL32(_i1>>32,_i2>>32); \
205 rl = MUL32(_i1,_i2); \
206 ADD128(rh,rl,(m >> 32),(m << 32)); \
207 }
208 #endif
209
210 #ifndef MUL64
211 #define MUL64(rh,rl,i1,i2) \
212 { uint64_t _i1 = (i1), _i2 = (i2); \
213 uint64_t m1= MUL32(_i1,_i2>>32); \
214 uint64_t m2= MUL32(_i1>>32,_i2); \
215 rh = MUL32(_i1>>32,_i2>>32); \
216 rl = MUL32(_i1,_i2); \
217 ADD128(rh,rl,(m1 >> 32),(m1 << 32)); \
218 ADD128(rh,rl,(m2 >> 32),(m2 << 32)); \
219 }
220 #endif
221
222 #ifndef GET_REVERSED_64
223 #ifndef bswap64
224 #ifndef bswap32
225 #define bswap32(x) \
226 ({ uint32_t bsx = (x); \
227 ((((bsx) & 0xff000000u) >> 24) | (((bsx) & 0x00ff0000u) >> 8) | \
228 (((bsx) & 0x0000ff00u) << 8) | (((bsx) & 0x000000ffu) << 24)); })
229 #endif
230 #define bswap64(x) \
231 ({ union { uint64_t ll; uint32_t l[2]; } w, r; \
232 w.ll = (x); \
233 r.l[0] = bswap32 (w.l[1]); \
234 r.l[1] = bswap32 (w.l[0]); \
235 r.ll; })
236 #endif
237 #define GET_REVERSED_64(p) bswap64(*(uint64_t *)(p))
238 #endif
239
240 /* ----------------------------------------------------------------------- */
241
242 #if (VMAC_PREFER_BIG_ENDIAN)
243 # define get64PE get64BE
244 #else
245 # define get64PE get64LE
246 #endif
247
248 #if (VMAC_ARCH_BIG_ENDIAN)
249 # define get64BE(ptr) (*(uint64_t *)(ptr))
250 # define get64LE(ptr) GET_REVERSED_64(ptr)
251 #else /* assume little-endian */
252 # define get64BE(ptr) GET_REVERSED_64(ptr)
253 # define get64LE(ptr) (*(uint64_t *)(ptr))
254 #endif
255
256
257 /* --------------------------------------------------------------------- *
258 * For highest performance the L1 NH and L2 polynomial hashes should be
259 * carefully implemented to take advantage of one's target architechture.
260 * Here these two hash functions are defined multiple time; once for
261 * 64-bit architectures, once for 32-bit SSE2 architectures, and once
262 * for the rest (32-bit) architectures.
263 * For each, nh_16 *must* be defined (works on multiples of 16 bytes).
264 * Optionally, nh_vmac_nhbytes can be defined (for multiples of
265 * VMAC_NHBYTES), and nh_16_2 and nh_vmac_nhbytes_2 (versions that do two
266 * NH computations at once).
267 * --------------------------------------------------------------------- */
268
269 /* ----------------------------------------------------------------------- */
270 #if VMAC_ARCH_64
271 /* ----------------------------------------------------------------------- */
272
273 #define nh_16(mp, kp, nw, rh, rl) \
274 { int i; uint64_t th, tl; \
275 rh = rl = 0; \
276 for (i = 0; i < nw; i+= 2) { \
277 MUL64(th,tl,get64PE((mp)+i )+(kp)[i ],get64PE((mp)+i+1)+(kp)[i+1]);\
278 ADD128(rh,rl,th,tl); \
279 } \
280 }
281 #define nh_16_2(mp, kp, nw, rh, rl, rh1, rl1) \
282 { int i; uint64_t th, tl; \
283 rh1 = rl1 = rh = rl = 0; \
284 for (i = 0; i < nw; i+= 2) { \
285 MUL64(th,tl,get64PE((mp)+i )+(kp)[i ],get64PE((mp)+i+1)+(kp)[i+1]);\
286 ADD128(rh,rl,th,tl); \
287 MUL64(th,tl,get64PE((mp)+i )+(kp)[i+2],get64PE((mp)+i+1)+(kp)[i+3]);\
288 ADD128(rh1,rl1,th,tl); \
289 } \
290 }
291
292 #if (VMAC_NHBYTES >= 64) /* These versions do 64-bytes of message at a time */
293 #define nh_vmac_nhbytes(mp, kp, nw, rh, rl) \
294 { int i; uint64_t th, tl; \
295 rh = rl = 0; \
296 for (i = 0; i < nw; i+= 8) { \
297 MUL64(th,tl,get64PE((mp)+i )+(kp)[i ],get64PE((mp)+i+1)+(kp)[i+1]);\
298 ADD128(rh,rl,th,tl); \
299 MUL64(th,tl,get64PE((mp)+i+2)+(kp)[i+2],get64PE((mp)+i+3)+(kp)[i+3]);\
300 ADD128(rh,rl,th,tl); \
301 MUL64(th,tl,get64PE((mp)+i+4)+(kp)[i+4],get64PE((mp)+i+5)+(kp)[i+5]);\
302 ADD128(rh,rl,th,tl); \
303 MUL64(th,tl,get64PE((mp)+i+6)+(kp)[i+6],get64PE((mp)+i+7)+(kp)[i+7]);\
304 ADD128(rh,rl,th,tl); \
305 } \
306 }
307 #define nh_vmac_nhbytes_2(mp, kp, nw, rh, rl, rh1, rl1) \
308 { int i; uint64_t th, tl; \
309 rh1 = rl1 = rh = rl = 0; \
310 for (i = 0; i < nw; i+= 8) { \
311 MUL64(th,tl,get64PE((mp)+i )+(kp)[i ],get64PE((mp)+i+1)+(kp)[i+1]);\
312 ADD128(rh,rl,th,tl); \
313 MUL64(th,tl,get64PE((mp)+i )+(kp)[i+2],get64PE((mp)+i+1)+(kp)[i+3]);\
314 ADD128(rh1,rl1,th,tl); \
315 MUL64(th,tl,get64PE((mp)+i+2)+(kp)[i+2],get64PE((mp)+i+3)+(kp)[i+3]);\
316 ADD128(rh,rl,th,tl); \
317 MUL64(th,tl,get64PE((mp)+i+2)+(kp)[i+4],get64PE((mp)+i+3)+(kp)[i+5]);\
318 ADD128(rh1,rl1,th,tl); \
319 MUL64(th,tl,get64PE((mp)+i+4)+(kp)[i+4],get64PE((mp)+i+5)+(kp)[i+5]);\
320 ADD128(rh,rl,th,tl); \
321 MUL64(th,tl,get64PE((mp)+i+4)+(kp)[i+6],get64PE((mp)+i+5)+(kp)[i+7]);\
322 ADD128(rh1,rl1,th,tl); \
323 MUL64(th,tl,get64PE((mp)+i+6)+(kp)[i+6],get64PE((mp)+i+7)+(kp)[i+7]);\
324 ADD128(rh,rl,th,tl); \
325 MUL64(th,tl,get64PE((mp)+i+6)+(kp)[i+8],get64PE((mp)+i+7)+(kp)[i+9]);\
326 ADD128(rh1,rl1,th,tl); \
327 } \
328 }
329 #endif
330
331 #define poly_step(ah, al, kh, kl, mh, ml) \
332 { uint64_t t1h, t1l, t2h, t2l, t3h, t3l, z=0; \
333 /* compute ab*cd, put bd into result registers */ \
334 PMUL64(t3h,t3l,al,kh); \
335 PMUL64(t2h,t2l,ah,kl); \
336 PMUL64(t1h,t1l,ah,2*kh); \
337 PMUL64(ah,al,al,kl); \
338 /* add 2 * ac to result */ \
339 ADD128(ah,al,t1h,t1l); \
340 /* add together ad + bc */ \
341 ADD128(t2h,t2l,t3h,t3l); \
342 /* now (ah,al), (t2l,2*t2h) need summing */ \
343 /* first add the high registers, carrying into t2h */ \
344 ADD128(t2h,ah,z,t2l); \
345 /* double t2h and add top bit of ah */ \
346 t2h = 2 * t2h + (ah >> 63); \
347 ah &= m63; \
348 /* now add the low registers */ \
349 ADD128(ah,al,mh,ml); \
350 ADD128(ah,al,z,t2h); \
351 }
352
353 /* ----------------------------------------------------------------------- */
354 #elif VMAC_USE_SSE2
355 /* ----------------------------------------------------------------------- */
356
357 // macros from Crypto++ for sharing inline assembly code between MSVC and GNU C
358 #if defined(__GNUC__)
359 // define these in two steps to allow arguments to be expanded
360 #define GNU_AS2(x, y) #x ", " #y ";"
361 #define GNU_AS3(x, y, z) #x ", " #y ", " #z ";"
362 #define GNU_ASL(x) "\n" #x ":"
363 #define GNU_ASJ(x, y, z) #x " " #y #z ";"
364 #define AS2(x, y) GNU_AS2(x, y)
365 #define AS3(x, y, z) GNU_AS3(x, y, z)
366 #define ASS(x, y, a, b, c, d) #x ", " #y ", " #a "*64+" #b "*16+" #c "*4+" #d ";"
367 #define ASL(x) GNU_ASL(x)
368 #define ASJ(x, y, z) GNU_ASJ(x, y, z)
369 #else
370 #define AS2(x, y) __asm {x, y}
371 #define AS3(x, y, z) __asm {x, y, z}
372 #define ASS(x, y, a, b, c, d) __asm {x, y, _MM_SHUFFLE(a, b, c, d)}
373 #define ASL(x) __asm {label##x:}
374 #define ASJ(x, y, z) __asm {x label##y}
375 #endif
376
nh_16_func(const uint64_t * mp,const uint64_t * kp,size_t nw,uint64_t * rh,uint64_t * rl)377 static void NOINLINE nh_16_func(const uint64_t *mp, const uint64_t *kp, size_t nw, uint64_t *rh, uint64_t *rl)
378 {
379 // This assembly version, using MMX registers, is just as fast as the
380 // intrinsics version (which uses XMM registers) on the Intel Core 2,
381 // but is much faster on the Pentium 4. In order to schedule multiplies
382 // as early as possible, the loop interleaves operations for the current
383 // block and the next block. To mask out high 32-bits, we use "movd"
384 // to move the lower 32-bits to the stack and then back. Surprisingly,
385 // this is faster than any other method.
386 #ifdef __GNUC__
387 __asm__ __volatile__
388 (
389 ".intel_syntax noprefix;"
390 #else
391 AS2( mov esi, mp)
392 AS2( mov edi, kp)
393 AS2( mov ecx, nw)
394 AS2( mov eax, rl)
395 AS2( mov edx, rh)
396 #endif
397 AS2( sub esp, 12)
398 AS2( movq mm6, [esi])
399 AS2( paddq mm6, [edi])
400 AS2( movq mm5, [esi+8])
401 AS2( paddq mm5, [edi+8])
402 AS2( add esi, 16)
403 AS2( add edi, 16)
404 AS2( movq mm4, mm6)
405 ASS( pshufw mm2, mm6, 1, 0, 3, 2)
406 AS2( pmuludq mm6, mm5)
407 ASS( pshufw mm3, mm5, 1, 0, 3, 2)
408 AS2( pmuludq mm5, mm2)
409 AS2( pmuludq mm2, mm3)
410 AS2( pmuludq mm3, mm4)
411 AS2( pxor mm7, mm7)
412 AS2( movd [esp], mm6)
413 AS2( psrlq mm6, 32)
414 AS2( movd [esp+4], mm5)
415 AS2( psrlq mm5, 32)
416 AS2( sub ecx, 2)
417 ASJ( jz, 1, f)
418 ASL(0)
419 AS2( movq mm0, [esi])
420 AS2( paddq mm0, [edi])
421 AS2( movq mm1, [esi+8])
422 AS2( paddq mm1, [edi+8])
423 AS2( add esi, 16)
424 AS2( add edi, 16)
425 AS2( movq mm4, mm0)
426 AS2( paddq mm5, mm2)
427 ASS( pshufw mm2, mm0, 1, 0, 3, 2)
428 AS2( pmuludq mm0, mm1)
429 AS2( movd [esp+8], mm3)
430 AS2( psrlq mm3, 32)
431 AS2( paddq mm5, mm3)
432 ASS( pshufw mm3, mm1, 1, 0, 3, 2)
433 AS2( pmuludq mm1, mm2)
434 AS2( pmuludq mm2, mm3)
435 AS2( pmuludq mm3, mm4)
436 AS2( movd mm4, [esp])
437 AS2( paddq mm7, mm4)
438 AS2( movd mm4, [esp+4])
439 AS2( paddq mm6, mm4)
440 AS2( movd mm4, [esp+8])
441 AS2( paddq mm6, mm4)
442 AS2( movd [esp], mm0)
443 AS2( psrlq mm0, 32)
444 AS2( paddq mm6, mm0)
445 AS2( movd [esp+4], mm1)
446 AS2( psrlq mm1, 32)
447 AS2( paddq mm5, mm1)
448 AS2( sub ecx, 2)
449 ASJ( jnz, 0, b)
450 ASL(1)
451 AS2( paddq mm5, mm2)
452 AS2( movd [esp+8], mm3)
453 AS2( psrlq mm3, 32)
454 AS2( paddq mm5, mm3)
455 AS2( movd mm4, [esp])
456 AS2( paddq mm7, mm4)
457 AS2( movd mm4, [esp+4])
458 AS2( paddq mm6, mm4)
459 AS2( movd mm4, [esp+8])
460 AS2( paddq mm6, mm4)
461
462 ASS( pshufw mm0, mm7, 3, 2, 1, 0)
463 AS2( psrlq mm7, 32)
464 AS2( paddq mm6, mm7)
465 AS2( punpckldq mm0, mm6)
466 AS2( psrlq mm6, 32)
467 AS2( paddq mm5, mm6)
468 AS2( movq [eax], mm0)
469 AS2( movq [edx], mm5)
470 AS2( add esp, 12)
471 #ifdef __GNUC__
472 ".att_syntax prefix;"
473 :
474 : "S" (mp), "D" (kp), "c" (nw), "a" (rl), "d" (rh)
475 : "memory", "cc"
476 );
477 #endif
478 }
479 #define nh_16(mp, kp, nw, rh, rl) nh_16_func(mp, kp, nw, &(rh), &(rl));
480
poly_step_func(uint64_t * ahi,uint64_t * alo,const uint64_t * kh,const uint64_t * kl,const uint64_t * mh,const uint64_t * ml)481 static void poly_step_func(uint64_t *ahi, uint64_t *alo, const uint64_t *kh,
482 const uint64_t *kl, const uint64_t *mh, const uint64_t *ml)
483 {
484 // This code tries to schedule the multiplies as early as possible to overcome
485 // the long latencies on the Pentium 4. It also minimizes "movq" instructions
486 // which are very expensive on the P4.
487
488 #define a0 [eax+0]
489 #define a1 [eax+4]
490 #define a2 [ebx+0]
491 #define a3 [ebx+4]
492 #define k0 [ecx+0]
493 #define k1 [ecx+4]
494 #define k2 [edx+0]
495 #define k3 [edx+4]
496
497 #ifdef __GNUC__
498 uint32_t temp;
499 __asm__ __volatile__
500 (
501 "mov %%ebx, %0;"
502 "mov %1, %%ebx;"
503 ".intel_syntax noprefix;"
504 #else
505 AS2( mov ebx, ahi)
506 AS2( mov edx, kh)
507 AS2( mov eax, alo)
508 AS2( mov ecx, kl)
509 AS2( mov esi, mh)
510 AS2( mov edi, ml)
511 #endif
512
513 AS2( movd mm0, a3)
514 AS2( movq mm4, mm0)
515 AS2( pmuludq mm0, k3) // a3*k3
516 AS2( movd mm1, a0)
517 AS2( pmuludq mm1, k2) // a0*k2
518 AS2( movd mm2, a1)
519 AS2( movd mm6, k1)
520 AS2( pmuludq mm2, mm6) // a1*k1
521 AS2( movd mm3, a2)
522 AS2( movq mm5, mm3)
523 AS2( movd mm7, k0)
524 AS2( pmuludq mm3, mm7) // a2*k0
525 AS2( pmuludq mm4, mm7) // a3*k0
526 AS2( pmuludq mm5, mm6) // a2*k1
527 AS2( psllq mm0, 1)
528 AS2( paddq mm0, [esi])
529 AS2( paddq mm0, mm1)
530 AS2( movd mm1, a1)
531 AS2( paddq mm4, mm5)
532 AS2( movq mm5, mm1)
533 AS2( pmuludq mm1, k2) // a1*k2
534 AS2( paddq mm0, mm2)
535 AS2( movd mm2, a0)
536 AS2( paddq mm0, mm3)
537 AS2( movq mm3, mm2)
538 AS2( pmuludq mm2, k3) // a0*k3
539 AS2( pmuludq mm3, mm7) // a0*k0
540 AS2( movd esi, mm0)
541 AS2( psrlq mm0, 32)
542 AS2( pmuludq mm7, mm5) // a1*k0
543 AS2( pmuludq mm5, k3) // a1*k3
544 AS2( paddq mm0, mm1)
545 AS2( movd mm1, a2)
546 AS2( pmuludq mm1, k2) // a2*k2
547 AS2( paddq mm0, mm2)
548 AS2( paddq mm0, mm4)
549 AS2( movq mm4, mm0)
550 AS2( movd mm2, a3)
551 AS2( pmuludq mm2, mm6) // a3*k1
552 AS2( pmuludq mm6, a0) // a0*k1
553 AS2( psrlq mm0, 31)
554 AS2( paddq mm0, mm3)
555 AS2( movd mm3, [edi])
556 AS2( paddq mm0, mm3)
557 AS2( movd mm3, a2)
558 AS2( pmuludq mm3, k3) // a2*k3
559 AS2( paddq mm5, mm1)
560 AS2( movd mm1, a3)
561 AS2( pmuludq mm1, k2) // a3*k2
562 AS2( paddq mm5, mm2)
563 AS2( movd mm2, [edi+4])
564 AS2( psllq mm5, 1)
565 AS2( paddq mm0, mm5)
566 AS2( movq mm5, mm0)
567 AS2( psllq mm4, 33)
568 AS2( psrlq mm0, 32)
569 AS2( paddq mm6, mm7)
570 AS2( movd mm7, esi)
571 AS2( paddq mm0, mm6)
572 AS2( paddq mm0, mm2)
573 AS2( paddq mm3, mm1)
574 AS2( psllq mm3, 1)
575 AS2( paddq mm0, mm3)
576 AS2( psrlq mm4, 1)
577 AS2( punpckldq mm5, mm0)
578 AS2( psrlq mm0, 32)
579 AS2( por mm4, mm7)
580 AS2( paddq mm0, mm4)
581 AS2( movq a0, mm5)
582 AS2( movq a2, mm0)
583 #ifdef __GNUC__
584 ".att_syntax prefix;"
585 "mov %0, %%ebx;"
586 : "=m" (temp)
587 : "m" (ahi), "D" (ml), "d" (kh), "a" (alo), "S" (mh), "c" (kl)
588 : "memory", "cc"
589 );
590 #endif
591
592
593 #undef a0
594 #undef a1
595 #undef a2
596 #undef a3
597 #undef k0
598 #undef k1
599 #undef k2
600 #undef k3
601 }
602
603 #define poly_step(ah, al, kh, kl, mh, ml) \
604 poly_step_func(&(ah), &(al), &(kh), &(kl), &(mh), &(ml))
605
606 /* ----------------------------------------------------------------------- */
607 #else /* not VMAC_ARCH_64 and not SSE2 */
608 /* ----------------------------------------------------------------------- */
609
610 #ifndef nh_16
611 #define nh_16(mp, kp, nw, rh, rl) \
612 { uint64_t t1,t2,m1,m2,t; \
613 int i; \
614 rh = rl = t = 0; \
615 for (i = 0; i < nw; i+=2) { \
616 t1 = get64PE(mp+i) + kp[i]; \
617 t2 = get64PE(mp+i+1) + kp[i+1]; \
618 m2 = MUL32(t1 >> 32, t2); \
619 m1 = MUL32(t1, t2 >> 32); \
620 ADD128(rh,rl,MUL32(t1 >> 32,t2 >> 32),MUL32(t1,t2)); \
621 rh += (uint64_t)(uint32_t)(m1 >> 32) + (uint32_t)(m2 >> 32); \
622 t += (uint64_t)(uint32_t)m1 + (uint32_t)m2; \
623 } \
624 ADD128(rh,rl,(t >> 32),(t << 32)); \
625 }
626 #endif
627
poly_step_func(uint64_t * ahi,uint64_t * alo,const uint64_t * kh,const uint64_t * kl,const uint64_t * mh,const uint64_t * ml)628 static void poly_step_func(uint64_t *ahi, uint64_t *alo, const uint64_t *kh,
629 const uint64_t *kl, const uint64_t *mh, const uint64_t *ml)
630 {
631
632 #if VMAC_ARCH_BIG_ENDIAN
633 #define INDEX_HIGH 0
634 #define INDEX_LOW 1
635 #else
636 #define INDEX_HIGH 1
637 #define INDEX_LOW 0
638 #endif
639
640 #define a0 *(((uint32_t*)alo)+INDEX_LOW)
641 #define a1 *(((uint32_t*)alo)+INDEX_HIGH)
642 #define a2 *(((uint32_t*)ahi)+INDEX_LOW)
643 #define a3 *(((uint32_t*)ahi)+INDEX_HIGH)
644 #define k0 *(((uint32_t*)kl)+INDEX_LOW)
645 #define k1 *(((uint32_t*)kl)+INDEX_HIGH)
646 #define k2 *(((uint32_t*)kh)+INDEX_LOW)
647 #define k3 *(((uint32_t*)kh)+INDEX_HIGH)
648
649 uint64_t p, q, t;
650 uint32_t t2;
651
652 p = MUL32(a3, k3);
653 p += p;
654 p += *(uint64_t *)mh;
655 p += MUL32(a0, k2);
656 p += MUL32(a1, k1);
657 p += MUL32(a2, k0);
658 t = (uint32_t)(p);
659 p >>= 32;
660 p += MUL32(a0, k3);
661 p += MUL32(a1, k2);
662 p += MUL32(a2, k1);
663 p += MUL32(a3, k0);
664 t |= ((uint64_t)((uint32_t)p & 0x7fffffff)) << 32;
665 p >>= 31;
666 p += (uint64_t)(((uint32_t*)ml)[INDEX_LOW]);
667 p += MUL32(a0, k0);
668 q = MUL32(a1, k3);
669 q += MUL32(a2, k2);
670 q += MUL32(a3, k1);
671 q += q;
672 p += q;
673 t2 = (uint32_t)(p);
674 p >>= 32;
675 p += (uint64_t)(((uint32_t*)ml)[INDEX_HIGH]);
676 p += MUL32(a0, k1);
677 p += MUL32(a1, k0);
678 q = MUL32(a2, k3);
679 q += MUL32(a3, k2);
680 q += q;
681 p += q;
682 *(uint64_t *)(alo) = (p << 32) | t2;
683 p >>= 32;
684 *(uint64_t *)(ahi) = p + t;
685
686 #undef a0
687 #undef a1
688 #undef a2
689 #undef a3
690 #undef k0
691 #undef k1
692 #undef k2
693 #undef k3
694 }
695
696 #define poly_step(ah, al, kh, kl, mh, ml) \
697 poly_step_func(&(ah), &(al), &(kh), &(kl), &(mh), &(ml))
698
699 /* ----------------------------------------------------------------------- */
700 #endif /* end of specialized NH and poly definitions */
701 /* ----------------------------------------------------------------------- */
702
703 /* At least nh_16 is defined. Defined others as needed here */
704 #ifndef nh_16_2
705 #define nh_16_2(mp, kp, nw, rh, rl, rh2, rl2) \
706 nh_16(mp, kp, nw, rh, rl); \
707 nh_16(mp, ((kp)+2), nw, rh2, rl2);
708 #endif
709 #ifndef nh_vmac_nhbytes
710 #define nh_vmac_nhbytes(mp, kp, nw, rh, rl) \
711 nh_16(mp, kp, nw, rh, rl)
712 #endif
713 #ifndef nh_vmac_nhbytes_2
714 #define nh_vmac_nhbytes_2(mp, kp, nw, rh, rl, rh2, rl2) \
715 nh_vmac_nhbytes(mp, kp, nw, rh, rl); \
716 nh_vmac_nhbytes(mp, ((kp)+2), nw, rh2, rl2);
717 #endif
718
719 /* ----------------------------------------------------------------------- */
720
vhash_abort(vmac_ctx_t * ctx)721 static void vhash_abort(vmac_ctx_t *ctx)
722 {
723 ctx->polytmp[0] = ctx->polykey[0] ;
724 ctx->polytmp[1] = ctx->polykey[1] ;
725 #if (VMAC_TAG_LEN == 128)
726 ctx->polytmp[2] = ctx->polykey[2] ;
727 ctx->polytmp[3] = ctx->polykey[3] ;
728 #endif
729 ctx->first_block_processed = 0;
730 }
731
732 /* ----------------------------------------------------------------------- */
l3hash(uint64_t p1,uint64_t p2,uint64_t k1,uint64_t k2,uint64_t len)733 static uint64_t l3hash(uint64_t p1, uint64_t p2,
734 uint64_t k1, uint64_t k2, uint64_t len)
735 {
736 uint64_t rh, rl, t, z=0;
737
738 /* fully reduce (p1,p2)+(len,0) mod p127 */
739 t = p1 >> 63;
740 p1 &= m63;
741 ADD128(p1, p2, len, t);
742 /* At this point, (p1,p2) is at most 2^127+(len<<64) */
743 t = (p1 > m63) + ((p1 == m63) && (p2 == m64));
744 ADD128(p1, p2, z, t);
745 p1 &= m63;
746
747 /* compute (p1,p2)/(2^64-2^32) and (p1,p2)%(2^64-2^32) */
748 t = p1 + (p2 >> 32);
749 t += (t >> 32);
750 t += (uint32_t)t > 0xfffffffeu;
751 p1 += (t >> 32);
752 p2 += (p1 << 32);
753
754 /* compute (p1+k1)%p64 and (p2+k2)%p64 */
755 p1 += k1;
756 p1 += (0 - (p1 < k1)) & 257;
757 p2 += k2;
758 p2 += (0 - (p2 < k2)) & 257;
759
760 /* compute (p1+k1)*(p2+k2)%p64 */
761 MUL64(rh, rl, p1, p2);
762 t = rh >> 56;
763 ADD128(t, rl, z, rh);
764 rh <<= 8;
765 ADD128(t, rl, z, rh);
766 t += t << 8;
767 rl += t;
768 rl += (0 - (rl < t)) & 257;
769 rl += (0 - (rl > p64-1)) & 257;
770 return rl;
771 }
772
773 /* ----------------------------------------------------------------------- */
774
vhash_update(unsigned char * m,unsigned int mbytes,vmac_ctx_t * ctx)775 void vhash_update(unsigned char *m,
776 unsigned int mbytes, /* Pos multiple of VMAC_NHBYTES */
777 vmac_ctx_t *ctx)
778 {
779 uint64_t rh, rl, *mptr;
780 const uint64_t *kptr = (uint64_t *)ctx->nhkey;
781 int i;
782 uint64_t ch, cl;
783 uint64_t pkh = ctx->polykey[0];
784 uint64_t pkl = ctx->polykey[1];
785 #if (VMAC_TAG_LEN == 128)
786 uint64_t ch2, cl2, rh2, rl2;
787 uint64_t pkh2 = ctx->polykey[2];
788 uint64_t pkl2 = ctx->polykey[3];
789 #endif
790
791 mptr = (uint64_t *)m;
792 i = mbytes / VMAC_NHBYTES; /* Must be non-zero */
793
794 ch = ctx->polytmp[0];
795 cl = ctx->polytmp[1];
796 #if (VMAC_TAG_LEN == 128)
797 ch2 = ctx->polytmp[2];
798 cl2 = ctx->polytmp[3];
799 #endif
800
801 if ( ! ctx->first_block_processed) {
802 ctx->first_block_processed = 1;
803 #if (VMAC_TAG_LEN == 64)
804 nh_vmac_nhbytes(mptr,kptr,VMAC_NHBYTES/8,rh,rl);
805 #else
806 nh_vmac_nhbytes_2(mptr,kptr,VMAC_NHBYTES/8,rh,rl,rh2,rl2);
807 rh2 &= m62;
808 ADD128(ch2,cl2,rh2,rl2);
809 #endif
810 rh &= m62;
811 ADD128(ch,cl,rh,rl);
812 mptr += (VMAC_NHBYTES/sizeof(uint64_t));
813 i--;
814 }
815
816 while (i--) {
817 #if (VMAC_TAG_LEN == 64)
818 nh_vmac_nhbytes(mptr,kptr,VMAC_NHBYTES/8,rh,rl);
819 #else
820 nh_vmac_nhbytes_2(mptr,kptr,VMAC_NHBYTES/8,rh,rl,rh2,rl2);
821 rh2 &= m62;
822 poly_step(ch2,cl2,pkh2,pkl2,rh2,rl2);
823 #endif
824 rh &= m62;
825 poly_step(ch,cl,pkh,pkl,rh,rl);
826 mptr += (VMAC_NHBYTES/sizeof(uint64_t));
827 }
828
829 ctx->polytmp[0] = ch;
830 ctx->polytmp[1] = cl;
831 #if (VMAC_TAG_LEN == 128)
832 ctx->polytmp[2] = ch2;
833 ctx->polytmp[3] = cl2;
834 #endif
835 #if VMAC_USE_SSE2
836 _mm_empty(); /* SSE2 version of poly_step uses mmx instructions */
837 #endif
838 }
839
840 /* ----------------------------------------------------------------------- */
841
vhash(unsigned char m[],unsigned int mbytes,uint64_t * tagl,vmac_ctx_t * ctx)842 uint64_t vhash(unsigned char m[],
843 unsigned int mbytes,
844 uint64_t *tagl,
845 vmac_ctx_t *ctx)
846 {
847 uint64_t rh, rl, *mptr;
848 const uint64_t *kptr = (uint64_t *)ctx->nhkey;
849 int i, remaining;
850 uint64_t ch, cl;
851 uint64_t pkh = ctx->polykey[0];
852 uint64_t pkl = ctx->polykey[1];
853 #if (VMAC_TAG_LEN == 128)
854 uint64_t ch2, cl2, rh2, rl2;
855 uint64_t pkh2 = ctx->polykey[2];
856 uint64_t pkl2 = ctx->polykey[3];
857 #endif
858
859 mptr = (uint64_t *)m;
860 i = mbytes / VMAC_NHBYTES;
861 remaining = mbytes % VMAC_NHBYTES;
862
863 if (ctx->first_block_processed)
864 {
865 ch = ctx->polytmp[0];
866 cl = ctx->polytmp[1];
867 #if (VMAC_TAG_LEN == 128)
868 ch2 = ctx->polytmp[2];
869 cl2 = ctx->polytmp[3];
870 #endif
871 }
872 else if (i)
873 {
874 #if (VMAC_TAG_LEN == 64)
875 nh_vmac_nhbytes(mptr,kptr,VMAC_NHBYTES/8,ch,cl);
876 #else
877 nh_vmac_nhbytes_2(mptr,kptr,VMAC_NHBYTES/8,ch,cl,ch2,cl2);
878 ch2 &= m62;
879 ADD128(ch2,cl2,pkh2,pkl2);
880 #endif
881 ch &= m62;
882 ADD128(ch,cl,pkh,pkl);
883 mptr += (VMAC_NHBYTES/sizeof(uint64_t));
884 i--;
885 }
886 else if (remaining)
887 {
888 #if (VMAC_TAG_LEN == 64)
889 nh_16(mptr,kptr,2*((remaining+15)/16),ch,cl);
890 #else
891 nh_16_2(mptr,kptr,2*((remaining+15)/16),ch,cl,ch2,cl2);
892 ch2 &= m62;
893 ADD128(ch2,cl2,pkh2,pkl2);
894 #endif
895 ch &= m62;
896 ADD128(ch,cl,pkh,pkl);
897 mptr += (VMAC_NHBYTES/sizeof(uint64_t));
898 goto do_l3;
899 }
900 else /* Empty String */
901 {
902 ch = pkh; cl = pkl;
903 #if (VMAC_TAG_LEN == 128)
904 ch2 = pkh2; cl2 = pkl2;
905 #endif
906 goto do_l3;
907 }
908
909 while (i--) {
910 #if (VMAC_TAG_LEN == 64)
911 nh_vmac_nhbytes(mptr,kptr,VMAC_NHBYTES/8,rh,rl);
912 #else
913 nh_vmac_nhbytes_2(mptr,kptr,VMAC_NHBYTES/8,rh,rl,rh2,rl2);
914 rh2 &= m62;
915 poly_step(ch2,cl2,pkh2,pkl2,rh2,rl2);
916 #endif
917 rh &= m62;
918 poly_step(ch,cl,pkh,pkl,rh,rl);
919 mptr += (VMAC_NHBYTES/sizeof(uint64_t));
920 }
921 if (remaining) {
922 #if (VMAC_TAG_LEN == 64)
923 nh_16(mptr,kptr,2*((remaining+15)/16),rh,rl);
924 #else
925 nh_16_2(mptr,kptr,2*((remaining+15)/16),rh,rl,rh2,rl2);
926 rh2 &= m62;
927 poly_step(ch2,cl2,pkh2,pkl2,rh2,rl2);
928 #endif
929 rh &= m62;
930 poly_step(ch,cl,pkh,pkl,rh,rl);
931 }
932
933 do_l3:
934 #if VMAC_USE_SSE2
935 _mm_empty(); /* SSE2 version of poly_step uses mmx instructions */
936 #endif
937 vhash_abort(ctx);
938 remaining *= 8;
939 #if (VMAC_TAG_LEN == 128)
940 *tagl = l3hash(ch2, cl2, ctx->l3key[2], ctx->l3key[3],remaining);
941 #endif
942 return l3hash(ch, cl, ctx->l3key[0], ctx->l3key[1],remaining);
943 }
944
945 /* ----------------------------------------------------------------------- */
946
vmac(unsigned char m[],unsigned int mbytes,unsigned char n[16],uint64_t * tagl,vmac_ctx_t * ctx)947 uint64_t vmac(unsigned char m[],
948 unsigned int mbytes,
949 unsigned char n[16],
950 uint64_t *tagl,
951 vmac_ctx_t *ctx)
952 {
953 #if (VMAC_TAG_LEN == 64)
954 uint64_t *in_n, *out_p;
955 uint64_t p, h;
956 int i;
957
958 #if VMAC_CACHE_NONCES
959 in_n = ctx->cached_nonce;
960 out_p = ctx->cached_aes;
961 #else
962 uint64_t tmp[2];
963 in_n = out_p = tmp;
964 #endif
965
966 i = n[15] & 1;
967 #if VMAC_CACHE_NONCES
968 if ((*(uint64_t *)(n+8) != in_n[1]) ||
969 (*(uint64_t *)(n ) != in_n[0])) {
970 #endif
971
972 in_n[0] = *(uint64_t *)(n );
973 in_n[1] = *(uint64_t *)(n+8);
974 ((unsigned char *)in_n)[15] &= 0xFE;
975 aes_encryption(in_n, out_p, &ctx->cipher_key);
976
977 #if VMAC_CACHE_NONCES
978 ((unsigned char *)in_n)[15] |= (unsigned char)(1-i);
979 }
980 #endif
981 p = get64BE(out_p + i);
982 h = vhash(m, mbytes, (uint64_t *)0, ctx);
983 return p + h;
984 #else
985 uint64_t tmp[2];
986 uint64_t th,tl;
987 aes_encryption(n, (unsigned char *)tmp, &ctx->cipher_key);
988 th = vhash(m, mbytes, &tl, ctx);
989 th += get64BE(tmp);
990 *tagl = tl + get64BE(tmp+1);
991 return th;
992 #endif
993 }
994
995 /* ----------------------------------------------------------------------- */
996
vmac_set_key(unsigned char user_key[],vmac_ctx_t * ctx)997 void vmac_set_key(unsigned char user_key[], vmac_ctx_t *ctx)
998 {
999 uint64_t in[2] = {0}, out[2];
1000 unsigned i;
1001 aes_key_setup(user_key, &ctx->cipher_key);
1002
1003 /* Fill nh key */
1004 ((unsigned char *)in)[0] = 0x80;
1005 for (i = 0; i < sizeof(ctx->nhkey)/8; i+=2) {
1006 aes_encryption((unsigned char *)in, (unsigned char *)out,
1007 &ctx->cipher_key);
1008 ctx->nhkey[i ] = get64BE(out);
1009 ctx->nhkey[i+1] = get64BE(out+1);
1010 ((unsigned char *)in)[15] += 1;
1011 }
1012
1013 /* Fill poly key */
1014 ((unsigned char *)in)[0] = 0xC0;
1015 in[1] = 0;
1016 for (i = 0; i < sizeof(ctx->polykey)/8; i+=2) {
1017 aes_encryption((unsigned char *)in, (unsigned char *)out,
1018 &ctx->cipher_key);
1019 ctx->polytmp[i ] = ctx->polykey[i ] = get64BE(out) & mpoly;
1020 ctx->polytmp[i+1] = ctx->polykey[i+1] = get64BE(out+1) & mpoly;
1021 ((unsigned char *)in)[15] += 1;
1022 }
1023
1024 /* Fill ip key */
1025 ((unsigned char *)in)[0] = 0xE0;
1026 in[1] = 0;
1027 for (i = 0; i < sizeof(ctx->l3key)/8; i+=2) {
1028 do {
1029 aes_encryption((unsigned char *)in, (unsigned char *)out,
1030 &ctx->cipher_key);
1031 ctx->l3key[i ] = get64BE(out);
1032 ctx->l3key[i+1] = get64BE(out+1);
1033 ((unsigned char *)in)[15] += 1;
1034 } while (ctx->l3key[i] >= p64 || ctx->l3key[i+1] >= p64);
1035 }
1036
1037 /* Invalidate nonce/aes cache and reset other elements */
1038 #if (VMAC_TAG_LEN == 64) && (VMAC_CACHE_NONCES)
1039 ctx->cached_nonce[0] = (uint64_t)-1; /* Ensure illegal nonce */
1040 ctx->cached_nonce[1] = (uint64_t)0; /* Ensure illegal nonce */
1041 #endif
1042 ctx->first_block_processed = 0;
1043 }
1044
1045 /* ----------------------------------------------------------------------- */
1046
1047
1048 #if VMAC_RUN_TESTS
1049
1050 #include <stdlib.h>
1051 #include <stdio.h>
1052 #include <time.h>
1053 #include <string.h>
1054
prime(void)1055 unsigned prime(void) /* Wake variable speed cpu, get rough speed estimate */
1056 {
1057 volatile uint64_t i;
1058 volatile uint64_t j=1;
1059 unsigned cnt=0;
1060 volatile clock_t ticks = clock();
1061 do {
1062 for (i = 0; i < 500000; i++) {
1063 uint64_t x = get64PE(&j);
1064 j = x * x + (uint64_t)ticks;
1065 }
1066 cnt++;
1067 } while (clock() - ticks < (CLOCKS_PER_SEC/2));
1068 return cnt; /* cnt is millions of iterations per second */
1069 }
1070
main(void)1071 int main(void)
1072 {
1073 ALIGN(16) vmac_ctx_t ctx, ctx_aio, ctx_inc1, ctx_inc2;
1074 uint64_t res, tagl;
1075 void *p;
1076 unsigned char *m;
1077 ALIGN(4) unsigned char key[] = "abcdefghijklmnop";
1078 ALIGN(4) unsigned char nonce[] = "\0\0\0\0\0\0\0\0bcdefghi";
1079 unsigned int vector_lengths[] = {0,3,48,300,3000000};
1080 #if (VMAC_TAG_LEN == 64)
1081 ALIGN(4) char *should_be[] = {"2576BE1C56D8B81B","2D376CF5B1813CE5",
1082 "E8421F61D573D298","4492DF6C5CAC1BBE",
1083 "09BA597DD7601113"};
1084 #else
1085 ALIGN(4) char *should_be[] = {"472766C70F74ED23481D6D7DE4E80DAC",
1086 "4EE815A06A1D71EDD36FC75D51188A42",
1087 "09F2C80C8E1007A0C12FAE19FE4504AE",
1088 "66438817154850C61D8A412164803BCB",
1089 "2B6B02288FFC461B75485DE893C629DC"};
1090 #endif
1091 unsigned speed_lengths[] = {16, 32, 64, 128, 256, 512, 1024, 2048, 4096};
1092 unsigned i, j, *speed_iters;
1093 clock_t ticks;
1094 double cpb;
1095 const unsigned int buf_len = 3 * (1 << 20);
1096
1097 j = prime();
1098 i = sizeof(speed_lengths)/sizeof(speed_lengths[0]);
1099 speed_iters = (unsigned *)malloc(i*sizeof(speed_iters[0]));
1100 speed_iters[i-1] = j * (1 << 12);
1101 while (--i) speed_iters[i-1] = (unsigned)(1.3 * speed_iters[i]);
1102
1103 /* Initialize context and message buffer, all 16-byte aligned */
1104 p = malloc(buf_len + 32);
1105 m = (unsigned char *)(((size_t)p + 16) & ~((size_t)15));
1106 memset(m, 0, buf_len + 16);
1107 vmac_set_key(key, &ctx);
1108
1109 /* Test incremental and all-in-one interfaces for correctness */
1110 vmac_set_key(key, &ctx_aio);
1111 vmac_set_key(key, &ctx_inc1);
1112 vmac_set_key(key, &ctx_inc2);
1113
1114
1115 /*
1116 for (i = 0; i <= 512; i++) {
1117 vhash_update(m,(i/VMAC_NHBYTES)*VMAC_NHBYTES,&ctx_inc1);
1118 tagh = vmac(m+(i/VMAC_NHBYTES)*VMAC_NHBYTES, i%VMAC_NHBYTES,
1119 nonce, &tagl, &ctx);
1120 vhash_update(m,(i/VMAC_NHBYTES)*VMAC_NHBYTES,&ctx_inc1);
1121 for (j = 0; j < vector_lengths[i]; j++)
1122 m[j] = (unsigned char)('a'+j%3);
1123
1124 }
1125 */
1126
1127 /* Generate vectors */
1128 for (i = 0; i < sizeof(vector_lengths)/sizeof(unsigned int); i++) {
1129 for (j = 0; j < vector_lengths[i]; j++)
1130 m[j] = (unsigned char)('a'+j%3);
1131 res = vmac(m, vector_lengths[i], nonce, &tagl, &ctx);
1132 #if (VMAC_TAG_LEN == 64)
1133 printf("\'abc\' * %7u: %016llX Should be: %s\n",
1134 vector_lengths[i]/3,res,should_be[i]);
1135 #else
1136 printf("\'abc\' * %7u: %016llX%016llX\nShould be : %s\n",
1137 vector_lengths[i]/3,res,tagl,should_be[i]);
1138 #endif
1139 }
1140
1141 /* Speed test */
1142 for (i = 0; i < sizeof(speed_lengths)/sizeof(unsigned int); i++) {
1143 ticks = clock();
1144 for (j = 0; j < speed_iters[i]; j++) {
1145 #if HASH_ONLY
1146 res = vhash(m, speed_lengths[i], &tagl, &ctx);
1147 #else
1148 res = vmac(m, speed_lengths[i], nonce, &tagl, &ctx);
1149 nonce[7]++;
1150 #endif
1151 }
1152 ticks = clock() - ticks;
1153 cpb = ((ticks*VMAC_HZ)/
1154 ((double)CLOCKS_PER_SEC*speed_lengths[i]*speed_iters[i]));
1155 printf("%4u bytes, %2.2f cpb\n", speed_lengths[i], cpb);
1156 }
1157 return 1;
1158 }
1159
1160 #endif
1161