1 /* --------------------------------------------------------------------------
2 * VMAC and VHASH Implementation by Ted Krovetz (tdk@acm.org) and Wei Dai.
3 * This implementation is herby placed in the public domain.
4 * The authors offers no warranty. Use at your own risk.
5 * Please send bug reports to the authors.
6 * Last modified: 17 APR 08, 1700 PDT
7 * ----------------------------------------------------------------------- */
8
9 /* start for Xen */
10 #include <xen/init.h>
11 #include <xen/byteswap.h>
12 #include <xen/types.h>
13 #include <xen/lib.h>
14 #include <crypto/vmac.h>
15 #define UINT64_C(x) x##ULL
16 /* end for Xen */
17
18 /* Enable code tuned for 64-bit registers; otherwise tuned for 32-bit */
19 #ifndef VMAC_ARCH_64
20 #define VMAC_ARCH_64 (__x86_64__ || __ppc64__ || _M_X64)
21 #endif
22
23 /* Enable code tuned for Intel SSE2 instruction set */
24 #if ((__SSE2__ || (_M_IX86_FP >= 2)) && ( ! VMAC_ARCH_64))
25 #define VMAC_USE_SSE2 1
26 #include <emmintrin.h>
27 #endif
28
29 /* Native word reads. Update (or define via compiler) if incorrect */
30 #ifndef VMAC_ARCH_BIG_ENDIAN /* Assume big-endian unless on the list */
31 #define VMAC_ARCH_BIG_ENDIAN \
32 (!(__x86_64__ || __i386__ || _M_IX86 || \
33 _M_X64 || __ARMEL__ || __MIPSEL__))
34 #endif
35
36 /* ----------------------------------------------------------------------- */
37 /* Constants and masks */
38
39 const uint64_t p64 = UINT64_C(0xfffffffffffffeff); /* 2^64 - 257 prime */
40 const uint64_t m62 = UINT64_C(0x3fffffffffffffff); /* 62-bit mask */
41 const uint64_t m63 = UINT64_C(0x7fffffffffffffff); /* 63-bit mask */
42 const uint64_t m64 = UINT64_C(0xffffffffffffffff); /* 64-bit mask */
43 const uint64_t mpoly = UINT64_C(0x1fffffff1fffffff); /* Poly key mask */
44
45 /* ----------------------------------------------------------------------- *
46 * The following routines are used in this implementation. They are
47 * written via macros to simulate zero-overhead call-by-reference.
48 * All have default implemantations for when they are not defined in an
49 * architecture-specific manner.
50 *
51 * MUL64: 64x64->128-bit multiplication
52 * PMUL64: assumes top bits cleared on inputs
53 * ADD128: 128x128->128-bit addition
54 * ----------------------------------------------------------------------- */
55
56 /* ----------------------------------------------------------------------- */
57 #if (__GNUC__ && (__x86_64__ || __amd64__))
58 /* ----------------------------------------------------------------------- */
59
60 #define ADD128(rh,rl,ih,il) \
61 asm ("addq %3, %1 \n\t" \
62 "adcq %2, %0" \
63 : "+r"(rh),"+r"(rl) \
64 : "r"(ih),"r"(il) : "cc");
65
66 #define MUL64(rh,rl,i1,i2) \
67 asm ("mulq %3" : "=a"(rl), "=d"(rh) : "a"(i1), "r"(i2) : "cc")
68
69 #define PMUL64 MUL64
70
71
72 /* ----------------------------------------------------------------------- */
73 #elif (__GNUC__ && __ppc64__)
74 /* ----------------------------------------------------------------------- */
75
76 #define ADD128(rh,rl,ih,il) \
77 asm volatile ( "addc %1, %1, %3 \n\t" \
78 "adde %0, %0, %2" \
79 : "+r"(rh),"+r"(rl) \
80 : "r"(ih),"r"(il));
81
82 #define MUL64(rh,rl,i1,i2) \
83 { uint64_t _i1 = (i1), _i2 = (i2); \
84 rl = _i1 * _i2; \
85 asm volatile ("mulhdu %0, %1, %2" : "=r" (rh) : "r" (_i1), "r" (_i2));\
86 }
87
88 #define PMUL64 MUL64
89
90 /* ----------------------------------------------------------------------- */
91 #elif _MSC_VER
92 /* ----------------------------------------------------------------------- */
93
94 #include <intrin.h>
95
96 #if (_M_IA64 || _M_X64) && \
97 (!defined(__INTEL_COMPILER) || __INTEL_COMPILER >= 1000)
98 #define MUL64(rh,rl,i1,i2) (rl) = _umul128(i1,i2,&(rh));
99 #pragma intrinsic(_umul128)
100 #define PMUL64 MUL64
101 #endif
102
103 /* MSVC uses add, adc in this version */
104 #define ADD128(rh,rl,ih,il) \
105 { uint64_t _il = (il); \
106 (rl) += (_il); \
107 (rh) += (ih) + ((rl) < (_il)); \
108 }
109
110 #if _MSC_VER >= 1400 && \
111 (!defined(__INTEL_COMPILER) || __INTEL_COMPILER >= 1000)
112 #define MUL32(i1,i2) (__emulu((uint32_t)(i1),(uint32_t)(i2)))
113 #pragma intrinsic(__emulu)
114 #endif
115
116 /* ----------------------------------------------------------------------- */
117 #endif
118 /* ----------------------------------------------------------------------- */
119
120 #if __GNUC__
121 #define ALIGN(n) __attribute__ ((aligned(n)))
122 #define NOINLINE __attribute__ ((noinline))
123 #elif _MSC_VER
124 #define ALIGN(n) __declspec(align(n))
125 #define NOINLINE __declspec(noinline)
126 #else
127 #define ALIGN(n)
128 #define NOINLINE
129 #endif
130
131 /* ----------------------------------------------------------------------- */
132 /* Default implementations, if not defined above */
133 /* ----------------------------------------------------------------------- */
134
135 #ifndef ADD128
136 #define ADD128(rh,rl,ih,il) \
137 { uint64_t _il = (il); \
138 (rl) += (_il); \
139 if ((rl) < (_il)) (rh)++; \
140 (rh) += (ih); \
141 }
142 #endif
143
144 #ifndef MUL32
145 #define MUL32(i1,i2) ((uint64_t)(uint32_t)(i1)*(uint32_t)(i2))
146 #endif
147
148 #ifndef PMUL64 /* rh may not be same as i1 or i2 */
149 #define PMUL64(rh,rl,i1,i2) /* Assumes m doesn't overflow */ \
150 { uint64_t _i1 = (i1), _i2 = (i2); \
151 uint64_t m = MUL32(_i1,_i2>>32) + MUL32(_i1>>32,_i2); \
152 rh = MUL32(_i1>>32,_i2>>32); \
153 rl = MUL32(_i1,_i2); \
154 ADD128(rh,rl,(m >> 32),(m << 32)); \
155 }
156 #endif
157
158 #ifndef MUL64
159 #define MUL64(rh,rl,i1,i2) \
160 { uint64_t _i1 = (i1), _i2 = (i2); \
161 uint64_t m1= MUL32(_i1,_i2>>32); \
162 uint64_t m2= MUL32(_i1>>32,_i2); \
163 rh = MUL32(_i1>>32,_i2>>32); \
164 rl = MUL32(_i1,_i2); \
165 ADD128(rh,rl,(m1 >> 32),(m1 << 32)); \
166 ADD128(rh,rl,(m2 >> 32),(m2 << 32)); \
167 }
168 #endif
169
170 /* ----------------------------------------------------------------------- */
171
172 #if (VMAC_PREFER_BIG_ENDIAN)
173 # define get64PE get64BE
174 #else
175 # define get64PE get64LE
176 #endif
177
178 #if (VMAC_ARCH_BIG_ENDIAN)
179 # define get64BE(ptr) (*(uint64_t *)(ptr))
180 # define get64LE(ptr) bswap64(*(uint64_t *)(ptr))
181 #else /* assume little-endian */
182 # define get64BE(ptr) bswap64(*(uint64_t *)(ptr))
183 # define get64LE(ptr) (*(uint64_t *)(ptr))
184 #endif
185
186
187 /* --------------------------------------------------------------------- *
188 * For highest performance the L1 NH and L2 polynomial hashes should be
189 * carefully implemented to take advantage of one's target architechture.
190 * Here these two hash functions are defined multiple time; once for
191 * 64-bit architectures, once for 32-bit SSE2 architectures, and once
192 * for the rest (32-bit) architectures.
193 * For each, nh_16 *must* be defined (works on multiples of 16 bytes).
194 * Optionally, nh_vmac_nhbytes can be defined (for multiples of
195 * VMAC_NHBYTES), and nh_16_2 and nh_vmac_nhbytes_2 (versions that do two
196 * NH computations at once).
197 * --------------------------------------------------------------------- */
198
199 /* ----------------------------------------------------------------------- */
200 #if VMAC_ARCH_64
201 /* ----------------------------------------------------------------------- */
202
203 #define nh_16(mp, kp, nw, rh, rl) \
204 { int i; uint64_t th, tl; \
205 rh = rl = 0; \
206 for (i = 0; i < nw; i+= 2) { \
207 MUL64(th,tl,get64PE((mp)+i )+(kp)[i ],get64PE((mp)+i+1)+(kp)[i+1]);\
208 ADD128(rh,rl,th,tl); \
209 } \
210 }
211 #define nh_16_2(mp, kp, nw, rh, rl, rh1, rl1) \
212 { int i; uint64_t th, tl; \
213 rh1 = rl1 = rh = rl = 0; \
214 for (i = 0; i < nw; i+= 2) { \
215 MUL64(th,tl,get64PE((mp)+i )+(kp)[i ],get64PE((mp)+i+1)+(kp)[i+1]);\
216 ADD128(rh,rl,th,tl); \
217 MUL64(th,tl,get64PE((mp)+i )+(kp)[i+2],get64PE((mp)+i+1)+(kp)[i+3]);\
218 ADD128(rh1,rl1,th,tl); \
219 } \
220 }
221
222 #if (VMAC_NHBYTES >= 64) /* These versions do 64-bytes of message at a time */
223 #define nh_vmac_nhbytes(mp, kp, nw, rh, rl) \
224 { int i; uint64_t th, tl; \
225 rh = rl = 0; \
226 for (i = 0; i < nw; i+= 8) { \
227 MUL64(th,tl,get64PE((mp)+i )+(kp)[i ],get64PE((mp)+i+1)+(kp)[i+1]);\
228 ADD128(rh,rl,th,tl); \
229 MUL64(th,tl,get64PE((mp)+i+2)+(kp)[i+2],get64PE((mp)+i+3)+(kp)[i+3]);\
230 ADD128(rh,rl,th,tl); \
231 MUL64(th,tl,get64PE((mp)+i+4)+(kp)[i+4],get64PE((mp)+i+5)+(kp)[i+5]);\
232 ADD128(rh,rl,th,tl); \
233 MUL64(th,tl,get64PE((mp)+i+6)+(kp)[i+6],get64PE((mp)+i+7)+(kp)[i+7]);\
234 ADD128(rh,rl,th,tl); \
235 } \
236 }
237 #define nh_vmac_nhbytes_2(mp, kp, nw, rh, rl, rh1, rl1) \
238 { int i; uint64_t th, tl; \
239 rh1 = rl1 = rh = rl = 0; \
240 for (i = 0; i < nw; i+= 8) { \
241 MUL64(th,tl,get64PE((mp)+i )+(kp)[i ],get64PE((mp)+i+1)+(kp)[i+1]);\
242 ADD128(rh,rl,th,tl); \
243 MUL64(th,tl,get64PE((mp)+i )+(kp)[i+2],get64PE((mp)+i+1)+(kp)[i+3]);\
244 ADD128(rh1,rl1,th,tl); \
245 MUL64(th,tl,get64PE((mp)+i+2)+(kp)[i+2],get64PE((mp)+i+3)+(kp)[i+3]);\
246 ADD128(rh,rl,th,tl); \
247 MUL64(th,tl,get64PE((mp)+i+2)+(kp)[i+4],get64PE((mp)+i+3)+(kp)[i+5]);\
248 ADD128(rh1,rl1,th,tl); \
249 MUL64(th,tl,get64PE((mp)+i+4)+(kp)[i+4],get64PE((mp)+i+5)+(kp)[i+5]);\
250 ADD128(rh,rl,th,tl); \
251 MUL64(th,tl,get64PE((mp)+i+4)+(kp)[i+6],get64PE((mp)+i+5)+(kp)[i+7]);\
252 ADD128(rh1,rl1,th,tl); \
253 MUL64(th,tl,get64PE((mp)+i+6)+(kp)[i+6],get64PE((mp)+i+7)+(kp)[i+7]);\
254 ADD128(rh,rl,th,tl); \
255 MUL64(th,tl,get64PE((mp)+i+6)+(kp)[i+8],get64PE((mp)+i+7)+(kp)[i+9]);\
256 ADD128(rh1,rl1,th,tl); \
257 } \
258 }
259 #endif
260
261 #define poly_step(ah, al, kh, kl, mh, ml) \
262 { uint64_t t1h, t1l, t2h, t2l, t3h, t3l, z=0; \
263 /* compute ab*cd, put bd into result registers */ \
264 PMUL64(t3h,t3l,al,kh); \
265 PMUL64(t2h,t2l,ah,kl); \
266 PMUL64(t1h,t1l,ah,2*kh); \
267 PMUL64(ah,al,al,kl); \
268 /* add 2 * ac to result */ \
269 ADD128(ah,al,t1h,t1l); \
270 /* add together ad + bc */ \
271 ADD128(t2h,t2l,t3h,t3l); \
272 /* now (ah,al), (t2l,2*t2h) need summing */ \
273 /* first add the high registers, carrying into t2h */ \
274 ADD128(t2h,ah,z,t2l); \
275 /* double t2h and add top bit of ah */ \
276 t2h = 2 * t2h + (ah >> 63); \
277 ah &= m63; \
278 /* now add the low registers */ \
279 ADD128(ah,al,mh,ml); \
280 ADD128(ah,al,z,t2h); \
281 }
282
283 /* ----------------------------------------------------------------------- */
284 #elif VMAC_USE_SSE2
285 /* ----------------------------------------------------------------------- */
286
287 // macros from Crypto++ for sharing inline assembly code between MSVC and GNU C
288 #if defined(__GNUC__)
289 // define these in two steps to allow arguments to be expanded
290 #define GNU_AS2(x, y) #x ", " #y ";"
291 #define GNU_AS3(x, y, z) #x ", " #y ", " #z ";"
292 #define GNU_ASL(x) "\n" #x ":"
293 #define GNU_ASJ(x, y, z) #x " " #y #z ";"
294 #define AS2(x, y) GNU_AS2(x, y)
295 #define AS3(x, y, z) GNU_AS3(x, y, z)
296 #define ASS(x, y, a, b, c, d) #x ", " #y ", " #a "*64+" #b "*16+" #c "*4+" #d ";"
297 #define ASL(x) GNU_ASL(x)
298 #define ASJ(x, y, z) GNU_ASJ(x, y, z)
299 #else
300 #define AS2(x, y) __asm {x, y}
301 #define AS3(x, y, z) __asm {x, y, z}
302 #define ASS(x, y, a, b, c, d) __asm {x, y, _MM_SHUFFLE(a, b, c, d)}
303 #define ASL(x) __asm {label##x:}
304 #define ASJ(x, y, z) __asm {x label##y}
305 #endif
306
nh_16_func(const uint64_t * mp,const uint64_t * kp,size_t nw,uint64_t * rh,uint64_t * rl)307 static void NOINLINE nh_16_func(const uint64_t *mp, const uint64_t *kp, size_t nw, uint64_t *rh, uint64_t *rl)
308 {
309 // This assembly version, using MMX registers, is just as fast as the
310 // intrinsics version (which uses XMM registers) on the Intel Core 2,
311 // but is much faster on the Pentium 4. In order to schedule multiplies
312 // as early as possible, the loop interleaves operations for the current
313 // block and the next block. To mask out high 32-bits, we use "movd"
314 // to move the lower 32-bits to the stack and then back. Surprisingly,
315 // this is faster than any other method.
316 #ifdef __GNUC__
317 __asm__ __volatile__
318 (
319 ".intel_syntax noprefix;"
320 #else
321 AS2( mov esi, mp)
322 AS2( mov edi, kp)
323 AS2( mov ecx, nw)
324 AS2( mov eax, rl)
325 AS2( mov edx, rh)
326 #endif
327 AS2( sub esp, 12)
328 AS2( movq mm6, [esi])
329 AS2( paddq mm6, [edi])
330 AS2( movq mm5, [esi+8])
331 AS2( paddq mm5, [edi+8])
332 AS2( add esi, 16)
333 AS2( add edi, 16)
334 AS2( movq mm4, mm6)
335 ASS( pshufw mm2, mm6, 1, 0, 3, 2)
336 AS2( pmuludq mm6, mm5)
337 ASS( pshufw mm3, mm5, 1, 0, 3, 2)
338 AS2( pmuludq mm5, mm2)
339 AS2( pmuludq mm2, mm3)
340 AS2( pmuludq mm3, mm4)
341 AS2( pxor mm7, mm7)
342 AS2( movd [esp], mm6)
343 AS2( psrlq mm6, 32)
344 AS2( movd [esp+4], mm5)
345 AS2( psrlq mm5, 32)
346 AS2( sub ecx, 2)
347 ASJ( jz, 1, f)
348 ASL(0)
349 AS2( movq mm0, [esi])
350 AS2( paddq mm0, [edi])
351 AS2( movq mm1, [esi+8])
352 AS2( paddq mm1, [edi+8])
353 AS2( add esi, 16)
354 AS2( add edi, 16)
355 AS2( movq mm4, mm0)
356 AS2( paddq mm5, mm2)
357 ASS( pshufw mm2, mm0, 1, 0, 3, 2)
358 AS2( pmuludq mm0, mm1)
359 AS2( movd [esp+8], mm3)
360 AS2( psrlq mm3, 32)
361 AS2( paddq mm5, mm3)
362 ASS( pshufw mm3, mm1, 1, 0, 3, 2)
363 AS2( pmuludq mm1, mm2)
364 AS2( pmuludq mm2, mm3)
365 AS2( pmuludq mm3, mm4)
366 AS2( movd mm4, [esp])
367 AS2( paddq mm7, mm4)
368 AS2( movd mm4, [esp+4])
369 AS2( paddq mm6, mm4)
370 AS2( movd mm4, [esp+8])
371 AS2( paddq mm6, mm4)
372 AS2( movd [esp], mm0)
373 AS2( psrlq mm0, 32)
374 AS2( paddq mm6, mm0)
375 AS2( movd [esp+4], mm1)
376 AS2( psrlq mm1, 32)
377 AS2( paddq mm5, mm1)
378 AS2( sub ecx, 2)
379 ASJ( jnz, 0, b)
380 ASL(1)
381 AS2( paddq mm5, mm2)
382 AS2( movd [esp+8], mm3)
383 AS2( psrlq mm3, 32)
384 AS2( paddq mm5, mm3)
385 AS2( movd mm4, [esp])
386 AS2( paddq mm7, mm4)
387 AS2( movd mm4, [esp+4])
388 AS2( paddq mm6, mm4)
389 AS2( movd mm4, [esp+8])
390 AS2( paddq mm6, mm4)
391
392 ASS( pshufw mm0, mm7, 3, 2, 1, 0)
393 AS2( psrlq mm7, 32)
394 AS2( paddq mm6, mm7)
395 AS2( punpckldq mm0, mm6)
396 AS2( psrlq mm6, 32)
397 AS2( paddq mm5, mm6)
398 AS2( movq [eax], mm0)
399 AS2( movq [edx], mm5)
400 AS2( add esp, 12)
401 #ifdef __GNUC__
402 ".att_syntax prefix;"
403 :
404 : "S" (mp), "D" (kp), "c" (nw), "a" (rl), "d" (rh)
405 : "memory", "cc"
406 );
407 #endif
408 }
409 #define nh_16(mp, kp, nw, rh, rl) nh_16_func(mp, kp, nw, &(rh), &(rl));
410
poly_step_func(uint64_t * ahi,uint64_t * alo,const uint64_t * kh,const uint64_t * kl,const uint64_t * mh,const uint64_t * ml)411 static void poly_step_func(uint64_t *ahi, uint64_t *alo, const uint64_t *kh,
412 const uint64_t *kl, const uint64_t *mh, const uint64_t *ml)
413 {
414 // This code tries to schedule the multiplies as early as possible to overcome
415 // the long latencies on the Pentium 4. It also minimizes "movq" instructions
416 // which are very expensive on the P4.
417
418 #define a0 [eax+0]
419 #define a1 [eax+4]
420 #define a2 [ebx+0]
421 #define a3 [ebx+4]
422 #define k0 [ecx+0]
423 #define k1 [ecx+4]
424 #define k2 [edx+0]
425 #define k3 [edx+4]
426
427 #ifdef __GNUC__
428 uint32_t temp;
429 __asm__ __volatile__
430 (
431 "mov %%ebx, %0;"
432 "mov %1, %%ebx;"
433 ".intel_syntax noprefix;"
434 #else
435 AS2( mov ebx, ahi)
436 AS2( mov edx, kh)
437 AS2( mov eax, alo)
438 AS2( mov ecx, kl)
439 AS2( mov esi, mh)
440 AS2( mov edi, ml)
441 #endif
442
443 AS2( movd mm0, a3)
444 AS2( movq mm4, mm0)
445 AS2( pmuludq mm0, k3) // a3*k3
446 AS2( movd mm1, a0)
447 AS2( pmuludq mm1, k2) // a0*k2
448 AS2( movd mm2, a1)
449 AS2( movd mm6, k1)
450 AS2( pmuludq mm2, mm6) // a1*k1
451 AS2( movd mm3, a2)
452 AS2( movq mm5, mm3)
453 AS2( movd mm7, k0)
454 AS2( pmuludq mm3, mm7) // a2*k0
455 AS2( pmuludq mm4, mm7) // a3*k0
456 AS2( pmuludq mm5, mm6) // a2*k1
457 AS2( psllq mm0, 1)
458 AS2( paddq mm0, [esi])
459 AS2( paddq mm0, mm1)
460 AS2( movd mm1, a1)
461 AS2( paddq mm4, mm5)
462 AS2( movq mm5, mm1)
463 AS2( pmuludq mm1, k2) // a1*k2
464 AS2( paddq mm0, mm2)
465 AS2( movd mm2, a0)
466 AS2( paddq mm0, mm3)
467 AS2( movq mm3, mm2)
468 AS2( pmuludq mm2, k3) // a0*k3
469 AS2( pmuludq mm3, mm7) // a0*k0
470 AS2( movd esi, mm0)
471 AS2( psrlq mm0, 32)
472 AS2( pmuludq mm7, mm5) // a1*k0
473 AS2( pmuludq mm5, k3) // a1*k3
474 AS2( paddq mm0, mm1)
475 AS2( movd mm1, a2)
476 AS2( pmuludq mm1, k2) // a2*k2
477 AS2( paddq mm0, mm2)
478 AS2( paddq mm0, mm4)
479 AS2( movq mm4, mm0)
480 AS2( movd mm2, a3)
481 AS2( pmuludq mm2, mm6) // a3*k1
482 AS2( pmuludq mm6, a0) // a0*k1
483 AS2( psrlq mm0, 31)
484 AS2( paddq mm0, mm3)
485 AS2( movd mm3, [edi])
486 AS2( paddq mm0, mm3)
487 AS2( movd mm3, a2)
488 AS2( pmuludq mm3, k3) // a2*k3
489 AS2( paddq mm5, mm1)
490 AS2( movd mm1, a3)
491 AS2( pmuludq mm1, k2) // a3*k2
492 AS2( paddq mm5, mm2)
493 AS2( movd mm2, [edi+4])
494 AS2( psllq mm5, 1)
495 AS2( paddq mm0, mm5)
496 AS2( movq mm5, mm0)
497 AS2( psllq mm4, 33)
498 AS2( psrlq mm0, 32)
499 AS2( paddq mm6, mm7)
500 AS2( movd mm7, esi)
501 AS2( paddq mm0, mm6)
502 AS2( paddq mm0, mm2)
503 AS2( paddq mm3, mm1)
504 AS2( psllq mm3, 1)
505 AS2( paddq mm0, mm3)
506 AS2( psrlq mm4, 1)
507 AS2( punpckldq mm5, mm0)
508 AS2( psrlq mm0, 32)
509 AS2( por mm4, mm7)
510 AS2( paddq mm0, mm4)
511 AS2( movq a0, mm5)
512 AS2( movq a2, mm0)
513 #ifdef __GNUC__
514 ".att_syntax prefix;"
515 "mov %0, %%ebx;"
516 : "=m" (temp)
517 : "m" (ahi), "D" (ml), "d" (kh), "a" (alo), "S" (mh), "c" (kl)
518 : "memory", "cc"
519 );
520 #endif
521
522
523 #undef a0
524 #undef a1
525 #undef a2
526 #undef a3
527 #undef k0
528 #undef k1
529 #undef k2
530 #undef k3
531 }
532
533 #define poly_step(ah, al, kh, kl, mh, ml) \
534 poly_step_func(&(ah), &(al), &(kh), &(kl), &(mh), &(ml))
535
536 /* ----------------------------------------------------------------------- */
537 #else /* not VMAC_ARCH_64 and not SSE2 */
538 /* ----------------------------------------------------------------------- */
539
540 #ifndef nh_16
541 #define nh_16(mp, kp, nw, rh, rl) \
542 { uint64_t t1,t2,m1,m2,t; \
543 int i; \
544 rh = rl = t = 0; \
545 for (i = 0; i < nw; i+=2) { \
546 t1 = get64PE(mp+i) + kp[i]; \
547 t2 = get64PE(mp+i+1) + kp[i+1]; \
548 m2 = MUL32(t1 >> 32, t2); \
549 m1 = MUL32(t1, t2 >> 32); \
550 ADD128(rh,rl,MUL32(t1 >> 32,t2 >> 32),MUL32(t1,t2)); \
551 rh += (uint64_t)(uint32_t)(m1 >> 32) + (uint32_t)(m2 >> 32); \
552 t += (uint64_t)(uint32_t)m1 + (uint32_t)m2; \
553 } \
554 ADD128(rh,rl,(t >> 32),(t << 32)); \
555 }
556 #endif
557
poly_step_func(uint64_t * ahi,uint64_t * alo,const uint64_t * kh,const uint64_t * kl,const uint64_t * mh,const uint64_t * ml)558 static void poly_step_func(uint64_t *ahi, uint64_t *alo, const uint64_t *kh,
559 const uint64_t *kl, const uint64_t *mh, const uint64_t *ml)
560 {
561
562 #if VMAC_ARCH_BIG_ENDIAN
563 #define INDEX_HIGH 0
564 #define INDEX_LOW 1
565 #else
566 #define INDEX_HIGH 1
567 #define INDEX_LOW 0
568 #endif
569
570 #define a0 *(((uint32_t*)alo)+INDEX_LOW)
571 #define a1 *(((uint32_t*)alo)+INDEX_HIGH)
572 #define a2 *(((uint32_t*)ahi)+INDEX_LOW)
573 #define a3 *(((uint32_t*)ahi)+INDEX_HIGH)
574 #define k0 *(((uint32_t*)kl)+INDEX_LOW)
575 #define k1 *(((uint32_t*)kl)+INDEX_HIGH)
576 #define k2 *(((uint32_t*)kh)+INDEX_LOW)
577 #define k3 *(((uint32_t*)kh)+INDEX_HIGH)
578
579 uint64_t p, q, t;
580 uint32_t t2;
581
582 p = MUL32(a3, k3);
583 p += p;
584 p += *(uint64_t *)mh;
585 p += MUL32(a0, k2);
586 p += MUL32(a1, k1);
587 p += MUL32(a2, k0);
588 t = (uint32_t)(p);
589 p >>= 32;
590 p += MUL32(a0, k3);
591 p += MUL32(a1, k2);
592 p += MUL32(a2, k1);
593 p += MUL32(a3, k0);
594 t |= ((uint64_t)((uint32_t)p & 0x7fffffff)) << 32;
595 p >>= 31;
596 p += (uint64_t)(((uint32_t*)ml)[INDEX_LOW]);
597 p += MUL32(a0, k0);
598 q = MUL32(a1, k3);
599 q += MUL32(a2, k2);
600 q += MUL32(a3, k1);
601 q += q;
602 p += q;
603 t2 = (uint32_t)(p);
604 p >>= 32;
605 p += (uint64_t)(((uint32_t*)ml)[INDEX_HIGH]);
606 p += MUL32(a0, k1);
607 p += MUL32(a1, k0);
608 q = MUL32(a2, k3);
609 q += MUL32(a3, k2);
610 q += q;
611 p += q;
612 *(uint64_t *)(alo) = (p << 32) | t2;
613 p >>= 32;
614 *(uint64_t *)(ahi) = p + t;
615
616 #undef a0
617 #undef a1
618 #undef a2
619 #undef a3
620 #undef k0
621 #undef k1
622 #undef k2
623 #undef k3
624 }
625
626 #define poly_step(ah, al, kh, kl, mh, ml) \
627 poly_step_func(&(ah), &(al), &(kh), &(kl), &(mh), &(ml))
628
629 /* ----------------------------------------------------------------------- */
630 #endif /* end of specialized NH and poly definitions */
631 /* ----------------------------------------------------------------------- */
632
633 /* At least nh_16 is defined. Defined others as needed here */
634 #ifndef nh_16_2
635 #define nh_16_2(mp, kp, nw, rh, rl, rh2, rl2) \
636 nh_16(mp, kp, nw, rh, rl); \
637 nh_16(mp, ((kp)+2), nw, rh2, rl2);
638 #endif
639 #ifndef nh_vmac_nhbytes
640 #define nh_vmac_nhbytes(mp, kp, nw, rh, rl) \
641 nh_16(mp, kp, nw, rh, rl)
642 #endif
643 #ifndef nh_vmac_nhbytes_2
644 #define nh_vmac_nhbytes_2(mp, kp, nw, rh, rl, rh2, rl2) \
645 nh_vmac_nhbytes(mp, kp, nw, rh, rl); \
646 nh_vmac_nhbytes(mp, ((kp)+2), nw, rh2, rl2);
647 #endif
648
649 /* ----------------------------------------------------------------------- */
650
vhash_abort(vmac_ctx_t * ctx)651 static void vhash_abort(vmac_ctx_t *ctx)
652 {
653 ctx->polytmp[0] = ctx->polykey[0] ;
654 ctx->polytmp[1] = ctx->polykey[1] ;
655 #if (VMAC_TAG_LEN == 128)
656 ctx->polytmp[2] = ctx->polykey[2] ;
657 ctx->polytmp[3] = ctx->polykey[3] ;
658 #endif
659 ctx->first_block_processed = 0;
660 }
661
662 /* ----------------------------------------------------------------------- */
l3hash(uint64_t p1,uint64_t p2,uint64_t k1,uint64_t k2,uint64_t len)663 static uint64_t l3hash(uint64_t p1, uint64_t p2,
664 uint64_t k1, uint64_t k2, uint64_t len)
665 {
666 uint64_t rh, rl, t, z=0;
667
668 /* fully reduce (p1,p2)+(len,0) mod p127 */
669 t = p1 >> 63;
670 p1 &= m63;
671 ADD128(p1, p2, len, t);
672 /* At this point, (p1,p2) is at most 2^127+(len<<64) */
673 t = (p1 > m63) + ((p1 == m63) && (p2 == m64));
674 ADD128(p1, p2, z, t);
675 p1 &= m63;
676
677 /* compute (p1,p2)/(2^64-2^32) and (p1,p2)%(2^64-2^32) */
678 t = p1 + (p2 >> 32);
679 t += (t >> 32);
680 t += (uint32_t)t > 0xfffffffeu;
681 p1 += (t >> 32);
682 p2 += (p1 << 32);
683
684 /* compute (p1+k1)%p64 and (p2+k2)%p64 */
685 p1 += k1;
686 p1 += (0 - (p1 < k1)) & 257;
687 p2 += k2;
688 p2 += (0 - (p2 < k2)) & 257;
689
690 /* compute (p1+k1)*(p2+k2)%p64 */
691 MUL64(rh, rl, p1, p2);
692 t = rh >> 56;
693 ADD128(t, rl, z, rh);
694 rh <<= 8;
695 ADD128(t, rl, z, rh);
696 t += t << 8;
697 rl += t;
698 rl += (0 - (rl < t)) & 257;
699 rl += (0 - (rl > p64-1)) & 257;
700 return rl;
701 }
702
703 /* ----------------------------------------------------------------------- */
704
vhash_update(unsigned char * m,unsigned int mbytes,vmac_ctx_t * ctx)705 void vhash_update(unsigned char *m,
706 unsigned int mbytes, /* Pos multiple of VMAC_NHBYTES */
707 vmac_ctx_t *ctx)
708 {
709 uint64_t rh, rl, *mptr;
710 const uint64_t *kptr = (uint64_t *)ctx->nhkey;
711 int i;
712 uint64_t ch, cl;
713 uint64_t pkh = ctx->polykey[0];
714 uint64_t pkl = ctx->polykey[1];
715 #if (VMAC_TAG_LEN == 128)
716 uint64_t ch2, cl2, rh2, rl2;
717 uint64_t pkh2 = ctx->polykey[2];
718 uint64_t pkl2 = ctx->polykey[3];
719 #endif
720
721 mptr = (uint64_t *)m;
722 i = mbytes / VMAC_NHBYTES; /* Must be non-zero */
723
724 ch = ctx->polytmp[0];
725 cl = ctx->polytmp[1];
726 #if (VMAC_TAG_LEN == 128)
727 ch2 = ctx->polytmp[2];
728 cl2 = ctx->polytmp[3];
729 #endif
730
731 if ( ! ctx->first_block_processed) {
732 ctx->first_block_processed = 1;
733 #if (VMAC_TAG_LEN == 64)
734 nh_vmac_nhbytes(mptr,kptr,VMAC_NHBYTES/8,rh,rl);
735 #else
736 nh_vmac_nhbytes_2(mptr,kptr,VMAC_NHBYTES/8,rh,rl,rh2,rl2);
737 rh2 &= m62;
738 ADD128(ch2,cl2,rh2,rl2);
739 #endif
740 rh &= m62;
741 ADD128(ch,cl,rh,rl);
742 mptr += (VMAC_NHBYTES/sizeof(uint64_t));
743 i--;
744 }
745
746 while (i--) {
747 #if (VMAC_TAG_LEN == 64)
748 nh_vmac_nhbytes(mptr,kptr,VMAC_NHBYTES/8,rh,rl);
749 #else
750 nh_vmac_nhbytes_2(mptr,kptr,VMAC_NHBYTES/8,rh,rl,rh2,rl2);
751 rh2 &= m62;
752 poly_step(ch2,cl2,pkh2,pkl2,rh2,rl2);
753 #endif
754 rh &= m62;
755 poly_step(ch,cl,pkh,pkl,rh,rl);
756 mptr += (VMAC_NHBYTES/sizeof(uint64_t));
757 }
758
759 ctx->polytmp[0] = ch;
760 ctx->polytmp[1] = cl;
761 #if (VMAC_TAG_LEN == 128)
762 ctx->polytmp[2] = ch2;
763 ctx->polytmp[3] = cl2;
764 #endif
765 #if VMAC_USE_SSE2
766 _mm_empty(); /* SSE2 version of poly_step uses mmx instructions */
767 #endif
768 }
769
770 /* ----------------------------------------------------------------------- */
771
vhash(unsigned char m[],unsigned int mbytes,uint64_t * tagl,vmac_ctx_t * ctx)772 uint64_t vhash(unsigned char m[],
773 unsigned int mbytes,
774 uint64_t *tagl,
775 vmac_ctx_t *ctx)
776 {
777 uint64_t rh, rl, *mptr;
778 const uint64_t *kptr = (uint64_t *)ctx->nhkey;
779 int i, remaining;
780 uint64_t ch, cl;
781 uint64_t pkh = ctx->polykey[0];
782 uint64_t pkl = ctx->polykey[1];
783 #if (VMAC_TAG_LEN == 128)
784 uint64_t ch2, cl2, rh2, rl2;
785 uint64_t pkh2 = ctx->polykey[2];
786 uint64_t pkl2 = ctx->polykey[3];
787 #endif
788
789 mptr = (uint64_t *)m;
790 i = mbytes / VMAC_NHBYTES;
791 remaining = mbytes % VMAC_NHBYTES;
792
793 if (ctx->first_block_processed)
794 {
795 ch = ctx->polytmp[0];
796 cl = ctx->polytmp[1];
797 #if (VMAC_TAG_LEN == 128)
798 ch2 = ctx->polytmp[2];
799 cl2 = ctx->polytmp[3];
800 #endif
801 }
802 else if (i)
803 {
804 #if (VMAC_TAG_LEN == 64)
805 nh_vmac_nhbytes(mptr,kptr,VMAC_NHBYTES/8,ch,cl);
806 #else
807 nh_vmac_nhbytes_2(mptr,kptr,VMAC_NHBYTES/8,ch,cl,ch2,cl2);
808 ch2 &= m62;
809 ADD128(ch2,cl2,pkh2,pkl2);
810 #endif
811 ch &= m62;
812 ADD128(ch,cl,pkh,pkl);
813 mptr += (VMAC_NHBYTES/sizeof(uint64_t));
814 i--;
815 }
816 else if (remaining)
817 {
818 #if (VMAC_TAG_LEN == 64)
819 nh_16(mptr,kptr,2*((remaining+15)/16),ch,cl);
820 #else
821 nh_16_2(mptr,kptr,2*((remaining+15)/16),ch,cl,ch2,cl2);
822 ch2 &= m62;
823 ADD128(ch2,cl2,pkh2,pkl2);
824 #endif
825 ch &= m62;
826 ADD128(ch,cl,pkh,pkl);
827 mptr += (VMAC_NHBYTES/sizeof(uint64_t));
828 goto do_l3;
829 }
830 else /* Empty String */
831 {
832 ch = pkh; cl = pkl;
833 #if (VMAC_TAG_LEN == 128)
834 ch2 = pkh2; cl2 = pkl2;
835 #endif
836 goto do_l3;
837 }
838
839 while (i--) {
840 #if (VMAC_TAG_LEN == 64)
841 nh_vmac_nhbytes(mptr,kptr,VMAC_NHBYTES/8,rh,rl);
842 #else
843 nh_vmac_nhbytes_2(mptr,kptr,VMAC_NHBYTES/8,rh,rl,rh2,rl2);
844 rh2 &= m62;
845 poly_step(ch2,cl2,pkh2,pkl2,rh2,rl2);
846 #endif
847 rh &= m62;
848 poly_step(ch,cl,pkh,pkl,rh,rl);
849 mptr += (VMAC_NHBYTES/sizeof(uint64_t));
850 }
851 if (remaining) {
852 #if (VMAC_TAG_LEN == 64)
853 nh_16(mptr,kptr,2*((remaining+15)/16),rh,rl);
854 #else
855 nh_16_2(mptr,kptr,2*((remaining+15)/16),rh,rl,rh2,rl2);
856 rh2 &= m62;
857 poly_step(ch2,cl2,pkh2,pkl2,rh2,rl2);
858 #endif
859 rh &= m62;
860 poly_step(ch,cl,pkh,pkl,rh,rl);
861 }
862
863 do_l3:
864 #if VMAC_USE_SSE2
865 _mm_empty(); /* SSE2 version of poly_step uses mmx instructions */
866 #endif
867 vhash_abort(ctx);
868 remaining *= 8;
869 #if (VMAC_TAG_LEN == 128)
870 *tagl = l3hash(ch2, cl2, ctx->l3key[2], ctx->l3key[3],remaining);
871 #endif
872 return l3hash(ch, cl, ctx->l3key[0], ctx->l3key[1],remaining);
873 }
874
875 /* ----------------------------------------------------------------------- */
876
vmac(unsigned char m[],unsigned int mbytes,unsigned char n[16],uint64_t * tagl,vmac_ctx_t * ctx)877 uint64_t vmac(unsigned char m[],
878 unsigned int mbytes,
879 unsigned char n[16],
880 uint64_t *tagl,
881 vmac_ctx_t *ctx)
882 {
883 #if (VMAC_TAG_LEN == 64)
884 uint64_t *in_n, *out_p;
885 uint64_t p, h;
886 int i;
887
888 #if VMAC_CACHE_NONCES
889 in_n = ctx->cached_nonce;
890 out_p = ctx->cached_aes;
891 #else
892 uint64_t tmp[2];
893 in_n = out_p = tmp;
894 #endif
895
896 i = n[15] & 1;
897 #if VMAC_CACHE_NONCES
898 if ((*(uint64_t *)(n+8) != in_n[1]) ||
899 (*(uint64_t *)(n ) != in_n[0])) {
900 #endif
901
902 in_n[0] = *(uint64_t *)(n );
903 in_n[1] = *(uint64_t *)(n+8);
904 ((unsigned char *)in_n)[15] &= 0xFE;
905 aes_encryption(in_n, out_p, &ctx->cipher_key);
906
907 #if VMAC_CACHE_NONCES
908 ((unsigned char *)in_n)[15] |= (unsigned char)(1-i);
909 }
910 #endif
911 p = get64BE(out_p + i);
912 h = vhash(m, mbytes, (uint64_t *)0, ctx);
913 return p + h;
914 #else
915 uint64_t tmp[2];
916 uint64_t th,tl;
917 aes_encryption(n, (unsigned char *)tmp, &ctx->cipher_key);
918 th = vhash(m, mbytes, &tl, ctx);
919 th += get64BE(tmp);
920 *tagl = tl + get64BE(tmp+1);
921 return th;
922 #endif
923 }
924
925 /* ----------------------------------------------------------------------- */
926
vmac_set_key(unsigned char user_key[],vmac_ctx_t * ctx)927 void vmac_set_key(unsigned char user_key[], vmac_ctx_t *ctx)
928 {
929 uint64_t in[2] = {0}, out[2];
930 unsigned i;
931 aes_key_setup(user_key, &ctx->cipher_key);
932
933 /* Fill nh key */
934 ((unsigned char *)in)[0] = 0x80;
935 for (i = 0; i < sizeof(ctx->nhkey)/8; i+=2) {
936 aes_encryption((unsigned char *)in, (unsigned char *)out,
937 &ctx->cipher_key);
938 ctx->nhkey[i ] = get64BE(out);
939 ctx->nhkey[i+1] = get64BE(out+1);
940 ((unsigned char *)in)[15] += 1;
941 }
942
943 /* Fill poly key */
944 ((unsigned char *)in)[0] = 0xC0;
945 in[1] = 0;
946 for (i = 0; i < sizeof(ctx->polykey)/8; i+=2) {
947 aes_encryption((unsigned char *)in, (unsigned char *)out,
948 &ctx->cipher_key);
949 ctx->polytmp[i ] = ctx->polykey[i ] = get64BE(out) & mpoly;
950 ctx->polytmp[i+1] = ctx->polykey[i+1] = get64BE(out+1) & mpoly;
951 ((unsigned char *)in)[15] += 1;
952 }
953
954 /* Fill ip key */
955 ((unsigned char *)in)[0] = 0xE0;
956 in[1] = 0;
957 for (i = 0; i < sizeof(ctx->l3key)/8; i+=2) {
958 do {
959 aes_encryption((unsigned char *)in, (unsigned char *)out,
960 &ctx->cipher_key);
961 ctx->l3key[i ] = get64BE(out);
962 ctx->l3key[i+1] = get64BE(out+1);
963 ((unsigned char *)in)[15] += 1;
964 } while (ctx->l3key[i] >= p64 || ctx->l3key[i+1] >= p64);
965 }
966
967 /* Invalidate nonce/aes cache and reset other elements */
968 #if (VMAC_TAG_LEN == 64) && (VMAC_CACHE_NONCES)
969 ctx->cached_nonce[0] = (uint64_t)-1; /* Ensure illegal nonce */
970 ctx->cached_nonce[1] = (uint64_t)0; /* Ensure illegal nonce */
971 #endif
972 ctx->first_block_processed = 0;
973 }
974
975 /* ----------------------------------------------------------------------- */
976
977
978 #if VMAC_RUN_TESTS
979
980 #include <stdlib.h>
981 #include <stdio.h>
982 #include <time.h>
983 #include <string.h>
984
prime(void)985 unsigned prime(void) /* Wake variable speed cpu, get rough speed estimate */
986 {
987 volatile uint64_t i;
988 volatile uint64_t j=1;
989 unsigned cnt=0;
990 volatile clock_t ticks = clock();
991 do {
992 for (i = 0; i < 500000; i++) {
993 uint64_t x = get64PE(&j);
994 j = x * x + (uint64_t)ticks;
995 }
996 cnt++;
997 } while (clock() - ticks < (CLOCKS_PER_SEC/2));
998 return cnt; /* cnt is millions of iterations per second */
999 }
1000
main(void)1001 int main(void)
1002 {
1003 ALIGN(16) vmac_ctx_t ctx, ctx_aio, ctx_inc1, ctx_inc2;
1004 uint64_t res, tagl;
1005 void *p;
1006 unsigned char *m;
1007 ALIGN(4) unsigned char key[] = "abcdefghijklmnop";
1008 ALIGN(4) unsigned char nonce[] = "\0\0\0\0\0\0\0\0bcdefghi";
1009 unsigned int vector_lengths[] = {0,3,48,300,3000000};
1010 #if (VMAC_TAG_LEN == 64)
1011 ALIGN(4) char *should_be[] = {"2576BE1C56D8B81B","2D376CF5B1813CE5",
1012 "E8421F61D573D298","4492DF6C5CAC1BBE",
1013 "09BA597DD7601113"};
1014 #else
1015 ALIGN(4) char *should_be[] = {"472766C70F74ED23481D6D7DE4E80DAC",
1016 "4EE815A06A1D71EDD36FC75D51188A42",
1017 "09F2C80C8E1007A0C12FAE19FE4504AE",
1018 "66438817154850C61D8A412164803BCB",
1019 "2B6B02288FFC461B75485DE893C629DC"};
1020 #endif
1021 unsigned speed_lengths[] = {16, 32, 64, 128, 256, 512, 1024, 2048, 4096};
1022 unsigned i, j, *speed_iters;
1023 clock_t ticks;
1024 double cpb;
1025 const unsigned int buf_len = 3 * (1 << 20);
1026
1027 j = prime();
1028 i = sizeof(speed_lengths)/sizeof(speed_lengths[0]);
1029 speed_iters = (unsigned *)malloc(i*sizeof(speed_iters[0]));
1030 speed_iters[i-1] = j * (1 << 12);
1031 while (--i) speed_iters[i-1] = (unsigned)(1.3 * speed_iters[i]);
1032
1033 /* Initialize context and message buffer, all 16-byte aligned */
1034 p = malloc(buf_len + 32);
1035 m = (unsigned char *)(((size_t)p + 16) & ~((size_t)15));
1036 memset(m, 0, buf_len + 16);
1037 vmac_set_key(key, &ctx);
1038
1039 /* Test incremental and all-in-one interfaces for correctness */
1040 vmac_set_key(key, &ctx_aio);
1041 vmac_set_key(key, &ctx_inc1);
1042 vmac_set_key(key, &ctx_inc2);
1043
1044
1045 /*
1046 for (i = 0; i <= 512; i++) {
1047 vhash_update(m,(i/VMAC_NHBYTES)*VMAC_NHBYTES,&ctx_inc1);
1048 tagh = vmac(m+(i/VMAC_NHBYTES)*VMAC_NHBYTES, i%VMAC_NHBYTES,
1049 nonce, &tagl, &ctx);
1050 vhash_update(m,(i/VMAC_NHBYTES)*VMAC_NHBYTES,&ctx_inc1);
1051 for (j = 0; j < vector_lengths[i]; j++)
1052 m[j] = (unsigned char)('a'+j%3);
1053
1054 }
1055 */
1056
1057 /* Generate vectors */
1058 for (i = 0; i < sizeof(vector_lengths)/sizeof(unsigned int); i++) {
1059 for (j = 0; j < vector_lengths[i]; j++)
1060 m[j] = (unsigned char)('a'+j%3);
1061 res = vmac(m, vector_lengths[i], nonce, &tagl, &ctx);
1062 #if (VMAC_TAG_LEN == 64)
1063 printf("\'abc\' * %7u: %016llX Should be: %s\n",
1064 vector_lengths[i]/3,res,should_be[i]);
1065 #else
1066 printf("\'abc\' * %7u: %016llX%016llX\nShould be : %s\n",
1067 vector_lengths[i]/3,res,tagl,should_be[i]);
1068 #endif
1069 }
1070
1071 /* Speed test */
1072 for (i = 0; i < sizeof(speed_lengths)/sizeof(unsigned int); i++) {
1073 ticks = clock();
1074 for (j = 0; j < speed_iters[i]; j++) {
1075 #if HASH_ONLY
1076 res = vhash(m, speed_lengths[i], &tagl, &ctx);
1077 #else
1078 res = vmac(m, speed_lengths[i], nonce, &tagl, &ctx);
1079 nonce[7]++;
1080 #endif
1081 }
1082 ticks = clock() - ticks;
1083 cpb = ((ticks*VMAC_HZ)/
1084 ((double)CLOCKS_PER_SEC*speed_lengths[i]*speed_iters[i]));
1085 printf("%4u bytes, %2.2f cpb\n", speed_lengths[i], cpb);
1086 }
1087 return 1;
1088 }
1089
1090 #endif
1091