1// Copyright 2002-2016 The OpenSSL Project Authors. All Rights Reserved. 2// 3// Licensed under the Apache License, Version 2.0 (the "License"); 4// you may not use this file except in compliance with the License. 5// You may obtain a copy of the License at 6// 7// https://www.apache.org/licenses/LICENSE-2.0 8// 9// Unless required by applicable law or agreed to in writing, software 10// distributed under the License is distributed on an "AS IS" BASIS, 11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12// See the License for the specific language governing permissions and 13// limitations under the License. 14 15/* x86_64 BIGNUM accelerator version 0.1, December 2002. 16 * 17 * Implemented by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL 18 * project. 19 * 20 * Rights for redistribution and usage in source and binary forms are 21 * granted according to the License. Warranty of any kind is disclaimed. 22 * 23 * Q. Version 0.1? It doesn't sound like Andy, he used to assign real 24 * versions, like 1.0... 25 * A. Well, that's because this code is basically a quick-n-dirty 26 * proof-of-concept hack. As you can see it's implemented with 27 * inline assembler, which means that you're bound to GCC and that 28 * there might be enough room for further improvement. 29 * 30 * Q. Why inline assembler? 31 * A. x86_64 features own ABI which I'm not familiar with. This is 32 * why I decided to let the compiler take care of subroutine 33 * prologue/epilogue as well as register allocation. For reference. 34 * Win64 implements different ABI for AMD64, different from Linux. 35 * 36 * Q. How much faster does it get? 37 * A. 'apps/openssl speed rsa dsa' output with no-asm: 38 * 39 * sign verify sign/s verify/s 40 * rsa 512 bits 0.0006s 0.0001s 1683.8 18456.2 41 * rsa 1024 bits 0.0028s 0.0002s 356.0 6407.0 42 * rsa 2048 bits 0.0172s 0.0005s 58.0 1957.8 43 * rsa 4096 bits 0.1155s 0.0018s 8.7 555.6 44 * sign verify sign/s verify/s 45 * dsa 512 bits 0.0005s 0.0006s 2100.8 1768.3 46 * dsa 1024 bits 0.0014s 0.0018s 692.3 559.2 47 * dsa 2048 bits 0.0049s 0.0061s 204.7 165.0 48 * 49 * 'apps/openssl speed rsa dsa' output with this module: 50 * 51 * sign verify sign/s verify/s 52 * rsa 512 bits 0.0004s 0.0000s 2767.1 33297.9 53 * rsa 1024 bits 0.0012s 0.0001s 867.4 14674.7 54 * rsa 2048 bits 0.0061s 0.0002s 164.0 5270.0 55 * rsa 4096 bits 0.0384s 0.0006s 26.1 1650.8 56 * sign verify sign/s verify/s 57 * dsa 512 bits 0.0002s 0.0003s 4442.2 3786.3 58 * dsa 1024 bits 0.0005s 0.0007s 1835.1 1497.4 59 * dsa 2048 bits 0.0016s 0.0020s 620.4 504.6 60 * 61 * For the reference. IA-32 assembler implementation performs 62 * very much like 64-bit code compiled with no-asm on the same 63 * machine. 64 */ 65 66#include <openssl/bn.h> 67 68// TODO(davidben): Get this file working on MSVC x64. 69#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64) && \ 70 (defined(__GNUC__) || defined(__clang__)) 71 72#include "../internal.h" 73 74 75#undef mul 76#undef mul_add 77 78// "m"(a), "+m"(r) is the way to favor DirectPath µ-code; 79#define mul_add(r, a, word, carry) \ 80 do { \ 81 BN_ULONG high, low; \ 82 __asm__("mulq %3" : "=a"(low), "=d"(high) : "a"(word), "m"(a) : "cc"); \ 83 __asm__("addq %2,%0; adcq $0,%1" \ 84 : "+r"(carry), "+d"(high) \ 85 : "a"(low) \ 86 : "cc"); \ 87 __asm__("addq %2,%0; adcq $0,%1" \ 88 : "+m"(r), "+d"(high) \ 89 : "r"(carry) \ 90 : "cc"); \ 91 (carry) = high; \ 92 } while (0) 93 94#define mul(r, a, word, carry) \ 95 do { \ 96 BN_ULONG high, low; \ 97 __asm__("mulq %3" : "=a"(low), "=d"(high) : "a"(word), "g"(a) : "cc"); \ 98 __asm__("addq %2,%0; adcq $0,%1" \ 99 : "+r"(carry), "+d"(high) \ 100 : "a"(low) \ 101 : "cc"); \ 102 (r) = (carry); \ 103 (carry) = high; \ 104 } while (0) 105 106// r0:r1:carry = r0:r1 + a^2 + carry:0 107#define sqr_add(r0, r1, a, carry) \ 108 do { \ 109 BN_ULONG high, low; \ 110 /* lo:hi = a^2 */ \ 111 __asm__("mulq %2" : "=a"(low), "=d"(high) : "a"(a) : "cc"); \ 112 /* carry:hi = lo:hi + carry:0 = a^2 + carry */ \ 113 __asm__("addq %2,%0; adcq $0,%1" \ 114 : "+r"(carry), "+d"(high) \ 115 : "a"(low) \ 116 : "cc"); \ 117 /* r0:r1:carry = carry:hi + r0:r1 */ \ 118 __asm__("addq %2,%0; adcq %3,%1; movq $0, %2; adcq $0, %2" \ 119 : "+m"(r0), "+m"(r1), "+r"(carry) \ 120 : "d"(high) \ 121 : "cc"); \ 122 } while (0) 123 124BN_ULONG bn_mul_add_words(BN_ULONG *rp, const BN_ULONG *ap, size_t num, 125 BN_ULONG w) { 126 BN_ULONG c1 = 0; 127 128 if (num == 0) { 129 return (c1); 130 } 131 132 while (num & ~3) { 133 mul_add(rp[0], ap[0], w, c1); 134 mul_add(rp[1], ap[1], w, c1); 135 mul_add(rp[2], ap[2], w, c1); 136 mul_add(rp[3], ap[3], w, c1); 137 ap += 4; 138 rp += 4; 139 num -= 4; 140 } 141 if (num) { 142 mul_add(rp[0], ap[0], w, c1); 143 if (--num == 0) { 144 return c1; 145 } 146 mul_add(rp[1], ap[1], w, c1); 147 if (--num == 0) { 148 return c1; 149 } 150 mul_add(rp[2], ap[2], w, c1); 151 return c1; 152 } 153 154 return c1; 155} 156 157BN_ULONG bn_mul_words(BN_ULONG *rp, const BN_ULONG *ap, size_t num, 158 BN_ULONG w) { 159 BN_ULONG c1 = 0; 160 161 if (num == 0) { 162 return c1; 163 } 164 165 while (num & ~3) { 166 mul(rp[0], ap[0], w, c1); 167 mul(rp[1], ap[1], w, c1); 168 mul(rp[2], ap[2], w, c1); 169 mul(rp[3], ap[3], w, c1); 170 ap += 4; 171 rp += 4; 172 num -= 4; 173 } 174 if (num) { 175 mul(rp[0], ap[0], w, c1); 176 if (--num == 0) { 177 return c1; 178 } 179 mul(rp[1], ap[1], w, c1); 180 if (--num == 0) { 181 return c1; 182 } 183 mul(rp[2], ap[2], w, c1); 184 } 185 return c1; 186} 187 188void bn_sqr_add_words(BN_ULONG *r, const BN_ULONG *a, size_t n) { 189 if (n == 0) { 190 return; 191 } 192 193 BN_ULONG carry = 0; 194 while (n & ~3) { 195 sqr_add(r[0], r[1], a[0], carry); 196 sqr_add(r[2], r[3], a[1], carry); 197 sqr_add(r[4], r[5], a[2], carry); 198 sqr_add(r[6], r[7], a[3], carry); 199 a += 4; 200 r += 8; 201 n -= 4; 202 } 203 if (n) { 204 sqr_add(r[0], r[1], a[0], carry); 205 if (--n == 0) { 206 return; 207 } 208 sqr_add(r[2], r[3], a[1], carry); 209 if (--n == 0) { 210 return; 211 } 212 sqr_add(r[4], r[5], a[2], carry); 213 } 214} 215 216BN_ULONG bn_add_words(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp, 217 size_t n) { 218 BN_ULONG ret; 219 size_t i = 0; 220 221 if (n == 0) { 222 return 0; 223 } 224 225 __asm__ volatile( 226 " subq %0,%0 \n" // clear carry 227 " jmp 1f \n" 228 ".p2align 4 \n" 229 "1:" 230 " movq (%4,%2,8),%0 \n" 231 " adcq (%5,%2,8),%0 \n" 232 " movq %0,(%3,%2,8) \n" 233 " lea 1(%2),%2 \n" 234 " dec %1 \n" 235 " jnz 1b \n" 236 " sbbq %0,%0 \n" 237 : "=&r"(ret), "+&c"(n), "+&r"(i) 238 : "r"(rp), "r"(ap), "r"(bp) 239 : "cc", "memory"); 240 241 return ret & 1; 242} 243 244BN_ULONG bn_sub_words(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp, 245 size_t n) { 246 BN_ULONG ret; 247 size_t i = 0; 248 249 if (n == 0) { 250 return 0; 251 } 252 253 __asm__ volatile( 254 " subq %0,%0 \n" // clear borrow 255 " jmp 1f \n" 256 ".p2align 4 \n" 257 "1:" 258 " movq (%4,%2,8),%0 \n" 259 " sbbq (%5,%2,8),%0 \n" 260 " movq %0,(%3,%2,8) \n" 261 " lea 1(%2),%2 \n" 262 " dec %1 \n" 263 " jnz 1b \n" 264 " sbbq %0,%0 \n" 265 : "=&r"(ret), "+&c"(n), "+&r"(i) 266 : "r"(rp), "r"(ap), "r"(bp) 267 : "cc", "memory"); 268 269 return ret & 1; 270} 271 272// mul_add_c(a,b,c0,c1,c2) -- c+=a*b for three word number c=(c2,c1,c0) 273// mul_add_c2(a,b,c0,c1,c2) -- c+=2*a*b for three word number c=(c2,c1,c0) 274// sqr_add_c(a,i,c0,c1,c2) -- c+=a[i]^2 for three word number c=(c2,c1,c0) 275// sqr_add_c2(a,i,c0,c1,c2) -- c+=2*a[i]*a[j] for three word number c=(c2,c1,c0) 276 277// Keep in mind that carrying into high part of multiplication result can not 278// overflow, because it cannot be all-ones. 279#define mul_add_c(a, b, c0, c1, c2) \ 280 do { \ 281 BN_ULONG t1, t2; \ 282 __asm__("mulq %3" : "=a"(t1), "=d"(t2) : "a"(a), "m"(b) : "cc"); \ 283 __asm__("addq %3,%0; adcq %4,%1; adcq $0,%2" \ 284 : "+&r"(c0), "+r"(c1), "+r"(c2) \ 285 : "r"(t1), "r"(t2) \ 286 : "cc"); \ 287 } while (0) 288 289#define sqr_add_c(a, i, c0, c1, c2) \ 290 do { \ 291 BN_ULONG t1, t2; \ 292 __asm__("mulq %2" : "=a"(t1), "=d"(t2) : "a"((a)[i]) : "cc"); \ 293 __asm__("addq %3,%0; adcq %4,%1; adcq $0,%2" \ 294 : "+&r"(c0), "+r"(c1), "+r"(c2) \ 295 : "r"(t1), "r"(t2) \ 296 : "cc"); \ 297 } while (0) 298 299#define mul_add_c2(a, b, c0, c1, c2) \ 300 do { \ 301 BN_ULONG t1, t2; \ 302 __asm__("mulq %3" : "=a"(t1), "=d"(t2) : "a"(a), "m"(b) : "cc"); \ 303 __asm__("addq %3,%0; adcq %4,%1; adcq $0,%2" \ 304 : "+&r"(c0), "+r"(c1), "+r"(c2) \ 305 : "r"(t1), "r"(t2) \ 306 : "cc"); \ 307 __asm__("addq %3,%0; adcq %4,%1; adcq $0,%2" \ 308 : "+&r"(c0), "+r"(c1), "+r"(c2) \ 309 : "r"(t1), "r"(t2) \ 310 : "cc"); \ 311 } while (0) 312 313#define sqr_add_c2(a, i, j, c0, c1, c2) mul_add_c2((a)[i], (a)[j], c0, c1, c2) 314 315void bn_mul_comba8(BN_ULONG r[16], const BN_ULONG a[8], const BN_ULONG b[8]) { 316 BN_ULONG c1, c2, c3; 317 318 c1 = 0; 319 c2 = 0; 320 c3 = 0; 321 mul_add_c(a[0], b[0], c1, c2, c3); 322 r[0] = c1; 323 c1 = 0; 324 mul_add_c(a[0], b[1], c2, c3, c1); 325 mul_add_c(a[1], b[0], c2, c3, c1); 326 r[1] = c2; 327 c2 = 0; 328 mul_add_c(a[2], b[0], c3, c1, c2); 329 mul_add_c(a[1], b[1], c3, c1, c2); 330 mul_add_c(a[0], b[2], c3, c1, c2); 331 r[2] = c3; 332 c3 = 0; 333 mul_add_c(a[0], b[3], c1, c2, c3); 334 mul_add_c(a[1], b[2], c1, c2, c3); 335 mul_add_c(a[2], b[1], c1, c2, c3); 336 mul_add_c(a[3], b[0], c1, c2, c3); 337 r[3] = c1; 338 c1 = 0; 339 mul_add_c(a[4], b[0], c2, c3, c1); 340 mul_add_c(a[3], b[1], c2, c3, c1); 341 mul_add_c(a[2], b[2], c2, c3, c1); 342 mul_add_c(a[1], b[3], c2, c3, c1); 343 mul_add_c(a[0], b[4], c2, c3, c1); 344 r[4] = c2; 345 c2 = 0; 346 mul_add_c(a[0], b[5], c3, c1, c2); 347 mul_add_c(a[1], b[4], c3, c1, c2); 348 mul_add_c(a[2], b[3], c3, c1, c2); 349 mul_add_c(a[3], b[2], c3, c1, c2); 350 mul_add_c(a[4], b[1], c3, c1, c2); 351 mul_add_c(a[5], b[0], c3, c1, c2); 352 r[5] = c3; 353 c3 = 0; 354 mul_add_c(a[6], b[0], c1, c2, c3); 355 mul_add_c(a[5], b[1], c1, c2, c3); 356 mul_add_c(a[4], b[2], c1, c2, c3); 357 mul_add_c(a[3], b[3], c1, c2, c3); 358 mul_add_c(a[2], b[4], c1, c2, c3); 359 mul_add_c(a[1], b[5], c1, c2, c3); 360 mul_add_c(a[0], b[6], c1, c2, c3); 361 r[6] = c1; 362 c1 = 0; 363 mul_add_c(a[0], b[7], c2, c3, c1); 364 mul_add_c(a[1], b[6], c2, c3, c1); 365 mul_add_c(a[2], b[5], c2, c3, c1); 366 mul_add_c(a[3], b[4], c2, c3, c1); 367 mul_add_c(a[4], b[3], c2, c3, c1); 368 mul_add_c(a[5], b[2], c2, c3, c1); 369 mul_add_c(a[6], b[1], c2, c3, c1); 370 mul_add_c(a[7], b[0], c2, c3, c1); 371 r[7] = c2; 372 c2 = 0; 373 mul_add_c(a[7], b[1], c3, c1, c2); 374 mul_add_c(a[6], b[2], c3, c1, c2); 375 mul_add_c(a[5], b[3], c3, c1, c2); 376 mul_add_c(a[4], b[4], c3, c1, c2); 377 mul_add_c(a[3], b[5], c3, c1, c2); 378 mul_add_c(a[2], b[6], c3, c1, c2); 379 mul_add_c(a[1], b[7], c3, c1, c2); 380 r[8] = c3; 381 c3 = 0; 382 mul_add_c(a[2], b[7], c1, c2, c3); 383 mul_add_c(a[3], b[6], c1, c2, c3); 384 mul_add_c(a[4], b[5], c1, c2, c3); 385 mul_add_c(a[5], b[4], c1, c2, c3); 386 mul_add_c(a[6], b[3], c1, c2, c3); 387 mul_add_c(a[7], b[2], c1, c2, c3); 388 r[9] = c1; 389 c1 = 0; 390 mul_add_c(a[7], b[3], c2, c3, c1); 391 mul_add_c(a[6], b[4], c2, c3, c1); 392 mul_add_c(a[5], b[5], c2, c3, c1); 393 mul_add_c(a[4], b[6], c2, c3, c1); 394 mul_add_c(a[3], b[7], c2, c3, c1); 395 r[10] = c2; 396 c2 = 0; 397 mul_add_c(a[4], b[7], c3, c1, c2); 398 mul_add_c(a[5], b[6], c3, c1, c2); 399 mul_add_c(a[6], b[5], c3, c1, c2); 400 mul_add_c(a[7], b[4], c3, c1, c2); 401 r[11] = c3; 402 c3 = 0; 403 mul_add_c(a[7], b[5], c1, c2, c3); 404 mul_add_c(a[6], b[6], c1, c2, c3); 405 mul_add_c(a[5], b[7], c1, c2, c3); 406 r[12] = c1; 407 c1 = 0; 408 mul_add_c(a[6], b[7], c2, c3, c1); 409 mul_add_c(a[7], b[6], c2, c3, c1); 410 r[13] = c2; 411 c2 = 0; 412 mul_add_c(a[7], b[7], c3, c1, c2); 413 r[14] = c3; 414 r[15] = c1; 415} 416 417void bn_mul_comba4(BN_ULONG r[8], const BN_ULONG a[4], const BN_ULONG b[4]) { 418 BN_ULONG c1, c2, c3; 419 420 c1 = 0; 421 c2 = 0; 422 c3 = 0; 423 mul_add_c(a[0], b[0], c1, c2, c3); 424 r[0] = c1; 425 c1 = 0; 426 mul_add_c(a[0], b[1], c2, c3, c1); 427 mul_add_c(a[1], b[0], c2, c3, c1); 428 r[1] = c2; 429 c2 = 0; 430 mul_add_c(a[2], b[0], c3, c1, c2); 431 mul_add_c(a[1], b[1], c3, c1, c2); 432 mul_add_c(a[0], b[2], c3, c1, c2); 433 r[2] = c3; 434 c3 = 0; 435 mul_add_c(a[0], b[3], c1, c2, c3); 436 mul_add_c(a[1], b[2], c1, c2, c3); 437 mul_add_c(a[2], b[1], c1, c2, c3); 438 mul_add_c(a[3], b[0], c1, c2, c3); 439 r[3] = c1; 440 c1 = 0; 441 mul_add_c(a[3], b[1], c2, c3, c1); 442 mul_add_c(a[2], b[2], c2, c3, c1); 443 mul_add_c(a[1], b[3], c2, c3, c1); 444 r[4] = c2; 445 c2 = 0; 446 mul_add_c(a[2], b[3], c3, c1, c2); 447 mul_add_c(a[3], b[2], c3, c1, c2); 448 r[5] = c3; 449 c3 = 0; 450 mul_add_c(a[3], b[3], c1, c2, c3); 451 r[6] = c1; 452 r[7] = c2; 453} 454 455void bn_sqr_comba8(BN_ULONG r[16], const BN_ULONG a[8]) { 456 BN_ULONG c1, c2, c3; 457 458 c1 = 0; 459 c2 = 0; 460 c3 = 0; 461 sqr_add_c(a, 0, c1, c2, c3); 462 r[0] = c1; 463 c1 = 0; 464 sqr_add_c2(a, 1, 0, c2, c3, c1); 465 r[1] = c2; 466 c2 = 0; 467 sqr_add_c(a, 1, c3, c1, c2); 468 sqr_add_c2(a, 2, 0, c3, c1, c2); 469 r[2] = c3; 470 c3 = 0; 471 sqr_add_c2(a, 3, 0, c1, c2, c3); 472 sqr_add_c2(a, 2, 1, c1, c2, c3); 473 r[3] = c1; 474 c1 = 0; 475 sqr_add_c(a, 2, c2, c3, c1); 476 sqr_add_c2(a, 3, 1, c2, c3, c1); 477 sqr_add_c2(a, 4, 0, c2, c3, c1); 478 r[4] = c2; 479 c2 = 0; 480 sqr_add_c2(a, 5, 0, c3, c1, c2); 481 sqr_add_c2(a, 4, 1, c3, c1, c2); 482 sqr_add_c2(a, 3, 2, c3, c1, c2); 483 r[5] = c3; 484 c3 = 0; 485 sqr_add_c(a, 3, c1, c2, c3); 486 sqr_add_c2(a, 4, 2, c1, c2, c3); 487 sqr_add_c2(a, 5, 1, c1, c2, c3); 488 sqr_add_c2(a, 6, 0, c1, c2, c3); 489 r[6] = c1; 490 c1 = 0; 491 sqr_add_c2(a, 7, 0, c2, c3, c1); 492 sqr_add_c2(a, 6, 1, c2, c3, c1); 493 sqr_add_c2(a, 5, 2, c2, c3, c1); 494 sqr_add_c2(a, 4, 3, c2, c3, c1); 495 r[7] = c2; 496 c2 = 0; 497 sqr_add_c(a, 4, c3, c1, c2); 498 sqr_add_c2(a, 5, 3, c3, c1, c2); 499 sqr_add_c2(a, 6, 2, c3, c1, c2); 500 sqr_add_c2(a, 7, 1, c3, c1, c2); 501 r[8] = c3; 502 c3 = 0; 503 sqr_add_c2(a, 7, 2, c1, c2, c3); 504 sqr_add_c2(a, 6, 3, c1, c2, c3); 505 sqr_add_c2(a, 5, 4, c1, c2, c3); 506 r[9] = c1; 507 c1 = 0; 508 sqr_add_c(a, 5, c2, c3, c1); 509 sqr_add_c2(a, 6, 4, c2, c3, c1); 510 sqr_add_c2(a, 7, 3, c2, c3, c1); 511 r[10] = c2; 512 c2 = 0; 513 sqr_add_c2(a, 7, 4, c3, c1, c2); 514 sqr_add_c2(a, 6, 5, c3, c1, c2); 515 r[11] = c3; 516 c3 = 0; 517 sqr_add_c(a, 6, c1, c2, c3); 518 sqr_add_c2(a, 7, 5, c1, c2, c3); 519 r[12] = c1; 520 c1 = 0; 521 sqr_add_c2(a, 7, 6, c2, c3, c1); 522 r[13] = c2; 523 c2 = 0; 524 sqr_add_c(a, 7, c3, c1, c2); 525 r[14] = c3; 526 r[15] = c1; 527} 528 529void bn_sqr_comba4(BN_ULONG r[8], const BN_ULONG a[4]) { 530 BN_ULONG c1, c2, c3; 531 532 c1 = 0; 533 c2 = 0; 534 c3 = 0; 535 sqr_add_c(a, 0, c1, c2, c3); 536 r[0] = c1; 537 c1 = 0; 538 sqr_add_c2(a, 1, 0, c2, c3, c1); 539 r[1] = c2; 540 c2 = 0; 541 sqr_add_c(a, 1, c3, c1, c2); 542 sqr_add_c2(a, 2, 0, c3, c1, c2); 543 r[2] = c3; 544 c3 = 0; 545 sqr_add_c2(a, 3, 0, c1, c2, c3); 546 sqr_add_c2(a, 2, 1, c1, c2, c3); 547 r[3] = c1; 548 c1 = 0; 549 sqr_add_c(a, 2, c2, c3, c1); 550 sqr_add_c2(a, 3, 1, c2, c3, c1); 551 r[4] = c2; 552 c2 = 0; 553 sqr_add_c2(a, 3, 2, c3, c1, c2); 554 r[5] = c3; 555 c3 = 0; 556 sqr_add_c(a, 3, c1, c2, c3); 557 r[6] = c1; 558 r[7] = c2; 559} 560 561#undef mul_add 562#undef mul 563#undef sqr 564#undef mul_add_c 565#undef sqr_add_c 566#undef mul_add_c2 567#undef sqr_add_c2 568 569#endif // !NO_ASM && X86_64 && (__GNUC__ || __clang__) 570