1// Copyright 1995-2016 The OpenSSL Project Authors. All Rights Reserved. 2// 3// Licensed under the Apache License, Version 2.0 (the "License"); 4// you may not use this file except in compliance with the License. 5// You may obtain a copy of the License at 6// 7// https://www.apache.org/licenses/LICENSE-2.0 8// 9// Unless required by applicable law or agreed to in writing, software 10// distributed under the License is distributed on an "AS IS" BASIS, 11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12// See the License for the specific language governing permissions and 13// limitations under the License. 14 15#include <openssl/bn.h> 16 17#include <assert.h> 18 19#include "internal.h" 20 21 22#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86) 23// See asm/bn-586.pl. 24#define BN_ADD_ASM 25#define BN_MUL_ASM 26#endif 27 28#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64) && \ 29 (defined(__GNUC__) || defined(__clang__)) 30// See asm/x86_64-gcc.c 31#define BN_ADD_ASM 32#define BN_MUL_ASM 33#endif 34 35#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_AARCH64) 36// See asm/bn-armv8.pl. 37#define BN_ADD_ASM 38#endif 39 40#if !defined(BN_MUL_ASM) 41 42#ifdef BN_ULLONG 43#define mul_add(r, a, w, c) \ 44 do { \ 45 BN_ULLONG t; \ 46 t = (BN_ULLONG)(w) * (a) + (r) + (c); \ 47 (r) = (BN_ULONG)(t); \ 48 (c) = (BN_ULONG)((t) >> BN_BITS2); \ 49 } while (0) 50 51#define mul(r, a, w, c) \ 52 do { \ 53 BN_ULLONG t; \ 54 t = (BN_ULLONG)(w) * (a) + (c); \ 55 (r) = (BN_ULONG)(t); \ 56 (c) = (BN_ULONG)((t) >> BN_BITS2); \ 57 } while (0) 58 59#define sqr(r0, r1, a) \ 60 do { \ 61 BN_ULLONG t; \ 62 t = (BN_ULLONG)(a) * (a); \ 63 (r0) = (BN_ULONG)(t); \ 64 (r1) = (BN_ULONG)((t) >> BN_BITS2); \ 65 } while (0) 66 67#else 68 69#define mul_add(r, a, w, c) \ 70 do { \ 71 BN_ULONG high, low, ret, tmp = (a); \ 72 ret = (r); \ 73 BN_UMULT_LOHI(low, high, w, tmp); \ 74 ret += (c); \ 75 (c) = (ret < (c)) ? 1 : 0; \ 76 (c) += high; \ 77 ret += low; \ 78 (c) += (ret < low) ? 1 : 0; \ 79 (r) = ret; \ 80 } while (0) 81 82#define mul(r, a, w, c) \ 83 do { \ 84 BN_ULONG high, low, ret, ta = (a); \ 85 BN_UMULT_LOHI(low, high, w, ta); \ 86 ret = low + (c); \ 87 (c) = high; \ 88 (c) += (ret < low) ? 1 : 0; \ 89 (r) = ret; \ 90 } while (0) 91 92#define sqr(r0, r1, a) \ 93 do { \ 94 BN_ULONG tmp = (a); \ 95 BN_UMULT_LOHI(r0, r1, tmp, tmp); \ 96 } while (0) 97 98#endif // !BN_ULLONG 99 100BN_ULONG bn_mul_add_words(BN_ULONG *rp, const BN_ULONG *ap, size_t num, 101 BN_ULONG w) { 102 BN_ULONG c1 = 0; 103 104 if (num == 0) { 105 return c1; 106 } 107 108 while (num & ~3) { 109 mul_add(rp[0], ap[0], w, c1); 110 mul_add(rp[1], ap[1], w, c1); 111 mul_add(rp[2], ap[2], w, c1); 112 mul_add(rp[3], ap[3], w, c1); 113 ap += 4; 114 rp += 4; 115 num -= 4; 116 } 117 118 while (num) { 119 mul_add(rp[0], ap[0], w, c1); 120 ap++; 121 rp++; 122 num--; 123 } 124 125 return c1; 126} 127 128BN_ULONG bn_mul_words(BN_ULONG *rp, const BN_ULONG *ap, size_t num, 129 BN_ULONG w) { 130 BN_ULONG c1 = 0; 131 132 if (num == 0) { 133 return c1; 134 } 135 136 while (num & ~3) { 137 mul(rp[0], ap[0], w, c1); 138 mul(rp[1], ap[1], w, c1); 139 mul(rp[2], ap[2], w, c1); 140 mul(rp[3], ap[3], w, c1); 141 ap += 4; 142 rp += 4; 143 num -= 4; 144 } 145 while (num) { 146 mul(rp[0], ap[0], w, c1); 147 ap++; 148 rp++; 149 num--; 150 } 151 return c1; 152} 153 154void bn_sqr_add_words(BN_ULONG *r, const BN_ULONG *a, size_t n) { 155 if (n == 0) { 156 return; 157 } 158 159 BN_ULONG carry = 0, lo, hi; 160 while (n & ~3) { 161 sqr(lo, hi, a[0]); 162 r[0] = CRYPTO_addc_w(r[0], lo, carry, &carry); 163 r[1] = CRYPTO_addc_w(r[1], hi, carry, &carry); 164 sqr(lo, hi, a[1]); 165 r[2] = CRYPTO_addc_w(r[2], lo, carry, &carry); 166 r[3] = CRYPTO_addc_w(r[3], hi, carry, &carry); 167 sqr(lo, hi, a[2]); 168 r[4] = CRYPTO_addc_w(r[4], lo, carry, &carry); 169 r[5] = CRYPTO_addc_w(r[5], hi, carry, &carry); 170 sqr(lo, hi, a[3]); 171 r[6] = CRYPTO_addc_w(r[6], lo, carry, &carry); 172 r[7] = CRYPTO_addc_w(r[7], hi, carry, &carry); 173 a += 4; 174 r += 8; 175 n -= 4; 176 } 177 while (n) { 178 sqr(lo, hi, a[0]); 179 r[0] = CRYPTO_addc_w(r[0], lo, carry, &carry); 180 r[1] = CRYPTO_addc_w(r[1], hi, carry, &carry); 181 a++; 182 r += 2; 183 n--; 184 } 185} 186 187// mul_add_c(a,b,c0,c1,c2) -- c+=a*b for three word number c=(c2,c1,c0) 188// mul_add_c2(a,b,c0,c1,c2) -- c+=2*a*b for three word number c=(c2,c1,c0) 189// sqr_add_c(a,i,c0,c1,c2) -- c+=a[i]^2 for three word number c=(c2,c1,c0) 190// sqr_add_c2(a,i,c0,c1,c2) -- c+=2*a[i]*a[j] for three word number c=(c2,c1,c0) 191 192#ifdef BN_ULLONG 193 194// Keep in mind that additions to multiplication result can not overflow, 195// because its high half cannot be all-ones. 196#define mul_add_c(a, b, c0, c1, c2) \ 197 do { \ 198 BN_ULONG hi; \ 199 BN_ULLONG t = (BN_ULLONG)(a) * (b); \ 200 t += (c0); /* no carry */ \ 201 (c0) = (BN_ULONG)(t); \ 202 hi = (BN_ULONG)((t) >> BN_BITS2); \ 203 (c1) += (hi); \ 204 (c2) += (c1) < hi; \ 205 } while (0) 206 207#define mul_add_c2(a, b, c0, c1, c2) \ 208 do { \ 209 BN_ULONG hi; \ 210 BN_ULLONG t = (BN_ULLONG)(a) * (b); \ 211 BN_ULLONG tt = t + (c0); /* no carry */ \ 212 (c0) = (BN_ULONG)(tt); \ 213 hi = (BN_ULONG)((tt) >> BN_BITS2); \ 214 (c1) += hi; \ 215 (c2) += (c1) < hi; \ 216 t += (c0); /* no carry */ \ 217 (c0) = (BN_ULONG)(t); \ 218 hi = (BN_ULONG)((t) >> BN_BITS2); \ 219 (c1) += hi; \ 220 (c2) += (c1) < hi; \ 221 } while (0) 222 223#define sqr_add_c(a, i, c0, c1, c2) \ 224 do { \ 225 BN_ULONG hi; \ 226 BN_ULLONG t = (BN_ULLONG)(a)[i] * (a)[i]; \ 227 t += (c0); /* no carry */ \ 228 (c0) = (BN_ULONG)(t); \ 229 hi = (BN_ULONG)((t) >> BN_BITS2); \ 230 (c1) += hi; \ 231 (c2) += (c1) < hi; \ 232 } while (0) 233 234#define sqr_add_c2(a, i, j, c0, c1, c2) mul_add_c2((a)[i], (a)[j], c0, c1, c2) 235 236#else 237 238// Keep in mind that additions to hi can not overflow, because the high word of 239// a multiplication result cannot be all-ones. 240#define mul_add_c(a, b, c0, c1, c2) \ 241 do { \ 242 BN_ULONG ta = (a), tb = (b); \ 243 BN_ULONG lo, hi; \ 244 BN_UMULT_LOHI(lo, hi, ta, tb); \ 245 (c0) += lo; \ 246 hi += ((c0) < lo) ? 1 : 0; \ 247 (c1) += hi; \ 248 (c2) += ((c1) < hi) ? 1 : 0; \ 249 } while (0) 250 251#define mul_add_c2(a, b, c0, c1, c2) \ 252 do { \ 253 BN_ULONG ta = (a), tb = (b); \ 254 BN_ULONG lo, hi, tt; \ 255 BN_UMULT_LOHI(lo, hi, ta, tb); \ 256 (c0) += lo; \ 257 tt = hi + (((c0) < lo) ? 1 : 0); \ 258 (c1) += tt; \ 259 (c2) += ((c1) < tt) ? 1 : 0; \ 260 (c0) += lo; \ 261 hi += (c0 < lo) ? 1 : 0; \ 262 (c1) += hi; \ 263 (c2) += ((c1) < hi) ? 1 : 0; \ 264 } while (0) 265 266#define sqr_add_c(a, i, c0, c1, c2) \ 267 do { \ 268 BN_ULONG ta = (a)[i]; \ 269 BN_ULONG lo, hi; \ 270 BN_UMULT_LOHI(lo, hi, ta, ta); \ 271 (c0) += lo; \ 272 hi += (c0 < lo) ? 1 : 0; \ 273 (c1) += hi; \ 274 (c2) += ((c1) < hi) ? 1 : 0; \ 275 } while (0) 276 277#define sqr_add_c2(a, i, j, c0, c1, c2) mul_add_c2((a)[i], (a)[j], c0, c1, c2) 278 279#endif // !BN_ULLONG 280 281void bn_mul_comba8(BN_ULONG r[16], const BN_ULONG a[8], const BN_ULONG b[8]) { 282 BN_ULONG c1, c2, c3; 283 284 c1 = 0; 285 c2 = 0; 286 c3 = 0; 287 mul_add_c(a[0], b[0], c1, c2, c3); 288 r[0] = c1; 289 c1 = 0; 290 mul_add_c(a[0], b[1], c2, c3, c1); 291 mul_add_c(a[1], b[0], c2, c3, c1); 292 r[1] = c2; 293 c2 = 0; 294 mul_add_c(a[2], b[0], c3, c1, c2); 295 mul_add_c(a[1], b[1], c3, c1, c2); 296 mul_add_c(a[0], b[2], c3, c1, c2); 297 r[2] = c3; 298 c3 = 0; 299 mul_add_c(a[0], b[3], c1, c2, c3); 300 mul_add_c(a[1], b[2], c1, c2, c3); 301 mul_add_c(a[2], b[1], c1, c2, c3); 302 mul_add_c(a[3], b[0], c1, c2, c3); 303 r[3] = c1; 304 c1 = 0; 305 mul_add_c(a[4], b[0], c2, c3, c1); 306 mul_add_c(a[3], b[1], c2, c3, c1); 307 mul_add_c(a[2], b[2], c2, c3, c1); 308 mul_add_c(a[1], b[3], c2, c3, c1); 309 mul_add_c(a[0], b[4], c2, c3, c1); 310 r[4] = c2; 311 c2 = 0; 312 mul_add_c(a[0], b[5], c3, c1, c2); 313 mul_add_c(a[1], b[4], c3, c1, c2); 314 mul_add_c(a[2], b[3], c3, c1, c2); 315 mul_add_c(a[3], b[2], c3, c1, c2); 316 mul_add_c(a[4], b[1], c3, c1, c2); 317 mul_add_c(a[5], b[0], c3, c1, c2); 318 r[5] = c3; 319 c3 = 0; 320 mul_add_c(a[6], b[0], c1, c2, c3); 321 mul_add_c(a[5], b[1], c1, c2, c3); 322 mul_add_c(a[4], b[2], c1, c2, c3); 323 mul_add_c(a[3], b[3], c1, c2, c3); 324 mul_add_c(a[2], b[4], c1, c2, c3); 325 mul_add_c(a[1], b[5], c1, c2, c3); 326 mul_add_c(a[0], b[6], c1, c2, c3); 327 r[6] = c1; 328 c1 = 0; 329 mul_add_c(a[0], b[7], c2, c3, c1); 330 mul_add_c(a[1], b[6], c2, c3, c1); 331 mul_add_c(a[2], b[5], c2, c3, c1); 332 mul_add_c(a[3], b[4], c2, c3, c1); 333 mul_add_c(a[4], b[3], c2, c3, c1); 334 mul_add_c(a[5], b[2], c2, c3, c1); 335 mul_add_c(a[6], b[1], c2, c3, c1); 336 mul_add_c(a[7], b[0], c2, c3, c1); 337 r[7] = c2; 338 c2 = 0; 339 mul_add_c(a[7], b[1], c3, c1, c2); 340 mul_add_c(a[6], b[2], c3, c1, c2); 341 mul_add_c(a[5], b[3], c3, c1, c2); 342 mul_add_c(a[4], b[4], c3, c1, c2); 343 mul_add_c(a[3], b[5], c3, c1, c2); 344 mul_add_c(a[2], b[6], c3, c1, c2); 345 mul_add_c(a[1], b[7], c3, c1, c2); 346 r[8] = c3; 347 c3 = 0; 348 mul_add_c(a[2], b[7], c1, c2, c3); 349 mul_add_c(a[3], b[6], c1, c2, c3); 350 mul_add_c(a[4], b[5], c1, c2, c3); 351 mul_add_c(a[5], b[4], c1, c2, c3); 352 mul_add_c(a[6], b[3], c1, c2, c3); 353 mul_add_c(a[7], b[2], c1, c2, c3); 354 r[9] = c1; 355 c1 = 0; 356 mul_add_c(a[7], b[3], c2, c3, c1); 357 mul_add_c(a[6], b[4], c2, c3, c1); 358 mul_add_c(a[5], b[5], c2, c3, c1); 359 mul_add_c(a[4], b[6], c2, c3, c1); 360 mul_add_c(a[3], b[7], c2, c3, c1); 361 r[10] = c2; 362 c2 = 0; 363 mul_add_c(a[4], b[7], c3, c1, c2); 364 mul_add_c(a[5], b[6], c3, c1, c2); 365 mul_add_c(a[6], b[5], c3, c1, c2); 366 mul_add_c(a[7], b[4], c3, c1, c2); 367 r[11] = c3; 368 c3 = 0; 369 mul_add_c(a[7], b[5], c1, c2, c3); 370 mul_add_c(a[6], b[6], c1, c2, c3); 371 mul_add_c(a[5], b[7], c1, c2, c3); 372 r[12] = c1; 373 c1 = 0; 374 mul_add_c(a[6], b[7], c2, c3, c1); 375 mul_add_c(a[7], b[6], c2, c3, c1); 376 r[13] = c2; 377 c2 = 0; 378 mul_add_c(a[7], b[7], c3, c1, c2); 379 r[14] = c3; 380 r[15] = c1; 381} 382 383void bn_mul_comba4(BN_ULONG r[8], const BN_ULONG a[4], const BN_ULONG b[4]) { 384 BN_ULONG c1, c2, c3; 385 386 c1 = 0; 387 c2 = 0; 388 c3 = 0; 389 mul_add_c(a[0], b[0], c1, c2, c3); 390 r[0] = c1; 391 c1 = 0; 392 mul_add_c(a[0], b[1], c2, c3, c1); 393 mul_add_c(a[1], b[0], c2, c3, c1); 394 r[1] = c2; 395 c2 = 0; 396 mul_add_c(a[2], b[0], c3, c1, c2); 397 mul_add_c(a[1], b[1], c3, c1, c2); 398 mul_add_c(a[0], b[2], c3, c1, c2); 399 r[2] = c3; 400 c3 = 0; 401 mul_add_c(a[0], b[3], c1, c2, c3); 402 mul_add_c(a[1], b[2], c1, c2, c3); 403 mul_add_c(a[2], b[1], c1, c2, c3); 404 mul_add_c(a[3], b[0], c1, c2, c3); 405 r[3] = c1; 406 c1 = 0; 407 mul_add_c(a[3], b[1], c2, c3, c1); 408 mul_add_c(a[2], b[2], c2, c3, c1); 409 mul_add_c(a[1], b[3], c2, c3, c1); 410 r[4] = c2; 411 c2 = 0; 412 mul_add_c(a[2], b[3], c3, c1, c2); 413 mul_add_c(a[3], b[2], c3, c1, c2); 414 r[5] = c3; 415 c3 = 0; 416 mul_add_c(a[3], b[3], c1, c2, c3); 417 r[6] = c1; 418 r[7] = c2; 419} 420 421void bn_sqr_comba8(BN_ULONG r[16], const BN_ULONG a[8]) { 422 BN_ULONG c1, c2, c3; 423 424 c1 = 0; 425 c2 = 0; 426 c3 = 0; 427 sqr_add_c(a, 0, c1, c2, c3); 428 r[0] = c1; 429 c1 = 0; 430 sqr_add_c2(a, 1, 0, c2, c3, c1); 431 r[1] = c2; 432 c2 = 0; 433 sqr_add_c(a, 1, c3, c1, c2); 434 sqr_add_c2(a, 2, 0, c3, c1, c2); 435 r[2] = c3; 436 c3 = 0; 437 sqr_add_c2(a, 3, 0, c1, c2, c3); 438 sqr_add_c2(a, 2, 1, c1, c2, c3); 439 r[3] = c1; 440 c1 = 0; 441 sqr_add_c(a, 2, c2, c3, c1); 442 sqr_add_c2(a, 3, 1, c2, c3, c1); 443 sqr_add_c2(a, 4, 0, c2, c3, c1); 444 r[4] = c2; 445 c2 = 0; 446 sqr_add_c2(a, 5, 0, c3, c1, c2); 447 sqr_add_c2(a, 4, 1, c3, c1, c2); 448 sqr_add_c2(a, 3, 2, c3, c1, c2); 449 r[5] = c3; 450 c3 = 0; 451 sqr_add_c(a, 3, c1, c2, c3); 452 sqr_add_c2(a, 4, 2, c1, c2, c3); 453 sqr_add_c2(a, 5, 1, c1, c2, c3); 454 sqr_add_c2(a, 6, 0, c1, c2, c3); 455 r[6] = c1; 456 c1 = 0; 457 sqr_add_c2(a, 7, 0, c2, c3, c1); 458 sqr_add_c2(a, 6, 1, c2, c3, c1); 459 sqr_add_c2(a, 5, 2, c2, c3, c1); 460 sqr_add_c2(a, 4, 3, c2, c3, c1); 461 r[7] = c2; 462 c2 = 0; 463 sqr_add_c(a, 4, c3, c1, c2); 464 sqr_add_c2(a, 5, 3, c3, c1, c2); 465 sqr_add_c2(a, 6, 2, c3, c1, c2); 466 sqr_add_c2(a, 7, 1, c3, c1, c2); 467 r[8] = c3; 468 c3 = 0; 469 sqr_add_c2(a, 7, 2, c1, c2, c3); 470 sqr_add_c2(a, 6, 3, c1, c2, c3); 471 sqr_add_c2(a, 5, 4, c1, c2, c3); 472 r[9] = c1; 473 c1 = 0; 474 sqr_add_c(a, 5, c2, c3, c1); 475 sqr_add_c2(a, 6, 4, c2, c3, c1); 476 sqr_add_c2(a, 7, 3, c2, c3, c1); 477 r[10] = c2; 478 c2 = 0; 479 sqr_add_c2(a, 7, 4, c3, c1, c2); 480 sqr_add_c2(a, 6, 5, c3, c1, c2); 481 r[11] = c3; 482 c3 = 0; 483 sqr_add_c(a, 6, c1, c2, c3); 484 sqr_add_c2(a, 7, 5, c1, c2, c3); 485 r[12] = c1; 486 c1 = 0; 487 sqr_add_c2(a, 7, 6, c2, c3, c1); 488 r[13] = c2; 489 c2 = 0; 490 sqr_add_c(a, 7, c3, c1, c2); 491 r[14] = c3; 492 r[15] = c1; 493} 494 495void bn_sqr_comba4(BN_ULONG r[8], const BN_ULONG a[4]) { 496 BN_ULONG c1, c2, c3; 497 498 c1 = 0; 499 c2 = 0; 500 c3 = 0; 501 sqr_add_c(a, 0, c1, c2, c3); 502 r[0] = c1; 503 c1 = 0; 504 sqr_add_c2(a, 1, 0, c2, c3, c1); 505 r[1] = c2; 506 c2 = 0; 507 sqr_add_c(a, 1, c3, c1, c2); 508 sqr_add_c2(a, 2, 0, c3, c1, c2); 509 r[2] = c3; 510 c3 = 0; 511 sqr_add_c2(a, 3, 0, c1, c2, c3); 512 sqr_add_c2(a, 2, 1, c1, c2, c3); 513 r[3] = c1; 514 c1 = 0; 515 sqr_add_c(a, 2, c2, c3, c1); 516 sqr_add_c2(a, 3, 1, c2, c3, c1); 517 r[4] = c2; 518 c2 = 0; 519 sqr_add_c2(a, 3, 2, c3, c1, c2); 520 r[5] = c3; 521 c3 = 0; 522 sqr_add_c(a, 3, c1, c2, c3); 523 r[6] = c1; 524 r[7] = c2; 525} 526 527#undef mul_add 528#undef mul 529#undef sqr 530#undef mul_add_c 531#undef mul_add_c2 532#undef sqr_add_c 533#undef sqr_add_c2 534 535#endif // !BN_MUL_ASM 536 537#if !defined(BN_ADD_ASM) 538 539BN_ULONG bn_add_words(BN_ULONG *r, const BN_ULONG *a, const BN_ULONG *b, 540 size_t n) { 541 if (n == 0) { 542 return 0; 543 } 544 545 BN_ULONG carry = 0; 546 while (n & ~3) { 547 r[0] = CRYPTO_addc_w(a[0], b[0], carry, &carry); 548 r[1] = CRYPTO_addc_w(a[1], b[1], carry, &carry); 549 r[2] = CRYPTO_addc_w(a[2], b[2], carry, &carry); 550 r[3] = CRYPTO_addc_w(a[3], b[3], carry, &carry); 551 a += 4; 552 b += 4; 553 r += 4; 554 n -= 4; 555 } 556 while (n) { 557 r[0] = CRYPTO_addc_w(a[0], b[0], carry, &carry); 558 a++; 559 b++; 560 r++; 561 n--; 562 } 563 return carry; 564} 565 566BN_ULONG bn_sub_words(BN_ULONG *r, const BN_ULONG *a, const BN_ULONG *b, 567 size_t n) { 568 if (n == 0) { 569 return (BN_ULONG)0; 570 } 571 572 BN_ULONG borrow = 0; 573 while (n & ~3) { 574 r[0] = CRYPTO_subc_w(a[0], b[0], borrow, &borrow); 575 r[1] = CRYPTO_subc_w(a[1], b[1], borrow, &borrow); 576 r[2] = CRYPTO_subc_w(a[2], b[2], borrow, &borrow); 577 r[3] = CRYPTO_subc_w(a[3], b[3], borrow, &borrow); 578 a += 4; 579 b += 4; 580 r += 4; 581 n -= 4; 582 } 583 while (n) { 584 r[0] = CRYPTO_subc_w(a[0], b[0], borrow, &borrow); 585 a++; 586 b++; 587 r++; 588 n--; 589 } 590 return borrow; 591} 592 593#endif // !BN_ADD_ASM 594