1// Copyright 1995-2016 The OpenSSL Project Authors. All Rights Reserved.
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7//     https://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15#include <openssl/bn.h>
16
17#include <assert.h>
18
19#include "internal.h"
20
21
22#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86)
23// See asm/bn-586.pl.
24#define BN_ADD_ASM
25#define BN_MUL_ASM
26#endif
27
28#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64) && \
29    (defined(__GNUC__) || defined(__clang__))
30// See asm/x86_64-gcc.c
31#define BN_ADD_ASM
32#define BN_MUL_ASM
33#endif
34
35#if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_AARCH64)
36// See asm/bn-armv8.pl.
37#define BN_ADD_ASM
38#endif
39
40#if !defined(BN_MUL_ASM)
41
42#ifdef BN_ULLONG
43#define mul_add(r, a, w, c)               \
44  do {                                    \
45    BN_ULLONG t;                          \
46    t = (BN_ULLONG)(w) * (a) + (r) + (c); \
47    (r) = (BN_ULONG)(t);                  \
48    (c) = (BN_ULONG)((t) >> BN_BITS2);    \
49  } while (0)
50
51#define mul(r, a, w, c)                \
52  do {                                 \
53    BN_ULLONG t;                       \
54    t = (BN_ULLONG)(w) * (a) + (c);    \
55    (r) = (BN_ULONG)(t);               \
56    (c) = (BN_ULONG)((t) >> BN_BITS2); \
57  } while (0)
58
59#define sqr(r0, r1, a)                  \
60  do {                                  \
61    BN_ULLONG t;                        \
62    t = (BN_ULLONG)(a) * (a);           \
63    (r0) = (BN_ULONG)(t);               \
64    (r1) = (BN_ULONG)((t) >> BN_BITS2); \
65  } while (0)
66
67#else
68
69#define mul_add(r, a, w, c)             \
70  do {                                  \
71    BN_ULONG high, low, ret, tmp = (a); \
72    ret = (r);                          \
73    BN_UMULT_LOHI(low, high, w, tmp);   \
74    ret += (c);                         \
75    (c) = (ret < (c)) ? 1 : 0;          \
76    (c) += high;                        \
77    ret += low;                         \
78    (c) += (ret < low) ? 1 : 0;         \
79    (r) = ret;                          \
80  } while (0)
81
82#define mul(r, a, w, c)                \
83  do {                                 \
84    BN_ULONG high, low, ret, ta = (a); \
85    BN_UMULT_LOHI(low, high, w, ta);   \
86    ret = low + (c);                   \
87    (c) = high;                        \
88    (c) += (ret < low) ? 1 : 0;        \
89    (r) = ret;                         \
90  } while (0)
91
92#define sqr(r0, r1, a)               \
93  do {                               \
94    BN_ULONG tmp = (a);              \
95    BN_UMULT_LOHI(r0, r1, tmp, tmp); \
96  } while (0)
97
98#endif  // !BN_ULLONG
99
100BN_ULONG bn_mul_add_words(BN_ULONG *rp, const BN_ULONG *ap, size_t num,
101                          BN_ULONG w) {
102  BN_ULONG c1 = 0;
103
104  if (num == 0) {
105    return c1;
106  }
107
108  while (num & ~3) {
109    mul_add(rp[0], ap[0], w, c1);
110    mul_add(rp[1], ap[1], w, c1);
111    mul_add(rp[2], ap[2], w, c1);
112    mul_add(rp[3], ap[3], w, c1);
113    ap += 4;
114    rp += 4;
115    num -= 4;
116  }
117
118  while (num) {
119    mul_add(rp[0], ap[0], w, c1);
120    ap++;
121    rp++;
122    num--;
123  }
124
125  return c1;
126}
127
128BN_ULONG bn_mul_words(BN_ULONG *rp, const BN_ULONG *ap, size_t num,
129                      BN_ULONG w) {
130  BN_ULONG c1 = 0;
131
132  if (num == 0) {
133    return c1;
134  }
135
136  while (num & ~3) {
137    mul(rp[0], ap[0], w, c1);
138    mul(rp[1], ap[1], w, c1);
139    mul(rp[2], ap[2], w, c1);
140    mul(rp[3], ap[3], w, c1);
141    ap += 4;
142    rp += 4;
143    num -= 4;
144  }
145  while (num) {
146    mul(rp[0], ap[0], w, c1);
147    ap++;
148    rp++;
149    num--;
150  }
151  return c1;
152}
153
154void bn_sqr_add_words(BN_ULONG *r, const BN_ULONG *a, size_t n) {
155  if (n == 0) {
156    return;
157  }
158
159  BN_ULONG carry = 0, lo, hi;
160  while (n & ~3) {
161    sqr(lo, hi, a[0]);
162    r[0] = CRYPTO_addc_w(r[0], lo, carry, &carry);
163    r[1] = CRYPTO_addc_w(r[1], hi, carry, &carry);
164    sqr(lo, hi, a[1]);
165    r[2] = CRYPTO_addc_w(r[2], lo, carry, &carry);
166    r[3] = CRYPTO_addc_w(r[3], hi, carry, &carry);
167    sqr(lo, hi, a[2]);
168    r[4] = CRYPTO_addc_w(r[4], lo, carry, &carry);
169    r[5] = CRYPTO_addc_w(r[5], hi, carry, &carry);
170    sqr(lo, hi, a[3]);
171    r[6] = CRYPTO_addc_w(r[6], lo, carry, &carry);
172    r[7] = CRYPTO_addc_w(r[7], hi, carry, &carry);
173    a += 4;
174    r += 8;
175    n -= 4;
176  }
177  while (n) {
178    sqr(lo, hi, a[0]);
179    r[0] = CRYPTO_addc_w(r[0], lo, carry, &carry);
180    r[1] = CRYPTO_addc_w(r[1], hi, carry, &carry);
181    a++;
182    r += 2;
183    n--;
184  }
185}
186
187// mul_add_c(a,b,c0,c1,c2)  -- c+=a*b for three word number c=(c2,c1,c0)
188// mul_add_c2(a,b,c0,c1,c2) -- c+=2*a*b for three word number c=(c2,c1,c0)
189// sqr_add_c(a,i,c0,c1,c2)  -- c+=a[i]^2 for three word number c=(c2,c1,c0)
190// sqr_add_c2(a,i,c0,c1,c2) -- c+=2*a[i]*a[j] for three word number c=(c2,c1,c0)
191
192#ifdef BN_ULLONG
193
194// Keep in mind that additions to multiplication result can not overflow,
195// because its high half cannot be all-ones.
196#define mul_add_c(a, b, c0, c1, c2)     \
197  do {                                  \
198    BN_ULONG hi;                        \
199    BN_ULLONG t = (BN_ULLONG)(a) * (b); \
200    t += (c0); /* no carry */           \
201    (c0) = (BN_ULONG)(t);               \
202    hi = (BN_ULONG)((t) >> BN_BITS2);   \
203    (c1) += (hi);                       \
204    (c2) += (c1) < hi;                  \
205  } while (0)
206
207#define mul_add_c2(a, b, c0, c1, c2)        \
208  do {                                      \
209    BN_ULONG hi;                            \
210    BN_ULLONG t = (BN_ULLONG)(a) * (b);     \
211    BN_ULLONG tt = t + (c0); /* no carry */ \
212    (c0) = (BN_ULONG)(tt);                  \
213    hi = (BN_ULONG)((tt) >> BN_BITS2);      \
214    (c1) += hi;                             \
215    (c2) += (c1) < hi;                      \
216    t += (c0); /* no carry */               \
217    (c0) = (BN_ULONG)(t);                   \
218    hi = (BN_ULONG)((t) >> BN_BITS2);       \
219    (c1) += hi;                             \
220    (c2) += (c1) < hi;                      \
221  } while (0)
222
223#define sqr_add_c(a, i, c0, c1, c2)           \
224  do {                                        \
225    BN_ULONG hi;                              \
226    BN_ULLONG t = (BN_ULLONG)(a)[i] * (a)[i]; \
227    t += (c0); /* no carry */                 \
228    (c0) = (BN_ULONG)(t);                     \
229    hi = (BN_ULONG)((t) >> BN_BITS2);         \
230    (c1) += hi;                               \
231    (c2) += (c1) < hi;                        \
232  } while (0)
233
234#define sqr_add_c2(a, i, j, c0, c1, c2) mul_add_c2((a)[i], (a)[j], c0, c1, c2)
235
236#else
237
238// Keep in mind that additions to hi can not overflow, because the high word of
239// a multiplication result cannot be all-ones.
240#define mul_add_c(a, b, c0, c1, c2) \
241  do {                              \
242    BN_ULONG ta = (a), tb = (b);    \
243    BN_ULONG lo, hi;                \
244    BN_UMULT_LOHI(lo, hi, ta, tb);  \
245    (c0) += lo;                     \
246    hi += ((c0) < lo) ? 1 : 0;      \
247    (c1) += hi;                     \
248    (c2) += ((c1) < hi) ? 1 : 0;    \
249  } while (0)
250
251#define mul_add_c2(a, b, c0, c1, c2) \
252  do {                               \
253    BN_ULONG ta = (a), tb = (b);     \
254    BN_ULONG lo, hi, tt;             \
255    BN_UMULT_LOHI(lo, hi, ta, tb);   \
256    (c0) += lo;                      \
257    tt = hi + (((c0) < lo) ? 1 : 0); \
258    (c1) += tt;                      \
259    (c2) += ((c1) < tt) ? 1 : 0;     \
260    (c0) += lo;                      \
261    hi += (c0 < lo) ? 1 : 0;         \
262    (c1) += hi;                      \
263    (c2) += ((c1) < hi) ? 1 : 0;     \
264  } while (0)
265
266#define sqr_add_c(a, i, c0, c1, c2) \
267  do {                              \
268    BN_ULONG ta = (a)[i];           \
269    BN_ULONG lo, hi;                \
270    BN_UMULT_LOHI(lo, hi, ta, ta);  \
271    (c0) += lo;                     \
272    hi += (c0 < lo) ? 1 : 0;        \
273    (c1) += hi;                     \
274    (c2) += ((c1) < hi) ? 1 : 0;    \
275  } while (0)
276
277#define sqr_add_c2(a, i, j, c0, c1, c2) mul_add_c2((a)[i], (a)[j], c0, c1, c2)
278
279#endif  // !BN_ULLONG
280
281void bn_mul_comba8(BN_ULONG r[16], const BN_ULONG a[8], const BN_ULONG b[8]) {
282  BN_ULONG c1, c2, c3;
283
284  c1 = 0;
285  c2 = 0;
286  c3 = 0;
287  mul_add_c(a[0], b[0], c1, c2, c3);
288  r[0] = c1;
289  c1 = 0;
290  mul_add_c(a[0], b[1], c2, c3, c1);
291  mul_add_c(a[1], b[0], c2, c3, c1);
292  r[1] = c2;
293  c2 = 0;
294  mul_add_c(a[2], b[0], c3, c1, c2);
295  mul_add_c(a[1], b[1], c3, c1, c2);
296  mul_add_c(a[0], b[2], c3, c1, c2);
297  r[2] = c3;
298  c3 = 0;
299  mul_add_c(a[0], b[3], c1, c2, c3);
300  mul_add_c(a[1], b[2], c1, c2, c3);
301  mul_add_c(a[2], b[1], c1, c2, c3);
302  mul_add_c(a[3], b[0], c1, c2, c3);
303  r[3] = c1;
304  c1 = 0;
305  mul_add_c(a[4], b[0], c2, c3, c1);
306  mul_add_c(a[3], b[1], c2, c3, c1);
307  mul_add_c(a[2], b[2], c2, c3, c1);
308  mul_add_c(a[1], b[3], c2, c3, c1);
309  mul_add_c(a[0], b[4], c2, c3, c1);
310  r[4] = c2;
311  c2 = 0;
312  mul_add_c(a[0], b[5], c3, c1, c2);
313  mul_add_c(a[1], b[4], c3, c1, c2);
314  mul_add_c(a[2], b[3], c3, c1, c2);
315  mul_add_c(a[3], b[2], c3, c1, c2);
316  mul_add_c(a[4], b[1], c3, c1, c2);
317  mul_add_c(a[5], b[0], c3, c1, c2);
318  r[5] = c3;
319  c3 = 0;
320  mul_add_c(a[6], b[0], c1, c2, c3);
321  mul_add_c(a[5], b[1], c1, c2, c3);
322  mul_add_c(a[4], b[2], c1, c2, c3);
323  mul_add_c(a[3], b[3], c1, c2, c3);
324  mul_add_c(a[2], b[4], c1, c2, c3);
325  mul_add_c(a[1], b[5], c1, c2, c3);
326  mul_add_c(a[0], b[6], c1, c2, c3);
327  r[6] = c1;
328  c1 = 0;
329  mul_add_c(a[0], b[7], c2, c3, c1);
330  mul_add_c(a[1], b[6], c2, c3, c1);
331  mul_add_c(a[2], b[5], c2, c3, c1);
332  mul_add_c(a[3], b[4], c2, c3, c1);
333  mul_add_c(a[4], b[3], c2, c3, c1);
334  mul_add_c(a[5], b[2], c2, c3, c1);
335  mul_add_c(a[6], b[1], c2, c3, c1);
336  mul_add_c(a[7], b[0], c2, c3, c1);
337  r[7] = c2;
338  c2 = 0;
339  mul_add_c(a[7], b[1], c3, c1, c2);
340  mul_add_c(a[6], b[2], c3, c1, c2);
341  mul_add_c(a[5], b[3], c3, c1, c2);
342  mul_add_c(a[4], b[4], c3, c1, c2);
343  mul_add_c(a[3], b[5], c3, c1, c2);
344  mul_add_c(a[2], b[6], c3, c1, c2);
345  mul_add_c(a[1], b[7], c3, c1, c2);
346  r[8] = c3;
347  c3 = 0;
348  mul_add_c(a[2], b[7], c1, c2, c3);
349  mul_add_c(a[3], b[6], c1, c2, c3);
350  mul_add_c(a[4], b[5], c1, c2, c3);
351  mul_add_c(a[5], b[4], c1, c2, c3);
352  mul_add_c(a[6], b[3], c1, c2, c3);
353  mul_add_c(a[7], b[2], c1, c2, c3);
354  r[9] = c1;
355  c1 = 0;
356  mul_add_c(a[7], b[3], c2, c3, c1);
357  mul_add_c(a[6], b[4], c2, c3, c1);
358  mul_add_c(a[5], b[5], c2, c3, c1);
359  mul_add_c(a[4], b[6], c2, c3, c1);
360  mul_add_c(a[3], b[7], c2, c3, c1);
361  r[10] = c2;
362  c2 = 0;
363  mul_add_c(a[4], b[7], c3, c1, c2);
364  mul_add_c(a[5], b[6], c3, c1, c2);
365  mul_add_c(a[6], b[5], c3, c1, c2);
366  mul_add_c(a[7], b[4], c3, c1, c2);
367  r[11] = c3;
368  c3 = 0;
369  mul_add_c(a[7], b[5], c1, c2, c3);
370  mul_add_c(a[6], b[6], c1, c2, c3);
371  mul_add_c(a[5], b[7], c1, c2, c3);
372  r[12] = c1;
373  c1 = 0;
374  mul_add_c(a[6], b[7], c2, c3, c1);
375  mul_add_c(a[7], b[6], c2, c3, c1);
376  r[13] = c2;
377  c2 = 0;
378  mul_add_c(a[7], b[7], c3, c1, c2);
379  r[14] = c3;
380  r[15] = c1;
381}
382
383void bn_mul_comba4(BN_ULONG r[8], const BN_ULONG a[4], const BN_ULONG b[4]) {
384  BN_ULONG c1, c2, c3;
385
386  c1 = 0;
387  c2 = 0;
388  c3 = 0;
389  mul_add_c(a[0], b[0], c1, c2, c3);
390  r[0] = c1;
391  c1 = 0;
392  mul_add_c(a[0], b[1], c2, c3, c1);
393  mul_add_c(a[1], b[0], c2, c3, c1);
394  r[1] = c2;
395  c2 = 0;
396  mul_add_c(a[2], b[0], c3, c1, c2);
397  mul_add_c(a[1], b[1], c3, c1, c2);
398  mul_add_c(a[0], b[2], c3, c1, c2);
399  r[2] = c3;
400  c3 = 0;
401  mul_add_c(a[0], b[3], c1, c2, c3);
402  mul_add_c(a[1], b[2], c1, c2, c3);
403  mul_add_c(a[2], b[1], c1, c2, c3);
404  mul_add_c(a[3], b[0], c1, c2, c3);
405  r[3] = c1;
406  c1 = 0;
407  mul_add_c(a[3], b[1], c2, c3, c1);
408  mul_add_c(a[2], b[2], c2, c3, c1);
409  mul_add_c(a[1], b[3], c2, c3, c1);
410  r[4] = c2;
411  c2 = 0;
412  mul_add_c(a[2], b[3], c3, c1, c2);
413  mul_add_c(a[3], b[2], c3, c1, c2);
414  r[5] = c3;
415  c3 = 0;
416  mul_add_c(a[3], b[3], c1, c2, c3);
417  r[6] = c1;
418  r[7] = c2;
419}
420
421void bn_sqr_comba8(BN_ULONG r[16], const BN_ULONG a[8]) {
422  BN_ULONG c1, c2, c3;
423
424  c1 = 0;
425  c2 = 0;
426  c3 = 0;
427  sqr_add_c(a, 0, c1, c2, c3);
428  r[0] = c1;
429  c1 = 0;
430  sqr_add_c2(a, 1, 0, c2, c3, c1);
431  r[1] = c2;
432  c2 = 0;
433  sqr_add_c(a, 1, c3, c1, c2);
434  sqr_add_c2(a, 2, 0, c3, c1, c2);
435  r[2] = c3;
436  c3 = 0;
437  sqr_add_c2(a, 3, 0, c1, c2, c3);
438  sqr_add_c2(a, 2, 1, c1, c2, c3);
439  r[3] = c1;
440  c1 = 0;
441  sqr_add_c(a, 2, c2, c3, c1);
442  sqr_add_c2(a, 3, 1, c2, c3, c1);
443  sqr_add_c2(a, 4, 0, c2, c3, c1);
444  r[4] = c2;
445  c2 = 0;
446  sqr_add_c2(a, 5, 0, c3, c1, c2);
447  sqr_add_c2(a, 4, 1, c3, c1, c2);
448  sqr_add_c2(a, 3, 2, c3, c1, c2);
449  r[5] = c3;
450  c3 = 0;
451  sqr_add_c(a, 3, c1, c2, c3);
452  sqr_add_c2(a, 4, 2, c1, c2, c3);
453  sqr_add_c2(a, 5, 1, c1, c2, c3);
454  sqr_add_c2(a, 6, 0, c1, c2, c3);
455  r[6] = c1;
456  c1 = 0;
457  sqr_add_c2(a, 7, 0, c2, c3, c1);
458  sqr_add_c2(a, 6, 1, c2, c3, c1);
459  sqr_add_c2(a, 5, 2, c2, c3, c1);
460  sqr_add_c2(a, 4, 3, c2, c3, c1);
461  r[7] = c2;
462  c2 = 0;
463  sqr_add_c(a, 4, c3, c1, c2);
464  sqr_add_c2(a, 5, 3, c3, c1, c2);
465  sqr_add_c2(a, 6, 2, c3, c1, c2);
466  sqr_add_c2(a, 7, 1, c3, c1, c2);
467  r[8] = c3;
468  c3 = 0;
469  sqr_add_c2(a, 7, 2, c1, c2, c3);
470  sqr_add_c2(a, 6, 3, c1, c2, c3);
471  sqr_add_c2(a, 5, 4, c1, c2, c3);
472  r[9] = c1;
473  c1 = 0;
474  sqr_add_c(a, 5, c2, c3, c1);
475  sqr_add_c2(a, 6, 4, c2, c3, c1);
476  sqr_add_c2(a, 7, 3, c2, c3, c1);
477  r[10] = c2;
478  c2 = 0;
479  sqr_add_c2(a, 7, 4, c3, c1, c2);
480  sqr_add_c2(a, 6, 5, c3, c1, c2);
481  r[11] = c3;
482  c3 = 0;
483  sqr_add_c(a, 6, c1, c2, c3);
484  sqr_add_c2(a, 7, 5, c1, c2, c3);
485  r[12] = c1;
486  c1 = 0;
487  sqr_add_c2(a, 7, 6, c2, c3, c1);
488  r[13] = c2;
489  c2 = 0;
490  sqr_add_c(a, 7, c3, c1, c2);
491  r[14] = c3;
492  r[15] = c1;
493}
494
495void bn_sqr_comba4(BN_ULONG r[8], const BN_ULONG a[4]) {
496  BN_ULONG c1, c2, c3;
497
498  c1 = 0;
499  c2 = 0;
500  c3 = 0;
501  sqr_add_c(a, 0, c1, c2, c3);
502  r[0] = c1;
503  c1 = 0;
504  sqr_add_c2(a, 1, 0, c2, c3, c1);
505  r[1] = c2;
506  c2 = 0;
507  sqr_add_c(a, 1, c3, c1, c2);
508  sqr_add_c2(a, 2, 0, c3, c1, c2);
509  r[2] = c3;
510  c3 = 0;
511  sqr_add_c2(a, 3, 0, c1, c2, c3);
512  sqr_add_c2(a, 2, 1, c1, c2, c3);
513  r[3] = c1;
514  c1 = 0;
515  sqr_add_c(a, 2, c2, c3, c1);
516  sqr_add_c2(a, 3, 1, c2, c3, c1);
517  r[4] = c2;
518  c2 = 0;
519  sqr_add_c2(a, 3, 2, c3, c1, c2);
520  r[5] = c3;
521  c3 = 0;
522  sqr_add_c(a, 3, c1, c2, c3);
523  r[6] = c1;
524  r[7] = c2;
525}
526
527#undef mul_add
528#undef mul
529#undef sqr
530#undef mul_add_c
531#undef mul_add_c2
532#undef sqr_add_c
533#undef sqr_add_c2
534
535#endif  // !BN_MUL_ASM
536
537#if !defined(BN_ADD_ASM)
538
539BN_ULONG bn_add_words(BN_ULONG *r, const BN_ULONG *a, const BN_ULONG *b,
540                      size_t n) {
541  if (n == 0) {
542    return 0;
543  }
544
545  BN_ULONG carry = 0;
546  while (n & ~3) {
547    r[0] = CRYPTO_addc_w(a[0], b[0], carry, &carry);
548    r[1] = CRYPTO_addc_w(a[1], b[1], carry, &carry);
549    r[2] = CRYPTO_addc_w(a[2], b[2], carry, &carry);
550    r[3] = CRYPTO_addc_w(a[3], b[3], carry, &carry);
551    a += 4;
552    b += 4;
553    r += 4;
554    n -= 4;
555  }
556  while (n) {
557    r[0] = CRYPTO_addc_w(a[0], b[0], carry, &carry);
558    a++;
559    b++;
560    r++;
561    n--;
562  }
563  return carry;
564}
565
566BN_ULONG bn_sub_words(BN_ULONG *r, const BN_ULONG *a, const BN_ULONG *b,
567                      size_t n) {
568  if (n == 0) {
569    return (BN_ULONG)0;
570  }
571
572  BN_ULONG borrow = 0;
573  while (n & ~3) {
574    r[0] = CRYPTO_subc_w(a[0], b[0], borrow, &borrow);
575    r[1] = CRYPTO_subc_w(a[1], b[1], borrow, &borrow);
576    r[2] = CRYPTO_subc_w(a[2], b[2], borrow, &borrow);
577    r[3] = CRYPTO_subc_w(a[3], b[3], borrow, &borrow);
578    a += 4;
579    b += 4;
580    r += 4;
581    n -= 4;
582  }
583  while (n) {
584    r[0] = CRYPTO_subc_w(a[0], b[0], borrow, &borrow);
585    a++;
586    b++;
587    r++;
588    n--;
589  }
590  return borrow;
591}
592
593#endif  // !BN_ADD_ASM
594