You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 
 
 

1190 line
31 KiB

  1. /* Copyright (C) 1995-1998 Eric Young (eay@cryptsoft.com)
  2. * All rights reserved.
  3. *
  4. * This package is an SSL implementation written
  5. * by Eric Young (eay@cryptsoft.com).
  6. * The implementation was written so as to conform with Netscapes SSL.
  7. *
  8. * This library is free for commercial and non-commercial use as long as
  9. * the following conditions are aheared to. The following conditions
  10. * apply to all code found in this distribution, be it the RC4, RSA,
  11. * lhash, DES, etc., code; not just the SSL code. The SSL documentation
  12. * included with this distribution is covered by the same copyright terms
  13. * except that the holder is Tim Hudson (tjh@cryptsoft.com).
  14. *
  15. * Copyright remains Eric Young's, and as such any Copyright notices in
  16. * the code are not to be removed.
  17. * If this package is used in a product, Eric Young should be given attribution
  18. * as the author of the parts of the library used.
  19. * This can be in the form of a textual message at program startup or
  20. * in documentation (online or textual) provided with the package.
  21. *
  22. * Redistribution and use in source and binary forms, with or without
  23. * modification, are permitted provided that the following conditions
  24. * are met:
  25. * 1. Redistributions of source code must retain the copyright
  26. * notice, this list of conditions and the following disclaimer.
  27. * 2. Redistributions in binary form must reproduce the above copyright
  28. * notice, this list of conditions and the following disclaimer in the
  29. * documentation and/or other materials provided with the distribution.
  30. * 3. All advertising materials mentioning features or use of this software
  31. * must display the following acknowledgement:
  32. * "This product includes cryptographic software written by
  33. * Eric Young (eay@cryptsoft.com)"
  34. * The word 'cryptographic' can be left out if the rouines from the library
  35. * being used are not cryptographic related :-).
  36. * 4. If you include any Windows specific code (or a derivative thereof) from
  37. * the apps directory (application code) you must include an acknowledgement:
  38. * "This product includes software written by Tim Hudson (tjh@cryptsoft.com)"
  39. *
  40. * THIS SOFTWARE IS PROVIDED BY ERIC YOUNG ``AS IS'' AND
  41. * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
  42. * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  43. * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
  44. * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
  45. * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
  46. * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  47. * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  48. * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  49. * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  50. * SUCH DAMAGE.
  51. *
  52. * The licence and distribution terms for any publically available version or
  53. * derivative of this code cannot be changed. i.e. this code cannot simply be
  54. * copied and put under another distribution licence
  55. * [including the GNU Public Licence.] */
  56. #include <openssl/bn.h>
  57. #include <assert.h>
  58. #include "internal.h"
  59. #if defined(OPENSSL_WINDOWS) || defined(OPENSSL_NO_ASM) || \
  60. (!defined(OPENSSL_X86_64) && !defined(OPENSSL_X86))
  61. #if defined(OPENSSL_WINDOWS)
  62. #define alloca _alloca
  63. #else
  64. #include <alloca.h>
  65. #endif
  66. #ifdef BN_LLONG
  67. #define mul_add(r, a, w, c) \
  68. { \
  69. BN_ULLONG t; \
  70. t = (BN_ULLONG)w * (a) + (r) + (c); \
  71. (r) = Lw(t); \
  72. (c) = Hw(t); \
  73. }
  74. #define mul(r, a, w, c) \
  75. { \
  76. BN_ULLONG t; \
  77. t = (BN_ULLONG)w * (a) + (c); \
  78. (r) = Lw(t); \
  79. (c) = Hw(t); \
  80. }
  81. #define sqr(r0, r1, a) \
  82. { \
  83. BN_ULLONG t; \
  84. t = (BN_ULLONG)(a) * (a); \
  85. (r0) = Lw(t); \
  86. (r1) = Hw(t); \
  87. }
  88. #elif defined(BN_UMULT_LOHI)
  89. #define mul_add(r, a, w, c) \
  90. { \
  91. BN_ULONG high, low, ret, tmp = (a); \
  92. ret = (r); \
  93. BN_UMULT_LOHI(low, high, w, tmp); \
  94. ret += (c); \
  95. (c) = (ret < (c)) ? 1 : 0; \
  96. (c) += high; \
  97. ret += low; \
  98. (c) += (ret < low) ? 1 : 0; \
  99. (r) = ret; \
  100. }
  101. #define mul(r, a, w, c) \
  102. { \
  103. BN_ULONG high, low, ret, ta = (a); \
  104. BN_UMULT_LOHI(low, high, w, ta); \
  105. ret = low + (c); \
  106. (c) = high; \
  107. (c) += (ret < low) ? 1 : 0; \
  108. (r) = ret; \
  109. }
  110. #define sqr(r0, r1, a) \
  111. { \
  112. BN_ULONG tmp = (a); \
  113. BN_UMULT_LOHI(r0, r1, tmp, tmp); \
  114. }
  115. #elif defined(BN_UMULT_HIGH)
  116. #define mul_add(r, a, w, c) \
  117. { \
  118. BN_ULONG high, low, ret, tmp = (a); \
  119. ret = (r); \
  120. high = BN_UMULT_HIGH(w, tmp); \
  121. ret += (c); \
  122. low = (w) * tmp; \
  123. (c) = (ret < (c)) ? 1 : 0; \
  124. (c) += high; \
  125. ret += low; \
  126. (c) += (ret < low) ? 1 : 0; \
  127. (r) = ret; \
  128. }
  129. #define mul(r, a, w, c) \
  130. { \
  131. BN_ULONG high, low, ret, ta = (a); \
  132. low = (w) * ta; \
  133. high = BN_UMULT_HIGH(w, ta); \
  134. ret = low + (c); \
  135. (c) = high; \
  136. (c) += (ret < low) ? 1 : 0; \
  137. (r) = ret; \
  138. }
  139. #define sqr(r0, r1, a) \
  140. { \
  141. BN_ULONG tmp = (a); \
  142. (r0) = tmp * tmp; \
  143. (r1) = BN_UMULT_HIGH(tmp, tmp); \
  144. }
  145. #else
  146. /*************************************************************
  147. * No long long type
  148. */
  149. #define LBITS(a) ((a) & BN_MASK2l)
  150. #define HBITS(a) (((a) >> BN_BITS4) & BN_MASK2l)
  151. #define L2HBITS(a) (((a) << BN_BITS4) & BN_MASK2)
  152. #define LLBITS(a) ((a) & BN_MASKl)
  153. #define LHBITS(a) (((a) >> BN_BITS2) & BN_MASKl)
  154. #define LL2HBITS(a) ((BN_ULLONG)((a) & BN_MASKl) << BN_BITS2)
  155. #define mul64(l, h, bl, bh) \
  156. { \
  157. BN_ULONG m, m1, lt, ht; \
  158. \
  159. lt = l; \
  160. ht = h; \
  161. m = (bh) * (lt); \
  162. lt = (bl) * (lt); \
  163. m1 = (bl) * (ht); \
  164. ht = (bh) * (ht); \
  165. m = (m + m1) & BN_MASK2; \
  166. if (m < m1) \
  167. ht += L2HBITS((BN_ULONG)1); \
  168. ht += HBITS(m); \
  169. m1 = L2HBITS(m); \
  170. lt = (lt + m1) & BN_MASK2; \
  171. if (lt < m1) \
  172. ht++; \
  173. (l) = lt; \
  174. (h) = ht; \
  175. }
  176. #define sqr64(lo, ho, in) \
  177. { \
  178. BN_ULONG l, h, m; \
  179. \
  180. h = (in); \
  181. l = LBITS(h); \
  182. h = HBITS(h); \
  183. m = (l) * (h); \
  184. l *= l; \
  185. h *= h; \
  186. h += (m & BN_MASK2h1) >> (BN_BITS4 - 1); \
  187. m = (m & BN_MASK2l) << (BN_BITS4 + 1); \
  188. l = (l + m) & BN_MASK2; \
  189. if (l < m) \
  190. h++; \
  191. (lo) = l; \
  192. (ho) = h; \
  193. }
  194. #define mul_add(r, a, bl, bh, c) \
  195. { \
  196. BN_ULONG l, h; \
  197. \
  198. h = (a); \
  199. l = LBITS(h); \
  200. h = HBITS(h); \
  201. mul64(l, h, (bl), (bh)); \
  202. \
  203. /* non-multiply part */ \
  204. l = (l + (c)) & BN_MASK2; \
  205. if (l < (c)) \
  206. h++; \
  207. (c) = (r); \
  208. l = (l + (c)) & BN_MASK2; \
  209. if (l < (c)) \
  210. h++; \
  211. (c) = h & BN_MASK2; \
  212. (r) = l; \
  213. }
  214. #define mul(r, a, bl, bh, c) \
  215. { \
  216. BN_ULONG l, h; \
  217. \
  218. h = (a); \
  219. l = LBITS(h); \
  220. h = HBITS(h); \
  221. mul64(l, h, (bl), (bh)); \
  222. \
  223. /* non-multiply part */ \
  224. l += (c); \
  225. if ((l & BN_MASK2) < (c)) \
  226. h++; \
  227. (c) = h & BN_MASK2; \
  228. (r) = l & BN_MASK2; \
  229. }
  230. #endif /* !BN_LLONG */
  231. #if defined(BN_LLONG) || defined(BN_UMULT_HIGH)
  232. BN_ULONG bn_mul_add_words(BN_ULONG *rp, const BN_ULONG *ap, int num,
  233. BN_ULONG w) {
  234. BN_ULONG c1 = 0;
  235. assert(num >= 0);
  236. if (num <= 0) {
  237. return c1;
  238. }
  239. while (num & ~3) {
  240. mul_add(rp[0], ap[0], w, c1);
  241. mul_add(rp[1], ap[1], w, c1);
  242. mul_add(rp[2], ap[2], w, c1);
  243. mul_add(rp[3], ap[3], w, c1);
  244. ap += 4;
  245. rp += 4;
  246. num -= 4;
  247. }
  248. while (num) {
  249. mul_add(rp[0], ap[0], w, c1);
  250. ap++;
  251. rp++;
  252. num--;
  253. }
  254. return c1;
  255. }
  256. BN_ULONG bn_mul_words(BN_ULONG *rp, const BN_ULONG *ap, int num, BN_ULONG w) {
  257. BN_ULONG c1 = 0;
  258. assert(num >= 0);
  259. if (num <= 0) {
  260. return c1;
  261. }
  262. while (num & ~3) {
  263. mul(rp[0], ap[0], w, c1);
  264. mul(rp[1], ap[1], w, c1);
  265. mul(rp[2], ap[2], w, c1);
  266. mul(rp[3], ap[3], w, c1);
  267. ap += 4;
  268. rp += 4;
  269. num -= 4;
  270. }
  271. while (num) {
  272. mul(rp[0], ap[0], w, c1);
  273. ap++;
  274. rp++;
  275. num--;
  276. }
  277. return c1;
  278. }
  279. void bn_sqr_words(BN_ULONG *r, const BN_ULONG *a, int n) {
  280. assert(n >= 0);
  281. if (n <= 0) {
  282. return;
  283. }
  284. while (n & ~3) {
  285. sqr(r[0], r[1], a[0]);
  286. sqr(r[2], r[3], a[1]);
  287. sqr(r[4], r[5], a[2]);
  288. sqr(r[6], r[7], a[3]);
  289. a += 4;
  290. r += 8;
  291. n -= 4;
  292. }
  293. while (n) {
  294. sqr(r[0], r[1], a[0]);
  295. a++;
  296. r += 2;
  297. n--;
  298. }
  299. }
  300. #else /* !(defined(BN_LLONG) || defined(BN_UMULT_HIGH)) */
  301. BN_ULONG bn_mul_add_words(BN_ULONG *rp, const BN_ULONG *ap, int num,
  302. BN_ULONG w) {
  303. BN_ULONG c = 0;
  304. BN_ULONG bl, bh;
  305. assert(num >= 0);
  306. if (num <= 0) {
  307. return (BN_ULONG)0;
  308. }
  309. bl = LBITS(w);
  310. bh = HBITS(w);
  311. while (num & ~3) {
  312. mul_add(rp[0], ap[0], bl, bh, c);
  313. mul_add(rp[1], ap[1], bl, bh, c);
  314. mul_add(rp[2], ap[2], bl, bh, c);
  315. mul_add(rp[3], ap[3], bl, bh, c);
  316. ap += 4;
  317. rp += 4;
  318. num -= 4;
  319. }
  320. while (num) {
  321. mul_add(rp[0], ap[0], bl, bh, c);
  322. ap++;
  323. rp++;
  324. num--;
  325. }
  326. return c;
  327. }
  328. BN_ULONG bn_mul_words(BN_ULONG *rp, const BN_ULONG *ap, int num, BN_ULONG w) {
  329. BN_ULONG carry = 0;
  330. BN_ULONG bl, bh;
  331. assert(num >= 0);
  332. if (num <= 0) {
  333. return (BN_ULONG)0;
  334. }
  335. bl = LBITS(w);
  336. bh = HBITS(w);
  337. while (num & ~3) {
  338. mul(rp[0], ap[0], bl, bh, carry);
  339. mul(rp[1], ap[1], bl, bh, carry);
  340. mul(rp[2], ap[2], bl, bh, carry);
  341. mul(rp[3], ap[3], bl, bh, carry);
  342. ap += 4;
  343. rp += 4;
  344. num -= 4;
  345. }
  346. while (num) {
  347. mul(rp[0], ap[0], bl, bh, carry);
  348. ap++;
  349. rp++;
  350. num--;
  351. }
  352. return carry;
  353. }
  354. void bn_sqr_words(BN_ULONG *r, const BN_ULONG *a, int n) {
  355. assert(n >= 0);
  356. if (n <= 0) {
  357. return;
  358. }
  359. while (n & ~3) {
  360. sqr64(r[0], r[1], a[0]);
  361. sqr64(r[2], r[3], a[1]);
  362. sqr64(r[4], r[5], a[2]);
  363. sqr64(r[6], r[7], a[3]);
  364. a += 4;
  365. r += 8;
  366. n -= 4;
  367. }
  368. while (n) {
  369. sqr64(r[0], r[1], a[0]);
  370. a++;
  371. r += 2;
  372. n--;
  373. }
  374. }
  375. #endif /* !(defined(BN_LLONG) || defined(BN_UMULT_HIGH)) */
  376. #if defined(BN_LLONG) && defined(BN_DIV2W)
  377. BN_ULONG bn_div_words(BN_ULONG h, BN_ULONG l, BN_ULONG d) {
  378. return (BN_ULONG)(((((BN_ULLONG)h) << BN_BITS2) | l) / (BN_ULLONG)d);
  379. }
  380. #else
  381. /* Divide h,l by d and return the result. */
  382. BN_ULONG bn_div_words(BN_ULONG h, BN_ULONG l, BN_ULONG d) {
  383. BN_ULONG dh, dl, q, ret = 0, th, tl, t;
  384. int i, count = 2;
  385. if (d == 0) {
  386. return BN_MASK2;
  387. }
  388. i = BN_num_bits_word(d);
  389. assert((i == BN_BITS2) || (h <= (BN_ULONG)1 << i));
  390. i = BN_BITS2 - i;
  391. if (h >= d) {
  392. h -= d;
  393. }
  394. if (i) {
  395. d <<= i;
  396. h = (h << i) | (l >> (BN_BITS2 - i));
  397. l <<= i;
  398. }
  399. dh = (d & BN_MASK2h) >> BN_BITS4;
  400. dl = (d & BN_MASK2l);
  401. for (;;) {
  402. if ((h >> BN_BITS4) == dh) {
  403. q = BN_MASK2l;
  404. } else {
  405. q = h / dh;
  406. }
  407. th = q * dh;
  408. tl = dl * q;
  409. for (;;) {
  410. t = h - th;
  411. if ((t & BN_MASK2h) ||
  412. ((tl) <= ((t << BN_BITS4) | ((l & BN_MASK2h) >> BN_BITS4)))) {
  413. break;
  414. }
  415. q--;
  416. th -= dh;
  417. tl -= dl;
  418. }
  419. t = (tl >> BN_BITS4);
  420. tl = (tl << BN_BITS4) & BN_MASK2h;
  421. th += t;
  422. if (l < tl) {
  423. th++;
  424. }
  425. l -= tl;
  426. if (h < th) {
  427. h += d;
  428. q--;
  429. }
  430. h -= th;
  431. if (--count == 0) {
  432. break;
  433. }
  434. ret = q << BN_BITS4;
  435. h = ((h << BN_BITS4) | (l >> BN_BITS4)) & BN_MASK2;
  436. l = (l & BN_MASK2l) << BN_BITS4;
  437. }
  438. ret |= q;
  439. return ret;
  440. }
  441. #endif /* !defined(BN_LLONG) && defined(BN_DIV2W) */
  442. #ifdef BN_LLONG
  443. BN_ULONG bn_add_words(BN_ULONG *r, const BN_ULONG *a, const BN_ULONG *b,
  444. int n) {
  445. BN_ULLONG ll = 0;
  446. assert(n >= 0);
  447. if (n <= 0) {
  448. return (BN_ULONG)0;
  449. }
  450. while (n & ~3) {
  451. ll += (BN_ULLONG)a[0] + b[0];
  452. r[0] = (BN_ULONG)ll & BN_MASK2;
  453. ll >>= BN_BITS2;
  454. ll += (BN_ULLONG)a[1] + b[1];
  455. r[1] = (BN_ULONG)ll & BN_MASK2;
  456. ll >>= BN_BITS2;
  457. ll += (BN_ULLONG)a[2] + b[2];
  458. r[2] = (BN_ULONG)ll & BN_MASK2;
  459. ll >>= BN_BITS2;
  460. ll += (BN_ULLONG)a[3] + b[3];
  461. r[3] = (BN_ULONG)ll & BN_MASK2;
  462. ll >>= BN_BITS2;
  463. a += 4;
  464. b += 4;
  465. r += 4;
  466. n -= 4;
  467. }
  468. while (n) {
  469. ll += (BN_ULLONG)a[0] + b[0];
  470. r[0] = (BN_ULONG)ll & BN_MASK2;
  471. ll >>= BN_BITS2;
  472. a++;
  473. b++;
  474. r++;
  475. n--;
  476. }
  477. return (BN_ULONG)ll;
  478. }
  479. #else /* !BN_LLONG */
  480. BN_ULONG bn_add_words(BN_ULONG *r, const BN_ULONG *a, const BN_ULONG *b,
  481. int n) {
  482. BN_ULONG c, l, t;
  483. assert(n >= 0);
  484. if (n <= 0) {
  485. return (BN_ULONG)0;
  486. }
  487. c = 0;
  488. while (n & ~3) {
  489. t = a[0];
  490. t = (t + c) & BN_MASK2;
  491. c = (t < c);
  492. l = (t + b[0]) & BN_MASK2;
  493. c += (l < t);
  494. r[0] = l;
  495. t = a[1];
  496. t = (t + c) & BN_MASK2;
  497. c = (t < c);
  498. l = (t + b[1]) & BN_MASK2;
  499. c += (l < t);
  500. r[1] = l;
  501. t = a[2];
  502. t = (t + c) & BN_MASK2;
  503. c = (t < c);
  504. l = (t + b[2]) & BN_MASK2;
  505. c += (l < t);
  506. r[2] = l;
  507. t = a[3];
  508. t = (t + c) & BN_MASK2;
  509. c = (t < c);
  510. l = (t + b[3]) & BN_MASK2;
  511. c += (l < t);
  512. r[3] = l;
  513. a += 4;
  514. b += 4;
  515. r += 4;
  516. n -= 4;
  517. }
  518. while (n) {
  519. t = a[0];
  520. t = (t + c) & BN_MASK2;
  521. c = (t < c);
  522. l = (t + b[0]) & BN_MASK2;
  523. c += (l < t);
  524. r[0] = l;
  525. a++;
  526. b++;
  527. r++;
  528. n--;
  529. }
  530. return (BN_ULONG)c;
  531. }
  532. #endif /* !BN_LLONG */
  533. BN_ULONG bn_sub_words(BN_ULONG *r, const BN_ULONG *a, const BN_ULONG *b,
  534. int n) {
  535. BN_ULONG t1, t2;
  536. int c = 0;
  537. assert(n >= 0);
  538. if (n <= 0) {
  539. return (BN_ULONG)0;
  540. }
  541. while (n & ~3) {
  542. t1 = a[0];
  543. t2 = b[0];
  544. r[0] = (t1 - t2 - c) & BN_MASK2;
  545. if (t1 != t2)
  546. c = (t1 < t2);
  547. t1 = a[1];
  548. t2 = b[1];
  549. r[1] = (t1 - t2 - c) & BN_MASK2;
  550. if (t1 != t2)
  551. c = (t1 < t2);
  552. t1 = a[2];
  553. t2 = b[2];
  554. r[2] = (t1 - t2 - c) & BN_MASK2;
  555. if (t1 != t2)
  556. c = (t1 < t2);
  557. t1 = a[3];
  558. t2 = b[3];
  559. r[3] = (t1 - t2 - c) & BN_MASK2;
  560. if (t1 != t2)
  561. c = (t1 < t2);
  562. a += 4;
  563. b += 4;
  564. r += 4;
  565. n -= 4;
  566. }
  567. while (n) {
  568. t1 = a[0];
  569. t2 = b[0];
  570. r[0] = (t1 - t2 - c) & BN_MASK2;
  571. if (t1 != t2)
  572. c = (t1 < t2);
  573. a++;
  574. b++;
  575. r++;
  576. n--;
  577. }
  578. return c;
  579. }
  580. /* mul_add_c(a,b,c0,c1,c2) -- c+=a*b for three word number c=(c2,c1,c0) */
  581. /* mul_add_c2(a,b,c0,c1,c2) -- c+=2*a*b for three word number c=(c2,c1,c0) */
  582. /* sqr_add_c(a,i,c0,c1,c2) -- c+=a[i]^2 for three word number c=(c2,c1,c0) */
  583. /* sqr_add_c2(a,i,c0,c1,c2) -- c+=2*a[i]*a[j] for three word number c=(c2,c1,c0) */
  584. #ifdef BN_LLONG
  585. #define mul_add_c(a, b, c0, c1, c2) \
  586. t = (BN_ULLONG)a * b; \
  587. t1 = (BN_ULONG)Lw(t); \
  588. t2 = (BN_ULONG)Hw(t); \
  589. c0 = (c0 + t1) & BN_MASK2; \
  590. if ((c0) < t1) \
  591. t2++; \
  592. c1 = (c1 + t2) & BN_MASK2; \
  593. if ((c1) < t2) \
  594. c2++;
  595. #define mul_add_c2(a, b, c0, c1, c2) \
  596. t = (BN_ULLONG)a * b; \
  597. tt = (t + t) & BN_MASK; \
  598. if (tt < t) \
  599. c2++; \
  600. t1 = (BN_ULONG)Lw(tt); \
  601. t2 = (BN_ULONG)Hw(tt); \
  602. c0 = (c0 + t1) & BN_MASK2; \
  603. if ((c0 < t1) && (((++t2) & BN_MASK2) == 0)) \
  604. c2++; \
  605. c1 = (c1 + t2) & BN_MASK2; \
  606. if ((c1) < t2) \
  607. c2++;
  608. #define sqr_add_c(a, i, c0, c1, c2) \
  609. t = (BN_ULLONG)a[i] * a[i]; \
  610. t1 = (BN_ULONG)Lw(t); \
  611. t2 = (BN_ULONG)Hw(t); \
  612. c0 = (c0 + t1) & BN_MASK2; \
  613. if ((c0) < t1) \
  614. t2++; \
  615. c1 = (c1 + t2) & BN_MASK2; \
  616. if ((c1) < t2) \
  617. c2++;
  618. #define sqr_add_c2(a, i, j, c0, c1, c2) mul_add_c2((a)[i], (a)[j], c0, c1, c2)
  619. #elif defined(BN_UMULT_LOHI)
  620. #define mul_add_c(a, b, c0, c1, c2) \
  621. { \
  622. BN_ULONG ta = (a), tb = (b); \
  623. BN_UMULT_LOHI(t1, t2, ta, tb); \
  624. c0 += t1; \
  625. t2 += (c0 < t1) ? 1 : 0; \
  626. c1 += t2; \
  627. c2 += (c1 < t2) ? 1 : 0; \
  628. }
  629. #define mul_add_c2(a, b, c0, c1, c2) \
  630. { \
  631. BN_ULONG ta = (a), tb = (b), t0; \
  632. BN_UMULT_LOHI(t0, t1, ta, tb); \
  633. t2 = t1 + t1; \
  634. c2 += (t2 < t1) ? 1 : 0; \
  635. t1 = t0 + t0; \
  636. t2 += (t1 < t0) ? 1 : 0; \
  637. c0 += t1; \
  638. t2 += (c0 < t1) ? 1 : 0; \
  639. c1 += t2; \
  640. c2 += (c1 < t2) ? 1 : 0; \
  641. }
  642. #define sqr_add_c(a, i, c0, c1, c2) \
  643. { \
  644. BN_ULONG ta = (a)[i]; \
  645. BN_UMULT_LOHI(t1, t2, ta, ta); \
  646. c0 += t1; \
  647. t2 += (c0 < t1) ? 1 : 0; \
  648. c1 += t2; \
  649. c2 += (c1 < t2) ? 1 : 0; \
  650. }
  651. #define sqr_add_c2(a, i, j, c0, c1, c2) mul_add_c2((a)[i], (a)[j], c0, c1, c2)
  652. #elif defined(BN_UMULT_HIGH)
  653. #define mul_add_c(a, b, c0, c1, c2) \
  654. { \
  655. BN_ULONG ta = (a), tb = (b); \
  656. t1 = ta * tb; \
  657. t2 = BN_UMULT_HIGH(ta, tb); \
  658. c0 += t1; \
  659. t2 += (c0 < t1) ? 1 : 0; \
  660. c1 += t2; \
  661. c2 += (c1 < t2) ? 1 : 0; \
  662. }
  663. #define mul_add_c2(a, b, c0, c1, c2) \
  664. { \
  665. BN_ULONG ta = (a), tb = (b), t0; \
  666. t1 = BN_UMULT_HIGH(ta, tb); \
  667. t0 = ta * tb; \
  668. t2 = t1 + t1; \
  669. c2 += (t2 < t1) ? 1 : 0; \
  670. t1 = t0 + t0; \
  671. t2 += (t1 < t0) ? 1 : 0; \
  672. c0 += t1; \
  673. t2 += (c0 < t1) ? 1 : 0; \
  674. c1 += t2; \
  675. c2 += (c1 < t2) ? 1 : 0; \
  676. }
  677. #define sqr_add_c(a, i, c0, c1, c2) \
  678. { \
  679. BN_ULONG ta = (a)[i]; \
  680. t1 = ta * ta; \
  681. t2 = BN_UMULT_HIGH(ta, ta); \
  682. c0 += t1; \
  683. t2 += (c0 < t1) ? 1 : 0; \
  684. c1 += t2; \
  685. c2 += (c1 < t2) ? 1 : 0; \
  686. }
  687. #define sqr_add_c2(a, i, j, c0, c1, c2) mul_add_c2((a)[i], (a)[j], c0, c1, c2)
  688. #else /* !BN_LLONG */
  689. #define mul_add_c(a, b, c0, c1, c2) \
  690. t1 = LBITS(a); \
  691. t2 = HBITS(a); \
  692. bl = LBITS(b); \
  693. bh = HBITS(b); \
  694. mul64(t1, t2, bl, bh); \
  695. c0 = (c0 + t1) & BN_MASK2; \
  696. if ((c0) < t1) \
  697. t2++; \
  698. c1 = (c1 + t2) & BN_MASK2; \
  699. if ((c1) < t2) \
  700. c2++;
  701. #define mul_add_c2(a, b, c0, c1, c2) \
  702. t1 = LBITS(a); \
  703. t2 = HBITS(a); \
  704. bl = LBITS(b); \
  705. bh = HBITS(b); \
  706. mul64(t1, t2, bl, bh); \
  707. if (t2 & BN_TBIT) \
  708. c2++; \
  709. t2 = (t2 + t2) & BN_MASK2; \
  710. if (t1 & BN_TBIT) \
  711. t2++; \
  712. t1 = (t1 + t1) & BN_MASK2; \
  713. c0 = (c0 + t1) & BN_MASK2; \
  714. if ((c0 < t1) && (((++t2) & BN_MASK2) == 0)) \
  715. c2++; \
  716. c1 = (c1 + t2) & BN_MASK2; \
  717. if ((c1) < t2) \
  718. c2++;
  719. #define sqr_add_c(a, i, c0, c1, c2) \
  720. sqr64(t1, t2, (a)[i]); \
  721. c0 = (c0 + t1) & BN_MASK2; \
  722. if ((c0) < t1) \
  723. t2++; \
  724. c1 = (c1 + t2) & BN_MASK2; \
  725. if ((c1) < t2) \
  726. c2++;
  727. #define sqr_add_c2(a, i, j, c0, c1, c2) mul_add_c2((a)[i], (a)[j], c0, c1, c2)
  728. #endif /* !BN_LLONG */
  729. void bn_mul_comba8(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b) {
  730. #ifdef BN_LLONG
  731. BN_ULLONG t;
  732. #else
  733. BN_ULONG bl, bh;
  734. #endif
  735. BN_ULONG t1, t2;
  736. BN_ULONG c1, c2, c3;
  737. c1 = 0;
  738. c2 = 0;
  739. c3 = 0;
  740. mul_add_c(a[0], b[0], c1, c2, c3);
  741. r[0] = c1;
  742. c1 = 0;
  743. mul_add_c(a[0], b[1], c2, c3, c1);
  744. mul_add_c(a[1], b[0], c2, c3, c1);
  745. r[1] = c2;
  746. c2 = 0;
  747. mul_add_c(a[2], b[0], c3, c1, c2);
  748. mul_add_c(a[1], b[1], c3, c1, c2);
  749. mul_add_c(a[0], b[2], c3, c1, c2);
  750. r[2] = c3;
  751. c3 = 0;
  752. mul_add_c(a[0], b[3], c1, c2, c3);
  753. mul_add_c(a[1], b[2], c1, c2, c3);
  754. mul_add_c(a[2], b[1], c1, c2, c3);
  755. mul_add_c(a[3], b[0], c1, c2, c3);
  756. r[3] = c1;
  757. c1 = 0;
  758. mul_add_c(a[4], b[0], c2, c3, c1);
  759. mul_add_c(a[3], b[1], c2, c3, c1);
  760. mul_add_c(a[2], b[2], c2, c3, c1);
  761. mul_add_c(a[1], b[3], c2, c3, c1);
  762. mul_add_c(a[0], b[4], c2, c3, c1);
  763. r[4] = c2;
  764. c2 = 0;
  765. mul_add_c(a[0], b[5], c3, c1, c2);
  766. mul_add_c(a[1], b[4], c3, c1, c2);
  767. mul_add_c(a[2], b[3], c3, c1, c2);
  768. mul_add_c(a[3], b[2], c3, c1, c2);
  769. mul_add_c(a[4], b[1], c3, c1, c2);
  770. mul_add_c(a[5], b[0], c3, c1, c2);
  771. r[5] = c3;
  772. c3 = 0;
  773. mul_add_c(a[6], b[0], c1, c2, c3);
  774. mul_add_c(a[5], b[1], c1, c2, c3);
  775. mul_add_c(a[4], b[2], c1, c2, c3);
  776. mul_add_c(a[3], b[3], c1, c2, c3);
  777. mul_add_c(a[2], b[4], c1, c2, c3);
  778. mul_add_c(a[1], b[5], c1, c2, c3);
  779. mul_add_c(a[0], b[6], c1, c2, c3);
  780. r[6] = c1;
  781. c1 = 0;
  782. mul_add_c(a[0], b[7], c2, c3, c1);
  783. mul_add_c(a[1], b[6], c2, c3, c1);
  784. mul_add_c(a[2], b[5], c2, c3, c1);
  785. mul_add_c(a[3], b[4], c2, c3, c1);
  786. mul_add_c(a[4], b[3], c2, c3, c1);
  787. mul_add_c(a[5], b[2], c2, c3, c1);
  788. mul_add_c(a[6], b[1], c2, c3, c1);
  789. mul_add_c(a[7], b[0], c2, c3, c1);
  790. r[7] = c2;
  791. c2 = 0;
  792. mul_add_c(a[7], b[1], c3, c1, c2);
  793. mul_add_c(a[6], b[2], c3, c1, c2);
  794. mul_add_c(a[5], b[3], c3, c1, c2);
  795. mul_add_c(a[4], b[4], c3, c1, c2);
  796. mul_add_c(a[3], b[5], c3, c1, c2);
  797. mul_add_c(a[2], b[6], c3, c1, c2);
  798. mul_add_c(a[1], b[7], c3, c1, c2);
  799. r[8] = c3;
  800. c3 = 0;
  801. mul_add_c(a[2], b[7], c1, c2, c3);
  802. mul_add_c(a[3], b[6], c1, c2, c3);
  803. mul_add_c(a[4], b[5], c1, c2, c3);
  804. mul_add_c(a[5], b[4], c1, c2, c3);
  805. mul_add_c(a[6], b[3], c1, c2, c3);
  806. mul_add_c(a[7], b[2], c1, c2, c3);
  807. r[9] = c1;
  808. c1 = 0;
  809. mul_add_c(a[7], b[3], c2, c3, c1);
  810. mul_add_c(a[6], b[4], c2, c3, c1);
  811. mul_add_c(a[5], b[5], c2, c3, c1);
  812. mul_add_c(a[4], b[6], c2, c3, c1);
  813. mul_add_c(a[3], b[7], c2, c3, c1);
  814. r[10] = c2;
  815. c2 = 0;
  816. mul_add_c(a[4], b[7], c3, c1, c2);
  817. mul_add_c(a[5], b[6], c3, c1, c2);
  818. mul_add_c(a[6], b[5], c3, c1, c2);
  819. mul_add_c(a[7], b[4], c3, c1, c2);
  820. r[11] = c3;
  821. c3 = 0;
  822. mul_add_c(a[7], b[5], c1, c2, c3);
  823. mul_add_c(a[6], b[6], c1, c2, c3);
  824. mul_add_c(a[5], b[7], c1, c2, c3);
  825. r[12] = c1;
  826. c1 = 0;
  827. mul_add_c(a[6], b[7], c2, c3, c1);
  828. mul_add_c(a[7], b[6], c2, c3, c1);
  829. r[13] = c2;
  830. c2 = 0;
  831. mul_add_c(a[7], b[7], c3, c1, c2);
  832. r[14] = c3;
  833. r[15] = c1;
  834. }
  835. void bn_mul_comba4(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b) {
  836. #ifdef BN_LLONG
  837. BN_ULLONG t;
  838. #else
  839. BN_ULONG bl, bh;
  840. #endif
  841. BN_ULONG t1, t2;
  842. BN_ULONG c1, c2, c3;
  843. c1 = 0;
  844. c2 = 0;
  845. c3 = 0;
  846. mul_add_c(a[0], b[0], c1, c2, c3);
  847. r[0] = c1;
  848. c1 = 0;
  849. mul_add_c(a[0], b[1], c2, c3, c1);
  850. mul_add_c(a[1], b[0], c2, c3, c1);
  851. r[1] = c2;
  852. c2 = 0;
  853. mul_add_c(a[2], b[0], c3, c1, c2);
  854. mul_add_c(a[1], b[1], c3, c1, c2);
  855. mul_add_c(a[0], b[2], c3, c1, c2);
  856. r[2] = c3;
  857. c3 = 0;
  858. mul_add_c(a[0], b[3], c1, c2, c3);
  859. mul_add_c(a[1], b[2], c1, c2, c3);
  860. mul_add_c(a[2], b[1], c1, c2, c3);
  861. mul_add_c(a[3], b[0], c1, c2, c3);
  862. r[3] = c1;
  863. c1 = 0;
  864. mul_add_c(a[3], b[1], c2, c3, c1);
  865. mul_add_c(a[2], b[2], c2, c3, c1);
  866. mul_add_c(a[1], b[3], c2, c3, c1);
  867. r[4] = c2;
  868. c2 = 0;
  869. mul_add_c(a[2], b[3], c3, c1, c2);
  870. mul_add_c(a[3], b[2], c3, c1, c2);
  871. r[5] = c3;
  872. c3 = 0;
  873. mul_add_c(a[3], b[3], c1, c2, c3);
  874. r[6] = c1;
  875. r[7] = c2;
  876. }
  877. void bn_sqr_comba8(BN_ULONG *r, const BN_ULONG *a) {
  878. #ifdef BN_LLONG
  879. BN_ULLONG t, tt;
  880. #else
  881. BN_ULONG bl, bh;
  882. #endif
  883. BN_ULONG t1, t2;
  884. BN_ULONG c1, c2, c3;
  885. c1 = 0;
  886. c2 = 0;
  887. c3 = 0;
  888. sqr_add_c(a, 0, c1, c2, c3);
  889. r[0] = c1;
  890. c1 = 0;
  891. sqr_add_c2(a, 1, 0, c2, c3, c1);
  892. r[1] = c2;
  893. c2 = 0;
  894. sqr_add_c(a, 1, c3, c1, c2);
  895. sqr_add_c2(a, 2, 0, c3, c1, c2);
  896. r[2] = c3;
  897. c3 = 0;
  898. sqr_add_c2(a, 3, 0, c1, c2, c3);
  899. sqr_add_c2(a, 2, 1, c1, c2, c3);
  900. r[3] = c1;
  901. c1 = 0;
  902. sqr_add_c(a, 2, c2, c3, c1);
  903. sqr_add_c2(a, 3, 1, c2, c3, c1);
  904. sqr_add_c2(a, 4, 0, c2, c3, c1);
  905. r[4] = c2;
  906. c2 = 0;
  907. sqr_add_c2(a, 5, 0, c3, c1, c2);
  908. sqr_add_c2(a, 4, 1, c3, c1, c2);
  909. sqr_add_c2(a, 3, 2, c3, c1, c2);
  910. r[5] = c3;
  911. c3 = 0;
  912. sqr_add_c(a, 3, c1, c2, c3);
  913. sqr_add_c2(a, 4, 2, c1, c2, c3);
  914. sqr_add_c2(a, 5, 1, c1, c2, c3);
  915. sqr_add_c2(a, 6, 0, c1, c2, c3);
  916. r[6] = c1;
  917. c1 = 0;
  918. sqr_add_c2(a, 7, 0, c2, c3, c1);
  919. sqr_add_c2(a, 6, 1, c2, c3, c1);
  920. sqr_add_c2(a, 5, 2, c2, c3, c1);
  921. sqr_add_c2(a, 4, 3, c2, c3, c1);
  922. r[7] = c2;
  923. c2 = 0;
  924. sqr_add_c(a, 4, c3, c1, c2);
  925. sqr_add_c2(a, 5, 3, c3, c1, c2);
  926. sqr_add_c2(a, 6, 2, c3, c1, c2);
  927. sqr_add_c2(a, 7, 1, c3, c1, c2);
  928. r[8] = c3;
  929. c3 = 0;
  930. sqr_add_c2(a, 7, 2, c1, c2, c3);
  931. sqr_add_c2(a, 6, 3, c1, c2, c3);
  932. sqr_add_c2(a, 5, 4, c1, c2, c3);
  933. r[9] = c1;
  934. c1 = 0;
  935. sqr_add_c(a, 5, c2, c3, c1);
  936. sqr_add_c2(a, 6, 4, c2, c3, c1);
  937. sqr_add_c2(a, 7, 3, c2, c3, c1);
  938. r[10] = c2;
  939. c2 = 0;
  940. sqr_add_c2(a, 7, 4, c3, c1, c2);
  941. sqr_add_c2(a, 6, 5, c3, c1, c2);
  942. r[11] = c3;
  943. c3 = 0;
  944. sqr_add_c(a, 6, c1, c2, c3);
  945. sqr_add_c2(a, 7, 5, c1, c2, c3);
  946. r[12] = c1;
  947. c1 = 0;
  948. sqr_add_c2(a, 7, 6, c2, c3, c1);
  949. r[13] = c2;
  950. c2 = 0;
  951. sqr_add_c(a, 7, c3, c1, c2);
  952. r[14] = c3;
  953. r[15] = c1;
  954. }
  955. void bn_sqr_comba4(BN_ULONG *r, const BN_ULONG *a) {
  956. #ifdef BN_LLONG
  957. BN_ULLONG t, tt;
  958. #else
  959. BN_ULONG bl, bh;
  960. #endif
  961. BN_ULONG t1, t2;
  962. BN_ULONG c1, c2, c3;
  963. c1 = 0;
  964. c2 = 0;
  965. c3 = 0;
  966. sqr_add_c(a, 0, c1, c2, c3);
  967. r[0] = c1;
  968. c1 = 0;
  969. sqr_add_c2(a, 1, 0, c2, c3, c1);
  970. r[1] = c2;
  971. c2 = 0;
  972. sqr_add_c(a, 1, c3, c1, c2);
  973. sqr_add_c2(a, 2, 0, c3, c1, c2);
  974. r[2] = c3;
  975. c3 = 0;
  976. sqr_add_c2(a, 3, 0, c1, c2, c3);
  977. sqr_add_c2(a, 2, 1, c1, c2, c3);
  978. r[3] = c1;
  979. c1 = 0;
  980. sqr_add_c(a, 2, c2, c3, c1);
  981. sqr_add_c2(a, 3, 1, c2, c3, c1);
  982. r[4] = c2;
  983. c2 = 0;
  984. sqr_add_c2(a, 3, 2, c3, c1, c2);
  985. r[5] = c3;
  986. c3 = 0;
  987. sqr_add_c(a, 3, c1, c2, c3);
  988. r[6] = c1;
  989. r[7] = c2;
  990. }
  991. #if defined(OPENSSL_NO_ASM) || (!defined(OPENSSL_ARM) && !defined(OPENSSL_X86_64))
  992. /* This is essentially reference implementation, which may or may not
  993. * result in performance improvement. E.g. on IA-32 this routine was
  994. * observed to give 40% faster rsa1024 private key operations and 10%
  995. * faster rsa4096 ones, while on AMD64 it improves rsa1024 sign only
  996. * by 10% and *worsens* rsa4096 sign by 15%. Once again, it's a
  997. * reference implementation, one to be used as starting point for
  998. * platform-specific assembler. Mentioned numbers apply to compiler
  999. * generated code compiled with and without -DOPENSSL_BN_ASM_MONT and
  1000. * can vary not only from platform to platform, but even for compiler
  1001. * versions. Assembler vs. assembler improvement coefficients can
  1002. * [and are known to] differ and are to be documented elsewhere. */
  1003. int bn_mul_mont(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp,
  1004. const BN_ULONG *np, const BN_ULONG *n0p, int num) {
  1005. BN_ULONG c0, c1, ml, *tp, n0;
  1006. #ifdef mul64
  1007. BN_ULONG mh;
  1008. #endif
  1009. volatile BN_ULONG *vp;
  1010. int i = 0, j;
  1011. #if 0 /* template for platform-specific implementation */
  1012. if (ap==bp) return bn_sqr_mont(rp,ap,np,n0p,num);
  1013. #endif
  1014. vp = tp = alloca((num + 2) * sizeof(BN_ULONG));
  1015. n0 = *n0p;
  1016. c0 = 0;
  1017. ml = bp[0];
  1018. #ifdef mul64
  1019. mh = HBITS(ml);
  1020. ml = LBITS(ml);
  1021. for (j = 0; j < num; ++j)
  1022. mul(tp[j], ap[j], ml, mh, c0);
  1023. #else
  1024. for (j = 0; j < num; ++j)
  1025. mul(tp[j], ap[j], ml, c0);
  1026. #endif
  1027. tp[num] = c0;
  1028. tp[num + 1] = 0;
  1029. goto enter;
  1030. for (i = 0; i < num; i++) {
  1031. c0 = 0;
  1032. ml = bp[i];
  1033. #ifdef mul64
  1034. mh = HBITS(ml);
  1035. ml = LBITS(ml);
  1036. for (j = 0; j < num; ++j)
  1037. mul_add(tp[j], ap[j], ml, mh, c0);
  1038. #else
  1039. for (j = 0; j < num; ++j)
  1040. mul_add(tp[j], ap[j], ml, c0);
  1041. #endif
  1042. c1 = (tp[num] + c0) & BN_MASK2;
  1043. tp[num] = c1;
  1044. tp[num + 1] = (c1 < c0 ? 1 : 0);
  1045. enter:
  1046. c1 = tp[0];
  1047. ml = (c1 * n0) & BN_MASK2;
  1048. c0 = 0;
  1049. #ifdef mul64
  1050. mh = HBITS(ml);
  1051. ml = LBITS(ml);
  1052. mul_add(c1, np[0], ml, mh, c0);
  1053. #else
  1054. mul_add(c1, ml, np[0], c0);
  1055. #endif
  1056. for (j = 1; j < num; j++) {
  1057. c1 = tp[j];
  1058. #ifdef mul64
  1059. mul_add(c1, np[j], ml, mh, c0);
  1060. #else
  1061. mul_add(c1, ml, np[j], c0);
  1062. #endif
  1063. tp[j - 1] = c1 & BN_MASK2;
  1064. }
  1065. c1 = (tp[num] + c0) & BN_MASK2;
  1066. tp[num - 1] = c1;
  1067. tp[num] = tp[num + 1] + (c1 < c0 ? 1 : 0);
  1068. }
  1069. if (tp[num] != 0 || tp[num - 1] >= np[num - 1]) {
  1070. c0 = bn_sub_words(rp, tp, np, num);
  1071. if (tp[num] != 0 || c0 == 0) {
  1072. for (i = 0; i < num + 2; i++)
  1073. vp[i] = 0;
  1074. return 1;
  1075. }
  1076. }
  1077. for (i = 0; i < num; i++)
  1078. rp[i] = tp[i], vp[i] = 0;
  1079. vp[num] = 0;
  1080. vp[num + 1] = 0;
  1081. return 1;
  1082. }
  1083. #endif
  1084. #endif