Nevar pievienot vairāk kā 25 tēmas Tēmai ir jāsākas ar burtu vai ciparu, tā var saturēt domu zīmes ('-') un var būt līdz 35 simboliem gara.
 
 
 
 
 
 

616 rindas
18 KiB

  1. #include <openssl/bn.h>
  2. #if defined(OPENSSL_X86_64) && !defined(OPENSSL_WINDOWS)
  3. #include "../internal.h"
  4. /* x86_64 BIGNUM accelerator version 0.1, December 2002.
  5. *
  6. * Implemented by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
  7. * project.
  8. *
  9. * Rights for redistribution and usage in source and binary forms are
  10. * granted according to the OpenSSL license. Warranty of any kind is
  11. * disclaimed.
  12. *
  13. * Q. Version 0.1? It doesn't sound like Andy, he used to assign real
  14. * versions, like 1.0...
  15. * A. Well, that's because this code is basically a quick-n-dirty
  16. * proof-of-concept hack. As you can see it's implemented with
  17. * inline assembler, which means that you're bound to GCC and that
  18. * there might be enough room for further improvement.
  19. *
  20. * Q. Why inline assembler?
  21. * A. x86_64 features own ABI which I'm not familiar with. This is
  22. * why I decided to let the compiler take care of subroutine
  23. * prologue/epilogue as well as register allocation. For reference.
  24. * Win64 implements different ABI for AMD64, different from Linux.
  25. *
  26. * Q. How much faster does it get?
  27. * A. 'apps/openssl speed rsa dsa' output with no-asm:
  28. *
  29. * sign verify sign/s verify/s
  30. * rsa 512 bits 0.0006s 0.0001s 1683.8 18456.2
  31. * rsa 1024 bits 0.0028s 0.0002s 356.0 6407.0
  32. * rsa 2048 bits 0.0172s 0.0005s 58.0 1957.8
  33. * rsa 4096 bits 0.1155s 0.0018s 8.7 555.6
  34. * sign verify sign/s verify/s
  35. * dsa 512 bits 0.0005s 0.0006s 2100.8 1768.3
  36. * dsa 1024 bits 0.0014s 0.0018s 692.3 559.2
  37. * dsa 2048 bits 0.0049s 0.0061s 204.7 165.0
  38. *
  39. * 'apps/openssl speed rsa dsa' output with this module:
  40. *
  41. * sign verify sign/s verify/s
  42. * rsa 512 bits 0.0004s 0.0000s 2767.1 33297.9
  43. * rsa 1024 bits 0.0012s 0.0001s 867.4 14674.7
  44. * rsa 2048 bits 0.0061s 0.0002s 164.0 5270.0
  45. * rsa 4096 bits 0.0384s 0.0006s 26.1 1650.8
  46. * sign verify sign/s verify/s
  47. * dsa 512 bits 0.0002s 0.0003s 4442.2 3786.3
  48. * dsa 1024 bits 0.0005s 0.0007s 1835.1 1497.4
  49. * dsa 2048 bits 0.0016s 0.0020s 620.4 504.6
  50. *
  51. * For the reference. IA-32 assembler implementation performs
  52. * very much like 64-bit code compiled with no-asm on the same
  53. * machine.
  54. */
  55. #undef mul
  56. #undef mul_add
  57. #define asm __asm__
  58. /*
  59. * "m"(a), "+m"(r) is the way to favor DirectPath µ-code;
  60. * "g"(0) let the compiler to decide where does it
  61. * want to keep the value of zero;
  62. */
  63. #define mul_add(r, a, word, carry) \
  64. do { \
  65. register BN_ULONG high, low; \
  66. asm("mulq %3" : "=a"(low), "=d"(high) : "a"(word), "m"(a) : "cc"); \
  67. asm("addq %2,%0; adcq %3,%1" \
  68. : "+r"(carry), "+d"(high) \
  69. : "a"(low), "g"(0) \
  70. : "cc"); \
  71. asm("addq %2,%0; adcq %3,%1" \
  72. : "+m"(r), "+d"(high) \
  73. : "r"(carry), "g"(0) \
  74. : "cc"); \
  75. carry = high; \
  76. } while (0)
  77. #define mul(r, a, word, carry) \
  78. do { \
  79. register BN_ULONG high, low; \
  80. asm("mulq %3" : "=a"(low), "=d"(high) : "a"(word), "g"(a) : "cc"); \
  81. asm("addq %2,%0; adcq %3,%1" \
  82. : "+r"(carry), "+d"(high) \
  83. : "a"(low), "g"(0) \
  84. : "cc"); \
  85. (r) = carry, carry = high; \
  86. } while (0)
  87. #undef sqr
  88. #define sqr(r0, r1, a) asm("mulq %2" : "=a"(r0), "=d"(r1) : "a"(a) : "cc");
  89. BN_ULONG bn_mul_add_words(BN_ULONG *rp, const BN_ULONG *ap, int num,
  90. BN_ULONG w) {
  91. BN_ULONG c1 = 0;
  92. if (num <= 0)
  93. return (c1);
  94. while (num & ~3) {
  95. mul_add(rp[0], ap[0], w, c1);
  96. mul_add(rp[1], ap[1], w, c1);
  97. mul_add(rp[2], ap[2], w, c1);
  98. mul_add(rp[3], ap[3], w, c1);
  99. ap += 4;
  100. rp += 4;
  101. num -= 4;
  102. }
  103. if (num) {
  104. mul_add(rp[0], ap[0], w, c1);
  105. if (--num == 0)
  106. return c1;
  107. mul_add(rp[1], ap[1], w, c1);
  108. if (--num == 0)
  109. return c1;
  110. mul_add(rp[2], ap[2], w, c1);
  111. return c1;
  112. }
  113. return (c1);
  114. }
  115. BN_ULONG bn_mul_words(BN_ULONG *rp, const BN_ULONG *ap, int num, BN_ULONG w) {
  116. BN_ULONG c1 = 0;
  117. if (num <= 0)
  118. return (c1);
  119. while (num & ~3) {
  120. mul(rp[0], ap[0], w, c1);
  121. mul(rp[1], ap[1], w, c1);
  122. mul(rp[2], ap[2], w, c1);
  123. mul(rp[3], ap[3], w, c1);
  124. ap += 4;
  125. rp += 4;
  126. num -= 4;
  127. }
  128. if (num) {
  129. mul(rp[0], ap[0], w, c1);
  130. if (--num == 0)
  131. return c1;
  132. mul(rp[1], ap[1], w, c1);
  133. if (--num == 0)
  134. return c1;
  135. mul(rp[2], ap[2], w, c1);
  136. }
  137. return (c1);
  138. }
  139. void bn_sqr_words(BN_ULONG *r, const BN_ULONG *a, int n) {
  140. if (n <= 0)
  141. return;
  142. while (n & ~3) {
  143. sqr(r[0], r[1], a[0]);
  144. sqr(r[2], r[3], a[1]);
  145. sqr(r[4], r[5], a[2]);
  146. sqr(r[6], r[7], a[3]);
  147. a += 4;
  148. r += 8;
  149. n -= 4;
  150. }
  151. if (n) {
  152. sqr(r[0], r[1], a[0]);
  153. if (--n == 0)
  154. return;
  155. sqr(r[2], r[3], a[1]);
  156. if (--n == 0)
  157. return;
  158. sqr(r[4], r[5], a[2]);
  159. }
  160. }
  161. BN_ULONG bn_div_words(BN_ULONG h, BN_ULONG l, BN_ULONG d) {
  162. BN_ULONG ret, waste;
  163. asm("divq %4" : "=a"(ret), "=d"(waste) : "a"(l), "d"(h), "g"(d) : "cc");
  164. return ret;
  165. }
  166. BN_ULONG bn_add_words(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp,
  167. int n) {
  168. BN_ULONG ret;
  169. size_t i = 0;
  170. if (n <= 0)
  171. return 0;
  172. asm volatile (
  173. " subq %0,%0 \n" /* clear carry */
  174. " jmp 1f \n"
  175. ".p2align 4 \n"
  176. "1: movq (%4,%2,8),%0 \n"
  177. " adcq (%5,%2,8),%0 \n"
  178. " movq %0,(%3,%2,8) \n"
  179. " lea 1(%2),%2 \n"
  180. " loop 1b \n"
  181. " sbbq %0,%0 \n"
  182. : "=&r"(ret), "+c"(n), "+r"(i)
  183. : "r"(rp), "r"(ap), "r"(bp)
  184. : "cc", "memory");
  185. return ret & 1;
  186. }
  187. #ifndef SIMICS
  188. BN_ULONG bn_sub_words(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp,
  189. int n) {
  190. BN_ULONG ret;
  191. size_t i = 0;
  192. if (n <= 0)
  193. return 0;
  194. asm volatile (
  195. " subq %0,%0 \n" /* clear borrow */
  196. " jmp 1f \n"
  197. ".p2align 4 \n"
  198. "1: movq (%4,%2,8),%0 \n"
  199. " sbbq (%5,%2,8),%0 \n"
  200. " movq %0,(%3,%2,8) \n"
  201. " lea 1(%2),%2 \n"
  202. " loop 1b \n"
  203. " sbbq %0,%0 \n"
  204. : "=&r"(ret), "+c"(n), "+r"(i)
  205. : "r"(rp), "r"(ap), "r"(bp)
  206. : "cc", "memory");
  207. return ret & 1;
  208. }
  209. #else
  210. /* Simics 1.4<7 has buggy sbbq:-( */
  211. #define BN_MASK2 0xffffffffffffffffL
  212. BN_ULONG bn_sub_words(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b, int n) {
  213. BN_ULONG t1, t2;
  214. int c = 0;
  215. if (n <= 0)
  216. return ((BN_ULONG)0);
  217. for (;;) {
  218. t1 = a[0];
  219. t2 = b[0];
  220. r[0] = (t1 - t2 - c) & BN_MASK2;
  221. if (t1 != t2)
  222. c = (t1 < t2);
  223. if (--n <= 0)
  224. break;
  225. t1 = a[1];
  226. t2 = b[1];
  227. r[1] = (t1 - t2 - c) & BN_MASK2;
  228. if (t1 != t2)
  229. c = (t1 < t2);
  230. if (--n <= 0)
  231. break;
  232. t1 = a[2];
  233. t2 = b[2];
  234. r[2] = (t1 - t2 - c) & BN_MASK2;
  235. if (t1 != t2)
  236. c = (t1 < t2);
  237. if (--n <= 0)
  238. break;
  239. t1 = a[3];
  240. t2 = b[3];
  241. r[3] = (t1 - t2 - c) & BN_MASK2;
  242. if (t1 != t2)
  243. c = (t1 < t2);
  244. if (--n <= 0)
  245. break;
  246. a += 4;
  247. b += 4;
  248. r += 4;
  249. }
  250. return (c);
  251. }
  252. #endif
  253. /* mul_add_c(a,b,c0,c1,c2) -- c+=a*b for three word number c=(c2,c1,c0) */
  254. /* mul_add_c2(a,b,c0,c1,c2) -- c+=2*a*b for three word number c=(c2,c1,c0) */
  255. /* sqr_add_c(a,i,c0,c1,c2) -- c+=a[i]^2 for three word number c=(c2,c1,c0) */
  256. /* sqr_add_c2(a,i,c0,c1,c2) -- c+=2*a[i]*a[j] for three word number c=(c2,c1,c0)
  257. */
  258. #if 0
  259. /* original macros are kept for reference purposes */
  260. #define mul_add_c(a, b, c0, c1, c2) \
  261. { \
  262. BN_ULONG ta = (a), tb = (b); \
  263. t1 = ta * tb; \
  264. t2 = BN_UMULT_HIGH(ta, tb); \
  265. c0 += t1; \
  266. t2 += (c0 < t1) ? 1 : 0; \
  267. c1 += t2; \
  268. c2 += (c1 < t2) ? 1 : 0; \
  269. }
  270. #define mul_add_c2(a, b, c0, c1, c2) \
  271. { \
  272. BN_ULONG ta = (a), tb = (b), t0; \
  273. t1 = BN_UMULT_HIGH(ta, tb); \
  274. t0 = ta * tb; \
  275. t2 = t1 + t1; \
  276. c2 += (t2 < t1) ? 1 : 0; \
  277. t1 = t0 + t0; \
  278. t2 += (t1 < t0) ? 1 : 0; \
  279. c0 += t1; \
  280. t2 += (c0 < t1) ? 1 : 0; \
  281. c1 += t2; \
  282. c2 += (c1 < t2) ? 1 : 0; \
  283. }
  284. #else
  285. #define mul_add_c(a, b, c0, c1, c2) \
  286. do { \
  287. asm("mulq %3" : "=a"(t1), "=d"(t2) : "a"(a), "m"(b) : "cc"); \
  288. asm("addq %2,%0; adcq %3,%1" \
  289. : "+r"(c0), "+d"(t2) \
  290. : "a"(t1), "g"(0) \
  291. : "cc"); \
  292. asm("addq %2,%0; adcq %3,%1" \
  293. : "+r"(c1), "+r"(c2) \
  294. : "d"(t2), "g"(0) \
  295. : "cc"); \
  296. } while (0)
  297. #define sqr_add_c(a, i, c0, c1, c2) \
  298. do { \
  299. asm("mulq %2" : "=a"(t1), "=d"(t2) : "a"(a[i]) : "cc"); \
  300. asm("addq %2,%0; adcq %3,%1" \
  301. : "+r"(c0), "+d"(t2) \
  302. : "a"(t1), "g"(0) \
  303. : "cc"); \
  304. asm("addq %2,%0; adcq %3,%1" \
  305. : "+r"(c1), "+r"(c2) \
  306. : "d"(t2), "g"(0) \
  307. : "cc"); \
  308. } while (0)
  309. #define mul_add_c2(a, b, c0, c1, c2) \
  310. do { \
  311. asm("mulq %3" : "=a"(t1), "=d"(t2) : "a"(a), "m"(b) : "cc"); \
  312. asm("addq %0,%0; adcq %2,%1" : "+d"(t2), "+r"(c2) : "g"(0) : "cc"); \
  313. asm("addq %0,%0; adcq %2,%1" : "+a"(t1), "+d"(t2) : "g"(0) : "cc"); \
  314. asm("addq %2,%0; adcq %3,%1" \
  315. : "+r"(c0), "+d"(t2) \
  316. : "a"(t1), "g"(0) \
  317. : "cc"); \
  318. asm("addq %2,%0; adcq %3,%1" \
  319. : "+r"(c1), "+r"(c2) \
  320. : "d"(t2), "g"(0) \
  321. : "cc"); \
  322. } while (0)
  323. #endif
  324. #define sqr_add_c2(a, i, j, c0, c1, c2) mul_add_c2((a)[i], (a)[j], c0, c1, c2)
  325. void bn_mul_comba8(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b) {
  326. BN_ULONG t1, t2;
  327. BN_ULONG c1, c2, c3;
  328. c1 = 0;
  329. c2 = 0;
  330. c3 = 0;
  331. mul_add_c(a[0], b[0], c1, c2, c3);
  332. r[0] = c1;
  333. c1 = 0;
  334. mul_add_c(a[0], b[1], c2, c3, c1);
  335. mul_add_c(a[1], b[0], c2, c3, c1);
  336. r[1] = c2;
  337. c2 = 0;
  338. mul_add_c(a[2], b[0], c3, c1, c2);
  339. mul_add_c(a[1], b[1], c3, c1, c2);
  340. mul_add_c(a[0], b[2], c3, c1, c2);
  341. r[2] = c3;
  342. c3 = 0;
  343. mul_add_c(a[0], b[3], c1, c2, c3);
  344. mul_add_c(a[1], b[2], c1, c2, c3);
  345. mul_add_c(a[2], b[1], c1, c2, c3);
  346. mul_add_c(a[3], b[0], c1, c2, c3);
  347. r[3] = c1;
  348. c1 = 0;
  349. mul_add_c(a[4], b[0], c2, c3, c1);
  350. mul_add_c(a[3], b[1], c2, c3, c1);
  351. mul_add_c(a[2], b[2], c2, c3, c1);
  352. mul_add_c(a[1], b[3], c2, c3, c1);
  353. mul_add_c(a[0], b[4], c2, c3, c1);
  354. r[4] = c2;
  355. c2 = 0;
  356. mul_add_c(a[0], b[5], c3, c1, c2);
  357. mul_add_c(a[1], b[4], c3, c1, c2);
  358. mul_add_c(a[2], b[3], c3, c1, c2);
  359. mul_add_c(a[3], b[2], c3, c1, c2);
  360. mul_add_c(a[4], b[1], c3, c1, c2);
  361. mul_add_c(a[5], b[0], c3, c1, c2);
  362. r[5] = c3;
  363. c3 = 0;
  364. mul_add_c(a[6], b[0], c1, c2, c3);
  365. mul_add_c(a[5], b[1], c1, c2, c3);
  366. mul_add_c(a[4], b[2], c1, c2, c3);
  367. mul_add_c(a[3], b[3], c1, c2, c3);
  368. mul_add_c(a[2], b[4], c1, c2, c3);
  369. mul_add_c(a[1], b[5], c1, c2, c3);
  370. mul_add_c(a[0], b[6], c1, c2, c3);
  371. r[6] = c1;
  372. c1 = 0;
  373. mul_add_c(a[0], b[7], c2, c3, c1);
  374. mul_add_c(a[1], b[6], c2, c3, c1);
  375. mul_add_c(a[2], b[5], c2, c3, c1);
  376. mul_add_c(a[3], b[4], c2, c3, c1);
  377. mul_add_c(a[4], b[3], c2, c3, c1);
  378. mul_add_c(a[5], b[2], c2, c3, c1);
  379. mul_add_c(a[6], b[1], c2, c3, c1);
  380. mul_add_c(a[7], b[0], c2, c3, c1);
  381. r[7] = c2;
  382. c2 = 0;
  383. mul_add_c(a[7], b[1], c3, c1, c2);
  384. mul_add_c(a[6], b[2], c3, c1, c2);
  385. mul_add_c(a[5], b[3], c3, c1, c2);
  386. mul_add_c(a[4], b[4], c3, c1, c2);
  387. mul_add_c(a[3], b[5], c3, c1, c2);
  388. mul_add_c(a[2], b[6], c3, c1, c2);
  389. mul_add_c(a[1], b[7], c3, c1, c2);
  390. r[8] = c3;
  391. c3 = 0;
  392. mul_add_c(a[2], b[7], c1, c2, c3);
  393. mul_add_c(a[3], b[6], c1, c2, c3);
  394. mul_add_c(a[4], b[5], c1, c2, c3);
  395. mul_add_c(a[5], b[4], c1, c2, c3);
  396. mul_add_c(a[6], b[3], c1, c2, c3);
  397. mul_add_c(a[7], b[2], c1, c2, c3);
  398. r[9] = c1;
  399. c1 = 0;
  400. mul_add_c(a[7], b[3], c2, c3, c1);
  401. mul_add_c(a[6], b[4], c2, c3, c1);
  402. mul_add_c(a[5], b[5], c2, c3, c1);
  403. mul_add_c(a[4], b[6], c2, c3, c1);
  404. mul_add_c(a[3], b[7], c2, c3, c1);
  405. r[10] = c2;
  406. c2 = 0;
  407. mul_add_c(a[4], b[7], c3, c1, c2);
  408. mul_add_c(a[5], b[6], c3, c1, c2);
  409. mul_add_c(a[6], b[5], c3, c1, c2);
  410. mul_add_c(a[7], b[4], c3, c1, c2);
  411. r[11] = c3;
  412. c3 = 0;
  413. mul_add_c(a[7], b[5], c1, c2, c3);
  414. mul_add_c(a[6], b[6], c1, c2, c3);
  415. mul_add_c(a[5], b[7], c1, c2, c3);
  416. r[12] = c1;
  417. c1 = 0;
  418. mul_add_c(a[6], b[7], c2, c3, c1);
  419. mul_add_c(a[7], b[6], c2, c3, c1);
  420. r[13] = c2;
  421. c2 = 0;
  422. mul_add_c(a[7], b[7], c3, c1, c2);
  423. r[14] = c3;
  424. r[15] = c1;
  425. }
  426. void bn_mul_comba4(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b) {
  427. BN_ULONG t1, t2;
  428. BN_ULONG c1, c2, c3;
  429. c1 = 0;
  430. c2 = 0;
  431. c3 = 0;
  432. mul_add_c(a[0], b[0], c1, c2, c3);
  433. r[0] = c1;
  434. c1 = 0;
  435. mul_add_c(a[0], b[1], c2, c3, c1);
  436. mul_add_c(a[1], b[0], c2, c3, c1);
  437. r[1] = c2;
  438. c2 = 0;
  439. mul_add_c(a[2], b[0], c3, c1, c2);
  440. mul_add_c(a[1], b[1], c3, c1, c2);
  441. mul_add_c(a[0], b[2], c3, c1, c2);
  442. r[2] = c3;
  443. c3 = 0;
  444. mul_add_c(a[0], b[3], c1, c2, c3);
  445. mul_add_c(a[1], b[2], c1, c2, c3);
  446. mul_add_c(a[2], b[1], c1, c2, c3);
  447. mul_add_c(a[3], b[0], c1, c2, c3);
  448. r[3] = c1;
  449. c1 = 0;
  450. mul_add_c(a[3], b[1], c2, c3, c1);
  451. mul_add_c(a[2], b[2], c2, c3, c1);
  452. mul_add_c(a[1], b[3], c2, c3, c1);
  453. r[4] = c2;
  454. c2 = 0;
  455. mul_add_c(a[2], b[3], c3, c1, c2);
  456. mul_add_c(a[3], b[2], c3, c1, c2);
  457. r[5] = c3;
  458. c3 = 0;
  459. mul_add_c(a[3], b[3], c1, c2, c3);
  460. r[6] = c1;
  461. r[7] = c2;
  462. }
  463. void bn_sqr_comba8(BN_ULONG *r, const BN_ULONG *a) {
  464. BN_ULONG t1, t2;
  465. BN_ULONG c1, c2, c3;
  466. c1 = 0;
  467. c2 = 0;
  468. c3 = 0;
  469. sqr_add_c(a, 0, c1, c2, c3);
  470. r[0] = c1;
  471. c1 = 0;
  472. sqr_add_c2(a, 1, 0, c2, c3, c1);
  473. r[1] = c2;
  474. c2 = 0;
  475. sqr_add_c(a, 1, c3, c1, c2);
  476. sqr_add_c2(a, 2, 0, c3, c1, c2);
  477. r[2] = c3;
  478. c3 = 0;
  479. sqr_add_c2(a, 3, 0, c1, c2, c3);
  480. sqr_add_c2(a, 2, 1, c1, c2, c3);
  481. r[3] = c1;
  482. c1 = 0;
  483. sqr_add_c(a, 2, c2, c3, c1);
  484. sqr_add_c2(a, 3, 1, c2, c3, c1);
  485. sqr_add_c2(a, 4, 0, c2, c3, c1);
  486. r[4] = c2;
  487. c2 = 0;
  488. sqr_add_c2(a, 5, 0, c3, c1, c2);
  489. sqr_add_c2(a, 4, 1, c3, c1, c2);
  490. sqr_add_c2(a, 3, 2, c3, c1, c2);
  491. r[5] = c3;
  492. c3 = 0;
  493. sqr_add_c(a, 3, c1, c2, c3);
  494. sqr_add_c2(a, 4, 2, c1, c2, c3);
  495. sqr_add_c2(a, 5, 1, c1, c2, c3);
  496. sqr_add_c2(a, 6, 0, c1, c2, c3);
  497. r[6] = c1;
  498. c1 = 0;
  499. sqr_add_c2(a, 7, 0, c2, c3, c1);
  500. sqr_add_c2(a, 6, 1, c2, c3, c1);
  501. sqr_add_c2(a, 5, 2, c2, c3, c1);
  502. sqr_add_c2(a, 4, 3, c2, c3, c1);
  503. r[7] = c2;
  504. c2 = 0;
  505. sqr_add_c(a, 4, c3, c1, c2);
  506. sqr_add_c2(a, 5, 3, c3, c1, c2);
  507. sqr_add_c2(a, 6, 2, c3, c1, c2);
  508. sqr_add_c2(a, 7, 1, c3, c1, c2);
  509. r[8] = c3;
  510. c3 = 0;
  511. sqr_add_c2(a, 7, 2, c1, c2, c3);
  512. sqr_add_c2(a, 6, 3, c1, c2, c3);
  513. sqr_add_c2(a, 5, 4, c1, c2, c3);
  514. r[9] = c1;
  515. c1 = 0;
  516. sqr_add_c(a, 5, c2, c3, c1);
  517. sqr_add_c2(a, 6, 4, c2, c3, c1);
  518. sqr_add_c2(a, 7, 3, c2, c3, c1);
  519. r[10] = c2;
  520. c2 = 0;
  521. sqr_add_c2(a, 7, 4, c3, c1, c2);
  522. sqr_add_c2(a, 6, 5, c3, c1, c2);
  523. r[11] = c3;
  524. c3 = 0;
  525. sqr_add_c(a, 6, c1, c2, c3);
  526. sqr_add_c2(a, 7, 5, c1, c2, c3);
  527. r[12] = c1;
  528. c1 = 0;
  529. sqr_add_c2(a, 7, 6, c2, c3, c1);
  530. r[13] = c2;
  531. c2 = 0;
  532. sqr_add_c(a, 7, c3, c1, c2);
  533. r[14] = c3;
  534. r[15] = c1;
  535. }
  536. void bn_sqr_comba4(BN_ULONG *r, const BN_ULONG *a) {
  537. BN_ULONG t1, t2;
  538. BN_ULONG c1, c2, c3;
  539. c1 = 0;
  540. c2 = 0;
  541. c3 = 0;
  542. sqr_add_c(a, 0, c1, c2, c3);
  543. r[0] = c1;
  544. c1 = 0;
  545. sqr_add_c2(a, 1, 0, c2, c3, c1);
  546. r[1] = c2;
  547. c2 = 0;
  548. sqr_add_c(a, 1, c3, c1, c2);
  549. sqr_add_c2(a, 2, 0, c3, c1, c2);
  550. r[2] = c3;
  551. c3 = 0;
  552. sqr_add_c2(a, 3, 0, c1, c2, c3);
  553. sqr_add_c2(a, 2, 1, c1, c2, c3);
  554. r[3] = c1;
  555. c1 = 0;
  556. sqr_add_c(a, 2, c2, c3, c1);
  557. sqr_add_c2(a, 3, 1, c2, c3, c1);
  558. r[4] = c2;
  559. c2 = 0;
  560. sqr_add_c2(a, 3, 2, c3, c1, c2);
  561. r[5] = c3;
  562. c3 = 0;
  563. sqr_add_c(a, 3, c1, c2, c3);
  564. r[6] = c1;
  565. r[7] = c2;
  566. }
  567. #endif /* defined(OPENSSL_X86_64) && !defined(OPENSSL_WINDOWS) */