No puede seleccionar más de 25 temas Los temas deben comenzar con una letra o número, pueden incluir guiones ('-') y pueden tener hasta 35 caracteres de largo.

x86_64-gcc.c 16 KiB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599
  1. #include <openssl/bn.h>
  2. #if !defined(OPENSSL_NO_ASM) && defined(OPENSSL_X86_64) && !defined(OPENSSL_WINDOWS)
  3. #include "../internal.h"
  4. /* x86_64 BIGNUM accelerator version 0.1, December 2002.
  5. *
  6. * Implemented by Andy Polyakov <appro@fy.chalmers.se> for the OpenSSL
  7. * project.
  8. *
  9. * Rights for redistribution and usage in source and binary forms are
  10. * granted according to the OpenSSL license. Warranty of any kind is
  11. * disclaimed.
  12. *
  13. * Q. Version 0.1? It doesn't sound like Andy, he used to assign real
  14. * versions, like 1.0...
  15. * A. Well, that's because this code is basically a quick-n-dirty
  16. * proof-of-concept hack. As you can see it's implemented with
  17. * inline assembler, which means that you're bound to GCC and that
  18. * there might be enough room for further improvement.
  19. *
  20. * Q. Why inline assembler?
  21. * A. x86_64 features own ABI which I'm not familiar with. This is
  22. * why I decided to let the compiler take care of subroutine
  23. * prologue/epilogue as well as register allocation. For reference.
  24. * Win64 implements different ABI for AMD64, different from Linux.
  25. *
  26. * Q. How much faster does it get?
  27. * A. 'apps/openssl speed rsa dsa' output with no-asm:
  28. *
  29. * sign verify sign/s verify/s
  30. * rsa 512 bits 0.0006s 0.0001s 1683.8 18456.2
  31. * rsa 1024 bits 0.0028s 0.0002s 356.0 6407.0
  32. * rsa 2048 bits 0.0172s 0.0005s 58.0 1957.8
  33. * rsa 4096 bits 0.1155s 0.0018s 8.7 555.6
  34. * sign verify sign/s verify/s
  35. * dsa 512 bits 0.0005s 0.0006s 2100.8 1768.3
  36. * dsa 1024 bits 0.0014s 0.0018s 692.3 559.2
  37. * dsa 2048 bits 0.0049s 0.0061s 204.7 165.0
  38. *
  39. * 'apps/openssl speed rsa dsa' output with this module:
  40. *
  41. * sign verify sign/s verify/s
  42. * rsa 512 bits 0.0004s 0.0000s 2767.1 33297.9
  43. * rsa 1024 bits 0.0012s 0.0001s 867.4 14674.7
  44. * rsa 2048 bits 0.0061s 0.0002s 164.0 5270.0
  45. * rsa 4096 bits 0.0384s 0.0006s 26.1 1650.8
  46. * sign verify sign/s verify/s
  47. * dsa 512 bits 0.0002s 0.0003s 4442.2 3786.3
  48. * dsa 1024 bits 0.0005s 0.0007s 1835.1 1497.4
  49. * dsa 2048 bits 0.0016s 0.0020s 620.4 504.6
  50. *
  51. * For the reference. IA-32 assembler implementation performs
  52. * very much like 64-bit code compiled with no-asm on the same
  53. * machine.
  54. */
  55. /* TODO(davidben): Get this file working on Windows x64. */
  56. #undef mul
  57. #undef mul_add
  58. #define asm __asm__
  59. /*
  60. * "m"(a), "+m"(r) is the way to favor DirectPath µ-code;
  61. * "g"(0) let the compiler to decide where does it
  62. * want to keep the value of zero;
  63. */
  64. #define mul_add(r, a, word, carry) \
  65. do { \
  66. register BN_ULONG high, low; \
  67. asm("mulq %3" : "=a"(low), "=d"(high) : "a"(word), "m"(a) : "cc"); \
  68. asm("addq %2,%0; adcq %3,%1" \
  69. : "+r"(carry), "+d"(high) \
  70. : "a"(low), "g"(0) \
  71. : "cc"); \
  72. asm("addq %2,%0; adcq %3,%1" \
  73. : "+m"(r), "+d"(high) \
  74. : "r"(carry), "g"(0) \
  75. : "cc"); \
  76. carry = high; \
  77. } while (0)
  78. #define mul(r, a, word, carry) \
  79. do { \
  80. register BN_ULONG high, low; \
  81. asm("mulq %3" : "=a"(low), "=d"(high) : "a"(word), "g"(a) : "cc"); \
  82. asm("addq %2,%0; adcq %3,%1" \
  83. : "+r"(carry), "+d"(high) \
  84. : "a"(low), "g"(0) \
  85. : "cc"); \
  86. (r) = carry, carry = high; \
  87. } while (0)
  88. #undef sqr
  89. #define sqr(r0, r1, a) asm("mulq %2" : "=a"(r0), "=d"(r1) : "a"(a) : "cc");
  90. BN_ULONG bn_mul_add_words(BN_ULONG *rp, const BN_ULONG *ap, int num,
  91. BN_ULONG w) {
  92. BN_ULONG c1 = 0;
  93. if (num <= 0) {
  94. return (c1);
  95. }
  96. while (num & ~3) {
  97. mul_add(rp[0], ap[0], w, c1);
  98. mul_add(rp[1], ap[1], w, c1);
  99. mul_add(rp[2], ap[2], w, c1);
  100. mul_add(rp[3], ap[3], w, c1);
  101. ap += 4;
  102. rp += 4;
  103. num -= 4;
  104. }
  105. if (num) {
  106. mul_add(rp[0], ap[0], w, c1);
  107. if (--num == 0) {
  108. return c1;
  109. }
  110. mul_add(rp[1], ap[1], w, c1);
  111. if (--num == 0) {
  112. return c1;
  113. }
  114. mul_add(rp[2], ap[2], w, c1);
  115. return c1;
  116. }
  117. return c1;
  118. }
  119. BN_ULONG bn_mul_words(BN_ULONG *rp, const BN_ULONG *ap, int num, BN_ULONG w) {
  120. BN_ULONG c1 = 0;
  121. if (num <= 0) {
  122. return c1;
  123. }
  124. while (num & ~3) {
  125. mul(rp[0], ap[0], w, c1);
  126. mul(rp[1], ap[1], w, c1);
  127. mul(rp[2], ap[2], w, c1);
  128. mul(rp[3], ap[3], w, c1);
  129. ap += 4;
  130. rp += 4;
  131. num -= 4;
  132. }
  133. if (num) {
  134. mul(rp[0], ap[0], w, c1);
  135. if (--num == 0) {
  136. return c1;
  137. }
  138. mul(rp[1], ap[1], w, c1);
  139. if (--num == 0) {
  140. return c1;
  141. }
  142. mul(rp[2], ap[2], w, c1);
  143. }
  144. return c1;
  145. }
  146. void bn_sqr_words(BN_ULONG *r, const BN_ULONG *a, int n) {
  147. if (n <= 0) {
  148. return;
  149. }
  150. while (n & ~3) {
  151. sqr(r[0], r[1], a[0]);
  152. sqr(r[2], r[3], a[1]);
  153. sqr(r[4], r[5], a[2]);
  154. sqr(r[6], r[7], a[3]);
  155. a += 4;
  156. r += 8;
  157. n -= 4;
  158. }
  159. if (n) {
  160. sqr(r[0], r[1], a[0]);
  161. if (--n == 0) {
  162. return;
  163. }
  164. sqr(r[2], r[3], a[1]);
  165. if (--n == 0) {
  166. return;
  167. }
  168. sqr(r[4], r[5], a[2]);
  169. }
  170. }
  171. BN_ULONG bn_div_words(BN_ULONG h, BN_ULONG l, BN_ULONG d) {
  172. BN_ULONG ret, waste;
  173. asm("divq %4" : "=a"(ret), "=d"(waste) : "a"(l), "d"(h), "g"(d) : "cc");
  174. return ret;
  175. }
  176. BN_ULONG bn_add_words(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp,
  177. int n) {
  178. BN_ULONG ret;
  179. size_t i = 0;
  180. if (n <= 0) {
  181. return 0;
  182. }
  183. asm volatile (
  184. " subq %0,%0 \n" /* clear carry */
  185. " jmp 1f \n"
  186. ".p2align 4 \n"
  187. "1: movq (%4,%2,8),%0 \n"
  188. " adcq (%5,%2,8),%0 \n"
  189. " movq %0,(%3,%2,8) \n"
  190. " lea 1(%2),%2 \n"
  191. " loop 1b \n"
  192. " sbbq %0,%0 \n"
  193. : "=&r"(ret), "+c"(n), "+r"(i)
  194. : "r"(rp), "r"(ap), "r"(bp)
  195. : "cc", "memory");
  196. return ret & 1;
  197. }
  198. #ifndef SIMICS
  199. BN_ULONG bn_sub_words(BN_ULONG *rp, const BN_ULONG *ap, const BN_ULONG *bp,
  200. int n) {
  201. BN_ULONG ret;
  202. size_t i = 0;
  203. if (n <= 0) {
  204. return 0;
  205. }
  206. asm volatile (
  207. " subq %0,%0 \n" /* clear borrow */
  208. " jmp 1f \n"
  209. ".p2align 4 \n"
  210. "1: movq (%4,%2,8),%0 \n"
  211. " sbbq (%5,%2,8),%0 \n"
  212. " movq %0,(%3,%2,8) \n"
  213. " lea 1(%2),%2 \n"
  214. " loop 1b \n"
  215. " sbbq %0,%0 \n"
  216. : "=&r"(ret), "+c"(n), "+r"(i)
  217. : "r"(rp), "r"(ap), "r"(bp)
  218. : "cc", "memory");
  219. return ret & 1;
  220. }
  221. #else
  222. /* Simics 1.4<7 has buggy sbbq:-( */
  223. #define BN_MASK2 0xffffffffffffffffL
  224. BN_ULONG bn_sub_words(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b, int n) {
  225. BN_ULONG t1, t2;
  226. int c = 0;
  227. if (n <= 0) {
  228. return (BN_ULONG)0;
  229. }
  230. for (;;) {
  231. t1 = a[0];
  232. t2 = b[0];
  233. r[0] = (t1 - t2 - c) & BN_MASK2;
  234. if (t1 != t2) {
  235. c = (t1 < t2);
  236. }
  237. if (--n <= 0) {
  238. break;
  239. }
  240. t1 = a[1];
  241. t2 = b[1];
  242. r[1] = (t1 - t2 - c) & BN_MASK2;
  243. if (t1 != t2) {
  244. c = (t1 < t2);
  245. }
  246. if (--n <= 0) {
  247. break;
  248. }
  249. t1 = a[2];
  250. t2 = b[2];
  251. r[2] = (t1 - t2 - c) & BN_MASK2;
  252. if (t1 != t2) {
  253. c = (t1 < t2);
  254. }
  255. if (--n <= 0) {
  256. break;
  257. }
  258. t1 = a[3];
  259. t2 = b[3];
  260. r[3] = (t1 - t2 - c) & BN_MASK2;
  261. if (t1 != t2) {
  262. c = (t1 < t2);
  263. }
  264. if (--n <= 0) {
  265. break;
  266. }
  267. a += 4;
  268. b += 4;
  269. r += 4;
  270. }
  271. return c;
  272. }
  273. #endif
  274. /* mul_add_c(a,b,c0,c1,c2) -- c+=a*b for three word number c=(c2,c1,c0) */
  275. /* mul_add_c2(a,b,c0,c1,c2) -- c+=2*a*b for three word number c=(c2,c1,c0) */
  276. /* sqr_add_c(a,i,c0,c1,c2) -- c+=a[i]^2 for three word number c=(c2,c1,c0) */
  277. /* sqr_add_c2(a,i,c0,c1,c2) -- c+=2*a[i]*a[j] for three word number c=(c2,c1,c0)
  278. */
  279. /* Keep in mind that carrying into high part of multiplication result can not
  280. * overflow, because it cannot be all-ones. */
  281. #define mul_add_c(a, b, c0, c1, c2) \
  282. do { \
  283. BN_ULONG t1, t2; \
  284. asm("mulq %3" : "=a"(t1), "=d"(t2) : "a"(a), "m"(b) : "cc"); \
  285. asm("addq %3,%0; adcq %4,%1; adcq %5,%2" \
  286. : "+r"(c0), "+r"(c1), "+r"(c2) \
  287. : "r"(t1), "r"(t2), "g"(0) \
  288. : "cc"); \
  289. } while (0)
  290. #define sqr_add_c(a, i, c0, c1, c2) \
  291. do { \
  292. BN_ULONG t1, t2; \
  293. asm("mulq %2" : "=a"(t1), "=d"(t2) : "a"(a[i]) : "cc"); \
  294. asm("addq %3,%0; adcq %4,%1; adcq %5,%2" \
  295. : "+r"(c0), "+r"(c1), "+r"(c2) \
  296. : "r"(t1), "r"(t2), "g"(0) \
  297. : "cc"); \
  298. } while (0)
  299. #define mul_add_c2(a, b, c0, c1, c2) \
  300. do { \
  301. BN_ULONG t1, t2; \
  302. asm("mulq %3" : "=a"(t1), "=d"(t2) : "a"(a), "m"(b) : "cc"); \
  303. asm("addq %3,%0; adcq %4,%1; adcq %5,%2" \
  304. : "+r"(c0), "+r"(c1), "+r"(c2) \
  305. : "r"(t1), "r"(t2), "g"(0) \
  306. : "cc"); \
  307. asm("addq %3,%0; adcq %4,%1; adcq %5,%2" \
  308. : "+r"(c0), "+r"(c1), "+r"(c2) \
  309. : "r"(t1), "r"(t2), "g"(0) \
  310. : "cc"); \
  311. } while (0)
  312. #define sqr_add_c2(a, i, j, c0, c1, c2) mul_add_c2((a)[i], (a)[j], c0, c1, c2)
  313. void bn_mul_comba8(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b) {
  314. BN_ULONG c1, c2, c3;
  315. c1 = 0;
  316. c2 = 0;
  317. c3 = 0;
  318. mul_add_c(a[0], b[0], c1, c2, c3);
  319. r[0] = c1;
  320. c1 = 0;
  321. mul_add_c(a[0], b[1], c2, c3, c1);
  322. mul_add_c(a[1], b[0], c2, c3, c1);
  323. r[1] = c2;
  324. c2 = 0;
  325. mul_add_c(a[2], b[0], c3, c1, c2);
  326. mul_add_c(a[1], b[1], c3, c1, c2);
  327. mul_add_c(a[0], b[2], c3, c1, c2);
  328. r[2] = c3;
  329. c3 = 0;
  330. mul_add_c(a[0], b[3], c1, c2, c3);
  331. mul_add_c(a[1], b[2], c1, c2, c3);
  332. mul_add_c(a[2], b[1], c1, c2, c3);
  333. mul_add_c(a[3], b[0], c1, c2, c3);
  334. r[3] = c1;
  335. c1 = 0;
  336. mul_add_c(a[4], b[0], c2, c3, c1);
  337. mul_add_c(a[3], b[1], c2, c3, c1);
  338. mul_add_c(a[2], b[2], c2, c3, c1);
  339. mul_add_c(a[1], b[3], c2, c3, c1);
  340. mul_add_c(a[0], b[4], c2, c3, c1);
  341. r[4] = c2;
  342. c2 = 0;
  343. mul_add_c(a[0], b[5], c3, c1, c2);
  344. mul_add_c(a[1], b[4], c3, c1, c2);
  345. mul_add_c(a[2], b[3], c3, c1, c2);
  346. mul_add_c(a[3], b[2], c3, c1, c2);
  347. mul_add_c(a[4], b[1], c3, c1, c2);
  348. mul_add_c(a[5], b[0], c3, c1, c2);
  349. r[5] = c3;
  350. c3 = 0;
  351. mul_add_c(a[6], b[0], c1, c2, c3);
  352. mul_add_c(a[5], b[1], c1, c2, c3);
  353. mul_add_c(a[4], b[2], c1, c2, c3);
  354. mul_add_c(a[3], b[3], c1, c2, c3);
  355. mul_add_c(a[2], b[4], c1, c2, c3);
  356. mul_add_c(a[1], b[5], c1, c2, c3);
  357. mul_add_c(a[0], b[6], c1, c2, c3);
  358. r[6] = c1;
  359. c1 = 0;
  360. mul_add_c(a[0], b[7], c2, c3, c1);
  361. mul_add_c(a[1], b[6], c2, c3, c1);
  362. mul_add_c(a[2], b[5], c2, c3, c1);
  363. mul_add_c(a[3], b[4], c2, c3, c1);
  364. mul_add_c(a[4], b[3], c2, c3, c1);
  365. mul_add_c(a[5], b[2], c2, c3, c1);
  366. mul_add_c(a[6], b[1], c2, c3, c1);
  367. mul_add_c(a[7], b[0], c2, c3, c1);
  368. r[7] = c2;
  369. c2 = 0;
  370. mul_add_c(a[7], b[1], c3, c1, c2);
  371. mul_add_c(a[6], b[2], c3, c1, c2);
  372. mul_add_c(a[5], b[3], c3, c1, c2);
  373. mul_add_c(a[4], b[4], c3, c1, c2);
  374. mul_add_c(a[3], b[5], c3, c1, c2);
  375. mul_add_c(a[2], b[6], c3, c1, c2);
  376. mul_add_c(a[1], b[7], c3, c1, c2);
  377. r[8] = c3;
  378. c3 = 0;
  379. mul_add_c(a[2], b[7], c1, c2, c3);
  380. mul_add_c(a[3], b[6], c1, c2, c3);
  381. mul_add_c(a[4], b[5], c1, c2, c3);
  382. mul_add_c(a[5], b[4], c1, c2, c3);
  383. mul_add_c(a[6], b[3], c1, c2, c3);
  384. mul_add_c(a[7], b[2], c1, c2, c3);
  385. r[9] = c1;
  386. c1 = 0;
  387. mul_add_c(a[7], b[3], c2, c3, c1);
  388. mul_add_c(a[6], b[4], c2, c3, c1);
  389. mul_add_c(a[5], b[5], c2, c3, c1);
  390. mul_add_c(a[4], b[6], c2, c3, c1);
  391. mul_add_c(a[3], b[7], c2, c3, c1);
  392. r[10] = c2;
  393. c2 = 0;
  394. mul_add_c(a[4], b[7], c3, c1, c2);
  395. mul_add_c(a[5], b[6], c3, c1, c2);
  396. mul_add_c(a[6], b[5], c3, c1, c2);
  397. mul_add_c(a[7], b[4], c3, c1, c2);
  398. r[11] = c3;
  399. c3 = 0;
  400. mul_add_c(a[7], b[5], c1, c2, c3);
  401. mul_add_c(a[6], b[6], c1, c2, c3);
  402. mul_add_c(a[5], b[7], c1, c2, c3);
  403. r[12] = c1;
  404. c1 = 0;
  405. mul_add_c(a[6], b[7], c2, c3, c1);
  406. mul_add_c(a[7], b[6], c2, c3, c1);
  407. r[13] = c2;
  408. c2 = 0;
  409. mul_add_c(a[7], b[7], c3, c1, c2);
  410. r[14] = c3;
  411. r[15] = c1;
  412. }
  413. void bn_mul_comba4(BN_ULONG *r, BN_ULONG *a, BN_ULONG *b) {
  414. BN_ULONG c1, c2, c3;
  415. c1 = 0;
  416. c2 = 0;
  417. c3 = 0;
  418. mul_add_c(a[0], b[0], c1, c2, c3);
  419. r[0] = c1;
  420. c1 = 0;
  421. mul_add_c(a[0], b[1], c2, c3, c1);
  422. mul_add_c(a[1], b[0], c2, c3, c1);
  423. r[1] = c2;
  424. c2 = 0;
  425. mul_add_c(a[2], b[0], c3, c1, c2);
  426. mul_add_c(a[1], b[1], c3, c1, c2);
  427. mul_add_c(a[0], b[2], c3, c1, c2);
  428. r[2] = c3;
  429. c3 = 0;
  430. mul_add_c(a[0], b[3], c1, c2, c3);
  431. mul_add_c(a[1], b[2], c1, c2, c3);
  432. mul_add_c(a[2], b[1], c1, c2, c3);
  433. mul_add_c(a[3], b[0], c1, c2, c3);
  434. r[3] = c1;
  435. c1 = 0;
  436. mul_add_c(a[3], b[1], c2, c3, c1);
  437. mul_add_c(a[2], b[2], c2, c3, c1);
  438. mul_add_c(a[1], b[3], c2, c3, c1);
  439. r[4] = c2;
  440. c2 = 0;
  441. mul_add_c(a[2], b[3], c3, c1, c2);
  442. mul_add_c(a[3], b[2], c3, c1, c2);
  443. r[5] = c3;
  444. c3 = 0;
  445. mul_add_c(a[3], b[3], c1, c2, c3);
  446. r[6] = c1;
  447. r[7] = c2;
  448. }
  449. void bn_sqr_comba8(BN_ULONG *r, const BN_ULONG *a) {
  450. BN_ULONG c1, c2, c3;
  451. c1 = 0;
  452. c2 = 0;
  453. c3 = 0;
  454. sqr_add_c(a, 0, c1, c2, c3);
  455. r[0] = c1;
  456. c1 = 0;
  457. sqr_add_c2(a, 1, 0, c2, c3, c1);
  458. r[1] = c2;
  459. c2 = 0;
  460. sqr_add_c(a, 1, c3, c1, c2);
  461. sqr_add_c2(a, 2, 0, c3, c1, c2);
  462. r[2] = c3;
  463. c3 = 0;
  464. sqr_add_c2(a, 3, 0, c1, c2, c3);
  465. sqr_add_c2(a, 2, 1, c1, c2, c3);
  466. r[3] = c1;
  467. c1 = 0;
  468. sqr_add_c(a, 2, c2, c3, c1);
  469. sqr_add_c2(a, 3, 1, c2, c3, c1);
  470. sqr_add_c2(a, 4, 0, c2, c3, c1);
  471. r[4] = c2;
  472. c2 = 0;
  473. sqr_add_c2(a, 5, 0, c3, c1, c2);
  474. sqr_add_c2(a, 4, 1, c3, c1, c2);
  475. sqr_add_c2(a, 3, 2, c3, c1, c2);
  476. r[5] = c3;
  477. c3 = 0;
  478. sqr_add_c(a, 3, c1, c2, c3);
  479. sqr_add_c2(a, 4, 2, c1, c2, c3);
  480. sqr_add_c2(a, 5, 1, c1, c2, c3);
  481. sqr_add_c2(a, 6, 0, c1, c2, c3);
  482. r[6] = c1;
  483. c1 = 0;
  484. sqr_add_c2(a, 7, 0, c2, c3, c1);
  485. sqr_add_c2(a, 6, 1, c2, c3, c1);
  486. sqr_add_c2(a, 5, 2, c2, c3, c1);
  487. sqr_add_c2(a, 4, 3, c2, c3, c1);
  488. r[7] = c2;
  489. c2 = 0;
  490. sqr_add_c(a, 4, c3, c1, c2);
  491. sqr_add_c2(a, 5, 3, c3, c1, c2);
  492. sqr_add_c2(a, 6, 2, c3, c1, c2);
  493. sqr_add_c2(a, 7, 1, c3, c1, c2);
  494. r[8] = c3;
  495. c3 = 0;
  496. sqr_add_c2(a, 7, 2, c1, c2, c3);
  497. sqr_add_c2(a, 6, 3, c1, c2, c3);
  498. sqr_add_c2(a, 5, 4, c1, c2, c3);
  499. r[9] = c1;
  500. c1 = 0;
  501. sqr_add_c(a, 5, c2, c3, c1);
  502. sqr_add_c2(a, 6, 4, c2, c3, c1);
  503. sqr_add_c2(a, 7, 3, c2, c3, c1);
  504. r[10] = c2;
  505. c2 = 0;
  506. sqr_add_c2(a, 7, 4, c3, c1, c2);
  507. sqr_add_c2(a, 6, 5, c3, c1, c2);
  508. r[11] = c3;
  509. c3 = 0;
  510. sqr_add_c(a, 6, c1, c2, c3);
  511. sqr_add_c2(a, 7, 5, c1, c2, c3);
  512. r[12] = c1;
  513. c1 = 0;
  514. sqr_add_c2(a, 7, 6, c2, c3, c1);
  515. r[13] = c2;
  516. c2 = 0;
  517. sqr_add_c(a, 7, c3, c1, c2);
  518. r[14] = c3;
  519. r[15] = c1;
  520. }
  521. void bn_sqr_comba4(BN_ULONG *r, const BN_ULONG *a) {
  522. BN_ULONG c1, c2, c3;
  523. c1 = 0;
  524. c2 = 0;
  525. c3 = 0;
  526. sqr_add_c(a, 0, c1, c2, c3);
  527. r[0] = c1;
  528. c1 = 0;
  529. sqr_add_c2(a, 1, 0, c2, c3, c1);
  530. r[1] = c2;
  531. c2 = 0;
  532. sqr_add_c(a, 1, c3, c1, c2);
  533. sqr_add_c2(a, 2, 0, c3, c1, c2);
  534. r[2] = c3;
  535. c3 = 0;
  536. sqr_add_c2(a, 3, 0, c1, c2, c3);
  537. sqr_add_c2(a, 2, 1, c1, c2, c3);
  538. r[3] = c1;
  539. c1 = 0;
  540. sqr_add_c(a, 2, c2, c3, c1);
  541. sqr_add_c2(a, 3, 1, c2, c3, c1);
  542. r[4] = c2;
  543. c2 = 0;
  544. sqr_add_c2(a, 3, 2, c3, c1, c2);
  545. r[5] = c3;
  546. c3 = 0;
  547. sqr_add_c(a, 3, c1, c2, c3);
  548. r[6] = c1;
  549. r[7] = c2;
  550. }
  551. #endif /* !NO_ASM && X86_64 && !WINDOWS */