Nie możesz wybrać więcej, niż 25 tematów Tematy muszą się zaczynać od litery lub cyfry, mogą zawierać myślniki ('-') i mogą mieć do 35 znaków.
 
 
 
 
 
 

2339 wiersze
46 KiB

  1. #!/usr/bin/env perl
  2. ##############################################################################
  3. # #
  4. # Copyright (c) 2012, Intel Corporation #
  5. # #
  6. # All rights reserved. #
  7. # #
  8. # Redistribution and use in source and binary forms, with or without #
  9. # modification, are permitted provided that the following conditions are #
  10. # met: #
  11. # #
  12. # * Redistributions of source code must retain the above copyright #
  13. # notice, this list of conditions and the following disclaimer. #
  14. # #
  15. # * Redistributions in binary form must reproduce the above copyright #
  16. # notice, this list of conditions and the following disclaimer in the #
  17. # documentation and/or other materials provided with the #
  18. # distribution. #
  19. # #
  20. # * Neither the name of the Intel Corporation nor the names of its #
  21. # contributors may be used to endorse or promote products derived from #
  22. # this software without specific prior written permission. #
  23. # #
  24. # #
  25. # THIS SOFTWARE IS PROVIDED BY INTEL CORPORATION ""AS IS"" AND ANY #
  26. # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE #
  27. # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR #
  28. # PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL CORPORATION OR #
  29. # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, #
  30. # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, #
  31. # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR #
  32. # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF #
  33. # LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING #
  34. # NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS #
  35. # SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #
  36. # #
  37. ##############################################################################
  38. # Developers and authors: #
  39. # Shay Gueron (1, 2), and Vlad Krasnov (1) #
  40. # (1) Intel Architecture Group, Microprocessor and Chipset Development, #
  41. # Israel Development Center, Haifa, Israel #
  42. # (2) University of Haifa #
  43. ##############################################################################
  44. # Reference: #
  45. # [1] S. Gueron, "Efficient Software Implementations of Modular #
  46. # Exponentiation", http://eprint.iacr.org/2011/239 #
  47. # [2] S. Gueron, V. Krasnov. "Speeding up Big-Numbers Squaring". #
  48. # IEEE Proceedings of 9th International Conference on Information #
  49. # Technology: New Generations (ITNG 2012), 821-823 (2012). #
  50. # [3] S. Gueron, Efficient Software Implementations of Modular Exponentiation#
  51. # Journal of Cryptographic Engineering 2:31-43 (2012). #
  52. # [4] S. Gueron, V. Krasnov: "[PATCH] Efficient and side channel analysis #
  53. # resistant 512-bit and 1024-bit modular exponentiation for optimizing #
  54. # RSA1024 and RSA2048 on x86_64 platforms", #
  55. # http://rt.openssl.org/Ticket/Display.html?id=2582&user=guest&pass=guest#
  56. ##############################################################################
  57. # While original submission covers 512- and 1024-bit exponentiation,
  58. # this module is limited to 512-bit version only (and as such
  59. # accelerates RSA1024 sign). This is because improvement for longer
  60. # keys is not high enough to justify the effort, highest measured
  61. # was ~5% on Westmere. [This is relative to OpenSSL 1.0.2, upcoming
  62. # for the moment of this writing!] Nor does this module implement
  63. # "monolithic" complete exponentiation jumbo-subroutine, but adheres
  64. # to more modular mixture of C and assembly. And it's optimized even
  65. # for processors other than Intel Core family (see table below for
  66. # improvement coefficients).
  67. # <appro@openssl.org>
  68. #
  69. # RSA1024 sign/sec this/original |this/rsax(*) this/fips(*)
  70. # ----------------+---------------------------
  71. # Opteron +13% |+5% +20%
  72. # Bulldozer -0% |-1% +10%
  73. # P4 +11% |+7% +8%
  74. # Westmere +5% |+14% +17%
  75. # Sandy Bridge +2% |+12% +29%
  76. # Ivy Bridge +1% |+11% +35%
  77. # Haswell(**) -0% |+12% +39%
  78. # Atom +13% |+11% +4%
  79. # VIA Nano +70% |+9% +25%
  80. #
  81. # (*) rsax engine and fips numbers are presented for reference
  82. # purposes;
  83. # (**) MULX was attempted, but found to give only marginal improvement;
  84. $flavour = shift;
  85. $output = shift;
  86. if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
  87. $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
  88. $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
  89. ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
  90. ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
  91. die "can't locate x86_64-xlate.pl";
  92. open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
  93. *STDOUT=*OUT;
  94. # In upstream, this is controlled by shelling out to the compiler to check
  95. # versions, but BoringSSL is intended to be used with pre-generated perlasm
  96. # output, so this isn't useful anyway.
  97. #
  98. # TODO(davidben): Enable this after testing. $addx goes up to 1.
  99. $addx = 0;
  100. ($out, $inp, $mod) = ("%rdi", "%rsi", "%rbp"); # common internal API
  101. {
  102. my ($out,$inp,$mod,$n0,$times) = ("%rdi","%rsi","%rdx","%rcx","%r8d");
  103. $code.=<<___;
  104. .text
  105. .extern OPENSSL_ia32cap_P
  106. .globl rsaz_512_sqr
  107. .type rsaz_512_sqr,\@function,5
  108. .align 32
  109. rsaz_512_sqr: # 25-29% faster than rsaz_512_mul
  110. push %rbx
  111. push %rbp
  112. push %r12
  113. push %r13
  114. push %r14
  115. push %r15
  116. subq \$128+24, %rsp
  117. .Lsqr_body:
  118. movq $mod, %rbp # common argument
  119. movq ($inp), %rdx
  120. movq 8($inp), %rax
  121. movq $n0, 128(%rsp)
  122. ___
  123. $code.=<<___ if ($addx);
  124. movl \$0x80100,%r11d
  125. andl OPENSSL_ia32cap_P+8(%rip),%r11d
  126. cmpl \$0x80100,%r11d # check for MULX and ADO/CX
  127. je .Loop_sqrx
  128. ___
  129. $code.=<<___;
  130. jmp .Loop_sqr
  131. .align 32
  132. .Loop_sqr:
  133. movl $times,128+8(%rsp)
  134. #first iteration
  135. movq %rdx, %rbx
  136. mulq %rdx
  137. movq %rax, %r8
  138. movq 16($inp), %rax
  139. movq %rdx, %r9
  140. mulq %rbx
  141. addq %rax, %r9
  142. movq 24($inp), %rax
  143. movq %rdx, %r10
  144. adcq \$0, %r10
  145. mulq %rbx
  146. addq %rax, %r10
  147. movq 32($inp), %rax
  148. movq %rdx, %r11
  149. adcq \$0, %r11
  150. mulq %rbx
  151. addq %rax, %r11
  152. movq 40($inp), %rax
  153. movq %rdx, %r12
  154. adcq \$0, %r12
  155. mulq %rbx
  156. addq %rax, %r12
  157. movq 48($inp), %rax
  158. movq %rdx, %r13
  159. adcq \$0, %r13
  160. mulq %rbx
  161. addq %rax, %r13
  162. movq 56($inp), %rax
  163. movq %rdx, %r14
  164. adcq \$0, %r14
  165. mulq %rbx
  166. addq %rax, %r14
  167. movq %rbx, %rax
  168. movq %rdx, %r15
  169. adcq \$0, %r15
  170. addq %r8, %r8 #shlq \$1, %r8
  171. movq %r9, %rcx
  172. adcq %r9, %r9 #shld \$1, %r8, %r9
  173. mulq %rax
  174. movq %rax, (%rsp)
  175. addq %rdx, %r8
  176. adcq \$0, %r9
  177. movq %r8, 8(%rsp)
  178. shrq \$63, %rcx
  179. #second iteration
  180. movq 8($inp), %r8
  181. movq 16($inp), %rax
  182. mulq %r8
  183. addq %rax, %r10
  184. movq 24($inp), %rax
  185. movq %rdx, %rbx
  186. adcq \$0, %rbx
  187. mulq %r8
  188. addq %rax, %r11
  189. movq 32($inp), %rax
  190. adcq \$0, %rdx
  191. addq %rbx, %r11
  192. movq %rdx, %rbx
  193. adcq \$0, %rbx
  194. mulq %r8
  195. addq %rax, %r12
  196. movq 40($inp), %rax
  197. adcq \$0, %rdx
  198. addq %rbx, %r12
  199. movq %rdx, %rbx
  200. adcq \$0, %rbx
  201. mulq %r8
  202. addq %rax, %r13
  203. movq 48($inp), %rax
  204. adcq \$0, %rdx
  205. addq %rbx, %r13
  206. movq %rdx, %rbx
  207. adcq \$0, %rbx
  208. mulq %r8
  209. addq %rax, %r14
  210. movq 56($inp), %rax
  211. adcq \$0, %rdx
  212. addq %rbx, %r14
  213. movq %rdx, %rbx
  214. adcq \$0, %rbx
  215. mulq %r8
  216. addq %rax, %r15
  217. movq %r8, %rax
  218. adcq \$0, %rdx
  219. addq %rbx, %r15
  220. movq %rdx, %r8
  221. movq %r10, %rdx
  222. adcq \$0, %r8
  223. add %rdx, %rdx
  224. lea (%rcx,%r10,2), %r10 #shld \$1, %rcx, %r10
  225. movq %r11, %rbx
  226. adcq %r11, %r11 #shld \$1, %r10, %r11
  227. mulq %rax
  228. addq %rax, %r9
  229. adcq %rdx, %r10
  230. adcq \$0, %r11
  231. movq %r9, 16(%rsp)
  232. movq %r10, 24(%rsp)
  233. shrq \$63, %rbx
  234. #third iteration
  235. movq 16($inp), %r9
  236. movq 24($inp), %rax
  237. mulq %r9
  238. addq %rax, %r12
  239. movq 32($inp), %rax
  240. movq %rdx, %rcx
  241. adcq \$0, %rcx
  242. mulq %r9
  243. addq %rax, %r13
  244. movq 40($inp), %rax
  245. adcq \$0, %rdx
  246. addq %rcx, %r13
  247. movq %rdx, %rcx
  248. adcq \$0, %rcx
  249. mulq %r9
  250. addq %rax, %r14
  251. movq 48($inp), %rax
  252. adcq \$0, %rdx
  253. addq %rcx, %r14
  254. movq %rdx, %rcx
  255. adcq \$0, %rcx
  256. mulq %r9
  257. movq %r12, %r10
  258. lea (%rbx,%r12,2), %r12 #shld \$1, %rbx, %r12
  259. addq %rax, %r15
  260. movq 56($inp), %rax
  261. adcq \$0, %rdx
  262. addq %rcx, %r15
  263. movq %rdx, %rcx
  264. adcq \$0, %rcx
  265. mulq %r9
  266. shrq \$63, %r10
  267. addq %rax, %r8
  268. movq %r9, %rax
  269. adcq \$0, %rdx
  270. addq %rcx, %r8
  271. movq %rdx, %r9
  272. adcq \$0, %r9
  273. movq %r13, %rcx
  274. leaq (%r10,%r13,2), %r13 #shld \$1, %r12, %r13
  275. mulq %rax
  276. addq %rax, %r11
  277. adcq %rdx, %r12
  278. adcq \$0, %r13
  279. movq %r11, 32(%rsp)
  280. movq %r12, 40(%rsp)
  281. shrq \$63, %rcx
  282. #fourth iteration
  283. movq 24($inp), %r10
  284. movq 32($inp), %rax
  285. mulq %r10
  286. addq %rax, %r14
  287. movq 40($inp), %rax
  288. movq %rdx, %rbx
  289. adcq \$0, %rbx
  290. mulq %r10
  291. addq %rax, %r15
  292. movq 48($inp), %rax
  293. adcq \$0, %rdx
  294. addq %rbx, %r15
  295. movq %rdx, %rbx
  296. adcq \$0, %rbx
  297. mulq %r10
  298. movq %r14, %r12
  299. leaq (%rcx,%r14,2), %r14 #shld \$1, %rcx, %r14
  300. addq %rax, %r8
  301. movq 56($inp), %rax
  302. adcq \$0, %rdx
  303. addq %rbx, %r8
  304. movq %rdx, %rbx
  305. adcq \$0, %rbx
  306. mulq %r10
  307. shrq \$63, %r12
  308. addq %rax, %r9
  309. movq %r10, %rax
  310. adcq \$0, %rdx
  311. addq %rbx, %r9
  312. movq %rdx, %r10
  313. adcq \$0, %r10
  314. movq %r15, %rbx
  315. leaq (%r12,%r15,2),%r15 #shld \$1, %r14, %r15
  316. mulq %rax
  317. addq %rax, %r13
  318. adcq %rdx, %r14
  319. adcq \$0, %r15
  320. movq %r13, 48(%rsp)
  321. movq %r14, 56(%rsp)
  322. shrq \$63, %rbx
  323. #fifth iteration
  324. movq 32($inp), %r11
  325. movq 40($inp), %rax
  326. mulq %r11
  327. addq %rax, %r8
  328. movq 48($inp), %rax
  329. movq %rdx, %rcx
  330. adcq \$0, %rcx
  331. mulq %r11
  332. addq %rax, %r9
  333. movq 56($inp), %rax
  334. adcq \$0, %rdx
  335. movq %r8, %r12
  336. leaq (%rbx,%r8,2), %r8 #shld \$1, %rbx, %r8
  337. addq %rcx, %r9
  338. movq %rdx, %rcx
  339. adcq \$0, %rcx
  340. mulq %r11
  341. shrq \$63, %r12
  342. addq %rax, %r10
  343. movq %r11, %rax
  344. adcq \$0, %rdx
  345. addq %rcx, %r10
  346. movq %rdx, %r11
  347. adcq \$0, %r11
  348. movq %r9, %rcx
  349. leaq (%r12,%r9,2), %r9 #shld \$1, %r8, %r9
  350. mulq %rax
  351. addq %rax, %r15
  352. adcq %rdx, %r8
  353. adcq \$0, %r9
  354. movq %r15, 64(%rsp)
  355. movq %r8, 72(%rsp)
  356. shrq \$63, %rcx
  357. #sixth iteration
  358. movq 40($inp), %r12
  359. movq 48($inp), %rax
  360. mulq %r12
  361. addq %rax, %r10
  362. movq 56($inp), %rax
  363. movq %rdx, %rbx
  364. adcq \$0, %rbx
  365. mulq %r12
  366. addq %rax, %r11
  367. movq %r12, %rax
  368. movq %r10, %r15
  369. leaq (%rcx,%r10,2), %r10 #shld \$1, %rcx, %r10
  370. adcq \$0, %rdx
  371. shrq \$63, %r15
  372. addq %rbx, %r11
  373. movq %rdx, %r12
  374. adcq \$0, %r12
  375. movq %r11, %rbx
  376. leaq (%r15,%r11,2), %r11 #shld \$1, %r10, %r11
  377. mulq %rax
  378. addq %rax, %r9
  379. adcq %rdx, %r10
  380. adcq \$0, %r11
  381. movq %r9, 80(%rsp)
  382. movq %r10, 88(%rsp)
  383. #seventh iteration
  384. movq 48($inp), %r13
  385. movq 56($inp), %rax
  386. mulq %r13
  387. addq %rax, %r12
  388. movq %r13, %rax
  389. movq %rdx, %r13
  390. adcq \$0, %r13
  391. xorq %r14, %r14
  392. shlq \$1, %rbx
  393. adcq %r12, %r12 #shld \$1, %rbx, %r12
  394. adcq %r13, %r13 #shld \$1, %r12, %r13
  395. adcq %r14, %r14 #shld \$1, %r13, %r14
  396. mulq %rax
  397. addq %rax, %r11
  398. adcq %rdx, %r12
  399. adcq \$0, %r13
  400. movq %r11, 96(%rsp)
  401. movq %r12, 104(%rsp)
  402. #eighth iteration
  403. movq 56($inp), %rax
  404. mulq %rax
  405. addq %rax, %r13
  406. adcq \$0, %rdx
  407. addq %rdx, %r14
  408. movq %r13, 112(%rsp)
  409. movq %r14, 120(%rsp)
  410. movq (%rsp), %r8
  411. movq 8(%rsp), %r9
  412. movq 16(%rsp), %r10
  413. movq 24(%rsp), %r11
  414. movq 32(%rsp), %r12
  415. movq 40(%rsp), %r13
  416. movq 48(%rsp), %r14
  417. movq 56(%rsp), %r15
  418. call __rsaz_512_reduce
  419. addq 64(%rsp), %r8
  420. adcq 72(%rsp), %r9
  421. adcq 80(%rsp), %r10
  422. adcq 88(%rsp), %r11
  423. adcq 96(%rsp), %r12
  424. adcq 104(%rsp), %r13
  425. adcq 112(%rsp), %r14
  426. adcq 120(%rsp), %r15
  427. sbbq %rcx, %rcx
  428. call __rsaz_512_subtract
  429. movq %r8, %rdx
  430. movq %r9, %rax
  431. movl 128+8(%rsp), $times
  432. movq $out, $inp
  433. decl $times
  434. jnz .Loop_sqr
  435. ___
  436. if ($addx) {
  437. $code.=<<___;
  438. jmp .Lsqr_tail
  439. .align 32
  440. .Loop_sqrx:
  441. movl $times,128+8(%rsp)
  442. movq $out, %xmm0 # off-load
  443. movq %rbp, %xmm1 # off-load
  444. #first iteration
  445. mulx %rax, %r8, %r9
  446. mulx 16($inp), %rcx, %r10
  447. xor %rbp, %rbp # cf=0, of=0
  448. mulx 24($inp), %rax, %r11
  449. adcx %rcx, %r9
  450. mulx 32($inp), %rcx, %r12
  451. adcx %rax, %r10
  452. mulx 40($inp), %rax, %r13
  453. adcx %rcx, %r11
  454. .byte 0xc4,0x62,0xf3,0xf6,0xb6,0x30,0x00,0x00,0x00 # mulx 48($inp), %rcx, %r14
  455. adcx %rax, %r12
  456. adcx %rcx, %r13
  457. .byte 0xc4,0x62,0xfb,0xf6,0xbe,0x38,0x00,0x00,0x00 # mulx 56($inp), %rax, %r15
  458. adcx %rax, %r14
  459. adcx %rbp, %r15 # %rbp is 0
  460. mov %r9, %rcx
  461. shld \$1, %r8, %r9
  462. shl \$1, %r8
  463. xor %ebp, %ebp
  464. mulx %rdx, %rax, %rdx
  465. adcx %rdx, %r8
  466. mov 8($inp), %rdx
  467. adcx %rbp, %r9
  468. mov %rax, (%rsp)
  469. mov %r8, 8(%rsp)
  470. #second iteration
  471. mulx 16($inp), %rax, %rbx
  472. adox %rax, %r10
  473. adcx %rbx, %r11
  474. .byte 0xc4,0x62,0xc3,0xf6,0x86,0x18,0x00,0x00,0x00 # mulx 24($inp), $out, %r8
  475. adox $out, %r11
  476. adcx %r8, %r12
  477. mulx 32($inp), %rax, %rbx
  478. adox %rax, %r12
  479. adcx %rbx, %r13
  480. mulx 40($inp), $out, %r8
  481. adox $out, %r13
  482. adcx %r8, %r14
  483. .byte 0xc4,0xe2,0xfb,0xf6,0x9e,0x30,0x00,0x00,0x00 # mulx 48($inp), %rax, %rbx
  484. adox %rax, %r14
  485. adcx %rbx, %r15
  486. .byte 0xc4,0x62,0xc3,0xf6,0x86,0x38,0x00,0x00,0x00 # mulx 56($inp), $out, %r8
  487. adox $out, %r15
  488. adcx %rbp, %r8
  489. adox %rbp, %r8
  490. mov %r11, %rbx
  491. shld \$1, %r10, %r11
  492. shld \$1, %rcx, %r10
  493. xor %ebp,%ebp
  494. mulx %rdx, %rax, %rcx
  495. mov 16($inp), %rdx
  496. adcx %rax, %r9
  497. adcx %rcx, %r10
  498. adcx %rbp, %r11
  499. mov %r9, 16(%rsp)
  500. .byte 0x4c,0x89,0x94,0x24,0x18,0x00,0x00,0x00 # mov %r10, 24(%rsp)
  501. #third iteration
  502. .byte 0xc4,0x62,0xc3,0xf6,0x8e,0x18,0x00,0x00,0x00 # mulx 24($inp), $out, %r9
  503. adox $out, %r12
  504. adcx %r9, %r13
  505. mulx 32($inp), %rax, %rcx
  506. adox %rax, %r13
  507. adcx %rcx, %r14
  508. mulx 40($inp), $out, %r9
  509. adox $out, %r14
  510. adcx %r9, %r15
  511. .byte 0xc4,0xe2,0xfb,0xf6,0x8e,0x30,0x00,0x00,0x00 # mulx 48($inp), %rax, %rcx
  512. adox %rax, %r15
  513. adcx %rcx, %r8
  514. .byte 0xc4,0x62,0xc3,0xf6,0x8e,0x38,0x00,0x00,0x00 # mulx 56($inp), $out, %r9
  515. adox $out, %r8
  516. adcx %rbp, %r9
  517. adox %rbp, %r9
  518. mov %r13, %rcx
  519. shld \$1, %r12, %r13
  520. shld \$1, %rbx, %r12
  521. xor %ebp, %ebp
  522. mulx %rdx, %rax, %rdx
  523. adcx %rax, %r11
  524. adcx %rdx, %r12
  525. mov 24($inp), %rdx
  526. adcx %rbp, %r13
  527. mov %r11, 32(%rsp)
  528. .byte 0x4c,0x89,0xa4,0x24,0x28,0x00,0x00,0x00 # mov %r12, 40(%rsp)
  529. #fourth iteration
  530. .byte 0xc4,0xe2,0xfb,0xf6,0x9e,0x20,0x00,0x00,0x00 # mulx 32($inp), %rax, %rbx
  531. adox %rax, %r14
  532. adcx %rbx, %r15
  533. mulx 40($inp), $out, %r10
  534. adox $out, %r15
  535. adcx %r10, %r8
  536. mulx 48($inp), %rax, %rbx
  537. adox %rax, %r8
  538. adcx %rbx, %r9
  539. mulx 56($inp), $out, %r10
  540. adox $out, %r9
  541. adcx %rbp, %r10
  542. adox %rbp, %r10
  543. .byte 0x66
  544. mov %r15, %rbx
  545. shld \$1, %r14, %r15
  546. shld \$1, %rcx, %r14
  547. xor %ebp, %ebp
  548. mulx %rdx, %rax, %rdx
  549. adcx %rax, %r13
  550. adcx %rdx, %r14
  551. mov 32($inp), %rdx
  552. adcx %rbp, %r15
  553. mov %r13, 48(%rsp)
  554. mov %r14, 56(%rsp)
  555. #fifth iteration
  556. .byte 0xc4,0x62,0xc3,0xf6,0x9e,0x28,0x00,0x00,0x00 # mulx 40($inp), $out, %r11
  557. adox $out, %r8
  558. adcx %r11, %r9
  559. mulx 48($inp), %rax, %rcx
  560. adox %rax, %r9
  561. adcx %rcx, %r10
  562. mulx 56($inp), $out, %r11
  563. adox $out, %r10
  564. adcx %rbp, %r11
  565. adox %rbp, %r11
  566. mov %r9, %rcx
  567. shld \$1, %r8, %r9
  568. shld \$1, %rbx, %r8
  569. xor %ebp, %ebp
  570. mulx %rdx, %rax, %rdx
  571. adcx %rax, %r15
  572. adcx %rdx, %r8
  573. mov 40($inp), %rdx
  574. adcx %rbp, %r9
  575. mov %r15, 64(%rsp)
  576. mov %r8, 72(%rsp)
  577. #sixth iteration
  578. .byte 0xc4,0xe2,0xfb,0xf6,0x9e,0x30,0x00,0x00,0x00 # mulx 48($inp), %rax, %rbx
  579. adox %rax, %r10
  580. adcx %rbx, %r11
  581. .byte 0xc4,0x62,0xc3,0xf6,0xa6,0x38,0x00,0x00,0x00 # mulx 56($inp), $out, %r12
  582. adox $out, %r11
  583. adcx %rbp, %r12
  584. adox %rbp, %r12
  585. mov %r11, %rbx
  586. shld \$1, %r10, %r11
  587. shld \$1, %rcx, %r10
  588. xor %ebp, %ebp
  589. mulx %rdx, %rax, %rdx
  590. adcx %rax, %r9
  591. adcx %rdx, %r10
  592. mov 48($inp), %rdx
  593. adcx %rbp, %r11
  594. mov %r9, 80(%rsp)
  595. mov %r10, 88(%rsp)
  596. #seventh iteration
  597. .byte 0xc4,0x62,0xfb,0xf6,0xae,0x38,0x00,0x00,0x00 # mulx 56($inp), %rax, %r13
  598. adox %rax, %r12
  599. adox %rbp, %r13
  600. xor %r14, %r14
  601. shld \$1, %r13, %r14
  602. shld \$1, %r12, %r13
  603. shld \$1, %rbx, %r12
  604. xor %ebp, %ebp
  605. mulx %rdx, %rax, %rdx
  606. adcx %rax, %r11
  607. adcx %rdx, %r12
  608. mov 56($inp), %rdx
  609. adcx %rbp, %r13
  610. .byte 0x4c,0x89,0x9c,0x24,0x60,0x00,0x00,0x00 # mov %r11, 96(%rsp)
  611. .byte 0x4c,0x89,0xa4,0x24,0x68,0x00,0x00,0x00 # mov %r12, 104(%rsp)
  612. #eighth iteration
  613. mulx %rdx, %rax, %rdx
  614. adox %rax, %r13
  615. adox %rbp, %rdx
  616. .byte 0x66
  617. add %rdx, %r14
  618. movq %r13, 112(%rsp)
  619. movq %r14, 120(%rsp)
  620. movq %xmm0, $out
  621. movq %xmm1, %rbp
  622. movq 128(%rsp), %rdx # pull $n0
  623. movq (%rsp), %r8
  624. movq 8(%rsp), %r9
  625. movq 16(%rsp), %r10
  626. movq 24(%rsp), %r11
  627. movq 32(%rsp), %r12
  628. movq 40(%rsp), %r13
  629. movq 48(%rsp), %r14
  630. movq 56(%rsp), %r15
  631. call __rsaz_512_reducex
  632. addq 64(%rsp), %r8
  633. adcq 72(%rsp), %r9
  634. adcq 80(%rsp), %r10
  635. adcq 88(%rsp), %r11
  636. adcq 96(%rsp), %r12
  637. adcq 104(%rsp), %r13
  638. adcq 112(%rsp), %r14
  639. adcq 120(%rsp), %r15
  640. sbbq %rcx, %rcx
  641. call __rsaz_512_subtract
  642. movq %r8, %rdx
  643. movq %r9, %rax
  644. movl 128+8(%rsp), $times
  645. movq $out, $inp
  646. decl $times
  647. jnz .Loop_sqrx
  648. .Lsqr_tail:
  649. ___
  650. }
  651. $code.=<<___;
  652. leaq 128+24+48(%rsp), %rax
  653. movq -48(%rax), %r15
  654. movq -40(%rax), %r14
  655. movq -32(%rax), %r13
  656. movq -24(%rax), %r12
  657. movq -16(%rax), %rbp
  658. movq -8(%rax), %rbx
  659. leaq (%rax), %rsp
  660. .Lsqr_epilogue:
  661. ret
  662. .size rsaz_512_sqr,.-rsaz_512_sqr
  663. ___
  664. }
  665. {
  666. my ($out,$ap,$bp,$mod,$n0) = ("%rdi","%rsi","%rdx","%rcx","%r8");
  667. $code.=<<___;
  668. .globl rsaz_512_mul
  669. .type rsaz_512_mul,\@function,5
  670. .align 32
  671. rsaz_512_mul:
  672. push %rbx
  673. push %rbp
  674. push %r12
  675. push %r13
  676. push %r14
  677. push %r15
  678. subq \$128+24, %rsp
  679. .Lmul_body:
  680. movq $out, %xmm0 # off-load arguments
  681. movq $mod, %xmm1
  682. movq $n0, 128(%rsp)
  683. ___
  684. $code.=<<___ if ($addx);
  685. movl \$0x80100,%r11d
  686. andl OPENSSL_ia32cap_P+8(%rip),%r11d
  687. cmpl \$0x80100,%r11d # check for MULX and ADO/CX
  688. je .Lmulx
  689. ___
  690. $code.=<<___;
  691. movq ($bp), %rbx # pass b[0]
  692. movq $bp, %rbp # pass argument
  693. call __rsaz_512_mul
  694. movq %xmm0, $out
  695. movq %xmm1, %rbp
  696. movq (%rsp), %r8
  697. movq 8(%rsp), %r9
  698. movq 16(%rsp), %r10
  699. movq 24(%rsp), %r11
  700. movq 32(%rsp), %r12
  701. movq 40(%rsp), %r13
  702. movq 48(%rsp), %r14
  703. movq 56(%rsp), %r15
  704. call __rsaz_512_reduce
  705. ___
  706. $code.=<<___ if ($addx);
  707. jmp .Lmul_tail
  708. .align 32
  709. .Lmulx:
  710. movq $bp, %rbp # pass argument
  711. movq ($bp), %rdx # pass b[0]
  712. call __rsaz_512_mulx
  713. movq %xmm0, $out
  714. movq %xmm1, %rbp
  715. movq 128(%rsp), %rdx # pull $n0
  716. movq (%rsp), %r8
  717. movq 8(%rsp), %r9
  718. movq 16(%rsp), %r10
  719. movq 24(%rsp), %r11
  720. movq 32(%rsp), %r12
  721. movq 40(%rsp), %r13
  722. movq 48(%rsp), %r14
  723. movq 56(%rsp), %r15
  724. call __rsaz_512_reducex
  725. .Lmul_tail:
  726. ___
  727. $code.=<<___;
  728. addq 64(%rsp), %r8
  729. adcq 72(%rsp), %r9
  730. adcq 80(%rsp), %r10
  731. adcq 88(%rsp), %r11
  732. adcq 96(%rsp), %r12
  733. adcq 104(%rsp), %r13
  734. adcq 112(%rsp), %r14
  735. adcq 120(%rsp), %r15
  736. sbbq %rcx, %rcx
  737. call __rsaz_512_subtract
  738. leaq 128+24+48(%rsp), %rax
  739. movq -48(%rax), %r15
  740. movq -40(%rax), %r14
  741. movq -32(%rax), %r13
  742. movq -24(%rax), %r12
  743. movq -16(%rax), %rbp
  744. movq -8(%rax), %rbx
  745. leaq (%rax), %rsp
  746. .Lmul_epilogue:
  747. ret
  748. .size rsaz_512_mul,.-rsaz_512_mul
  749. ___
  750. }
  751. {
  752. my ($out,$ap,$bp,$mod,$n0,$pwr) = ("%rdi","%rsi","%rdx","%rcx","%r8","%r9d");
  753. $code.=<<___;
  754. .globl rsaz_512_mul_gather4
  755. .type rsaz_512_mul_gather4,\@function,6
  756. .align 32
  757. rsaz_512_mul_gather4:
  758. push %rbx
  759. push %rbp
  760. push %r12
  761. push %r13
  762. push %r14
  763. push %r15
  764. subq \$`128+24+($win64?0xb0:0)`, %rsp
  765. ___
  766. $code.=<<___ if ($win64);
  767. movaps %xmm6,0xa0(%rsp)
  768. movaps %xmm7,0xb0(%rsp)
  769. movaps %xmm8,0xc0(%rsp)
  770. movaps %xmm9,0xd0(%rsp)
  771. movaps %xmm10,0xe0(%rsp)
  772. movaps %xmm11,0xf0(%rsp)
  773. movaps %xmm12,0x100(%rsp)
  774. movaps %xmm13,0x110(%rsp)
  775. movaps %xmm14,0x120(%rsp)
  776. movaps %xmm15,0x130(%rsp)
  777. ___
  778. $code.=<<___;
  779. .Lmul_gather4_body:
  780. movd $pwr,%xmm8
  781. movdqa .Linc+16(%rip),%xmm1 # 00000002000000020000000200000002
  782. movdqa .Linc(%rip),%xmm0 # 00000001000000010000000000000000
  783. pshufd \$0,%xmm8,%xmm8 # broadcast $power
  784. movdqa %xmm1,%xmm7
  785. movdqa %xmm1,%xmm2
  786. ___
  787. ########################################################################
  788. # calculate mask by comparing 0..15 to $power
  789. #
  790. for($i=0;$i<4;$i++) {
  791. $code.=<<___;
  792. paddd %xmm`$i`,%xmm`$i+1`
  793. pcmpeqd %xmm8,%xmm`$i`
  794. movdqa %xmm7,%xmm`$i+3`
  795. ___
  796. }
  797. for(;$i<7;$i++) {
  798. $code.=<<___;
  799. paddd %xmm`$i`,%xmm`$i+1`
  800. pcmpeqd %xmm8,%xmm`$i`
  801. ___
  802. }
  803. $code.=<<___;
  804. pcmpeqd %xmm8,%xmm7
  805. movdqa 16*0($bp),%xmm8
  806. movdqa 16*1($bp),%xmm9
  807. movdqa 16*2($bp),%xmm10
  808. movdqa 16*3($bp),%xmm11
  809. pand %xmm0,%xmm8
  810. movdqa 16*4($bp),%xmm12
  811. pand %xmm1,%xmm9
  812. movdqa 16*5($bp),%xmm13
  813. pand %xmm2,%xmm10
  814. movdqa 16*6($bp),%xmm14
  815. pand %xmm3,%xmm11
  816. movdqa 16*7($bp),%xmm15
  817. leaq 128($bp), %rbp
  818. pand %xmm4,%xmm12
  819. pand %xmm5,%xmm13
  820. pand %xmm6,%xmm14
  821. pand %xmm7,%xmm15
  822. por %xmm10,%xmm8
  823. por %xmm11,%xmm9
  824. por %xmm12,%xmm8
  825. por %xmm13,%xmm9
  826. por %xmm14,%xmm8
  827. por %xmm15,%xmm9
  828. por %xmm9,%xmm8
  829. pshufd \$0x4e,%xmm8,%xmm9
  830. por %xmm9,%xmm8
  831. ___
  832. $code.=<<___ if ($addx);
  833. movl \$0x80100,%r11d
  834. andl OPENSSL_ia32cap_P+8(%rip),%r11d
  835. cmpl \$0x80100,%r11d # check for MULX and ADO/CX
  836. je .Lmulx_gather
  837. ___
  838. $code.=<<___;
  839. movq %xmm8,%rbx
  840. movq $n0, 128(%rsp) # off-load arguments
  841. movq $out, 128+8(%rsp)
  842. movq $mod, 128+16(%rsp)
  843. movq ($ap), %rax
  844. movq 8($ap), %rcx
  845. mulq %rbx # 0 iteration
  846. movq %rax, (%rsp)
  847. movq %rcx, %rax
  848. movq %rdx, %r8
  849. mulq %rbx
  850. addq %rax, %r8
  851. movq 16($ap), %rax
  852. movq %rdx, %r9
  853. adcq \$0, %r9
  854. mulq %rbx
  855. addq %rax, %r9
  856. movq 24($ap), %rax
  857. movq %rdx, %r10
  858. adcq \$0, %r10
  859. mulq %rbx
  860. addq %rax, %r10
  861. movq 32($ap), %rax
  862. movq %rdx, %r11
  863. adcq \$0, %r11
  864. mulq %rbx
  865. addq %rax, %r11
  866. movq 40($ap), %rax
  867. movq %rdx, %r12
  868. adcq \$0, %r12
  869. mulq %rbx
  870. addq %rax, %r12
  871. movq 48($ap), %rax
  872. movq %rdx, %r13
  873. adcq \$0, %r13
  874. mulq %rbx
  875. addq %rax, %r13
  876. movq 56($ap), %rax
  877. movq %rdx, %r14
  878. adcq \$0, %r14
  879. mulq %rbx
  880. addq %rax, %r14
  881. movq ($ap), %rax
  882. movq %rdx, %r15
  883. adcq \$0, %r15
  884. leaq 8(%rsp), %rdi
  885. movl \$7, %ecx
  886. jmp .Loop_mul_gather
  887. .align 32
  888. .Loop_mul_gather:
  889. movdqa 16*0(%rbp),%xmm8
  890. movdqa 16*1(%rbp),%xmm9
  891. movdqa 16*2(%rbp),%xmm10
  892. movdqa 16*3(%rbp),%xmm11
  893. pand %xmm0,%xmm8
  894. movdqa 16*4(%rbp),%xmm12
  895. pand %xmm1,%xmm9
  896. movdqa 16*5(%rbp),%xmm13
  897. pand %xmm2,%xmm10
  898. movdqa 16*6(%rbp),%xmm14
  899. pand %xmm3,%xmm11
  900. movdqa 16*7(%rbp),%xmm15
  901. leaq 128(%rbp), %rbp
  902. pand %xmm4,%xmm12
  903. pand %xmm5,%xmm13
  904. pand %xmm6,%xmm14
  905. pand %xmm7,%xmm15
  906. por %xmm10,%xmm8
  907. por %xmm11,%xmm9
  908. por %xmm12,%xmm8
  909. por %xmm13,%xmm9
  910. por %xmm14,%xmm8
  911. por %xmm15,%xmm9
  912. por %xmm9,%xmm8
  913. pshufd \$0x4e,%xmm8,%xmm9
  914. por %xmm9,%xmm8
  915. movq %xmm8,%rbx
  916. mulq %rbx
  917. addq %rax, %r8
  918. movq 8($ap), %rax
  919. movq %r8, (%rdi)
  920. movq %rdx, %r8
  921. adcq \$0, %r8
  922. mulq %rbx
  923. addq %rax, %r9
  924. movq 16($ap), %rax
  925. adcq \$0, %rdx
  926. addq %r9, %r8
  927. movq %rdx, %r9
  928. adcq \$0, %r9
  929. mulq %rbx
  930. addq %rax, %r10
  931. movq 24($ap), %rax
  932. adcq \$0, %rdx
  933. addq %r10, %r9
  934. movq %rdx, %r10
  935. adcq \$0, %r10
  936. mulq %rbx
  937. addq %rax, %r11
  938. movq 32($ap), %rax
  939. adcq \$0, %rdx
  940. addq %r11, %r10
  941. movq %rdx, %r11
  942. adcq \$0, %r11
  943. mulq %rbx
  944. addq %rax, %r12
  945. movq 40($ap), %rax
  946. adcq \$0, %rdx
  947. addq %r12, %r11
  948. movq %rdx, %r12
  949. adcq \$0, %r12
  950. mulq %rbx
  951. addq %rax, %r13
  952. movq 48($ap), %rax
  953. adcq \$0, %rdx
  954. addq %r13, %r12
  955. movq %rdx, %r13
  956. adcq \$0, %r13
  957. mulq %rbx
  958. addq %rax, %r14
  959. movq 56($ap), %rax
  960. adcq \$0, %rdx
  961. addq %r14, %r13
  962. movq %rdx, %r14
  963. adcq \$0, %r14
  964. mulq %rbx
  965. addq %rax, %r15
  966. movq ($ap), %rax
  967. adcq \$0, %rdx
  968. addq %r15, %r14
  969. movq %rdx, %r15
  970. adcq \$0, %r15
  971. leaq 8(%rdi), %rdi
  972. decl %ecx
  973. jnz .Loop_mul_gather
  974. movq %r8, (%rdi)
  975. movq %r9, 8(%rdi)
  976. movq %r10, 16(%rdi)
  977. movq %r11, 24(%rdi)
  978. movq %r12, 32(%rdi)
  979. movq %r13, 40(%rdi)
  980. movq %r14, 48(%rdi)
  981. movq %r15, 56(%rdi)
  982. movq 128+8(%rsp), $out
  983. movq 128+16(%rsp), %rbp
  984. movq (%rsp), %r8
  985. movq 8(%rsp), %r9
  986. movq 16(%rsp), %r10
  987. movq 24(%rsp), %r11
  988. movq 32(%rsp), %r12
  989. movq 40(%rsp), %r13
  990. movq 48(%rsp), %r14
  991. movq 56(%rsp), %r15
  992. call __rsaz_512_reduce
  993. ___
  994. $code.=<<___ if ($addx);
  995. jmp .Lmul_gather_tail
  996. .align 32
  997. .Lmulx_gather:
  998. movq %xmm8,%rdx
  999. mov $n0, 128(%rsp) # off-load arguments
  1000. mov $out, 128+8(%rsp)
  1001. mov $mod, 128+16(%rsp)
  1002. mulx ($ap), %rbx, %r8 # 0 iteration
  1003. mov %rbx, (%rsp)
  1004. xor %edi, %edi # cf=0, of=0
  1005. mulx 8($ap), %rax, %r9
  1006. mulx 16($ap), %rbx, %r10
  1007. adcx %rax, %r8
  1008. mulx 24($ap), %rax, %r11
  1009. adcx %rbx, %r9
  1010. mulx 32($ap), %rbx, %r12
  1011. adcx %rax, %r10
  1012. mulx 40($ap), %rax, %r13
  1013. adcx %rbx, %r11
  1014. mulx 48($ap), %rbx, %r14
  1015. adcx %rax, %r12
  1016. mulx 56($ap), %rax, %r15
  1017. adcx %rbx, %r13
  1018. adcx %rax, %r14
  1019. .byte 0x67
  1020. mov %r8, %rbx
  1021. adcx %rdi, %r15 # %rdi is 0
  1022. mov \$-7, %rcx
  1023. jmp .Loop_mulx_gather
  1024. .align 32
  1025. .Loop_mulx_gather:
  1026. movdqa 16*0(%rbp),%xmm8
  1027. movdqa 16*1(%rbp),%xmm9
  1028. movdqa 16*2(%rbp),%xmm10
  1029. movdqa 16*3(%rbp),%xmm11
  1030. pand %xmm0,%xmm8
  1031. movdqa 16*4(%rbp),%xmm12
  1032. pand %xmm1,%xmm9
  1033. movdqa 16*5(%rbp),%xmm13
  1034. pand %xmm2,%xmm10
  1035. movdqa 16*6(%rbp),%xmm14
  1036. pand %xmm3,%xmm11
  1037. movdqa 16*7(%rbp),%xmm15
  1038. leaq 128(%rbp), %rbp
  1039. pand %xmm4,%xmm12
  1040. pand %xmm5,%xmm13
  1041. pand %xmm6,%xmm14
  1042. pand %xmm7,%xmm15
  1043. por %xmm10,%xmm8
  1044. por %xmm11,%xmm9
  1045. por %xmm12,%xmm8
  1046. por %xmm13,%xmm9
  1047. por %xmm14,%xmm8
  1048. por %xmm15,%xmm9
  1049. por %xmm9,%xmm8
  1050. pshufd \$0x4e,%xmm8,%xmm9
  1051. por %xmm9,%xmm8
  1052. movq %xmm8,%rdx
  1053. .byte 0xc4,0x62,0xfb,0xf6,0x86,0x00,0x00,0x00,0x00 # mulx ($ap), %rax, %r8
  1054. adcx %rax, %rbx
  1055. adox %r9, %r8
  1056. mulx 8($ap), %rax, %r9
  1057. adcx %rax, %r8
  1058. adox %r10, %r9
  1059. mulx 16($ap), %rax, %r10
  1060. adcx %rax, %r9
  1061. adox %r11, %r10
  1062. .byte 0xc4,0x62,0xfb,0xf6,0x9e,0x18,0x00,0x00,0x00 # mulx 24($ap), %rax, %r11
  1063. adcx %rax, %r10
  1064. adox %r12, %r11
  1065. mulx 32($ap), %rax, %r12
  1066. adcx %rax, %r11
  1067. adox %r13, %r12
  1068. mulx 40($ap), %rax, %r13
  1069. adcx %rax, %r12
  1070. adox %r14, %r13
  1071. .byte 0xc4,0x62,0xfb,0xf6,0xb6,0x30,0x00,0x00,0x00 # mulx 48($ap), %rax, %r14
  1072. adcx %rax, %r13
  1073. .byte 0x67
  1074. adox %r15, %r14
  1075. mulx 56($ap), %rax, %r15
  1076. mov %rbx, 64(%rsp,%rcx,8)
  1077. adcx %rax, %r14
  1078. adox %rdi, %r15
  1079. mov %r8, %rbx
  1080. adcx %rdi, %r15 # cf=0
  1081. inc %rcx # of=0
  1082. jnz .Loop_mulx_gather
  1083. mov %r8, 64(%rsp)
  1084. mov %r9, 64+8(%rsp)
  1085. mov %r10, 64+16(%rsp)
  1086. mov %r11, 64+24(%rsp)
  1087. mov %r12, 64+32(%rsp)
  1088. mov %r13, 64+40(%rsp)
  1089. mov %r14, 64+48(%rsp)
  1090. mov %r15, 64+56(%rsp)
  1091. mov 128(%rsp), %rdx # pull arguments
  1092. mov 128+8(%rsp), $out
  1093. mov 128+16(%rsp), %rbp
  1094. mov (%rsp), %r8
  1095. mov 8(%rsp), %r9
  1096. mov 16(%rsp), %r10
  1097. mov 24(%rsp), %r11
  1098. mov 32(%rsp), %r12
  1099. mov 40(%rsp), %r13
  1100. mov 48(%rsp), %r14
  1101. mov 56(%rsp), %r15
  1102. call __rsaz_512_reducex
  1103. .Lmul_gather_tail:
  1104. ___
  1105. $code.=<<___;
  1106. addq 64(%rsp), %r8
  1107. adcq 72(%rsp), %r9
  1108. adcq 80(%rsp), %r10
  1109. adcq 88(%rsp), %r11
  1110. adcq 96(%rsp), %r12
  1111. adcq 104(%rsp), %r13
  1112. adcq 112(%rsp), %r14
  1113. adcq 120(%rsp), %r15
  1114. sbbq %rcx, %rcx
  1115. call __rsaz_512_subtract
  1116. leaq 128+24+48(%rsp), %rax
  1117. ___
  1118. $code.=<<___ if ($win64);
  1119. movaps 0xa0-0xc8(%rax),%xmm6
  1120. movaps 0xb0-0xc8(%rax),%xmm7
  1121. movaps 0xc0-0xc8(%rax),%xmm8
  1122. movaps 0xd0-0xc8(%rax),%xmm9
  1123. movaps 0xe0-0xc8(%rax),%xmm10
  1124. movaps 0xf0-0xc8(%rax),%xmm11
  1125. movaps 0x100-0xc8(%rax),%xmm12
  1126. movaps 0x110-0xc8(%rax),%xmm13
  1127. movaps 0x120-0xc8(%rax),%xmm14
  1128. movaps 0x130-0xc8(%rax),%xmm15
  1129. lea 0xb0(%rax),%rax
  1130. ___
  1131. $code.=<<___;
  1132. movq -48(%rax), %r15
  1133. movq -40(%rax), %r14
  1134. movq -32(%rax), %r13
  1135. movq -24(%rax), %r12
  1136. movq -16(%rax), %rbp
  1137. movq -8(%rax), %rbx
  1138. leaq (%rax), %rsp
  1139. .Lmul_gather4_epilogue:
  1140. ret
  1141. .size rsaz_512_mul_gather4,.-rsaz_512_mul_gather4
  1142. ___
  1143. }
  1144. {
  1145. my ($out,$ap,$mod,$n0,$tbl,$pwr) = ("%rdi","%rsi","%rdx","%rcx","%r8","%r9d");
  1146. $code.=<<___;
  1147. .globl rsaz_512_mul_scatter4
  1148. .type rsaz_512_mul_scatter4,\@function,6
  1149. .align 32
  1150. rsaz_512_mul_scatter4:
  1151. push %rbx
  1152. push %rbp
  1153. push %r12
  1154. push %r13
  1155. push %r14
  1156. push %r15
  1157. mov $pwr, $pwr
  1158. subq \$128+24, %rsp
  1159. .Lmul_scatter4_body:
  1160. leaq ($tbl,$pwr,8), $tbl
  1161. movq $out, %xmm0 # off-load arguments
  1162. movq $mod, %xmm1
  1163. movq $tbl, %xmm2
  1164. movq $n0, 128(%rsp)
  1165. movq $out, %rbp
  1166. ___
  1167. $code.=<<___ if ($addx);
  1168. movl \$0x80100,%r11d
  1169. andl OPENSSL_ia32cap_P+8(%rip),%r11d
  1170. cmpl \$0x80100,%r11d # check for MULX and ADO/CX
  1171. je .Lmulx_scatter
  1172. ___
  1173. $code.=<<___;
  1174. movq ($out),%rbx # pass b[0]
  1175. call __rsaz_512_mul
  1176. movq %xmm0, $out
  1177. movq %xmm1, %rbp
  1178. movq (%rsp), %r8
  1179. movq 8(%rsp), %r9
  1180. movq 16(%rsp), %r10
  1181. movq 24(%rsp), %r11
  1182. movq 32(%rsp), %r12
  1183. movq 40(%rsp), %r13
  1184. movq 48(%rsp), %r14
  1185. movq 56(%rsp), %r15
  1186. call __rsaz_512_reduce
  1187. ___
  1188. $code.=<<___ if ($addx);
  1189. jmp .Lmul_scatter_tail
  1190. .align 32
  1191. .Lmulx_scatter:
  1192. movq ($out), %rdx # pass b[0]
  1193. call __rsaz_512_mulx
  1194. movq %xmm0, $out
  1195. movq %xmm1, %rbp
  1196. movq 128(%rsp), %rdx # pull $n0
  1197. movq (%rsp), %r8
  1198. movq 8(%rsp), %r9
  1199. movq 16(%rsp), %r10
  1200. movq 24(%rsp), %r11
  1201. movq 32(%rsp), %r12
  1202. movq 40(%rsp), %r13
  1203. movq 48(%rsp), %r14
  1204. movq 56(%rsp), %r15
  1205. call __rsaz_512_reducex
  1206. .Lmul_scatter_tail:
  1207. ___
  1208. $code.=<<___;
  1209. addq 64(%rsp), %r8
  1210. adcq 72(%rsp), %r9
  1211. adcq 80(%rsp), %r10
  1212. adcq 88(%rsp), %r11
  1213. adcq 96(%rsp), %r12
  1214. adcq 104(%rsp), %r13
  1215. adcq 112(%rsp), %r14
  1216. adcq 120(%rsp), %r15
  1217. movq %xmm2, $inp
  1218. sbbq %rcx, %rcx
  1219. call __rsaz_512_subtract
  1220. movq %r8, 128*0($inp) # scatter
  1221. movq %r9, 128*1($inp)
  1222. movq %r10, 128*2($inp)
  1223. movq %r11, 128*3($inp)
  1224. movq %r12, 128*4($inp)
  1225. movq %r13, 128*5($inp)
  1226. movq %r14, 128*6($inp)
  1227. movq %r15, 128*7($inp)
  1228. leaq 128+24+48(%rsp), %rax
  1229. movq -48(%rax), %r15
  1230. movq -40(%rax), %r14
  1231. movq -32(%rax), %r13
  1232. movq -24(%rax), %r12
  1233. movq -16(%rax), %rbp
  1234. movq -8(%rax), %rbx
  1235. leaq (%rax), %rsp
  1236. .Lmul_scatter4_epilogue:
  1237. ret
  1238. .size rsaz_512_mul_scatter4,.-rsaz_512_mul_scatter4
  1239. ___
  1240. }
  1241. {
  1242. my ($out,$inp,$mod,$n0) = ("%rdi","%rsi","%rdx","%rcx");
  1243. $code.=<<___;
  1244. .globl rsaz_512_mul_by_one
  1245. .type rsaz_512_mul_by_one,\@function,4
  1246. .align 32
  1247. rsaz_512_mul_by_one:
  1248. push %rbx
  1249. push %rbp
  1250. push %r12
  1251. push %r13
  1252. push %r14
  1253. push %r15
  1254. subq \$128+24, %rsp
  1255. .Lmul_by_one_body:
  1256. ___
  1257. $code.=<<___ if ($addx);
  1258. movl OPENSSL_ia32cap_P+8(%rip),%eax
  1259. ___
  1260. $code.=<<___;
  1261. movq $mod, %rbp # reassign argument
  1262. movq $n0, 128(%rsp)
  1263. movq ($inp), %r8
  1264. pxor %xmm0, %xmm0
  1265. movq 8($inp), %r9
  1266. movq 16($inp), %r10
  1267. movq 24($inp), %r11
  1268. movq 32($inp), %r12
  1269. movq 40($inp), %r13
  1270. movq 48($inp), %r14
  1271. movq 56($inp), %r15
  1272. movdqa %xmm0, (%rsp)
  1273. movdqa %xmm0, 16(%rsp)
  1274. movdqa %xmm0, 32(%rsp)
  1275. movdqa %xmm0, 48(%rsp)
  1276. movdqa %xmm0, 64(%rsp)
  1277. movdqa %xmm0, 80(%rsp)
  1278. movdqa %xmm0, 96(%rsp)
  1279. ___
  1280. $code.=<<___ if ($addx);
  1281. andl \$0x80100,%eax
  1282. cmpl \$0x80100,%eax # check for MULX and ADO/CX
  1283. je .Lby_one_callx
  1284. ___
  1285. $code.=<<___;
  1286. call __rsaz_512_reduce
  1287. ___
  1288. $code.=<<___ if ($addx);
  1289. jmp .Lby_one_tail
  1290. .align 32
  1291. .Lby_one_callx:
  1292. movq 128(%rsp), %rdx # pull $n0
  1293. call __rsaz_512_reducex
  1294. .Lby_one_tail:
  1295. ___
  1296. $code.=<<___;
  1297. movq %r8, ($out)
  1298. movq %r9, 8($out)
  1299. movq %r10, 16($out)
  1300. movq %r11, 24($out)
  1301. movq %r12, 32($out)
  1302. movq %r13, 40($out)
  1303. movq %r14, 48($out)
  1304. movq %r15, 56($out)
  1305. leaq 128+24+48(%rsp), %rax
  1306. movq -48(%rax), %r15
  1307. movq -40(%rax), %r14
  1308. movq -32(%rax), %r13
  1309. movq -24(%rax), %r12
  1310. movq -16(%rax), %rbp
  1311. movq -8(%rax), %rbx
  1312. leaq (%rax), %rsp
  1313. .Lmul_by_one_epilogue:
  1314. ret
  1315. .size rsaz_512_mul_by_one,.-rsaz_512_mul_by_one
  1316. ___
  1317. }
  1318. { # __rsaz_512_reduce
  1319. #
  1320. # input: %r8-%r15, %rbp - mod, 128(%rsp) - n0
  1321. # output: %r8-%r15
  1322. # clobbers: everything except %rbp and %rdi
  1323. $code.=<<___;
  1324. .type __rsaz_512_reduce,\@abi-omnipotent
  1325. .align 32
  1326. __rsaz_512_reduce:
  1327. movq %r8, %rbx
  1328. imulq 128+8(%rsp), %rbx
  1329. movq 0(%rbp), %rax
  1330. movl \$8, %ecx
  1331. jmp .Lreduction_loop
  1332. .align 32
  1333. .Lreduction_loop:
  1334. mulq %rbx
  1335. movq 8(%rbp), %rax
  1336. negq %r8
  1337. movq %rdx, %r8
  1338. adcq \$0, %r8
  1339. mulq %rbx
  1340. addq %rax, %r9
  1341. movq 16(%rbp), %rax
  1342. adcq \$0, %rdx
  1343. addq %r9, %r8
  1344. movq %rdx, %r9
  1345. adcq \$0, %r9
  1346. mulq %rbx
  1347. addq %rax, %r10
  1348. movq 24(%rbp), %rax
  1349. adcq \$0, %rdx
  1350. addq %r10, %r9
  1351. movq %rdx, %r10
  1352. adcq \$0, %r10
  1353. mulq %rbx
  1354. addq %rax, %r11
  1355. movq 32(%rbp), %rax
  1356. adcq \$0, %rdx
  1357. addq %r11, %r10
  1358. movq 128+8(%rsp), %rsi
  1359. #movq %rdx, %r11
  1360. #adcq \$0, %r11
  1361. adcq \$0, %rdx
  1362. movq %rdx, %r11
  1363. mulq %rbx
  1364. addq %rax, %r12
  1365. movq 40(%rbp), %rax
  1366. adcq \$0, %rdx
  1367. imulq %r8, %rsi
  1368. addq %r12, %r11
  1369. movq %rdx, %r12
  1370. adcq \$0, %r12
  1371. mulq %rbx
  1372. addq %rax, %r13
  1373. movq 48(%rbp), %rax
  1374. adcq \$0, %rdx
  1375. addq %r13, %r12
  1376. movq %rdx, %r13
  1377. adcq \$0, %r13
  1378. mulq %rbx
  1379. addq %rax, %r14
  1380. movq 56(%rbp), %rax
  1381. adcq \$0, %rdx
  1382. addq %r14, %r13
  1383. movq %rdx, %r14
  1384. adcq \$0, %r14
  1385. mulq %rbx
  1386. movq %rsi, %rbx
  1387. addq %rax, %r15
  1388. movq 0(%rbp), %rax
  1389. adcq \$0, %rdx
  1390. addq %r15, %r14
  1391. movq %rdx, %r15
  1392. adcq \$0, %r15
  1393. decl %ecx
  1394. jne .Lreduction_loop
  1395. ret
  1396. .size __rsaz_512_reduce,.-__rsaz_512_reduce
  1397. ___
  1398. }
  1399. if ($addx) {
  1400. # __rsaz_512_reducex
  1401. #
  1402. # input: %r8-%r15, %rbp - mod, 128(%rsp) - n0
  1403. # output: %r8-%r15
  1404. # clobbers: everything except %rbp and %rdi
  1405. $code.=<<___;
  1406. .type __rsaz_512_reducex,\@abi-omnipotent
  1407. .align 32
  1408. __rsaz_512_reducex:
  1409. #movq 128+8(%rsp), %rdx # pull $n0
  1410. imulq %r8, %rdx
  1411. xorq %rsi, %rsi # cf=0,of=0
  1412. movl \$8, %ecx
  1413. jmp .Lreduction_loopx
  1414. .align 32
  1415. .Lreduction_loopx:
  1416. mov %r8, %rbx
  1417. mulx 0(%rbp), %rax, %r8
  1418. adcx %rbx, %rax
  1419. adox %r9, %r8
  1420. mulx 8(%rbp), %rax, %r9
  1421. adcx %rax, %r8
  1422. adox %r10, %r9
  1423. mulx 16(%rbp), %rbx, %r10
  1424. adcx %rbx, %r9
  1425. adox %r11, %r10
  1426. mulx 24(%rbp), %rbx, %r11
  1427. adcx %rbx, %r10
  1428. adox %r12, %r11
  1429. .byte 0xc4,0x62,0xe3,0xf6,0xa5,0x20,0x00,0x00,0x00 # mulx 32(%rbp), %rbx, %r12
  1430. mov %rdx, %rax
  1431. mov %r8, %rdx
  1432. adcx %rbx, %r11
  1433. adox %r13, %r12
  1434. mulx 128+8(%rsp), %rbx, %rdx
  1435. mov %rax, %rdx
  1436. mulx 40(%rbp), %rax, %r13
  1437. adcx %rax, %r12
  1438. adox %r14, %r13
  1439. .byte 0xc4,0x62,0xfb,0xf6,0xb5,0x30,0x00,0x00,0x00 # mulx 48(%rbp), %rax, %r14
  1440. adcx %rax, %r13
  1441. adox %r15, %r14
  1442. mulx 56(%rbp), %rax, %r15
  1443. mov %rbx, %rdx
  1444. adcx %rax, %r14
  1445. adox %rsi, %r15 # %rsi is 0
  1446. adcx %rsi, %r15 # cf=0
  1447. decl %ecx # of=0
  1448. jne .Lreduction_loopx
  1449. ret
  1450. .size __rsaz_512_reducex,.-__rsaz_512_reducex
  1451. ___
  1452. }
  1453. { # __rsaz_512_subtract
  1454. # input: %r8-%r15, %rdi - $out, %rbp - $mod, %rcx - mask
  1455. # output:
  1456. # clobbers: everything but %rdi, %rsi and %rbp
  1457. $code.=<<___;
  1458. .type __rsaz_512_subtract,\@abi-omnipotent
  1459. .align 32
  1460. __rsaz_512_subtract:
  1461. movq %r8, ($out)
  1462. movq %r9, 8($out)
  1463. movq %r10, 16($out)
  1464. movq %r11, 24($out)
  1465. movq %r12, 32($out)
  1466. movq %r13, 40($out)
  1467. movq %r14, 48($out)
  1468. movq %r15, 56($out)
  1469. movq 0($mod), %r8
  1470. movq 8($mod), %r9
  1471. negq %r8
  1472. notq %r9
  1473. andq %rcx, %r8
  1474. movq 16($mod), %r10
  1475. andq %rcx, %r9
  1476. notq %r10
  1477. movq 24($mod), %r11
  1478. andq %rcx, %r10
  1479. notq %r11
  1480. movq 32($mod), %r12
  1481. andq %rcx, %r11
  1482. notq %r12
  1483. movq 40($mod), %r13
  1484. andq %rcx, %r12
  1485. notq %r13
  1486. movq 48($mod), %r14
  1487. andq %rcx, %r13
  1488. notq %r14
  1489. movq 56($mod), %r15
  1490. andq %rcx, %r14
  1491. notq %r15
  1492. andq %rcx, %r15
  1493. addq ($out), %r8
  1494. adcq 8($out), %r9
  1495. adcq 16($out), %r10
  1496. adcq 24($out), %r11
  1497. adcq 32($out), %r12
  1498. adcq 40($out), %r13
  1499. adcq 48($out), %r14
  1500. adcq 56($out), %r15
  1501. movq %r8, ($out)
  1502. movq %r9, 8($out)
  1503. movq %r10, 16($out)
  1504. movq %r11, 24($out)
  1505. movq %r12, 32($out)
  1506. movq %r13, 40($out)
  1507. movq %r14, 48($out)
  1508. movq %r15, 56($out)
  1509. ret
  1510. .size __rsaz_512_subtract,.-__rsaz_512_subtract
  1511. ___
  1512. }
  1513. { # __rsaz_512_mul
  1514. #
  1515. # input: %rsi - ap, %rbp - bp
  1516. # ouput:
  1517. # clobbers: everything
  1518. my ($ap,$bp) = ("%rsi","%rbp");
  1519. $code.=<<___;
  1520. .type __rsaz_512_mul,\@abi-omnipotent
  1521. .align 32
  1522. __rsaz_512_mul:
  1523. leaq 8(%rsp), %rdi
  1524. movq ($ap), %rax
  1525. mulq %rbx
  1526. movq %rax, (%rdi)
  1527. movq 8($ap), %rax
  1528. movq %rdx, %r8
  1529. mulq %rbx
  1530. addq %rax, %r8
  1531. movq 16($ap), %rax
  1532. movq %rdx, %r9
  1533. adcq \$0, %r9
  1534. mulq %rbx
  1535. addq %rax, %r9
  1536. movq 24($ap), %rax
  1537. movq %rdx, %r10
  1538. adcq \$0, %r10
  1539. mulq %rbx
  1540. addq %rax, %r10
  1541. movq 32($ap), %rax
  1542. movq %rdx, %r11
  1543. adcq \$0, %r11
  1544. mulq %rbx
  1545. addq %rax, %r11
  1546. movq 40($ap), %rax
  1547. movq %rdx, %r12
  1548. adcq \$0, %r12
  1549. mulq %rbx
  1550. addq %rax, %r12
  1551. movq 48($ap), %rax
  1552. movq %rdx, %r13
  1553. adcq \$0, %r13
  1554. mulq %rbx
  1555. addq %rax, %r13
  1556. movq 56($ap), %rax
  1557. movq %rdx, %r14
  1558. adcq \$0, %r14
  1559. mulq %rbx
  1560. addq %rax, %r14
  1561. movq ($ap), %rax
  1562. movq %rdx, %r15
  1563. adcq \$0, %r15
  1564. leaq 8($bp), $bp
  1565. leaq 8(%rdi), %rdi
  1566. movl \$7, %ecx
  1567. jmp .Loop_mul
  1568. .align 32
  1569. .Loop_mul:
  1570. movq ($bp), %rbx
  1571. mulq %rbx
  1572. addq %rax, %r8
  1573. movq 8($ap), %rax
  1574. movq %r8, (%rdi)
  1575. movq %rdx, %r8
  1576. adcq \$0, %r8
  1577. mulq %rbx
  1578. addq %rax, %r9
  1579. movq 16($ap), %rax
  1580. adcq \$0, %rdx
  1581. addq %r9, %r8
  1582. movq %rdx, %r9
  1583. adcq \$0, %r9
  1584. mulq %rbx
  1585. addq %rax, %r10
  1586. movq 24($ap), %rax
  1587. adcq \$0, %rdx
  1588. addq %r10, %r9
  1589. movq %rdx, %r10
  1590. adcq \$0, %r10
  1591. mulq %rbx
  1592. addq %rax, %r11
  1593. movq 32($ap), %rax
  1594. adcq \$0, %rdx
  1595. addq %r11, %r10
  1596. movq %rdx, %r11
  1597. adcq \$0, %r11
  1598. mulq %rbx
  1599. addq %rax, %r12
  1600. movq 40($ap), %rax
  1601. adcq \$0, %rdx
  1602. addq %r12, %r11
  1603. movq %rdx, %r12
  1604. adcq \$0, %r12
  1605. mulq %rbx
  1606. addq %rax, %r13
  1607. movq 48($ap), %rax
  1608. adcq \$0, %rdx
  1609. addq %r13, %r12
  1610. movq %rdx, %r13
  1611. adcq \$0, %r13
  1612. mulq %rbx
  1613. addq %rax, %r14
  1614. movq 56($ap), %rax
  1615. adcq \$0, %rdx
  1616. addq %r14, %r13
  1617. movq %rdx, %r14
  1618. leaq 8($bp), $bp
  1619. adcq \$0, %r14
  1620. mulq %rbx
  1621. addq %rax, %r15
  1622. movq ($ap), %rax
  1623. adcq \$0, %rdx
  1624. addq %r15, %r14
  1625. movq %rdx, %r15
  1626. adcq \$0, %r15
  1627. leaq 8(%rdi), %rdi
  1628. decl %ecx
  1629. jnz .Loop_mul
  1630. movq %r8, (%rdi)
  1631. movq %r9, 8(%rdi)
  1632. movq %r10, 16(%rdi)
  1633. movq %r11, 24(%rdi)
  1634. movq %r12, 32(%rdi)
  1635. movq %r13, 40(%rdi)
  1636. movq %r14, 48(%rdi)
  1637. movq %r15, 56(%rdi)
  1638. ret
  1639. .size __rsaz_512_mul,.-__rsaz_512_mul
  1640. ___
  1641. }
  1642. if ($addx) {
  1643. # __rsaz_512_mulx
  1644. #
  1645. # input: %rsi - ap, %rbp - bp
  1646. # ouput:
  1647. # clobbers: everything
  1648. my ($ap,$bp,$zero) = ("%rsi","%rbp","%rdi");
  1649. $code.=<<___;
  1650. .type __rsaz_512_mulx,\@abi-omnipotent
  1651. .align 32
  1652. __rsaz_512_mulx:
  1653. mulx ($ap), %rbx, %r8 # initial %rdx preloaded by caller
  1654. mov \$-6, %rcx
  1655. mulx 8($ap), %rax, %r9
  1656. movq %rbx, 8(%rsp)
  1657. mulx 16($ap), %rbx, %r10
  1658. adc %rax, %r8
  1659. mulx 24($ap), %rax, %r11
  1660. adc %rbx, %r9
  1661. mulx 32($ap), %rbx, %r12
  1662. adc %rax, %r10
  1663. mulx 40($ap), %rax, %r13
  1664. adc %rbx, %r11
  1665. mulx 48($ap), %rbx, %r14
  1666. adc %rax, %r12
  1667. mulx 56($ap), %rax, %r15
  1668. mov 8($bp), %rdx
  1669. adc %rbx, %r13
  1670. adc %rax, %r14
  1671. adc \$0, %r15
  1672. xor $zero, $zero # cf=0,of=0
  1673. jmp .Loop_mulx
  1674. .align 32
  1675. .Loop_mulx:
  1676. movq %r8, %rbx
  1677. mulx ($ap), %rax, %r8
  1678. adcx %rax, %rbx
  1679. adox %r9, %r8
  1680. mulx 8($ap), %rax, %r9
  1681. adcx %rax, %r8
  1682. adox %r10, %r9
  1683. mulx 16($ap), %rax, %r10
  1684. adcx %rax, %r9
  1685. adox %r11, %r10
  1686. mulx 24($ap), %rax, %r11
  1687. adcx %rax, %r10
  1688. adox %r12, %r11
  1689. .byte 0x3e,0xc4,0x62,0xfb,0xf6,0xa6,0x20,0x00,0x00,0x00 # mulx 32($ap), %rax, %r12
  1690. adcx %rax, %r11
  1691. adox %r13, %r12
  1692. mulx 40($ap), %rax, %r13
  1693. adcx %rax, %r12
  1694. adox %r14, %r13
  1695. mulx 48($ap), %rax, %r14
  1696. adcx %rax, %r13
  1697. adox %r15, %r14
  1698. mulx 56($ap), %rax, %r15
  1699. movq 64($bp,%rcx,8), %rdx
  1700. movq %rbx, 8+64-8(%rsp,%rcx,8)
  1701. adcx %rax, %r14
  1702. adox $zero, %r15
  1703. adcx $zero, %r15 # cf=0
  1704. inc %rcx # of=0
  1705. jnz .Loop_mulx
  1706. movq %r8, %rbx
  1707. mulx ($ap), %rax, %r8
  1708. adcx %rax, %rbx
  1709. adox %r9, %r8
  1710. .byte 0xc4,0x62,0xfb,0xf6,0x8e,0x08,0x00,0x00,0x00 # mulx 8($ap), %rax, %r9
  1711. adcx %rax, %r8
  1712. adox %r10, %r9
  1713. .byte 0xc4,0x62,0xfb,0xf6,0x96,0x10,0x00,0x00,0x00 # mulx 16($ap), %rax, %r10
  1714. adcx %rax, %r9
  1715. adox %r11, %r10
  1716. mulx 24($ap), %rax, %r11
  1717. adcx %rax, %r10
  1718. adox %r12, %r11
  1719. mulx 32($ap), %rax, %r12
  1720. adcx %rax, %r11
  1721. adox %r13, %r12
  1722. mulx 40($ap), %rax, %r13
  1723. adcx %rax, %r12
  1724. adox %r14, %r13
  1725. .byte 0xc4,0x62,0xfb,0xf6,0xb6,0x30,0x00,0x00,0x00 # mulx 48($ap), %rax, %r14
  1726. adcx %rax, %r13
  1727. adox %r15, %r14
  1728. .byte 0xc4,0x62,0xfb,0xf6,0xbe,0x38,0x00,0x00,0x00 # mulx 56($ap), %rax, %r15
  1729. adcx %rax, %r14
  1730. adox $zero, %r15
  1731. adcx $zero, %r15
  1732. mov %rbx, 8+64-8(%rsp)
  1733. mov %r8, 8+64(%rsp)
  1734. mov %r9, 8+64+8(%rsp)
  1735. mov %r10, 8+64+16(%rsp)
  1736. mov %r11, 8+64+24(%rsp)
  1737. mov %r12, 8+64+32(%rsp)
  1738. mov %r13, 8+64+40(%rsp)
  1739. mov %r14, 8+64+48(%rsp)
  1740. mov %r15, 8+64+56(%rsp)
  1741. ret
  1742. .size __rsaz_512_mulx,.-__rsaz_512_mulx
  1743. ___
  1744. }
  1745. {
  1746. my ($out,$inp,$power)= $win64 ? ("%rcx","%rdx","%r8d") : ("%rdi","%rsi","%edx");
  1747. $code.=<<___;
  1748. .globl rsaz_512_scatter4
  1749. .type rsaz_512_scatter4,\@abi-omnipotent
  1750. .align 16
  1751. rsaz_512_scatter4:
  1752. leaq ($out,$power,8), $out
  1753. movl \$8, %r9d
  1754. jmp .Loop_scatter
  1755. .align 16
  1756. .Loop_scatter:
  1757. movq ($inp), %rax
  1758. leaq 8($inp), $inp
  1759. movq %rax, ($out)
  1760. leaq 128($out), $out
  1761. decl %r9d
  1762. jnz .Loop_scatter
  1763. ret
  1764. .size rsaz_512_scatter4,.-rsaz_512_scatter4
  1765. .globl rsaz_512_gather4
  1766. .type rsaz_512_gather4,\@abi-omnipotent
  1767. .align 16
  1768. rsaz_512_gather4:
  1769. ___
  1770. $code.=<<___ if ($win64);
  1771. .LSEH_begin_rsaz_512_gather4:
  1772. .byte 0x48,0x81,0xec,0xa8,0x00,0x00,0x00 # sub $0xa8,%rsp
  1773. .byte 0x0f,0x29,0x34,0x24 # movaps %xmm6,(%rsp)
  1774. .byte 0x0f,0x29,0x7c,0x24,0x10 # movaps %xmm7,0x10(%rsp)
  1775. .byte 0x44,0x0f,0x29,0x44,0x24,0x20 # movaps %xmm8,0x20(%rsp)
  1776. .byte 0x44,0x0f,0x29,0x4c,0x24,0x30 # movaps %xmm9,0x30(%rsp)
  1777. .byte 0x44,0x0f,0x29,0x54,0x24,0x40 # movaps %xmm10,0x40(%rsp)
  1778. .byte 0x44,0x0f,0x29,0x5c,0x24,0x50 # movaps %xmm11,0x50(%rsp)
  1779. .byte 0x44,0x0f,0x29,0x64,0x24,0x60 # movaps %xmm12,0x60(%rsp)
  1780. .byte 0x44,0x0f,0x29,0x6c,0x24,0x70 # movaps %xmm13,0x70(%rsp)
  1781. .byte 0x44,0x0f,0x29,0xb4,0x24,0x80,0,0,0 # movaps %xmm14,0x80(%rsp)
  1782. .byte 0x44,0x0f,0x29,0xbc,0x24,0x90,0,0,0 # movaps %xmm15,0x90(%rsp)
  1783. ___
  1784. $code.=<<___;
  1785. movd $power,%xmm8
  1786. movdqa .Linc+16(%rip),%xmm1 # 00000002000000020000000200000002
  1787. movdqa .Linc(%rip),%xmm0 # 00000001000000010000000000000000
  1788. pshufd \$0,%xmm8,%xmm8 # broadcast $power
  1789. movdqa %xmm1,%xmm7
  1790. movdqa %xmm1,%xmm2
  1791. ___
  1792. ########################################################################
  1793. # calculate mask by comparing 0..15 to $power
  1794. #
  1795. for($i=0;$i<4;$i++) {
  1796. $code.=<<___;
  1797. paddd %xmm`$i`,%xmm`$i+1`
  1798. pcmpeqd %xmm8,%xmm`$i`
  1799. movdqa %xmm7,%xmm`$i+3`
  1800. ___
  1801. }
  1802. for(;$i<7;$i++) {
  1803. $code.=<<___;
  1804. paddd %xmm`$i`,%xmm`$i+1`
  1805. pcmpeqd %xmm8,%xmm`$i`
  1806. ___
  1807. }
  1808. $code.=<<___;
  1809. pcmpeqd %xmm8,%xmm7
  1810. movl \$8, %r9d
  1811. jmp .Loop_gather
  1812. .align 16
  1813. .Loop_gather:
  1814. movdqa 16*0($inp),%xmm8
  1815. movdqa 16*1($inp),%xmm9
  1816. movdqa 16*2($inp),%xmm10
  1817. movdqa 16*3($inp),%xmm11
  1818. pand %xmm0,%xmm8
  1819. movdqa 16*4($inp),%xmm12
  1820. pand %xmm1,%xmm9
  1821. movdqa 16*5($inp),%xmm13
  1822. pand %xmm2,%xmm10
  1823. movdqa 16*6($inp),%xmm14
  1824. pand %xmm3,%xmm11
  1825. movdqa 16*7($inp),%xmm15
  1826. leaq 128($inp), $inp
  1827. pand %xmm4,%xmm12
  1828. pand %xmm5,%xmm13
  1829. pand %xmm6,%xmm14
  1830. pand %xmm7,%xmm15
  1831. por %xmm10,%xmm8
  1832. por %xmm11,%xmm9
  1833. por %xmm12,%xmm8
  1834. por %xmm13,%xmm9
  1835. por %xmm14,%xmm8
  1836. por %xmm15,%xmm9
  1837. por %xmm9,%xmm8
  1838. pshufd \$0x4e,%xmm8,%xmm9
  1839. por %xmm9,%xmm8
  1840. movq %xmm8,($out)
  1841. leaq 8($out), $out
  1842. decl %r9d
  1843. jnz .Loop_gather
  1844. ___
  1845. $code.=<<___ if ($win64);
  1846. movaps 0x00(%rsp),%xmm6
  1847. movaps 0x10(%rsp),%xmm7
  1848. movaps 0x20(%rsp),%xmm8
  1849. movaps 0x30(%rsp),%xmm9
  1850. movaps 0x40(%rsp),%xmm10
  1851. movaps 0x50(%rsp),%xmm11
  1852. movaps 0x60(%rsp),%xmm12
  1853. movaps 0x70(%rsp),%xmm13
  1854. movaps 0x80(%rsp),%xmm14
  1855. movaps 0x90(%rsp),%xmm15
  1856. add \$0xa8,%rsp
  1857. ___
  1858. $code.=<<___;
  1859. ret
  1860. .LSEH_end_rsaz_512_gather4:
  1861. .size rsaz_512_gather4,.-rsaz_512_gather4
  1862. .align 64
  1863. .Linc:
  1864. .long 0,0, 1,1
  1865. .long 2,2, 2,2
  1866. ___
  1867. }
  1868. # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
  1869. # CONTEXT *context,DISPATCHER_CONTEXT *disp)
  1870. if ($win64) {
  1871. $rec="%rcx";
  1872. $frame="%rdx";
  1873. $context="%r8";
  1874. $disp="%r9";
  1875. $code.=<<___;
  1876. .extern __imp_RtlVirtualUnwind
  1877. .type se_handler,\@abi-omnipotent
  1878. .align 16
  1879. se_handler:
  1880. push %rsi
  1881. push %rdi
  1882. push %rbx
  1883. push %rbp
  1884. push %r12
  1885. push %r13
  1886. push %r14
  1887. push %r15
  1888. pushfq
  1889. sub \$64,%rsp
  1890. mov 120($context),%rax # pull context->Rax
  1891. mov 248($context),%rbx # pull context->Rip
  1892. mov 8($disp),%rsi # disp->ImageBase
  1893. mov 56($disp),%r11 # disp->HandlerData
  1894. mov 0(%r11),%r10d # HandlerData[0]
  1895. lea (%rsi,%r10),%r10 # end of prologue label
  1896. cmp %r10,%rbx # context->Rip<end of prologue label
  1897. jb .Lcommon_seh_tail
  1898. mov 152($context),%rax # pull context->Rsp
  1899. mov 4(%r11),%r10d # HandlerData[1]
  1900. lea (%rsi,%r10),%r10 # epilogue label
  1901. cmp %r10,%rbx # context->Rip>=epilogue label
  1902. jae .Lcommon_seh_tail
  1903. lea 128+24+48(%rax),%rax
  1904. lea .Lmul_gather4_epilogue(%rip),%rbx
  1905. cmp %r10,%rbx
  1906. jne .Lse_not_in_mul_gather4
  1907. lea 0xb0(%rax),%rax
  1908. lea -48-0xa8(%rax),%rsi
  1909. lea 512($context),%rdi
  1910. mov \$20,%ecx
  1911. .long 0xa548f3fc # cld; rep movsq
  1912. .Lse_not_in_mul_gather4:
  1913. mov -8(%rax),%rbx
  1914. mov -16(%rax),%rbp
  1915. mov -24(%rax),%r12
  1916. mov -32(%rax),%r13
  1917. mov -40(%rax),%r14
  1918. mov -48(%rax),%r15
  1919. mov %rbx,144($context) # restore context->Rbx
  1920. mov %rbp,160($context) # restore context->Rbp
  1921. mov %r12,216($context) # restore context->R12
  1922. mov %r13,224($context) # restore context->R13
  1923. mov %r14,232($context) # restore context->R14
  1924. mov %r15,240($context) # restore context->R15
  1925. .Lcommon_seh_tail:
  1926. mov 8(%rax),%rdi
  1927. mov 16(%rax),%rsi
  1928. mov %rax,152($context) # restore context->Rsp
  1929. mov %rsi,168($context) # restore context->Rsi
  1930. mov %rdi,176($context) # restore context->Rdi
  1931. mov 40($disp),%rdi # disp->ContextRecord
  1932. mov $context,%rsi # context
  1933. mov \$154,%ecx # sizeof(CONTEXT)
  1934. .long 0xa548f3fc # cld; rep movsq
  1935. mov $disp,%rsi
  1936. xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER
  1937. mov 8(%rsi),%rdx # arg2, disp->ImageBase
  1938. mov 0(%rsi),%r8 # arg3, disp->ControlPc
  1939. mov 16(%rsi),%r9 # arg4, disp->FunctionEntry
  1940. mov 40(%rsi),%r10 # disp->ContextRecord
  1941. lea 56(%rsi),%r11 # &disp->HandlerData
  1942. lea 24(%rsi),%r12 # &disp->EstablisherFrame
  1943. mov %r10,32(%rsp) # arg5
  1944. mov %r11,40(%rsp) # arg6
  1945. mov %r12,48(%rsp) # arg7
  1946. mov %rcx,56(%rsp) # arg8, (NULL)
  1947. call *__imp_RtlVirtualUnwind(%rip)
  1948. mov \$1,%eax # ExceptionContinueSearch
  1949. add \$64,%rsp
  1950. popfq
  1951. pop %r15
  1952. pop %r14
  1953. pop %r13
  1954. pop %r12
  1955. pop %rbp
  1956. pop %rbx
  1957. pop %rdi
  1958. pop %rsi
  1959. ret
  1960. .size se_handler,.-se_handler
  1961. .section .pdata
  1962. .align 4
  1963. .rva .LSEH_begin_rsaz_512_sqr
  1964. .rva .LSEH_end_rsaz_512_sqr
  1965. .rva .LSEH_info_rsaz_512_sqr
  1966. .rva .LSEH_begin_rsaz_512_mul
  1967. .rva .LSEH_end_rsaz_512_mul
  1968. .rva .LSEH_info_rsaz_512_mul
  1969. .rva .LSEH_begin_rsaz_512_mul_gather4
  1970. .rva .LSEH_end_rsaz_512_mul_gather4
  1971. .rva .LSEH_info_rsaz_512_mul_gather4
  1972. .rva .LSEH_begin_rsaz_512_mul_scatter4
  1973. .rva .LSEH_end_rsaz_512_mul_scatter4
  1974. .rva .LSEH_info_rsaz_512_mul_scatter4
  1975. .rva .LSEH_begin_rsaz_512_mul_by_one
  1976. .rva .LSEH_end_rsaz_512_mul_by_one
  1977. .rva .LSEH_info_rsaz_512_mul_by_one
  1978. .rva .LSEH_begin_rsaz_512_gather4
  1979. .rva .LSEH_end_rsaz_512_gather4
  1980. .rva .LSEH_info_rsaz_512_gather4
  1981. .section .xdata
  1982. .align 8
  1983. .LSEH_info_rsaz_512_sqr:
  1984. .byte 9,0,0,0
  1985. .rva se_handler
  1986. .rva .Lsqr_body,.Lsqr_epilogue # HandlerData[]
  1987. .LSEH_info_rsaz_512_mul:
  1988. .byte 9,0,0,0
  1989. .rva se_handler
  1990. .rva .Lmul_body,.Lmul_epilogue # HandlerData[]
  1991. .LSEH_info_rsaz_512_mul_gather4:
  1992. .byte 9,0,0,0
  1993. .rva se_handler
  1994. .rva .Lmul_gather4_body,.Lmul_gather4_epilogue # HandlerData[]
  1995. .LSEH_info_rsaz_512_mul_scatter4:
  1996. .byte 9,0,0,0
  1997. .rva se_handler
  1998. .rva .Lmul_scatter4_body,.Lmul_scatter4_epilogue # HandlerData[]
  1999. .LSEH_info_rsaz_512_mul_by_one:
  2000. .byte 9,0,0,0
  2001. .rva se_handler
  2002. .rva .Lmul_by_one_body,.Lmul_by_one_epilogue # HandlerData[]
  2003. .LSEH_info_rsaz_512_gather4:
  2004. .byte 0x01,0x46,0x16,0x00
  2005. .byte 0x46,0xf8,0x09,0x00 # vmovaps 0x90(rsp),xmm15
  2006. .byte 0x3d,0xe8,0x08,0x00 # vmovaps 0x80(rsp),xmm14
  2007. .byte 0x34,0xd8,0x07,0x00 # vmovaps 0x70(rsp),xmm13
  2008. .byte 0x2e,0xc8,0x06,0x00 # vmovaps 0x60(rsp),xmm12
  2009. .byte 0x28,0xb8,0x05,0x00 # vmovaps 0x50(rsp),xmm11
  2010. .byte 0x22,0xa8,0x04,0x00 # vmovaps 0x40(rsp),xmm10
  2011. .byte 0x1c,0x98,0x03,0x00 # vmovaps 0x30(rsp),xmm9
  2012. .byte 0x16,0x88,0x02,0x00 # vmovaps 0x20(rsp),xmm8
  2013. .byte 0x10,0x78,0x01,0x00 # vmovaps 0x10(rsp),xmm7
  2014. .byte 0x0b,0x68,0x00,0x00 # vmovaps 0x00(rsp),xmm6
  2015. .byte 0x07,0x01,0x15,0x00 # sub rsp,0xa8
  2016. ___
  2017. }
  2018. $code =~ s/\`([^\`]*)\`/eval $1/gem;
  2019. print $code;
  2020. close STDOUT;