Vous ne pouvez pas sélectionner plus de 25 sujets Les noms de sujets doivent commencer par une lettre ou un nombre, peuvent contenir des tirets ('-') et peuvent comporter jusqu'à 35 caractères.
 
 
 
 
 
 

2132 lignes
41 KiB

  1. #!/usr/bin/env perl
  2. ##############################################################################
  3. # #
  4. # Copyright (c) 2012, Intel Corporation #
  5. # #
  6. # All rights reserved. #
  7. # #
  8. # Redistribution and use in source and binary forms, with or without #
  9. # modification, are permitted provided that the following conditions are #
  10. # met: #
  11. # #
  12. # * Redistributions of source code must retain the above copyright #
  13. # notice, this list of conditions and the following disclaimer. #
  14. # #
  15. # * Redistributions in binary form must reproduce the above copyright #
  16. # notice, this list of conditions and the following disclaimer in the #
  17. # documentation and/or other materials provided with the #
  18. # distribution. #
  19. # #
  20. # * Neither the name of the Intel Corporation nor the names of its #
  21. # contributors may be used to endorse or promote products derived from #
  22. # this software without specific prior written permission. #
  23. # #
  24. # #
  25. # THIS SOFTWARE IS PROVIDED BY INTEL CORPORATION ""AS IS"" AND ANY #
  26. # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE #
  27. # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR #
  28. # PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL CORPORATION OR #
  29. # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, #
  30. # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, #
  31. # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR #
  32. # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF #
  33. # LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING #
  34. # NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS #
  35. # SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #
  36. # #
  37. ##############################################################################
  38. # Developers and authors: #
  39. # Shay Gueron (1, 2), and Vlad Krasnov (1) #
  40. # (1) Intel Architecture Group, Microprocessor and Chipset Development, #
  41. # Israel Development Center, Haifa, Israel #
  42. # (2) University of Haifa #
  43. ##############################################################################
  44. # Reference: #
  45. # [1] S. Gueron, "Efficient Software Implementations of Modular #
  46. # Exponentiation", http://eprint.iacr.org/2011/239 #
  47. # [2] S. Gueron, V. Krasnov. "Speeding up Big-Numbers Squaring". #
  48. # IEEE Proceedings of 9th International Conference on Information #
  49. # Technology: New Generations (ITNG 2012), 821-823 (2012). #
  50. # [3] S. Gueron, Efficient Software Implementations of Modular Exponentiation#
  51. # Journal of Cryptographic Engineering 2:31-43 (2012). #
  52. # [4] S. Gueron, V. Krasnov: "[PATCH] Efficient and side channel analysis #
  53. # resistant 512-bit and 1024-bit modular exponentiation for optimizing #
  54. # RSA1024 and RSA2048 on x86_64 platforms", #
  55. # http://rt.openssl.org/Ticket/Display.html?id=2582&user=guest&pass=guest#
  56. ##############################################################################
  57. # While original submission covers 512- and 1024-bit exponentiation,
  58. # this module is limited to 512-bit version only (and as such
  59. # accelerates RSA1024 sign). This is because improvement for longer
  60. # keys is not high enough to justify the effort, highest measured
  61. # was ~5% on Westmere. [This is relative to OpenSSL 1.0.2, upcoming
  62. # for the moment of this writing!] Nor does this module implement
  63. # "monolithic" complete exponentiation jumbo-subroutine, but adheres
  64. # to more modular mixture of C and assembly. And it's optimized even
  65. # for processors other than Intel Core family (see table below for
  66. # improvement coefficients).
  67. # <appro@openssl.org>
  68. #
  69. # RSA1024 sign/sec this/original |this/rsax(*) this/fips(*)
  70. # ----------------+---------------------------
  71. # Opteron +13% |+5% +20%
  72. # Bulldozer -0% |-1% +10%
  73. # P4 +11% |+7% +8%
  74. # Westmere +5% |+14% +17%
  75. # Sandy Bridge +2% |+12% +29%
  76. # Ivy Bridge +1% |+11% +35%
  77. # Haswell(**) -0% |+12% +39%
  78. # Atom +13% |+11% +4%
  79. # VIA Nano +70% |+9% +25%
  80. #
  81. # (*) rsax engine and fips numbers are presented for reference
  82. # purposes;
  83. # (**) MULX was attempted, but found to give only marginal improvement;
  84. $flavour = shift;
  85. $output = shift;
  86. if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
  87. $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
  88. $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
  89. ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
  90. ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
  91. die "can't locate x86_64-xlate.pl";
  92. open OUT,"| \"$^X\" $xlate $flavour $output";
  93. *STDOUT=*OUT;
  94. # In upstream, this is controlled by shelling out to the compiler to check
  95. # versions, but BoringSSL is intended to be used with pre-generated perlasm
  96. # output, so this isn't useful anyway.
  97. #
  98. # TODO(davidben): Enable this after testing. $addx goes up to 1.
  99. $addx = 0;
  100. ($out, $inp, $mod) = ("%rdi", "%rsi", "%rbp"); # common internal API
  101. {
  102. my ($out,$inp,$mod,$n0,$times) = ("%rdi","%rsi","%rdx","%rcx","%r8d");
  103. $code.=<<___;
  104. .text
  105. .extern OPENSSL_ia32cap_P
  106. .globl rsaz_512_sqr
  107. .type rsaz_512_sqr,\@function,5
  108. .align 32
  109. rsaz_512_sqr: # 25-29% faster than rsaz_512_mul
  110. push %rbx
  111. push %rbp
  112. push %r12
  113. push %r13
  114. push %r14
  115. push %r15
  116. subq \$128+24, %rsp
  117. .Lsqr_body:
  118. movq $mod, %rbp # common argument
  119. movq ($inp), %rdx
  120. movq 8($inp), %rax
  121. movq $n0, 128(%rsp)
  122. ___
  123. $code.=<<___ if ($addx);
  124. movl \$0x80100,%r11d
  125. andl OPENSSL_ia32cap_P+8(%rip),%r11d
  126. cmpl \$0x80100,%r11d # check for MULX and ADO/CX
  127. je .Loop_sqrx
  128. ___
  129. $code.=<<___;
  130. jmp .Loop_sqr
  131. .align 32
  132. .Loop_sqr:
  133. movl $times,128+8(%rsp)
  134. #first iteration
  135. movq %rdx, %rbx
  136. mulq %rdx
  137. movq %rax, %r8
  138. movq 16($inp), %rax
  139. movq %rdx, %r9
  140. mulq %rbx
  141. addq %rax, %r9
  142. movq 24($inp), %rax
  143. movq %rdx, %r10
  144. adcq \$0, %r10
  145. mulq %rbx
  146. addq %rax, %r10
  147. movq 32($inp), %rax
  148. movq %rdx, %r11
  149. adcq \$0, %r11
  150. mulq %rbx
  151. addq %rax, %r11
  152. movq 40($inp), %rax
  153. movq %rdx, %r12
  154. adcq \$0, %r12
  155. mulq %rbx
  156. addq %rax, %r12
  157. movq 48($inp), %rax
  158. movq %rdx, %r13
  159. adcq \$0, %r13
  160. mulq %rbx
  161. addq %rax, %r13
  162. movq 56($inp), %rax
  163. movq %rdx, %r14
  164. adcq \$0, %r14
  165. mulq %rbx
  166. addq %rax, %r14
  167. movq %rbx, %rax
  168. movq %rdx, %r15
  169. adcq \$0, %r15
  170. addq %r8, %r8 #shlq \$1, %r8
  171. movq %r9, %rcx
  172. adcq %r9, %r9 #shld \$1, %r8, %r9
  173. mulq %rax
  174. movq %rax, (%rsp)
  175. addq %rdx, %r8
  176. adcq \$0, %r9
  177. movq %r8, 8(%rsp)
  178. shrq \$63, %rcx
  179. #second iteration
  180. movq 8($inp), %r8
  181. movq 16($inp), %rax
  182. mulq %r8
  183. addq %rax, %r10
  184. movq 24($inp), %rax
  185. movq %rdx, %rbx
  186. adcq \$0, %rbx
  187. mulq %r8
  188. addq %rax, %r11
  189. movq 32($inp), %rax
  190. adcq \$0, %rdx
  191. addq %rbx, %r11
  192. movq %rdx, %rbx
  193. adcq \$0, %rbx
  194. mulq %r8
  195. addq %rax, %r12
  196. movq 40($inp), %rax
  197. adcq \$0, %rdx
  198. addq %rbx, %r12
  199. movq %rdx, %rbx
  200. adcq \$0, %rbx
  201. mulq %r8
  202. addq %rax, %r13
  203. movq 48($inp), %rax
  204. adcq \$0, %rdx
  205. addq %rbx, %r13
  206. movq %rdx, %rbx
  207. adcq \$0, %rbx
  208. mulq %r8
  209. addq %rax, %r14
  210. movq 56($inp), %rax
  211. adcq \$0, %rdx
  212. addq %rbx, %r14
  213. movq %rdx, %rbx
  214. adcq \$0, %rbx
  215. mulq %r8
  216. addq %rax, %r15
  217. movq %r8, %rax
  218. adcq \$0, %rdx
  219. addq %rbx, %r15
  220. movq %rdx, %r8
  221. movq %r10, %rdx
  222. adcq \$0, %r8
  223. add %rdx, %rdx
  224. lea (%rcx,%r10,2), %r10 #shld \$1, %rcx, %r10
  225. movq %r11, %rbx
  226. adcq %r11, %r11 #shld \$1, %r10, %r11
  227. mulq %rax
  228. addq %rax, %r9
  229. adcq %rdx, %r10
  230. adcq \$0, %r11
  231. movq %r9, 16(%rsp)
  232. movq %r10, 24(%rsp)
  233. shrq \$63, %rbx
  234. #third iteration
  235. movq 16($inp), %r9
  236. movq 24($inp), %rax
  237. mulq %r9
  238. addq %rax, %r12
  239. movq 32($inp), %rax
  240. movq %rdx, %rcx
  241. adcq \$0, %rcx
  242. mulq %r9
  243. addq %rax, %r13
  244. movq 40($inp), %rax
  245. adcq \$0, %rdx
  246. addq %rcx, %r13
  247. movq %rdx, %rcx
  248. adcq \$0, %rcx
  249. mulq %r9
  250. addq %rax, %r14
  251. movq 48($inp), %rax
  252. adcq \$0, %rdx
  253. addq %rcx, %r14
  254. movq %rdx, %rcx
  255. adcq \$0, %rcx
  256. mulq %r9
  257. movq %r12, %r10
  258. lea (%rbx,%r12,2), %r12 #shld \$1, %rbx, %r12
  259. addq %rax, %r15
  260. movq 56($inp), %rax
  261. adcq \$0, %rdx
  262. addq %rcx, %r15
  263. movq %rdx, %rcx
  264. adcq \$0, %rcx
  265. mulq %r9
  266. shrq \$63, %r10
  267. addq %rax, %r8
  268. movq %r9, %rax
  269. adcq \$0, %rdx
  270. addq %rcx, %r8
  271. movq %rdx, %r9
  272. adcq \$0, %r9
  273. movq %r13, %rcx
  274. leaq (%r10,%r13,2), %r13 #shld \$1, %r12, %r13
  275. mulq %rax
  276. addq %rax, %r11
  277. adcq %rdx, %r12
  278. adcq \$0, %r13
  279. movq %r11, 32(%rsp)
  280. movq %r12, 40(%rsp)
  281. shrq \$63, %rcx
  282. #fourth iteration
  283. movq 24($inp), %r10
  284. movq 32($inp), %rax
  285. mulq %r10
  286. addq %rax, %r14
  287. movq 40($inp), %rax
  288. movq %rdx, %rbx
  289. adcq \$0, %rbx
  290. mulq %r10
  291. addq %rax, %r15
  292. movq 48($inp), %rax
  293. adcq \$0, %rdx
  294. addq %rbx, %r15
  295. movq %rdx, %rbx
  296. adcq \$0, %rbx
  297. mulq %r10
  298. movq %r14, %r12
  299. leaq (%rcx,%r14,2), %r14 #shld \$1, %rcx, %r14
  300. addq %rax, %r8
  301. movq 56($inp), %rax
  302. adcq \$0, %rdx
  303. addq %rbx, %r8
  304. movq %rdx, %rbx
  305. adcq \$0, %rbx
  306. mulq %r10
  307. shrq \$63, %r12
  308. addq %rax, %r9
  309. movq %r10, %rax
  310. adcq \$0, %rdx
  311. addq %rbx, %r9
  312. movq %rdx, %r10
  313. adcq \$0, %r10
  314. movq %r15, %rbx
  315. leaq (%r12,%r15,2),%r15 #shld \$1, %r14, %r15
  316. mulq %rax
  317. addq %rax, %r13
  318. adcq %rdx, %r14
  319. adcq \$0, %r15
  320. movq %r13, 48(%rsp)
  321. movq %r14, 56(%rsp)
  322. shrq \$63, %rbx
  323. #fifth iteration
  324. movq 32($inp), %r11
  325. movq 40($inp), %rax
  326. mulq %r11
  327. addq %rax, %r8
  328. movq 48($inp), %rax
  329. movq %rdx, %rcx
  330. adcq \$0, %rcx
  331. mulq %r11
  332. addq %rax, %r9
  333. movq 56($inp), %rax
  334. adcq \$0, %rdx
  335. movq %r8, %r12
  336. leaq (%rbx,%r8,2), %r8 #shld \$1, %rbx, %r8
  337. addq %rcx, %r9
  338. movq %rdx, %rcx
  339. adcq \$0, %rcx
  340. mulq %r11
  341. shrq \$63, %r12
  342. addq %rax, %r10
  343. movq %r11, %rax
  344. adcq \$0, %rdx
  345. addq %rcx, %r10
  346. movq %rdx, %r11
  347. adcq \$0, %r11
  348. movq %r9, %rcx
  349. leaq (%r12,%r9,2), %r9 #shld \$1, %r8, %r9
  350. mulq %rax
  351. addq %rax, %r15
  352. adcq %rdx, %r8
  353. adcq \$0, %r9
  354. movq %r15, 64(%rsp)
  355. movq %r8, 72(%rsp)
  356. shrq \$63, %rcx
  357. #sixth iteration
  358. movq 40($inp), %r12
  359. movq 48($inp), %rax
  360. mulq %r12
  361. addq %rax, %r10
  362. movq 56($inp), %rax
  363. movq %rdx, %rbx
  364. adcq \$0, %rbx
  365. mulq %r12
  366. addq %rax, %r11
  367. movq %r12, %rax
  368. movq %r10, %r15
  369. leaq (%rcx,%r10,2), %r10 #shld \$1, %rcx, %r10
  370. adcq \$0, %rdx
  371. shrq \$63, %r15
  372. addq %rbx, %r11
  373. movq %rdx, %r12
  374. adcq \$0, %r12
  375. movq %r11, %rbx
  376. leaq (%r15,%r11,2), %r11 #shld \$1, %r10, %r11
  377. mulq %rax
  378. addq %rax, %r9
  379. adcq %rdx, %r10
  380. adcq \$0, %r11
  381. movq %r9, 80(%rsp)
  382. movq %r10, 88(%rsp)
  383. #seventh iteration
  384. movq 48($inp), %r13
  385. movq 56($inp), %rax
  386. mulq %r13
  387. addq %rax, %r12
  388. movq %r13, %rax
  389. movq %rdx, %r13
  390. adcq \$0, %r13
  391. xorq %r14, %r14
  392. shlq \$1, %rbx
  393. adcq %r12, %r12 #shld \$1, %rbx, %r12
  394. adcq %r13, %r13 #shld \$1, %r12, %r13
  395. adcq %r14, %r14 #shld \$1, %r13, %r14
  396. mulq %rax
  397. addq %rax, %r11
  398. adcq %rdx, %r12
  399. adcq \$0, %r13
  400. movq %r11, 96(%rsp)
  401. movq %r12, 104(%rsp)
  402. #eighth iteration
  403. movq 56($inp), %rax
  404. mulq %rax
  405. addq %rax, %r13
  406. adcq \$0, %rdx
  407. addq %rdx, %r14
  408. movq %r13, 112(%rsp)
  409. movq %r14, 120(%rsp)
  410. movq (%rsp), %r8
  411. movq 8(%rsp), %r9
  412. movq 16(%rsp), %r10
  413. movq 24(%rsp), %r11
  414. movq 32(%rsp), %r12
  415. movq 40(%rsp), %r13
  416. movq 48(%rsp), %r14
  417. movq 56(%rsp), %r15
  418. call __rsaz_512_reduce
  419. addq 64(%rsp), %r8
  420. adcq 72(%rsp), %r9
  421. adcq 80(%rsp), %r10
  422. adcq 88(%rsp), %r11
  423. adcq 96(%rsp), %r12
  424. adcq 104(%rsp), %r13
  425. adcq 112(%rsp), %r14
  426. adcq 120(%rsp), %r15
  427. sbbq %rcx, %rcx
  428. call __rsaz_512_subtract
  429. movq %r8, %rdx
  430. movq %r9, %rax
  431. movl 128+8(%rsp), $times
  432. movq $out, $inp
  433. decl $times
  434. jnz .Loop_sqr
  435. ___
  436. if ($addx) {
  437. $code.=<<___;
  438. jmp .Lsqr_tail
  439. .align 32
  440. .Loop_sqrx:
  441. movl $times,128+8(%rsp)
  442. movq $out, %xmm0 # off-load
  443. movq %rbp, %xmm1 # off-load
  444. #first iteration
  445. mulx %rax, %r8, %r9
  446. mulx 16($inp), %rcx, %r10
  447. xor %rbp, %rbp # cf=0, of=0
  448. mulx 24($inp), %rax, %r11
  449. adcx %rcx, %r9
  450. mulx 32($inp), %rcx, %r12
  451. adcx %rax, %r10
  452. mulx 40($inp), %rax, %r13
  453. adcx %rcx, %r11
  454. .byte 0xc4,0x62,0xf3,0xf6,0xb6,0x30,0x00,0x00,0x00 # mulx 48($inp), %rcx, %r14
  455. adcx %rax, %r12
  456. adcx %rcx, %r13
  457. .byte 0xc4,0x62,0xfb,0xf6,0xbe,0x38,0x00,0x00,0x00 # mulx 56($inp), %rax, %r15
  458. adcx %rax, %r14
  459. adcx %rbp, %r15 # %rbp is 0
  460. mov %r9, %rcx
  461. shld \$1, %r8, %r9
  462. shl \$1, %r8
  463. xor %ebp, %ebp
  464. mulx %rdx, %rax, %rdx
  465. adcx %rdx, %r8
  466. mov 8($inp), %rdx
  467. adcx %rbp, %r9
  468. mov %rax, (%rsp)
  469. mov %r8, 8(%rsp)
  470. #second iteration
  471. mulx 16($inp), %rax, %rbx
  472. adox %rax, %r10
  473. adcx %rbx, %r11
  474. .byte 0xc4,0x62,0xc3,0xf6,0x86,0x18,0x00,0x00,0x00 # mulx 24($inp), $out, %r8
  475. adox $out, %r11
  476. adcx %r8, %r12
  477. mulx 32($inp), %rax, %rbx
  478. adox %rax, %r12
  479. adcx %rbx, %r13
  480. mulx 40($inp), $out, %r8
  481. adox $out, %r13
  482. adcx %r8, %r14
  483. .byte 0xc4,0xe2,0xfb,0xf6,0x9e,0x30,0x00,0x00,0x00 # mulx 48($inp), %rax, %rbx
  484. adox %rax, %r14
  485. adcx %rbx, %r15
  486. .byte 0xc4,0x62,0xc3,0xf6,0x86,0x38,0x00,0x00,0x00 # mulx 56($inp), $out, %r8
  487. adox $out, %r15
  488. adcx %rbp, %r8
  489. adox %rbp, %r8
  490. mov %r11, %rbx
  491. shld \$1, %r10, %r11
  492. shld \$1, %rcx, %r10
  493. xor %ebp,%ebp
  494. mulx %rdx, %rax, %rcx
  495. mov 16($inp), %rdx
  496. adcx %rax, %r9
  497. adcx %rcx, %r10
  498. adcx %rbp, %r11
  499. mov %r9, 16(%rsp)
  500. .byte 0x4c,0x89,0x94,0x24,0x18,0x00,0x00,0x00 # mov %r10, 24(%rsp)
  501. #third iteration
  502. .byte 0xc4,0x62,0xc3,0xf6,0x8e,0x18,0x00,0x00,0x00 # mulx 24($inp), $out, %r9
  503. adox $out, %r12
  504. adcx %r9, %r13
  505. mulx 32($inp), %rax, %rcx
  506. adox %rax, %r13
  507. adcx %rcx, %r14
  508. mulx 40($inp), $out, %r9
  509. adox $out, %r14
  510. adcx %r9, %r15
  511. .byte 0xc4,0xe2,0xfb,0xf6,0x8e,0x30,0x00,0x00,0x00 # mulx 48($inp), %rax, %rcx
  512. adox %rax, %r15
  513. adcx %rcx, %r8
  514. .byte 0xc4,0x62,0xc3,0xf6,0x8e,0x38,0x00,0x00,0x00 # mulx 56($inp), $out, %r9
  515. adox $out, %r8
  516. adcx %rbp, %r9
  517. adox %rbp, %r9
  518. mov %r13, %rcx
  519. shld \$1, %r12, %r13
  520. shld \$1, %rbx, %r12
  521. xor %ebp, %ebp
  522. mulx %rdx, %rax, %rdx
  523. adcx %rax, %r11
  524. adcx %rdx, %r12
  525. mov 24($inp), %rdx
  526. adcx %rbp, %r13
  527. mov %r11, 32(%rsp)
  528. .byte 0x4c,0x89,0xa4,0x24,0x28,0x00,0x00,0x00 # mov %r12, 40(%rsp)
  529. #fourth iteration
  530. .byte 0xc4,0xe2,0xfb,0xf6,0x9e,0x20,0x00,0x00,0x00 # mulx 32($inp), %rax, %rbx
  531. adox %rax, %r14
  532. adcx %rbx, %r15
  533. mulx 40($inp), $out, %r10
  534. adox $out, %r15
  535. adcx %r10, %r8
  536. mulx 48($inp), %rax, %rbx
  537. adox %rax, %r8
  538. adcx %rbx, %r9
  539. mulx 56($inp), $out, %r10
  540. adox $out, %r9
  541. adcx %rbp, %r10
  542. adox %rbp, %r10
  543. .byte 0x66
  544. mov %r15, %rbx
  545. shld \$1, %r14, %r15
  546. shld \$1, %rcx, %r14
  547. xor %ebp, %ebp
  548. mulx %rdx, %rax, %rdx
  549. adcx %rax, %r13
  550. adcx %rdx, %r14
  551. mov 32($inp), %rdx
  552. adcx %rbp, %r15
  553. mov %r13, 48(%rsp)
  554. mov %r14, 56(%rsp)
  555. #fifth iteration
  556. .byte 0xc4,0x62,0xc3,0xf6,0x9e,0x28,0x00,0x00,0x00 # mulx 40($inp), $out, %r11
  557. adox $out, %r8
  558. adcx %r11, %r9
  559. mulx 48($inp), %rax, %rcx
  560. adox %rax, %r9
  561. adcx %rcx, %r10
  562. mulx 56($inp), $out, %r11
  563. adox $out, %r10
  564. adcx %rbp, %r11
  565. adox %rbp, %r11
  566. mov %r9, %rcx
  567. shld \$1, %r8, %r9
  568. shld \$1, %rbx, %r8
  569. xor %ebp, %ebp
  570. mulx %rdx, %rax, %rdx
  571. adcx %rax, %r15
  572. adcx %rdx, %r8
  573. mov 40($inp), %rdx
  574. adcx %rbp, %r9
  575. mov %r15, 64(%rsp)
  576. mov %r8, 72(%rsp)
  577. #sixth iteration
  578. .byte 0xc4,0xe2,0xfb,0xf6,0x9e,0x30,0x00,0x00,0x00 # mulx 48($inp), %rax, %rbx
  579. adox %rax, %r10
  580. adcx %rbx, %r11
  581. .byte 0xc4,0x62,0xc3,0xf6,0xa6,0x38,0x00,0x00,0x00 # mulx 56($inp), $out, %r12
  582. adox $out, %r11
  583. adcx %rbp, %r12
  584. adox %rbp, %r12
  585. mov %r11, %rbx
  586. shld \$1, %r10, %r11
  587. shld \$1, %rcx, %r10
  588. xor %ebp, %ebp
  589. mulx %rdx, %rax, %rdx
  590. adcx %rax, %r9
  591. adcx %rdx, %r10
  592. mov 48($inp), %rdx
  593. adcx %rbp, %r11
  594. mov %r9, 80(%rsp)
  595. mov %r10, 88(%rsp)
  596. #seventh iteration
  597. .byte 0xc4,0x62,0xfb,0xf6,0xae,0x38,0x00,0x00,0x00 # mulx 56($inp), %rax, %r13
  598. adox %rax, %r12
  599. adox %rbp, %r13
  600. xor %r14, %r14
  601. shld \$1, %r13, %r14
  602. shld \$1, %r12, %r13
  603. shld \$1, %rbx, %r12
  604. xor %ebp, %ebp
  605. mulx %rdx, %rax, %rdx
  606. adcx %rax, %r11
  607. adcx %rdx, %r12
  608. mov 56($inp), %rdx
  609. adcx %rbp, %r13
  610. .byte 0x4c,0x89,0x9c,0x24,0x60,0x00,0x00,0x00 # mov %r11, 96(%rsp)
  611. .byte 0x4c,0x89,0xa4,0x24,0x68,0x00,0x00,0x00 # mov %r12, 104(%rsp)
  612. #eighth iteration
  613. mulx %rdx, %rax, %rdx
  614. adox %rax, %r13
  615. adox %rbp, %rdx
  616. .byte 0x66
  617. add %rdx, %r14
  618. movq %r13, 112(%rsp)
  619. movq %r14, 120(%rsp)
  620. movq %xmm0, $out
  621. movq %xmm1, %rbp
  622. movq 128(%rsp), %rdx # pull $n0
  623. movq (%rsp), %r8
  624. movq 8(%rsp), %r9
  625. movq 16(%rsp), %r10
  626. movq 24(%rsp), %r11
  627. movq 32(%rsp), %r12
  628. movq 40(%rsp), %r13
  629. movq 48(%rsp), %r14
  630. movq 56(%rsp), %r15
  631. call __rsaz_512_reducex
  632. addq 64(%rsp), %r8
  633. adcq 72(%rsp), %r9
  634. adcq 80(%rsp), %r10
  635. adcq 88(%rsp), %r11
  636. adcq 96(%rsp), %r12
  637. adcq 104(%rsp), %r13
  638. adcq 112(%rsp), %r14
  639. adcq 120(%rsp), %r15
  640. sbbq %rcx, %rcx
  641. call __rsaz_512_subtract
  642. movq %r8, %rdx
  643. movq %r9, %rax
  644. movl 128+8(%rsp), $times
  645. movq $out, $inp
  646. decl $times
  647. jnz .Loop_sqrx
  648. .Lsqr_tail:
  649. ___
  650. }
  651. $code.=<<___;
  652. leaq 128+24+48(%rsp), %rax
  653. movq -48(%rax), %r15
  654. movq -40(%rax), %r14
  655. movq -32(%rax), %r13
  656. movq -24(%rax), %r12
  657. movq -16(%rax), %rbp
  658. movq -8(%rax), %rbx
  659. leaq (%rax), %rsp
  660. .Lsqr_epilogue:
  661. ret
  662. .size rsaz_512_sqr,.-rsaz_512_sqr
  663. ___
  664. }
  665. {
  666. my ($out,$ap,$bp,$mod,$n0) = ("%rdi","%rsi","%rdx","%rcx","%r8");
  667. $code.=<<___;
  668. .globl rsaz_512_mul
  669. .type rsaz_512_mul,\@function,5
  670. .align 32
  671. rsaz_512_mul:
  672. push %rbx
  673. push %rbp
  674. push %r12
  675. push %r13
  676. push %r14
  677. push %r15
  678. subq \$128+24, %rsp
  679. .Lmul_body:
  680. movq $out, %xmm0 # off-load arguments
  681. movq $mod, %xmm1
  682. movq $n0, 128(%rsp)
  683. ___
  684. $code.=<<___ if ($addx);
  685. movl \$0x80100,%r11d
  686. andl OPENSSL_ia32cap_P+8(%rip),%r11d
  687. cmpl \$0x80100,%r11d # check for MULX and ADO/CX
  688. je .Lmulx
  689. ___
  690. $code.=<<___;
  691. movq ($bp), %rbx # pass b[0]
  692. movq $bp, %rbp # pass argument
  693. call __rsaz_512_mul
  694. movq %xmm0, $out
  695. movq %xmm1, %rbp
  696. movq (%rsp), %r8
  697. movq 8(%rsp), %r9
  698. movq 16(%rsp), %r10
  699. movq 24(%rsp), %r11
  700. movq 32(%rsp), %r12
  701. movq 40(%rsp), %r13
  702. movq 48(%rsp), %r14
  703. movq 56(%rsp), %r15
  704. call __rsaz_512_reduce
  705. ___
  706. $code.=<<___ if ($addx);
  707. jmp .Lmul_tail
  708. .align 32
  709. .Lmulx:
  710. movq $bp, %rbp # pass argument
  711. movq ($bp), %rdx # pass b[0]
  712. call __rsaz_512_mulx
  713. movq %xmm0, $out
  714. movq %xmm1, %rbp
  715. movq 128(%rsp), %rdx # pull $n0
  716. movq (%rsp), %r8
  717. movq 8(%rsp), %r9
  718. movq 16(%rsp), %r10
  719. movq 24(%rsp), %r11
  720. movq 32(%rsp), %r12
  721. movq 40(%rsp), %r13
  722. movq 48(%rsp), %r14
  723. movq 56(%rsp), %r15
  724. call __rsaz_512_reducex
  725. .Lmul_tail:
  726. ___
  727. $code.=<<___;
  728. addq 64(%rsp), %r8
  729. adcq 72(%rsp), %r9
  730. adcq 80(%rsp), %r10
  731. adcq 88(%rsp), %r11
  732. adcq 96(%rsp), %r12
  733. adcq 104(%rsp), %r13
  734. adcq 112(%rsp), %r14
  735. adcq 120(%rsp), %r15
  736. sbbq %rcx, %rcx
  737. call __rsaz_512_subtract
  738. leaq 128+24+48(%rsp), %rax
  739. movq -48(%rax), %r15
  740. movq -40(%rax), %r14
  741. movq -32(%rax), %r13
  742. movq -24(%rax), %r12
  743. movq -16(%rax), %rbp
  744. movq -8(%rax), %rbx
  745. leaq (%rax), %rsp
  746. .Lmul_epilogue:
  747. ret
  748. .size rsaz_512_mul,.-rsaz_512_mul
  749. ___
  750. }
  751. {
  752. my ($out,$ap,$bp,$mod,$n0,$pwr) = ("%rdi","%rsi","%rdx","%rcx","%r8","%r9d");
  753. $code.=<<___;
  754. .globl rsaz_512_mul_gather4
  755. .type rsaz_512_mul_gather4,\@function,6
  756. .align 32
  757. rsaz_512_mul_gather4:
  758. push %rbx
  759. push %rbp
  760. push %r12
  761. push %r13
  762. push %r14
  763. push %r15
  764. mov $pwr, $pwr
  765. subq \$128+24, %rsp
  766. .Lmul_gather4_body:
  767. ___
  768. $code.=<<___ if ($addx);
  769. movl \$0x80100,%r11d
  770. andl OPENSSL_ia32cap_P+8(%rip),%r11d
  771. cmpl \$0x80100,%r11d # check for MULX and ADO/CX
  772. je .Lmulx_gather
  773. ___
  774. $code.=<<___;
  775. movl 64($bp,$pwr,4), %eax
  776. movq $out, %xmm0 # off-load arguments
  777. movl ($bp,$pwr,4), %ebx
  778. movq $mod, %xmm1
  779. movq $n0, 128(%rsp)
  780. shlq \$32, %rax
  781. or %rax, %rbx
  782. movq ($ap), %rax
  783. movq 8($ap), %rcx
  784. leaq 128($bp,$pwr,4), %rbp
  785. mulq %rbx # 0 iteration
  786. movq %rax, (%rsp)
  787. movq %rcx, %rax
  788. movq %rdx, %r8
  789. mulq %rbx
  790. movd (%rbp), %xmm4
  791. addq %rax, %r8
  792. movq 16($ap), %rax
  793. movq %rdx, %r9
  794. adcq \$0, %r9
  795. mulq %rbx
  796. movd 64(%rbp), %xmm5
  797. addq %rax, %r9
  798. movq 24($ap), %rax
  799. movq %rdx, %r10
  800. adcq \$0, %r10
  801. mulq %rbx
  802. pslldq \$4, %xmm5
  803. addq %rax, %r10
  804. movq 32($ap), %rax
  805. movq %rdx, %r11
  806. adcq \$0, %r11
  807. mulq %rbx
  808. por %xmm5, %xmm4
  809. addq %rax, %r11
  810. movq 40($ap), %rax
  811. movq %rdx, %r12
  812. adcq \$0, %r12
  813. mulq %rbx
  814. addq %rax, %r12
  815. movq 48($ap), %rax
  816. movq %rdx, %r13
  817. adcq \$0, %r13
  818. mulq %rbx
  819. leaq 128(%rbp), %rbp
  820. addq %rax, %r13
  821. movq 56($ap), %rax
  822. movq %rdx, %r14
  823. adcq \$0, %r14
  824. mulq %rbx
  825. movq %xmm4, %rbx
  826. addq %rax, %r14
  827. movq ($ap), %rax
  828. movq %rdx, %r15
  829. adcq \$0, %r15
  830. leaq 8(%rsp), %rdi
  831. movl \$7, %ecx
  832. jmp .Loop_mul_gather
  833. .align 32
  834. .Loop_mul_gather:
  835. mulq %rbx
  836. addq %rax, %r8
  837. movq 8($ap), %rax
  838. movq %r8, (%rdi)
  839. movq %rdx, %r8
  840. adcq \$0, %r8
  841. mulq %rbx
  842. movd (%rbp), %xmm4
  843. addq %rax, %r9
  844. movq 16($ap), %rax
  845. adcq \$0, %rdx
  846. addq %r9, %r8
  847. movq %rdx, %r9
  848. adcq \$0, %r9
  849. mulq %rbx
  850. movd 64(%rbp), %xmm5
  851. addq %rax, %r10
  852. movq 24($ap), %rax
  853. adcq \$0, %rdx
  854. addq %r10, %r9
  855. movq %rdx, %r10
  856. adcq \$0, %r10
  857. mulq %rbx
  858. pslldq \$4, %xmm5
  859. addq %rax, %r11
  860. movq 32($ap), %rax
  861. adcq \$0, %rdx
  862. addq %r11, %r10
  863. movq %rdx, %r11
  864. adcq \$0, %r11
  865. mulq %rbx
  866. por %xmm5, %xmm4
  867. addq %rax, %r12
  868. movq 40($ap), %rax
  869. adcq \$0, %rdx
  870. addq %r12, %r11
  871. movq %rdx, %r12
  872. adcq \$0, %r12
  873. mulq %rbx
  874. addq %rax, %r13
  875. movq 48($ap), %rax
  876. adcq \$0, %rdx
  877. addq %r13, %r12
  878. movq %rdx, %r13
  879. adcq \$0, %r13
  880. mulq %rbx
  881. addq %rax, %r14
  882. movq 56($ap), %rax
  883. adcq \$0, %rdx
  884. addq %r14, %r13
  885. movq %rdx, %r14
  886. adcq \$0, %r14
  887. mulq %rbx
  888. movq %xmm4, %rbx
  889. addq %rax, %r15
  890. movq ($ap), %rax
  891. adcq \$0, %rdx
  892. addq %r15, %r14
  893. movq %rdx, %r15
  894. adcq \$0, %r15
  895. leaq 128(%rbp), %rbp
  896. leaq 8(%rdi), %rdi
  897. decl %ecx
  898. jnz .Loop_mul_gather
  899. movq %r8, (%rdi)
  900. movq %r9, 8(%rdi)
  901. movq %r10, 16(%rdi)
  902. movq %r11, 24(%rdi)
  903. movq %r12, 32(%rdi)
  904. movq %r13, 40(%rdi)
  905. movq %r14, 48(%rdi)
  906. movq %r15, 56(%rdi)
  907. movq %xmm0, $out
  908. movq %xmm1, %rbp
  909. movq (%rsp), %r8
  910. movq 8(%rsp), %r9
  911. movq 16(%rsp), %r10
  912. movq 24(%rsp), %r11
  913. movq 32(%rsp), %r12
  914. movq 40(%rsp), %r13
  915. movq 48(%rsp), %r14
  916. movq 56(%rsp), %r15
  917. call __rsaz_512_reduce
  918. ___
  919. $code.=<<___ if ($addx);
  920. jmp .Lmul_gather_tail
  921. .align 32
  922. .Lmulx_gather:
  923. mov 64($bp,$pwr,4), %eax
  924. movq $out, %xmm0 # off-load arguments
  925. lea 128($bp,$pwr,4), %rbp
  926. mov ($bp,$pwr,4), %edx
  927. movq $mod, %xmm1
  928. mov $n0, 128(%rsp)
  929. shl \$32, %rax
  930. or %rax, %rdx
  931. mulx ($ap), %rbx, %r8 # 0 iteration
  932. mov %rbx, (%rsp)
  933. xor %edi, %edi # cf=0, of=0
  934. mulx 8($ap), %rax, %r9
  935. movd (%rbp), %xmm4
  936. mulx 16($ap), %rbx, %r10
  937. movd 64(%rbp), %xmm5
  938. adcx %rax, %r8
  939. mulx 24($ap), %rax, %r11
  940. pslldq \$4, %xmm5
  941. adcx %rbx, %r9
  942. mulx 32($ap), %rbx, %r12
  943. por %xmm5, %xmm4
  944. adcx %rax, %r10
  945. mulx 40($ap), %rax, %r13
  946. adcx %rbx, %r11
  947. mulx 48($ap), %rbx, %r14
  948. lea 128(%rbp), %rbp
  949. adcx %rax, %r12
  950. mulx 56($ap), %rax, %r15
  951. movq %xmm4, %rdx
  952. adcx %rbx, %r13
  953. adcx %rax, %r14
  954. mov %r8, %rbx
  955. adcx %rdi, %r15 # %rdi is 0
  956. mov \$-7, %rcx
  957. jmp .Loop_mulx_gather
  958. .align 32
  959. .Loop_mulx_gather:
  960. mulx ($ap), %rax, %r8
  961. adcx %rax, %rbx
  962. adox %r9, %r8
  963. mulx 8($ap), %rax, %r9
  964. .byte 0x66,0x0f,0x6e,0xa5,0x00,0x00,0x00,0x00 # movd (%rbp), %xmm4
  965. adcx %rax, %r8
  966. adox %r10, %r9
  967. mulx 16($ap), %rax, %r10
  968. movd 64(%rbp), %xmm5
  969. lea 128(%rbp), %rbp
  970. adcx %rax, %r9
  971. adox %r11, %r10
  972. .byte 0xc4,0x62,0xfb,0xf6,0x9e,0x18,0x00,0x00,0x00 # mulx 24($ap), %rax, %r11
  973. pslldq \$4, %xmm5
  974. por %xmm5, %xmm4
  975. adcx %rax, %r10
  976. adox %r12, %r11
  977. mulx 32($ap), %rax, %r12
  978. adcx %rax, %r11
  979. adox %r13, %r12
  980. mulx 40($ap), %rax, %r13
  981. adcx %rax, %r12
  982. adox %r14, %r13
  983. .byte 0xc4,0x62,0xfb,0xf6,0xb6,0x30,0x00,0x00,0x00 # mulx 48($ap), %rax, %r14
  984. adcx %rax, %r13
  985. adox %r15, %r14
  986. mulx 56($ap), %rax, %r15
  987. movq %xmm4, %rdx
  988. mov %rbx, 64(%rsp,%rcx,8)
  989. adcx %rax, %r14
  990. adox %rdi, %r15
  991. mov %r8, %rbx
  992. adcx %rdi, %r15 # cf=0
  993. inc %rcx # of=0
  994. jnz .Loop_mulx_gather
  995. mov %r8, 64(%rsp)
  996. mov %r9, 64+8(%rsp)
  997. mov %r10, 64+16(%rsp)
  998. mov %r11, 64+24(%rsp)
  999. mov %r12, 64+32(%rsp)
  1000. mov %r13, 64+40(%rsp)
  1001. mov %r14, 64+48(%rsp)
  1002. mov %r15, 64+56(%rsp)
  1003. movq %xmm0, $out
  1004. movq %xmm1, %rbp
  1005. mov 128(%rsp), %rdx # pull $n0
  1006. mov (%rsp), %r8
  1007. mov 8(%rsp), %r9
  1008. mov 16(%rsp), %r10
  1009. mov 24(%rsp), %r11
  1010. mov 32(%rsp), %r12
  1011. mov 40(%rsp), %r13
  1012. mov 48(%rsp), %r14
  1013. mov 56(%rsp), %r15
  1014. call __rsaz_512_reducex
  1015. .Lmul_gather_tail:
  1016. ___
  1017. $code.=<<___;
  1018. addq 64(%rsp), %r8
  1019. adcq 72(%rsp), %r9
  1020. adcq 80(%rsp), %r10
  1021. adcq 88(%rsp), %r11
  1022. adcq 96(%rsp), %r12
  1023. adcq 104(%rsp), %r13
  1024. adcq 112(%rsp), %r14
  1025. adcq 120(%rsp), %r15
  1026. sbbq %rcx, %rcx
  1027. call __rsaz_512_subtract
  1028. leaq 128+24+48(%rsp), %rax
  1029. movq -48(%rax), %r15
  1030. movq -40(%rax), %r14
  1031. movq -32(%rax), %r13
  1032. movq -24(%rax), %r12
  1033. movq -16(%rax), %rbp
  1034. movq -8(%rax), %rbx
  1035. leaq (%rax), %rsp
  1036. .Lmul_gather4_epilogue:
  1037. ret
  1038. .size rsaz_512_mul_gather4,.-rsaz_512_mul_gather4
  1039. ___
  1040. }
  1041. {
  1042. my ($out,$ap,$mod,$n0,$tbl,$pwr) = ("%rdi","%rsi","%rdx","%rcx","%r8","%r9d");
  1043. $code.=<<___;
  1044. .globl rsaz_512_mul_scatter4
  1045. .type rsaz_512_mul_scatter4,\@function,6
  1046. .align 32
  1047. rsaz_512_mul_scatter4:
  1048. push %rbx
  1049. push %rbp
  1050. push %r12
  1051. push %r13
  1052. push %r14
  1053. push %r15
  1054. mov $pwr, $pwr
  1055. subq \$128+24, %rsp
  1056. .Lmul_scatter4_body:
  1057. leaq ($tbl,$pwr,4), $tbl
  1058. movq $out, %xmm0 # off-load arguments
  1059. movq $mod, %xmm1
  1060. movq $tbl, %xmm2
  1061. movq $n0, 128(%rsp)
  1062. movq $out, %rbp
  1063. ___
  1064. $code.=<<___ if ($addx);
  1065. movl \$0x80100,%r11d
  1066. andl OPENSSL_ia32cap_P+8(%rip),%r11d
  1067. cmpl \$0x80100,%r11d # check for MULX and ADO/CX
  1068. je .Lmulx_scatter
  1069. ___
  1070. $code.=<<___;
  1071. movq ($out),%rbx # pass b[0]
  1072. call __rsaz_512_mul
  1073. movq %xmm0, $out
  1074. movq %xmm1, %rbp
  1075. movq (%rsp), %r8
  1076. movq 8(%rsp), %r9
  1077. movq 16(%rsp), %r10
  1078. movq 24(%rsp), %r11
  1079. movq 32(%rsp), %r12
  1080. movq 40(%rsp), %r13
  1081. movq 48(%rsp), %r14
  1082. movq 56(%rsp), %r15
  1083. call __rsaz_512_reduce
  1084. ___
  1085. $code.=<<___ if ($addx);
  1086. jmp .Lmul_scatter_tail
  1087. .align 32
  1088. .Lmulx_scatter:
  1089. movq ($out), %rdx # pass b[0]
  1090. call __rsaz_512_mulx
  1091. movq %xmm0, $out
  1092. movq %xmm1, %rbp
  1093. movq 128(%rsp), %rdx # pull $n0
  1094. movq (%rsp), %r8
  1095. movq 8(%rsp), %r9
  1096. movq 16(%rsp), %r10
  1097. movq 24(%rsp), %r11
  1098. movq 32(%rsp), %r12
  1099. movq 40(%rsp), %r13
  1100. movq 48(%rsp), %r14
  1101. movq 56(%rsp), %r15
  1102. call __rsaz_512_reducex
  1103. .Lmul_scatter_tail:
  1104. ___
  1105. $code.=<<___;
  1106. addq 64(%rsp), %r8
  1107. adcq 72(%rsp), %r9
  1108. adcq 80(%rsp), %r10
  1109. adcq 88(%rsp), %r11
  1110. adcq 96(%rsp), %r12
  1111. adcq 104(%rsp), %r13
  1112. adcq 112(%rsp), %r14
  1113. adcq 120(%rsp), %r15
  1114. movq %xmm2, $inp
  1115. sbbq %rcx, %rcx
  1116. call __rsaz_512_subtract
  1117. movl %r8d, 64*0($inp) # scatter
  1118. shrq \$32, %r8
  1119. movl %r9d, 64*2($inp)
  1120. shrq \$32, %r9
  1121. movl %r10d, 64*4($inp)
  1122. shrq \$32, %r10
  1123. movl %r11d, 64*6($inp)
  1124. shrq \$32, %r11
  1125. movl %r12d, 64*8($inp)
  1126. shrq \$32, %r12
  1127. movl %r13d, 64*10($inp)
  1128. shrq \$32, %r13
  1129. movl %r14d, 64*12($inp)
  1130. shrq \$32, %r14
  1131. movl %r15d, 64*14($inp)
  1132. shrq \$32, %r15
  1133. movl %r8d, 64*1($inp)
  1134. movl %r9d, 64*3($inp)
  1135. movl %r10d, 64*5($inp)
  1136. movl %r11d, 64*7($inp)
  1137. movl %r12d, 64*9($inp)
  1138. movl %r13d, 64*11($inp)
  1139. movl %r14d, 64*13($inp)
  1140. movl %r15d, 64*15($inp)
  1141. leaq 128+24+48(%rsp), %rax
  1142. movq -48(%rax), %r15
  1143. movq -40(%rax), %r14
  1144. movq -32(%rax), %r13
  1145. movq -24(%rax), %r12
  1146. movq -16(%rax), %rbp
  1147. movq -8(%rax), %rbx
  1148. leaq (%rax), %rsp
  1149. .Lmul_scatter4_epilogue:
  1150. ret
  1151. .size rsaz_512_mul_scatter4,.-rsaz_512_mul_scatter4
  1152. ___
  1153. }
  1154. {
  1155. my ($out,$inp,$mod,$n0) = ("%rdi","%rsi","%rdx","%rcx");
  1156. $code.=<<___;
  1157. .globl rsaz_512_mul_by_one
  1158. .type rsaz_512_mul_by_one,\@function,4
  1159. .align 32
  1160. rsaz_512_mul_by_one:
  1161. push %rbx
  1162. push %rbp
  1163. push %r12
  1164. push %r13
  1165. push %r14
  1166. push %r15
  1167. subq \$128+24, %rsp
  1168. .Lmul_by_one_body:
  1169. ___
  1170. $code.=<<___ if ($addx);
  1171. movl OPENSSL_ia32cap_P+8(%rip),%eax
  1172. ___
  1173. $code.=<<___;
  1174. movq $mod, %rbp # reassign argument
  1175. movq $n0, 128(%rsp)
  1176. movq ($inp), %r8
  1177. pxor %xmm0, %xmm0
  1178. movq 8($inp), %r9
  1179. movq 16($inp), %r10
  1180. movq 24($inp), %r11
  1181. movq 32($inp), %r12
  1182. movq 40($inp), %r13
  1183. movq 48($inp), %r14
  1184. movq 56($inp), %r15
  1185. movdqa %xmm0, (%rsp)
  1186. movdqa %xmm0, 16(%rsp)
  1187. movdqa %xmm0, 32(%rsp)
  1188. movdqa %xmm0, 48(%rsp)
  1189. movdqa %xmm0, 64(%rsp)
  1190. movdqa %xmm0, 80(%rsp)
  1191. movdqa %xmm0, 96(%rsp)
  1192. ___
  1193. $code.=<<___ if ($addx);
  1194. andl \$0x80100,%eax
  1195. cmpl \$0x80100,%eax # check for MULX and ADO/CX
  1196. je .Lby_one_callx
  1197. ___
  1198. $code.=<<___;
  1199. call __rsaz_512_reduce
  1200. ___
  1201. $code.=<<___ if ($addx);
  1202. jmp .Lby_one_tail
  1203. .align 32
  1204. .Lby_one_callx:
  1205. movq 128(%rsp), %rdx # pull $n0
  1206. call __rsaz_512_reducex
  1207. .Lby_one_tail:
  1208. ___
  1209. $code.=<<___;
  1210. movq %r8, ($out)
  1211. movq %r9, 8($out)
  1212. movq %r10, 16($out)
  1213. movq %r11, 24($out)
  1214. movq %r12, 32($out)
  1215. movq %r13, 40($out)
  1216. movq %r14, 48($out)
  1217. movq %r15, 56($out)
  1218. leaq 128+24+48(%rsp), %rax
  1219. movq -48(%rax), %r15
  1220. movq -40(%rax), %r14
  1221. movq -32(%rax), %r13
  1222. movq -24(%rax), %r12
  1223. movq -16(%rax), %rbp
  1224. movq -8(%rax), %rbx
  1225. leaq (%rax), %rsp
  1226. .Lmul_by_one_epilogue:
  1227. ret
  1228. .size rsaz_512_mul_by_one,.-rsaz_512_mul_by_one
  1229. ___
  1230. }
  1231. { # __rsaz_512_reduce
  1232. #
  1233. # input: %r8-%r15, %rbp - mod, 128(%rsp) - n0
  1234. # output: %r8-%r15
  1235. # clobbers: everything except %rbp and %rdi
  1236. $code.=<<___;
  1237. .type __rsaz_512_reduce,\@abi-omnipotent
  1238. .align 32
  1239. __rsaz_512_reduce:
  1240. movq %r8, %rbx
  1241. imulq 128+8(%rsp), %rbx
  1242. movq 0(%rbp), %rax
  1243. movl \$8, %ecx
  1244. jmp .Lreduction_loop
  1245. .align 32
  1246. .Lreduction_loop:
  1247. mulq %rbx
  1248. movq 8(%rbp), %rax
  1249. negq %r8
  1250. movq %rdx, %r8
  1251. adcq \$0, %r8
  1252. mulq %rbx
  1253. addq %rax, %r9
  1254. movq 16(%rbp), %rax
  1255. adcq \$0, %rdx
  1256. addq %r9, %r8
  1257. movq %rdx, %r9
  1258. adcq \$0, %r9
  1259. mulq %rbx
  1260. addq %rax, %r10
  1261. movq 24(%rbp), %rax
  1262. adcq \$0, %rdx
  1263. addq %r10, %r9
  1264. movq %rdx, %r10
  1265. adcq \$0, %r10
  1266. mulq %rbx
  1267. addq %rax, %r11
  1268. movq 32(%rbp), %rax
  1269. adcq \$0, %rdx
  1270. addq %r11, %r10
  1271. movq 128+8(%rsp), %rsi
  1272. #movq %rdx, %r11
  1273. #adcq \$0, %r11
  1274. adcq \$0, %rdx
  1275. movq %rdx, %r11
  1276. mulq %rbx
  1277. addq %rax, %r12
  1278. movq 40(%rbp), %rax
  1279. adcq \$0, %rdx
  1280. imulq %r8, %rsi
  1281. addq %r12, %r11
  1282. movq %rdx, %r12
  1283. adcq \$0, %r12
  1284. mulq %rbx
  1285. addq %rax, %r13
  1286. movq 48(%rbp), %rax
  1287. adcq \$0, %rdx
  1288. addq %r13, %r12
  1289. movq %rdx, %r13
  1290. adcq \$0, %r13
  1291. mulq %rbx
  1292. addq %rax, %r14
  1293. movq 56(%rbp), %rax
  1294. adcq \$0, %rdx
  1295. addq %r14, %r13
  1296. movq %rdx, %r14
  1297. adcq \$0, %r14
  1298. mulq %rbx
  1299. movq %rsi, %rbx
  1300. addq %rax, %r15
  1301. movq 0(%rbp), %rax
  1302. adcq \$0, %rdx
  1303. addq %r15, %r14
  1304. movq %rdx, %r15
  1305. adcq \$0, %r15
  1306. decl %ecx
  1307. jne .Lreduction_loop
  1308. ret
  1309. .size __rsaz_512_reduce,.-__rsaz_512_reduce
  1310. ___
  1311. }
  1312. if ($addx) {
  1313. # __rsaz_512_reducex
  1314. #
  1315. # input: %r8-%r15, %rbp - mod, 128(%rsp) - n0
  1316. # output: %r8-%r15
  1317. # clobbers: everything except %rbp and %rdi
  1318. $code.=<<___;
  1319. .type __rsaz_512_reducex,\@abi-omnipotent
  1320. .align 32
  1321. __rsaz_512_reducex:
  1322. #movq 128+8(%rsp), %rdx # pull $n0
  1323. imulq %r8, %rdx
  1324. xorq %rsi, %rsi # cf=0,of=0
  1325. movl \$8, %ecx
  1326. jmp .Lreduction_loopx
  1327. .align 32
  1328. .Lreduction_loopx:
  1329. mov %r8, %rbx
  1330. mulx 0(%rbp), %rax, %r8
  1331. adcx %rbx, %rax
  1332. adox %r9, %r8
  1333. mulx 8(%rbp), %rax, %r9
  1334. adcx %rax, %r8
  1335. adox %r10, %r9
  1336. mulx 16(%rbp), %rbx, %r10
  1337. adcx %rbx, %r9
  1338. adox %r11, %r10
  1339. mulx 24(%rbp), %rbx, %r11
  1340. adcx %rbx, %r10
  1341. adox %r12, %r11
  1342. .byte 0xc4,0x62,0xe3,0xf6,0xa5,0x20,0x00,0x00,0x00 # mulx 32(%rbp), %rbx, %r12
  1343. mov %rdx, %rax
  1344. mov %r8, %rdx
  1345. adcx %rbx, %r11
  1346. adox %r13, %r12
  1347. mulx 128+8(%rsp), %rbx, %rdx
  1348. mov %rax, %rdx
  1349. mulx 40(%rbp), %rax, %r13
  1350. adcx %rax, %r12
  1351. adox %r14, %r13
  1352. .byte 0xc4,0x62,0xfb,0xf6,0xb5,0x30,0x00,0x00,0x00 # mulx 48(%rbp), %rax, %r14
  1353. adcx %rax, %r13
  1354. adox %r15, %r14
  1355. mulx 56(%rbp), %rax, %r15
  1356. mov %rbx, %rdx
  1357. adcx %rax, %r14
  1358. adox %rsi, %r15 # %rsi is 0
  1359. adcx %rsi, %r15 # cf=0
  1360. decl %ecx # of=0
  1361. jne .Lreduction_loopx
  1362. ret
  1363. .size __rsaz_512_reducex,.-__rsaz_512_reducex
  1364. ___
  1365. }
  1366. { # __rsaz_512_subtract
  1367. # input: %r8-%r15, %rdi - $out, %rbp - $mod, %rcx - mask
  1368. # output:
  1369. # clobbers: everything but %rdi, %rsi and %rbp
  1370. $code.=<<___;
  1371. .type __rsaz_512_subtract,\@abi-omnipotent
  1372. .align 32
  1373. __rsaz_512_subtract:
  1374. movq %r8, ($out)
  1375. movq %r9, 8($out)
  1376. movq %r10, 16($out)
  1377. movq %r11, 24($out)
  1378. movq %r12, 32($out)
  1379. movq %r13, 40($out)
  1380. movq %r14, 48($out)
  1381. movq %r15, 56($out)
  1382. movq 0($mod), %r8
  1383. movq 8($mod), %r9
  1384. negq %r8
  1385. notq %r9
  1386. andq %rcx, %r8
  1387. movq 16($mod), %r10
  1388. andq %rcx, %r9
  1389. notq %r10
  1390. movq 24($mod), %r11
  1391. andq %rcx, %r10
  1392. notq %r11
  1393. movq 32($mod), %r12
  1394. andq %rcx, %r11
  1395. notq %r12
  1396. movq 40($mod), %r13
  1397. andq %rcx, %r12
  1398. notq %r13
  1399. movq 48($mod), %r14
  1400. andq %rcx, %r13
  1401. notq %r14
  1402. movq 56($mod), %r15
  1403. andq %rcx, %r14
  1404. notq %r15
  1405. andq %rcx, %r15
  1406. addq ($out), %r8
  1407. adcq 8($out), %r9
  1408. adcq 16($out), %r10
  1409. adcq 24($out), %r11
  1410. adcq 32($out), %r12
  1411. adcq 40($out), %r13
  1412. adcq 48($out), %r14
  1413. adcq 56($out), %r15
  1414. movq %r8, ($out)
  1415. movq %r9, 8($out)
  1416. movq %r10, 16($out)
  1417. movq %r11, 24($out)
  1418. movq %r12, 32($out)
  1419. movq %r13, 40($out)
  1420. movq %r14, 48($out)
  1421. movq %r15, 56($out)
  1422. ret
  1423. .size __rsaz_512_subtract,.-__rsaz_512_subtract
  1424. ___
  1425. }
  1426. { # __rsaz_512_mul
  1427. #
  1428. # input: %rsi - ap, %rbp - bp
  1429. # ouput:
  1430. # clobbers: everything
  1431. my ($ap,$bp) = ("%rsi","%rbp");
  1432. $code.=<<___;
  1433. .type __rsaz_512_mul,\@abi-omnipotent
  1434. .align 32
  1435. __rsaz_512_mul:
  1436. leaq 8(%rsp), %rdi
  1437. movq ($ap), %rax
  1438. mulq %rbx
  1439. movq %rax, (%rdi)
  1440. movq 8($ap), %rax
  1441. movq %rdx, %r8
  1442. mulq %rbx
  1443. addq %rax, %r8
  1444. movq 16($ap), %rax
  1445. movq %rdx, %r9
  1446. adcq \$0, %r9
  1447. mulq %rbx
  1448. addq %rax, %r9
  1449. movq 24($ap), %rax
  1450. movq %rdx, %r10
  1451. adcq \$0, %r10
  1452. mulq %rbx
  1453. addq %rax, %r10
  1454. movq 32($ap), %rax
  1455. movq %rdx, %r11
  1456. adcq \$0, %r11
  1457. mulq %rbx
  1458. addq %rax, %r11
  1459. movq 40($ap), %rax
  1460. movq %rdx, %r12
  1461. adcq \$0, %r12
  1462. mulq %rbx
  1463. addq %rax, %r12
  1464. movq 48($ap), %rax
  1465. movq %rdx, %r13
  1466. adcq \$0, %r13
  1467. mulq %rbx
  1468. addq %rax, %r13
  1469. movq 56($ap), %rax
  1470. movq %rdx, %r14
  1471. adcq \$0, %r14
  1472. mulq %rbx
  1473. addq %rax, %r14
  1474. movq ($ap), %rax
  1475. movq %rdx, %r15
  1476. adcq \$0, %r15
  1477. leaq 8($bp), $bp
  1478. leaq 8(%rdi), %rdi
  1479. movl \$7, %ecx
  1480. jmp .Loop_mul
  1481. .align 32
  1482. .Loop_mul:
  1483. movq ($bp), %rbx
  1484. mulq %rbx
  1485. addq %rax, %r8
  1486. movq 8($ap), %rax
  1487. movq %r8, (%rdi)
  1488. movq %rdx, %r8
  1489. adcq \$0, %r8
  1490. mulq %rbx
  1491. addq %rax, %r9
  1492. movq 16($ap), %rax
  1493. adcq \$0, %rdx
  1494. addq %r9, %r8
  1495. movq %rdx, %r9
  1496. adcq \$0, %r9
  1497. mulq %rbx
  1498. addq %rax, %r10
  1499. movq 24($ap), %rax
  1500. adcq \$0, %rdx
  1501. addq %r10, %r9
  1502. movq %rdx, %r10
  1503. adcq \$0, %r10
  1504. mulq %rbx
  1505. addq %rax, %r11
  1506. movq 32($ap), %rax
  1507. adcq \$0, %rdx
  1508. addq %r11, %r10
  1509. movq %rdx, %r11
  1510. adcq \$0, %r11
  1511. mulq %rbx
  1512. addq %rax, %r12
  1513. movq 40($ap), %rax
  1514. adcq \$0, %rdx
  1515. addq %r12, %r11
  1516. movq %rdx, %r12
  1517. adcq \$0, %r12
  1518. mulq %rbx
  1519. addq %rax, %r13
  1520. movq 48($ap), %rax
  1521. adcq \$0, %rdx
  1522. addq %r13, %r12
  1523. movq %rdx, %r13
  1524. adcq \$0, %r13
  1525. mulq %rbx
  1526. addq %rax, %r14
  1527. movq 56($ap), %rax
  1528. adcq \$0, %rdx
  1529. addq %r14, %r13
  1530. movq %rdx, %r14
  1531. leaq 8($bp), $bp
  1532. adcq \$0, %r14
  1533. mulq %rbx
  1534. addq %rax, %r15
  1535. movq ($ap), %rax
  1536. adcq \$0, %rdx
  1537. addq %r15, %r14
  1538. movq %rdx, %r15
  1539. adcq \$0, %r15
  1540. leaq 8(%rdi), %rdi
  1541. decl %ecx
  1542. jnz .Loop_mul
  1543. movq %r8, (%rdi)
  1544. movq %r9, 8(%rdi)
  1545. movq %r10, 16(%rdi)
  1546. movq %r11, 24(%rdi)
  1547. movq %r12, 32(%rdi)
  1548. movq %r13, 40(%rdi)
  1549. movq %r14, 48(%rdi)
  1550. movq %r15, 56(%rdi)
  1551. ret
  1552. .size __rsaz_512_mul,.-__rsaz_512_mul
  1553. ___
  1554. }
  1555. if ($addx) {
  1556. # __rsaz_512_mulx
  1557. #
  1558. # input: %rsi - ap, %rbp - bp
  1559. # ouput:
  1560. # clobbers: everything
  1561. my ($ap,$bp,$zero) = ("%rsi","%rbp","%rdi");
  1562. $code.=<<___;
  1563. .type __rsaz_512_mulx,\@abi-omnipotent
  1564. .align 32
  1565. __rsaz_512_mulx:
  1566. mulx ($ap), %rbx, %r8 # initial %rdx preloaded by caller
  1567. mov \$-6, %rcx
  1568. mulx 8($ap), %rax, %r9
  1569. movq %rbx, 8(%rsp)
  1570. mulx 16($ap), %rbx, %r10
  1571. adc %rax, %r8
  1572. mulx 24($ap), %rax, %r11
  1573. adc %rbx, %r9
  1574. mulx 32($ap), %rbx, %r12
  1575. adc %rax, %r10
  1576. mulx 40($ap), %rax, %r13
  1577. adc %rbx, %r11
  1578. mulx 48($ap), %rbx, %r14
  1579. adc %rax, %r12
  1580. mulx 56($ap), %rax, %r15
  1581. mov 8($bp), %rdx
  1582. adc %rbx, %r13
  1583. adc %rax, %r14
  1584. adc \$0, %r15
  1585. xor $zero, $zero # cf=0,of=0
  1586. jmp .Loop_mulx
  1587. .align 32
  1588. .Loop_mulx:
  1589. movq %r8, %rbx
  1590. mulx ($ap), %rax, %r8
  1591. adcx %rax, %rbx
  1592. adox %r9, %r8
  1593. mulx 8($ap), %rax, %r9
  1594. adcx %rax, %r8
  1595. adox %r10, %r9
  1596. mulx 16($ap), %rax, %r10
  1597. adcx %rax, %r9
  1598. adox %r11, %r10
  1599. mulx 24($ap), %rax, %r11
  1600. adcx %rax, %r10
  1601. adox %r12, %r11
  1602. .byte 0x3e,0xc4,0x62,0xfb,0xf6,0xa6,0x20,0x00,0x00,0x00 # mulx 32($ap), %rax, %r12
  1603. adcx %rax, %r11
  1604. adox %r13, %r12
  1605. mulx 40($ap), %rax, %r13
  1606. adcx %rax, %r12
  1607. adox %r14, %r13
  1608. mulx 48($ap), %rax, %r14
  1609. adcx %rax, %r13
  1610. adox %r15, %r14
  1611. mulx 56($ap), %rax, %r15
  1612. movq 64($bp,%rcx,8), %rdx
  1613. movq %rbx, 8+64-8(%rsp,%rcx,8)
  1614. adcx %rax, %r14
  1615. adox $zero, %r15
  1616. adcx $zero, %r15 # cf=0
  1617. inc %rcx # of=0
  1618. jnz .Loop_mulx
  1619. movq %r8, %rbx
  1620. mulx ($ap), %rax, %r8
  1621. adcx %rax, %rbx
  1622. adox %r9, %r8
  1623. .byte 0xc4,0x62,0xfb,0xf6,0x8e,0x08,0x00,0x00,0x00 # mulx 8($ap), %rax, %r9
  1624. adcx %rax, %r8
  1625. adox %r10, %r9
  1626. .byte 0xc4,0x62,0xfb,0xf6,0x96,0x10,0x00,0x00,0x00 # mulx 16($ap), %rax, %r10
  1627. adcx %rax, %r9
  1628. adox %r11, %r10
  1629. mulx 24($ap), %rax, %r11
  1630. adcx %rax, %r10
  1631. adox %r12, %r11
  1632. mulx 32($ap), %rax, %r12
  1633. adcx %rax, %r11
  1634. adox %r13, %r12
  1635. mulx 40($ap), %rax, %r13
  1636. adcx %rax, %r12
  1637. adox %r14, %r13
  1638. .byte 0xc4,0x62,0xfb,0xf6,0xb6,0x30,0x00,0x00,0x00 # mulx 48($ap), %rax, %r14
  1639. adcx %rax, %r13
  1640. adox %r15, %r14
  1641. .byte 0xc4,0x62,0xfb,0xf6,0xbe,0x38,0x00,0x00,0x00 # mulx 56($ap), %rax, %r15
  1642. adcx %rax, %r14
  1643. adox $zero, %r15
  1644. adcx $zero, %r15
  1645. mov %rbx, 8+64-8(%rsp)
  1646. mov %r8, 8+64(%rsp)
  1647. mov %r9, 8+64+8(%rsp)
  1648. mov %r10, 8+64+16(%rsp)
  1649. mov %r11, 8+64+24(%rsp)
  1650. mov %r12, 8+64+32(%rsp)
  1651. mov %r13, 8+64+40(%rsp)
  1652. mov %r14, 8+64+48(%rsp)
  1653. mov %r15, 8+64+56(%rsp)
  1654. ret
  1655. .size __rsaz_512_mulx,.-__rsaz_512_mulx
  1656. ___
  1657. }
  1658. {
  1659. my ($out,$inp,$power)= $win64 ? ("%rcx","%rdx","%r8d") : ("%rdi","%rsi","%edx");
  1660. $code.=<<___;
  1661. .globl rsaz_512_scatter4
  1662. .type rsaz_512_scatter4,\@abi-omnipotent
  1663. .align 16
  1664. rsaz_512_scatter4:
  1665. leaq ($out,$power,4), $out
  1666. movl \$8, %r9d
  1667. jmp .Loop_scatter
  1668. .align 16
  1669. .Loop_scatter:
  1670. movq ($inp), %rax
  1671. leaq 8($inp), $inp
  1672. movl %eax, ($out)
  1673. shrq \$32, %rax
  1674. movl %eax, 64($out)
  1675. leaq 128($out), $out
  1676. decl %r9d
  1677. jnz .Loop_scatter
  1678. ret
  1679. .size rsaz_512_scatter4,.-rsaz_512_scatter4
  1680. .globl rsaz_512_gather4
  1681. .type rsaz_512_gather4,\@abi-omnipotent
  1682. .align 16
  1683. rsaz_512_gather4:
  1684. leaq ($inp,$power,4), $inp
  1685. movl \$8, %r9d
  1686. jmp .Loop_gather
  1687. .align 16
  1688. .Loop_gather:
  1689. movl ($inp), %eax
  1690. movl 64($inp), %r8d
  1691. leaq 128($inp), $inp
  1692. shlq \$32, %r8
  1693. or %r8, %rax
  1694. movq %rax, ($out)
  1695. leaq 8($out), $out
  1696. decl %r9d
  1697. jnz .Loop_gather
  1698. ret
  1699. .size rsaz_512_gather4,.-rsaz_512_gather4
  1700. ___
  1701. }
  1702. # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
  1703. # CONTEXT *context,DISPATCHER_CONTEXT *disp)
  1704. if ($win64) {
  1705. $rec="%rcx";
  1706. $frame="%rdx";
  1707. $context="%r8";
  1708. $disp="%r9";
  1709. $code.=<<___;
  1710. .extern __imp_RtlVirtualUnwind
  1711. .type se_handler,\@abi-omnipotent
  1712. .align 16
  1713. se_handler:
  1714. push %rsi
  1715. push %rdi
  1716. push %rbx
  1717. push %rbp
  1718. push %r12
  1719. push %r13
  1720. push %r14
  1721. push %r15
  1722. pushfq
  1723. sub \$64,%rsp
  1724. mov 120($context),%rax # pull context->Rax
  1725. mov 248($context),%rbx # pull context->Rip
  1726. mov 8($disp),%rsi # disp->ImageBase
  1727. mov 56($disp),%r11 # disp->HandlerData
  1728. mov 0(%r11),%r10d # HandlerData[0]
  1729. lea (%rsi,%r10),%r10 # end of prologue label
  1730. cmp %r10,%rbx # context->Rip<end of prologue label
  1731. jb .Lcommon_seh_tail
  1732. mov 152($context),%rax # pull context->Rsp
  1733. mov 4(%r11),%r10d # HandlerData[1]
  1734. lea (%rsi,%r10),%r10 # epilogue label
  1735. cmp %r10,%rbx # context->Rip>=epilogue label
  1736. jae .Lcommon_seh_tail
  1737. lea 128+24+48(%rax),%rax
  1738. mov -8(%rax),%rbx
  1739. mov -16(%rax),%rbp
  1740. mov -24(%rax),%r12
  1741. mov -32(%rax),%r13
  1742. mov -40(%rax),%r14
  1743. mov -48(%rax),%r15
  1744. mov %rbx,144($context) # restore context->Rbx
  1745. mov %rbp,160($context) # restore context->Rbp
  1746. mov %r12,216($context) # restore context->R12
  1747. mov %r13,224($context) # restore context->R13
  1748. mov %r14,232($context) # restore context->R14
  1749. mov %r15,240($context) # restore context->R15
  1750. .Lcommon_seh_tail:
  1751. mov 8(%rax),%rdi
  1752. mov 16(%rax),%rsi
  1753. mov %rax,152($context) # restore context->Rsp
  1754. mov %rsi,168($context) # restore context->Rsi
  1755. mov %rdi,176($context) # restore context->Rdi
  1756. mov 40($disp),%rdi # disp->ContextRecord
  1757. mov $context,%rsi # context
  1758. mov \$154,%ecx # sizeof(CONTEXT)
  1759. .long 0xa548f3fc # cld; rep movsq
  1760. mov $disp,%rsi
  1761. xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER
  1762. mov 8(%rsi),%rdx # arg2, disp->ImageBase
  1763. mov 0(%rsi),%r8 # arg3, disp->ControlPc
  1764. mov 16(%rsi),%r9 # arg4, disp->FunctionEntry
  1765. mov 40(%rsi),%r10 # disp->ContextRecord
  1766. lea 56(%rsi),%r11 # &disp->HandlerData
  1767. lea 24(%rsi),%r12 # &disp->EstablisherFrame
  1768. mov %r10,32(%rsp) # arg5
  1769. mov %r11,40(%rsp) # arg6
  1770. mov %r12,48(%rsp) # arg7
  1771. mov %rcx,56(%rsp) # arg8, (NULL)
  1772. call *__imp_RtlVirtualUnwind(%rip)
  1773. mov \$1,%eax # ExceptionContinueSearch
  1774. add \$64,%rsp
  1775. popfq
  1776. pop %r15
  1777. pop %r14
  1778. pop %r13
  1779. pop %r12
  1780. pop %rbp
  1781. pop %rbx
  1782. pop %rdi
  1783. pop %rsi
  1784. ret
  1785. .size sqr_handler,.-sqr_handler
  1786. .section .pdata
  1787. .align 4
  1788. .rva .LSEH_begin_rsaz_512_sqr
  1789. .rva .LSEH_end_rsaz_512_sqr
  1790. .rva .LSEH_info_rsaz_512_sqr
  1791. .rva .LSEH_begin_rsaz_512_mul
  1792. .rva .LSEH_end_rsaz_512_mul
  1793. .rva .LSEH_info_rsaz_512_mul
  1794. .rva .LSEH_begin_rsaz_512_mul_gather4
  1795. .rva .LSEH_end_rsaz_512_mul_gather4
  1796. .rva .LSEH_info_rsaz_512_mul_gather4
  1797. .rva .LSEH_begin_rsaz_512_mul_scatter4
  1798. .rva .LSEH_end_rsaz_512_mul_scatter4
  1799. .rva .LSEH_info_rsaz_512_mul_scatter4
  1800. .rva .LSEH_begin_rsaz_512_mul_by_one
  1801. .rva .LSEH_end_rsaz_512_mul_by_one
  1802. .rva .LSEH_info_rsaz_512_mul_by_one
  1803. .section .xdata
  1804. .align 8
  1805. .LSEH_info_rsaz_512_sqr:
  1806. .byte 9,0,0,0
  1807. .rva se_handler
  1808. .rva .Lsqr_body,.Lsqr_epilogue # HandlerData[]
  1809. .LSEH_info_rsaz_512_mul:
  1810. .byte 9,0,0,0
  1811. .rva se_handler
  1812. .rva .Lmul_body,.Lmul_epilogue # HandlerData[]
  1813. .LSEH_info_rsaz_512_mul_gather4:
  1814. .byte 9,0,0,0
  1815. .rva se_handler
  1816. .rva .Lmul_gather4_body,.Lmul_gather4_epilogue # HandlerData[]
  1817. .LSEH_info_rsaz_512_mul_scatter4:
  1818. .byte 9,0,0,0
  1819. .rva se_handler
  1820. .rva .Lmul_scatter4_body,.Lmul_scatter4_epilogue # HandlerData[]
  1821. .LSEH_info_rsaz_512_mul_by_one:
  1822. .byte 9,0,0,0
  1823. .rva se_handler
  1824. .rva .Lmul_by_one_body,.Lmul_by_one_epilogue # HandlerData[]
  1825. ___
  1826. }
  1827. $code =~ s/\`([^\`]*)\`/eval $1/gem;
  1828. print $code;
  1829. close STDOUT;