Ви не можете вибрати більше 25 тем Теми мають розпочинатися з літери або цифри, можуть містити дефіси (-) і не повинні перевищувати 35 символів.
 
 
 
 
 
 

1895 рядки
51 KiB

  1. #!/usr/bin/env perl
  2. ##############################################################################
  3. # #
  4. # Copyright (c) 2012, Intel Corporation #
  5. # #
  6. # All rights reserved. #
  7. # #
  8. # Redistribution and use in source and binary forms, with or without #
  9. # modification, are permitted provided that the following conditions are #
  10. # met: #
  11. # #
  12. # * Redistributions of source code must retain the above copyright #
  13. # notice, this list of conditions and the following disclaimer. #
  14. # #
  15. # * Redistributions in binary form must reproduce the above copyright #
  16. # notice, this list of conditions and the following disclaimer in the #
  17. # documentation and/or other materials provided with the #
  18. # distribution. #
  19. # #
  20. # * Neither the name of the Intel Corporation nor the names of its #
  21. # contributors may be used to endorse or promote products derived from #
  22. # this software without specific prior written permission. #
  23. # #
  24. # #
  25. # THIS SOFTWARE IS PROVIDED BY INTEL CORPORATION ""AS IS"" AND ANY #
  26. # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE #
  27. # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR #
  28. # PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL CORPORATION OR #
  29. # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, #
  30. # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, #
  31. # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR #
  32. # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF #
  33. # LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING #
  34. # NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS #
  35. # SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #
  36. # #
  37. ##############################################################################
  38. # Developers and authors: #
  39. # Shay Gueron (1, 2), and Vlad Krasnov (1) #
  40. # (1) Intel Corporation, Israel Development Center, Haifa, Israel #
  41. # (2) University of Haifa, Israel #
  42. ##############################################################################
  43. # Reference: #
  44. # [1] S. Gueron, V. Krasnov: "Software Implementation of Modular #
  45. # Exponentiation, Using Advanced Vector Instructions Architectures", #
  46. # F. Ozbudak and F. Rodriguez-Henriquez (Eds.): WAIFI 2012, LNCS 7369, #
  47. # pp. 119?135, 2012. Springer-Verlag Berlin Heidelberg 2012 #
  48. # [2] S. Gueron: "Efficient Software Implementations of Modular #
  49. # Exponentiation", Journal of Cryptographic Engineering 2:31-43 (2012). #
  50. # [3] S. Gueron, V. Krasnov: "Speeding up Big-numbers Squaring",IEEE #
  51. # Proceedings of 9th International Conference on Information Technology: #
  52. # New Generations (ITNG 2012), pp.821-823 (2012) #
  53. # [4] S. Gueron, V. Krasnov: "[PATCH] Efficient and side channel analysis #
  54. # resistant 1024-bit modular exponentiation, for optimizing RSA2048 #
  55. # on AVX2 capable x86_64 platforms", #
  56. # http://rt.openssl.org/Ticket/Display.html?id=2850&user=guest&pass=guest#
  57. ##############################################################################
  58. #
  59. # +13% improvement over original submission by <appro@openssl.org>
  60. #
  61. # rsa2048 sign/sec OpenSSL 1.0.1 scalar(*) this
  62. # 2.3GHz Haswell 621 765/+23% 1113/+79%
  63. #
  64. # (*) if system doesn't support AVX2, for reference purposes;
  65. $flavour = shift;
  66. $output = shift;
  67. if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
  68. $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
  69. $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
  70. ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
  71. ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
  72. die "can't locate x86_64-xlate.pl";
  73. if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
  74. =~ /GNU assembler version ([2-9]\.[0-9]+)/) {
  75. $avx = ($1>=2.19) + ($1>=2.22);
  76. $addx = ($1>=2.23);
  77. }
  78. if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
  79. `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) {
  80. $avx = ($1>=2.09) + ($1>=2.10);
  81. $addx = ($1>=2.10);
  82. }
  83. if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
  84. `ml64 2>&1` =~ /Version ([0-9]+)\./) {
  85. $avx = ($1>=10) + ($1>=11);
  86. $addx = ($1>=11);
  87. }
  88. if (!$avx && `$ENV{CC} -v 2>&1` =~ /(^clang version|based on LLVM) ([3-9])\.([0-9]+)/) {
  89. my $ver = $2 + $3/100.0; # 3.1->3.01, 3.10->3.10
  90. $avx = ($ver>=3.0) + ($ver>=3.01);
  91. $addx = ($ver>=3.03);
  92. }
  93. open OUT,"| $^X $xlate $flavour $output";
  94. *STDOUT = *OUT;
  95. if ($avx>1) {{{
  96. { # void AMS_WW(
  97. my $rp="%rdi"; # BN_ULONG *rp,
  98. my $ap="%rsi"; # const BN_ULONG *ap,
  99. my $np="%rdx"; # const BN_ULONG *np,
  100. my $n0="%ecx"; # const BN_ULONG n0,
  101. my $rep="%r8d"; # int repeat);
  102. # The registers that hold the accumulated redundant result
  103. # The AMM works on 1024 bit operands, and redundant word size is 29
  104. # Therefore: ceil(1024/29)/4 = 9
  105. my $ACC0="%ymm0";
  106. my $ACC1="%ymm1";
  107. my $ACC2="%ymm2";
  108. my $ACC3="%ymm3";
  109. my $ACC4="%ymm4";
  110. my $ACC5="%ymm5";
  111. my $ACC6="%ymm6";
  112. my $ACC7="%ymm7";
  113. my $ACC8="%ymm8";
  114. my $ACC9="%ymm9";
  115. # Registers that hold the broadcasted words of bp, currently used
  116. my $B1="%ymm10";
  117. my $B2="%ymm11";
  118. # Registers that hold the broadcasted words of Y, currently used
  119. my $Y1="%ymm12";
  120. my $Y2="%ymm13";
  121. # Helper registers
  122. my $TEMP1="%ymm14";
  123. my $AND_MASK="%ymm15";
  124. # alu registers that hold the first words of the ACC
  125. my $r0="%r9";
  126. my $r1="%r10";
  127. my $r2="%r11";
  128. my $r3="%r12";
  129. my $i="%r14d"; # loop counter
  130. my $tmp = "%r15";
  131. my $FrameSize=32*18+32*8; # place for A^2 and 2*A
  132. my $aap=$r0;
  133. my $tp0="%rbx";
  134. my $tp1=$r3;
  135. my $tpa=$tmp;
  136. $np="%r13"; # reassigned argument
  137. $code.=<<___;
  138. .text
  139. .globl rsaz_1024_sqr_avx2
  140. .type rsaz_1024_sqr_avx2,\@function,5
  141. .align 64
  142. rsaz_1024_sqr_avx2: # 702 cycles, 14% faster than rsaz_1024_mul_avx2
  143. lea (%rsp), %rax
  144. push %rbx
  145. push %rbp
  146. push %r12
  147. push %r13
  148. push %r14
  149. push %r15
  150. vzeroupper
  151. ___
  152. $code.=<<___ if ($win64);
  153. lea -0xa8(%rsp),%rsp
  154. vmovaps %xmm6,-0xd8(%rax)
  155. vmovaps %xmm7,-0xc8(%rax)
  156. vmovaps %xmm8,-0xb8(%rax)
  157. vmovaps %xmm9,-0xa8(%rax)
  158. vmovaps %xmm10,-0x98(%rax)
  159. vmovaps %xmm11,-0x88(%rax)
  160. vmovaps %xmm12,-0x78(%rax)
  161. vmovaps %xmm13,-0x68(%rax)
  162. vmovaps %xmm14,-0x58(%rax)
  163. vmovaps %xmm15,-0x48(%rax)
  164. .Lsqr_1024_body:
  165. ___
  166. $code.=<<___;
  167. mov %rax,%rbp
  168. mov %rdx, $np # reassigned argument
  169. sub \$$FrameSize, %rsp
  170. mov $np, $tmp
  171. sub \$-128, $rp # size optimization
  172. sub \$-128, $ap
  173. sub \$-128, $np
  174. and \$4095, $tmp # see if $np crosses page
  175. add \$32*10, $tmp
  176. shr \$12, $tmp
  177. vpxor $ACC9,$ACC9,$ACC9
  178. jz .Lsqr_1024_no_n_copy
  179. # unaligned 256-bit load that crosses page boundary can
  180. # cause >2x performance degradation here, so if $np does
  181. # cross page boundary, copy it to stack and make sure stack
  182. # frame doesn't...
  183. sub \$32*10,%rsp
  184. vmovdqu 32*0-128($np), $ACC0
  185. and \$-2048, %rsp
  186. vmovdqu 32*1-128($np), $ACC1
  187. vmovdqu 32*2-128($np), $ACC2
  188. vmovdqu 32*3-128($np), $ACC3
  189. vmovdqu 32*4-128($np), $ACC4
  190. vmovdqu 32*5-128($np), $ACC5
  191. vmovdqu 32*6-128($np), $ACC6
  192. vmovdqu 32*7-128($np), $ACC7
  193. vmovdqu 32*8-128($np), $ACC8
  194. lea $FrameSize+128(%rsp),$np
  195. vmovdqu $ACC0, 32*0-128($np)
  196. vmovdqu $ACC1, 32*1-128($np)
  197. vmovdqu $ACC2, 32*2-128($np)
  198. vmovdqu $ACC3, 32*3-128($np)
  199. vmovdqu $ACC4, 32*4-128($np)
  200. vmovdqu $ACC5, 32*5-128($np)
  201. vmovdqu $ACC6, 32*6-128($np)
  202. vmovdqu $ACC7, 32*7-128($np)
  203. vmovdqu $ACC8, 32*8-128($np)
  204. vmovdqu $ACC9, 32*9-128($np) # $ACC9 is zero
  205. .Lsqr_1024_no_n_copy:
  206. and \$-1024, %rsp
  207. vmovdqu 32*1-128($ap), $ACC1
  208. vmovdqu 32*2-128($ap), $ACC2
  209. vmovdqu 32*3-128($ap), $ACC3
  210. vmovdqu 32*4-128($ap), $ACC4
  211. vmovdqu 32*5-128($ap), $ACC5
  212. vmovdqu 32*6-128($ap), $ACC6
  213. vmovdqu 32*7-128($ap), $ACC7
  214. vmovdqu 32*8-128($ap), $ACC8
  215. lea 192(%rsp), $tp0 # 64+128=192
  216. vpbroadcastq .Land_mask(%rip), $AND_MASK
  217. jmp .LOOP_GRANDE_SQR_1024
  218. .align 32
  219. .LOOP_GRANDE_SQR_1024:
  220. lea 32*18+128(%rsp), $aap # size optimization
  221. lea 448(%rsp), $tp1 # 64+128+256=448
  222. # the squaring is performed as described in Variant B of
  223. # "Speeding up Big-Number Squaring", so start by calculating
  224. # the A*2=A+A vector
  225. vpaddq $ACC1, $ACC1, $ACC1
  226. vpbroadcastq 32*0-128($ap), $B1
  227. vpaddq $ACC2, $ACC2, $ACC2
  228. vmovdqa $ACC1, 32*0-128($aap)
  229. vpaddq $ACC3, $ACC3, $ACC3
  230. vmovdqa $ACC2, 32*1-128($aap)
  231. vpaddq $ACC4, $ACC4, $ACC4
  232. vmovdqa $ACC3, 32*2-128($aap)
  233. vpaddq $ACC5, $ACC5, $ACC5
  234. vmovdqa $ACC4, 32*3-128($aap)
  235. vpaddq $ACC6, $ACC6, $ACC6
  236. vmovdqa $ACC5, 32*4-128($aap)
  237. vpaddq $ACC7, $ACC7, $ACC7
  238. vmovdqa $ACC6, 32*5-128($aap)
  239. vpaddq $ACC8, $ACC8, $ACC8
  240. vmovdqa $ACC7, 32*6-128($aap)
  241. vpxor $ACC9, $ACC9, $ACC9
  242. vmovdqa $ACC8, 32*7-128($aap)
  243. vpmuludq 32*0-128($ap), $B1, $ACC0
  244. vpbroadcastq 32*1-128($ap), $B2
  245. vmovdqu $ACC9, 32*9-192($tp0) # zero upper half
  246. vpmuludq $B1, $ACC1, $ACC1
  247. vmovdqu $ACC9, 32*10-448($tp1)
  248. vpmuludq $B1, $ACC2, $ACC2
  249. vmovdqu $ACC9, 32*11-448($tp1)
  250. vpmuludq $B1, $ACC3, $ACC3
  251. vmovdqu $ACC9, 32*12-448($tp1)
  252. vpmuludq $B1, $ACC4, $ACC4
  253. vmovdqu $ACC9, 32*13-448($tp1)
  254. vpmuludq $B1, $ACC5, $ACC5
  255. vmovdqu $ACC9, 32*14-448($tp1)
  256. vpmuludq $B1, $ACC6, $ACC6
  257. vmovdqu $ACC9, 32*15-448($tp1)
  258. vpmuludq $B1, $ACC7, $ACC7
  259. vmovdqu $ACC9, 32*16-448($tp1)
  260. vpmuludq $B1, $ACC8, $ACC8
  261. vpbroadcastq 32*2-128($ap), $B1
  262. vmovdqu $ACC9, 32*17-448($tp1)
  263. mov $ap, $tpa
  264. mov \$4, $i
  265. jmp .Lsqr_entry_1024
  266. ___
  267. $TEMP0=$Y1;
  268. $TEMP2=$Y2;
  269. $code.=<<___;
  270. .align 32
  271. .LOOP_SQR_1024:
  272. vpbroadcastq 32*1-128($tpa), $B2
  273. vpmuludq 32*0-128($ap), $B1, $ACC0
  274. vpaddq 32*0-192($tp0), $ACC0, $ACC0
  275. vpmuludq 32*0-128($aap), $B1, $ACC1
  276. vpaddq 32*1-192($tp0), $ACC1, $ACC1
  277. vpmuludq 32*1-128($aap), $B1, $ACC2
  278. vpaddq 32*2-192($tp0), $ACC2, $ACC2
  279. vpmuludq 32*2-128($aap), $B1, $ACC3
  280. vpaddq 32*3-192($tp0), $ACC3, $ACC3
  281. vpmuludq 32*3-128($aap), $B1, $ACC4
  282. vpaddq 32*4-192($tp0), $ACC4, $ACC4
  283. vpmuludq 32*4-128($aap), $B1, $ACC5
  284. vpaddq 32*5-192($tp0), $ACC5, $ACC5
  285. vpmuludq 32*5-128($aap), $B1, $ACC6
  286. vpaddq 32*6-192($tp0), $ACC6, $ACC6
  287. vpmuludq 32*6-128($aap), $B1, $ACC7
  288. vpaddq 32*7-192($tp0), $ACC7, $ACC7
  289. vpmuludq 32*7-128($aap), $B1, $ACC8
  290. vpbroadcastq 32*2-128($tpa), $B1
  291. vpaddq 32*8-192($tp0), $ACC8, $ACC8
  292. .Lsqr_entry_1024:
  293. vmovdqu $ACC0, 32*0-192($tp0)
  294. vmovdqu $ACC1, 32*1-192($tp0)
  295. vpmuludq 32*1-128($ap), $B2, $TEMP0
  296. vpaddq $TEMP0, $ACC2, $ACC2
  297. vpmuludq 32*1-128($aap), $B2, $TEMP1
  298. vpaddq $TEMP1, $ACC3, $ACC3
  299. vpmuludq 32*2-128($aap), $B2, $TEMP2
  300. vpaddq $TEMP2, $ACC4, $ACC4
  301. vpmuludq 32*3-128($aap), $B2, $TEMP0
  302. vpaddq $TEMP0, $ACC5, $ACC5
  303. vpmuludq 32*4-128($aap), $B2, $TEMP1
  304. vpaddq $TEMP1, $ACC6, $ACC6
  305. vpmuludq 32*5-128($aap), $B2, $TEMP2
  306. vpaddq $TEMP2, $ACC7, $ACC7
  307. vpmuludq 32*6-128($aap), $B2, $TEMP0
  308. vpaddq $TEMP0, $ACC8, $ACC8
  309. vpmuludq 32*7-128($aap), $B2, $ACC0
  310. vpbroadcastq 32*3-128($tpa), $B2
  311. vpaddq 32*9-192($tp0), $ACC0, $ACC0
  312. vmovdqu $ACC2, 32*2-192($tp0)
  313. vmovdqu $ACC3, 32*3-192($tp0)
  314. vpmuludq 32*2-128($ap), $B1, $TEMP2
  315. vpaddq $TEMP2, $ACC4, $ACC4
  316. vpmuludq 32*2-128($aap), $B1, $TEMP0
  317. vpaddq $TEMP0, $ACC5, $ACC5
  318. vpmuludq 32*3-128($aap), $B1, $TEMP1
  319. vpaddq $TEMP1, $ACC6, $ACC6
  320. vpmuludq 32*4-128($aap), $B1, $TEMP2
  321. vpaddq $TEMP2, $ACC7, $ACC7
  322. vpmuludq 32*5-128($aap), $B1, $TEMP0
  323. vpaddq $TEMP0, $ACC8, $ACC8
  324. vpmuludq 32*6-128($aap), $B1, $TEMP1
  325. vpaddq $TEMP1, $ACC0, $ACC0
  326. vpmuludq 32*7-128($aap), $B1, $ACC1
  327. vpbroadcastq 32*4-128($tpa), $B1
  328. vpaddq 32*10-448($tp1), $ACC1, $ACC1
  329. vmovdqu $ACC4, 32*4-192($tp0)
  330. vmovdqu $ACC5, 32*5-192($tp0)
  331. vpmuludq 32*3-128($ap), $B2, $TEMP0
  332. vpaddq $TEMP0, $ACC6, $ACC6
  333. vpmuludq 32*3-128($aap), $B2, $TEMP1
  334. vpaddq $TEMP1, $ACC7, $ACC7
  335. vpmuludq 32*4-128($aap), $B2, $TEMP2
  336. vpaddq $TEMP2, $ACC8, $ACC8
  337. vpmuludq 32*5-128($aap), $B2, $TEMP0
  338. vpaddq $TEMP0, $ACC0, $ACC0
  339. vpmuludq 32*6-128($aap), $B2, $TEMP1
  340. vpaddq $TEMP1, $ACC1, $ACC1
  341. vpmuludq 32*7-128($aap), $B2, $ACC2
  342. vpbroadcastq 32*5-128($tpa), $B2
  343. vpaddq 32*11-448($tp1), $ACC2, $ACC2
  344. vmovdqu $ACC6, 32*6-192($tp0)
  345. vmovdqu $ACC7, 32*7-192($tp0)
  346. vpmuludq 32*4-128($ap), $B1, $TEMP0
  347. vpaddq $TEMP0, $ACC8, $ACC8
  348. vpmuludq 32*4-128($aap), $B1, $TEMP1
  349. vpaddq $TEMP1, $ACC0, $ACC0
  350. vpmuludq 32*5-128($aap), $B1, $TEMP2
  351. vpaddq $TEMP2, $ACC1, $ACC1
  352. vpmuludq 32*6-128($aap), $B1, $TEMP0
  353. vpaddq $TEMP0, $ACC2, $ACC2
  354. vpmuludq 32*7-128($aap), $B1, $ACC3
  355. vpbroadcastq 32*6-128($tpa), $B1
  356. vpaddq 32*12-448($tp1), $ACC3, $ACC3
  357. vmovdqu $ACC8, 32*8-192($tp0)
  358. vmovdqu $ACC0, 32*9-192($tp0)
  359. lea 8($tp0), $tp0
  360. vpmuludq 32*5-128($ap), $B2, $TEMP2
  361. vpaddq $TEMP2, $ACC1, $ACC1
  362. vpmuludq 32*5-128($aap), $B2, $TEMP0
  363. vpaddq $TEMP0, $ACC2, $ACC2
  364. vpmuludq 32*6-128($aap), $B2, $TEMP1
  365. vpaddq $TEMP1, $ACC3, $ACC3
  366. vpmuludq 32*7-128($aap), $B2, $ACC4
  367. vpbroadcastq 32*7-128($tpa), $B2
  368. vpaddq 32*13-448($tp1), $ACC4, $ACC4
  369. vmovdqu $ACC1, 32*10-448($tp1)
  370. vmovdqu $ACC2, 32*11-448($tp1)
  371. vpmuludq 32*6-128($ap), $B1, $TEMP0
  372. vpaddq $TEMP0, $ACC3, $ACC3
  373. vpmuludq 32*6-128($aap), $B1, $TEMP1
  374. vpbroadcastq 32*8-128($tpa), $ACC0 # borrow $ACC0 for $B1
  375. vpaddq $TEMP1, $ACC4, $ACC4
  376. vpmuludq 32*7-128($aap), $B1, $ACC5
  377. vpbroadcastq 32*0+8-128($tpa), $B1 # for next iteration
  378. vpaddq 32*14-448($tp1), $ACC5, $ACC5
  379. vmovdqu $ACC3, 32*12-448($tp1)
  380. vmovdqu $ACC4, 32*13-448($tp1)
  381. lea 8($tpa), $tpa
  382. vpmuludq 32*7-128($ap), $B2, $TEMP0
  383. vpaddq $TEMP0, $ACC5, $ACC5
  384. vpmuludq 32*7-128($aap), $B2, $ACC6
  385. vpaddq 32*15-448($tp1), $ACC6, $ACC6
  386. vpmuludq 32*8-128($ap), $ACC0, $ACC7
  387. vmovdqu $ACC5, 32*14-448($tp1)
  388. vpaddq 32*16-448($tp1), $ACC7, $ACC7
  389. vmovdqu $ACC6, 32*15-448($tp1)
  390. vmovdqu $ACC7, 32*16-448($tp1)
  391. lea 8($tp1), $tp1
  392. dec $i
  393. jnz .LOOP_SQR_1024
  394. ___
  395. $ZERO = $ACC9;
  396. $TEMP0 = $B1;
  397. $TEMP2 = $B2;
  398. $TEMP3 = $Y1;
  399. $TEMP4 = $Y2;
  400. $code.=<<___;
  401. #we need to fix indexes 32-39 to avoid overflow
  402. vmovdqu 32*8(%rsp), $ACC8 # 32*8-192($tp0),
  403. vmovdqu 32*9(%rsp), $ACC1 # 32*9-192($tp0)
  404. vmovdqu 32*10(%rsp), $ACC2 # 32*10-192($tp0)
  405. lea 192(%rsp), $tp0 # 64+128=192
  406. vpsrlq \$29, $ACC8, $TEMP1
  407. vpand $AND_MASK, $ACC8, $ACC8
  408. vpsrlq \$29, $ACC1, $TEMP2
  409. vpand $AND_MASK, $ACC1, $ACC1
  410. vpermq \$0x93, $TEMP1, $TEMP1
  411. vpxor $ZERO, $ZERO, $ZERO
  412. vpermq \$0x93, $TEMP2, $TEMP2
  413. vpblendd \$3, $ZERO, $TEMP1, $TEMP0
  414. vpblendd \$3, $TEMP1, $TEMP2, $TEMP1
  415. vpaddq $TEMP0, $ACC8, $ACC8
  416. vpblendd \$3, $TEMP2, $ZERO, $TEMP2
  417. vpaddq $TEMP1, $ACC1, $ACC1
  418. vpaddq $TEMP2, $ACC2, $ACC2
  419. vmovdqu $ACC1, 32*9-192($tp0)
  420. vmovdqu $ACC2, 32*10-192($tp0)
  421. mov (%rsp), %rax
  422. mov 8(%rsp), $r1
  423. mov 16(%rsp), $r2
  424. mov 24(%rsp), $r3
  425. vmovdqu 32*1(%rsp), $ACC1
  426. vmovdqu 32*2-192($tp0), $ACC2
  427. vmovdqu 32*3-192($tp0), $ACC3
  428. vmovdqu 32*4-192($tp0), $ACC4
  429. vmovdqu 32*5-192($tp0), $ACC5
  430. vmovdqu 32*6-192($tp0), $ACC6
  431. vmovdqu 32*7-192($tp0), $ACC7
  432. mov %rax, $r0
  433. imull $n0, %eax
  434. and \$0x1fffffff, %eax
  435. vmovd %eax, $Y1
  436. mov %rax, %rdx
  437. imulq -128($np), %rax
  438. vpbroadcastq $Y1, $Y1
  439. add %rax, $r0
  440. mov %rdx, %rax
  441. imulq 8-128($np), %rax
  442. shr \$29, $r0
  443. add %rax, $r1
  444. mov %rdx, %rax
  445. imulq 16-128($np), %rax
  446. add $r0, $r1
  447. add %rax, $r2
  448. imulq 24-128($np), %rdx
  449. add %rdx, $r3
  450. mov $r1, %rax
  451. imull $n0, %eax
  452. and \$0x1fffffff, %eax
  453. mov \$9, $i
  454. jmp .LOOP_REDUCE_1024
  455. .align 32
  456. .LOOP_REDUCE_1024:
  457. vmovd %eax, $Y2
  458. vpbroadcastq $Y2, $Y2
  459. vpmuludq 32*1-128($np), $Y1, $TEMP0
  460. mov %rax, %rdx
  461. imulq -128($np), %rax
  462. vpaddq $TEMP0, $ACC1, $ACC1
  463. add %rax, $r1
  464. vpmuludq 32*2-128($np), $Y1, $TEMP1
  465. mov %rdx, %rax
  466. imulq 8-128($np), %rax
  467. vpaddq $TEMP1, $ACC2, $ACC2
  468. vpmuludq 32*3-128($np), $Y1, $TEMP2
  469. .byte 0x67
  470. add %rax, $r2
  471. .byte 0x67
  472. mov %rdx, %rax
  473. imulq 16-128($np), %rax
  474. shr \$29, $r1
  475. vpaddq $TEMP2, $ACC3, $ACC3
  476. vpmuludq 32*4-128($np), $Y1, $TEMP0
  477. add %rax, $r3
  478. add $r1, $r2
  479. vpaddq $TEMP0, $ACC4, $ACC4
  480. vpmuludq 32*5-128($np), $Y1, $TEMP1
  481. mov $r2, %rax
  482. imull $n0, %eax
  483. vpaddq $TEMP1, $ACC5, $ACC5
  484. vpmuludq 32*6-128($np), $Y1, $TEMP2
  485. and \$0x1fffffff, %eax
  486. vpaddq $TEMP2, $ACC6, $ACC6
  487. vpmuludq 32*7-128($np), $Y1, $TEMP0
  488. vpaddq $TEMP0, $ACC7, $ACC7
  489. vpmuludq 32*8-128($np), $Y1, $TEMP1
  490. vmovd %eax, $Y1
  491. #vmovdqu 32*1-8-128($np), $TEMP2 # moved below
  492. vpaddq $TEMP1, $ACC8, $ACC8
  493. #vmovdqu 32*2-8-128($np), $TEMP0 # moved below
  494. vpbroadcastq $Y1, $Y1
  495. vpmuludq 32*1-8-128($np), $Y2, $TEMP2 # see above
  496. vmovdqu 32*3-8-128($np), $TEMP1
  497. mov %rax, %rdx
  498. imulq -128($np), %rax
  499. vpaddq $TEMP2, $ACC1, $ACC1
  500. vpmuludq 32*2-8-128($np), $Y2, $TEMP0 # see above
  501. vmovdqu 32*4-8-128($np), $TEMP2
  502. add %rax, $r2
  503. mov %rdx, %rax
  504. imulq 8-128($np), %rax
  505. vpaddq $TEMP0, $ACC2, $ACC2
  506. add $r3, %rax
  507. shr \$29, $r2
  508. vpmuludq $Y2, $TEMP1, $TEMP1
  509. vmovdqu 32*5-8-128($np), $TEMP0
  510. add $r2, %rax
  511. vpaddq $TEMP1, $ACC3, $ACC3
  512. vpmuludq $Y2, $TEMP2, $TEMP2
  513. vmovdqu 32*6-8-128($np), $TEMP1
  514. .byte 0x67
  515. mov %rax, $r3
  516. imull $n0, %eax
  517. vpaddq $TEMP2, $ACC4, $ACC4
  518. vpmuludq $Y2, $TEMP0, $TEMP0
  519. .byte 0xc4,0x41,0x7e,0x6f,0x9d,0x58,0x00,0x00,0x00 # vmovdqu 32*7-8-128($np), $TEMP2
  520. and \$0x1fffffff, %eax
  521. vpaddq $TEMP0, $ACC5, $ACC5
  522. vpmuludq $Y2, $TEMP1, $TEMP1
  523. vmovdqu 32*8-8-128($np), $TEMP0
  524. vpaddq $TEMP1, $ACC6, $ACC6
  525. vpmuludq $Y2, $TEMP2, $TEMP2
  526. vmovdqu 32*9-8-128($np), $ACC9
  527. vmovd %eax, $ACC0 # borrow ACC0 for Y2
  528. imulq -128($np), %rax
  529. vpaddq $TEMP2, $ACC7, $ACC7
  530. vpmuludq $Y2, $TEMP0, $TEMP0
  531. vmovdqu 32*1-16-128($np), $TEMP1
  532. vpbroadcastq $ACC0, $ACC0
  533. vpaddq $TEMP0, $ACC8, $ACC8
  534. vpmuludq $Y2, $ACC9, $ACC9
  535. vmovdqu 32*2-16-128($np), $TEMP2
  536. add %rax, $r3
  537. ___
  538. ($ACC0,$Y2)=($Y2,$ACC0);
  539. $code.=<<___;
  540. vmovdqu 32*1-24-128($np), $ACC0
  541. vpmuludq $Y1, $TEMP1, $TEMP1
  542. vmovdqu 32*3-16-128($np), $TEMP0
  543. vpaddq $TEMP1, $ACC1, $ACC1
  544. vpmuludq $Y2, $ACC0, $ACC0
  545. vpmuludq $Y1, $TEMP2, $TEMP2
  546. .byte 0xc4,0x41,0x7e,0x6f,0xb5,0xf0,0xff,0xff,0xff # vmovdqu 32*4-16-128($np), $TEMP1
  547. vpaddq $ACC1, $ACC0, $ACC0
  548. vpaddq $TEMP2, $ACC2, $ACC2
  549. vpmuludq $Y1, $TEMP0, $TEMP0
  550. vmovdqu 32*5-16-128($np), $TEMP2
  551. .byte 0x67
  552. vmovq $ACC0, %rax
  553. vmovdqu $ACC0, (%rsp) # transfer $r0-$r3
  554. vpaddq $TEMP0, $ACC3, $ACC3
  555. vpmuludq $Y1, $TEMP1, $TEMP1
  556. vmovdqu 32*6-16-128($np), $TEMP0
  557. vpaddq $TEMP1, $ACC4, $ACC4
  558. vpmuludq $Y1, $TEMP2, $TEMP2
  559. vmovdqu 32*7-16-128($np), $TEMP1
  560. vpaddq $TEMP2, $ACC5, $ACC5
  561. vpmuludq $Y1, $TEMP0, $TEMP0
  562. vmovdqu 32*8-16-128($np), $TEMP2
  563. vpaddq $TEMP0, $ACC6, $ACC6
  564. vpmuludq $Y1, $TEMP1, $TEMP1
  565. shr \$29, $r3
  566. vmovdqu 32*9-16-128($np), $TEMP0
  567. add $r3, %rax
  568. vpaddq $TEMP1, $ACC7, $ACC7
  569. vpmuludq $Y1, $TEMP2, $TEMP2
  570. #vmovdqu 32*2-24-128($np), $TEMP1 # moved below
  571. mov %rax, $r0
  572. imull $n0, %eax
  573. vpaddq $TEMP2, $ACC8, $ACC8
  574. vpmuludq $Y1, $TEMP0, $TEMP0
  575. and \$0x1fffffff, %eax
  576. vmovd %eax, $Y1
  577. vmovdqu 32*3-24-128($np), $TEMP2
  578. .byte 0x67
  579. vpaddq $TEMP0, $ACC9, $ACC9
  580. vpbroadcastq $Y1, $Y1
  581. vpmuludq 32*2-24-128($np), $Y2, $TEMP1 # see above
  582. vmovdqu 32*4-24-128($np), $TEMP0
  583. mov %rax, %rdx
  584. imulq -128($np), %rax
  585. mov 8(%rsp), $r1
  586. vpaddq $TEMP1, $ACC2, $ACC1
  587. vpmuludq $Y2, $TEMP2, $TEMP2
  588. vmovdqu 32*5-24-128($np), $TEMP1
  589. add %rax, $r0
  590. mov %rdx, %rax
  591. imulq 8-128($np), %rax
  592. .byte 0x67
  593. shr \$29, $r0
  594. mov 16(%rsp), $r2
  595. vpaddq $TEMP2, $ACC3, $ACC2
  596. vpmuludq $Y2, $TEMP0, $TEMP0
  597. vmovdqu 32*6-24-128($np), $TEMP2
  598. add %rax, $r1
  599. mov %rdx, %rax
  600. imulq 16-128($np), %rax
  601. vpaddq $TEMP0, $ACC4, $ACC3
  602. vpmuludq $Y2, $TEMP1, $TEMP1
  603. vmovdqu 32*7-24-128($np), $TEMP0
  604. imulq 24-128($np), %rdx # future $r3
  605. add %rax, $r2
  606. lea ($r0,$r1), %rax
  607. vpaddq $TEMP1, $ACC5, $ACC4
  608. vpmuludq $Y2, $TEMP2, $TEMP2
  609. vmovdqu 32*8-24-128($np), $TEMP1
  610. mov %rax, $r1
  611. imull $n0, %eax
  612. vpmuludq $Y2, $TEMP0, $TEMP0
  613. vpaddq $TEMP2, $ACC6, $ACC5
  614. vmovdqu 32*9-24-128($np), $TEMP2
  615. and \$0x1fffffff, %eax
  616. vpaddq $TEMP0, $ACC7, $ACC6
  617. vpmuludq $Y2, $TEMP1, $TEMP1
  618. add 24(%rsp), %rdx
  619. vpaddq $TEMP1, $ACC8, $ACC7
  620. vpmuludq $Y2, $TEMP2, $TEMP2
  621. vpaddq $TEMP2, $ACC9, $ACC8
  622. vmovq $r3, $ACC9
  623. mov %rdx, $r3
  624. dec $i
  625. jnz .LOOP_REDUCE_1024
  626. ___
  627. ($ACC0,$Y2)=($Y2,$ACC0);
  628. $code.=<<___;
  629. lea 448(%rsp), $tp1 # size optimization
  630. vpaddq $ACC9, $Y2, $ACC0
  631. vpxor $ZERO, $ZERO, $ZERO
  632. vpaddq 32*9-192($tp0), $ACC0, $ACC0
  633. vpaddq 32*10-448($tp1), $ACC1, $ACC1
  634. vpaddq 32*11-448($tp1), $ACC2, $ACC2
  635. vpaddq 32*12-448($tp1), $ACC3, $ACC3
  636. vpaddq 32*13-448($tp1), $ACC4, $ACC4
  637. vpaddq 32*14-448($tp1), $ACC5, $ACC5
  638. vpaddq 32*15-448($tp1), $ACC6, $ACC6
  639. vpaddq 32*16-448($tp1), $ACC7, $ACC7
  640. vpaddq 32*17-448($tp1), $ACC8, $ACC8
  641. vpsrlq \$29, $ACC0, $TEMP1
  642. vpand $AND_MASK, $ACC0, $ACC0
  643. vpsrlq \$29, $ACC1, $TEMP2
  644. vpand $AND_MASK, $ACC1, $ACC1
  645. vpsrlq \$29, $ACC2, $TEMP3
  646. vpermq \$0x93, $TEMP1, $TEMP1
  647. vpand $AND_MASK, $ACC2, $ACC2
  648. vpsrlq \$29, $ACC3, $TEMP4
  649. vpermq \$0x93, $TEMP2, $TEMP2
  650. vpand $AND_MASK, $ACC3, $ACC3
  651. vpermq \$0x93, $TEMP3, $TEMP3
  652. vpblendd \$3, $ZERO, $TEMP1, $TEMP0
  653. vpermq \$0x93, $TEMP4, $TEMP4
  654. vpblendd \$3, $TEMP1, $TEMP2, $TEMP1
  655. vpaddq $TEMP0, $ACC0, $ACC0
  656. vpblendd \$3, $TEMP2, $TEMP3, $TEMP2
  657. vpaddq $TEMP1, $ACC1, $ACC1
  658. vpblendd \$3, $TEMP3, $TEMP4, $TEMP3
  659. vpaddq $TEMP2, $ACC2, $ACC2
  660. vpblendd \$3, $TEMP4, $ZERO, $TEMP4
  661. vpaddq $TEMP3, $ACC3, $ACC3
  662. vpaddq $TEMP4, $ACC4, $ACC4
  663. vpsrlq \$29, $ACC0, $TEMP1
  664. vpand $AND_MASK, $ACC0, $ACC0
  665. vpsrlq \$29, $ACC1, $TEMP2
  666. vpand $AND_MASK, $ACC1, $ACC1
  667. vpsrlq \$29, $ACC2, $TEMP3
  668. vpermq \$0x93, $TEMP1, $TEMP1
  669. vpand $AND_MASK, $ACC2, $ACC2
  670. vpsrlq \$29, $ACC3, $TEMP4
  671. vpermq \$0x93, $TEMP2, $TEMP2
  672. vpand $AND_MASK, $ACC3, $ACC3
  673. vpermq \$0x93, $TEMP3, $TEMP3
  674. vpblendd \$3, $ZERO, $TEMP1, $TEMP0
  675. vpermq \$0x93, $TEMP4, $TEMP4
  676. vpblendd \$3, $TEMP1, $TEMP2, $TEMP1
  677. vpaddq $TEMP0, $ACC0, $ACC0
  678. vpblendd \$3, $TEMP2, $TEMP3, $TEMP2
  679. vpaddq $TEMP1, $ACC1, $ACC1
  680. vmovdqu $ACC0, 32*0-128($rp)
  681. vpblendd \$3, $TEMP3, $TEMP4, $TEMP3
  682. vpaddq $TEMP2, $ACC2, $ACC2
  683. vmovdqu $ACC1, 32*1-128($rp)
  684. vpblendd \$3, $TEMP4, $ZERO, $TEMP4
  685. vpaddq $TEMP3, $ACC3, $ACC3
  686. vmovdqu $ACC2, 32*2-128($rp)
  687. vpaddq $TEMP4, $ACC4, $ACC4
  688. vmovdqu $ACC3, 32*3-128($rp)
  689. ___
  690. $TEMP5=$ACC0;
  691. $code.=<<___;
  692. vpsrlq \$29, $ACC4, $TEMP1
  693. vpand $AND_MASK, $ACC4, $ACC4
  694. vpsrlq \$29, $ACC5, $TEMP2
  695. vpand $AND_MASK, $ACC5, $ACC5
  696. vpsrlq \$29, $ACC6, $TEMP3
  697. vpermq \$0x93, $TEMP1, $TEMP1
  698. vpand $AND_MASK, $ACC6, $ACC6
  699. vpsrlq \$29, $ACC7, $TEMP4
  700. vpermq \$0x93, $TEMP2, $TEMP2
  701. vpand $AND_MASK, $ACC7, $ACC7
  702. vpsrlq \$29, $ACC8, $TEMP5
  703. vpermq \$0x93, $TEMP3, $TEMP3
  704. vpand $AND_MASK, $ACC8, $ACC8
  705. vpermq \$0x93, $TEMP4, $TEMP4
  706. vpblendd \$3, $ZERO, $TEMP1, $TEMP0
  707. vpermq \$0x93, $TEMP5, $TEMP5
  708. vpblendd \$3, $TEMP1, $TEMP2, $TEMP1
  709. vpaddq $TEMP0, $ACC4, $ACC4
  710. vpblendd \$3, $TEMP2, $TEMP3, $TEMP2
  711. vpaddq $TEMP1, $ACC5, $ACC5
  712. vpblendd \$3, $TEMP3, $TEMP4, $TEMP3
  713. vpaddq $TEMP2, $ACC6, $ACC6
  714. vpblendd \$3, $TEMP4, $TEMP5, $TEMP4
  715. vpaddq $TEMP3, $ACC7, $ACC7
  716. vpaddq $TEMP4, $ACC8, $ACC8
  717. vpsrlq \$29, $ACC4, $TEMP1
  718. vpand $AND_MASK, $ACC4, $ACC4
  719. vpsrlq \$29, $ACC5, $TEMP2
  720. vpand $AND_MASK, $ACC5, $ACC5
  721. vpsrlq \$29, $ACC6, $TEMP3
  722. vpermq \$0x93, $TEMP1, $TEMP1
  723. vpand $AND_MASK, $ACC6, $ACC6
  724. vpsrlq \$29, $ACC7, $TEMP4
  725. vpermq \$0x93, $TEMP2, $TEMP2
  726. vpand $AND_MASK, $ACC7, $ACC7
  727. vpsrlq \$29, $ACC8, $TEMP5
  728. vpermq \$0x93, $TEMP3, $TEMP3
  729. vpand $AND_MASK, $ACC8, $ACC8
  730. vpermq \$0x93, $TEMP4, $TEMP4
  731. vpblendd \$3, $ZERO, $TEMP1, $TEMP0
  732. vpermq \$0x93, $TEMP5, $TEMP5
  733. vpblendd \$3, $TEMP1, $TEMP2, $TEMP1
  734. vpaddq $TEMP0, $ACC4, $ACC4
  735. vpblendd \$3, $TEMP2, $TEMP3, $TEMP2
  736. vpaddq $TEMP1, $ACC5, $ACC5
  737. vmovdqu $ACC4, 32*4-128($rp)
  738. vpblendd \$3, $TEMP3, $TEMP4, $TEMP3
  739. vpaddq $TEMP2, $ACC6, $ACC6
  740. vmovdqu $ACC5, 32*5-128($rp)
  741. vpblendd \$3, $TEMP4, $TEMP5, $TEMP4
  742. vpaddq $TEMP3, $ACC7, $ACC7
  743. vmovdqu $ACC6, 32*6-128($rp)
  744. vpaddq $TEMP4, $ACC8, $ACC8
  745. vmovdqu $ACC7, 32*7-128($rp)
  746. vmovdqu $ACC8, 32*8-128($rp)
  747. mov $rp, $ap
  748. dec $rep
  749. jne .LOOP_GRANDE_SQR_1024
  750. vzeroall
  751. mov %rbp, %rax
  752. ___
  753. $code.=<<___ if ($win64);
  754. movaps -0xd8(%rax),%xmm6
  755. movaps -0xc8(%rax),%xmm7
  756. movaps -0xb8(%rax),%xmm8
  757. movaps -0xa8(%rax),%xmm9
  758. movaps -0x98(%rax),%xmm10
  759. movaps -0x88(%rax),%xmm11
  760. movaps -0x78(%rax),%xmm12
  761. movaps -0x68(%rax),%xmm13
  762. movaps -0x58(%rax),%xmm14
  763. movaps -0x48(%rax),%xmm15
  764. ___
  765. $code.=<<___;
  766. mov -48(%rax),%r15
  767. mov -40(%rax),%r14
  768. mov -32(%rax),%r13
  769. mov -24(%rax),%r12
  770. mov -16(%rax),%rbp
  771. mov -8(%rax),%rbx
  772. lea (%rax),%rsp # restore %rsp
  773. .Lsqr_1024_epilogue:
  774. ret
  775. .size rsaz_1024_sqr_avx2,.-rsaz_1024_sqr_avx2
  776. ___
  777. }
  778. { # void AMM_WW(
  779. my $rp="%rdi"; # BN_ULONG *rp,
  780. my $ap="%rsi"; # const BN_ULONG *ap,
  781. my $bp="%rdx"; # const BN_ULONG *bp,
  782. my $np="%rcx"; # const BN_ULONG *np,
  783. my $n0="%r8d"; # unsigned int n0);
  784. # The registers that hold the accumulated redundant result
  785. # The AMM works on 1024 bit operands, and redundant word size is 29
  786. # Therefore: ceil(1024/29)/4 = 9
  787. my $ACC0="%ymm0";
  788. my $ACC1="%ymm1";
  789. my $ACC2="%ymm2";
  790. my $ACC3="%ymm3";
  791. my $ACC4="%ymm4";
  792. my $ACC5="%ymm5";
  793. my $ACC6="%ymm6";
  794. my $ACC7="%ymm7";
  795. my $ACC8="%ymm8";
  796. my $ACC9="%ymm9";
  797. # Registers that hold the broadcasted words of multiplier, currently used
  798. my $Bi="%ymm10";
  799. my $Yi="%ymm11";
  800. # Helper registers
  801. my $TEMP0=$ACC0;
  802. my $TEMP1="%ymm12";
  803. my $TEMP2="%ymm13";
  804. my $ZERO="%ymm14";
  805. my $AND_MASK="%ymm15";
  806. # alu registers that hold the first words of the ACC
  807. my $r0="%r9";
  808. my $r1="%r10";
  809. my $r2="%r11";
  810. my $r3="%r12";
  811. my $i="%r14d";
  812. my $tmp="%r15";
  813. $bp="%r13"; # reassigned argument
  814. $code.=<<___;
  815. .globl rsaz_1024_mul_avx2
  816. .type rsaz_1024_mul_avx2,\@function,5
  817. .align 64
  818. rsaz_1024_mul_avx2:
  819. lea (%rsp), %rax
  820. push %rbx
  821. push %rbp
  822. push %r12
  823. push %r13
  824. push %r14
  825. push %r15
  826. ___
  827. $code.=<<___ if ($win64);
  828. vzeroupper
  829. lea -0xa8(%rsp),%rsp
  830. vmovaps %xmm6,-0xd8(%rax)
  831. vmovaps %xmm7,-0xc8(%rax)
  832. vmovaps %xmm8,-0xb8(%rax)
  833. vmovaps %xmm9,-0xa8(%rax)
  834. vmovaps %xmm10,-0x98(%rax)
  835. vmovaps %xmm11,-0x88(%rax)
  836. vmovaps %xmm12,-0x78(%rax)
  837. vmovaps %xmm13,-0x68(%rax)
  838. vmovaps %xmm14,-0x58(%rax)
  839. vmovaps %xmm15,-0x48(%rax)
  840. .Lmul_1024_body:
  841. ___
  842. $code.=<<___;
  843. mov %rax,%rbp
  844. vzeroall
  845. mov %rdx, $bp # reassigned argument
  846. sub \$64,%rsp
  847. # unaligned 256-bit load that crosses page boundary can
  848. # cause severe performance degradation here, so if $ap does
  849. # cross page boundary, swap it with $bp [meaning that caller
  850. # is advised to lay down $ap and $bp next to each other, so
  851. # that only one can cross page boundary].
  852. .byte 0x67,0x67
  853. mov $ap, $tmp
  854. and \$4095, $tmp
  855. add \$32*10, $tmp
  856. shr \$12, $tmp
  857. mov $ap, $tmp
  858. cmovnz $bp, $ap
  859. cmovnz $tmp, $bp
  860. mov $np, $tmp
  861. sub \$-128,$ap # size optimization
  862. sub \$-128,$np
  863. sub \$-128,$rp
  864. and \$4095, $tmp # see if $np crosses page
  865. add \$32*10, $tmp
  866. .byte 0x67,0x67
  867. shr \$12, $tmp
  868. jz .Lmul_1024_no_n_copy
  869. # unaligned 256-bit load that crosses page boundary can
  870. # cause severe performance degradation here, so if $np does
  871. # cross page boundary, copy it to stack and make sure stack
  872. # frame doesn't...
  873. sub \$32*10,%rsp
  874. vmovdqu 32*0-128($np), $ACC0
  875. and \$-512, %rsp
  876. vmovdqu 32*1-128($np), $ACC1
  877. vmovdqu 32*2-128($np), $ACC2
  878. vmovdqu 32*3-128($np), $ACC3
  879. vmovdqu 32*4-128($np), $ACC4
  880. vmovdqu 32*5-128($np), $ACC5
  881. vmovdqu 32*6-128($np), $ACC6
  882. vmovdqu 32*7-128($np), $ACC7
  883. vmovdqu 32*8-128($np), $ACC8
  884. lea 64+128(%rsp),$np
  885. vmovdqu $ACC0, 32*0-128($np)
  886. vpxor $ACC0, $ACC0, $ACC0
  887. vmovdqu $ACC1, 32*1-128($np)
  888. vpxor $ACC1, $ACC1, $ACC1
  889. vmovdqu $ACC2, 32*2-128($np)
  890. vpxor $ACC2, $ACC2, $ACC2
  891. vmovdqu $ACC3, 32*3-128($np)
  892. vpxor $ACC3, $ACC3, $ACC3
  893. vmovdqu $ACC4, 32*4-128($np)
  894. vpxor $ACC4, $ACC4, $ACC4
  895. vmovdqu $ACC5, 32*5-128($np)
  896. vpxor $ACC5, $ACC5, $ACC5
  897. vmovdqu $ACC6, 32*6-128($np)
  898. vpxor $ACC6, $ACC6, $ACC6
  899. vmovdqu $ACC7, 32*7-128($np)
  900. vpxor $ACC7, $ACC7, $ACC7
  901. vmovdqu $ACC8, 32*8-128($np)
  902. vmovdqa $ACC0, $ACC8
  903. vmovdqu $ACC9, 32*9-128($np) # $ACC9 is zero after vzeroall
  904. .Lmul_1024_no_n_copy:
  905. and \$-64,%rsp
  906. mov ($bp), %rbx
  907. vpbroadcastq ($bp), $Bi
  908. vmovdqu $ACC0, (%rsp) # clear top of stack
  909. xor $r0, $r0
  910. .byte 0x67
  911. xor $r1, $r1
  912. xor $r2, $r2
  913. xor $r3, $r3
  914. vmovdqu .Land_mask(%rip), $AND_MASK
  915. mov \$9, $i
  916. vmovdqu $ACC9, 32*9-128($rp) # $ACC9 is zero after vzeroall
  917. jmp .Loop_mul_1024
  918. .align 32
  919. .Loop_mul_1024:
  920. vpsrlq \$29, $ACC3, $ACC9 # correct $ACC3(*)
  921. mov %rbx, %rax
  922. imulq -128($ap), %rax
  923. add $r0, %rax
  924. mov %rbx, $r1
  925. imulq 8-128($ap), $r1
  926. add 8(%rsp), $r1
  927. mov %rax, $r0
  928. imull $n0, %eax
  929. and \$0x1fffffff, %eax
  930. mov %rbx, $r2
  931. imulq 16-128($ap), $r2
  932. add 16(%rsp), $r2
  933. mov %rbx, $r3
  934. imulq 24-128($ap), $r3
  935. add 24(%rsp), $r3
  936. vpmuludq 32*1-128($ap),$Bi,$TEMP0
  937. vmovd %eax, $Yi
  938. vpaddq $TEMP0,$ACC1,$ACC1
  939. vpmuludq 32*2-128($ap),$Bi,$TEMP1
  940. vpbroadcastq $Yi, $Yi
  941. vpaddq $TEMP1,$ACC2,$ACC2
  942. vpmuludq 32*3-128($ap),$Bi,$TEMP2
  943. vpand $AND_MASK, $ACC3, $ACC3 # correct $ACC3
  944. vpaddq $TEMP2,$ACC3,$ACC3
  945. vpmuludq 32*4-128($ap),$Bi,$TEMP0
  946. vpaddq $TEMP0,$ACC4,$ACC4
  947. vpmuludq 32*5-128($ap),$Bi,$TEMP1
  948. vpaddq $TEMP1,$ACC5,$ACC5
  949. vpmuludq 32*6-128($ap),$Bi,$TEMP2
  950. vpaddq $TEMP2,$ACC6,$ACC6
  951. vpmuludq 32*7-128($ap),$Bi,$TEMP0
  952. vpermq \$0x93, $ACC9, $ACC9 # correct $ACC3
  953. vpaddq $TEMP0,$ACC7,$ACC7
  954. vpmuludq 32*8-128($ap),$Bi,$TEMP1
  955. vpbroadcastq 8($bp), $Bi
  956. vpaddq $TEMP1,$ACC8,$ACC8
  957. mov %rax,%rdx
  958. imulq -128($np),%rax
  959. add %rax,$r0
  960. mov %rdx,%rax
  961. imulq 8-128($np),%rax
  962. add %rax,$r1
  963. mov %rdx,%rax
  964. imulq 16-128($np),%rax
  965. add %rax,$r2
  966. shr \$29, $r0
  967. imulq 24-128($np),%rdx
  968. add %rdx,$r3
  969. add $r0, $r1
  970. vpmuludq 32*1-128($np),$Yi,$TEMP2
  971. vmovq $Bi, %rbx
  972. vpaddq $TEMP2,$ACC1,$ACC1
  973. vpmuludq 32*2-128($np),$Yi,$TEMP0
  974. vpaddq $TEMP0,$ACC2,$ACC2
  975. vpmuludq 32*3-128($np),$Yi,$TEMP1
  976. vpaddq $TEMP1,$ACC3,$ACC3
  977. vpmuludq 32*4-128($np),$Yi,$TEMP2
  978. vpaddq $TEMP2,$ACC4,$ACC4
  979. vpmuludq 32*5-128($np),$Yi,$TEMP0
  980. vpaddq $TEMP0,$ACC5,$ACC5
  981. vpmuludq 32*6-128($np),$Yi,$TEMP1
  982. vpaddq $TEMP1,$ACC6,$ACC6
  983. vpmuludq 32*7-128($np),$Yi,$TEMP2
  984. vpblendd \$3, $ZERO, $ACC9, $ACC9 # correct $ACC3
  985. vpaddq $TEMP2,$ACC7,$ACC7
  986. vpmuludq 32*8-128($np),$Yi,$TEMP0
  987. vpaddq $ACC9, $ACC3, $ACC3 # correct $ACC3
  988. vpaddq $TEMP0,$ACC8,$ACC8
  989. mov %rbx, %rax
  990. imulq -128($ap),%rax
  991. add %rax,$r1
  992. vmovdqu -8+32*1-128($ap),$TEMP1
  993. mov %rbx, %rax
  994. imulq 8-128($ap),%rax
  995. add %rax,$r2
  996. vmovdqu -8+32*2-128($ap),$TEMP2
  997. mov $r1, %rax
  998. imull $n0, %eax
  999. and \$0x1fffffff, %eax
  1000. imulq 16-128($ap),%rbx
  1001. add %rbx,$r3
  1002. vpmuludq $Bi,$TEMP1,$TEMP1
  1003. vmovd %eax, $Yi
  1004. vmovdqu -8+32*3-128($ap),$TEMP0
  1005. vpaddq $TEMP1,$ACC1,$ACC1
  1006. vpmuludq $Bi,$TEMP2,$TEMP2
  1007. vpbroadcastq $Yi, $Yi
  1008. vmovdqu -8+32*4-128($ap),$TEMP1
  1009. vpaddq $TEMP2,$ACC2,$ACC2
  1010. vpmuludq $Bi,$TEMP0,$TEMP0
  1011. vmovdqu -8+32*5-128($ap),$TEMP2
  1012. vpaddq $TEMP0,$ACC3,$ACC3
  1013. vpmuludq $Bi,$TEMP1,$TEMP1
  1014. vmovdqu -8+32*6-128($ap),$TEMP0
  1015. vpaddq $TEMP1,$ACC4,$ACC4
  1016. vpmuludq $Bi,$TEMP2,$TEMP2
  1017. vmovdqu -8+32*7-128($ap),$TEMP1
  1018. vpaddq $TEMP2,$ACC5,$ACC5
  1019. vpmuludq $Bi,$TEMP0,$TEMP0
  1020. vmovdqu -8+32*8-128($ap),$TEMP2
  1021. vpaddq $TEMP0,$ACC6,$ACC6
  1022. vpmuludq $Bi,$TEMP1,$TEMP1
  1023. vmovdqu -8+32*9-128($ap),$ACC9
  1024. vpaddq $TEMP1,$ACC7,$ACC7
  1025. vpmuludq $Bi,$TEMP2,$TEMP2
  1026. vpaddq $TEMP2,$ACC8,$ACC8
  1027. vpmuludq $Bi,$ACC9,$ACC9
  1028. vpbroadcastq 16($bp), $Bi
  1029. mov %rax,%rdx
  1030. imulq -128($np),%rax
  1031. add %rax,$r1
  1032. vmovdqu -8+32*1-128($np),$TEMP0
  1033. mov %rdx,%rax
  1034. imulq 8-128($np),%rax
  1035. add %rax,$r2
  1036. vmovdqu -8+32*2-128($np),$TEMP1
  1037. shr \$29, $r1
  1038. imulq 16-128($np),%rdx
  1039. add %rdx,$r3
  1040. add $r1, $r2
  1041. vpmuludq $Yi,$TEMP0,$TEMP0
  1042. vmovq $Bi, %rbx
  1043. vmovdqu -8+32*3-128($np),$TEMP2
  1044. vpaddq $TEMP0,$ACC1,$ACC1
  1045. vpmuludq $Yi,$TEMP1,$TEMP1
  1046. vmovdqu -8+32*4-128($np),$TEMP0
  1047. vpaddq $TEMP1,$ACC2,$ACC2
  1048. vpmuludq $Yi,$TEMP2,$TEMP2
  1049. vmovdqu -8+32*5-128($np),$TEMP1
  1050. vpaddq $TEMP2,$ACC3,$ACC3
  1051. vpmuludq $Yi,$TEMP0,$TEMP0
  1052. vmovdqu -8+32*6-128($np),$TEMP2
  1053. vpaddq $TEMP0,$ACC4,$ACC4
  1054. vpmuludq $Yi,$TEMP1,$TEMP1
  1055. vmovdqu -8+32*7-128($np),$TEMP0
  1056. vpaddq $TEMP1,$ACC5,$ACC5
  1057. vpmuludq $Yi,$TEMP2,$TEMP2
  1058. vmovdqu -8+32*8-128($np),$TEMP1
  1059. vpaddq $TEMP2,$ACC6,$ACC6
  1060. vpmuludq $Yi,$TEMP0,$TEMP0
  1061. vmovdqu -8+32*9-128($np),$TEMP2
  1062. vpaddq $TEMP0,$ACC7,$ACC7
  1063. vpmuludq $Yi,$TEMP1,$TEMP1
  1064. vpaddq $TEMP1,$ACC8,$ACC8
  1065. vpmuludq $Yi,$TEMP2,$TEMP2
  1066. vpaddq $TEMP2,$ACC9,$ACC9
  1067. vmovdqu -16+32*1-128($ap),$TEMP0
  1068. mov %rbx,%rax
  1069. imulq -128($ap),%rax
  1070. add $r2,%rax
  1071. vmovdqu -16+32*2-128($ap),$TEMP1
  1072. mov %rax,$r2
  1073. imull $n0, %eax
  1074. and \$0x1fffffff, %eax
  1075. imulq 8-128($ap),%rbx
  1076. add %rbx,$r3
  1077. vpmuludq $Bi,$TEMP0,$TEMP0
  1078. vmovd %eax, $Yi
  1079. vmovdqu -16+32*3-128($ap),$TEMP2
  1080. vpaddq $TEMP0,$ACC1,$ACC1
  1081. vpmuludq $Bi,$TEMP1,$TEMP1
  1082. vpbroadcastq $Yi, $Yi
  1083. vmovdqu -16+32*4-128($ap),$TEMP0
  1084. vpaddq $TEMP1,$ACC2,$ACC2
  1085. vpmuludq $Bi,$TEMP2,$TEMP2
  1086. vmovdqu -16+32*5-128($ap),$TEMP1
  1087. vpaddq $TEMP2,$ACC3,$ACC3
  1088. vpmuludq $Bi,$TEMP0,$TEMP0
  1089. vmovdqu -16+32*6-128($ap),$TEMP2
  1090. vpaddq $TEMP0,$ACC4,$ACC4
  1091. vpmuludq $Bi,$TEMP1,$TEMP1
  1092. vmovdqu -16+32*7-128($ap),$TEMP0
  1093. vpaddq $TEMP1,$ACC5,$ACC5
  1094. vpmuludq $Bi,$TEMP2,$TEMP2
  1095. vmovdqu -16+32*8-128($ap),$TEMP1
  1096. vpaddq $TEMP2,$ACC6,$ACC6
  1097. vpmuludq $Bi,$TEMP0,$TEMP0
  1098. vmovdqu -16+32*9-128($ap),$TEMP2
  1099. vpaddq $TEMP0,$ACC7,$ACC7
  1100. vpmuludq $Bi,$TEMP1,$TEMP1
  1101. vpaddq $TEMP1,$ACC8,$ACC8
  1102. vpmuludq $Bi,$TEMP2,$TEMP2
  1103. vpbroadcastq 24($bp), $Bi
  1104. vpaddq $TEMP2,$ACC9,$ACC9
  1105. vmovdqu -16+32*1-128($np),$TEMP0
  1106. mov %rax,%rdx
  1107. imulq -128($np),%rax
  1108. add %rax,$r2
  1109. vmovdqu -16+32*2-128($np),$TEMP1
  1110. imulq 8-128($np),%rdx
  1111. add %rdx,$r3
  1112. shr \$29, $r2
  1113. vpmuludq $Yi,$TEMP0,$TEMP0
  1114. vmovq $Bi, %rbx
  1115. vmovdqu -16+32*3-128($np),$TEMP2
  1116. vpaddq $TEMP0,$ACC1,$ACC1
  1117. vpmuludq $Yi,$TEMP1,$TEMP1
  1118. vmovdqu -16+32*4-128($np),$TEMP0
  1119. vpaddq $TEMP1,$ACC2,$ACC2
  1120. vpmuludq $Yi,$TEMP2,$TEMP2
  1121. vmovdqu -16+32*5-128($np),$TEMP1
  1122. vpaddq $TEMP2,$ACC3,$ACC3
  1123. vpmuludq $Yi,$TEMP0,$TEMP0
  1124. vmovdqu -16+32*6-128($np),$TEMP2
  1125. vpaddq $TEMP0,$ACC4,$ACC4
  1126. vpmuludq $Yi,$TEMP1,$TEMP1
  1127. vmovdqu -16+32*7-128($np),$TEMP0
  1128. vpaddq $TEMP1,$ACC5,$ACC5
  1129. vpmuludq $Yi,$TEMP2,$TEMP2
  1130. vmovdqu -16+32*8-128($np),$TEMP1
  1131. vpaddq $TEMP2,$ACC6,$ACC6
  1132. vpmuludq $Yi,$TEMP0,$TEMP0
  1133. vmovdqu -16+32*9-128($np),$TEMP2
  1134. vpaddq $TEMP0,$ACC7,$ACC7
  1135. vpmuludq $Yi,$TEMP1,$TEMP1
  1136. vmovdqu -24+32*1-128($ap),$TEMP0
  1137. vpaddq $TEMP1,$ACC8,$ACC8
  1138. vpmuludq $Yi,$TEMP2,$TEMP2
  1139. vmovdqu -24+32*2-128($ap),$TEMP1
  1140. vpaddq $TEMP2,$ACC9,$ACC9
  1141. add $r2, $r3
  1142. imulq -128($ap),%rbx
  1143. add %rbx,$r3
  1144. mov $r3, %rax
  1145. imull $n0, %eax
  1146. and \$0x1fffffff, %eax
  1147. vpmuludq $Bi,$TEMP0,$TEMP0
  1148. vmovd %eax, $Yi
  1149. vmovdqu -24+32*3-128($ap),$TEMP2
  1150. vpaddq $TEMP0,$ACC1,$ACC1
  1151. vpmuludq $Bi,$TEMP1,$TEMP1
  1152. vpbroadcastq $Yi, $Yi
  1153. vmovdqu -24+32*4-128($ap),$TEMP0
  1154. vpaddq $TEMP1,$ACC2,$ACC2
  1155. vpmuludq $Bi,$TEMP2,$TEMP2
  1156. vmovdqu -24+32*5-128($ap),$TEMP1
  1157. vpaddq $TEMP2,$ACC3,$ACC3
  1158. vpmuludq $Bi,$TEMP0,$TEMP0
  1159. vmovdqu -24+32*6-128($ap),$TEMP2
  1160. vpaddq $TEMP0,$ACC4,$ACC4
  1161. vpmuludq $Bi,$TEMP1,$TEMP1
  1162. vmovdqu -24+32*7-128($ap),$TEMP0
  1163. vpaddq $TEMP1,$ACC5,$ACC5
  1164. vpmuludq $Bi,$TEMP2,$TEMP2
  1165. vmovdqu -24+32*8-128($ap),$TEMP1
  1166. vpaddq $TEMP2,$ACC6,$ACC6
  1167. vpmuludq $Bi,$TEMP0,$TEMP0
  1168. vmovdqu -24+32*9-128($ap),$TEMP2
  1169. vpaddq $TEMP0,$ACC7,$ACC7
  1170. vpmuludq $Bi,$TEMP1,$TEMP1
  1171. vpaddq $TEMP1,$ACC8,$ACC8
  1172. vpmuludq $Bi,$TEMP2,$TEMP2
  1173. vpbroadcastq 32($bp), $Bi
  1174. vpaddq $TEMP2,$ACC9,$ACC9
  1175. add \$32, $bp # $bp++
  1176. vmovdqu -24+32*1-128($np),$TEMP0
  1177. imulq -128($np),%rax
  1178. add %rax,$r3
  1179. shr \$29, $r3
  1180. vmovdqu -24+32*2-128($np),$TEMP1
  1181. vpmuludq $Yi,$TEMP0,$TEMP0
  1182. vmovq $Bi, %rbx
  1183. vmovdqu -24+32*3-128($np),$TEMP2
  1184. vpaddq $TEMP0,$ACC1,$ACC0 # $ACC0==$TEMP0
  1185. vpmuludq $Yi,$TEMP1,$TEMP1
  1186. vmovdqu $ACC0, (%rsp) # transfer $r0-$r3
  1187. vpaddq $TEMP1,$ACC2,$ACC1
  1188. vmovdqu -24+32*4-128($np),$TEMP0
  1189. vpmuludq $Yi,$TEMP2,$TEMP2
  1190. vmovdqu -24+32*5-128($np),$TEMP1
  1191. vpaddq $TEMP2,$ACC3,$ACC2
  1192. vpmuludq $Yi,$TEMP0,$TEMP0
  1193. vmovdqu -24+32*6-128($np),$TEMP2
  1194. vpaddq $TEMP0,$ACC4,$ACC3
  1195. vpmuludq $Yi,$TEMP1,$TEMP1
  1196. vmovdqu -24+32*7-128($np),$TEMP0
  1197. vpaddq $TEMP1,$ACC5,$ACC4
  1198. vpmuludq $Yi,$TEMP2,$TEMP2
  1199. vmovdqu -24+32*8-128($np),$TEMP1
  1200. vpaddq $TEMP2,$ACC6,$ACC5
  1201. vpmuludq $Yi,$TEMP0,$TEMP0
  1202. vmovdqu -24+32*9-128($np),$TEMP2
  1203. mov $r3, $r0
  1204. vpaddq $TEMP0,$ACC7,$ACC6
  1205. vpmuludq $Yi,$TEMP1,$TEMP1
  1206. add (%rsp), $r0
  1207. vpaddq $TEMP1,$ACC8,$ACC7
  1208. vpmuludq $Yi,$TEMP2,$TEMP2
  1209. vmovq $r3, $TEMP1
  1210. vpaddq $TEMP2,$ACC9,$ACC8
  1211. dec $i
  1212. jnz .Loop_mul_1024
  1213. ___
  1214. # (*) Original implementation was correcting ACC1-ACC3 for overflow
  1215. # after 7 loop runs, or after 28 iterations, or 56 additions.
  1216. # But as we underutilize resources, it's possible to correct in
  1217. # each iteration with marginal performance loss. But then, as
  1218. # we do it in each iteration, we can correct less digits, and
  1219. # avoid performance penalties completely. Also note that we
  1220. # correct only three digits out of four. This works because
  1221. # most significant digit is subjected to less additions.
  1222. $TEMP0 = $ACC9;
  1223. $TEMP3 = $Bi;
  1224. $TEMP4 = $Yi;
  1225. $code.=<<___;
  1226. vpermq \$0, $AND_MASK, $AND_MASK
  1227. vpaddq (%rsp), $TEMP1, $ACC0
  1228. vpsrlq \$29, $ACC0, $TEMP1
  1229. vpand $AND_MASK, $ACC0, $ACC0
  1230. vpsrlq \$29, $ACC1, $TEMP2
  1231. vpand $AND_MASK, $ACC1, $ACC1
  1232. vpsrlq \$29, $ACC2, $TEMP3
  1233. vpermq \$0x93, $TEMP1, $TEMP1
  1234. vpand $AND_MASK, $ACC2, $ACC2
  1235. vpsrlq \$29, $ACC3, $TEMP4
  1236. vpermq \$0x93, $TEMP2, $TEMP2
  1237. vpand $AND_MASK, $ACC3, $ACC3
  1238. vpblendd \$3, $ZERO, $TEMP1, $TEMP0
  1239. vpermq \$0x93, $TEMP3, $TEMP3
  1240. vpblendd \$3, $TEMP1, $TEMP2, $TEMP1
  1241. vpermq \$0x93, $TEMP4, $TEMP4
  1242. vpaddq $TEMP0, $ACC0, $ACC0
  1243. vpblendd \$3, $TEMP2, $TEMP3, $TEMP2
  1244. vpaddq $TEMP1, $ACC1, $ACC1
  1245. vpblendd \$3, $TEMP3, $TEMP4, $TEMP3
  1246. vpaddq $TEMP2, $ACC2, $ACC2
  1247. vpblendd \$3, $TEMP4, $ZERO, $TEMP4
  1248. vpaddq $TEMP3, $ACC3, $ACC3
  1249. vpaddq $TEMP4, $ACC4, $ACC4
  1250. vpsrlq \$29, $ACC0, $TEMP1
  1251. vpand $AND_MASK, $ACC0, $ACC0
  1252. vpsrlq \$29, $ACC1, $TEMP2
  1253. vpand $AND_MASK, $ACC1, $ACC1
  1254. vpsrlq \$29, $ACC2, $TEMP3
  1255. vpermq \$0x93, $TEMP1, $TEMP1
  1256. vpand $AND_MASK, $ACC2, $ACC2
  1257. vpsrlq \$29, $ACC3, $TEMP4
  1258. vpermq \$0x93, $TEMP2, $TEMP2
  1259. vpand $AND_MASK, $ACC3, $ACC3
  1260. vpermq \$0x93, $TEMP3, $TEMP3
  1261. vpblendd \$3, $ZERO, $TEMP1, $TEMP0
  1262. vpermq \$0x93, $TEMP4, $TEMP4
  1263. vpblendd \$3, $TEMP1, $TEMP2, $TEMP1
  1264. vpaddq $TEMP0, $ACC0, $ACC0
  1265. vpblendd \$3, $TEMP2, $TEMP3, $TEMP2
  1266. vpaddq $TEMP1, $ACC1, $ACC1
  1267. vpblendd \$3, $TEMP3, $TEMP4, $TEMP3
  1268. vpaddq $TEMP2, $ACC2, $ACC2
  1269. vpblendd \$3, $TEMP4, $ZERO, $TEMP4
  1270. vpaddq $TEMP3, $ACC3, $ACC3
  1271. vpaddq $TEMP4, $ACC4, $ACC4
  1272. vmovdqu $ACC0, 0-128($rp)
  1273. vmovdqu $ACC1, 32-128($rp)
  1274. vmovdqu $ACC2, 64-128($rp)
  1275. vmovdqu $ACC3, 96-128($rp)
  1276. ___
  1277. $TEMP5=$ACC0;
  1278. $code.=<<___;
  1279. vpsrlq \$29, $ACC4, $TEMP1
  1280. vpand $AND_MASK, $ACC4, $ACC4
  1281. vpsrlq \$29, $ACC5, $TEMP2
  1282. vpand $AND_MASK, $ACC5, $ACC5
  1283. vpsrlq \$29, $ACC6, $TEMP3
  1284. vpermq \$0x93, $TEMP1, $TEMP1
  1285. vpand $AND_MASK, $ACC6, $ACC6
  1286. vpsrlq \$29, $ACC7, $TEMP4
  1287. vpermq \$0x93, $TEMP2, $TEMP2
  1288. vpand $AND_MASK, $ACC7, $ACC7
  1289. vpsrlq \$29, $ACC8, $TEMP5
  1290. vpermq \$0x93, $TEMP3, $TEMP3
  1291. vpand $AND_MASK, $ACC8, $ACC8
  1292. vpermq \$0x93, $TEMP4, $TEMP4
  1293. vpblendd \$3, $ZERO, $TEMP1, $TEMP0
  1294. vpermq \$0x93, $TEMP5, $TEMP5
  1295. vpblendd \$3, $TEMP1, $TEMP2, $TEMP1
  1296. vpaddq $TEMP0, $ACC4, $ACC4
  1297. vpblendd \$3, $TEMP2, $TEMP3, $TEMP2
  1298. vpaddq $TEMP1, $ACC5, $ACC5
  1299. vpblendd \$3, $TEMP3, $TEMP4, $TEMP3
  1300. vpaddq $TEMP2, $ACC6, $ACC6
  1301. vpblendd \$3, $TEMP4, $TEMP5, $TEMP4
  1302. vpaddq $TEMP3, $ACC7, $ACC7
  1303. vpaddq $TEMP4, $ACC8, $ACC8
  1304. vpsrlq \$29, $ACC4, $TEMP1
  1305. vpand $AND_MASK, $ACC4, $ACC4
  1306. vpsrlq \$29, $ACC5, $TEMP2
  1307. vpand $AND_MASK, $ACC5, $ACC5
  1308. vpsrlq \$29, $ACC6, $TEMP3
  1309. vpermq \$0x93, $TEMP1, $TEMP1
  1310. vpand $AND_MASK, $ACC6, $ACC6
  1311. vpsrlq \$29, $ACC7, $TEMP4
  1312. vpermq \$0x93, $TEMP2, $TEMP2
  1313. vpand $AND_MASK, $ACC7, $ACC7
  1314. vpsrlq \$29, $ACC8, $TEMP5
  1315. vpermq \$0x93, $TEMP3, $TEMP3
  1316. vpand $AND_MASK, $ACC8, $ACC8
  1317. vpermq \$0x93, $TEMP4, $TEMP4
  1318. vpblendd \$3, $ZERO, $TEMP1, $TEMP0
  1319. vpermq \$0x93, $TEMP5, $TEMP5
  1320. vpblendd \$3, $TEMP1, $TEMP2, $TEMP1
  1321. vpaddq $TEMP0, $ACC4, $ACC4
  1322. vpblendd \$3, $TEMP2, $TEMP3, $TEMP2
  1323. vpaddq $TEMP1, $ACC5, $ACC5
  1324. vpblendd \$3, $TEMP3, $TEMP4, $TEMP3
  1325. vpaddq $TEMP2, $ACC6, $ACC6
  1326. vpblendd \$3, $TEMP4, $TEMP5, $TEMP4
  1327. vpaddq $TEMP3, $ACC7, $ACC7
  1328. vpaddq $TEMP4, $ACC8, $ACC8
  1329. vmovdqu $ACC4, 128-128($rp)
  1330. vmovdqu $ACC5, 160-128($rp)
  1331. vmovdqu $ACC6, 192-128($rp)
  1332. vmovdqu $ACC7, 224-128($rp)
  1333. vmovdqu $ACC8, 256-128($rp)
  1334. vzeroupper
  1335. mov %rbp, %rax
  1336. ___
  1337. $code.=<<___ if ($win64);
  1338. movaps -0xd8(%rax),%xmm6
  1339. movaps -0xc8(%rax),%xmm7
  1340. movaps -0xb8(%rax),%xmm8
  1341. movaps -0xa8(%rax),%xmm9
  1342. movaps -0x98(%rax),%xmm10
  1343. movaps -0x88(%rax),%xmm11
  1344. movaps -0x78(%rax),%xmm12
  1345. movaps -0x68(%rax),%xmm13
  1346. movaps -0x58(%rax),%xmm14
  1347. movaps -0x48(%rax),%xmm15
  1348. ___
  1349. $code.=<<___;
  1350. mov -48(%rax),%r15
  1351. mov -40(%rax),%r14
  1352. mov -32(%rax),%r13
  1353. mov -24(%rax),%r12
  1354. mov -16(%rax),%rbp
  1355. mov -8(%rax),%rbx
  1356. lea (%rax),%rsp # restore %rsp
  1357. .Lmul_1024_epilogue:
  1358. ret
  1359. .size rsaz_1024_mul_avx2,.-rsaz_1024_mul_avx2
  1360. ___
  1361. }
  1362. {
  1363. my ($out,$inp) = $win64 ? ("%rcx","%rdx") : ("%rdi","%rsi");
  1364. my @T = map("%r$_",(8..11));
  1365. $code.=<<___;
  1366. .globl rsaz_1024_red2norm_avx2
  1367. .type rsaz_1024_red2norm_avx2,\@abi-omnipotent
  1368. .align 32
  1369. rsaz_1024_red2norm_avx2:
  1370. sub \$-128,$inp # size optimization
  1371. xor %rax,%rax
  1372. ___
  1373. for ($j=0,$i=0; $i<16; $i++) {
  1374. my $k=0;
  1375. while (29*$j<64*($i+1)) { # load data till boundary
  1376. $code.=" mov `8*$j-128`($inp), @T[0]\n";
  1377. $j++; $k++; push(@T,shift(@T));
  1378. }
  1379. $l=$k;
  1380. while ($k>1) { # shift loaded data but last value
  1381. $code.=" shl \$`29*($j-$k)`,@T[-$k]\n";
  1382. $k--;
  1383. }
  1384. $code.=<<___; # shift last value
  1385. mov @T[-1], @T[0]
  1386. shl \$`29*($j-1)`, @T[-1]
  1387. shr \$`-29*($j-1)`, @T[0]
  1388. ___
  1389. while ($l) { # accumulate all values
  1390. $code.=" add @T[-$l], %rax\n";
  1391. $l--;
  1392. }
  1393. $code.=<<___;
  1394. adc \$0, @T[0] # consume eventual carry
  1395. mov %rax, 8*$i($out)
  1396. mov @T[0], %rax
  1397. ___
  1398. push(@T,shift(@T));
  1399. }
  1400. $code.=<<___;
  1401. ret
  1402. .size rsaz_1024_red2norm_avx2,.-rsaz_1024_red2norm_avx2
  1403. .globl rsaz_1024_norm2red_avx2
  1404. .type rsaz_1024_norm2red_avx2,\@abi-omnipotent
  1405. .align 32
  1406. rsaz_1024_norm2red_avx2:
  1407. sub \$-128,$out # size optimization
  1408. mov ($inp),@T[0]
  1409. mov \$0x1fffffff,%eax
  1410. ___
  1411. for ($j=0,$i=0; $i<16; $i++) {
  1412. $code.=" mov `8*($i+1)`($inp),@T[1]\n" if ($i<15);
  1413. $code.=" xor @T[1],@T[1]\n" if ($i==15);
  1414. my $k=1;
  1415. while (29*($j+1)<64*($i+1)) {
  1416. $code.=<<___;
  1417. mov @T[0],@T[-$k]
  1418. shr \$`29*$j`,@T[-$k]
  1419. and %rax,@T[-$k] # &0x1fffffff
  1420. mov @T[-$k],`8*$j-128`($out)
  1421. ___
  1422. $j++; $k++;
  1423. }
  1424. $code.=<<___;
  1425. shrd \$`29*$j`,@T[1],@T[0]
  1426. and %rax,@T[0]
  1427. mov @T[0],`8*$j-128`($out)
  1428. ___
  1429. $j++;
  1430. push(@T,shift(@T));
  1431. }
  1432. $code.=<<___;
  1433. mov @T[0],`8*$j-128`($out) # zero
  1434. mov @T[0],`8*($j+1)-128`($out)
  1435. mov @T[0],`8*($j+2)-128`($out)
  1436. mov @T[0],`8*($j+3)-128`($out)
  1437. ret
  1438. .size rsaz_1024_norm2red_avx2,.-rsaz_1024_norm2red_avx2
  1439. ___
  1440. }
  1441. {
  1442. my ($out,$inp,$power) = $win64 ? ("%rcx","%rdx","%r8d") : ("%rdi","%rsi","%edx");
  1443. $code.=<<___;
  1444. .globl rsaz_1024_scatter5_avx2
  1445. .type rsaz_1024_scatter5_avx2,\@abi-omnipotent
  1446. .align 32
  1447. rsaz_1024_scatter5_avx2:
  1448. vzeroupper
  1449. vmovdqu .Lscatter_permd(%rip),%ymm5
  1450. shl \$4,$power
  1451. lea ($out,$power),$out
  1452. mov \$9,%eax
  1453. jmp .Loop_scatter_1024
  1454. .align 32
  1455. .Loop_scatter_1024:
  1456. vmovdqu ($inp),%ymm0
  1457. lea 32($inp),$inp
  1458. vpermd %ymm0,%ymm5,%ymm0
  1459. vmovdqu %xmm0,($out)
  1460. lea 16*32($out),$out
  1461. dec %eax
  1462. jnz .Loop_scatter_1024
  1463. vzeroupper
  1464. ret
  1465. .size rsaz_1024_scatter5_avx2,.-rsaz_1024_scatter5_avx2
  1466. .globl rsaz_1024_gather5_avx2
  1467. .type rsaz_1024_gather5_avx2,\@abi-omnipotent
  1468. .align 32
  1469. rsaz_1024_gather5_avx2:
  1470. ___
  1471. $code.=<<___ if ($win64);
  1472. lea -0x88(%rsp),%rax
  1473. vzeroupper
  1474. .LSEH_begin_rsaz_1024_gather5:
  1475. # I can't trust assembler to use specific encoding:-(
  1476. .byte 0x48,0x8d,0x60,0xe0 #lea -0x20(%rax),%rsp
  1477. .byte 0xc5,0xf8,0x29,0x70,0xe0 #vmovaps %xmm6,-0x20(%rax)
  1478. .byte 0xc5,0xf8,0x29,0x78,0xf0 #vmovaps %xmm7,-0x10(%rax)
  1479. .byte 0xc5,0x78,0x29,0x40,0x00 #vmovaps %xmm8,0(%rax)
  1480. .byte 0xc5,0x78,0x29,0x48,0x10 #vmovaps %xmm9,0x10(%rax)
  1481. .byte 0xc5,0x78,0x29,0x50,0x20 #vmovaps %xmm10,0x20(%rax)
  1482. .byte 0xc5,0x78,0x29,0x58,0x30 #vmovaps %xmm11,0x30(%rax)
  1483. .byte 0xc5,0x78,0x29,0x60,0x40 #vmovaps %xmm12,0x40(%rax)
  1484. .byte 0xc5,0x78,0x29,0x68,0x50 #vmovaps %xmm13,0x50(%rax)
  1485. .byte 0xc5,0x78,0x29,0x70,0x60 #vmovaps %xmm14,0x60(%rax)
  1486. .byte 0xc5,0x78,0x29,0x78,0x70 #vmovaps %xmm15,0x70(%rax)
  1487. ___
  1488. $code.=<<___;
  1489. lea .Lgather_table(%rip),%r11
  1490. mov $power,%eax
  1491. and \$3,$power
  1492. shr \$2,%eax # cache line number
  1493. shl \$4,$power # offset within cache line
  1494. vmovdqu -32(%r11),%ymm7 # .Lgather_permd
  1495. vpbroadcastb 8(%r11,%rax), %xmm8
  1496. vpbroadcastb 7(%r11,%rax), %xmm9
  1497. vpbroadcastb 6(%r11,%rax), %xmm10
  1498. vpbroadcastb 5(%r11,%rax), %xmm11
  1499. vpbroadcastb 4(%r11,%rax), %xmm12
  1500. vpbroadcastb 3(%r11,%rax), %xmm13
  1501. vpbroadcastb 2(%r11,%rax), %xmm14
  1502. vpbroadcastb 1(%r11,%rax), %xmm15
  1503. lea 64($inp,$power),$inp
  1504. mov \$64,%r11 # size optimization
  1505. mov \$9,%eax
  1506. jmp .Loop_gather_1024
  1507. .align 32
  1508. .Loop_gather_1024:
  1509. vpand -64($inp), %xmm8,%xmm0
  1510. vpand ($inp), %xmm9,%xmm1
  1511. vpand 64($inp), %xmm10,%xmm2
  1512. vpand ($inp,%r11,2), %xmm11,%xmm3
  1513. vpor %xmm0,%xmm1,%xmm1
  1514. vpand 64($inp,%r11,2), %xmm12,%xmm4
  1515. vpor %xmm2,%xmm3,%xmm3
  1516. vpand ($inp,%r11,4), %xmm13,%xmm5
  1517. vpor %xmm1,%xmm3,%xmm3
  1518. vpand 64($inp,%r11,4), %xmm14,%xmm6
  1519. vpor %xmm4,%xmm5,%xmm5
  1520. vpand -128($inp,%r11,8), %xmm15,%xmm2
  1521. lea ($inp,%r11,8),$inp
  1522. vpor %xmm3,%xmm5,%xmm5
  1523. vpor %xmm2,%xmm6,%xmm6
  1524. vpor %xmm5,%xmm6,%xmm6
  1525. vpermd %ymm6,%ymm7,%ymm6
  1526. vmovdqu %ymm6,($out)
  1527. lea 32($out),$out
  1528. dec %eax
  1529. jnz .Loop_gather_1024
  1530. vpxor %ymm0,%ymm0,%ymm0
  1531. vmovdqu %ymm0,($out)
  1532. vzeroupper
  1533. ___
  1534. $code.=<<___ if ($win64);
  1535. movaps (%rsp),%xmm6
  1536. movaps 0x10(%rsp),%xmm7
  1537. movaps 0x20(%rsp),%xmm8
  1538. movaps 0x30(%rsp),%xmm9
  1539. movaps 0x40(%rsp),%xmm10
  1540. movaps 0x50(%rsp),%xmm11
  1541. movaps 0x60(%rsp),%xmm12
  1542. movaps 0x70(%rsp),%xmm13
  1543. movaps 0x80(%rsp),%xmm14
  1544. movaps 0x90(%rsp),%xmm15
  1545. lea 0xa8(%rsp),%rsp
  1546. .LSEH_end_rsaz_1024_gather5:
  1547. ___
  1548. $code.=<<___;
  1549. ret
  1550. .size rsaz_1024_gather5_avx2,.-rsaz_1024_gather5_avx2
  1551. ___
  1552. }
  1553. $code.=<<___;
  1554. .extern OPENSSL_ia32cap_P
  1555. .globl rsaz_avx2_eligible
  1556. .type rsaz_avx2_eligible,\@abi-omnipotent
  1557. .align 32
  1558. rsaz_avx2_eligible:
  1559. mov OPENSSL_ia32cap_P+8(%rip),%eax
  1560. ___
  1561. $code.=<<___ if ($addx);
  1562. mov \$`1<<8|1<<19`,%ecx
  1563. mov \$0,%edx
  1564. and %eax,%ecx
  1565. cmp \$`1<<8|1<<19`,%ecx # check for BMI2+AD*X
  1566. cmove %edx,%eax
  1567. ___
  1568. $code.=<<___;
  1569. and \$`1<<5`,%eax
  1570. shr \$5,%eax
  1571. ret
  1572. .size rsaz_avx2_eligible,.-rsaz_avx2_eligible
  1573. .align 64
  1574. .Land_mask:
  1575. .quad 0x1fffffff,0x1fffffff,0x1fffffff,-1
  1576. .Lscatter_permd:
  1577. .long 0,2,4,6,7,7,7,7
  1578. .Lgather_permd:
  1579. .long 0,7,1,7,2,7,3,7
  1580. .Lgather_table:
  1581. .byte 0,0,0,0,0,0,0,0, 0xff,0,0,0,0,0,0,0
  1582. .align 64
  1583. ___
  1584. if ($win64) {
  1585. $rec="%rcx";
  1586. $frame="%rdx";
  1587. $context="%r8";
  1588. $disp="%r9";
  1589. $code.=<<___
  1590. .extern __imp_RtlVirtualUnwind
  1591. .type rsaz_se_handler,\@abi-omnipotent
  1592. .align 16
  1593. rsaz_se_handler:
  1594. push %rsi
  1595. push %rdi
  1596. push %rbx
  1597. push %rbp
  1598. push %r12
  1599. push %r13
  1600. push %r14
  1601. push %r15
  1602. pushfq
  1603. sub \$64,%rsp
  1604. mov 120($context),%rax # pull context->Rax
  1605. mov 248($context),%rbx # pull context->Rip
  1606. mov 8($disp),%rsi # disp->ImageBase
  1607. mov 56($disp),%r11 # disp->HandlerData
  1608. mov 0(%r11),%r10d # HandlerData[0]
  1609. lea (%rsi,%r10),%r10 # prologue label
  1610. cmp %r10,%rbx # context->Rip<prologue label
  1611. jb .Lcommon_seh_tail
  1612. mov 152($context),%rax # pull context->Rsp
  1613. mov 4(%r11),%r10d # HandlerData[1]
  1614. lea (%rsi,%r10),%r10 # epilogue label
  1615. cmp %r10,%rbx # context->Rip>=epilogue label
  1616. jae .Lcommon_seh_tail
  1617. mov 160($context),%rax # pull context->Rbp
  1618. mov -48(%rax),%r15
  1619. mov -40(%rax),%r14
  1620. mov -32(%rax),%r13
  1621. mov -24(%rax),%r12
  1622. mov -16(%rax),%rbp
  1623. mov -8(%rax),%rbx
  1624. mov %r15,240($context)
  1625. mov %r14,232($context)
  1626. mov %r13,224($context)
  1627. mov %r12,216($context)
  1628. mov %rbp,160($context)
  1629. mov %rbx,144($context)
  1630. lea -0xd8(%rax),%rsi # %xmm save area
  1631. lea 512($context),%rdi # & context.Xmm6
  1632. mov \$20,%ecx # 10*sizeof(%xmm0)/sizeof(%rax)
  1633. .long 0xa548f3fc # cld; rep movsq
  1634. .Lcommon_seh_tail:
  1635. mov 8(%rax),%rdi
  1636. mov 16(%rax),%rsi
  1637. mov %rax,152($context) # restore context->Rsp
  1638. mov %rsi,168($context) # restore context->Rsi
  1639. mov %rdi,176($context) # restore context->Rdi
  1640. mov 40($disp),%rdi # disp->ContextRecord
  1641. mov $context,%rsi # context
  1642. mov \$154,%ecx # sizeof(CONTEXT)
  1643. .long 0xa548f3fc # cld; rep movsq
  1644. mov $disp,%rsi
  1645. xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER
  1646. mov 8(%rsi),%rdx # arg2, disp->ImageBase
  1647. mov 0(%rsi),%r8 # arg3, disp->ControlPc
  1648. mov 16(%rsi),%r9 # arg4, disp->FunctionEntry
  1649. mov 40(%rsi),%r10 # disp->ContextRecord
  1650. lea 56(%rsi),%r11 # &disp->HandlerData
  1651. lea 24(%rsi),%r12 # &disp->EstablisherFrame
  1652. mov %r10,32(%rsp) # arg5
  1653. mov %r11,40(%rsp) # arg6
  1654. mov %r12,48(%rsp) # arg7
  1655. mov %rcx,56(%rsp) # arg8, (NULL)
  1656. call *__imp_RtlVirtualUnwind(%rip)
  1657. mov \$1,%eax # ExceptionContinueSearch
  1658. add \$64,%rsp
  1659. popfq
  1660. pop %r15
  1661. pop %r14
  1662. pop %r13
  1663. pop %r12
  1664. pop %rbp
  1665. pop %rbx
  1666. pop %rdi
  1667. pop %rsi
  1668. ret
  1669. .size rsaz_se_handler,.-rsaz_se_handler
  1670. .section .pdata
  1671. .align 4
  1672. .rva .LSEH_begin_rsaz_1024_sqr_avx2
  1673. .rva .LSEH_end_rsaz_1024_sqr_avx2
  1674. .rva .LSEH_info_rsaz_1024_sqr_avx2
  1675. .rva .LSEH_begin_rsaz_1024_mul_avx2
  1676. .rva .LSEH_end_rsaz_1024_mul_avx2
  1677. .rva .LSEH_info_rsaz_1024_mul_avx2
  1678. .rva .LSEH_begin_rsaz_1024_gather5
  1679. .rva .LSEH_end_rsaz_1024_gather5
  1680. .rva .LSEH_info_rsaz_1024_gather5
  1681. .section .xdata
  1682. .align 8
  1683. .LSEH_info_rsaz_1024_sqr_avx2:
  1684. .byte 9,0,0,0
  1685. .rva rsaz_se_handler
  1686. .rva .Lsqr_1024_body,.Lsqr_1024_epilogue
  1687. .LSEH_info_rsaz_1024_mul_avx2:
  1688. .byte 9,0,0,0
  1689. .rva rsaz_se_handler
  1690. .rva .Lmul_1024_body,.Lmul_1024_epilogue
  1691. .LSEH_info_rsaz_1024_gather5:
  1692. .byte 0x01,0x33,0x16,0x00
  1693. .byte 0x36,0xf8,0x09,0x00 #vmovaps 0x90(rsp),xmm15
  1694. .byte 0x31,0xe8,0x08,0x00 #vmovaps 0x80(rsp),xmm14
  1695. .byte 0x2c,0xd8,0x07,0x00 #vmovaps 0x70(rsp),xmm13
  1696. .byte 0x27,0xc8,0x06,0x00 #vmovaps 0x60(rsp),xmm12
  1697. .byte 0x22,0xb8,0x05,0x00 #vmovaps 0x50(rsp),xmm11
  1698. .byte 0x1d,0xa8,0x04,0x00 #vmovaps 0x40(rsp),xmm10
  1699. .byte 0x18,0x98,0x03,0x00 #vmovaps 0x30(rsp),xmm9
  1700. .byte 0x13,0x88,0x02,0x00 #vmovaps 0x20(rsp),xmm8
  1701. .byte 0x0e,0x78,0x01,0x00 #vmovaps 0x10(rsp),xmm7
  1702. .byte 0x09,0x68,0x00,0x00 #vmovaps 0x00(rsp),xmm6
  1703. .byte 0x04,0x01,0x15,0x00 #sub rsp,0xa8
  1704. ___
  1705. }
  1706. foreach (split("\n",$code)) {
  1707. s/\`([^\`]*)\`/eval($1)/ge;
  1708. s/\b(sh[rl]d?\s+\$)(-?[0-9]+)/$1.$2%64/ge or
  1709. s/\b(vmov[dq])\b(.+)%ymm([0-9]+)/$1$2%xmm$3/go or
  1710. s/\b(vmovdqu)\b(.+)%x%ymm([0-9]+)/$1$2%xmm$3/go or
  1711. s/\b(vpinsr[qd])\b(.+)%ymm([0-9]+)/$1$2%xmm$3/go or
  1712. s/\b(vpextr[qd])\b(.+)%ymm([0-9]+)/$1$2%xmm$3/go or
  1713. s/\b(vpbroadcast[qd]\s+)%ymm([0-9]+)/$1%xmm$2/go;
  1714. print $_,"\n";
  1715. }
  1716. }}} else {{{
  1717. print <<___; # assembler is too old
  1718. .text
  1719. .globl rsaz_avx2_eligible
  1720. .type rsaz_avx2_eligible,\@abi-omnipotent
  1721. rsaz_avx2_eligible:
  1722. xor %eax,%eax
  1723. ret
  1724. .size rsaz_avx2_eligible,.-rsaz_avx2_eligible
  1725. .globl rsaz_1024_sqr_avx2
  1726. .globl rsaz_1024_mul_avx2
  1727. .globl rsaz_1024_norm2red_avx2
  1728. .globl rsaz_1024_red2norm_avx2
  1729. .globl rsaz_1024_scatter5_avx2
  1730. .globl rsaz_1024_gather5_avx2
  1731. .type rsaz_1024_sqr_avx2,\@abi-omnipotent
  1732. rsaz_1024_sqr_avx2:
  1733. rsaz_1024_mul_avx2:
  1734. rsaz_1024_norm2red_avx2:
  1735. rsaz_1024_red2norm_avx2:
  1736. rsaz_1024_scatter5_avx2:
  1737. rsaz_1024_gather5_avx2:
  1738. .byte 0x0f,0x0b # ud2
  1739. ret
  1740. .size rsaz_1024_sqr_avx2,.-rsaz_1024_sqr_avx2
  1741. ___
  1742. }}}
  1743. close STDOUT;