You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270127112721273127412751276127712781279128012811282128312841285128612871288128912901291129212931294129512961297129812991300130113021303130413051306130713081309131013111312131313141315131613171318131913201321132213231324132513261327132813291330133113321333133413351336133713381339134013411342134313441345134613471348134913501351135213531354135513561357135813591360136113621363136413651366136713681369137013711372137313741375137613771378137913801381138213831384138513861387138813891390139113921393139413951396139713981399140014011402140314041405140614071408140914101411141214131414141514161417141814191420142114221423142414251426142714281429143014311432143314341435143614371438143914401441144214431444144514461447144814491450145114521453145414551456145714581459146014611462146314641465146614671468146914701471147214731474147514761477147814791480148114821483148414851486148714881489149014911492149314941495149614971498149915001501150215031504150515061507150815091510151115121513151415151516151715181519152015211522152315241525152615271528152915301531153215331534153515361537153815391540154115421543154415451546154715481549155015511552155315541555155615571558155915601561156215631564156515661567156815691570157115721573157415751576157715781579158015811582158315841585158615871588158915901591159215931594159515961597159815991600160116021603160416051606160716081609161016111612161316141615161616171618161916201621162216231624162516261627162816291630163116321633163416351636163716381639164016411642164316441645164616471648164916501651165216531654165516561657165816591660166116621663166416651666166716681669167016711672167316741675167616771678167916801681168216831684168516861687168816891690169116921693169416951696169716981699170017011702170317041705170617071708170917101711171217131714171517161717171817191720172117221723172417251726172717281729173017311732173317341735173617371738173917401741174217431744174517461747174817491750175117521753175417551756175717581759176017611762176317641765176617671768176917701771177217731774177517761777177817791780178117821783178417851786178717881789179017911792179317941795179617971798179918001801180218031804180518061807180818091810181118121813181418151816181718181819182018211822182318241825182618271828182918301831183218331834183518361837183818391840184118421843184418451846184718481849185018511852185318541855185618571858185918601861186218631864186518661867186818691870187118721873187418751876187718781879188018811882188318841885188618871888188918901891189218931894189518961897189818991900190119021903190419051906190719081909191019111912191319141915191619171918191919201921192219231924192519261927192819291930193119321933193419351936193719381939194019411942194319441945
  1. #!/usr/bin/env perl
  2. ##############################################################################
  3. # #
  4. # Copyright (c) 2012, Intel Corporation #
  5. # #
  6. # All rights reserved. #
  7. # #
  8. # Redistribution and use in source and binary forms, with or without #
  9. # modification, are permitted provided that the following conditions are #
  10. # met: #
  11. # #
  12. # * Redistributions of source code must retain the above copyright #
  13. # notice, this list of conditions and the following disclaimer. #
  14. # #
  15. # * Redistributions in binary form must reproduce the above copyright #
  16. # notice, this list of conditions and the following disclaimer in the #
  17. # documentation and/or other materials provided with the #
  18. # distribution. #
  19. # #
  20. # * Neither the name of the Intel Corporation nor the names of its #
  21. # contributors may be used to endorse or promote products derived from #
  22. # this software without specific prior written permission. #
  23. # #
  24. # #
  25. # THIS SOFTWARE IS PROVIDED BY INTEL CORPORATION ""AS IS"" AND ANY #
  26. # EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE #
  27. # IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR #
  28. # PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL CORPORATION OR #
  29. # CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, #
  30. # EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, #
  31. # PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR #
  32. # PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF #
  33. # LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING #
  34. # NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS #
  35. # SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #
  36. # #
  37. ##############################################################################
  38. # Developers and authors: #
  39. # Shay Gueron (1, 2), and Vlad Krasnov (1) #
  40. # (1) Intel Corporation, Israel Development Center, Haifa, Israel #
  41. # (2) University of Haifa, Israel #
  42. ##############################################################################
  43. # Reference: #
  44. # [1] S. Gueron, V. Krasnov: "Software Implementation of Modular #
  45. # Exponentiation, Using Advanced Vector Instructions Architectures", #
  46. # F. Ozbudak and F. Rodriguez-Henriquez (Eds.): WAIFI 2012, LNCS 7369, #
  47. # pp. 119?135, 2012. Springer-Verlag Berlin Heidelberg 2012 #
  48. # [2] S. Gueron: "Efficient Software Implementations of Modular #
  49. # Exponentiation", Journal of Cryptographic Engineering 2:31-43 (2012). #
  50. # [3] S. Gueron, V. Krasnov: "Speeding up Big-numbers Squaring",IEEE #
  51. # Proceedings of 9th International Conference on Information Technology: #
  52. # New Generations (ITNG 2012), pp.821-823 (2012) #
  53. # [4] S. Gueron, V. Krasnov: "[PATCH] Efficient and side channel analysis #
  54. # resistant 1024-bit modular exponentiation, for optimizing RSA2048 #
  55. # on AVX2 capable x86_64 platforms", #
  56. # http://rt.openssl.org/Ticket/Display.html?id=2850&user=guest&pass=guest#
  57. ##############################################################################
  58. #
  59. # +13% improvement over original submission by <appro@openssl.org>
  60. #
  61. # rsa2048 sign/sec OpenSSL 1.0.1 scalar(*) this
  62. # 2.3GHz Haswell 621 765/+23% 1113/+79%
  63. # 2.3GHz Broadwell(**) 688 1200(***)/+74% 1120/+63%
  64. #
  65. # (*) if system doesn't support AVX2, for reference purposes;
  66. # (**) scaled to 2.3GHz to simplify comparison;
  67. # (***) scalar AD*X code is faster than AVX2 and is preferred code
  68. # path for Broadwell;
  69. $flavour = shift;
  70. $output = shift;
  71. if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
  72. $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
  73. $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
  74. ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
  75. ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
  76. die "can't locate x86_64-xlate.pl";
  77. # In upstream, this is controlled by shelling out to the compiler to check
  78. # versions, but BoringSSL is intended to be used with pre-generated perlasm
  79. # output, so this isn't useful anyway.
  80. #
  81. # TODO(davidben): Enable these after testing. $avx goes up to 2 and $addx to 1.
  82. $avx = 0;
  83. $addx = 0;
  84. open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
  85. *STDOUT = *OUT;
  86. if ($avx>1) {{{
  87. { # void AMS_WW(
  88. my $rp="%rdi"; # BN_ULONG *rp,
  89. my $ap="%rsi"; # const BN_ULONG *ap,
  90. my $np="%rdx"; # const BN_ULONG *np,
  91. my $n0="%ecx"; # const BN_ULONG n0,
  92. my $rep="%r8d"; # int repeat);
  93. # The registers that hold the accumulated redundant result
  94. # The AMM works on 1024 bit operands, and redundant word size is 29
  95. # Therefore: ceil(1024/29)/4 = 9
  96. my $ACC0="%ymm0";
  97. my $ACC1="%ymm1";
  98. my $ACC2="%ymm2";
  99. my $ACC3="%ymm3";
  100. my $ACC4="%ymm4";
  101. my $ACC5="%ymm5";
  102. my $ACC6="%ymm6";
  103. my $ACC7="%ymm7";
  104. my $ACC8="%ymm8";
  105. my $ACC9="%ymm9";
  106. # Registers that hold the broadcasted words of bp, currently used
  107. my $B1="%ymm10";
  108. my $B2="%ymm11";
  109. # Registers that hold the broadcasted words of Y, currently used
  110. my $Y1="%ymm12";
  111. my $Y2="%ymm13";
  112. # Helper registers
  113. my $TEMP1="%ymm14";
  114. my $AND_MASK="%ymm15";
  115. # alu registers that hold the first words of the ACC
  116. my $r0="%r9";
  117. my $r1="%r10";
  118. my $r2="%r11";
  119. my $r3="%r12";
  120. my $i="%r14d"; # loop counter
  121. my $tmp = "%r15";
  122. my $FrameSize=32*18+32*8; # place for A^2 and 2*A
  123. my $aap=$r0;
  124. my $tp0="%rbx";
  125. my $tp1=$r3;
  126. my $tpa=$tmp;
  127. $np="%r13"; # reassigned argument
  128. $code.=<<___;
  129. .text
  130. .globl rsaz_1024_sqr_avx2
  131. .type rsaz_1024_sqr_avx2,\@function,5
  132. .align 64
  133. rsaz_1024_sqr_avx2: # 702 cycles, 14% faster than rsaz_1024_mul_avx2
  134. lea (%rsp), %rax
  135. push %rbx
  136. push %rbp
  137. push %r12
  138. push %r13
  139. push %r14
  140. push %r15
  141. vzeroupper
  142. ___
  143. $code.=<<___ if ($win64);
  144. lea -0xa8(%rsp),%rsp
  145. vmovaps %xmm6,-0xd8(%rax)
  146. vmovaps %xmm7,-0xc8(%rax)
  147. vmovaps %xmm8,-0xb8(%rax)
  148. vmovaps %xmm9,-0xa8(%rax)
  149. vmovaps %xmm10,-0x98(%rax)
  150. vmovaps %xmm11,-0x88(%rax)
  151. vmovaps %xmm12,-0x78(%rax)
  152. vmovaps %xmm13,-0x68(%rax)
  153. vmovaps %xmm14,-0x58(%rax)
  154. vmovaps %xmm15,-0x48(%rax)
  155. .Lsqr_1024_body:
  156. ___
  157. $code.=<<___;
  158. mov %rax,%rbp
  159. mov %rdx, $np # reassigned argument
  160. sub \$$FrameSize, %rsp
  161. mov $np, $tmp
  162. sub \$-128, $rp # size optimization
  163. sub \$-128, $ap
  164. sub \$-128, $np
  165. and \$4095, $tmp # see if $np crosses page
  166. add \$32*10, $tmp
  167. shr \$12, $tmp
  168. vpxor $ACC9,$ACC9,$ACC9
  169. jz .Lsqr_1024_no_n_copy
  170. # unaligned 256-bit load that crosses page boundary can
  171. # cause >2x performance degradation here, so if $np does
  172. # cross page boundary, copy it to stack and make sure stack
  173. # frame doesn't...
  174. sub \$32*10,%rsp
  175. vmovdqu 32*0-128($np), $ACC0
  176. and \$-2048, %rsp
  177. vmovdqu 32*1-128($np), $ACC1
  178. vmovdqu 32*2-128($np), $ACC2
  179. vmovdqu 32*3-128($np), $ACC3
  180. vmovdqu 32*4-128($np), $ACC4
  181. vmovdqu 32*5-128($np), $ACC5
  182. vmovdqu 32*6-128($np), $ACC6
  183. vmovdqu 32*7-128($np), $ACC7
  184. vmovdqu 32*8-128($np), $ACC8
  185. lea $FrameSize+128(%rsp),$np
  186. vmovdqu $ACC0, 32*0-128($np)
  187. vmovdqu $ACC1, 32*1-128($np)
  188. vmovdqu $ACC2, 32*2-128($np)
  189. vmovdqu $ACC3, 32*3-128($np)
  190. vmovdqu $ACC4, 32*4-128($np)
  191. vmovdqu $ACC5, 32*5-128($np)
  192. vmovdqu $ACC6, 32*6-128($np)
  193. vmovdqu $ACC7, 32*7-128($np)
  194. vmovdqu $ACC8, 32*8-128($np)
  195. vmovdqu $ACC9, 32*9-128($np) # $ACC9 is zero
  196. .Lsqr_1024_no_n_copy:
  197. and \$-1024, %rsp
  198. vmovdqu 32*1-128($ap), $ACC1
  199. vmovdqu 32*2-128($ap), $ACC2
  200. vmovdqu 32*3-128($ap), $ACC3
  201. vmovdqu 32*4-128($ap), $ACC4
  202. vmovdqu 32*5-128($ap), $ACC5
  203. vmovdqu 32*6-128($ap), $ACC6
  204. vmovdqu 32*7-128($ap), $ACC7
  205. vmovdqu 32*8-128($ap), $ACC8
  206. lea 192(%rsp), $tp0 # 64+128=192
  207. vpbroadcastq .Land_mask(%rip), $AND_MASK
  208. jmp .LOOP_GRANDE_SQR_1024
  209. .align 32
  210. .LOOP_GRANDE_SQR_1024:
  211. lea 32*18+128(%rsp), $aap # size optimization
  212. lea 448(%rsp), $tp1 # 64+128+256=448
  213. # the squaring is performed as described in Variant B of
  214. # "Speeding up Big-Number Squaring", so start by calculating
  215. # the A*2=A+A vector
  216. vpaddq $ACC1, $ACC1, $ACC1
  217. vpbroadcastq 32*0-128($ap), $B1
  218. vpaddq $ACC2, $ACC2, $ACC2
  219. vmovdqa $ACC1, 32*0-128($aap)
  220. vpaddq $ACC3, $ACC3, $ACC3
  221. vmovdqa $ACC2, 32*1-128($aap)
  222. vpaddq $ACC4, $ACC4, $ACC4
  223. vmovdqa $ACC3, 32*2-128($aap)
  224. vpaddq $ACC5, $ACC5, $ACC5
  225. vmovdqa $ACC4, 32*3-128($aap)
  226. vpaddq $ACC6, $ACC6, $ACC6
  227. vmovdqa $ACC5, 32*4-128($aap)
  228. vpaddq $ACC7, $ACC7, $ACC7
  229. vmovdqa $ACC6, 32*5-128($aap)
  230. vpaddq $ACC8, $ACC8, $ACC8
  231. vmovdqa $ACC7, 32*6-128($aap)
  232. vpxor $ACC9, $ACC9, $ACC9
  233. vmovdqa $ACC8, 32*7-128($aap)
  234. vpmuludq 32*0-128($ap), $B1, $ACC0
  235. vpbroadcastq 32*1-128($ap), $B2
  236. vmovdqu $ACC9, 32*9-192($tp0) # zero upper half
  237. vpmuludq $B1, $ACC1, $ACC1
  238. vmovdqu $ACC9, 32*10-448($tp1)
  239. vpmuludq $B1, $ACC2, $ACC2
  240. vmovdqu $ACC9, 32*11-448($tp1)
  241. vpmuludq $B1, $ACC3, $ACC3
  242. vmovdqu $ACC9, 32*12-448($tp1)
  243. vpmuludq $B1, $ACC4, $ACC4
  244. vmovdqu $ACC9, 32*13-448($tp1)
  245. vpmuludq $B1, $ACC5, $ACC5
  246. vmovdqu $ACC9, 32*14-448($tp1)
  247. vpmuludq $B1, $ACC6, $ACC6
  248. vmovdqu $ACC9, 32*15-448($tp1)
  249. vpmuludq $B1, $ACC7, $ACC7
  250. vmovdqu $ACC9, 32*16-448($tp1)
  251. vpmuludq $B1, $ACC8, $ACC8
  252. vpbroadcastq 32*2-128($ap), $B1
  253. vmovdqu $ACC9, 32*17-448($tp1)
  254. mov $ap, $tpa
  255. mov \$4, $i
  256. jmp .Lsqr_entry_1024
  257. ___
  258. $TEMP0=$Y1;
  259. $TEMP2=$Y2;
  260. $code.=<<___;
  261. .align 32
  262. .LOOP_SQR_1024:
  263. vpbroadcastq 32*1-128($tpa), $B2
  264. vpmuludq 32*0-128($ap), $B1, $ACC0
  265. vpaddq 32*0-192($tp0), $ACC0, $ACC0
  266. vpmuludq 32*0-128($aap), $B1, $ACC1
  267. vpaddq 32*1-192($tp0), $ACC1, $ACC1
  268. vpmuludq 32*1-128($aap), $B1, $ACC2
  269. vpaddq 32*2-192($tp0), $ACC2, $ACC2
  270. vpmuludq 32*2-128($aap), $B1, $ACC3
  271. vpaddq 32*3-192($tp0), $ACC3, $ACC3
  272. vpmuludq 32*3-128($aap), $B1, $ACC4
  273. vpaddq 32*4-192($tp0), $ACC4, $ACC4
  274. vpmuludq 32*4-128($aap), $B1, $ACC5
  275. vpaddq 32*5-192($tp0), $ACC5, $ACC5
  276. vpmuludq 32*5-128($aap), $B1, $ACC6
  277. vpaddq 32*6-192($tp0), $ACC6, $ACC6
  278. vpmuludq 32*6-128($aap), $B1, $ACC7
  279. vpaddq 32*7-192($tp0), $ACC7, $ACC7
  280. vpmuludq 32*7-128($aap), $B1, $ACC8
  281. vpbroadcastq 32*2-128($tpa), $B1
  282. vpaddq 32*8-192($tp0), $ACC8, $ACC8
  283. .Lsqr_entry_1024:
  284. vmovdqu $ACC0, 32*0-192($tp0)
  285. vmovdqu $ACC1, 32*1-192($tp0)
  286. vpmuludq 32*1-128($ap), $B2, $TEMP0
  287. vpaddq $TEMP0, $ACC2, $ACC2
  288. vpmuludq 32*1-128($aap), $B2, $TEMP1
  289. vpaddq $TEMP1, $ACC3, $ACC3
  290. vpmuludq 32*2-128($aap), $B2, $TEMP2
  291. vpaddq $TEMP2, $ACC4, $ACC4
  292. vpmuludq 32*3-128($aap), $B2, $TEMP0
  293. vpaddq $TEMP0, $ACC5, $ACC5
  294. vpmuludq 32*4-128($aap), $B2, $TEMP1
  295. vpaddq $TEMP1, $ACC6, $ACC6
  296. vpmuludq 32*5-128($aap), $B2, $TEMP2
  297. vpaddq $TEMP2, $ACC7, $ACC7
  298. vpmuludq 32*6-128($aap), $B2, $TEMP0
  299. vpaddq $TEMP0, $ACC8, $ACC8
  300. vpmuludq 32*7-128($aap), $B2, $ACC0
  301. vpbroadcastq 32*3-128($tpa), $B2
  302. vpaddq 32*9-192($tp0), $ACC0, $ACC0
  303. vmovdqu $ACC2, 32*2-192($tp0)
  304. vmovdqu $ACC3, 32*3-192($tp0)
  305. vpmuludq 32*2-128($ap), $B1, $TEMP2
  306. vpaddq $TEMP2, $ACC4, $ACC4
  307. vpmuludq 32*2-128($aap), $B1, $TEMP0
  308. vpaddq $TEMP0, $ACC5, $ACC5
  309. vpmuludq 32*3-128($aap), $B1, $TEMP1
  310. vpaddq $TEMP1, $ACC6, $ACC6
  311. vpmuludq 32*4-128($aap), $B1, $TEMP2
  312. vpaddq $TEMP2, $ACC7, $ACC7
  313. vpmuludq 32*5-128($aap), $B1, $TEMP0
  314. vpaddq $TEMP0, $ACC8, $ACC8
  315. vpmuludq 32*6-128($aap), $B1, $TEMP1
  316. vpaddq $TEMP1, $ACC0, $ACC0
  317. vpmuludq 32*7-128($aap), $B1, $ACC1
  318. vpbroadcastq 32*4-128($tpa), $B1
  319. vpaddq 32*10-448($tp1), $ACC1, $ACC1
  320. vmovdqu $ACC4, 32*4-192($tp0)
  321. vmovdqu $ACC5, 32*5-192($tp0)
  322. vpmuludq 32*3-128($ap), $B2, $TEMP0
  323. vpaddq $TEMP0, $ACC6, $ACC6
  324. vpmuludq 32*3-128($aap), $B2, $TEMP1
  325. vpaddq $TEMP1, $ACC7, $ACC7
  326. vpmuludq 32*4-128($aap), $B2, $TEMP2
  327. vpaddq $TEMP2, $ACC8, $ACC8
  328. vpmuludq 32*5-128($aap), $B2, $TEMP0
  329. vpaddq $TEMP0, $ACC0, $ACC0
  330. vpmuludq 32*6-128($aap), $B2, $TEMP1
  331. vpaddq $TEMP1, $ACC1, $ACC1
  332. vpmuludq 32*7-128($aap), $B2, $ACC2
  333. vpbroadcastq 32*5-128($tpa), $B2
  334. vpaddq 32*11-448($tp1), $ACC2, $ACC2
  335. vmovdqu $ACC6, 32*6-192($tp0)
  336. vmovdqu $ACC7, 32*7-192($tp0)
  337. vpmuludq 32*4-128($ap), $B1, $TEMP0
  338. vpaddq $TEMP0, $ACC8, $ACC8
  339. vpmuludq 32*4-128($aap), $B1, $TEMP1
  340. vpaddq $TEMP1, $ACC0, $ACC0
  341. vpmuludq 32*5-128($aap), $B1, $TEMP2
  342. vpaddq $TEMP2, $ACC1, $ACC1
  343. vpmuludq 32*6-128($aap), $B1, $TEMP0
  344. vpaddq $TEMP0, $ACC2, $ACC2
  345. vpmuludq 32*7-128($aap), $B1, $ACC3
  346. vpbroadcastq 32*6-128($tpa), $B1
  347. vpaddq 32*12-448($tp1), $ACC3, $ACC3
  348. vmovdqu $ACC8, 32*8-192($tp0)
  349. vmovdqu $ACC0, 32*9-192($tp0)
  350. lea 8($tp0), $tp0
  351. vpmuludq 32*5-128($ap), $B2, $TEMP2
  352. vpaddq $TEMP2, $ACC1, $ACC1
  353. vpmuludq 32*5-128($aap), $B2, $TEMP0
  354. vpaddq $TEMP0, $ACC2, $ACC2
  355. vpmuludq 32*6-128($aap), $B2, $TEMP1
  356. vpaddq $TEMP1, $ACC3, $ACC3
  357. vpmuludq 32*7-128($aap), $B2, $ACC4
  358. vpbroadcastq 32*7-128($tpa), $B2
  359. vpaddq 32*13-448($tp1), $ACC4, $ACC4
  360. vmovdqu $ACC1, 32*10-448($tp1)
  361. vmovdqu $ACC2, 32*11-448($tp1)
  362. vpmuludq 32*6-128($ap), $B1, $TEMP0
  363. vpaddq $TEMP0, $ACC3, $ACC3
  364. vpmuludq 32*6-128($aap), $B1, $TEMP1
  365. vpbroadcastq 32*8-128($tpa), $ACC0 # borrow $ACC0 for $B1
  366. vpaddq $TEMP1, $ACC4, $ACC4
  367. vpmuludq 32*7-128($aap), $B1, $ACC5
  368. vpbroadcastq 32*0+8-128($tpa), $B1 # for next iteration
  369. vpaddq 32*14-448($tp1), $ACC5, $ACC5
  370. vmovdqu $ACC3, 32*12-448($tp1)
  371. vmovdqu $ACC4, 32*13-448($tp1)
  372. lea 8($tpa), $tpa
  373. vpmuludq 32*7-128($ap), $B2, $TEMP0
  374. vpaddq $TEMP0, $ACC5, $ACC5
  375. vpmuludq 32*7-128($aap), $B2, $ACC6
  376. vpaddq 32*15-448($tp1), $ACC6, $ACC6
  377. vpmuludq 32*8-128($ap), $ACC0, $ACC7
  378. vmovdqu $ACC5, 32*14-448($tp1)
  379. vpaddq 32*16-448($tp1), $ACC7, $ACC7
  380. vmovdqu $ACC6, 32*15-448($tp1)
  381. vmovdqu $ACC7, 32*16-448($tp1)
  382. lea 8($tp1), $tp1
  383. dec $i
  384. jnz .LOOP_SQR_1024
  385. ___
  386. $ZERO = $ACC9;
  387. $TEMP0 = $B1;
  388. $TEMP2 = $B2;
  389. $TEMP3 = $Y1;
  390. $TEMP4 = $Y2;
  391. $code.=<<___;
  392. # we need to fix indices 32-39 to avoid overflow
  393. vmovdqu 32*8(%rsp), $ACC8 # 32*8-192($tp0),
  394. vmovdqu 32*9(%rsp), $ACC1 # 32*9-192($tp0)
  395. vmovdqu 32*10(%rsp), $ACC2 # 32*10-192($tp0)
  396. lea 192(%rsp), $tp0 # 64+128=192
  397. vpsrlq \$29, $ACC8, $TEMP1
  398. vpand $AND_MASK, $ACC8, $ACC8
  399. vpsrlq \$29, $ACC1, $TEMP2
  400. vpand $AND_MASK, $ACC1, $ACC1
  401. vpermq \$0x93, $TEMP1, $TEMP1
  402. vpxor $ZERO, $ZERO, $ZERO
  403. vpermq \$0x93, $TEMP2, $TEMP2
  404. vpblendd \$3, $ZERO, $TEMP1, $TEMP0
  405. vpblendd \$3, $TEMP1, $TEMP2, $TEMP1
  406. vpaddq $TEMP0, $ACC8, $ACC8
  407. vpblendd \$3, $TEMP2, $ZERO, $TEMP2
  408. vpaddq $TEMP1, $ACC1, $ACC1
  409. vpaddq $TEMP2, $ACC2, $ACC2
  410. vmovdqu $ACC1, 32*9-192($tp0)
  411. vmovdqu $ACC2, 32*10-192($tp0)
  412. mov (%rsp), %rax
  413. mov 8(%rsp), $r1
  414. mov 16(%rsp), $r2
  415. mov 24(%rsp), $r3
  416. vmovdqu 32*1(%rsp), $ACC1
  417. vmovdqu 32*2-192($tp0), $ACC2
  418. vmovdqu 32*3-192($tp0), $ACC3
  419. vmovdqu 32*4-192($tp0), $ACC4
  420. vmovdqu 32*5-192($tp0), $ACC5
  421. vmovdqu 32*6-192($tp0), $ACC6
  422. vmovdqu 32*7-192($tp0), $ACC7
  423. mov %rax, $r0
  424. imull $n0, %eax
  425. and \$0x1fffffff, %eax
  426. vmovd %eax, $Y1
  427. mov %rax, %rdx
  428. imulq -128($np), %rax
  429. vpbroadcastq $Y1, $Y1
  430. add %rax, $r0
  431. mov %rdx, %rax
  432. imulq 8-128($np), %rax
  433. shr \$29, $r0
  434. add %rax, $r1
  435. mov %rdx, %rax
  436. imulq 16-128($np), %rax
  437. add $r0, $r1
  438. add %rax, $r2
  439. imulq 24-128($np), %rdx
  440. add %rdx, $r3
  441. mov $r1, %rax
  442. imull $n0, %eax
  443. and \$0x1fffffff, %eax
  444. mov \$9, $i
  445. jmp .LOOP_REDUCE_1024
  446. .align 32
  447. .LOOP_REDUCE_1024:
  448. vmovd %eax, $Y2
  449. vpbroadcastq $Y2, $Y2
  450. vpmuludq 32*1-128($np), $Y1, $TEMP0
  451. mov %rax, %rdx
  452. imulq -128($np), %rax
  453. vpaddq $TEMP0, $ACC1, $ACC1
  454. add %rax, $r1
  455. vpmuludq 32*2-128($np), $Y1, $TEMP1
  456. mov %rdx, %rax
  457. imulq 8-128($np), %rax
  458. vpaddq $TEMP1, $ACC2, $ACC2
  459. vpmuludq 32*3-128($np), $Y1, $TEMP2
  460. .byte 0x67
  461. add %rax, $r2
  462. .byte 0x67
  463. mov %rdx, %rax
  464. imulq 16-128($np), %rax
  465. shr \$29, $r1
  466. vpaddq $TEMP2, $ACC3, $ACC3
  467. vpmuludq 32*4-128($np), $Y1, $TEMP0
  468. add %rax, $r3
  469. add $r1, $r2
  470. vpaddq $TEMP0, $ACC4, $ACC4
  471. vpmuludq 32*5-128($np), $Y1, $TEMP1
  472. mov $r2, %rax
  473. imull $n0, %eax
  474. vpaddq $TEMP1, $ACC5, $ACC5
  475. vpmuludq 32*6-128($np), $Y1, $TEMP2
  476. and \$0x1fffffff, %eax
  477. vpaddq $TEMP2, $ACC6, $ACC6
  478. vpmuludq 32*7-128($np), $Y1, $TEMP0
  479. vpaddq $TEMP0, $ACC7, $ACC7
  480. vpmuludq 32*8-128($np), $Y1, $TEMP1
  481. vmovd %eax, $Y1
  482. #vmovdqu 32*1-8-128($np), $TEMP2 # moved below
  483. vpaddq $TEMP1, $ACC8, $ACC8
  484. #vmovdqu 32*2-8-128($np), $TEMP0 # moved below
  485. vpbroadcastq $Y1, $Y1
  486. vpmuludq 32*1-8-128($np), $Y2, $TEMP2 # see above
  487. vmovdqu 32*3-8-128($np), $TEMP1
  488. mov %rax, %rdx
  489. imulq -128($np), %rax
  490. vpaddq $TEMP2, $ACC1, $ACC1
  491. vpmuludq 32*2-8-128($np), $Y2, $TEMP0 # see above
  492. vmovdqu 32*4-8-128($np), $TEMP2
  493. add %rax, $r2
  494. mov %rdx, %rax
  495. imulq 8-128($np), %rax
  496. vpaddq $TEMP0, $ACC2, $ACC2
  497. add $r3, %rax
  498. shr \$29, $r2
  499. vpmuludq $Y2, $TEMP1, $TEMP1
  500. vmovdqu 32*5-8-128($np), $TEMP0
  501. add $r2, %rax
  502. vpaddq $TEMP1, $ACC3, $ACC3
  503. vpmuludq $Y2, $TEMP2, $TEMP2
  504. vmovdqu 32*6-8-128($np), $TEMP1
  505. .byte 0x67
  506. mov %rax, $r3
  507. imull $n0, %eax
  508. vpaddq $TEMP2, $ACC4, $ACC4
  509. vpmuludq $Y2, $TEMP0, $TEMP0
  510. .byte 0xc4,0x41,0x7e,0x6f,0x9d,0x58,0x00,0x00,0x00 # vmovdqu 32*7-8-128($np), $TEMP2
  511. and \$0x1fffffff, %eax
  512. vpaddq $TEMP0, $ACC5, $ACC5
  513. vpmuludq $Y2, $TEMP1, $TEMP1
  514. vmovdqu 32*8-8-128($np), $TEMP0
  515. vpaddq $TEMP1, $ACC6, $ACC6
  516. vpmuludq $Y2, $TEMP2, $TEMP2
  517. vmovdqu 32*9-8-128($np), $ACC9
  518. vmovd %eax, $ACC0 # borrow ACC0 for Y2
  519. imulq -128($np), %rax
  520. vpaddq $TEMP2, $ACC7, $ACC7
  521. vpmuludq $Y2, $TEMP0, $TEMP0
  522. vmovdqu 32*1-16-128($np), $TEMP1
  523. vpbroadcastq $ACC0, $ACC0
  524. vpaddq $TEMP0, $ACC8, $ACC8
  525. vpmuludq $Y2, $ACC9, $ACC9
  526. vmovdqu 32*2-16-128($np), $TEMP2
  527. add %rax, $r3
  528. ___
  529. ($ACC0,$Y2)=($Y2,$ACC0);
  530. $code.=<<___;
  531. vmovdqu 32*1-24-128($np), $ACC0
  532. vpmuludq $Y1, $TEMP1, $TEMP1
  533. vmovdqu 32*3-16-128($np), $TEMP0
  534. vpaddq $TEMP1, $ACC1, $ACC1
  535. vpmuludq $Y2, $ACC0, $ACC0
  536. vpmuludq $Y1, $TEMP2, $TEMP2
  537. .byte 0xc4,0x41,0x7e,0x6f,0xb5,0xf0,0xff,0xff,0xff # vmovdqu 32*4-16-128($np), $TEMP1
  538. vpaddq $ACC1, $ACC0, $ACC0
  539. vpaddq $TEMP2, $ACC2, $ACC2
  540. vpmuludq $Y1, $TEMP0, $TEMP0
  541. vmovdqu 32*5-16-128($np), $TEMP2
  542. .byte 0x67
  543. vmovq $ACC0, %rax
  544. vmovdqu $ACC0, (%rsp) # transfer $r0-$r3
  545. vpaddq $TEMP0, $ACC3, $ACC3
  546. vpmuludq $Y1, $TEMP1, $TEMP1
  547. vmovdqu 32*6-16-128($np), $TEMP0
  548. vpaddq $TEMP1, $ACC4, $ACC4
  549. vpmuludq $Y1, $TEMP2, $TEMP2
  550. vmovdqu 32*7-16-128($np), $TEMP1
  551. vpaddq $TEMP2, $ACC5, $ACC5
  552. vpmuludq $Y1, $TEMP0, $TEMP0
  553. vmovdqu 32*8-16-128($np), $TEMP2
  554. vpaddq $TEMP0, $ACC6, $ACC6
  555. vpmuludq $Y1, $TEMP1, $TEMP1
  556. shr \$29, $r3
  557. vmovdqu 32*9-16-128($np), $TEMP0
  558. add $r3, %rax
  559. vpaddq $TEMP1, $ACC7, $ACC7
  560. vpmuludq $Y1, $TEMP2, $TEMP2
  561. #vmovdqu 32*2-24-128($np), $TEMP1 # moved below
  562. mov %rax, $r0
  563. imull $n0, %eax
  564. vpaddq $TEMP2, $ACC8, $ACC8
  565. vpmuludq $Y1, $TEMP0, $TEMP0
  566. and \$0x1fffffff, %eax
  567. vmovd %eax, $Y1
  568. vmovdqu 32*3-24-128($np), $TEMP2
  569. .byte 0x67
  570. vpaddq $TEMP0, $ACC9, $ACC9
  571. vpbroadcastq $Y1, $Y1
  572. vpmuludq 32*2-24-128($np), $Y2, $TEMP1 # see above
  573. vmovdqu 32*4-24-128($np), $TEMP0
  574. mov %rax, %rdx
  575. imulq -128($np), %rax
  576. mov 8(%rsp), $r1
  577. vpaddq $TEMP1, $ACC2, $ACC1
  578. vpmuludq $Y2, $TEMP2, $TEMP2
  579. vmovdqu 32*5-24-128($np), $TEMP1
  580. add %rax, $r0
  581. mov %rdx, %rax
  582. imulq 8-128($np), %rax
  583. .byte 0x67
  584. shr \$29, $r0
  585. mov 16(%rsp), $r2
  586. vpaddq $TEMP2, $ACC3, $ACC2
  587. vpmuludq $Y2, $TEMP0, $TEMP0
  588. vmovdqu 32*6-24-128($np), $TEMP2
  589. add %rax, $r1
  590. mov %rdx, %rax
  591. imulq 16-128($np), %rax
  592. vpaddq $TEMP0, $ACC4, $ACC3
  593. vpmuludq $Y2, $TEMP1, $TEMP1
  594. vmovdqu 32*7-24-128($np), $TEMP0
  595. imulq 24-128($np), %rdx # future $r3
  596. add %rax, $r2
  597. lea ($r0,$r1), %rax
  598. vpaddq $TEMP1, $ACC5, $ACC4
  599. vpmuludq $Y2, $TEMP2, $TEMP2
  600. vmovdqu 32*8-24-128($np), $TEMP1
  601. mov %rax, $r1
  602. imull $n0, %eax
  603. vpmuludq $Y2, $TEMP0, $TEMP0
  604. vpaddq $TEMP2, $ACC6, $ACC5
  605. vmovdqu 32*9-24-128($np), $TEMP2
  606. and \$0x1fffffff, %eax
  607. vpaddq $TEMP0, $ACC7, $ACC6
  608. vpmuludq $Y2, $TEMP1, $TEMP1
  609. add 24(%rsp), %rdx
  610. vpaddq $TEMP1, $ACC8, $ACC7
  611. vpmuludq $Y2, $TEMP2, $TEMP2
  612. vpaddq $TEMP2, $ACC9, $ACC8
  613. vmovq $r3, $ACC9
  614. mov %rdx, $r3
  615. dec $i
  616. jnz .LOOP_REDUCE_1024
  617. ___
  618. ($ACC0,$Y2)=($Y2,$ACC0);
  619. $code.=<<___;
  620. lea 448(%rsp), $tp1 # size optimization
  621. vpaddq $ACC9, $Y2, $ACC0
  622. vpxor $ZERO, $ZERO, $ZERO
  623. vpaddq 32*9-192($tp0), $ACC0, $ACC0
  624. vpaddq 32*10-448($tp1), $ACC1, $ACC1
  625. vpaddq 32*11-448($tp1), $ACC2, $ACC2
  626. vpaddq 32*12-448($tp1), $ACC3, $ACC3
  627. vpaddq 32*13-448($tp1), $ACC4, $ACC4
  628. vpaddq 32*14-448($tp1), $ACC5, $ACC5
  629. vpaddq 32*15-448($tp1), $ACC6, $ACC6
  630. vpaddq 32*16-448($tp1), $ACC7, $ACC7
  631. vpaddq 32*17-448($tp1), $ACC8, $ACC8
  632. vpsrlq \$29, $ACC0, $TEMP1
  633. vpand $AND_MASK, $ACC0, $ACC0
  634. vpsrlq \$29, $ACC1, $TEMP2
  635. vpand $AND_MASK, $ACC1, $ACC1
  636. vpsrlq \$29, $ACC2, $TEMP3
  637. vpermq \$0x93, $TEMP1, $TEMP1
  638. vpand $AND_MASK, $ACC2, $ACC2
  639. vpsrlq \$29, $ACC3, $TEMP4
  640. vpermq \$0x93, $TEMP2, $TEMP2
  641. vpand $AND_MASK, $ACC3, $ACC3
  642. vpermq \$0x93, $TEMP3, $TEMP3
  643. vpblendd \$3, $ZERO, $TEMP1, $TEMP0
  644. vpermq \$0x93, $TEMP4, $TEMP4
  645. vpblendd \$3, $TEMP1, $TEMP2, $TEMP1
  646. vpaddq $TEMP0, $ACC0, $ACC0
  647. vpblendd \$3, $TEMP2, $TEMP3, $TEMP2
  648. vpaddq $TEMP1, $ACC1, $ACC1
  649. vpblendd \$3, $TEMP3, $TEMP4, $TEMP3
  650. vpaddq $TEMP2, $ACC2, $ACC2
  651. vpblendd \$3, $TEMP4, $ZERO, $TEMP4
  652. vpaddq $TEMP3, $ACC3, $ACC3
  653. vpaddq $TEMP4, $ACC4, $ACC4
  654. vpsrlq \$29, $ACC0, $TEMP1
  655. vpand $AND_MASK, $ACC0, $ACC0
  656. vpsrlq \$29, $ACC1, $TEMP2
  657. vpand $AND_MASK, $ACC1, $ACC1
  658. vpsrlq \$29, $ACC2, $TEMP3
  659. vpermq \$0x93, $TEMP1, $TEMP1
  660. vpand $AND_MASK, $ACC2, $ACC2
  661. vpsrlq \$29, $ACC3, $TEMP4
  662. vpermq \$0x93, $TEMP2, $TEMP2
  663. vpand $AND_MASK, $ACC3, $ACC3
  664. vpermq \$0x93, $TEMP3, $TEMP3
  665. vpblendd \$3, $ZERO, $TEMP1, $TEMP0
  666. vpermq \$0x93, $TEMP4, $TEMP4
  667. vpblendd \$3, $TEMP1, $TEMP2, $TEMP1
  668. vpaddq $TEMP0, $ACC0, $ACC0
  669. vpblendd \$3, $TEMP2, $TEMP3, $TEMP2
  670. vpaddq $TEMP1, $ACC1, $ACC1
  671. vmovdqu $ACC0, 32*0-128($rp)
  672. vpblendd \$3, $TEMP3, $TEMP4, $TEMP3
  673. vpaddq $TEMP2, $ACC2, $ACC2
  674. vmovdqu $ACC1, 32*1-128($rp)
  675. vpblendd \$3, $TEMP4, $ZERO, $TEMP4
  676. vpaddq $TEMP3, $ACC3, $ACC3
  677. vmovdqu $ACC2, 32*2-128($rp)
  678. vpaddq $TEMP4, $ACC4, $ACC4
  679. vmovdqu $ACC3, 32*3-128($rp)
  680. ___
  681. $TEMP5=$ACC0;
  682. $code.=<<___;
  683. vpsrlq \$29, $ACC4, $TEMP1
  684. vpand $AND_MASK, $ACC4, $ACC4
  685. vpsrlq \$29, $ACC5, $TEMP2
  686. vpand $AND_MASK, $ACC5, $ACC5
  687. vpsrlq \$29, $ACC6, $TEMP3
  688. vpermq \$0x93, $TEMP1, $TEMP1
  689. vpand $AND_MASK, $ACC6, $ACC6
  690. vpsrlq \$29, $ACC7, $TEMP4
  691. vpermq \$0x93, $TEMP2, $TEMP2
  692. vpand $AND_MASK, $ACC7, $ACC7
  693. vpsrlq \$29, $ACC8, $TEMP5
  694. vpermq \$0x93, $TEMP3, $TEMP3
  695. vpand $AND_MASK, $ACC8, $ACC8
  696. vpermq \$0x93, $TEMP4, $TEMP4
  697. vpblendd \$3, $ZERO, $TEMP1, $TEMP0
  698. vpermq \$0x93, $TEMP5, $TEMP5
  699. vpblendd \$3, $TEMP1, $TEMP2, $TEMP1
  700. vpaddq $TEMP0, $ACC4, $ACC4
  701. vpblendd \$3, $TEMP2, $TEMP3, $TEMP2
  702. vpaddq $TEMP1, $ACC5, $ACC5
  703. vpblendd \$3, $TEMP3, $TEMP4, $TEMP3
  704. vpaddq $TEMP2, $ACC6, $ACC6
  705. vpblendd \$3, $TEMP4, $TEMP5, $TEMP4
  706. vpaddq $TEMP3, $ACC7, $ACC7
  707. vpaddq $TEMP4, $ACC8, $ACC8
  708. vpsrlq \$29, $ACC4, $TEMP1
  709. vpand $AND_MASK, $ACC4, $ACC4
  710. vpsrlq \$29, $ACC5, $TEMP2
  711. vpand $AND_MASK, $ACC5, $ACC5
  712. vpsrlq \$29, $ACC6, $TEMP3
  713. vpermq \$0x93, $TEMP1, $TEMP1
  714. vpand $AND_MASK, $ACC6, $ACC6
  715. vpsrlq \$29, $ACC7, $TEMP4
  716. vpermq \$0x93, $TEMP2, $TEMP2
  717. vpand $AND_MASK, $ACC7, $ACC7
  718. vpsrlq \$29, $ACC8, $TEMP5
  719. vpermq \$0x93, $TEMP3, $TEMP3
  720. vpand $AND_MASK, $ACC8, $ACC8
  721. vpermq \$0x93, $TEMP4, $TEMP4
  722. vpblendd \$3, $ZERO, $TEMP1, $TEMP0
  723. vpermq \$0x93, $TEMP5, $TEMP5
  724. vpblendd \$3, $TEMP1, $TEMP2, $TEMP1
  725. vpaddq $TEMP0, $ACC4, $ACC4
  726. vpblendd \$3, $TEMP2, $TEMP3, $TEMP2
  727. vpaddq $TEMP1, $ACC5, $ACC5
  728. vmovdqu $ACC4, 32*4-128($rp)
  729. vpblendd \$3, $TEMP3, $TEMP4, $TEMP3
  730. vpaddq $TEMP2, $ACC6, $ACC6
  731. vmovdqu $ACC5, 32*5-128($rp)
  732. vpblendd \$3, $TEMP4, $TEMP5, $TEMP4
  733. vpaddq $TEMP3, $ACC7, $ACC7
  734. vmovdqu $ACC6, 32*6-128($rp)
  735. vpaddq $TEMP4, $ACC8, $ACC8
  736. vmovdqu $ACC7, 32*7-128($rp)
  737. vmovdqu $ACC8, 32*8-128($rp)
  738. mov $rp, $ap
  739. dec $rep
  740. jne .LOOP_GRANDE_SQR_1024
  741. vzeroall
  742. mov %rbp, %rax
  743. ___
  744. $code.=<<___ if ($win64);
  745. movaps -0xd8(%rax),%xmm6
  746. movaps -0xc8(%rax),%xmm7
  747. movaps -0xb8(%rax),%xmm8
  748. movaps -0xa8(%rax),%xmm9
  749. movaps -0x98(%rax),%xmm10
  750. movaps -0x88(%rax),%xmm11
  751. movaps -0x78(%rax),%xmm12
  752. movaps -0x68(%rax),%xmm13
  753. movaps -0x58(%rax),%xmm14
  754. movaps -0x48(%rax),%xmm15
  755. ___
  756. $code.=<<___;
  757. mov -48(%rax),%r15
  758. mov -40(%rax),%r14
  759. mov -32(%rax),%r13
  760. mov -24(%rax),%r12
  761. mov -16(%rax),%rbp
  762. mov -8(%rax),%rbx
  763. lea (%rax),%rsp # restore %rsp
  764. .Lsqr_1024_epilogue:
  765. ret
  766. .size rsaz_1024_sqr_avx2,.-rsaz_1024_sqr_avx2
  767. ___
  768. }
  769. { # void AMM_WW(
  770. my $rp="%rdi"; # BN_ULONG *rp,
  771. my $ap="%rsi"; # const BN_ULONG *ap,
  772. my $bp="%rdx"; # const BN_ULONG *bp,
  773. my $np="%rcx"; # const BN_ULONG *np,
  774. my $n0="%r8d"; # unsigned int n0);
  775. # The registers that hold the accumulated redundant result
  776. # The AMM works on 1024 bit operands, and redundant word size is 29
  777. # Therefore: ceil(1024/29)/4 = 9
  778. my $ACC0="%ymm0";
  779. my $ACC1="%ymm1";
  780. my $ACC2="%ymm2";
  781. my $ACC3="%ymm3";
  782. my $ACC4="%ymm4";
  783. my $ACC5="%ymm5";
  784. my $ACC6="%ymm6";
  785. my $ACC7="%ymm7";
  786. my $ACC8="%ymm8";
  787. my $ACC9="%ymm9";
  788. # Registers that hold the broadcasted words of multiplier, currently used
  789. my $Bi="%ymm10";
  790. my $Yi="%ymm11";
  791. # Helper registers
  792. my $TEMP0=$ACC0;
  793. my $TEMP1="%ymm12";
  794. my $TEMP2="%ymm13";
  795. my $ZERO="%ymm14";
  796. my $AND_MASK="%ymm15";
  797. # alu registers that hold the first words of the ACC
  798. my $r0="%r9";
  799. my $r1="%r10";
  800. my $r2="%r11";
  801. my $r3="%r12";
  802. my $i="%r14d";
  803. my $tmp="%r15";
  804. $bp="%r13"; # reassigned argument
  805. $code.=<<___;
  806. .globl rsaz_1024_mul_avx2
  807. .type rsaz_1024_mul_avx2,\@function,5
  808. .align 64
  809. rsaz_1024_mul_avx2:
  810. lea (%rsp), %rax
  811. push %rbx
  812. push %rbp
  813. push %r12
  814. push %r13
  815. push %r14
  816. push %r15
  817. ___
  818. $code.=<<___ if ($win64);
  819. vzeroupper
  820. lea -0xa8(%rsp),%rsp
  821. vmovaps %xmm6,-0xd8(%rax)
  822. vmovaps %xmm7,-0xc8(%rax)
  823. vmovaps %xmm8,-0xb8(%rax)
  824. vmovaps %xmm9,-0xa8(%rax)
  825. vmovaps %xmm10,-0x98(%rax)
  826. vmovaps %xmm11,-0x88(%rax)
  827. vmovaps %xmm12,-0x78(%rax)
  828. vmovaps %xmm13,-0x68(%rax)
  829. vmovaps %xmm14,-0x58(%rax)
  830. vmovaps %xmm15,-0x48(%rax)
  831. .Lmul_1024_body:
  832. ___
  833. $code.=<<___;
  834. mov %rax,%rbp
  835. vzeroall
  836. mov %rdx, $bp # reassigned argument
  837. sub \$64,%rsp
  838. # unaligned 256-bit load that crosses page boundary can
  839. # cause severe performance degradation here, so if $ap does
  840. # cross page boundary, swap it with $bp [meaning that caller
  841. # is advised to lay down $ap and $bp next to each other, so
  842. # that only one can cross page boundary].
  843. .byte 0x67,0x67
  844. mov $ap, $tmp
  845. and \$4095, $tmp
  846. add \$32*10, $tmp
  847. shr \$12, $tmp
  848. mov $ap, $tmp
  849. cmovnz $bp, $ap
  850. cmovnz $tmp, $bp
  851. mov $np, $tmp
  852. sub \$-128,$ap # size optimization
  853. sub \$-128,$np
  854. sub \$-128,$rp
  855. and \$4095, $tmp # see if $np crosses page
  856. add \$32*10, $tmp
  857. .byte 0x67,0x67
  858. shr \$12, $tmp
  859. jz .Lmul_1024_no_n_copy
  860. # unaligned 256-bit load that crosses page boundary can
  861. # cause severe performance degradation here, so if $np does
  862. # cross page boundary, copy it to stack and make sure stack
  863. # frame doesn't...
  864. sub \$32*10,%rsp
  865. vmovdqu 32*0-128($np), $ACC0
  866. and \$-512, %rsp
  867. vmovdqu 32*1-128($np), $ACC1
  868. vmovdqu 32*2-128($np), $ACC2
  869. vmovdqu 32*3-128($np), $ACC3
  870. vmovdqu 32*4-128($np), $ACC4
  871. vmovdqu 32*5-128($np), $ACC5
  872. vmovdqu 32*6-128($np), $ACC6
  873. vmovdqu 32*7-128($np), $ACC7
  874. vmovdqu 32*8-128($np), $ACC8
  875. lea 64+128(%rsp),$np
  876. vmovdqu $ACC0, 32*0-128($np)
  877. vpxor $ACC0, $ACC0, $ACC0
  878. vmovdqu $ACC1, 32*1-128($np)
  879. vpxor $ACC1, $ACC1, $ACC1
  880. vmovdqu $ACC2, 32*2-128($np)
  881. vpxor $ACC2, $ACC2, $ACC2
  882. vmovdqu $ACC3, 32*3-128($np)
  883. vpxor $ACC3, $ACC3, $ACC3
  884. vmovdqu $ACC4, 32*4-128($np)
  885. vpxor $ACC4, $ACC4, $ACC4
  886. vmovdqu $ACC5, 32*5-128($np)
  887. vpxor $ACC5, $ACC5, $ACC5
  888. vmovdqu $ACC6, 32*6-128($np)
  889. vpxor $ACC6, $ACC6, $ACC6
  890. vmovdqu $ACC7, 32*7-128($np)
  891. vpxor $ACC7, $ACC7, $ACC7
  892. vmovdqu $ACC8, 32*8-128($np)
  893. vmovdqa $ACC0, $ACC8
  894. vmovdqu $ACC9, 32*9-128($np) # $ACC9 is zero after vzeroall
  895. .Lmul_1024_no_n_copy:
  896. and \$-64,%rsp
  897. mov ($bp), %rbx
  898. vpbroadcastq ($bp), $Bi
  899. vmovdqu $ACC0, (%rsp) # clear top of stack
  900. xor $r0, $r0
  901. .byte 0x67
  902. xor $r1, $r1
  903. xor $r2, $r2
  904. xor $r3, $r3
  905. vmovdqu .Land_mask(%rip), $AND_MASK
  906. mov \$9, $i
  907. vmovdqu $ACC9, 32*9-128($rp) # $ACC9 is zero after vzeroall
  908. jmp .Loop_mul_1024
  909. .align 32
  910. .Loop_mul_1024:
  911. vpsrlq \$29, $ACC3, $ACC9 # correct $ACC3(*)
  912. mov %rbx, %rax
  913. imulq -128($ap), %rax
  914. add $r0, %rax
  915. mov %rbx, $r1
  916. imulq 8-128($ap), $r1
  917. add 8(%rsp), $r1
  918. mov %rax, $r0
  919. imull $n0, %eax
  920. and \$0x1fffffff, %eax
  921. mov %rbx, $r2
  922. imulq 16-128($ap), $r2
  923. add 16(%rsp), $r2
  924. mov %rbx, $r3
  925. imulq 24-128($ap), $r3
  926. add 24(%rsp), $r3
  927. vpmuludq 32*1-128($ap),$Bi,$TEMP0
  928. vmovd %eax, $Yi
  929. vpaddq $TEMP0,$ACC1,$ACC1
  930. vpmuludq 32*2-128($ap),$Bi,$TEMP1
  931. vpbroadcastq $Yi, $Yi
  932. vpaddq $TEMP1,$ACC2,$ACC2
  933. vpmuludq 32*3-128($ap),$Bi,$TEMP2
  934. vpand $AND_MASK, $ACC3, $ACC3 # correct $ACC3
  935. vpaddq $TEMP2,$ACC3,$ACC3
  936. vpmuludq 32*4-128($ap),$Bi,$TEMP0
  937. vpaddq $TEMP0,$ACC4,$ACC4
  938. vpmuludq 32*5-128($ap),$Bi,$TEMP1
  939. vpaddq $TEMP1,$ACC5,$ACC5
  940. vpmuludq 32*6-128($ap),$Bi,$TEMP2
  941. vpaddq $TEMP2,$ACC6,$ACC6
  942. vpmuludq 32*7-128($ap),$Bi,$TEMP0
  943. vpermq \$0x93, $ACC9, $ACC9 # correct $ACC3
  944. vpaddq $TEMP0,$ACC7,$ACC7
  945. vpmuludq 32*8-128($ap),$Bi,$TEMP1
  946. vpbroadcastq 8($bp), $Bi
  947. vpaddq $TEMP1,$ACC8,$ACC8
  948. mov %rax,%rdx
  949. imulq -128($np),%rax
  950. add %rax,$r0
  951. mov %rdx,%rax
  952. imulq 8-128($np),%rax
  953. add %rax,$r1
  954. mov %rdx,%rax
  955. imulq 16-128($np),%rax
  956. add %rax,$r2
  957. shr \$29, $r0
  958. imulq 24-128($np),%rdx
  959. add %rdx,$r3
  960. add $r0, $r1
  961. vpmuludq 32*1-128($np),$Yi,$TEMP2
  962. vmovq $Bi, %rbx
  963. vpaddq $TEMP2,$ACC1,$ACC1
  964. vpmuludq 32*2-128($np),$Yi,$TEMP0
  965. vpaddq $TEMP0,$ACC2,$ACC2
  966. vpmuludq 32*3-128($np),$Yi,$TEMP1
  967. vpaddq $TEMP1,$ACC3,$ACC3
  968. vpmuludq 32*4-128($np),$Yi,$TEMP2
  969. vpaddq $TEMP2,$ACC4,$ACC4
  970. vpmuludq 32*5-128($np),$Yi,$TEMP0
  971. vpaddq $TEMP0,$ACC5,$ACC5
  972. vpmuludq 32*6-128($np),$Yi,$TEMP1
  973. vpaddq $TEMP1,$ACC6,$ACC6
  974. vpmuludq 32*7-128($np),$Yi,$TEMP2
  975. vpblendd \$3, $ZERO, $ACC9, $ACC9 # correct $ACC3
  976. vpaddq $TEMP2,$ACC7,$ACC7
  977. vpmuludq 32*8-128($np),$Yi,$TEMP0
  978. vpaddq $ACC9, $ACC3, $ACC3 # correct $ACC3
  979. vpaddq $TEMP0,$ACC8,$ACC8
  980. mov %rbx, %rax
  981. imulq -128($ap),%rax
  982. add %rax,$r1
  983. vmovdqu -8+32*1-128($ap),$TEMP1
  984. mov %rbx, %rax
  985. imulq 8-128($ap),%rax
  986. add %rax,$r2
  987. vmovdqu -8+32*2-128($ap),$TEMP2
  988. mov $r1, %rax
  989. imull $n0, %eax
  990. and \$0x1fffffff, %eax
  991. imulq 16-128($ap),%rbx
  992. add %rbx,$r3
  993. vpmuludq $Bi,$TEMP1,$TEMP1
  994. vmovd %eax, $Yi
  995. vmovdqu -8+32*3-128($ap),$TEMP0
  996. vpaddq $TEMP1,$ACC1,$ACC1
  997. vpmuludq $Bi,$TEMP2,$TEMP2
  998. vpbroadcastq $Yi, $Yi
  999. vmovdqu -8+32*4-128($ap),$TEMP1
  1000. vpaddq $TEMP2,$ACC2,$ACC2
  1001. vpmuludq $Bi,$TEMP0,$TEMP0
  1002. vmovdqu -8+32*5-128($ap),$TEMP2
  1003. vpaddq $TEMP0,$ACC3,$ACC3
  1004. vpmuludq $Bi,$TEMP1,$TEMP1
  1005. vmovdqu -8+32*6-128($ap),$TEMP0
  1006. vpaddq $TEMP1,$ACC4,$ACC4
  1007. vpmuludq $Bi,$TEMP2,$TEMP2
  1008. vmovdqu -8+32*7-128($ap),$TEMP1
  1009. vpaddq $TEMP2,$ACC5,$ACC5
  1010. vpmuludq $Bi,$TEMP0,$TEMP0
  1011. vmovdqu -8+32*8-128($ap),$TEMP2
  1012. vpaddq $TEMP0,$ACC6,$ACC6
  1013. vpmuludq $Bi,$TEMP1,$TEMP1
  1014. vmovdqu -8+32*9-128($ap),$ACC9
  1015. vpaddq $TEMP1,$ACC7,$ACC7
  1016. vpmuludq $Bi,$TEMP2,$TEMP2
  1017. vpaddq $TEMP2,$ACC8,$ACC8
  1018. vpmuludq $Bi,$ACC9,$ACC9
  1019. vpbroadcastq 16($bp), $Bi
  1020. mov %rax,%rdx
  1021. imulq -128($np),%rax
  1022. add %rax,$r1
  1023. vmovdqu -8+32*1-128($np),$TEMP0
  1024. mov %rdx,%rax
  1025. imulq 8-128($np),%rax
  1026. add %rax,$r2
  1027. vmovdqu -8+32*2-128($np),$TEMP1
  1028. shr \$29, $r1
  1029. imulq 16-128($np),%rdx
  1030. add %rdx,$r3
  1031. add $r1, $r2
  1032. vpmuludq $Yi,$TEMP0,$TEMP0
  1033. vmovq $Bi, %rbx
  1034. vmovdqu -8+32*3-128($np),$TEMP2
  1035. vpaddq $TEMP0,$ACC1,$ACC1
  1036. vpmuludq $Yi,$TEMP1,$TEMP1
  1037. vmovdqu -8+32*4-128($np),$TEMP0
  1038. vpaddq $TEMP1,$ACC2,$ACC2
  1039. vpmuludq $Yi,$TEMP2,$TEMP2
  1040. vmovdqu -8+32*5-128($np),$TEMP1
  1041. vpaddq $TEMP2,$ACC3,$ACC3
  1042. vpmuludq $Yi,$TEMP0,$TEMP0
  1043. vmovdqu -8+32*6-128($np),$TEMP2
  1044. vpaddq $TEMP0,$ACC4,$ACC4
  1045. vpmuludq $Yi,$TEMP1,$TEMP1
  1046. vmovdqu -8+32*7-128($np),$TEMP0
  1047. vpaddq $TEMP1,$ACC5,$ACC5
  1048. vpmuludq $Yi,$TEMP2,$TEMP2
  1049. vmovdqu -8+32*8-128($np),$TEMP1
  1050. vpaddq $TEMP2,$ACC6,$ACC6
  1051. vpmuludq $Yi,$TEMP0,$TEMP0
  1052. vmovdqu -8+32*9-128($np),$TEMP2
  1053. vpaddq $TEMP0,$ACC7,$ACC7
  1054. vpmuludq $Yi,$TEMP1,$TEMP1
  1055. vpaddq $TEMP1,$ACC8,$ACC8
  1056. vpmuludq $Yi,$TEMP2,$TEMP2
  1057. vpaddq $TEMP2,$ACC9,$ACC9
  1058. vmovdqu -16+32*1-128($ap),$TEMP0
  1059. mov %rbx,%rax
  1060. imulq -128($ap),%rax
  1061. add $r2,%rax
  1062. vmovdqu -16+32*2-128($ap),$TEMP1
  1063. mov %rax,$r2
  1064. imull $n0, %eax
  1065. and \$0x1fffffff, %eax
  1066. imulq 8-128($ap),%rbx
  1067. add %rbx,$r3
  1068. vpmuludq $Bi,$TEMP0,$TEMP0
  1069. vmovd %eax, $Yi
  1070. vmovdqu -16+32*3-128($ap),$TEMP2
  1071. vpaddq $TEMP0,$ACC1,$ACC1
  1072. vpmuludq $Bi,$TEMP1,$TEMP1
  1073. vpbroadcastq $Yi, $Yi
  1074. vmovdqu -16+32*4-128($ap),$TEMP0
  1075. vpaddq $TEMP1,$ACC2,$ACC2
  1076. vpmuludq $Bi,$TEMP2,$TEMP2
  1077. vmovdqu -16+32*5-128($ap),$TEMP1
  1078. vpaddq $TEMP2,$ACC3,$ACC3
  1079. vpmuludq $Bi,$TEMP0,$TEMP0
  1080. vmovdqu -16+32*6-128($ap),$TEMP2
  1081. vpaddq $TEMP0,$ACC4,$ACC4
  1082. vpmuludq $Bi,$TEMP1,$TEMP1
  1083. vmovdqu -16+32*7-128($ap),$TEMP0
  1084. vpaddq $TEMP1,$ACC5,$ACC5
  1085. vpmuludq $Bi,$TEMP2,$TEMP2
  1086. vmovdqu -16+32*8-128($ap),$TEMP1
  1087. vpaddq $TEMP2,$ACC6,$ACC6
  1088. vpmuludq $Bi,$TEMP0,$TEMP0
  1089. vmovdqu -16+32*9-128($ap),$TEMP2
  1090. vpaddq $TEMP0,$ACC7,$ACC7
  1091. vpmuludq $Bi,$TEMP1,$TEMP1
  1092. vpaddq $TEMP1,$ACC8,$ACC8
  1093. vpmuludq $Bi,$TEMP2,$TEMP2
  1094. vpbroadcastq 24($bp), $Bi
  1095. vpaddq $TEMP2,$ACC9,$ACC9
  1096. vmovdqu -16+32*1-128($np),$TEMP0
  1097. mov %rax,%rdx
  1098. imulq -128($np),%rax
  1099. add %rax,$r2
  1100. vmovdqu -16+32*2-128($np),$TEMP1
  1101. imulq 8-128($np),%rdx
  1102. add %rdx,$r3
  1103. shr \$29, $r2
  1104. vpmuludq $Yi,$TEMP0,$TEMP0
  1105. vmovq $Bi, %rbx
  1106. vmovdqu -16+32*3-128($np),$TEMP2
  1107. vpaddq $TEMP0,$ACC1,$ACC1
  1108. vpmuludq $Yi,$TEMP1,$TEMP1
  1109. vmovdqu -16+32*4-128($np),$TEMP0
  1110. vpaddq $TEMP1,$ACC2,$ACC2
  1111. vpmuludq $Yi,$TEMP2,$TEMP2
  1112. vmovdqu -16+32*5-128($np),$TEMP1
  1113. vpaddq $TEMP2,$ACC3,$ACC3
  1114. vpmuludq $Yi,$TEMP0,$TEMP0
  1115. vmovdqu -16+32*6-128($np),$TEMP2
  1116. vpaddq $TEMP0,$ACC4,$ACC4
  1117. vpmuludq $Yi,$TEMP1,$TEMP1
  1118. vmovdqu -16+32*7-128($np),$TEMP0
  1119. vpaddq $TEMP1,$ACC5,$ACC5
  1120. vpmuludq $Yi,$TEMP2,$TEMP2
  1121. vmovdqu -16+32*8-128($np),$TEMP1
  1122. vpaddq $TEMP2,$ACC6,$ACC6
  1123. vpmuludq $Yi,$TEMP0,$TEMP0
  1124. vmovdqu -16+32*9-128($np),$TEMP2
  1125. vpaddq $TEMP0,$ACC7,$ACC7
  1126. vpmuludq $Yi,$TEMP1,$TEMP1
  1127. vmovdqu -24+32*1-128($ap),$TEMP0
  1128. vpaddq $TEMP1,$ACC8,$ACC8
  1129. vpmuludq $Yi,$TEMP2,$TEMP2
  1130. vmovdqu -24+32*2-128($ap),$TEMP1
  1131. vpaddq $TEMP2,$ACC9,$ACC9
  1132. add $r2, $r3
  1133. imulq -128($ap),%rbx
  1134. add %rbx,$r3
  1135. mov $r3, %rax
  1136. imull $n0, %eax
  1137. and \$0x1fffffff, %eax
  1138. vpmuludq $Bi,$TEMP0,$TEMP0
  1139. vmovd %eax, $Yi
  1140. vmovdqu -24+32*3-128($ap),$TEMP2
  1141. vpaddq $TEMP0,$ACC1,$ACC1
  1142. vpmuludq $Bi,$TEMP1,$TEMP1
  1143. vpbroadcastq $Yi, $Yi
  1144. vmovdqu -24+32*4-128($ap),$TEMP0
  1145. vpaddq $TEMP1,$ACC2,$ACC2
  1146. vpmuludq $Bi,$TEMP2,$TEMP2
  1147. vmovdqu -24+32*5-128($ap),$TEMP1
  1148. vpaddq $TEMP2,$ACC3,$ACC3
  1149. vpmuludq $Bi,$TEMP0,$TEMP0
  1150. vmovdqu -24+32*6-128($ap),$TEMP2
  1151. vpaddq $TEMP0,$ACC4,$ACC4
  1152. vpmuludq $Bi,$TEMP1,$TEMP1
  1153. vmovdqu -24+32*7-128($ap),$TEMP0
  1154. vpaddq $TEMP1,$ACC5,$ACC5
  1155. vpmuludq $Bi,$TEMP2,$TEMP2
  1156. vmovdqu -24+32*8-128($ap),$TEMP1
  1157. vpaddq $TEMP2,$ACC6,$ACC6
  1158. vpmuludq $Bi,$TEMP0,$TEMP0
  1159. vmovdqu -24+32*9-128($ap),$TEMP2
  1160. vpaddq $TEMP0,$ACC7,$ACC7
  1161. vpmuludq $Bi,$TEMP1,$TEMP1
  1162. vpaddq $TEMP1,$ACC8,$ACC8
  1163. vpmuludq $Bi,$TEMP2,$TEMP2
  1164. vpbroadcastq 32($bp), $Bi
  1165. vpaddq $TEMP2,$ACC9,$ACC9
  1166. add \$32, $bp # $bp++
  1167. vmovdqu -24+32*1-128($np),$TEMP0
  1168. imulq -128($np),%rax
  1169. add %rax,$r3
  1170. shr \$29, $r3
  1171. vmovdqu -24+32*2-128($np),$TEMP1
  1172. vpmuludq $Yi,$TEMP0,$TEMP0
  1173. vmovq $Bi, %rbx
  1174. vmovdqu -24+32*3-128($np),$TEMP2
  1175. vpaddq $TEMP0,$ACC1,$ACC0 # $ACC0==$TEMP0
  1176. vpmuludq $Yi,$TEMP1,$TEMP1
  1177. vmovdqu $ACC0, (%rsp) # transfer $r0-$r3
  1178. vpaddq $TEMP1,$ACC2,$ACC1
  1179. vmovdqu -24+32*4-128($np),$TEMP0
  1180. vpmuludq $Yi,$TEMP2,$TEMP2
  1181. vmovdqu -24+32*5-128($np),$TEMP1
  1182. vpaddq $TEMP2,$ACC3,$ACC2
  1183. vpmuludq $Yi,$TEMP0,$TEMP0
  1184. vmovdqu -24+32*6-128($np),$TEMP2
  1185. vpaddq $TEMP0,$ACC4,$ACC3
  1186. vpmuludq $Yi,$TEMP1,$TEMP1
  1187. vmovdqu -24+32*7-128($np),$TEMP0
  1188. vpaddq $TEMP1,$ACC5,$ACC4
  1189. vpmuludq $Yi,$TEMP2,$TEMP2
  1190. vmovdqu -24+32*8-128($np),$TEMP1
  1191. vpaddq $TEMP2,$ACC6,$ACC5
  1192. vpmuludq $Yi,$TEMP0,$TEMP0
  1193. vmovdqu -24+32*9-128($np),$TEMP2
  1194. mov $r3, $r0
  1195. vpaddq $TEMP0,$ACC7,$ACC6
  1196. vpmuludq $Yi,$TEMP1,$TEMP1
  1197. add (%rsp), $r0
  1198. vpaddq $TEMP1,$ACC8,$ACC7
  1199. vpmuludq $Yi,$TEMP2,$TEMP2
  1200. vmovq $r3, $TEMP1
  1201. vpaddq $TEMP2,$ACC9,$ACC8
  1202. dec $i
  1203. jnz .Loop_mul_1024
  1204. ___
  1205. # (*) Original implementation was correcting ACC1-ACC3 for overflow
  1206. # after 7 loop runs, or after 28 iterations, or 56 additions.
  1207. # But as we underutilize resources, it's possible to correct in
  1208. # each iteration with marginal performance loss. But then, as
  1209. # we do it in each iteration, we can correct less digits, and
  1210. # avoid performance penalties completely. Also note that we
  1211. # correct only three digits out of four. This works because
  1212. # most significant digit is subjected to less additions.
  1213. $TEMP0 = $ACC9;
  1214. $TEMP3 = $Bi;
  1215. $TEMP4 = $Yi;
  1216. $code.=<<___;
  1217. vpermq \$0, $AND_MASK, $AND_MASK
  1218. vpaddq (%rsp), $TEMP1, $ACC0
  1219. vpsrlq \$29, $ACC0, $TEMP1
  1220. vpand $AND_MASK, $ACC0, $ACC0
  1221. vpsrlq \$29, $ACC1, $TEMP2
  1222. vpand $AND_MASK, $ACC1, $ACC1
  1223. vpsrlq \$29, $ACC2, $TEMP3
  1224. vpermq \$0x93, $TEMP1, $TEMP1
  1225. vpand $AND_MASK, $ACC2, $ACC2
  1226. vpsrlq \$29, $ACC3, $TEMP4
  1227. vpermq \$0x93, $TEMP2, $TEMP2
  1228. vpand $AND_MASK, $ACC3, $ACC3
  1229. vpblendd \$3, $ZERO, $TEMP1, $TEMP0
  1230. vpermq \$0x93, $TEMP3, $TEMP3
  1231. vpblendd \$3, $TEMP1, $TEMP2, $TEMP1
  1232. vpermq \$0x93, $TEMP4, $TEMP4
  1233. vpaddq $TEMP0, $ACC0, $ACC0
  1234. vpblendd \$3, $TEMP2, $TEMP3, $TEMP2
  1235. vpaddq $TEMP1, $ACC1, $ACC1
  1236. vpblendd \$3, $TEMP3, $TEMP4, $TEMP3
  1237. vpaddq $TEMP2, $ACC2, $ACC2
  1238. vpblendd \$3, $TEMP4, $ZERO, $TEMP4
  1239. vpaddq $TEMP3, $ACC3, $ACC3
  1240. vpaddq $TEMP4, $ACC4, $ACC4
  1241. vpsrlq \$29, $ACC0, $TEMP1
  1242. vpand $AND_MASK, $ACC0, $ACC0
  1243. vpsrlq \$29, $ACC1, $TEMP2
  1244. vpand $AND_MASK, $ACC1, $ACC1
  1245. vpsrlq \$29, $ACC2, $TEMP3
  1246. vpermq \$0x93, $TEMP1, $TEMP1
  1247. vpand $AND_MASK, $ACC2, $ACC2
  1248. vpsrlq \$29, $ACC3, $TEMP4
  1249. vpermq \$0x93, $TEMP2, $TEMP2
  1250. vpand $AND_MASK, $ACC3, $ACC3
  1251. vpermq \$0x93, $TEMP3, $TEMP3
  1252. vpblendd \$3, $ZERO, $TEMP1, $TEMP0
  1253. vpermq \$0x93, $TEMP4, $TEMP4
  1254. vpblendd \$3, $TEMP1, $TEMP2, $TEMP1
  1255. vpaddq $TEMP0, $ACC0, $ACC0
  1256. vpblendd \$3, $TEMP2, $TEMP3, $TEMP2
  1257. vpaddq $TEMP1, $ACC1, $ACC1
  1258. vpblendd \$3, $TEMP3, $TEMP4, $TEMP3
  1259. vpaddq $TEMP2, $ACC2, $ACC2
  1260. vpblendd \$3, $TEMP4, $ZERO, $TEMP4
  1261. vpaddq $TEMP3, $ACC3, $ACC3
  1262. vpaddq $TEMP4, $ACC4, $ACC4
  1263. vmovdqu $ACC0, 0-128($rp)
  1264. vmovdqu $ACC1, 32-128($rp)
  1265. vmovdqu $ACC2, 64-128($rp)
  1266. vmovdqu $ACC3, 96-128($rp)
  1267. ___
  1268. $TEMP5=$ACC0;
  1269. $code.=<<___;
  1270. vpsrlq \$29, $ACC4, $TEMP1
  1271. vpand $AND_MASK, $ACC4, $ACC4
  1272. vpsrlq \$29, $ACC5, $TEMP2
  1273. vpand $AND_MASK, $ACC5, $ACC5
  1274. vpsrlq \$29, $ACC6, $TEMP3
  1275. vpermq \$0x93, $TEMP1, $TEMP1
  1276. vpand $AND_MASK, $ACC6, $ACC6
  1277. vpsrlq \$29, $ACC7, $TEMP4
  1278. vpermq \$0x93, $TEMP2, $TEMP2
  1279. vpand $AND_MASK, $ACC7, $ACC7
  1280. vpsrlq \$29, $ACC8, $TEMP5
  1281. vpermq \$0x93, $TEMP3, $TEMP3
  1282. vpand $AND_MASK, $ACC8, $ACC8
  1283. vpermq \$0x93, $TEMP4, $TEMP4
  1284. vpblendd \$3, $ZERO, $TEMP1, $TEMP0
  1285. vpermq \$0x93, $TEMP5, $TEMP5
  1286. vpblendd \$3, $TEMP1, $TEMP2, $TEMP1
  1287. vpaddq $TEMP0, $ACC4, $ACC4
  1288. vpblendd \$3, $TEMP2, $TEMP3, $TEMP2
  1289. vpaddq $TEMP1, $ACC5, $ACC5
  1290. vpblendd \$3, $TEMP3, $TEMP4, $TEMP3
  1291. vpaddq $TEMP2, $ACC6, $ACC6
  1292. vpblendd \$3, $TEMP4, $TEMP5, $TEMP4
  1293. vpaddq $TEMP3, $ACC7, $ACC7
  1294. vpaddq $TEMP4, $ACC8, $ACC8
  1295. vpsrlq \$29, $ACC4, $TEMP1
  1296. vpand $AND_MASK, $ACC4, $ACC4
  1297. vpsrlq \$29, $ACC5, $TEMP2
  1298. vpand $AND_MASK, $ACC5, $ACC5
  1299. vpsrlq \$29, $ACC6, $TEMP3
  1300. vpermq \$0x93, $TEMP1, $TEMP1
  1301. vpand $AND_MASK, $ACC6, $ACC6
  1302. vpsrlq \$29, $ACC7, $TEMP4
  1303. vpermq \$0x93, $TEMP2, $TEMP2
  1304. vpand $AND_MASK, $ACC7, $ACC7
  1305. vpsrlq \$29, $ACC8, $TEMP5
  1306. vpermq \$0x93, $TEMP3, $TEMP3
  1307. vpand $AND_MASK, $ACC8, $ACC8
  1308. vpermq \$0x93, $TEMP4, $TEMP4
  1309. vpblendd \$3, $ZERO, $TEMP1, $TEMP0
  1310. vpermq \$0x93, $TEMP5, $TEMP5
  1311. vpblendd \$3, $TEMP1, $TEMP2, $TEMP1
  1312. vpaddq $TEMP0, $ACC4, $ACC4
  1313. vpblendd \$3, $TEMP2, $TEMP3, $TEMP2
  1314. vpaddq $TEMP1, $ACC5, $ACC5
  1315. vpblendd \$3, $TEMP3, $TEMP4, $TEMP3
  1316. vpaddq $TEMP2, $ACC6, $ACC6
  1317. vpblendd \$3, $TEMP4, $TEMP5, $TEMP4
  1318. vpaddq $TEMP3, $ACC7, $ACC7
  1319. vpaddq $TEMP4, $ACC8, $ACC8
  1320. vmovdqu $ACC4, 128-128($rp)
  1321. vmovdqu $ACC5, 160-128($rp)
  1322. vmovdqu $ACC6, 192-128($rp)
  1323. vmovdqu $ACC7, 224-128($rp)
  1324. vmovdqu $ACC8, 256-128($rp)
  1325. vzeroupper
  1326. mov %rbp, %rax
  1327. ___
  1328. $code.=<<___ if ($win64);
  1329. movaps -0xd8(%rax),%xmm6
  1330. movaps -0xc8(%rax),%xmm7
  1331. movaps -0xb8(%rax),%xmm8
  1332. movaps -0xa8(%rax),%xmm9
  1333. movaps -0x98(%rax),%xmm10
  1334. movaps -0x88(%rax),%xmm11
  1335. movaps -0x78(%rax),%xmm12
  1336. movaps -0x68(%rax),%xmm13
  1337. movaps -0x58(%rax),%xmm14
  1338. movaps -0x48(%rax),%xmm15
  1339. ___
  1340. $code.=<<___;
  1341. mov -48(%rax),%r15
  1342. mov -40(%rax),%r14
  1343. mov -32(%rax),%r13
  1344. mov -24(%rax),%r12
  1345. mov -16(%rax),%rbp
  1346. mov -8(%rax),%rbx
  1347. lea (%rax),%rsp # restore %rsp
  1348. .Lmul_1024_epilogue:
  1349. ret
  1350. .size rsaz_1024_mul_avx2,.-rsaz_1024_mul_avx2
  1351. ___
  1352. }
  1353. {
  1354. my ($out,$inp) = $win64 ? ("%rcx","%rdx") : ("%rdi","%rsi");
  1355. my @T = map("%r$_",(8..11));
  1356. $code.=<<___;
  1357. .globl rsaz_1024_red2norm_avx2
  1358. .type rsaz_1024_red2norm_avx2,\@abi-omnipotent
  1359. .align 32
  1360. rsaz_1024_red2norm_avx2:
  1361. sub \$-128,$inp # size optimization
  1362. xor %rax,%rax
  1363. ___
  1364. for ($j=0,$i=0; $i<16; $i++) {
  1365. my $k=0;
  1366. while (29*$j<64*($i+1)) { # load data till boundary
  1367. $code.=" mov `8*$j-128`($inp), @T[0]\n";
  1368. $j++; $k++; push(@T,shift(@T));
  1369. }
  1370. $l=$k;
  1371. while ($k>1) { # shift loaded data but last value
  1372. $code.=" shl \$`29*($j-$k)`,@T[-$k]\n";
  1373. $k--;
  1374. }
  1375. $code.=<<___; # shift last value
  1376. mov @T[-1], @T[0]
  1377. shl \$`29*($j-1)`, @T[-1]
  1378. shr \$`-29*($j-1)`, @T[0]
  1379. ___
  1380. while ($l) { # accumulate all values
  1381. $code.=" add @T[-$l], %rax\n";
  1382. $l--;
  1383. }
  1384. $code.=<<___;
  1385. adc \$0, @T[0] # consume eventual carry
  1386. mov %rax, 8*$i($out)
  1387. mov @T[0], %rax
  1388. ___
  1389. push(@T,shift(@T));
  1390. }
  1391. $code.=<<___;
  1392. ret
  1393. .size rsaz_1024_red2norm_avx2,.-rsaz_1024_red2norm_avx2
  1394. .globl rsaz_1024_norm2red_avx2
  1395. .type rsaz_1024_norm2red_avx2,\@abi-omnipotent
  1396. .align 32
  1397. rsaz_1024_norm2red_avx2:
  1398. sub \$-128,$out # size optimization
  1399. mov ($inp),@T[0]
  1400. mov \$0x1fffffff,%eax
  1401. ___
  1402. for ($j=0,$i=0; $i<16; $i++) {
  1403. $code.=" mov `8*($i+1)`($inp),@T[1]\n" if ($i<15);
  1404. $code.=" xor @T[1],@T[1]\n" if ($i==15);
  1405. my $k=1;
  1406. while (29*($j+1)<64*($i+1)) {
  1407. $code.=<<___;
  1408. mov @T[0],@T[-$k]
  1409. shr \$`29*$j`,@T[-$k]
  1410. and %rax,@T[-$k] # &0x1fffffff
  1411. mov @T[-$k],`8*$j-128`($out)
  1412. ___
  1413. $j++; $k++;
  1414. }
  1415. $code.=<<___;
  1416. shrd \$`29*$j`,@T[1],@T[0]
  1417. and %rax,@T[0]
  1418. mov @T[0],`8*$j-128`($out)
  1419. ___
  1420. $j++;
  1421. push(@T,shift(@T));
  1422. }
  1423. $code.=<<___;
  1424. mov @T[0],`8*$j-128`($out) # zero
  1425. mov @T[0],`8*($j+1)-128`($out)
  1426. mov @T[0],`8*($j+2)-128`($out)
  1427. mov @T[0],`8*($j+3)-128`($out)
  1428. ret
  1429. .size rsaz_1024_norm2red_avx2,.-rsaz_1024_norm2red_avx2
  1430. ___
  1431. }
  1432. {
  1433. my ($out,$inp,$power) = $win64 ? ("%rcx","%rdx","%r8d") : ("%rdi","%rsi","%edx");
  1434. $code.=<<___;
  1435. .globl rsaz_1024_scatter5_avx2
  1436. .type rsaz_1024_scatter5_avx2,\@abi-omnipotent
  1437. .align 32
  1438. rsaz_1024_scatter5_avx2:
  1439. vzeroupper
  1440. vmovdqu .Lscatter_permd(%rip),%ymm5
  1441. shl \$4,$power
  1442. lea ($out,$power),$out
  1443. mov \$9,%eax
  1444. jmp .Loop_scatter_1024
  1445. .align 32
  1446. .Loop_scatter_1024:
  1447. vmovdqu ($inp),%ymm0
  1448. lea 32($inp),$inp
  1449. vpermd %ymm0,%ymm5,%ymm0
  1450. vmovdqu %xmm0,($out)
  1451. lea 16*32($out),$out
  1452. dec %eax
  1453. jnz .Loop_scatter_1024
  1454. vzeroupper
  1455. ret
  1456. .size rsaz_1024_scatter5_avx2,.-rsaz_1024_scatter5_avx2
  1457. .globl rsaz_1024_gather5_avx2
  1458. .type rsaz_1024_gather5_avx2,\@abi-omnipotent
  1459. .align 32
  1460. rsaz_1024_gather5_avx2:
  1461. vzeroupper
  1462. mov %rsp,%r11
  1463. ___
  1464. $code.=<<___ if ($win64);
  1465. lea -0x88(%rsp),%rax
  1466. .LSEH_begin_rsaz_1024_gather5:
  1467. # I can't trust assembler to use specific encoding:-(
  1468. .byte 0x48,0x8d,0x60,0xe0 # lea -0x20(%rax),%rsp
  1469. .byte 0xc5,0xf8,0x29,0x70,0xe0 # vmovaps %xmm6,-0x20(%rax)
  1470. .byte 0xc5,0xf8,0x29,0x78,0xf0 # vmovaps %xmm7,-0x10(%rax)
  1471. .byte 0xc5,0x78,0x29,0x40,0x00 # vmovaps %xmm8,0(%rax)
  1472. .byte 0xc5,0x78,0x29,0x48,0x10 # vmovaps %xmm9,0x10(%rax)
  1473. .byte 0xc5,0x78,0x29,0x50,0x20 # vmovaps %xmm10,0x20(%rax)
  1474. .byte 0xc5,0x78,0x29,0x58,0x30 # vmovaps %xmm11,0x30(%rax)
  1475. .byte 0xc5,0x78,0x29,0x60,0x40 # vmovaps %xmm12,0x40(%rax)
  1476. .byte 0xc5,0x78,0x29,0x68,0x50 # vmovaps %xmm13,0x50(%rax)
  1477. .byte 0xc5,0x78,0x29,0x70,0x60 # vmovaps %xmm14,0x60(%rax)
  1478. .byte 0xc5,0x78,0x29,0x78,0x70 # vmovaps %xmm15,0x70(%rax)
  1479. ___
  1480. $code.=<<___;
  1481. lea -0x100(%rsp),%rsp
  1482. and \$-32, %rsp
  1483. lea .Linc(%rip), %r10
  1484. lea -128(%rsp),%rax # control u-op density
  1485. vmovd $power, %xmm4
  1486. vmovdqa (%r10),%ymm0
  1487. vmovdqa 32(%r10),%ymm1
  1488. vmovdqa 64(%r10),%ymm5
  1489. vpbroadcastd %xmm4,%ymm4
  1490. vpaddd %ymm5, %ymm0, %ymm2
  1491. vpcmpeqd %ymm4, %ymm0, %ymm0
  1492. vpaddd %ymm5, %ymm1, %ymm3
  1493. vpcmpeqd %ymm4, %ymm1, %ymm1
  1494. vmovdqa %ymm0, 32*0+128(%rax)
  1495. vpaddd %ymm5, %ymm2, %ymm0
  1496. vpcmpeqd %ymm4, %ymm2, %ymm2
  1497. vmovdqa %ymm1, 32*1+128(%rax)
  1498. vpaddd %ymm5, %ymm3, %ymm1
  1499. vpcmpeqd %ymm4, %ymm3, %ymm3
  1500. vmovdqa %ymm2, 32*2+128(%rax)
  1501. vpaddd %ymm5, %ymm0, %ymm2
  1502. vpcmpeqd %ymm4, %ymm0, %ymm0
  1503. vmovdqa %ymm3, 32*3+128(%rax)
  1504. vpaddd %ymm5, %ymm1, %ymm3
  1505. vpcmpeqd %ymm4, %ymm1, %ymm1
  1506. vmovdqa %ymm0, 32*4+128(%rax)
  1507. vpaddd %ymm5, %ymm2, %ymm8
  1508. vpcmpeqd %ymm4, %ymm2, %ymm2
  1509. vmovdqa %ymm1, 32*5+128(%rax)
  1510. vpaddd %ymm5, %ymm3, %ymm9
  1511. vpcmpeqd %ymm4, %ymm3, %ymm3
  1512. vmovdqa %ymm2, 32*6+128(%rax)
  1513. vpaddd %ymm5, %ymm8, %ymm10
  1514. vpcmpeqd %ymm4, %ymm8, %ymm8
  1515. vmovdqa %ymm3, 32*7+128(%rax)
  1516. vpaddd %ymm5, %ymm9, %ymm11
  1517. vpcmpeqd %ymm4, %ymm9, %ymm9
  1518. vpaddd %ymm5, %ymm10, %ymm12
  1519. vpcmpeqd %ymm4, %ymm10, %ymm10
  1520. vpaddd %ymm5, %ymm11, %ymm13
  1521. vpcmpeqd %ymm4, %ymm11, %ymm11
  1522. vpaddd %ymm5, %ymm12, %ymm14
  1523. vpcmpeqd %ymm4, %ymm12, %ymm12
  1524. vpaddd %ymm5, %ymm13, %ymm15
  1525. vpcmpeqd %ymm4, %ymm13, %ymm13
  1526. vpcmpeqd %ymm4, %ymm14, %ymm14
  1527. vpcmpeqd %ymm4, %ymm15, %ymm15
  1528. vmovdqa -32(%r10),%ymm7 # .Lgather_permd
  1529. lea 128($inp), $inp
  1530. mov \$9,$power
  1531. .Loop_gather_1024:
  1532. vmovdqa 32*0-128($inp), %ymm0
  1533. vmovdqa 32*1-128($inp), %ymm1
  1534. vmovdqa 32*2-128($inp), %ymm2
  1535. vmovdqa 32*3-128($inp), %ymm3
  1536. vpand 32*0+128(%rax), %ymm0, %ymm0
  1537. vpand 32*1+128(%rax), %ymm1, %ymm1
  1538. vpand 32*2+128(%rax), %ymm2, %ymm2
  1539. vpor %ymm0, %ymm1, %ymm4
  1540. vpand 32*3+128(%rax), %ymm3, %ymm3
  1541. vmovdqa 32*4-128($inp), %ymm0
  1542. vmovdqa 32*5-128($inp), %ymm1
  1543. vpor %ymm2, %ymm3, %ymm5
  1544. vmovdqa 32*6-128($inp), %ymm2
  1545. vmovdqa 32*7-128($inp), %ymm3
  1546. vpand 32*4+128(%rax), %ymm0, %ymm0
  1547. vpand 32*5+128(%rax), %ymm1, %ymm1
  1548. vpand 32*6+128(%rax), %ymm2, %ymm2
  1549. vpor %ymm0, %ymm4, %ymm4
  1550. vpand 32*7+128(%rax), %ymm3, %ymm3
  1551. vpand 32*8-128($inp), %ymm8, %ymm0
  1552. vpor %ymm1, %ymm5, %ymm5
  1553. vpand 32*9-128($inp), %ymm9, %ymm1
  1554. vpor %ymm2, %ymm4, %ymm4
  1555. vpand 32*10-128($inp),%ymm10, %ymm2
  1556. vpor %ymm3, %ymm5, %ymm5
  1557. vpand 32*11-128($inp),%ymm11, %ymm3
  1558. vpor %ymm0, %ymm4, %ymm4
  1559. vpand 32*12-128($inp),%ymm12, %ymm0
  1560. vpor %ymm1, %ymm5, %ymm5
  1561. vpand 32*13-128($inp),%ymm13, %ymm1
  1562. vpor %ymm2, %ymm4, %ymm4
  1563. vpand 32*14-128($inp),%ymm14, %ymm2
  1564. vpor %ymm3, %ymm5, %ymm5
  1565. vpand 32*15-128($inp),%ymm15, %ymm3
  1566. lea 32*16($inp), $inp
  1567. vpor %ymm0, %ymm4, %ymm4
  1568. vpor %ymm1, %ymm5, %ymm5
  1569. vpor %ymm2, %ymm4, %ymm4
  1570. vpor %ymm3, %ymm5, %ymm5
  1571. vpor %ymm5, %ymm4, %ymm4
  1572. vextracti128 \$1, %ymm4, %xmm5 # upper half is cleared
  1573. vpor %xmm4, %xmm5, %xmm5
  1574. vpermd %ymm5,%ymm7,%ymm5
  1575. vmovdqu %ymm5,($out)
  1576. lea 32($out),$out
  1577. dec $power
  1578. jnz .Loop_gather_1024
  1579. vpxor %ymm0,%ymm0,%ymm0
  1580. vmovdqu %ymm0,($out)
  1581. vzeroupper
  1582. ___
  1583. $code.=<<___ if ($win64);
  1584. movaps -0xa8(%r11),%xmm6
  1585. movaps -0x98(%r11),%xmm7
  1586. movaps -0x88(%r11),%xmm8
  1587. movaps -0x78(%r11),%xmm9
  1588. movaps -0x68(%r11),%xmm10
  1589. movaps -0x58(%r11),%xmm11
  1590. movaps -0x48(%r11),%xmm12
  1591. movaps -0x38(%r11),%xmm13
  1592. movaps -0x28(%r11),%xmm14
  1593. movaps -0x18(%r11),%xmm15
  1594. .LSEH_end_rsaz_1024_gather5:
  1595. ___
  1596. $code.=<<___;
  1597. lea (%r11),%rsp
  1598. ret
  1599. .size rsaz_1024_gather5_avx2,.-rsaz_1024_gather5_avx2
  1600. ___
  1601. }
  1602. $code.=<<___;
  1603. .extern OPENSSL_ia32cap_P
  1604. .globl rsaz_avx2_eligible
  1605. .type rsaz_avx2_eligible,\@abi-omnipotent
  1606. .align 32
  1607. rsaz_avx2_eligible:
  1608. mov OPENSSL_ia32cap_P+8(%rip),%eax
  1609. ___
  1610. $code.=<<___ if ($addx);
  1611. mov \$`1<<8|1<<19`,%ecx
  1612. mov \$0,%edx
  1613. and %eax,%ecx
  1614. cmp \$`1<<8|1<<19`,%ecx # check for BMI2+AD*X
  1615. cmove %edx,%eax
  1616. ___
  1617. $code.=<<___;
  1618. and \$`1<<5`,%eax
  1619. shr \$5,%eax
  1620. ret
  1621. .size rsaz_avx2_eligible,.-rsaz_avx2_eligible
  1622. .align 64
  1623. .Land_mask:
  1624. .quad 0x1fffffff,0x1fffffff,0x1fffffff,-1
  1625. .Lscatter_permd:
  1626. .long 0,2,4,6,7,7,7,7
  1627. .Lgather_permd:
  1628. .long 0,7,1,7,2,7,3,7
  1629. .Linc:
  1630. .long 0,0,0,0, 1,1,1,1
  1631. .long 2,2,2,2, 3,3,3,3
  1632. .long 4,4,4,4, 4,4,4,4
  1633. .align 64
  1634. ___
  1635. if ($win64) {
  1636. $rec="%rcx";
  1637. $frame="%rdx";
  1638. $context="%r8";
  1639. $disp="%r9";
  1640. $code.=<<___
  1641. .extern __imp_RtlVirtualUnwind
  1642. .type rsaz_se_handler,\@abi-omnipotent
  1643. .align 16
  1644. rsaz_se_handler:
  1645. push %rsi
  1646. push %rdi
  1647. push %rbx
  1648. push %rbp
  1649. push %r12
  1650. push %r13
  1651. push %r14
  1652. push %r15
  1653. pushfq
  1654. sub \$64,%rsp
  1655. mov 120($context),%rax # pull context->Rax
  1656. mov 248($context),%rbx # pull context->Rip
  1657. mov 8($disp),%rsi # disp->ImageBase
  1658. mov 56($disp),%r11 # disp->HandlerData
  1659. mov 0(%r11),%r10d # HandlerData[0]
  1660. lea (%rsi,%r10),%r10 # prologue label
  1661. cmp %r10,%rbx # context->Rip<prologue label
  1662. jb .Lcommon_seh_tail
  1663. mov 152($context),%rax # pull context->Rsp
  1664. mov 4(%r11),%r10d # HandlerData[1]
  1665. lea (%rsi,%r10),%r10 # epilogue label
  1666. cmp %r10,%rbx # context->Rip>=epilogue label
  1667. jae .Lcommon_seh_tail
  1668. mov 160($context),%rax # pull context->Rbp
  1669. mov -48(%rax),%r15
  1670. mov -40(%rax),%r14
  1671. mov -32(%rax),%r13
  1672. mov -24(%rax),%r12
  1673. mov -16(%rax),%rbp
  1674. mov -8(%rax),%rbx
  1675. mov %r15,240($context)
  1676. mov %r14,232($context)
  1677. mov %r13,224($context)
  1678. mov %r12,216($context)
  1679. mov %rbp,160($context)
  1680. mov %rbx,144($context)
  1681. lea -0xd8(%rax),%rsi # %xmm save area
  1682. lea 512($context),%rdi # & context.Xmm6
  1683. mov \$20,%ecx # 10*sizeof(%xmm0)/sizeof(%rax)
  1684. .long 0xa548f3fc # cld; rep movsq
  1685. .Lcommon_seh_tail:
  1686. mov 8(%rax),%rdi
  1687. mov 16(%rax),%rsi
  1688. mov %rax,152($context) # restore context->Rsp
  1689. mov %rsi,168($context) # restore context->Rsi
  1690. mov %rdi,176($context) # restore context->Rdi
  1691. mov 40($disp),%rdi # disp->ContextRecord
  1692. mov $context,%rsi # context
  1693. mov \$154,%ecx # sizeof(CONTEXT)
  1694. .long 0xa548f3fc # cld; rep movsq
  1695. mov $disp,%rsi
  1696. xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER
  1697. mov 8(%rsi),%rdx # arg2, disp->ImageBase
  1698. mov 0(%rsi),%r8 # arg3, disp->ControlPc
  1699. mov 16(%rsi),%r9 # arg4, disp->FunctionEntry
  1700. mov 40(%rsi),%r10 # disp->ContextRecord
  1701. lea 56(%rsi),%r11 # &disp->HandlerData
  1702. lea 24(%rsi),%r12 # &disp->EstablisherFrame
  1703. mov %r10,32(%rsp) # arg5
  1704. mov %r11,40(%rsp) # arg6
  1705. mov %r12,48(%rsp) # arg7
  1706. mov %rcx,56(%rsp) # arg8, (NULL)
  1707. call *__imp_RtlVirtualUnwind(%rip)
  1708. mov \$1,%eax # ExceptionContinueSearch
  1709. add \$64,%rsp
  1710. popfq
  1711. pop %r15
  1712. pop %r14
  1713. pop %r13
  1714. pop %r12
  1715. pop %rbp
  1716. pop %rbx
  1717. pop %rdi
  1718. pop %rsi
  1719. ret
  1720. .size rsaz_se_handler,.-rsaz_se_handler
  1721. .section .pdata
  1722. .align 4
  1723. .rva .LSEH_begin_rsaz_1024_sqr_avx2
  1724. .rva .LSEH_end_rsaz_1024_sqr_avx2
  1725. .rva .LSEH_info_rsaz_1024_sqr_avx2
  1726. .rva .LSEH_begin_rsaz_1024_mul_avx2
  1727. .rva .LSEH_end_rsaz_1024_mul_avx2
  1728. .rva .LSEH_info_rsaz_1024_mul_avx2
  1729. .rva .LSEH_begin_rsaz_1024_gather5
  1730. .rva .LSEH_end_rsaz_1024_gather5
  1731. .rva .LSEH_info_rsaz_1024_gather5
  1732. .section .xdata
  1733. .align 8
  1734. .LSEH_info_rsaz_1024_sqr_avx2:
  1735. .byte 9,0,0,0
  1736. .rva rsaz_se_handler
  1737. .rva .Lsqr_1024_body,.Lsqr_1024_epilogue
  1738. .LSEH_info_rsaz_1024_mul_avx2:
  1739. .byte 9,0,0,0
  1740. .rva rsaz_se_handler
  1741. .rva .Lmul_1024_body,.Lmul_1024_epilogue
  1742. .LSEH_info_rsaz_1024_gather5:
  1743. .byte 0x01,0x36,0x17,0x0b
  1744. .byte 0x36,0xf8,0x09,0x00 # vmovaps 0x90(rsp),xmm15
  1745. .byte 0x31,0xe8,0x08,0x00 # vmovaps 0x80(rsp),xmm14
  1746. .byte 0x2c,0xd8,0x07,0x00 # vmovaps 0x70(rsp),xmm13
  1747. .byte 0x27,0xc8,0x06,0x00 # vmovaps 0x60(rsp),xmm12
  1748. .byte 0x22,0xb8,0x05,0x00 # vmovaps 0x50(rsp),xmm11
  1749. .byte 0x1d,0xa8,0x04,0x00 # vmovaps 0x40(rsp),xmm10
  1750. .byte 0x18,0x98,0x03,0x00 # vmovaps 0x30(rsp),xmm9
  1751. .byte 0x13,0x88,0x02,0x00 # vmovaps 0x20(rsp),xmm8
  1752. .byte 0x0e,0x78,0x01,0x00 # vmovaps 0x10(rsp),xmm7
  1753. .byte 0x09,0x68,0x00,0x00 # vmovaps 0x00(rsp),xmm6
  1754. .byte 0x04,0x01,0x15,0x00 # sub rsp,0xa8
  1755. .byte 0x00,0xb3,0x00,0x00 # set_frame r11
  1756. ___
  1757. }
  1758. foreach (split("\n",$code)) {
  1759. s/\`([^\`]*)\`/eval($1)/ge;
  1760. s/\b(sh[rl]d?\s+\$)(-?[0-9]+)/$1.$2%64/ge or
  1761. s/\b(vmov[dq])\b(.+)%ymm([0-9]+)/$1$2%xmm$3/go or
  1762. s/\b(vmovdqu)\b(.+)%x%ymm([0-9]+)/$1$2%xmm$3/go or
  1763. s/\b(vpinsr[qd])\b(.+)%ymm([0-9]+)/$1$2%xmm$3/go or
  1764. s/\b(vpextr[qd])\b(.+)%ymm([0-9]+)/$1$2%xmm$3/go or
  1765. s/\b(vpbroadcast[qd]\s+)%ymm([0-9]+)/$1%xmm$2/go;
  1766. print $_,"\n";
  1767. }
  1768. }}} else {{{
  1769. print <<___; # assembler is too old
  1770. .text
  1771. .globl rsaz_avx2_eligible
  1772. .type rsaz_avx2_eligible,\@abi-omnipotent
  1773. rsaz_avx2_eligible:
  1774. xor %eax,%eax
  1775. ret
  1776. .size rsaz_avx2_eligible,.-rsaz_avx2_eligible
  1777. .globl rsaz_1024_sqr_avx2
  1778. .globl rsaz_1024_mul_avx2
  1779. .globl rsaz_1024_norm2red_avx2
  1780. .globl rsaz_1024_red2norm_avx2
  1781. .globl rsaz_1024_scatter5_avx2
  1782. .globl rsaz_1024_gather5_avx2
  1783. .type rsaz_1024_sqr_avx2,\@abi-omnipotent
  1784. rsaz_1024_sqr_avx2:
  1785. rsaz_1024_mul_avx2:
  1786. rsaz_1024_norm2red_avx2:
  1787. rsaz_1024_red2norm_avx2:
  1788. rsaz_1024_scatter5_avx2:
  1789. rsaz_1024_gather5_avx2:
  1790. .byte 0x0f,0x0b # ud2
  1791. ret
  1792. .size rsaz_1024_sqr_avx2,.-rsaz_1024_sqr_avx2
  1793. ___
  1794. }}}
  1795. close STDOUT;