No puede seleccionar más de 25 temas Los temas deben comenzar con una letra o número, pueden incluir guiones ('-') y pueden tener hasta 35 caracteres de largo.

sha512-x86_64.pl 60 KiB

Enable AVX code for SHA-*. SHA-1, SHA-256, and SHA-512 get a 12-26%, 17-23%, and 33-37% improvement, respectively on x86-64. SHA-1 and SHA-256 get a 8-20% and 14-17% improvement on x86. (x86 does not have AVX code for SHA-512.) This costs us 12k of binary size on x86-64 and 8k of binary size on x86. $ bssl speed SHA- (x86-64, before) Did 4811000 SHA-1 (16 bytes) operations in 1000013us (4810937.5 ops/sec): 77.0 MB/s Did 1414000 SHA-1 (256 bytes) operations in 1000253us (1413642.3 ops/sec): 361.9 MB/s Did 56000 SHA-1 (8192 bytes) operations in 1002640us (55852.5 ops/sec): 457.5 MB/s Did 2536000 SHA-256 (16 bytes) operations in 1000140us (2535645.0 ops/sec): 40.6 MB/s Did 603000 SHA-256 (256 bytes) operations in 1001613us (602028.9 ops/sec): 154.1 MB/s Did 25000 SHA-256 (8192 bytes) operations in 1010132us (24749.2 ops/sec): 202.7 MB/s Did 1767000 SHA-512 (16 bytes) operations in 1000477us (1766157.5 ops/sec): 28.3 MB/s Did 638000 SHA-512 (256 bytes) operations in 1000933us (637405.3 ops/sec): 163.2 MB/s Did 32000 SHA-512 (8192 bytes) operations in 1025646us (31199.8 ops/sec): 255.6 MB/s $ bssl speed SHA- (x86-64, after) Did 5438000 SHA-1 (16 bytes) operations in 1000060us (5437673.7 ops/sec): 87.0 MB/s Did 1590000 SHA-1 (256 bytes) operations in 1000181us (1589712.3 ops/sec): 407.0 MB/s Did 71000 SHA-1 (8192 bytes) operations in 1007958us (70439.4 ops/sec): 577.0 MB/s Did 2955000 SHA-256 (16 bytes) operations in 1000251us (2954258.5 ops/sec): 47.3 MB/s Did 740000 SHA-256 (256 bytes) operations in 1000628us (739535.6 ops/sec): 189.3 MB/s Did 31000 SHA-256 (8192 bytes) operations in 1019619us (30403.5 ops/sec): 249.1 MB/s Did 2348000 SHA-512 (16 bytes) operations in 1000285us (2347331.0 ops/sec): 37.6 MB/s Did 878000 SHA-512 (256 bytes) operations in 1001064us (877066.8 ops/sec): 224.5 MB/s Did 43000 SHA-512 (8192 bytes) operations in 1002485us (42893.4 ops/sec): 351.4 MB/s $ bssl speed SHA- (x86, before, SHA-512 redacted because irrelevant) Did 4319000 SHA-1 (16 bytes) operations in 1000066us (4318715.0 ops/sec): 69.1 MB/s Did 1306000 SHA-1 (256 bytes) operations in 1000437us (1305429.5 ops/sec): 334.2 MB/s Did 58000 SHA-1 (8192 bytes) operations in 1014807us (57153.7 ops/sec): 468.2 MB/s Did 2291000 SHA-256 (16 bytes) operations in 1000343us (2290214.5 ops/sec): 36.6 MB/s Did 594000 SHA-256 (256 bytes) operations in 1000684us (593594.0 ops/sec): 152.0 MB/s Did 25000 SHA-256 (8192 bytes) operations in 1030688us (24255.6 ops/sec): 198.7 MB/s $ bssl speed SHA- (x86, after, SHA-512 redacted because irrelevant) Did 4673000 SHA-1 (16 bytes) operations in 1000063us (4672705.6 ops/sec): 74.8 MB/s Did 1484000 SHA-1 (256 bytes) operations in 1000453us (1483328.1 ops/sec): 379.7 MB/s Did 69000 SHA-1 (8192 bytes) operations in 1008305us (68431.7 ops/sec): 560.6 MB/s Did 2684000 SHA-256 (16 bytes) operations in 1000196us (2683474.0 ops/sec): 42.9 MB/s Did 679000 SHA-256 (256 bytes) operations in 1000525us (678643.7 ops/sec): 173.7 MB/s Did 29000 SHA-256 (8192 bytes) operations in 1033251us (28066.8 ops/sec): 229.9 MB/s Change-Id: I952a3b4fc4c52ebb50690da3b8c97770e8342e98 Reviewed-on: https://boringssl-review.googlesource.com/6470 Reviewed-by: Adam Langley <agl@google.com>
hace 9 años
12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232123312341235123612371238123912401241124212431244124512461247124812491250125112521253125412551256125712581259126012611262126312641265126612671268126912701271127212731274127512761277127812791280128112821283128412851286128712881289129012911292129312941295129612971298129913001301130213031304130513061307130813091310131113121313131413151316131713181319132013211322132313241325132613271328132913301331133213331334133513361337133813391340134113421343134413451346134713481349135013511352135313541355135613571358135913601361136213631364136513661367136813691370137113721373137413751376137713781379138013811382138313841385138613871388138913901391139213931394139513961397139813991400140114021403140414051406140714081409141014111412141314141415141614171418141914201421142214231424142514261427142814291430143114321433143414351436143714381439144014411442144314441445144614471448144914501451145214531454145514561457145814591460146114621463146414651466146714681469147014711472147314741475147614771478147914801481148214831484148514861487148814891490149114921493149414951496149714981499150015011502150315041505150615071508150915101511151215131514151515161517151815191520152115221523152415251526152715281529153015311532153315341535153615371538153915401541154215431544154515461547154815491550155115521553155415551556155715581559156015611562156315641565156615671568156915701571157215731574157515761577157815791580158115821583158415851586158715881589159015911592159315941595159615971598159916001601160216031604160516061607160816091610161116121613161416151616161716181619162016211622162316241625162616271628162916301631163216331634163516361637163816391640164116421643164416451646164716481649165016511652165316541655165616571658165916601661166216631664166516661667166816691670167116721673167416751676167716781679168016811682168316841685168616871688168916901691169216931694169516961697169816991700170117021703170417051706170717081709171017111712171317141715171617171718171917201721172217231724172517261727172817291730173117321733173417351736173717381739174017411742174317441745174617471748174917501751175217531754175517561757175817591760176117621763176417651766176717681769177017711772177317741775177617771778177917801781178217831784178517861787178817891790179117921793179417951796179717981799180018011802180318041805180618071808180918101811181218131814181518161817181818191820182118221823182418251826182718281829183018311832183318341835183618371838183918401841184218431844184518461847184818491850185118521853185418551856185718581859186018611862186318641865186618671868186918701871187218731874187518761877187818791880188118821883188418851886188718881889189018911892189318941895189618971898189919001901190219031904190519061907190819091910191119121913191419151916191719181919192019211922192319241925192619271928192919301931193219331934193519361937193819391940194119421943194419451946194719481949195019511952195319541955195619571958195919601961196219631964196519661967196819691970197119721973197419751976197719781979198019811982198319841985198619871988198919901991199219931994199519961997199819992000200120022003200420052006200720082009201020112012201320142015201620172018201920202021202220232024202520262027202820292030203120322033203420352036203720382039204020412042204320442045204620472048204920502051205220532054205520562057205820592060206120622063206420652066206720682069207020712072207320742075207620772078207920802081208220832084208520862087208820892090209120922093209420952096209720982099210021012102210321042105210621072108210921102111211221132114211521162117211821192120212121222123212421252126212721282129213021312132213321342135213621372138213921402141214221432144214521462147214821492150215121522153215421552156215721582159216021612162216321642165216621672168216921702171217221732174217521762177217821792180218121822183218421852186218721882189219021912192219321942195219621972198219922002201220222032204220522062207220822092210221122122213221422152216221722182219222022212222222322242225222622272228222922302231223222332234223522362237223822392240224122422243224422452246224722482249225022512252225322542255225622572258225922602261226222632264226522662267226822692270227122722273227422752276227722782279228022812282228322842285228622872288228922902291229222932294229522962297229822992300230123022303230423052306230723082309231023112312231323142315231623172318231923202321232223232324232523262327232823292330233123322333233423352336233723382339234023412342234323442345234623472348234923502351235223532354235523562357235823592360236123622363236423652366236723682369237023712372237323742375237623772378237923802381238223832384238523862387238823892390
  1. #!/usr/bin/env perl
  2. #
  3. # ====================================================================
  4. # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
  5. # project. Rights for redistribution and usage in source and binary
  6. # forms are granted according to the OpenSSL license.
  7. # ====================================================================
  8. #
  9. # sha256/512_block procedure for x86_64.
  10. #
  11. # 40% improvement over compiler-generated code on Opteron. On EM64T
  12. # sha256 was observed to run >80% faster and sha512 - >40%. No magical
  13. # tricks, just straight implementation... I really wonder why gcc
  14. # [being armed with inline assembler] fails to generate as fast code.
  15. # The only thing which is cool about this module is that it's very
  16. # same instruction sequence used for both SHA-256 and SHA-512. In
  17. # former case the instructions operate on 32-bit operands, while in
  18. # latter - on 64-bit ones. All I had to do is to get one flavor right,
  19. # the other one passed the test right away:-)
  20. #
  21. # sha256_block runs in ~1005 cycles on Opteron, which gives you
  22. # asymptotic performance of 64*1000/1005=63.7MBps times CPU clock
  23. # frequency in GHz. sha512_block runs in ~1275 cycles, which results
  24. # in 128*1000/1275=100MBps per GHz. Is there room for improvement?
  25. # Well, if you compare it to IA-64 implementation, which maintains
  26. # X[16] in register bank[!], tends to 4 instructions per CPU clock
  27. # cycle and runs in 1003 cycles, 1275 is very good result for 3-way
  28. # issue Opteron pipeline and X[16] maintained in memory. So that *if*
  29. # there is a way to improve it, *then* the only way would be to try to
  30. # offload X[16] updates to SSE unit, but that would require "deeper"
  31. # loop unroll, which in turn would naturally cause size blow-up, not
  32. # to mention increased complexity! And once again, only *if* it's
  33. # actually possible to noticeably improve overall ILP, instruction
  34. # level parallelism, on a given CPU implementation in this case.
  35. #
  36. # Special note on Intel EM64T. While Opteron CPU exhibits perfect
  37. # perfromance ratio of 1.5 between 64- and 32-bit flavors [see above],
  38. # [currently available] EM64T CPUs apparently are far from it. On the
  39. # contrary, 64-bit version, sha512_block, is ~30% *slower* than 32-bit
  40. # sha256_block:-( This is presumably because 64-bit shifts/rotates
  41. # apparently are not atomic instructions, but implemented in microcode.
  42. #
  43. # May 2012.
  44. #
  45. # Optimization including one of Pavel Semjanov's ideas, alternative
  46. # Maj, resulted in >=5% improvement on most CPUs, +20% SHA256 and
  47. # unfortunately -2% SHA512 on P4 [which nobody should care about
  48. # that much].
  49. #
  50. # June 2012.
  51. #
  52. # Add SIMD code paths, see below for improvement coefficients. SSSE3
  53. # code path was not attempted for SHA512, because improvement is not
  54. # estimated to be high enough, noticeably less than 9%, to justify
  55. # the effort, not on pre-AVX processors. [Obviously with exclusion
  56. # for VIA Nano, but it has SHA512 instruction that is faster and
  57. # should be used instead.] For reference, corresponding estimated
  58. # upper limit for improvement for SSSE3 SHA256 is 28%. The fact that
  59. # higher coefficients are observed on VIA Nano and Bulldozer has more
  60. # to do with specifics of their architecture [which is topic for
  61. # separate discussion].
  62. #
  63. # November 2012.
  64. #
  65. # Add AVX2 code path. Two consecutive input blocks are loaded to
  66. # 256-bit %ymm registers, with data from first block to least
  67. # significant 128-bit halves and data from second to most significant.
  68. # The data is then processed with same SIMD instruction sequence as
  69. # for AVX, but with %ymm as operands. Side effect is increased stack
  70. # frame, 448 additional bytes in SHA256 and 1152 in SHA512, and 1.2KB
  71. # code size increase.
  72. #
  73. # March 2014.
  74. #
  75. # Add support for Intel SHA Extensions.
  76. ######################################################################
  77. # Current performance in cycles per processed byte (less is better):
  78. #
  79. # SHA256 SSSE3 AVX/XOP(*) SHA512 AVX/XOP(*)
  80. #
  81. # AMD K8 14.9 - - 9.57 -
  82. # P4 17.3 - - 30.8 -
  83. # Core 2 15.6 13.8(+13%) - 9.97 -
  84. # Westmere 14.8 12.3(+19%) - 9.58 -
  85. # Sandy Bridge 17.4 14.2(+23%) 11.6(+50%(**)) 11.2 8.10(+38%(**))
  86. # Ivy Bridge 12.6 10.5(+20%) 10.3(+22%) 8.17 7.22(+13%)
  87. # Haswell 12.2 9.28(+31%) 7.80(+56%) 7.66 5.40(+42%)
  88. # Bulldozer 21.1 13.6(+54%) 13.6(+54%(***)) 13.5 8.58(+57%)
  89. # VIA Nano 23.0 16.5(+39%) - 14.7 -
  90. # Atom 23.0 18.9(+22%) - 14.7 -
  91. # Silvermont 27.4 20.6(+33%) - 17.5 -
  92. #
  93. # (*) whichever best applicable;
  94. # (**) switch from ror to shrd stands for fair share of improvement;
  95. # (***) execution time is fully determined by remaining integer-only
  96. # part, body_00_15; reducing the amount of SIMD instructions
  97. # below certain limit makes no difference/sense; to conserve
  98. # space SHA256 XOP code path is therefore omitted;
  99. $flavour = shift;
  100. $output = shift;
  101. if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
  102. $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
  103. $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
  104. ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
  105. ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
  106. die "can't locate x86_64-xlate.pl";
  107. # In upstream, this is controlled by shelling out to the compiler to check
  108. # versions, but BoringSSL is intended to be used with pre-generated perlasm
  109. # output, so this isn't useful anyway.
  110. #
  111. # TODO(davidben): Enable AVX2 code after testing by setting $avx to 2. Is it
  112. # necessary to disable AVX2 code when SHA Extensions code is disabled? Upstream
  113. # did not tie them together until after $shaext was added.
  114. $avx = 1;
  115. # TODO(davidben): Consider enabling the Intel SHA Extensions code once it's
  116. # been tested.
  117. $shaext=0; ### set to zero if compiling for 1.0.1
  118. $avx=1 if (!$shaext && $avx);
  119. open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
  120. *STDOUT=*OUT;
  121. if ($output =~ /512/) {
  122. $func="sha512_block_data_order";
  123. $TABLE="K512";
  124. $SZ=8;
  125. @ROT=($A,$B,$C,$D,$E,$F,$G,$H)=("%rax","%rbx","%rcx","%rdx",
  126. "%r8", "%r9", "%r10","%r11");
  127. ($T1,$a0,$a1,$a2,$a3)=("%r12","%r13","%r14","%r15","%rdi");
  128. @Sigma0=(28,34,39);
  129. @Sigma1=(14,18,41);
  130. @sigma0=(1, 8, 7);
  131. @sigma1=(19,61, 6);
  132. $rounds=80;
  133. } else {
  134. $func="sha256_block_data_order";
  135. $TABLE="K256";
  136. $SZ=4;
  137. @ROT=($A,$B,$C,$D,$E,$F,$G,$H)=("%eax","%ebx","%ecx","%edx",
  138. "%r8d","%r9d","%r10d","%r11d");
  139. ($T1,$a0,$a1,$a2,$a3)=("%r12d","%r13d","%r14d","%r15d","%edi");
  140. @Sigma0=( 2,13,22);
  141. @Sigma1=( 6,11,25);
  142. @sigma0=( 7,18, 3);
  143. @sigma1=(17,19,10);
  144. $rounds=64;
  145. }
  146. $ctx="%rdi"; # 1st arg, zapped by $a3
  147. $inp="%rsi"; # 2nd arg
  148. $Tbl="%rbp";
  149. $_ctx="16*$SZ+0*8(%rsp)";
  150. $_inp="16*$SZ+1*8(%rsp)";
  151. $_end="16*$SZ+2*8(%rsp)";
  152. $_rsp="16*$SZ+3*8(%rsp)";
  153. $framesz="16*$SZ+4*8";
  154. sub ROUND_00_15()
  155. { my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
  156. my $STRIDE=$SZ;
  157. $STRIDE += 16 if ($i%(16/$SZ)==(16/$SZ-1));
  158. $code.=<<___;
  159. ror \$`$Sigma1[2]-$Sigma1[1]`,$a0
  160. mov $f,$a2
  161. xor $e,$a0
  162. ror \$`$Sigma0[2]-$Sigma0[1]`,$a1
  163. xor $g,$a2 # f^g
  164. mov $T1,`$SZ*($i&0xf)`(%rsp)
  165. xor $a,$a1
  166. and $e,$a2 # (f^g)&e
  167. ror \$`$Sigma1[1]-$Sigma1[0]`,$a0
  168. add $h,$T1 # T1+=h
  169. xor $g,$a2 # Ch(e,f,g)=((f^g)&e)^g
  170. ror \$`$Sigma0[1]-$Sigma0[0]`,$a1
  171. xor $e,$a0
  172. add $a2,$T1 # T1+=Ch(e,f,g)
  173. mov $a,$a2
  174. add ($Tbl),$T1 # T1+=K[round]
  175. xor $a,$a1
  176. xor $b,$a2 # a^b, b^c in next round
  177. ror \$$Sigma1[0],$a0 # Sigma1(e)
  178. mov $b,$h
  179. and $a2,$a3
  180. ror \$$Sigma0[0],$a1 # Sigma0(a)
  181. add $a0,$T1 # T1+=Sigma1(e)
  182. xor $a3,$h # h=Maj(a,b,c)=Ch(a^b,c,b)
  183. add $T1,$d # d+=T1
  184. add $T1,$h # h+=T1
  185. lea $STRIDE($Tbl),$Tbl # round++
  186. ___
  187. $code.=<<___ if ($i<15);
  188. add $a1,$h # h+=Sigma0(a)
  189. ___
  190. ($a2,$a3) = ($a3,$a2);
  191. }
  192. sub ROUND_16_XX()
  193. { my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
  194. $code.=<<___;
  195. mov `$SZ*(($i+1)&0xf)`(%rsp),$a0
  196. mov `$SZ*(($i+14)&0xf)`(%rsp),$a2
  197. mov $a0,$T1
  198. ror \$`$sigma0[1]-$sigma0[0]`,$a0
  199. add $a1,$a # modulo-scheduled h+=Sigma0(a)
  200. mov $a2,$a1
  201. ror \$`$sigma1[1]-$sigma1[0]`,$a2
  202. xor $T1,$a0
  203. shr \$$sigma0[2],$T1
  204. ror \$$sigma0[0],$a0
  205. xor $a1,$a2
  206. shr \$$sigma1[2],$a1
  207. ror \$$sigma1[0],$a2
  208. xor $a0,$T1 # sigma0(X[(i+1)&0xf])
  209. xor $a1,$a2 # sigma1(X[(i+14)&0xf])
  210. add `$SZ*(($i+9)&0xf)`(%rsp),$T1
  211. add `$SZ*($i&0xf)`(%rsp),$T1
  212. mov $e,$a0
  213. add $a2,$T1
  214. mov $a,$a1
  215. ___
  216. &ROUND_00_15(@_);
  217. }
  218. $code=<<___;
  219. .text
  220. .extern OPENSSL_ia32cap_P
  221. .globl $func
  222. .type $func,\@function,3
  223. .align 16
  224. $func:
  225. ___
  226. $code.=<<___ if ($SZ==4 || $avx);
  227. lea OPENSSL_ia32cap_P(%rip),%r11
  228. mov 0(%r11),%r9d
  229. mov 4(%r11),%r10d
  230. mov 8(%r11),%r11d
  231. ___
  232. $code.=<<___ if ($SZ==4 && $shaext);
  233. test \$`1<<29`,%r11d # check for SHA
  234. jnz _shaext_shortcut
  235. ___
  236. $code.=<<___ if ($avx && $SZ==8);
  237. test \$`1<<11`,%r10d # check for XOP
  238. jnz .Lxop_shortcut
  239. ___
  240. $code.=<<___ if ($avx>1);
  241. and \$`1<<8|1<<5|1<<3`,%r11d # check for BMI2+AVX2+BMI1
  242. cmp \$`1<<8|1<<5|1<<3`,%r11d
  243. je .Lavx2_shortcut
  244. ___
  245. $code.=<<___ if ($avx);
  246. and \$`1<<30`,%r9d # mask "Intel CPU" bit
  247. and \$`1<<28|1<<9`,%r10d # mask AVX and SSSE3 bits
  248. or %r9d,%r10d
  249. cmp \$`1<<28|1<<9|1<<30`,%r10d
  250. je .Lavx_shortcut
  251. ___
  252. $code.=<<___ if ($SZ==4);
  253. test \$`1<<9`,%r10d
  254. jnz .Lssse3_shortcut
  255. ___
  256. $code.=<<___;
  257. push %rbx
  258. push %rbp
  259. push %r12
  260. push %r13
  261. push %r14
  262. push %r15
  263. mov %rsp,%r11 # copy %rsp
  264. shl \$4,%rdx # num*16
  265. sub \$$framesz,%rsp
  266. lea ($inp,%rdx,$SZ),%rdx # inp+num*16*$SZ
  267. and \$-64,%rsp # align stack frame
  268. mov $ctx,$_ctx # save ctx, 1st arg
  269. mov $inp,$_inp # save inp, 2nd arh
  270. mov %rdx,$_end # save end pointer, "3rd" arg
  271. mov %r11,$_rsp # save copy of %rsp
  272. .Lprologue:
  273. mov $SZ*0($ctx),$A
  274. mov $SZ*1($ctx),$B
  275. mov $SZ*2($ctx),$C
  276. mov $SZ*3($ctx),$D
  277. mov $SZ*4($ctx),$E
  278. mov $SZ*5($ctx),$F
  279. mov $SZ*6($ctx),$G
  280. mov $SZ*7($ctx),$H
  281. jmp .Lloop
  282. .align 16
  283. .Lloop:
  284. mov $B,$a3
  285. lea $TABLE(%rip),$Tbl
  286. xor $C,$a3 # magic
  287. ___
  288. for($i=0;$i<16;$i++) {
  289. $code.=" mov $SZ*$i($inp),$T1\n";
  290. $code.=" mov @ROT[4],$a0\n";
  291. $code.=" mov @ROT[0],$a1\n";
  292. $code.=" bswap $T1\n";
  293. &ROUND_00_15($i,@ROT);
  294. unshift(@ROT,pop(@ROT));
  295. }
  296. $code.=<<___;
  297. jmp .Lrounds_16_xx
  298. .align 16
  299. .Lrounds_16_xx:
  300. ___
  301. for(;$i<32;$i++) {
  302. &ROUND_16_XX($i,@ROT);
  303. unshift(@ROT,pop(@ROT));
  304. }
  305. $code.=<<___;
  306. cmpb \$0,`$SZ-1`($Tbl)
  307. jnz .Lrounds_16_xx
  308. mov $_ctx,$ctx
  309. add $a1,$A # modulo-scheduled h+=Sigma0(a)
  310. lea 16*$SZ($inp),$inp
  311. add $SZ*0($ctx),$A
  312. add $SZ*1($ctx),$B
  313. add $SZ*2($ctx),$C
  314. add $SZ*3($ctx),$D
  315. add $SZ*4($ctx),$E
  316. add $SZ*5($ctx),$F
  317. add $SZ*6($ctx),$G
  318. add $SZ*7($ctx),$H
  319. cmp $_end,$inp
  320. mov $A,$SZ*0($ctx)
  321. mov $B,$SZ*1($ctx)
  322. mov $C,$SZ*2($ctx)
  323. mov $D,$SZ*3($ctx)
  324. mov $E,$SZ*4($ctx)
  325. mov $F,$SZ*5($ctx)
  326. mov $G,$SZ*6($ctx)
  327. mov $H,$SZ*7($ctx)
  328. jb .Lloop
  329. mov $_rsp,%rsi
  330. mov (%rsi),%r15
  331. mov 8(%rsi),%r14
  332. mov 16(%rsi),%r13
  333. mov 24(%rsi),%r12
  334. mov 32(%rsi),%rbp
  335. mov 40(%rsi),%rbx
  336. lea 48(%rsi),%rsp
  337. .Lepilogue:
  338. ret
  339. .size $func,.-$func
  340. ___
  341. if ($SZ==4) {
  342. $code.=<<___;
  343. .align 64
  344. .type $TABLE,\@object
  345. $TABLE:
  346. .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
  347. .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
  348. .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
  349. .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
  350. .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
  351. .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
  352. .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
  353. .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
  354. .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
  355. .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
  356. .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
  357. .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
  358. .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
  359. .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
  360. .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
  361. .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
  362. .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
  363. .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
  364. .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
  365. .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
  366. .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
  367. .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
  368. .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
  369. .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
  370. .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
  371. .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
  372. .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
  373. .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
  374. .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
  375. .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
  376. .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
  377. .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
  378. .long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f
  379. .long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f
  380. .long 0x03020100,0x0b0a0908,0xffffffff,0xffffffff
  381. .long 0x03020100,0x0b0a0908,0xffffffff,0xffffffff
  382. .long 0xffffffff,0xffffffff,0x03020100,0x0b0a0908
  383. .long 0xffffffff,0xffffffff,0x03020100,0x0b0a0908
  384. .asciz "SHA256 block transform for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
  385. ___
  386. } else {
  387. $code.=<<___;
  388. .align 64
  389. .type $TABLE,\@object
  390. $TABLE:
  391. .quad 0x428a2f98d728ae22,0x7137449123ef65cd
  392. .quad 0x428a2f98d728ae22,0x7137449123ef65cd
  393. .quad 0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc
  394. .quad 0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc
  395. .quad 0x3956c25bf348b538,0x59f111f1b605d019
  396. .quad 0x3956c25bf348b538,0x59f111f1b605d019
  397. .quad 0x923f82a4af194f9b,0xab1c5ed5da6d8118
  398. .quad 0x923f82a4af194f9b,0xab1c5ed5da6d8118
  399. .quad 0xd807aa98a3030242,0x12835b0145706fbe
  400. .quad 0xd807aa98a3030242,0x12835b0145706fbe
  401. .quad 0x243185be4ee4b28c,0x550c7dc3d5ffb4e2
  402. .quad 0x243185be4ee4b28c,0x550c7dc3d5ffb4e2
  403. .quad 0x72be5d74f27b896f,0x80deb1fe3b1696b1
  404. .quad 0x72be5d74f27b896f,0x80deb1fe3b1696b1
  405. .quad 0x9bdc06a725c71235,0xc19bf174cf692694
  406. .quad 0x9bdc06a725c71235,0xc19bf174cf692694
  407. .quad 0xe49b69c19ef14ad2,0xefbe4786384f25e3
  408. .quad 0xe49b69c19ef14ad2,0xefbe4786384f25e3
  409. .quad 0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65
  410. .quad 0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65
  411. .quad 0x2de92c6f592b0275,0x4a7484aa6ea6e483
  412. .quad 0x2de92c6f592b0275,0x4a7484aa6ea6e483
  413. .quad 0x5cb0a9dcbd41fbd4,0x76f988da831153b5
  414. .quad 0x5cb0a9dcbd41fbd4,0x76f988da831153b5
  415. .quad 0x983e5152ee66dfab,0xa831c66d2db43210
  416. .quad 0x983e5152ee66dfab,0xa831c66d2db43210
  417. .quad 0xb00327c898fb213f,0xbf597fc7beef0ee4
  418. .quad 0xb00327c898fb213f,0xbf597fc7beef0ee4
  419. .quad 0xc6e00bf33da88fc2,0xd5a79147930aa725
  420. .quad 0xc6e00bf33da88fc2,0xd5a79147930aa725
  421. .quad 0x06ca6351e003826f,0x142929670a0e6e70
  422. .quad 0x06ca6351e003826f,0x142929670a0e6e70
  423. .quad 0x27b70a8546d22ffc,0x2e1b21385c26c926
  424. .quad 0x27b70a8546d22ffc,0x2e1b21385c26c926
  425. .quad 0x4d2c6dfc5ac42aed,0x53380d139d95b3df
  426. .quad 0x4d2c6dfc5ac42aed,0x53380d139d95b3df
  427. .quad 0x650a73548baf63de,0x766a0abb3c77b2a8
  428. .quad 0x650a73548baf63de,0x766a0abb3c77b2a8
  429. .quad 0x81c2c92e47edaee6,0x92722c851482353b
  430. .quad 0x81c2c92e47edaee6,0x92722c851482353b
  431. .quad 0xa2bfe8a14cf10364,0xa81a664bbc423001
  432. .quad 0xa2bfe8a14cf10364,0xa81a664bbc423001
  433. .quad 0xc24b8b70d0f89791,0xc76c51a30654be30
  434. .quad 0xc24b8b70d0f89791,0xc76c51a30654be30
  435. .quad 0xd192e819d6ef5218,0xd69906245565a910
  436. .quad 0xd192e819d6ef5218,0xd69906245565a910
  437. .quad 0xf40e35855771202a,0x106aa07032bbd1b8
  438. .quad 0xf40e35855771202a,0x106aa07032bbd1b8
  439. .quad 0x19a4c116b8d2d0c8,0x1e376c085141ab53
  440. .quad 0x19a4c116b8d2d0c8,0x1e376c085141ab53
  441. .quad 0x2748774cdf8eeb99,0x34b0bcb5e19b48a8
  442. .quad 0x2748774cdf8eeb99,0x34b0bcb5e19b48a8
  443. .quad 0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb
  444. .quad 0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb
  445. .quad 0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3
  446. .quad 0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3
  447. .quad 0x748f82ee5defb2fc,0x78a5636f43172f60
  448. .quad 0x748f82ee5defb2fc,0x78a5636f43172f60
  449. .quad 0x84c87814a1f0ab72,0x8cc702081a6439ec
  450. .quad 0x84c87814a1f0ab72,0x8cc702081a6439ec
  451. .quad 0x90befffa23631e28,0xa4506cebde82bde9
  452. .quad 0x90befffa23631e28,0xa4506cebde82bde9
  453. .quad 0xbef9a3f7b2c67915,0xc67178f2e372532b
  454. .quad 0xbef9a3f7b2c67915,0xc67178f2e372532b
  455. .quad 0xca273eceea26619c,0xd186b8c721c0c207
  456. .quad 0xca273eceea26619c,0xd186b8c721c0c207
  457. .quad 0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178
  458. .quad 0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178
  459. .quad 0x06f067aa72176fba,0x0a637dc5a2c898a6
  460. .quad 0x06f067aa72176fba,0x0a637dc5a2c898a6
  461. .quad 0x113f9804bef90dae,0x1b710b35131c471b
  462. .quad 0x113f9804bef90dae,0x1b710b35131c471b
  463. .quad 0x28db77f523047d84,0x32caab7b40c72493
  464. .quad 0x28db77f523047d84,0x32caab7b40c72493
  465. .quad 0x3c9ebe0a15c9bebc,0x431d67c49c100d4c
  466. .quad 0x3c9ebe0a15c9bebc,0x431d67c49c100d4c
  467. .quad 0x4cc5d4becb3e42b6,0x597f299cfc657e2a
  468. .quad 0x4cc5d4becb3e42b6,0x597f299cfc657e2a
  469. .quad 0x5fcb6fab3ad6faec,0x6c44198c4a475817
  470. .quad 0x5fcb6fab3ad6faec,0x6c44198c4a475817
  471. .quad 0x0001020304050607,0x08090a0b0c0d0e0f
  472. .quad 0x0001020304050607,0x08090a0b0c0d0e0f
  473. .asciz "SHA512 block transform for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
  474. ___
  475. }
  476. ######################################################################
  477. # SIMD code paths
  478. #
  479. if ($SZ==4 && $shaext) {{{
  480. ######################################################################
  481. # Intel SHA Extensions implementation of SHA256 update function.
  482. #
  483. my ($ctx,$inp,$num,$Tbl)=("%rdi","%rsi","%rdx","%rcx");
  484. my ($Wi,$ABEF,$CDGH,$TMP,$BSWAP,$ABEF_SAVE,$CDGH_SAVE)=map("%xmm$_",(0..2,7..10));
  485. my @MSG=map("%xmm$_",(3..6));
  486. $code.=<<___;
  487. .type sha256_block_data_order_shaext,\@function,3
  488. .align 64
  489. sha256_block_data_order_shaext:
  490. _shaext_shortcut:
  491. ___
  492. $code.=<<___ if ($win64);
  493. lea `-8-5*16`(%rsp),%rsp
  494. movaps %xmm6,-8-5*16(%rax)
  495. movaps %xmm7,-8-4*16(%rax)
  496. movaps %xmm8,-8-3*16(%rax)
  497. movaps %xmm9,-8-2*16(%rax)
  498. movaps %xmm10,-8-1*16(%rax)
  499. .Lprologue_shaext:
  500. ___
  501. $code.=<<___;
  502. lea K256+0x80(%rip),$Tbl
  503. movdqu ($ctx),$ABEF # DCBA
  504. movdqu 16($ctx),$CDGH # HGFE
  505. movdqa 0x200-0x80($Tbl),$TMP # byte swap mask
  506. pshufd \$0x1b,$ABEF,$Wi # ABCD
  507. pshufd \$0xb1,$ABEF,$ABEF # CDAB
  508. pshufd \$0x1b,$CDGH,$CDGH # EFGH
  509. movdqa $TMP,$BSWAP # offload
  510. palignr \$8,$CDGH,$ABEF # ABEF
  511. punpcklqdq $Wi,$CDGH # CDGH
  512. jmp .Loop_shaext
  513. .align 16
  514. .Loop_shaext:
  515. movdqu ($inp),@MSG[0]
  516. movdqu 0x10($inp),@MSG[1]
  517. movdqu 0x20($inp),@MSG[2]
  518. pshufb $TMP,@MSG[0]
  519. movdqu 0x30($inp),@MSG[3]
  520. movdqa 0*32-0x80($Tbl),$Wi
  521. paddd @MSG[0],$Wi
  522. pshufb $TMP,@MSG[1]
  523. movdqa $CDGH,$CDGH_SAVE # offload
  524. sha256rnds2 $ABEF,$CDGH # 0-3
  525. pshufd \$0x0e,$Wi,$Wi
  526. nop
  527. movdqa $ABEF,$ABEF_SAVE # offload
  528. sha256rnds2 $CDGH,$ABEF
  529. movdqa 1*32-0x80($Tbl),$Wi
  530. paddd @MSG[1],$Wi
  531. pshufb $TMP,@MSG[2]
  532. sha256rnds2 $ABEF,$CDGH # 4-7
  533. pshufd \$0x0e,$Wi,$Wi
  534. lea 0x40($inp),$inp
  535. sha256msg1 @MSG[1],@MSG[0]
  536. sha256rnds2 $CDGH,$ABEF
  537. movdqa 2*32-0x80($Tbl),$Wi
  538. paddd @MSG[2],$Wi
  539. pshufb $TMP,@MSG[3]
  540. sha256rnds2 $ABEF,$CDGH # 8-11
  541. pshufd \$0x0e,$Wi,$Wi
  542. movdqa @MSG[3],$TMP
  543. palignr \$4,@MSG[2],$TMP
  544. nop
  545. paddd $TMP,@MSG[0]
  546. sha256msg1 @MSG[2],@MSG[1]
  547. sha256rnds2 $CDGH,$ABEF
  548. movdqa 3*32-0x80($Tbl),$Wi
  549. paddd @MSG[3],$Wi
  550. sha256msg2 @MSG[3],@MSG[0]
  551. sha256rnds2 $ABEF,$CDGH # 12-15
  552. pshufd \$0x0e,$Wi,$Wi
  553. movdqa @MSG[0],$TMP
  554. palignr \$4,@MSG[3],$TMP
  555. nop
  556. paddd $TMP,@MSG[1]
  557. sha256msg1 @MSG[3],@MSG[2]
  558. sha256rnds2 $CDGH,$ABEF
  559. ___
  560. for($i=4;$i<16-3;$i++) {
  561. $code.=<<___;
  562. movdqa $i*32-0x80($Tbl),$Wi
  563. paddd @MSG[0],$Wi
  564. sha256msg2 @MSG[0],@MSG[1]
  565. sha256rnds2 $ABEF,$CDGH # 16-19...
  566. pshufd \$0x0e,$Wi,$Wi
  567. movdqa @MSG[1],$TMP
  568. palignr \$4,@MSG[0],$TMP
  569. nop
  570. paddd $TMP,@MSG[2]
  571. sha256msg1 @MSG[0],@MSG[3]
  572. sha256rnds2 $CDGH,$ABEF
  573. ___
  574. push(@MSG,shift(@MSG));
  575. }
  576. $code.=<<___;
  577. movdqa 13*32-0x80($Tbl),$Wi
  578. paddd @MSG[0],$Wi
  579. sha256msg2 @MSG[0],@MSG[1]
  580. sha256rnds2 $ABEF,$CDGH # 52-55
  581. pshufd \$0x0e,$Wi,$Wi
  582. movdqa @MSG[1],$TMP
  583. palignr \$4,@MSG[0],$TMP
  584. sha256rnds2 $CDGH,$ABEF
  585. paddd $TMP,@MSG[2]
  586. movdqa 14*32-0x80($Tbl),$Wi
  587. paddd @MSG[1],$Wi
  588. sha256rnds2 $ABEF,$CDGH # 56-59
  589. pshufd \$0x0e,$Wi,$Wi
  590. sha256msg2 @MSG[1],@MSG[2]
  591. movdqa $BSWAP,$TMP
  592. sha256rnds2 $CDGH,$ABEF
  593. movdqa 15*32-0x80($Tbl),$Wi
  594. paddd @MSG[2],$Wi
  595. nop
  596. sha256rnds2 $ABEF,$CDGH # 60-63
  597. pshufd \$0x0e,$Wi,$Wi
  598. dec $num
  599. nop
  600. sha256rnds2 $CDGH,$ABEF
  601. paddd $CDGH_SAVE,$CDGH
  602. paddd $ABEF_SAVE,$ABEF
  603. jnz .Loop_shaext
  604. pshufd \$0xb1,$CDGH,$CDGH # DCHG
  605. pshufd \$0x1b,$ABEF,$TMP # FEBA
  606. pshufd \$0xb1,$ABEF,$ABEF # BAFE
  607. punpckhqdq $CDGH,$ABEF # DCBA
  608. palignr \$8,$TMP,$CDGH # HGFE
  609. movdqu $ABEF,($ctx)
  610. movdqu $CDGH,16($ctx)
  611. ___
  612. $code.=<<___ if ($win64);
  613. movaps -8-5*16(%rax),%xmm6
  614. movaps -8-4*16(%rax),%xmm7
  615. movaps -8-3*16(%rax),%xmm8
  616. movaps -8-2*16(%rax),%xmm9
  617. movaps -8-1*16(%rax),%xmm10
  618. mov %rax,%rsp
  619. .Lepilogue_shaext:
  620. ___
  621. $code.=<<___;
  622. ret
  623. .size sha256_block_data_order_shaext,.-sha256_block_data_order_shaext
  624. ___
  625. }}}
  626. {{{
  627. my $a4=$T1;
  628. my ($a,$b,$c,$d,$e,$f,$g,$h);
  629. sub AUTOLOAD() # thunk [simplified] 32-bit style perlasm
  630. { my $opcode = $AUTOLOAD; $opcode =~ s/.*:://;
  631. my $arg = pop;
  632. $arg = "\$$arg" if ($arg*1 eq $arg);
  633. $code .= "\t$opcode\t".join(',',$arg,reverse @_)."\n";
  634. }
  635. sub body_00_15 () {
  636. (
  637. '($a,$b,$c,$d,$e,$f,$g,$h)=@ROT;'.
  638. '&ror ($a0,$Sigma1[2]-$Sigma1[1])',
  639. '&mov ($a,$a1)',
  640. '&mov ($a4,$f)',
  641. '&ror ($a1,$Sigma0[2]-$Sigma0[1])',
  642. '&xor ($a0,$e)',
  643. '&xor ($a4,$g)', # f^g
  644. '&ror ($a0,$Sigma1[1]-$Sigma1[0])',
  645. '&xor ($a1,$a)',
  646. '&and ($a4,$e)', # (f^g)&e
  647. '&xor ($a0,$e)',
  648. '&add ($h,$SZ*($i&15)."(%rsp)")', # h+=X[i]+K[i]
  649. '&mov ($a2,$a)',
  650. '&xor ($a4,$g)', # Ch(e,f,g)=((f^g)&e)^g
  651. '&ror ($a1,$Sigma0[1]-$Sigma0[0])',
  652. '&xor ($a2,$b)', # a^b, b^c in next round
  653. '&add ($h,$a4)', # h+=Ch(e,f,g)
  654. '&ror ($a0,$Sigma1[0])', # Sigma1(e)
  655. '&and ($a3,$a2)', # (b^c)&(a^b)
  656. '&xor ($a1,$a)',
  657. '&add ($h,$a0)', # h+=Sigma1(e)
  658. '&xor ($a3,$b)', # Maj(a,b,c)=Ch(a^b,c,b)
  659. '&ror ($a1,$Sigma0[0])', # Sigma0(a)
  660. '&add ($d,$h)', # d+=h
  661. '&add ($h,$a3)', # h+=Maj(a,b,c)
  662. '&mov ($a0,$d)',
  663. '&add ($a1,$h);'. # h+=Sigma0(a)
  664. '($a2,$a3) = ($a3,$a2); unshift(@ROT,pop(@ROT)); $i++;'
  665. );
  666. }
  667. ######################################################################
  668. # SSSE3 code path
  669. #
  670. if ($SZ==4) { # SHA256 only
  671. my @X = map("%xmm$_",(0..3));
  672. my ($t0,$t1,$t2,$t3, $t4,$t5) = map("%xmm$_",(4..9));
  673. $code.=<<___;
  674. .type ${func}_ssse3,\@function,3
  675. .align 64
  676. ${func}_ssse3:
  677. .Lssse3_shortcut:
  678. push %rbx
  679. push %rbp
  680. push %r12
  681. push %r13
  682. push %r14
  683. push %r15
  684. mov %rsp,%r11 # copy %rsp
  685. shl \$4,%rdx # num*16
  686. sub \$`$framesz+$win64*16*4`,%rsp
  687. lea ($inp,%rdx,$SZ),%rdx # inp+num*16*$SZ
  688. and \$-64,%rsp # align stack frame
  689. mov $ctx,$_ctx # save ctx, 1st arg
  690. mov $inp,$_inp # save inp, 2nd arh
  691. mov %rdx,$_end # save end pointer, "3rd" arg
  692. mov %r11,$_rsp # save copy of %rsp
  693. ___
  694. $code.=<<___ if ($win64);
  695. movaps %xmm6,16*$SZ+32(%rsp)
  696. movaps %xmm7,16*$SZ+48(%rsp)
  697. movaps %xmm8,16*$SZ+64(%rsp)
  698. movaps %xmm9,16*$SZ+80(%rsp)
  699. ___
  700. $code.=<<___;
  701. .Lprologue_ssse3:
  702. mov $SZ*0($ctx),$A
  703. mov $SZ*1($ctx),$B
  704. mov $SZ*2($ctx),$C
  705. mov $SZ*3($ctx),$D
  706. mov $SZ*4($ctx),$E
  707. mov $SZ*5($ctx),$F
  708. mov $SZ*6($ctx),$G
  709. mov $SZ*7($ctx),$H
  710. ___
  711. $code.=<<___;
  712. #movdqa $TABLE+`$SZ*2*$rounds`+32(%rip),$t4
  713. #movdqa $TABLE+`$SZ*2*$rounds`+64(%rip),$t5
  714. jmp .Lloop_ssse3
  715. .align 16
  716. .Lloop_ssse3:
  717. movdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3
  718. movdqu 0x00($inp),@X[0]
  719. movdqu 0x10($inp),@X[1]
  720. movdqu 0x20($inp),@X[2]
  721. pshufb $t3,@X[0]
  722. movdqu 0x30($inp),@X[3]
  723. lea $TABLE(%rip),$Tbl
  724. pshufb $t3,@X[1]
  725. movdqa 0x00($Tbl),$t0
  726. movdqa 0x20($Tbl),$t1
  727. pshufb $t3,@X[2]
  728. paddd @X[0],$t0
  729. movdqa 0x40($Tbl),$t2
  730. pshufb $t3,@X[3]
  731. movdqa 0x60($Tbl),$t3
  732. paddd @X[1],$t1
  733. paddd @X[2],$t2
  734. paddd @X[3],$t3
  735. movdqa $t0,0x00(%rsp)
  736. mov $A,$a1
  737. movdqa $t1,0x10(%rsp)
  738. mov $B,$a3
  739. movdqa $t2,0x20(%rsp)
  740. xor $C,$a3 # magic
  741. movdqa $t3,0x30(%rsp)
  742. mov $E,$a0
  743. jmp .Lssse3_00_47
  744. .align 16
  745. .Lssse3_00_47:
  746. sub \$`-16*2*$SZ`,$Tbl # size optimization
  747. ___
  748. sub Xupdate_256_SSSE3 () {
  749. (
  750. '&movdqa ($t0,@X[1]);',
  751. '&movdqa ($t3,@X[3])',
  752. '&palignr ($t0,@X[0],$SZ)', # X[1..4]
  753. '&palignr ($t3,@X[2],$SZ);', # X[9..12]
  754. '&movdqa ($t1,$t0)',
  755. '&movdqa ($t2,$t0);',
  756. '&psrld ($t0,$sigma0[2])',
  757. '&paddd (@X[0],$t3);', # X[0..3] += X[9..12]
  758. '&psrld ($t2,$sigma0[0])',
  759. '&pshufd ($t3,@X[3],0b11111010)',# X[14..15]
  760. '&pslld ($t1,8*$SZ-$sigma0[1]);'.
  761. '&pxor ($t0,$t2)',
  762. '&psrld ($t2,$sigma0[1]-$sigma0[0]);'.
  763. '&pxor ($t0,$t1)',
  764. '&pslld ($t1,$sigma0[1]-$sigma0[0]);'.
  765. '&pxor ($t0,$t2);',
  766. '&movdqa ($t2,$t3)',
  767. '&pxor ($t0,$t1);', # sigma0(X[1..4])
  768. '&psrld ($t3,$sigma1[2])',
  769. '&paddd (@X[0],$t0);', # X[0..3] += sigma0(X[1..4])
  770. '&psrlq ($t2,$sigma1[0])',
  771. '&pxor ($t3,$t2);',
  772. '&psrlq ($t2,$sigma1[1]-$sigma1[0])',
  773. '&pxor ($t3,$t2)',
  774. '&pshufb ($t3,$t4)', # sigma1(X[14..15])
  775. '&paddd (@X[0],$t3)', # X[0..1] += sigma1(X[14..15])
  776. '&pshufd ($t3,@X[0],0b01010000)',# X[16..17]
  777. '&movdqa ($t2,$t3);',
  778. '&psrld ($t3,$sigma1[2])',
  779. '&psrlq ($t2,$sigma1[0])',
  780. '&pxor ($t3,$t2);',
  781. '&psrlq ($t2,$sigma1[1]-$sigma1[0])',
  782. '&pxor ($t3,$t2);',
  783. '&movdqa ($t2,16*2*$j."($Tbl)")',
  784. '&pshufb ($t3,$t5)',
  785. '&paddd (@X[0],$t3)' # X[2..3] += sigma1(X[16..17])
  786. );
  787. }
  788. sub SSSE3_256_00_47 () {
  789. my $j = shift;
  790. my $body = shift;
  791. my @X = @_;
  792. my @insns = (&$body,&$body,&$body,&$body); # 104 instructions
  793. if (0) {
  794. foreach (Xupdate_256_SSSE3()) { # 36 instructions
  795. eval;
  796. eval(shift(@insns));
  797. eval(shift(@insns));
  798. eval(shift(@insns));
  799. }
  800. } else { # squeeze extra 4% on Westmere and 19% on Atom
  801. eval(shift(@insns)); #@
  802. &movdqa ($t0,@X[1]);
  803. eval(shift(@insns));
  804. eval(shift(@insns));
  805. &movdqa ($t3,@X[3]);
  806. eval(shift(@insns)); #@
  807. eval(shift(@insns));
  808. eval(shift(@insns));
  809. eval(shift(@insns)); #@
  810. eval(shift(@insns));
  811. &palignr ($t0,@X[0],$SZ); # X[1..4]
  812. eval(shift(@insns));
  813. eval(shift(@insns));
  814. &palignr ($t3,@X[2],$SZ); # X[9..12]
  815. eval(shift(@insns));
  816. eval(shift(@insns));
  817. eval(shift(@insns));
  818. eval(shift(@insns)); #@
  819. &movdqa ($t1,$t0);
  820. eval(shift(@insns));
  821. eval(shift(@insns));
  822. &movdqa ($t2,$t0);
  823. eval(shift(@insns)); #@
  824. eval(shift(@insns));
  825. &psrld ($t0,$sigma0[2]);
  826. eval(shift(@insns));
  827. eval(shift(@insns));
  828. eval(shift(@insns));
  829. &paddd (@X[0],$t3); # X[0..3] += X[9..12]
  830. eval(shift(@insns)); #@
  831. eval(shift(@insns));
  832. &psrld ($t2,$sigma0[0]);
  833. eval(shift(@insns));
  834. eval(shift(@insns));
  835. &pshufd ($t3,@X[3],0b11111010); # X[4..15]
  836. eval(shift(@insns));
  837. eval(shift(@insns)); #@
  838. &pslld ($t1,8*$SZ-$sigma0[1]);
  839. eval(shift(@insns));
  840. eval(shift(@insns));
  841. &pxor ($t0,$t2);
  842. eval(shift(@insns)); #@
  843. eval(shift(@insns));
  844. eval(shift(@insns));
  845. eval(shift(@insns)); #@
  846. &psrld ($t2,$sigma0[1]-$sigma0[0]);
  847. eval(shift(@insns));
  848. &pxor ($t0,$t1);
  849. eval(shift(@insns));
  850. eval(shift(@insns));
  851. &pslld ($t1,$sigma0[1]-$sigma0[0]);
  852. eval(shift(@insns));
  853. eval(shift(@insns));
  854. &pxor ($t0,$t2);
  855. eval(shift(@insns));
  856. eval(shift(@insns)); #@
  857. &movdqa ($t2,$t3);
  858. eval(shift(@insns));
  859. eval(shift(@insns));
  860. &pxor ($t0,$t1); # sigma0(X[1..4])
  861. eval(shift(@insns)); #@
  862. eval(shift(@insns));
  863. eval(shift(@insns));
  864. &psrld ($t3,$sigma1[2]);
  865. eval(shift(@insns));
  866. eval(shift(@insns));
  867. &paddd (@X[0],$t0); # X[0..3] += sigma0(X[1..4])
  868. eval(shift(@insns)); #@
  869. eval(shift(@insns));
  870. &psrlq ($t2,$sigma1[0]);
  871. eval(shift(@insns));
  872. eval(shift(@insns));
  873. eval(shift(@insns));
  874. &pxor ($t3,$t2);
  875. eval(shift(@insns)); #@
  876. eval(shift(@insns));
  877. eval(shift(@insns));
  878. eval(shift(@insns)); #@
  879. &psrlq ($t2,$sigma1[1]-$sigma1[0]);
  880. eval(shift(@insns));
  881. eval(shift(@insns));
  882. &pxor ($t3,$t2);
  883. eval(shift(@insns)); #@
  884. eval(shift(@insns));
  885. eval(shift(@insns));
  886. #&pshufb ($t3,$t4); # sigma1(X[14..15])
  887. &pshufd ($t3,$t3,0b10000000);
  888. eval(shift(@insns));
  889. eval(shift(@insns));
  890. eval(shift(@insns));
  891. &psrldq ($t3,8);
  892. eval(shift(@insns));
  893. eval(shift(@insns)); #@
  894. eval(shift(@insns));
  895. eval(shift(@insns));
  896. eval(shift(@insns)); #@
  897. &paddd (@X[0],$t3); # X[0..1] += sigma1(X[14..15])
  898. eval(shift(@insns));
  899. eval(shift(@insns));
  900. eval(shift(@insns));
  901. &pshufd ($t3,@X[0],0b01010000); # X[16..17]
  902. eval(shift(@insns));
  903. eval(shift(@insns)); #@
  904. eval(shift(@insns));
  905. &movdqa ($t2,$t3);
  906. eval(shift(@insns));
  907. eval(shift(@insns));
  908. &psrld ($t3,$sigma1[2]);
  909. eval(shift(@insns));
  910. eval(shift(@insns)); #@
  911. &psrlq ($t2,$sigma1[0]);
  912. eval(shift(@insns));
  913. eval(shift(@insns));
  914. &pxor ($t3,$t2);
  915. eval(shift(@insns)); #@
  916. eval(shift(@insns));
  917. eval(shift(@insns));
  918. eval(shift(@insns)); #@
  919. eval(shift(@insns));
  920. &psrlq ($t2,$sigma1[1]-$sigma1[0]);
  921. eval(shift(@insns));
  922. eval(shift(@insns));
  923. eval(shift(@insns));
  924. &pxor ($t3,$t2);
  925. eval(shift(@insns));
  926. eval(shift(@insns));
  927. eval(shift(@insns)); #@
  928. #&pshufb ($t3,$t5);
  929. &pshufd ($t3,$t3,0b00001000);
  930. eval(shift(@insns));
  931. eval(shift(@insns));
  932. &movdqa ($t2,16*2*$j."($Tbl)");
  933. eval(shift(@insns)); #@
  934. eval(shift(@insns));
  935. &pslldq ($t3,8);
  936. eval(shift(@insns));
  937. eval(shift(@insns));
  938. eval(shift(@insns));
  939. &paddd (@X[0],$t3); # X[2..3] += sigma1(X[16..17])
  940. eval(shift(@insns)); #@
  941. eval(shift(@insns));
  942. eval(shift(@insns));
  943. }
  944. &paddd ($t2,@X[0]);
  945. foreach (@insns) { eval; } # remaining instructions
  946. &movdqa (16*$j."(%rsp)",$t2);
  947. }
  948. for ($i=0,$j=0; $j<4; $j++) {
  949. &SSSE3_256_00_47($j,\&body_00_15,@X);
  950. push(@X,shift(@X)); # rotate(@X)
  951. }
  952. &cmpb ($SZ-1+16*2*$SZ."($Tbl)",0);
  953. &jne (".Lssse3_00_47");
  954. for ($i=0; $i<16; ) {
  955. foreach(body_00_15()) { eval; }
  956. }
  957. $code.=<<___;
  958. mov $_ctx,$ctx
  959. mov $a1,$A
  960. add $SZ*0($ctx),$A
  961. lea 16*$SZ($inp),$inp
  962. add $SZ*1($ctx),$B
  963. add $SZ*2($ctx),$C
  964. add $SZ*3($ctx),$D
  965. add $SZ*4($ctx),$E
  966. add $SZ*5($ctx),$F
  967. add $SZ*6($ctx),$G
  968. add $SZ*7($ctx),$H
  969. cmp $_end,$inp
  970. mov $A,$SZ*0($ctx)
  971. mov $B,$SZ*1($ctx)
  972. mov $C,$SZ*2($ctx)
  973. mov $D,$SZ*3($ctx)
  974. mov $E,$SZ*4($ctx)
  975. mov $F,$SZ*5($ctx)
  976. mov $G,$SZ*6($ctx)
  977. mov $H,$SZ*7($ctx)
  978. jb .Lloop_ssse3
  979. mov $_rsp,%rsi
  980. ___
  981. $code.=<<___ if ($win64);
  982. movaps 16*$SZ+32(%rsp),%xmm6
  983. movaps 16*$SZ+48(%rsp),%xmm7
  984. movaps 16*$SZ+64(%rsp),%xmm8
  985. movaps 16*$SZ+80(%rsp),%xmm9
  986. ___
  987. $code.=<<___;
  988. mov (%rsi),%r15
  989. mov 8(%rsi),%r14
  990. mov 16(%rsi),%r13
  991. mov 24(%rsi),%r12
  992. mov 32(%rsi),%rbp
  993. mov 40(%rsi),%rbx
  994. lea 48(%rsi),%rsp
  995. .Lepilogue_ssse3:
  996. ret
  997. .size ${func}_ssse3,.-${func}_ssse3
  998. ___
  999. }
  1000. if ($avx) {{
  1001. ######################################################################
  1002. # XOP code path
  1003. #
  1004. if ($SZ==8) { # SHA512 only
  1005. $code.=<<___;
  1006. .type ${func}_xop,\@function,3
  1007. .align 64
  1008. ${func}_xop:
  1009. .Lxop_shortcut:
  1010. push %rbx
  1011. push %rbp
  1012. push %r12
  1013. push %r13
  1014. push %r14
  1015. push %r15
  1016. mov %rsp,%r11 # copy %rsp
  1017. shl \$4,%rdx # num*16
  1018. sub \$`$framesz+$win64*16*($SZ==4?4:6)`,%rsp
  1019. lea ($inp,%rdx,$SZ),%rdx # inp+num*16*$SZ
  1020. and \$-64,%rsp # align stack frame
  1021. mov $ctx,$_ctx # save ctx, 1st arg
  1022. mov $inp,$_inp # save inp, 2nd arh
  1023. mov %rdx,$_end # save end pointer, "3rd" arg
  1024. mov %r11,$_rsp # save copy of %rsp
  1025. ___
  1026. $code.=<<___ if ($win64);
  1027. movaps %xmm6,16*$SZ+32(%rsp)
  1028. movaps %xmm7,16*$SZ+48(%rsp)
  1029. movaps %xmm8,16*$SZ+64(%rsp)
  1030. movaps %xmm9,16*$SZ+80(%rsp)
  1031. ___
  1032. $code.=<<___ if ($win64 && $SZ>4);
  1033. movaps %xmm10,16*$SZ+96(%rsp)
  1034. movaps %xmm11,16*$SZ+112(%rsp)
  1035. ___
  1036. $code.=<<___;
  1037. .Lprologue_xop:
  1038. vzeroupper
  1039. mov $SZ*0($ctx),$A
  1040. mov $SZ*1($ctx),$B
  1041. mov $SZ*2($ctx),$C
  1042. mov $SZ*3($ctx),$D
  1043. mov $SZ*4($ctx),$E
  1044. mov $SZ*5($ctx),$F
  1045. mov $SZ*6($ctx),$G
  1046. mov $SZ*7($ctx),$H
  1047. jmp .Lloop_xop
  1048. ___
  1049. if ($SZ==4) { # SHA256
  1050. my @X = map("%xmm$_",(0..3));
  1051. my ($t0,$t1,$t2,$t3) = map("%xmm$_",(4..7));
  1052. $code.=<<___;
  1053. .align 16
  1054. .Lloop_xop:
  1055. vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3
  1056. vmovdqu 0x00($inp),@X[0]
  1057. vmovdqu 0x10($inp),@X[1]
  1058. vmovdqu 0x20($inp),@X[2]
  1059. vmovdqu 0x30($inp),@X[3]
  1060. vpshufb $t3,@X[0],@X[0]
  1061. lea $TABLE(%rip),$Tbl
  1062. vpshufb $t3,@X[1],@X[1]
  1063. vpshufb $t3,@X[2],@X[2]
  1064. vpaddd 0x00($Tbl),@X[0],$t0
  1065. vpshufb $t3,@X[3],@X[3]
  1066. vpaddd 0x20($Tbl),@X[1],$t1
  1067. vpaddd 0x40($Tbl),@X[2],$t2
  1068. vpaddd 0x60($Tbl),@X[3],$t3
  1069. vmovdqa $t0,0x00(%rsp)
  1070. mov $A,$a1
  1071. vmovdqa $t1,0x10(%rsp)
  1072. mov $B,$a3
  1073. vmovdqa $t2,0x20(%rsp)
  1074. xor $C,$a3 # magic
  1075. vmovdqa $t3,0x30(%rsp)
  1076. mov $E,$a0
  1077. jmp .Lxop_00_47
  1078. .align 16
  1079. .Lxop_00_47:
  1080. sub \$`-16*2*$SZ`,$Tbl # size optimization
  1081. ___
  1082. sub XOP_256_00_47 () {
  1083. my $j = shift;
  1084. my $body = shift;
  1085. my @X = @_;
  1086. my @insns = (&$body,&$body,&$body,&$body); # 104 instructions
  1087. &vpalignr ($t0,@X[1],@X[0],$SZ); # X[1..4]
  1088. eval(shift(@insns));
  1089. eval(shift(@insns));
  1090. &vpalignr ($t3,@X[3],@X[2],$SZ); # X[9..12]
  1091. eval(shift(@insns));
  1092. eval(shift(@insns));
  1093. &vprotd ($t1,$t0,8*$SZ-$sigma0[1]);
  1094. eval(shift(@insns));
  1095. eval(shift(@insns));
  1096. &vpsrld ($t0,$t0,$sigma0[2]);
  1097. eval(shift(@insns));
  1098. eval(shift(@insns));
  1099. &vpaddd (@X[0],@X[0],$t3); # X[0..3] += X[9..12]
  1100. eval(shift(@insns));
  1101. eval(shift(@insns));
  1102. eval(shift(@insns));
  1103. eval(shift(@insns));
  1104. &vprotd ($t2,$t1,$sigma0[1]-$sigma0[0]);
  1105. eval(shift(@insns));
  1106. eval(shift(@insns));
  1107. &vpxor ($t0,$t0,$t1);
  1108. eval(shift(@insns));
  1109. eval(shift(@insns));
  1110. eval(shift(@insns));
  1111. eval(shift(@insns));
  1112. &vprotd ($t3,@X[3],8*$SZ-$sigma1[1]);
  1113. eval(shift(@insns));
  1114. eval(shift(@insns));
  1115. &vpxor ($t0,$t0,$t2); # sigma0(X[1..4])
  1116. eval(shift(@insns));
  1117. eval(shift(@insns));
  1118. &vpsrld ($t2,@X[3],$sigma1[2]);
  1119. eval(shift(@insns));
  1120. eval(shift(@insns));
  1121. &vpaddd (@X[0],@X[0],$t0); # X[0..3] += sigma0(X[1..4])
  1122. eval(shift(@insns));
  1123. eval(shift(@insns));
  1124. &vprotd ($t1,$t3,$sigma1[1]-$sigma1[0]);
  1125. eval(shift(@insns));
  1126. eval(shift(@insns));
  1127. &vpxor ($t3,$t3,$t2);
  1128. eval(shift(@insns));
  1129. eval(shift(@insns));
  1130. eval(shift(@insns));
  1131. eval(shift(@insns));
  1132. &vpxor ($t3,$t3,$t1); # sigma1(X[14..15])
  1133. eval(shift(@insns));
  1134. eval(shift(@insns));
  1135. eval(shift(@insns));
  1136. eval(shift(@insns));
  1137. &vpsrldq ($t3,$t3,8);
  1138. eval(shift(@insns));
  1139. eval(shift(@insns));
  1140. eval(shift(@insns));
  1141. eval(shift(@insns));
  1142. &vpaddd (@X[0],@X[0],$t3); # X[0..1] += sigma1(X[14..15])
  1143. eval(shift(@insns));
  1144. eval(shift(@insns));
  1145. eval(shift(@insns));
  1146. eval(shift(@insns));
  1147. &vprotd ($t3,@X[0],8*$SZ-$sigma1[1]);
  1148. eval(shift(@insns));
  1149. eval(shift(@insns));
  1150. &vpsrld ($t2,@X[0],$sigma1[2]);
  1151. eval(shift(@insns));
  1152. eval(shift(@insns));
  1153. &vprotd ($t1,$t3,$sigma1[1]-$sigma1[0]);
  1154. eval(shift(@insns));
  1155. eval(shift(@insns));
  1156. &vpxor ($t3,$t3,$t2);
  1157. eval(shift(@insns));
  1158. eval(shift(@insns));
  1159. eval(shift(@insns));
  1160. eval(shift(@insns));
  1161. &vpxor ($t3,$t3,$t1); # sigma1(X[16..17])
  1162. eval(shift(@insns));
  1163. eval(shift(@insns));
  1164. eval(shift(@insns));
  1165. eval(shift(@insns));
  1166. &vpslldq ($t3,$t3,8); # 22 instructions
  1167. eval(shift(@insns));
  1168. eval(shift(@insns));
  1169. eval(shift(@insns));
  1170. eval(shift(@insns));
  1171. &vpaddd (@X[0],@X[0],$t3); # X[2..3] += sigma1(X[16..17])
  1172. eval(shift(@insns));
  1173. eval(shift(@insns));
  1174. eval(shift(@insns));
  1175. eval(shift(@insns));
  1176. &vpaddd ($t2,@X[0],16*2*$j."($Tbl)");
  1177. foreach (@insns) { eval; } # remaining instructions
  1178. &vmovdqa (16*$j."(%rsp)",$t2);
  1179. }
  1180. for ($i=0,$j=0; $j<4; $j++) {
  1181. &XOP_256_00_47($j,\&body_00_15,@X);
  1182. push(@X,shift(@X)); # rotate(@X)
  1183. }
  1184. &cmpb ($SZ-1+16*2*$SZ."($Tbl)",0);
  1185. &jne (".Lxop_00_47");
  1186. for ($i=0; $i<16; ) {
  1187. foreach(body_00_15()) { eval; }
  1188. }
  1189. } else { # SHA512
  1190. my @X = map("%xmm$_",(0..7));
  1191. my ($t0,$t1,$t2,$t3) = map("%xmm$_",(8..11));
  1192. $code.=<<___;
  1193. .align 16
  1194. .Lloop_xop:
  1195. vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3
  1196. vmovdqu 0x00($inp),@X[0]
  1197. lea $TABLE+0x80(%rip),$Tbl # size optimization
  1198. vmovdqu 0x10($inp),@X[1]
  1199. vmovdqu 0x20($inp),@X[2]
  1200. vpshufb $t3,@X[0],@X[0]
  1201. vmovdqu 0x30($inp),@X[3]
  1202. vpshufb $t3,@X[1],@X[1]
  1203. vmovdqu 0x40($inp),@X[4]
  1204. vpshufb $t3,@X[2],@X[2]
  1205. vmovdqu 0x50($inp),@X[5]
  1206. vpshufb $t3,@X[3],@X[3]
  1207. vmovdqu 0x60($inp),@X[6]
  1208. vpshufb $t3,@X[4],@X[4]
  1209. vmovdqu 0x70($inp),@X[7]
  1210. vpshufb $t3,@X[5],@X[5]
  1211. vpaddq -0x80($Tbl),@X[0],$t0
  1212. vpshufb $t3,@X[6],@X[6]
  1213. vpaddq -0x60($Tbl),@X[1],$t1
  1214. vpshufb $t3,@X[7],@X[7]
  1215. vpaddq -0x40($Tbl),@X[2],$t2
  1216. vpaddq -0x20($Tbl),@X[3],$t3
  1217. vmovdqa $t0,0x00(%rsp)
  1218. vpaddq 0x00($Tbl),@X[4],$t0
  1219. vmovdqa $t1,0x10(%rsp)
  1220. vpaddq 0x20($Tbl),@X[5],$t1
  1221. vmovdqa $t2,0x20(%rsp)
  1222. vpaddq 0x40($Tbl),@X[6],$t2
  1223. vmovdqa $t3,0x30(%rsp)
  1224. vpaddq 0x60($Tbl),@X[7],$t3
  1225. vmovdqa $t0,0x40(%rsp)
  1226. mov $A,$a1
  1227. vmovdqa $t1,0x50(%rsp)
  1228. mov $B,$a3
  1229. vmovdqa $t2,0x60(%rsp)
  1230. xor $C,$a3 # magic
  1231. vmovdqa $t3,0x70(%rsp)
  1232. mov $E,$a0
  1233. jmp .Lxop_00_47
  1234. .align 16
  1235. .Lxop_00_47:
  1236. add \$`16*2*$SZ`,$Tbl
  1237. ___
  1238. sub XOP_512_00_47 () {
  1239. my $j = shift;
  1240. my $body = shift;
  1241. my @X = @_;
  1242. my @insns = (&$body,&$body); # 52 instructions
  1243. &vpalignr ($t0,@X[1],@X[0],$SZ); # X[1..2]
  1244. eval(shift(@insns));
  1245. eval(shift(@insns));
  1246. &vpalignr ($t3,@X[5],@X[4],$SZ); # X[9..10]
  1247. eval(shift(@insns));
  1248. eval(shift(@insns));
  1249. &vprotq ($t1,$t0,8*$SZ-$sigma0[1]);
  1250. eval(shift(@insns));
  1251. eval(shift(@insns));
  1252. &vpsrlq ($t0,$t0,$sigma0[2]);
  1253. eval(shift(@insns));
  1254. eval(shift(@insns));
  1255. &vpaddq (@X[0],@X[0],$t3); # X[0..1] += X[9..10]
  1256. eval(shift(@insns));
  1257. eval(shift(@insns));
  1258. eval(shift(@insns));
  1259. eval(shift(@insns));
  1260. &vprotq ($t2,$t1,$sigma0[1]-$sigma0[0]);
  1261. eval(shift(@insns));
  1262. eval(shift(@insns));
  1263. &vpxor ($t0,$t0,$t1);
  1264. eval(shift(@insns));
  1265. eval(shift(@insns));
  1266. eval(shift(@insns));
  1267. eval(shift(@insns));
  1268. &vprotq ($t3,@X[7],8*$SZ-$sigma1[1]);
  1269. eval(shift(@insns));
  1270. eval(shift(@insns));
  1271. &vpxor ($t0,$t0,$t2); # sigma0(X[1..2])
  1272. eval(shift(@insns));
  1273. eval(shift(@insns));
  1274. &vpsrlq ($t2,@X[7],$sigma1[2]);
  1275. eval(shift(@insns));
  1276. eval(shift(@insns));
  1277. &vpaddq (@X[0],@X[0],$t0); # X[0..1] += sigma0(X[1..2])
  1278. eval(shift(@insns));
  1279. eval(shift(@insns));
  1280. &vprotq ($t1,$t3,$sigma1[1]-$sigma1[0]);
  1281. eval(shift(@insns));
  1282. eval(shift(@insns));
  1283. &vpxor ($t3,$t3,$t2);
  1284. eval(shift(@insns));
  1285. eval(shift(@insns));
  1286. eval(shift(@insns));
  1287. eval(shift(@insns));
  1288. &vpxor ($t3,$t3,$t1); # sigma1(X[14..15])
  1289. eval(shift(@insns));
  1290. eval(shift(@insns));
  1291. eval(shift(@insns));
  1292. eval(shift(@insns));
  1293. &vpaddq (@X[0],@X[0],$t3); # X[0..1] += sigma1(X[14..15])
  1294. eval(shift(@insns));
  1295. eval(shift(@insns));
  1296. eval(shift(@insns));
  1297. eval(shift(@insns));
  1298. &vpaddq ($t2,@X[0],16*2*$j-0x80."($Tbl)");
  1299. foreach (@insns) { eval; } # remaining instructions
  1300. &vmovdqa (16*$j."(%rsp)",$t2);
  1301. }
  1302. for ($i=0,$j=0; $j<8; $j++) {
  1303. &XOP_512_00_47($j,\&body_00_15,@X);
  1304. push(@X,shift(@X)); # rotate(@X)
  1305. }
  1306. &cmpb ($SZ-1+16*2*$SZ-0x80."($Tbl)",0);
  1307. &jne (".Lxop_00_47");
  1308. for ($i=0; $i<16; ) {
  1309. foreach(body_00_15()) { eval; }
  1310. }
  1311. }
  1312. $code.=<<___;
  1313. mov $_ctx,$ctx
  1314. mov $a1,$A
  1315. add $SZ*0($ctx),$A
  1316. lea 16*$SZ($inp),$inp
  1317. add $SZ*1($ctx),$B
  1318. add $SZ*2($ctx),$C
  1319. add $SZ*3($ctx),$D
  1320. add $SZ*4($ctx),$E
  1321. add $SZ*5($ctx),$F
  1322. add $SZ*6($ctx),$G
  1323. add $SZ*7($ctx),$H
  1324. cmp $_end,$inp
  1325. mov $A,$SZ*0($ctx)
  1326. mov $B,$SZ*1($ctx)
  1327. mov $C,$SZ*2($ctx)
  1328. mov $D,$SZ*3($ctx)
  1329. mov $E,$SZ*4($ctx)
  1330. mov $F,$SZ*5($ctx)
  1331. mov $G,$SZ*6($ctx)
  1332. mov $H,$SZ*7($ctx)
  1333. jb .Lloop_xop
  1334. mov $_rsp,%rsi
  1335. vzeroupper
  1336. ___
  1337. $code.=<<___ if ($win64);
  1338. movaps 16*$SZ+32(%rsp),%xmm6
  1339. movaps 16*$SZ+48(%rsp),%xmm7
  1340. movaps 16*$SZ+64(%rsp),%xmm8
  1341. movaps 16*$SZ+80(%rsp),%xmm9
  1342. ___
  1343. $code.=<<___ if ($win64 && $SZ>4);
  1344. movaps 16*$SZ+96(%rsp),%xmm10
  1345. movaps 16*$SZ+112(%rsp),%xmm11
  1346. ___
  1347. $code.=<<___;
  1348. mov (%rsi),%r15
  1349. mov 8(%rsi),%r14
  1350. mov 16(%rsi),%r13
  1351. mov 24(%rsi),%r12
  1352. mov 32(%rsi),%rbp
  1353. mov 40(%rsi),%rbx
  1354. lea 48(%rsi),%rsp
  1355. .Lepilogue_xop:
  1356. ret
  1357. .size ${func}_xop,.-${func}_xop
  1358. ___
  1359. }
  1360. ######################################################################
  1361. # AVX+shrd code path
  1362. #
  1363. local *ror = sub { &shrd(@_[0],@_) };
  1364. $code.=<<___;
  1365. .type ${func}_avx,\@function,3
  1366. .align 64
  1367. ${func}_avx:
  1368. .Lavx_shortcut:
  1369. push %rbx
  1370. push %rbp
  1371. push %r12
  1372. push %r13
  1373. push %r14
  1374. push %r15
  1375. mov %rsp,%r11 # copy %rsp
  1376. shl \$4,%rdx # num*16
  1377. sub \$`$framesz+$win64*16*($SZ==4?4:6)`,%rsp
  1378. lea ($inp,%rdx,$SZ),%rdx # inp+num*16*$SZ
  1379. and \$-64,%rsp # align stack frame
  1380. mov $ctx,$_ctx # save ctx, 1st arg
  1381. mov $inp,$_inp # save inp, 2nd arh
  1382. mov %rdx,$_end # save end pointer, "3rd" arg
  1383. mov %r11,$_rsp # save copy of %rsp
  1384. ___
  1385. $code.=<<___ if ($win64);
  1386. movaps %xmm6,16*$SZ+32(%rsp)
  1387. movaps %xmm7,16*$SZ+48(%rsp)
  1388. movaps %xmm8,16*$SZ+64(%rsp)
  1389. movaps %xmm9,16*$SZ+80(%rsp)
  1390. ___
  1391. $code.=<<___ if ($win64 && $SZ>4);
  1392. movaps %xmm10,16*$SZ+96(%rsp)
  1393. movaps %xmm11,16*$SZ+112(%rsp)
  1394. ___
  1395. $code.=<<___;
  1396. .Lprologue_avx:
  1397. vzeroupper
  1398. mov $SZ*0($ctx),$A
  1399. mov $SZ*1($ctx),$B
  1400. mov $SZ*2($ctx),$C
  1401. mov $SZ*3($ctx),$D
  1402. mov $SZ*4($ctx),$E
  1403. mov $SZ*5($ctx),$F
  1404. mov $SZ*6($ctx),$G
  1405. mov $SZ*7($ctx),$H
  1406. ___
  1407. if ($SZ==4) { # SHA256
  1408. my @X = map("%xmm$_",(0..3));
  1409. my ($t0,$t1,$t2,$t3, $t4,$t5) = map("%xmm$_",(4..9));
  1410. $code.=<<___;
  1411. vmovdqa $TABLE+`$SZ*2*$rounds`+32(%rip),$t4
  1412. vmovdqa $TABLE+`$SZ*2*$rounds`+64(%rip),$t5
  1413. jmp .Lloop_avx
  1414. .align 16
  1415. .Lloop_avx:
  1416. vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3
  1417. vmovdqu 0x00($inp),@X[0]
  1418. vmovdqu 0x10($inp),@X[1]
  1419. vmovdqu 0x20($inp),@X[2]
  1420. vmovdqu 0x30($inp),@X[3]
  1421. vpshufb $t3,@X[0],@X[0]
  1422. lea $TABLE(%rip),$Tbl
  1423. vpshufb $t3,@X[1],@X[1]
  1424. vpshufb $t3,@X[2],@X[2]
  1425. vpaddd 0x00($Tbl),@X[0],$t0
  1426. vpshufb $t3,@X[3],@X[3]
  1427. vpaddd 0x20($Tbl),@X[1],$t1
  1428. vpaddd 0x40($Tbl),@X[2],$t2
  1429. vpaddd 0x60($Tbl),@X[3],$t3
  1430. vmovdqa $t0,0x00(%rsp)
  1431. mov $A,$a1
  1432. vmovdqa $t1,0x10(%rsp)
  1433. mov $B,$a3
  1434. vmovdqa $t2,0x20(%rsp)
  1435. xor $C,$a3 # magic
  1436. vmovdqa $t3,0x30(%rsp)
  1437. mov $E,$a0
  1438. jmp .Lavx_00_47
  1439. .align 16
  1440. .Lavx_00_47:
  1441. sub \$`-16*2*$SZ`,$Tbl # size optimization
  1442. ___
  1443. sub Xupdate_256_AVX () {
  1444. (
  1445. '&vpalignr ($t0,@X[1],@X[0],$SZ)', # X[1..4]
  1446. '&vpalignr ($t3,@X[3],@X[2],$SZ)', # X[9..12]
  1447. '&vpsrld ($t2,$t0,$sigma0[0]);',
  1448. '&vpaddd (@X[0],@X[0],$t3)', # X[0..3] += X[9..12]
  1449. '&vpsrld ($t3,$t0,$sigma0[2])',
  1450. '&vpslld ($t1,$t0,8*$SZ-$sigma0[1]);',
  1451. '&vpxor ($t0,$t3,$t2)',
  1452. '&vpshufd ($t3,@X[3],0b11111010)',# X[14..15]
  1453. '&vpsrld ($t2,$t2,$sigma0[1]-$sigma0[0]);',
  1454. '&vpxor ($t0,$t0,$t1)',
  1455. '&vpslld ($t1,$t1,$sigma0[1]-$sigma0[0]);',
  1456. '&vpxor ($t0,$t0,$t2)',
  1457. '&vpsrld ($t2,$t3,$sigma1[2]);',
  1458. '&vpxor ($t0,$t0,$t1)', # sigma0(X[1..4])
  1459. '&vpsrlq ($t3,$t3,$sigma1[0]);',
  1460. '&vpaddd (@X[0],@X[0],$t0)', # X[0..3] += sigma0(X[1..4])
  1461. '&vpxor ($t2,$t2,$t3);',
  1462. '&vpsrlq ($t3,$t3,$sigma1[1]-$sigma1[0])',
  1463. '&vpxor ($t2,$t2,$t3)',
  1464. '&vpshufb ($t2,$t2,$t4)', # sigma1(X[14..15])
  1465. '&vpaddd (@X[0],@X[0],$t2)', # X[0..1] += sigma1(X[14..15])
  1466. '&vpshufd ($t3,@X[0],0b01010000)',# X[16..17]
  1467. '&vpsrld ($t2,$t3,$sigma1[2])',
  1468. '&vpsrlq ($t3,$t3,$sigma1[0])',
  1469. '&vpxor ($t2,$t2,$t3);',
  1470. '&vpsrlq ($t3,$t3,$sigma1[1]-$sigma1[0])',
  1471. '&vpxor ($t2,$t2,$t3)',
  1472. '&vpshufb ($t2,$t2,$t5)',
  1473. '&vpaddd (@X[0],@X[0],$t2)' # X[2..3] += sigma1(X[16..17])
  1474. );
  1475. }
  1476. sub AVX_256_00_47 () {
  1477. my $j = shift;
  1478. my $body = shift;
  1479. my @X = @_;
  1480. my @insns = (&$body,&$body,&$body,&$body); # 104 instructions
  1481. foreach (Xupdate_256_AVX()) { # 29 instructions
  1482. eval;
  1483. eval(shift(@insns));
  1484. eval(shift(@insns));
  1485. eval(shift(@insns));
  1486. }
  1487. &vpaddd ($t2,@X[0],16*2*$j."($Tbl)");
  1488. foreach (@insns) { eval; } # remaining instructions
  1489. &vmovdqa (16*$j."(%rsp)",$t2);
  1490. }
  1491. for ($i=0,$j=0; $j<4; $j++) {
  1492. &AVX_256_00_47($j,\&body_00_15,@X);
  1493. push(@X,shift(@X)); # rotate(@X)
  1494. }
  1495. &cmpb ($SZ-1+16*2*$SZ."($Tbl)",0);
  1496. &jne (".Lavx_00_47");
  1497. for ($i=0; $i<16; ) {
  1498. foreach(body_00_15()) { eval; }
  1499. }
  1500. } else { # SHA512
  1501. my @X = map("%xmm$_",(0..7));
  1502. my ($t0,$t1,$t2,$t3) = map("%xmm$_",(8..11));
  1503. $code.=<<___;
  1504. jmp .Lloop_avx
  1505. .align 16
  1506. .Lloop_avx:
  1507. vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3
  1508. vmovdqu 0x00($inp),@X[0]
  1509. lea $TABLE+0x80(%rip),$Tbl # size optimization
  1510. vmovdqu 0x10($inp),@X[1]
  1511. vmovdqu 0x20($inp),@X[2]
  1512. vpshufb $t3,@X[0],@X[0]
  1513. vmovdqu 0x30($inp),@X[3]
  1514. vpshufb $t3,@X[1],@X[1]
  1515. vmovdqu 0x40($inp),@X[4]
  1516. vpshufb $t3,@X[2],@X[2]
  1517. vmovdqu 0x50($inp),@X[5]
  1518. vpshufb $t3,@X[3],@X[3]
  1519. vmovdqu 0x60($inp),@X[6]
  1520. vpshufb $t3,@X[4],@X[4]
  1521. vmovdqu 0x70($inp),@X[7]
  1522. vpshufb $t3,@X[5],@X[5]
  1523. vpaddq -0x80($Tbl),@X[0],$t0
  1524. vpshufb $t3,@X[6],@X[6]
  1525. vpaddq -0x60($Tbl),@X[1],$t1
  1526. vpshufb $t3,@X[7],@X[7]
  1527. vpaddq -0x40($Tbl),@X[2],$t2
  1528. vpaddq -0x20($Tbl),@X[3],$t3
  1529. vmovdqa $t0,0x00(%rsp)
  1530. vpaddq 0x00($Tbl),@X[4],$t0
  1531. vmovdqa $t1,0x10(%rsp)
  1532. vpaddq 0x20($Tbl),@X[5],$t1
  1533. vmovdqa $t2,0x20(%rsp)
  1534. vpaddq 0x40($Tbl),@X[6],$t2
  1535. vmovdqa $t3,0x30(%rsp)
  1536. vpaddq 0x60($Tbl),@X[7],$t3
  1537. vmovdqa $t0,0x40(%rsp)
  1538. mov $A,$a1
  1539. vmovdqa $t1,0x50(%rsp)
  1540. mov $B,$a3
  1541. vmovdqa $t2,0x60(%rsp)
  1542. xor $C,$a3 # magic
  1543. vmovdqa $t3,0x70(%rsp)
  1544. mov $E,$a0
  1545. jmp .Lavx_00_47
  1546. .align 16
  1547. .Lavx_00_47:
  1548. add \$`16*2*$SZ`,$Tbl
  1549. ___
  1550. sub Xupdate_512_AVX () {
  1551. (
  1552. '&vpalignr ($t0,@X[1],@X[0],$SZ)', # X[1..2]
  1553. '&vpalignr ($t3,@X[5],@X[4],$SZ)', # X[9..10]
  1554. '&vpsrlq ($t2,$t0,$sigma0[0])',
  1555. '&vpaddq (@X[0],@X[0],$t3);', # X[0..1] += X[9..10]
  1556. '&vpsrlq ($t3,$t0,$sigma0[2])',
  1557. '&vpsllq ($t1,$t0,8*$SZ-$sigma0[1]);',
  1558. '&vpxor ($t0,$t3,$t2)',
  1559. '&vpsrlq ($t2,$t2,$sigma0[1]-$sigma0[0]);',
  1560. '&vpxor ($t0,$t0,$t1)',
  1561. '&vpsllq ($t1,$t1,$sigma0[1]-$sigma0[0]);',
  1562. '&vpxor ($t0,$t0,$t2)',
  1563. '&vpsrlq ($t3,@X[7],$sigma1[2]);',
  1564. '&vpxor ($t0,$t0,$t1)', # sigma0(X[1..2])
  1565. '&vpsllq ($t2,@X[7],8*$SZ-$sigma1[1]);',
  1566. '&vpaddq (@X[0],@X[0],$t0)', # X[0..1] += sigma0(X[1..2])
  1567. '&vpsrlq ($t1,@X[7],$sigma1[0]);',
  1568. '&vpxor ($t3,$t3,$t2)',
  1569. '&vpsllq ($t2,$t2,$sigma1[1]-$sigma1[0]);',
  1570. '&vpxor ($t3,$t3,$t1)',
  1571. '&vpsrlq ($t1,$t1,$sigma1[1]-$sigma1[0]);',
  1572. '&vpxor ($t3,$t3,$t2)',
  1573. '&vpxor ($t3,$t3,$t1)', # sigma1(X[14..15])
  1574. '&vpaddq (@X[0],@X[0],$t3)', # X[0..1] += sigma1(X[14..15])
  1575. );
  1576. }
  1577. sub AVX_512_00_47 () {
  1578. my $j = shift;
  1579. my $body = shift;
  1580. my @X = @_;
  1581. my @insns = (&$body,&$body); # 52 instructions
  1582. foreach (Xupdate_512_AVX()) { # 23 instructions
  1583. eval;
  1584. eval(shift(@insns));
  1585. eval(shift(@insns));
  1586. }
  1587. &vpaddq ($t2,@X[0],16*2*$j-0x80."($Tbl)");
  1588. foreach (@insns) { eval; } # remaining instructions
  1589. &vmovdqa (16*$j."(%rsp)",$t2);
  1590. }
  1591. for ($i=0,$j=0; $j<8; $j++) {
  1592. &AVX_512_00_47($j,\&body_00_15,@X);
  1593. push(@X,shift(@X)); # rotate(@X)
  1594. }
  1595. &cmpb ($SZ-1+16*2*$SZ-0x80."($Tbl)",0);
  1596. &jne (".Lavx_00_47");
  1597. for ($i=0; $i<16; ) {
  1598. foreach(body_00_15()) { eval; }
  1599. }
  1600. }
  1601. $code.=<<___;
  1602. mov $_ctx,$ctx
  1603. mov $a1,$A
  1604. add $SZ*0($ctx),$A
  1605. lea 16*$SZ($inp),$inp
  1606. add $SZ*1($ctx),$B
  1607. add $SZ*2($ctx),$C
  1608. add $SZ*3($ctx),$D
  1609. add $SZ*4($ctx),$E
  1610. add $SZ*5($ctx),$F
  1611. add $SZ*6($ctx),$G
  1612. add $SZ*7($ctx),$H
  1613. cmp $_end,$inp
  1614. mov $A,$SZ*0($ctx)
  1615. mov $B,$SZ*1($ctx)
  1616. mov $C,$SZ*2($ctx)
  1617. mov $D,$SZ*3($ctx)
  1618. mov $E,$SZ*4($ctx)
  1619. mov $F,$SZ*5($ctx)
  1620. mov $G,$SZ*6($ctx)
  1621. mov $H,$SZ*7($ctx)
  1622. jb .Lloop_avx
  1623. mov $_rsp,%rsi
  1624. vzeroupper
  1625. ___
  1626. $code.=<<___ if ($win64);
  1627. movaps 16*$SZ+32(%rsp),%xmm6
  1628. movaps 16*$SZ+48(%rsp),%xmm7
  1629. movaps 16*$SZ+64(%rsp),%xmm8
  1630. movaps 16*$SZ+80(%rsp),%xmm9
  1631. ___
  1632. $code.=<<___ if ($win64 && $SZ>4);
  1633. movaps 16*$SZ+96(%rsp),%xmm10
  1634. movaps 16*$SZ+112(%rsp),%xmm11
  1635. ___
  1636. $code.=<<___;
  1637. mov (%rsi),%r15
  1638. mov 8(%rsi),%r14
  1639. mov 16(%rsi),%r13
  1640. mov 24(%rsi),%r12
  1641. mov 32(%rsi),%rbp
  1642. mov 40(%rsi),%rbx
  1643. lea 48(%rsi),%rsp
  1644. .Lepilogue_avx:
  1645. ret
  1646. .size ${func}_avx,.-${func}_avx
  1647. ___
  1648. if ($avx>1) {{
  1649. ######################################################################
  1650. # AVX2+BMI code path
  1651. #
  1652. my $a5=$SZ==4?"%esi":"%rsi"; # zap $inp
  1653. my $PUSH8=8*2*$SZ;
  1654. use integer;
  1655. sub bodyx_00_15 () {
  1656. # at start $a1 should be zero, $a3 - $b^$c and $a4 copy of $f
  1657. (
  1658. '($a,$b,$c,$d,$e,$f,$g,$h)=@ROT;'.
  1659. '&add ($h,(32*($i/(16/$SZ))+$SZ*($i%(16/$SZ)))%$PUSH8.$base)', # h+=X[i]+K[i]
  1660. '&and ($a4,$e)', # f&e
  1661. '&rorx ($a0,$e,$Sigma1[2])',
  1662. '&rorx ($a2,$e,$Sigma1[1])',
  1663. '&lea ($a,"($a,$a1)")', # h+=Sigma0(a) from the past
  1664. '&lea ($h,"($h,$a4)")',
  1665. '&andn ($a4,$e,$g)', # ~e&g
  1666. '&xor ($a0,$a2)',
  1667. '&rorx ($a1,$e,$Sigma1[0])',
  1668. '&lea ($h,"($h,$a4)")', # h+=Ch(e,f,g)=(e&f)+(~e&g)
  1669. '&xor ($a0,$a1)', # Sigma1(e)
  1670. '&mov ($a2,$a)',
  1671. '&rorx ($a4,$a,$Sigma0[2])',
  1672. '&lea ($h,"($h,$a0)")', # h+=Sigma1(e)
  1673. '&xor ($a2,$b)', # a^b, b^c in next round
  1674. '&rorx ($a1,$a,$Sigma0[1])',
  1675. '&rorx ($a0,$a,$Sigma0[0])',
  1676. '&lea ($d,"($d,$h)")', # d+=h
  1677. '&and ($a3,$a2)', # (b^c)&(a^b)
  1678. '&xor ($a1,$a4)',
  1679. '&xor ($a3,$b)', # Maj(a,b,c)=Ch(a^b,c,b)
  1680. '&xor ($a1,$a0)', # Sigma0(a)
  1681. '&lea ($h,"($h,$a3)");'. # h+=Maj(a,b,c)
  1682. '&mov ($a4,$e)', # copy of f in future
  1683. '($a2,$a3) = ($a3,$a2); unshift(@ROT,pop(@ROT)); $i++;'
  1684. );
  1685. # and at the finish one has to $a+=$a1
  1686. }
  1687. $code.=<<___;
  1688. .type ${func}_avx2,\@function,3
  1689. .align 64
  1690. ${func}_avx2:
  1691. .Lavx2_shortcut:
  1692. push %rbx
  1693. push %rbp
  1694. push %r12
  1695. push %r13
  1696. push %r14
  1697. push %r15
  1698. mov %rsp,%r11 # copy %rsp
  1699. sub \$`2*$SZ*$rounds+4*8+$win64*16*($SZ==4?4:6)`,%rsp
  1700. shl \$4,%rdx # num*16
  1701. and \$-256*$SZ,%rsp # align stack frame
  1702. lea ($inp,%rdx,$SZ),%rdx # inp+num*16*$SZ
  1703. add \$`2*$SZ*($rounds-8)`,%rsp
  1704. mov $ctx,$_ctx # save ctx, 1st arg
  1705. mov $inp,$_inp # save inp, 2nd arh
  1706. mov %rdx,$_end # save end pointer, "3rd" arg
  1707. mov %r11,$_rsp # save copy of %rsp
  1708. ___
  1709. $code.=<<___ if ($win64);
  1710. movaps %xmm6,16*$SZ+32(%rsp)
  1711. movaps %xmm7,16*$SZ+48(%rsp)
  1712. movaps %xmm8,16*$SZ+64(%rsp)
  1713. movaps %xmm9,16*$SZ+80(%rsp)
  1714. ___
  1715. $code.=<<___ if ($win64 && $SZ>4);
  1716. movaps %xmm10,16*$SZ+96(%rsp)
  1717. movaps %xmm11,16*$SZ+112(%rsp)
  1718. ___
  1719. $code.=<<___;
  1720. .Lprologue_avx2:
  1721. vzeroupper
  1722. sub \$-16*$SZ,$inp # inp++, size optimization
  1723. mov $SZ*0($ctx),$A
  1724. mov $inp,%r12 # borrow $T1
  1725. mov $SZ*1($ctx),$B
  1726. cmp %rdx,$inp # $_end
  1727. mov $SZ*2($ctx),$C
  1728. cmove %rsp,%r12 # next block or random data
  1729. mov $SZ*3($ctx),$D
  1730. mov $SZ*4($ctx),$E
  1731. mov $SZ*5($ctx),$F
  1732. mov $SZ*6($ctx),$G
  1733. mov $SZ*7($ctx),$H
  1734. ___
  1735. if ($SZ==4) { # SHA256
  1736. my @X = map("%ymm$_",(0..3));
  1737. my ($t0,$t1,$t2,$t3, $t4,$t5) = map("%ymm$_",(4..9));
  1738. $code.=<<___;
  1739. vmovdqa $TABLE+`$SZ*2*$rounds`+32(%rip),$t4
  1740. vmovdqa $TABLE+`$SZ*2*$rounds`+64(%rip),$t5
  1741. jmp .Loop_avx2
  1742. .align 16
  1743. .Loop_avx2:
  1744. vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3
  1745. vmovdqu -16*$SZ+0($inp),%xmm0
  1746. vmovdqu -16*$SZ+16($inp),%xmm1
  1747. vmovdqu -16*$SZ+32($inp),%xmm2
  1748. vmovdqu -16*$SZ+48($inp),%xmm3
  1749. #mov $inp,$_inp # offload $inp
  1750. vinserti128 \$1,(%r12),@X[0],@X[0]
  1751. vinserti128 \$1,16(%r12),@X[1],@X[1]
  1752. vpshufb $t3,@X[0],@X[0]
  1753. vinserti128 \$1,32(%r12),@X[2],@X[2]
  1754. vpshufb $t3,@X[1],@X[1]
  1755. vinserti128 \$1,48(%r12),@X[3],@X[3]
  1756. lea $TABLE(%rip),$Tbl
  1757. vpshufb $t3,@X[2],@X[2]
  1758. vpaddd 0x00($Tbl),@X[0],$t0
  1759. vpshufb $t3,@X[3],@X[3]
  1760. vpaddd 0x20($Tbl),@X[1],$t1
  1761. vpaddd 0x40($Tbl),@X[2],$t2
  1762. vpaddd 0x60($Tbl),@X[3],$t3
  1763. vmovdqa $t0,0x00(%rsp)
  1764. xor $a1,$a1
  1765. vmovdqa $t1,0x20(%rsp)
  1766. lea -$PUSH8(%rsp),%rsp
  1767. mov $B,$a3
  1768. vmovdqa $t2,0x00(%rsp)
  1769. xor $C,$a3 # magic
  1770. vmovdqa $t3,0x20(%rsp)
  1771. mov $F,$a4
  1772. sub \$-16*2*$SZ,$Tbl # size optimization
  1773. jmp .Lavx2_00_47
  1774. .align 16
  1775. .Lavx2_00_47:
  1776. ___
  1777. sub AVX2_256_00_47 () {
  1778. my $j = shift;
  1779. my $body = shift;
  1780. my @X = @_;
  1781. my @insns = (&$body,&$body,&$body,&$body); # 96 instructions
  1782. my $base = "+2*$PUSH8(%rsp)";
  1783. &lea ("%rsp","-$PUSH8(%rsp)") if (($j%2)==0);
  1784. foreach (Xupdate_256_AVX()) { # 29 instructions
  1785. eval;
  1786. eval(shift(@insns));
  1787. eval(shift(@insns));
  1788. eval(shift(@insns));
  1789. }
  1790. &vpaddd ($t2,@X[0],16*2*$j."($Tbl)");
  1791. foreach (@insns) { eval; } # remaining instructions
  1792. &vmovdqa ((32*$j)%$PUSH8."(%rsp)",$t2);
  1793. }
  1794. for ($i=0,$j=0; $j<4; $j++) {
  1795. &AVX2_256_00_47($j,\&bodyx_00_15,@X);
  1796. push(@X,shift(@X)); # rotate(@X)
  1797. }
  1798. &lea ($Tbl,16*2*$SZ."($Tbl)");
  1799. &cmpb (($SZ-1)."($Tbl)",0);
  1800. &jne (".Lavx2_00_47");
  1801. for ($i=0; $i<16; ) {
  1802. my $base=$i<8?"+$PUSH8(%rsp)":"(%rsp)";
  1803. foreach(bodyx_00_15()) { eval; }
  1804. }
  1805. } else { # SHA512
  1806. my @X = map("%ymm$_",(0..7));
  1807. my ($t0,$t1,$t2,$t3) = map("%ymm$_",(8..11));
  1808. $code.=<<___;
  1809. jmp .Loop_avx2
  1810. .align 16
  1811. .Loop_avx2:
  1812. vmovdqu -16*$SZ($inp),%xmm0
  1813. vmovdqu -16*$SZ+16($inp),%xmm1
  1814. vmovdqu -16*$SZ+32($inp),%xmm2
  1815. lea $TABLE+0x80(%rip),$Tbl # size optimization
  1816. vmovdqu -16*$SZ+48($inp),%xmm3
  1817. vmovdqu -16*$SZ+64($inp),%xmm4
  1818. vmovdqu -16*$SZ+80($inp),%xmm5
  1819. vmovdqu -16*$SZ+96($inp),%xmm6
  1820. vmovdqu -16*$SZ+112($inp),%xmm7
  1821. #mov $inp,$_inp # offload $inp
  1822. vmovdqa `$SZ*2*$rounds-0x80`($Tbl),$t2
  1823. vinserti128 \$1,(%r12),@X[0],@X[0]
  1824. vinserti128 \$1,16(%r12),@X[1],@X[1]
  1825. vpshufb $t2,@X[0],@X[0]
  1826. vinserti128 \$1,32(%r12),@X[2],@X[2]
  1827. vpshufb $t2,@X[1],@X[1]
  1828. vinserti128 \$1,48(%r12),@X[3],@X[3]
  1829. vpshufb $t2,@X[2],@X[2]
  1830. vinserti128 \$1,64(%r12),@X[4],@X[4]
  1831. vpshufb $t2,@X[3],@X[3]
  1832. vinserti128 \$1,80(%r12),@X[5],@X[5]
  1833. vpshufb $t2,@X[4],@X[4]
  1834. vinserti128 \$1,96(%r12),@X[6],@X[6]
  1835. vpshufb $t2,@X[5],@X[5]
  1836. vinserti128 \$1,112(%r12),@X[7],@X[7]
  1837. vpaddq -0x80($Tbl),@X[0],$t0
  1838. vpshufb $t2,@X[6],@X[6]
  1839. vpaddq -0x60($Tbl),@X[1],$t1
  1840. vpshufb $t2,@X[7],@X[7]
  1841. vpaddq -0x40($Tbl),@X[2],$t2
  1842. vpaddq -0x20($Tbl),@X[3],$t3
  1843. vmovdqa $t0,0x00(%rsp)
  1844. vpaddq 0x00($Tbl),@X[4],$t0
  1845. vmovdqa $t1,0x20(%rsp)
  1846. vpaddq 0x20($Tbl),@X[5],$t1
  1847. vmovdqa $t2,0x40(%rsp)
  1848. vpaddq 0x40($Tbl),@X[6],$t2
  1849. vmovdqa $t3,0x60(%rsp)
  1850. lea -$PUSH8(%rsp),%rsp
  1851. vpaddq 0x60($Tbl),@X[7],$t3
  1852. vmovdqa $t0,0x00(%rsp)
  1853. xor $a1,$a1
  1854. vmovdqa $t1,0x20(%rsp)
  1855. mov $B,$a3
  1856. vmovdqa $t2,0x40(%rsp)
  1857. xor $C,$a3 # magic
  1858. vmovdqa $t3,0x60(%rsp)
  1859. mov $F,$a4
  1860. add \$16*2*$SZ,$Tbl
  1861. jmp .Lavx2_00_47
  1862. .align 16
  1863. .Lavx2_00_47:
  1864. ___
  1865. sub AVX2_512_00_47 () {
  1866. my $j = shift;
  1867. my $body = shift;
  1868. my @X = @_;
  1869. my @insns = (&$body,&$body); # 48 instructions
  1870. my $base = "+2*$PUSH8(%rsp)";
  1871. &lea ("%rsp","-$PUSH8(%rsp)") if (($j%4)==0);
  1872. foreach (Xupdate_512_AVX()) { # 23 instructions
  1873. eval;
  1874. if ($_ !~ /\;$/) {
  1875. eval(shift(@insns));
  1876. eval(shift(@insns));
  1877. eval(shift(@insns));
  1878. }
  1879. }
  1880. &vpaddq ($t2,@X[0],16*2*$j-0x80."($Tbl)");
  1881. foreach (@insns) { eval; } # remaining instructions
  1882. &vmovdqa ((32*$j)%$PUSH8."(%rsp)",$t2);
  1883. }
  1884. for ($i=0,$j=0; $j<8; $j++) {
  1885. &AVX2_512_00_47($j,\&bodyx_00_15,@X);
  1886. push(@X,shift(@X)); # rotate(@X)
  1887. }
  1888. &lea ($Tbl,16*2*$SZ."($Tbl)");
  1889. &cmpb (($SZ-1-0x80)."($Tbl)",0);
  1890. &jne (".Lavx2_00_47");
  1891. for ($i=0; $i<16; ) {
  1892. my $base=$i<8?"+$PUSH8(%rsp)":"(%rsp)";
  1893. foreach(bodyx_00_15()) { eval; }
  1894. }
  1895. }
  1896. $code.=<<___;
  1897. mov `2*$SZ*$rounds`(%rsp),$ctx # $_ctx
  1898. add $a1,$A
  1899. #mov `2*$SZ*$rounds+8`(%rsp),$inp # $_inp
  1900. lea `2*$SZ*($rounds-8)`(%rsp),$Tbl
  1901. add $SZ*0($ctx),$A
  1902. add $SZ*1($ctx),$B
  1903. add $SZ*2($ctx),$C
  1904. add $SZ*3($ctx),$D
  1905. add $SZ*4($ctx),$E
  1906. add $SZ*5($ctx),$F
  1907. add $SZ*6($ctx),$G
  1908. add $SZ*7($ctx),$H
  1909. mov $A,$SZ*0($ctx)
  1910. mov $B,$SZ*1($ctx)
  1911. mov $C,$SZ*2($ctx)
  1912. mov $D,$SZ*3($ctx)
  1913. mov $E,$SZ*4($ctx)
  1914. mov $F,$SZ*5($ctx)
  1915. mov $G,$SZ*6($ctx)
  1916. mov $H,$SZ*7($ctx)
  1917. cmp `$PUSH8+2*8`($Tbl),$inp # $_end
  1918. je .Ldone_avx2
  1919. xor $a1,$a1
  1920. mov $B,$a3
  1921. xor $C,$a3 # magic
  1922. mov $F,$a4
  1923. jmp .Lower_avx2
  1924. .align 16
  1925. .Lower_avx2:
  1926. ___
  1927. for ($i=0; $i<8; ) {
  1928. my $base="+16($Tbl)";
  1929. foreach(bodyx_00_15()) { eval; }
  1930. }
  1931. $code.=<<___;
  1932. lea -$PUSH8($Tbl),$Tbl
  1933. cmp %rsp,$Tbl
  1934. jae .Lower_avx2
  1935. mov `2*$SZ*$rounds`(%rsp),$ctx # $_ctx
  1936. add $a1,$A
  1937. #mov `2*$SZ*$rounds+8`(%rsp),$inp # $_inp
  1938. lea `2*$SZ*($rounds-8)`(%rsp),%rsp
  1939. add $SZ*0($ctx),$A
  1940. add $SZ*1($ctx),$B
  1941. add $SZ*2($ctx),$C
  1942. add $SZ*3($ctx),$D
  1943. add $SZ*4($ctx),$E
  1944. add $SZ*5($ctx),$F
  1945. lea `2*16*$SZ`($inp),$inp # inp+=2
  1946. add $SZ*6($ctx),$G
  1947. mov $inp,%r12
  1948. add $SZ*7($ctx),$H
  1949. cmp $_end,$inp
  1950. mov $A,$SZ*0($ctx)
  1951. cmove %rsp,%r12 # next block or stale data
  1952. mov $B,$SZ*1($ctx)
  1953. mov $C,$SZ*2($ctx)
  1954. mov $D,$SZ*3($ctx)
  1955. mov $E,$SZ*4($ctx)
  1956. mov $F,$SZ*5($ctx)
  1957. mov $G,$SZ*6($ctx)
  1958. mov $H,$SZ*7($ctx)
  1959. jbe .Loop_avx2
  1960. lea (%rsp),$Tbl
  1961. .Ldone_avx2:
  1962. lea ($Tbl),%rsp
  1963. mov $_rsp,%rsi
  1964. vzeroupper
  1965. ___
  1966. $code.=<<___ if ($win64);
  1967. movaps 16*$SZ+32(%rsp),%xmm6
  1968. movaps 16*$SZ+48(%rsp),%xmm7
  1969. movaps 16*$SZ+64(%rsp),%xmm8
  1970. movaps 16*$SZ+80(%rsp),%xmm9
  1971. ___
  1972. $code.=<<___ if ($win64 && $SZ>4);
  1973. movaps 16*$SZ+96(%rsp),%xmm10
  1974. movaps 16*$SZ+112(%rsp),%xmm11
  1975. ___
  1976. $code.=<<___;
  1977. mov (%rsi),%r15
  1978. mov 8(%rsi),%r14
  1979. mov 16(%rsi),%r13
  1980. mov 24(%rsi),%r12
  1981. mov 32(%rsi),%rbp
  1982. mov 40(%rsi),%rbx
  1983. lea 48(%rsi),%rsp
  1984. .Lepilogue_avx2:
  1985. ret
  1986. .size ${func}_avx2,.-${func}_avx2
  1987. ___
  1988. }}
  1989. }}}}}
  1990. # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
  1991. # CONTEXT *context,DISPATCHER_CONTEXT *disp)
  1992. if ($win64) {
  1993. $rec="%rcx";
  1994. $frame="%rdx";
  1995. $context="%r8";
  1996. $disp="%r9";
  1997. $code.=<<___;
  1998. .extern __imp_RtlVirtualUnwind
  1999. .type se_handler,\@abi-omnipotent
  2000. .align 16
  2001. se_handler:
  2002. push %rsi
  2003. push %rdi
  2004. push %rbx
  2005. push %rbp
  2006. push %r12
  2007. push %r13
  2008. push %r14
  2009. push %r15
  2010. pushfq
  2011. sub \$64,%rsp
  2012. mov 120($context),%rax # pull context->Rax
  2013. mov 248($context),%rbx # pull context->Rip
  2014. mov 8($disp),%rsi # disp->ImageBase
  2015. mov 56($disp),%r11 # disp->HanderlData
  2016. mov 0(%r11),%r10d # HandlerData[0]
  2017. lea (%rsi,%r10),%r10 # prologue label
  2018. cmp %r10,%rbx # context->Rip<prologue label
  2019. jb .Lin_prologue
  2020. mov 152($context),%rax # pull context->Rsp
  2021. mov 4(%r11),%r10d # HandlerData[1]
  2022. lea (%rsi,%r10),%r10 # epilogue label
  2023. cmp %r10,%rbx # context->Rip>=epilogue label
  2024. jae .Lin_prologue
  2025. ___
  2026. $code.=<<___ if ($avx>1);
  2027. lea .Lavx2_shortcut(%rip),%r10
  2028. cmp %r10,%rbx # context->Rip<avx2_shortcut
  2029. jb .Lnot_in_avx2
  2030. and \$-256*$SZ,%rax
  2031. add \$`2*$SZ*($rounds-8)`,%rax
  2032. .Lnot_in_avx2:
  2033. ___
  2034. $code.=<<___;
  2035. mov %rax,%rsi # put aside Rsp
  2036. mov 16*$SZ+3*8(%rax),%rax # pull $_rsp
  2037. lea 48(%rax),%rax
  2038. mov -8(%rax),%rbx
  2039. mov -16(%rax),%rbp
  2040. mov -24(%rax),%r12
  2041. mov -32(%rax),%r13
  2042. mov -40(%rax),%r14
  2043. mov -48(%rax),%r15
  2044. mov %rbx,144($context) # restore context->Rbx
  2045. mov %rbp,160($context) # restore context->Rbp
  2046. mov %r12,216($context) # restore context->R12
  2047. mov %r13,224($context) # restore context->R13
  2048. mov %r14,232($context) # restore context->R14
  2049. mov %r15,240($context) # restore context->R15
  2050. lea .Lepilogue(%rip),%r10
  2051. cmp %r10,%rbx
  2052. jb .Lin_prologue # non-AVX code
  2053. lea 16*$SZ+4*8(%rsi),%rsi # Xmm6- save area
  2054. lea 512($context),%rdi # &context.Xmm6
  2055. mov \$`$SZ==4?8:12`,%ecx
  2056. .long 0xa548f3fc # cld; rep movsq
  2057. .Lin_prologue:
  2058. mov 8(%rax),%rdi
  2059. mov 16(%rax),%rsi
  2060. mov %rax,152($context) # restore context->Rsp
  2061. mov %rsi,168($context) # restore context->Rsi
  2062. mov %rdi,176($context) # restore context->Rdi
  2063. mov 40($disp),%rdi # disp->ContextRecord
  2064. mov $context,%rsi # context
  2065. mov \$154,%ecx # sizeof(CONTEXT)
  2066. .long 0xa548f3fc # cld; rep movsq
  2067. mov $disp,%rsi
  2068. xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER
  2069. mov 8(%rsi),%rdx # arg2, disp->ImageBase
  2070. mov 0(%rsi),%r8 # arg3, disp->ControlPc
  2071. mov 16(%rsi),%r9 # arg4, disp->FunctionEntry
  2072. mov 40(%rsi),%r10 # disp->ContextRecord
  2073. lea 56(%rsi),%r11 # &disp->HandlerData
  2074. lea 24(%rsi),%r12 # &disp->EstablisherFrame
  2075. mov %r10,32(%rsp) # arg5
  2076. mov %r11,40(%rsp) # arg6
  2077. mov %r12,48(%rsp) # arg7
  2078. mov %rcx,56(%rsp) # arg8, (NULL)
  2079. call *__imp_RtlVirtualUnwind(%rip)
  2080. mov \$1,%eax # ExceptionContinueSearch
  2081. add \$64,%rsp
  2082. popfq
  2083. pop %r15
  2084. pop %r14
  2085. pop %r13
  2086. pop %r12
  2087. pop %rbp
  2088. pop %rbx
  2089. pop %rdi
  2090. pop %rsi
  2091. ret
  2092. .size se_handler,.-se_handler
  2093. ___
  2094. $code.=<<___ if ($SZ==4 && $shaext);
  2095. .type shaext_handler,\@abi-omnipotent
  2096. .align 16
  2097. shaext_handler:
  2098. push %rsi
  2099. push %rdi
  2100. push %rbx
  2101. push %rbp
  2102. push %r12
  2103. push %r13
  2104. push %r14
  2105. push %r15
  2106. pushfq
  2107. sub \$64,%rsp
  2108. mov 120($context),%rax # pull context->Rax
  2109. mov 248($context),%rbx # pull context->Rip
  2110. lea .Lprologue_shaext(%rip),%r10
  2111. cmp %r10,%rbx # context->Rip<.Lprologue
  2112. jb .Lin_prologue
  2113. lea .Lepilogue_shaext(%rip),%r10
  2114. cmp %r10,%rbx # context->Rip>=.Lepilogue
  2115. jae .Lin_prologue
  2116. lea -8-5*16(%rax),%rsi
  2117. lea 512($context),%rdi # &context.Xmm6
  2118. mov \$10,%ecx
  2119. .long 0xa548f3fc # cld; rep movsq
  2120. jmp .Lin_prologue
  2121. .size shaext_handler,.-shaext_handler
  2122. ___
  2123. $code.=<<___;
  2124. .section .pdata
  2125. .align 4
  2126. .rva .LSEH_begin_$func
  2127. .rva .LSEH_end_$func
  2128. .rva .LSEH_info_$func
  2129. ___
  2130. $code.=<<___ if ($SZ==4 && $shaext);
  2131. .rva .LSEH_begin_${func}_shaext
  2132. .rva .LSEH_end_${func}_shaext
  2133. .rva .LSEH_info_${func}_shaext
  2134. ___
  2135. $code.=<<___ if ($SZ==4);
  2136. .rva .LSEH_begin_${func}_ssse3
  2137. .rva .LSEH_end_${func}_ssse3
  2138. .rva .LSEH_info_${func}_ssse3
  2139. ___
  2140. $code.=<<___ if ($avx && $SZ==8);
  2141. .rva .LSEH_begin_${func}_xop
  2142. .rva .LSEH_end_${func}_xop
  2143. .rva .LSEH_info_${func}_xop
  2144. ___
  2145. $code.=<<___ if ($avx);
  2146. .rva .LSEH_begin_${func}_avx
  2147. .rva .LSEH_end_${func}_avx
  2148. .rva .LSEH_info_${func}_avx
  2149. ___
  2150. $code.=<<___ if ($avx>1);
  2151. .rva .LSEH_begin_${func}_avx2
  2152. .rva .LSEH_end_${func}_avx2
  2153. .rva .LSEH_info_${func}_avx2
  2154. ___
  2155. $code.=<<___;
  2156. .section .xdata
  2157. .align 8
  2158. .LSEH_info_$func:
  2159. .byte 9,0,0,0
  2160. .rva se_handler
  2161. .rva .Lprologue,.Lepilogue # HandlerData[]
  2162. ___
  2163. $code.=<<___ if ($SZ==4 && $shaext);
  2164. .LSEH_info_${func}_shaext:
  2165. .byte 9,0,0,0
  2166. .rva shaext_handler
  2167. ___
  2168. $code.=<<___ if ($SZ==4);
  2169. .LSEH_info_${func}_ssse3:
  2170. .byte 9,0,0,0
  2171. .rva se_handler
  2172. .rva .Lprologue_ssse3,.Lepilogue_ssse3 # HandlerData[]
  2173. ___
  2174. $code.=<<___ if ($avx && $SZ==8);
  2175. .LSEH_info_${func}_xop:
  2176. .byte 9,0,0,0
  2177. .rva se_handler
  2178. .rva .Lprologue_xop,.Lepilogue_xop # HandlerData[]
  2179. ___
  2180. $code.=<<___ if ($avx);
  2181. .LSEH_info_${func}_avx:
  2182. .byte 9,0,0,0
  2183. .rva se_handler
  2184. .rva .Lprologue_avx,.Lepilogue_avx # HandlerData[]
  2185. ___
  2186. $code.=<<___ if ($avx>1);
  2187. .LSEH_info_${func}_avx2:
  2188. .byte 9,0,0,0
  2189. .rva se_handler
  2190. .rva .Lprologue_avx2,.Lepilogue_avx2 # HandlerData[]
  2191. ___
  2192. }
  2193. sub sha256op38 {
  2194. my $instr = shift;
  2195. my %opcodelet = (
  2196. "sha256rnds2" => 0xcb,
  2197. "sha256msg1" => 0xcc,
  2198. "sha256msg2" => 0xcd );
  2199. if (defined($opcodelet{$instr}) && @_[0] =~ /%xmm([0-7]),\s*%xmm([0-7])/) {
  2200. my @opcode=(0x0f,0x38);
  2201. push @opcode,$opcodelet{$instr};
  2202. push @opcode,0xc0|($1&7)|(($2&7)<<3); # ModR/M
  2203. return ".byte\t".join(',',@opcode);
  2204. } else {
  2205. return $instr."\t".@_[0];
  2206. }
  2207. }
  2208. foreach (split("\n",$code)) {
  2209. s/\`([^\`]*)\`/eval $1/geo;
  2210. s/\b(sha256[^\s]*)\s+(.*)/sha256op38($1,$2)/geo;
  2211. print $_,"\n";
  2212. }
  2213. close STDOUT;