You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 
 
 

2397 lines
60 KiB

  1. #!/usr/bin/env perl
  2. #
  3. # ====================================================================
  4. # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
  5. # project. Rights for redistribution and usage in source and binary
  6. # forms are granted according to the OpenSSL license.
  7. # ====================================================================
  8. #
  9. # sha256/512_block procedure for x86_64.
  10. #
  11. # 40% improvement over compiler-generated code on Opteron. On EM64T
  12. # sha256 was observed to run >80% faster and sha512 - >40%. No magical
  13. # tricks, just straight implementation... I really wonder why gcc
  14. # [being armed with inline assembler] fails to generate as fast code.
  15. # The only thing which is cool about this module is that it's very
  16. # same instruction sequence used for both SHA-256 and SHA-512. In
  17. # former case the instructions operate on 32-bit operands, while in
  18. # latter - on 64-bit ones. All I had to do is to get one flavor right,
  19. # the other one passed the test right away:-)
  20. #
  21. # sha256_block runs in ~1005 cycles on Opteron, which gives you
  22. # asymptotic performance of 64*1000/1005=63.7MBps times CPU clock
  23. # frequency in GHz. sha512_block runs in ~1275 cycles, which results
  24. # in 128*1000/1275=100MBps per GHz. Is there room for improvement?
  25. # Well, if you compare it to IA-64 implementation, which maintains
  26. # X[16] in register bank[!], tends to 4 instructions per CPU clock
  27. # cycle and runs in 1003 cycles, 1275 is very good result for 3-way
  28. # issue Opteron pipeline and X[16] maintained in memory. So that *if*
  29. # there is a way to improve it, *then* the only way would be to try to
  30. # offload X[16] updates to SSE unit, but that would require "deeper"
  31. # loop unroll, which in turn would naturally cause size blow-up, not
  32. # to mention increased complexity! And once again, only *if* it's
  33. # actually possible to noticeably improve overall ILP, instruction
  34. # level parallelism, on a given CPU implementation in this case.
  35. #
  36. # Special note on Intel EM64T. While Opteron CPU exhibits perfect
  37. # perfromance ratio of 1.5 between 64- and 32-bit flavors [see above],
  38. # [currently available] EM64T CPUs apparently are far from it. On the
  39. # contrary, 64-bit version, sha512_block, is ~30% *slower* than 32-bit
  40. # sha256_block:-( This is presumably because 64-bit shifts/rotates
  41. # apparently are not atomic instructions, but implemented in microcode.
  42. #
  43. # May 2012.
  44. #
  45. # Optimization including one of Pavel Semjanov's ideas, alternative
  46. # Maj, resulted in >=5% improvement on most CPUs, +20% SHA256 and
  47. # unfortunately -2% SHA512 on P4 [which nobody should care about
  48. # that much].
  49. #
  50. # June 2012.
  51. #
  52. # Add SIMD code paths, see below for improvement coefficients. SSSE3
  53. # code path was not attempted for SHA512, because improvement is not
  54. # estimated to be high enough, noticeably less than 9%, to justify
  55. # the effort, not on pre-AVX processors. [Obviously with exclusion
  56. # for VIA Nano, but it has SHA512 instruction that is faster and
  57. # should be used instead.] For reference, corresponding estimated
  58. # upper limit for improvement for SSSE3 SHA256 is 28%. The fact that
  59. # higher coefficients are observed on VIA Nano and Bulldozer has more
  60. # to do with specifics of their architecture [which is topic for
  61. # separate discussion].
  62. #
  63. # November 2012.
  64. #
  65. # Add AVX2 code path. Two consecutive input blocks are loaded to
  66. # 256-bit %ymm registers, with data from first block to least
  67. # significant 128-bit halves and data from second to most significant.
  68. # The data is then processed with same SIMD instruction sequence as
  69. # for AVX, but with %ymm as operands. Side effect is increased stack
  70. # frame, 448 additional bytes in SHA256 and 1152 in SHA512, and 1.2KB
  71. # code size increase.
  72. #
  73. # March 2014.
  74. #
  75. # Add support for Intel SHA Extensions.
  76. ######################################################################
  77. # Current performance in cycles per processed byte (less is better):
  78. #
  79. # SHA256 SSSE3 AVX/XOP(*) SHA512 AVX/XOP(*)
  80. #
  81. # AMD K8 14.9 - - 9.57 -
  82. # P4 17.3 - - 30.8 -
  83. # Core 2 15.6 13.8(+13%) - 9.97 -
  84. # Westmere 14.8 12.3(+19%) - 9.58 -
  85. # Sandy Bridge 17.4 14.2(+23%) 11.6(+50%(**)) 11.2 8.10(+38%(**))
  86. # Ivy Bridge 12.6 10.5(+20%) 10.3(+22%) 8.17 7.22(+13%)
  87. # Haswell 12.2 9.28(+31%) 7.80(+56%) 7.66 5.40(+42%)
  88. # Bulldozer 21.1 13.6(+54%) 13.6(+54%(***)) 13.5 8.58(+57%)
  89. # VIA Nano 23.0 16.5(+39%) - 14.7 -
  90. # Atom 23.0 18.9(+22%) - 14.7 -
  91. # Silvermont 27.4 20.6(+33%) - 17.5 -
  92. #
  93. # (*) whichever best applicable;
  94. # (**) switch from ror to shrd stands for fair share of improvement;
  95. # (***) execution time is fully determined by remaining integer-only
  96. # part, body_00_15; reducing the amount of SIMD instructions
  97. # below certain limit makes no difference/sense; to conserve
  98. # space SHA256 XOP code path is therefore omitted;
  99. $flavour = shift;
  100. $output = shift;
  101. if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
  102. $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
  103. $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
  104. ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
  105. ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
  106. die "can't locate x86_64-xlate.pl";
  107. if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
  108. =~ /GNU assembler version ([2-9]\.[0-9]+)/) {
  109. $avx = ($1>=2.19) + ($1>=2.22);
  110. }
  111. if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
  112. `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) {
  113. $avx = ($1>=2.09) + ($1>=2.10);
  114. }
  115. if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
  116. `ml64 2>&1` =~ /Version ([0-9]+)\./) {
  117. $avx = ($1>=10) + ($1>=11);
  118. }
  119. if (!$avx && `$ENV{CC} -v 2>&1` =~ /(^clang version|based on LLVM) ([3-9]\.[0-9]+)/) {
  120. $avx = ($2>=3.0) + ($2>3.0);
  121. }
  122. $shaext=0; ### set to zero if compiling for 1.0.1
  123. $avx=1 if (!$shaext && $avx);
  124. open OUT,"| \"$^X\" $xlate $flavour";
  125. *STDOUT=*OUT;
  126. if ($output =~ /512/) {
  127. $func="sha512_block_data_order";
  128. $TABLE="K512";
  129. $SZ=8;
  130. @ROT=($A,$B,$C,$D,$E,$F,$G,$H)=("%rax","%rbx","%rcx","%rdx",
  131. "%r8", "%r9", "%r10","%r11");
  132. ($T1,$a0,$a1,$a2,$a3)=("%r12","%r13","%r14","%r15","%rdi");
  133. @Sigma0=(28,34,39);
  134. @Sigma1=(14,18,41);
  135. @sigma0=(1, 8, 7);
  136. @sigma1=(19,61, 6);
  137. $rounds=80;
  138. } else {
  139. $func="sha256_block_data_order";
  140. $TABLE="K256";
  141. $SZ=4;
  142. @ROT=($A,$B,$C,$D,$E,$F,$G,$H)=("%eax","%ebx","%ecx","%edx",
  143. "%r8d","%r9d","%r10d","%r11d");
  144. ($T1,$a0,$a1,$a2,$a3)=("%r12d","%r13d","%r14d","%r15d","%edi");
  145. @Sigma0=( 2,13,22);
  146. @Sigma1=( 6,11,25);
  147. @sigma0=( 7,18, 3);
  148. @sigma1=(17,19,10);
  149. $rounds=64;
  150. }
  151. $ctx="%rdi"; # 1st arg, zapped by $a3
  152. $inp="%rsi"; # 2nd arg
  153. $Tbl="%rbp";
  154. $_ctx="16*$SZ+0*8(%rsp)";
  155. $_inp="16*$SZ+1*8(%rsp)";
  156. $_end="16*$SZ+2*8(%rsp)";
  157. $_rsp="16*$SZ+3*8(%rsp)";
  158. $framesz="16*$SZ+4*8";
  159. sub ROUND_00_15()
  160. { my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
  161. my $STRIDE=$SZ;
  162. $STRIDE += 16 if ($i%(16/$SZ)==(16/$SZ-1));
  163. $code.=<<___;
  164. ror \$`$Sigma1[2]-$Sigma1[1]`,$a0
  165. mov $f,$a2
  166. xor $e,$a0
  167. ror \$`$Sigma0[2]-$Sigma0[1]`,$a1
  168. xor $g,$a2 # f^g
  169. mov $T1,`$SZ*($i&0xf)`(%rsp)
  170. xor $a,$a1
  171. and $e,$a2 # (f^g)&e
  172. ror \$`$Sigma1[1]-$Sigma1[0]`,$a0
  173. add $h,$T1 # T1+=h
  174. xor $g,$a2 # Ch(e,f,g)=((f^g)&e)^g
  175. ror \$`$Sigma0[1]-$Sigma0[0]`,$a1
  176. xor $e,$a0
  177. add $a2,$T1 # T1+=Ch(e,f,g)
  178. mov $a,$a2
  179. add ($Tbl),$T1 # T1+=K[round]
  180. xor $a,$a1
  181. xor $b,$a2 # a^b, b^c in next round
  182. ror \$$Sigma1[0],$a0 # Sigma1(e)
  183. mov $b,$h
  184. and $a2,$a3
  185. ror \$$Sigma0[0],$a1 # Sigma0(a)
  186. add $a0,$T1 # T1+=Sigma1(e)
  187. xor $a3,$h # h=Maj(a,b,c)=Ch(a^b,c,b)
  188. add $T1,$d # d+=T1
  189. add $T1,$h # h+=T1
  190. lea $STRIDE($Tbl),$Tbl # round++
  191. ___
  192. $code.=<<___ if ($i<15);
  193. add $a1,$h # h+=Sigma0(a)
  194. ___
  195. ($a2,$a3) = ($a3,$a2);
  196. }
  197. sub ROUND_16_XX()
  198. { my ($i,$a,$b,$c,$d,$e,$f,$g,$h) = @_;
  199. $code.=<<___;
  200. mov `$SZ*(($i+1)&0xf)`(%rsp),$a0
  201. mov `$SZ*(($i+14)&0xf)`(%rsp),$a2
  202. mov $a0,$T1
  203. ror \$`$sigma0[1]-$sigma0[0]`,$a0
  204. add $a1,$a # modulo-scheduled h+=Sigma0(a)
  205. mov $a2,$a1
  206. ror \$`$sigma1[1]-$sigma1[0]`,$a2
  207. xor $T1,$a0
  208. shr \$$sigma0[2],$T1
  209. ror \$$sigma0[0],$a0
  210. xor $a1,$a2
  211. shr \$$sigma1[2],$a1
  212. ror \$$sigma1[0],$a2
  213. xor $a0,$T1 # sigma0(X[(i+1)&0xf])
  214. xor $a1,$a2 # sigma1(X[(i+14)&0xf])
  215. add `$SZ*(($i+9)&0xf)`(%rsp),$T1
  216. add `$SZ*($i&0xf)`(%rsp),$T1
  217. mov $e,$a0
  218. add $a2,$T1
  219. mov $a,$a1
  220. ___
  221. &ROUND_00_15(@_);
  222. }
  223. $code=<<___;
  224. .text
  225. .extern OPENSSL_ia32cap_P
  226. .globl $func
  227. .type $func,\@function,3
  228. .align 16
  229. $func:
  230. ___
  231. $code.=<<___ if ($SZ==4 || $avx);
  232. lea OPENSSL_ia32cap_P(%rip),%r11
  233. mov 0(%r11),%r9d
  234. mov 4(%r11),%r10d
  235. mov 8(%r11),%r11d
  236. ___
  237. $code.=<<___ if ($SZ==4 && $shaext);
  238. test \$`1<<29`,%r11d # check for SHA
  239. jnz _shaext_shortcut
  240. ___
  241. $code.=<<___ if ($avx && $SZ==8);
  242. test \$`1<<11`,%r10d # check for XOP
  243. jnz .Lxop_shortcut
  244. ___
  245. $code.=<<___ if ($avx>1);
  246. and \$`1<<8|1<<5|1<<3`,%r11d # check for BMI2+AVX2+BMI1
  247. cmp \$`1<<8|1<<5|1<<3`,%r11d
  248. je .Lavx2_shortcut
  249. ___
  250. $code.=<<___ if ($avx);
  251. and \$`1<<30`,%r9d # mask "Intel CPU" bit
  252. and \$`1<<28|1<<9`,%r10d # mask AVX and SSSE3 bits
  253. or %r9d,%r10d
  254. cmp \$`1<<28|1<<9|1<<30`,%r10d
  255. je .Lavx_shortcut
  256. ___
  257. $code.=<<___ if ($SZ==4);
  258. test \$`1<<9`,%r10d
  259. jnz .Lssse3_shortcut
  260. ___
  261. $code.=<<___;
  262. push %rbx
  263. push %rbp
  264. push %r12
  265. push %r13
  266. push %r14
  267. push %r15
  268. mov %rsp,%r11 # copy %rsp
  269. shl \$4,%rdx # num*16
  270. sub \$$framesz,%rsp
  271. lea ($inp,%rdx,$SZ),%rdx # inp+num*16*$SZ
  272. and \$-64,%rsp # align stack frame
  273. mov $ctx,$_ctx # save ctx, 1st arg
  274. mov $inp,$_inp # save inp, 2nd arh
  275. mov %rdx,$_end # save end pointer, "3rd" arg
  276. mov %r11,$_rsp # save copy of %rsp
  277. .Lprologue:
  278. mov $SZ*0($ctx),$A
  279. mov $SZ*1($ctx),$B
  280. mov $SZ*2($ctx),$C
  281. mov $SZ*3($ctx),$D
  282. mov $SZ*4($ctx),$E
  283. mov $SZ*5($ctx),$F
  284. mov $SZ*6($ctx),$G
  285. mov $SZ*7($ctx),$H
  286. jmp .Lloop
  287. .align 16
  288. .Lloop:
  289. mov $B,$a3
  290. lea $TABLE(%rip),$Tbl
  291. xor $C,$a3 # magic
  292. ___
  293. for($i=0;$i<16;$i++) {
  294. $code.=" mov $SZ*$i($inp),$T1\n";
  295. $code.=" mov @ROT[4],$a0\n";
  296. $code.=" mov @ROT[0],$a1\n";
  297. $code.=" bswap $T1\n";
  298. &ROUND_00_15($i,@ROT);
  299. unshift(@ROT,pop(@ROT));
  300. }
  301. $code.=<<___;
  302. jmp .Lrounds_16_xx
  303. .align 16
  304. .Lrounds_16_xx:
  305. ___
  306. for(;$i<32;$i++) {
  307. &ROUND_16_XX($i,@ROT);
  308. unshift(@ROT,pop(@ROT));
  309. }
  310. $code.=<<___;
  311. cmpb \$0,`$SZ-1`($Tbl)
  312. jnz .Lrounds_16_xx
  313. mov $_ctx,$ctx
  314. add $a1,$A # modulo-scheduled h+=Sigma0(a)
  315. lea 16*$SZ($inp),$inp
  316. add $SZ*0($ctx),$A
  317. add $SZ*1($ctx),$B
  318. add $SZ*2($ctx),$C
  319. add $SZ*3($ctx),$D
  320. add $SZ*4($ctx),$E
  321. add $SZ*5($ctx),$F
  322. add $SZ*6($ctx),$G
  323. add $SZ*7($ctx),$H
  324. cmp $_end,$inp
  325. mov $A,$SZ*0($ctx)
  326. mov $B,$SZ*1($ctx)
  327. mov $C,$SZ*2($ctx)
  328. mov $D,$SZ*3($ctx)
  329. mov $E,$SZ*4($ctx)
  330. mov $F,$SZ*5($ctx)
  331. mov $G,$SZ*6($ctx)
  332. mov $H,$SZ*7($ctx)
  333. jb .Lloop
  334. mov $_rsp,%rsi
  335. mov (%rsi),%r15
  336. mov 8(%rsi),%r14
  337. mov 16(%rsi),%r13
  338. mov 24(%rsi),%r12
  339. mov 32(%rsi),%rbp
  340. mov 40(%rsi),%rbx
  341. lea 48(%rsi),%rsp
  342. .Lepilogue:
  343. ret
  344. .size $func,.-$func
  345. ___
  346. if ($SZ==4) {
  347. $code.=<<___;
  348. .align 64
  349. .type $TABLE,\@object
  350. $TABLE:
  351. .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
  352. .long 0x428a2f98,0x71374491,0xb5c0fbcf,0xe9b5dba5
  353. .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
  354. .long 0x3956c25b,0x59f111f1,0x923f82a4,0xab1c5ed5
  355. .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
  356. .long 0xd807aa98,0x12835b01,0x243185be,0x550c7dc3
  357. .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
  358. .long 0x72be5d74,0x80deb1fe,0x9bdc06a7,0xc19bf174
  359. .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
  360. .long 0xe49b69c1,0xefbe4786,0x0fc19dc6,0x240ca1cc
  361. .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
  362. .long 0x2de92c6f,0x4a7484aa,0x5cb0a9dc,0x76f988da
  363. .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
  364. .long 0x983e5152,0xa831c66d,0xb00327c8,0xbf597fc7
  365. .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
  366. .long 0xc6e00bf3,0xd5a79147,0x06ca6351,0x14292967
  367. .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
  368. .long 0x27b70a85,0x2e1b2138,0x4d2c6dfc,0x53380d13
  369. .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
  370. .long 0x650a7354,0x766a0abb,0x81c2c92e,0x92722c85
  371. .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
  372. .long 0xa2bfe8a1,0xa81a664b,0xc24b8b70,0xc76c51a3
  373. .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
  374. .long 0xd192e819,0xd6990624,0xf40e3585,0x106aa070
  375. .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
  376. .long 0x19a4c116,0x1e376c08,0x2748774c,0x34b0bcb5
  377. .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
  378. .long 0x391c0cb3,0x4ed8aa4a,0x5b9cca4f,0x682e6ff3
  379. .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
  380. .long 0x748f82ee,0x78a5636f,0x84c87814,0x8cc70208
  381. .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
  382. .long 0x90befffa,0xa4506ceb,0xbef9a3f7,0xc67178f2
  383. .long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f
  384. .long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f
  385. .long 0x03020100,0x0b0a0908,0xffffffff,0xffffffff
  386. .long 0x03020100,0x0b0a0908,0xffffffff,0xffffffff
  387. .long 0xffffffff,0xffffffff,0x03020100,0x0b0a0908
  388. .long 0xffffffff,0xffffffff,0x03020100,0x0b0a0908
  389. .asciz "SHA256 block transform for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
  390. ___
  391. } else {
  392. $code.=<<___;
  393. .align 64
  394. .type $TABLE,\@object
  395. $TABLE:
  396. .quad 0x428a2f98d728ae22,0x7137449123ef65cd
  397. .quad 0x428a2f98d728ae22,0x7137449123ef65cd
  398. .quad 0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc
  399. .quad 0xb5c0fbcfec4d3b2f,0xe9b5dba58189dbbc
  400. .quad 0x3956c25bf348b538,0x59f111f1b605d019
  401. .quad 0x3956c25bf348b538,0x59f111f1b605d019
  402. .quad 0x923f82a4af194f9b,0xab1c5ed5da6d8118
  403. .quad 0x923f82a4af194f9b,0xab1c5ed5da6d8118
  404. .quad 0xd807aa98a3030242,0x12835b0145706fbe
  405. .quad 0xd807aa98a3030242,0x12835b0145706fbe
  406. .quad 0x243185be4ee4b28c,0x550c7dc3d5ffb4e2
  407. .quad 0x243185be4ee4b28c,0x550c7dc3d5ffb4e2
  408. .quad 0x72be5d74f27b896f,0x80deb1fe3b1696b1
  409. .quad 0x72be5d74f27b896f,0x80deb1fe3b1696b1
  410. .quad 0x9bdc06a725c71235,0xc19bf174cf692694
  411. .quad 0x9bdc06a725c71235,0xc19bf174cf692694
  412. .quad 0xe49b69c19ef14ad2,0xefbe4786384f25e3
  413. .quad 0xe49b69c19ef14ad2,0xefbe4786384f25e3
  414. .quad 0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65
  415. .quad 0x0fc19dc68b8cd5b5,0x240ca1cc77ac9c65
  416. .quad 0x2de92c6f592b0275,0x4a7484aa6ea6e483
  417. .quad 0x2de92c6f592b0275,0x4a7484aa6ea6e483
  418. .quad 0x5cb0a9dcbd41fbd4,0x76f988da831153b5
  419. .quad 0x5cb0a9dcbd41fbd4,0x76f988da831153b5
  420. .quad 0x983e5152ee66dfab,0xa831c66d2db43210
  421. .quad 0x983e5152ee66dfab,0xa831c66d2db43210
  422. .quad 0xb00327c898fb213f,0xbf597fc7beef0ee4
  423. .quad 0xb00327c898fb213f,0xbf597fc7beef0ee4
  424. .quad 0xc6e00bf33da88fc2,0xd5a79147930aa725
  425. .quad 0xc6e00bf33da88fc2,0xd5a79147930aa725
  426. .quad 0x06ca6351e003826f,0x142929670a0e6e70
  427. .quad 0x06ca6351e003826f,0x142929670a0e6e70
  428. .quad 0x27b70a8546d22ffc,0x2e1b21385c26c926
  429. .quad 0x27b70a8546d22ffc,0x2e1b21385c26c926
  430. .quad 0x4d2c6dfc5ac42aed,0x53380d139d95b3df
  431. .quad 0x4d2c6dfc5ac42aed,0x53380d139d95b3df
  432. .quad 0x650a73548baf63de,0x766a0abb3c77b2a8
  433. .quad 0x650a73548baf63de,0x766a0abb3c77b2a8
  434. .quad 0x81c2c92e47edaee6,0x92722c851482353b
  435. .quad 0x81c2c92e47edaee6,0x92722c851482353b
  436. .quad 0xa2bfe8a14cf10364,0xa81a664bbc423001
  437. .quad 0xa2bfe8a14cf10364,0xa81a664bbc423001
  438. .quad 0xc24b8b70d0f89791,0xc76c51a30654be30
  439. .quad 0xc24b8b70d0f89791,0xc76c51a30654be30
  440. .quad 0xd192e819d6ef5218,0xd69906245565a910
  441. .quad 0xd192e819d6ef5218,0xd69906245565a910
  442. .quad 0xf40e35855771202a,0x106aa07032bbd1b8
  443. .quad 0xf40e35855771202a,0x106aa07032bbd1b8
  444. .quad 0x19a4c116b8d2d0c8,0x1e376c085141ab53
  445. .quad 0x19a4c116b8d2d0c8,0x1e376c085141ab53
  446. .quad 0x2748774cdf8eeb99,0x34b0bcb5e19b48a8
  447. .quad 0x2748774cdf8eeb99,0x34b0bcb5e19b48a8
  448. .quad 0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb
  449. .quad 0x391c0cb3c5c95a63,0x4ed8aa4ae3418acb
  450. .quad 0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3
  451. .quad 0x5b9cca4f7763e373,0x682e6ff3d6b2b8a3
  452. .quad 0x748f82ee5defb2fc,0x78a5636f43172f60
  453. .quad 0x748f82ee5defb2fc,0x78a5636f43172f60
  454. .quad 0x84c87814a1f0ab72,0x8cc702081a6439ec
  455. .quad 0x84c87814a1f0ab72,0x8cc702081a6439ec
  456. .quad 0x90befffa23631e28,0xa4506cebde82bde9
  457. .quad 0x90befffa23631e28,0xa4506cebde82bde9
  458. .quad 0xbef9a3f7b2c67915,0xc67178f2e372532b
  459. .quad 0xbef9a3f7b2c67915,0xc67178f2e372532b
  460. .quad 0xca273eceea26619c,0xd186b8c721c0c207
  461. .quad 0xca273eceea26619c,0xd186b8c721c0c207
  462. .quad 0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178
  463. .quad 0xeada7dd6cde0eb1e,0xf57d4f7fee6ed178
  464. .quad 0x06f067aa72176fba,0x0a637dc5a2c898a6
  465. .quad 0x06f067aa72176fba,0x0a637dc5a2c898a6
  466. .quad 0x113f9804bef90dae,0x1b710b35131c471b
  467. .quad 0x113f9804bef90dae,0x1b710b35131c471b
  468. .quad 0x28db77f523047d84,0x32caab7b40c72493
  469. .quad 0x28db77f523047d84,0x32caab7b40c72493
  470. .quad 0x3c9ebe0a15c9bebc,0x431d67c49c100d4c
  471. .quad 0x3c9ebe0a15c9bebc,0x431d67c49c100d4c
  472. .quad 0x4cc5d4becb3e42b6,0x597f299cfc657e2a
  473. .quad 0x4cc5d4becb3e42b6,0x597f299cfc657e2a
  474. .quad 0x5fcb6fab3ad6faec,0x6c44198c4a475817
  475. .quad 0x5fcb6fab3ad6faec,0x6c44198c4a475817
  476. .quad 0x0001020304050607,0x08090a0b0c0d0e0f
  477. .quad 0x0001020304050607,0x08090a0b0c0d0e0f
  478. .asciz "SHA512 block transform for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
  479. ___
  480. }
  481. ######################################################################
  482. # SIMD code paths
  483. #
  484. if ($SZ==4 && $shaext) {{{
  485. ######################################################################
  486. # Intel SHA Extensions implementation of SHA256 update function.
  487. #
  488. my ($ctx,$inp,$num,$Tbl)=("%rdi","%rsi","%rdx","%rcx");
  489. my ($Wi,$ABEF,$CDGH,$TMP,$BSWAP,$ABEF_SAVE,$CDGH_SAVE)=map("%xmm$_",(0..2,7..10));
  490. my @MSG=map("%xmm$_",(3..6));
  491. $code.=<<___;
  492. .type sha256_block_data_order_shaext,\@function,3
  493. .align 64
  494. sha256_block_data_order_shaext:
  495. _shaext_shortcut:
  496. ___
  497. $code.=<<___ if ($win64);
  498. lea `-8-5*16`(%rsp),%rsp
  499. movaps %xmm6,-8-5*16(%rax)
  500. movaps %xmm7,-8-4*16(%rax)
  501. movaps %xmm8,-8-3*16(%rax)
  502. movaps %xmm9,-8-2*16(%rax)
  503. movaps %xmm10,-8-1*16(%rax)
  504. .Lprologue_shaext:
  505. ___
  506. $code.=<<___;
  507. lea K256+0x80(%rip),$Tbl
  508. movdqu ($ctx),$ABEF # DCBA
  509. movdqu 16($ctx),$CDGH # HGFE
  510. movdqa 0x200-0x80($Tbl),$TMP # byte swap mask
  511. pshufd \$0x1b,$ABEF,$Wi # ABCD
  512. pshufd \$0xb1,$ABEF,$ABEF # CDAB
  513. pshufd \$0x1b,$CDGH,$CDGH # EFGH
  514. movdqa $TMP,$BSWAP # offload
  515. palignr \$8,$CDGH,$ABEF # ABEF
  516. punpcklqdq $Wi,$CDGH # CDGH
  517. jmp .Loop_shaext
  518. .align 16
  519. .Loop_shaext:
  520. movdqu ($inp),@MSG[0]
  521. movdqu 0x10($inp),@MSG[1]
  522. movdqu 0x20($inp),@MSG[2]
  523. pshufb $TMP,@MSG[0]
  524. movdqu 0x30($inp),@MSG[3]
  525. movdqa 0*32-0x80($Tbl),$Wi
  526. paddd @MSG[0],$Wi
  527. pshufb $TMP,@MSG[1]
  528. movdqa $CDGH,$CDGH_SAVE # offload
  529. sha256rnds2 $ABEF,$CDGH # 0-3
  530. pshufd \$0x0e,$Wi,$Wi
  531. nop
  532. movdqa $ABEF,$ABEF_SAVE # offload
  533. sha256rnds2 $CDGH,$ABEF
  534. movdqa 1*32-0x80($Tbl),$Wi
  535. paddd @MSG[1],$Wi
  536. pshufb $TMP,@MSG[2]
  537. sha256rnds2 $ABEF,$CDGH # 4-7
  538. pshufd \$0x0e,$Wi,$Wi
  539. lea 0x40($inp),$inp
  540. sha256msg1 @MSG[1],@MSG[0]
  541. sha256rnds2 $CDGH,$ABEF
  542. movdqa 2*32-0x80($Tbl),$Wi
  543. paddd @MSG[2],$Wi
  544. pshufb $TMP,@MSG[3]
  545. sha256rnds2 $ABEF,$CDGH # 8-11
  546. pshufd \$0x0e,$Wi,$Wi
  547. movdqa @MSG[3],$TMP
  548. palignr \$4,@MSG[2],$TMP
  549. nop
  550. paddd $TMP,@MSG[0]
  551. sha256msg1 @MSG[2],@MSG[1]
  552. sha256rnds2 $CDGH,$ABEF
  553. movdqa 3*32-0x80($Tbl),$Wi
  554. paddd @MSG[3],$Wi
  555. sha256msg2 @MSG[3],@MSG[0]
  556. sha256rnds2 $ABEF,$CDGH # 12-15
  557. pshufd \$0x0e,$Wi,$Wi
  558. movdqa @MSG[0],$TMP
  559. palignr \$4,@MSG[3],$TMP
  560. nop
  561. paddd $TMP,@MSG[1]
  562. sha256msg1 @MSG[3],@MSG[2]
  563. sha256rnds2 $CDGH,$ABEF
  564. ___
  565. for($i=4;$i<16-3;$i++) {
  566. $code.=<<___;
  567. movdqa $i*32-0x80($Tbl),$Wi
  568. paddd @MSG[0],$Wi
  569. sha256msg2 @MSG[0],@MSG[1]
  570. sha256rnds2 $ABEF,$CDGH # 16-19...
  571. pshufd \$0x0e,$Wi,$Wi
  572. movdqa @MSG[1],$TMP
  573. palignr \$4,@MSG[0],$TMP
  574. nop
  575. paddd $TMP,@MSG[2]
  576. sha256msg1 @MSG[0],@MSG[3]
  577. sha256rnds2 $CDGH,$ABEF
  578. ___
  579. push(@MSG,shift(@MSG));
  580. }
  581. $code.=<<___;
  582. movdqa 13*32-0x80($Tbl),$Wi
  583. paddd @MSG[0],$Wi
  584. sha256msg2 @MSG[0],@MSG[1]
  585. sha256rnds2 $ABEF,$CDGH # 52-55
  586. pshufd \$0x0e,$Wi,$Wi
  587. movdqa @MSG[1],$TMP
  588. palignr \$4,@MSG[0],$TMP
  589. sha256rnds2 $CDGH,$ABEF
  590. paddd $TMP,@MSG[2]
  591. movdqa 14*32-0x80($Tbl),$Wi
  592. paddd @MSG[1],$Wi
  593. sha256rnds2 $ABEF,$CDGH # 56-59
  594. pshufd \$0x0e,$Wi,$Wi
  595. sha256msg2 @MSG[1],@MSG[2]
  596. movdqa $BSWAP,$TMP
  597. sha256rnds2 $CDGH,$ABEF
  598. movdqa 15*32-0x80($Tbl),$Wi
  599. paddd @MSG[2],$Wi
  600. nop
  601. sha256rnds2 $ABEF,$CDGH # 60-63
  602. pshufd \$0x0e,$Wi,$Wi
  603. dec $num
  604. nop
  605. sha256rnds2 $CDGH,$ABEF
  606. paddd $CDGH_SAVE,$CDGH
  607. paddd $ABEF_SAVE,$ABEF
  608. jnz .Loop_shaext
  609. pshufd \$0xb1,$CDGH,$CDGH # DCHG
  610. pshufd \$0x1b,$ABEF,$TMP # FEBA
  611. pshufd \$0xb1,$ABEF,$ABEF # BAFE
  612. punpckhqdq $CDGH,$ABEF # DCBA
  613. palignr \$8,$TMP,$CDGH # HGFE
  614. movdqu $ABEF,($ctx)
  615. movdqu $CDGH,16($ctx)
  616. ___
  617. $code.=<<___ if ($win64);
  618. movaps -8-5*16(%rax),%xmm6
  619. movaps -8-4*16(%rax),%xmm7
  620. movaps -8-3*16(%rax),%xmm8
  621. movaps -8-2*16(%rax),%xmm9
  622. movaps -8-1*16(%rax),%xmm10
  623. mov %rax,%rsp
  624. .Lepilogue_shaext:
  625. ___
  626. $code.=<<___;
  627. ret
  628. .size sha256_block_data_order_shaext,.-sha256_block_data_order_shaext
  629. ___
  630. }}}
  631. {{{
  632. my $a4=$T1;
  633. my ($a,$b,$c,$d,$e,$f,$g,$h);
  634. sub AUTOLOAD() # thunk [simplified] 32-bit style perlasm
  635. { my $opcode = $AUTOLOAD; $opcode =~ s/.*:://;
  636. my $arg = pop;
  637. $arg = "\$$arg" if ($arg*1 eq $arg);
  638. $code .= "\t$opcode\t".join(',',$arg,reverse @_)."\n";
  639. }
  640. sub body_00_15 () {
  641. (
  642. '($a,$b,$c,$d,$e,$f,$g,$h)=@ROT;'.
  643. '&ror ($a0,$Sigma1[2]-$Sigma1[1])',
  644. '&mov ($a,$a1)',
  645. '&mov ($a4,$f)',
  646. '&ror ($a1,$Sigma0[2]-$Sigma0[1])',
  647. '&xor ($a0,$e)',
  648. '&xor ($a4,$g)', # f^g
  649. '&ror ($a0,$Sigma1[1]-$Sigma1[0])',
  650. '&xor ($a1,$a)',
  651. '&and ($a4,$e)', # (f^g)&e
  652. '&xor ($a0,$e)',
  653. '&add ($h,$SZ*($i&15)."(%rsp)")', # h+=X[i]+K[i]
  654. '&mov ($a2,$a)',
  655. '&xor ($a4,$g)', # Ch(e,f,g)=((f^g)&e)^g
  656. '&ror ($a1,$Sigma0[1]-$Sigma0[0])',
  657. '&xor ($a2,$b)', # a^b, b^c in next round
  658. '&add ($h,$a4)', # h+=Ch(e,f,g)
  659. '&ror ($a0,$Sigma1[0])', # Sigma1(e)
  660. '&and ($a3,$a2)', # (b^c)&(a^b)
  661. '&xor ($a1,$a)',
  662. '&add ($h,$a0)', # h+=Sigma1(e)
  663. '&xor ($a3,$b)', # Maj(a,b,c)=Ch(a^b,c,b)
  664. '&ror ($a1,$Sigma0[0])', # Sigma0(a)
  665. '&add ($d,$h)', # d+=h
  666. '&add ($h,$a3)', # h+=Maj(a,b,c)
  667. '&mov ($a0,$d)',
  668. '&add ($a1,$h);'. # h+=Sigma0(a)
  669. '($a2,$a3) = ($a3,$a2); unshift(@ROT,pop(@ROT)); $i++;'
  670. );
  671. }
  672. ######################################################################
  673. # SSSE3 code path
  674. #
  675. if ($SZ==4) { # SHA256 only
  676. my @X = map("%xmm$_",(0..3));
  677. my ($t0,$t1,$t2,$t3, $t4,$t5) = map("%xmm$_",(4..9));
  678. $code.=<<___;
  679. .type ${func}_ssse3,\@function,3
  680. .align 64
  681. ${func}_ssse3:
  682. .Lssse3_shortcut:
  683. push %rbx
  684. push %rbp
  685. push %r12
  686. push %r13
  687. push %r14
  688. push %r15
  689. mov %rsp,%r11 # copy %rsp
  690. shl \$4,%rdx # num*16
  691. sub \$`$framesz+$win64*16*4`,%rsp
  692. lea ($inp,%rdx,$SZ),%rdx # inp+num*16*$SZ
  693. and \$-64,%rsp # align stack frame
  694. mov $ctx,$_ctx # save ctx, 1st arg
  695. mov $inp,$_inp # save inp, 2nd arh
  696. mov %rdx,$_end # save end pointer, "3rd" arg
  697. mov %r11,$_rsp # save copy of %rsp
  698. ___
  699. $code.=<<___ if ($win64);
  700. movaps %xmm6,16*$SZ+32(%rsp)
  701. movaps %xmm7,16*$SZ+48(%rsp)
  702. movaps %xmm8,16*$SZ+64(%rsp)
  703. movaps %xmm9,16*$SZ+80(%rsp)
  704. ___
  705. $code.=<<___;
  706. .Lprologue_ssse3:
  707. mov $SZ*0($ctx),$A
  708. mov $SZ*1($ctx),$B
  709. mov $SZ*2($ctx),$C
  710. mov $SZ*3($ctx),$D
  711. mov $SZ*4($ctx),$E
  712. mov $SZ*5($ctx),$F
  713. mov $SZ*6($ctx),$G
  714. mov $SZ*7($ctx),$H
  715. ___
  716. $code.=<<___;
  717. #movdqa $TABLE+`$SZ*2*$rounds`+32(%rip),$t4
  718. #movdqa $TABLE+`$SZ*2*$rounds`+64(%rip),$t5
  719. jmp .Lloop_ssse3
  720. .align 16
  721. .Lloop_ssse3:
  722. movdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3
  723. movdqu 0x00($inp),@X[0]
  724. movdqu 0x10($inp),@X[1]
  725. movdqu 0x20($inp),@X[2]
  726. pshufb $t3,@X[0]
  727. movdqu 0x30($inp),@X[3]
  728. lea $TABLE(%rip),$Tbl
  729. pshufb $t3,@X[1]
  730. movdqa 0x00($Tbl),$t0
  731. movdqa 0x20($Tbl),$t1
  732. pshufb $t3,@X[2]
  733. paddd @X[0],$t0
  734. movdqa 0x40($Tbl),$t2
  735. pshufb $t3,@X[3]
  736. movdqa 0x60($Tbl),$t3
  737. paddd @X[1],$t1
  738. paddd @X[2],$t2
  739. paddd @X[3],$t3
  740. movdqa $t0,0x00(%rsp)
  741. mov $A,$a1
  742. movdqa $t1,0x10(%rsp)
  743. mov $B,$a3
  744. movdqa $t2,0x20(%rsp)
  745. xor $C,$a3 # magic
  746. movdqa $t3,0x30(%rsp)
  747. mov $E,$a0
  748. jmp .Lssse3_00_47
  749. .align 16
  750. .Lssse3_00_47:
  751. sub \$`-16*2*$SZ`,$Tbl # size optimization
  752. ___
  753. sub Xupdate_256_SSSE3 () {
  754. (
  755. '&movdqa ($t0,@X[1]);',
  756. '&movdqa ($t3,@X[3])',
  757. '&palignr ($t0,@X[0],$SZ)', # X[1..4]
  758. '&palignr ($t3,@X[2],$SZ);', # X[9..12]
  759. '&movdqa ($t1,$t0)',
  760. '&movdqa ($t2,$t0);',
  761. '&psrld ($t0,$sigma0[2])',
  762. '&paddd (@X[0],$t3);', # X[0..3] += X[9..12]
  763. '&psrld ($t2,$sigma0[0])',
  764. '&pshufd ($t3,@X[3],0b11111010)',# X[14..15]
  765. '&pslld ($t1,8*$SZ-$sigma0[1]);'.
  766. '&pxor ($t0,$t2)',
  767. '&psrld ($t2,$sigma0[1]-$sigma0[0]);'.
  768. '&pxor ($t0,$t1)',
  769. '&pslld ($t1,$sigma0[1]-$sigma0[0]);'.
  770. '&pxor ($t0,$t2);',
  771. '&movdqa ($t2,$t3)',
  772. '&pxor ($t0,$t1);', # sigma0(X[1..4])
  773. '&psrld ($t3,$sigma1[2])',
  774. '&paddd (@X[0],$t0);', # X[0..3] += sigma0(X[1..4])
  775. '&psrlq ($t2,$sigma1[0])',
  776. '&pxor ($t3,$t2);',
  777. '&psrlq ($t2,$sigma1[1]-$sigma1[0])',
  778. '&pxor ($t3,$t2)',
  779. '&pshufb ($t3,$t4)', # sigma1(X[14..15])
  780. '&paddd (@X[0],$t3)', # X[0..1] += sigma1(X[14..15])
  781. '&pshufd ($t3,@X[0],0b01010000)',# X[16..17]
  782. '&movdqa ($t2,$t3);',
  783. '&psrld ($t3,$sigma1[2])',
  784. '&psrlq ($t2,$sigma1[0])',
  785. '&pxor ($t3,$t2);',
  786. '&psrlq ($t2,$sigma1[1]-$sigma1[0])',
  787. '&pxor ($t3,$t2);',
  788. '&movdqa ($t2,16*2*$j."($Tbl)")',
  789. '&pshufb ($t3,$t5)',
  790. '&paddd (@X[0],$t3)' # X[2..3] += sigma1(X[16..17])
  791. );
  792. }
  793. sub SSSE3_256_00_47 () {
  794. my $j = shift;
  795. my $body = shift;
  796. my @X = @_;
  797. my @insns = (&$body,&$body,&$body,&$body); # 104 instructions
  798. if (0) {
  799. foreach (Xupdate_256_SSSE3()) { # 36 instructions
  800. eval;
  801. eval(shift(@insns));
  802. eval(shift(@insns));
  803. eval(shift(@insns));
  804. }
  805. } else { # squeeze extra 4% on Westmere and 19% on Atom
  806. eval(shift(@insns)); #@
  807. &movdqa ($t0,@X[1]);
  808. eval(shift(@insns));
  809. eval(shift(@insns));
  810. &movdqa ($t3,@X[3]);
  811. eval(shift(@insns)); #@
  812. eval(shift(@insns));
  813. eval(shift(@insns));
  814. eval(shift(@insns)); #@
  815. eval(shift(@insns));
  816. &palignr ($t0,@X[0],$SZ); # X[1..4]
  817. eval(shift(@insns));
  818. eval(shift(@insns));
  819. &palignr ($t3,@X[2],$SZ); # X[9..12]
  820. eval(shift(@insns));
  821. eval(shift(@insns));
  822. eval(shift(@insns));
  823. eval(shift(@insns)); #@
  824. &movdqa ($t1,$t0);
  825. eval(shift(@insns));
  826. eval(shift(@insns));
  827. &movdqa ($t2,$t0);
  828. eval(shift(@insns)); #@
  829. eval(shift(@insns));
  830. &psrld ($t0,$sigma0[2]);
  831. eval(shift(@insns));
  832. eval(shift(@insns));
  833. eval(shift(@insns));
  834. &paddd (@X[0],$t3); # X[0..3] += X[9..12]
  835. eval(shift(@insns)); #@
  836. eval(shift(@insns));
  837. &psrld ($t2,$sigma0[0]);
  838. eval(shift(@insns));
  839. eval(shift(@insns));
  840. &pshufd ($t3,@X[3],0b11111010); # X[4..15]
  841. eval(shift(@insns));
  842. eval(shift(@insns)); #@
  843. &pslld ($t1,8*$SZ-$sigma0[1]);
  844. eval(shift(@insns));
  845. eval(shift(@insns));
  846. &pxor ($t0,$t2);
  847. eval(shift(@insns)); #@
  848. eval(shift(@insns));
  849. eval(shift(@insns));
  850. eval(shift(@insns)); #@
  851. &psrld ($t2,$sigma0[1]-$sigma0[0]);
  852. eval(shift(@insns));
  853. &pxor ($t0,$t1);
  854. eval(shift(@insns));
  855. eval(shift(@insns));
  856. &pslld ($t1,$sigma0[1]-$sigma0[0]);
  857. eval(shift(@insns));
  858. eval(shift(@insns));
  859. &pxor ($t0,$t2);
  860. eval(shift(@insns));
  861. eval(shift(@insns)); #@
  862. &movdqa ($t2,$t3);
  863. eval(shift(@insns));
  864. eval(shift(@insns));
  865. &pxor ($t0,$t1); # sigma0(X[1..4])
  866. eval(shift(@insns)); #@
  867. eval(shift(@insns));
  868. eval(shift(@insns));
  869. &psrld ($t3,$sigma1[2]);
  870. eval(shift(@insns));
  871. eval(shift(@insns));
  872. &paddd (@X[0],$t0); # X[0..3] += sigma0(X[1..4])
  873. eval(shift(@insns)); #@
  874. eval(shift(@insns));
  875. &psrlq ($t2,$sigma1[0]);
  876. eval(shift(@insns));
  877. eval(shift(@insns));
  878. eval(shift(@insns));
  879. &pxor ($t3,$t2);
  880. eval(shift(@insns)); #@
  881. eval(shift(@insns));
  882. eval(shift(@insns));
  883. eval(shift(@insns)); #@
  884. &psrlq ($t2,$sigma1[1]-$sigma1[0]);
  885. eval(shift(@insns));
  886. eval(shift(@insns));
  887. &pxor ($t3,$t2);
  888. eval(shift(@insns)); #@
  889. eval(shift(@insns));
  890. eval(shift(@insns));
  891. #&pshufb ($t3,$t4); # sigma1(X[14..15])
  892. &pshufd ($t3,$t3,0b10000000);
  893. eval(shift(@insns));
  894. eval(shift(@insns));
  895. eval(shift(@insns));
  896. &psrldq ($t3,8);
  897. eval(shift(@insns));
  898. eval(shift(@insns)); #@
  899. eval(shift(@insns));
  900. eval(shift(@insns));
  901. eval(shift(@insns)); #@
  902. &paddd (@X[0],$t3); # X[0..1] += sigma1(X[14..15])
  903. eval(shift(@insns));
  904. eval(shift(@insns));
  905. eval(shift(@insns));
  906. &pshufd ($t3,@X[0],0b01010000); # X[16..17]
  907. eval(shift(@insns));
  908. eval(shift(@insns)); #@
  909. eval(shift(@insns));
  910. &movdqa ($t2,$t3);
  911. eval(shift(@insns));
  912. eval(shift(@insns));
  913. &psrld ($t3,$sigma1[2]);
  914. eval(shift(@insns));
  915. eval(shift(@insns)); #@
  916. &psrlq ($t2,$sigma1[0]);
  917. eval(shift(@insns));
  918. eval(shift(@insns));
  919. &pxor ($t3,$t2);
  920. eval(shift(@insns)); #@
  921. eval(shift(@insns));
  922. eval(shift(@insns));
  923. eval(shift(@insns)); #@
  924. eval(shift(@insns));
  925. &psrlq ($t2,$sigma1[1]-$sigma1[0]);
  926. eval(shift(@insns));
  927. eval(shift(@insns));
  928. eval(shift(@insns));
  929. &pxor ($t3,$t2);
  930. eval(shift(@insns));
  931. eval(shift(@insns));
  932. eval(shift(@insns)); #@
  933. #&pshufb ($t3,$t5);
  934. &pshufd ($t3,$t3,0b00001000);
  935. eval(shift(@insns));
  936. eval(shift(@insns));
  937. &movdqa ($t2,16*2*$j."($Tbl)");
  938. eval(shift(@insns)); #@
  939. eval(shift(@insns));
  940. &pslldq ($t3,8);
  941. eval(shift(@insns));
  942. eval(shift(@insns));
  943. eval(shift(@insns));
  944. &paddd (@X[0],$t3); # X[2..3] += sigma1(X[16..17])
  945. eval(shift(@insns)); #@
  946. eval(shift(@insns));
  947. eval(shift(@insns));
  948. }
  949. &paddd ($t2,@X[0]);
  950. foreach (@insns) { eval; } # remaining instructions
  951. &movdqa (16*$j."(%rsp)",$t2);
  952. }
  953. for ($i=0,$j=0; $j<4; $j++) {
  954. &SSSE3_256_00_47($j,\&body_00_15,@X);
  955. push(@X,shift(@X)); # rotate(@X)
  956. }
  957. &cmpb ($SZ-1+16*2*$SZ."($Tbl)",0);
  958. &jne (".Lssse3_00_47");
  959. for ($i=0; $i<16; ) {
  960. foreach(body_00_15()) { eval; }
  961. }
  962. $code.=<<___;
  963. mov $_ctx,$ctx
  964. mov $a1,$A
  965. add $SZ*0($ctx),$A
  966. lea 16*$SZ($inp),$inp
  967. add $SZ*1($ctx),$B
  968. add $SZ*2($ctx),$C
  969. add $SZ*3($ctx),$D
  970. add $SZ*4($ctx),$E
  971. add $SZ*5($ctx),$F
  972. add $SZ*6($ctx),$G
  973. add $SZ*7($ctx),$H
  974. cmp $_end,$inp
  975. mov $A,$SZ*0($ctx)
  976. mov $B,$SZ*1($ctx)
  977. mov $C,$SZ*2($ctx)
  978. mov $D,$SZ*3($ctx)
  979. mov $E,$SZ*4($ctx)
  980. mov $F,$SZ*5($ctx)
  981. mov $G,$SZ*6($ctx)
  982. mov $H,$SZ*7($ctx)
  983. jb .Lloop_ssse3
  984. mov $_rsp,%rsi
  985. ___
  986. $code.=<<___ if ($win64);
  987. movaps 16*$SZ+32(%rsp),%xmm6
  988. movaps 16*$SZ+48(%rsp),%xmm7
  989. movaps 16*$SZ+64(%rsp),%xmm8
  990. movaps 16*$SZ+80(%rsp),%xmm9
  991. ___
  992. $code.=<<___;
  993. mov (%rsi),%r15
  994. mov 8(%rsi),%r14
  995. mov 16(%rsi),%r13
  996. mov 24(%rsi),%r12
  997. mov 32(%rsi),%rbp
  998. mov 40(%rsi),%rbx
  999. lea 48(%rsi),%rsp
  1000. .Lepilogue_ssse3:
  1001. ret
  1002. .size ${func}_ssse3,.-${func}_ssse3
  1003. ___
  1004. }
  1005. if ($avx) {{
  1006. ######################################################################
  1007. # XOP code path
  1008. #
  1009. if ($SZ==8) { # SHA512 only
  1010. $code.=<<___;
  1011. .type ${func}_xop,\@function,3
  1012. .align 64
  1013. ${func}_xop:
  1014. .Lxop_shortcut:
  1015. push %rbx
  1016. push %rbp
  1017. push %r12
  1018. push %r13
  1019. push %r14
  1020. push %r15
  1021. mov %rsp,%r11 # copy %rsp
  1022. shl \$4,%rdx # num*16
  1023. sub \$`$framesz+$win64*16*($SZ==4?4:6)`,%rsp
  1024. lea ($inp,%rdx,$SZ),%rdx # inp+num*16*$SZ
  1025. and \$-64,%rsp # align stack frame
  1026. mov $ctx,$_ctx # save ctx, 1st arg
  1027. mov $inp,$_inp # save inp, 2nd arh
  1028. mov %rdx,$_end # save end pointer, "3rd" arg
  1029. mov %r11,$_rsp # save copy of %rsp
  1030. ___
  1031. $code.=<<___ if ($win64);
  1032. movaps %xmm6,16*$SZ+32(%rsp)
  1033. movaps %xmm7,16*$SZ+48(%rsp)
  1034. movaps %xmm8,16*$SZ+64(%rsp)
  1035. movaps %xmm9,16*$SZ+80(%rsp)
  1036. ___
  1037. $code.=<<___ if ($win64 && $SZ>4);
  1038. movaps %xmm10,16*$SZ+96(%rsp)
  1039. movaps %xmm11,16*$SZ+112(%rsp)
  1040. ___
  1041. $code.=<<___;
  1042. .Lprologue_xop:
  1043. vzeroupper
  1044. mov $SZ*0($ctx),$A
  1045. mov $SZ*1($ctx),$B
  1046. mov $SZ*2($ctx),$C
  1047. mov $SZ*3($ctx),$D
  1048. mov $SZ*4($ctx),$E
  1049. mov $SZ*5($ctx),$F
  1050. mov $SZ*6($ctx),$G
  1051. mov $SZ*7($ctx),$H
  1052. jmp .Lloop_xop
  1053. ___
  1054. if ($SZ==4) { # SHA256
  1055. my @X = map("%xmm$_",(0..3));
  1056. my ($t0,$t1,$t2,$t3) = map("%xmm$_",(4..7));
  1057. $code.=<<___;
  1058. .align 16
  1059. .Lloop_xop:
  1060. vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3
  1061. vmovdqu 0x00($inp),@X[0]
  1062. vmovdqu 0x10($inp),@X[1]
  1063. vmovdqu 0x20($inp),@X[2]
  1064. vmovdqu 0x30($inp),@X[3]
  1065. vpshufb $t3,@X[0],@X[0]
  1066. lea $TABLE(%rip),$Tbl
  1067. vpshufb $t3,@X[1],@X[1]
  1068. vpshufb $t3,@X[2],@X[2]
  1069. vpaddd 0x00($Tbl),@X[0],$t0
  1070. vpshufb $t3,@X[3],@X[3]
  1071. vpaddd 0x20($Tbl),@X[1],$t1
  1072. vpaddd 0x40($Tbl),@X[2],$t2
  1073. vpaddd 0x60($Tbl),@X[3],$t3
  1074. vmovdqa $t0,0x00(%rsp)
  1075. mov $A,$a1
  1076. vmovdqa $t1,0x10(%rsp)
  1077. mov $B,$a3
  1078. vmovdqa $t2,0x20(%rsp)
  1079. xor $C,$a3 # magic
  1080. vmovdqa $t3,0x30(%rsp)
  1081. mov $E,$a0
  1082. jmp .Lxop_00_47
  1083. .align 16
  1084. .Lxop_00_47:
  1085. sub \$`-16*2*$SZ`,$Tbl # size optimization
  1086. ___
  1087. sub XOP_256_00_47 () {
  1088. my $j = shift;
  1089. my $body = shift;
  1090. my @X = @_;
  1091. my @insns = (&$body,&$body,&$body,&$body); # 104 instructions
  1092. &vpalignr ($t0,@X[1],@X[0],$SZ); # X[1..4]
  1093. eval(shift(@insns));
  1094. eval(shift(@insns));
  1095. &vpalignr ($t3,@X[3],@X[2],$SZ); # X[9..12]
  1096. eval(shift(@insns));
  1097. eval(shift(@insns));
  1098. &vprotd ($t1,$t0,8*$SZ-$sigma0[1]);
  1099. eval(shift(@insns));
  1100. eval(shift(@insns));
  1101. &vpsrld ($t0,$t0,$sigma0[2]);
  1102. eval(shift(@insns));
  1103. eval(shift(@insns));
  1104. &vpaddd (@X[0],@X[0],$t3); # X[0..3] += X[9..12]
  1105. eval(shift(@insns));
  1106. eval(shift(@insns));
  1107. eval(shift(@insns));
  1108. eval(shift(@insns));
  1109. &vprotd ($t2,$t1,$sigma0[1]-$sigma0[0]);
  1110. eval(shift(@insns));
  1111. eval(shift(@insns));
  1112. &vpxor ($t0,$t0,$t1);
  1113. eval(shift(@insns));
  1114. eval(shift(@insns));
  1115. eval(shift(@insns));
  1116. eval(shift(@insns));
  1117. &vprotd ($t3,@X[3],8*$SZ-$sigma1[1]);
  1118. eval(shift(@insns));
  1119. eval(shift(@insns));
  1120. &vpxor ($t0,$t0,$t2); # sigma0(X[1..4])
  1121. eval(shift(@insns));
  1122. eval(shift(@insns));
  1123. &vpsrld ($t2,@X[3],$sigma1[2]);
  1124. eval(shift(@insns));
  1125. eval(shift(@insns));
  1126. &vpaddd (@X[0],@X[0],$t0); # X[0..3] += sigma0(X[1..4])
  1127. eval(shift(@insns));
  1128. eval(shift(@insns));
  1129. &vprotd ($t1,$t3,$sigma1[1]-$sigma1[0]);
  1130. eval(shift(@insns));
  1131. eval(shift(@insns));
  1132. &vpxor ($t3,$t3,$t2);
  1133. eval(shift(@insns));
  1134. eval(shift(@insns));
  1135. eval(shift(@insns));
  1136. eval(shift(@insns));
  1137. &vpxor ($t3,$t3,$t1); # sigma1(X[14..15])
  1138. eval(shift(@insns));
  1139. eval(shift(@insns));
  1140. eval(shift(@insns));
  1141. eval(shift(@insns));
  1142. &vpsrldq ($t3,$t3,8);
  1143. eval(shift(@insns));
  1144. eval(shift(@insns));
  1145. eval(shift(@insns));
  1146. eval(shift(@insns));
  1147. &vpaddd (@X[0],@X[0],$t3); # X[0..1] += sigma1(X[14..15])
  1148. eval(shift(@insns));
  1149. eval(shift(@insns));
  1150. eval(shift(@insns));
  1151. eval(shift(@insns));
  1152. &vprotd ($t3,@X[0],8*$SZ-$sigma1[1]);
  1153. eval(shift(@insns));
  1154. eval(shift(@insns));
  1155. &vpsrld ($t2,@X[0],$sigma1[2]);
  1156. eval(shift(@insns));
  1157. eval(shift(@insns));
  1158. &vprotd ($t1,$t3,$sigma1[1]-$sigma1[0]);
  1159. eval(shift(@insns));
  1160. eval(shift(@insns));
  1161. &vpxor ($t3,$t3,$t2);
  1162. eval(shift(@insns));
  1163. eval(shift(@insns));
  1164. eval(shift(@insns));
  1165. eval(shift(@insns));
  1166. &vpxor ($t3,$t3,$t1); # sigma1(X[16..17])
  1167. eval(shift(@insns));
  1168. eval(shift(@insns));
  1169. eval(shift(@insns));
  1170. eval(shift(@insns));
  1171. &vpslldq ($t3,$t3,8); # 22 instructions
  1172. eval(shift(@insns));
  1173. eval(shift(@insns));
  1174. eval(shift(@insns));
  1175. eval(shift(@insns));
  1176. &vpaddd (@X[0],@X[0],$t3); # X[2..3] += sigma1(X[16..17])
  1177. eval(shift(@insns));
  1178. eval(shift(@insns));
  1179. eval(shift(@insns));
  1180. eval(shift(@insns));
  1181. &vpaddd ($t2,@X[0],16*2*$j."($Tbl)");
  1182. foreach (@insns) { eval; } # remaining instructions
  1183. &vmovdqa (16*$j."(%rsp)",$t2);
  1184. }
  1185. for ($i=0,$j=0; $j<4; $j++) {
  1186. &XOP_256_00_47($j,\&body_00_15,@X);
  1187. push(@X,shift(@X)); # rotate(@X)
  1188. }
  1189. &cmpb ($SZ-1+16*2*$SZ."($Tbl)",0);
  1190. &jne (".Lxop_00_47");
  1191. for ($i=0; $i<16; ) {
  1192. foreach(body_00_15()) { eval; }
  1193. }
  1194. } else { # SHA512
  1195. my @X = map("%xmm$_",(0..7));
  1196. my ($t0,$t1,$t2,$t3) = map("%xmm$_",(8..11));
  1197. $code.=<<___;
  1198. .align 16
  1199. .Lloop_xop:
  1200. vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3
  1201. vmovdqu 0x00($inp),@X[0]
  1202. lea $TABLE+0x80(%rip),$Tbl # size optimization
  1203. vmovdqu 0x10($inp),@X[1]
  1204. vmovdqu 0x20($inp),@X[2]
  1205. vpshufb $t3,@X[0],@X[0]
  1206. vmovdqu 0x30($inp),@X[3]
  1207. vpshufb $t3,@X[1],@X[1]
  1208. vmovdqu 0x40($inp),@X[4]
  1209. vpshufb $t3,@X[2],@X[2]
  1210. vmovdqu 0x50($inp),@X[5]
  1211. vpshufb $t3,@X[3],@X[3]
  1212. vmovdqu 0x60($inp),@X[6]
  1213. vpshufb $t3,@X[4],@X[4]
  1214. vmovdqu 0x70($inp),@X[7]
  1215. vpshufb $t3,@X[5],@X[5]
  1216. vpaddq -0x80($Tbl),@X[0],$t0
  1217. vpshufb $t3,@X[6],@X[6]
  1218. vpaddq -0x60($Tbl),@X[1],$t1
  1219. vpshufb $t3,@X[7],@X[7]
  1220. vpaddq -0x40($Tbl),@X[2],$t2
  1221. vpaddq -0x20($Tbl),@X[3],$t3
  1222. vmovdqa $t0,0x00(%rsp)
  1223. vpaddq 0x00($Tbl),@X[4],$t0
  1224. vmovdqa $t1,0x10(%rsp)
  1225. vpaddq 0x20($Tbl),@X[5],$t1
  1226. vmovdqa $t2,0x20(%rsp)
  1227. vpaddq 0x40($Tbl),@X[6],$t2
  1228. vmovdqa $t3,0x30(%rsp)
  1229. vpaddq 0x60($Tbl),@X[7],$t3
  1230. vmovdqa $t0,0x40(%rsp)
  1231. mov $A,$a1
  1232. vmovdqa $t1,0x50(%rsp)
  1233. mov $B,$a3
  1234. vmovdqa $t2,0x60(%rsp)
  1235. xor $C,$a3 # magic
  1236. vmovdqa $t3,0x70(%rsp)
  1237. mov $E,$a0
  1238. jmp .Lxop_00_47
  1239. .align 16
  1240. .Lxop_00_47:
  1241. add \$`16*2*$SZ`,$Tbl
  1242. ___
  1243. sub XOP_512_00_47 () {
  1244. my $j = shift;
  1245. my $body = shift;
  1246. my @X = @_;
  1247. my @insns = (&$body,&$body); # 52 instructions
  1248. &vpalignr ($t0,@X[1],@X[0],$SZ); # X[1..2]
  1249. eval(shift(@insns));
  1250. eval(shift(@insns));
  1251. &vpalignr ($t3,@X[5],@X[4],$SZ); # X[9..10]
  1252. eval(shift(@insns));
  1253. eval(shift(@insns));
  1254. &vprotq ($t1,$t0,8*$SZ-$sigma0[1]);
  1255. eval(shift(@insns));
  1256. eval(shift(@insns));
  1257. &vpsrlq ($t0,$t0,$sigma0[2]);
  1258. eval(shift(@insns));
  1259. eval(shift(@insns));
  1260. &vpaddq (@X[0],@X[0],$t3); # X[0..1] += X[9..10]
  1261. eval(shift(@insns));
  1262. eval(shift(@insns));
  1263. eval(shift(@insns));
  1264. eval(shift(@insns));
  1265. &vprotq ($t2,$t1,$sigma0[1]-$sigma0[0]);
  1266. eval(shift(@insns));
  1267. eval(shift(@insns));
  1268. &vpxor ($t0,$t0,$t1);
  1269. eval(shift(@insns));
  1270. eval(shift(@insns));
  1271. eval(shift(@insns));
  1272. eval(shift(@insns));
  1273. &vprotq ($t3,@X[7],8*$SZ-$sigma1[1]);
  1274. eval(shift(@insns));
  1275. eval(shift(@insns));
  1276. &vpxor ($t0,$t0,$t2); # sigma0(X[1..2])
  1277. eval(shift(@insns));
  1278. eval(shift(@insns));
  1279. &vpsrlq ($t2,@X[7],$sigma1[2]);
  1280. eval(shift(@insns));
  1281. eval(shift(@insns));
  1282. &vpaddq (@X[0],@X[0],$t0); # X[0..1] += sigma0(X[1..2])
  1283. eval(shift(@insns));
  1284. eval(shift(@insns));
  1285. &vprotq ($t1,$t3,$sigma1[1]-$sigma1[0]);
  1286. eval(shift(@insns));
  1287. eval(shift(@insns));
  1288. &vpxor ($t3,$t3,$t2);
  1289. eval(shift(@insns));
  1290. eval(shift(@insns));
  1291. eval(shift(@insns));
  1292. eval(shift(@insns));
  1293. &vpxor ($t3,$t3,$t1); # sigma1(X[14..15])
  1294. eval(shift(@insns));
  1295. eval(shift(@insns));
  1296. eval(shift(@insns));
  1297. eval(shift(@insns));
  1298. &vpaddq (@X[0],@X[0],$t3); # X[0..1] += sigma1(X[14..15])
  1299. eval(shift(@insns));
  1300. eval(shift(@insns));
  1301. eval(shift(@insns));
  1302. eval(shift(@insns));
  1303. &vpaddq ($t2,@X[0],16*2*$j-0x80."($Tbl)");
  1304. foreach (@insns) { eval; } # remaining instructions
  1305. &vmovdqa (16*$j."(%rsp)",$t2);
  1306. }
  1307. for ($i=0,$j=0; $j<8; $j++) {
  1308. &XOP_512_00_47($j,\&body_00_15,@X);
  1309. push(@X,shift(@X)); # rotate(@X)
  1310. }
  1311. &cmpb ($SZ-1+16*2*$SZ-0x80."($Tbl)",0);
  1312. &jne (".Lxop_00_47");
  1313. for ($i=0; $i<16; ) {
  1314. foreach(body_00_15()) { eval; }
  1315. }
  1316. }
  1317. $code.=<<___;
  1318. mov $_ctx,$ctx
  1319. mov $a1,$A
  1320. add $SZ*0($ctx),$A
  1321. lea 16*$SZ($inp),$inp
  1322. add $SZ*1($ctx),$B
  1323. add $SZ*2($ctx),$C
  1324. add $SZ*3($ctx),$D
  1325. add $SZ*4($ctx),$E
  1326. add $SZ*5($ctx),$F
  1327. add $SZ*6($ctx),$G
  1328. add $SZ*7($ctx),$H
  1329. cmp $_end,$inp
  1330. mov $A,$SZ*0($ctx)
  1331. mov $B,$SZ*1($ctx)
  1332. mov $C,$SZ*2($ctx)
  1333. mov $D,$SZ*3($ctx)
  1334. mov $E,$SZ*4($ctx)
  1335. mov $F,$SZ*5($ctx)
  1336. mov $G,$SZ*6($ctx)
  1337. mov $H,$SZ*7($ctx)
  1338. jb .Lloop_xop
  1339. mov $_rsp,%rsi
  1340. vzeroupper
  1341. ___
  1342. $code.=<<___ if ($win64);
  1343. movaps 16*$SZ+32(%rsp),%xmm6
  1344. movaps 16*$SZ+48(%rsp),%xmm7
  1345. movaps 16*$SZ+64(%rsp),%xmm8
  1346. movaps 16*$SZ+80(%rsp),%xmm9
  1347. ___
  1348. $code.=<<___ if ($win64 && $SZ>4);
  1349. movaps 16*$SZ+96(%rsp),%xmm10
  1350. movaps 16*$SZ+112(%rsp),%xmm11
  1351. ___
  1352. $code.=<<___;
  1353. mov (%rsi),%r15
  1354. mov 8(%rsi),%r14
  1355. mov 16(%rsi),%r13
  1356. mov 24(%rsi),%r12
  1357. mov 32(%rsi),%rbp
  1358. mov 40(%rsi),%rbx
  1359. lea 48(%rsi),%rsp
  1360. .Lepilogue_xop:
  1361. ret
  1362. .size ${func}_xop,.-${func}_xop
  1363. ___
  1364. }
  1365. ######################################################################
  1366. # AVX+shrd code path
  1367. #
  1368. local *ror = sub { &shrd(@_[0],@_) };
  1369. $code.=<<___;
  1370. .type ${func}_avx,\@function,3
  1371. .align 64
  1372. ${func}_avx:
  1373. .Lavx_shortcut:
  1374. push %rbx
  1375. push %rbp
  1376. push %r12
  1377. push %r13
  1378. push %r14
  1379. push %r15
  1380. mov %rsp,%r11 # copy %rsp
  1381. shl \$4,%rdx # num*16
  1382. sub \$`$framesz+$win64*16*($SZ==4?4:6)`,%rsp
  1383. lea ($inp,%rdx,$SZ),%rdx # inp+num*16*$SZ
  1384. and \$-64,%rsp # align stack frame
  1385. mov $ctx,$_ctx # save ctx, 1st arg
  1386. mov $inp,$_inp # save inp, 2nd arh
  1387. mov %rdx,$_end # save end pointer, "3rd" arg
  1388. mov %r11,$_rsp # save copy of %rsp
  1389. ___
  1390. $code.=<<___ if ($win64);
  1391. movaps %xmm6,16*$SZ+32(%rsp)
  1392. movaps %xmm7,16*$SZ+48(%rsp)
  1393. movaps %xmm8,16*$SZ+64(%rsp)
  1394. movaps %xmm9,16*$SZ+80(%rsp)
  1395. ___
  1396. $code.=<<___ if ($win64 && $SZ>4);
  1397. movaps %xmm10,16*$SZ+96(%rsp)
  1398. movaps %xmm11,16*$SZ+112(%rsp)
  1399. ___
  1400. $code.=<<___;
  1401. .Lprologue_avx:
  1402. vzeroupper
  1403. mov $SZ*0($ctx),$A
  1404. mov $SZ*1($ctx),$B
  1405. mov $SZ*2($ctx),$C
  1406. mov $SZ*3($ctx),$D
  1407. mov $SZ*4($ctx),$E
  1408. mov $SZ*5($ctx),$F
  1409. mov $SZ*6($ctx),$G
  1410. mov $SZ*7($ctx),$H
  1411. ___
  1412. if ($SZ==4) { # SHA256
  1413. my @X = map("%xmm$_",(0..3));
  1414. my ($t0,$t1,$t2,$t3, $t4,$t5) = map("%xmm$_",(4..9));
  1415. $code.=<<___;
  1416. vmovdqa $TABLE+`$SZ*2*$rounds`+32(%rip),$t4
  1417. vmovdqa $TABLE+`$SZ*2*$rounds`+64(%rip),$t5
  1418. jmp .Lloop_avx
  1419. .align 16
  1420. .Lloop_avx:
  1421. vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3
  1422. vmovdqu 0x00($inp),@X[0]
  1423. vmovdqu 0x10($inp),@X[1]
  1424. vmovdqu 0x20($inp),@X[2]
  1425. vmovdqu 0x30($inp),@X[3]
  1426. vpshufb $t3,@X[0],@X[0]
  1427. lea $TABLE(%rip),$Tbl
  1428. vpshufb $t3,@X[1],@X[1]
  1429. vpshufb $t3,@X[2],@X[2]
  1430. vpaddd 0x00($Tbl),@X[0],$t0
  1431. vpshufb $t3,@X[3],@X[3]
  1432. vpaddd 0x20($Tbl),@X[1],$t1
  1433. vpaddd 0x40($Tbl),@X[2],$t2
  1434. vpaddd 0x60($Tbl),@X[3],$t3
  1435. vmovdqa $t0,0x00(%rsp)
  1436. mov $A,$a1
  1437. vmovdqa $t1,0x10(%rsp)
  1438. mov $B,$a3
  1439. vmovdqa $t2,0x20(%rsp)
  1440. xor $C,$a3 # magic
  1441. vmovdqa $t3,0x30(%rsp)
  1442. mov $E,$a0
  1443. jmp .Lavx_00_47
  1444. .align 16
  1445. .Lavx_00_47:
  1446. sub \$`-16*2*$SZ`,$Tbl # size optimization
  1447. ___
  1448. sub Xupdate_256_AVX () {
  1449. (
  1450. '&vpalignr ($t0,@X[1],@X[0],$SZ)', # X[1..4]
  1451. '&vpalignr ($t3,@X[3],@X[2],$SZ)', # X[9..12]
  1452. '&vpsrld ($t2,$t0,$sigma0[0]);',
  1453. '&vpaddd (@X[0],@X[0],$t3)', # X[0..3] += X[9..12]
  1454. '&vpsrld ($t3,$t0,$sigma0[2])',
  1455. '&vpslld ($t1,$t0,8*$SZ-$sigma0[1]);',
  1456. '&vpxor ($t0,$t3,$t2)',
  1457. '&vpshufd ($t3,@X[3],0b11111010)',# X[14..15]
  1458. '&vpsrld ($t2,$t2,$sigma0[1]-$sigma0[0]);',
  1459. '&vpxor ($t0,$t0,$t1)',
  1460. '&vpslld ($t1,$t1,$sigma0[1]-$sigma0[0]);',
  1461. '&vpxor ($t0,$t0,$t2)',
  1462. '&vpsrld ($t2,$t3,$sigma1[2]);',
  1463. '&vpxor ($t0,$t0,$t1)', # sigma0(X[1..4])
  1464. '&vpsrlq ($t3,$t3,$sigma1[0]);',
  1465. '&vpaddd (@X[0],@X[0],$t0)', # X[0..3] += sigma0(X[1..4])
  1466. '&vpxor ($t2,$t2,$t3);',
  1467. '&vpsrlq ($t3,$t3,$sigma1[1]-$sigma1[0])',
  1468. '&vpxor ($t2,$t2,$t3)',
  1469. '&vpshufb ($t2,$t2,$t4)', # sigma1(X[14..15])
  1470. '&vpaddd (@X[0],@X[0],$t2)', # X[0..1] += sigma1(X[14..15])
  1471. '&vpshufd ($t3,@X[0],0b01010000)',# X[16..17]
  1472. '&vpsrld ($t2,$t3,$sigma1[2])',
  1473. '&vpsrlq ($t3,$t3,$sigma1[0])',
  1474. '&vpxor ($t2,$t2,$t3);',
  1475. '&vpsrlq ($t3,$t3,$sigma1[1]-$sigma1[0])',
  1476. '&vpxor ($t2,$t2,$t3)',
  1477. '&vpshufb ($t2,$t2,$t5)',
  1478. '&vpaddd (@X[0],@X[0],$t2)' # X[2..3] += sigma1(X[16..17])
  1479. );
  1480. }
  1481. sub AVX_256_00_47 () {
  1482. my $j = shift;
  1483. my $body = shift;
  1484. my @X = @_;
  1485. my @insns = (&$body,&$body,&$body,&$body); # 104 instructions
  1486. foreach (Xupdate_256_AVX()) { # 29 instructions
  1487. eval;
  1488. eval(shift(@insns));
  1489. eval(shift(@insns));
  1490. eval(shift(@insns));
  1491. }
  1492. &vpaddd ($t2,@X[0],16*2*$j."($Tbl)");
  1493. foreach (@insns) { eval; } # remaining instructions
  1494. &vmovdqa (16*$j."(%rsp)",$t2);
  1495. }
  1496. for ($i=0,$j=0; $j<4; $j++) {
  1497. &AVX_256_00_47($j,\&body_00_15,@X);
  1498. push(@X,shift(@X)); # rotate(@X)
  1499. }
  1500. &cmpb ($SZ-1+16*2*$SZ."($Tbl)",0);
  1501. &jne (".Lavx_00_47");
  1502. for ($i=0; $i<16; ) {
  1503. foreach(body_00_15()) { eval; }
  1504. }
  1505. } else { # SHA512
  1506. my @X = map("%xmm$_",(0..7));
  1507. my ($t0,$t1,$t2,$t3) = map("%xmm$_",(8..11));
  1508. $code.=<<___;
  1509. jmp .Lloop_avx
  1510. .align 16
  1511. .Lloop_avx:
  1512. vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3
  1513. vmovdqu 0x00($inp),@X[0]
  1514. lea $TABLE+0x80(%rip),$Tbl # size optimization
  1515. vmovdqu 0x10($inp),@X[1]
  1516. vmovdqu 0x20($inp),@X[2]
  1517. vpshufb $t3,@X[0],@X[0]
  1518. vmovdqu 0x30($inp),@X[3]
  1519. vpshufb $t3,@X[1],@X[1]
  1520. vmovdqu 0x40($inp),@X[4]
  1521. vpshufb $t3,@X[2],@X[2]
  1522. vmovdqu 0x50($inp),@X[5]
  1523. vpshufb $t3,@X[3],@X[3]
  1524. vmovdqu 0x60($inp),@X[6]
  1525. vpshufb $t3,@X[4],@X[4]
  1526. vmovdqu 0x70($inp),@X[7]
  1527. vpshufb $t3,@X[5],@X[5]
  1528. vpaddq -0x80($Tbl),@X[0],$t0
  1529. vpshufb $t3,@X[6],@X[6]
  1530. vpaddq -0x60($Tbl),@X[1],$t1
  1531. vpshufb $t3,@X[7],@X[7]
  1532. vpaddq -0x40($Tbl),@X[2],$t2
  1533. vpaddq -0x20($Tbl),@X[3],$t3
  1534. vmovdqa $t0,0x00(%rsp)
  1535. vpaddq 0x00($Tbl),@X[4],$t0
  1536. vmovdqa $t1,0x10(%rsp)
  1537. vpaddq 0x20($Tbl),@X[5],$t1
  1538. vmovdqa $t2,0x20(%rsp)
  1539. vpaddq 0x40($Tbl),@X[6],$t2
  1540. vmovdqa $t3,0x30(%rsp)
  1541. vpaddq 0x60($Tbl),@X[7],$t3
  1542. vmovdqa $t0,0x40(%rsp)
  1543. mov $A,$a1
  1544. vmovdqa $t1,0x50(%rsp)
  1545. mov $B,$a3
  1546. vmovdqa $t2,0x60(%rsp)
  1547. xor $C,$a3 # magic
  1548. vmovdqa $t3,0x70(%rsp)
  1549. mov $E,$a0
  1550. jmp .Lavx_00_47
  1551. .align 16
  1552. .Lavx_00_47:
  1553. add \$`16*2*$SZ`,$Tbl
  1554. ___
  1555. sub Xupdate_512_AVX () {
  1556. (
  1557. '&vpalignr ($t0,@X[1],@X[0],$SZ)', # X[1..2]
  1558. '&vpalignr ($t3,@X[5],@X[4],$SZ)', # X[9..10]
  1559. '&vpsrlq ($t2,$t0,$sigma0[0])',
  1560. '&vpaddq (@X[0],@X[0],$t3);', # X[0..1] += X[9..10]
  1561. '&vpsrlq ($t3,$t0,$sigma0[2])',
  1562. '&vpsllq ($t1,$t0,8*$SZ-$sigma0[1]);',
  1563. '&vpxor ($t0,$t3,$t2)',
  1564. '&vpsrlq ($t2,$t2,$sigma0[1]-$sigma0[0]);',
  1565. '&vpxor ($t0,$t0,$t1)',
  1566. '&vpsllq ($t1,$t1,$sigma0[1]-$sigma0[0]);',
  1567. '&vpxor ($t0,$t0,$t2)',
  1568. '&vpsrlq ($t3,@X[7],$sigma1[2]);',
  1569. '&vpxor ($t0,$t0,$t1)', # sigma0(X[1..2])
  1570. '&vpsllq ($t2,@X[7],8*$SZ-$sigma1[1]);',
  1571. '&vpaddq (@X[0],@X[0],$t0)', # X[0..1] += sigma0(X[1..2])
  1572. '&vpsrlq ($t1,@X[7],$sigma1[0]);',
  1573. '&vpxor ($t3,$t3,$t2)',
  1574. '&vpsllq ($t2,$t2,$sigma1[1]-$sigma1[0]);',
  1575. '&vpxor ($t3,$t3,$t1)',
  1576. '&vpsrlq ($t1,$t1,$sigma1[1]-$sigma1[0]);',
  1577. '&vpxor ($t3,$t3,$t2)',
  1578. '&vpxor ($t3,$t3,$t1)', # sigma1(X[14..15])
  1579. '&vpaddq (@X[0],@X[0],$t3)', # X[0..1] += sigma1(X[14..15])
  1580. );
  1581. }
  1582. sub AVX_512_00_47 () {
  1583. my $j = shift;
  1584. my $body = shift;
  1585. my @X = @_;
  1586. my @insns = (&$body,&$body); # 52 instructions
  1587. foreach (Xupdate_512_AVX()) { # 23 instructions
  1588. eval;
  1589. eval(shift(@insns));
  1590. eval(shift(@insns));
  1591. }
  1592. &vpaddq ($t2,@X[0],16*2*$j-0x80."($Tbl)");
  1593. foreach (@insns) { eval; } # remaining instructions
  1594. &vmovdqa (16*$j."(%rsp)",$t2);
  1595. }
  1596. for ($i=0,$j=0; $j<8; $j++) {
  1597. &AVX_512_00_47($j,\&body_00_15,@X);
  1598. push(@X,shift(@X)); # rotate(@X)
  1599. }
  1600. &cmpb ($SZ-1+16*2*$SZ-0x80."($Tbl)",0);
  1601. &jne (".Lavx_00_47");
  1602. for ($i=0; $i<16; ) {
  1603. foreach(body_00_15()) { eval; }
  1604. }
  1605. }
  1606. $code.=<<___;
  1607. mov $_ctx,$ctx
  1608. mov $a1,$A
  1609. add $SZ*0($ctx),$A
  1610. lea 16*$SZ($inp),$inp
  1611. add $SZ*1($ctx),$B
  1612. add $SZ*2($ctx),$C
  1613. add $SZ*3($ctx),$D
  1614. add $SZ*4($ctx),$E
  1615. add $SZ*5($ctx),$F
  1616. add $SZ*6($ctx),$G
  1617. add $SZ*7($ctx),$H
  1618. cmp $_end,$inp
  1619. mov $A,$SZ*0($ctx)
  1620. mov $B,$SZ*1($ctx)
  1621. mov $C,$SZ*2($ctx)
  1622. mov $D,$SZ*3($ctx)
  1623. mov $E,$SZ*4($ctx)
  1624. mov $F,$SZ*5($ctx)
  1625. mov $G,$SZ*6($ctx)
  1626. mov $H,$SZ*7($ctx)
  1627. jb .Lloop_avx
  1628. mov $_rsp,%rsi
  1629. vzeroupper
  1630. ___
  1631. $code.=<<___ if ($win64);
  1632. movaps 16*$SZ+32(%rsp),%xmm6
  1633. movaps 16*$SZ+48(%rsp),%xmm7
  1634. movaps 16*$SZ+64(%rsp),%xmm8
  1635. movaps 16*$SZ+80(%rsp),%xmm9
  1636. ___
  1637. $code.=<<___ if ($win64 && $SZ>4);
  1638. movaps 16*$SZ+96(%rsp),%xmm10
  1639. movaps 16*$SZ+112(%rsp),%xmm11
  1640. ___
  1641. $code.=<<___;
  1642. mov (%rsi),%r15
  1643. mov 8(%rsi),%r14
  1644. mov 16(%rsi),%r13
  1645. mov 24(%rsi),%r12
  1646. mov 32(%rsi),%rbp
  1647. mov 40(%rsi),%rbx
  1648. lea 48(%rsi),%rsp
  1649. .Lepilogue_avx:
  1650. ret
  1651. .size ${func}_avx,.-${func}_avx
  1652. ___
  1653. if ($avx>1) {{
  1654. ######################################################################
  1655. # AVX2+BMI code path
  1656. #
  1657. my $a5=$SZ==4?"%esi":"%rsi"; # zap $inp
  1658. my $PUSH8=8*2*$SZ;
  1659. use integer;
  1660. sub bodyx_00_15 () {
  1661. # at start $a1 should be zero, $a3 - $b^$c and $a4 copy of $f
  1662. (
  1663. '($a,$b,$c,$d,$e,$f,$g,$h)=@ROT;'.
  1664. '&add ($h,(32*($i/(16/$SZ))+$SZ*($i%(16/$SZ)))%$PUSH8.$base)', # h+=X[i]+K[i]
  1665. '&and ($a4,$e)', # f&e
  1666. '&rorx ($a0,$e,$Sigma1[2])',
  1667. '&rorx ($a2,$e,$Sigma1[1])',
  1668. '&lea ($a,"($a,$a1)")', # h+=Sigma0(a) from the past
  1669. '&lea ($h,"($h,$a4)")',
  1670. '&andn ($a4,$e,$g)', # ~e&g
  1671. '&xor ($a0,$a2)',
  1672. '&rorx ($a1,$e,$Sigma1[0])',
  1673. '&lea ($h,"($h,$a4)")', # h+=Ch(e,f,g)=(e&f)+(~e&g)
  1674. '&xor ($a0,$a1)', # Sigma1(e)
  1675. '&mov ($a2,$a)',
  1676. '&rorx ($a4,$a,$Sigma0[2])',
  1677. '&lea ($h,"($h,$a0)")', # h+=Sigma1(e)
  1678. '&xor ($a2,$b)', # a^b, b^c in next round
  1679. '&rorx ($a1,$a,$Sigma0[1])',
  1680. '&rorx ($a0,$a,$Sigma0[0])',
  1681. '&lea ($d,"($d,$h)")', # d+=h
  1682. '&and ($a3,$a2)', # (b^c)&(a^b)
  1683. '&xor ($a1,$a4)',
  1684. '&xor ($a3,$b)', # Maj(a,b,c)=Ch(a^b,c,b)
  1685. '&xor ($a1,$a0)', # Sigma0(a)
  1686. '&lea ($h,"($h,$a3)");'. # h+=Maj(a,b,c)
  1687. '&mov ($a4,$e)', # copy of f in future
  1688. '($a2,$a3) = ($a3,$a2); unshift(@ROT,pop(@ROT)); $i++;'
  1689. );
  1690. # and at the finish one has to $a+=$a1
  1691. }
  1692. $code.=<<___;
  1693. .type ${func}_avx2,\@function,3
  1694. .align 64
  1695. ${func}_avx2:
  1696. .Lavx2_shortcut:
  1697. push %rbx
  1698. push %rbp
  1699. push %r12
  1700. push %r13
  1701. push %r14
  1702. push %r15
  1703. mov %rsp,%r11 # copy %rsp
  1704. sub \$`2*$SZ*$rounds+4*8+$win64*16*($SZ==4?4:6)`,%rsp
  1705. shl \$4,%rdx # num*16
  1706. and \$-256*$SZ,%rsp # align stack frame
  1707. lea ($inp,%rdx,$SZ),%rdx # inp+num*16*$SZ
  1708. add \$`2*$SZ*($rounds-8)`,%rsp
  1709. mov $ctx,$_ctx # save ctx, 1st arg
  1710. mov $inp,$_inp # save inp, 2nd arh
  1711. mov %rdx,$_end # save end pointer, "3rd" arg
  1712. mov %r11,$_rsp # save copy of %rsp
  1713. ___
  1714. $code.=<<___ if ($win64);
  1715. movaps %xmm6,16*$SZ+32(%rsp)
  1716. movaps %xmm7,16*$SZ+48(%rsp)
  1717. movaps %xmm8,16*$SZ+64(%rsp)
  1718. movaps %xmm9,16*$SZ+80(%rsp)
  1719. ___
  1720. $code.=<<___ if ($win64 && $SZ>4);
  1721. movaps %xmm10,16*$SZ+96(%rsp)
  1722. movaps %xmm11,16*$SZ+112(%rsp)
  1723. ___
  1724. $code.=<<___;
  1725. .Lprologue_avx2:
  1726. vzeroupper
  1727. sub \$-16*$SZ,$inp # inp++, size optimization
  1728. mov $SZ*0($ctx),$A
  1729. mov $inp,%r12 # borrow $T1
  1730. mov $SZ*1($ctx),$B
  1731. cmp %rdx,$inp # $_end
  1732. mov $SZ*2($ctx),$C
  1733. cmove %rsp,%r12 # next block or random data
  1734. mov $SZ*3($ctx),$D
  1735. mov $SZ*4($ctx),$E
  1736. mov $SZ*5($ctx),$F
  1737. mov $SZ*6($ctx),$G
  1738. mov $SZ*7($ctx),$H
  1739. ___
  1740. if ($SZ==4) { # SHA256
  1741. my @X = map("%ymm$_",(0..3));
  1742. my ($t0,$t1,$t2,$t3, $t4,$t5) = map("%ymm$_",(4..9));
  1743. $code.=<<___;
  1744. vmovdqa $TABLE+`$SZ*2*$rounds`+32(%rip),$t4
  1745. vmovdqa $TABLE+`$SZ*2*$rounds`+64(%rip),$t5
  1746. jmp .Loop_avx2
  1747. .align 16
  1748. .Loop_avx2:
  1749. vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3
  1750. vmovdqu -16*$SZ+0($inp),%xmm0
  1751. vmovdqu -16*$SZ+16($inp),%xmm1
  1752. vmovdqu -16*$SZ+32($inp),%xmm2
  1753. vmovdqu -16*$SZ+48($inp),%xmm3
  1754. #mov $inp,$_inp # offload $inp
  1755. vinserti128 \$1,(%r12),@X[0],@X[0]
  1756. vinserti128 \$1,16(%r12),@X[1],@X[1]
  1757. vpshufb $t3,@X[0],@X[0]
  1758. vinserti128 \$1,32(%r12),@X[2],@X[2]
  1759. vpshufb $t3,@X[1],@X[1]
  1760. vinserti128 \$1,48(%r12),@X[3],@X[3]
  1761. lea $TABLE(%rip),$Tbl
  1762. vpshufb $t3,@X[2],@X[2]
  1763. vpaddd 0x00($Tbl),@X[0],$t0
  1764. vpshufb $t3,@X[3],@X[3]
  1765. vpaddd 0x20($Tbl),@X[1],$t1
  1766. vpaddd 0x40($Tbl),@X[2],$t2
  1767. vpaddd 0x60($Tbl),@X[3],$t3
  1768. vmovdqa $t0,0x00(%rsp)
  1769. xor $a1,$a1
  1770. vmovdqa $t1,0x20(%rsp)
  1771. lea -$PUSH8(%rsp),%rsp
  1772. mov $B,$a3
  1773. vmovdqa $t2,0x00(%rsp)
  1774. xor $C,$a3 # magic
  1775. vmovdqa $t3,0x20(%rsp)
  1776. mov $F,$a4
  1777. sub \$-16*2*$SZ,$Tbl # size optimization
  1778. jmp .Lavx2_00_47
  1779. .align 16
  1780. .Lavx2_00_47:
  1781. ___
  1782. sub AVX2_256_00_47 () {
  1783. my $j = shift;
  1784. my $body = shift;
  1785. my @X = @_;
  1786. my @insns = (&$body,&$body,&$body,&$body); # 96 instructions
  1787. my $base = "+2*$PUSH8(%rsp)";
  1788. &lea ("%rsp","-$PUSH8(%rsp)") if (($j%2)==0);
  1789. foreach (Xupdate_256_AVX()) { # 29 instructions
  1790. eval;
  1791. eval(shift(@insns));
  1792. eval(shift(@insns));
  1793. eval(shift(@insns));
  1794. }
  1795. &vpaddd ($t2,@X[0],16*2*$j."($Tbl)");
  1796. foreach (@insns) { eval; } # remaining instructions
  1797. &vmovdqa ((32*$j)%$PUSH8."(%rsp)",$t2);
  1798. }
  1799. for ($i=0,$j=0; $j<4; $j++) {
  1800. &AVX2_256_00_47($j,\&bodyx_00_15,@X);
  1801. push(@X,shift(@X)); # rotate(@X)
  1802. }
  1803. &lea ($Tbl,16*2*$SZ."($Tbl)");
  1804. &cmpb (($SZ-1)."($Tbl)",0);
  1805. &jne (".Lavx2_00_47");
  1806. for ($i=0; $i<16; ) {
  1807. my $base=$i<8?"+$PUSH8(%rsp)":"(%rsp)";
  1808. foreach(bodyx_00_15()) { eval; }
  1809. }
  1810. } else { # SHA512
  1811. my @X = map("%ymm$_",(0..7));
  1812. my ($t0,$t1,$t2,$t3) = map("%ymm$_",(8..11));
  1813. $code.=<<___;
  1814. jmp .Loop_avx2
  1815. .align 16
  1816. .Loop_avx2:
  1817. vmovdqu -16*$SZ($inp),%xmm0
  1818. vmovdqu -16*$SZ+16($inp),%xmm1
  1819. vmovdqu -16*$SZ+32($inp),%xmm2
  1820. lea $TABLE+0x80(%rip),$Tbl # size optimization
  1821. vmovdqu -16*$SZ+48($inp),%xmm3
  1822. vmovdqu -16*$SZ+64($inp),%xmm4
  1823. vmovdqu -16*$SZ+80($inp),%xmm5
  1824. vmovdqu -16*$SZ+96($inp),%xmm6
  1825. vmovdqu -16*$SZ+112($inp),%xmm7
  1826. #mov $inp,$_inp # offload $inp
  1827. vmovdqa `$SZ*2*$rounds-0x80`($Tbl),$t2
  1828. vinserti128 \$1,(%r12),@X[0],@X[0]
  1829. vinserti128 \$1,16(%r12),@X[1],@X[1]
  1830. vpshufb $t2,@X[0],@X[0]
  1831. vinserti128 \$1,32(%r12),@X[2],@X[2]
  1832. vpshufb $t2,@X[1],@X[1]
  1833. vinserti128 \$1,48(%r12),@X[3],@X[3]
  1834. vpshufb $t2,@X[2],@X[2]
  1835. vinserti128 \$1,64(%r12),@X[4],@X[4]
  1836. vpshufb $t2,@X[3],@X[3]
  1837. vinserti128 \$1,80(%r12),@X[5],@X[5]
  1838. vpshufb $t2,@X[4],@X[4]
  1839. vinserti128 \$1,96(%r12),@X[6],@X[6]
  1840. vpshufb $t2,@X[5],@X[5]
  1841. vinserti128 \$1,112(%r12),@X[7],@X[7]
  1842. vpaddq -0x80($Tbl),@X[0],$t0
  1843. vpshufb $t2,@X[6],@X[6]
  1844. vpaddq -0x60($Tbl),@X[1],$t1
  1845. vpshufb $t2,@X[7],@X[7]
  1846. vpaddq -0x40($Tbl),@X[2],$t2
  1847. vpaddq -0x20($Tbl),@X[3],$t3
  1848. vmovdqa $t0,0x00(%rsp)
  1849. vpaddq 0x00($Tbl),@X[4],$t0
  1850. vmovdqa $t1,0x20(%rsp)
  1851. vpaddq 0x20($Tbl),@X[5],$t1
  1852. vmovdqa $t2,0x40(%rsp)
  1853. vpaddq 0x40($Tbl),@X[6],$t2
  1854. vmovdqa $t3,0x60(%rsp)
  1855. lea -$PUSH8(%rsp),%rsp
  1856. vpaddq 0x60($Tbl),@X[7],$t3
  1857. vmovdqa $t0,0x00(%rsp)
  1858. xor $a1,$a1
  1859. vmovdqa $t1,0x20(%rsp)
  1860. mov $B,$a3
  1861. vmovdqa $t2,0x40(%rsp)
  1862. xor $C,$a3 # magic
  1863. vmovdqa $t3,0x60(%rsp)
  1864. mov $F,$a4
  1865. add \$16*2*$SZ,$Tbl
  1866. jmp .Lavx2_00_47
  1867. .align 16
  1868. .Lavx2_00_47:
  1869. ___
  1870. sub AVX2_512_00_47 () {
  1871. my $j = shift;
  1872. my $body = shift;
  1873. my @X = @_;
  1874. my @insns = (&$body,&$body); # 48 instructions
  1875. my $base = "+2*$PUSH8(%rsp)";
  1876. &lea ("%rsp","-$PUSH8(%rsp)") if (($j%4)==0);
  1877. foreach (Xupdate_512_AVX()) { # 23 instructions
  1878. eval;
  1879. if ($_ !~ /\;$/) {
  1880. eval(shift(@insns));
  1881. eval(shift(@insns));
  1882. eval(shift(@insns));
  1883. }
  1884. }
  1885. &vpaddq ($t2,@X[0],16*2*$j-0x80."($Tbl)");
  1886. foreach (@insns) { eval; } # remaining instructions
  1887. &vmovdqa ((32*$j)%$PUSH8."(%rsp)",$t2);
  1888. }
  1889. for ($i=0,$j=0; $j<8; $j++) {
  1890. &AVX2_512_00_47($j,\&bodyx_00_15,@X);
  1891. push(@X,shift(@X)); # rotate(@X)
  1892. }
  1893. &lea ($Tbl,16*2*$SZ."($Tbl)");
  1894. &cmpb (($SZ-1-0x80)."($Tbl)",0);
  1895. &jne (".Lavx2_00_47");
  1896. for ($i=0; $i<16; ) {
  1897. my $base=$i<8?"+$PUSH8(%rsp)":"(%rsp)";
  1898. foreach(bodyx_00_15()) { eval; }
  1899. }
  1900. }
  1901. $code.=<<___;
  1902. mov `2*$SZ*$rounds`(%rsp),$ctx # $_ctx
  1903. add $a1,$A
  1904. #mov `2*$SZ*$rounds+8`(%rsp),$inp # $_inp
  1905. lea `2*$SZ*($rounds-8)`(%rsp),$Tbl
  1906. add $SZ*0($ctx),$A
  1907. add $SZ*1($ctx),$B
  1908. add $SZ*2($ctx),$C
  1909. add $SZ*3($ctx),$D
  1910. add $SZ*4($ctx),$E
  1911. add $SZ*5($ctx),$F
  1912. add $SZ*6($ctx),$G
  1913. add $SZ*7($ctx),$H
  1914. mov $A,$SZ*0($ctx)
  1915. mov $B,$SZ*1($ctx)
  1916. mov $C,$SZ*2($ctx)
  1917. mov $D,$SZ*3($ctx)
  1918. mov $E,$SZ*4($ctx)
  1919. mov $F,$SZ*5($ctx)
  1920. mov $G,$SZ*6($ctx)
  1921. mov $H,$SZ*7($ctx)
  1922. cmp `$PUSH8+2*8`($Tbl),$inp # $_end
  1923. je .Ldone_avx2
  1924. xor $a1,$a1
  1925. mov $B,$a3
  1926. xor $C,$a3 # magic
  1927. mov $F,$a4
  1928. jmp .Lower_avx2
  1929. .align 16
  1930. .Lower_avx2:
  1931. ___
  1932. for ($i=0; $i<8; ) {
  1933. my $base="+16($Tbl)";
  1934. foreach(bodyx_00_15()) { eval; }
  1935. }
  1936. $code.=<<___;
  1937. lea -$PUSH8($Tbl),$Tbl
  1938. cmp %rsp,$Tbl
  1939. jae .Lower_avx2
  1940. mov `2*$SZ*$rounds`(%rsp),$ctx # $_ctx
  1941. add $a1,$A
  1942. #mov `2*$SZ*$rounds+8`(%rsp),$inp # $_inp
  1943. lea `2*$SZ*($rounds-8)`(%rsp),%rsp
  1944. add $SZ*0($ctx),$A
  1945. add $SZ*1($ctx),$B
  1946. add $SZ*2($ctx),$C
  1947. add $SZ*3($ctx),$D
  1948. add $SZ*4($ctx),$E
  1949. add $SZ*5($ctx),$F
  1950. lea `2*16*$SZ`($inp),$inp # inp+=2
  1951. add $SZ*6($ctx),$G
  1952. mov $inp,%r12
  1953. add $SZ*7($ctx),$H
  1954. cmp $_end,$inp
  1955. mov $A,$SZ*0($ctx)
  1956. cmove %rsp,%r12 # next block or stale data
  1957. mov $B,$SZ*1($ctx)
  1958. mov $C,$SZ*2($ctx)
  1959. mov $D,$SZ*3($ctx)
  1960. mov $E,$SZ*4($ctx)
  1961. mov $F,$SZ*5($ctx)
  1962. mov $G,$SZ*6($ctx)
  1963. mov $H,$SZ*7($ctx)
  1964. jbe .Loop_avx2
  1965. lea (%rsp),$Tbl
  1966. .Ldone_avx2:
  1967. lea ($Tbl),%rsp
  1968. mov $_rsp,%rsi
  1969. vzeroupper
  1970. ___
  1971. $code.=<<___ if ($win64);
  1972. movaps 16*$SZ+32(%rsp),%xmm6
  1973. movaps 16*$SZ+48(%rsp),%xmm7
  1974. movaps 16*$SZ+64(%rsp),%xmm8
  1975. movaps 16*$SZ+80(%rsp),%xmm9
  1976. ___
  1977. $code.=<<___ if ($win64 && $SZ>4);
  1978. movaps 16*$SZ+96(%rsp),%xmm10
  1979. movaps 16*$SZ+112(%rsp),%xmm11
  1980. ___
  1981. $code.=<<___;
  1982. mov (%rsi),%r15
  1983. mov 8(%rsi),%r14
  1984. mov 16(%rsi),%r13
  1985. mov 24(%rsi),%r12
  1986. mov 32(%rsi),%rbp
  1987. mov 40(%rsi),%rbx
  1988. lea 48(%rsi),%rsp
  1989. .Lepilogue_avx2:
  1990. ret
  1991. .size ${func}_avx2,.-${func}_avx2
  1992. ___
  1993. }}
  1994. }}}}}
  1995. # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
  1996. # CONTEXT *context,DISPATCHER_CONTEXT *disp)
  1997. if ($win64) {
  1998. $rec="%rcx";
  1999. $frame="%rdx";
  2000. $context="%r8";
  2001. $disp="%r9";
  2002. $code.=<<___;
  2003. .extern __imp_RtlVirtualUnwind
  2004. .type se_handler,\@abi-omnipotent
  2005. .align 16
  2006. se_handler:
  2007. push %rsi
  2008. push %rdi
  2009. push %rbx
  2010. push %rbp
  2011. push %r12
  2012. push %r13
  2013. push %r14
  2014. push %r15
  2015. pushfq
  2016. sub \$64,%rsp
  2017. mov 120($context),%rax # pull context->Rax
  2018. mov 248($context),%rbx # pull context->Rip
  2019. mov 8($disp),%rsi # disp->ImageBase
  2020. mov 56($disp),%r11 # disp->HanderlData
  2021. mov 0(%r11),%r10d # HandlerData[0]
  2022. lea (%rsi,%r10),%r10 # prologue label
  2023. cmp %r10,%rbx # context->Rip<prologue label
  2024. jb .Lin_prologue
  2025. mov 152($context),%rax # pull context->Rsp
  2026. mov 4(%r11),%r10d # HandlerData[1]
  2027. lea (%rsi,%r10),%r10 # epilogue label
  2028. cmp %r10,%rbx # context->Rip>=epilogue label
  2029. jae .Lin_prologue
  2030. ___
  2031. $code.=<<___ if ($avx>1);
  2032. lea .Lavx2_shortcut(%rip),%r10
  2033. cmp %r10,%rbx # context->Rip<avx2_shortcut
  2034. jb .Lnot_in_avx2
  2035. and \$-256*$SZ,%rax
  2036. add \$`2*$SZ*($rounds-8)`,%rax
  2037. .Lnot_in_avx2:
  2038. ___
  2039. $code.=<<___;
  2040. mov %rax,%rsi # put aside Rsp
  2041. mov 16*$SZ+3*8(%rax),%rax # pull $_rsp
  2042. lea 48(%rax),%rax
  2043. mov -8(%rax),%rbx
  2044. mov -16(%rax),%rbp
  2045. mov -24(%rax),%r12
  2046. mov -32(%rax),%r13
  2047. mov -40(%rax),%r14
  2048. mov -48(%rax),%r15
  2049. mov %rbx,144($context) # restore context->Rbx
  2050. mov %rbp,160($context) # restore context->Rbp
  2051. mov %r12,216($context) # restore context->R12
  2052. mov %r13,224($context) # restore context->R13
  2053. mov %r14,232($context) # restore context->R14
  2054. mov %r15,240($context) # restore context->R15
  2055. lea .Lepilogue(%rip),%r10
  2056. cmp %r10,%rbx
  2057. jb .Lin_prologue # non-AVX code
  2058. lea 16*$SZ+4*8(%rsi),%rsi # Xmm6- save area
  2059. lea 512($context),%rdi # &context.Xmm6
  2060. mov \$`$SZ==4?8:12`,%ecx
  2061. .long 0xa548f3fc # cld; rep movsq
  2062. .Lin_prologue:
  2063. mov 8(%rax),%rdi
  2064. mov 16(%rax),%rsi
  2065. mov %rax,152($context) # restore context->Rsp
  2066. mov %rsi,168($context) # restore context->Rsi
  2067. mov %rdi,176($context) # restore context->Rdi
  2068. mov 40($disp),%rdi # disp->ContextRecord
  2069. mov $context,%rsi # context
  2070. mov \$154,%ecx # sizeof(CONTEXT)
  2071. .long 0xa548f3fc # cld; rep movsq
  2072. mov $disp,%rsi
  2073. xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER
  2074. mov 8(%rsi),%rdx # arg2, disp->ImageBase
  2075. mov 0(%rsi),%r8 # arg3, disp->ControlPc
  2076. mov 16(%rsi),%r9 # arg4, disp->FunctionEntry
  2077. mov 40(%rsi),%r10 # disp->ContextRecord
  2078. lea 56(%rsi),%r11 # &disp->HandlerData
  2079. lea 24(%rsi),%r12 # &disp->EstablisherFrame
  2080. mov %r10,32(%rsp) # arg5
  2081. mov %r11,40(%rsp) # arg6
  2082. mov %r12,48(%rsp) # arg7
  2083. mov %rcx,56(%rsp) # arg8, (NULL)
  2084. call *__imp_RtlVirtualUnwind(%rip)
  2085. mov \$1,%eax # ExceptionContinueSearch
  2086. add \$64,%rsp
  2087. popfq
  2088. pop %r15
  2089. pop %r14
  2090. pop %r13
  2091. pop %r12
  2092. pop %rbp
  2093. pop %rbx
  2094. pop %rdi
  2095. pop %rsi
  2096. ret
  2097. .size se_handler,.-se_handler
  2098. ___
  2099. $code.=<<___ if ($SZ == 4 && $shaext);
  2100. .type shaext_handler,\@abi-omnipotent
  2101. .align 16
  2102. shaext_handler:
  2103. push %rsi
  2104. push %rdi
  2105. push %rbx
  2106. push %rbp
  2107. push %r12
  2108. push %r13
  2109. push %r14
  2110. push %r15
  2111. pushfq
  2112. sub \$64,%rsp
  2113. mov 120($context),%rax # pull context->Rax
  2114. mov 248($context),%rbx # pull context->Rip
  2115. lea .Lprologue_shaext(%rip),%r10
  2116. cmp %r10,%rbx # context->Rip<.Lprologue
  2117. jb .Lin_prologue
  2118. lea .Lepilogue_shaext(%rip),%r10
  2119. cmp %r10,%rbx # context->Rip>=.Lepilogue
  2120. jae .Lin_prologue
  2121. lea -8-5*16(%rax),%rsi
  2122. lea 512($context),%rdi # &context.Xmm6
  2123. mov \$10,%ecx
  2124. .long 0xa548f3fc # cld; rep movsq
  2125. jmp .Lin_prologue
  2126. .size shaext_handler,.-shaext_handler
  2127. ___
  2128. $code.=<<___;
  2129. .section .pdata
  2130. .align 4
  2131. .rva .LSEH_begin_$func
  2132. .rva .LSEH_end_$func
  2133. .rva .LSEH_info_$func
  2134. ___
  2135. $code.=<<___ if ($SZ==4 && $shext);
  2136. .rva .LSEH_begin_${func}_shaext
  2137. .rva .LSEH_end_${func}_shaext
  2138. .rva .LSEH_info_${func}_shaext
  2139. ___
  2140. $code.=<<___ if ($SZ==4);
  2141. .rva .LSEH_begin_${func}_ssse3
  2142. .rva .LSEH_end_${func}_ssse3
  2143. .rva .LSEH_info_${func}_ssse3
  2144. ___
  2145. $code.=<<___ if ($avx && $SZ==8);
  2146. .rva .LSEH_begin_${func}_xop
  2147. .rva .LSEH_end_${func}_xop
  2148. .rva .LSEH_info_${func}_xop
  2149. ___
  2150. $code.=<<___ if ($avx);
  2151. .rva .LSEH_begin_${func}_avx
  2152. .rva .LSEH_end_${func}_avx
  2153. .rva .LSEH_info_${func}_avx
  2154. ___
  2155. $code.=<<___ if ($avx>1);
  2156. .rva .LSEH_begin_${func}_avx2
  2157. .rva .LSEH_end_${func}_avx2
  2158. .rva .LSEH_info_${func}_avx2
  2159. ___
  2160. $code.=<<___;
  2161. .section .xdata
  2162. .align 8
  2163. .LSEH_info_$func:
  2164. .byte 9,0,0,0
  2165. .rva se_handler
  2166. .rva .Lprologue,.Lepilogue # HandlerData[]
  2167. ___
  2168. $code.=<<___ if ($SZ==4 && $shaext);
  2169. .LSEH_info_${func}_shaext:
  2170. .byte 9,0,0,0
  2171. .rva shaext_handler
  2172. ___
  2173. $code.=<<___ if ($SZ==4);
  2174. .LSEH_info_${func}_ssse3:
  2175. .byte 9,0,0,0
  2176. .rva se_handler
  2177. .rva .Lprologue_ssse3,.Lepilogue_ssse3 # HandlerData[]
  2178. ___
  2179. $code.=<<___ if ($avx && $SZ==8);
  2180. .LSEH_info_${func}_xop:
  2181. .byte 9,0,0,0
  2182. .rva se_handler
  2183. .rva .Lprologue_xop,.Lepilogue_xop # HandlerData[]
  2184. ___
  2185. $code.=<<___ if ($avx);
  2186. .LSEH_info_${func}_avx:
  2187. .byte 9,0,0,0
  2188. .rva se_handler
  2189. .rva .Lprologue_avx,.Lepilogue_avx # HandlerData[]
  2190. ___
  2191. $code.=<<___ if ($avx>1);
  2192. .LSEH_info_${func}_avx2:
  2193. .byte 9,0,0,0
  2194. .rva se_handler
  2195. .rva .Lprologue_avx2,.Lepilogue_avx2 # HandlerData[]
  2196. ___
  2197. }
  2198. sub sha256op38 {
  2199. my $instr = shift;
  2200. my %opcodelet = (
  2201. "sha256rnds2" => 0xcb,
  2202. "sha256msg1" => 0xcc,
  2203. "sha256msg2" => 0xcd );
  2204. if (defined($opcodelet{$instr}) && @_[0] =~ /%xmm([0-7]),\s*%xmm([0-7])/) {
  2205. my @opcode=(0x0f,0x38);
  2206. push @opcode,$opcodelet{$instr};
  2207. push @opcode,0xc0|($1&7)|(($2&7)<<3); # ModR/M
  2208. return ".byte\t".join(',',@opcode);
  2209. } else {
  2210. return $instr."\t".@_[0];
  2211. }
  2212. }
  2213. foreach (split("\n",$code)) {
  2214. s/\`([^\`]*)\`/eval $1/geo;
  2215. s/\b(sha256[^\s]*)\s+(.*)/sha256op38($1,$2)/geo;
  2216. print $_,"\n";
  2217. }
  2218. close STDOUT;