You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 
 
 

2068 regels
49 KiB

  1. #!/usr/bin/env perl
  2. #
  3. # ====================================================================
  4. # Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
  5. # project. The module is, however, dual licensed under OpenSSL and
  6. # CRYPTOGAMS licenses depending on where you obtain it. For further
  7. # details see http://www.openssl.org/~appro/cryptogams/.
  8. # ====================================================================
  9. #
  10. # sha1_block procedure for x86_64.
  11. #
  12. # It was brought to my attention that on EM64T compiler-generated code
  13. # was far behind 32-bit assembler implementation. This is unlike on
  14. # Opteron where compiler-generated code was only 15% behind 32-bit
  15. # assembler, which originally made it hard to motivate the effort.
  16. # There was suggestion to mechanically translate 32-bit code, but I
  17. # dismissed it, reasoning that x86_64 offers enough register bank
  18. # capacity to fully utilize SHA-1 parallelism. Therefore this fresh
  19. # implementation:-) However! While 64-bit code does perform better
  20. # on Opteron, I failed to beat 32-bit assembler on EM64T core. Well,
  21. # x86_64 does offer larger *addressable* bank, but out-of-order core
  22. # reaches for even more registers through dynamic aliasing, and EM64T
  23. # core must have managed to run-time optimize even 32-bit code just as
  24. # good as 64-bit one. Performance improvement is summarized in the
  25. # following table:
  26. #
  27. # gcc 3.4 32-bit asm cycles/byte
  28. # Opteron +45% +20% 6.8
  29. # Xeon P4 +65% +0% 9.9
  30. # Core2 +60% +10% 7.0
  31. # August 2009.
  32. #
  33. # The code was revised to minimize code size and to maximize
  34. # "distance" between instructions producing input to 'lea'
  35. # instruction and the 'lea' instruction itself, which is essential
  36. # for Intel Atom core.
  37. # October 2010.
  38. #
  39. # Add SSSE3, Supplemental[!] SSE3, implementation. The idea behind it
  40. # is to offload message schedule denoted by Wt in NIST specification,
  41. # or Xupdate in OpenSSL source, to SIMD unit. See sha1-586.pl module
  42. # for background and implementation details. The only difference from
  43. # 32-bit code is that 64-bit code doesn't have to spill @X[] elements
  44. # to free temporary registers.
  45. # April 2011.
  46. #
  47. # Add AVX code path. See sha1-586.pl for further information.
  48. # May 2013.
  49. #
  50. # Add AVX2+BMI code path. Initial attempt (utilizing BMI instructions
  51. # and loading pair of consecutive blocks to 256-bit %ymm registers)
  52. # did not provide impressive performance improvement till a crucial
  53. # hint regarding the number of Xupdate iterations to pre-compute in
  54. # advance was provided by Ilya Albrekht of Intel Corp.
  55. # March 2014.
  56. #
  57. # Add support for Intel SHA Extensions.
  58. ######################################################################
  59. # Current performance is summarized in following table. Numbers are
  60. # CPU clock cycles spent to process single byte (less is better).
  61. #
  62. # x86_64 SSSE3 AVX[2]
  63. # P4 9.05 -
  64. # Opteron 6.26 -
  65. # Core2 6.55 6.05/+8% -
  66. # Westmere 6.73 5.30/+27% -
  67. # Sandy Bridge 7.70 6.10/+26% 4.99/+54%
  68. # Ivy Bridge 6.06 4.67/+30% 4.60/+32%
  69. # Haswell 5.45 4.15/+31% 3.57/+53%
  70. # Bulldozer 9.11 5.95/+53%
  71. # VIA Nano 9.32 7.15/+30%
  72. # Atom 10.3 9.17/+12%
  73. # Silvermont 13.1(*) 9.37/+40%
  74. #
  75. # (*) obviously suboptimal result, nothing was done about it,
  76. # because SSSE3 code is compiled unconditionally;
  77. $flavour = shift;
  78. $output = shift;
  79. if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
  80. $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
  81. $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
  82. ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
  83. ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
  84. die "can't locate x86_64-xlate.pl";
  85. if (`$ENV{CC} -Wa,-v -c -o /dev/null -x assembler /dev/null 2>&1`
  86. =~ /GNU assembler version ([2-9]\.[0-9]+)/) {
  87. $avx = ($1>=2.19) + ($1>=2.22);
  88. }
  89. if (!$avx && $win64 && ($flavour =~ /nasm/ || $ENV{ASM} =~ /nasm/) &&
  90. `nasm -v 2>&1` =~ /NASM version ([2-9]\.[0-9]+)/) {
  91. $avx = ($1>=2.09) + ($1>=2.10);
  92. }
  93. if (!$avx && $win64 && ($flavour =~ /masm/ || $ENV{ASM} =~ /ml64/) &&
  94. `ml64 2>&1` =~ /Version ([0-9]+)\./) {
  95. $avx = ($1>=10) + ($1>=11);
  96. }
  97. if (!$avx && `$ENV{CC} -v 2>&1` =~ /(^clang version|based on LLVM) ([2-9]\.[0-9]+)/) {
  98. $avx = ($2>=3.0) + ($2>3.0);
  99. }
  100. $shaext=0; ### set to zero if compiling for 1.0.1
  101. $avx=1 if (!$shaext && $avx);
  102. open OUT,"| \"$^X\" $xlate $flavour $output";
  103. *STDOUT=*OUT;
  104. $ctx="%rdi"; # 1st arg
  105. $inp="%rsi"; # 2nd arg
  106. $num="%rdx"; # 3rd arg
  107. # reassign arguments in order to produce more compact code
  108. $ctx="%r8";
  109. $inp="%r9";
  110. $num="%r10";
  111. $t0="%eax";
  112. $t1="%ebx";
  113. $t2="%ecx";
  114. @xi=("%edx","%ebp","%r14d");
  115. $A="%esi";
  116. $B="%edi";
  117. $C="%r11d";
  118. $D="%r12d";
  119. $E="%r13d";
  120. @V=($A,$B,$C,$D,$E);
  121. sub BODY_00_19 {
  122. my ($i,$a,$b,$c,$d,$e)=@_;
  123. my $j=$i+1;
  124. $code.=<<___ if ($i==0);
  125. mov `4*$i`($inp),$xi[0]
  126. bswap $xi[0]
  127. ___
  128. $code.=<<___ if ($i<15);
  129. mov `4*$j`($inp),$xi[1]
  130. mov $d,$t0
  131. mov $xi[0],`4*$i`(%rsp)
  132. mov $a,$t2
  133. bswap $xi[1]
  134. xor $c,$t0
  135. rol \$5,$t2
  136. and $b,$t0
  137. lea 0x5a827999($xi[0],$e),$e
  138. add $t2,$e
  139. xor $d,$t0
  140. rol \$30,$b
  141. add $t0,$e
  142. ___
  143. $code.=<<___ if ($i>=15);
  144. xor `4*($j%16)`(%rsp),$xi[1]
  145. mov $d,$t0
  146. mov $xi[0],`4*($i%16)`(%rsp)
  147. mov $a,$t2
  148. xor `4*(($j+2)%16)`(%rsp),$xi[1]
  149. xor $c,$t0
  150. rol \$5,$t2
  151. xor `4*(($j+8)%16)`(%rsp),$xi[1]
  152. and $b,$t0
  153. lea 0x5a827999($xi[0],$e),$e
  154. rol \$30,$b
  155. xor $d,$t0
  156. add $t2,$e
  157. rol \$1,$xi[1]
  158. add $t0,$e
  159. ___
  160. push(@xi,shift(@xi));
  161. }
  162. sub BODY_20_39 {
  163. my ($i,$a,$b,$c,$d,$e)=@_;
  164. my $j=$i+1;
  165. my $K=($i<40)?0x6ed9eba1:0xca62c1d6;
  166. $code.=<<___ if ($i<79);
  167. xor `4*($j%16)`(%rsp),$xi[1]
  168. mov $b,$t0
  169. `"mov $xi[0],".4*($i%16)."(%rsp)" if ($i<72)`
  170. mov $a,$t2
  171. xor `4*(($j+2)%16)`(%rsp),$xi[1]
  172. xor $d,$t0
  173. rol \$5,$t2
  174. xor `4*(($j+8)%16)`(%rsp),$xi[1]
  175. lea $K($xi[0],$e),$e
  176. xor $c,$t0
  177. add $t2,$e
  178. rol \$30,$b
  179. add $t0,$e
  180. rol \$1,$xi[1]
  181. ___
  182. $code.=<<___ if ($i==79);
  183. mov $b,$t0
  184. mov $a,$t2
  185. xor $d,$t0
  186. lea $K($xi[0],$e),$e
  187. rol \$5,$t2
  188. xor $c,$t0
  189. add $t2,$e
  190. rol \$30,$b
  191. add $t0,$e
  192. ___
  193. push(@xi,shift(@xi));
  194. }
  195. sub BODY_40_59 {
  196. my ($i,$a,$b,$c,$d,$e)=@_;
  197. my $j=$i+1;
  198. $code.=<<___;
  199. xor `4*($j%16)`(%rsp),$xi[1]
  200. mov $d,$t0
  201. mov $xi[0],`4*($i%16)`(%rsp)
  202. mov $d,$t1
  203. xor `4*(($j+2)%16)`(%rsp),$xi[1]
  204. and $c,$t0
  205. mov $a,$t2
  206. xor `4*(($j+8)%16)`(%rsp),$xi[1]
  207. lea 0x8f1bbcdc($xi[0],$e),$e
  208. xor $c,$t1
  209. rol \$5,$t2
  210. add $t0,$e
  211. rol \$1,$xi[1]
  212. and $b,$t1
  213. add $t2,$e
  214. rol \$30,$b
  215. add $t1,$e
  216. ___
  217. push(@xi,shift(@xi));
  218. }
  219. $code.=<<___;
  220. .text
  221. .extern OPENSSL_ia32cap_P
  222. .globl sha1_block_data_order
  223. .type sha1_block_data_order,\@function,3
  224. .align 16
  225. sha1_block_data_order:
  226. mov OPENSSL_ia32cap_P+0(%rip),%r9d
  227. mov OPENSSL_ia32cap_P+4(%rip),%r8d
  228. mov OPENSSL_ia32cap_P+8(%rip),%r10d
  229. test \$`1<<9`,%r8d # check SSSE3 bit
  230. jz .Lialu
  231. ___
  232. $code.=<<___ if ($shaext);
  233. test \$`1<<29`,%r10d # check SHA bit
  234. jnz _shaext_shortcut
  235. ___
  236. $code.=<<___ if ($avx>1);
  237. and \$`1<<3|1<<5|1<<8`,%r10d # check AVX2+BMI1+BMI2
  238. cmp \$`1<<3|1<<5|1<<8`,%r10d
  239. je _avx2_shortcut
  240. ___
  241. $code.=<<___ if ($avx);
  242. and \$`1<<28`,%r8d # mask AVX bit
  243. and \$`1<<30`,%r9d # mask "Intel CPU" bit
  244. or %r9d,%r8d
  245. cmp \$`1<<28|1<<30`,%r8d
  246. je _avx_shortcut
  247. ___
  248. $code.=<<___;
  249. jmp _ssse3_shortcut
  250. .align 16
  251. .Lialu:
  252. mov %rsp,%rax
  253. push %rbx
  254. push %rbp
  255. push %r12
  256. push %r13
  257. push %r14
  258. mov %rdi,$ctx # reassigned argument
  259. sub \$`8+16*4`,%rsp
  260. mov %rsi,$inp # reassigned argument
  261. and \$-64,%rsp
  262. mov %rdx,$num # reassigned argument
  263. mov %rax,`16*4`(%rsp)
  264. .Lprologue:
  265. mov 0($ctx),$A
  266. mov 4($ctx),$B
  267. mov 8($ctx),$C
  268. mov 12($ctx),$D
  269. mov 16($ctx),$E
  270. jmp .Lloop
  271. .align 16
  272. .Lloop:
  273. ___
  274. for($i=0;$i<20;$i++) { &BODY_00_19($i,@V); unshift(@V,pop(@V)); }
  275. for(;$i<40;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
  276. for(;$i<60;$i++) { &BODY_40_59($i,@V); unshift(@V,pop(@V)); }
  277. for(;$i<80;$i++) { &BODY_20_39($i,@V); unshift(@V,pop(@V)); }
  278. $code.=<<___;
  279. add 0($ctx),$A
  280. add 4($ctx),$B
  281. add 8($ctx),$C
  282. add 12($ctx),$D
  283. add 16($ctx),$E
  284. mov $A,0($ctx)
  285. mov $B,4($ctx)
  286. mov $C,8($ctx)
  287. mov $D,12($ctx)
  288. mov $E,16($ctx)
  289. sub \$1,$num
  290. lea `16*4`($inp),$inp
  291. jnz .Lloop
  292. mov `16*4`(%rsp),%rsi
  293. mov -40(%rsi),%r14
  294. mov -32(%rsi),%r13
  295. mov -24(%rsi),%r12
  296. mov -16(%rsi),%rbp
  297. mov -8(%rsi),%rbx
  298. lea (%rsi),%rsp
  299. .Lepilogue:
  300. ret
  301. .size sha1_block_data_order,.-sha1_block_data_order
  302. ___
  303. if ($shaext) {{{
  304. ######################################################################
  305. # Intel SHA Extensions implementation of SHA1 update function.
  306. #
  307. my ($ctx,$inp,$num)=("%rdi","%rsi","%rdx");
  308. my ($ABCD,$E,$E_,$BSWAP,$ABCD_SAVE,$E_SAVE)=map("%xmm$_",(0..3,8,9));
  309. my @MSG=map("%xmm$_",(4..7));
  310. $code.=<<___;
  311. .type sha1_block_data_order_shaext,\@function,3
  312. .align 32
  313. sha1_block_data_order_shaext:
  314. _shaext_shortcut:
  315. ___
  316. $code.=<<___ if ($win64);
  317. lea `-8-4*16`(%rsp),%rsp
  318. movaps %xmm6,-8-4*16(%rax)
  319. movaps %xmm7,-8-3*16(%rax)
  320. movaps %xmm8,-8-2*16(%rax)
  321. movaps %xmm9,-8-1*16(%rax)
  322. .Lprologue_shaext:
  323. ___
  324. $code.=<<___;
  325. movdqu ($ctx),$ABCD
  326. movd 16($ctx),$E
  327. movdqa K_XX_XX+0xa0(%rip),$BSWAP # byte-n-word swap
  328. movdqu ($inp),@MSG[0]
  329. pshufd \$0b00011011,$ABCD,$ABCD # flip word order
  330. movdqu 0x10($inp),@MSG[1]
  331. pshufd \$0b00011011,$E,$E # flip word order
  332. movdqu 0x20($inp),@MSG[2]
  333. pshufb $BSWAP,@MSG[0]
  334. movdqu 0x30($inp),@MSG[3]
  335. pshufb $BSWAP,@MSG[1]
  336. pshufb $BSWAP,@MSG[2]
  337. movdqa $E,$E_SAVE # offload $E
  338. pshufb $BSWAP,@MSG[3]
  339. jmp .Loop_shaext
  340. .align 16
  341. .Loop_shaext:
  342. dec $num
  343. lea 0x40($inp),%rax # next input block
  344. paddd @MSG[0],$E
  345. cmovne %rax,$inp
  346. movdqa $ABCD,$ABCD_SAVE # offload $ABCD
  347. ___
  348. for($i=0;$i<20-4;$i+=2) {
  349. $code.=<<___;
  350. sha1msg1 @MSG[1],@MSG[0]
  351. movdqa $ABCD,$E_
  352. sha1rnds4 \$`int($i/5)`,$E,$ABCD # 0-3...
  353. sha1nexte @MSG[1],$E_
  354. pxor @MSG[2],@MSG[0]
  355. sha1msg1 @MSG[2],@MSG[1]
  356. sha1msg2 @MSG[3],@MSG[0]
  357. movdqa $ABCD,$E
  358. sha1rnds4 \$`int(($i+1)/5)`,$E_,$ABCD
  359. sha1nexte @MSG[2],$E
  360. pxor @MSG[3],@MSG[1]
  361. sha1msg2 @MSG[0],@MSG[1]
  362. ___
  363. push(@MSG,shift(@MSG)); push(@MSG,shift(@MSG));
  364. }
  365. $code.=<<___;
  366. movdqu ($inp),@MSG[0]
  367. movdqa $ABCD,$E_
  368. sha1rnds4 \$3,$E,$ABCD # 64-67
  369. sha1nexte @MSG[1],$E_
  370. movdqu 0x10($inp),@MSG[1]
  371. pshufb $BSWAP,@MSG[0]
  372. movdqa $ABCD,$E
  373. sha1rnds4 \$3,$E_,$ABCD # 68-71
  374. sha1nexte @MSG[2],$E
  375. movdqu 0x20($inp),@MSG[2]
  376. pshufb $BSWAP,@MSG[1]
  377. movdqa $ABCD,$E_
  378. sha1rnds4 \$3,$E,$ABCD # 72-75
  379. sha1nexte @MSG[3],$E_
  380. movdqu 0x30($inp),@MSG[3]
  381. pshufb $BSWAP,@MSG[2]
  382. movdqa $ABCD,$E
  383. sha1rnds4 \$3,$E_,$ABCD # 76-79
  384. sha1nexte $E_SAVE,$E
  385. pshufb $BSWAP,@MSG[3]
  386. paddd $ABCD_SAVE,$ABCD
  387. movdqa $E,$E_SAVE # offload $E
  388. jnz .Loop_shaext
  389. pshufd \$0b00011011,$ABCD,$ABCD
  390. pshufd \$0b00011011,$E,$E
  391. movdqu $ABCD,($ctx)
  392. movd $E,16($ctx)
  393. ___
  394. $code.=<<___ if ($win64);
  395. movaps -8-4*16(%rax),%xmm6
  396. movaps -8-3*16(%rax),%xmm7
  397. movaps -8-2*16(%rax),%xmm8
  398. movaps -8-1*16(%rax),%xmm9
  399. mov %rax,%rsp
  400. .Lepilogue_shaext:
  401. ___
  402. $code.=<<___;
  403. ret
  404. .size sha1_block_data_order_shaext,.-sha1_block_data_order_shaext
  405. ___
  406. }}}
  407. {{{
  408. my $Xi=4;
  409. my @X=map("%xmm$_",(4..7,0..3));
  410. my @Tx=map("%xmm$_",(8..10));
  411. my $Kx="%xmm11";
  412. my @V=($A,$B,$C,$D,$E)=("%eax","%ebx","%ecx","%edx","%ebp"); # size optimization
  413. my @T=("%esi","%edi");
  414. my $j=0;
  415. my $rx=0;
  416. my $K_XX_XX="%r11";
  417. my $_rol=sub { &rol(@_) };
  418. my $_ror=sub { &ror(@_) };
  419. { my $sn;
  420. sub align32() {
  421. ++$sn;
  422. $code.=<<___;
  423. jmp .Lalign32_$sn # see "Decoded ICache" in manual
  424. .align 32
  425. .Lalign32_$sn:
  426. ___
  427. }
  428. }
  429. $code.=<<___;
  430. .type sha1_block_data_order_ssse3,\@function,3
  431. .align 16
  432. sha1_block_data_order_ssse3:
  433. _ssse3_shortcut:
  434. mov %rsp,%rax
  435. push %rbx
  436. push %rbp
  437. push %r12
  438. push %r13 # redundant, done to share Win64 SE handler
  439. push %r14
  440. lea `-64-($win64?6*16:0)`(%rsp),%rsp
  441. ___
  442. $code.=<<___ if ($win64);
  443. movaps %xmm6,-40-6*16(%rax)
  444. movaps %xmm7,-40-5*16(%rax)
  445. movaps %xmm8,-40-4*16(%rax)
  446. movaps %xmm9,-40-3*16(%rax)
  447. movaps %xmm10,-40-2*16(%rax)
  448. movaps %xmm11,-40-1*16(%rax)
  449. .Lprologue_ssse3:
  450. ___
  451. $code.=<<___;
  452. mov %rax,%r14 # original %rsp
  453. and \$-64,%rsp
  454. mov %rdi,$ctx # reassigned argument
  455. mov %rsi,$inp # reassigned argument
  456. mov %rdx,$num # reassigned argument
  457. shl \$6,$num
  458. add $inp,$num
  459. lea K_XX_XX+64(%rip),$K_XX_XX
  460. mov 0($ctx),$A # load context
  461. mov 4($ctx),$B
  462. mov 8($ctx),$C
  463. mov 12($ctx),$D
  464. mov $B,@T[0] # magic seed
  465. mov 16($ctx),$E
  466. mov $C,@T[1]
  467. xor $D,@T[1]
  468. and @T[1],@T[0]
  469. movdqa 64($K_XX_XX),@X[2] # pbswap mask
  470. movdqa -64($K_XX_XX),@Tx[1] # K_00_19
  471. movdqu 0($inp),@X[-4&7] # load input to %xmm[0-3]
  472. movdqu 16($inp),@X[-3&7]
  473. movdqu 32($inp),@X[-2&7]
  474. movdqu 48($inp),@X[-1&7]
  475. pshufb @X[2],@X[-4&7] # byte swap
  476. pshufb @X[2],@X[-3&7]
  477. pshufb @X[2],@X[-2&7]
  478. add \$64,$inp
  479. paddd @Tx[1],@X[-4&7] # add K_00_19
  480. pshufb @X[2],@X[-1&7]
  481. paddd @Tx[1],@X[-3&7]
  482. paddd @Tx[1],@X[-2&7]
  483. movdqa @X[-4&7],0(%rsp) # X[]+K xfer to IALU
  484. psubd @Tx[1],@X[-4&7] # restore X[]
  485. movdqa @X[-3&7],16(%rsp)
  486. psubd @Tx[1],@X[-3&7]
  487. movdqa @X[-2&7],32(%rsp)
  488. psubd @Tx[1],@X[-2&7]
  489. jmp .Loop_ssse3
  490. ___
  491. sub AUTOLOAD() # thunk [simplified] 32-bit style perlasm
  492. { my $opcode = $AUTOLOAD; $opcode =~ s/.*:://;
  493. my $arg = pop;
  494. $arg = "\$$arg" if ($arg*1 eq $arg);
  495. $code .= "\t$opcode\t".join(',',$arg,reverse @_)."\n";
  496. }
  497. sub Xupdate_ssse3_16_31() # recall that $Xi starts wtih 4
  498. { use integer;
  499. my $body = shift;
  500. my @insns = (&$body,&$body,&$body,&$body); # 40 instructions
  501. my ($a,$b,$c,$d,$e);
  502. eval(shift(@insns)); # ror
  503. &pshufd (@X[0],@X[-4&7],0xee); # was &movdqa (@X[0],@X[-3&7]);
  504. eval(shift(@insns));
  505. &movdqa (@Tx[0],@X[-1&7]);
  506. &paddd (@Tx[1],@X[-1&7]);
  507. eval(shift(@insns));
  508. eval(shift(@insns));
  509. &punpcklqdq(@X[0],@X[-3&7]); # compose "X[-14]" in "X[0]", was &palignr(@X[0],@X[-4&7],8);
  510. eval(shift(@insns));
  511. eval(shift(@insns)); # rol
  512. eval(shift(@insns));
  513. &psrldq (@Tx[0],4); # "X[-3]", 3 dwords
  514. eval(shift(@insns));
  515. eval(shift(@insns));
  516. &pxor (@X[0],@X[-4&7]); # "X[0]"^="X[-16]"
  517. eval(shift(@insns));
  518. eval(shift(@insns)); # ror
  519. &pxor (@Tx[0],@X[-2&7]); # "X[-3]"^"X[-8]"
  520. eval(shift(@insns));
  521. eval(shift(@insns));
  522. eval(shift(@insns));
  523. &pxor (@X[0],@Tx[0]); # "X[0]"^="X[-3]"^"X[-8]"
  524. eval(shift(@insns));
  525. eval(shift(@insns)); # rol
  526. &movdqa (eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]); # X[]+K xfer to IALU
  527. eval(shift(@insns));
  528. eval(shift(@insns));
  529. &movdqa (@Tx[2],@X[0]);
  530. eval(shift(@insns));
  531. eval(shift(@insns));
  532. eval(shift(@insns)); # ror
  533. &movdqa (@Tx[0],@X[0]);
  534. eval(shift(@insns));
  535. &pslldq (@Tx[2],12); # "X[0]"<<96, extract one dword
  536. &paddd (@X[0],@X[0]);
  537. eval(shift(@insns));
  538. eval(shift(@insns));
  539. &psrld (@Tx[0],31);
  540. eval(shift(@insns));
  541. eval(shift(@insns)); # rol
  542. eval(shift(@insns));
  543. &movdqa (@Tx[1],@Tx[2]);
  544. eval(shift(@insns));
  545. eval(shift(@insns));
  546. &psrld (@Tx[2],30);
  547. eval(shift(@insns));
  548. eval(shift(@insns)); # ror
  549. &por (@X[0],@Tx[0]); # "X[0]"<<<=1
  550. eval(shift(@insns));
  551. eval(shift(@insns));
  552. eval(shift(@insns));
  553. &pslld (@Tx[1],2);
  554. &pxor (@X[0],@Tx[2]);
  555. eval(shift(@insns));
  556. &movdqa (@Tx[2],eval(2*16*(($Xi)/5)-64)."($K_XX_XX)"); # K_XX_XX
  557. eval(shift(@insns)); # rol
  558. eval(shift(@insns));
  559. eval(shift(@insns));
  560. &pxor (@X[0],@Tx[1]); # "X[0]"^=("X[0]">>96)<<<2
  561. &pshufd (@Tx[1],@X[-1&7],0xee) if ($Xi==7); # was &movdqa (@Tx[0],@X[-1&7]) in Xupdate_ssse3_32_79
  562. foreach (@insns) { eval; } # remaining instructions [if any]
  563. $Xi++; push(@X,shift(@X)); # "rotate" X[]
  564. push(@Tx,shift(@Tx));
  565. }
  566. sub Xupdate_ssse3_32_79()
  567. { use integer;
  568. my $body = shift;
  569. my @insns = (&$body,&$body,&$body,&$body); # 32 to 44 instructions
  570. my ($a,$b,$c,$d,$e);
  571. eval(shift(@insns)) if ($Xi==8);
  572. &pxor (@X[0],@X[-4&7]); # "X[0]"="X[-32]"^"X[-16]"
  573. eval(shift(@insns)) if ($Xi==8);
  574. eval(shift(@insns)); # body_20_39
  575. eval(shift(@insns));
  576. eval(shift(@insns)) if (@insns[1] =~ /_ror/);
  577. eval(shift(@insns)) if (@insns[0] =~ /_ror/);
  578. &punpcklqdq(@Tx[0],@X[-1&7]); # compose "X[-6]", was &palignr(@Tx[0],@X[-2&7],8);
  579. eval(shift(@insns));
  580. eval(shift(@insns)); # rol
  581. &pxor (@X[0],@X[-7&7]); # "X[0]"^="X[-28]"
  582. eval(shift(@insns));
  583. eval(shift(@insns));
  584. if ($Xi%5) {
  585. &movdqa (@Tx[2],@Tx[1]);# "perpetuate" K_XX_XX...
  586. } else { # ... or load next one
  587. &movdqa (@Tx[2],eval(2*16*($Xi/5)-64)."($K_XX_XX)");
  588. }
  589. eval(shift(@insns)); # ror
  590. &paddd (@Tx[1],@X[-1&7]);
  591. eval(shift(@insns));
  592. &pxor (@X[0],@Tx[0]); # "X[0]"^="X[-6]"
  593. eval(shift(@insns)); # body_20_39
  594. eval(shift(@insns));
  595. eval(shift(@insns));
  596. eval(shift(@insns)); # rol
  597. eval(shift(@insns)) if (@insns[0] =~ /_ror/);
  598. &movdqa (@Tx[0],@X[0]);
  599. eval(shift(@insns));
  600. eval(shift(@insns));
  601. &movdqa (eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]); # X[]+K xfer to IALU
  602. eval(shift(@insns)); # ror
  603. eval(shift(@insns));
  604. eval(shift(@insns)); # body_20_39
  605. &pslld (@X[0],2);
  606. eval(shift(@insns));
  607. eval(shift(@insns));
  608. &psrld (@Tx[0],30);
  609. eval(shift(@insns)) if (@insns[0] =~ /_rol/);# rol
  610. eval(shift(@insns));
  611. eval(shift(@insns));
  612. eval(shift(@insns)); # ror
  613. &por (@X[0],@Tx[0]); # "X[0]"<<<=2
  614. eval(shift(@insns));
  615. eval(shift(@insns)); # body_20_39
  616. eval(shift(@insns)) if (@insns[1] =~ /_rol/);
  617. eval(shift(@insns)) if (@insns[0] =~ /_rol/);
  618. &pshufd(@Tx[1],@X[-1&7],0xee) if ($Xi<19); # was &movdqa (@Tx[1],@X[0])
  619. eval(shift(@insns));
  620. eval(shift(@insns)); # rol
  621. eval(shift(@insns));
  622. eval(shift(@insns));
  623. eval(shift(@insns)); # rol
  624. eval(shift(@insns));
  625. foreach (@insns) { eval; } # remaining instructions
  626. $Xi++; push(@X,shift(@X)); # "rotate" X[]
  627. push(@Tx,shift(@Tx));
  628. }
  629. sub Xuplast_ssse3_80()
  630. { use integer;
  631. my $body = shift;
  632. my @insns = (&$body,&$body,&$body,&$body); # 32 instructions
  633. my ($a,$b,$c,$d,$e);
  634. eval(shift(@insns));
  635. eval(shift(@insns));
  636. eval(shift(@insns));
  637. eval(shift(@insns));
  638. &paddd (@Tx[1],@X[-1&7]);
  639. eval(shift(@insns));
  640. eval(shift(@insns));
  641. &movdqa (eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]); # X[]+K xfer IALU
  642. foreach (@insns) { eval; } # remaining instructions
  643. &cmp ($inp,$num);
  644. &je (".Ldone_ssse3");
  645. unshift(@Tx,pop(@Tx));
  646. &movdqa (@X[2],"64($K_XX_XX)"); # pbswap mask
  647. &movdqa (@Tx[1],"-64($K_XX_XX)"); # K_00_19
  648. &movdqu (@X[-4&7],"0($inp)"); # load input
  649. &movdqu (@X[-3&7],"16($inp)");
  650. &movdqu (@X[-2&7],"32($inp)");
  651. &movdqu (@X[-1&7],"48($inp)");
  652. &pshufb (@X[-4&7],@X[2]); # byte swap
  653. &add ($inp,64);
  654. $Xi=0;
  655. }
  656. sub Xloop_ssse3()
  657. { use integer;
  658. my $body = shift;
  659. my @insns = (&$body,&$body,&$body,&$body); # 32 instructions
  660. my ($a,$b,$c,$d,$e);
  661. eval(shift(@insns));
  662. eval(shift(@insns));
  663. eval(shift(@insns));
  664. &pshufb (@X[($Xi-3)&7],@X[2]);
  665. eval(shift(@insns));
  666. eval(shift(@insns));
  667. eval(shift(@insns));
  668. eval(shift(@insns));
  669. &paddd (@X[($Xi-4)&7],@Tx[1]);
  670. eval(shift(@insns));
  671. eval(shift(@insns));
  672. eval(shift(@insns));
  673. eval(shift(@insns));
  674. &movdqa (eval(16*$Xi)."(%rsp)",@X[($Xi-4)&7]); # X[]+K xfer to IALU
  675. eval(shift(@insns));
  676. eval(shift(@insns));
  677. eval(shift(@insns));
  678. eval(shift(@insns));
  679. &psubd (@X[($Xi-4)&7],@Tx[1]);
  680. foreach (@insns) { eval; }
  681. $Xi++;
  682. }
  683. sub Xtail_ssse3()
  684. { use integer;
  685. my $body = shift;
  686. my @insns = (&$body,&$body,&$body,&$body); # 32 instructions
  687. my ($a,$b,$c,$d,$e);
  688. foreach (@insns) { eval; }
  689. }
  690. sub body_00_19 () { # ((c^d)&b)^d
  691. # on start @T[0]=(c^d)&b
  692. return &body_20_39() if ($rx==19); $rx++;
  693. (
  694. '($a,$b,$c,$d,$e)=@V;'.
  695. '&$_ror ($b,$j?7:2)', # $b>>>2
  696. '&xor (@T[0],$d)',
  697. '&mov (@T[1],$a)', # $b for next round
  698. '&add ($e,eval(4*($j&15))."(%rsp)")', # X[]+K xfer
  699. '&xor ($b,$c)', # $c^$d for next round
  700. '&$_rol ($a,5)',
  701. '&add ($e,@T[0])',
  702. '&and (@T[1],$b)', # ($b&($c^$d)) for next round
  703. '&xor ($b,$c)', # restore $b
  704. '&add ($e,$a);' .'$j++; unshift(@V,pop(@V)); unshift(@T,pop(@T));'
  705. );
  706. }
  707. sub body_20_39 () { # b^d^c
  708. # on entry @T[0]=b^d
  709. return &body_40_59() if ($rx==39); $rx++;
  710. (
  711. '($a,$b,$c,$d,$e)=@V;'.
  712. '&add ($e,eval(4*($j&15))."(%rsp)")', # X[]+K xfer
  713. '&xor (@T[0],$d) if($j==19);'.
  714. '&xor (@T[0],$c) if($j> 19)', # ($b^$d^$c)
  715. '&mov (@T[1],$a)', # $b for next round
  716. '&$_rol ($a,5)',
  717. '&add ($e,@T[0])',
  718. '&xor (@T[1],$c) if ($j< 79)', # $b^$d for next round
  719. '&$_ror ($b,7)', # $b>>>2
  720. '&add ($e,$a);' .'$j++; unshift(@V,pop(@V)); unshift(@T,pop(@T));'
  721. );
  722. }
  723. sub body_40_59 () { # ((b^c)&(c^d))^c
  724. # on entry @T[0]=(b^c), (c^=d)
  725. $rx++;
  726. (
  727. '($a,$b,$c,$d,$e)=@V;'.
  728. '&add ($e,eval(4*($j&15))."(%rsp)")', # X[]+K xfer
  729. '&and (@T[0],$c) if ($j>=40)', # (b^c)&(c^d)
  730. '&xor ($c,$d) if ($j>=40)', # restore $c
  731. '&$_ror ($b,7)', # $b>>>2
  732. '&mov (@T[1],$a)', # $b for next round
  733. '&xor (@T[0],$c)',
  734. '&$_rol ($a,5)',
  735. '&add ($e,@T[0])',
  736. '&xor (@T[1],$c) if ($j==59);'.
  737. '&xor (@T[1],$b) if ($j< 59)', # b^c for next round
  738. '&xor ($b,$c) if ($j< 59)', # c^d for next round
  739. '&add ($e,$a);' .'$j++; unshift(@V,pop(@V)); unshift(@T,pop(@T));'
  740. );
  741. }
  742. $code.=<<___;
  743. .align 16
  744. .Loop_ssse3:
  745. ___
  746. &Xupdate_ssse3_16_31(\&body_00_19);
  747. &Xupdate_ssse3_16_31(\&body_00_19);
  748. &Xupdate_ssse3_16_31(\&body_00_19);
  749. &Xupdate_ssse3_16_31(\&body_00_19);
  750. &Xupdate_ssse3_32_79(\&body_00_19);
  751. &Xupdate_ssse3_32_79(\&body_20_39);
  752. &Xupdate_ssse3_32_79(\&body_20_39);
  753. &Xupdate_ssse3_32_79(\&body_20_39);
  754. &Xupdate_ssse3_32_79(\&body_20_39);
  755. &Xupdate_ssse3_32_79(\&body_20_39);
  756. &Xupdate_ssse3_32_79(\&body_40_59);
  757. &Xupdate_ssse3_32_79(\&body_40_59);
  758. &Xupdate_ssse3_32_79(\&body_40_59);
  759. &Xupdate_ssse3_32_79(\&body_40_59);
  760. &Xupdate_ssse3_32_79(\&body_40_59);
  761. &Xupdate_ssse3_32_79(\&body_20_39);
  762. &Xuplast_ssse3_80(\&body_20_39); # can jump to "done"
  763. $saved_j=$j; @saved_V=@V;
  764. &Xloop_ssse3(\&body_20_39);
  765. &Xloop_ssse3(\&body_20_39);
  766. &Xloop_ssse3(\&body_20_39);
  767. $code.=<<___;
  768. add 0($ctx),$A # update context
  769. add 4($ctx),@T[0]
  770. add 8($ctx),$C
  771. add 12($ctx),$D
  772. mov $A,0($ctx)
  773. add 16($ctx),$E
  774. mov @T[0],4($ctx)
  775. mov @T[0],$B # magic seed
  776. mov $C,8($ctx)
  777. mov $C,@T[1]
  778. mov $D,12($ctx)
  779. xor $D,@T[1]
  780. mov $E,16($ctx)
  781. and @T[1],@T[0]
  782. jmp .Loop_ssse3
  783. .align 16
  784. .Ldone_ssse3:
  785. ___
  786. $j=$saved_j; @V=@saved_V;
  787. &Xtail_ssse3(\&body_20_39);
  788. &Xtail_ssse3(\&body_20_39);
  789. &Xtail_ssse3(\&body_20_39);
  790. $code.=<<___;
  791. add 0($ctx),$A # update context
  792. add 4($ctx),@T[0]
  793. add 8($ctx),$C
  794. mov $A,0($ctx)
  795. add 12($ctx),$D
  796. mov @T[0],4($ctx)
  797. add 16($ctx),$E
  798. mov $C,8($ctx)
  799. mov $D,12($ctx)
  800. mov $E,16($ctx)
  801. ___
  802. $code.=<<___ if ($win64);
  803. movaps -40-6*16(%r14),%xmm6
  804. movaps -40-5*16(%r14),%xmm7
  805. movaps -40-4*16(%r14),%xmm8
  806. movaps -40-3*16(%r14),%xmm9
  807. movaps -40-2*16(%r14),%xmm10
  808. movaps -40-1*16(%r14),%xmm11
  809. ___
  810. $code.=<<___;
  811. lea (%r14),%rsi
  812. mov -40(%rsi),%r14
  813. mov -32(%rsi),%r13
  814. mov -24(%rsi),%r12
  815. mov -16(%rsi),%rbp
  816. mov -8(%rsi),%rbx
  817. lea (%rsi),%rsp
  818. .Lepilogue_ssse3:
  819. ret
  820. .size sha1_block_data_order_ssse3,.-sha1_block_data_order_ssse3
  821. ___
  822. if ($avx) {
  823. $Xi=4; # reset variables
  824. @X=map("%xmm$_",(4..7,0..3));
  825. @Tx=map("%xmm$_",(8..10));
  826. $j=0;
  827. $rx=0;
  828. my $done_avx_label=".Ldone_avx";
  829. my $_rol=sub { &shld(@_[0],@_) };
  830. my $_ror=sub { &shrd(@_[0],@_) };
  831. $code.=<<___;
  832. .type sha1_block_data_order_avx,\@function,3
  833. .align 16
  834. sha1_block_data_order_avx:
  835. _avx_shortcut:
  836. mov %rsp,%rax
  837. push %rbx
  838. push %rbp
  839. push %r12
  840. push %r13 # redundant, done to share Win64 SE handler
  841. push %r14
  842. lea `-64-($win64?6*16:0)`(%rsp),%rsp
  843. vzeroupper
  844. ___
  845. $code.=<<___ if ($win64);
  846. vmovaps %xmm6,-40-6*16(%rax)
  847. vmovaps %xmm7,-40-5*16(%rax)
  848. vmovaps %xmm8,-40-4*16(%rax)
  849. vmovaps %xmm9,-40-3*16(%rax)
  850. vmovaps %xmm10,-40-2*16(%rax)
  851. vmovaps %xmm11,-40-1*16(%rax)
  852. .Lprologue_avx:
  853. ___
  854. $code.=<<___;
  855. mov %rax,%r14 # original %rsp
  856. and \$-64,%rsp
  857. mov %rdi,$ctx # reassigned argument
  858. mov %rsi,$inp # reassigned argument
  859. mov %rdx,$num # reassigned argument
  860. shl \$6,$num
  861. add $inp,$num
  862. lea K_XX_XX+64(%rip),$K_XX_XX
  863. mov 0($ctx),$A # load context
  864. mov 4($ctx),$B
  865. mov 8($ctx),$C
  866. mov 12($ctx),$D
  867. mov $B,@T[0] # magic seed
  868. mov 16($ctx),$E
  869. mov $C,@T[1]
  870. xor $D,@T[1]
  871. and @T[1],@T[0]
  872. vmovdqa 64($K_XX_XX),@X[2] # pbswap mask
  873. vmovdqa -64($K_XX_XX),$Kx # K_00_19
  874. vmovdqu 0($inp),@X[-4&7] # load input to %xmm[0-3]
  875. vmovdqu 16($inp),@X[-3&7]
  876. vmovdqu 32($inp),@X[-2&7]
  877. vmovdqu 48($inp),@X[-1&7]
  878. vpshufb @X[2],@X[-4&7],@X[-4&7] # byte swap
  879. add \$64,$inp
  880. vpshufb @X[2],@X[-3&7],@X[-3&7]
  881. vpshufb @X[2],@X[-2&7],@X[-2&7]
  882. vpshufb @X[2],@X[-1&7],@X[-1&7]
  883. vpaddd $Kx,@X[-4&7],@X[0] # add K_00_19
  884. vpaddd $Kx,@X[-3&7],@X[1]
  885. vpaddd $Kx,@X[-2&7],@X[2]
  886. vmovdqa @X[0],0(%rsp) # X[]+K xfer to IALU
  887. vmovdqa @X[1],16(%rsp)
  888. vmovdqa @X[2],32(%rsp)
  889. jmp .Loop_avx
  890. ___
  891. sub Xupdate_avx_16_31() # recall that $Xi starts wtih 4
  892. { use integer;
  893. my $body = shift;
  894. my @insns = (&$body,&$body,&$body,&$body); # 40 instructions
  895. my ($a,$b,$c,$d,$e);
  896. eval(shift(@insns));
  897. eval(shift(@insns));
  898. &vpalignr(@X[0],@X[-3&7],@X[-4&7],8); # compose "X[-14]" in "X[0]"
  899. eval(shift(@insns));
  900. eval(shift(@insns));
  901. &vpaddd (@Tx[1],$Kx,@X[-1&7]);
  902. eval(shift(@insns));
  903. eval(shift(@insns));
  904. &vpsrldq(@Tx[0],@X[-1&7],4); # "X[-3]", 3 dwords
  905. eval(shift(@insns));
  906. eval(shift(@insns));
  907. &vpxor (@X[0],@X[0],@X[-4&7]); # "X[0]"^="X[-16]"
  908. eval(shift(@insns));
  909. eval(shift(@insns));
  910. &vpxor (@Tx[0],@Tx[0],@X[-2&7]); # "X[-3]"^"X[-8]"
  911. eval(shift(@insns));
  912. eval(shift(@insns));
  913. eval(shift(@insns));
  914. eval(shift(@insns));
  915. &vpxor (@X[0],@X[0],@Tx[0]); # "X[0]"^="X[-3]"^"X[-8]"
  916. eval(shift(@insns));
  917. eval(shift(@insns));
  918. &vmovdqa (eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]); # X[]+K xfer to IALU
  919. eval(shift(@insns));
  920. eval(shift(@insns));
  921. &vpsrld (@Tx[0],@X[0],31);
  922. eval(shift(@insns));
  923. eval(shift(@insns));
  924. eval(shift(@insns));
  925. eval(shift(@insns));
  926. &vpslldq(@Tx[2],@X[0],12); # "X[0]"<<96, extract one dword
  927. &vpaddd (@X[0],@X[0],@X[0]);
  928. eval(shift(@insns));
  929. eval(shift(@insns));
  930. eval(shift(@insns));
  931. eval(shift(@insns));
  932. &vpsrld (@Tx[1],@Tx[2],30);
  933. &vpor (@X[0],@X[0],@Tx[0]); # "X[0]"<<<=1
  934. eval(shift(@insns));
  935. eval(shift(@insns));
  936. eval(shift(@insns));
  937. eval(shift(@insns));
  938. &vpslld (@Tx[2],@Tx[2],2);
  939. &vpxor (@X[0],@X[0],@Tx[1]);
  940. eval(shift(@insns));
  941. eval(shift(@insns));
  942. eval(shift(@insns));
  943. eval(shift(@insns));
  944. &vpxor (@X[0],@X[0],@Tx[2]); # "X[0]"^=("X[0]">>96)<<<2
  945. eval(shift(@insns));
  946. eval(shift(@insns));
  947. &vmovdqa ($Kx,eval(2*16*(($Xi)/5)-64)."($K_XX_XX)") if ($Xi%5==0); # K_XX_XX
  948. eval(shift(@insns));
  949. eval(shift(@insns));
  950. foreach (@insns) { eval; } # remaining instructions [if any]
  951. $Xi++; push(@X,shift(@X)); # "rotate" X[]
  952. }
  953. sub Xupdate_avx_32_79()
  954. { use integer;
  955. my $body = shift;
  956. my @insns = (&$body,&$body,&$body,&$body); # 32 to 44 instructions
  957. my ($a,$b,$c,$d,$e);
  958. &vpalignr(@Tx[0],@X[-1&7],@X[-2&7],8); # compose "X[-6]"
  959. &vpxor (@X[0],@X[0],@X[-4&7]); # "X[0]"="X[-32]"^"X[-16]"
  960. eval(shift(@insns)); # body_20_39
  961. eval(shift(@insns));
  962. eval(shift(@insns));
  963. eval(shift(@insns)); # rol
  964. &vpxor (@X[0],@X[0],@X[-7&7]); # "X[0]"^="X[-28]"
  965. eval(shift(@insns));
  966. eval(shift(@insns)) if (@insns[0] !~ /&ro[rl]/);
  967. &vpaddd (@Tx[1],$Kx,@X[-1&7]);
  968. &vmovdqa ($Kx,eval(2*16*($Xi/5)-64)."($K_XX_XX)") if ($Xi%5==0);
  969. eval(shift(@insns)); # ror
  970. eval(shift(@insns));
  971. &vpxor (@X[0],@X[0],@Tx[0]); # "X[0]"^="X[-6]"
  972. eval(shift(@insns)); # body_20_39
  973. eval(shift(@insns));
  974. eval(shift(@insns));
  975. eval(shift(@insns)); # rol
  976. &vpsrld (@Tx[0],@X[0],30);
  977. &vmovdqa (eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]); # X[]+K xfer to IALU
  978. eval(shift(@insns));
  979. eval(shift(@insns));
  980. eval(shift(@insns)); # ror
  981. eval(shift(@insns));
  982. &vpslld (@X[0],@X[0],2);
  983. eval(shift(@insns)); # body_20_39
  984. eval(shift(@insns));
  985. eval(shift(@insns));
  986. eval(shift(@insns)); # rol
  987. eval(shift(@insns));
  988. eval(shift(@insns));
  989. eval(shift(@insns)); # ror
  990. eval(shift(@insns));
  991. &vpor (@X[0],@X[0],@Tx[0]); # "X[0]"<<<=2
  992. eval(shift(@insns)); # body_20_39
  993. eval(shift(@insns));
  994. eval(shift(@insns));
  995. eval(shift(@insns)); # rol
  996. eval(shift(@insns));
  997. eval(shift(@insns));
  998. eval(shift(@insns)); # rol
  999. eval(shift(@insns));
  1000. foreach (@insns) { eval; } # remaining instructions
  1001. $Xi++; push(@X,shift(@X)); # "rotate" X[]
  1002. }
  1003. sub Xuplast_avx_80()
  1004. { use integer;
  1005. my $body = shift;
  1006. my @insns = (&$body,&$body,&$body,&$body); # 32 instructions
  1007. my ($a,$b,$c,$d,$e);
  1008. eval(shift(@insns));
  1009. &vpaddd (@Tx[1],$Kx,@X[-1&7]);
  1010. eval(shift(@insns));
  1011. eval(shift(@insns));
  1012. eval(shift(@insns));
  1013. eval(shift(@insns));
  1014. &vmovdqa (eval(16*(($Xi-1)&3))."(%rsp)",@Tx[1]); # X[]+K xfer IALU
  1015. foreach (@insns) { eval; } # remaining instructions
  1016. &cmp ($inp,$num);
  1017. &je ($done_avx_label);
  1018. &vmovdqa(@X[2],"64($K_XX_XX)"); # pbswap mask
  1019. &vmovdqa($Kx,"-64($K_XX_XX)"); # K_00_19
  1020. &vmovdqu(@X[-4&7],"0($inp)"); # load input
  1021. &vmovdqu(@X[-3&7],"16($inp)");
  1022. &vmovdqu(@X[-2&7],"32($inp)");
  1023. &vmovdqu(@X[-1&7],"48($inp)");
  1024. &vpshufb(@X[-4&7],@X[-4&7],@X[2]); # byte swap
  1025. &add ($inp,64);
  1026. $Xi=0;
  1027. }
  1028. sub Xloop_avx()
  1029. { use integer;
  1030. my $body = shift;
  1031. my @insns = (&$body,&$body,&$body,&$body); # 32 instructions
  1032. my ($a,$b,$c,$d,$e);
  1033. eval(shift(@insns));
  1034. eval(shift(@insns));
  1035. &vpshufb(@X[($Xi-3)&7],@X[($Xi-3)&7],@X[2]);
  1036. eval(shift(@insns));
  1037. eval(shift(@insns));
  1038. &vpaddd (@X[$Xi&7],@X[($Xi-4)&7],$Kx);
  1039. eval(shift(@insns));
  1040. eval(shift(@insns));
  1041. eval(shift(@insns));
  1042. eval(shift(@insns));
  1043. &vmovdqa(eval(16*$Xi)."(%rsp)",@X[$Xi&7]); # X[]+K xfer to IALU
  1044. eval(shift(@insns));
  1045. eval(shift(@insns));
  1046. foreach (@insns) { eval; }
  1047. $Xi++;
  1048. }
  1049. sub Xtail_avx()
  1050. { use integer;
  1051. my $body = shift;
  1052. my @insns = (&$body,&$body,&$body,&$body); # 32 instructions
  1053. my ($a,$b,$c,$d,$e);
  1054. foreach (@insns) { eval; }
  1055. }
  1056. $code.=<<___;
  1057. .align 16
  1058. .Loop_avx:
  1059. ___
  1060. &Xupdate_avx_16_31(\&body_00_19);
  1061. &Xupdate_avx_16_31(\&body_00_19);
  1062. &Xupdate_avx_16_31(\&body_00_19);
  1063. &Xupdate_avx_16_31(\&body_00_19);
  1064. &Xupdate_avx_32_79(\&body_00_19);
  1065. &Xupdate_avx_32_79(\&body_20_39);
  1066. &Xupdate_avx_32_79(\&body_20_39);
  1067. &Xupdate_avx_32_79(\&body_20_39);
  1068. &Xupdate_avx_32_79(\&body_20_39);
  1069. &Xupdate_avx_32_79(\&body_20_39);
  1070. &Xupdate_avx_32_79(\&body_40_59);
  1071. &Xupdate_avx_32_79(\&body_40_59);
  1072. &Xupdate_avx_32_79(\&body_40_59);
  1073. &Xupdate_avx_32_79(\&body_40_59);
  1074. &Xupdate_avx_32_79(\&body_40_59);
  1075. &Xupdate_avx_32_79(\&body_20_39);
  1076. &Xuplast_avx_80(\&body_20_39); # can jump to "done"
  1077. $saved_j=$j; @saved_V=@V;
  1078. &Xloop_avx(\&body_20_39);
  1079. &Xloop_avx(\&body_20_39);
  1080. &Xloop_avx(\&body_20_39);
  1081. $code.=<<___;
  1082. add 0($ctx),$A # update context
  1083. add 4($ctx),@T[0]
  1084. add 8($ctx),$C
  1085. add 12($ctx),$D
  1086. mov $A,0($ctx)
  1087. add 16($ctx),$E
  1088. mov @T[0],4($ctx)
  1089. mov @T[0],$B # magic seed
  1090. mov $C,8($ctx)
  1091. mov $C,@T[1]
  1092. mov $D,12($ctx)
  1093. xor $D,@T[1]
  1094. mov $E,16($ctx)
  1095. and @T[1],@T[0]
  1096. jmp .Loop_avx
  1097. .align 16
  1098. $done_avx_label:
  1099. ___
  1100. $j=$saved_j; @V=@saved_V;
  1101. &Xtail_avx(\&body_20_39);
  1102. &Xtail_avx(\&body_20_39);
  1103. &Xtail_avx(\&body_20_39);
  1104. $code.=<<___;
  1105. vzeroupper
  1106. add 0($ctx),$A # update context
  1107. add 4($ctx),@T[0]
  1108. add 8($ctx),$C
  1109. mov $A,0($ctx)
  1110. add 12($ctx),$D
  1111. mov @T[0],4($ctx)
  1112. add 16($ctx),$E
  1113. mov $C,8($ctx)
  1114. mov $D,12($ctx)
  1115. mov $E,16($ctx)
  1116. ___
  1117. $code.=<<___ if ($win64);
  1118. movaps -40-6*16(%r14),%xmm6
  1119. movaps -40-5*16(%r14),%xmm7
  1120. movaps -40-4*16(%r14),%xmm8
  1121. movaps -40-3*16(%r14),%xmm9
  1122. movaps -40-2*16(%r14),%xmm10
  1123. movaps -40-1*16(%r14),%xmm11
  1124. ___
  1125. $code.=<<___;
  1126. lea (%r14),%rsi
  1127. mov -40(%rsi),%r14
  1128. mov -32(%rsi),%r13
  1129. mov -24(%rsi),%r12
  1130. mov -16(%rsi),%rbp
  1131. mov -8(%rsi),%rbx
  1132. lea (%rsi),%rsp
  1133. .Lepilogue_avx:
  1134. ret
  1135. .size sha1_block_data_order_avx,.-sha1_block_data_order_avx
  1136. ___
  1137. if ($avx>1) {
  1138. use integer;
  1139. $Xi=4; # reset variables
  1140. @X=map("%ymm$_",(4..7,0..3));
  1141. @Tx=map("%ymm$_",(8..10));
  1142. $Kx="%ymm11";
  1143. $j=0;
  1144. my @ROTX=("%eax","%ebp","%ebx","%ecx","%edx","%esi");
  1145. my ($a5,$t0)=("%r12d","%edi");
  1146. my ($A,$F,$B,$C,$D,$E)=@ROTX;
  1147. my $rx=0;
  1148. my $frame="%r13";
  1149. $code.=<<___;
  1150. .type sha1_block_data_order_avx2,\@function,3
  1151. .align 16
  1152. sha1_block_data_order_avx2:
  1153. _avx2_shortcut:
  1154. mov %rsp,%rax
  1155. push %rbx
  1156. push %rbp
  1157. push %r12
  1158. push %r13
  1159. push %r14
  1160. vzeroupper
  1161. ___
  1162. $code.=<<___ if ($win64);
  1163. lea -6*16(%rsp),%rsp
  1164. vmovaps %xmm6,-40-6*16(%rax)
  1165. vmovaps %xmm7,-40-5*16(%rax)
  1166. vmovaps %xmm8,-40-4*16(%rax)
  1167. vmovaps %xmm9,-40-3*16(%rax)
  1168. vmovaps %xmm10,-40-2*16(%rax)
  1169. vmovaps %xmm11,-40-1*16(%rax)
  1170. .Lprologue_avx2:
  1171. ___
  1172. $code.=<<___;
  1173. mov %rax,%r14 # original %rsp
  1174. mov %rdi,$ctx # reassigned argument
  1175. mov %rsi,$inp # reassigned argument
  1176. mov %rdx,$num # reassigned argument
  1177. lea -640(%rsp),%rsp
  1178. shl \$6,$num
  1179. lea 64($inp),$frame
  1180. and \$-128,%rsp
  1181. add $inp,$num
  1182. lea K_XX_XX+64(%rip),$K_XX_XX
  1183. mov 0($ctx),$A # load context
  1184. cmp $num,$frame
  1185. cmovae $inp,$frame # next or same block
  1186. mov 4($ctx),$F
  1187. mov 8($ctx),$C
  1188. mov 12($ctx),$D
  1189. mov 16($ctx),$E
  1190. vmovdqu 64($K_XX_XX),@X[2] # pbswap mask
  1191. vmovdqu ($inp),%xmm0
  1192. vmovdqu 16($inp),%xmm1
  1193. vmovdqu 32($inp),%xmm2
  1194. vmovdqu 48($inp),%xmm3
  1195. lea 64($inp),$inp
  1196. vinserti128 \$1,($frame),@X[-4&7],@X[-4&7]
  1197. vinserti128 \$1,16($frame),@X[-3&7],@X[-3&7]
  1198. vpshufb @X[2],@X[-4&7],@X[-4&7]
  1199. vinserti128 \$1,32($frame),@X[-2&7],@X[-2&7]
  1200. vpshufb @X[2],@X[-3&7],@X[-3&7]
  1201. vinserti128 \$1,48($frame),@X[-1&7],@X[-1&7]
  1202. vpshufb @X[2],@X[-2&7],@X[-2&7]
  1203. vmovdqu -64($K_XX_XX),$Kx # K_00_19
  1204. vpshufb @X[2],@X[-1&7],@X[-1&7]
  1205. vpaddd $Kx,@X[-4&7],@X[0] # add K_00_19
  1206. vpaddd $Kx,@X[-3&7],@X[1]
  1207. vmovdqu @X[0],0(%rsp) # X[]+K xfer to IALU
  1208. vpaddd $Kx,@X[-2&7],@X[2]
  1209. vmovdqu @X[1],32(%rsp)
  1210. vpaddd $Kx,@X[-1&7],@X[3]
  1211. vmovdqu @X[2],64(%rsp)
  1212. vmovdqu @X[3],96(%rsp)
  1213. ___
  1214. for (;$Xi<8;$Xi++) { # Xupdate_avx2_16_31
  1215. use integer;
  1216. &vpalignr(@X[0],@X[-3&7],@X[-4&7],8); # compose "X[-14]" in "X[0]"
  1217. &vpsrldq(@Tx[0],@X[-1&7],4); # "X[-3]", 3 dwords
  1218. &vpxor (@X[0],@X[0],@X[-4&7]); # "X[0]"^="X[-16]"
  1219. &vpxor (@Tx[0],@Tx[0],@X[-2&7]); # "X[-3]"^"X[-8]"
  1220. &vpxor (@X[0],@X[0],@Tx[0]); # "X[0]"^="X[-3]"^"X[-8]"
  1221. &vpsrld (@Tx[0],@X[0],31);
  1222. &vmovdqu($Kx,eval(2*16*(($Xi)/5)-64)."($K_XX_XX)") if ($Xi%5==0); # K_XX_XX
  1223. &vpslldq(@Tx[2],@X[0],12); # "X[0]"<<96, extract one dword
  1224. &vpaddd (@X[0],@X[0],@X[0]);
  1225. &vpsrld (@Tx[1],@Tx[2],30);
  1226. &vpor (@X[0],@X[0],@Tx[0]); # "X[0]"<<<=1
  1227. &vpslld (@Tx[2],@Tx[2],2);
  1228. &vpxor (@X[0],@X[0],@Tx[1]);
  1229. &vpxor (@X[0],@X[0],@Tx[2]); # "X[0]"^=("X[0]">>96)<<<2
  1230. &vpaddd (@Tx[1],@X[0],$Kx);
  1231. &vmovdqu("32*$Xi(%rsp)",@Tx[1]); # X[]+K xfer to IALU
  1232. push(@X,shift(@X)); # "rotate" X[]
  1233. }
  1234. $code.=<<___;
  1235. lea 128(%rsp),$frame
  1236. jmp .Loop_avx2
  1237. .align 32
  1238. .Loop_avx2:
  1239. rorx \$2,$F,$B
  1240. andn $D,$F,$t0
  1241. and $C,$F
  1242. xor $t0,$F
  1243. ___
  1244. sub bodyx_00_19 () { # 8 instructions, 3 cycles critical path
  1245. # at start $f=(b&c)^(~b&d), $b>>>=2
  1246. return &bodyx_20_39() if ($rx==19); $rx++;
  1247. (
  1248. '($a,$f,$b,$c,$d,$e)=@ROTX;'.
  1249. '&add ($e,((32*($j/4)+4*($j%4))%256-128)."($frame)");'. # e+=X[i]+K
  1250. '&lea ($frame,"256($frame)") if ($j%32==31);',
  1251. '&andn ($t0,$a,$c)', # ~b&d for next round
  1252. '&add ($e,$f)', # e+=(b&c)^(~b&d)
  1253. '&rorx ($a5,$a,27)', # a<<<5
  1254. '&rorx ($f,$a,2)', # b>>>2 for next round
  1255. '&and ($a,$b)', # b&c for next round
  1256. '&add ($e,$a5)', # e+=a<<<5
  1257. '&xor ($a,$t0);'. # f=(b&c)^(~b&d) for next round
  1258. 'unshift(@ROTX,pop(@ROTX)); $j++;'
  1259. )
  1260. }
  1261. sub bodyx_20_39 () { # 7 instructions, 2 cycles critical path
  1262. # on entry $f=b^c^d, $b>>>=2
  1263. return &bodyx_40_59() if ($rx==39); $rx++;
  1264. (
  1265. '($a,$f,$b,$c,$d,$e)=@ROTX;'.
  1266. '&add ($e,((32*($j/4)+4*($j%4))%256-128)."($frame)");'. # e+=X[i]+K
  1267. '&lea ($frame,"256($frame)") if ($j%32==31);',
  1268. '&lea ($e,"($e,$f)")', # e+=b^c^d
  1269. '&rorx ($a5,$a,27)', # a<<<5
  1270. '&rorx ($f,$a,2) if ($j<79)', # b>>>2 in next round
  1271. '&xor ($a,$b) if ($j<79)', # b^c for next round
  1272. '&add ($e,$a5)', # e+=a<<<5
  1273. '&xor ($a,$c) if ($j<79);'. # f=b^c^d for next round
  1274. 'unshift(@ROTX,pop(@ROTX)); $j++;'
  1275. )
  1276. }
  1277. sub bodyx_40_59 () { # 10 instructions, 3 cycles critical path
  1278. # on entry $f=((b^c)&(c^d)), $b>>>=2
  1279. $rx++;
  1280. (
  1281. '($a,$f,$b,$c,$d,$e)=@ROTX;'.
  1282. '&add ($e,((32*($j/4)+4*($j%4))%256-128)."($frame)");'. # e+=X[i]+K
  1283. '&lea ($frame,"256($frame)") if ($j%32==31);',
  1284. '&xor ($f,$c) if ($j>39)', # (b^c)&(c^d)^c
  1285. '&mov ($t0,$b) if ($j<59)', # count on zero latency
  1286. '&xor ($t0,$c) if ($j<59)', # c^d for next round
  1287. '&lea ($e,"($e,$f)")', # e+=(b^c)&(c^d)^c
  1288. '&rorx ($a5,$a,27)', # a<<<5
  1289. '&rorx ($f,$a,2)', # b>>>2 in next round
  1290. '&xor ($a,$b)', # b^c for next round
  1291. '&add ($e,$a5)', # e+=a<<<5
  1292. '&and ($a,$t0) if ($j< 59);'. # f=(b^c)&(c^d) for next round
  1293. '&xor ($a,$c) if ($j==59);'. # f=b^c^d for next round
  1294. 'unshift(@ROTX,pop(@ROTX)); $j++;'
  1295. )
  1296. }
  1297. sub Xupdate_avx2_16_31() # recall that $Xi starts wtih 4
  1298. { use integer;
  1299. my $body = shift;
  1300. my @insns = (&$body,&$body,&$body,&$body,&$body); # 35 instructions
  1301. my ($a,$b,$c,$d,$e);
  1302. &vpalignr(@X[0],@X[-3&7],@X[-4&7],8); # compose "X[-14]" in "X[0]"
  1303. eval(shift(@insns));
  1304. eval(shift(@insns));
  1305. eval(shift(@insns));
  1306. eval(shift(@insns));
  1307. &vpsrldq(@Tx[0],@X[-1&7],4); # "X[-3]", 3 dwords
  1308. eval(shift(@insns));
  1309. eval(shift(@insns));
  1310. eval(shift(@insns));
  1311. &vpxor (@X[0],@X[0],@X[-4&7]); # "X[0]"^="X[-16]"
  1312. &vpxor (@Tx[0],@Tx[0],@X[-2&7]); # "X[-3]"^"X[-8]"
  1313. eval(shift(@insns));
  1314. eval(shift(@insns));
  1315. &vpxor (@X[0],@X[0],@Tx[0]); # "X[0]"^="X[-3]"^"X[-8]"
  1316. eval(shift(@insns));
  1317. eval(shift(@insns));
  1318. eval(shift(@insns));
  1319. eval(shift(@insns));
  1320. &vpsrld (@Tx[0],@X[0],31);
  1321. &vmovdqu($Kx,eval(2*16*(($Xi)/5)-64)."($K_XX_XX)") if ($Xi%5==0); # K_XX_XX
  1322. eval(shift(@insns));
  1323. eval(shift(@insns));
  1324. eval(shift(@insns));
  1325. &vpslldq(@Tx[2],@X[0],12); # "X[0]"<<96, extract one dword
  1326. &vpaddd (@X[0],@X[0],@X[0]);
  1327. eval(shift(@insns));
  1328. eval(shift(@insns));
  1329. &vpsrld (@Tx[1],@Tx[2],30);
  1330. &vpor (@X[0],@X[0],@Tx[0]); # "X[0]"<<<=1
  1331. eval(shift(@insns));
  1332. eval(shift(@insns));
  1333. &vpslld (@Tx[2],@Tx[2],2);
  1334. &vpxor (@X[0],@X[0],@Tx[1]);
  1335. eval(shift(@insns));
  1336. eval(shift(@insns));
  1337. &vpxor (@X[0],@X[0],@Tx[2]); # "X[0]"^=("X[0]">>96)<<<2
  1338. eval(shift(@insns));
  1339. eval(shift(@insns));
  1340. eval(shift(@insns));
  1341. &vpaddd (@Tx[1],@X[0],$Kx);
  1342. eval(shift(@insns));
  1343. eval(shift(@insns));
  1344. eval(shift(@insns));
  1345. &vmovdqu(eval(32*($Xi))."(%rsp)",@Tx[1]); # X[]+K xfer to IALU
  1346. foreach (@insns) { eval; } # remaining instructions [if any]
  1347. $Xi++;
  1348. push(@X,shift(@X)); # "rotate" X[]
  1349. }
  1350. sub Xupdate_avx2_32_79()
  1351. { use integer;
  1352. my $body = shift;
  1353. my @insns = (&$body,&$body,&$body,&$body,&$body); # 35 to 50 instructions
  1354. my ($a,$b,$c,$d,$e);
  1355. &vpalignr(@Tx[0],@X[-1&7],@X[-2&7],8); # compose "X[-6]"
  1356. &vpxor (@X[0],@X[0],@X[-4&7]); # "X[0]"="X[-32]"^"X[-16]"
  1357. eval(shift(@insns));
  1358. eval(shift(@insns));
  1359. &vpxor (@X[0],@X[0],@X[-7&7]); # "X[0]"^="X[-28]"
  1360. &vmovdqu($Kx,eval(2*16*($Xi/5)-64)."($K_XX_XX)") if ($Xi%5==0);
  1361. eval(shift(@insns));
  1362. eval(shift(@insns));
  1363. eval(shift(@insns));
  1364. &vpxor (@X[0],@X[0],@Tx[0]); # "X[0]"^="X[-6]"
  1365. eval(shift(@insns));
  1366. eval(shift(@insns));
  1367. eval(shift(@insns));
  1368. &vpsrld (@Tx[0],@X[0],30);
  1369. &vpslld (@X[0],@X[0],2);
  1370. eval(shift(@insns));
  1371. eval(shift(@insns));
  1372. eval(shift(@insns));
  1373. #&vpslld (@X[0],@X[0],2);
  1374. eval(shift(@insns));
  1375. eval(shift(@insns));
  1376. eval(shift(@insns));
  1377. &vpor (@X[0],@X[0],@Tx[0]); # "X[0]"<<<=2
  1378. eval(shift(@insns));
  1379. eval(shift(@insns));
  1380. eval(shift(@insns));
  1381. eval(shift(@insns));
  1382. &vpaddd (@Tx[1],@X[0],$Kx);
  1383. eval(shift(@insns));
  1384. eval(shift(@insns));
  1385. eval(shift(@insns));
  1386. eval(shift(@insns));
  1387. &vmovdqu("32*$Xi(%rsp)",@Tx[1]); # X[]+K xfer to IALU
  1388. foreach (@insns) { eval; } # remaining instructions
  1389. $Xi++;
  1390. push(@X,shift(@X)); # "rotate" X[]
  1391. }
  1392. sub Xloop_avx2()
  1393. { use integer;
  1394. my $body = shift;
  1395. my @insns = (&$body,&$body,&$body,&$body,&$body); # 32 instructions
  1396. my ($a,$b,$c,$d,$e);
  1397. foreach (@insns) { eval; }
  1398. }
  1399. &align32();
  1400. &Xupdate_avx2_32_79(\&bodyx_00_19);
  1401. &Xupdate_avx2_32_79(\&bodyx_00_19);
  1402. &Xupdate_avx2_32_79(\&bodyx_00_19);
  1403. &Xupdate_avx2_32_79(\&bodyx_00_19);
  1404. &Xupdate_avx2_32_79(\&bodyx_20_39);
  1405. &Xupdate_avx2_32_79(\&bodyx_20_39);
  1406. &Xupdate_avx2_32_79(\&bodyx_20_39);
  1407. &Xupdate_avx2_32_79(\&bodyx_20_39);
  1408. &align32();
  1409. &Xupdate_avx2_32_79(\&bodyx_40_59);
  1410. &Xupdate_avx2_32_79(\&bodyx_40_59);
  1411. &Xupdate_avx2_32_79(\&bodyx_40_59);
  1412. &Xupdate_avx2_32_79(\&bodyx_40_59);
  1413. &Xloop_avx2(\&bodyx_20_39);
  1414. &Xloop_avx2(\&bodyx_20_39);
  1415. &Xloop_avx2(\&bodyx_20_39);
  1416. &Xloop_avx2(\&bodyx_20_39);
  1417. $code.=<<___;
  1418. lea 128($inp),$frame
  1419. lea 128($inp),%rdi # borrow $t0
  1420. cmp $num,$frame
  1421. cmovae $inp,$frame # next or previous block
  1422. # output is d-e-[a]-f-b-c => A=d,F=e,C=f,D=b,E=c
  1423. add 0($ctx),@ROTX[0] # update context
  1424. add 4($ctx),@ROTX[1]
  1425. add 8($ctx),@ROTX[3]
  1426. mov @ROTX[0],0($ctx)
  1427. add 12($ctx),@ROTX[4]
  1428. mov @ROTX[1],4($ctx)
  1429. mov @ROTX[0],$A # A=d
  1430. add 16($ctx),@ROTX[5]
  1431. mov @ROTX[3],$a5
  1432. mov @ROTX[3],8($ctx)
  1433. mov @ROTX[4],$D # D=b
  1434. #xchg @ROTX[5],$F # F=c, C=f
  1435. mov @ROTX[4],12($ctx)
  1436. mov @ROTX[1],$F # F=e
  1437. mov @ROTX[5],16($ctx)
  1438. #mov $F,16($ctx)
  1439. mov @ROTX[5],$E # E=c
  1440. mov $a5,$C # C=f
  1441. #xchg $F,$E # E=c, F=e
  1442. cmp $num,$inp
  1443. je .Ldone_avx2
  1444. ___
  1445. $Xi=4; # reset variables
  1446. @X=map("%ymm$_",(4..7,0..3));
  1447. $code.=<<___;
  1448. vmovdqu 64($K_XX_XX),@X[2] # pbswap mask
  1449. cmp $num,%rdi # borrowed $t0
  1450. ja .Last_avx2
  1451. vmovdqu -64(%rdi),%xmm0 # low part of @X[-4&7]
  1452. vmovdqu -48(%rdi),%xmm1
  1453. vmovdqu -32(%rdi),%xmm2
  1454. vmovdqu -16(%rdi),%xmm3
  1455. vinserti128 \$1,0($frame),@X[-4&7],@X[-4&7]
  1456. vinserti128 \$1,16($frame),@X[-3&7],@X[-3&7]
  1457. vinserti128 \$1,32($frame),@X[-2&7],@X[-2&7]
  1458. vinserti128 \$1,48($frame),@X[-1&7],@X[-1&7]
  1459. jmp .Last_avx2
  1460. .align 32
  1461. .Last_avx2:
  1462. lea 128+16(%rsp),$frame
  1463. rorx \$2,$F,$B
  1464. andn $D,$F,$t0
  1465. and $C,$F
  1466. xor $t0,$F
  1467. sub \$-128,$inp
  1468. ___
  1469. $rx=$j=0; @ROTX=($A,$F,$B,$C,$D,$E);
  1470. &Xloop_avx2 (\&bodyx_00_19);
  1471. &Xloop_avx2 (\&bodyx_00_19);
  1472. &Xloop_avx2 (\&bodyx_00_19);
  1473. &Xloop_avx2 (\&bodyx_00_19);
  1474. &Xloop_avx2 (\&bodyx_20_39);
  1475. &vmovdqu ($Kx,"-64($K_XX_XX)"); # K_00_19
  1476. &vpshufb (@X[-4&7],@X[-4&7],@X[2]); # byte swap
  1477. &Xloop_avx2 (\&bodyx_20_39);
  1478. &vpshufb (@X[-3&7],@X[-3&7],@X[2]);
  1479. &vpaddd (@Tx[0],@X[-4&7],$Kx); # add K_00_19
  1480. &Xloop_avx2 (\&bodyx_20_39);
  1481. &vmovdqu ("0(%rsp)",@Tx[0]);
  1482. &vpshufb (@X[-2&7],@X[-2&7],@X[2]);
  1483. &vpaddd (@Tx[1],@X[-3&7],$Kx);
  1484. &Xloop_avx2 (\&bodyx_20_39);
  1485. &vmovdqu ("32(%rsp)",@Tx[1]);
  1486. &vpshufb (@X[-1&7],@X[-1&7],@X[2]);
  1487. &vpaddd (@X[2],@X[-2&7],$Kx);
  1488. &Xloop_avx2 (\&bodyx_40_59);
  1489. &align32 ();
  1490. &vmovdqu ("64(%rsp)",@X[2]);
  1491. &vpaddd (@X[3],@X[-1&7],$Kx);
  1492. &Xloop_avx2 (\&bodyx_40_59);
  1493. &vmovdqu ("96(%rsp)",@X[3]);
  1494. &Xloop_avx2 (\&bodyx_40_59);
  1495. &Xupdate_avx2_16_31(\&bodyx_40_59);
  1496. &Xupdate_avx2_16_31(\&bodyx_20_39);
  1497. &Xupdate_avx2_16_31(\&bodyx_20_39);
  1498. &Xupdate_avx2_16_31(\&bodyx_20_39);
  1499. &Xloop_avx2 (\&bodyx_20_39);
  1500. $code.=<<___;
  1501. lea 128(%rsp),$frame
  1502. # output is d-e-[a]-f-b-c => A=d,F=e,C=f,D=b,E=c
  1503. add 0($ctx),@ROTX[0] # update context
  1504. add 4($ctx),@ROTX[1]
  1505. add 8($ctx),@ROTX[3]
  1506. mov @ROTX[0],0($ctx)
  1507. add 12($ctx),@ROTX[4]
  1508. mov @ROTX[1],4($ctx)
  1509. mov @ROTX[0],$A # A=d
  1510. add 16($ctx),@ROTX[5]
  1511. mov @ROTX[3],$a5
  1512. mov @ROTX[3],8($ctx)
  1513. mov @ROTX[4],$D # D=b
  1514. #xchg @ROTX[5],$F # F=c, C=f
  1515. mov @ROTX[4],12($ctx)
  1516. mov @ROTX[1],$F # F=e
  1517. mov @ROTX[5],16($ctx)
  1518. #mov $F,16($ctx)
  1519. mov @ROTX[5],$E # E=c
  1520. mov $a5,$C # C=f
  1521. #xchg $F,$E # E=c, F=e
  1522. cmp $num,$inp
  1523. jbe .Loop_avx2
  1524. .Ldone_avx2:
  1525. vzeroupper
  1526. ___
  1527. $code.=<<___ if ($win64);
  1528. movaps -40-6*16(%r14),%xmm6
  1529. movaps -40-5*16(%r14),%xmm7
  1530. movaps -40-4*16(%r14),%xmm8
  1531. movaps -40-3*16(%r14),%xmm9
  1532. movaps -40-2*16(%r14),%xmm10
  1533. movaps -40-1*16(%r14),%xmm11
  1534. ___
  1535. $code.=<<___;
  1536. lea (%r14),%rsi
  1537. mov -40(%rsi),%r14
  1538. mov -32(%rsi),%r13
  1539. mov -24(%rsi),%r12
  1540. mov -16(%rsi),%rbp
  1541. mov -8(%rsi),%rbx
  1542. lea (%rsi),%rsp
  1543. .Lepilogue_avx2:
  1544. ret
  1545. .size sha1_block_data_order_avx2,.-sha1_block_data_order_avx2
  1546. ___
  1547. }
  1548. }
  1549. $code.=<<___;
  1550. .align 64
  1551. K_XX_XX:
  1552. .long 0x5a827999,0x5a827999,0x5a827999,0x5a827999 # K_00_19
  1553. .long 0x5a827999,0x5a827999,0x5a827999,0x5a827999 # K_00_19
  1554. .long 0x6ed9eba1,0x6ed9eba1,0x6ed9eba1,0x6ed9eba1 # K_20_39
  1555. .long 0x6ed9eba1,0x6ed9eba1,0x6ed9eba1,0x6ed9eba1 # K_20_39
  1556. .long 0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc # K_40_59
  1557. .long 0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc,0x8f1bbcdc # K_40_59
  1558. .long 0xca62c1d6,0xca62c1d6,0xca62c1d6,0xca62c1d6 # K_60_79
  1559. .long 0xca62c1d6,0xca62c1d6,0xca62c1d6,0xca62c1d6 # K_60_79
  1560. .long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f # pbswap mask
  1561. .long 0x00010203,0x04050607,0x08090a0b,0x0c0d0e0f # pbswap mask
  1562. .byte 0xf,0xe,0xd,0xc,0xb,0xa,0x9,0x8,0x7,0x6,0x5,0x4,0x3,0x2,0x1,0x0
  1563. ___
  1564. }}}
  1565. $code.=<<___;
  1566. .asciz "SHA1 block transform for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
  1567. .align 64
  1568. ___
  1569. # EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
  1570. # CONTEXT *context,DISPATCHER_CONTEXT *disp)
  1571. if ($win64) {
  1572. $rec="%rcx";
  1573. $frame="%rdx";
  1574. $context="%r8";
  1575. $disp="%r9";
  1576. $code.=<<___;
  1577. .extern __imp_RtlVirtualUnwind
  1578. .type se_handler,\@abi-omnipotent
  1579. .align 16
  1580. se_handler:
  1581. push %rsi
  1582. push %rdi
  1583. push %rbx
  1584. push %rbp
  1585. push %r12
  1586. push %r13
  1587. push %r14
  1588. push %r15
  1589. pushfq
  1590. sub \$64,%rsp
  1591. mov 120($context),%rax # pull context->Rax
  1592. mov 248($context),%rbx # pull context->Rip
  1593. lea .Lprologue(%rip),%r10
  1594. cmp %r10,%rbx # context->Rip<.Lprologue
  1595. jb .Lcommon_seh_tail
  1596. mov 152($context),%rax # pull context->Rsp
  1597. lea .Lepilogue(%rip),%r10
  1598. cmp %r10,%rbx # context->Rip>=.Lepilogue
  1599. jae .Lcommon_seh_tail
  1600. mov `16*4`(%rax),%rax # pull saved stack pointer
  1601. mov -8(%rax),%rbx
  1602. mov -16(%rax),%rbp
  1603. mov -24(%rax),%r12
  1604. mov -32(%rax),%r13
  1605. mov -40(%rax),%r14
  1606. mov %rbx,144($context) # restore context->Rbx
  1607. mov %rbp,160($context) # restore context->Rbp
  1608. mov %r12,216($context) # restore context->R12
  1609. mov %r13,224($context) # restore context->R13
  1610. mov %r14,232($context) # restore context->R14
  1611. jmp .Lcommon_seh_tail
  1612. .size se_handler,.-se_handler
  1613. ___
  1614. $code.=<<___ if ($shaext);
  1615. .type shaext_handler,\@abi-omnipotent
  1616. .align 16
  1617. shaext_handler:
  1618. push %rsi
  1619. push %rdi
  1620. push %rbx
  1621. push %rbp
  1622. push %r12
  1623. push %r13
  1624. push %r14
  1625. push %r15
  1626. pushfq
  1627. sub \$64,%rsp
  1628. mov 120($context),%rax # pull context->Rax
  1629. mov 248($context),%rbx # pull context->Rip
  1630. lea .Lprologue_shaext(%rip),%r10
  1631. cmp %r10,%rbx # context->Rip<.Lprologue
  1632. jb .Lcommon_seh_tail
  1633. lea .Lepilogue_shaext(%rip),%r10
  1634. cmp %r10,%rbx # context->Rip>=.Lepilogue
  1635. jae .Lcommon_seh_tail
  1636. lea -8-4*16(%rax),%rsi
  1637. lea 512($context),%rdi # &context.Xmm6
  1638. mov \$8,%ecx
  1639. .long 0xa548f3fc # cld; rep movsq
  1640. jmp .Lcommon_seh_tail
  1641. .size shaext_handler,.-shaext_handler
  1642. ___
  1643. $code.=<<___;
  1644. .type ssse3_handler,\@abi-omnipotent
  1645. .align 16
  1646. ssse3_handler:
  1647. push %rsi
  1648. push %rdi
  1649. push %rbx
  1650. push %rbp
  1651. push %r12
  1652. push %r13
  1653. push %r14
  1654. push %r15
  1655. pushfq
  1656. sub \$64,%rsp
  1657. mov 120($context),%rax # pull context->Rax
  1658. mov 248($context),%rbx # pull context->Rip
  1659. mov 8($disp),%rsi # disp->ImageBase
  1660. mov 56($disp),%r11 # disp->HandlerData
  1661. mov 0(%r11),%r10d # HandlerData[0]
  1662. lea (%rsi,%r10),%r10 # prologue label
  1663. cmp %r10,%rbx # context->Rip<prologue label
  1664. jb .Lcommon_seh_tail
  1665. mov 152($context),%rax # pull context->Rsp
  1666. mov 4(%r11),%r10d # HandlerData[1]
  1667. lea (%rsi,%r10),%r10 # epilogue label
  1668. cmp %r10,%rbx # context->Rip>=epilogue label
  1669. jae .Lcommon_seh_tail
  1670. mov 232($context),%rax # pull context->R14
  1671. lea -40-6*16(%rax),%rsi
  1672. lea 512($context),%rdi # &context.Xmm6
  1673. mov \$12,%ecx
  1674. .long 0xa548f3fc # cld; rep movsq
  1675. mov -8(%rax),%rbx
  1676. mov -16(%rax),%rbp
  1677. mov -24(%rax),%r12
  1678. mov -32(%rax),%r13
  1679. mov -40(%rax),%r14
  1680. mov %rbx,144($context) # restore context->Rbx
  1681. mov %rbp,160($context) # restore context->Rbp
  1682. mov %r12,216($context) # restore cotnext->R12
  1683. mov %r13,224($context) # restore cotnext->R13
  1684. mov %r14,232($context) # restore cotnext->R14
  1685. .Lcommon_seh_tail:
  1686. mov 8(%rax),%rdi
  1687. mov 16(%rax),%rsi
  1688. mov %rax,152($context) # restore context->Rsp
  1689. mov %rsi,168($context) # restore context->Rsi
  1690. mov %rdi,176($context) # restore context->Rdi
  1691. mov 40($disp),%rdi # disp->ContextRecord
  1692. mov $context,%rsi # context
  1693. mov \$154,%ecx # sizeof(CONTEXT)
  1694. .long 0xa548f3fc # cld; rep movsq
  1695. mov $disp,%rsi
  1696. xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER
  1697. mov 8(%rsi),%rdx # arg2, disp->ImageBase
  1698. mov 0(%rsi),%r8 # arg3, disp->ControlPc
  1699. mov 16(%rsi),%r9 # arg4, disp->FunctionEntry
  1700. mov 40(%rsi),%r10 # disp->ContextRecord
  1701. lea 56(%rsi),%r11 # &disp->HandlerData
  1702. lea 24(%rsi),%r12 # &disp->EstablisherFrame
  1703. mov %r10,32(%rsp) # arg5
  1704. mov %r11,40(%rsp) # arg6
  1705. mov %r12,48(%rsp) # arg7
  1706. mov %rcx,56(%rsp) # arg8, (NULL)
  1707. call *__imp_RtlVirtualUnwind(%rip)
  1708. mov \$1,%eax # ExceptionContinueSearch
  1709. add \$64,%rsp
  1710. popfq
  1711. pop %r15
  1712. pop %r14
  1713. pop %r13
  1714. pop %r12
  1715. pop %rbp
  1716. pop %rbx
  1717. pop %rdi
  1718. pop %rsi
  1719. ret
  1720. .size ssse3_handler,.-ssse3_handler
  1721. .section .pdata
  1722. .align 4
  1723. .rva .LSEH_begin_sha1_block_data_order
  1724. .rva .LSEH_end_sha1_block_data_order
  1725. .rva .LSEH_info_sha1_block_data_order
  1726. ___
  1727. $code.=<<___ if ($shaext);
  1728. .rva .LSEH_begin_sha1_block_data_order_shaext
  1729. .rva .LSEH_end_sha1_block_data_order_shaext
  1730. .rva .LSEH_info_sha1_block_data_order_shaext
  1731. ___
  1732. $code.=<<___;
  1733. .rva .LSEH_begin_sha1_block_data_order_ssse3
  1734. .rva .LSEH_end_sha1_block_data_order_ssse3
  1735. .rva .LSEH_info_sha1_block_data_order_ssse3
  1736. ___
  1737. $code.=<<___ if ($avx);
  1738. .rva .LSEH_begin_sha1_block_data_order_avx
  1739. .rva .LSEH_end_sha1_block_data_order_avx
  1740. .rva .LSEH_info_sha1_block_data_order_avx
  1741. ___
  1742. $code.=<<___ if ($avx>1);
  1743. .rva .LSEH_begin_sha1_block_data_order_avx2
  1744. .rva .LSEH_end_sha1_block_data_order_avx2
  1745. .rva .LSEH_info_sha1_block_data_order_avx2
  1746. ___
  1747. $code.=<<___;
  1748. .section .xdata
  1749. .align 8
  1750. .LSEH_info_sha1_block_data_order:
  1751. .byte 9,0,0,0
  1752. .rva se_handler
  1753. ___
  1754. $code.=<<___ if ($shaext);
  1755. .LSEH_info_sha1_block_data_order_shaext:
  1756. .byte 9,0,0,0
  1757. .rva shaext_handler
  1758. ___
  1759. $code.=<<___;
  1760. .LSEH_info_sha1_block_data_order_ssse3:
  1761. .byte 9,0,0,0
  1762. .rva ssse3_handler
  1763. .rva .Lprologue_ssse3,.Lepilogue_ssse3 # HandlerData[]
  1764. ___
  1765. $code.=<<___ if ($avx);
  1766. .LSEH_info_sha1_block_data_order_avx:
  1767. .byte 9,0,0,0
  1768. .rva ssse3_handler
  1769. .rva .Lprologue_avx,.Lepilogue_avx # HandlerData[]
  1770. ___
  1771. $code.=<<___ if ($avx>1);
  1772. .LSEH_info_sha1_block_data_order_avx2:
  1773. .byte 9,0,0,0
  1774. .rva ssse3_handler
  1775. .rva .Lprologue_avx2,.Lepilogue_avx2 # HandlerData[]
  1776. ___
  1777. }
  1778. ####################################################################
  1779. sub sha1rnds4 {
  1780. if (@_[0] =~ /\$([x0-9a-f]+),\s*%xmm([0-7]),\s*%xmm([0-7])/) {
  1781. my @opcode=(0x0f,0x3a,0xcc);
  1782. push @opcode,0xc0|($2&7)|(($3&7)<<3); # ModR/M
  1783. my $c=$1;
  1784. push @opcode,$c=~/^0/?oct($c):$c;
  1785. return ".byte\t".join(',',@opcode);
  1786. } else {
  1787. return "sha1rnds4\t".@_[0];
  1788. }
  1789. }
  1790. sub sha1op38 {
  1791. my $instr = shift;
  1792. my %opcodelet = (
  1793. "sha1nexte" => 0xc8,
  1794. "sha1msg1" => 0xc9,
  1795. "sha1msg2" => 0xca );
  1796. if (defined($opcodelet{$instr}) && @_[0] =~ /%xmm([0-9]+),\s*%xmm([0-9]+)/) {
  1797. my @opcode=(0x0f,0x38);
  1798. my $rex=0;
  1799. $rex|=0x04 if ($2>=8);
  1800. $rex|=0x01 if ($1>=8);
  1801. unshift @opcode,0x40|$rex if ($rex);
  1802. push @opcode,$opcodelet{$instr};
  1803. push @opcode,0xc0|($1&7)|(($2&7)<<3); # ModR/M
  1804. return ".byte\t".join(',',@opcode);
  1805. } else {
  1806. return $instr."\t".@_[0];
  1807. }
  1808. }
  1809. foreach (split("\n",$code)) {
  1810. s/\`([^\`]*)\`/eval $1/geo;
  1811. s/\b(sha1rnds4)\s+(.*)/sha1rnds4($2)/geo or
  1812. s/\b(sha1[^\s]*)\s+(.*)/sha1op38($1,$2)/geo;
  1813. print $_,"\n";
  1814. }
  1815. close STDOUT;