diff --git a/crypto/fipsmodule/sha/asm/sha512-x86_64.pl b/crypto/fipsmodule/sha/asm/sha512-x86_64.pl index 9d53ec47..49278506 100755 --- a/crypto/fipsmodule/sha/asm/sha512-x86_64.pl +++ b/crypto/fipsmodule/sha/asm/sha512-x86_64.pl @@ -108,6 +108,8 @@ # part, body_00_15; reducing the amount of SIMD instructions # below certain limit makes no difference/sense; to conserve # space SHA256 XOP code path is therefore omitted; +# +# Modified from upstream OpenSSL to remove the XOP code. $flavour = shift; $output = shift; @@ -275,9 +277,7 @@ $code.=<<___ if ($SZ==4 && $shaext); test \$`1<<29`,%r11d # check for SHA jnz _shaext_shortcut ___ -$code.=<<___ if ($avx && $SZ==8); - test \$`1<<11`,%r10d # check for XOP - jnz .Lxop_shortcut + # XOP codepath removed. ___ $code.=<<___ if ($avx>1); and \$`1<<8|1<<5|1<<3`,%r11d # check for BMI2+AVX2+BMI1 @@ -1127,399 +1127,6 @@ ___ if ($avx) {{ ###################################################################### -# XOP code path -# -if ($SZ==8) { # SHA512 only -$code.=<<___; -.type ${func}_xop,\@function,3 -.align 64 -${func}_xop: -.cfi_startproc -.Lxop_shortcut: - mov %rsp,%rax # copy %rsp -.cfi_def_cfa_register %rax - push %rbx -.cfi_push %rbx - push %rbp -.cfi_push %rbp - push %r12 -.cfi_push %r12 - push %r13 -.cfi_push %r13 - push %r14 -.cfi_push %r14 - push %r15 -.cfi_push %r15 - shl \$4,%rdx # num*16 - sub \$`$framesz+$win64*16*($SZ==4?4:6)`,%rsp - lea ($inp,%rdx,$SZ),%rdx # inp+num*16*$SZ - and \$-64,%rsp # align stack frame - mov $ctx,$_ctx # save ctx, 1st arg - mov $inp,$_inp # save inp, 2nd arh - mov %rdx,$_end # save end pointer, "3rd" arg - mov %rax,$_rsp # save copy of %rsp -.cfi_cfa_expression $_rsp,deref,+8 -___ -$code.=<<___ if ($win64); - movaps %xmm6,16*$SZ+32(%rsp) - movaps %xmm7,16*$SZ+48(%rsp) - movaps %xmm8,16*$SZ+64(%rsp) - movaps %xmm9,16*$SZ+80(%rsp) -___ -$code.=<<___ if ($win64 && $SZ>4); - movaps %xmm10,16*$SZ+96(%rsp) - movaps %xmm11,16*$SZ+112(%rsp) -___ -$code.=<<___; -.Lprologue_xop: - - vzeroupper - mov $SZ*0($ctx),$A - mov $SZ*1($ctx),$B - mov $SZ*2($ctx),$C - mov $SZ*3($ctx),$D - mov $SZ*4($ctx),$E - mov $SZ*5($ctx),$F - mov $SZ*6($ctx),$G - mov $SZ*7($ctx),$H - jmp .Lloop_xop -___ - if ($SZ==4) { # SHA256 - my @X = map("%xmm$_",(0..3)); - my ($t0,$t1,$t2,$t3) = map("%xmm$_",(4..7)); - -$code.=<<___; -.align 16 -.Lloop_xop: - vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3 - vmovdqu 0x00($inp),@X[0] - vmovdqu 0x10($inp),@X[1] - vmovdqu 0x20($inp),@X[2] - vmovdqu 0x30($inp),@X[3] - vpshufb $t3,@X[0],@X[0] - lea $TABLE(%rip),$Tbl - vpshufb $t3,@X[1],@X[1] - vpshufb $t3,@X[2],@X[2] - vpaddd 0x00($Tbl),@X[0],$t0 - vpshufb $t3,@X[3],@X[3] - vpaddd 0x20($Tbl),@X[1],$t1 - vpaddd 0x40($Tbl),@X[2],$t2 - vpaddd 0x60($Tbl),@X[3],$t3 - vmovdqa $t0,0x00(%rsp) - mov $A,$a1 - vmovdqa $t1,0x10(%rsp) - mov $B,$a3 - vmovdqa $t2,0x20(%rsp) - xor $C,$a3 # magic - vmovdqa $t3,0x30(%rsp) - mov $E,$a0 - jmp .Lxop_00_47 - -.align 16 -.Lxop_00_47: - sub \$`-16*2*$SZ`,$Tbl # size optimization -___ -sub XOP_256_00_47 () { -my $j = shift; -my $body = shift; -my @X = @_; -my @insns = (&$body,&$body,&$body,&$body); # 104 instructions - - &vpalignr ($t0,@X[1],@X[0],$SZ); # X[1..4] - eval(shift(@insns)); - eval(shift(@insns)); - &vpalignr ($t3,@X[3],@X[2],$SZ); # X[9..12] - eval(shift(@insns)); - eval(shift(@insns)); - &vprotd ($t1,$t0,8*$SZ-$sigma0[1]); - eval(shift(@insns)); - eval(shift(@insns)); - &vpsrld ($t0,$t0,$sigma0[2]); - eval(shift(@insns)); - eval(shift(@insns)); - &vpaddd (@X[0],@X[0],$t3); # X[0..3] += X[9..12] - eval(shift(@insns)); - eval(shift(@insns)); - eval(shift(@insns)); - eval(shift(@insns)); - &vprotd ($t2,$t1,$sigma0[1]-$sigma0[0]); - eval(shift(@insns)); - eval(shift(@insns)); - &vpxor ($t0,$t0,$t1); - eval(shift(@insns)); - eval(shift(@insns)); - eval(shift(@insns)); - eval(shift(@insns)); - &vprotd ($t3,@X[3],8*$SZ-$sigma1[1]); - eval(shift(@insns)); - eval(shift(@insns)); - &vpxor ($t0,$t0,$t2); # sigma0(X[1..4]) - eval(shift(@insns)); - eval(shift(@insns)); - &vpsrld ($t2,@X[3],$sigma1[2]); - eval(shift(@insns)); - eval(shift(@insns)); - &vpaddd (@X[0],@X[0],$t0); # X[0..3] += sigma0(X[1..4]) - eval(shift(@insns)); - eval(shift(@insns)); - &vprotd ($t1,$t3,$sigma1[1]-$sigma1[0]); - eval(shift(@insns)); - eval(shift(@insns)); - &vpxor ($t3,$t3,$t2); - eval(shift(@insns)); - eval(shift(@insns)); - eval(shift(@insns)); - eval(shift(@insns)); - &vpxor ($t3,$t3,$t1); # sigma1(X[14..15]) - eval(shift(@insns)); - eval(shift(@insns)); - eval(shift(@insns)); - eval(shift(@insns)); - &vpsrldq ($t3,$t3,8); - eval(shift(@insns)); - eval(shift(@insns)); - eval(shift(@insns)); - eval(shift(@insns)); - &vpaddd (@X[0],@X[0],$t3); # X[0..1] += sigma1(X[14..15]) - eval(shift(@insns)); - eval(shift(@insns)); - eval(shift(@insns)); - eval(shift(@insns)); - &vprotd ($t3,@X[0],8*$SZ-$sigma1[1]); - eval(shift(@insns)); - eval(shift(@insns)); - &vpsrld ($t2,@X[0],$sigma1[2]); - eval(shift(@insns)); - eval(shift(@insns)); - &vprotd ($t1,$t3,$sigma1[1]-$sigma1[0]); - eval(shift(@insns)); - eval(shift(@insns)); - &vpxor ($t3,$t3,$t2); - eval(shift(@insns)); - eval(shift(@insns)); - eval(shift(@insns)); - eval(shift(@insns)); - &vpxor ($t3,$t3,$t1); # sigma1(X[16..17]) - eval(shift(@insns)); - eval(shift(@insns)); - eval(shift(@insns)); - eval(shift(@insns)); - &vpslldq ($t3,$t3,8); # 22 instructions - eval(shift(@insns)); - eval(shift(@insns)); - eval(shift(@insns)); - eval(shift(@insns)); - &vpaddd (@X[0],@X[0],$t3); # X[2..3] += sigma1(X[16..17]) - eval(shift(@insns)); - eval(shift(@insns)); - eval(shift(@insns)); - eval(shift(@insns)); - &vpaddd ($t2,@X[0],16*2*$j."($Tbl)"); - foreach (@insns) { eval; } # remaining instructions - &vmovdqa (16*$j."(%rsp)",$t2); -} - - for ($i=0,$j=0; $j<4; $j++) { - &XOP_256_00_47($j,\&body_00_15,@X); - push(@X,shift(@X)); # rotate(@X) - } - &cmpb ($SZ-1+16*2*$SZ."($Tbl)",0); - &jne (".Lxop_00_47"); - - for ($i=0; $i<16; ) { - foreach(body_00_15()) { eval; } - } - - } else { # SHA512 - my @X = map("%xmm$_",(0..7)); - my ($t0,$t1,$t2,$t3) = map("%xmm$_",(8..11)); - -$code.=<<___; -.align 16 -.Lloop_xop: - vmovdqa $TABLE+`$SZ*2*$rounds`(%rip),$t3 - vmovdqu 0x00($inp),@X[0] - lea $TABLE+0x80(%rip),$Tbl # size optimization - vmovdqu 0x10($inp),@X[1] - vmovdqu 0x20($inp),@X[2] - vpshufb $t3,@X[0],@X[0] - vmovdqu 0x30($inp),@X[3] - vpshufb $t3,@X[1],@X[1] - vmovdqu 0x40($inp),@X[4] - vpshufb $t3,@X[2],@X[2] - vmovdqu 0x50($inp),@X[5] - vpshufb $t3,@X[3],@X[3] - vmovdqu 0x60($inp),@X[6] - vpshufb $t3,@X[4],@X[4] - vmovdqu 0x70($inp),@X[7] - vpshufb $t3,@X[5],@X[5] - vpaddq -0x80($Tbl),@X[0],$t0 - vpshufb $t3,@X[6],@X[6] - vpaddq -0x60($Tbl),@X[1],$t1 - vpshufb $t3,@X[7],@X[7] - vpaddq -0x40($Tbl),@X[2],$t2 - vpaddq -0x20($Tbl),@X[3],$t3 - vmovdqa $t0,0x00(%rsp) - vpaddq 0x00($Tbl),@X[4],$t0 - vmovdqa $t1,0x10(%rsp) - vpaddq 0x20($Tbl),@X[5],$t1 - vmovdqa $t2,0x20(%rsp) - vpaddq 0x40($Tbl),@X[6],$t2 - vmovdqa $t3,0x30(%rsp) - vpaddq 0x60($Tbl),@X[7],$t3 - vmovdqa $t0,0x40(%rsp) - mov $A,$a1 - vmovdqa $t1,0x50(%rsp) - mov $B,$a3 - vmovdqa $t2,0x60(%rsp) - xor $C,$a3 # magic - vmovdqa $t3,0x70(%rsp) - mov $E,$a0 - jmp .Lxop_00_47 - -.align 16 -.Lxop_00_47: - add \$`16*2*$SZ`,$Tbl -___ -sub XOP_512_00_47 () { -my $j = shift; -my $body = shift; -my @X = @_; -my @insns = (&$body,&$body); # 52 instructions - - &vpalignr ($t0,@X[1],@X[0],$SZ); # X[1..2] - eval(shift(@insns)); - eval(shift(@insns)); - &vpalignr ($t3,@X[5],@X[4],$SZ); # X[9..10] - eval(shift(@insns)); - eval(shift(@insns)); - &vprotq ($t1,$t0,8*$SZ-$sigma0[1]); - eval(shift(@insns)); - eval(shift(@insns)); - &vpsrlq ($t0,$t0,$sigma0[2]); - eval(shift(@insns)); - eval(shift(@insns)); - &vpaddq (@X[0],@X[0],$t3); # X[0..1] += X[9..10] - eval(shift(@insns)); - eval(shift(@insns)); - eval(shift(@insns)); - eval(shift(@insns)); - &vprotq ($t2,$t1,$sigma0[1]-$sigma0[0]); - eval(shift(@insns)); - eval(shift(@insns)); - &vpxor ($t0,$t0,$t1); - eval(shift(@insns)); - eval(shift(@insns)); - eval(shift(@insns)); - eval(shift(@insns)); - &vprotq ($t3,@X[7],8*$SZ-$sigma1[1]); - eval(shift(@insns)); - eval(shift(@insns)); - &vpxor ($t0,$t0,$t2); # sigma0(X[1..2]) - eval(shift(@insns)); - eval(shift(@insns)); - &vpsrlq ($t2,@X[7],$sigma1[2]); - eval(shift(@insns)); - eval(shift(@insns)); - &vpaddq (@X[0],@X[0],$t0); # X[0..1] += sigma0(X[1..2]) - eval(shift(@insns)); - eval(shift(@insns)); - &vprotq ($t1,$t3,$sigma1[1]-$sigma1[0]); - eval(shift(@insns)); - eval(shift(@insns)); - &vpxor ($t3,$t3,$t2); - eval(shift(@insns)); - eval(shift(@insns)); - eval(shift(@insns)); - eval(shift(@insns)); - &vpxor ($t3,$t3,$t1); # sigma1(X[14..15]) - eval(shift(@insns)); - eval(shift(@insns)); - eval(shift(@insns)); - eval(shift(@insns)); - &vpaddq (@X[0],@X[0],$t3); # X[0..1] += sigma1(X[14..15]) - eval(shift(@insns)); - eval(shift(@insns)); - eval(shift(@insns)); - eval(shift(@insns)); - &vpaddq ($t2,@X[0],16*2*$j-0x80."($Tbl)"); - foreach (@insns) { eval; } # remaining instructions - &vmovdqa (16*$j."(%rsp)",$t2); -} - - for ($i=0,$j=0; $j<8; $j++) { - &XOP_512_00_47($j,\&body_00_15,@X); - push(@X,shift(@X)); # rotate(@X) - } - &cmpb ($SZ-1+16*2*$SZ-0x80."($Tbl)",0); - &jne (".Lxop_00_47"); - - for ($i=0; $i<16; ) { - foreach(body_00_15()) { eval; } - } -} -$code.=<<___; - mov $_ctx,$ctx - mov $a1,$A - - add $SZ*0($ctx),$A - lea 16*$SZ($inp),$inp - add $SZ*1($ctx),$B - add $SZ*2($ctx),$C - add $SZ*3($ctx),$D - add $SZ*4($ctx),$E - add $SZ*5($ctx),$F - add $SZ*6($ctx),$G - add $SZ*7($ctx),$H - - cmp $_end,$inp - - mov $A,$SZ*0($ctx) - mov $B,$SZ*1($ctx) - mov $C,$SZ*2($ctx) - mov $D,$SZ*3($ctx) - mov $E,$SZ*4($ctx) - mov $F,$SZ*5($ctx) - mov $G,$SZ*6($ctx) - mov $H,$SZ*7($ctx) - jb .Lloop_xop - - mov $_rsp,%rsi -.cfi_def_cfa %rsi,8 - vzeroupper -___ -$code.=<<___ if ($win64); - movaps 16*$SZ+32(%rsp),%xmm6 - movaps 16*$SZ+48(%rsp),%xmm7 - movaps 16*$SZ+64(%rsp),%xmm8 - movaps 16*$SZ+80(%rsp),%xmm9 -___ -$code.=<<___ if ($win64 && $SZ>4); - movaps 16*$SZ+96(%rsp),%xmm10 - movaps 16*$SZ+112(%rsp),%xmm11 -___ -$code.=<<___; - mov -48(%rsi),%r15 -.cfi_restore %r15 - mov -40(%rsi),%r14 -.cfi_restore %r14 - mov -32(%rsi),%r13 -.cfi_restore %r13 - mov -24(%rsi),%r12 -.cfi_restore %r12 - mov -16(%rsi),%rbp -.cfi_restore %rbp - mov -8(%rsi),%rbx -.cfi_restore %rbx - lea (%rsi),%rsp -.cfi_def_cfa_register %rsp -.Lepilogue_xop: - ret -.cfi_endproc -.size ${func}_xop,.-${func}_xop -___ -} -###################################################################### # AVX+shrd code path # local *ror = sub { &shrd(@_[0],@_) }; @@ -2409,11 +2016,6 @@ $code.=<<___ if ($SZ==4); .rva .LSEH_end_${func}_ssse3 .rva .LSEH_info_${func}_ssse3 ___ -$code.=<<___ if ($avx && $SZ==8); - .rva .LSEH_begin_${func}_xop - .rva .LSEH_end_${func}_xop - .rva .LSEH_info_${func}_xop -___ $code.=<<___ if ($avx); .rva .LSEH_begin_${func}_avx .rva .LSEH_end_${func}_avx @@ -2443,12 +2045,6 @@ $code.=<<___ if ($SZ==4); .rva se_handler .rva .Lprologue_ssse3,.Lepilogue_ssse3 # HandlerData[] ___ -$code.=<<___ if ($avx && $SZ==8); -.LSEH_info_${func}_xop: - .byte 9,0,0,0 - .rva se_handler - .rva .Lprologue_xop,.Lepilogue_xop # HandlerData[] -___ $code.=<<___ if ($avx); .LSEH_info_${func}_avx: .byte 9,0,0,0