chacha/asm/chacha-x86_64.pl: add AVX512 path optimized for shorter inputs.
(Imports upstream's 3c274a6e2016b6724fbfe3ff1487efa2a536ece4.) Change-Id: I2f0c0abff04decd347d4770e6d1d190f1e08afa0 Reviewed-on: https://boringssl-review.googlesource.com/13781 Commit-Queue: Adam Langley <agl@google.com> Reviewed-by: David Benjamin <davidben@google.com>
This commit is contained in:
parent
cf9a98cc0c
commit
004bff3a14
@ -96,6 +96,10 @@ $code.=<<___;
|
||||
.Lsigma:
|
||||
.asciz "expand 32-byte k"
|
||||
.align 64
|
||||
.Lzeroz:
|
||||
.long 0,0,0,0, 1,0,0,0, 2,0,0,0, 3,0,0,0
|
||||
.Lfourz:
|
||||
.long 4,0,0,0, 4,0,0,0, 4,0,0,0, 4,0,0,0
|
||||
.Lincz:
|
||||
.long 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15
|
||||
.Lsixteen:
|
||||
@ -225,6 +229,12 @@ ChaCha20_ctr32:
|
||||
cmp \$0,$len
|
||||
je .Lno_data
|
||||
mov OPENSSL_ia32cap_P+4(%rip),%r10
|
||||
___
|
||||
$code.=<<___ if ($avx>2);
|
||||
bt \$48,%r10 # check for AVX512F
|
||||
jc .LChaCha20_avx512
|
||||
___
|
||||
$code.=<<___;
|
||||
test \$`1<<(41-32)`,%r10d
|
||||
jnz .LChaCha20_ssse3
|
||||
|
||||
@ -427,7 +437,7 @@ $code.=<<___;
|
||||
ja .LChaCha20_4x # but overall it won't be slower
|
||||
|
||||
.Ldo_sse3_after_all:
|
||||
push %rbx
|
||||
push %rbx # just to share SEH handler, no pops
|
||||
push %rbp
|
||||
push %r12
|
||||
push %r13
|
||||
@ -452,7 +462,7 @@ $code.=<<___;
|
||||
movdqa $b,0x10(%rsp)
|
||||
movdqa $c,0x20(%rsp)
|
||||
movdqa $d,0x30(%rsp)
|
||||
mov \$10,%ebp
|
||||
mov \$10,$counter # reuse $counter
|
||||
jmp .Loop_ssse3
|
||||
|
||||
.align 32
|
||||
@ -462,7 +472,7 @@ $code.=<<___;
|
||||
movdqa 0x10(%rsp),$b
|
||||
movdqa 0x20(%rsp),$c
|
||||
paddd 0x30(%rsp),$d
|
||||
mov \$10,%ebp
|
||||
mov \$10,$counter
|
||||
movdqa $d,0x30(%rsp)
|
||||
jmp .Loop_ssse3
|
||||
|
||||
@ -480,7 +490,7 @@ ___
|
||||
&pshufd ($b,$b,0b10010011);
|
||||
&pshufd ($d,$d,0b00111001);
|
||||
|
||||
&dec ("%ebp");
|
||||
&dec ($counter);
|
||||
&jnz (".Loop_ssse3");
|
||||
|
||||
$code.=<<___;
|
||||
@ -519,14 +529,14 @@ $code.=<<___;
|
||||
movdqa $b,0x10(%rsp)
|
||||
movdqa $c,0x20(%rsp)
|
||||
movdqa $d,0x30(%rsp)
|
||||
xor %rbx,%rbx
|
||||
xor $counter,$counter
|
||||
|
||||
.Loop_tail_ssse3:
|
||||
movzb ($inp,%rbx),%eax
|
||||
movzb (%rsp,%rbx),%ecx
|
||||
lea 1(%rbx),%rbx
|
||||
movzb ($inp,$counter),%eax
|
||||
movzb (%rsp,$counter),%ecx
|
||||
lea 1($counter),$counter
|
||||
xor %ecx,%eax
|
||||
mov %al,-1($out,%rbx)
|
||||
mov %al,-1($out,$counter)
|
||||
dec $len
|
||||
jnz .Loop_tail_ssse3
|
||||
|
||||
@ -537,13 +547,7 @@ $code.=<<___ if ($win64);
|
||||
movaps 64+48(%rsp),%xmm7
|
||||
___
|
||||
$code.=<<___;
|
||||
add \$64+$xframe,%rsp
|
||||
pop %r15
|
||||
pop %r14
|
||||
pop %r13
|
||||
pop %r12
|
||||
pop %rbp
|
||||
pop %rbx
|
||||
add \$64+$xframe+48,%rsp
|
||||
ret
|
||||
.size ChaCha20_ssse3,.-ChaCha20_ssse3
|
||||
___
|
||||
@ -1261,12 +1265,6 @@ $code.=<<___;
|
||||
.align 32
|
||||
ChaCha20_8x:
|
||||
.LChaCha20_8x:
|
||||
___
|
||||
$code.=<<___ if ($avx>2);
|
||||
test \$`1<<16`,%r10d # check for AVX512F
|
||||
jnz .LChaCha20_16x
|
||||
___
|
||||
$code.=<<___;
|
||||
mov %rsp,%r10
|
||||
sub \$0x280+$xframe,%rsp
|
||||
and \$-32,%rsp
|
||||
@ -1758,7 +1756,7 @@ $code.=<<___;
|
||||
jnz .Loop_tail8x
|
||||
|
||||
.Ldone8x:
|
||||
vzeroupper
|
||||
vzeroall
|
||||
___
|
||||
$code.=<<___ if ($win64);
|
||||
lea 0x290+0x30(%rsp),%r11
|
||||
@ -1783,6 +1781,228 @@ ___
|
||||
########################################################################
|
||||
# AVX512 code paths
|
||||
if ($avx>2) {
|
||||
# This one handles shorter inputs...
|
||||
|
||||
my ($a,$b,$c,$d, $a_,$b_,$c_,$d_,$fourz) = map("%zmm$_",(0..3,16..20));
|
||||
my ($t0,$t1,$t2,$t3) = map("%xmm$_",(4..7));
|
||||
|
||||
sub AVX512ROUND { # critical path is 14 "SIMD ticks" per round
|
||||
&vpaddd ($a,$a,$b);
|
||||
&vpxord ($d,$d,$a);
|
||||
&vprold ($d,$d,16);
|
||||
|
||||
&vpaddd ($c,$c,$d);
|
||||
&vpxord ($b,$b,$c);
|
||||
&vprold ($b,$b,12);
|
||||
|
||||
&vpaddd ($a,$a,$b);
|
||||
&vpxord ($d,$d,$a);
|
||||
&vprold ($d,$d,8);
|
||||
|
||||
&vpaddd ($c,$c,$d);
|
||||
&vpxord ($b,$b,$c);
|
||||
&vprold ($b,$b,7);
|
||||
}
|
||||
|
||||
my $xframe = $win64 ? 32+32+8 : 24;
|
||||
|
||||
$code.=<<___;
|
||||
.type ChaCha20_avx512,\@function,5
|
||||
.align 32
|
||||
ChaCha20_avx512:
|
||||
.LChaCha20_avx512:
|
||||
cmp \$512,$len
|
||||
ja .LChaCha20_16x
|
||||
|
||||
push %rbx # just to share SEH handler, no pops
|
||||
push %rbp
|
||||
push %r12
|
||||
push %r13
|
||||
push %r14
|
||||
push %r15
|
||||
|
||||
sub \$64+$xframe,%rsp
|
||||
___
|
||||
$code.=<<___ if ($win64);
|
||||
movaps %xmm6,64+32(%rsp)
|
||||
movaps %xmm7,64+48(%rsp)
|
||||
___
|
||||
$code.=<<___;
|
||||
vbroadcasti32x4 .Lsigma(%rip),$a
|
||||
vbroadcasti32x4 ($key),$b
|
||||
vbroadcasti32x4 16($key),$c
|
||||
vbroadcasti32x4 ($counter),$d
|
||||
|
||||
vmovdqa32 $a,$a_
|
||||
vmovdqa32 $b,$b_
|
||||
vmovdqa32 $c,$c_
|
||||
vpaddd .Lzeroz(%rip),$d,$d
|
||||
vmovdqa32 .Lfourz(%rip),$fourz
|
||||
mov \$10,$counter # reuse $counter
|
||||
vmovdqa32 $d,$d_
|
||||
jmp .Loop_avx512
|
||||
|
||||
.align 16
|
||||
.Loop_outer_avx512:
|
||||
vmovdqa32 $a_,$a
|
||||
vmovdqa32 $b_,$b
|
||||
vmovdqa32 $c_,$c
|
||||
vpaddd $fourz,$d_,$d
|
||||
mov \$10,$counter
|
||||
vmovdqa32 $d,$d_
|
||||
jmp .Loop_avx512
|
||||
|
||||
.align 32
|
||||
.Loop_avx512:
|
||||
___
|
||||
&AVX512ROUND();
|
||||
&vpshufd ($c,$c,0b01001110);
|
||||
&vpshufd ($b,$b,0b00111001);
|
||||
&vpshufd ($d,$d,0b10010011);
|
||||
|
||||
&AVX512ROUND();
|
||||
&vpshufd ($c,$c,0b01001110);
|
||||
&vpshufd ($b,$b,0b10010011);
|
||||
&vpshufd ($d,$d,0b00111001);
|
||||
|
||||
&dec ($counter);
|
||||
&jnz (".Loop_avx512");
|
||||
|
||||
$code.=<<___;
|
||||
vpaddd $a_,$a,$a
|
||||
vpaddd $b_,$b,$b
|
||||
vpaddd $c_,$c,$c
|
||||
vpaddd $d_,$d,$d
|
||||
|
||||
sub \$64,$len
|
||||
jb .Ltail64_avx512
|
||||
|
||||
vpxor 0x00($inp),%x#$a,$t0 # xor with input
|
||||
vpxor 0x10($inp),%x#$b,$t1
|
||||
vpxor 0x20($inp),%x#$c,$t2
|
||||
vpxor 0x30($inp),%x#$d,$t3
|
||||
lea 0x40($inp),$inp # inp+=64
|
||||
|
||||
vmovdqu $t0,0x00($out) # write output
|
||||
vmovdqu $t1,0x10($out)
|
||||
vmovdqu $t2,0x20($out)
|
||||
vmovdqu $t3,0x30($out)
|
||||
lea 0x40($out),$out # out+=64
|
||||
|
||||
jz .Ldone_avx512
|
||||
|
||||
vextracti32x4 \$1,$a,$t0
|
||||
vextracti32x4 \$1,$b,$t1
|
||||
vextracti32x4 \$1,$c,$t2
|
||||
vextracti32x4 \$1,$d,$t3
|
||||
|
||||
sub \$64,$len
|
||||
jb .Ltail_avx512
|
||||
|
||||
vpxor 0x00($inp),$t0,$t0 # xor with input
|
||||
vpxor 0x10($inp),$t1,$t1
|
||||
vpxor 0x20($inp),$t2,$t2
|
||||
vpxor 0x30($inp),$t3,$t3
|
||||
lea 0x40($inp),$inp # inp+=64
|
||||
|
||||
vmovdqu $t0,0x00($out) # write output
|
||||
vmovdqu $t1,0x10($out)
|
||||
vmovdqu $t2,0x20($out)
|
||||
vmovdqu $t3,0x30($out)
|
||||
lea 0x40($out),$out # out+=64
|
||||
|
||||
jz .Ldone_avx512
|
||||
|
||||
vextracti32x4 \$2,$a,$t0
|
||||
vextracti32x4 \$2,$b,$t1
|
||||
vextracti32x4 \$2,$c,$t2
|
||||
vextracti32x4 \$2,$d,$t3
|
||||
|
||||
sub \$64,$len
|
||||
jb .Ltail_avx512
|
||||
|
||||
vpxor 0x00($inp),$t0,$t0 # xor with input
|
||||
vpxor 0x10($inp),$t1,$t1
|
||||
vpxor 0x20($inp),$t2,$t2
|
||||
vpxor 0x30($inp),$t3,$t3
|
||||
lea 0x40($inp),$inp # inp+=64
|
||||
|
||||
vmovdqu $t0,0x00($out) # write output
|
||||
vmovdqu $t1,0x10($out)
|
||||
vmovdqu $t2,0x20($out)
|
||||
vmovdqu $t3,0x30($out)
|
||||
lea 0x40($out),$out # out+=64
|
||||
|
||||
jz .Ldone_avx512
|
||||
|
||||
vextracti32x4 \$3,$a,$t0
|
||||
vextracti32x4 \$3,$b,$t1
|
||||
vextracti32x4 \$3,$c,$t2
|
||||
vextracti32x4 \$3,$d,$t3
|
||||
|
||||
sub \$64,$len
|
||||
jb .Ltail_avx512
|
||||
|
||||
vpxor 0x00($inp),$t0,$t0 # xor with input
|
||||
vpxor 0x10($inp),$t1,$t1
|
||||
vpxor 0x20($inp),$t2,$t2
|
||||
vpxor 0x30($inp),$t3,$t3
|
||||
lea 0x40($inp),$inp # inp+=64
|
||||
|
||||
vmovdqu $t0,0x00($out) # write output
|
||||
vmovdqu $t1,0x10($out)
|
||||
vmovdqu $t2,0x20($out)
|
||||
vmovdqu $t3,0x30($out)
|
||||
lea 0x40($out),$out # out+=64
|
||||
|
||||
jnz .Loop_outer_avx512
|
||||
|
||||
jmp .Ldone_avx512
|
||||
|
||||
.align 16
|
||||
.Ltail64_avx512:
|
||||
vmovdqa %x#$a,0x00(%rsp)
|
||||
vmovdqa %x#$b,0x10(%rsp)
|
||||
vmovdqa %x#$c,0x20(%rsp)
|
||||
vmovdqa %x#$d,0x30(%rsp)
|
||||
add \$64,$len
|
||||
jmp .Loop_tail_avx512
|
||||
|
||||
.align 16
|
||||
.Ltail_avx512:
|
||||
vmovdqa $t0,0x00(%rsp)
|
||||
vmovdqa $t1,0x10(%rsp)
|
||||
vmovdqa $t2,0x20(%rsp)
|
||||
vmovdqa $t3,0x30(%rsp)
|
||||
add \$64,$len
|
||||
|
||||
.Loop_tail_avx512:
|
||||
movzb ($inp,$counter),%eax
|
||||
movzb (%rsp,$counter),%ecx
|
||||
lea 1($counter),$counter
|
||||
xor %ecx,%eax
|
||||
mov %al,-1($out,$counter)
|
||||
dec $len
|
||||
jnz .Loop_tail_avx512
|
||||
|
||||
vmovdqa32 $a_,0x00(%rsp)
|
||||
|
||||
.Ldone_avx512:
|
||||
vzeroall
|
||||
___
|
||||
$code.=<<___ if ($win64);
|
||||
movaps 64+32(%rsp),%xmm6
|
||||
movaps 64+48(%rsp),%xmm7
|
||||
___
|
||||
$code.=<<___;
|
||||
add \$64+$xframe+48,%rsp
|
||||
ret
|
||||
.size ChaCha20_avx512,.-ChaCha20_avx512
|
||||
___
|
||||
}
|
||||
if ($avx>2) {
|
||||
# This one handles longer inputs...
|
||||
|
||||
my ($xa0,$xa1,$xa2,$xa3, $xb0,$xb1,$xb2,$xb3,
|
||||
$xc0,$xc1,$xc2,$xc3, $xd0,$xd1,$xd2,$xd3)=map("%zmm$_",(0..15));
|
||||
my @xx=($xa0,$xa1,$xa2,$xa3, $xb0,$xb1,$xb2,$xb3,
|
||||
@ -2257,8 +2477,11 @@ $code.=<<___;
|
||||
dec $len
|
||||
jnz .Loop_tail16x
|
||||
|
||||
vpxord $xa0,$xa0,$xa0
|
||||
vmovdqa32 $xa0,0(%rsp)
|
||||
|
||||
.Ldone16x:
|
||||
vzeroupper
|
||||
vzeroall
|
||||
___
|
||||
$code.=<<___ if ($win64);
|
||||
lea 0x290+0x30(%rsp),%r11
|
||||
@ -2281,9 +2504,9 @@ ___
|
||||
}
|
||||
|
||||
foreach (split("\n",$code)) {
|
||||
s/\`([^\`]*)\`/eval $1/geo;
|
||||
s/\`([^\`]*)\`/eval $1/ge;
|
||||
|
||||
s/%x#%y/%x/go;
|
||||
s/%x#%[yz]/%x/g; # "down-shift"
|
||||
|
||||
print $_,"\n";
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user