fdd8e9c8c7
Depending on architecture, perlasm differed on which one or both of: perl foo.pl flavor output.S perl foo.pl flavor > output.S Upstream has now unified on the first form after making a number of changes to their files (the second does not even work for their x86 files anymore). Sync those portions of our perlasm scripts with upstream and update CMakeLists.txt and generate_build_files.py per the new convention. This imports various commits like this one: 184bc45f683c76531d7e065b6553ca9086564576 (this was done by taking a diff, so I don't have the full list) Confirmed that generate_build_files.py sees no change. BUG=14 Change-Id: Id2fb5b8bc2a7369d077221b5df9a6947d41f50d2 Reviewed-on: https://boringssl-review.googlesource.com/8518 Reviewed-by: Adam Langley <agl@google.com>
2236 lines
51 KiB
Perl
Executable File
2236 lines
51 KiB
Perl
Executable File
#!/usr/bin/env perl
|
|
#
|
|
# ====================================================================
|
|
# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
|
|
# project. The module is, however, dual licensed under OpenSSL and
|
|
# CRYPTOGAMS licenses depending on where you obtain it. For further
|
|
# details see http://www.openssl.org/~appro/cryptogams/.
|
|
# ====================================================================
|
|
#
|
|
# This module implements Poly1305 hash for x86_64.
|
|
#
|
|
# March 2015
|
|
#
|
|
# Numbers are cycles per processed byte with poly1305_blocks alone,
|
|
# measured with rdtsc at fixed clock frequency.
|
|
#
|
|
# IALU/gcc-4.8(*) AVX(**) AVX2
|
|
# P4 4.90/+120% -
|
|
# Core 2 2.39/+90% -
|
|
# Westmere 1.86/+120% -
|
|
# Sandy Bridge 1.39/+140% 1.10
|
|
# Haswell 1.10/+175% 1.11 0.65
|
|
# Skylake 1.12/+120% 0.96 0.51
|
|
# Silvermont 2.83/+95% -
|
|
# VIA Nano 1.82/+150% -
|
|
# Sledgehammer 1.38/+160% -
|
|
# Bulldozer 2.21/+130% 0.97
|
|
#
|
|
# (*) improvement coefficients relative to clang are more modest and
|
|
# are ~50% on most processors, in both cases we are comparing to
|
|
# __int128 code;
|
|
# (**) SSE2 implementation was attempted, but among non-AVX processors
|
|
# it was faster than integer-only code only on older Intel P4 and
|
|
# Core processors, 50-30%, less newer processor is, but slower on
|
|
# contemporary ones, for example almost 2x slower on Atom, and as
|
|
# former are naturally disappearing, SSE2 is deemed unnecessary;
|
|
|
|
$flavour = shift;
|
|
$output = shift;
|
|
if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
|
|
|
|
$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
|
|
|
|
$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
|
|
( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
|
|
( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
|
|
die "can't locate x86_64-xlate.pl";
|
|
|
|
$avx = 2;
|
|
|
|
open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
|
|
*STDOUT=*OUT;
|
|
|
|
my ($ctx,$inp,$len,$padbit)=("%rdi","%rsi","%rdx","%rcx");
|
|
my ($mac,$nonce)=($inp,$len); # *_emit arguments
|
|
my ($d1,$d2,$d3, $r0,$r1,$s1)=map("%r$_",(8..13));
|
|
my ($h0,$h1,$h2)=("%r14","%rbx","%rbp");
|
|
|
|
sub poly1305_iteration {
|
|
# input: copy of $r1 in %rax, $h0-$h2, $r0-$r1
|
|
# output: $h0-$h2 *= $r0-$r1
|
|
$code.=<<___;
|
|
mulq $h0 # h0*r1
|
|
mov %rax,$d2
|
|
mov $r0,%rax
|
|
mov %rdx,$d3
|
|
|
|
mulq $h0 # h0*r0
|
|
mov %rax,$h0 # future $h0
|
|
mov $r0,%rax
|
|
mov %rdx,$d1
|
|
|
|
mulq $h1 # h1*r0
|
|
add %rax,$d2
|
|
mov $s1,%rax
|
|
adc %rdx,$d3
|
|
|
|
mulq $h1 # h1*s1
|
|
mov $h2,$h1 # borrow $h1
|
|
add %rax,$h0
|
|
adc %rdx,$d1
|
|
|
|
imulq $s1,$h1 # h2*s1
|
|
add $h1,$d2
|
|
mov $d1,$h1
|
|
adc \$0,$d3
|
|
|
|
imulq $r0,$h2 # h2*r0
|
|
add $d2,$h1
|
|
mov \$-4,%rax # mask value
|
|
adc $h2,$d3
|
|
|
|
and $d3,%rax # last reduction step
|
|
mov $d3,$h2
|
|
shr \$2,$d3
|
|
and \$3,$h2
|
|
add $d3,%rax
|
|
add %rax,$h0
|
|
adc \$0,$h1
|
|
___
|
|
}
|
|
|
|
########################################################################
|
|
# Layout of opaque area is following.
|
|
#
|
|
# unsigned __int64 h[3]; # current hash value base 2^64
|
|
# unsigned __int64 r[2]; # key value base 2^64
|
|
|
|
$code.=<<___;
|
|
.text
|
|
|
|
.extern OPENSSL_ia32cap_P
|
|
|
|
.globl poly1305_init
|
|
.globl poly1305_blocks
|
|
.globl poly1305_emit
|
|
.type poly1305_init,\@function,3
|
|
.align 32
|
|
poly1305_init:
|
|
xor %rax,%rax
|
|
mov %rax,0($ctx) # initialize hash value
|
|
mov %rax,8($ctx)
|
|
mov %rax,16($ctx)
|
|
|
|
cmp \$0,$inp
|
|
je .Lno_key
|
|
|
|
lea poly1305_blocks(%rip),%r10
|
|
lea poly1305_emit(%rip),%r11
|
|
___
|
|
$code.=<<___ if ($avx);
|
|
mov OPENSSL_ia32cap_P+4(%rip),%r9
|
|
lea poly1305_blocks_avx(%rip),%rax
|
|
lea poly1305_emit_avx(%rip),%rcx
|
|
bt \$`60-32`,%r9 # AVX?
|
|
cmovc %rax,%r10
|
|
cmovc %rcx,%r11
|
|
___
|
|
$code.=<<___ if ($avx>1);
|
|
lea poly1305_blocks_avx2(%rip),%rax
|
|
bt \$`5+32`,%r9 # AVX2?
|
|
cmovc %rax,%r10
|
|
___
|
|
$code.=<<___;
|
|
mov \$0x0ffffffc0fffffff,%rax
|
|
mov \$0x0ffffffc0ffffffc,%rcx
|
|
and 0($inp),%rax
|
|
and 8($inp),%rcx
|
|
mov %rax,24($ctx)
|
|
mov %rcx,32($ctx)
|
|
___
|
|
$code.=<<___ if ($flavour !~ /elf32/);
|
|
mov %r10,0(%rdx)
|
|
mov %r11,8(%rdx)
|
|
___
|
|
$code.=<<___ if ($flavour =~ /elf32/);
|
|
mov %r10d,0(%rdx)
|
|
mov %r11d,4(%rdx)
|
|
___
|
|
$code.=<<___;
|
|
mov \$1,%eax
|
|
.Lno_key:
|
|
ret
|
|
.size poly1305_init,.-poly1305_init
|
|
|
|
.type poly1305_blocks,\@function,4
|
|
.align 32
|
|
poly1305_blocks:
|
|
.Lblocks:
|
|
sub \$16,$len # too short?
|
|
jc .Lno_data
|
|
|
|
push %rbx
|
|
push %rbp
|
|
push %r12
|
|
push %r13
|
|
push %r14
|
|
push %r15
|
|
.Lblocks_body:
|
|
|
|
mov $len,%r15 # reassign $len
|
|
|
|
mov 24($ctx),$r0 # load r
|
|
mov 32($ctx),$s1
|
|
|
|
mov 0($ctx),$h0 # load hash value
|
|
mov 8($ctx),$h1
|
|
mov 16($ctx),$h2
|
|
|
|
mov $s1,$r1
|
|
shr \$2,$s1
|
|
mov $r1,%rax
|
|
add $r1,$s1 # s1 = r1 + (r1 >> 2)
|
|
jmp .Loop
|
|
|
|
.align 32
|
|
.Loop:
|
|
add 0($inp),$h0 # accumulate input
|
|
adc 8($inp),$h1
|
|
lea 16($inp),$inp
|
|
adc $padbit,$h2
|
|
___
|
|
&poly1305_iteration();
|
|
$code.=<<___;
|
|
mov $r1,%rax
|
|
sub \$16,%r15 # len-=16
|
|
jnc .Loop
|
|
|
|
mov $h0,0($ctx) # store hash value
|
|
mov $h1,8($ctx)
|
|
mov $h2,16($ctx)
|
|
|
|
mov 0(%rsp),%r15
|
|
mov 8(%rsp),%r14
|
|
mov 16(%rsp),%r13
|
|
mov 24(%rsp),%r12
|
|
mov 32(%rsp),%rbp
|
|
mov 40(%rsp),%rbx
|
|
lea 48(%rsp),%rsp
|
|
.Lno_data:
|
|
.Lblocks_epilogue:
|
|
ret
|
|
.size poly1305_blocks,.-poly1305_blocks
|
|
|
|
.type poly1305_emit,\@function,3
|
|
.align 32
|
|
poly1305_emit:
|
|
.Lemit:
|
|
mov 0($ctx),%r8 # load hash value
|
|
mov 8($ctx),%r9
|
|
mov 16($ctx),%r10
|
|
|
|
mov %r8,%rax
|
|
add \$5,%r8 # compare to modulus
|
|
mov %r9,%rcx
|
|
adc \$0,%r9
|
|
adc \$0,%r10
|
|
shr \$2,%r10 # did 130-bit value overfow?
|
|
cmovnz %r8,%rax
|
|
cmovnz %r9,%rcx
|
|
|
|
add 0($nonce),%rax # accumulate nonce
|
|
adc 8($nonce),%rcx
|
|
mov %rax,0($mac) # write result
|
|
mov %rcx,8($mac)
|
|
|
|
ret
|
|
.size poly1305_emit,.-poly1305_emit
|
|
___
|
|
if ($avx) {
|
|
|
|
########################################################################
|
|
# Layout of opaque area is following.
|
|
#
|
|
# unsigned __int32 h[5]; # current hash value base 2^26
|
|
# unsigned __int32 is_base2_26;
|
|
# unsigned __int64 r[2]; # key value base 2^64
|
|
# unsigned __int64 pad;
|
|
# struct { unsigned __int32 r^2, r^1, r^4, r^3; } r[9];
|
|
#
|
|
# where r^n are base 2^26 digits of degrees of multiplier key. There are
|
|
# 5 digits, but last four are interleaved with multiples of 5, totalling
|
|
# in 9 elements: r0, r1, 5*r1, r2, 5*r2, r3, 5*r3, r4, 5*r4.
|
|
|
|
my ($H0,$H1,$H2,$H3,$H4, $T0,$T1,$T2,$T3,$T4, $D0,$D1,$D2,$D3,$D4, $MASK) =
|
|
map("%xmm$_",(0..15));
|
|
|
|
$code.=<<___;
|
|
.type __poly1305_block,\@abi-omnipotent
|
|
.align 32
|
|
__poly1305_block:
|
|
___
|
|
&poly1305_iteration();
|
|
$code.=<<___;
|
|
ret
|
|
.size __poly1305_block,.-__poly1305_block
|
|
|
|
.type __poly1305_init_avx,\@abi-omnipotent
|
|
.align 32
|
|
__poly1305_init_avx:
|
|
mov $r0,$h0
|
|
mov $r1,$h1
|
|
xor $h2,$h2
|
|
|
|
lea 48+64($ctx),$ctx # size optimization
|
|
|
|
mov $r1,%rax
|
|
call __poly1305_block # r^2
|
|
|
|
mov \$0x3ffffff,%eax # save interleaved r^2 and r base 2^26
|
|
mov \$0x3ffffff,%edx
|
|
mov $h0,$d1
|
|
and $h0#d,%eax
|
|
mov $r0,$d2
|
|
and $r0#d,%edx
|
|
mov %eax,`16*0+0-64`($ctx)
|
|
shr \$26,$d1
|
|
mov %edx,`16*0+4-64`($ctx)
|
|
shr \$26,$d2
|
|
|
|
mov \$0x3ffffff,%eax
|
|
mov \$0x3ffffff,%edx
|
|
and $d1#d,%eax
|
|
and $d2#d,%edx
|
|
mov %eax,`16*1+0-64`($ctx)
|
|
lea (%rax,%rax,4),%eax # *5
|
|
mov %edx,`16*1+4-64`($ctx)
|
|
lea (%rdx,%rdx,4),%edx # *5
|
|
mov %eax,`16*2+0-64`($ctx)
|
|
shr \$26,$d1
|
|
mov %edx,`16*2+4-64`($ctx)
|
|
shr \$26,$d2
|
|
|
|
mov $h1,%rax
|
|
mov $r1,%rdx
|
|
shl \$12,%rax
|
|
shl \$12,%rdx
|
|
or $d1,%rax
|
|
or $d2,%rdx
|
|
and \$0x3ffffff,%eax
|
|
and \$0x3ffffff,%edx
|
|
mov %eax,`16*3+0-64`($ctx)
|
|
lea (%rax,%rax,4),%eax # *5
|
|
mov %edx,`16*3+4-64`($ctx)
|
|
lea (%rdx,%rdx,4),%edx # *5
|
|
mov %eax,`16*4+0-64`($ctx)
|
|
mov $h1,$d1
|
|
mov %edx,`16*4+4-64`($ctx)
|
|
mov $r1,$d2
|
|
|
|
mov \$0x3ffffff,%eax
|
|
mov \$0x3ffffff,%edx
|
|
shr \$14,$d1
|
|
shr \$14,$d2
|
|
and $d1#d,%eax
|
|
and $d2#d,%edx
|
|
mov %eax,`16*5+0-64`($ctx)
|
|
lea (%rax,%rax,4),%eax # *5
|
|
mov %edx,`16*5+4-64`($ctx)
|
|
lea (%rdx,%rdx,4),%edx # *5
|
|
mov %eax,`16*6+0-64`($ctx)
|
|
shr \$26,$d1
|
|
mov %edx,`16*6+4-64`($ctx)
|
|
shr \$26,$d2
|
|
|
|
mov $h2,%rax
|
|
shl \$24,%rax
|
|
or %rax,$d1
|
|
mov $d1#d,`16*7+0-64`($ctx)
|
|
lea ($d1,$d1,4),$d1 # *5
|
|
mov $d2#d,`16*7+4-64`($ctx)
|
|
lea ($d2,$d2,4),$d2 # *5
|
|
mov $d1#d,`16*8+0-64`($ctx)
|
|
mov $d2#d,`16*8+4-64`($ctx)
|
|
|
|
mov $r1,%rax
|
|
call __poly1305_block # r^3
|
|
|
|
mov \$0x3ffffff,%eax # save r^3 base 2^26
|
|
mov $h0,$d1
|
|
and $h0#d,%eax
|
|
shr \$26,$d1
|
|
mov %eax,`16*0+12-64`($ctx)
|
|
|
|
mov \$0x3ffffff,%edx
|
|
and $d1#d,%edx
|
|
mov %edx,`16*1+12-64`($ctx)
|
|
lea (%rdx,%rdx,4),%edx # *5
|
|
shr \$26,$d1
|
|
mov %edx,`16*2+12-64`($ctx)
|
|
|
|
mov $h1,%rax
|
|
shl \$12,%rax
|
|
or $d1,%rax
|
|
and \$0x3ffffff,%eax
|
|
mov %eax,`16*3+12-64`($ctx)
|
|
lea (%rax,%rax,4),%eax # *5
|
|
mov $h1,$d1
|
|
mov %eax,`16*4+12-64`($ctx)
|
|
|
|
mov \$0x3ffffff,%edx
|
|
shr \$14,$d1
|
|
and $d1#d,%edx
|
|
mov %edx,`16*5+12-64`($ctx)
|
|
lea (%rdx,%rdx,4),%edx # *5
|
|
shr \$26,$d1
|
|
mov %edx,`16*6+12-64`($ctx)
|
|
|
|
mov $h2,%rax
|
|
shl \$24,%rax
|
|
or %rax,$d1
|
|
mov $d1#d,`16*7+12-64`($ctx)
|
|
lea ($d1,$d1,4),$d1 # *5
|
|
mov $d1#d,`16*8+12-64`($ctx)
|
|
|
|
mov $r1,%rax
|
|
call __poly1305_block # r^4
|
|
|
|
mov \$0x3ffffff,%eax # save r^4 base 2^26
|
|
mov $h0,$d1
|
|
and $h0#d,%eax
|
|
shr \$26,$d1
|
|
mov %eax,`16*0+8-64`($ctx)
|
|
|
|
mov \$0x3ffffff,%edx
|
|
and $d1#d,%edx
|
|
mov %edx,`16*1+8-64`($ctx)
|
|
lea (%rdx,%rdx,4),%edx # *5
|
|
shr \$26,$d1
|
|
mov %edx,`16*2+8-64`($ctx)
|
|
|
|
mov $h1,%rax
|
|
shl \$12,%rax
|
|
or $d1,%rax
|
|
and \$0x3ffffff,%eax
|
|
mov %eax,`16*3+8-64`($ctx)
|
|
lea (%rax,%rax,4),%eax # *5
|
|
mov $h1,$d1
|
|
mov %eax,`16*4+8-64`($ctx)
|
|
|
|
mov \$0x3ffffff,%edx
|
|
shr \$14,$d1
|
|
and $d1#d,%edx
|
|
mov %edx,`16*5+8-64`($ctx)
|
|
lea (%rdx,%rdx,4),%edx # *5
|
|
shr \$26,$d1
|
|
mov %edx,`16*6+8-64`($ctx)
|
|
|
|
mov $h2,%rax
|
|
shl \$24,%rax
|
|
or %rax,$d1
|
|
mov $d1#d,`16*7+8-64`($ctx)
|
|
lea ($d1,$d1,4),$d1 # *5
|
|
mov $d1#d,`16*8+8-64`($ctx)
|
|
|
|
lea -48-64($ctx),$ctx # size [de-]optimization
|
|
ret
|
|
.size __poly1305_init_avx,.-__poly1305_init_avx
|
|
|
|
.type poly1305_blocks_avx,\@function,4
|
|
.align 32
|
|
poly1305_blocks_avx:
|
|
mov 20($ctx),%r8d # is_base2_26
|
|
cmp \$128,$len
|
|
jae .Lblocks_avx
|
|
test %r8d,%r8d
|
|
jz .Lblocks
|
|
|
|
.Lblocks_avx:
|
|
and \$-16,$len
|
|
jz .Lno_data_avx
|
|
|
|
vzeroupper
|
|
|
|
test %r8d,%r8d
|
|
jz .Lbase2_64_avx
|
|
|
|
test \$31,$len
|
|
jz .Leven_avx
|
|
|
|
push %rbx
|
|
push %rbp
|
|
push %r12
|
|
push %r13
|
|
push %r14
|
|
push %r15
|
|
.Lblocks_avx_body:
|
|
|
|
mov $len,%r15 # reassign $len
|
|
|
|
mov 0($ctx),$d1 # load hash value
|
|
mov 8($ctx),$d2
|
|
mov 16($ctx),$h2#d
|
|
|
|
mov 24($ctx),$r0 # load r
|
|
mov 32($ctx),$s1
|
|
|
|
################################# base 2^26 -> base 2^64
|
|
mov $d1#d,$h0#d
|
|
and \$-1<<31,$d1
|
|
mov $d2,$r1 # borrow $r1
|
|
mov $d2#d,$h1#d
|
|
and \$-1<<31,$d2
|
|
|
|
shr \$6,$d1
|
|
shl \$52,$r1
|
|
add $d1,$h0
|
|
shr \$12,$h1
|
|
shr \$18,$d2
|
|
add $r1,$h0
|
|
adc $d2,$h1
|
|
|
|
mov $h2,$d1
|
|
shl \$40,$d1
|
|
shr \$24,$h2
|
|
add $d1,$h1
|
|
adc \$0,$h2 # can be partially reduced...
|
|
|
|
mov \$-4,$d2 # ... so reduce
|
|
mov $h2,$d1
|
|
and $h2,$d2
|
|
shr \$2,$d1
|
|
and \$3,$h2
|
|
add $d2,$d1 # =*5
|
|
add $d1,$h0
|
|
adc \$0,$h1
|
|
|
|
mov $s1,$r1
|
|
mov $s1,%rax
|
|
shr \$2,$s1
|
|
add $r1,$s1 # s1 = r1 + (r1 >> 2)
|
|
|
|
add 0($inp),$h0 # accumulate input
|
|
adc 8($inp),$h1
|
|
lea 16($inp),$inp
|
|
adc $padbit,$h2
|
|
|
|
call __poly1305_block
|
|
|
|
test $padbit,$padbit # if $padbit is zero,
|
|
jz .Lstore_base2_64_avx # store hash in base 2^64 format
|
|
|
|
################################# base 2^64 -> base 2^26
|
|
mov $h0,%rax
|
|
mov $h0,%rdx
|
|
shr \$52,$h0
|
|
mov $h1,$r0
|
|
mov $h1,$r1
|
|
shr \$26,%rdx
|
|
and \$0x3ffffff,%rax # h[0]
|
|
shl \$12,$r0
|
|
and \$0x3ffffff,%rdx # h[1]
|
|
shr \$14,$h1
|
|
or $r0,$h0
|
|
shl \$24,$h2
|
|
and \$0x3ffffff,$h0 # h[2]
|
|
shr \$40,$r1
|
|
and \$0x3ffffff,$h1 # h[3]
|
|
or $r1,$h2 # h[4]
|
|
|
|
sub \$16,%r15
|
|
jz .Lstore_base2_26_avx
|
|
|
|
vmovd %rax#d,$H0
|
|
vmovd %rdx#d,$H1
|
|
vmovd $h0#d,$H2
|
|
vmovd $h1#d,$H3
|
|
vmovd $h2#d,$H4
|
|
jmp .Lproceed_avx
|
|
|
|
.align 32
|
|
.Lstore_base2_64_avx:
|
|
mov $h0,0($ctx)
|
|
mov $h1,8($ctx)
|
|
mov $h2,16($ctx) # note that is_base2_26 is zeroed
|
|
jmp .Ldone_avx
|
|
|
|
.align 16
|
|
.Lstore_base2_26_avx:
|
|
mov %rax#d,0($ctx) # store hash value base 2^26
|
|
mov %rdx#d,4($ctx)
|
|
mov $h0#d,8($ctx)
|
|
mov $h1#d,12($ctx)
|
|
mov $h2#d,16($ctx)
|
|
.align 16
|
|
.Ldone_avx:
|
|
mov 0(%rsp),%r15
|
|
mov 8(%rsp),%r14
|
|
mov 16(%rsp),%r13
|
|
mov 24(%rsp),%r12
|
|
mov 32(%rsp),%rbp
|
|
mov 40(%rsp),%rbx
|
|
lea 48(%rsp),%rsp
|
|
.Lno_data_avx:
|
|
.Lblocks_avx_epilogue:
|
|
ret
|
|
|
|
.align 32
|
|
.Lbase2_64_avx:
|
|
push %rbx
|
|
push %rbp
|
|
push %r12
|
|
push %r13
|
|
push %r14
|
|
push %r15
|
|
.Lbase2_64_avx_body:
|
|
|
|
mov $len,%r15 # reassign $len
|
|
|
|
mov 24($ctx),$r0 # load r
|
|
mov 32($ctx),$s1
|
|
|
|
mov 0($ctx),$h0 # load hash value
|
|
mov 8($ctx),$h1
|
|
mov 16($ctx),$h2#d
|
|
|
|
mov $s1,$r1
|
|
mov $s1,%rax
|
|
shr \$2,$s1
|
|
add $r1,$s1 # s1 = r1 + (r1 >> 2)
|
|
|
|
test \$31,$len
|
|
jz .Linit_avx
|
|
|
|
add 0($inp),$h0 # accumulate input
|
|
adc 8($inp),$h1
|
|
lea 16($inp),$inp
|
|
adc $padbit,$h2
|
|
sub \$16,%r15
|
|
|
|
call __poly1305_block
|
|
|
|
.Linit_avx:
|
|
################################# base 2^64 -> base 2^26
|
|
mov $h0,%rax
|
|
mov $h0,%rdx
|
|
shr \$52,$h0
|
|
mov $h1,$d1
|
|
mov $h1,$d2
|
|
shr \$26,%rdx
|
|
and \$0x3ffffff,%rax # h[0]
|
|
shl \$12,$d1
|
|
and \$0x3ffffff,%rdx # h[1]
|
|
shr \$14,$h1
|
|
or $d1,$h0
|
|
shl \$24,$h2
|
|
and \$0x3ffffff,$h0 # h[2]
|
|
shr \$40,$d2
|
|
and \$0x3ffffff,$h1 # h[3]
|
|
or $d2,$h2 # h[4]
|
|
|
|
vmovd %rax#d,$H0
|
|
vmovd %rdx#d,$H1
|
|
vmovd $h0#d,$H2
|
|
vmovd $h1#d,$H3
|
|
vmovd $h2#d,$H4
|
|
movl \$1,20($ctx) # set is_base2_26
|
|
|
|
call __poly1305_init_avx
|
|
|
|
.Lproceed_avx:
|
|
mov %r15,$len
|
|
|
|
mov 0(%rsp),%r15
|
|
mov 8(%rsp),%r14
|
|
mov 16(%rsp),%r13
|
|
mov 24(%rsp),%r12
|
|
mov 32(%rsp),%rbp
|
|
mov 40(%rsp),%rbx
|
|
lea 48(%rsp),%rax
|
|
lea 48(%rsp),%rsp
|
|
.Lbase2_64_avx_epilogue:
|
|
jmp .Ldo_avx
|
|
|
|
.align 32
|
|
.Leven_avx:
|
|
vmovd 4*0($ctx),$H0 # load hash value
|
|
vmovd 4*1($ctx),$H1
|
|
vmovd 4*2($ctx),$H2
|
|
vmovd 4*3($ctx),$H3
|
|
vmovd 4*4($ctx),$H4
|
|
|
|
.Ldo_avx:
|
|
___
|
|
$code.=<<___ if (!$win64);
|
|
lea -0x58(%rsp),%r11
|
|
sub \$0x178,%rsp
|
|
___
|
|
$code.=<<___ if ($win64);
|
|
lea -0xf8(%rsp),%r11
|
|
sub \$0x218,%rsp
|
|
vmovdqa %xmm6,0x50(%r11)
|
|
vmovdqa %xmm7,0x60(%r11)
|
|
vmovdqa %xmm8,0x70(%r11)
|
|
vmovdqa %xmm9,0x80(%r11)
|
|
vmovdqa %xmm10,0x90(%r11)
|
|
vmovdqa %xmm11,0xa0(%r11)
|
|
vmovdqa %xmm12,0xb0(%r11)
|
|
vmovdqa %xmm13,0xc0(%r11)
|
|
vmovdqa %xmm14,0xd0(%r11)
|
|
vmovdqa %xmm15,0xe0(%r11)
|
|
.Ldo_avx_body:
|
|
___
|
|
$code.=<<___;
|
|
sub \$64,$len
|
|
lea -32($inp),%rax
|
|
cmovc %rax,$inp
|
|
|
|
vmovdqu `16*3`($ctx),$D4 # preload r0^2
|
|
lea `16*3+64`($ctx),$ctx # size optimization
|
|
lea .Lconst(%rip),%rcx
|
|
|
|
################################################################
|
|
# load input
|
|
vmovdqu 16*2($inp),$T0
|
|
vmovdqu 16*3($inp),$T1
|
|
vmovdqa 64(%rcx),$MASK # .Lmask26
|
|
|
|
vpsrldq \$6,$T0,$T2 # splat input
|
|
vpsrldq \$6,$T1,$T3
|
|
vpunpckhqdq $T1,$T0,$T4 # 4
|
|
vpunpcklqdq $T1,$T0,$T0 # 0:1
|
|
vpunpcklqdq $T3,$T2,$T3 # 2:3
|
|
|
|
vpsrlq \$40,$T4,$T4 # 4
|
|
vpsrlq \$26,$T0,$T1
|
|
vpand $MASK,$T0,$T0 # 0
|
|
vpsrlq \$4,$T3,$T2
|
|
vpand $MASK,$T1,$T1 # 1
|
|
vpsrlq \$30,$T3,$T3
|
|
vpand $MASK,$T2,$T2 # 2
|
|
vpand $MASK,$T3,$T3 # 3
|
|
vpor 32(%rcx),$T4,$T4 # padbit, yes, always
|
|
|
|
jbe .Lskip_loop_avx
|
|
|
|
# expand and copy pre-calculated table to stack
|
|
vmovdqu `16*1-64`($ctx),$D1
|
|
vmovdqu `16*2-64`($ctx),$D2
|
|
vpshufd \$0xEE,$D4,$D3 # 34xx -> 3434
|
|
vpshufd \$0x44,$D4,$D0 # xx12 -> 1212
|
|
vmovdqa $D3,-0x90(%r11)
|
|
vmovdqa $D0,0x00(%rsp)
|
|
vpshufd \$0xEE,$D1,$D4
|
|
vmovdqu `16*3-64`($ctx),$D0
|
|
vpshufd \$0x44,$D1,$D1
|
|
vmovdqa $D4,-0x80(%r11)
|
|
vmovdqa $D1,0x10(%rsp)
|
|
vpshufd \$0xEE,$D2,$D3
|
|
vmovdqu `16*4-64`($ctx),$D1
|
|
vpshufd \$0x44,$D2,$D2
|
|
vmovdqa $D3,-0x70(%r11)
|
|
vmovdqa $D2,0x20(%rsp)
|
|
vpshufd \$0xEE,$D0,$D4
|
|
vmovdqu `16*5-64`($ctx),$D2
|
|
vpshufd \$0x44,$D0,$D0
|
|
vmovdqa $D4,-0x60(%r11)
|
|
vmovdqa $D0,0x30(%rsp)
|
|
vpshufd \$0xEE,$D1,$D3
|
|
vmovdqu `16*6-64`($ctx),$D0
|
|
vpshufd \$0x44,$D1,$D1
|
|
vmovdqa $D3,-0x50(%r11)
|
|
vmovdqa $D1,0x40(%rsp)
|
|
vpshufd \$0xEE,$D2,$D4
|
|
vmovdqu `16*7-64`($ctx),$D1
|
|
vpshufd \$0x44,$D2,$D2
|
|
vmovdqa $D4,-0x40(%r11)
|
|
vmovdqa $D2,0x50(%rsp)
|
|
vpshufd \$0xEE,$D0,$D3
|
|
vmovdqu `16*8-64`($ctx),$D2
|
|
vpshufd \$0x44,$D0,$D0
|
|
vmovdqa $D3,-0x30(%r11)
|
|
vmovdqa $D0,0x60(%rsp)
|
|
vpshufd \$0xEE,$D1,$D4
|
|
vpshufd \$0x44,$D1,$D1
|
|
vmovdqa $D4,-0x20(%r11)
|
|
vmovdqa $D1,0x70(%rsp)
|
|
vpshufd \$0xEE,$D2,$D3
|
|
vmovdqa 0x00(%rsp),$D4 # preload r0^2
|
|
vpshufd \$0x44,$D2,$D2
|
|
vmovdqa $D3,-0x10(%r11)
|
|
vmovdqa $D2,0x80(%rsp)
|
|
|
|
jmp .Loop_avx
|
|
|
|
.align 32
|
|
.Loop_avx:
|
|
################################################################
|
|
# ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2
|
|
# ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^3+inp[7]*r
|
|
# \___________________/
|
|
# ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2+inp[8])*r^2
|
|
# ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^4+inp[7]*r^2+inp[9])*r
|
|
# \___________________/ \____________________/
|
|
#
|
|
# Note that we start with inp[2:3]*r^2. This is because it
|
|
# doesn't depend on reduction in previous iteration.
|
|
################################################################
|
|
# d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4
|
|
# d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4
|
|
# d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4
|
|
# d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4
|
|
# d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4
|
|
#
|
|
# though note that $Tx and $Hx are "reversed" in this section,
|
|
# and $D4 is preloaded with r0^2...
|
|
|
|
vpmuludq $T0,$D4,$D0 # d0 = h0*r0
|
|
vpmuludq $T1,$D4,$D1 # d1 = h1*r0
|
|
vmovdqa $H2,0x20(%r11) # offload hash
|
|
vpmuludq $T2,$D4,$D2 # d3 = h2*r0
|
|
vmovdqa 0x10(%rsp),$H2 # r1^2
|
|
vpmuludq $T3,$D4,$D3 # d3 = h3*r0
|
|
vpmuludq $T4,$D4,$D4 # d4 = h4*r0
|
|
|
|
vmovdqa $H0,0x00(%r11) #
|
|
vpmuludq 0x20(%rsp),$T4,$H0 # h4*s1
|
|
vmovdqa $H1,0x10(%r11) #
|
|
vpmuludq $T3,$H2,$H1 # h3*r1
|
|
vpaddq $H0,$D0,$D0 # d0 += h4*s1
|
|
vpaddq $H1,$D4,$D4 # d4 += h3*r1
|
|
vmovdqa $H3,0x30(%r11) #
|
|
vpmuludq $T2,$H2,$H0 # h2*r1
|
|
vpmuludq $T1,$H2,$H1 # h1*r1
|
|
vpaddq $H0,$D3,$D3 # d3 += h2*r1
|
|
vmovdqa 0x30(%rsp),$H3 # r2^2
|
|
vpaddq $H1,$D2,$D2 # d2 += h1*r1
|
|
vmovdqa $H4,0x40(%r11) #
|
|
vpmuludq $T0,$H2,$H2 # h0*r1
|
|
vpmuludq $T2,$H3,$H0 # h2*r2
|
|
vpaddq $H2,$D1,$D1 # d1 += h0*r1
|
|
|
|
vmovdqa 0x40(%rsp),$H4 # s2^2
|
|
vpaddq $H0,$D4,$D4 # d4 += h2*r2
|
|
vpmuludq $T1,$H3,$H1 # h1*r2
|
|
vpmuludq $T0,$H3,$H3 # h0*r2
|
|
vpaddq $H1,$D3,$D3 # d3 += h1*r2
|
|
vmovdqa 0x50(%rsp),$H2 # r3^2
|
|
vpaddq $H3,$D2,$D2 # d2 += h0*r2
|
|
vpmuludq $T4,$H4,$H0 # h4*s2
|
|
vpmuludq $T3,$H4,$H4 # h3*s2
|
|
vpaddq $H0,$D1,$D1 # d1 += h4*s2
|
|
vmovdqa 0x60(%rsp),$H3 # s3^2
|
|
vpaddq $H4,$D0,$D0 # d0 += h3*s2
|
|
|
|
vmovdqa 0x80(%rsp),$H4 # s4^2
|
|
vpmuludq $T1,$H2,$H1 # h1*r3
|
|
vpmuludq $T0,$H2,$H2 # h0*r3
|
|
vpaddq $H1,$D4,$D4 # d4 += h1*r3
|
|
vpaddq $H2,$D3,$D3 # d3 += h0*r3
|
|
vpmuludq $T4,$H3,$H0 # h4*s3
|
|
vpmuludq $T3,$H3,$H1 # h3*s3
|
|
vpaddq $H0,$D2,$D2 # d2 += h4*s3
|
|
vmovdqu 16*0($inp),$H0 # load input
|
|
vpaddq $H1,$D1,$D1 # d1 += h3*s3
|
|
vpmuludq $T2,$H3,$H3 # h2*s3
|
|
vpmuludq $T2,$H4,$T2 # h2*s4
|
|
vpaddq $H3,$D0,$D0 # d0 += h2*s3
|
|
|
|
vmovdqu 16*1($inp),$H1 #
|
|
vpaddq $T2,$D1,$D1 # d1 += h2*s4
|
|
vpmuludq $T3,$H4,$T3 # h3*s4
|
|
vpmuludq $T4,$H4,$T4 # h4*s4
|
|
vpsrldq \$6,$H0,$H2 # splat input
|
|
vpaddq $T3,$D2,$D2 # d2 += h3*s4
|
|
vpaddq $T4,$D3,$D3 # d3 += h4*s4
|
|
vpsrldq \$6,$H1,$H3 #
|
|
vpmuludq 0x70(%rsp),$T0,$T4 # h0*r4
|
|
vpmuludq $T1,$H4,$T0 # h1*s4
|
|
vpunpckhqdq $H1,$H0,$H4 # 4
|
|
vpaddq $T4,$D4,$D4 # d4 += h0*r4
|
|
vmovdqa -0x90(%r11),$T4 # r0^4
|
|
vpaddq $T0,$D0,$D0 # d0 += h1*s4
|
|
|
|
vpunpcklqdq $H1,$H0,$H0 # 0:1
|
|
vpunpcklqdq $H3,$H2,$H3 # 2:3
|
|
|
|
#vpsrlq \$40,$H4,$H4 # 4
|
|
vpsrldq \$`40/8`,$H4,$H4 # 4
|
|
vpsrlq \$26,$H0,$H1
|
|
vpand $MASK,$H0,$H0 # 0
|
|
vpsrlq \$4,$H3,$H2
|
|
vpand $MASK,$H1,$H1 # 1
|
|
vpand 0(%rcx),$H4,$H4 # .Lmask24
|
|
vpsrlq \$30,$H3,$H3
|
|
vpand $MASK,$H2,$H2 # 2
|
|
vpand $MASK,$H3,$H3 # 3
|
|
vpor 32(%rcx),$H4,$H4 # padbit, yes, always
|
|
|
|
vpaddq 0x00(%r11),$H0,$H0 # add hash value
|
|
vpaddq 0x10(%r11),$H1,$H1
|
|
vpaddq 0x20(%r11),$H2,$H2
|
|
vpaddq 0x30(%r11),$H3,$H3
|
|
vpaddq 0x40(%r11),$H4,$H4
|
|
|
|
lea 16*2($inp),%rax
|
|
lea 16*4($inp),$inp
|
|
sub \$64,$len
|
|
cmovc %rax,$inp
|
|
|
|
################################################################
|
|
# Now we accumulate (inp[0:1]+hash)*r^4
|
|
################################################################
|
|
# d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4
|
|
# d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4
|
|
# d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4
|
|
# d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4
|
|
# d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4
|
|
|
|
vpmuludq $H0,$T4,$T0 # h0*r0
|
|
vpmuludq $H1,$T4,$T1 # h1*r0
|
|
vpaddq $T0,$D0,$D0
|
|
vpaddq $T1,$D1,$D1
|
|
vmovdqa -0x80(%r11),$T2 # r1^4
|
|
vpmuludq $H2,$T4,$T0 # h2*r0
|
|
vpmuludq $H3,$T4,$T1 # h3*r0
|
|
vpaddq $T0,$D2,$D2
|
|
vpaddq $T1,$D3,$D3
|
|
vpmuludq $H4,$T4,$T4 # h4*r0
|
|
vpmuludq -0x70(%r11),$H4,$T0 # h4*s1
|
|
vpaddq $T4,$D4,$D4
|
|
|
|
vpaddq $T0,$D0,$D0 # d0 += h4*s1
|
|
vpmuludq $H2,$T2,$T1 # h2*r1
|
|
vpmuludq $H3,$T2,$T0 # h3*r1
|
|
vpaddq $T1,$D3,$D3 # d3 += h2*r1
|
|
vmovdqa -0x60(%r11),$T3 # r2^4
|
|
vpaddq $T0,$D4,$D4 # d4 += h3*r1
|
|
vpmuludq $H1,$T2,$T1 # h1*r1
|
|
vpmuludq $H0,$T2,$T2 # h0*r1
|
|
vpaddq $T1,$D2,$D2 # d2 += h1*r1
|
|
vpaddq $T2,$D1,$D1 # d1 += h0*r1
|
|
|
|
vmovdqa -0x50(%r11),$T4 # s2^4
|
|
vpmuludq $H2,$T3,$T0 # h2*r2
|
|
vpmuludq $H1,$T3,$T1 # h1*r2
|
|
vpaddq $T0,$D4,$D4 # d4 += h2*r2
|
|
vpaddq $T1,$D3,$D3 # d3 += h1*r2
|
|
vmovdqa -0x40(%r11),$T2 # r3^4
|
|
vpmuludq $H0,$T3,$T3 # h0*r2
|
|
vpmuludq $H4,$T4,$T0 # h4*s2
|
|
vpaddq $T3,$D2,$D2 # d2 += h0*r2
|
|
vpaddq $T0,$D1,$D1 # d1 += h4*s2
|
|
vmovdqa -0x30(%r11),$T3 # s3^4
|
|
vpmuludq $H3,$T4,$T4 # h3*s2
|
|
vpmuludq $H1,$T2,$T1 # h1*r3
|
|
vpaddq $T4,$D0,$D0 # d0 += h3*s2
|
|
|
|
vmovdqa -0x10(%r11),$T4 # s4^4
|
|
vpaddq $T1,$D4,$D4 # d4 += h1*r3
|
|
vpmuludq $H0,$T2,$T2 # h0*r3
|
|
vpmuludq $H4,$T3,$T0 # h4*s3
|
|
vpaddq $T2,$D3,$D3 # d3 += h0*r3
|
|
vpaddq $T0,$D2,$D2 # d2 += h4*s3
|
|
vmovdqu 16*2($inp),$T0 # load input
|
|
vpmuludq $H3,$T3,$T2 # h3*s3
|
|
vpmuludq $H2,$T3,$T3 # h2*s3
|
|
vpaddq $T2,$D1,$D1 # d1 += h3*s3
|
|
vmovdqu 16*3($inp),$T1 #
|
|
vpaddq $T3,$D0,$D0 # d0 += h2*s3
|
|
|
|
vpmuludq $H2,$T4,$H2 # h2*s4
|
|
vpmuludq $H3,$T4,$H3 # h3*s4
|
|
vpsrldq \$6,$T0,$T2 # splat input
|
|
vpaddq $H2,$D1,$D1 # d1 += h2*s4
|
|
vpmuludq $H4,$T4,$H4 # h4*s4
|
|
vpsrldq \$6,$T1,$T3 #
|
|
vpaddq $H3,$D2,$H2 # h2 = d2 + h3*s4
|
|
vpaddq $H4,$D3,$H3 # h3 = d3 + h4*s4
|
|
vpmuludq -0x20(%r11),$H0,$H4 # h0*r4
|
|
vpmuludq $H1,$T4,$H0
|
|
vpunpckhqdq $T1,$T0,$T4 # 4
|
|
vpaddq $H4,$D4,$H4 # h4 = d4 + h0*r4
|
|
vpaddq $H0,$D0,$H0 # h0 = d0 + h1*s4
|
|
|
|
vpunpcklqdq $T1,$T0,$T0 # 0:1
|
|
vpunpcklqdq $T3,$T2,$T3 # 2:3
|
|
|
|
#vpsrlq \$40,$T4,$T4 # 4
|
|
vpsrldq \$`40/8`,$T4,$T4 # 4
|
|
vpsrlq \$26,$T0,$T1
|
|
vmovdqa 0x00(%rsp),$D4 # preload r0^2
|
|
vpand $MASK,$T0,$T0 # 0
|
|
vpsrlq \$4,$T3,$T2
|
|
vpand $MASK,$T1,$T1 # 1
|
|
vpand 0(%rcx),$T4,$T4 # .Lmask24
|
|
vpsrlq \$30,$T3,$T3
|
|
vpand $MASK,$T2,$T2 # 2
|
|
vpand $MASK,$T3,$T3 # 3
|
|
vpor 32(%rcx),$T4,$T4 # padbit, yes, always
|
|
|
|
################################################################
|
|
# lazy reduction as discussed in "NEON crypto" by D.J. Bernstein
|
|
# and P. Schwabe
|
|
|
|
vpsrlq \$26,$H3,$D3
|
|
vpand $MASK,$H3,$H3
|
|
vpaddq $D3,$H4,$H4 # h3 -> h4
|
|
|
|
vpsrlq \$26,$H0,$D0
|
|
vpand $MASK,$H0,$H0
|
|
vpaddq $D0,$D1,$H1 # h0 -> h1
|
|
|
|
vpsrlq \$26,$H4,$D0
|
|
vpand $MASK,$H4,$H4
|
|
|
|
vpsrlq \$26,$H1,$D1
|
|
vpand $MASK,$H1,$H1
|
|
vpaddq $D1,$H2,$H2 # h1 -> h2
|
|
|
|
vpaddq $D0,$H0,$H0
|
|
vpsllq \$2,$D0,$D0
|
|
vpaddq $D0,$H0,$H0 # h4 -> h0
|
|
|
|
vpsrlq \$26,$H2,$D2
|
|
vpand $MASK,$H2,$H2
|
|
vpaddq $D2,$H3,$H3 # h2 -> h3
|
|
|
|
vpsrlq \$26,$H0,$D0
|
|
vpand $MASK,$H0,$H0
|
|
vpaddq $D0,$H1,$H1 # h0 -> h1
|
|
|
|
vpsrlq \$26,$H3,$D3
|
|
vpand $MASK,$H3,$H3
|
|
vpaddq $D3,$H4,$H4 # h3 -> h4
|
|
|
|
ja .Loop_avx
|
|
|
|
.Lskip_loop_avx:
|
|
################################################################
|
|
# multiply (inp[0:1]+hash) or inp[2:3] by r^2:r^1
|
|
|
|
vpshufd \$0x10,$D4,$D4 # r0^n, xx12 -> x1x2
|
|
add \$32,$len
|
|
jnz .Long_tail_avx
|
|
|
|
vpaddq $H2,$T2,$T2
|
|
vpaddq $H0,$T0,$T0
|
|
vpaddq $H1,$T1,$T1
|
|
vpaddq $H3,$T3,$T3
|
|
vpaddq $H4,$T4,$T4
|
|
|
|
.Long_tail_avx:
|
|
vmovdqa $H2,0x20(%r11)
|
|
vmovdqa $H0,0x00(%r11)
|
|
vmovdqa $H1,0x10(%r11)
|
|
vmovdqa $H3,0x30(%r11)
|
|
vmovdqa $H4,0x40(%r11)
|
|
|
|
# d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4
|
|
# d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4
|
|
# d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4
|
|
# d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4
|
|
# d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4
|
|
|
|
vpmuludq $T2,$D4,$D2 # d2 = h2*r0
|
|
vpmuludq $T0,$D4,$D0 # d0 = h0*r0
|
|
vpshufd \$0x10,`16*1-64`($ctx),$H2 # r1^n
|
|
vpmuludq $T1,$D4,$D1 # d1 = h1*r0
|
|
vpmuludq $T3,$D4,$D3 # d3 = h3*r0
|
|
vpmuludq $T4,$D4,$D4 # d4 = h4*r0
|
|
|
|
vpmuludq $T3,$H2,$H0 # h3*r1
|
|
vpaddq $H0,$D4,$D4 # d4 += h3*r1
|
|
vpshufd \$0x10,`16*2-64`($ctx),$H3 # s1^n
|
|
vpmuludq $T2,$H2,$H1 # h2*r1
|
|
vpaddq $H1,$D3,$D3 # d3 += h2*r1
|
|
vpshufd \$0x10,`16*3-64`($ctx),$H4 # r2^n
|
|
vpmuludq $T1,$H2,$H0 # h1*r1
|
|
vpaddq $H0,$D2,$D2 # d2 += h1*r1
|
|
vpmuludq $T0,$H2,$H2 # h0*r1
|
|
vpaddq $H2,$D1,$D1 # d1 += h0*r1
|
|
vpmuludq $T4,$H3,$H3 # h4*s1
|
|
vpaddq $H3,$D0,$D0 # d0 += h4*s1
|
|
|
|
vpshufd \$0x10,`16*4-64`($ctx),$H2 # s2^n
|
|
vpmuludq $T2,$H4,$H1 # h2*r2
|
|
vpaddq $H1,$D4,$D4 # d4 += h2*r2
|
|
vpmuludq $T1,$H4,$H0 # h1*r2
|
|
vpaddq $H0,$D3,$D3 # d3 += h1*r2
|
|
vpshufd \$0x10,`16*5-64`($ctx),$H3 # r3^n
|
|
vpmuludq $T0,$H4,$H4 # h0*r2
|
|
vpaddq $H4,$D2,$D2 # d2 += h0*r2
|
|
vpmuludq $T4,$H2,$H1 # h4*s2
|
|
vpaddq $H1,$D1,$D1 # d1 += h4*s2
|
|
vpshufd \$0x10,`16*6-64`($ctx),$H4 # s3^n
|
|
vpmuludq $T3,$H2,$H2 # h3*s2
|
|
vpaddq $H2,$D0,$D0 # d0 += h3*s2
|
|
|
|
vpmuludq $T1,$H3,$H0 # h1*r3
|
|
vpaddq $H0,$D4,$D4 # d4 += h1*r3
|
|
vpmuludq $T0,$H3,$H3 # h0*r3
|
|
vpaddq $H3,$D3,$D3 # d3 += h0*r3
|
|
vpshufd \$0x10,`16*7-64`($ctx),$H2 # r4^n
|
|
vpmuludq $T4,$H4,$H1 # h4*s3
|
|
vpaddq $H1,$D2,$D2 # d2 += h4*s3
|
|
vpshufd \$0x10,`16*8-64`($ctx),$H3 # s4^n
|
|
vpmuludq $T3,$H4,$H0 # h3*s3
|
|
vpaddq $H0,$D1,$D1 # d1 += h3*s3
|
|
vpmuludq $T2,$H4,$H4 # h2*s3
|
|
vpaddq $H4,$D0,$D0 # d0 += h2*s3
|
|
|
|
vpmuludq $T0,$H2,$H2 # h0*r4
|
|
vpaddq $H2,$D4,$D4 # h4 = d4 + h0*r4
|
|
vpmuludq $T4,$H3,$H1 # h4*s4
|
|
vpaddq $H1,$D3,$D3 # h3 = d3 + h4*s4
|
|
vpmuludq $T3,$H3,$H0 # h3*s4
|
|
vpaddq $H0,$D2,$D2 # h2 = d2 + h3*s4
|
|
vpmuludq $T2,$H3,$H1 # h2*s4
|
|
vpaddq $H1,$D1,$D1 # h1 = d1 + h2*s4
|
|
vpmuludq $T1,$H3,$H3 # h1*s4
|
|
vpaddq $H3,$D0,$D0 # h0 = d0 + h1*s4
|
|
|
|
jz .Lshort_tail_avx
|
|
|
|
vmovdqu 16*0($inp),$H0 # load input
|
|
vmovdqu 16*1($inp),$H1
|
|
|
|
vpsrldq \$6,$H0,$H2 # splat input
|
|
vpsrldq \$6,$H1,$H3
|
|
vpunpckhqdq $H1,$H0,$H4 # 4
|
|
vpunpcklqdq $H1,$H0,$H0 # 0:1
|
|
vpunpcklqdq $H3,$H2,$H3 # 2:3
|
|
|
|
vpsrlq \$40,$H4,$H4 # 4
|
|
vpsrlq \$26,$H0,$H1
|
|
vpand $MASK,$H0,$H0 # 0
|
|
vpsrlq \$4,$H3,$H2
|
|
vpand $MASK,$H1,$H1 # 1
|
|
vpsrlq \$30,$H3,$H3
|
|
vpand $MASK,$H2,$H2 # 2
|
|
vpand $MASK,$H3,$H3 # 3
|
|
vpor 32(%rcx),$H4,$H4 # padbit, yes, always
|
|
|
|
vpshufd \$0x32,`16*0-64`($ctx),$T4 # r0^n, 34xx -> x3x4
|
|
vpaddq 0x00(%r11),$H0,$H0
|
|
vpaddq 0x10(%r11),$H1,$H1
|
|
vpaddq 0x20(%r11),$H2,$H2
|
|
vpaddq 0x30(%r11),$H3,$H3
|
|
vpaddq 0x40(%r11),$H4,$H4
|
|
|
|
################################################################
|
|
# multiply (inp[0:1]+hash) by r^4:r^3 and accumulate
|
|
|
|
vpmuludq $H0,$T4,$T0 # h0*r0
|
|
vpaddq $T0,$D0,$D0 # d0 += h0*r0
|
|
vpmuludq $H1,$T4,$T1 # h1*r0
|
|
vpaddq $T1,$D1,$D1 # d1 += h1*r0
|
|
vpmuludq $H2,$T4,$T0 # h2*r0
|
|
vpaddq $T0,$D2,$D2 # d2 += h2*r0
|
|
vpshufd \$0x32,`16*1-64`($ctx),$T2 # r1^n
|
|
vpmuludq $H3,$T4,$T1 # h3*r0
|
|
vpaddq $T1,$D3,$D3 # d3 += h3*r0
|
|
vpmuludq $H4,$T4,$T4 # h4*r0
|
|
vpaddq $T4,$D4,$D4 # d4 += h4*r0
|
|
|
|
vpmuludq $H3,$T2,$T0 # h3*r1
|
|
vpaddq $T0,$D4,$D4 # d4 += h3*r1
|
|
vpshufd \$0x32,`16*2-64`($ctx),$T3 # s1
|
|
vpmuludq $H2,$T2,$T1 # h2*r1
|
|
vpaddq $T1,$D3,$D3 # d3 += h2*r1
|
|
vpshufd \$0x32,`16*3-64`($ctx),$T4 # r2
|
|
vpmuludq $H1,$T2,$T0 # h1*r1
|
|
vpaddq $T0,$D2,$D2 # d2 += h1*r1
|
|
vpmuludq $H0,$T2,$T2 # h0*r1
|
|
vpaddq $T2,$D1,$D1 # d1 += h0*r1
|
|
vpmuludq $H4,$T3,$T3 # h4*s1
|
|
vpaddq $T3,$D0,$D0 # d0 += h4*s1
|
|
|
|
vpshufd \$0x32,`16*4-64`($ctx),$T2 # s2
|
|
vpmuludq $H2,$T4,$T1 # h2*r2
|
|
vpaddq $T1,$D4,$D4 # d4 += h2*r2
|
|
vpmuludq $H1,$T4,$T0 # h1*r2
|
|
vpaddq $T0,$D3,$D3 # d3 += h1*r2
|
|
vpshufd \$0x32,`16*5-64`($ctx),$T3 # r3
|
|
vpmuludq $H0,$T4,$T4 # h0*r2
|
|
vpaddq $T4,$D2,$D2 # d2 += h0*r2
|
|
vpmuludq $H4,$T2,$T1 # h4*s2
|
|
vpaddq $T1,$D1,$D1 # d1 += h4*s2
|
|
vpshufd \$0x32,`16*6-64`($ctx),$T4 # s3
|
|
vpmuludq $H3,$T2,$T2 # h3*s2
|
|
vpaddq $T2,$D0,$D0 # d0 += h3*s2
|
|
|
|
vpmuludq $H1,$T3,$T0 # h1*r3
|
|
vpaddq $T0,$D4,$D4 # d4 += h1*r3
|
|
vpmuludq $H0,$T3,$T3 # h0*r3
|
|
vpaddq $T3,$D3,$D3 # d3 += h0*r3
|
|
vpshufd \$0x32,`16*7-64`($ctx),$T2 # r4
|
|
vpmuludq $H4,$T4,$T1 # h4*s3
|
|
vpaddq $T1,$D2,$D2 # d2 += h4*s3
|
|
vpshufd \$0x32,`16*8-64`($ctx),$T3 # s4
|
|
vpmuludq $H3,$T4,$T0 # h3*s3
|
|
vpaddq $T0,$D1,$D1 # d1 += h3*s3
|
|
vpmuludq $H2,$T4,$T4 # h2*s3
|
|
vpaddq $T4,$D0,$D0 # d0 += h2*s3
|
|
|
|
vpmuludq $H0,$T2,$T2 # h0*r4
|
|
vpaddq $T2,$D4,$D4 # d4 += h0*r4
|
|
vpmuludq $H4,$T3,$T1 # h4*s4
|
|
vpaddq $T1,$D3,$D3 # d3 += h4*s4
|
|
vpmuludq $H3,$T3,$T0 # h3*s4
|
|
vpaddq $T0,$D2,$D2 # d2 += h3*s4
|
|
vpmuludq $H2,$T3,$T1 # h2*s4
|
|
vpaddq $T1,$D1,$D1 # d1 += h2*s4
|
|
vpmuludq $H1,$T3,$T3 # h1*s4
|
|
vpaddq $T3,$D0,$D0 # d0 += h1*s4
|
|
|
|
.Lshort_tail_avx:
|
|
################################################################
|
|
# horizontal addition
|
|
|
|
vpsrldq \$8,$D4,$T4
|
|
vpsrldq \$8,$D3,$T3
|
|
vpsrldq \$8,$D1,$T1
|
|
vpsrldq \$8,$D0,$T0
|
|
vpsrldq \$8,$D2,$T2
|
|
vpaddq $T3,$D3,$D3
|
|
vpaddq $T4,$D4,$D4
|
|
vpaddq $T0,$D0,$D0
|
|
vpaddq $T1,$D1,$D1
|
|
vpaddq $T2,$D2,$D2
|
|
|
|
################################################################
|
|
# lazy reduction
|
|
|
|
vpsrlq \$26,$D3,$H3
|
|
vpand $MASK,$D3,$D3
|
|
vpaddq $H3,$D4,$D4 # h3 -> h4
|
|
|
|
vpsrlq \$26,$D0,$H0
|
|
vpand $MASK,$D0,$D0
|
|
vpaddq $H0,$D1,$D1 # h0 -> h1
|
|
|
|
vpsrlq \$26,$D4,$H4
|
|
vpand $MASK,$D4,$D4
|
|
|
|
vpsrlq \$26,$D1,$H1
|
|
vpand $MASK,$D1,$D1
|
|
vpaddq $H1,$D2,$D2 # h1 -> h2
|
|
|
|
vpaddq $H4,$D0,$D0
|
|
vpsllq \$2,$H4,$H4
|
|
vpaddq $H4,$D0,$D0 # h4 -> h0
|
|
|
|
vpsrlq \$26,$D2,$H2
|
|
vpand $MASK,$D2,$D2
|
|
vpaddq $H2,$D3,$D3 # h2 -> h3
|
|
|
|
vpsrlq \$26,$D0,$H0
|
|
vpand $MASK,$D0,$D0
|
|
vpaddq $H0,$D1,$D1 # h0 -> h1
|
|
|
|
vpsrlq \$26,$D3,$H3
|
|
vpand $MASK,$D3,$D3
|
|
vpaddq $H3,$D4,$D4 # h3 -> h4
|
|
|
|
vmovd $D0,`4*0-48-64`($ctx) # save partially reduced
|
|
vmovd $D1,`4*1-48-64`($ctx)
|
|
vmovd $D2,`4*2-48-64`($ctx)
|
|
vmovd $D3,`4*3-48-64`($ctx)
|
|
vmovd $D4,`4*4-48-64`($ctx)
|
|
___
|
|
$code.=<<___ if ($win64);
|
|
vmovdqa 0x50(%r11),%xmm6
|
|
vmovdqa 0x60(%r11),%xmm7
|
|
vmovdqa 0x70(%r11),%xmm8
|
|
vmovdqa 0x80(%r11),%xmm9
|
|
vmovdqa 0x90(%r11),%xmm10
|
|
vmovdqa 0xa0(%r11),%xmm11
|
|
vmovdqa 0xb0(%r11),%xmm12
|
|
vmovdqa 0xc0(%r11),%xmm13
|
|
vmovdqa 0xd0(%r11),%xmm14
|
|
vmovdqa 0xe0(%r11),%xmm15
|
|
lea 0xf8(%r11),%rsp
|
|
.Ldo_avx_epilogue:
|
|
___
|
|
$code.=<<___ if (!$win64);
|
|
lea 0x58(%r11),%rsp
|
|
___
|
|
$code.=<<___;
|
|
vzeroupper
|
|
ret
|
|
.size poly1305_blocks_avx,.-poly1305_blocks_avx
|
|
|
|
.type poly1305_emit_avx,\@function,3
|
|
.align 32
|
|
poly1305_emit_avx:
|
|
cmpl \$0,20($ctx) # is_base2_26?
|
|
je .Lemit
|
|
|
|
mov 0($ctx),%eax # load hash value base 2^26
|
|
mov 4($ctx),%ecx
|
|
mov 8($ctx),%r8d
|
|
mov 12($ctx),%r11d
|
|
mov 16($ctx),%r10d
|
|
|
|
shl \$26,%rcx # base 2^26 -> base 2^64
|
|
mov %r8,%r9
|
|
shl \$52,%r8
|
|
add %rcx,%rax
|
|
shr \$12,%r9
|
|
add %rax,%r8 # h0
|
|
adc \$0,%r9
|
|
|
|
shl \$14,%r11
|
|
mov %r10,%rax
|
|
shr \$24,%r10
|
|
add %r11,%r9
|
|
shl \$40,%rax
|
|
add %rax,%r9 # h1
|
|
adc \$0,%r10 # h2
|
|
|
|
mov %r10,%rax # could be partially reduced, so reduce
|
|
mov %r10,%rcx
|
|
and \$3,%r10
|
|
shr \$2,%rax
|
|
and \$-4,%rcx
|
|
add %rcx,%rax
|
|
add %rax,%r8
|
|
adc \$0,%r9
|
|
|
|
mov %r8,%rax
|
|
add \$5,%r8 # compare to modulus
|
|
mov %r9,%rcx
|
|
adc \$0,%r9
|
|
adc \$0,%r10
|
|
shr \$2,%r10 # did 130-bit value overfow?
|
|
cmovnz %r8,%rax
|
|
cmovnz %r9,%rcx
|
|
|
|
add 0($nonce),%rax # accumulate nonce
|
|
adc 8($nonce),%rcx
|
|
mov %rax,0($mac) # write result
|
|
mov %rcx,8($mac)
|
|
|
|
ret
|
|
.size poly1305_emit_avx,.-poly1305_emit_avx
|
|
___
|
|
|
|
if ($avx>1) {
|
|
my ($H0,$H1,$H2,$H3,$H4, $MASK, $T4,$T0,$T1,$T2,$T3, $D0,$D1,$D2,$D3,$D4) =
|
|
map("%ymm$_",(0..15));
|
|
my $S4=$MASK;
|
|
|
|
$code.=<<___;
|
|
.type poly1305_blocks_avx2,\@function,4
|
|
.align 32
|
|
poly1305_blocks_avx2:
|
|
mov 20($ctx),%r8d # is_base2_26
|
|
cmp \$128,$len
|
|
jae .Lblocks_avx2
|
|
test %r8d,%r8d
|
|
jz .Lblocks
|
|
|
|
.Lblocks_avx2:
|
|
and \$-16,$len
|
|
jz .Lno_data_avx2
|
|
|
|
vzeroupper
|
|
|
|
test %r8d,%r8d
|
|
jz .Lbase2_64_avx2
|
|
|
|
test \$63,$len
|
|
jz .Leven_avx2
|
|
|
|
push %rbx
|
|
push %rbp
|
|
push %r12
|
|
push %r13
|
|
push %r14
|
|
push %r15
|
|
.Lblocks_avx2_body:
|
|
|
|
mov $len,%r15 # reassign $len
|
|
|
|
mov 0($ctx),$d1 # load hash value
|
|
mov 8($ctx),$d2
|
|
mov 16($ctx),$h2#d
|
|
|
|
mov 24($ctx),$r0 # load r
|
|
mov 32($ctx),$s1
|
|
|
|
################################# base 2^26 -> base 2^64
|
|
mov $d1#d,$h0#d
|
|
and \$-1<<31,$d1
|
|
mov $d2,$r1 # borrow $r1
|
|
mov $d2#d,$h1#d
|
|
and \$-1<<31,$d2
|
|
|
|
shr \$6,$d1
|
|
shl \$52,$r1
|
|
add $d1,$h0
|
|
shr \$12,$h1
|
|
shr \$18,$d2
|
|
add $r1,$h0
|
|
adc $d2,$h1
|
|
|
|
mov $h2,$d1
|
|
shl \$40,$d1
|
|
shr \$24,$h2
|
|
add $d1,$h1
|
|
adc \$0,$h2 # can be partially reduced...
|
|
|
|
mov \$-4,$d2 # ... so reduce
|
|
mov $h2,$d1
|
|
and $h2,$d2
|
|
shr \$2,$d1
|
|
and \$3,$h2
|
|
add $d2,$d1 # =*5
|
|
add $d1,$h0
|
|
adc \$0,$h1
|
|
|
|
mov $s1,$r1
|
|
mov $s1,%rax
|
|
shr \$2,$s1
|
|
add $r1,$s1 # s1 = r1 + (r1 >> 2)
|
|
|
|
.Lbase2_26_pre_avx2:
|
|
add 0($inp),$h0 # accumulate input
|
|
adc 8($inp),$h1
|
|
lea 16($inp),$inp
|
|
adc $padbit,$h2
|
|
sub \$16,%r15
|
|
|
|
call __poly1305_block
|
|
mov $r1,%rax
|
|
|
|
test \$63,%r15
|
|
jnz .Lbase2_26_pre_avx2
|
|
|
|
test $padbit,$padbit # if $padbit is zero,
|
|
jz .Lstore_base2_64_avx2 # store hash in base 2^64 format
|
|
|
|
################################# base 2^64 -> base 2^26
|
|
mov $h0,%rax
|
|
mov $h0,%rdx
|
|
shr \$52,$h0
|
|
mov $h1,$r0
|
|
mov $h1,$r1
|
|
shr \$26,%rdx
|
|
and \$0x3ffffff,%rax # h[0]
|
|
shl \$12,$r0
|
|
and \$0x3ffffff,%rdx # h[1]
|
|
shr \$14,$h1
|
|
or $r0,$h0
|
|
shl \$24,$h2
|
|
and \$0x3ffffff,$h0 # h[2]
|
|
shr \$40,$r1
|
|
and \$0x3ffffff,$h1 # h[3]
|
|
or $r1,$h2 # h[4]
|
|
|
|
test %r15,%r15
|
|
jz .Lstore_base2_26_avx2
|
|
|
|
vmovd %rax#d,%x#$H0
|
|
vmovd %rdx#d,%x#$H1
|
|
vmovd $h0#d,%x#$H2
|
|
vmovd $h1#d,%x#$H3
|
|
vmovd $h2#d,%x#$H4
|
|
jmp .Lproceed_avx2
|
|
|
|
.align 32
|
|
.Lstore_base2_64_avx2:
|
|
mov $h0,0($ctx)
|
|
mov $h1,8($ctx)
|
|
mov $h2,16($ctx) # note that is_base2_26 is zeroed
|
|
jmp .Ldone_avx2
|
|
|
|
.align 16
|
|
.Lstore_base2_26_avx2:
|
|
mov %rax#d,0($ctx) # store hash value base 2^26
|
|
mov %rdx#d,4($ctx)
|
|
mov $h0#d,8($ctx)
|
|
mov $h1#d,12($ctx)
|
|
mov $h2#d,16($ctx)
|
|
.align 16
|
|
.Ldone_avx2:
|
|
mov 0(%rsp),%r15
|
|
mov 8(%rsp),%r14
|
|
mov 16(%rsp),%r13
|
|
mov 24(%rsp),%r12
|
|
mov 32(%rsp),%rbp
|
|
mov 40(%rsp),%rbx
|
|
lea 48(%rsp),%rsp
|
|
.Lno_data_avx2:
|
|
.Lblocks_avx2_epilogue:
|
|
ret
|
|
|
|
.align 32
|
|
.Lbase2_64_avx2:
|
|
push %rbx
|
|
push %rbp
|
|
push %r12
|
|
push %r13
|
|
push %r14
|
|
push %r15
|
|
.Lbase2_64_avx2_body:
|
|
|
|
mov $len,%r15 # reassign $len
|
|
|
|
mov 24($ctx),$r0 # load r
|
|
mov 32($ctx),$s1
|
|
|
|
mov 0($ctx),$h0 # load hash value
|
|
mov 8($ctx),$h1
|
|
mov 16($ctx),$h2#d
|
|
|
|
mov $s1,$r1
|
|
mov $s1,%rax
|
|
shr \$2,$s1
|
|
add $r1,$s1 # s1 = r1 + (r1 >> 2)
|
|
|
|
test \$63,$len
|
|
jz .Linit_avx2
|
|
|
|
.Lbase2_64_pre_avx2:
|
|
add 0($inp),$h0 # accumulate input
|
|
adc 8($inp),$h1
|
|
lea 16($inp),$inp
|
|
adc $padbit,$h2
|
|
sub \$16,%r15
|
|
|
|
call __poly1305_block
|
|
mov $r1,%rax
|
|
|
|
test \$63,%r15
|
|
jnz .Lbase2_64_pre_avx2
|
|
|
|
.Linit_avx2:
|
|
################################# base 2^64 -> base 2^26
|
|
mov $h0,%rax
|
|
mov $h0,%rdx
|
|
shr \$52,$h0
|
|
mov $h1,$d1
|
|
mov $h1,$d2
|
|
shr \$26,%rdx
|
|
and \$0x3ffffff,%rax # h[0]
|
|
shl \$12,$d1
|
|
and \$0x3ffffff,%rdx # h[1]
|
|
shr \$14,$h1
|
|
or $d1,$h0
|
|
shl \$24,$h2
|
|
and \$0x3ffffff,$h0 # h[2]
|
|
shr \$40,$d2
|
|
and \$0x3ffffff,$h1 # h[3]
|
|
or $d2,$h2 # h[4]
|
|
|
|
vmovd %rax#d,%x#$H0
|
|
vmovd %rdx#d,%x#$H1
|
|
vmovd $h0#d,%x#$H2
|
|
vmovd $h1#d,%x#$H3
|
|
vmovd $h2#d,%x#$H4
|
|
movl \$1,20($ctx) # set is_base2_26
|
|
|
|
call __poly1305_init_avx
|
|
|
|
.Lproceed_avx2:
|
|
mov %r15,$len
|
|
|
|
mov 0(%rsp),%r15
|
|
mov 8(%rsp),%r14
|
|
mov 16(%rsp),%r13
|
|
mov 24(%rsp),%r12
|
|
mov 32(%rsp),%rbp
|
|
mov 40(%rsp),%rbx
|
|
lea 48(%rsp),%rax
|
|
lea 48(%rsp),%rsp
|
|
.Lbase2_64_avx2_epilogue:
|
|
jmp .Ldo_avx2
|
|
|
|
.align 32
|
|
.Leven_avx2:
|
|
vmovd 4*0($ctx),%x#$H0 # load hash value base 2^26
|
|
vmovd 4*1($ctx),%x#$H1
|
|
vmovd 4*2($ctx),%x#$H2
|
|
vmovd 4*3($ctx),%x#$H3
|
|
vmovd 4*4($ctx),%x#$H4
|
|
|
|
.Ldo_avx2:
|
|
___
|
|
$code.=<<___ if (!$win64);
|
|
lea -8(%rsp),%r11
|
|
sub \$0x128,%rsp
|
|
___
|
|
$code.=<<___ if ($win64);
|
|
lea -0xf8(%rsp),%r11
|
|
sub \$0x1c8,%rsp
|
|
vmovdqa %xmm6,0x50(%r11)
|
|
vmovdqa %xmm7,0x60(%r11)
|
|
vmovdqa %xmm8,0x70(%r11)
|
|
vmovdqa %xmm9,0x80(%r11)
|
|
vmovdqa %xmm10,0x90(%r11)
|
|
vmovdqa %xmm11,0xa0(%r11)
|
|
vmovdqa %xmm12,0xb0(%r11)
|
|
vmovdqa %xmm13,0xc0(%r11)
|
|
vmovdqa %xmm14,0xd0(%r11)
|
|
vmovdqa %xmm15,0xe0(%r11)
|
|
.Ldo_avx2_body:
|
|
___
|
|
$code.=<<___;
|
|
lea 48+64($ctx),$ctx # size optimization
|
|
lea .Lconst(%rip),%rcx
|
|
|
|
# expand and copy pre-calculated table to stack
|
|
vmovdqu `16*0-64`($ctx),%x#$T2
|
|
and \$-512,%rsp
|
|
vmovdqu `16*1-64`($ctx),%x#$T3
|
|
vmovdqu `16*2-64`($ctx),%x#$T4
|
|
vmovdqu `16*3-64`($ctx),%x#$D0
|
|
vmovdqu `16*4-64`($ctx),%x#$D1
|
|
vmovdqu `16*5-64`($ctx),%x#$D2
|
|
vmovdqu `16*6-64`($ctx),%x#$D3
|
|
vpermq \$0x15,$T2,$T2 # 00003412 -> 12343434
|
|
vmovdqu `16*7-64`($ctx),%x#$D4
|
|
vpermq \$0x15,$T3,$T3
|
|
vpshufd \$0xc8,$T2,$T2 # 12343434 -> 14243444
|
|
vmovdqu `16*8-64`($ctx),%x#$MASK
|
|
vpermq \$0x15,$T4,$T4
|
|
vpshufd \$0xc8,$T3,$T3
|
|
vmovdqa $T2,0x00(%rsp)
|
|
vpermq \$0x15,$D0,$D0
|
|
vpshufd \$0xc8,$T4,$T4
|
|
vmovdqa $T3,0x20(%rsp)
|
|
vpermq \$0x15,$D1,$D1
|
|
vpshufd \$0xc8,$D0,$D0
|
|
vmovdqa $T4,0x40(%rsp)
|
|
vpermq \$0x15,$D2,$D2
|
|
vpshufd \$0xc8,$D1,$D1
|
|
vmovdqa $D0,0x60(%rsp)
|
|
vpermq \$0x15,$D3,$D3
|
|
vpshufd \$0xc8,$D2,$D2
|
|
vmovdqa $D1,0x80(%rsp)
|
|
vpermq \$0x15,$D4,$D4
|
|
vpshufd \$0xc8,$D3,$D3
|
|
vmovdqa $D2,0xa0(%rsp)
|
|
vpermq \$0x15,$MASK,$MASK
|
|
vpshufd \$0xc8,$D4,$D4
|
|
vmovdqa $D3,0xc0(%rsp)
|
|
vpshufd \$0xc8,$MASK,$MASK
|
|
vmovdqa $D4,0xe0(%rsp)
|
|
vmovdqa $MASK,0x100(%rsp)
|
|
vmovdqa 64(%rcx),$MASK # .Lmask26
|
|
|
|
################################################################
|
|
# load input
|
|
vmovdqu 16*0($inp),%x#$T0
|
|
vmovdqu 16*1($inp),%x#$T1
|
|
vinserti128 \$1,16*2($inp),$T0,$T0
|
|
vinserti128 \$1,16*3($inp),$T1,$T1
|
|
lea 16*4($inp),$inp
|
|
|
|
vpsrldq \$6,$T0,$T2 # splat input
|
|
vpsrldq \$6,$T1,$T3
|
|
vpunpckhqdq $T1,$T0,$T4 # 4
|
|
vpunpcklqdq $T3,$T2,$T2 # 2:3
|
|
vpunpcklqdq $T1,$T0,$T0 # 0:1
|
|
|
|
vpsrlq \$30,$T2,$T3
|
|
vpsrlq \$4,$T2,$T2
|
|
vpsrlq \$26,$T0,$T1
|
|
vpsrlq \$40,$T4,$T4 # 4
|
|
vpand $MASK,$T2,$T2 # 2
|
|
vpand $MASK,$T0,$T0 # 0
|
|
vpand $MASK,$T1,$T1 # 1
|
|
vpand $MASK,$T3,$T3 # 3
|
|
vpor 32(%rcx),$T4,$T4 # padbit, yes, always
|
|
|
|
lea 0x90(%rsp),%rax # size optimization
|
|
vpaddq $H2,$T2,$H2 # accumulate input
|
|
sub \$64,$len
|
|
jz .Ltail_avx2
|
|
jmp .Loop_avx2
|
|
|
|
.align 32
|
|
.Loop_avx2:
|
|
################################################################
|
|
# ((inp[0]*r^4+r[4])*r^4+r[8])*r^4
|
|
# ((inp[1]*r^4+r[5])*r^4+r[9])*r^3
|
|
# ((inp[2]*r^4+r[6])*r^4+r[10])*r^2
|
|
# ((inp[3]*r^4+r[7])*r^4+r[11])*r^1
|
|
# \________/\________/
|
|
################################################################
|
|
#vpaddq $H2,$T2,$H2 # accumulate input
|
|
vpaddq $H0,$T0,$H0
|
|
vmovdqa `32*0`(%rsp),$T0 # r0^4
|
|
vpaddq $H1,$T1,$H1
|
|
vmovdqa `32*1`(%rsp),$T1 # r1^4
|
|
vpaddq $H3,$T3,$H3
|
|
vmovdqa `32*3`(%rsp),$T2 # r2^4
|
|
vpaddq $H4,$T4,$H4
|
|
vmovdqa `32*6-0x90`(%rax),$T3 # s3^4
|
|
vmovdqa `32*8-0x90`(%rax),$S4 # s4^4
|
|
|
|
# d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4
|
|
# d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4
|
|
# d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4
|
|
# d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4
|
|
# d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4
|
|
#
|
|
# however, as h2 is "chronologically" first one available pull
|
|
# corresponding operations up, so it's
|
|
#
|
|
# d4 = h2*r2 + h4*r0 + h3*r1 + h1*r3 + h0*r4
|
|
# d3 = h2*r1 + h3*r0 + h1*r2 + h0*r3 + h4*5*r4
|
|
# d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4
|
|
# d1 = h2*5*r4 + h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3
|
|
# d0 = h2*5*r3 + h0*r0 + h4*5*r1 + h3*5*r2 + h1*5*r4
|
|
|
|
vpmuludq $H2,$T0,$D2 # d2 = h2*r0
|
|
vpmuludq $H2,$T1,$D3 # d3 = h2*r1
|
|
vpmuludq $H2,$T2,$D4 # d4 = h2*r2
|
|
vpmuludq $H2,$T3,$D0 # d0 = h2*s3
|
|
vpmuludq $H2,$S4,$D1 # d1 = h2*s4
|
|
|
|
vpmuludq $H0,$T1,$T4 # h0*r1
|
|
vpmuludq $H1,$T1,$H2 # h1*r1, borrow $H2 as temp
|
|
vpaddq $T4,$D1,$D1 # d1 += h0*r1
|
|
vpaddq $H2,$D2,$D2 # d2 += h1*r1
|
|
vpmuludq $H3,$T1,$T4 # h3*r1
|
|
vpmuludq `32*2`(%rsp),$H4,$H2 # h4*s1
|
|
vpaddq $T4,$D4,$D4 # d4 += h3*r1
|
|
vpaddq $H2,$D0,$D0 # d0 += h4*s1
|
|
vmovdqa `32*4-0x90`(%rax),$T1 # s2
|
|
|
|
vpmuludq $H0,$T0,$T4 # h0*r0
|
|
vpmuludq $H1,$T0,$H2 # h1*r0
|
|
vpaddq $T4,$D0,$D0 # d0 += h0*r0
|
|
vpaddq $H2,$D1,$D1 # d1 += h1*r0
|
|
vpmuludq $H3,$T0,$T4 # h3*r0
|
|
vpmuludq $H4,$T0,$H2 # h4*r0
|
|
vmovdqu 16*0($inp),%x#$T0 # load input
|
|
vpaddq $T4,$D3,$D3 # d3 += h3*r0
|
|
vpaddq $H2,$D4,$D4 # d4 += h4*r0
|
|
vinserti128 \$1,16*2($inp),$T0,$T0
|
|
|
|
vpmuludq $H3,$T1,$T4 # h3*s2
|
|
vpmuludq $H4,$T1,$H2 # h4*s2
|
|
vmovdqu 16*1($inp),%x#$T1
|
|
vpaddq $T4,$D0,$D0 # d0 += h3*s2
|
|
vpaddq $H2,$D1,$D1 # d1 += h4*s2
|
|
vmovdqa `32*5-0x90`(%rax),$H2 # r3
|
|
vpmuludq $H1,$T2,$T4 # h1*r2
|
|
vpmuludq $H0,$T2,$T2 # h0*r2
|
|
vpaddq $T4,$D3,$D3 # d3 += h1*r2
|
|
vpaddq $T2,$D2,$D2 # d2 += h0*r2
|
|
vinserti128 \$1,16*3($inp),$T1,$T1
|
|
lea 16*4($inp),$inp
|
|
|
|
vpmuludq $H1,$H2,$T4 # h1*r3
|
|
vpmuludq $H0,$H2,$H2 # h0*r3
|
|
vpsrldq \$6,$T0,$T2 # splat input
|
|
vpaddq $T4,$D4,$D4 # d4 += h1*r3
|
|
vpaddq $H2,$D3,$D3 # d3 += h0*r3
|
|
vpmuludq $H3,$T3,$T4 # h3*s3
|
|
vpmuludq $H4,$T3,$H2 # h4*s3
|
|
vpsrldq \$6,$T1,$T3
|
|
vpaddq $T4,$D1,$D1 # d1 += h3*s3
|
|
vpaddq $H2,$D2,$D2 # d2 += h4*s3
|
|
vpunpckhqdq $T1,$T0,$T4 # 4
|
|
|
|
vpmuludq $H3,$S4,$H3 # h3*s4
|
|
vpmuludq $H4,$S4,$H4 # h4*s4
|
|
vpunpcklqdq $T1,$T0,$T0 # 0:1
|
|
vpaddq $H3,$D2,$H2 # h2 = d2 + h3*r4
|
|
vpaddq $H4,$D3,$H3 # h3 = d3 + h4*r4
|
|
vpunpcklqdq $T3,$T2,$T3 # 2:3
|
|
vpmuludq `32*7-0x90`(%rax),$H0,$H4 # h0*r4
|
|
vpmuludq $H1,$S4,$H0 # h1*s4
|
|
vmovdqa 64(%rcx),$MASK # .Lmask26
|
|
vpaddq $H4,$D4,$H4 # h4 = d4 + h0*r4
|
|
vpaddq $H0,$D0,$H0 # h0 = d0 + h1*s4
|
|
|
|
################################################################
|
|
# lazy reduction (interleaved with tail of input splat)
|
|
|
|
vpsrlq \$26,$H3,$D3
|
|
vpand $MASK,$H3,$H3
|
|
vpaddq $D3,$H4,$H4 # h3 -> h4
|
|
|
|
vpsrlq \$26,$H0,$D0
|
|
vpand $MASK,$H0,$H0
|
|
vpaddq $D0,$D1,$H1 # h0 -> h1
|
|
|
|
vpsrlq \$26,$H4,$D4
|
|
vpand $MASK,$H4,$H4
|
|
|
|
vpsrlq \$4,$T3,$T2
|
|
|
|
vpsrlq \$26,$H1,$D1
|
|
vpand $MASK,$H1,$H1
|
|
vpaddq $D1,$H2,$H2 # h1 -> h2
|
|
|
|
vpaddq $D4,$H0,$H0
|
|
vpsllq \$2,$D4,$D4
|
|
vpaddq $D4,$H0,$H0 # h4 -> h0
|
|
|
|
vpand $MASK,$T2,$T2 # 2
|
|
vpsrlq \$26,$T0,$T1
|
|
|
|
vpsrlq \$26,$H2,$D2
|
|
vpand $MASK,$H2,$H2
|
|
vpaddq $D2,$H3,$H3 # h2 -> h3
|
|
|
|
vpaddq $T2,$H2,$H2 # modulo-scheduled
|
|
vpsrlq \$30,$T3,$T3
|
|
|
|
vpsrlq \$26,$H0,$D0
|
|
vpand $MASK,$H0,$H0
|
|
vpaddq $D0,$H1,$H1 # h0 -> h1
|
|
|
|
vpsrlq \$40,$T4,$T4 # 4
|
|
|
|
vpsrlq \$26,$H3,$D3
|
|
vpand $MASK,$H3,$H3
|
|
vpaddq $D3,$H4,$H4 # h3 -> h4
|
|
|
|
vpand $MASK,$T0,$T0 # 0
|
|
vpand $MASK,$T1,$T1 # 1
|
|
vpand $MASK,$T3,$T3 # 3
|
|
vpor 32(%rcx),$T4,$T4 # padbit, yes, always
|
|
|
|
sub \$64,$len
|
|
jnz .Loop_avx2
|
|
|
|
.byte 0x66,0x90
|
|
.Ltail_avx2:
|
|
################################################################
|
|
# while above multiplications were by r^4 in all lanes, in last
|
|
# iteration we multiply least significant lane by r^4 and most
|
|
# significant one by r, so copy of above except that references
|
|
# to the precomputed table are displaced by 4...
|
|
|
|
#vpaddq $H2,$T2,$H2 # accumulate input
|
|
vpaddq $H0,$T0,$H0
|
|
vmovdqu `32*0+4`(%rsp),$T0 # r0^4
|
|
vpaddq $H1,$T1,$H1
|
|
vmovdqu `32*1+4`(%rsp),$T1 # r1^4
|
|
vpaddq $H3,$T3,$H3
|
|
vmovdqu `32*3+4`(%rsp),$T2 # r2^4
|
|
vpaddq $H4,$T4,$H4
|
|
vmovdqu `32*6+4-0x90`(%rax),$T3 # s3^4
|
|
vmovdqu `32*8+4-0x90`(%rax),$S4 # s4^4
|
|
|
|
vpmuludq $H2,$T0,$D2 # d2 = h2*r0
|
|
vpmuludq $H2,$T1,$D3 # d3 = h2*r1
|
|
vpmuludq $H2,$T2,$D4 # d4 = h2*r2
|
|
vpmuludq $H2,$T3,$D0 # d0 = h2*s3
|
|
vpmuludq $H2,$S4,$D1 # d1 = h2*s4
|
|
|
|
vpmuludq $H0,$T1,$T4 # h0*r1
|
|
vpmuludq $H1,$T1,$H2 # h1*r1
|
|
vpaddq $T4,$D1,$D1 # d1 += h0*r1
|
|
vpaddq $H2,$D2,$D2 # d2 += h1*r1
|
|
vpmuludq $H3,$T1,$T4 # h3*r1
|
|
vpmuludq `32*2+4`(%rsp),$H4,$H2 # h4*s1
|
|
vpaddq $T4,$D4,$D4 # d4 += h3*r1
|
|
vpaddq $H2,$D0,$D0 # d0 += h4*s1
|
|
|
|
vpmuludq $H0,$T0,$T4 # h0*r0
|
|
vpmuludq $H1,$T0,$H2 # h1*r0
|
|
vpaddq $T4,$D0,$D0 # d0 += h0*r0
|
|
vmovdqu `32*4+4-0x90`(%rax),$T1 # s2
|
|
vpaddq $H2,$D1,$D1 # d1 += h1*r0
|
|
vpmuludq $H3,$T0,$T4 # h3*r0
|
|
vpmuludq $H4,$T0,$H2 # h4*r0
|
|
vpaddq $T4,$D3,$D3 # d3 += h3*r0
|
|
vpaddq $H2,$D4,$D4 # d4 += h4*r0
|
|
|
|
vpmuludq $H3,$T1,$T4 # h3*s2
|
|
vpmuludq $H4,$T1,$H2 # h4*s2
|
|
vpaddq $T4,$D0,$D0 # d0 += h3*s2
|
|
vpaddq $H2,$D1,$D1 # d1 += h4*s2
|
|
vmovdqu `32*5+4-0x90`(%rax),$H2 # r3
|
|
vpmuludq $H1,$T2,$T4 # h1*r2
|
|
vpmuludq $H0,$T2,$T2 # h0*r2
|
|
vpaddq $T4,$D3,$D3 # d3 += h1*r2
|
|
vpaddq $T2,$D2,$D2 # d2 += h0*r2
|
|
|
|
vpmuludq $H1,$H2,$T4 # h1*r3
|
|
vpmuludq $H0,$H2,$H2 # h0*r3
|
|
vpaddq $T4,$D4,$D4 # d4 += h1*r3
|
|
vpaddq $H2,$D3,$D3 # d3 += h0*r3
|
|
vpmuludq $H3,$T3,$T4 # h3*s3
|
|
vpmuludq $H4,$T3,$H2 # h4*s3
|
|
vpaddq $T4,$D1,$D1 # d1 += h3*s3
|
|
vpaddq $H2,$D2,$D2 # d2 += h4*s3
|
|
|
|
vpmuludq $H3,$S4,$H3 # h3*s4
|
|
vpmuludq $H4,$S4,$H4 # h4*s4
|
|
vpaddq $H3,$D2,$H2 # h2 = d2 + h3*r4
|
|
vpaddq $H4,$D3,$H3 # h3 = d3 + h4*r4
|
|
vpmuludq `32*7+4-0x90`(%rax),$H0,$H4 # h0*r4
|
|
vpmuludq $H1,$S4,$H0 # h1*s4
|
|
vmovdqa 64(%rcx),$MASK # .Lmask26
|
|
vpaddq $H4,$D4,$H4 # h4 = d4 + h0*r4
|
|
vpaddq $H0,$D0,$H0 # h0 = d0 + h1*s4
|
|
|
|
################################################################
|
|
# horizontal addition
|
|
|
|
vpsrldq \$8,$D1,$T1
|
|
vpsrldq \$8,$H2,$T2
|
|
vpsrldq \$8,$H3,$T3
|
|
vpsrldq \$8,$H4,$T4
|
|
vpsrldq \$8,$H0,$T0
|
|
vpaddq $T1,$D1,$D1
|
|
vpaddq $T2,$H2,$H2
|
|
vpaddq $T3,$H3,$H3
|
|
vpaddq $T4,$H4,$H4
|
|
vpaddq $T0,$H0,$H0
|
|
|
|
vpermq \$0x2,$H3,$T3
|
|
vpermq \$0x2,$H4,$T4
|
|
vpermq \$0x2,$H0,$T0
|
|
vpermq \$0x2,$D1,$T1
|
|
vpermq \$0x2,$H2,$T2
|
|
vpaddq $T3,$H3,$H3
|
|
vpaddq $T4,$H4,$H4
|
|
vpaddq $T0,$H0,$H0
|
|
vpaddq $T1,$D1,$D1
|
|
vpaddq $T2,$H2,$H2
|
|
|
|
################################################################
|
|
# lazy reduction
|
|
|
|
vpsrlq \$26,$H3,$D3
|
|
vpand $MASK,$H3,$H3
|
|
vpaddq $D3,$H4,$H4 # h3 -> h4
|
|
|
|
vpsrlq \$26,$H0,$D0
|
|
vpand $MASK,$H0,$H0
|
|
vpaddq $D0,$D1,$H1 # h0 -> h1
|
|
|
|
vpsrlq \$26,$H4,$D4
|
|
vpand $MASK,$H4,$H4
|
|
|
|
vpsrlq \$26,$H1,$D1
|
|
vpand $MASK,$H1,$H1
|
|
vpaddq $D1,$H2,$H2 # h1 -> h2
|
|
|
|
vpaddq $D4,$H0,$H0
|
|
vpsllq \$2,$D4,$D4
|
|
vpaddq $D4,$H0,$H0 # h4 -> h0
|
|
|
|
vpsrlq \$26,$H2,$D2
|
|
vpand $MASK,$H2,$H2
|
|
vpaddq $D2,$H3,$H3 # h2 -> h3
|
|
|
|
vpsrlq \$26,$H0,$D0
|
|
vpand $MASK,$H0,$H0
|
|
vpaddq $D0,$H1,$H1 # h0 -> h1
|
|
|
|
vpsrlq \$26,$H3,$D3
|
|
vpand $MASK,$H3,$H3
|
|
vpaddq $D3,$H4,$H4 # h3 -> h4
|
|
|
|
vmovd %x#$H0,`4*0-48-64`($ctx)# save partially reduced
|
|
vmovd %x#$H1,`4*1-48-64`($ctx)
|
|
vmovd %x#$H2,`4*2-48-64`($ctx)
|
|
vmovd %x#$H3,`4*3-48-64`($ctx)
|
|
vmovd %x#$H4,`4*4-48-64`($ctx)
|
|
___
|
|
$code.=<<___ if ($win64);
|
|
vmovdqa 0x50(%r11),%xmm6
|
|
vmovdqa 0x60(%r11),%xmm7
|
|
vmovdqa 0x70(%r11),%xmm8
|
|
vmovdqa 0x80(%r11),%xmm9
|
|
vmovdqa 0x90(%r11),%xmm10
|
|
vmovdqa 0xa0(%r11),%xmm11
|
|
vmovdqa 0xb0(%r11),%xmm12
|
|
vmovdqa 0xc0(%r11),%xmm13
|
|
vmovdqa 0xd0(%r11),%xmm14
|
|
vmovdqa 0xe0(%r11),%xmm15
|
|
lea 0xf8(%r11),%rsp
|
|
.Ldo_avx2_epilogue:
|
|
___
|
|
$code.=<<___ if (!$win64);
|
|
lea 8(%r11),%rsp
|
|
___
|
|
$code.=<<___;
|
|
vzeroupper
|
|
ret
|
|
.size poly1305_blocks_avx2,.-poly1305_blocks_avx2
|
|
___
|
|
}
|
|
$code.=<<___;
|
|
.align 64
|
|
.Lconst:
|
|
.Lmask24:
|
|
.long 0x0ffffff,0,0x0ffffff,0,0x0ffffff,0,0x0ffffff,0
|
|
.L129:
|
|
.long 1<<24,0,1<<24,0,1<<24,0,1<<24,0
|
|
.Lmask26:
|
|
.long 0x3ffffff,0,0x3ffffff,0,0x3ffffff,0,0x3ffffff,0
|
|
.Lfive:
|
|
.long 5,0,5,0,5,0,5,0
|
|
___
|
|
}
|
|
|
|
$code.=<<___;
|
|
.asciz "Poly1305 for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
|
|
.align 16
|
|
___
|
|
|
|
# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
|
|
# CONTEXT *context,DISPATCHER_CONTEXT *disp)
|
|
if ($win64) {
|
|
$rec="%rcx";
|
|
$frame="%rdx";
|
|
$context="%r8";
|
|
$disp="%r9";
|
|
|
|
$code.=<<___;
|
|
.extern __imp_RtlVirtualUnwind
|
|
.type se_handler,\@abi-omnipotent
|
|
.align 16
|
|
se_handler:
|
|
push %rsi
|
|
push %rdi
|
|
push %rbx
|
|
push %rbp
|
|
push %r12
|
|
push %r13
|
|
push %r14
|
|
push %r15
|
|
pushfq
|
|
sub \$64,%rsp
|
|
|
|
mov 120($context),%rax # pull context->Rax
|
|
mov 248($context),%rbx # pull context->Rip
|
|
|
|
mov 8($disp),%rsi # disp->ImageBase
|
|
mov 56($disp),%r11 # disp->HandlerData
|
|
|
|
mov 0(%r11),%r10d # HandlerData[0]
|
|
lea (%rsi,%r10),%r10 # prologue label
|
|
cmp %r10,%rbx # context->Rip<.Lprologue
|
|
jb .Lcommon_seh_tail
|
|
|
|
mov 152($context),%rax # pull context->Rsp
|
|
|
|
mov 4(%r11),%r10d # HandlerData[1]
|
|
lea (%rsi,%r10),%r10 # epilogue label
|
|
cmp %r10,%rbx # context->Rip>=.Lepilogue
|
|
jae .Lcommon_seh_tail
|
|
|
|
lea 48(%rax),%rax
|
|
|
|
mov -8(%rax),%rbx
|
|
mov -16(%rax),%rbp
|
|
mov -24(%rax),%r12
|
|
mov -32(%rax),%r13
|
|
mov -40(%rax),%r14
|
|
mov -48(%rax),%r15
|
|
mov %rbx,144($context) # restore context->Rbx
|
|
mov %rbp,160($context) # restore context->Rbp
|
|
mov %r12,216($context) # restore context->R12
|
|
mov %r13,224($context) # restore context->R13
|
|
mov %r14,232($context) # restore context->R14
|
|
mov %r15,240($context) # restore context->R14
|
|
|
|
jmp .Lcommon_seh_tail
|
|
.size se_handler,.-se_handler
|
|
|
|
.type avx_handler,\@abi-omnipotent
|
|
.align 16
|
|
avx_handler:
|
|
push %rsi
|
|
push %rdi
|
|
push %rbx
|
|
push %rbp
|
|
push %r12
|
|
push %r13
|
|
push %r14
|
|
push %r15
|
|
pushfq
|
|
sub \$64,%rsp
|
|
|
|
mov 120($context),%rax # pull context->Rax
|
|
mov 248($context),%rbx # pull context->Rip
|
|
|
|
mov 8($disp),%rsi # disp->ImageBase
|
|
mov 56($disp),%r11 # disp->HandlerData
|
|
|
|
mov 0(%r11),%r10d # HandlerData[0]
|
|
lea (%rsi,%r10),%r10 # prologue label
|
|
cmp %r10,%rbx # context->Rip<prologue label
|
|
jb .Lcommon_seh_tail
|
|
|
|
mov 152($context),%rax # pull context->Rsp
|
|
|
|
mov 4(%r11),%r10d # HandlerData[1]
|
|
lea (%rsi,%r10),%r10 # epilogue label
|
|
cmp %r10,%rbx # context->Rip>=epilogue label
|
|
jae .Lcommon_seh_tail
|
|
|
|
mov 208($context),%rax # pull context->R11
|
|
|
|
lea 0x50(%rax),%rsi
|
|
lea 0xf8(%rax),%rax
|
|
lea 512($context),%rdi # &context.Xmm6
|
|
mov \$20,%ecx
|
|
.long 0xa548f3fc # cld; rep movsq
|
|
|
|
.Lcommon_seh_tail:
|
|
mov 8(%rax),%rdi
|
|
mov 16(%rax),%rsi
|
|
mov %rax,152($context) # restore context->Rsp
|
|
mov %rsi,168($context) # restore context->Rsi
|
|
mov %rdi,176($context) # restore context->Rdi
|
|
|
|
mov 40($disp),%rdi # disp->ContextRecord
|
|
mov $context,%rsi # context
|
|
mov \$154,%ecx # sizeof(CONTEXT)
|
|
.long 0xa548f3fc # cld; rep movsq
|
|
|
|
mov $disp,%rsi
|
|
xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER
|
|
mov 8(%rsi),%rdx # arg2, disp->ImageBase
|
|
mov 0(%rsi),%r8 # arg3, disp->ControlPc
|
|
mov 16(%rsi),%r9 # arg4, disp->FunctionEntry
|
|
mov 40(%rsi),%r10 # disp->ContextRecord
|
|
lea 56(%rsi),%r11 # &disp->HandlerData
|
|
lea 24(%rsi),%r12 # &disp->EstablisherFrame
|
|
mov %r10,32(%rsp) # arg5
|
|
mov %r11,40(%rsp) # arg6
|
|
mov %r12,48(%rsp) # arg7
|
|
mov %rcx,56(%rsp) # arg8, (NULL)
|
|
call *__imp_RtlVirtualUnwind(%rip)
|
|
|
|
mov \$1,%eax # ExceptionContinueSearch
|
|
add \$64,%rsp
|
|
popfq
|
|
pop %r15
|
|
pop %r14
|
|
pop %r13
|
|
pop %r12
|
|
pop %rbp
|
|
pop %rbx
|
|
pop %rdi
|
|
pop %rsi
|
|
ret
|
|
.size avx_handler,.-avx_handler
|
|
|
|
.section .pdata
|
|
.align 4
|
|
.rva .LSEH_begin_poly1305_init
|
|
.rva .LSEH_end_poly1305_init
|
|
.rva .LSEH_info_poly1305_init
|
|
|
|
.rva .LSEH_begin_poly1305_blocks
|
|
.rva .LSEH_end_poly1305_blocks
|
|
.rva .LSEH_info_poly1305_blocks
|
|
|
|
.rva .LSEH_begin_poly1305_emit
|
|
.rva .LSEH_end_poly1305_emit
|
|
.rva .LSEH_info_poly1305_emit
|
|
___
|
|
$code.=<<___ if ($avx);
|
|
.rva .LSEH_begin_poly1305_blocks_avx
|
|
.rva .Lbase2_64_avx
|
|
.rva .LSEH_info_poly1305_blocks_avx_1
|
|
|
|
.rva .Lbase2_64_avx
|
|
.rva .Leven_avx
|
|
.rva .LSEH_info_poly1305_blocks_avx_2
|
|
|
|
.rva .Leven_avx
|
|
.rva .LSEH_end_poly1305_blocks_avx
|
|
.rva .LSEH_info_poly1305_blocks_avx_3
|
|
|
|
.rva .LSEH_begin_poly1305_emit_avx
|
|
.rva .LSEH_end_poly1305_emit_avx
|
|
.rva .LSEH_info_poly1305_emit_avx
|
|
___
|
|
$code.=<<___ if ($avx>1);
|
|
.rva .LSEH_begin_poly1305_blocks_avx2
|
|
.rva .Lbase2_64_avx2
|
|
.rva .LSEH_info_poly1305_blocks_avx2_1
|
|
|
|
.rva .Lbase2_64_avx2
|
|
.rva .Leven_avx2
|
|
.rva .LSEH_info_poly1305_blocks_avx2_2
|
|
|
|
.rva .Leven_avx2
|
|
.rva .LSEH_end_poly1305_blocks_avx2
|
|
.rva .LSEH_info_poly1305_blocks_avx2_3
|
|
___
|
|
$code.=<<___;
|
|
.section .xdata
|
|
.align 8
|
|
.LSEH_info_poly1305_init:
|
|
.byte 9,0,0,0
|
|
.rva se_handler
|
|
.rva .LSEH_begin_poly1305_init,.LSEH_begin_poly1305_init
|
|
|
|
.LSEH_info_poly1305_blocks:
|
|
.byte 9,0,0,0
|
|
.rva se_handler
|
|
.rva .Lblocks_body,.Lblocks_epilogue
|
|
|
|
.LSEH_info_poly1305_emit:
|
|
.byte 9,0,0,0
|
|
.rva se_handler
|
|
.rva .LSEH_begin_poly1305_emit,.LSEH_begin_poly1305_emit
|
|
___
|
|
$code.=<<___ if ($avx);
|
|
.LSEH_info_poly1305_blocks_avx_1:
|
|
.byte 9,0,0,0
|
|
.rva se_handler
|
|
.rva .Lblocks_avx_body,.Lblocks_avx_epilogue # HandlerData[]
|
|
|
|
.LSEH_info_poly1305_blocks_avx_2:
|
|
.byte 9,0,0,0
|
|
.rva se_handler
|
|
.rva .Lbase2_64_avx_body,.Lbase2_64_avx_epilogue # HandlerData[]
|
|
|
|
.LSEH_info_poly1305_blocks_avx_3:
|
|
.byte 9,0,0,0
|
|
.rva avx_handler
|
|
.rva .Ldo_avx_body,.Ldo_avx_epilogue # HandlerData[]
|
|
|
|
.LSEH_info_poly1305_emit_avx:
|
|
.byte 9,0,0,0
|
|
.rva se_handler
|
|
.rva .LSEH_begin_poly1305_emit_avx,.LSEH_begin_poly1305_emit_avx
|
|
___
|
|
$code.=<<___ if ($avx>1);
|
|
.LSEH_info_poly1305_blocks_avx2_1:
|
|
.byte 9,0,0,0
|
|
.rva se_handler
|
|
.rva .Lblocks_avx2_body,.Lblocks_avx2_epilogue # HandlerData[]
|
|
|
|
.LSEH_info_poly1305_blocks_avx2_2:
|
|
.byte 9,0,0,0
|
|
.rva se_handler
|
|
.rva .Lbase2_64_avx2_body,.Lbase2_64_avx2_epilogue # HandlerData[]
|
|
|
|
.LSEH_info_poly1305_blocks_avx2_3:
|
|
.byte 9,0,0,0
|
|
.rva avx_handler
|
|
.rva .Ldo_avx2_body,.Ldo_avx2_epilogue # HandlerData[]
|
|
___
|
|
}
|
|
|
|
foreach (split('\n',$code)) {
|
|
s/\`([^\`]*)\`/eval($1)/ge;
|
|
s/%r([a-z]+)#d/%e$1/g;
|
|
s/%r([0-9]+)#d/%r$1d/g;
|
|
s/%x#%y/%x/g;
|
|
|
|
print $_,"\n";
|
|
}
|
|
close STDOUT;
|