boringssl/crypto/poly1305/asm/poly1305-x86_64.pl
David Benjamin fdd8e9c8c7 Switch perlasm calling convention.
Depending on architecture, perlasm differed on which one or both of:

  perl foo.pl flavor output.S
  perl foo.pl flavor > output.S

Upstream has now unified on the first form after making a number of
changes to their files (the second does not even work for their x86
files anymore). Sync those portions of our perlasm scripts with upstream
and update CMakeLists.txt and generate_build_files.py per the new
convention.

This imports various commits like this one:
184bc45f683c76531d7e065b6553ca9086564576 (this was done by taking a
diff, so I don't have the full list)

Confirmed that generate_build_files.py sees no change.

BUG=14

Change-Id: Id2fb5b8bc2a7369d077221b5df9a6947d41f50d2
Reviewed-on: https://boringssl-review.googlesource.com/8518
Reviewed-by: Adam Langley <agl@google.com>
2016-06-27 21:59:26 +00:00

2236 lines
51 KiB
Perl
Executable File

#!/usr/bin/env perl
#
# ====================================================================
# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
# project. The module is, however, dual licensed under OpenSSL and
# CRYPTOGAMS licenses depending on where you obtain it. For further
# details see http://www.openssl.org/~appro/cryptogams/.
# ====================================================================
#
# This module implements Poly1305 hash for x86_64.
#
# March 2015
#
# Numbers are cycles per processed byte with poly1305_blocks alone,
# measured with rdtsc at fixed clock frequency.
#
# IALU/gcc-4.8(*) AVX(**) AVX2
# P4 4.90/+120% -
# Core 2 2.39/+90% -
# Westmere 1.86/+120% -
# Sandy Bridge 1.39/+140% 1.10
# Haswell 1.10/+175% 1.11 0.65
# Skylake 1.12/+120% 0.96 0.51
# Silvermont 2.83/+95% -
# VIA Nano 1.82/+150% -
# Sledgehammer 1.38/+160% -
# Bulldozer 2.21/+130% 0.97
#
# (*) improvement coefficients relative to clang are more modest and
# are ~50% on most processors, in both cases we are comparing to
# __int128 code;
# (**) SSE2 implementation was attempted, but among non-AVX processors
# it was faster than integer-only code only on older Intel P4 and
# Core processors, 50-30%, less newer processor is, but slower on
# contemporary ones, for example almost 2x slower on Atom, and as
# former are naturally disappearing, SSE2 is deemed unnecessary;
$flavour = shift;
$output = shift;
if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
die "can't locate x86_64-xlate.pl";
$avx = 2;
open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
*STDOUT=*OUT;
my ($ctx,$inp,$len,$padbit)=("%rdi","%rsi","%rdx","%rcx");
my ($mac,$nonce)=($inp,$len); # *_emit arguments
my ($d1,$d2,$d3, $r0,$r1,$s1)=map("%r$_",(8..13));
my ($h0,$h1,$h2)=("%r14","%rbx","%rbp");
sub poly1305_iteration {
# input: copy of $r1 in %rax, $h0-$h2, $r0-$r1
# output: $h0-$h2 *= $r0-$r1
$code.=<<___;
mulq $h0 # h0*r1
mov %rax,$d2
mov $r0,%rax
mov %rdx,$d3
mulq $h0 # h0*r0
mov %rax,$h0 # future $h0
mov $r0,%rax
mov %rdx,$d1
mulq $h1 # h1*r0
add %rax,$d2
mov $s1,%rax
adc %rdx,$d3
mulq $h1 # h1*s1
mov $h2,$h1 # borrow $h1
add %rax,$h0
adc %rdx,$d1
imulq $s1,$h1 # h2*s1
add $h1,$d2
mov $d1,$h1
adc \$0,$d3
imulq $r0,$h2 # h2*r0
add $d2,$h1
mov \$-4,%rax # mask value
adc $h2,$d3
and $d3,%rax # last reduction step
mov $d3,$h2
shr \$2,$d3
and \$3,$h2
add $d3,%rax
add %rax,$h0
adc \$0,$h1
___
}
########################################################################
# Layout of opaque area is following.
#
# unsigned __int64 h[3]; # current hash value base 2^64
# unsigned __int64 r[2]; # key value base 2^64
$code.=<<___;
.text
.extern OPENSSL_ia32cap_P
.globl poly1305_init
.globl poly1305_blocks
.globl poly1305_emit
.type poly1305_init,\@function,3
.align 32
poly1305_init:
xor %rax,%rax
mov %rax,0($ctx) # initialize hash value
mov %rax,8($ctx)
mov %rax,16($ctx)
cmp \$0,$inp
je .Lno_key
lea poly1305_blocks(%rip),%r10
lea poly1305_emit(%rip),%r11
___
$code.=<<___ if ($avx);
mov OPENSSL_ia32cap_P+4(%rip),%r9
lea poly1305_blocks_avx(%rip),%rax
lea poly1305_emit_avx(%rip),%rcx
bt \$`60-32`,%r9 # AVX?
cmovc %rax,%r10
cmovc %rcx,%r11
___
$code.=<<___ if ($avx>1);
lea poly1305_blocks_avx2(%rip),%rax
bt \$`5+32`,%r9 # AVX2?
cmovc %rax,%r10
___
$code.=<<___;
mov \$0x0ffffffc0fffffff,%rax
mov \$0x0ffffffc0ffffffc,%rcx
and 0($inp),%rax
and 8($inp),%rcx
mov %rax,24($ctx)
mov %rcx,32($ctx)
___
$code.=<<___ if ($flavour !~ /elf32/);
mov %r10,0(%rdx)
mov %r11,8(%rdx)
___
$code.=<<___ if ($flavour =~ /elf32/);
mov %r10d,0(%rdx)
mov %r11d,4(%rdx)
___
$code.=<<___;
mov \$1,%eax
.Lno_key:
ret
.size poly1305_init,.-poly1305_init
.type poly1305_blocks,\@function,4
.align 32
poly1305_blocks:
.Lblocks:
sub \$16,$len # too short?
jc .Lno_data
push %rbx
push %rbp
push %r12
push %r13
push %r14
push %r15
.Lblocks_body:
mov $len,%r15 # reassign $len
mov 24($ctx),$r0 # load r
mov 32($ctx),$s1
mov 0($ctx),$h0 # load hash value
mov 8($ctx),$h1
mov 16($ctx),$h2
mov $s1,$r1
shr \$2,$s1
mov $r1,%rax
add $r1,$s1 # s1 = r1 + (r1 >> 2)
jmp .Loop
.align 32
.Loop:
add 0($inp),$h0 # accumulate input
adc 8($inp),$h1
lea 16($inp),$inp
adc $padbit,$h2
___
&poly1305_iteration();
$code.=<<___;
mov $r1,%rax
sub \$16,%r15 # len-=16
jnc .Loop
mov $h0,0($ctx) # store hash value
mov $h1,8($ctx)
mov $h2,16($ctx)
mov 0(%rsp),%r15
mov 8(%rsp),%r14
mov 16(%rsp),%r13
mov 24(%rsp),%r12
mov 32(%rsp),%rbp
mov 40(%rsp),%rbx
lea 48(%rsp),%rsp
.Lno_data:
.Lblocks_epilogue:
ret
.size poly1305_blocks,.-poly1305_blocks
.type poly1305_emit,\@function,3
.align 32
poly1305_emit:
.Lemit:
mov 0($ctx),%r8 # load hash value
mov 8($ctx),%r9
mov 16($ctx),%r10
mov %r8,%rax
add \$5,%r8 # compare to modulus
mov %r9,%rcx
adc \$0,%r9
adc \$0,%r10
shr \$2,%r10 # did 130-bit value overfow?
cmovnz %r8,%rax
cmovnz %r9,%rcx
add 0($nonce),%rax # accumulate nonce
adc 8($nonce),%rcx
mov %rax,0($mac) # write result
mov %rcx,8($mac)
ret
.size poly1305_emit,.-poly1305_emit
___
if ($avx) {
########################################################################
# Layout of opaque area is following.
#
# unsigned __int32 h[5]; # current hash value base 2^26
# unsigned __int32 is_base2_26;
# unsigned __int64 r[2]; # key value base 2^64
# unsigned __int64 pad;
# struct { unsigned __int32 r^2, r^1, r^4, r^3; } r[9];
#
# where r^n are base 2^26 digits of degrees of multiplier key. There are
# 5 digits, but last four are interleaved with multiples of 5, totalling
# in 9 elements: r0, r1, 5*r1, r2, 5*r2, r3, 5*r3, r4, 5*r4.
my ($H0,$H1,$H2,$H3,$H4, $T0,$T1,$T2,$T3,$T4, $D0,$D1,$D2,$D3,$D4, $MASK) =
map("%xmm$_",(0..15));
$code.=<<___;
.type __poly1305_block,\@abi-omnipotent
.align 32
__poly1305_block:
___
&poly1305_iteration();
$code.=<<___;
ret
.size __poly1305_block,.-__poly1305_block
.type __poly1305_init_avx,\@abi-omnipotent
.align 32
__poly1305_init_avx:
mov $r0,$h0
mov $r1,$h1
xor $h2,$h2
lea 48+64($ctx),$ctx # size optimization
mov $r1,%rax
call __poly1305_block # r^2
mov \$0x3ffffff,%eax # save interleaved r^2 and r base 2^26
mov \$0x3ffffff,%edx
mov $h0,$d1
and $h0#d,%eax
mov $r0,$d2
and $r0#d,%edx
mov %eax,`16*0+0-64`($ctx)
shr \$26,$d1
mov %edx,`16*0+4-64`($ctx)
shr \$26,$d2
mov \$0x3ffffff,%eax
mov \$0x3ffffff,%edx
and $d1#d,%eax
and $d2#d,%edx
mov %eax,`16*1+0-64`($ctx)
lea (%rax,%rax,4),%eax # *5
mov %edx,`16*1+4-64`($ctx)
lea (%rdx,%rdx,4),%edx # *5
mov %eax,`16*2+0-64`($ctx)
shr \$26,$d1
mov %edx,`16*2+4-64`($ctx)
shr \$26,$d2
mov $h1,%rax
mov $r1,%rdx
shl \$12,%rax
shl \$12,%rdx
or $d1,%rax
or $d2,%rdx
and \$0x3ffffff,%eax
and \$0x3ffffff,%edx
mov %eax,`16*3+0-64`($ctx)
lea (%rax,%rax,4),%eax # *5
mov %edx,`16*3+4-64`($ctx)
lea (%rdx,%rdx,4),%edx # *5
mov %eax,`16*4+0-64`($ctx)
mov $h1,$d1
mov %edx,`16*4+4-64`($ctx)
mov $r1,$d2
mov \$0x3ffffff,%eax
mov \$0x3ffffff,%edx
shr \$14,$d1
shr \$14,$d2
and $d1#d,%eax
and $d2#d,%edx
mov %eax,`16*5+0-64`($ctx)
lea (%rax,%rax,4),%eax # *5
mov %edx,`16*5+4-64`($ctx)
lea (%rdx,%rdx,4),%edx # *5
mov %eax,`16*6+0-64`($ctx)
shr \$26,$d1
mov %edx,`16*6+4-64`($ctx)
shr \$26,$d2
mov $h2,%rax
shl \$24,%rax
or %rax,$d1
mov $d1#d,`16*7+0-64`($ctx)
lea ($d1,$d1,4),$d1 # *5
mov $d2#d,`16*7+4-64`($ctx)
lea ($d2,$d2,4),$d2 # *5
mov $d1#d,`16*8+0-64`($ctx)
mov $d2#d,`16*8+4-64`($ctx)
mov $r1,%rax
call __poly1305_block # r^3
mov \$0x3ffffff,%eax # save r^3 base 2^26
mov $h0,$d1
and $h0#d,%eax
shr \$26,$d1
mov %eax,`16*0+12-64`($ctx)
mov \$0x3ffffff,%edx
and $d1#d,%edx
mov %edx,`16*1+12-64`($ctx)
lea (%rdx,%rdx,4),%edx # *5
shr \$26,$d1
mov %edx,`16*2+12-64`($ctx)
mov $h1,%rax
shl \$12,%rax
or $d1,%rax
and \$0x3ffffff,%eax
mov %eax,`16*3+12-64`($ctx)
lea (%rax,%rax,4),%eax # *5
mov $h1,$d1
mov %eax,`16*4+12-64`($ctx)
mov \$0x3ffffff,%edx
shr \$14,$d1
and $d1#d,%edx
mov %edx,`16*5+12-64`($ctx)
lea (%rdx,%rdx,4),%edx # *5
shr \$26,$d1
mov %edx,`16*6+12-64`($ctx)
mov $h2,%rax
shl \$24,%rax
or %rax,$d1
mov $d1#d,`16*7+12-64`($ctx)
lea ($d1,$d1,4),$d1 # *5
mov $d1#d,`16*8+12-64`($ctx)
mov $r1,%rax
call __poly1305_block # r^4
mov \$0x3ffffff,%eax # save r^4 base 2^26
mov $h0,$d1
and $h0#d,%eax
shr \$26,$d1
mov %eax,`16*0+8-64`($ctx)
mov \$0x3ffffff,%edx
and $d1#d,%edx
mov %edx,`16*1+8-64`($ctx)
lea (%rdx,%rdx,4),%edx # *5
shr \$26,$d1
mov %edx,`16*2+8-64`($ctx)
mov $h1,%rax
shl \$12,%rax
or $d1,%rax
and \$0x3ffffff,%eax
mov %eax,`16*3+8-64`($ctx)
lea (%rax,%rax,4),%eax # *5
mov $h1,$d1
mov %eax,`16*4+8-64`($ctx)
mov \$0x3ffffff,%edx
shr \$14,$d1
and $d1#d,%edx
mov %edx,`16*5+8-64`($ctx)
lea (%rdx,%rdx,4),%edx # *5
shr \$26,$d1
mov %edx,`16*6+8-64`($ctx)
mov $h2,%rax
shl \$24,%rax
or %rax,$d1
mov $d1#d,`16*7+8-64`($ctx)
lea ($d1,$d1,4),$d1 # *5
mov $d1#d,`16*8+8-64`($ctx)
lea -48-64($ctx),$ctx # size [de-]optimization
ret
.size __poly1305_init_avx,.-__poly1305_init_avx
.type poly1305_blocks_avx,\@function,4
.align 32
poly1305_blocks_avx:
mov 20($ctx),%r8d # is_base2_26
cmp \$128,$len
jae .Lblocks_avx
test %r8d,%r8d
jz .Lblocks
.Lblocks_avx:
and \$-16,$len
jz .Lno_data_avx
vzeroupper
test %r8d,%r8d
jz .Lbase2_64_avx
test \$31,$len
jz .Leven_avx
push %rbx
push %rbp
push %r12
push %r13
push %r14
push %r15
.Lblocks_avx_body:
mov $len,%r15 # reassign $len
mov 0($ctx),$d1 # load hash value
mov 8($ctx),$d2
mov 16($ctx),$h2#d
mov 24($ctx),$r0 # load r
mov 32($ctx),$s1
################################# base 2^26 -> base 2^64
mov $d1#d,$h0#d
and \$-1<<31,$d1
mov $d2,$r1 # borrow $r1
mov $d2#d,$h1#d
and \$-1<<31,$d2
shr \$6,$d1
shl \$52,$r1
add $d1,$h0
shr \$12,$h1
shr \$18,$d2
add $r1,$h0
adc $d2,$h1
mov $h2,$d1
shl \$40,$d1
shr \$24,$h2
add $d1,$h1
adc \$0,$h2 # can be partially reduced...
mov \$-4,$d2 # ... so reduce
mov $h2,$d1
and $h2,$d2
shr \$2,$d1
and \$3,$h2
add $d2,$d1 # =*5
add $d1,$h0
adc \$0,$h1
mov $s1,$r1
mov $s1,%rax
shr \$2,$s1
add $r1,$s1 # s1 = r1 + (r1 >> 2)
add 0($inp),$h0 # accumulate input
adc 8($inp),$h1
lea 16($inp),$inp
adc $padbit,$h2
call __poly1305_block
test $padbit,$padbit # if $padbit is zero,
jz .Lstore_base2_64_avx # store hash in base 2^64 format
################################# base 2^64 -> base 2^26
mov $h0,%rax
mov $h0,%rdx
shr \$52,$h0
mov $h1,$r0
mov $h1,$r1
shr \$26,%rdx
and \$0x3ffffff,%rax # h[0]
shl \$12,$r0
and \$0x3ffffff,%rdx # h[1]
shr \$14,$h1
or $r0,$h0
shl \$24,$h2
and \$0x3ffffff,$h0 # h[2]
shr \$40,$r1
and \$0x3ffffff,$h1 # h[3]
or $r1,$h2 # h[4]
sub \$16,%r15
jz .Lstore_base2_26_avx
vmovd %rax#d,$H0
vmovd %rdx#d,$H1
vmovd $h0#d,$H2
vmovd $h1#d,$H3
vmovd $h2#d,$H4
jmp .Lproceed_avx
.align 32
.Lstore_base2_64_avx:
mov $h0,0($ctx)
mov $h1,8($ctx)
mov $h2,16($ctx) # note that is_base2_26 is zeroed
jmp .Ldone_avx
.align 16
.Lstore_base2_26_avx:
mov %rax#d,0($ctx) # store hash value base 2^26
mov %rdx#d,4($ctx)
mov $h0#d,8($ctx)
mov $h1#d,12($ctx)
mov $h2#d,16($ctx)
.align 16
.Ldone_avx:
mov 0(%rsp),%r15
mov 8(%rsp),%r14
mov 16(%rsp),%r13
mov 24(%rsp),%r12
mov 32(%rsp),%rbp
mov 40(%rsp),%rbx
lea 48(%rsp),%rsp
.Lno_data_avx:
.Lblocks_avx_epilogue:
ret
.align 32
.Lbase2_64_avx:
push %rbx
push %rbp
push %r12
push %r13
push %r14
push %r15
.Lbase2_64_avx_body:
mov $len,%r15 # reassign $len
mov 24($ctx),$r0 # load r
mov 32($ctx),$s1
mov 0($ctx),$h0 # load hash value
mov 8($ctx),$h1
mov 16($ctx),$h2#d
mov $s1,$r1
mov $s1,%rax
shr \$2,$s1
add $r1,$s1 # s1 = r1 + (r1 >> 2)
test \$31,$len
jz .Linit_avx
add 0($inp),$h0 # accumulate input
adc 8($inp),$h1
lea 16($inp),$inp
adc $padbit,$h2
sub \$16,%r15
call __poly1305_block
.Linit_avx:
################################# base 2^64 -> base 2^26
mov $h0,%rax
mov $h0,%rdx
shr \$52,$h0
mov $h1,$d1
mov $h1,$d2
shr \$26,%rdx
and \$0x3ffffff,%rax # h[0]
shl \$12,$d1
and \$0x3ffffff,%rdx # h[1]
shr \$14,$h1
or $d1,$h0
shl \$24,$h2
and \$0x3ffffff,$h0 # h[2]
shr \$40,$d2
and \$0x3ffffff,$h1 # h[3]
or $d2,$h2 # h[4]
vmovd %rax#d,$H0
vmovd %rdx#d,$H1
vmovd $h0#d,$H2
vmovd $h1#d,$H3
vmovd $h2#d,$H4
movl \$1,20($ctx) # set is_base2_26
call __poly1305_init_avx
.Lproceed_avx:
mov %r15,$len
mov 0(%rsp),%r15
mov 8(%rsp),%r14
mov 16(%rsp),%r13
mov 24(%rsp),%r12
mov 32(%rsp),%rbp
mov 40(%rsp),%rbx
lea 48(%rsp),%rax
lea 48(%rsp),%rsp
.Lbase2_64_avx_epilogue:
jmp .Ldo_avx
.align 32
.Leven_avx:
vmovd 4*0($ctx),$H0 # load hash value
vmovd 4*1($ctx),$H1
vmovd 4*2($ctx),$H2
vmovd 4*3($ctx),$H3
vmovd 4*4($ctx),$H4
.Ldo_avx:
___
$code.=<<___ if (!$win64);
lea -0x58(%rsp),%r11
sub \$0x178,%rsp
___
$code.=<<___ if ($win64);
lea -0xf8(%rsp),%r11
sub \$0x218,%rsp
vmovdqa %xmm6,0x50(%r11)
vmovdqa %xmm7,0x60(%r11)
vmovdqa %xmm8,0x70(%r11)
vmovdqa %xmm9,0x80(%r11)
vmovdqa %xmm10,0x90(%r11)
vmovdqa %xmm11,0xa0(%r11)
vmovdqa %xmm12,0xb0(%r11)
vmovdqa %xmm13,0xc0(%r11)
vmovdqa %xmm14,0xd0(%r11)
vmovdqa %xmm15,0xe0(%r11)
.Ldo_avx_body:
___
$code.=<<___;
sub \$64,$len
lea -32($inp),%rax
cmovc %rax,$inp
vmovdqu `16*3`($ctx),$D4 # preload r0^2
lea `16*3+64`($ctx),$ctx # size optimization
lea .Lconst(%rip),%rcx
################################################################
# load input
vmovdqu 16*2($inp),$T0
vmovdqu 16*3($inp),$T1
vmovdqa 64(%rcx),$MASK # .Lmask26
vpsrldq \$6,$T0,$T2 # splat input
vpsrldq \$6,$T1,$T3
vpunpckhqdq $T1,$T0,$T4 # 4
vpunpcklqdq $T1,$T0,$T0 # 0:1
vpunpcklqdq $T3,$T2,$T3 # 2:3
vpsrlq \$40,$T4,$T4 # 4
vpsrlq \$26,$T0,$T1
vpand $MASK,$T0,$T0 # 0
vpsrlq \$4,$T3,$T2
vpand $MASK,$T1,$T1 # 1
vpsrlq \$30,$T3,$T3
vpand $MASK,$T2,$T2 # 2
vpand $MASK,$T3,$T3 # 3
vpor 32(%rcx),$T4,$T4 # padbit, yes, always
jbe .Lskip_loop_avx
# expand and copy pre-calculated table to stack
vmovdqu `16*1-64`($ctx),$D1
vmovdqu `16*2-64`($ctx),$D2
vpshufd \$0xEE,$D4,$D3 # 34xx -> 3434
vpshufd \$0x44,$D4,$D0 # xx12 -> 1212
vmovdqa $D3,-0x90(%r11)
vmovdqa $D0,0x00(%rsp)
vpshufd \$0xEE,$D1,$D4
vmovdqu `16*3-64`($ctx),$D0
vpshufd \$0x44,$D1,$D1
vmovdqa $D4,-0x80(%r11)
vmovdqa $D1,0x10(%rsp)
vpshufd \$0xEE,$D2,$D3
vmovdqu `16*4-64`($ctx),$D1
vpshufd \$0x44,$D2,$D2
vmovdqa $D3,-0x70(%r11)
vmovdqa $D2,0x20(%rsp)
vpshufd \$0xEE,$D0,$D4
vmovdqu `16*5-64`($ctx),$D2
vpshufd \$0x44,$D0,$D0
vmovdqa $D4,-0x60(%r11)
vmovdqa $D0,0x30(%rsp)
vpshufd \$0xEE,$D1,$D3
vmovdqu `16*6-64`($ctx),$D0
vpshufd \$0x44,$D1,$D1
vmovdqa $D3,-0x50(%r11)
vmovdqa $D1,0x40(%rsp)
vpshufd \$0xEE,$D2,$D4
vmovdqu `16*7-64`($ctx),$D1
vpshufd \$0x44,$D2,$D2
vmovdqa $D4,-0x40(%r11)
vmovdqa $D2,0x50(%rsp)
vpshufd \$0xEE,$D0,$D3
vmovdqu `16*8-64`($ctx),$D2
vpshufd \$0x44,$D0,$D0
vmovdqa $D3,-0x30(%r11)
vmovdqa $D0,0x60(%rsp)
vpshufd \$0xEE,$D1,$D4
vpshufd \$0x44,$D1,$D1
vmovdqa $D4,-0x20(%r11)
vmovdqa $D1,0x70(%rsp)
vpshufd \$0xEE,$D2,$D3
vmovdqa 0x00(%rsp),$D4 # preload r0^2
vpshufd \$0x44,$D2,$D2
vmovdqa $D3,-0x10(%r11)
vmovdqa $D2,0x80(%rsp)
jmp .Loop_avx
.align 32
.Loop_avx:
################################################################
# ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2
# ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^3+inp[7]*r
# \___________________/
# ((inp[0]*r^4+inp[2]*r^2+inp[4])*r^4+inp[6]*r^2+inp[8])*r^2
# ((inp[1]*r^4+inp[3]*r^2+inp[5])*r^4+inp[7]*r^2+inp[9])*r
# \___________________/ \____________________/
#
# Note that we start with inp[2:3]*r^2. This is because it
# doesn't depend on reduction in previous iteration.
################################################################
# d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4
# d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4
# d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4
# d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4
# d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4
#
# though note that $Tx and $Hx are "reversed" in this section,
# and $D4 is preloaded with r0^2...
vpmuludq $T0,$D4,$D0 # d0 = h0*r0
vpmuludq $T1,$D4,$D1 # d1 = h1*r0
vmovdqa $H2,0x20(%r11) # offload hash
vpmuludq $T2,$D4,$D2 # d3 = h2*r0
vmovdqa 0x10(%rsp),$H2 # r1^2
vpmuludq $T3,$D4,$D3 # d3 = h3*r0
vpmuludq $T4,$D4,$D4 # d4 = h4*r0
vmovdqa $H0,0x00(%r11) #
vpmuludq 0x20(%rsp),$T4,$H0 # h4*s1
vmovdqa $H1,0x10(%r11) #
vpmuludq $T3,$H2,$H1 # h3*r1
vpaddq $H0,$D0,$D0 # d0 += h4*s1
vpaddq $H1,$D4,$D4 # d4 += h3*r1
vmovdqa $H3,0x30(%r11) #
vpmuludq $T2,$H2,$H0 # h2*r1
vpmuludq $T1,$H2,$H1 # h1*r1
vpaddq $H0,$D3,$D3 # d3 += h2*r1
vmovdqa 0x30(%rsp),$H3 # r2^2
vpaddq $H1,$D2,$D2 # d2 += h1*r1
vmovdqa $H4,0x40(%r11) #
vpmuludq $T0,$H2,$H2 # h0*r1
vpmuludq $T2,$H3,$H0 # h2*r2
vpaddq $H2,$D1,$D1 # d1 += h0*r1
vmovdqa 0x40(%rsp),$H4 # s2^2
vpaddq $H0,$D4,$D4 # d4 += h2*r2
vpmuludq $T1,$H3,$H1 # h1*r2
vpmuludq $T0,$H3,$H3 # h0*r2
vpaddq $H1,$D3,$D3 # d3 += h1*r2
vmovdqa 0x50(%rsp),$H2 # r3^2
vpaddq $H3,$D2,$D2 # d2 += h0*r2
vpmuludq $T4,$H4,$H0 # h4*s2
vpmuludq $T3,$H4,$H4 # h3*s2
vpaddq $H0,$D1,$D1 # d1 += h4*s2
vmovdqa 0x60(%rsp),$H3 # s3^2
vpaddq $H4,$D0,$D0 # d0 += h3*s2
vmovdqa 0x80(%rsp),$H4 # s4^2
vpmuludq $T1,$H2,$H1 # h1*r3
vpmuludq $T0,$H2,$H2 # h0*r3
vpaddq $H1,$D4,$D4 # d4 += h1*r3
vpaddq $H2,$D3,$D3 # d3 += h0*r3
vpmuludq $T4,$H3,$H0 # h4*s3
vpmuludq $T3,$H3,$H1 # h3*s3
vpaddq $H0,$D2,$D2 # d2 += h4*s3
vmovdqu 16*0($inp),$H0 # load input
vpaddq $H1,$D1,$D1 # d1 += h3*s3
vpmuludq $T2,$H3,$H3 # h2*s3
vpmuludq $T2,$H4,$T2 # h2*s4
vpaddq $H3,$D0,$D0 # d0 += h2*s3
vmovdqu 16*1($inp),$H1 #
vpaddq $T2,$D1,$D1 # d1 += h2*s4
vpmuludq $T3,$H4,$T3 # h3*s4
vpmuludq $T4,$H4,$T4 # h4*s4
vpsrldq \$6,$H0,$H2 # splat input
vpaddq $T3,$D2,$D2 # d2 += h3*s4
vpaddq $T4,$D3,$D3 # d3 += h4*s4
vpsrldq \$6,$H1,$H3 #
vpmuludq 0x70(%rsp),$T0,$T4 # h0*r4
vpmuludq $T1,$H4,$T0 # h1*s4
vpunpckhqdq $H1,$H0,$H4 # 4
vpaddq $T4,$D4,$D4 # d4 += h0*r4
vmovdqa -0x90(%r11),$T4 # r0^4
vpaddq $T0,$D0,$D0 # d0 += h1*s4
vpunpcklqdq $H1,$H0,$H0 # 0:1
vpunpcklqdq $H3,$H2,$H3 # 2:3
#vpsrlq \$40,$H4,$H4 # 4
vpsrldq \$`40/8`,$H4,$H4 # 4
vpsrlq \$26,$H0,$H1
vpand $MASK,$H0,$H0 # 0
vpsrlq \$4,$H3,$H2
vpand $MASK,$H1,$H1 # 1
vpand 0(%rcx),$H4,$H4 # .Lmask24
vpsrlq \$30,$H3,$H3
vpand $MASK,$H2,$H2 # 2
vpand $MASK,$H3,$H3 # 3
vpor 32(%rcx),$H4,$H4 # padbit, yes, always
vpaddq 0x00(%r11),$H0,$H0 # add hash value
vpaddq 0x10(%r11),$H1,$H1
vpaddq 0x20(%r11),$H2,$H2
vpaddq 0x30(%r11),$H3,$H3
vpaddq 0x40(%r11),$H4,$H4
lea 16*2($inp),%rax
lea 16*4($inp),$inp
sub \$64,$len
cmovc %rax,$inp
################################################################
# Now we accumulate (inp[0:1]+hash)*r^4
################################################################
# d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4
# d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4
# d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4
# d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4
# d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4
vpmuludq $H0,$T4,$T0 # h0*r0
vpmuludq $H1,$T4,$T1 # h1*r0
vpaddq $T0,$D0,$D0
vpaddq $T1,$D1,$D1
vmovdqa -0x80(%r11),$T2 # r1^4
vpmuludq $H2,$T4,$T0 # h2*r0
vpmuludq $H3,$T4,$T1 # h3*r0
vpaddq $T0,$D2,$D2
vpaddq $T1,$D3,$D3
vpmuludq $H4,$T4,$T4 # h4*r0
vpmuludq -0x70(%r11),$H4,$T0 # h4*s1
vpaddq $T4,$D4,$D4
vpaddq $T0,$D0,$D0 # d0 += h4*s1
vpmuludq $H2,$T2,$T1 # h2*r1
vpmuludq $H3,$T2,$T0 # h3*r1
vpaddq $T1,$D3,$D3 # d3 += h2*r1
vmovdqa -0x60(%r11),$T3 # r2^4
vpaddq $T0,$D4,$D4 # d4 += h3*r1
vpmuludq $H1,$T2,$T1 # h1*r1
vpmuludq $H0,$T2,$T2 # h0*r1
vpaddq $T1,$D2,$D2 # d2 += h1*r1
vpaddq $T2,$D1,$D1 # d1 += h0*r1
vmovdqa -0x50(%r11),$T4 # s2^4
vpmuludq $H2,$T3,$T0 # h2*r2
vpmuludq $H1,$T3,$T1 # h1*r2
vpaddq $T0,$D4,$D4 # d4 += h2*r2
vpaddq $T1,$D3,$D3 # d3 += h1*r2
vmovdqa -0x40(%r11),$T2 # r3^4
vpmuludq $H0,$T3,$T3 # h0*r2
vpmuludq $H4,$T4,$T0 # h4*s2
vpaddq $T3,$D2,$D2 # d2 += h0*r2
vpaddq $T0,$D1,$D1 # d1 += h4*s2
vmovdqa -0x30(%r11),$T3 # s3^4
vpmuludq $H3,$T4,$T4 # h3*s2
vpmuludq $H1,$T2,$T1 # h1*r3
vpaddq $T4,$D0,$D0 # d0 += h3*s2
vmovdqa -0x10(%r11),$T4 # s4^4
vpaddq $T1,$D4,$D4 # d4 += h1*r3
vpmuludq $H0,$T2,$T2 # h0*r3
vpmuludq $H4,$T3,$T0 # h4*s3
vpaddq $T2,$D3,$D3 # d3 += h0*r3
vpaddq $T0,$D2,$D2 # d2 += h4*s3
vmovdqu 16*2($inp),$T0 # load input
vpmuludq $H3,$T3,$T2 # h3*s3
vpmuludq $H2,$T3,$T3 # h2*s3
vpaddq $T2,$D1,$D1 # d1 += h3*s3
vmovdqu 16*3($inp),$T1 #
vpaddq $T3,$D0,$D0 # d0 += h2*s3
vpmuludq $H2,$T4,$H2 # h2*s4
vpmuludq $H3,$T4,$H3 # h3*s4
vpsrldq \$6,$T0,$T2 # splat input
vpaddq $H2,$D1,$D1 # d1 += h2*s4
vpmuludq $H4,$T4,$H4 # h4*s4
vpsrldq \$6,$T1,$T3 #
vpaddq $H3,$D2,$H2 # h2 = d2 + h3*s4
vpaddq $H4,$D3,$H3 # h3 = d3 + h4*s4
vpmuludq -0x20(%r11),$H0,$H4 # h0*r4
vpmuludq $H1,$T4,$H0
vpunpckhqdq $T1,$T0,$T4 # 4
vpaddq $H4,$D4,$H4 # h4 = d4 + h0*r4
vpaddq $H0,$D0,$H0 # h0 = d0 + h1*s4
vpunpcklqdq $T1,$T0,$T0 # 0:1
vpunpcklqdq $T3,$T2,$T3 # 2:3
#vpsrlq \$40,$T4,$T4 # 4
vpsrldq \$`40/8`,$T4,$T4 # 4
vpsrlq \$26,$T0,$T1
vmovdqa 0x00(%rsp),$D4 # preload r0^2
vpand $MASK,$T0,$T0 # 0
vpsrlq \$4,$T3,$T2
vpand $MASK,$T1,$T1 # 1
vpand 0(%rcx),$T4,$T4 # .Lmask24
vpsrlq \$30,$T3,$T3
vpand $MASK,$T2,$T2 # 2
vpand $MASK,$T3,$T3 # 3
vpor 32(%rcx),$T4,$T4 # padbit, yes, always
################################################################
# lazy reduction as discussed in "NEON crypto" by D.J. Bernstein
# and P. Schwabe
vpsrlq \$26,$H3,$D3
vpand $MASK,$H3,$H3
vpaddq $D3,$H4,$H4 # h3 -> h4
vpsrlq \$26,$H0,$D0
vpand $MASK,$H0,$H0
vpaddq $D0,$D1,$H1 # h0 -> h1
vpsrlq \$26,$H4,$D0
vpand $MASK,$H4,$H4
vpsrlq \$26,$H1,$D1
vpand $MASK,$H1,$H1
vpaddq $D1,$H2,$H2 # h1 -> h2
vpaddq $D0,$H0,$H0
vpsllq \$2,$D0,$D0
vpaddq $D0,$H0,$H0 # h4 -> h0
vpsrlq \$26,$H2,$D2
vpand $MASK,$H2,$H2
vpaddq $D2,$H3,$H3 # h2 -> h3
vpsrlq \$26,$H0,$D0
vpand $MASK,$H0,$H0
vpaddq $D0,$H1,$H1 # h0 -> h1
vpsrlq \$26,$H3,$D3
vpand $MASK,$H3,$H3
vpaddq $D3,$H4,$H4 # h3 -> h4
ja .Loop_avx
.Lskip_loop_avx:
################################################################
# multiply (inp[0:1]+hash) or inp[2:3] by r^2:r^1
vpshufd \$0x10,$D4,$D4 # r0^n, xx12 -> x1x2
add \$32,$len
jnz .Long_tail_avx
vpaddq $H2,$T2,$T2
vpaddq $H0,$T0,$T0
vpaddq $H1,$T1,$T1
vpaddq $H3,$T3,$T3
vpaddq $H4,$T4,$T4
.Long_tail_avx:
vmovdqa $H2,0x20(%r11)
vmovdqa $H0,0x00(%r11)
vmovdqa $H1,0x10(%r11)
vmovdqa $H3,0x30(%r11)
vmovdqa $H4,0x40(%r11)
# d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4
# d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4
# d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4
# d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4
# d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4
vpmuludq $T2,$D4,$D2 # d2 = h2*r0
vpmuludq $T0,$D4,$D0 # d0 = h0*r0
vpshufd \$0x10,`16*1-64`($ctx),$H2 # r1^n
vpmuludq $T1,$D4,$D1 # d1 = h1*r0
vpmuludq $T3,$D4,$D3 # d3 = h3*r0
vpmuludq $T4,$D4,$D4 # d4 = h4*r0
vpmuludq $T3,$H2,$H0 # h3*r1
vpaddq $H0,$D4,$D4 # d4 += h3*r1
vpshufd \$0x10,`16*2-64`($ctx),$H3 # s1^n
vpmuludq $T2,$H2,$H1 # h2*r1
vpaddq $H1,$D3,$D3 # d3 += h2*r1
vpshufd \$0x10,`16*3-64`($ctx),$H4 # r2^n
vpmuludq $T1,$H2,$H0 # h1*r1
vpaddq $H0,$D2,$D2 # d2 += h1*r1
vpmuludq $T0,$H2,$H2 # h0*r1
vpaddq $H2,$D1,$D1 # d1 += h0*r1
vpmuludq $T4,$H3,$H3 # h4*s1
vpaddq $H3,$D0,$D0 # d0 += h4*s1
vpshufd \$0x10,`16*4-64`($ctx),$H2 # s2^n
vpmuludq $T2,$H4,$H1 # h2*r2
vpaddq $H1,$D4,$D4 # d4 += h2*r2
vpmuludq $T1,$H4,$H0 # h1*r2
vpaddq $H0,$D3,$D3 # d3 += h1*r2
vpshufd \$0x10,`16*5-64`($ctx),$H3 # r3^n
vpmuludq $T0,$H4,$H4 # h0*r2
vpaddq $H4,$D2,$D2 # d2 += h0*r2
vpmuludq $T4,$H2,$H1 # h4*s2
vpaddq $H1,$D1,$D1 # d1 += h4*s2
vpshufd \$0x10,`16*6-64`($ctx),$H4 # s3^n
vpmuludq $T3,$H2,$H2 # h3*s2
vpaddq $H2,$D0,$D0 # d0 += h3*s2
vpmuludq $T1,$H3,$H0 # h1*r3
vpaddq $H0,$D4,$D4 # d4 += h1*r3
vpmuludq $T0,$H3,$H3 # h0*r3
vpaddq $H3,$D3,$D3 # d3 += h0*r3
vpshufd \$0x10,`16*7-64`($ctx),$H2 # r4^n
vpmuludq $T4,$H4,$H1 # h4*s3
vpaddq $H1,$D2,$D2 # d2 += h4*s3
vpshufd \$0x10,`16*8-64`($ctx),$H3 # s4^n
vpmuludq $T3,$H4,$H0 # h3*s3
vpaddq $H0,$D1,$D1 # d1 += h3*s3
vpmuludq $T2,$H4,$H4 # h2*s3
vpaddq $H4,$D0,$D0 # d0 += h2*s3
vpmuludq $T0,$H2,$H2 # h0*r4
vpaddq $H2,$D4,$D4 # h4 = d4 + h0*r4
vpmuludq $T4,$H3,$H1 # h4*s4
vpaddq $H1,$D3,$D3 # h3 = d3 + h4*s4
vpmuludq $T3,$H3,$H0 # h3*s4
vpaddq $H0,$D2,$D2 # h2 = d2 + h3*s4
vpmuludq $T2,$H3,$H1 # h2*s4
vpaddq $H1,$D1,$D1 # h1 = d1 + h2*s4
vpmuludq $T1,$H3,$H3 # h1*s4
vpaddq $H3,$D0,$D0 # h0 = d0 + h1*s4
jz .Lshort_tail_avx
vmovdqu 16*0($inp),$H0 # load input
vmovdqu 16*1($inp),$H1
vpsrldq \$6,$H0,$H2 # splat input
vpsrldq \$6,$H1,$H3
vpunpckhqdq $H1,$H0,$H4 # 4
vpunpcklqdq $H1,$H0,$H0 # 0:1
vpunpcklqdq $H3,$H2,$H3 # 2:3
vpsrlq \$40,$H4,$H4 # 4
vpsrlq \$26,$H0,$H1
vpand $MASK,$H0,$H0 # 0
vpsrlq \$4,$H3,$H2
vpand $MASK,$H1,$H1 # 1
vpsrlq \$30,$H3,$H3
vpand $MASK,$H2,$H2 # 2
vpand $MASK,$H3,$H3 # 3
vpor 32(%rcx),$H4,$H4 # padbit, yes, always
vpshufd \$0x32,`16*0-64`($ctx),$T4 # r0^n, 34xx -> x3x4
vpaddq 0x00(%r11),$H0,$H0
vpaddq 0x10(%r11),$H1,$H1
vpaddq 0x20(%r11),$H2,$H2
vpaddq 0x30(%r11),$H3,$H3
vpaddq 0x40(%r11),$H4,$H4
################################################################
# multiply (inp[0:1]+hash) by r^4:r^3 and accumulate
vpmuludq $H0,$T4,$T0 # h0*r0
vpaddq $T0,$D0,$D0 # d0 += h0*r0
vpmuludq $H1,$T4,$T1 # h1*r0
vpaddq $T1,$D1,$D1 # d1 += h1*r0
vpmuludq $H2,$T4,$T0 # h2*r0
vpaddq $T0,$D2,$D2 # d2 += h2*r0
vpshufd \$0x32,`16*1-64`($ctx),$T2 # r1^n
vpmuludq $H3,$T4,$T1 # h3*r0
vpaddq $T1,$D3,$D3 # d3 += h3*r0
vpmuludq $H4,$T4,$T4 # h4*r0
vpaddq $T4,$D4,$D4 # d4 += h4*r0
vpmuludq $H3,$T2,$T0 # h3*r1
vpaddq $T0,$D4,$D4 # d4 += h3*r1
vpshufd \$0x32,`16*2-64`($ctx),$T3 # s1
vpmuludq $H2,$T2,$T1 # h2*r1
vpaddq $T1,$D3,$D3 # d3 += h2*r1
vpshufd \$0x32,`16*3-64`($ctx),$T4 # r2
vpmuludq $H1,$T2,$T0 # h1*r1
vpaddq $T0,$D2,$D2 # d2 += h1*r1
vpmuludq $H0,$T2,$T2 # h0*r1
vpaddq $T2,$D1,$D1 # d1 += h0*r1
vpmuludq $H4,$T3,$T3 # h4*s1
vpaddq $T3,$D0,$D0 # d0 += h4*s1
vpshufd \$0x32,`16*4-64`($ctx),$T2 # s2
vpmuludq $H2,$T4,$T1 # h2*r2
vpaddq $T1,$D4,$D4 # d4 += h2*r2
vpmuludq $H1,$T4,$T0 # h1*r2
vpaddq $T0,$D3,$D3 # d3 += h1*r2
vpshufd \$0x32,`16*5-64`($ctx),$T3 # r3
vpmuludq $H0,$T4,$T4 # h0*r2
vpaddq $T4,$D2,$D2 # d2 += h0*r2
vpmuludq $H4,$T2,$T1 # h4*s2
vpaddq $T1,$D1,$D1 # d1 += h4*s2
vpshufd \$0x32,`16*6-64`($ctx),$T4 # s3
vpmuludq $H3,$T2,$T2 # h3*s2
vpaddq $T2,$D0,$D0 # d0 += h3*s2
vpmuludq $H1,$T3,$T0 # h1*r3
vpaddq $T0,$D4,$D4 # d4 += h1*r3
vpmuludq $H0,$T3,$T3 # h0*r3
vpaddq $T3,$D3,$D3 # d3 += h0*r3
vpshufd \$0x32,`16*7-64`($ctx),$T2 # r4
vpmuludq $H4,$T4,$T1 # h4*s3
vpaddq $T1,$D2,$D2 # d2 += h4*s3
vpshufd \$0x32,`16*8-64`($ctx),$T3 # s4
vpmuludq $H3,$T4,$T0 # h3*s3
vpaddq $T0,$D1,$D1 # d1 += h3*s3
vpmuludq $H2,$T4,$T4 # h2*s3
vpaddq $T4,$D0,$D0 # d0 += h2*s3
vpmuludq $H0,$T2,$T2 # h0*r4
vpaddq $T2,$D4,$D4 # d4 += h0*r4
vpmuludq $H4,$T3,$T1 # h4*s4
vpaddq $T1,$D3,$D3 # d3 += h4*s4
vpmuludq $H3,$T3,$T0 # h3*s4
vpaddq $T0,$D2,$D2 # d2 += h3*s4
vpmuludq $H2,$T3,$T1 # h2*s4
vpaddq $T1,$D1,$D1 # d1 += h2*s4
vpmuludq $H1,$T3,$T3 # h1*s4
vpaddq $T3,$D0,$D0 # d0 += h1*s4
.Lshort_tail_avx:
################################################################
# horizontal addition
vpsrldq \$8,$D4,$T4
vpsrldq \$8,$D3,$T3
vpsrldq \$8,$D1,$T1
vpsrldq \$8,$D0,$T0
vpsrldq \$8,$D2,$T2
vpaddq $T3,$D3,$D3
vpaddq $T4,$D4,$D4
vpaddq $T0,$D0,$D0
vpaddq $T1,$D1,$D1
vpaddq $T2,$D2,$D2
################################################################
# lazy reduction
vpsrlq \$26,$D3,$H3
vpand $MASK,$D3,$D3
vpaddq $H3,$D4,$D4 # h3 -> h4
vpsrlq \$26,$D0,$H0
vpand $MASK,$D0,$D0
vpaddq $H0,$D1,$D1 # h0 -> h1
vpsrlq \$26,$D4,$H4
vpand $MASK,$D4,$D4
vpsrlq \$26,$D1,$H1
vpand $MASK,$D1,$D1
vpaddq $H1,$D2,$D2 # h1 -> h2
vpaddq $H4,$D0,$D0
vpsllq \$2,$H4,$H4
vpaddq $H4,$D0,$D0 # h4 -> h0
vpsrlq \$26,$D2,$H2
vpand $MASK,$D2,$D2
vpaddq $H2,$D3,$D3 # h2 -> h3
vpsrlq \$26,$D0,$H0
vpand $MASK,$D0,$D0
vpaddq $H0,$D1,$D1 # h0 -> h1
vpsrlq \$26,$D3,$H3
vpand $MASK,$D3,$D3
vpaddq $H3,$D4,$D4 # h3 -> h4
vmovd $D0,`4*0-48-64`($ctx) # save partially reduced
vmovd $D1,`4*1-48-64`($ctx)
vmovd $D2,`4*2-48-64`($ctx)
vmovd $D3,`4*3-48-64`($ctx)
vmovd $D4,`4*4-48-64`($ctx)
___
$code.=<<___ if ($win64);
vmovdqa 0x50(%r11),%xmm6
vmovdqa 0x60(%r11),%xmm7
vmovdqa 0x70(%r11),%xmm8
vmovdqa 0x80(%r11),%xmm9
vmovdqa 0x90(%r11),%xmm10
vmovdqa 0xa0(%r11),%xmm11
vmovdqa 0xb0(%r11),%xmm12
vmovdqa 0xc0(%r11),%xmm13
vmovdqa 0xd0(%r11),%xmm14
vmovdqa 0xe0(%r11),%xmm15
lea 0xf8(%r11),%rsp
.Ldo_avx_epilogue:
___
$code.=<<___ if (!$win64);
lea 0x58(%r11),%rsp
___
$code.=<<___;
vzeroupper
ret
.size poly1305_blocks_avx,.-poly1305_blocks_avx
.type poly1305_emit_avx,\@function,3
.align 32
poly1305_emit_avx:
cmpl \$0,20($ctx) # is_base2_26?
je .Lemit
mov 0($ctx),%eax # load hash value base 2^26
mov 4($ctx),%ecx
mov 8($ctx),%r8d
mov 12($ctx),%r11d
mov 16($ctx),%r10d
shl \$26,%rcx # base 2^26 -> base 2^64
mov %r8,%r9
shl \$52,%r8
add %rcx,%rax
shr \$12,%r9
add %rax,%r8 # h0
adc \$0,%r9
shl \$14,%r11
mov %r10,%rax
shr \$24,%r10
add %r11,%r9
shl \$40,%rax
add %rax,%r9 # h1
adc \$0,%r10 # h2
mov %r10,%rax # could be partially reduced, so reduce
mov %r10,%rcx
and \$3,%r10
shr \$2,%rax
and \$-4,%rcx
add %rcx,%rax
add %rax,%r8
adc \$0,%r9
mov %r8,%rax
add \$5,%r8 # compare to modulus
mov %r9,%rcx
adc \$0,%r9
adc \$0,%r10
shr \$2,%r10 # did 130-bit value overfow?
cmovnz %r8,%rax
cmovnz %r9,%rcx
add 0($nonce),%rax # accumulate nonce
adc 8($nonce),%rcx
mov %rax,0($mac) # write result
mov %rcx,8($mac)
ret
.size poly1305_emit_avx,.-poly1305_emit_avx
___
if ($avx>1) {
my ($H0,$H1,$H2,$H3,$H4, $MASK, $T4,$T0,$T1,$T2,$T3, $D0,$D1,$D2,$D3,$D4) =
map("%ymm$_",(0..15));
my $S4=$MASK;
$code.=<<___;
.type poly1305_blocks_avx2,\@function,4
.align 32
poly1305_blocks_avx2:
mov 20($ctx),%r8d # is_base2_26
cmp \$128,$len
jae .Lblocks_avx2
test %r8d,%r8d
jz .Lblocks
.Lblocks_avx2:
and \$-16,$len
jz .Lno_data_avx2
vzeroupper
test %r8d,%r8d
jz .Lbase2_64_avx2
test \$63,$len
jz .Leven_avx2
push %rbx
push %rbp
push %r12
push %r13
push %r14
push %r15
.Lblocks_avx2_body:
mov $len,%r15 # reassign $len
mov 0($ctx),$d1 # load hash value
mov 8($ctx),$d2
mov 16($ctx),$h2#d
mov 24($ctx),$r0 # load r
mov 32($ctx),$s1
################################# base 2^26 -> base 2^64
mov $d1#d,$h0#d
and \$-1<<31,$d1
mov $d2,$r1 # borrow $r1
mov $d2#d,$h1#d
and \$-1<<31,$d2
shr \$6,$d1
shl \$52,$r1
add $d1,$h0
shr \$12,$h1
shr \$18,$d2
add $r1,$h0
adc $d2,$h1
mov $h2,$d1
shl \$40,$d1
shr \$24,$h2
add $d1,$h1
adc \$0,$h2 # can be partially reduced...
mov \$-4,$d2 # ... so reduce
mov $h2,$d1
and $h2,$d2
shr \$2,$d1
and \$3,$h2
add $d2,$d1 # =*5
add $d1,$h0
adc \$0,$h1
mov $s1,$r1
mov $s1,%rax
shr \$2,$s1
add $r1,$s1 # s1 = r1 + (r1 >> 2)
.Lbase2_26_pre_avx2:
add 0($inp),$h0 # accumulate input
adc 8($inp),$h1
lea 16($inp),$inp
adc $padbit,$h2
sub \$16,%r15
call __poly1305_block
mov $r1,%rax
test \$63,%r15
jnz .Lbase2_26_pre_avx2
test $padbit,$padbit # if $padbit is zero,
jz .Lstore_base2_64_avx2 # store hash in base 2^64 format
################################# base 2^64 -> base 2^26
mov $h0,%rax
mov $h0,%rdx
shr \$52,$h0
mov $h1,$r0
mov $h1,$r1
shr \$26,%rdx
and \$0x3ffffff,%rax # h[0]
shl \$12,$r0
and \$0x3ffffff,%rdx # h[1]
shr \$14,$h1
or $r0,$h0
shl \$24,$h2
and \$0x3ffffff,$h0 # h[2]
shr \$40,$r1
and \$0x3ffffff,$h1 # h[3]
or $r1,$h2 # h[4]
test %r15,%r15
jz .Lstore_base2_26_avx2
vmovd %rax#d,%x#$H0
vmovd %rdx#d,%x#$H1
vmovd $h0#d,%x#$H2
vmovd $h1#d,%x#$H3
vmovd $h2#d,%x#$H4
jmp .Lproceed_avx2
.align 32
.Lstore_base2_64_avx2:
mov $h0,0($ctx)
mov $h1,8($ctx)
mov $h2,16($ctx) # note that is_base2_26 is zeroed
jmp .Ldone_avx2
.align 16
.Lstore_base2_26_avx2:
mov %rax#d,0($ctx) # store hash value base 2^26
mov %rdx#d,4($ctx)
mov $h0#d,8($ctx)
mov $h1#d,12($ctx)
mov $h2#d,16($ctx)
.align 16
.Ldone_avx2:
mov 0(%rsp),%r15
mov 8(%rsp),%r14
mov 16(%rsp),%r13
mov 24(%rsp),%r12
mov 32(%rsp),%rbp
mov 40(%rsp),%rbx
lea 48(%rsp),%rsp
.Lno_data_avx2:
.Lblocks_avx2_epilogue:
ret
.align 32
.Lbase2_64_avx2:
push %rbx
push %rbp
push %r12
push %r13
push %r14
push %r15
.Lbase2_64_avx2_body:
mov $len,%r15 # reassign $len
mov 24($ctx),$r0 # load r
mov 32($ctx),$s1
mov 0($ctx),$h0 # load hash value
mov 8($ctx),$h1
mov 16($ctx),$h2#d
mov $s1,$r1
mov $s1,%rax
shr \$2,$s1
add $r1,$s1 # s1 = r1 + (r1 >> 2)
test \$63,$len
jz .Linit_avx2
.Lbase2_64_pre_avx2:
add 0($inp),$h0 # accumulate input
adc 8($inp),$h1
lea 16($inp),$inp
adc $padbit,$h2
sub \$16,%r15
call __poly1305_block
mov $r1,%rax
test \$63,%r15
jnz .Lbase2_64_pre_avx2
.Linit_avx2:
################################# base 2^64 -> base 2^26
mov $h0,%rax
mov $h0,%rdx
shr \$52,$h0
mov $h1,$d1
mov $h1,$d2
shr \$26,%rdx
and \$0x3ffffff,%rax # h[0]
shl \$12,$d1
and \$0x3ffffff,%rdx # h[1]
shr \$14,$h1
or $d1,$h0
shl \$24,$h2
and \$0x3ffffff,$h0 # h[2]
shr \$40,$d2
and \$0x3ffffff,$h1 # h[3]
or $d2,$h2 # h[4]
vmovd %rax#d,%x#$H0
vmovd %rdx#d,%x#$H1
vmovd $h0#d,%x#$H2
vmovd $h1#d,%x#$H3
vmovd $h2#d,%x#$H4
movl \$1,20($ctx) # set is_base2_26
call __poly1305_init_avx
.Lproceed_avx2:
mov %r15,$len
mov 0(%rsp),%r15
mov 8(%rsp),%r14
mov 16(%rsp),%r13
mov 24(%rsp),%r12
mov 32(%rsp),%rbp
mov 40(%rsp),%rbx
lea 48(%rsp),%rax
lea 48(%rsp),%rsp
.Lbase2_64_avx2_epilogue:
jmp .Ldo_avx2
.align 32
.Leven_avx2:
vmovd 4*0($ctx),%x#$H0 # load hash value base 2^26
vmovd 4*1($ctx),%x#$H1
vmovd 4*2($ctx),%x#$H2
vmovd 4*3($ctx),%x#$H3
vmovd 4*4($ctx),%x#$H4
.Ldo_avx2:
___
$code.=<<___ if (!$win64);
lea -8(%rsp),%r11
sub \$0x128,%rsp
___
$code.=<<___ if ($win64);
lea -0xf8(%rsp),%r11
sub \$0x1c8,%rsp
vmovdqa %xmm6,0x50(%r11)
vmovdqa %xmm7,0x60(%r11)
vmovdqa %xmm8,0x70(%r11)
vmovdqa %xmm9,0x80(%r11)
vmovdqa %xmm10,0x90(%r11)
vmovdqa %xmm11,0xa0(%r11)
vmovdqa %xmm12,0xb0(%r11)
vmovdqa %xmm13,0xc0(%r11)
vmovdqa %xmm14,0xd0(%r11)
vmovdqa %xmm15,0xe0(%r11)
.Ldo_avx2_body:
___
$code.=<<___;
lea 48+64($ctx),$ctx # size optimization
lea .Lconst(%rip),%rcx
# expand and copy pre-calculated table to stack
vmovdqu `16*0-64`($ctx),%x#$T2
and \$-512,%rsp
vmovdqu `16*1-64`($ctx),%x#$T3
vmovdqu `16*2-64`($ctx),%x#$T4
vmovdqu `16*3-64`($ctx),%x#$D0
vmovdqu `16*4-64`($ctx),%x#$D1
vmovdqu `16*5-64`($ctx),%x#$D2
vmovdqu `16*6-64`($ctx),%x#$D3
vpermq \$0x15,$T2,$T2 # 00003412 -> 12343434
vmovdqu `16*7-64`($ctx),%x#$D4
vpermq \$0x15,$T3,$T3
vpshufd \$0xc8,$T2,$T2 # 12343434 -> 14243444
vmovdqu `16*8-64`($ctx),%x#$MASK
vpermq \$0x15,$T4,$T4
vpshufd \$0xc8,$T3,$T3
vmovdqa $T2,0x00(%rsp)
vpermq \$0x15,$D0,$D0
vpshufd \$0xc8,$T4,$T4
vmovdqa $T3,0x20(%rsp)
vpermq \$0x15,$D1,$D1
vpshufd \$0xc8,$D0,$D0
vmovdqa $T4,0x40(%rsp)
vpermq \$0x15,$D2,$D2
vpshufd \$0xc8,$D1,$D1
vmovdqa $D0,0x60(%rsp)
vpermq \$0x15,$D3,$D3
vpshufd \$0xc8,$D2,$D2
vmovdqa $D1,0x80(%rsp)
vpermq \$0x15,$D4,$D4
vpshufd \$0xc8,$D3,$D3
vmovdqa $D2,0xa0(%rsp)
vpermq \$0x15,$MASK,$MASK
vpshufd \$0xc8,$D4,$D4
vmovdqa $D3,0xc0(%rsp)
vpshufd \$0xc8,$MASK,$MASK
vmovdqa $D4,0xe0(%rsp)
vmovdqa $MASK,0x100(%rsp)
vmovdqa 64(%rcx),$MASK # .Lmask26
################################################################
# load input
vmovdqu 16*0($inp),%x#$T0
vmovdqu 16*1($inp),%x#$T1
vinserti128 \$1,16*2($inp),$T0,$T0
vinserti128 \$1,16*3($inp),$T1,$T1
lea 16*4($inp),$inp
vpsrldq \$6,$T0,$T2 # splat input
vpsrldq \$6,$T1,$T3
vpunpckhqdq $T1,$T0,$T4 # 4
vpunpcklqdq $T3,$T2,$T2 # 2:3
vpunpcklqdq $T1,$T0,$T0 # 0:1
vpsrlq \$30,$T2,$T3
vpsrlq \$4,$T2,$T2
vpsrlq \$26,$T0,$T1
vpsrlq \$40,$T4,$T4 # 4
vpand $MASK,$T2,$T2 # 2
vpand $MASK,$T0,$T0 # 0
vpand $MASK,$T1,$T1 # 1
vpand $MASK,$T3,$T3 # 3
vpor 32(%rcx),$T4,$T4 # padbit, yes, always
lea 0x90(%rsp),%rax # size optimization
vpaddq $H2,$T2,$H2 # accumulate input
sub \$64,$len
jz .Ltail_avx2
jmp .Loop_avx2
.align 32
.Loop_avx2:
################################################################
# ((inp[0]*r^4+r[4])*r^4+r[8])*r^4
# ((inp[1]*r^4+r[5])*r^4+r[9])*r^3
# ((inp[2]*r^4+r[6])*r^4+r[10])*r^2
# ((inp[3]*r^4+r[7])*r^4+r[11])*r^1
# \________/\________/
################################################################
#vpaddq $H2,$T2,$H2 # accumulate input
vpaddq $H0,$T0,$H0
vmovdqa `32*0`(%rsp),$T0 # r0^4
vpaddq $H1,$T1,$H1
vmovdqa `32*1`(%rsp),$T1 # r1^4
vpaddq $H3,$T3,$H3
vmovdqa `32*3`(%rsp),$T2 # r2^4
vpaddq $H4,$T4,$H4
vmovdqa `32*6-0x90`(%rax),$T3 # s3^4
vmovdqa `32*8-0x90`(%rax),$S4 # s4^4
# d4 = h4*r0 + h3*r1 + h2*r2 + h1*r3 + h0*r4
# d3 = h3*r0 + h2*r1 + h1*r2 + h0*r3 + h4*5*r4
# d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4
# d1 = h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3 + h2*5*r4
# d0 = h0*r0 + h4*5*r1 + h3*5*r2 + h2*5*r3 + h1*5*r4
#
# however, as h2 is "chronologically" first one available pull
# corresponding operations up, so it's
#
# d4 = h2*r2 + h4*r0 + h3*r1 + h1*r3 + h0*r4
# d3 = h2*r1 + h3*r0 + h1*r2 + h0*r3 + h4*5*r4
# d2 = h2*r0 + h1*r1 + h0*r2 + h4*5*r3 + h3*5*r4
# d1 = h2*5*r4 + h1*r0 + h0*r1 + h4*5*r2 + h3*5*r3
# d0 = h2*5*r3 + h0*r0 + h4*5*r1 + h3*5*r2 + h1*5*r4
vpmuludq $H2,$T0,$D2 # d2 = h2*r0
vpmuludq $H2,$T1,$D3 # d3 = h2*r1
vpmuludq $H2,$T2,$D4 # d4 = h2*r2
vpmuludq $H2,$T3,$D0 # d0 = h2*s3
vpmuludq $H2,$S4,$D1 # d1 = h2*s4
vpmuludq $H0,$T1,$T4 # h0*r1
vpmuludq $H1,$T1,$H2 # h1*r1, borrow $H2 as temp
vpaddq $T4,$D1,$D1 # d1 += h0*r1
vpaddq $H2,$D2,$D2 # d2 += h1*r1
vpmuludq $H3,$T1,$T4 # h3*r1
vpmuludq `32*2`(%rsp),$H4,$H2 # h4*s1
vpaddq $T4,$D4,$D4 # d4 += h3*r1
vpaddq $H2,$D0,$D0 # d0 += h4*s1
vmovdqa `32*4-0x90`(%rax),$T1 # s2
vpmuludq $H0,$T0,$T4 # h0*r0
vpmuludq $H1,$T0,$H2 # h1*r0
vpaddq $T4,$D0,$D0 # d0 += h0*r0
vpaddq $H2,$D1,$D1 # d1 += h1*r0
vpmuludq $H3,$T0,$T4 # h3*r0
vpmuludq $H4,$T0,$H2 # h4*r0
vmovdqu 16*0($inp),%x#$T0 # load input
vpaddq $T4,$D3,$D3 # d3 += h3*r0
vpaddq $H2,$D4,$D4 # d4 += h4*r0
vinserti128 \$1,16*2($inp),$T0,$T0
vpmuludq $H3,$T1,$T4 # h3*s2
vpmuludq $H4,$T1,$H2 # h4*s2
vmovdqu 16*1($inp),%x#$T1
vpaddq $T4,$D0,$D0 # d0 += h3*s2
vpaddq $H2,$D1,$D1 # d1 += h4*s2
vmovdqa `32*5-0x90`(%rax),$H2 # r3
vpmuludq $H1,$T2,$T4 # h1*r2
vpmuludq $H0,$T2,$T2 # h0*r2
vpaddq $T4,$D3,$D3 # d3 += h1*r2
vpaddq $T2,$D2,$D2 # d2 += h0*r2
vinserti128 \$1,16*3($inp),$T1,$T1
lea 16*4($inp),$inp
vpmuludq $H1,$H2,$T4 # h1*r3
vpmuludq $H0,$H2,$H2 # h0*r3
vpsrldq \$6,$T0,$T2 # splat input
vpaddq $T4,$D4,$D4 # d4 += h1*r3
vpaddq $H2,$D3,$D3 # d3 += h0*r3
vpmuludq $H3,$T3,$T4 # h3*s3
vpmuludq $H4,$T3,$H2 # h4*s3
vpsrldq \$6,$T1,$T3
vpaddq $T4,$D1,$D1 # d1 += h3*s3
vpaddq $H2,$D2,$D2 # d2 += h4*s3
vpunpckhqdq $T1,$T0,$T4 # 4
vpmuludq $H3,$S4,$H3 # h3*s4
vpmuludq $H4,$S4,$H4 # h4*s4
vpunpcklqdq $T1,$T0,$T0 # 0:1
vpaddq $H3,$D2,$H2 # h2 = d2 + h3*r4
vpaddq $H4,$D3,$H3 # h3 = d3 + h4*r4
vpunpcklqdq $T3,$T2,$T3 # 2:3
vpmuludq `32*7-0x90`(%rax),$H0,$H4 # h0*r4
vpmuludq $H1,$S4,$H0 # h1*s4
vmovdqa 64(%rcx),$MASK # .Lmask26
vpaddq $H4,$D4,$H4 # h4 = d4 + h0*r4
vpaddq $H0,$D0,$H0 # h0 = d0 + h1*s4
################################################################
# lazy reduction (interleaved with tail of input splat)
vpsrlq \$26,$H3,$D3
vpand $MASK,$H3,$H3
vpaddq $D3,$H4,$H4 # h3 -> h4
vpsrlq \$26,$H0,$D0
vpand $MASK,$H0,$H0
vpaddq $D0,$D1,$H1 # h0 -> h1
vpsrlq \$26,$H4,$D4
vpand $MASK,$H4,$H4
vpsrlq \$4,$T3,$T2
vpsrlq \$26,$H1,$D1
vpand $MASK,$H1,$H1
vpaddq $D1,$H2,$H2 # h1 -> h2
vpaddq $D4,$H0,$H0
vpsllq \$2,$D4,$D4
vpaddq $D4,$H0,$H0 # h4 -> h0
vpand $MASK,$T2,$T2 # 2
vpsrlq \$26,$T0,$T1
vpsrlq \$26,$H2,$D2
vpand $MASK,$H2,$H2
vpaddq $D2,$H3,$H3 # h2 -> h3
vpaddq $T2,$H2,$H2 # modulo-scheduled
vpsrlq \$30,$T3,$T3
vpsrlq \$26,$H0,$D0
vpand $MASK,$H0,$H0
vpaddq $D0,$H1,$H1 # h0 -> h1
vpsrlq \$40,$T4,$T4 # 4
vpsrlq \$26,$H3,$D3
vpand $MASK,$H3,$H3
vpaddq $D3,$H4,$H4 # h3 -> h4
vpand $MASK,$T0,$T0 # 0
vpand $MASK,$T1,$T1 # 1
vpand $MASK,$T3,$T3 # 3
vpor 32(%rcx),$T4,$T4 # padbit, yes, always
sub \$64,$len
jnz .Loop_avx2
.byte 0x66,0x90
.Ltail_avx2:
################################################################
# while above multiplications were by r^4 in all lanes, in last
# iteration we multiply least significant lane by r^4 and most
# significant one by r, so copy of above except that references
# to the precomputed table are displaced by 4...
#vpaddq $H2,$T2,$H2 # accumulate input
vpaddq $H0,$T0,$H0
vmovdqu `32*0+4`(%rsp),$T0 # r0^4
vpaddq $H1,$T1,$H1
vmovdqu `32*1+4`(%rsp),$T1 # r1^4
vpaddq $H3,$T3,$H3
vmovdqu `32*3+4`(%rsp),$T2 # r2^4
vpaddq $H4,$T4,$H4
vmovdqu `32*6+4-0x90`(%rax),$T3 # s3^4
vmovdqu `32*8+4-0x90`(%rax),$S4 # s4^4
vpmuludq $H2,$T0,$D2 # d2 = h2*r0
vpmuludq $H2,$T1,$D3 # d3 = h2*r1
vpmuludq $H2,$T2,$D4 # d4 = h2*r2
vpmuludq $H2,$T3,$D0 # d0 = h2*s3
vpmuludq $H2,$S4,$D1 # d1 = h2*s4
vpmuludq $H0,$T1,$T4 # h0*r1
vpmuludq $H1,$T1,$H2 # h1*r1
vpaddq $T4,$D1,$D1 # d1 += h0*r1
vpaddq $H2,$D2,$D2 # d2 += h1*r1
vpmuludq $H3,$T1,$T4 # h3*r1
vpmuludq `32*2+4`(%rsp),$H4,$H2 # h4*s1
vpaddq $T4,$D4,$D4 # d4 += h3*r1
vpaddq $H2,$D0,$D0 # d0 += h4*s1
vpmuludq $H0,$T0,$T4 # h0*r0
vpmuludq $H1,$T0,$H2 # h1*r0
vpaddq $T4,$D0,$D0 # d0 += h0*r0
vmovdqu `32*4+4-0x90`(%rax),$T1 # s2
vpaddq $H2,$D1,$D1 # d1 += h1*r0
vpmuludq $H3,$T0,$T4 # h3*r0
vpmuludq $H4,$T0,$H2 # h4*r0
vpaddq $T4,$D3,$D3 # d3 += h3*r0
vpaddq $H2,$D4,$D4 # d4 += h4*r0
vpmuludq $H3,$T1,$T4 # h3*s2
vpmuludq $H4,$T1,$H2 # h4*s2
vpaddq $T4,$D0,$D0 # d0 += h3*s2
vpaddq $H2,$D1,$D1 # d1 += h4*s2
vmovdqu `32*5+4-0x90`(%rax),$H2 # r3
vpmuludq $H1,$T2,$T4 # h1*r2
vpmuludq $H0,$T2,$T2 # h0*r2
vpaddq $T4,$D3,$D3 # d3 += h1*r2
vpaddq $T2,$D2,$D2 # d2 += h0*r2
vpmuludq $H1,$H2,$T4 # h1*r3
vpmuludq $H0,$H2,$H2 # h0*r3
vpaddq $T4,$D4,$D4 # d4 += h1*r3
vpaddq $H2,$D3,$D3 # d3 += h0*r3
vpmuludq $H3,$T3,$T4 # h3*s3
vpmuludq $H4,$T3,$H2 # h4*s3
vpaddq $T4,$D1,$D1 # d1 += h3*s3
vpaddq $H2,$D2,$D2 # d2 += h4*s3
vpmuludq $H3,$S4,$H3 # h3*s4
vpmuludq $H4,$S4,$H4 # h4*s4
vpaddq $H3,$D2,$H2 # h2 = d2 + h3*r4
vpaddq $H4,$D3,$H3 # h3 = d3 + h4*r4
vpmuludq `32*7+4-0x90`(%rax),$H0,$H4 # h0*r4
vpmuludq $H1,$S4,$H0 # h1*s4
vmovdqa 64(%rcx),$MASK # .Lmask26
vpaddq $H4,$D4,$H4 # h4 = d4 + h0*r4
vpaddq $H0,$D0,$H0 # h0 = d0 + h1*s4
################################################################
# horizontal addition
vpsrldq \$8,$D1,$T1
vpsrldq \$8,$H2,$T2
vpsrldq \$8,$H3,$T3
vpsrldq \$8,$H4,$T4
vpsrldq \$8,$H0,$T0
vpaddq $T1,$D1,$D1
vpaddq $T2,$H2,$H2
vpaddq $T3,$H3,$H3
vpaddq $T4,$H4,$H4
vpaddq $T0,$H0,$H0
vpermq \$0x2,$H3,$T3
vpermq \$0x2,$H4,$T4
vpermq \$0x2,$H0,$T0
vpermq \$0x2,$D1,$T1
vpermq \$0x2,$H2,$T2
vpaddq $T3,$H3,$H3
vpaddq $T4,$H4,$H4
vpaddq $T0,$H0,$H0
vpaddq $T1,$D1,$D1
vpaddq $T2,$H2,$H2
################################################################
# lazy reduction
vpsrlq \$26,$H3,$D3
vpand $MASK,$H3,$H3
vpaddq $D3,$H4,$H4 # h3 -> h4
vpsrlq \$26,$H0,$D0
vpand $MASK,$H0,$H0
vpaddq $D0,$D1,$H1 # h0 -> h1
vpsrlq \$26,$H4,$D4
vpand $MASK,$H4,$H4
vpsrlq \$26,$H1,$D1
vpand $MASK,$H1,$H1
vpaddq $D1,$H2,$H2 # h1 -> h2
vpaddq $D4,$H0,$H0
vpsllq \$2,$D4,$D4
vpaddq $D4,$H0,$H0 # h4 -> h0
vpsrlq \$26,$H2,$D2
vpand $MASK,$H2,$H2
vpaddq $D2,$H3,$H3 # h2 -> h3
vpsrlq \$26,$H0,$D0
vpand $MASK,$H0,$H0
vpaddq $D0,$H1,$H1 # h0 -> h1
vpsrlq \$26,$H3,$D3
vpand $MASK,$H3,$H3
vpaddq $D3,$H4,$H4 # h3 -> h4
vmovd %x#$H0,`4*0-48-64`($ctx)# save partially reduced
vmovd %x#$H1,`4*1-48-64`($ctx)
vmovd %x#$H2,`4*2-48-64`($ctx)
vmovd %x#$H3,`4*3-48-64`($ctx)
vmovd %x#$H4,`4*4-48-64`($ctx)
___
$code.=<<___ if ($win64);
vmovdqa 0x50(%r11),%xmm6
vmovdqa 0x60(%r11),%xmm7
vmovdqa 0x70(%r11),%xmm8
vmovdqa 0x80(%r11),%xmm9
vmovdqa 0x90(%r11),%xmm10
vmovdqa 0xa0(%r11),%xmm11
vmovdqa 0xb0(%r11),%xmm12
vmovdqa 0xc0(%r11),%xmm13
vmovdqa 0xd0(%r11),%xmm14
vmovdqa 0xe0(%r11),%xmm15
lea 0xf8(%r11),%rsp
.Ldo_avx2_epilogue:
___
$code.=<<___ if (!$win64);
lea 8(%r11),%rsp
___
$code.=<<___;
vzeroupper
ret
.size poly1305_blocks_avx2,.-poly1305_blocks_avx2
___
}
$code.=<<___;
.align 64
.Lconst:
.Lmask24:
.long 0x0ffffff,0,0x0ffffff,0,0x0ffffff,0,0x0ffffff,0
.L129:
.long 1<<24,0,1<<24,0,1<<24,0,1<<24,0
.Lmask26:
.long 0x3ffffff,0,0x3ffffff,0,0x3ffffff,0,0x3ffffff,0
.Lfive:
.long 5,0,5,0,5,0,5,0
___
}
$code.=<<___;
.asciz "Poly1305 for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
.align 16
___
# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
# CONTEXT *context,DISPATCHER_CONTEXT *disp)
if ($win64) {
$rec="%rcx";
$frame="%rdx";
$context="%r8";
$disp="%r9";
$code.=<<___;
.extern __imp_RtlVirtualUnwind
.type se_handler,\@abi-omnipotent
.align 16
se_handler:
push %rsi
push %rdi
push %rbx
push %rbp
push %r12
push %r13
push %r14
push %r15
pushfq
sub \$64,%rsp
mov 120($context),%rax # pull context->Rax
mov 248($context),%rbx # pull context->Rip
mov 8($disp),%rsi # disp->ImageBase
mov 56($disp),%r11 # disp->HandlerData
mov 0(%r11),%r10d # HandlerData[0]
lea (%rsi,%r10),%r10 # prologue label
cmp %r10,%rbx # context->Rip<.Lprologue
jb .Lcommon_seh_tail
mov 152($context),%rax # pull context->Rsp
mov 4(%r11),%r10d # HandlerData[1]
lea (%rsi,%r10),%r10 # epilogue label
cmp %r10,%rbx # context->Rip>=.Lepilogue
jae .Lcommon_seh_tail
lea 48(%rax),%rax
mov -8(%rax),%rbx
mov -16(%rax),%rbp
mov -24(%rax),%r12
mov -32(%rax),%r13
mov -40(%rax),%r14
mov -48(%rax),%r15
mov %rbx,144($context) # restore context->Rbx
mov %rbp,160($context) # restore context->Rbp
mov %r12,216($context) # restore context->R12
mov %r13,224($context) # restore context->R13
mov %r14,232($context) # restore context->R14
mov %r15,240($context) # restore context->R14
jmp .Lcommon_seh_tail
.size se_handler,.-se_handler
.type avx_handler,\@abi-omnipotent
.align 16
avx_handler:
push %rsi
push %rdi
push %rbx
push %rbp
push %r12
push %r13
push %r14
push %r15
pushfq
sub \$64,%rsp
mov 120($context),%rax # pull context->Rax
mov 248($context),%rbx # pull context->Rip
mov 8($disp),%rsi # disp->ImageBase
mov 56($disp),%r11 # disp->HandlerData
mov 0(%r11),%r10d # HandlerData[0]
lea (%rsi,%r10),%r10 # prologue label
cmp %r10,%rbx # context->Rip<prologue label
jb .Lcommon_seh_tail
mov 152($context),%rax # pull context->Rsp
mov 4(%r11),%r10d # HandlerData[1]
lea (%rsi,%r10),%r10 # epilogue label
cmp %r10,%rbx # context->Rip>=epilogue label
jae .Lcommon_seh_tail
mov 208($context),%rax # pull context->R11
lea 0x50(%rax),%rsi
lea 0xf8(%rax),%rax
lea 512($context),%rdi # &context.Xmm6
mov \$20,%ecx
.long 0xa548f3fc # cld; rep movsq
.Lcommon_seh_tail:
mov 8(%rax),%rdi
mov 16(%rax),%rsi
mov %rax,152($context) # restore context->Rsp
mov %rsi,168($context) # restore context->Rsi
mov %rdi,176($context) # restore context->Rdi
mov 40($disp),%rdi # disp->ContextRecord
mov $context,%rsi # context
mov \$154,%ecx # sizeof(CONTEXT)
.long 0xa548f3fc # cld; rep movsq
mov $disp,%rsi
xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER
mov 8(%rsi),%rdx # arg2, disp->ImageBase
mov 0(%rsi),%r8 # arg3, disp->ControlPc
mov 16(%rsi),%r9 # arg4, disp->FunctionEntry
mov 40(%rsi),%r10 # disp->ContextRecord
lea 56(%rsi),%r11 # &disp->HandlerData
lea 24(%rsi),%r12 # &disp->EstablisherFrame
mov %r10,32(%rsp) # arg5
mov %r11,40(%rsp) # arg6
mov %r12,48(%rsp) # arg7
mov %rcx,56(%rsp) # arg8, (NULL)
call *__imp_RtlVirtualUnwind(%rip)
mov \$1,%eax # ExceptionContinueSearch
add \$64,%rsp
popfq
pop %r15
pop %r14
pop %r13
pop %r12
pop %rbp
pop %rbx
pop %rdi
pop %rsi
ret
.size avx_handler,.-avx_handler
.section .pdata
.align 4
.rva .LSEH_begin_poly1305_init
.rva .LSEH_end_poly1305_init
.rva .LSEH_info_poly1305_init
.rva .LSEH_begin_poly1305_blocks
.rva .LSEH_end_poly1305_blocks
.rva .LSEH_info_poly1305_blocks
.rva .LSEH_begin_poly1305_emit
.rva .LSEH_end_poly1305_emit
.rva .LSEH_info_poly1305_emit
___
$code.=<<___ if ($avx);
.rva .LSEH_begin_poly1305_blocks_avx
.rva .Lbase2_64_avx
.rva .LSEH_info_poly1305_blocks_avx_1
.rva .Lbase2_64_avx
.rva .Leven_avx
.rva .LSEH_info_poly1305_blocks_avx_2
.rva .Leven_avx
.rva .LSEH_end_poly1305_blocks_avx
.rva .LSEH_info_poly1305_blocks_avx_3
.rva .LSEH_begin_poly1305_emit_avx
.rva .LSEH_end_poly1305_emit_avx
.rva .LSEH_info_poly1305_emit_avx
___
$code.=<<___ if ($avx>1);
.rva .LSEH_begin_poly1305_blocks_avx2
.rva .Lbase2_64_avx2
.rva .LSEH_info_poly1305_blocks_avx2_1
.rva .Lbase2_64_avx2
.rva .Leven_avx2
.rva .LSEH_info_poly1305_blocks_avx2_2
.rva .Leven_avx2
.rva .LSEH_end_poly1305_blocks_avx2
.rva .LSEH_info_poly1305_blocks_avx2_3
___
$code.=<<___;
.section .xdata
.align 8
.LSEH_info_poly1305_init:
.byte 9,0,0,0
.rva se_handler
.rva .LSEH_begin_poly1305_init,.LSEH_begin_poly1305_init
.LSEH_info_poly1305_blocks:
.byte 9,0,0,0
.rva se_handler
.rva .Lblocks_body,.Lblocks_epilogue
.LSEH_info_poly1305_emit:
.byte 9,0,0,0
.rva se_handler
.rva .LSEH_begin_poly1305_emit,.LSEH_begin_poly1305_emit
___
$code.=<<___ if ($avx);
.LSEH_info_poly1305_blocks_avx_1:
.byte 9,0,0,0
.rva se_handler
.rva .Lblocks_avx_body,.Lblocks_avx_epilogue # HandlerData[]
.LSEH_info_poly1305_blocks_avx_2:
.byte 9,0,0,0
.rva se_handler
.rva .Lbase2_64_avx_body,.Lbase2_64_avx_epilogue # HandlerData[]
.LSEH_info_poly1305_blocks_avx_3:
.byte 9,0,0,0
.rva avx_handler
.rva .Ldo_avx_body,.Ldo_avx_epilogue # HandlerData[]
.LSEH_info_poly1305_emit_avx:
.byte 9,0,0,0
.rva se_handler
.rva .LSEH_begin_poly1305_emit_avx,.LSEH_begin_poly1305_emit_avx
___
$code.=<<___ if ($avx>1);
.LSEH_info_poly1305_blocks_avx2_1:
.byte 9,0,0,0
.rva se_handler
.rva .Lblocks_avx2_body,.Lblocks_avx2_epilogue # HandlerData[]
.LSEH_info_poly1305_blocks_avx2_2:
.byte 9,0,0,0
.rva se_handler
.rva .Lbase2_64_avx2_body,.Lbase2_64_avx2_epilogue # HandlerData[]
.LSEH_info_poly1305_blocks_avx2_3:
.byte 9,0,0,0
.rva avx_handler
.rva .Ldo_avx2_body,.Ldo_avx2_epilogue # HandlerData[]
___
}
foreach (split('\n',$code)) {
s/\`([^\`]*)\`/eval($1)/ge;
s/%r([a-z]+)#d/%e$1/g;
s/%r([0-9]+)#d/%r$1d/g;
s/%x#%y/%x/g;
print $_,"\n";
}
close STDOUT;