644539191b
Before, attempting to build the code using Yasm as the assembler would result in warnings like this: warning : no non-local label before `.chacha20_consts' Precede the local labels with a non-local label to suppress these warnings. It isn't clear why these labels are defined as local labels instead of regular labels. Making them non-local may be a better idea. For reference, Yasm's interpretation of local labels is described succinctly at https://www.tortall.net/projects/yasm/manual/html/nasm-local-label.html. Change-Id: Ifc92de7fd7379859fe33f1137ab20b6ec282cd0b Reviewed-on: https://boringssl-review.googlesource.com/13384 Reviewed-by: Adam Langley <agl@google.com>
2381 lines
67 KiB
Raku
2381 lines
67 KiB
Raku
#!/usr/bin/env perl
|
|
|
|
# Copyright (c) 2015, CloudFlare Ltd.
|
|
#
|
|
# Permission to use, copy, modify, and/or distribute this software for any
|
|
# purpose with or without fee is hereby granted, provided that the above
|
|
# copyright notice and this permission notice appear in all copies.
|
|
#
|
|
# THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
|
|
# WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
|
|
# MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
|
|
# SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
|
|
# WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
|
|
# OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
|
|
# CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */
|
|
|
|
##############################################################################
|
|
# #
|
|
# Author: Vlad Krasnov #
|
|
# #
|
|
##############################################################################
|
|
|
|
$flavour = shift;
|
|
$output = shift;
|
|
if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
|
|
|
|
$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
|
|
|
|
$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
|
|
( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
|
|
( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
|
|
die "can't locate x86_64-xlate.pl";
|
|
|
|
open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
|
|
*STDOUT=*OUT;
|
|
|
|
$avx = 2;
|
|
|
|
$code.=<<___;
|
|
.text
|
|
.extern OPENSSL_ia32cap_P
|
|
|
|
chacha20_poly1305_constants:
|
|
|
|
.align 64
|
|
.chacha20_consts:
|
|
.byte 'e','x','p','a','n','d',' ','3','2','-','b','y','t','e',' ','k'
|
|
.byte 'e','x','p','a','n','d',' ','3','2','-','b','y','t','e',' ','k'
|
|
.rol8:
|
|
.byte 3,0,1,2, 7,4,5,6, 11,8,9,10, 15,12,13,14
|
|
.byte 3,0,1,2, 7,4,5,6, 11,8,9,10, 15,12,13,14
|
|
.rol16:
|
|
.byte 2,3,0,1, 6,7,4,5, 10,11,8,9, 14,15,12,13
|
|
.byte 2,3,0,1, 6,7,4,5, 10,11,8,9, 14,15,12,13
|
|
.avx2_init:
|
|
.long 0,0,0,0
|
|
.sse_inc:
|
|
.long 1,0,0,0
|
|
.avx2_inc:
|
|
.long 2,0,0,0,2,0,0,0
|
|
.clamp:
|
|
.quad 0x0FFFFFFC0FFFFFFF, 0x0FFFFFFC0FFFFFFC
|
|
.quad 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF
|
|
.align 16
|
|
.and_masks:
|
|
.byte 0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
|
|
.byte 0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
|
|
.byte 0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
|
|
.byte 0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
|
|
.byte 0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
|
|
.byte 0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
|
|
.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
|
|
.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
|
|
.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00
|
|
.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00
|
|
.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00
|
|
.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00
|
|
.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00
|
|
.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00
|
|
.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00
|
|
___
|
|
|
|
my ($oup,$inp,$inl,$adp,$keyp,$itr1,$itr2)=("%rdi","%rsi","%rbx","%rcx","%r9","%rcx","%r8");
|
|
my ($acc0,$acc1,$acc2)=map("%r$_",(10..12));
|
|
my ($t0,$t1,$t2,$t3)=("%r13","%r14","%r15","%r9");
|
|
my ($A0,$A1,$A2,$A3,$B0,$B1,$B2,$B3,$C0,$C1,$C2,$C3,$D0,$D1,$D2,$D3)=map("%xmm$_",(0..15));
|
|
my ($T0,$T1,$T2,$T3)=($A3,$B3,$C3,$D3);
|
|
my $r_store="0*16(%rbp)";
|
|
my $s_store="1*16(%rbp)";
|
|
my $len_store="2*16(%rbp)";
|
|
my $state1_store="3*16(%rbp)";
|
|
my $state2_store="4*16(%rbp)";
|
|
my $tmp_store="5*16(%rbp)";
|
|
my $ctr0_store="6*16(%rbp)";
|
|
my $ctr1_store="7*16(%rbp)";
|
|
my $ctr2_store="8*16(%rbp)";
|
|
my $ctr3_store="9*16(%rbp)";
|
|
|
|
sub chacha_qr {
|
|
my ($a,$b,$c,$d,$t,$dir)=@_;
|
|
$code.="movdqa $t, $tmp_store\n" if ($dir =~ /store/);
|
|
$code.="paddd $b, $a
|
|
pxor $a, $d
|
|
pshufb .rol16(%rip), $d
|
|
paddd $d, $c
|
|
pxor $c, $b
|
|
movdqa $b, $t
|
|
pslld \$12, $t
|
|
psrld \$20, $b
|
|
pxor $t, $b
|
|
paddd $b, $a
|
|
pxor $a, $d
|
|
pshufb .rol8(%rip), $d
|
|
paddd $d, $c
|
|
pxor $c, $b
|
|
movdqa $b, $t
|
|
pslld \$7, $t
|
|
psrld \$25, $b
|
|
pxor $t, $b\n";
|
|
$code.="palignr \$4, $b, $b
|
|
palignr \$8, $c, $c
|
|
palignr \$12, $d, $d\n" if ($dir =~ /left/);
|
|
$code.="palignr \$12, $b, $b
|
|
palignr \$8, $c, $c
|
|
palignr \$4, $d, $d\n" if ($dir =~ /right/);
|
|
$code.="movdqa $tmp_store, $t\n" if ($dir =~ /load/);
|
|
}
|
|
|
|
sub poly_add {
|
|
my ($src)=@_;
|
|
$code.="add $src, $acc0
|
|
adc 8+$src, $acc1
|
|
adc \$1, $acc2\n";
|
|
}
|
|
|
|
sub poly_stage1 {
|
|
$code.="mov 0+$r_store, %rax
|
|
mov %rax, $t2
|
|
mul $acc0
|
|
mov %rax, $t0
|
|
mov %rdx, $t1
|
|
mov 0+$r_store, %rax
|
|
mul $acc1
|
|
imulq $acc2, $t2
|
|
add %rax, $t1
|
|
adc %rdx, $t2\n";
|
|
}
|
|
|
|
sub poly_stage2 {
|
|
$code.="mov 8+$r_store, %rax
|
|
mov %rax, $t3
|
|
mul $acc0
|
|
add %rax, $t1
|
|
adc \$0, %rdx
|
|
mov %rdx, $acc0
|
|
mov 8+$r_store, %rax
|
|
mul $acc1
|
|
add %rax, $t2
|
|
adc \$0, %rdx\n";
|
|
}
|
|
|
|
sub poly_stage3 {
|
|
$code.="imulq $acc2, $t3
|
|
add $acc0, $t2
|
|
adc %rdx, $t3\n";
|
|
}
|
|
|
|
sub poly_reduce_stage {
|
|
$code.="mov $t0, $acc0
|
|
mov $t1, $acc1
|
|
mov $t2, $acc2
|
|
and \$3, $acc2
|
|
mov $t2, $t0
|
|
and \$-4, $t0
|
|
mov $t3, $t1
|
|
shrd \$2, $t3, $t2
|
|
shr \$2, $t3
|
|
add $t0, $acc0
|
|
adc $t1, $acc1
|
|
adc \$0, $acc2
|
|
add $t2, $acc0
|
|
adc $t3, $acc1
|
|
adc \$0, $acc2\n";
|
|
}
|
|
|
|
sub poly_mul {
|
|
&poly_stage1();
|
|
&poly_stage2();
|
|
&poly_stage3();
|
|
&poly_reduce_stage();
|
|
}
|
|
|
|
sub prep_state {
|
|
my ($n)=@_;
|
|
$code.="movdqa .chacha20_consts(%rip), $A0
|
|
movdqa $state1_store, $B0
|
|
movdqa $state2_store, $C0\n";
|
|
$code.="movdqa $A0, $A1
|
|
movdqa $B0, $B1
|
|
movdqa $C0, $C1\n" if ($n ge 2);
|
|
$code.="movdqa $A0, $A2
|
|
movdqa $B0, $B2
|
|
movdqa $C0, $C2\n" if ($n ge 3);
|
|
$code.="movdqa $A0, $A3
|
|
movdqa $B0, $B3
|
|
movdqa $C0, $C3\n" if ($n ge 4);
|
|
$code.="movdqa $ctr0_store, $D0
|
|
paddd .sse_inc(%rip), $D0
|
|
movdqa $D0, $ctr0_store\n" if ($n eq 1);
|
|
$code.="movdqa $ctr0_store, $D1
|
|
paddd .sse_inc(%rip), $D1
|
|
movdqa $D1, $D0
|
|
paddd .sse_inc(%rip), $D0
|
|
movdqa $D0, $ctr0_store
|
|
movdqa $D1, $ctr1_store\n" if ($n eq 2);
|
|
$code.="movdqa $ctr0_store, $D2
|
|
paddd .sse_inc(%rip), $D2
|
|
movdqa $D2, $D1
|
|
paddd .sse_inc(%rip), $D1
|
|
movdqa $D1, $D0
|
|
paddd .sse_inc(%rip), $D0
|
|
movdqa $D0, $ctr0_store
|
|
movdqa $D1, $ctr1_store
|
|
movdqa $D2, $ctr2_store\n" if ($n eq 3);
|
|
$code.="movdqa $ctr0_store, $D3
|
|
paddd .sse_inc(%rip), $D3
|
|
movdqa $D3, $D2
|
|
paddd .sse_inc(%rip), $D2
|
|
movdqa $D2, $D1
|
|
paddd .sse_inc(%rip), $D1
|
|
movdqa $D1, $D0
|
|
paddd .sse_inc(%rip), $D0
|
|
movdqa $D0, $ctr0_store
|
|
movdqa $D1, $ctr1_store
|
|
movdqa $D2, $ctr2_store
|
|
movdqa $D3, $ctr3_store\n" if ($n eq 4);
|
|
}
|
|
|
|
sub finalize_state {
|
|
my ($n)=@_;
|
|
$code.="paddd .chacha20_consts(%rip), $A3
|
|
paddd $state1_store, $B3
|
|
paddd $state2_store, $C3
|
|
paddd $ctr3_store, $D3\n" if ($n eq 4);
|
|
$code.="paddd .chacha20_consts(%rip), $A2
|
|
paddd $state1_store, $B2
|
|
paddd $state2_store, $C2
|
|
paddd $ctr2_store, $D2\n" if ($n ge 3);
|
|
$code.="paddd .chacha20_consts(%rip), $A1
|
|
paddd $state1_store, $B1
|
|
paddd $state2_store, $C1
|
|
paddd $ctr1_store, $D1\n" if ($n ge 2);
|
|
$code.="paddd .chacha20_consts(%rip), $A0
|
|
paddd $state1_store, $B0
|
|
paddd $state2_store, $C0
|
|
paddd $ctr0_store, $D0\n";
|
|
}
|
|
|
|
sub xor_stream {
|
|
my ($A, $B, $C, $D, $offset)=@_;
|
|
$code.="movdqu 0*16 + $offset($inp), $A3
|
|
movdqu 1*16 + $offset($inp), $B3
|
|
movdqu 2*16 + $offset($inp), $C3
|
|
movdqu 3*16 + $offset($inp), $D3
|
|
pxor $A3, $A
|
|
pxor $B3, $B
|
|
pxor $C3, $C
|
|
pxor $D, $D3
|
|
movdqu $A, 0*16 + $offset($oup)
|
|
movdqu $B, 1*16 + $offset($oup)
|
|
movdqu $C, 2*16 + $offset($oup)
|
|
movdqu $D3, 3*16 + $offset($oup)\n";
|
|
}
|
|
|
|
sub xor_stream_using_temp {
|
|
my ($A, $B, $C, $D, $offset, $temp)=@_;
|
|
$code.="movdqa $temp, $tmp_store
|
|
movdqu 0*16 + $offset($inp), $temp
|
|
pxor $A, $temp
|
|
movdqu $temp, 0*16 + $offset($oup)
|
|
movdqu 1*16 + $offset($inp), $temp
|
|
pxor $B, $temp
|
|
movdqu $temp, 1*16 + $offset($oup)
|
|
movdqu 2*16 + $offset($inp), $temp
|
|
pxor $C, $temp
|
|
movdqu $temp, 2*16 + $offset($oup)
|
|
movdqu 3*16 + $offset($inp), $temp
|
|
pxor $D, $temp
|
|
movdqu $temp, 3*16 + $offset($oup)\n";
|
|
}
|
|
|
|
sub gen_chacha_round {
|
|
my ($rot1, $rot2, $shift)=@_;
|
|
my $round="";
|
|
$round.="movdqa $C0, $tmp_store\n" if ($rot1 eq 20);
|
|
$round.="movdqa $rot2, $C0
|
|
paddd $B3, $A3
|
|
paddd $B2, $A2
|
|
paddd $B1, $A1
|
|
paddd $B0, $A0
|
|
pxor $A3, $D3
|
|
pxor $A2, $D2
|
|
pxor $A1, $D1
|
|
pxor $A0, $D0
|
|
pshufb $C0, $D3
|
|
pshufb $C0, $D2
|
|
pshufb $C0, $D1
|
|
pshufb $C0, $D0
|
|
movdqa $tmp_store, $C0
|
|
paddd $D3, $C3
|
|
paddd $D2, $C2
|
|
paddd $D1, $C1
|
|
paddd $D0, $C0
|
|
pxor $C3, $B3
|
|
pxor $C2, $B2
|
|
pxor $C1, $B1
|
|
pxor $C0, $B0
|
|
movdqa $C0, $tmp_store
|
|
movdqa $B3, $C0
|
|
psrld \$$rot1, $C0
|
|
pslld \$32-$rot1, $B3
|
|
pxor $C0, $B3
|
|
movdqa $B2, $C0
|
|
psrld \$$rot1, $C0
|
|
pslld \$32-$rot1, $B2
|
|
pxor $C0, $B2
|
|
movdqa $B1, $C0
|
|
psrld \$$rot1, $C0
|
|
pslld \$32-$rot1, $B1
|
|
pxor $C0, $B1
|
|
movdqa $B0, $C0
|
|
psrld \$$rot1, $C0
|
|
pslld \$32-$rot1, $B0
|
|
pxor $C0, $B0\n";
|
|
($s1,$s2,$s3)=(4,8,12) if ($shift =~ /left/);
|
|
($s1,$s2,$s3)=(12,8,4) if ($shift =~ /right/);
|
|
$round.="movdqa $tmp_store, $C0
|
|
palignr \$$s1, $B3, $B3
|
|
palignr \$$s2, $C3, $C3
|
|
palignr \$$s3, $D3, $D3
|
|
palignr \$$s1, $B2, $B2
|
|
palignr \$$s2, $C2, $C2
|
|
palignr \$$s3, $D2, $D2
|
|
palignr \$$s1, $B1, $B1
|
|
palignr \$$s2, $C1, $C1
|
|
palignr \$$s3, $D1, $D1
|
|
palignr \$$s1, $B0, $B0
|
|
palignr \$$s2, $C0, $C0
|
|
palignr \$$s3, $D0, $D0\n"
|
|
if (($shift =~ /left/) || ($shift =~ /right/));
|
|
return $round;
|
|
};
|
|
|
|
$chacha_body = &gen_chacha_round(20, ".rol16(%rip)") .
|
|
&gen_chacha_round(25, ".rol8(%rip)", "left") .
|
|
&gen_chacha_round(20, ".rol16(%rip)") .
|
|
&gen_chacha_round(25, ".rol8(%rip)", "right");
|
|
|
|
my @loop_body = split /\n/, $chacha_body;
|
|
|
|
sub emit_body {
|
|
my ($n)=@_;
|
|
for (my $i=0; $i < $n; $i++) {
|
|
$code=$code.shift(@loop_body)."\n";
|
|
};
|
|
}
|
|
|
|
{
|
|
################################################################################
|
|
# void poly_hash_ad_internal();
|
|
$code.="
|
|
.type poly_hash_ad_internal,\@function,2
|
|
.align 64
|
|
poly_hash_ad_internal:
|
|
.cfi_startproc
|
|
xor $acc0, $acc0
|
|
xor $acc1, $acc1
|
|
xor $acc2, $acc2
|
|
cmp \$13, $itr2
|
|
jne hash_ad_loop
|
|
poly_fast_tls_ad:
|
|
# Special treatment for the TLS case of 13 bytes
|
|
mov ($adp), $acc0
|
|
mov 5($adp), $acc1
|
|
shr \$24, $acc1
|
|
mov \$1, $acc2\n";
|
|
&poly_mul(); $code.="
|
|
ret
|
|
hash_ad_loop:
|
|
# Hash in 16 byte chunk
|
|
cmp \$16, $itr2
|
|
jb hash_ad_tail\n";
|
|
&poly_add("0($adp)");
|
|
&poly_mul(); $code.="
|
|
lea 1*16($adp), $adp
|
|
sub \$16, $itr2
|
|
jmp hash_ad_loop
|
|
hash_ad_tail:
|
|
cmp \$0, $itr2
|
|
je 1f
|
|
# Hash last < 16 byte tail
|
|
xor $t0, $t0
|
|
xor $t1, $t1
|
|
xor $t2, $t2
|
|
add $itr2, $adp
|
|
hash_ad_tail_loop:
|
|
shld \$8, $t0, $t1
|
|
shl \$8, $t0
|
|
movzxb -1($adp), $t2
|
|
xor $t2, $t0
|
|
dec $adp
|
|
dec $itr2
|
|
jne hash_ad_tail_loop
|
|
|
|
add $t0, $acc0
|
|
adc $t1, $acc1
|
|
adc \$1, $acc2\n";
|
|
&poly_mul(); $code.="
|
|
# Finished AD
|
|
1:
|
|
ret
|
|
.cfi_endproc
|
|
.size poly_hash_ad_internal, .-poly_hash_ad_internal\n";
|
|
}
|
|
|
|
{
|
|
################################################################################
|
|
# void chacha20_poly1305_open(uint8_t *pt, uint8_t *ct, size_t len_in, uint8_t *ad, size_t len_ad, uint8_t *keyp);
|
|
$code.="
|
|
.globl chacha20_poly1305_open
|
|
.type chacha20_poly1305_open,\@function,2
|
|
.align 64
|
|
chacha20_poly1305_open:
|
|
.cfi_startproc
|
|
push %rbp
|
|
.cfi_adjust_cfa_offset 8
|
|
push %rbx
|
|
.cfi_adjust_cfa_offset 8
|
|
push %r12
|
|
.cfi_adjust_cfa_offset 8
|
|
push %r13
|
|
.cfi_adjust_cfa_offset 8
|
|
push %r14
|
|
.cfi_adjust_cfa_offset 8
|
|
push %r15
|
|
.cfi_adjust_cfa_offset 8
|
|
# We write the calculated authenticator back to keyp at the end, so save
|
|
# the pointer on the stack too.
|
|
push $keyp
|
|
.cfi_adjust_cfa_offset 8
|
|
sub \$288 + 32, %rsp
|
|
.cfi_adjust_cfa_offset 288 + 32
|
|
.cfi_offset rbp, -16
|
|
.cfi_offset rbx, -24
|
|
.cfi_offset r12, -32
|
|
.cfi_offset r13, -40
|
|
.cfi_offset r14, -48
|
|
.cfi_offset r15, -56
|
|
lea 32(%rsp), %rbp
|
|
and \$-32, %rbp
|
|
mov %rdx, 8+$len_store
|
|
mov %r8, 0+$len_store
|
|
mov %rdx, $inl\n"; $code.="
|
|
mov OPENSSL_ia32cap_P+8(%rip), %eax
|
|
and \$`(1<<5) + (1<<8)`, %eax # Check both BMI2 and AVX2 are present
|
|
xor \$`(1<<5) + (1<<8)`, %eax
|
|
jz chacha20_poly1305_open_avx2\n" if ($avx>1);
|
|
$code.="
|
|
1:
|
|
cmp \$128, $inl
|
|
jbe open_sse_128
|
|
# For long buffers, prepare the poly key first
|
|
movdqa .chacha20_consts(%rip), $A0
|
|
movdqu 0*16($keyp), $B0
|
|
movdqu 1*16($keyp), $C0
|
|
movdqu 2*16($keyp), $D0
|
|
movdqa $D0, $T1
|
|
# Store on stack, to free keyp
|
|
movdqa $B0, $state1_store
|
|
movdqa $C0, $state2_store
|
|
movdqa $D0, $ctr0_store
|
|
mov \$10, $acc0
|
|
1: \n";
|
|
&chacha_qr($A0,$B0,$C0,$D0,$T0,"left");
|
|
&chacha_qr($A0,$B0,$C0,$D0,$T0,"right"); $code.="
|
|
dec $acc0
|
|
jne 1b
|
|
# A0|B0 hold the Poly1305 32-byte key, C0,D0 can be discarded
|
|
paddd .chacha20_consts(%rip), $A0
|
|
paddd $state1_store, $B0
|
|
# Clamp and store the key
|
|
pand .clamp(%rip), $A0
|
|
movdqa $A0, $r_store
|
|
movdqa $B0, $s_store
|
|
# Hash
|
|
mov %r8, $itr2
|
|
call poly_hash_ad_internal
|
|
open_sse_main_loop:
|
|
cmp \$16*16, $inl
|
|
jb 2f
|
|
# Load state, increment counter blocks\n";
|
|
&prep_state(4); $code.="
|
|
# There are 10 ChaCha20 iterations of 2QR each, so for 6 iterations we
|
|
# hash 2 blocks, and for the remaining 4 only 1 block - for a total of 16
|
|
mov \$4, $itr1
|
|
mov $inp, $itr2
|
|
1: \n";
|
|
&emit_body(20);
|
|
&poly_add("0($itr2)"); $code.="
|
|
lea 2*8($itr2), $itr2\n";
|
|
&emit_body(20);
|
|
&poly_stage1();
|
|
&emit_body(20);
|
|
&poly_stage2();
|
|
&emit_body(20);
|
|
&poly_stage3();
|
|
&emit_body(20);
|
|
&poly_reduce_stage();
|
|
foreach $l (@loop_body) {$code.=$l."\n";}
|
|
@loop_body = split /\n/, $chacha_body; $code.="
|
|
dec $itr1
|
|
jge 1b\n";
|
|
&poly_add("0($itr2)");
|
|
&poly_mul(); $code.="
|
|
lea 2*8($itr2), $itr2
|
|
cmp \$-6, $itr1
|
|
jg 1b\n";
|
|
&finalize_state(4);
|
|
&xor_stream_using_temp($A3, $B3, $C3, $D3, "0*16", $D0);
|
|
&xor_stream($A2, $B2, $C2, $D2, "4*16");
|
|
&xor_stream($A1, $B1, $C1, $D1, "8*16");
|
|
&xor_stream($A0, $B0, $C0, $tmp_store, "12*16"); $code.="
|
|
lea 16*16($inp), $inp
|
|
lea 16*16($oup), $oup
|
|
sub \$16*16, $inl
|
|
jmp open_sse_main_loop
|
|
2:
|
|
# Handle the various tail sizes efficiently
|
|
test $inl, $inl
|
|
jz open_sse_finalize
|
|
cmp \$4*16, $inl
|
|
ja 3f\n";
|
|
###############################################################################
|
|
# At most 64 bytes are left
|
|
&prep_state(1); $code.="
|
|
xor $itr2, $itr2
|
|
mov $inl, $itr1
|
|
cmp \$16, $itr1
|
|
jb 2f
|
|
1: \n";
|
|
&poly_add("0($inp, $itr2)");
|
|
&poly_mul(); $code.="
|
|
sub \$16, $itr1
|
|
2:
|
|
add \$16, $itr2\n";
|
|
&chacha_qr($A0,$B0,$C0,$D0,$T0,"left");
|
|
&chacha_qr($A0,$B0,$C0,$D0,$T0,"right"); $code.="
|
|
cmp \$16, $itr1
|
|
jae 1b
|
|
cmp \$10*16, $itr2
|
|
jne 2b\n";
|
|
&finalize_state(1); $code.="
|
|
jmp open_sse_tail_64_dec_loop
|
|
3:
|
|
cmp \$8*16, $inl
|
|
ja 3f\n";
|
|
###############################################################################
|
|
# 65 - 128 bytes are left
|
|
&prep_state(2); $code.="
|
|
mov $inl, $itr1
|
|
and \$-16, $itr1
|
|
xor $itr2, $itr2
|
|
1: \n";
|
|
&poly_add("0($inp, $itr2)");
|
|
&poly_mul(); $code.="
|
|
2:
|
|
add \$16, $itr2\n";
|
|
&chacha_qr($A0,$B0,$C0,$D0,$T0,"left");
|
|
&chacha_qr($A1,$B1,$C1,$D1,$T0,"left");
|
|
&chacha_qr($A0,$B0,$C0,$D0,$T0,"right");
|
|
&chacha_qr($A1,$B1,$C1,$D1,$T0,"right");$code.="
|
|
cmp $itr1, $itr2
|
|
jb 1b
|
|
cmp \$10*16, $itr2
|
|
jne 2b\n";
|
|
&finalize_state(2);
|
|
&xor_stream($A1, $B1, $C1, $D1, "0*16"); $code.="
|
|
sub \$4*16, $inl
|
|
lea 4*16($inp), $inp
|
|
lea 4*16($oup), $oup
|
|
jmp open_sse_tail_64_dec_loop
|
|
3:
|
|
cmp \$12*16, $inl
|
|
ja 3f\n";
|
|
###############################################################################
|
|
# 129 - 192 bytes are left
|
|
&prep_state(3); $code.="
|
|
mov $inl, $itr1
|
|
mov \$10*16, $itr2
|
|
cmp \$10*16, $itr1
|
|
cmovg $itr2, $itr1
|
|
and \$-16, $itr1
|
|
xor $itr2, $itr2
|
|
1: \n";
|
|
&poly_add("0($inp, $itr2)");
|
|
&poly_mul(); $code.="
|
|
2:
|
|
add \$16, $itr2\n";
|
|
&chacha_qr($A0,$B0,$C0,$D0,$T0,"left");
|
|
&chacha_qr($A1,$B1,$C1,$D1,$T0,"left");
|
|
&chacha_qr($A2,$B2,$C2,$D2,$T0,"left");
|
|
&chacha_qr($A0,$B0,$C0,$D0,$T0,"right");
|
|
&chacha_qr($A1,$B1,$C1,$D1,$T0,"right");
|
|
&chacha_qr($A2,$B2,$C2,$D2,$T0,"right"); $code.="
|
|
cmp $itr1, $itr2
|
|
jb 1b
|
|
cmp \$10*16, $itr2
|
|
jne 2b
|
|
cmp \$11*16, $inl
|
|
jb 1f\n";
|
|
&poly_add("10*16($inp)");
|
|
&poly_mul(); $code.="
|
|
cmp \$12*16, $inl
|
|
jb 1f\n";
|
|
&poly_add("11*16($inp)");
|
|
&poly_mul(); $code.="
|
|
1: \n";
|
|
&finalize_state(3);
|
|
&xor_stream($A2, $B2, $C2, $D2, "0*16");
|
|
&xor_stream($A1, $B1, $C1, $D1, "4*16"); $code.="
|
|
sub \$8*16, $inl
|
|
lea 8*16($inp), $inp
|
|
lea 8*16($oup), $oup
|
|
jmp open_sse_tail_64_dec_loop
|
|
3:
|
|
###############################################################################\n";
|
|
# 193 - 255 bytes are left
|
|
&prep_state(4); $code.="
|
|
xor $itr2, $itr2
|
|
1: \n";
|
|
&poly_add("0($inp, $itr2)");
|
|
&chacha_qr($A0,$B0,$C0,$D0,$C3,"store_left");
|
|
&chacha_qr($A1,$B1,$C1,$D1,$C3,"left");
|
|
&chacha_qr($A2,$B2,$C2,$D2,$C3,"left_load");
|
|
&poly_stage1();
|
|
&chacha_qr($A3,$B3,$C3,$D3,$C1,"store_left_load");
|
|
&poly_stage2();
|
|
&chacha_qr($A0,$B0,$C0,$D0,$C3,"store_right");
|
|
&chacha_qr($A1,$B1,$C1,$D1,$C3,"right");
|
|
&poly_stage3();
|
|
&chacha_qr($A2,$B2,$C2,$D2,$C3,"right_load");
|
|
&poly_reduce_stage();
|
|
&chacha_qr($A3,$B3,$C3,$D3,$C1,"store_right_load"); $code.="
|
|
add \$16, $itr2
|
|
cmp \$10*16, $itr2
|
|
jb 1b
|
|
mov $inl, $itr1
|
|
and \$-16, $itr1
|
|
1: \n";
|
|
&poly_add("0($inp, $itr2)");
|
|
&poly_mul(); $code.="
|
|
add \$16, $itr2
|
|
cmp $itr1, $itr2
|
|
jb 1b\n";
|
|
&finalize_state(4);
|
|
&xor_stream_using_temp($A3, $B3, $C3, $D3, "0*16", $D0);
|
|
&xor_stream($A2, $B2, $C2, $D2, "4*16");
|
|
&xor_stream($A1, $B1, $C1, $D1, "8*16"); $code.="
|
|
movdqa $tmp_store, $D0
|
|
sub \$12*16, $inl
|
|
lea 12*16($inp), $inp
|
|
lea 12*16($oup), $oup
|
|
###############################################################################
|
|
# Decrypt the remaining data, 16B at a time, using existing stream
|
|
open_sse_tail_64_dec_loop:
|
|
cmp \$16, $inl
|
|
jb 1f
|
|
sub \$16, $inl
|
|
movdqu ($inp), $T0
|
|
pxor $T0, $A0
|
|
movdqu $A0, ($oup)
|
|
lea 16($inp), $inp
|
|
lea 16($oup), $oup
|
|
movdqa $B0, $A0
|
|
movdqa $C0, $B0
|
|
movdqa $D0, $C0
|
|
jmp open_sse_tail_64_dec_loop
|
|
1:
|
|
movdqa $A0, $A1
|
|
|
|
# Decrypt up to 16 bytes at the end.
|
|
open_sse_tail_16:
|
|
test $inl, $inl
|
|
jz open_sse_finalize
|
|
|
|
# Read the final bytes into $T0. They need to be read in reverse order so
|
|
# that they end up in the correct order in $T0.
|
|
pxor $T0, $T0
|
|
lea -1($inp, $inl), $inp
|
|
movq $inl, $itr2
|
|
2:
|
|
pslldq \$1, $T0
|
|
pinsrb \$0, ($inp), $T0
|
|
sub \$1, $inp
|
|
sub \$1, $itr2
|
|
jnz 2b
|
|
|
|
3:
|
|
movq $T0, $t0
|
|
pextrq \$1, $T0, $t1
|
|
# The final bytes of keystream are in $A1.
|
|
pxor $A1, $T0
|
|
|
|
# Copy the plaintext bytes out.
|
|
2:
|
|
pextrb \$0, $T0, ($oup)
|
|
psrldq \$1, $T0
|
|
add \$1, $oup
|
|
sub \$1, $inl
|
|
jne 2b
|
|
|
|
add $t0, $acc0
|
|
adc $t1, $acc1
|
|
adc \$1, $acc2\n";
|
|
&poly_mul(); $code.="
|
|
|
|
open_sse_finalize:\n";
|
|
&poly_add($len_store);
|
|
&poly_mul(); $code.="
|
|
# Final reduce
|
|
mov $acc0, $t0
|
|
mov $acc1, $t1
|
|
mov $acc2, $t2
|
|
sub \$-5, $acc0
|
|
sbb \$-1, $acc1
|
|
sbb \$3, $acc2
|
|
cmovc $t0, $acc0
|
|
cmovc $t1, $acc1
|
|
cmovc $t2, $acc2
|
|
# Add in s part of the key
|
|
add 0+$s_store, $acc0
|
|
adc 8+$s_store, $acc1
|
|
|
|
add \$288 + 32, %rsp
|
|
.cfi_adjust_cfa_offset -(288 + 32)
|
|
pop $keyp
|
|
.cfi_adjust_cfa_offset -8
|
|
movq $acc0, ($keyp)
|
|
movq $acc1, 8($keyp)
|
|
|
|
pop %r15
|
|
.cfi_adjust_cfa_offset -8
|
|
pop %r14
|
|
.cfi_adjust_cfa_offset -8
|
|
pop %r13
|
|
.cfi_adjust_cfa_offset -8
|
|
pop %r12
|
|
.cfi_adjust_cfa_offset -8
|
|
pop %rbx
|
|
.cfi_adjust_cfa_offset -8
|
|
pop %rbp
|
|
.cfi_adjust_cfa_offset -8
|
|
ret
|
|
.cfi_adjust_cfa_offset (8 * 6) + 288 + 32
|
|
###############################################################################
|
|
open_sse_128:
|
|
movdqu .chacha20_consts(%rip), $A0\nmovdqa $A0, $A1\nmovdqa $A0, $A2
|
|
movdqu 0*16($keyp), $B0\nmovdqa $B0, $B1\nmovdqa $B0, $B2
|
|
movdqu 1*16($keyp), $C0\nmovdqa $C0, $C1\nmovdqa $C0, $C2
|
|
movdqu 2*16($keyp), $D0
|
|
movdqa $D0, $D1\npaddd .sse_inc(%rip), $D1
|
|
movdqa $D1, $D2\npaddd .sse_inc(%rip), $D2
|
|
movdqa $B0, $T1\nmovdqa $C0, $T2\nmovdqa $D1, $T3
|
|
mov \$10, $acc0
|
|
1: \n";
|
|
&chacha_qr($A0,$B0,$C0,$D0,$T0,"left");
|
|
&chacha_qr($A1,$B1,$C1,$D1,$T0,"left");
|
|
&chacha_qr($A2,$B2,$C2,$D2,$T0,"left");
|
|
&chacha_qr($A0,$B0,$C0,$D0,$T0,"right");
|
|
&chacha_qr($A1,$B1,$C1,$D1,$T0,"right");
|
|
&chacha_qr($A2,$B2,$C2,$D2,$T0,"right"); $code.="
|
|
dec $acc0
|
|
jnz 1b
|
|
paddd .chacha20_consts(%rip), $A0
|
|
paddd .chacha20_consts(%rip), $A1
|
|
paddd .chacha20_consts(%rip), $A2
|
|
paddd $T1, $B0\npaddd $T1, $B1\npaddd $T1, $B2
|
|
paddd $T2, $C1\npaddd $T2, $C2
|
|
paddd $T3, $D1
|
|
paddd .sse_inc(%rip), $T3
|
|
paddd $T3, $D2
|
|
# Clamp and store the key
|
|
pand .clamp(%rip), $A0
|
|
movdqa $A0, $r_store
|
|
movdqa $B0, $s_store
|
|
# Hash
|
|
mov %r8, $itr2
|
|
call poly_hash_ad_internal
|
|
1:
|
|
cmp \$16, $inl
|
|
jb open_sse_tail_16
|
|
sub \$16, $inl\n";
|
|
# Load for hashing
|
|
&poly_add("0*8($inp)"); $code.="
|
|
# Load for decryption
|
|
movdqu 0*16($inp), $T0
|
|
pxor $T0, $A1
|
|
movdqu $A1, 0*16($oup)
|
|
lea 1*16($inp), $inp
|
|
lea 1*16($oup), $oup\n";
|
|
&poly_mul(); $code.="
|
|
# Shift the stream left
|
|
movdqa $B1, $A1
|
|
movdqa $C1, $B1
|
|
movdqa $D1, $C1
|
|
movdqa $A2, $D1
|
|
movdqa $B2, $A2
|
|
movdqa $C2, $B2
|
|
movdqa $D2, $C2
|
|
jmp 1b
|
|
jmp open_sse_tail_16
|
|
.size chacha20_poly1305_open, .-chacha20_poly1305_open
|
|
.cfi_endproc
|
|
|
|
################################################################################
|
|
################################################################################
|
|
# void chacha20_poly1305_seal(uint8_t *pt, uint8_t *ct, size_t len_in, uint8_t *ad, size_t len_ad, uint8_t *keyp);
|
|
.globl chacha20_poly1305_seal
|
|
.type chacha20_poly1305_seal,\@function,2
|
|
.align 64
|
|
chacha20_poly1305_seal:
|
|
.cfi_startproc
|
|
push %rbp
|
|
.cfi_adjust_cfa_offset 8
|
|
push %rbx
|
|
.cfi_adjust_cfa_offset 8
|
|
push %r12
|
|
.cfi_adjust_cfa_offset 8
|
|
push %r13
|
|
.cfi_adjust_cfa_offset 8
|
|
push %r14
|
|
.cfi_adjust_cfa_offset 8
|
|
push %r15
|
|
.cfi_adjust_cfa_offset 8
|
|
# We write the calculated authenticator back to keyp at the end, so save
|
|
# the pointer on the stack too.
|
|
push $keyp
|
|
.cfi_adjust_cfa_offset 8
|
|
sub \$288 + 32, %rsp
|
|
.cfi_adjust_cfa_offset 288 + 32
|
|
.cfi_offset rbp, -16
|
|
.cfi_offset rbx, -24
|
|
.cfi_offset r12, -32
|
|
.cfi_offset r13, -40
|
|
.cfi_offset r14, -48
|
|
.cfi_offset r15, -56
|
|
lea 32(%rsp), %rbp
|
|
and \$-32, %rbp
|
|
mov %rdx, 8+$len_store
|
|
mov %r8, 0+$len_store
|
|
mov %rdx, $inl\n"; $code.="
|
|
mov OPENSSL_ia32cap_P+8(%rip), %eax
|
|
and \$`(1<<5) + (1<<8)`, %eax # Check both BMI2 and AVX2 are present
|
|
xor \$`(1<<5) + (1<<8)`, %eax
|
|
jz chacha20_poly1305_seal_avx2\n" if ($avx>1);
|
|
$code.="
|
|
cmp \$128, $inl
|
|
jbe seal_sse_128
|
|
# For longer buffers, prepare the poly key + some stream
|
|
movdqa .chacha20_consts(%rip), $A0
|
|
movdqu 0*16($keyp), $B0
|
|
movdqu 1*16($keyp), $C0
|
|
movdqu 2*16($keyp), $D0
|
|
movdqa $A0, $A1
|
|
movdqa $A0, $A2
|
|
movdqa $A0, $A3
|
|
movdqa $B0, $B1
|
|
movdqa $B0, $B2
|
|
movdqa $B0, $B3
|
|
movdqa $C0, $C1
|
|
movdqa $C0, $C2
|
|
movdqa $C0, $C3
|
|
movdqa $D0, $D3
|
|
paddd .sse_inc(%rip), $D0
|
|
movdqa $D0, $D2
|
|
paddd .sse_inc(%rip), $D0
|
|
movdqa $D0, $D1
|
|
paddd .sse_inc(%rip), $D0
|
|
# Store on stack
|
|
movdqa $B0, $state1_store
|
|
movdqa $C0, $state2_store
|
|
movdqa $D0, $ctr0_store
|
|
movdqa $D1, $ctr1_store
|
|
movdqa $D2, $ctr2_store
|
|
movdqa $D3, $ctr3_store
|
|
mov \$10, $acc0
|
|
1: \n";
|
|
foreach $l (@loop_body) {$code.=$l."\n";}
|
|
@loop_body = split /\n/, $chacha_body; $code.="
|
|
dec $acc0
|
|
jnz 1b\n";
|
|
&finalize_state(4); $code.="
|
|
# Clamp and store the key
|
|
pand .clamp(%rip), $A3
|
|
movdqa $A3, $r_store
|
|
movdqa $B3, $s_store
|
|
# Hash
|
|
mov %r8, $itr2
|
|
call poly_hash_ad_internal\n";
|
|
&xor_stream($A2,$B2,$C2,$D2,"0*16");
|
|
&xor_stream($A1,$B1,$C1,$D1,"4*16"); $code.="
|
|
cmp \$12*16, $inl
|
|
ja 1f
|
|
mov \$8*16, $itr1
|
|
sub \$8*16, $inl
|
|
lea 8*16($inp), $inp
|
|
jmp seal_sse_128_seal_hash
|
|
1: \n";
|
|
&xor_stream($A0, $B0, $C0, $D0, "8*16"); $code.="
|
|
mov \$12*16, $itr1
|
|
sub \$12*16, $inl
|
|
lea 12*16($inp), $inp
|
|
mov \$2, $itr1
|
|
mov \$8, $itr2
|
|
cmp \$4*16, $inl
|
|
jbe seal_sse_tail_64
|
|
cmp \$8*16, $inl
|
|
jbe seal_sse_tail_128
|
|
cmp \$12*16, $inl
|
|
jbe seal_sse_tail_192
|
|
|
|
1: \n";
|
|
# The main loop
|
|
&prep_state(4); $code.="
|
|
2: \n";
|
|
&emit_body(20);
|
|
&poly_add("0($oup)");
|
|
&emit_body(20);
|
|
&poly_stage1();
|
|
&emit_body(20);
|
|
&poly_stage2();
|
|
&emit_body(20);
|
|
&poly_stage3();
|
|
&emit_body(20);
|
|
&poly_reduce_stage();
|
|
foreach $l (@loop_body) {$code.=$l."\n";}
|
|
@loop_body = split /\n/, $chacha_body; $code.="
|
|
lea 16($oup), $oup
|
|
dec $itr2
|
|
jge 2b\n";
|
|
&poly_add("0*8($oup)");
|
|
&poly_mul(); $code.="
|
|
lea 16($oup), $oup
|
|
dec $itr1
|
|
jg 2b\n";
|
|
|
|
&finalize_state(4);$code.="
|
|
movdqa $D2, $tmp_store\n";
|
|
&xor_stream_using_temp($A3,$B3,$C3,$D3,0*16,$D2); $code.="
|
|
movdqa $tmp_store, $D2\n";
|
|
&xor_stream($A2,$B2,$C2,$D2, 4*16);
|
|
&xor_stream($A1,$B1,$C1,$D1, 8*16); $code.="
|
|
cmp \$16*16, $inl
|
|
ja 3f
|
|
|
|
mov \$12*16, $itr1
|
|
sub \$12*16, $inl
|
|
lea 12*16($inp), $inp
|
|
jmp seal_sse_128_seal_hash
|
|
3: \n";
|
|
&xor_stream($A0,$B0,$C0,$D0,"12*16"); $code.="
|
|
lea 16*16($inp), $inp
|
|
sub \$16*16, $inl
|
|
mov \$6, $itr1
|
|
mov \$4, $itr2
|
|
cmp \$12*16, $inl
|
|
jg 1b
|
|
mov $inl, $itr1
|
|
test $inl, $inl
|
|
je seal_sse_128_seal_hash
|
|
mov \$6, $itr1
|
|
cmp \$4*16, $inl
|
|
jg 3f
|
|
###############################################################################
|
|
seal_sse_tail_64:\n";
|
|
&prep_state(1); $code.="
|
|
1: \n";
|
|
&poly_add("0($oup)");
|
|
&poly_mul(); $code.="
|
|
lea 16($oup), $oup
|
|
2: \n";
|
|
&chacha_qr($A0,$B0,$C0,$D0,$T0,"left");
|
|
&chacha_qr($A0,$B0,$C0,$D0,$T0,"right");
|
|
&poly_add("0($oup)");
|
|
&poly_mul(); $code.="
|
|
lea 16($oup), $oup
|
|
dec $itr1
|
|
jg 1b
|
|
dec $itr2
|
|
jge 2b\n";
|
|
&finalize_state(1); $code.="
|
|
jmp seal_sse_128_seal
|
|
3:
|
|
cmp \$8*16, $inl
|
|
jg 3f
|
|
###############################################################################
|
|
seal_sse_tail_128:\n";
|
|
&prep_state(2); $code.="
|
|
1: \n";
|
|
&poly_add("0($oup)");
|
|
&poly_mul(); $code.="
|
|
lea 16($oup), $oup
|
|
2: \n";
|
|
&chacha_qr($A0,$B0,$C0,$D0,$T0,"left");
|
|
&chacha_qr($A1,$B1,$C1,$D1,$T0,"left");
|
|
&poly_add("0($oup)");
|
|
&poly_mul();
|
|
&chacha_qr($A0,$B0,$C0,$D0,$T0,"right");
|
|
&chacha_qr($A1,$B1,$C1,$D1,$T0,"right"); $code.="
|
|
lea 16($oup), $oup
|
|
dec $itr1
|
|
jg 1b
|
|
dec $itr2
|
|
jge 2b\n";
|
|
&finalize_state(2);
|
|
&xor_stream($A1,$B1,$C1,$D1,0*16); $code.="
|
|
mov \$4*16, $itr1
|
|
sub \$4*16, $inl
|
|
lea 4*16($inp), $inp
|
|
jmp seal_sse_128_seal_hash
|
|
3:
|
|
###############################################################################
|
|
seal_sse_tail_192:\n";
|
|
&prep_state(3); $code.="
|
|
1: \n";
|
|
&poly_add("0($oup)");
|
|
&poly_mul(); $code.="
|
|
lea 16($oup), $oup
|
|
2: \n";
|
|
&chacha_qr($A0,$B0,$C0,$D0,$T0,"left");
|
|
&chacha_qr($A1,$B1,$C1,$D1,$T0,"left");
|
|
&chacha_qr($A2,$B2,$C2,$D2,$T0,"left");
|
|
&poly_add("0($oup)");
|
|
&poly_mul();
|
|
&chacha_qr($A0,$B0,$C0,$D0,$T0,"right");
|
|
&chacha_qr($A1,$B1,$C1,$D1,$T0,"right");
|
|
&chacha_qr($A2,$B2,$C2,$D2,$T0,"right"); $code.="
|
|
lea 16($oup), $oup
|
|
dec $itr1
|
|
jg 1b
|
|
dec $itr2
|
|
jge 2b\n";
|
|
&finalize_state(3);
|
|
&xor_stream($A2,$B2,$C2,$D2,0*16);
|
|
&xor_stream($A1,$B1,$C1,$D1,4*16); $code.="
|
|
mov \$8*16, $itr1
|
|
sub \$8*16, $inl
|
|
lea 8*16($inp), $inp
|
|
###############################################################################
|
|
seal_sse_128_seal_hash:
|
|
cmp \$16, $itr1
|
|
jb seal_sse_128_seal\n";
|
|
&poly_add("0($oup)");
|
|
&poly_mul(); $code.="
|
|
sub \$16, $itr1
|
|
lea 16($oup), $oup
|
|
jmp seal_sse_128_seal_hash
|
|
|
|
seal_sse_128_seal:
|
|
cmp \$16, $inl
|
|
jb seal_sse_tail_16
|
|
sub \$16, $inl
|
|
# Load for decryption
|
|
movdqu 0*16($inp), $T0
|
|
pxor $T0, $A0
|
|
movdqu $A0, 0*16($oup)
|
|
# Then hash
|
|
add 0*8($oup), $acc0
|
|
adc 1*8($oup), $acc1
|
|
adc \$1, $acc2
|
|
lea 1*16($inp), $inp
|
|
lea 1*16($oup), $oup\n";
|
|
&poly_mul(); $code.="
|
|
# Shift the stream left
|
|
movdqa $B0, $A0
|
|
movdqa $C0, $B0
|
|
movdqa $D0, $C0
|
|
movdqa $A1, $D0
|
|
movdqa $B1, $A1
|
|
movdqa $C1, $B1
|
|
movdqa $D1, $C1
|
|
jmp seal_sse_128_seal
|
|
|
|
seal_sse_tail_16:
|
|
test $inl, $inl
|
|
jz seal_sse_finalize
|
|
# We can only load the PT one byte at a time to avoid buffer overread
|
|
mov $inl, $itr2
|
|
shl \$4, $itr2
|
|
lea .and_masks(%rip), $t0
|
|
mov $inl, $itr1
|
|
lea -1($inp, $inl), $inp
|
|
pxor $T3, $T3
|
|
1:
|
|
pslldq \$1, $T3
|
|
pinsrb \$0, ($inp), $T3
|
|
lea -1($inp), $inp
|
|
dec $itr1
|
|
jne 1b
|
|
|
|
# XOR the keystream with the plaintext.
|
|
pxor $A0, $T3
|
|
|
|
# Write ciphertext out, byte-by-byte.
|
|
movq $inl, $itr1
|
|
movdqu $T3, $A0
|
|
2:
|
|
pextrb \$0, $A0, ($oup)
|
|
psrldq \$1, $A0
|
|
add \$1, $oup
|
|
sub \$1, $itr1
|
|
jnz 2b
|
|
|
|
pand -16($t0, $itr2), $T3
|
|
movq $T3, $t0
|
|
pextrq \$1, $T3, $t1
|
|
add $t0, $acc0
|
|
adc $t1, $acc1
|
|
adc \$1, $acc2\n";
|
|
&poly_mul(); $code.="
|
|
seal_sse_finalize:\n";
|
|
&poly_add($len_store);
|
|
&poly_mul(); $code.="
|
|
# Final reduce
|
|
mov $acc0, $t0
|
|
mov $acc1, $t1
|
|
mov $acc2, $t2
|
|
sub \$-5, $acc0
|
|
sbb \$-1, $acc1
|
|
sbb \$3, $acc2
|
|
cmovc $t0, $acc0
|
|
cmovc $t1, $acc1
|
|
cmovc $t2, $acc2
|
|
# Add in s part of the key
|
|
add 0+$s_store, $acc0
|
|
adc 8+$s_store, $acc1
|
|
|
|
add \$288 + 32, %rsp
|
|
.cfi_adjust_cfa_offset -(288 + 32)
|
|
pop $keyp
|
|
.cfi_adjust_cfa_offset -8
|
|
mov $acc0, 0*8($keyp)
|
|
mov $acc1, 1*8($keyp)
|
|
|
|
pop %r15
|
|
.cfi_adjust_cfa_offset -8
|
|
pop %r14
|
|
.cfi_adjust_cfa_offset -8
|
|
pop %r13
|
|
.cfi_adjust_cfa_offset -8
|
|
pop %r12
|
|
.cfi_adjust_cfa_offset -8
|
|
pop %rbx
|
|
.cfi_adjust_cfa_offset -8
|
|
pop %rbp
|
|
.cfi_adjust_cfa_offset -8
|
|
ret
|
|
.cfi_adjust_cfa_offset (8 * 6) + 288 + 32
|
|
################################################################################
|
|
seal_sse_128:
|
|
movdqu .chacha20_consts(%rip), $A0\nmovdqa $A0, $A1\nmovdqa $A0, $A2
|
|
movdqu 0*16($keyp), $B0\nmovdqa $B0, $B1\nmovdqa $B0, $B2
|
|
movdqu 1*16($keyp), $C0\nmovdqa $C0, $C1\nmovdqa $C0, $C2
|
|
movdqu 2*16($keyp), $D2
|
|
movdqa $D2, $D0\npaddd .sse_inc(%rip), $D0
|
|
movdqa $D0, $D1\npaddd .sse_inc(%rip), $D1
|
|
movdqa $B0, $T1\nmovdqa $C0, $T2\nmovdqa $D0, $T3
|
|
mov \$10, $acc0
|
|
1:\n";
|
|
&chacha_qr($A0,$B0,$C0,$D0,$T0,"left");
|
|
&chacha_qr($A1,$B1,$C1,$D1,$T0,"left");
|
|
&chacha_qr($A2,$B2,$C2,$D2,$T0,"left");
|
|
&chacha_qr($A0,$B0,$C0,$D0,$T0,"right");
|
|
&chacha_qr($A1,$B1,$C1,$D1,$T0,"right");
|
|
&chacha_qr($A2,$B2,$C2,$D2,$T0,"right"); $code.="
|
|
dec $acc0
|
|
jnz 1b
|
|
paddd .chacha20_consts(%rip), $A0
|
|
paddd .chacha20_consts(%rip), $A1
|
|
paddd .chacha20_consts(%rip), $A2
|
|
paddd $T1, $B0\npaddd $T1, $B1\npaddd $T1, $B2
|
|
paddd $T2, $C0\npaddd $T2, $C1
|
|
paddd $T3, $D0
|
|
paddd .sse_inc(%rip), $T3
|
|
paddd $T3, $D1
|
|
# Clamp and store the key
|
|
pand .clamp(%rip), $A2
|
|
movdqa $A2, $r_store
|
|
movdqa $B2, $s_store
|
|
# Hash
|
|
mov %r8, $itr2
|
|
call poly_hash_ad_internal
|
|
jmp seal_sse_128_seal
|
|
.size chacha20_poly1305_seal, .-chacha20_poly1305_seal\n";
|
|
}
|
|
|
|
# There should have been a cfi_endproc at the end of that function, but the two
|
|
# following blocks of code are jumped to without a stack frame and the CFI
|
|
# context which they are used in happens to match the CFI context at the end of
|
|
# the previous function. So the CFI table is just extended to the end of them.
|
|
|
|
if ($avx>1) {
|
|
|
|
($A0,$A1,$A2,$A3,$B0,$B1,$B2,$B3,$C0,$C1,$C2,$C3,$D0,$D1,$D2,$D3)=map("%ymm$_",(0..15));
|
|
my ($A0x,$A1x,$A2x,$A3x,$B0x,$B1x,$B2x,$B3x,$C0x,$C1x,$C2x,$C3x,$D0x,$D1x,$D2x,$D3x)=map("%xmm$_",(0..15));
|
|
($T0,$T1,$T2,$T3)=($A3,$B3,$C3,$D3);
|
|
$state1_store="2*32(%rbp)";
|
|
$state2_store="3*32(%rbp)";
|
|
$tmp_store="4*32(%rbp)";
|
|
$ctr0_store="5*32(%rbp)";
|
|
$ctr1_store="6*32(%rbp)";
|
|
$ctr2_store="7*32(%rbp)";
|
|
$ctr3_store="8*32(%rbp)";
|
|
|
|
sub chacha_qr_avx2 {
|
|
my ($a,$b,$c,$d,$t,$dir)=@_;
|
|
$code.=<<___ if ($dir =~ /store/);
|
|
vmovdqa $t, $tmp_store
|
|
___
|
|
$code.=<<___;
|
|
vpaddd $b, $a, $a
|
|
vpxor $a, $d, $d
|
|
vpshufb .rol16(%rip), $d, $d
|
|
vpaddd $d, $c, $c
|
|
vpxor $c, $b, $b
|
|
vpsrld \$20, $b, $t
|
|
vpslld \$12, $b, $b
|
|
vpxor $t, $b, $b
|
|
vpaddd $b, $a, $a
|
|
vpxor $a, $d, $d
|
|
vpshufb .rol8(%rip), $d, $d
|
|
vpaddd $d, $c, $c
|
|
vpxor $c, $b, $b
|
|
vpslld \$7, $b, $t
|
|
vpsrld \$25, $b, $b
|
|
vpxor $t, $b, $b
|
|
___
|
|
$code.=<<___ if ($dir =~ /left/);
|
|
vpalignr \$12, $d, $d, $d
|
|
vpalignr \$8, $c, $c, $c
|
|
vpalignr \$4, $b, $b, $b
|
|
___
|
|
$code.=<<___ if ($dir =~ /right/);
|
|
vpalignr \$4, $d, $d, $d
|
|
vpalignr \$8, $c, $c, $c
|
|
vpalignr \$12, $b, $b, $b
|
|
___
|
|
$code.=<<___ if ($dir =~ /load/);
|
|
vmovdqa $tmp_store, $t
|
|
___
|
|
}
|
|
|
|
sub prep_state_avx2 {
|
|
my ($n)=@_;
|
|
$code.=<<___;
|
|
vmovdqa .chacha20_consts(%rip), $A0
|
|
vmovdqa $state1_store, $B0
|
|
vmovdqa $state2_store, $C0
|
|
___
|
|
$code.=<<___ if ($n ge 2);
|
|
vmovdqa $A0, $A1
|
|
vmovdqa $B0, $B1
|
|
vmovdqa $C0, $C1
|
|
___
|
|
$code.=<<___ if ($n ge 3);
|
|
vmovdqa $A0, $A2
|
|
vmovdqa $B0, $B2
|
|
vmovdqa $C0, $C2
|
|
___
|
|
$code.=<<___ if ($n ge 4);
|
|
vmovdqa $A0, $A3
|
|
vmovdqa $B0, $B3
|
|
vmovdqa $C0, $C3
|
|
___
|
|
$code.=<<___ if ($n eq 1);
|
|
vmovdqa .avx2_inc(%rip), $D0
|
|
vpaddd $ctr0_store, $D0, $D0
|
|
vmovdqa $D0, $ctr0_store
|
|
___
|
|
$code.=<<___ if ($n eq 2);
|
|
vmovdqa .avx2_inc(%rip), $D0
|
|
vpaddd $ctr0_store, $D0, $D1
|
|
vpaddd $D1, $D0, $D0
|
|
vmovdqa $D0, $ctr0_store
|
|
vmovdqa $D1, $ctr1_store
|
|
___
|
|
$code.=<<___ if ($n eq 3);
|
|
vmovdqa .avx2_inc(%rip), $D0
|
|
vpaddd $ctr0_store, $D0, $D2
|
|
vpaddd $D2, $D0, $D1
|
|
vpaddd $D1, $D0, $D0
|
|
vmovdqa $D0, $ctr0_store
|
|
vmovdqa $D1, $ctr1_store
|
|
vmovdqa $D2, $ctr2_store
|
|
___
|
|
$code.=<<___ if ($n eq 4);
|
|
vmovdqa .avx2_inc(%rip), $D0
|
|
vpaddd $ctr0_store, $D0, $D3
|
|
vpaddd $D3, $D0, $D2
|
|
vpaddd $D2, $D0, $D1
|
|
vpaddd $D1, $D0, $D0
|
|
vmovdqa $D3, $ctr3_store
|
|
vmovdqa $D2, $ctr2_store
|
|
vmovdqa $D1, $ctr1_store
|
|
vmovdqa $D0, $ctr0_store
|
|
___
|
|
}
|
|
|
|
sub finalize_state_avx2 {
|
|
my ($n)=@_;
|
|
$code.=<<___ if ($n eq 4);
|
|
vpaddd .chacha20_consts(%rip), $A3, $A3
|
|
vpaddd $state1_store, $B3, $B3
|
|
vpaddd $state2_store, $C3, $C3
|
|
vpaddd $ctr3_store, $D3, $D3
|
|
___
|
|
$code.=<<___ if ($n ge 3);
|
|
vpaddd .chacha20_consts(%rip), $A2, $A2
|
|
vpaddd $state1_store, $B2, $B2
|
|
vpaddd $state2_store, $C2, $C2
|
|
vpaddd $ctr2_store, $D2, $D2
|
|
___
|
|
$code.=<<___ if ($n ge 2);
|
|
vpaddd .chacha20_consts(%rip), $A1, $A1
|
|
vpaddd $state1_store, $B1, $B1
|
|
vpaddd $state2_store, $C1, $C1
|
|
vpaddd $ctr1_store, $D1, $D1
|
|
___
|
|
$code.=<<___;
|
|
vpaddd .chacha20_consts(%rip), $A0, $A0
|
|
vpaddd $state1_store, $B0, $B0
|
|
vpaddd $state2_store, $C0, $C0
|
|
vpaddd $ctr0_store, $D0, $D0
|
|
___
|
|
}
|
|
|
|
sub xor_stream_avx2 {
|
|
my ($A, $B, $C, $D, $offset, $hlp)=@_;
|
|
$code.=<<___;
|
|
vperm2i128 \$0x02, $A, $B, $hlp
|
|
vperm2i128 \$0x13, $A, $B, $B
|
|
vperm2i128 \$0x02, $C, $D, $A
|
|
vperm2i128 \$0x13, $C, $D, $C
|
|
vpxor 0*32+$offset($inp), $hlp, $hlp
|
|
vpxor 1*32+$offset($inp), $A, $A
|
|
vpxor 2*32+$offset($inp), $B, $B
|
|
vpxor 3*32+$offset($inp), $C, $C
|
|
vmovdqu $hlp, 0*32+$offset($oup)
|
|
vmovdqu $A, 1*32+$offset($oup)
|
|
vmovdqu $B, 2*32+$offset($oup)
|
|
vmovdqu $C, 3*32+$offset($oup)
|
|
___
|
|
}
|
|
|
|
sub finish_stream_avx2 {
|
|
my ($A, $B, $C, $D, $hlp)=@_;
|
|
$code.=<<___;
|
|
vperm2i128 \$0x13, $A, $B, $hlp
|
|
vperm2i128 \$0x02, $A, $B, $A
|
|
vperm2i128 \$0x02, $C, $D, $B
|
|
vperm2i128 \$0x13, $C, $D, $D
|
|
vmovdqa $hlp, $C
|
|
___
|
|
}
|
|
|
|
sub poly_stage1_mulx {
|
|
$code.=<<___;
|
|
mov 0+$r_store, %rdx
|
|
mov %rdx, $t2
|
|
mulx $acc0, $t0, $t1
|
|
mulx $acc1, %rax, %rdx
|
|
imulq $acc2, $t2
|
|
add %rax, $t1
|
|
adc %rdx, $t2
|
|
___
|
|
}
|
|
|
|
sub poly_stage2_mulx {
|
|
$code.=<<___;
|
|
mov 8+$r_store, %rdx
|
|
mulx $acc0, $acc0, %rax
|
|
add $acc0, $t1
|
|
mulx $acc1, $acc1, $t3
|
|
adc $acc1, $t2
|
|
adc \$0, $t3
|
|
imulq $acc2, %rdx
|
|
___
|
|
}
|
|
|
|
sub poly_stage3_mulx {
|
|
$code.=<<___;
|
|
add %rax, $t2
|
|
adc %rdx, $t3
|
|
___
|
|
}
|
|
|
|
sub poly_mul_mulx {
|
|
&poly_stage1_mulx();
|
|
&poly_stage2_mulx();
|
|
&poly_stage3_mulx();
|
|
&poly_reduce_stage();
|
|
}
|
|
|
|
sub gen_chacha_round_avx2 {
|
|
my ($rot1, $rot2, $shift)=@_;
|
|
my $round="";
|
|
$round=$round ."vmovdqa $C0, $tmp_store\n" if ($rot1 eq 20);
|
|
$round=$round ."vmovdqa $rot2, $C0
|
|
vpaddd $B3, $A3, $A3
|
|
vpaddd $B2, $A2, $A2
|
|
vpaddd $B1, $A1, $A1
|
|
vpaddd $B0, $A0, $A0
|
|
vpxor $A3, $D3, $D3
|
|
vpxor $A2, $D2, $D2
|
|
vpxor $A1, $D1, $D1
|
|
vpxor $A0, $D0, $D0
|
|
vpshufb $C0, $D3, $D3
|
|
vpshufb $C0, $D2, $D2
|
|
vpshufb $C0, $D1, $D1
|
|
vpshufb $C0, $D0, $D0
|
|
vmovdqa $tmp_store, $C0
|
|
vpaddd $D3, $C3, $C3
|
|
vpaddd $D2, $C2, $C2
|
|
vpaddd $D1, $C1, $C1
|
|
vpaddd $D0, $C0, $C0
|
|
vpxor $C3, $B3, $B3
|
|
vpxor $C2, $B2, $B2
|
|
vpxor $C1, $B1, $B1
|
|
vpxor $C0, $B0, $B0
|
|
vmovdqa $C0, $tmp_store
|
|
vpsrld \$$rot1, $B3, $C0
|
|
vpslld \$32-$rot1, $B3, $B3
|
|
vpxor $C0, $B3, $B3
|
|
vpsrld \$$rot1, $B2, $C0
|
|
vpslld \$32-$rot1, $B2, $B2
|
|
vpxor $C0, $B2, $B2
|
|
vpsrld \$$rot1, $B1, $C0
|
|
vpslld \$32-$rot1, $B1, $B1
|
|
vpxor $C0, $B1, $B1
|
|
vpsrld \$$rot1, $B0, $C0
|
|
vpslld \$32-$rot1, $B0, $B0
|
|
vpxor $C0, $B0, $B0\n";
|
|
($s1,$s2,$s3)=(4,8,12) if ($shift =~ /left/);
|
|
($s1,$s2,$s3)=(12,8,4) if ($shift =~ /right/);
|
|
$round=$round ."vmovdqa $tmp_store, $C0
|
|
vpalignr \$$s1, $B3, $B3, $B3
|
|
vpalignr \$$s2, $C3, $C3, $C3
|
|
vpalignr \$$s3, $D3, $D3, $D3
|
|
vpalignr \$$s1, $B2, $B2, $B2
|
|
vpalignr \$$s2, $C2, $C2, $C2
|
|
vpalignr \$$s3, $D2, $D2, $D2
|
|
vpalignr \$$s1, $B1, $B1, $B1
|
|
vpalignr \$$s2, $C1, $C1, $C1
|
|
vpalignr \$$s3, $D1, $D1, $D1
|
|
vpalignr \$$s1, $B0, $B0, $B0
|
|
vpalignr \$$s2, $C0, $C0, $C0
|
|
vpalignr \$$s3, $D0, $D0, $D0\n"
|
|
if (($shift =~ /left/) || ($shift =~ /right/));
|
|
return $round;
|
|
};
|
|
|
|
$chacha_body = &gen_chacha_round_avx2(20, ".rol16(%rip)") .
|
|
&gen_chacha_round_avx2(25, ".rol8(%rip)", "left") .
|
|
&gen_chacha_round_avx2(20, ".rol16(%rip)") .
|
|
&gen_chacha_round_avx2(25, ".rol8(%rip)", "right");
|
|
|
|
@loop_body = split /\n/, $chacha_body;
|
|
|
|
$code.="
|
|
###############################################################################
|
|
.type chacha20_poly1305_open_avx2,\@function,2
|
|
.align 64
|
|
chacha20_poly1305_open_avx2:
|
|
vzeroupper
|
|
vmovdqa .chacha20_consts(%rip), $A0
|
|
vbroadcasti128 0*16($keyp), $B0
|
|
vbroadcasti128 1*16($keyp), $C0
|
|
vbroadcasti128 2*16($keyp), $D0
|
|
vpaddd .avx2_init(%rip), $D0, $D0
|
|
cmp \$6*32, $inl
|
|
jbe open_avx2_192
|
|
cmp \$10*32, $inl
|
|
jbe open_avx2_320
|
|
|
|
vmovdqa $B0, $state1_store
|
|
vmovdqa $C0, $state2_store
|
|
vmovdqa $D0, $ctr0_store
|
|
mov \$10, $acc0
|
|
1: \n";
|
|
&chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"left");
|
|
&chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"right"); $code.="
|
|
dec $acc0
|
|
jne 1b
|
|
vpaddd .chacha20_consts(%rip), $A0, $A0
|
|
vpaddd $state1_store, $B0, $B0
|
|
vpaddd $state2_store, $C0, $C0
|
|
vpaddd $ctr0_store, $D0, $D0
|
|
|
|
vperm2i128 \$0x02, $A0, $B0, $T0
|
|
# Clamp and store key
|
|
vpand .clamp(%rip), $T0, $T0
|
|
vmovdqa $T0, $r_store
|
|
# Stream for the first 64 bytes
|
|
vperm2i128 \$0x13, $A0, $B0, $A0
|
|
vperm2i128 \$0x13, $C0, $D0, $B0
|
|
# Hash AD + first 64 bytes
|
|
mov %r8, $itr2
|
|
call poly_hash_ad_internal
|
|
xor $itr1, $itr1
|
|
# Hash first 64 bytes
|
|
1: \n";
|
|
&poly_add("0($inp, $itr1)");
|
|
&poly_mul(); $code.="
|
|
add \$16, $itr1
|
|
cmp \$2*32, $itr1
|
|
jne 1b
|
|
# Decrypt first 64 bytes
|
|
vpxor 0*32($inp), $A0, $A0
|
|
vpxor 1*32($inp), $B0, $B0
|
|
vmovdqu $A0, 0*32($oup)
|
|
vmovdqu $B0, 1*32($oup)
|
|
lea 2*32($inp), $inp
|
|
lea 2*32($oup), $oup
|
|
sub \$2*32, $inl
|
|
1:
|
|
# Hash and decrypt 512 bytes each iteration
|
|
cmp \$16*32, $inl
|
|
jb 3f\n";
|
|
&prep_state_avx2(4); $code.="
|
|
xor $itr1, $itr1
|
|
2: \n";
|
|
&poly_add("0*8($inp, $itr1)");
|
|
&emit_body(10);
|
|
&poly_stage1_mulx();
|
|
&emit_body(9);
|
|
&poly_stage2_mulx();
|
|
&emit_body(12);
|
|
&poly_stage3_mulx();
|
|
&emit_body(10);
|
|
&poly_reduce_stage();
|
|
&emit_body(9);
|
|
&poly_add("2*8($inp, $itr1)");
|
|
&emit_body(8);
|
|
&poly_stage1_mulx();
|
|
&emit_body(18);
|
|
&poly_stage2_mulx();
|
|
&emit_body(18);
|
|
&poly_stage3_mulx();
|
|
&emit_body(9);
|
|
&poly_reduce_stage();
|
|
&emit_body(8);
|
|
&poly_add("4*8($inp, $itr1)"); $code.="
|
|
lea 6*8($itr1), $itr1\n";
|
|
&emit_body(18);
|
|
&poly_stage1_mulx();
|
|
&emit_body(8);
|
|
&poly_stage2_mulx();
|
|
&emit_body(8);
|
|
&poly_stage3_mulx();
|
|
&emit_body(18);
|
|
&poly_reduce_stage();
|
|
foreach $l (@loop_body) {$code.=$l."\n";}
|
|
@loop_body = split /\n/, $chacha_body; $code.="
|
|
cmp \$10*6*8, $itr1
|
|
jne 2b\n";
|
|
&finalize_state_avx2(4); $code.="
|
|
vmovdqa $A0, $tmp_store\n";
|
|
&poly_add("10*6*8($inp)");
|
|
&xor_stream_avx2($A3, $B3, $C3, $D3, 0*32, $A0); $code.="
|
|
vmovdqa $tmp_store, $A0\n";
|
|
&poly_mul();
|
|
&xor_stream_avx2($A2, $B2, $C2, $D2, 4*32, $A3);
|
|
&poly_add("10*6*8+2*8($inp)");
|
|
&xor_stream_avx2($A1, $B1, $C1, $D1, 8*32, $A3);
|
|
&poly_mul();
|
|
&xor_stream_avx2($A0, $B0, $C0, $D0, 12*32, $A3); $code.="
|
|
lea 16*32($inp), $inp
|
|
lea 16*32($oup), $oup
|
|
sub \$16*32, $inl
|
|
jmp 1b
|
|
3:
|
|
test $inl, $inl
|
|
vzeroupper
|
|
je open_sse_finalize
|
|
3:
|
|
cmp \$4*32, $inl
|
|
ja 3f\n";
|
|
###############################################################################
|
|
# 1-128 bytes left
|
|
&prep_state_avx2(1); $code.="
|
|
xor $itr2, $itr2
|
|
mov $inl, $itr1
|
|
and \$-16, $itr1
|
|
test $itr1, $itr1
|
|
je 2f
|
|
1: \n";
|
|
&poly_add("0*8($inp, $itr2)");
|
|
&poly_mul(); $code.="
|
|
2:
|
|
add \$16, $itr2\n";
|
|
&chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"left");
|
|
&chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"right"); $code.="
|
|
cmp $itr1, $itr2
|
|
jb 1b
|
|
cmp \$160, $itr2
|
|
jne 2b\n";
|
|
&finalize_state_avx2(1);
|
|
&finish_stream_avx2($A0,$B0,$C0,$D0,$T0); $code.="
|
|
jmp open_avx2_tail_loop
|
|
3:
|
|
cmp \$8*32, $inl
|
|
ja 3f\n";
|
|
###############################################################################
|
|
# 129-256 bytes left
|
|
&prep_state_avx2(2); $code.="
|
|
mov $inl, $tmp_store
|
|
mov $inl, $itr1
|
|
sub \$4*32, $itr1
|
|
shr \$4, $itr1
|
|
mov \$10, $itr2
|
|
cmp \$10, $itr1
|
|
cmovg $itr2, $itr1
|
|
mov $inp, $inl
|
|
xor $itr2, $itr2
|
|
1: \n";
|
|
&poly_add("0*8($inl)");
|
|
&poly_mul_mulx(); $code.="
|
|
lea 16($inl), $inl
|
|
2: \n";
|
|
&chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"left");
|
|
&chacha_qr_avx2($A1,$B1,$C1,$D1,$T0,"left"); $code.="
|
|
inc $itr2\n";
|
|
&chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"right");
|
|
&chacha_qr_avx2($A1,$B1,$C1,$D1,$T0,"right");
|
|
&chacha_qr_avx2($A2,$B2,$C2,$D2,$T0,"right"); $code.="
|
|
cmp $itr1, $itr2
|
|
jb 1b
|
|
cmp \$10, $itr2
|
|
jne 2b
|
|
mov $inl, $itr2
|
|
sub $inp, $inl
|
|
mov $inl, $itr1
|
|
mov $tmp_store, $inl
|
|
1:
|
|
add \$16, $itr1
|
|
cmp $inl, $itr1
|
|
jg 1f\n";
|
|
&poly_add("0*8($itr2)");
|
|
&poly_mul_mulx(); $code.="
|
|
lea 16($itr2), $itr2
|
|
jmp 1b
|
|
1: \n";
|
|
&finalize_state_avx2(2);
|
|
&xor_stream_avx2($A1, $B1, $C1, $D1, 0*32, $T0);
|
|
&finish_stream_avx2($A0, $B0, $C0, $D0, $T0); $code.="
|
|
lea 4*32($inp), $inp
|
|
lea 4*32($oup), $oup
|
|
sub \$4*32, $inl
|
|
jmp open_avx2_tail_loop
|
|
3:
|
|
cmp \$12*32, $inl
|
|
ja 3f\n";
|
|
###############################################################################
|
|
# 257-383 bytes left
|
|
&prep_state_avx2(3); $code.="
|
|
mov $inl, $tmp_store
|
|
mov $inl, $itr1
|
|
sub \$8*32, $itr1
|
|
shr \$4, $itr1
|
|
add \$6, $itr1
|
|
mov \$10, $itr2
|
|
cmp \$10, $itr1
|
|
cmovg $itr2, $itr1
|
|
mov $inp, $inl
|
|
xor $itr2, $itr2
|
|
1: \n";
|
|
&poly_add("0*8($inl)");
|
|
&poly_mul_mulx(); $code.="
|
|
lea 16($inl), $inl
|
|
2: \n";
|
|
&chacha_qr_avx2($A2,$B2,$C2,$D2,$T0,"left");
|
|
&chacha_qr_avx2($A1,$B1,$C1,$D1,$T0,"left");
|
|
&chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"left");
|
|
&poly_add("0*8($inl)");
|
|
&poly_mul(); $code.="
|
|
lea 16($inl), $inl
|
|
inc $itr2\n";
|
|
&chacha_qr_avx2($A2,$B2,$C2,$D2,$T0,"right");
|
|
&chacha_qr_avx2($A1,$B1,$C1,$D1,$T0,"right");
|
|
&chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"right"); $code.="
|
|
cmp $itr1, $itr2
|
|
jb 1b
|
|
cmp \$10, $itr2
|
|
jne 2b
|
|
mov $inl, $itr2
|
|
sub $inp, $inl
|
|
mov $inl, $itr1
|
|
mov $tmp_store, $inl
|
|
1:
|
|
add \$16, $itr1
|
|
cmp $inl, $itr1
|
|
jg 1f\n";
|
|
&poly_add("0*8($itr2)");
|
|
&poly_mul_mulx(); $code.="
|
|
lea 16($itr2), $itr2
|
|
jmp 1b
|
|
1: \n";
|
|
&finalize_state_avx2(3);
|
|
&xor_stream_avx2($A2, $B2, $C2, $D2, 0*32, $T0);
|
|
&xor_stream_avx2($A1, $B1, $C1, $D1, 4*32, $T0);
|
|
&finish_stream_avx2($A0, $B0, $C0, $D0, $T0); $code.="
|
|
lea 8*32($inp), $inp
|
|
lea 8*32($oup), $oup
|
|
sub \$8*32, $inl
|
|
jmp open_avx2_tail_loop
|
|
3: \n";
|
|
###############################################################################
|
|
# 384-512 bytes left
|
|
&prep_state_avx2(4); $code.="
|
|
xor $itr1, $itr1
|
|
mov $inp, $itr2
|
|
1: \n";
|
|
&poly_add("0*8($itr2)");
|
|
&poly_mul(); $code.="
|
|
lea 2*8($itr2), $itr2
|
|
2: \n";
|
|
&emit_body(37);
|
|
&poly_add("0*8($itr2)");
|
|
&poly_mul_mulx();
|
|
&emit_body(48);
|
|
&poly_add("2*8($itr2)");
|
|
&poly_mul_mulx(); $code.="
|
|
lea 4*8($itr2), $itr2\n";
|
|
foreach $l (@loop_body) {$code.=$l."\n";}
|
|
@loop_body = split /\n/, $chacha_body; $code.="
|
|
inc $itr1
|
|
cmp \$4, $itr1
|
|
jl 1b
|
|
cmp \$10, $itr1
|
|
jne 2b
|
|
mov $inl, $itr1
|
|
sub \$12*32, $itr1
|
|
and \$-16, $itr1
|
|
1:
|
|
test $itr1, $itr1
|
|
je 1f\n";
|
|
&poly_add("0*8($itr2)");
|
|
&poly_mul_mulx(); $code.="
|
|
lea 2*8($itr2), $itr2
|
|
sub \$2*8, $itr1
|
|
jmp 1b
|
|
1: \n";
|
|
&finalize_state_avx2(4); $code.="
|
|
vmovdqa $A0, $tmp_store\n";
|
|
&xor_stream_avx2($A3, $B3, $C3, $D3, 0*32, $A0); $code.="
|
|
vmovdqa $tmp_store, $A0\n";
|
|
&xor_stream_avx2($A2, $B2, $C2, $D2, 4*32, $A3);
|
|
&xor_stream_avx2($A1, $B1, $C1, $D1, 8*32, $A3);
|
|
&finish_stream_avx2($A0, $B0, $C0, $D0, $A3); $code.="
|
|
lea 12*32($inp), $inp
|
|
lea 12*32($oup), $oup
|
|
sub \$12*32, $inl
|
|
open_avx2_tail_loop:
|
|
cmp \$32, $inl
|
|
jb open_avx2_tail
|
|
sub \$32, $inl
|
|
vpxor ($inp), $A0, $A0
|
|
vmovdqu $A0, ($oup)
|
|
lea 1*32($inp), $inp
|
|
lea 1*32($oup), $oup
|
|
vmovdqa $B0, $A0
|
|
vmovdqa $C0, $B0
|
|
vmovdqa $D0, $C0
|
|
jmp open_avx2_tail_loop
|
|
open_avx2_tail:
|
|
cmp \$16, $inl
|
|
vmovdqa $A0x, $A1x
|
|
jb 1f
|
|
sub \$16, $inl
|
|
#load for decryption
|
|
vpxor ($inp), $A0x, $A1x
|
|
vmovdqu $A1x, ($oup)
|
|
lea 1*16($inp), $inp
|
|
lea 1*16($oup), $oup
|
|
vperm2i128 \$0x11, $A0, $A0, $A0
|
|
vmovdqa $A0x, $A1x
|
|
1:
|
|
vzeroupper
|
|
jmp open_sse_tail_16
|
|
###############################################################################
|
|
open_avx2_192:
|
|
vmovdqa $A0, $A1
|
|
vmovdqa $A0, $A2
|
|
vmovdqa $B0, $B1
|
|
vmovdqa $B0, $B2
|
|
vmovdqa $C0, $C1
|
|
vmovdqa $C0, $C2
|
|
vpaddd .avx2_inc(%rip), $D0, $D1
|
|
vmovdqa $D0, $T2
|
|
vmovdqa $D1, $T3
|
|
mov \$10, $acc0
|
|
1: \n";
|
|
&chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"left");
|
|
&chacha_qr_avx2($A1,$B1,$C1,$D1,$T0,"left");
|
|
&chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"right");
|
|
&chacha_qr_avx2($A1,$B1,$C1,$D1,$T0,"right"); $code.="
|
|
dec $acc0
|
|
jne 1b
|
|
vpaddd $A2, $A0, $A0
|
|
vpaddd $A2, $A1, $A1
|
|
vpaddd $B2, $B0, $B0
|
|
vpaddd $B2, $B1, $B1
|
|
vpaddd $C2, $C0, $C0
|
|
vpaddd $C2, $C1, $C1
|
|
vpaddd $T2, $D0, $D0
|
|
vpaddd $T3, $D1, $D1
|
|
vperm2i128 \$0x02, $A0, $B0, $T0
|
|
# Clamp and store the key
|
|
vpand .clamp(%rip), $T0, $T0
|
|
vmovdqa $T0, $r_store
|
|
# Stream for up to 192 bytes
|
|
vperm2i128 \$0x13, $A0, $B0, $A0
|
|
vperm2i128 \$0x13, $C0, $D0, $B0
|
|
vperm2i128 \$0x02, $A1, $B1, $C0
|
|
vperm2i128 \$0x02, $C1, $D1, $D0
|
|
vperm2i128 \$0x13, $A1, $B1, $A1
|
|
vperm2i128 \$0x13, $C1, $D1, $B1
|
|
open_avx2_short:
|
|
mov %r8, $itr2
|
|
call poly_hash_ad_internal
|
|
open_avx2_hash_and_xor_loop:
|
|
cmp \$32, $inl
|
|
jb open_avx2_short_tail_32
|
|
sub \$32, $inl\n";
|
|
# Load + hash
|
|
&poly_add("0*8($inp)");
|
|
&poly_mul();
|
|
&poly_add("2*8($inp)");
|
|
&poly_mul(); $code.="
|
|
# Load + decrypt
|
|
vpxor ($inp), $A0, $A0
|
|
vmovdqu $A0, ($oup)
|
|
lea 1*32($inp), $inp
|
|
lea 1*32($oup), $oup
|
|
# Shift stream
|
|
vmovdqa $B0, $A0
|
|
vmovdqa $C0, $B0
|
|
vmovdqa $D0, $C0
|
|
vmovdqa $A1, $D0
|
|
vmovdqa $B1, $A1
|
|
vmovdqa $C1, $B1
|
|
vmovdqa $D1, $C1
|
|
vmovdqa $A2, $D1
|
|
vmovdqa $B2, $A2
|
|
jmp open_avx2_hash_and_xor_loop
|
|
open_avx2_short_tail_32:
|
|
cmp \$16, $inl
|
|
vmovdqa $A0x, $A1x
|
|
jb 1f
|
|
sub \$16, $inl\n";
|
|
&poly_add("0*8($inp)");
|
|
&poly_mul(); $code.="
|
|
vpxor ($inp), $A0x, $A3x
|
|
vmovdqu $A3x, ($oup)
|
|
lea 1*16($inp), $inp
|
|
lea 1*16($oup), $oup
|
|
vextracti128 \$1, $A0, $A1x
|
|
1:
|
|
vzeroupper
|
|
jmp open_sse_tail_16
|
|
###############################################################################
|
|
open_avx2_320:
|
|
vmovdqa $A0, $A1
|
|
vmovdqa $A0, $A2
|
|
vmovdqa $B0, $B1
|
|
vmovdqa $B0, $B2
|
|
vmovdqa $C0, $C1
|
|
vmovdqa $C0, $C2
|
|
vpaddd .avx2_inc(%rip), $D0, $D1
|
|
vpaddd .avx2_inc(%rip), $D1, $D2
|
|
vmovdqa $B0, $T1
|
|
vmovdqa $C0, $T2
|
|
vmovdqa $D0, $ctr0_store
|
|
vmovdqa $D1, $ctr1_store
|
|
vmovdqa $D2, $ctr2_store
|
|
mov \$10, $acc0
|
|
1: \n";
|
|
&chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"left");
|
|
&chacha_qr_avx2($A1,$B1,$C1,$D1,$T0,"left");
|
|
&chacha_qr_avx2($A2,$B2,$C2,$D2,$T0,"left");
|
|
&chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"right");
|
|
&chacha_qr_avx2($A1,$B1,$C1,$D1,$T0,"right");
|
|
&chacha_qr_avx2($A2,$B2,$C2,$D2,$T0,"right"); $code.="
|
|
dec $acc0
|
|
jne 1b
|
|
vpaddd .chacha20_consts(%rip), $A0, $A0
|
|
vpaddd .chacha20_consts(%rip), $A1, $A1
|
|
vpaddd .chacha20_consts(%rip), $A2, $A2
|
|
vpaddd $T1, $B0, $B0
|
|
vpaddd $T1, $B1, $B1
|
|
vpaddd $T1, $B2, $B2
|
|
vpaddd $T2, $C0, $C0
|
|
vpaddd $T2, $C1, $C1
|
|
vpaddd $T2, $C2, $C2
|
|
vpaddd $ctr0_store, $D0, $D0
|
|
vpaddd $ctr1_store, $D1, $D1
|
|
vpaddd $ctr2_store, $D2, $D2
|
|
vperm2i128 \$0x02, $A0, $B0, $T0
|
|
# Clamp and store the key
|
|
vpand .clamp(%rip), $T0, $T0
|
|
vmovdqa $T0, $r_store
|
|
# Stream for up to 320 bytes
|
|
vperm2i128 \$0x13, $A0, $B0, $A0
|
|
vperm2i128 \$0x13, $C0, $D0, $B0
|
|
vperm2i128 \$0x02, $A1, $B1, $C0
|
|
vperm2i128 \$0x02, $C1, $D1, $D0
|
|
vperm2i128 \$0x13, $A1, $B1, $A1
|
|
vperm2i128 \$0x13, $C1, $D1, $B1
|
|
vperm2i128 \$0x02, $A2, $B2, $C1
|
|
vperm2i128 \$0x02, $C2, $D2, $D1
|
|
vperm2i128 \$0x13, $A2, $B2, $A2
|
|
vperm2i128 \$0x13, $C2, $D2, $B2
|
|
jmp open_avx2_short
|
|
.size chacha20_poly1305_open_avx2, .-chacha20_poly1305_open_avx2
|
|
###############################################################################
|
|
###############################################################################
|
|
.type chacha20_poly1305_seal_avx2,\@function,2
|
|
.align 64
|
|
chacha20_poly1305_seal_avx2:
|
|
vzeroupper
|
|
vmovdqa .chacha20_consts(%rip), $A0
|
|
vbroadcasti128 0*16($keyp), $B0
|
|
vbroadcasti128 1*16($keyp), $C0
|
|
vbroadcasti128 2*16($keyp), $D0
|
|
vpaddd .avx2_init(%rip), $D0, $D0
|
|
cmp \$6*32, $inl
|
|
jbe seal_avx2_192
|
|
cmp \$10*32, $inl
|
|
jbe seal_avx2_320
|
|
vmovdqa $A0, $A1
|
|
vmovdqa $A0, $A2
|
|
vmovdqa $A0, $A3
|
|
vmovdqa $B0, $B1
|
|
vmovdqa $B0, $B2
|
|
vmovdqa $B0, $B3
|
|
vmovdqa $B0, $state1_store
|
|
vmovdqa $C0, $C1
|
|
vmovdqa $C0, $C2
|
|
vmovdqa $C0, $C3
|
|
vmovdqa $C0, $state2_store
|
|
vmovdqa $D0, $D3
|
|
vpaddd .avx2_inc(%rip), $D3, $D2
|
|
vpaddd .avx2_inc(%rip), $D2, $D1
|
|
vpaddd .avx2_inc(%rip), $D1, $D0
|
|
vmovdqa $D0, $ctr0_store
|
|
vmovdqa $D1, $ctr1_store
|
|
vmovdqa $D2, $ctr2_store
|
|
vmovdqa $D3, $ctr3_store
|
|
mov \$10, $acc0
|
|
1: \n";
|
|
foreach $l (@loop_body) {$code.=$l."\n";}
|
|
@loop_body = split /\n/, $chacha_body; $code.="
|
|
dec $acc0
|
|
jnz 1b\n";
|
|
&finalize_state_avx2(4); $code.="
|
|
vperm2i128 \$0x13, $C3, $D3, $C3
|
|
vperm2i128 \$0x02, $A3, $B3, $D3
|
|
vperm2i128 \$0x13, $A3, $B3, $A3
|
|
vpand .clamp(%rip), $D3, $D3
|
|
vmovdqa $D3, $r_store
|
|
mov %r8, $itr2
|
|
call poly_hash_ad_internal
|
|
# Safely store 320 bytes (otherwise would handle with optimized call)
|
|
vpxor 0*32($inp), $A3, $A3
|
|
vpxor 1*32($inp), $C3, $C3
|
|
vmovdqu $A3, 0*32($oup)
|
|
vmovdqu $C3, 1*32($oup)\n";
|
|
&xor_stream_avx2($A2,$B2,$C2,$D2,2*32,$T3);
|
|
&xor_stream_avx2($A1,$B1,$C1,$D1,6*32,$T3);
|
|
&finish_stream_avx2($A0,$B0,$C0,$D0,$T3); $code.="
|
|
lea 10*32($inp), $inp
|
|
sub \$10*32, $inl
|
|
mov \$10*32, $itr1
|
|
cmp \$4*32, $inl
|
|
jbe seal_avx2_hash
|
|
vpxor 0*32($inp), $A0, $A0
|
|
vpxor 1*32($inp), $B0, $B0
|
|
vpxor 2*32($inp), $C0, $C0
|
|
vpxor 3*32($inp), $D0, $D0
|
|
vmovdqu $A0, 10*32($oup)
|
|
vmovdqu $B0, 11*32($oup)
|
|
vmovdqu $C0, 12*32($oup)
|
|
vmovdqu $D0, 13*32($oup)
|
|
lea 4*32($inp), $inp
|
|
sub \$4*32, $inl
|
|
mov \$8, $itr1
|
|
mov \$2, $itr2
|
|
cmp \$4*32, $inl
|
|
jbe seal_avx2_tail_128
|
|
cmp \$8*32, $inl
|
|
jbe seal_avx2_tail_256
|
|
cmp \$12*32, $inl
|
|
jbe seal_avx2_tail_384
|
|
cmp \$16*32, $inl
|
|
jbe seal_avx2_tail_512\n";
|
|
# We have 448 bytes to hash, but main loop hashes 512 bytes at a time - perform some rounds, before the main loop
|
|
&prep_state_avx2(4);
|
|
foreach $l (@loop_body) {$code.=$l."\n";}
|
|
@loop_body = split /\n/, $chacha_body;
|
|
&emit_body(41);
|
|
@loop_body = split /\n/, $chacha_body; $code.="
|
|
sub \$16, $oup
|
|
mov \$9, $itr1
|
|
jmp 4f
|
|
1: \n";
|
|
&prep_state_avx2(4); $code.="
|
|
mov \$10, $itr1
|
|
2: \n";
|
|
&poly_add("0*8($oup)");
|
|
&emit_body(10);
|
|
&poly_stage1_mulx();
|
|
&emit_body(9);
|
|
&poly_stage2_mulx();
|
|
&emit_body(12);
|
|
&poly_stage3_mulx();
|
|
&emit_body(10);
|
|
&poly_reduce_stage(); $code.="
|
|
4: \n";
|
|
&emit_body(9);
|
|
&poly_add("2*8($oup)");
|
|
&emit_body(8);
|
|
&poly_stage1_mulx();
|
|
&emit_body(18);
|
|
&poly_stage2_mulx();
|
|
&emit_body(18);
|
|
&poly_stage3_mulx();
|
|
&emit_body(9);
|
|
&poly_reduce_stage();
|
|
&emit_body(8);
|
|
&poly_add("4*8($oup)"); $code.="
|
|
lea 6*8($oup), $oup\n";
|
|
&emit_body(18);
|
|
&poly_stage1_mulx();
|
|
&emit_body(8);
|
|
&poly_stage2_mulx();
|
|
&emit_body(8);
|
|
&poly_stage3_mulx();
|
|
&emit_body(18);
|
|
&poly_reduce_stage();
|
|
foreach $l (@loop_body) {$code.=$l."\n";}
|
|
@loop_body = split /\n/, $chacha_body; $code.="
|
|
dec $itr1
|
|
jne 2b\n";
|
|
&finalize_state_avx2(4); $code.="
|
|
lea 4*8($oup), $oup
|
|
vmovdqa $A0, $tmp_store\n";
|
|
&poly_add("-4*8($oup)");
|
|
&xor_stream_avx2($A3, $B3, $C3, $D3, 0*32, $A0); $code.="
|
|
vmovdqa $tmp_store, $A0\n";
|
|
&poly_mul();
|
|
&xor_stream_avx2($A2, $B2, $C2, $D2, 4*32, $A3);
|
|
&poly_add("-2*8($oup)");
|
|
&xor_stream_avx2($A1, $B1, $C1, $D1, 8*32, $A3);
|
|
&poly_mul();
|
|
&xor_stream_avx2($A0, $B0, $C0, $D0, 12*32, $A3); $code.="
|
|
lea 16*32($inp), $inp
|
|
sub \$16*32, $inl
|
|
cmp \$16*32, $inl
|
|
jg 1b\n";
|
|
&poly_add("0*8($oup)");
|
|
&poly_mul();
|
|
&poly_add("2*8($oup)");
|
|
&poly_mul(); $code.="
|
|
lea 4*8($oup), $oup
|
|
mov \$10, $itr1
|
|
xor $itr2, $itr2
|
|
cmp \$4*32, $inl
|
|
ja 3f
|
|
###############################################################################
|
|
seal_avx2_tail_128:\n";
|
|
&prep_state_avx2(1); $code.="
|
|
1: \n";
|
|
&poly_add("0($oup)");
|
|
&poly_mul(); $code.="
|
|
lea 2*8($oup), $oup
|
|
2: \n";
|
|
&chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"left");
|
|
&poly_add("0*8($oup)");
|
|
&poly_mul();
|
|
&chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"right");
|
|
&poly_add("2*8($oup)");
|
|
&poly_mul(); $code.="
|
|
lea 4*8($oup), $oup
|
|
dec $itr1
|
|
jg 1b
|
|
dec $itr2
|
|
jge 2b\n";
|
|
&finalize_state_avx2(1);
|
|
&finish_stream_avx2($A0,$B0,$C0,$D0,$T0); $code.="
|
|
jmp seal_avx2_short_loop
|
|
3:
|
|
cmp \$8*32, $inl
|
|
ja 3f
|
|
###############################################################################
|
|
seal_avx2_tail_256:\n";
|
|
&prep_state_avx2(2); $code.="
|
|
1: \n";
|
|
&poly_add("0($oup)");
|
|
&poly_mul(); $code.="
|
|
lea 2*8($oup), $oup
|
|
2: \n";
|
|
&chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"left");
|
|
&chacha_qr_avx2($A1,$B1,$C1,$D1,$T0,"left");
|
|
&poly_add("0*8($oup)");
|
|
&poly_mul();
|
|
&chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"right");
|
|
&chacha_qr_avx2($A1,$B1,$C1,$D1,$T0,"right");
|
|
&poly_add("2*8($oup)");
|
|
&poly_mul(); $code.="
|
|
lea 4*8($oup), $oup
|
|
dec $itr1
|
|
jg 1b
|
|
dec $itr2
|
|
jge 2b\n";
|
|
&finalize_state_avx2(2);
|
|
&xor_stream_avx2($A1,$B1,$C1,$D1,0*32,$T0);
|
|
&finish_stream_avx2($A0,$B0,$C0,$D0,$T0); $code.="
|
|
mov \$4*32, $itr1
|
|
lea 4*32($inp), $inp
|
|
sub \$4*32, $inl
|
|
jmp seal_avx2_hash
|
|
3:
|
|
cmp \$12*32, $inl
|
|
ja seal_avx2_tail_512
|
|
###############################################################################
|
|
seal_avx2_tail_384:\n";
|
|
&prep_state_avx2(3); $code.="
|
|
1: \n";
|
|
&poly_add("0($oup)");
|
|
&poly_mul(); $code.="
|
|
lea 2*8($oup), $oup
|
|
2: \n";
|
|
&chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"left");
|
|
&chacha_qr_avx2($A1,$B1,$C1,$D1,$T0,"left");
|
|
&poly_add("0*8($oup)");
|
|
&poly_mul();
|
|
&chacha_qr_avx2($A2,$B2,$C2,$D2,$T0,"left");
|
|
&chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"right");
|
|
&poly_add("2*8($oup)");
|
|
&poly_mul();
|
|
&chacha_qr_avx2($A1,$B1,$C1,$D1,$T0,"right");
|
|
&chacha_qr_avx2($A2,$B2,$C2,$D2,$T0,"right"); $code.="
|
|
lea 4*8($oup), $oup
|
|
dec $itr1
|
|
jg 1b
|
|
dec $itr2
|
|
jge 2b\n";
|
|
&finalize_state_avx2(3);
|
|
&xor_stream_avx2($A2,$B2,$C2,$D2,0*32,$T0);
|
|
&xor_stream_avx2($A1,$B1,$C1,$D1,4*32,$T0);
|
|
&finish_stream_avx2($A0,$B0,$C0,$D0,$T0); $code.="
|
|
mov \$8*32, $itr1
|
|
lea 8*32($inp), $inp
|
|
sub \$8*32, $inl
|
|
jmp seal_avx2_hash
|
|
###############################################################################
|
|
seal_avx2_tail_512:\n";
|
|
&prep_state_avx2(4); $code.="
|
|
1: \n";
|
|
&poly_add("0($oup)");
|
|
&poly_mul_mulx(); $code.="
|
|
lea 2*8($oup), $oup
|
|
2: \n";
|
|
&emit_body(20);
|
|
&poly_add("0*8($oup)");
|
|
&emit_body(20);
|
|
&poly_stage1_mulx();
|
|
&emit_body(20);
|
|
&poly_stage2_mulx();
|
|
&emit_body(20);
|
|
&poly_stage3_mulx();
|
|
&emit_body(20);
|
|
&poly_reduce_stage();
|
|
&emit_body(20);
|
|
&poly_add("2*8($oup)");
|
|
&emit_body(20);
|
|
&poly_stage1_mulx();
|
|
&emit_body(20);
|
|
&poly_stage2_mulx();
|
|
&emit_body(20);
|
|
&poly_stage3_mulx();
|
|
&emit_body(20);
|
|
&poly_reduce_stage();
|
|
foreach $l (@loop_body) {$code.=$l."\n";}
|
|
@loop_body = split /\n/, $chacha_body; $code.="
|
|
lea 4*8($oup), $oup
|
|
dec $itr1
|
|
jg 1b
|
|
dec $itr2
|
|
jge 2b\n";
|
|
&finalize_state_avx2(4); $code.="
|
|
vmovdqa $A0, $tmp_store\n";
|
|
&xor_stream_avx2($A3, $B3, $C3, $D3, 0*32, $A0); $code.="
|
|
vmovdqa $tmp_store, $A0\n";
|
|
&xor_stream_avx2($A2, $B2, $C2, $D2, 4*32, $A3);
|
|
&xor_stream_avx2($A1, $B1, $C1, $D1, 8*32, $A3);
|
|
&finish_stream_avx2($A0,$B0,$C0,$D0,$T0); $code.="
|
|
mov \$12*32, $itr1
|
|
lea 12*32($inp), $inp
|
|
sub \$12*32, $inl
|
|
jmp seal_avx2_hash
|
|
################################################################################
|
|
seal_avx2_320:
|
|
vmovdqa $A0, $A1
|
|
vmovdqa $A0, $A2
|
|
vmovdqa $B0, $B1
|
|
vmovdqa $B0, $B2
|
|
vmovdqa $C0, $C1
|
|
vmovdqa $C0, $C2
|
|
vpaddd .avx2_inc(%rip), $D0, $D1
|
|
vpaddd .avx2_inc(%rip), $D1, $D2
|
|
vmovdqa $B0, $T1
|
|
vmovdqa $C0, $T2
|
|
vmovdqa $D0, $ctr0_store
|
|
vmovdqa $D1, $ctr1_store
|
|
vmovdqa $D2, $ctr2_store
|
|
mov \$10, $acc0
|
|
1: \n";
|
|
&chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"left");
|
|
&chacha_qr_avx2($A1,$B1,$C1,$D1,$T0,"left");
|
|
&chacha_qr_avx2($A2,$B2,$C2,$D2,$T0,"left");
|
|
&chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"right");
|
|
&chacha_qr_avx2($A1,$B1,$C1,$D1,$T0,"right");
|
|
&chacha_qr_avx2($A2,$B2,$C2,$D2,$T0,"right"); $code.="
|
|
dec $acc0
|
|
jne 1b
|
|
vpaddd .chacha20_consts(%rip), $A0, $A0
|
|
vpaddd .chacha20_consts(%rip), $A1, $A1
|
|
vpaddd .chacha20_consts(%rip), $A2, $A2
|
|
vpaddd $T1, $B0, $B0
|
|
vpaddd $T1, $B1, $B1
|
|
vpaddd $T1, $B2, $B2
|
|
vpaddd $T2, $C0, $C0
|
|
vpaddd $T2, $C1, $C1
|
|
vpaddd $T2, $C2, $C2
|
|
vpaddd $ctr0_store, $D0, $D0
|
|
vpaddd $ctr1_store, $D1, $D1
|
|
vpaddd $ctr2_store, $D2, $D2
|
|
vperm2i128 \$0x02, $A0, $B0, $T0
|
|
# Clamp and store the key
|
|
vpand .clamp(%rip), $T0, $T0
|
|
vmovdqa $T0, $r_store
|
|
# Stream for up to 320 bytes
|
|
vperm2i128 \$0x13, $A0, $B0, $A0
|
|
vperm2i128 \$0x13, $C0, $D0, $B0
|
|
vperm2i128 \$0x02, $A1, $B1, $C0
|
|
vperm2i128 \$0x02, $C1, $D1, $D0
|
|
vperm2i128 \$0x13, $A1, $B1, $A1
|
|
vperm2i128 \$0x13, $C1, $D1, $B1
|
|
vperm2i128 \$0x02, $A2, $B2, $C1
|
|
vperm2i128 \$0x02, $C2, $D2, $D1
|
|
vperm2i128 \$0x13, $A2, $B2, $A2
|
|
vperm2i128 \$0x13, $C2, $D2, $B2
|
|
jmp seal_avx2_short
|
|
################################################################################
|
|
seal_avx2_192:
|
|
vmovdqa $A0, $A1
|
|
vmovdqa $A0, $A2
|
|
vmovdqa $B0, $B1
|
|
vmovdqa $B0, $B2
|
|
vmovdqa $C0, $C1
|
|
vmovdqa $C0, $C2
|
|
vpaddd .avx2_inc(%rip), $D0, $D1
|
|
vmovdqa $D0, $T2
|
|
vmovdqa $D1, $T3
|
|
mov \$10, $acc0
|
|
1: \n";
|
|
&chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"left");
|
|
&chacha_qr_avx2($A1,$B1,$C1,$D1,$T0,"left");
|
|
&chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"right");
|
|
&chacha_qr_avx2($A1,$B1,$C1,$D1,$T0,"right"); $code.="
|
|
dec $acc0
|
|
jne 1b
|
|
vpaddd $A2, $A0, $A0
|
|
vpaddd $A2, $A1, $A1
|
|
vpaddd $B2, $B0, $B0
|
|
vpaddd $B2, $B1, $B1
|
|
vpaddd $C2, $C0, $C0
|
|
vpaddd $C2, $C1, $C1
|
|
vpaddd $T2, $D0, $D0
|
|
vpaddd $T3, $D1, $D1
|
|
vperm2i128 \$0x02, $A0, $B0, $T0
|
|
# Clamp and store the key
|
|
vpand .clamp(%rip), $T0, $T0
|
|
vmovdqa $T0, $r_store
|
|
# Stream for up to 192 bytes
|
|
vperm2i128 \$0x13, $A0, $B0, $A0
|
|
vperm2i128 \$0x13, $C0, $D0, $B0
|
|
vperm2i128 \$0x02, $A1, $B1, $C0
|
|
vperm2i128 \$0x02, $C1, $D1, $D0
|
|
vperm2i128 \$0x13, $A1, $B1, $A1
|
|
vperm2i128 \$0x13, $C1, $D1, $B1
|
|
seal_avx2_short:
|
|
mov %r8, $itr2
|
|
call poly_hash_ad_internal
|
|
xor $itr1, $itr1
|
|
seal_avx2_hash:
|
|
cmp \$16, $itr1
|
|
jb seal_avx2_short_loop\n";
|
|
&poly_add("0($oup)");
|
|
&poly_mul(); $code.="
|
|
sub \$16, $itr1
|
|
add \$16, $oup
|
|
jmp seal_avx2_hash
|
|
seal_avx2_short_loop:
|
|
cmp \$32, $inl
|
|
jb seal_avx2_short_tail
|
|
sub \$32, $inl
|
|
# Encrypt
|
|
vpxor ($inp), $A0, $A0
|
|
vmovdqu $A0, ($oup)
|
|
lea 1*32($inp), $inp
|
|
# Load + hash\n";
|
|
&poly_add("0*8($oup)");
|
|
&poly_mul();
|
|
&poly_add("2*8($oup)");
|
|
&poly_mul(); $code.="
|
|
lea 1*32($oup), $oup
|
|
# Shift stream
|
|
vmovdqa $B0, $A0
|
|
vmovdqa $C0, $B0
|
|
vmovdqa $D0, $C0
|
|
vmovdqa $A1, $D0
|
|
vmovdqa $B1, $A1
|
|
vmovdqa $C1, $B1
|
|
vmovdqa $D1, $C1
|
|
vmovdqa $A2, $D1
|
|
vmovdqa $B2, $A2
|
|
jmp seal_avx2_short_loop
|
|
seal_avx2_short_tail:
|
|
cmp \$16, $inl
|
|
jb 1f
|
|
sub \$16, $inl
|
|
vpxor ($inp), $A0x, $A3x
|
|
vmovdqu $A3x, ($oup)
|
|
lea 1*16($inp), $inp\n";
|
|
&poly_add("0*8($oup)");
|
|
&poly_mul(); $code.="
|
|
lea 1*16($oup), $oup
|
|
vextracti128 \$1, $A0, $A0x
|
|
1:
|
|
vzeroupper
|
|
jmp seal_sse_tail_16
|
|
.cfi_endproc
|
|
";
|
|
}
|
|
|
|
if (!$win64) {
|
|
$code =~ s/\`([^\`]*)\`/eval $1/gem;
|
|
print $code;
|
|
} else {
|
|
print <<___;
|
|
.globl dummy_chacha20_poly1305_asm
|
|
.type dummy_chacha20_poly1305_asm,\@abi-omnipotent
|
|
dummy_chacha20_poly1305_asm:
|
|
ret
|
|
___
|
|
}
|
|
|
|
close STDOUT;
|