|
- #!/usr/bin/env perl
-
- # Copyright (c) 2015, CloudFlare Ltd.
- #
- # Permission to use, copy, modify, and/or distribute this software for any
- # purpose with or without fee is hereby granted, provided that the above
- # copyright notice and this permission notice appear in all copies.
- #
- # THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
- # WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
- # MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
- # SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
- # WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
- # OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
- # CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */
-
- ##############################################################################
- # #
- # Author: Vlad Krasnov #
- # #
- ##############################################################################
-
- $flavour = shift;
- $output = shift;
- if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
-
- $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
-
- $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
- ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
- ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
- die "can't locate x86_64-xlate.pl";
-
- open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
- *STDOUT=*OUT;
-
- $avx = 2;
-
- $code.=<<___;
- .text
- .extern OPENSSL_ia32cap_P
-
- chacha20_poly1305_constants:
-
- .align 64
- .chacha20_consts:
- .byte 'e','x','p','a','n','d',' ','3','2','-','b','y','t','e',' ','k'
- .byte 'e','x','p','a','n','d',' ','3','2','-','b','y','t','e',' ','k'
- .rol8:
- .byte 3,0,1,2, 7,4,5,6, 11,8,9,10, 15,12,13,14
- .byte 3,0,1,2, 7,4,5,6, 11,8,9,10, 15,12,13,14
- .rol16:
- .byte 2,3,0,1, 6,7,4,5, 10,11,8,9, 14,15,12,13
- .byte 2,3,0,1, 6,7,4,5, 10,11,8,9, 14,15,12,13
- .avx2_init:
- .long 0,0,0,0
- .sse_inc:
- .long 1,0,0,0
- .avx2_inc:
- .long 2,0,0,0,2,0,0,0
- .clamp:
- .quad 0x0FFFFFFC0FFFFFFF, 0x0FFFFFFC0FFFFFFC
- .quad 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF
- .align 16
- .and_masks:
- .byte 0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
- .byte 0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
- .byte 0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
- .byte 0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
- .byte 0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
- .byte 0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
- .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
- .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
- .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00
- .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00
- .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00
- .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00
- .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00
- .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00
- .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00
- .byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
- ___
-
- my ($oup,$inp,$inl,$adp,$keyp,$itr1,$itr2)=("%rdi","%rsi","%rbx","%rcx","%r9","%rcx","%r8");
- my ($acc0,$acc1,$acc2)=map("%r$_",(10..12));
- my ($t0,$t1,$t2,$t3)=("%r13","%r14","%r15","%r9");
- my ($A0,$A1,$A2,$A3,$B0,$B1,$B2,$B3,$C0,$C1,$C2,$C3,$D0,$D1,$D2,$D3)=map("%xmm$_",(0..15));
- my ($T0,$T1,$T2,$T3)=($A3,$B3,$C3,$D3);
- my $r_store="0*16(%rbp)";
- my $s_store="1*16(%rbp)";
- my $len_store="2*16(%rbp)";
- my $state1_store="3*16(%rbp)";
- my $state2_store="4*16(%rbp)";
- my $tmp_store="5*16(%rbp)";
- my $ctr0_store="6*16(%rbp)";
- my $ctr1_store="7*16(%rbp)";
- my $ctr2_store="8*16(%rbp)";
- my $ctr3_store="9*16(%rbp)";
-
- sub chacha_qr {
- my ($a,$b,$c,$d,$t,$dir)=@_;
- $code.="movdqa $t, $tmp_store\n" if ($dir =~ /store/);
- $code.="paddd $b, $a
- pxor $a, $d
- pshufb .rol16(%rip), $d
- paddd $d, $c
- pxor $c, $b
- movdqa $b, $t
- pslld \$12, $t
- psrld \$20, $b
- pxor $t, $b
- paddd $b, $a
- pxor $a, $d
- pshufb .rol8(%rip), $d
- paddd $d, $c
- pxor $c, $b
- movdqa $b, $t
- pslld \$7, $t
- psrld \$25, $b
- pxor $t, $b\n";
- $code.="palignr \$4, $b, $b
- palignr \$8, $c, $c
- palignr \$12, $d, $d\n" if ($dir =~ /left/);
- $code.="palignr \$12, $b, $b
- palignr \$8, $c, $c
- palignr \$4, $d, $d\n" if ($dir =~ /right/);
- $code.="movdqa $tmp_store, $t\n" if ($dir =~ /load/);
- }
-
- sub poly_add {
- my ($src)=@_;
- $code.="add $src, $acc0
- adc 8+$src, $acc1
- adc \$1, $acc2\n";
- }
-
- sub poly_stage1 {
- $code.="mov 0+$r_store, %rax
- mov %rax, $t2
- mul $acc0
- mov %rax, $t0
- mov %rdx, $t1
- mov 0+$r_store, %rax
- mul $acc1
- imulq $acc2, $t2
- add %rax, $t1
- adc %rdx, $t2\n";
- }
-
- sub poly_stage2 {
- $code.="mov 8+$r_store, %rax
- mov %rax, $t3
- mul $acc0
- add %rax, $t1
- adc \$0, %rdx
- mov %rdx, $acc0
- mov 8+$r_store, %rax
- mul $acc1
- add %rax, $t2
- adc \$0, %rdx\n";
- }
-
- sub poly_stage3 {
- $code.="imulq $acc2, $t3
- add $acc0, $t2
- adc %rdx, $t3\n";
- }
-
- sub poly_reduce_stage {
- $code.="mov $t0, $acc0
- mov $t1, $acc1
- mov $t2, $acc2
- and \$3, $acc2
- mov $t2, $t0
- and \$-4, $t0
- mov $t3, $t1
- shrd \$2, $t3, $t2
- shr \$2, $t3
- add $t0, $acc0
- adc $t1, $acc1
- adc \$0, $acc2
- add $t2, $acc0
- adc $t3, $acc1
- adc \$0, $acc2\n";
- }
-
- sub poly_mul {
- &poly_stage1();
- &poly_stage2();
- &poly_stage3();
- &poly_reduce_stage();
- }
-
- sub prep_state {
- my ($n)=@_;
- $code.="movdqa .chacha20_consts(%rip), $A0
- movdqa $state1_store, $B0
- movdqa $state2_store, $C0\n";
- $code.="movdqa $A0, $A1
- movdqa $B0, $B1
- movdqa $C0, $C1\n" if ($n ge 2);
- $code.="movdqa $A0, $A2
- movdqa $B0, $B2
- movdqa $C0, $C2\n" if ($n ge 3);
- $code.="movdqa $A0, $A3
- movdqa $B0, $B3
- movdqa $C0, $C3\n" if ($n ge 4);
- $code.="movdqa $ctr0_store, $D0
- paddd .sse_inc(%rip), $D0
- movdqa $D0, $ctr0_store\n" if ($n eq 1);
- $code.="movdqa $ctr0_store, $D1
- paddd .sse_inc(%rip), $D1
- movdqa $D1, $D0
- paddd .sse_inc(%rip), $D0
- movdqa $D0, $ctr0_store
- movdqa $D1, $ctr1_store\n" if ($n eq 2);
- $code.="movdqa $ctr0_store, $D2
- paddd .sse_inc(%rip), $D2
- movdqa $D2, $D1
- paddd .sse_inc(%rip), $D1
- movdqa $D1, $D0
- paddd .sse_inc(%rip), $D0
- movdqa $D0, $ctr0_store
- movdqa $D1, $ctr1_store
- movdqa $D2, $ctr2_store\n" if ($n eq 3);
- $code.="movdqa $ctr0_store, $D3
- paddd .sse_inc(%rip), $D3
- movdqa $D3, $D2
- paddd .sse_inc(%rip), $D2
- movdqa $D2, $D1
- paddd .sse_inc(%rip), $D1
- movdqa $D1, $D0
- paddd .sse_inc(%rip), $D0
- movdqa $D0, $ctr0_store
- movdqa $D1, $ctr1_store
- movdqa $D2, $ctr2_store
- movdqa $D3, $ctr3_store\n" if ($n eq 4);
- }
-
- sub finalize_state {
- my ($n)=@_;
- $code.="paddd .chacha20_consts(%rip), $A3
- paddd $state1_store, $B3
- paddd $state2_store, $C3
- paddd $ctr3_store, $D3\n" if ($n eq 4);
- $code.="paddd .chacha20_consts(%rip), $A2
- paddd $state1_store, $B2
- paddd $state2_store, $C2
- paddd $ctr2_store, $D2\n" if ($n ge 3);
- $code.="paddd .chacha20_consts(%rip), $A1
- paddd $state1_store, $B1
- paddd $state2_store, $C1
- paddd $ctr1_store, $D1\n" if ($n ge 2);
- $code.="paddd .chacha20_consts(%rip), $A0
- paddd $state1_store, $B0
- paddd $state2_store, $C0
- paddd $ctr0_store, $D0\n";
- }
-
- sub xor_stream {
- my ($A, $B, $C, $D, $offset)=@_;
- $code.="movdqu 0*16 + $offset($inp), $A3
- movdqu 1*16 + $offset($inp), $B3
- movdqu 2*16 + $offset($inp), $C3
- movdqu 3*16 + $offset($inp), $D3
- pxor $A3, $A
- pxor $B3, $B
- pxor $C3, $C
- pxor $D, $D3
- movdqu $A, 0*16 + $offset($oup)
- movdqu $B, 1*16 + $offset($oup)
- movdqu $C, 2*16 + $offset($oup)
- movdqu $D3, 3*16 + $offset($oup)\n";
- }
-
- sub xor_stream_using_temp {
- my ($A, $B, $C, $D, $offset, $temp)=@_;
- $code.="movdqa $temp, $tmp_store
- movdqu 0*16 + $offset($inp), $temp
- pxor $A, $temp
- movdqu $temp, 0*16 + $offset($oup)
- movdqu 1*16 + $offset($inp), $temp
- pxor $B, $temp
- movdqu $temp, 1*16 + $offset($oup)
- movdqu 2*16 + $offset($inp), $temp
- pxor $C, $temp
- movdqu $temp, 2*16 + $offset($oup)
- movdqu 3*16 + $offset($inp), $temp
- pxor $D, $temp
- movdqu $temp, 3*16 + $offset($oup)\n";
- }
-
- sub gen_chacha_round {
- my ($rot1, $rot2, $shift)=@_;
- my $round="";
- $round.="movdqa $C0, $tmp_store\n" if ($rot1 eq 20);
- $round.="movdqa $rot2, $C0
- paddd $B3, $A3
- paddd $B2, $A2
- paddd $B1, $A1
- paddd $B0, $A0
- pxor $A3, $D3
- pxor $A2, $D2
- pxor $A1, $D1
- pxor $A0, $D0
- pshufb $C0, $D3
- pshufb $C0, $D2
- pshufb $C0, $D1
- pshufb $C0, $D0
- movdqa $tmp_store, $C0
- paddd $D3, $C3
- paddd $D2, $C2
- paddd $D1, $C1
- paddd $D0, $C0
- pxor $C3, $B3
- pxor $C2, $B2
- pxor $C1, $B1
- pxor $C0, $B0
- movdqa $C0, $tmp_store
- movdqa $B3, $C0
- psrld \$$rot1, $C0
- pslld \$32-$rot1, $B3
- pxor $C0, $B3
- movdqa $B2, $C0
- psrld \$$rot1, $C0
- pslld \$32-$rot1, $B2
- pxor $C0, $B2
- movdqa $B1, $C0
- psrld \$$rot1, $C0
- pslld \$32-$rot1, $B1
- pxor $C0, $B1
- movdqa $B0, $C0
- psrld \$$rot1, $C0
- pslld \$32-$rot1, $B0
- pxor $C0, $B0\n";
- ($s1,$s2,$s3)=(4,8,12) if ($shift =~ /left/);
- ($s1,$s2,$s3)=(12,8,4) if ($shift =~ /right/);
- $round.="movdqa $tmp_store, $C0
- palignr \$$s1, $B3, $B3
- palignr \$$s2, $C3, $C3
- palignr \$$s3, $D3, $D3
- palignr \$$s1, $B2, $B2
- palignr \$$s2, $C2, $C2
- palignr \$$s3, $D2, $D2
- palignr \$$s1, $B1, $B1
- palignr \$$s2, $C1, $C1
- palignr \$$s3, $D1, $D1
- palignr \$$s1, $B0, $B0
- palignr \$$s2, $C0, $C0
- palignr \$$s3, $D0, $D0\n"
- if (($shift =~ /left/) || ($shift =~ /right/));
- return $round;
- };
-
- $chacha_body = &gen_chacha_round(20, ".rol16(%rip)") .
- &gen_chacha_round(25, ".rol8(%rip)", "left") .
- &gen_chacha_round(20, ".rol16(%rip)") .
- &gen_chacha_round(25, ".rol8(%rip)", "right");
-
- my @loop_body = split /\n/, $chacha_body;
-
- sub emit_body {
- my ($n)=@_;
- for (my $i=0; $i < $n; $i++) {
- $code=$code.shift(@loop_body)."\n";
- };
- }
-
- {
- ################################################################################
- # void poly_hash_ad_internal();
- $code.="
- .type poly_hash_ad_internal,\@function,2
- .align 64
- poly_hash_ad_internal:
- .cfi_startproc
- xor $acc0, $acc0
- xor $acc1, $acc1
- xor $acc2, $acc2
- cmp \$13, $itr2
- jne hash_ad_loop
- poly_fast_tls_ad:
- # Special treatment for the TLS case of 13 bytes
- mov ($adp), $acc0
- mov 5($adp), $acc1
- shr \$24, $acc1
- mov \$1, $acc2\n";
- &poly_mul(); $code.="
- ret
- hash_ad_loop:
- # Hash in 16 byte chunk
- cmp \$16, $itr2
- jb hash_ad_tail\n";
- &poly_add("0($adp)");
- &poly_mul(); $code.="
- lea 1*16($adp), $adp
- sub \$16, $itr2
- jmp hash_ad_loop
- hash_ad_tail:
- cmp \$0, $itr2
- je 1f
- # Hash last < 16 byte tail
- xor $t0, $t0
- xor $t1, $t1
- xor $t2, $t2
- add $itr2, $adp
- hash_ad_tail_loop:
- shld \$8, $t0, $t1
- shl \$8, $t0
- movzxb -1($adp), $t2
- xor $t2, $t0
- dec $adp
- dec $itr2
- jne hash_ad_tail_loop
-
- add $t0, $acc0
- adc $t1, $acc1
- adc \$1, $acc2\n";
- &poly_mul(); $code.="
- # Finished AD
- 1:
- ret
- .cfi_endproc
- .size poly_hash_ad_internal, .-poly_hash_ad_internal\n";
- }
-
- {
- ################################################################################
- # void chacha20_poly1305_open(uint8_t *pt, uint8_t *ct, size_t len_in, uint8_t *ad, size_t len_ad, uint8_t *keyp);
- $code.="
- .globl chacha20_poly1305_open
- .type chacha20_poly1305_open,\@function,2
- .align 64
- chacha20_poly1305_open:
- .cfi_startproc
- push %rbp
- .cfi_adjust_cfa_offset 8
- push %rbx
- .cfi_adjust_cfa_offset 8
- push %r12
- .cfi_adjust_cfa_offset 8
- push %r13
- .cfi_adjust_cfa_offset 8
- push %r14
- .cfi_adjust_cfa_offset 8
- push %r15
- .cfi_adjust_cfa_offset 8
- # We write the calculated authenticator back to keyp at the end, so save
- # the pointer on the stack too.
- push $keyp
- .cfi_adjust_cfa_offset 8
- sub \$288 + 32, %rsp
- .cfi_adjust_cfa_offset 288 + 32
- .cfi_offset rbp, -16
- .cfi_offset rbx, -24
- .cfi_offset r12, -32
- .cfi_offset r13, -40
- .cfi_offset r14, -48
- .cfi_offset r15, -56
- lea 32(%rsp), %rbp
- and \$-32, %rbp
- mov %rdx, 8+$len_store
- mov %r8, 0+$len_store
- mov %rdx, $inl\n"; $code.="
- mov OPENSSL_ia32cap_P+8(%rip), %eax
- and \$`(1<<5) + (1<<8)`, %eax # Check both BMI2 and AVX2 are present
- xor \$`(1<<5) + (1<<8)`, %eax
- jz chacha20_poly1305_open_avx2\n" if ($avx>1);
- $code.="
- 1:
- cmp \$128, $inl
- jbe open_sse_128
- # For long buffers, prepare the poly key first
- movdqa .chacha20_consts(%rip), $A0
- movdqu 0*16($keyp), $B0
- movdqu 1*16($keyp), $C0
- movdqu 2*16($keyp), $D0
- movdqa $D0, $T1
- # Store on stack, to free keyp
- movdqa $B0, $state1_store
- movdqa $C0, $state2_store
- movdqa $D0, $ctr0_store
- mov \$10, $acc0
- 1: \n";
- &chacha_qr($A0,$B0,$C0,$D0,$T0,"left");
- &chacha_qr($A0,$B0,$C0,$D0,$T0,"right"); $code.="
- dec $acc0
- jne 1b
- # A0|B0 hold the Poly1305 32-byte key, C0,D0 can be discarded
- paddd .chacha20_consts(%rip), $A0
- paddd $state1_store, $B0
- # Clamp and store the key
- pand .clamp(%rip), $A0
- movdqa $A0, $r_store
- movdqa $B0, $s_store
- # Hash
- mov %r8, $itr2
- call poly_hash_ad_internal
- open_sse_main_loop:
- cmp \$16*16, $inl
- jb 2f
- # Load state, increment counter blocks\n";
- &prep_state(4); $code.="
- # There are 10 ChaCha20 iterations of 2QR each, so for 6 iterations we
- # hash 2 blocks, and for the remaining 4 only 1 block - for a total of 16
- mov \$4, $itr1
- mov $inp, $itr2
- 1: \n";
- &emit_body(20);
- &poly_add("0($itr2)"); $code.="
- lea 2*8($itr2), $itr2\n";
- &emit_body(20);
- &poly_stage1();
- &emit_body(20);
- &poly_stage2();
- &emit_body(20);
- &poly_stage3();
- &emit_body(20);
- &poly_reduce_stage();
- foreach $l (@loop_body) {$code.=$l."\n";}
- @loop_body = split /\n/, $chacha_body; $code.="
- dec $itr1
- jge 1b\n";
- &poly_add("0($itr2)");
- &poly_mul(); $code.="
- lea 2*8($itr2), $itr2
- cmp \$-6, $itr1
- jg 1b\n";
- &finalize_state(4);
- &xor_stream_using_temp($A3, $B3, $C3, $D3, "0*16", $D0);
- &xor_stream($A2, $B2, $C2, $D2, "4*16");
- &xor_stream($A1, $B1, $C1, $D1, "8*16");
- &xor_stream($A0, $B0, $C0, $tmp_store, "12*16"); $code.="
- lea 16*16($inp), $inp
- lea 16*16($oup), $oup
- sub \$16*16, $inl
- jmp open_sse_main_loop
- 2:
- # Handle the various tail sizes efficiently
- test $inl, $inl
- jz open_sse_finalize
- cmp \$4*16, $inl
- ja 3f\n";
- ###############################################################################
- # At most 64 bytes are left
- &prep_state(1); $code.="
- xor $itr2, $itr2
- mov $inl, $itr1
- cmp \$16, $itr1
- jb 2f
- 1: \n";
- &poly_add("0($inp, $itr2)");
- &poly_mul(); $code.="
- sub \$16, $itr1
- 2:
- add \$16, $itr2\n";
- &chacha_qr($A0,$B0,$C0,$D0,$T0,"left");
- &chacha_qr($A0,$B0,$C0,$D0,$T0,"right"); $code.="
- cmp \$16, $itr1
- jae 1b
- cmp \$10*16, $itr2
- jne 2b\n";
- &finalize_state(1); $code.="
- jmp open_sse_tail_64_dec_loop
- 3:
- cmp \$8*16, $inl
- ja 3f\n";
- ###############################################################################
- # 65 - 128 bytes are left
- &prep_state(2); $code.="
- mov $inl, $itr1
- and \$-16, $itr1
- xor $itr2, $itr2
- 1: \n";
- &poly_add("0($inp, $itr2)");
- &poly_mul(); $code.="
- 2:
- add \$16, $itr2\n";
- &chacha_qr($A0,$B0,$C0,$D0,$T0,"left");
- &chacha_qr($A1,$B1,$C1,$D1,$T0,"left");
- &chacha_qr($A0,$B0,$C0,$D0,$T0,"right");
- &chacha_qr($A1,$B1,$C1,$D1,$T0,"right");$code.="
- cmp $itr1, $itr2
- jb 1b
- cmp \$10*16, $itr2
- jne 2b\n";
- &finalize_state(2);
- &xor_stream($A1, $B1, $C1, $D1, "0*16"); $code.="
- sub \$4*16, $inl
- lea 4*16($inp), $inp
- lea 4*16($oup), $oup
- jmp open_sse_tail_64_dec_loop
- 3:
- cmp \$12*16, $inl
- ja 3f\n";
- ###############################################################################
- # 129 - 192 bytes are left
- &prep_state(3); $code.="
- mov $inl, $itr1
- mov \$10*16, $itr2
- cmp \$10*16, $itr1
- cmovg $itr2, $itr1
- and \$-16, $itr1
- xor $itr2, $itr2
- 1: \n";
- &poly_add("0($inp, $itr2)");
- &poly_mul(); $code.="
- 2:
- add \$16, $itr2\n";
- &chacha_qr($A0,$B0,$C0,$D0,$T0,"left");
- &chacha_qr($A1,$B1,$C1,$D1,$T0,"left");
- &chacha_qr($A2,$B2,$C2,$D2,$T0,"left");
- &chacha_qr($A0,$B0,$C0,$D0,$T0,"right");
- &chacha_qr($A1,$B1,$C1,$D1,$T0,"right");
- &chacha_qr($A2,$B2,$C2,$D2,$T0,"right"); $code.="
- cmp $itr1, $itr2
- jb 1b
- cmp \$10*16, $itr2
- jne 2b
- cmp \$11*16, $inl
- jb 1f\n";
- &poly_add("10*16($inp)");
- &poly_mul(); $code.="
- cmp \$12*16, $inl
- jb 1f\n";
- &poly_add("11*16($inp)");
- &poly_mul(); $code.="
- 1: \n";
- &finalize_state(3);
- &xor_stream($A2, $B2, $C2, $D2, "0*16");
- &xor_stream($A1, $B1, $C1, $D1, "4*16"); $code.="
- sub \$8*16, $inl
- lea 8*16($inp), $inp
- lea 8*16($oup), $oup
- jmp open_sse_tail_64_dec_loop
- 3:
- ###############################################################################\n";
- # 193 - 255 bytes are left
- &prep_state(4); $code.="
- xor $itr2, $itr2
- 1: \n";
- &poly_add("0($inp, $itr2)");
- &chacha_qr($A0,$B0,$C0,$D0,$C3,"store_left");
- &chacha_qr($A1,$B1,$C1,$D1,$C3,"left");
- &chacha_qr($A2,$B2,$C2,$D2,$C3,"left_load");
- &poly_stage1();
- &chacha_qr($A3,$B3,$C3,$D3,$C1,"store_left_load");
- &poly_stage2();
- &chacha_qr($A0,$B0,$C0,$D0,$C3,"store_right");
- &chacha_qr($A1,$B1,$C1,$D1,$C3,"right");
- &poly_stage3();
- &chacha_qr($A2,$B2,$C2,$D2,$C3,"right_load");
- &poly_reduce_stage();
- &chacha_qr($A3,$B3,$C3,$D3,$C1,"store_right_load"); $code.="
- add \$16, $itr2
- cmp \$10*16, $itr2
- jb 1b
- mov $inl, $itr1
- and \$-16, $itr1
- 1: \n";
- &poly_add("0($inp, $itr2)");
- &poly_mul(); $code.="
- add \$16, $itr2
- cmp $itr1, $itr2
- jb 1b\n";
- &finalize_state(4);
- &xor_stream_using_temp($A3, $B3, $C3, $D3, "0*16", $D0);
- &xor_stream($A2, $B2, $C2, $D2, "4*16");
- &xor_stream($A1, $B1, $C1, $D1, "8*16"); $code.="
- movdqa $tmp_store, $D0
- sub \$12*16, $inl
- lea 12*16($inp), $inp
- lea 12*16($oup), $oup
- ###############################################################################
- # Decrypt the remaining data, 16B at a time, using existing stream
- open_sse_tail_64_dec_loop:
- cmp \$16, $inl
- jb 1f
- sub \$16, $inl
- movdqu ($inp), $T0
- pxor $T0, $A0
- movdqu $A0, ($oup)
- lea 16($inp), $inp
- lea 16($oup), $oup
- movdqa $B0, $A0
- movdqa $C0, $B0
- movdqa $D0, $C0
- jmp open_sse_tail_64_dec_loop
- 1:
- movdqa $A0, $A1
-
- # Decrypt up to 16 bytes at the end.
- open_sse_tail_16:
- test $inl, $inl
- jz open_sse_finalize
-
- # Read the final bytes into $T0. They need to be read in reverse order so
- # that they end up in the correct order in $T0.
- pxor $T0, $T0
- lea -1($inp, $inl), $inp
- movq $inl, $itr2
- 2:
- pslldq \$1, $T0
- pinsrb \$0, ($inp), $T0
- sub \$1, $inp
- sub \$1, $itr2
- jnz 2b
-
- 3:
- movq $T0, $t0
- pextrq \$1, $T0, $t1
- # The final bytes of keystream are in $A1.
- pxor $A1, $T0
-
- # Copy the plaintext bytes out.
- 2:
- pextrb \$0, $T0, ($oup)
- psrldq \$1, $T0
- add \$1, $oup
- sub \$1, $inl
- jne 2b
-
- add $t0, $acc0
- adc $t1, $acc1
- adc \$1, $acc2\n";
- &poly_mul(); $code.="
-
- open_sse_finalize:\n";
- &poly_add($len_store);
- &poly_mul(); $code.="
- # Final reduce
- mov $acc0, $t0
- mov $acc1, $t1
- mov $acc2, $t2
- sub \$-5, $acc0
- sbb \$-1, $acc1
- sbb \$3, $acc2
- cmovc $t0, $acc0
- cmovc $t1, $acc1
- cmovc $t2, $acc2
- # Add in s part of the key
- add 0+$s_store, $acc0
- adc 8+$s_store, $acc1
-
- add \$288 + 32, %rsp
- .cfi_adjust_cfa_offset -(288 + 32)
- pop $keyp
- .cfi_adjust_cfa_offset -8
- movq $acc0, ($keyp)
- movq $acc1, 8($keyp)
-
- pop %r15
- .cfi_adjust_cfa_offset -8
- pop %r14
- .cfi_adjust_cfa_offset -8
- pop %r13
- .cfi_adjust_cfa_offset -8
- pop %r12
- .cfi_adjust_cfa_offset -8
- pop %rbx
- .cfi_adjust_cfa_offset -8
- pop %rbp
- .cfi_adjust_cfa_offset -8
- ret
- .cfi_adjust_cfa_offset (8 * 6) + 288 + 32
- ###############################################################################
- open_sse_128:
- movdqu .chacha20_consts(%rip), $A0\nmovdqa $A0, $A1\nmovdqa $A0, $A2
- movdqu 0*16($keyp), $B0\nmovdqa $B0, $B1\nmovdqa $B0, $B2
- movdqu 1*16($keyp), $C0\nmovdqa $C0, $C1\nmovdqa $C0, $C2
- movdqu 2*16($keyp), $D0
- movdqa $D0, $D1\npaddd .sse_inc(%rip), $D1
- movdqa $D1, $D2\npaddd .sse_inc(%rip), $D2
- movdqa $B0, $T1\nmovdqa $C0, $T2\nmovdqa $D1, $T3
- mov \$10, $acc0
- 1: \n";
- &chacha_qr($A0,$B0,$C0,$D0,$T0,"left");
- &chacha_qr($A1,$B1,$C1,$D1,$T0,"left");
- &chacha_qr($A2,$B2,$C2,$D2,$T0,"left");
- &chacha_qr($A0,$B0,$C0,$D0,$T0,"right");
- &chacha_qr($A1,$B1,$C1,$D1,$T0,"right");
- &chacha_qr($A2,$B2,$C2,$D2,$T0,"right"); $code.="
- dec $acc0
- jnz 1b
- paddd .chacha20_consts(%rip), $A0
- paddd .chacha20_consts(%rip), $A1
- paddd .chacha20_consts(%rip), $A2
- paddd $T1, $B0\npaddd $T1, $B1\npaddd $T1, $B2
- paddd $T2, $C1\npaddd $T2, $C2
- paddd $T3, $D1
- paddd .sse_inc(%rip), $T3
- paddd $T3, $D2
- # Clamp and store the key
- pand .clamp(%rip), $A0
- movdqa $A0, $r_store
- movdqa $B0, $s_store
- # Hash
- mov %r8, $itr2
- call poly_hash_ad_internal
- 1:
- cmp \$16, $inl
- jb open_sse_tail_16
- sub \$16, $inl\n";
- # Load for hashing
- &poly_add("0*8($inp)"); $code.="
- # Load for decryption
- movdqu 0*16($inp), $T0
- pxor $T0, $A1
- movdqu $A1, 0*16($oup)
- lea 1*16($inp), $inp
- lea 1*16($oup), $oup\n";
- &poly_mul(); $code.="
- # Shift the stream left
- movdqa $B1, $A1
- movdqa $C1, $B1
- movdqa $D1, $C1
- movdqa $A2, $D1
- movdqa $B2, $A2
- movdqa $C2, $B2
- movdqa $D2, $C2
- jmp 1b
- jmp open_sse_tail_16
- .size chacha20_poly1305_open, .-chacha20_poly1305_open
- .cfi_endproc
-
- ################################################################################
- ################################################################################
- # void chacha20_poly1305_seal(uint8_t *pt, uint8_t *ct, size_t len_in, uint8_t *ad, size_t len_ad, uint8_t *keyp);
- .globl chacha20_poly1305_seal
- .type chacha20_poly1305_seal,\@function,2
- .align 64
- chacha20_poly1305_seal:
- .cfi_startproc
- push %rbp
- .cfi_adjust_cfa_offset 8
- push %rbx
- .cfi_adjust_cfa_offset 8
- push %r12
- .cfi_adjust_cfa_offset 8
- push %r13
- .cfi_adjust_cfa_offset 8
- push %r14
- .cfi_adjust_cfa_offset 8
- push %r15
- .cfi_adjust_cfa_offset 8
- # We write the calculated authenticator back to keyp at the end, so save
- # the pointer on the stack too.
- push $keyp
- .cfi_adjust_cfa_offset 8
- sub \$288 + 32, %rsp
- .cfi_adjust_cfa_offset 288 + 32
- .cfi_offset rbp, -16
- .cfi_offset rbx, -24
- .cfi_offset r12, -32
- .cfi_offset r13, -40
- .cfi_offset r14, -48
- .cfi_offset r15, -56
- lea 32(%rsp), %rbp
- and \$-32, %rbp
- mov 56($keyp), $inl # extra_in_len
- addq %rdx, $inl
- mov $inl, 8+$len_store
- mov %r8, 0+$len_store
- mov %rdx, $inl\n"; $code.="
- mov OPENSSL_ia32cap_P+8(%rip), %eax
- and \$`(1<<5) + (1<<8)`, %eax # Check both BMI2 and AVX2 are present
- xor \$`(1<<5) + (1<<8)`, %eax
- jz chacha20_poly1305_seal_avx2\n" if ($avx>1);
- $code.="
- cmp \$128, $inl
- jbe seal_sse_128
- # For longer buffers, prepare the poly key + some stream
- movdqa .chacha20_consts(%rip), $A0
- movdqu 0*16($keyp), $B0
- movdqu 1*16($keyp), $C0
- movdqu 2*16($keyp), $D0
- movdqa $A0, $A1
- movdqa $A0, $A2
- movdqa $A0, $A3
- movdqa $B0, $B1
- movdqa $B0, $B2
- movdqa $B0, $B3
- movdqa $C0, $C1
- movdqa $C0, $C2
- movdqa $C0, $C3
- movdqa $D0, $D3
- paddd .sse_inc(%rip), $D0
- movdqa $D0, $D2
- paddd .sse_inc(%rip), $D0
- movdqa $D0, $D1
- paddd .sse_inc(%rip), $D0
- # Store on stack
- movdqa $B0, $state1_store
- movdqa $C0, $state2_store
- movdqa $D0, $ctr0_store
- movdqa $D1, $ctr1_store
- movdqa $D2, $ctr2_store
- movdqa $D3, $ctr3_store
- mov \$10, $acc0
- 1: \n";
- foreach $l (@loop_body) {$code.=$l."\n";}
- @loop_body = split /\n/, $chacha_body; $code.="
- dec $acc0
- jnz 1b\n";
- &finalize_state(4); $code.="
- # Clamp and store the key
- pand .clamp(%rip), $A3
- movdqa $A3, $r_store
- movdqa $B3, $s_store
- # Hash
- mov %r8, $itr2
- call poly_hash_ad_internal\n";
- &xor_stream($A2,$B2,$C2,$D2,"0*16");
- &xor_stream($A1,$B1,$C1,$D1,"4*16"); $code.="
- cmp \$12*16, $inl
- ja 1f
- mov \$8*16, $itr1
- sub \$8*16, $inl
- lea 8*16($inp), $inp
- jmp seal_sse_128_seal_hash
- 1: \n";
- &xor_stream($A0, $B0, $C0, $D0, "8*16"); $code.="
- mov \$12*16, $itr1
- sub \$12*16, $inl
- lea 12*16($inp), $inp
- mov \$2, $itr1
- mov \$8, $itr2
- cmp \$4*16, $inl
- jbe seal_sse_tail_64
- cmp \$8*16, $inl
- jbe seal_sse_tail_128
- cmp \$12*16, $inl
- jbe seal_sse_tail_192
-
- 1: \n";
- # The main loop
- &prep_state(4); $code.="
- 2: \n";
- &emit_body(20);
- &poly_add("0($oup)");
- &emit_body(20);
- &poly_stage1();
- &emit_body(20);
- &poly_stage2();
- &emit_body(20);
- &poly_stage3();
- &emit_body(20);
- &poly_reduce_stage();
- foreach $l (@loop_body) {$code.=$l."\n";}
- @loop_body = split /\n/, $chacha_body; $code.="
- lea 16($oup), $oup
- dec $itr2
- jge 2b\n";
- &poly_add("0*8($oup)");
- &poly_mul(); $code.="
- lea 16($oup), $oup
- dec $itr1
- jg 2b\n";
-
- &finalize_state(4);$code.="
- movdqa $D2, $tmp_store\n";
- &xor_stream_using_temp($A3,$B3,$C3,$D3,0*16,$D2); $code.="
- movdqa $tmp_store, $D2\n";
- &xor_stream($A2,$B2,$C2,$D2, 4*16);
- &xor_stream($A1,$B1,$C1,$D1, 8*16); $code.="
- cmp \$16*16, $inl
- ja 3f
-
- mov \$12*16, $itr1
- sub \$12*16, $inl
- lea 12*16($inp), $inp
- jmp seal_sse_128_seal_hash
- 3: \n";
- &xor_stream($A0,$B0,$C0,$D0,"12*16"); $code.="
- lea 16*16($inp), $inp
- sub \$16*16, $inl
- mov \$6, $itr1
- mov \$4, $itr2
- cmp \$12*16, $inl
- jg 1b
- mov $inl, $itr1
- test $inl, $inl
- je seal_sse_128_seal_hash
- mov \$6, $itr1
- cmp \$4*16, $inl
- jg 3f
- ###############################################################################
- seal_sse_tail_64:\n";
- &prep_state(1); $code.="
- 1: \n";
- &poly_add("0($oup)");
- &poly_mul(); $code.="
- lea 16($oup), $oup
- 2: \n";
- &chacha_qr($A0,$B0,$C0,$D0,$T0,"left");
- &chacha_qr($A0,$B0,$C0,$D0,$T0,"right");
- &poly_add("0($oup)");
- &poly_mul(); $code.="
- lea 16($oup), $oup
- dec $itr1
- jg 1b
- dec $itr2
- jge 2b\n";
- &finalize_state(1); $code.="
- jmp seal_sse_128_seal
- 3:
- cmp \$8*16, $inl
- jg 3f
- ###############################################################################
- seal_sse_tail_128:\n";
- &prep_state(2); $code.="
- 1: \n";
- &poly_add("0($oup)");
- &poly_mul(); $code.="
- lea 16($oup), $oup
- 2: \n";
- &chacha_qr($A0,$B0,$C0,$D0,$T0,"left");
- &chacha_qr($A1,$B1,$C1,$D1,$T0,"left");
- &poly_add("0($oup)");
- &poly_mul();
- &chacha_qr($A0,$B0,$C0,$D0,$T0,"right");
- &chacha_qr($A1,$B1,$C1,$D1,$T0,"right"); $code.="
- lea 16($oup), $oup
- dec $itr1
- jg 1b
- dec $itr2
- jge 2b\n";
- &finalize_state(2);
- &xor_stream($A1,$B1,$C1,$D1,0*16); $code.="
- mov \$4*16, $itr1
- sub \$4*16, $inl
- lea 4*16($inp), $inp
- jmp seal_sse_128_seal_hash
- 3:
- ###############################################################################
- seal_sse_tail_192:\n";
- &prep_state(3); $code.="
- 1: \n";
- &poly_add("0($oup)");
- &poly_mul(); $code.="
- lea 16($oup), $oup
- 2: \n";
- &chacha_qr($A0,$B0,$C0,$D0,$T0,"left");
- &chacha_qr($A1,$B1,$C1,$D1,$T0,"left");
- &chacha_qr($A2,$B2,$C2,$D2,$T0,"left");
- &poly_add("0($oup)");
- &poly_mul();
- &chacha_qr($A0,$B0,$C0,$D0,$T0,"right");
- &chacha_qr($A1,$B1,$C1,$D1,$T0,"right");
- &chacha_qr($A2,$B2,$C2,$D2,$T0,"right"); $code.="
- lea 16($oup), $oup
- dec $itr1
- jg 1b
- dec $itr2
- jge 2b\n";
- &finalize_state(3);
- &xor_stream($A2,$B2,$C2,$D2,0*16);
- &xor_stream($A1,$B1,$C1,$D1,4*16); $code.="
- mov \$8*16, $itr1
- sub \$8*16, $inl
- lea 8*16($inp), $inp
- ###############################################################################
- seal_sse_128_seal_hash:
- cmp \$16, $itr1
- jb seal_sse_128_seal\n";
- &poly_add("0($oup)");
- &poly_mul(); $code.="
- sub \$16, $itr1
- lea 16($oup), $oup
- jmp seal_sse_128_seal_hash
-
- seal_sse_128_seal:
- cmp \$16, $inl
- jb seal_sse_tail_16
- sub \$16, $inl
- # Load for decryption
- movdqu 0*16($inp), $T0
- pxor $T0, $A0
- movdqu $A0, 0*16($oup)
- # Then hash
- add 0*8($oup), $acc0
- adc 1*8($oup), $acc1
- adc \$1, $acc2
- lea 1*16($inp), $inp
- lea 1*16($oup), $oup\n";
- &poly_mul(); $code.="
- # Shift the stream left
- movdqa $B0, $A0
- movdqa $C0, $B0
- movdqa $D0, $C0
- movdqa $A1, $D0
- movdqa $B1, $A1
- movdqa $C1, $B1
- movdqa $D1, $C1
- jmp seal_sse_128_seal
-
- seal_sse_tail_16:
- test $inl, $inl
- jz process_blocks_of_extra_in
- # We can only load the PT one byte at a time to avoid buffer overread
- mov $inl, $itr2
- mov $inl, $itr1
- lea -1($inp, $inl), $inp
- pxor $T3, $T3
- 1:
- pslldq \$1, $T3
- pinsrb \$0, ($inp), $T3
- lea -1($inp), $inp
- dec $itr1
- jne 1b
-
- # XOR the keystream with the plaintext.
- pxor $A0, $T3
-
- # Write ciphertext out, byte-by-byte.
- movq $inl, $itr1
- movdqu $T3, $A0
- 2:
- pextrb \$0, $A0, ($oup)
- psrldq \$1, $A0
- add \$1, $oup
- sub \$1, $itr1
- jnz 2b
-
- # $T3 contains the final (partial, non-empty) block of ciphertext which
- # needs to be fed into the Poly1305 state. The right-most $inl bytes of it
- # are valid. We need to fill it with extra_in bytes until full, or until we
- # run out of bytes.
- #
- # $keyp points to the tag output, which is actually a struct with the
- # extra_in pointer and length at offset 48.
- movq 288+32(%rsp), $keyp
- movq 56($keyp), $t1 # extra_in_len
- movq 48($keyp), $t0 # extra_in
- test $t1, $t1
- jz process_partial_block # Common case: no bytes of extra_in
-
- movq \$16, $t2
- subq $inl, $t2 # 16-$inl is the number of bytes that fit into $T3.
- cmpq $t2, $t1 # if extra_in_len < 16-$inl, only copy extra_in_len
- # (note that AT&T syntax reverses the arguments)
- jge load_extra_in
- movq $t1, $t2
-
- load_extra_in:
- # $t2 contains the number of bytes of extra_in (pointed to by $t0) to load
- # into $T3. They are loaded in reverse order.
- leaq -1($t0, $t2), $inp
- # Update extra_in and extra_in_len to reflect the bytes that are about to
- # be read.
- addq $t2, $t0
- subq $t2, $t1
- movq $t0, 48($keyp)
- movq $t1, 56($keyp)
-
- # Update $itr2, which is used to select the mask later on, to reflect the
- # extra bytes about to be added.
- addq $t2, $itr2
-
- # Load $t2 bytes of extra_in into $T2.
- pxor $T2, $T2
- 3:
- pslldq \$1, $T2
- pinsrb \$0, ($inp), $T2
- lea -1($inp), $inp
- sub \$1, $t2
- jnz 3b
-
- # Shift $T2 up the length of the remainder from the main encryption. Sadly,
- # the shift for an XMM register has to be a constant, thus we loop to do
- # this.
- movq $inl, $t2
-
- 4:
- pslldq \$1, $T2
- sub \$1, $t2
- jnz 4b
-
- # Mask $T3 (the remainder from the main encryption) so that superfluous
- # bytes are zero. This means that the non-zero bytes in $T2 and $T3 are
- # disjoint and so we can merge them with an OR.
- lea .and_masks(%rip), $t2
- shl \$4, $inl
- pand -16($t2, $inl), $T3
-
- # Merge $T2 into $T3, forming the remainder block.
- por $T2, $T3
-
- # The block of ciphertext + extra_in is ready to be included in the
- # Poly1305 state.
- movq $T3, $t0
- pextrq \$1, $T3, $t1
- add $t0, $acc0
- adc $t1, $acc1
- adc \$1, $acc2\n";
- &poly_mul(); $code.="
-
- process_blocks_of_extra_in:
- # There may be additional bytes of extra_in to process.
- movq 288+32(%rsp), $keyp
- movq 48($keyp), $inp # extra_in
- movq 56($keyp), $itr2 # extra_in_len
- movq $itr2, $itr1
- shr \$4, $itr2 # number of blocks
-
- 5:
- jz process_extra_in_trailer\n";
- &poly_add("0($inp)");
- &poly_mul(); $code.="
- leaq 16($inp), $inp
- subq \$1, $itr2
- jmp 5b
-
- process_extra_in_trailer:
- andq \$15, $itr1 # remaining num bytes (<16) of extra_in
- movq $itr1, $inl
- jz do_length_block
- leaq -1($inp, $itr1), $inp
-
- 6:
- pslldq \$1, $T3
- pinsrb \$0, ($inp), $T3
- lea -1($inp), $inp
- sub \$1, $itr1
- jnz 6b
-
- process_partial_block:
- # $T3 contains $inl bytes of data to be fed into Poly1305. $inl != 0
- lea .and_masks(%rip), $t2
- shl \$4, $inl
- pand -16($t2, $inl), $T3
- movq $T3, $t0
- pextrq \$1, $T3, $t1
- add $t0, $acc0
- adc $t1, $acc1
- adc \$1, $acc2\n";
- &poly_mul(); $code.="
-
- do_length_block:\n";
- &poly_add($len_store);
- &poly_mul(); $code.="
- # Final reduce
- mov $acc0, $t0
- mov $acc1, $t1
- mov $acc2, $t2
- sub \$-5, $acc0
- sbb \$-1, $acc1
- sbb \$3, $acc2
- cmovc $t0, $acc0
- cmovc $t1, $acc1
- cmovc $t2, $acc2
- # Add in s part of the key
- add 0+$s_store, $acc0
- adc 8+$s_store, $acc1
-
- add \$288 + 32, %rsp
- .cfi_adjust_cfa_offset -(288 + 32)
- pop $keyp
- .cfi_adjust_cfa_offset -8
- mov $acc0, 0*8($keyp)
- mov $acc1, 1*8($keyp)
-
- pop %r15
- .cfi_adjust_cfa_offset -8
- pop %r14
- .cfi_adjust_cfa_offset -8
- pop %r13
- .cfi_adjust_cfa_offset -8
- pop %r12
- .cfi_adjust_cfa_offset -8
- pop %rbx
- .cfi_adjust_cfa_offset -8
- pop %rbp
- .cfi_adjust_cfa_offset -8
- ret
- .cfi_adjust_cfa_offset (8 * 6) + 288 + 32
- ################################################################################
- seal_sse_128:
- movdqu .chacha20_consts(%rip), $A0\nmovdqa $A0, $A1\nmovdqa $A0, $A2
- movdqu 0*16($keyp), $B0\nmovdqa $B0, $B1\nmovdqa $B0, $B2
- movdqu 1*16($keyp), $C0\nmovdqa $C0, $C1\nmovdqa $C0, $C2
- movdqu 2*16($keyp), $D2
- movdqa $D2, $D0\npaddd .sse_inc(%rip), $D0
- movdqa $D0, $D1\npaddd .sse_inc(%rip), $D1
- movdqa $B0, $T1\nmovdqa $C0, $T2\nmovdqa $D0, $T3
- mov \$10, $acc0
- 1:\n";
- &chacha_qr($A0,$B0,$C0,$D0,$T0,"left");
- &chacha_qr($A1,$B1,$C1,$D1,$T0,"left");
- &chacha_qr($A2,$B2,$C2,$D2,$T0,"left");
- &chacha_qr($A0,$B0,$C0,$D0,$T0,"right");
- &chacha_qr($A1,$B1,$C1,$D1,$T0,"right");
- &chacha_qr($A2,$B2,$C2,$D2,$T0,"right"); $code.="
- dec $acc0
- jnz 1b
- paddd .chacha20_consts(%rip), $A0
- paddd .chacha20_consts(%rip), $A1
- paddd .chacha20_consts(%rip), $A2
- paddd $T1, $B0\npaddd $T1, $B1\npaddd $T1, $B2
- paddd $T2, $C0\npaddd $T2, $C1
- paddd $T3, $D0
- paddd .sse_inc(%rip), $T3
- paddd $T3, $D1
- # Clamp and store the key
- pand .clamp(%rip), $A2
- movdqa $A2, $r_store
- movdqa $B2, $s_store
- # Hash
- mov %r8, $itr2
- call poly_hash_ad_internal
- jmp seal_sse_128_seal
- .size chacha20_poly1305_seal, .-chacha20_poly1305_seal\n";
- }
-
- # There should have been a cfi_endproc at the end of that function, but the two
- # following blocks of code are jumped to without a stack frame and the CFI
- # context which they are used in happens to match the CFI context at the end of
- # the previous function. So the CFI table is just extended to the end of them.
-
- if ($avx>1) {
-
- ($A0,$A1,$A2,$A3,$B0,$B1,$B2,$B3,$C0,$C1,$C2,$C3,$D0,$D1,$D2,$D3)=map("%ymm$_",(0..15));
- my ($A0x,$A1x,$A2x,$A3x,$B0x,$B1x,$B2x,$B3x,$C0x,$C1x,$C2x,$C3x,$D0x,$D1x,$D2x,$D3x)=map("%xmm$_",(0..15));
- ($T0,$T1,$T2,$T3)=($A3,$B3,$C3,$D3);
- $state1_store="2*32(%rbp)";
- $state2_store="3*32(%rbp)";
- $tmp_store="4*32(%rbp)";
- $ctr0_store="5*32(%rbp)";
- $ctr1_store="6*32(%rbp)";
- $ctr2_store="7*32(%rbp)";
- $ctr3_store="8*32(%rbp)";
-
- sub chacha_qr_avx2 {
- my ($a,$b,$c,$d,$t,$dir)=@_;
- $code.=<<___ if ($dir =~ /store/);
- vmovdqa $t, $tmp_store
- ___
- $code.=<<___;
- vpaddd $b, $a, $a
- vpxor $a, $d, $d
- vpshufb .rol16(%rip), $d, $d
- vpaddd $d, $c, $c
- vpxor $c, $b, $b
- vpsrld \$20, $b, $t
- vpslld \$12, $b, $b
- vpxor $t, $b, $b
- vpaddd $b, $a, $a
- vpxor $a, $d, $d
- vpshufb .rol8(%rip), $d, $d
- vpaddd $d, $c, $c
- vpxor $c, $b, $b
- vpslld \$7, $b, $t
- vpsrld \$25, $b, $b
- vpxor $t, $b, $b
- ___
- $code.=<<___ if ($dir =~ /left/);
- vpalignr \$12, $d, $d, $d
- vpalignr \$8, $c, $c, $c
- vpalignr \$4, $b, $b, $b
- ___
- $code.=<<___ if ($dir =~ /right/);
- vpalignr \$4, $d, $d, $d
- vpalignr \$8, $c, $c, $c
- vpalignr \$12, $b, $b, $b
- ___
- $code.=<<___ if ($dir =~ /load/);
- vmovdqa $tmp_store, $t
- ___
- }
-
- sub prep_state_avx2 {
- my ($n)=@_;
- $code.=<<___;
- vmovdqa .chacha20_consts(%rip), $A0
- vmovdqa $state1_store, $B0
- vmovdqa $state2_store, $C0
- ___
- $code.=<<___ if ($n ge 2);
- vmovdqa $A0, $A1
- vmovdqa $B0, $B1
- vmovdqa $C0, $C1
- ___
- $code.=<<___ if ($n ge 3);
- vmovdqa $A0, $A2
- vmovdqa $B0, $B2
- vmovdqa $C0, $C2
- ___
- $code.=<<___ if ($n ge 4);
- vmovdqa $A0, $A3
- vmovdqa $B0, $B3
- vmovdqa $C0, $C3
- ___
- $code.=<<___ if ($n eq 1);
- vmovdqa .avx2_inc(%rip), $D0
- vpaddd $ctr0_store, $D0, $D0
- vmovdqa $D0, $ctr0_store
- ___
- $code.=<<___ if ($n eq 2);
- vmovdqa .avx2_inc(%rip), $D0
- vpaddd $ctr0_store, $D0, $D1
- vpaddd $D1, $D0, $D0
- vmovdqa $D0, $ctr0_store
- vmovdqa $D1, $ctr1_store
- ___
- $code.=<<___ if ($n eq 3);
- vmovdqa .avx2_inc(%rip), $D0
- vpaddd $ctr0_store, $D0, $D2
- vpaddd $D2, $D0, $D1
- vpaddd $D1, $D0, $D0
- vmovdqa $D0, $ctr0_store
- vmovdqa $D1, $ctr1_store
- vmovdqa $D2, $ctr2_store
- ___
- $code.=<<___ if ($n eq 4);
- vmovdqa .avx2_inc(%rip), $D0
- vpaddd $ctr0_store, $D0, $D3
- vpaddd $D3, $D0, $D2
- vpaddd $D2, $D0, $D1
- vpaddd $D1, $D0, $D0
- vmovdqa $D3, $ctr3_store
- vmovdqa $D2, $ctr2_store
- vmovdqa $D1, $ctr1_store
- vmovdqa $D0, $ctr0_store
- ___
- }
-
- sub finalize_state_avx2 {
- my ($n)=@_;
- $code.=<<___ if ($n eq 4);
- vpaddd .chacha20_consts(%rip), $A3, $A3
- vpaddd $state1_store, $B3, $B3
- vpaddd $state2_store, $C3, $C3
- vpaddd $ctr3_store, $D3, $D3
- ___
- $code.=<<___ if ($n ge 3);
- vpaddd .chacha20_consts(%rip), $A2, $A2
- vpaddd $state1_store, $B2, $B2
- vpaddd $state2_store, $C2, $C2
- vpaddd $ctr2_store, $D2, $D2
- ___
- $code.=<<___ if ($n ge 2);
- vpaddd .chacha20_consts(%rip), $A1, $A1
- vpaddd $state1_store, $B1, $B1
- vpaddd $state2_store, $C1, $C1
- vpaddd $ctr1_store, $D1, $D1
- ___
- $code.=<<___;
- vpaddd .chacha20_consts(%rip), $A0, $A0
- vpaddd $state1_store, $B0, $B0
- vpaddd $state2_store, $C0, $C0
- vpaddd $ctr0_store, $D0, $D0
- ___
- }
-
- sub xor_stream_avx2 {
- my ($A, $B, $C, $D, $offset, $hlp)=@_;
- $code.=<<___;
- vperm2i128 \$0x02, $A, $B, $hlp
- vperm2i128 \$0x13, $A, $B, $B
- vperm2i128 \$0x02, $C, $D, $A
- vperm2i128 \$0x13, $C, $D, $C
- vpxor 0*32+$offset($inp), $hlp, $hlp
- vpxor 1*32+$offset($inp), $A, $A
- vpxor 2*32+$offset($inp), $B, $B
- vpxor 3*32+$offset($inp), $C, $C
- vmovdqu $hlp, 0*32+$offset($oup)
- vmovdqu $A, 1*32+$offset($oup)
- vmovdqu $B, 2*32+$offset($oup)
- vmovdqu $C, 3*32+$offset($oup)
- ___
- }
-
- sub finish_stream_avx2 {
- my ($A, $B, $C, $D, $hlp)=@_;
- $code.=<<___;
- vperm2i128 \$0x13, $A, $B, $hlp
- vperm2i128 \$0x02, $A, $B, $A
- vperm2i128 \$0x02, $C, $D, $B
- vperm2i128 \$0x13, $C, $D, $D
- vmovdqa $hlp, $C
- ___
- }
-
- sub poly_stage1_mulx {
- $code.=<<___;
- mov 0+$r_store, %rdx
- mov %rdx, $t2
- mulx $acc0, $t0, $t1
- mulx $acc1, %rax, %rdx
- imulq $acc2, $t2
- add %rax, $t1
- adc %rdx, $t2
- ___
- }
-
- sub poly_stage2_mulx {
- $code.=<<___;
- mov 8+$r_store, %rdx
- mulx $acc0, $acc0, %rax
- add $acc0, $t1
- mulx $acc1, $acc1, $t3
- adc $acc1, $t2
- adc \$0, $t3
- imulq $acc2, %rdx
- ___
- }
-
- sub poly_stage3_mulx {
- $code.=<<___;
- add %rax, $t2
- adc %rdx, $t3
- ___
- }
-
- sub poly_mul_mulx {
- &poly_stage1_mulx();
- &poly_stage2_mulx();
- &poly_stage3_mulx();
- &poly_reduce_stage();
- }
-
- sub gen_chacha_round_avx2 {
- my ($rot1, $rot2, $shift)=@_;
- my $round="";
- $round=$round ."vmovdqa $C0, $tmp_store\n" if ($rot1 eq 20);
- $round=$round ."vmovdqa $rot2, $C0
- vpaddd $B3, $A3, $A3
- vpaddd $B2, $A2, $A2
- vpaddd $B1, $A1, $A1
- vpaddd $B0, $A0, $A0
- vpxor $A3, $D3, $D3
- vpxor $A2, $D2, $D2
- vpxor $A1, $D1, $D1
- vpxor $A0, $D0, $D0
- vpshufb $C0, $D3, $D3
- vpshufb $C0, $D2, $D2
- vpshufb $C0, $D1, $D1
- vpshufb $C0, $D0, $D0
- vmovdqa $tmp_store, $C0
- vpaddd $D3, $C3, $C3
- vpaddd $D2, $C2, $C2
- vpaddd $D1, $C1, $C1
- vpaddd $D0, $C0, $C0
- vpxor $C3, $B3, $B3
- vpxor $C2, $B2, $B2
- vpxor $C1, $B1, $B1
- vpxor $C0, $B0, $B0
- vmovdqa $C0, $tmp_store
- vpsrld \$$rot1, $B3, $C0
- vpslld \$32-$rot1, $B3, $B3
- vpxor $C0, $B3, $B3
- vpsrld \$$rot1, $B2, $C0
- vpslld \$32-$rot1, $B2, $B2
- vpxor $C0, $B2, $B2
- vpsrld \$$rot1, $B1, $C0
- vpslld \$32-$rot1, $B1, $B1
- vpxor $C0, $B1, $B1
- vpsrld \$$rot1, $B0, $C0
- vpslld \$32-$rot1, $B0, $B0
- vpxor $C0, $B0, $B0\n";
- ($s1,$s2,$s3)=(4,8,12) if ($shift =~ /left/);
- ($s1,$s2,$s3)=(12,8,4) if ($shift =~ /right/);
- $round=$round ."vmovdqa $tmp_store, $C0
- vpalignr \$$s1, $B3, $B3, $B3
- vpalignr \$$s2, $C3, $C3, $C3
- vpalignr \$$s3, $D3, $D3, $D3
- vpalignr \$$s1, $B2, $B2, $B2
- vpalignr \$$s2, $C2, $C2, $C2
- vpalignr \$$s3, $D2, $D2, $D2
- vpalignr \$$s1, $B1, $B1, $B1
- vpalignr \$$s2, $C1, $C1, $C1
- vpalignr \$$s3, $D1, $D1, $D1
- vpalignr \$$s1, $B0, $B0, $B0
- vpalignr \$$s2, $C0, $C0, $C0
- vpalignr \$$s3, $D0, $D0, $D0\n"
- if (($shift =~ /left/) || ($shift =~ /right/));
- return $round;
- };
-
- $chacha_body = &gen_chacha_round_avx2(20, ".rol16(%rip)") .
- &gen_chacha_round_avx2(25, ".rol8(%rip)", "left") .
- &gen_chacha_round_avx2(20, ".rol16(%rip)") .
- &gen_chacha_round_avx2(25, ".rol8(%rip)", "right");
-
- @loop_body = split /\n/, $chacha_body;
-
- $code.="
- ###############################################################################
- .type chacha20_poly1305_open_avx2,\@function,2
- .align 64
- chacha20_poly1305_open_avx2:
- vzeroupper
- vmovdqa .chacha20_consts(%rip), $A0
- vbroadcasti128 0*16($keyp), $B0
- vbroadcasti128 1*16($keyp), $C0
- vbroadcasti128 2*16($keyp), $D0
- vpaddd .avx2_init(%rip), $D0, $D0
- cmp \$6*32, $inl
- jbe open_avx2_192
- cmp \$10*32, $inl
- jbe open_avx2_320
-
- vmovdqa $B0, $state1_store
- vmovdqa $C0, $state2_store
- vmovdqa $D0, $ctr0_store
- mov \$10, $acc0
- 1: \n";
- &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"left");
- &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"right"); $code.="
- dec $acc0
- jne 1b
- vpaddd .chacha20_consts(%rip), $A0, $A0
- vpaddd $state1_store, $B0, $B0
- vpaddd $state2_store, $C0, $C0
- vpaddd $ctr0_store, $D0, $D0
-
- vperm2i128 \$0x02, $A0, $B0, $T0
- # Clamp and store key
- vpand .clamp(%rip), $T0, $T0
- vmovdqa $T0, $r_store
- # Stream for the first 64 bytes
- vperm2i128 \$0x13, $A0, $B0, $A0
- vperm2i128 \$0x13, $C0, $D0, $B0
- # Hash AD + first 64 bytes
- mov %r8, $itr2
- call poly_hash_ad_internal
- xor $itr1, $itr1
- # Hash first 64 bytes
- 1: \n";
- &poly_add("0($inp, $itr1)");
- &poly_mul(); $code.="
- add \$16, $itr1
- cmp \$2*32, $itr1
- jne 1b
- # Decrypt first 64 bytes
- vpxor 0*32($inp), $A0, $A0
- vpxor 1*32($inp), $B0, $B0
- vmovdqu $A0, 0*32($oup)
- vmovdqu $B0, 1*32($oup)
- lea 2*32($inp), $inp
- lea 2*32($oup), $oup
- sub \$2*32, $inl
- 1:
- # Hash and decrypt 512 bytes each iteration
- cmp \$16*32, $inl
- jb 3f\n";
- &prep_state_avx2(4); $code.="
- xor $itr1, $itr1
- 2: \n";
- &poly_add("0*8($inp, $itr1)");
- &emit_body(10);
- &poly_stage1_mulx();
- &emit_body(9);
- &poly_stage2_mulx();
- &emit_body(12);
- &poly_stage3_mulx();
- &emit_body(10);
- &poly_reduce_stage();
- &emit_body(9);
- &poly_add("2*8($inp, $itr1)");
- &emit_body(8);
- &poly_stage1_mulx();
- &emit_body(18);
- &poly_stage2_mulx();
- &emit_body(18);
- &poly_stage3_mulx();
- &emit_body(9);
- &poly_reduce_stage();
- &emit_body(8);
- &poly_add("4*8($inp, $itr1)"); $code.="
- lea 6*8($itr1), $itr1\n";
- &emit_body(18);
- &poly_stage1_mulx();
- &emit_body(8);
- &poly_stage2_mulx();
- &emit_body(8);
- &poly_stage3_mulx();
- &emit_body(18);
- &poly_reduce_stage();
- foreach $l (@loop_body) {$code.=$l."\n";}
- @loop_body = split /\n/, $chacha_body; $code.="
- cmp \$10*6*8, $itr1
- jne 2b\n";
- &finalize_state_avx2(4); $code.="
- vmovdqa $A0, $tmp_store\n";
- &poly_add("10*6*8($inp)");
- &xor_stream_avx2($A3, $B3, $C3, $D3, 0*32, $A0); $code.="
- vmovdqa $tmp_store, $A0\n";
- &poly_mul();
- &xor_stream_avx2($A2, $B2, $C2, $D2, 4*32, $A3);
- &poly_add("10*6*8+2*8($inp)");
- &xor_stream_avx2($A1, $B1, $C1, $D1, 8*32, $A3);
- &poly_mul();
- &xor_stream_avx2($A0, $B0, $C0, $D0, 12*32, $A3); $code.="
- lea 16*32($inp), $inp
- lea 16*32($oup), $oup
- sub \$16*32, $inl
- jmp 1b
- 3:
- test $inl, $inl
- vzeroupper
- je open_sse_finalize
- 3:
- cmp \$4*32, $inl
- ja 3f\n";
- ###############################################################################
- # 1-128 bytes left
- &prep_state_avx2(1); $code.="
- xor $itr2, $itr2
- mov $inl, $itr1
- and \$-16, $itr1
- test $itr1, $itr1
- je 2f
- 1: \n";
- &poly_add("0*8($inp, $itr2)");
- &poly_mul(); $code.="
- 2:
- add \$16, $itr2\n";
- &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"left");
- &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"right"); $code.="
- cmp $itr1, $itr2
- jb 1b
- cmp \$160, $itr2
- jne 2b\n";
- &finalize_state_avx2(1);
- &finish_stream_avx2($A0,$B0,$C0,$D0,$T0); $code.="
- jmp open_avx2_tail_loop
- 3:
- cmp \$8*32, $inl
- ja 3f\n";
- ###############################################################################
- # 129-256 bytes left
- &prep_state_avx2(2); $code.="
- mov $inl, $tmp_store
- mov $inl, $itr1
- sub \$4*32, $itr1
- shr \$4, $itr1
- mov \$10, $itr2
- cmp \$10, $itr1
- cmovg $itr2, $itr1
- mov $inp, $inl
- xor $itr2, $itr2
- 1: \n";
- &poly_add("0*8($inl)");
- &poly_mul_mulx(); $code.="
- lea 16($inl), $inl
- 2: \n";
- &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"left");
- &chacha_qr_avx2($A1,$B1,$C1,$D1,$T0,"left"); $code.="
- inc $itr2\n";
- &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"right");
- &chacha_qr_avx2($A1,$B1,$C1,$D1,$T0,"right");
- &chacha_qr_avx2($A2,$B2,$C2,$D2,$T0,"right"); $code.="
- cmp $itr1, $itr2
- jb 1b
- cmp \$10, $itr2
- jne 2b
- mov $inl, $itr2
- sub $inp, $inl
- mov $inl, $itr1
- mov $tmp_store, $inl
- 1:
- add \$16, $itr1
- cmp $inl, $itr1
- jg 1f\n";
- &poly_add("0*8($itr2)");
- &poly_mul_mulx(); $code.="
- lea 16($itr2), $itr2
- jmp 1b
- 1: \n";
- &finalize_state_avx2(2);
- &xor_stream_avx2($A1, $B1, $C1, $D1, 0*32, $T0);
- &finish_stream_avx2($A0, $B0, $C0, $D0, $T0); $code.="
- lea 4*32($inp), $inp
- lea 4*32($oup), $oup
- sub \$4*32, $inl
- jmp open_avx2_tail_loop
- 3:
- cmp \$12*32, $inl
- ja 3f\n";
- ###############################################################################
- # 257-383 bytes left
- &prep_state_avx2(3); $code.="
- mov $inl, $tmp_store
- mov $inl, $itr1
- sub \$8*32, $itr1
- shr \$4, $itr1
- add \$6, $itr1
- mov \$10, $itr2
- cmp \$10, $itr1
- cmovg $itr2, $itr1
- mov $inp, $inl
- xor $itr2, $itr2
- 1: \n";
- &poly_add("0*8($inl)");
- &poly_mul_mulx(); $code.="
- lea 16($inl), $inl
- 2: \n";
- &chacha_qr_avx2($A2,$B2,$C2,$D2,$T0,"left");
- &chacha_qr_avx2($A1,$B1,$C1,$D1,$T0,"left");
- &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"left");
- &poly_add("0*8($inl)");
- &poly_mul(); $code.="
- lea 16($inl), $inl
- inc $itr2\n";
- &chacha_qr_avx2($A2,$B2,$C2,$D2,$T0,"right");
- &chacha_qr_avx2($A1,$B1,$C1,$D1,$T0,"right");
- &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"right"); $code.="
- cmp $itr1, $itr2
- jb 1b
- cmp \$10, $itr2
- jne 2b
- mov $inl, $itr2
- sub $inp, $inl
- mov $inl, $itr1
- mov $tmp_store, $inl
- 1:
- add \$16, $itr1
- cmp $inl, $itr1
- jg 1f\n";
- &poly_add("0*8($itr2)");
- &poly_mul_mulx(); $code.="
- lea 16($itr2), $itr2
- jmp 1b
- 1: \n";
- &finalize_state_avx2(3);
- &xor_stream_avx2($A2, $B2, $C2, $D2, 0*32, $T0);
- &xor_stream_avx2($A1, $B1, $C1, $D1, 4*32, $T0);
- &finish_stream_avx2($A0, $B0, $C0, $D0, $T0); $code.="
- lea 8*32($inp), $inp
- lea 8*32($oup), $oup
- sub \$8*32, $inl
- jmp open_avx2_tail_loop
- 3: \n";
- ###############################################################################
- # 384-512 bytes left
- &prep_state_avx2(4); $code.="
- xor $itr1, $itr1
- mov $inp, $itr2
- 1: \n";
- &poly_add("0*8($itr2)");
- &poly_mul(); $code.="
- lea 2*8($itr2), $itr2
- 2: \n";
- &emit_body(37);
- &poly_add("0*8($itr2)");
- &poly_mul_mulx();
- &emit_body(48);
- &poly_add("2*8($itr2)");
- &poly_mul_mulx(); $code.="
- lea 4*8($itr2), $itr2\n";
- foreach $l (@loop_body) {$code.=$l."\n";}
- @loop_body = split /\n/, $chacha_body; $code.="
- inc $itr1
- cmp \$4, $itr1
- jl 1b
- cmp \$10, $itr1
- jne 2b
- mov $inl, $itr1
- sub \$12*32, $itr1
- and \$-16, $itr1
- 1:
- test $itr1, $itr1
- je 1f\n";
- &poly_add("0*8($itr2)");
- &poly_mul_mulx(); $code.="
- lea 2*8($itr2), $itr2
- sub \$2*8, $itr1
- jmp 1b
- 1: \n";
- &finalize_state_avx2(4); $code.="
- vmovdqa $A0, $tmp_store\n";
- &xor_stream_avx2($A3, $B3, $C3, $D3, 0*32, $A0); $code.="
- vmovdqa $tmp_store, $A0\n";
- &xor_stream_avx2($A2, $B2, $C2, $D2, 4*32, $A3);
- &xor_stream_avx2($A1, $B1, $C1, $D1, 8*32, $A3);
- &finish_stream_avx2($A0, $B0, $C0, $D0, $A3); $code.="
- lea 12*32($inp), $inp
- lea 12*32($oup), $oup
- sub \$12*32, $inl
- open_avx2_tail_loop:
- cmp \$32, $inl
- jb open_avx2_tail
- sub \$32, $inl
- vpxor ($inp), $A0, $A0
- vmovdqu $A0, ($oup)
- lea 1*32($inp), $inp
- lea 1*32($oup), $oup
- vmovdqa $B0, $A0
- vmovdqa $C0, $B0
- vmovdqa $D0, $C0
- jmp open_avx2_tail_loop
- open_avx2_tail:
- cmp \$16, $inl
- vmovdqa $A0x, $A1x
- jb 1f
- sub \$16, $inl
- #load for decryption
- vpxor ($inp), $A0x, $A1x
- vmovdqu $A1x, ($oup)
- lea 1*16($inp), $inp
- lea 1*16($oup), $oup
- vperm2i128 \$0x11, $A0, $A0, $A0
- vmovdqa $A0x, $A1x
- 1:
- vzeroupper
- jmp open_sse_tail_16
- ###############################################################################
- open_avx2_192:
- vmovdqa $A0, $A1
- vmovdqa $A0, $A2
- vmovdqa $B0, $B1
- vmovdqa $B0, $B2
- vmovdqa $C0, $C1
- vmovdqa $C0, $C2
- vpaddd .avx2_inc(%rip), $D0, $D1
- vmovdqa $D0, $T2
- vmovdqa $D1, $T3
- mov \$10, $acc0
- 1: \n";
- &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"left");
- &chacha_qr_avx2($A1,$B1,$C1,$D1,$T0,"left");
- &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"right");
- &chacha_qr_avx2($A1,$B1,$C1,$D1,$T0,"right"); $code.="
- dec $acc0
- jne 1b
- vpaddd $A2, $A0, $A0
- vpaddd $A2, $A1, $A1
- vpaddd $B2, $B0, $B0
- vpaddd $B2, $B1, $B1
- vpaddd $C2, $C0, $C0
- vpaddd $C2, $C1, $C1
- vpaddd $T2, $D0, $D0
- vpaddd $T3, $D1, $D1
- vperm2i128 \$0x02, $A0, $B0, $T0
- # Clamp and store the key
- vpand .clamp(%rip), $T0, $T0
- vmovdqa $T0, $r_store
- # Stream for up to 192 bytes
- vperm2i128 \$0x13, $A0, $B0, $A0
- vperm2i128 \$0x13, $C0, $D0, $B0
- vperm2i128 \$0x02, $A1, $B1, $C0
- vperm2i128 \$0x02, $C1, $D1, $D0
- vperm2i128 \$0x13, $A1, $B1, $A1
- vperm2i128 \$0x13, $C1, $D1, $B1
- open_avx2_short:
- mov %r8, $itr2
- call poly_hash_ad_internal
- open_avx2_hash_and_xor_loop:
- cmp \$32, $inl
- jb open_avx2_short_tail_32
- sub \$32, $inl\n";
- # Load + hash
- &poly_add("0*8($inp)");
- &poly_mul();
- &poly_add("2*8($inp)");
- &poly_mul(); $code.="
- # Load + decrypt
- vpxor ($inp), $A0, $A0
- vmovdqu $A0, ($oup)
- lea 1*32($inp), $inp
- lea 1*32($oup), $oup
- # Shift stream
- vmovdqa $B0, $A0
- vmovdqa $C0, $B0
- vmovdqa $D0, $C0
- vmovdqa $A1, $D0
- vmovdqa $B1, $A1
- vmovdqa $C1, $B1
- vmovdqa $D1, $C1
- vmovdqa $A2, $D1
- vmovdqa $B2, $A2
- jmp open_avx2_hash_and_xor_loop
- open_avx2_short_tail_32:
- cmp \$16, $inl
- vmovdqa $A0x, $A1x
- jb 1f
- sub \$16, $inl\n";
- &poly_add("0*8($inp)");
- &poly_mul(); $code.="
- vpxor ($inp), $A0x, $A3x
- vmovdqu $A3x, ($oup)
- lea 1*16($inp), $inp
- lea 1*16($oup), $oup
- vextracti128 \$1, $A0, $A1x
- 1:
- vzeroupper
- jmp open_sse_tail_16
- ###############################################################################
- open_avx2_320:
- vmovdqa $A0, $A1
- vmovdqa $A0, $A2
- vmovdqa $B0, $B1
- vmovdqa $B0, $B2
- vmovdqa $C0, $C1
- vmovdqa $C0, $C2
- vpaddd .avx2_inc(%rip), $D0, $D1
- vpaddd .avx2_inc(%rip), $D1, $D2
- vmovdqa $B0, $T1
- vmovdqa $C0, $T2
- vmovdqa $D0, $ctr0_store
- vmovdqa $D1, $ctr1_store
- vmovdqa $D2, $ctr2_store
- mov \$10, $acc0
- 1: \n";
- &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"left");
- &chacha_qr_avx2($A1,$B1,$C1,$D1,$T0,"left");
- &chacha_qr_avx2($A2,$B2,$C2,$D2,$T0,"left");
- &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"right");
- &chacha_qr_avx2($A1,$B1,$C1,$D1,$T0,"right");
- &chacha_qr_avx2($A2,$B2,$C2,$D2,$T0,"right"); $code.="
- dec $acc0
- jne 1b
- vpaddd .chacha20_consts(%rip), $A0, $A0
- vpaddd .chacha20_consts(%rip), $A1, $A1
- vpaddd .chacha20_consts(%rip), $A2, $A2
- vpaddd $T1, $B0, $B0
- vpaddd $T1, $B1, $B1
- vpaddd $T1, $B2, $B2
- vpaddd $T2, $C0, $C0
- vpaddd $T2, $C1, $C1
- vpaddd $T2, $C2, $C2
- vpaddd $ctr0_store, $D0, $D0
- vpaddd $ctr1_store, $D1, $D1
- vpaddd $ctr2_store, $D2, $D2
- vperm2i128 \$0x02, $A0, $B0, $T0
- # Clamp and store the key
- vpand .clamp(%rip), $T0, $T0
- vmovdqa $T0, $r_store
- # Stream for up to 320 bytes
- vperm2i128 \$0x13, $A0, $B0, $A0
- vperm2i128 \$0x13, $C0, $D0, $B0
- vperm2i128 \$0x02, $A1, $B1, $C0
- vperm2i128 \$0x02, $C1, $D1, $D0
- vperm2i128 \$0x13, $A1, $B1, $A1
- vperm2i128 \$0x13, $C1, $D1, $B1
- vperm2i128 \$0x02, $A2, $B2, $C1
- vperm2i128 \$0x02, $C2, $D2, $D1
- vperm2i128 \$0x13, $A2, $B2, $A2
- vperm2i128 \$0x13, $C2, $D2, $B2
- jmp open_avx2_short
- .size chacha20_poly1305_open_avx2, .-chacha20_poly1305_open_avx2
- ###############################################################################
- ###############################################################################
- .type chacha20_poly1305_seal_avx2,\@function,2
- .align 64
- chacha20_poly1305_seal_avx2:
- vzeroupper
- vmovdqa .chacha20_consts(%rip), $A0
- vbroadcasti128 0*16($keyp), $B0
- vbroadcasti128 1*16($keyp), $C0
- vbroadcasti128 2*16($keyp), $D0
- vpaddd .avx2_init(%rip), $D0, $D0
- cmp \$6*32, $inl
- jbe seal_avx2_192
- cmp \$10*32, $inl
- jbe seal_avx2_320
- vmovdqa $A0, $A1
- vmovdqa $A0, $A2
- vmovdqa $A0, $A3
- vmovdqa $B0, $B1
- vmovdqa $B0, $B2
- vmovdqa $B0, $B3
- vmovdqa $B0, $state1_store
- vmovdqa $C0, $C1
- vmovdqa $C0, $C2
- vmovdqa $C0, $C3
- vmovdqa $C0, $state2_store
- vmovdqa $D0, $D3
- vpaddd .avx2_inc(%rip), $D3, $D2
- vpaddd .avx2_inc(%rip), $D2, $D1
- vpaddd .avx2_inc(%rip), $D1, $D0
- vmovdqa $D0, $ctr0_store
- vmovdqa $D1, $ctr1_store
- vmovdqa $D2, $ctr2_store
- vmovdqa $D3, $ctr3_store
- mov \$10, $acc0
- 1: \n";
- foreach $l (@loop_body) {$code.=$l."\n";}
- @loop_body = split /\n/, $chacha_body; $code.="
- dec $acc0
- jnz 1b\n";
- &finalize_state_avx2(4); $code.="
- vperm2i128 \$0x13, $C3, $D3, $C3
- vperm2i128 \$0x02, $A3, $B3, $D3
- vperm2i128 \$0x13, $A3, $B3, $A3
- vpand .clamp(%rip), $D3, $D3
- vmovdqa $D3, $r_store
- mov %r8, $itr2
- call poly_hash_ad_internal
- # Safely store 320 bytes (otherwise would handle with optimized call)
- vpxor 0*32($inp), $A3, $A3
- vpxor 1*32($inp), $C3, $C3
- vmovdqu $A3, 0*32($oup)
- vmovdqu $C3, 1*32($oup)\n";
- &xor_stream_avx2($A2,$B2,$C2,$D2,2*32,$T3);
- &xor_stream_avx2($A1,$B1,$C1,$D1,6*32,$T3);
- &finish_stream_avx2($A0,$B0,$C0,$D0,$T3); $code.="
- lea 10*32($inp), $inp
- sub \$10*32, $inl
- mov \$10*32, $itr1
- cmp \$4*32, $inl
- jbe seal_avx2_hash
- vpxor 0*32($inp), $A0, $A0
- vpxor 1*32($inp), $B0, $B0
- vpxor 2*32($inp), $C0, $C0
- vpxor 3*32($inp), $D0, $D0
- vmovdqu $A0, 10*32($oup)
- vmovdqu $B0, 11*32($oup)
- vmovdqu $C0, 12*32($oup)
- vmovdqu $D0, 13*32($oup)
- lea 4*32($inp), $inp
- sub \$4*32, $inl
- mov \$8, $itr1
- mov \$2, $itr2
- cmp \$4*32, $inl
- jbe seal_avx2_tail_128
- cmp \$8*32, $inl
- jbe seal_avx2_tail_256
- cmp \$12*32, $inl
- jbe seal_avx2_tail_384
- cmp \$16*32, $inl
- jbe seal_avx2_tail_512\n";
- # We have 448 bytes to hash, but main loop hashes 512 bytes at a time - perform some rounds, before the main loop
- &prep_state_avx2(4);
- foreach $l (@loop_body) {$code.=$l."\n";}
- @loop_body = split /\n/, $chacha_body;
- &emit_body(41);
- @loop_body = split /\n/, $chacha_body; $code.="
- sub \$16, $oup
- mov \$9, $itr1
- jmp 4f
- 1: \n";
- &prep_state_avx2(4); $code.="
- mov \$10, $itr1
- 2: \n";
- &poly_add("0*8($oup)");
- &emit_body(10);
- &poly_stage1_mulx();
- &emit_body(9);
- &poly_stage2_mulx();
- &emit_body(12);
- &poly_stage3_mulx();
- &emit_body(10);
- &poly_reduce_stage(); $code.="
- 4: \n";
- &emit_body(9);
- &poly_add("2*8($oup)");
- &emit_body(8);
- &poly_stage1_mulx();
- &emit_body(18);
- &poly_stage2_mulx();
- &emit_body(18);
- &poly_stage3_mulx();
- &emit_body(9);
- &poly_reduce_stage();
- &emit_body(8);
- &poly_add("4*8($oup)"); $code.="
- lea 6*8($oup), $oup\n";
- &emit_body(18);
- &poly_stage1_mulx();
- &emit_body(8);
- &poly_stage2_mulx();
- &emit_body(8);
- &poly_stage3_mulx();
- &emit_body(18);
- &poly_reduce_stage();
- foreach $l (@loop_body) {$code.=$l."\n";}
- @loop_body = split /\n/, $chacha_body; $code.="
- dec $itr1
- jne 2b\n";
- &finalize_state_avx2(4); $code.="
- lea 4*8($oup), $oup
- vmovdqa $A0, $tmp_store\n";
- &poly_add("-4*8($oup)");
- &xor_stream_avx2($A3, $B3, $C3, $D3, 0*32, $A0); $code.="
- vmovdqa $tmp_store, $A0\n";
- &poly_mul();
- &xor_stream_avx2($A2, $B2, $C2, $D2, 4*32, $A3);
- &poly_add("-2*8($oup)");
- &xor_stream_avx2($A1, $B1, $C1, $D1, 8*32, $A3);
- &poly_mul();
- &xor_stream_avx2($A0, $B0, $C0, $D0, 12*32, $A3); $code.="
- lea 16*32($inp), $inp
- sub \$16*32, $inl
- cmp \$16*32, $inl
- jg 1b\n";
- &poly_add("0*8($oup)");
- &poly_mul();
- &poly_add("2*8($oup)");
- &poly_mul(); $code.="
- lea 4*8($oup), $oup
- mov \$10, $itr1
- xor $itr2, $itr2
- cmp \$4*32, $inl
- ja 3f
- ###############################################################################
- seal_avx2_tail_128:\n";
- &prep_state_avx2(1); $code.="
- 1: \n";
- &poly_add("0($oup)");
- &poly_mul(); $code.="
- lea 2*8($oup), $oup
- 2: \n";
- &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"left");
- &poly_add("0*8($oup)");
- &poly_mul();
- &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"right");
- &poly_add("2*8($oup)");
- &poly_mul(); $code.="
- lea 4*8($oup), $oup
- dec $itr1
- jg 1b
- dec $itr2
- jge 2b\n";
- &finalize_state_avx2(1);
- &finish_stream_avx2($A0,$B0,$C0,$D0,$T0); $code.="
- jmp seal_avx2_short_loop
- 3:
- cmp \$8*32, $inl
- ja 3f
- ###############################################################################
- seal_avx2_tail_256:\n";
- &prep_state_avx2(2); $code.="
- 1: \n";
- &poly_add("0($oup)");
- &poly_mul(); $code.="
- lea 2*8($oup), $oup
- 2: \n";
- &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"left");
- &chacha_qr_avx2($A1,$B1,$C1,$D1,$T0,"left");
- &poly_add("0*8($oup)");
- &poly_mul();
- &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"right");
- &chacha_qr_avx2($A1,$B1,$C1,$D1,$T0,"right");
- &poly_add("2*8($oup)");
- &poly_mul(); $code.="
- lea 4*8($oup), $oup
- dec $itr1
- jg 1b
- dec $itr2
- jge 2b\n";
- &finalize_state_avx2(2);
- &xor_stream_avx2($A1,$B1,$C1,$D1,0*32,$T0);
- &finish_stream_avx2($A0,$B0,$C0,$D0,$T0); $code.="
- mov \$4*32, $itr1
- lea 4*32($inp), $inp
- sub \$4*32, $inl
- jmp seal_avx2_hash
- 3:
- cmp \$12*32, $inl
- ja seal_avx2_tail_512
- ###############################################################################
- seal_avx2_tail_384:\n";
- &prep_state_avx2(3); $code.="
- 1: \n";
- &poly_add("0($oup)");
- &poly_mul(); $code.="
- lea 2*8($oup), $oup
- 2: \n";
- &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"left");
- &chacha_qr_avx2($A1,$B1,$C1,$D1,$T0,"left");
- &poly_add("0*8($oup)");
- &poly_mul();
- &chacha_qr_avx2($A2,$B2,$C2,$D2,$T0,"left");
- &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"right");
- &poly_add("2*8($oup)");
- &poly_mul();
- &chacha_qr_avx2($A1,$B1,$C1,$D1,$T0,"right");
- &chacha_qr_avx2($A2,$B2,$C2,$D2,$T0,"right"); $code.="
- lea 4*8($oup), $oup
- dec $itr1
- jg 1b
- dec $itr2
- jge 2b\n";
- &finalize_state_avx2(3);
- &xor_stream_avx2($A2,$B2,$C2,$D2,0*32,$T0);
- &xor_stream_avx2($A1,$B1,$C1,$D1,4*32,$T0);
- &finish_stream_avx2($A0,$B0,$C0,$D0,$T0); $code.="
- mov \$8*32, $itr1
- lea 8*32($inp), $inp
- sub \$8*32, $inl
- jmp seal_avx2_hash
- ###############################################################################
- seal_avx2_tail_512:\n";
- &prep_state_avx2(4); $code.="
- 1: \n";
- &poly_add("0($oup)");
- &poly_mul_mulx(); $code.="
- lea 2*8($oup), $oup
- 2: \n";
- &emit_body(20);
- &poly_add("0*8($oup)");
- &emit_body(20);
- &poly_stage1_mulx();
- &emit_body(20);
- &poly_stage2_mulx();
- &emit_body(20);
- &poly_stage3_mulx();
- &emit_body(20);
- &poly_reduce_stage();
- &emit_body(20);
- &poly_add("2*8($oup)");
- &emit_body(20);
- &poly_stage1_mulx();
- &emit_body(20);
- &poly_stage2_mulx();
- &emit_body(20);
- &poly_stage3_mulx();
- &emit_body(20);
- &poly_reduce_stage();
- foreach $l (@loop_body) {$code.=$l."\n";}
- @loop_body = split /\n/, $chacha_body; $code.="
- lea 4*8($oup), $oup
- dec $itr1
- jg 1b
- dec $itr2
- jge 2b\n";
- &finalize_state_avx2(4); $code.="
- vmovdqa $A0, $tmp_store\n";
- &xor_stream_avx2($A3, $B3, $C3, $D3, 0*32, $A0); $code.="
- vmovdqa $tmp_store, $A0\n";
- &xor_stream_avx2($A2, $B2, $C2, $D2, 4*32, $A3);
- &xor_stream_avx2($A1, $B1, $C1, $D1, 8*32, $A3);
- &finish_stream_avx2($A0,$B0,$C0,$D0,$T0); $code.="
- mov \$12*32, $itr1
- lea 12*32($inp), $inp
- sub \$12*32, $inl
- jmp seal_avx2_hash
- ################################################################################
- seal_avx2_320:
- vmovdqa $A0, $A1
- vmovdqa $A0, $A2
- vmovdqa $B0, $B1
- vmovdqa $B0, $B2
- vmovdqa $C0, $C1
- vmovdqa $C0, $C2
- vpaddd .avx2_inc(%rip), $D0, $D1
- vpaddd .avx2_inc(%rip), $D1, $D2
- vmovdqa $B0, $T1
- vmovdqa $C0, $T2
- vmovdqa $D0, $ctr0_store
- vmovdqa $D1, $ctr1_store
- vmovdqa $D2, $ctr2_store
- mov \$10, $acc0
- 1: \n";
- &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"left");
- &chacha_qr_avx2($A1,$B1,$C1,$D1,$T0,"left");
- &chacha_qr_avx2($A2,$B2,$C2,$D2,$T0,"left");
- &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"right");
- &chacha_qr_avx2($A1,$B1,$C1,$D1,$T0,"right");
- &chacha_qr_avx2($A2,$B2,$C2,$D2,$T0,"right"); $code.="
- dec $acc0
- jne 1b
- vpaddd .chacha20_consts(%rip), $A0, $A0
- vpaddd .chacha20_consts(%rip), $A1, $A1
- vpaddd .chacha20_consts(%rip), $A2, $A2
- vpaddd $T1, $B0, $B0
- vpaddd $T1, $B1, $B1
- vpaddd $T1, $B2, $B2
- vpaddd $T2, $C0, $C0
- vpaddd $T2, $C1, $C1
- vpaddd $T2, $C2, $C2
- vpaddd $ctr0_store, $D0, $D0
- vpaddd $ctr1_store, $D1, $D1
- vpaddd $ctr2_store, $D2, $D2
- vperm2i128 \$0x02, $A0, $B0, $T0
- # Clamp and store the key
- vpand .clamp(%rip), $T0, $T0
- vmovdqa $T0, $r_store
- # Stream for up to 320 bytes
- vperm2i128 \$0x13, $A0, $B0, $A0
- vperm2i128 \$0x13, $C0, $D0, $B0
- vperm2i128 \$0x02, $A1, $B1, $C0
- vperm2i128 \$0x02, $C1, $D1, $D0
- vperm2i128 \$0x13, $A1, $B1, $A1
- vperm2i128 \$0x13, $C1, $D1, $B1
- vperm2i128 \$0x02, $A2, $B2, $C1
- vperm2i128 \$0x02, $C2, $D2, $D1
- vperm2i128 \$0x13, $A2, $B2, $A2
- vperm2i128 \$0x13, $C2, $D2, $B2
- jmp seal_avx2_short
- ################################################################################
- seal_avx2_192:
- vmovdqa $A0, $A1
- vmovdqa $A0, $A2
- vmovdqa $B0, $B1
- vmovdqa $B0, $B2
- vmovdqa $C0, $C1
- vmovdqa $C0, $C2
- vpaddd .avx2_inc(%rip), $D0, $D1
- vmovdqa $D0, $T2
- vmovdqa $D1, $T3
- mov \$10, $acc0
- 1: \n";
- &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"left");
- &chacha_qr_avx2($A1,$B1,$C1,$D1,$T0,"left");
- &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"right");
- &chacha_qr_avx2($A1,$B1,$C1,$D1,$T0,"right"); $code.="
- dec $acc0
- jne 1b
- vpaddd $A2, $A0, $A0
- vpaddd $A2, $A1, $A1
- vpaddd $B2, $B0, $B0
- vpaddd $B2, $B1, $B1
- vpaddd $C2, $C0, $C0
- vpaddd $C2, $C1, $C1
- vpaddd $T2, $D0, $D0
- vpaddd $T3, $D1, $D1
- vperm2i128 \$0x02, $A0, $B0, $T0
- # Clamp and store the key
- vpand .clamp(%rip), $T0, $T0
- vmovdqa $T0, $r_store
- # Stream for up to 192 bytes
- vperm2i128 \$0x13, $A0, $B0, $A0
- vperm2i128 \$0x13, $C0, $D0, $B0
- vperm2i128 \$0x02, $A1, $B1, $C0
- vperm2i128 \$0x02, $C1, $D1, $D0
- vperm2i128 \$0x13, $A1, $B1, $A1
- vperm2i128 \$0x13, $C1, $D1, $B1
- seal_avx2_short:
- mov %r8, $itr2
- call poly_hash_ad_internal
- xor $itr1, $itr1
- seal_avx2_hash:
- cmp \$16, $itr1
- jb seal_avx2_short_loop\n";
- &poly_add("0($oup)");
- &poly_mul(); $code.="
- sub \$16, $itr1
- add \$16, $oup
- jmp seal_avx2_hash
- seal_avx2_short_loop:
- cmp \$32, $inl
- jb seal_avx2_short_tail
- sub \$32, $inl
- # Encrypt
- vpxor ($inp), $A0, $A0
- vmovdqu $A0, ($oup)
- lea 1*32($inp), $inp
- # Load + hash\n";
- &poly_add("0*8($oup)");
- &poly_mul();
- &poly_add("2*8($oup)");
- &poly_mul(); $code.="
- lea 1*32($oup), $oup
- # Shift stream
- vmovdqa $B0, $A0
- vmovdqa $C0, $B0
- vmovdqa $D0, $C0
- vmovdqa $A1, $D0
- vmovdqa $B1, $A1
- vmovdqa $C1, $B1
- vmovdqa $D1, $C1
- vmovdqa $A2, $D1
- vmovdqa $B2, $A2
- jmp seal_avx2_short_loop
- seal_avx2_short_tail:
- cmp \$16, $inl
- jb 1f
- sub \$16, $inl
- vpxor ($inp), $A0x, $A3x
- vmovdqu $A3x, ($oup)
- lea 1*16($inp), $inp\n";
- &poly_add("0*8($oup)");
- &poly_mul(); $code.="
- lea 1*16($oup), $oup
- vextracti128 \$1, $A0, $A0x
- 1:
- vzeroupper
- jmp seal_sse_tail_16
- .cfi_endproc
- ";
- }
-
- if (!$win64) {
- $code =~ s/\`([^\`]*)\`/eval $1/gem;
- print $code;
- } else {
- print <<___;
- .globl dummy_chacha20_poly1305_asm
- .type dummy_chacha20_poly1305_asm,\@abi-omnipotent
- dummy_chacha20_poly1305_asm:
- ret
- ___
- }
-
- close STDOUT;
|