#!/usr/bin/env perl

# Copyright (c) 2015, CloudFlare Ltd.
#
# Permission to use, copy, modify, and/or distribute this software for any
# purpose with or without fee is hereby granted, provided that the above
# copyright notice and this permission notice appear in all copies.
#
# THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
# WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
# MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
# SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
# WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
# OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
# CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */

##############################################################################
#                                                                            #
# Author:  Vlad Krasnov                                                      #
#                                                                            #
##############################################################################

$flavour = shift;
$output  = shift;
if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }

$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);

$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
die "can't locate x86_64-xlate.pl";

open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
*STDOUT=*OUT;

$avx = 2;

$code.=<<___;
.text
.extern OPENSSL_ia32cap_P

chacha20_poly1305_constants:

.align 64
.chacha20_consts:
.byte 'e','x','p','a','n','d',' ','3','2','-','b','y','t','e',' ','k'
.byte 'e','x','p','a','n','d',' ','3','2','-','b','y','t','e',' ','k'
.rol8:
.byte 3,0,1,2, 7,4,5,6, 11,8,9,10, 15,12,13,14
.byte 3,0,1,2, 7,4,5,6, 11,8,9,10, 15,12,13,14
.rol16:
.byte 2,3,0,1, 6,7,4,5, 10,11,8,9, 14,15,12,13
.byte 2,3,0,1, 6,7,4,5, 10,11,8,9, 14,15,12,13
.avx2_init:
.long 0,0,0,0
.sse_inc:
.long 1,0,0,0
.avx2_inc:
.long 2,0,0,0,2,0,0,0
.clamp:
.quad 0x0FFFFFFC0FFFFFFF, 0x0FFFFFFC0FFFFFFC
.quad 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF
.align 16
.and_masks:
.byte 0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
.byte 0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
.byte 0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
.byte 0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
.byte 0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
.byte 0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00,0x00
.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00,0x00
.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00,0x00
.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00,0x00
.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00,0x00
.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00,0x00
.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00,0x00
.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0x00
.byte 0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff,0xff
___

my ($oup,$inp,$inl,$adp,$keyp,$itr1,$itr2)=("%rdi","%rsi","%rbx","%rcx","%r9","%rcx","%r8");
my ($acc0,$acc1,$acc2)=map("%r$_",(10..12));
my ($t0,$t1,$t2,$t3)=("%r13","%r14","%r15","%r9");
my ($A0,$A1,$A2,$A3,$B0,$B1,$B2,$B3,$C0,$C1,$C2,$C3,$D0,$D1,$D2,$D3)=map("%xmm$_",(0..15));
my ($T0,$T1,$T2,$T3)=($A3,$B3,$C3,$D3);
my $r_store="0*16(%rbp)";
my $s_store="1*16(%rbp)";
my $len_store="2*16(%rbp)";
my $state1_store="3*16(%rbp)";
my $state2_store="4*16(%rbp)";
my $tmp_store="5*16(%rbp)";
my $ctr0_store="6*16(%rbp)";
my $ctr1_store="7*16(%rbp)";
my $ctr2_store="8*16(%rbp)";
my $ctr3_store="9*16(%rbp)";

sub chacha_qr {
my ($a,$b,$c,$d,$t,$dir)=@_;
$code.="movdqa $t, $tmp_store\n" if ($dir =~ /store/);
$code.="paddd $b, $a
        pxor $a, $d
        pshufb .rol16(%rip), $d
        paddd $d, $c
        pxor $c, $b
        movdqa $b, $t
        pslld \$12, $t
        psrld \$20, $b
        pxor $t, $b
        paddd $b, $a
        pxor $a, $d
        pshufb .rol8(%rip), $d
        paddd $d, $c
        pxor $c, $b
        movdqa $b, $t
        pslld \$7, $t
        psrld \$25, $b
        pxor $t, $b\n";
$code.="palignr \$4, $b, $b
        palignr \$8, $c, $c
        palignr \$12, $d, $d\n" if ($dir =~ /left/);
$code.="palignr \$12, $b, $b
        palignr \$8, $c, $c
        palignr \$4, $d, $d\n" if ($dir =~ /right/);
$code.="movdqa $tmp_store, $t\n" if ($dir =~ /load/);
}

sub poly_add {
my ($src)=@_;
$code.="add $src, $acc0
        adc 8+$src, $acc1
        adc \$1, $acc2\n";
}

sub poly_stage1 {
$code.="mov 0+$r_store, %rax
        mov %rax, $t2
        mul $acc0
        mov %rax, $t0
        mov %rdx, $t1
        mov 0+$r_store, %rax
        mul $acc1
        imulq $acc2, $t2
        add %rax, $t1
        adc %rdx, $t2\n";
}

sub poly_stage2 {
$code.="mov 8+$r_store, %rax
        mov %rax, $t3
        mul $acc0
        add %rax, $t1
        adc \$0, %rdx
        mov %rdx, $acc0
        mov 8+$r_store, %rax
        mul $acc1
        add %rax, $t2
        adc \$0, %rdx\n";
}

sub poly_stage3 {
$code.="imulq $acc2, $t3
        add $acc0, $t2
        adc %rdx, $t3\n";
}

sub poly_reduce_stage {
$code.="mov $t0, $acc0
        mov $t1, $acc1
        mov $t2, $acc2
        and \$3, $acc2
        mov $t2, $t0
        and \$-4, $t0
        mov $t3, $t1
        shrd \$2, $t3, $t2
        shr \$2, $t3
        add $t0, $acc0
        adc $t1, $acc1
        adc \$0, $acc2
        add $t2, $acc0
        adc $t3, $acc1
        adc \$0, $acc2\n";
}

sub poly_mul {
    &poly_stage1();
    &poly_stage2();
    &poly_stage3();
    &poly_reduce_stage();
}

sub prep_state {
my ($n)=@_;
$code.="movdqa .chacha20_consts(%rip), $A0
        movdqa $state1_store, $B0
        movdqa $state2_store, $C0\n";
$code.="movdqa $A0, $A1
        movdqa $B0, $B1
        movdqa $C0, $C1\n" if ($n ge 2);
$code.="movdqa $A0, $A2
        movdqa $B0, $B2
        movdqa $C0, $C2\n" if ($n ge 3);
$code.="movdqa $A0, $A3
        movdqa $B0, $B3
        movdqa $C0, $C3\n" if ($n ge 4);
$code.="movdqa $ctr0_store, $D0
        paddd .sse_inc(%rip), $D0
        movdqa $D0, $ctr0_store\n" if ($n eq 1);
$code.="movdqa $ctr0_store, $D1
        paddd .sse_inc(%rip), $D1
        movdqa $D1, $D0
        paddd .sse_inc(%rip), $D0
        movdqa $D0, $ctr0_store
        movdqa $D1, $ctr1_store\n" if ($n eq 2);
$code.="movdqa $ctr0_store, $D2
        paddd .sse_inc(%rip), $D2
        movdqa $D2, $D1
        paddd .sse_inc(%rip), $D1
        movdqa $D1, $D0
        paddd .sse_inc(%rip), $D0
        movdqa $D0, $ctr0_store
        movdqa $D1, $ctr1_store
        movdqa $D2, $ctr2_store\n" if ($n eq 3);
$code.="movdqa $ctr0_store, $D3
        paddd .sse_inc(%rip), $D3
        movdqa $D3, $D2
        paddd .sse_inc(%rip), $D2
        movdqa $D2, $D1
        paddd .sse_inc(%rip), $D1
        movdqa $D1, $D0
        paddd .sse_inc(%rip), $D0
        movdqa $D0, $ctr0_store
        movdqa $D1, $ctr1_store
        movdqa $D2, $ctr2_store
        movdqa $D3, $ctr3_store\n" if ($n eq 4);
}

sub finalize_state {
my ($n)=@_;
$code.="paddd .chacha20_consts(%rip), $A3
        paddd $state1_store, $B3
        paddd $state2_store, $C3
        paddd $ctr3_store, $D3\n" if ($n eq 4);
$code.="paddd .chacha20_consts(%rip), $A2
        paddd $state1_store, $B2
        paddd $state2_store, $C2
        paddd $ctr2_store, $D2\n" if ($n ge 3);
$code.="paddd .chacha20_consts(%rip), $A1
        paddd $state1_store, $B1
        paddd $state2_store, $C1
        paddd $ctr1_store, $D1\n" if ($n ge 2);
$code.="paddd .chacha20_consts(%rip), $A0
        paddd $state1_store, $B0
        paddd $state2_store, $C0
        paddd $ctr0_store, $D0\n";
}

sub xor_stream {
my ($A, $B, $C, $D, $offset)=@_;
$code.="movdqu 0*16 + $offset($inp), $A3
        movdqu 1*16 + $offset($inp), $B3
        movdqu 2*16 + $offset($inp), $C3
        movdqu 3*16 + $offset($inp), $D3
        pxor $A3, $A
        pxor $B3, $B
        pxor $C3, $C
        pxor $D, $D3
        movdqu $A, 0*16 + $offset($oup)
        movdqu $B, 1*16 + $offset($oup)
        movdqu $C, 2*16 + $offset($oup)
        movdqu $D3, 3*16 + $offset($oup)\n";
}

sub xor_stream_using_temp {
my ($A, $B, $C, $D, $offset, $temp)=@_;
$code.="movdqa $temp, $tmp_store
        movdqu 0*16 + $offset($inp), $temp
        pxor $A, $temp
        movdqu $temp, 0*16 + $offset($oup)
        movdqu 1*16 + $offset($inp), $temp
        pxor $B, $temp
        movdqu $temp, 1*16 + $offset($oup)
        movdqu 2*16 + $offset($inp), $temp
        pxor $C, $temp
        movdqu $temp, 2*16 + $offset($oup)
        movdqu 3*16 + $offset($inp), $temp
        pxor $D, $temp
        movdqu $temp, 3*16 + $offset($oup)\n";
}

sub gen_chacha_round {
my ($rot1, $rot2, $shift)=@_;
my $round="";
$round.="movdqa $C0, $tmp_store\n" if ($rot1 eq 20);
$round.="movdqa $rot2, $C0
         paddd $B3, $A3
         paddd $B2, $A2
         paddd $B1, $A1
         paddd $B0, $A0
         pxor $A3, $D3
         pxor $A2, $D2
         pxor $A1, $D1
         pxor $A0, $D0
         pshufb $C0, $D3
         pshufb $C0, $D2
         pshufb $C0, $D1
         pshufb $C0, $D0
         movdqa $tmp_store, $C0
         paddd $D3, $C3
         paddd $D2, $C2
         paddd $D1, $C1
         paddd $D0, $C0
         pxor $C3, $B3
         pxor $C2, $B2
         pxor $C1, $B1
         pxor $C0, $B0
         movdqa $C0, $tmp_store
         movdqa $B3, $C0
         psrld \$$rot1, $C0
         pslld \$32-$rot1, $B3
         pxor $C0, $B3
         movdqa $B2, $C0
         psrld \$$rot1, $C0
         pslld \$32-$rot1, $B2
         pxor $C0, $B2
         movdqa $B1, $C0
         psrld \$$rot1, $C0
         pslld \$32-$rot1, $B1
         pxor $C0, $B1
         movdqa $B0, $C0
         psrld \$$rot1, $C0
         pslld \$32-$rot1, $B0
         pxor $C0, $B0\n";
($s1,$s2,$s3)=(4,8,12) if ($shift =~ /left/);
($s1,$s2,$s3)=(12,8,4) if ($shift =~ /right/);
$round.="movdqa $tmp_store, $C0
         palignr \$$s1, $B3, $B3
         palignr \$$s2, $C3, $C3
         palignr \$$s3, $D3, $D3
         palignr \$$s1, $B2, $B2
         palignr \$$s2, $C2, $C2
         palignr \$$s3, $D2, $D2
         palignr \$$s1, $B1, $B1
         palignr \$$s2, $C1, $C1
         palignr \$$s3, $D1, $D1
         palignr \$$s1, $B0, $B0
         palignr \$$s2, $C0, $C0
         palignr \$$s3, $D0, $D0\n"
if (($shift =~ /left/) || ($shift =~ /right/));
return $round;
};

$chacha_body = &gen_chacha_round(20, ".rol16(%rip)") .
               &gen_chacha_round(25, ".rol8(%rip)", "left") .
               &gen_chacha_round(20, ".rol16(%rip)") .
               &gen_chacha_round(25, ".rol8(%rip)", "right");

my @loop_body = split /\n/, $chacha_body;

sub emit_body {
my ($n)=@_;
    for (my $i=0; $i < $n; $i++) {
        $code=$code.shift(@loop_body)."\n";
    };
}

{
################################################################################
# void poly_hash_ad_internal();
$code.="
.type poly_hash_ad_internal,\@function,2
.align 64
poly_hash_ad_internal:
.cfi_startproc
    xor $acc0, $acc0
    xor $acc1, $acc1
    xor $acc2, $acc2
    cmp \$13,  $itr2
    jne hash_ad_loop
poly_fast_tls_ad:
    # Special treatment for the TLS case of 13 bytes
    mov ($adp), $acc0
    mov 5($adp), $acc1
    shr \$24, $acc1
    mov \$1, $acc2\n";
    &poly_mul(); $code.="
    ret
hash_ad_loop:
        # Hash in 16 byte chunk
        cmp \$16, $itr2
        jb hash_ad_tail\n";
        &poly_add("0($adp)");
        &poly_mul(); $code.="
        lea 1*16($adp), $adp
        sub \$16, $itr2
    jmp hash_ad_loop
hash_ad_tail:
    cmp \$0, $itr2
    je 1f
    # Hash last < 16 byte tail
    xor $t0, $t0
    xor $t1, $t1
    xor $t2, $t2
    add $itr2, $adp
hash_ad_tail_loop:
        shld \$8, $t0, $t1
        shl \$8, $t0
        movzxb -1($adp), $t2
        xor $t2, $t0
        dec $adp
        dec $itr2
    jne hash_ad_tail_loop

    add $t0, $acc0
    adc $t1, $acc1
    adc \$1, $acc2\n";
    &poly_mul(); $code.="
    # Finished AD
1:
    ret
.cfi_endproc
.size poly_hash_ad_internal, .-poly_hash_ad_internal\n";
}

{
################################################################################
# void chacha20_poly1305_open(uint8_t *pt, uint8_t *ct, size_t len_in, uint8_t *ad, size_t len_ad, uint8_t *keyp);
$code.="
.globl chacha20_poly1305_open
.type chacha20_poly1305_open,\@function,2
.align 64
chacha20_poly1305_open:
.cfi_startproc
    push %rbp
.cfi_adjust_cfa_offset 8
    push %rbx
.cfi_adjust_cfa_offset 8
    push %r12
.cfi_adjust_cfa_offset 8
    push %r13
.cfi_adjust_cfa_offset 8
    push %r14
.cfi_adjust_cfa_offset 8
    push %r15
.cfi_adjust_cfa_offset 8
    # We write the calculated authenticator back to keyp at the end, so save
    # the pointer on the stack too.
    push $keyp
.cfi_adjust_cfa_offset 8
    sub \$288 + 32, %rsp
.cfi_adjust_cfa_offset 288 + 32
.cfi_offset rbp, -16
.cfi_offset rbx, -24
.cfi_offset r12, -32
.cfi_offset r13, -40
.cfi_offset r14, -48
.cfi_offset r15, -56
    lea 32(%rsp), %rbp
    and \$-32, %rbp
    mov %rdx, 8+$len_store
    mov %r8, 0+$len_store
    mov %rdx, $inl\n"; $code.="
    mov OPENSSL_ia32cap_P+8(%rip), %eax
    and \$`(1<<5) + (1<<8)`, %eax # Check both BMI2 and AVX2 are present
    xor \$`(1<<5) + (1<<8)`, %eax
    jz  chacha20_poly1305_open_avx2\n" if ($avx>1);
$code.="
1:
    cmp \$128, $inl
    jbe open_sse_128
    # For long buffers, prepare the poly key first
    movdqa .chacha20_consts(%rip), $A0
    movdqu 0*16($keyp), $B0
    movdqu 1*16($keyp), $C0
    movdqu 2*16($keyp), $D0
    movdqa $D0, $T1
    # Store on stack, to free keyp
    movdqa $B0, $state1_store
    movdqa $C0, $state2_store
    movdqa $D0, $ctr0_store
    mov \$10, $acc0
1:  \n";
        &chacha_qr($A0,$B0,$C0,$D0,$T0,"left");
        &chacha_qr($A0,$B0,$C0,$D0,$T0,"right"); $code.="
        dec $acc0
    jne 1b
    # A0|B0 hold the Poly1305 32-byte key, C0,D0 can be discarded
    paddd .chacha20_consts(%rip), $A0
    paddd $state1_store, $B0
    # Clamp and store the key
    pand .clamp(%rip), $A0
    movdqa $A0, $r_store
    movdqa $B0, $s_store
    # Hash
    mov %r8, $itr2
    call poly_hash_ad_internal
open_sse_main_loop:
        cmp \$16*16, $inl
        jb 2f
        # Load state, increment counter blocks\n";
        &prep_state(4); $code.="
        # There are 10 ChaCha20 iterations of 2QR each, so for 6 iterations we
        # hash 2 blocks, and for the remaining 4 only 1 block - for a total of 16
        mov \$4, $itr1
        mov $inp, $itr2
1:  \n";
            &emit_body(20);
            &poly_add("0($itr2)"); $code.="
            lea 2*8($itr2), $itr2\n";
            &emit_body(20);
            &poly_stage1();
            &emit_body(20);
            &poly_stage2();
            &emit_body(20);
            &poly_stage3();
            &emit_body(20);
            &poly_reduce_stage();
            foreach $l (@loop_body) {$code.=$l."\n";}
            @loop_body = split /\n/, $chacha_body; $code.="
            dec $itr1
        jge 1b\n";
            &poly_add("0($itr2)");
            &poly_mul(); $code.="
            lea 2*8($itr2), $itr2
            cmp \$-6, $itr1
        jg 1b\n";
        &finalize_state(4);
        &xor_stream_using_temp($A3, $B3, $C3, $D3, "0*16", $D0);
        &xor_stream($A2, $B2, $C2, $D2, "4*16");
        &xor_stream($A1, $B1, $C1, $D1, "8*16");
        &xor_stream($A0, $B0, $C0, $tmp_store, "12*16"); $code.="
        lea 16*16($inp), $inp
        lea 16*16($oup), $oup
        sub \$16*16, $inl
    jmp open_sse_main_loop
2:
    # Handle the various tail sizes efficiently
    test $inl, $inl
    jz open_sse_finalize
    cmp \$4*16, $inl
    ja 3f\n";
###############################################################################
    # At most 64 bytes are left
    &prep_state(1); $code.="
    xor $itr2, $itr2
    mov $inl, $itr1
    cmp \$16, $itr1
    jb 2f
1:  \n";
        &poly_add("0($inp, $itr2)");
        &poly_mul(); $code.="
        sub \$16, $itr1
2:
        add \$16, $itr2\n";
        &chacha_qr($A0,$B0,$C0,$D0,$T0,"left");
        &chacha_qr($A0,$B0,$C0,$D0,$T0,"right"); $code.="
        cmp \$16, $itr1
    jae 1b
        cmp \$10*16, $itr2
    jne 2b\n";
    &finalize_state(1); $code.="
    jmp open_sse_tail_64_dec_loop
3:
    cmp \$8*16, $inl
    ja 3f\n";
###############################################################################
    # 65 - 128 bytes are left
    &prep_state(2); $code.="
    mov $inl, $itr1
    and \$-16, $itr1
    xor $itr2, $itr2
1:  \n";
        &poly_add("0($inp, $itr2)");
        &poly_mul(); $code.="
2:
        add \$16, $itr2\n";
        &chacha_qr($A0,$B0,$C0,$D0,$T0,"left");
        &chacha_qr($A1,$B1,$C1,$D1,$T0,"left");
        &chacha_qr($A0,$B0,$C0,$D0,$T0,"right");
        &chacha_qr($A1,$B1,$C1,$D1,$T0,"right");$code.="
        cmp $itr1, $itr2
    jb 1b
        cmp \$10*16, $itr2
    jne 2b\n";
    &finalize_state(2);
    &xor_stream($A1, $B1, $C1, $D1, "0*16"); $code.="
    sub \$4*16, $inl
    lea 4*16($inp), $inp
    lea 4*16($oup), $oup
    jmp open_sse_tail_64_dec_loop
3:
    cmp \$12*16, $inl
    ja 3f\n";
###############################################################################
    # 129 - 192 bytes are left
    &prep_state(3); $code.="
    mov $inl, $itr1
    mov \$10*16, $itr2
    cmp \$10*16, $itr1
    cmovg $itr2, $itr1
    and \$-16, $itr1
    xor $itr2, $itr2
1:  \n";
        &poly_add("0($inp, $itr2)");
        &poly_mul(); $code.="
2:
        add \$16, $itr2\n";
        &chacha_qr($A0,$B0,$C0,$D0,$T0,"left");
        &chacha_qr($A1,$B1,$C1,$D1,$T0,"left");
        &chacha_qr($A2,$B2,$C2,$D2,$T0,"left");
        &chacha_qr($A0,$B0,$C0,$D0,$T0,"right");
        &chacha_qr($A1,$B1,$C1,$D1,$T0,"right");
        &chacha_qr($A2,$B2,$C2,$D2,$T0,"right"); $code.="
        cmp $itr1, $itr2
    jb 1b
        cmp \$10*16, $itr2
    jne 2b
    cmp \$11*16, $inl
    jb 1f\n";
    &poly_add("10*16($inp)");
    &poly_mul(); $code.="
    cmp \$12*16, $inl
    jb 1f\n";
    &poly_add("11*16($inp)");
    &poly_mul(); $code.="
1:  \n";
    &finalize_state(3);
    &xor_stream($A2, $B2, $C2, $D2, "0*16");
    &xor_stream($A1, $B1, $C1, $D1, "4*16"); $code.="
    sub \$8*16, $inl
    lea 8*16($inp), $inp
    lea 8*16($oup), $oup
    jmp open_sse_tail_64_dec_loop
3:
###############################################################################\n";
    # 193 - 255 bytes are left
    &prep_state(4); $code.="
    xor $itr2, $itr2
1:  \n";
        &poly_add("0($inp, $itr2)");
        &chacha_qr($A0,$B0,$C0,$D0,$C3,"store_left");
        &chacha_qr($A1,$B1,$C1,$D1,$C3,"left");
        &chacha_qr($A2,$B2,$C2,$D2,$C3,"left_load");
        &poly_stage1();
        &chacha_qr($A3,$B3,$C3,$D3,$C1,"store_left_load");
        &poly_stage2();
        &chacha_qr($A0,$B0,$C0,$D0,$C3,"store_right");
        &chacha_qr($A1,$B1,$C1,$D1,$C3,"right");
        &poly_stage3();
        &chacha_qr($A2,$B2,$C2,$D2,$C3,"right_load");
        &poly_reduce_stage();
        &chacha_qr($A3,$B3,$C3,$D3,$C1,"store_right_load"); $code.="
        add \$16, $itr2
        cmp \$10*16, $itr2
    jb 1b
    mov $inl, $itr1
    and \$-16, $itr1
1:  \n";
        &poly_add("0($inp, $itr2)");
        &poly_mul(); $code.="
        add \$16, $itr2
        cmp $itr1, $itr2
    jb 1b\n";
    &finalize_state(4);
    &xor_stream_using_temp($A3, $B3, $C3, $D3, "0*16", $D0);
    &xor_stream($A2, $B2, $C2, $D2, "4*16");
    &xor_stream($A1, $B1, $C1, $D1, "8*16"); $code.="
    movdqa $tmp_store, $D0
    sub \$12*16, $inl
    lea 12*16($inp), $inp
    lea 12*16($oup), $oup
###############################################################################
    # Decrypt the remaining data, 16B at a time, using existing stream
open_sse_tail_64_dec_loop:
    cmp \$16, $inl
    jb 1f
        sub \$16, $inl
        movdqu ($inp), $T0
        pxor $T0, $A0
        movdqu $A0, ($oup)
        lea 16($inp), $inp
        lea 16($oup), $oup
        movdqa $B0, $A0
        movdqa $C0, $B0
        movdqa $D0, $C0
    jmp open_sse_tail_64_dec_loop
1:
    movdqa $A0, $A1

    # Decrypt up to 16 bytes at the end.
open_sse_tail_16:
    test $inl, $inl
    jz open_sse_finalize

    # Read the final bytes into $T0. They need to be read in reverse order so
    # that they end up in the correct order in $T0.
    pxor $T0, $T0
    lea -1($inp, $inl), $inp
    movq $inl, $itr2
2:
        pslldq \$1, $T0
        pinsrb \$0, ($inp), $T0
        sub \$1, $inp
        sub \$1, $itr2
        jnz 2b

3:
    movq $T0, $t0
    pextrq \$1, $T0, $t1
    # The final bytes of keystream are in $A1.
    pxor $A1, $T0

    # Copy the plaintext bytes out.
2:
        pextrb \$0, $T0, ($oup)
        psrldq \$1, $T0
        add \$1, $oup
        sub \$1, $inl
    jne 2b

    add $t0, $acc0
    adc $t1, $acc1
    adc \$1, $acc2\n";
    &poly_mul(); $code.="

open_sse_finalize:\n";
    &poly_add($len_store);
    &poly_mul(); $code.="
    # Final reduce
    mov $acc0, $t0
    mov $acc1, $t1
    mov $acc2, $t2
    sub \$-5, $acc0
    sbb \$-1, $acc1
    sbb \$3, $acc2
    cmovc $t0, $acc0
    cmovc $t1, $acc1
    cmovc $t2, $acc2
    # Add in s part of the key
    add 0+$s_store, $acc0
    adc 8+$s_store, $acc1

    add \$288 + 32, %rsp
.cfi_adjust_cfa_offset -(288 + 32)
    pop $keyp
.cfi_adjust_cfa_offset -8
    movq $acc0, ($keyp)
    movq $acc1, 8($keyp)

    pop %r15
.cfi_adjust_cfa_offset -8
    pop %r14
.cfi_adjust_cfa_offset -8
    pop %r13
.cfi_adjust_cfa_offset -8
    pop %r12
.cfi_adjust_cfa_offset -8
    pop %rbx
.cfi_adjust_cfa_offset -8
    pop %rbp
.cfi_adjust_cfa_offset -8
    ret
.cfi_adjust_cfa_offset (8 * 6) + 288 + 32
###############################################################################
open_sse_128:
    movdqu .chacha20_consts(%rip), $A0\nmovdqa $A0, $A1\nmovdqa $A0, $A2
    movdqu 0*16($keyp), $B0\nmovdqa $B0, $B1\nmovdqa $B0, $B2
    movdqu 1*16($keyp), $C0\nmovdqa $C0, $C1\nmovdqa $C0, $C2
    movdqu 2*16($keyp), $D0
    movdqa $D0, $D1\npaddd .sse_inc(%rip), $D1
    movdqa $D1, $D2\npaddd .sse_inc(%rip), $D2
    movdqa $B0, $T1\nmovdqa $C0, $T2\nmovdqa $D1, $T3
    mov \$10, $acc0
1:  \n";
        &chacha_qr($A0,$B0,$C0,$D0,$T0,"left");
        &chacha_qr($A1,$B1,$C1,$D1,$T0,"left");
        &chacha_qr($A2,$B2,$C2,$D2,$T0,"left");
        &chacha_qr($A0,$B0,$C0,$D0,$T0,"right");
        &chacha_qr($A1,$B1,$C1,$D1,$T0,"right");
        &chacha_qr($A2,$B2,$C2,$D2,$T0,"right"); $code.="
    dec $acc0
    jnz 1b
    paddd .chacha20_consts(%rip), $A0
    paddd .chacha20_consts(%rip), $A1
    paddd .chacha20_consts(%rip), $A2
    paddd $T1, $B0\npaddd $T1, $B1\npaddd $T1, $B2
    paddd $T2, $C1\npaddd $T2, $C2
    paddd $T3, $D1
    paddd .sse_inc(%rip), $T3
    paddd $T3, $D2
    # Clamp and store the key
    pand .clamp(%rip), $A0
    movdqa $A0, $r_store
    movdqa $B0, $s_store
    # Hash
    mov %r8, $itr2
    call poly_hash_ad_internal
1:
        cmp \$16, $inl
        jb open_sse_tail_16
        sub \$16, $inl\n";
        # Load for hashing
        &poly_add("0*8($inp)"); $code.="
        # Load for decryption
        movdqu 0*16($inp), $T0
        pxor $T0, $A1
        movdqu $A1, 0*16($oup)
        lea 1*16($inp), $inp
        lea 1*16($oup), $oup\n";
        &poly_mul(); $code.="
        # Shift the stream left
        movdqa $B1, $A1
        movdqa $C1, $B1
        movdqa $D1, $C1
        movdqa $A2, $D1
        movdqa $B2, $A2
        movdqa $C2, $B2
        movdqa $D2, $C2
    jmp 1b
    jmp open_sse_tail_16
.size chacha20_poly1305_open, .-chacha20_poly1305_open
.cfi_endproc

################################################################################
################################################################################
# void chacha20_poly1305_seal(uint8_t *pt, uint8_t *ct, size_t len_in, uint8_t *ad, size_t len_ad, uint8_t *keyp);
.globl  chacha20_poly1305_seal
.type chacha20_poly1305_seal,\@function,2
.align 64
chacha20_poly1305_seal:
.cfi_startproc
    push %rbp
.cfi_adjust_cfa_offset 8
    push %rbx
.cfi_adjust_cfa_offset 8
    push %r12
.cfi_adjust_cfa_offset 8
    push %r13
.cfi_adjust_cfa_offset 8
    push %r14
.cfi_adjust_cfa_offset 8
    push %r15
.cfi_adjust_cfa_offset 8
    # We write the calculated authenticator back to keyp at the end, so save
    # the pointer on the stack too.
    push $keyp
.cfi_adjust_cfa_offset 8
    sub \$288 + 32, %rsp
.cfi_adjust_cfa_offset 288 + 32
.cfi_offset rbp, -16
.cfi_offset rbx, -24
.cfi_offset r12, -32
.cfi_offset r13, -40
.cfi_offset r14, -48
.cfi_offset r15, -56
    lea 32(%rsp), %rbp
    and \$-32, %rbp
    mov 56($keyp), $inl  # extra_in_len
    addq %rdx, $inl
    mov $inl, 8+$len_store
    mov %r8, 0+$len_store
    mov %rdx, $inl\n"; $code.="
    mov OPENSSL_ia32cap_P+8(%rip), %eax
    and \$`(1<<5) + (1<<8)`, %eax # Check both BMI2 and AVX2 are present
    xor \$`(1<<5) + (1<<8)`, %eax
    jz  chacha20_poly1305_seal_avx2\n" if ($avx>1);
$code.="
    cmp \$128, $inl
    jbe seal_sse_128
    # For longer buffers, prepare the poly key + some stream
    movdqa .chacha20_consts(%rip), $A0
    movdqu 0*16($keyp), $B0
    movdqu 1*16($keyp), $C0
    movdqu 2*16($keyp), $D0
    movdqa $A0, $A1
    movdqa $A0, $A2
    movdqa $A0, $A3
    movdqa $B0, $B1
    movdqa $B0, $B2
    movdqa $B0, $B3
    movdqa $C0, $C1
    movdqa $C0, $C2
    movdqa $C0, $C3
    movdqa $D0, $D3
    paddd .sse_inc(%rip), $D0
    movdqa $D0, $D2
    paddd .sse_inc(%rip), $D0
    movdqa $D0, $D1
    paddd .sse_inc(%rip), $D0
    # Store on stack
    movdqa $B0, $state1_store
    movdqa $C0, $state2_store
    movdqa $D0, $ctr0_store
    movdqa $D1, $ctr1_store
    movdqa $D2, $ctr2_store
    movdqa $D3, $ctr3_store
    mov \$10, $acc0
1:  \n";
        foreach $l (@loop_body) {$code.=$l."\n";}
        @loop_body = split /\n/, $chacha_body; $code.="
        dec $acc0
    jnz 1b\n";
    &finalize_state(4); $code.="
    # Clamp and store the key
    pand .clamp(%rip), $A3
    movdqa $A3, $r_store
    movdqa $B3, $s_store
    # Hash
    mov %r8, $itr2
    call poly_hash_ad_internal\n";
    &xor_stream($A2,$B2,$C2,$D2,"0*16");
    &xor_stream($A1,$B1,$C1,$D1,"4*16"); $code.="
    cmp \$12*16, $inl
    ja 1f
    mov \$8*16, $itr1
    sub \$8*16, $inl
    lea 8*16($inp), $inp
    jmp seal_sse_128_seal_hash
1:  \n";
    &xor_stream($A0, $B0, $C0, $D0, "8*16"); $code.="
    mov \$12*16, $itr1
    sub \$12*16, $inl
    lea 12*16($inp), $inp
    mov \$2, $itr1
    mov \$8, $itr2
    cmp \$4*16, $inl
    jbe seal_sse_tail_64
    cmp \$8*16, $inl
    jbe seal_sse_tail_128
    cmp \$12*16, $inl
    jbe seal_sse_tail_192

1:  \n";
    # The main loop
        &prep_state(4); $code.="
2:  \n";
            &emit_body(20);
            &poly_add("0($oup)");
            &emit_body(20);
            &poly_stage1();
            &emit_body(20);
            &poly_stage2();
            &emit_body(20);
            &poly_stage3();
            &emit_body(20);
            &poly_reduce_stage();
            foreach $l (@loop_body) {$code.=$l."\n";}
            @loop_body = split /\n/, $chacha_body; $code.="
            lea 16($oup), $oup
            dec $itr2
        jge 2b\n";
            &poly_add("0*8($oup)");
            &poly_mul(); $code.="
            lea 16($oup), $oup
            dec $itr1
        jg 2b\n";

        &finalize_state(4);$code.="
        movdqa $D2, $tmp_store\n";
        &xor_stream_using_temp($A3,$B3,$C3,$D3,0*16,$D2); $code.="
        movdqa $tmp_store, $D2\n";
        &xor_stream($A2,$B2,$C2,$D2, 4*16);
        &xor_stream($A1,$B1,$C1,$D1, 8*16); $code.="
        cmp \$16*16, $inl
        ja 3f

        mov \$12*16, $itr1
        sub \$12*16, $inl
        lea 12*16($inp), $inp
        jmp seal_sse_128_seal_hash
3:  \n";
        &xor_stream($A0,$B0,$C0,$D0,"12*16"); $code.="
        lea 16*16($inp), $inp
        sub \$16*16, $inl
        mov \$6, $itr1
        mov \$4, $itr2
        cmp \$12*16, $inl
    jg 1b
    mov $inl, $itr1
    test $inl, $inl
    je seal_sse_128_seal_hash
    mov \$6, $itr1
    cmp \$4*16, $inl
    jg 3f
###############################################################################
seal_sse_tail_64:\n";
    &prep_state(1); $code.="
1:  \n";
        &poly_add("0($oup)");
        &poly_mul(); $code.="
        lea 16($oup), $oup
2:  \n";
        &chacha_qr($A0,$B0,$C0,$D0,$T0,"left");
        &chacha_qr($A0,$B0,$C0,$D0,$T0,"right");
        &poly_add("0($oup)");
        &poly_mul(); $code.="
        lea 16($oup), $oup
    dec $itr1
    jg 1b
    dec $itr2
    jge 2b\n";
    &finalize_state(1); $code.="
    jmp seal_sse_128_seal
3:
    cmp \$8*16, $inl
    jg 3f
###############################################################################
seal_sse_tail_128:\n";
    &prep_state(2); $code.="
1:  \n";
        &poly_add("0($oup)");
        &poly_mul(); $code.="
        lea 16($oup), $oup
2:  \n";
        &chacha_qr($A0,$B0,$C0,$D0,$T0,"left");
        &chacha_qr($A1,$B1,$C1,$D1,$T0,"left");
        &poly_add("0($oup)");
        &poly_mul();
        &chacha_qr($A0,$B0,$C0,$D0,$T0,"right");
        &chacha_qr($A1,$B1,$C1,$D1,$T0,"right"); $code.="
        lea 16($oup), $oup
    dec $itr1
    jg 1b
    dec $itr2
    jge 2b\n";
    &finalize_state(2);
    &xor_stream($A1,$B1,$C1,$D1,0*16); $code.="
    mov \$4*16, $itr1
    sub \$4*16, $inl
    lea 4*16($inp), $inp
    jmp seal_sse_128_seal_hash
3:
###############################################################################
seal_sse_tail_192:\n";
    &prep_state(3); $code.="
1:  \n";
        &poly_add("0($oup)");
        &poly_mul(); $code.="
        lea 16($oup), $oup
2:  \n";
        &chacha_qr($A0,$B0,$C0,$D0,$T0,"left");
        &chacha_qr($A1,$B1,$C1,$D1,$T0,"left");
        &chacha_qr($A2,$B2,$C2,$D2,$T0,"left");
        &poly_add("0($oup)");
        &poly_mul();
        &chacha_qr($A0,$B0,$C0,$D0,$T0,"right");
        &chacha_qr($A1,$B1,$C1,$D1,$T0,"right");
        &chacha_qr($A2,$B2,$C2,$D2,$T0,"right"); $code.="
        lea 16($oup), $oup
    dec $itr1
    jg 1b
    dec $itr2
    jge 2b\n";
    &finalize_state(3);
    &xor_stream($A2,$B2,$C2,$D2,0*16);
    &xor_stream($A1,$B1,$C1,$D1,4*16); $code.="
    mov \$8*16, $itr1
    sub \$8*16, $inl
    lea 8*16($inp), $inp
###############################################################################
seal_sse_128_seal_hash:
        cmp \$16, $itr1
        jb seal_sse_128_seal\n";
        &poly_add("0($oup)");
        &poly_mul(); $code.="
        sub \$16, $itr1
        lea 16($oup), $oup
    jmp seal_sse_128_seal_hash

seal_sse_128_seal:
        cmp \$16, $inl
        jb seal_sse_tail_16
        sub \$16, $inl
        # Load for decryption
        movdqu 0*16($inp), $T0
        pxor $T0, $A0
        movdqu $A0, 0*16($oup)
        # Then hash
        add 0*8($oup), $acc0
        adc 1*8($oup), $acc1
        adc \$1, $acc2
        lea 1*16($inp), $inp
        lea 1*16($oup), $oup\n";
        &poly_mul(); $code.="
        # Shift the stream left
        movdqa $B0, $A0
        movdqa $C0, $B0
        movdqa $D0, $C0
        movdqa $A1, $D0
        movdqa $B1, $A1
        movdqa $C1, $B1
        movdqa $D1, $C1
    jmp seal_sse_128_seal

seal_sse_tail_16:
    test $inl, $inl
    jz process_blocks_of_extra_in
    # We can only load the PT one byte at a time to avoid buffer overread
    mov $inl, $itr2
    mov $inl, $itr1
    lea -1($inp, $inl), $inp
    pxor $T3, $T3
1:
        pslldq \$1, $T3
        pinsrb \$0, ($inp), $T3
        lea -1($inp), $inp
        dec $itr1
        jne 1b

    # XOR the keystream with the plaintext.
    pxor $A0, $T3

    # Write ciphertext out, byte-by-byte.
    movq $inl, $itr1
    movdqu $T3, $A0
2:
        pextrb \$0, $A0, ($oup)
        psrldq \$1, $A0
        add \$1, $oup
        sub \$1, $itr1
        jnz 2b

    # $T3 contains the final (partial, non-empty) block of ciphertext which
    # needs to be fed into the Poly1305 state. The right-most $inl bytes of it
    # are valid. We need to fill it with extra_in bytes until full, or until we
    # run out of bytes.
    #
    # $keyp points to the tag output, which is actually a struct with the
    # extra_in pointer and length at offset 48.
    movq 288+32(%rsp), $keyp
    movq 56($keyp), $t1  # extra_in_len
    movq 48($keyp), $t0  # extra_in
    test $t1, $t1
    jz process_partial_block  # Common case: no bytes of extra_in

    movq \$16, $t2
    subq $inl, $t2  # 16-$inl is the number of bytes that fit into $T3.
    cmpq $t2, $t1   # if extra_in_len < 16-$inl, only copy extra_in_len
                    # (note that AT&T syntax reverses the arguments)
    jge load_extra_in
    movq $t1, $t2

load_extra_in:
    # $t2 contains the number of bytes of extra_in (pointed to by $t0) to load
    # into $T3. They are loaded in reverse order.
    leaq -1($t0, $t2), $inp
    # Update extra_in and extra_in_len to reflect the bytes that are about to
    # be read.
    addq $t2, $t0
    subq $t2, $t1
    movq $t0, 48($keyp)
    movq $t1, 56($keyp)

    # Update $itr2, which is used to select the mask later on, to reflect the
    # extra bytes about to be added.
    addq $t2, $itr2

    # Load $t2 bytes of extra_in into $T2.
    pxor $T2, $T2
3:
        pslldq \$1, $T2
        pinsrb \$0, ($inp), $T2
        lea -1($inp), $inp
        sub \$1, $t2
        jnz 3b

    # Shift $T2 up the length of the remainder from the main encryption. Sadly,
    # the shift for an XMM register has to be a constant, thus we loop to do
    # this.
    movq $inl, $t2

4:
        pslldq \$1, $T2
        sub \$1, $t2
        jnz 4b

    # Mask $T3 (the remainder from the main encryption) so that superfluous
    # bytes are zero. This means that the non-zero bytes in $T2 and $T3 are
    # disjoint and so we can merge them with an OR.
    lea .and_masks(%rip), $t2
    shl \$4, $inl
    pand -16($t2, $inl), $T3

    # Merge $T2 into $T3, forming the remainder block.
    por $T2, $T3

    # The block of ciphertext + extra_in is ready to be included in the
    # Poly1305 state.
    movq $T3, $t0
    pextrq \$1, $T3, $t1
    add $t0, $acc0
    adc $t1, $acc1
    adc \$1, $acc2\n";
    &poly_mul(); $code.="

process_blocks_of_extra_in:
    # There may be additional bytes of extra_in to process.
    movq 288+32(%rsp), $keyp
    movq 48($keyp), $inp   # extra_in
    movq 56($keyp), $itr2  # extra_in_len
    movq $itr2, $itr1
    shr \$4, $itr2         # number of blocks

5:
        jz process_extra_in_trailer\n";
        &poly_add("0($inp)");
        &poly_mul(); $code.="
        leaq 16($inp), $inp
        subq \$1, $itr2
        jmp 5b

process_extra_in_trailer:
    andq \$15, $itr1       # remaining num bytes (<16) of extra_in
    movq $itr1, $inl
    jz do_length_block
    leaq -1($inp, $itr1), $inp

6:
        pslldq \$1, $T3
        pinsrb \$0, ($inp), $T3
        lea -1($inp), $inp
        sub \$1, $itr1
        jnz 6b

process_partial_block:
    # $T3 contains $inl bytes of data to be fed into Poly1305. $inl != 0
    lea .and_masks(%rip), $t2
    shl \$4, $inl
    pand -16($t2, $inl), $T3
    movq $T3, $t0
    pextrq \$1, $T3, $t1
    add $t0, $acc0
    adc $t1, $acc1
    adc \$1, $acc2\n";
    &poly_mul(); $code.="

do_length_block:\n";
    &poly_add($len_store);
    &poly_mul(); $code.="
    # Final reduce
    mov $acc0, $t0
    mov $acc1, $t1
    mov $acc2, $t2
    sub \$-5, $acc0
    sbb \$-1, $acc1
    sbb \$3, $acc2
    cmovc $t0, $acc0
    cmovc $t1, $acc1
    cmovc $t2, $acc2
    # Add in s part of the key
    add 0+$s_store, $acc0
    adc 8+$s_store, $acc1

    add \$288 + 32, %rsp
.cfi_adjust_cfa_offset -(288 + 32)
    pop $keyp
.cfi_adjust_cfa_offset -8
    mov $acc0, 0*8($keyp)
    mov $acc1, 1*8($keyp)

    pop %r15
.cfi_adjust_cfa_offset -8
    pop %r14
.cfi_adjust_cfa_offset -8
    pop %r13
.cfi_adjust_cfa_offset -8
    pop %r12
.cfi_adjust_cfa_offset -8
    pop %rbx
.cfi_adjust_cfa_offset -8
    pop %rbp
.cfi_adjust_cfa_offset -8
    ret
.cfi_adjust_cfa_offset (8 * 6) + 288 + 32
################################################################################
seal_sse_128:
    movdqu .chacha20_consts(%rip), $A0\nmovdqa $A0, $A1\nmovdqa $A0, $A2
    movdqu 0*16($keyp), $B0\nmovdqa $B0, $B1\nmovdqa $B0, $B2
    movdqu 1*16($keyp), $C0\nmovdqa $C0, $C1\nmovdqa $C0, $C2
    movdqu 2*16($keyp), $D2
    movdqa $D2, $D0\npaddd .sse_inc(%rip), $D0
    movdqa $D0, $D1\npaddd .sse_inc(%rip), $D1
    movdqa $B0, $T1\nmovdqa $C0, $T2\nmovdqa $D0, $T3
    mov \$10, $acc0
1:\n";
        &chacha_qr($A0,$B0,$C0,$D0,$T0,"left");
        &chacha_qr($A1,$B1,$C1,$D1,$T0,"left");
        &chacha_qr($A2,$B2,$C2,$D2,$T0,"left");
        &chacha_qr($A0,$B0,$C0,$D0,$T0,"right");
        &chacha_qr($A1,$B1,$C1,$D1,$T0,"right");
        &chacha_qr($A2,$B2,$C2,$D2,$T0,"right"); $code.="
        dec $acc0
    jnz 1b
    paddd .chacha20_consts(%rip), $A0
    paddd .chacha20_consts(%rip), $A1
    paddd .chacha20_consts(%rip), $A2
    paddd $T1, $B0\npaddd $T1, $B1\npaddd $T1, $B2
    paddd $T2, $C0\npaddd $T2, $C1
    paddd $T3, $D0
    paddd .sse_inc(%rip), $T3
    paddd $T3, $D1
    # Clamp and store the key
    pand .clamp(%rip), $A2
    movdqa $A2, $r_store
    movdqa $B2, $s_store
    # Hash
    mov %r8, $itr2
    call poly_hash_ad_internal
    jmp seal_sse_128_seal
.size chacha20_poly1305_seal, .-chacha20_poly1305_seal\n";
}

# There should have been a cfi_endproc at the end of that function, but the two
# following blocks of code are jumped to without a stack frame and the CFI
# context which they are used in happens to match the CFI context at the end of
# the previous function. So the CFI table is just extended to the end of them.

if ($avx>1) {

($A0,$A1,$A2,$A3,$B0,$B1,$B2,$B3,$C0,$C1,$C2,$C3,$D0,$D1,$D2,$D3)=map("%ymm$_",(0..15));
my ($A0x,$A1x,$A2x,$A3x,$B0x,$B1x,$B2x,$B3x,$C0x,$C1x,$C2x,$C3x,$D0x,$D1x,$D2x,$D3x)=map("%xmm$_",(0..15));
($T0,$T1,$T2,$T3)=($A3,$B3,$C3,$D3);
$state1_store="2*32(%rbp)";
$state2_store="3*32(%rbp)";
$tmp_store="4*32(%rbp)";
$ctr0_store="5*32(%rbp)";
$ctr1_store="6*32(%rbp)";
$ctr2_store="7*32(%rbp)";
$ctr3_store="8*32(%rbp)";

sub chacha_qr_avx2 {
my ($a,$b,$c,$d,$t,$dir)=@_;
$code.=<<___ if ($dir =~ /store/);
    vmovdqa $t, $tmp_store
___
$code.=<<___;
    vpaddd $b, $a, $a
    vpxor $a, $d, $d
    vpshufb .rol16(%rip), $d, $d
    vpaddd $d, $c, $c
    vpxor $c, $b, $b
    vpsrld \$20, $b, $t
    vpslld \$12, $b, $b
    vpxor $t, $b, $b
    vpaddd $b, $a, $a
    vpxor $a, $d, $d
    vpshufb .rol8(%rip), $d, $d
    vpaddd $d, $c, $c
    vpxor $c, $b, $b
    vpslld \$7, $b, $t
    vpsrld \$25, $b, $b
    vpxor $t, $b, $b
___
$code.=<<___ if ($dir =~ /left/);
    vpalignr \$12, $d, $d, $d
    vpalignr \$8, $c, $c, $c
    vpalignr \$4, $b, $b, $b
___
$code.=<<___ if ($dir =~ /right/);
    vpalignr \$4, $d, $d, $d
    vpalignr \$8, $c, $c, $c
    vpalignr \$12, $b, $b, $b
___
$code.=<<___ if ($dir =~ /load/);
    vmovdqa $tmp_store, $t
___
}

sub prep_state_avx2 {
my ($n)=@_;
$code.=<<___;
    vmovdqa .chacha20_consts(%rip), $A0
    vmovdqa $state1_store, $B0
    vmovdqa $state2_store, $C0
___
$code.=<<___ if ($n ge 2);
    vmovdqa $A0, $A1
    vmovdqa $B0, $B1
    vmovdqa $C0, $C1
___
$code.=<<___ if ($n ge 3);
    vmovdqa $A0, $A2
    vmovdqa $B0, $B2
    vmovdqa $C0, $C2
___
$code.=<<___ if ($n ge 4);
    vmovdqa $A0, $A3
    vmovdqa $B0, $B3
    vmovdqa $C0, $C3
___
$code.=<<___ if ($n eq 1);
    vmovdqa .avx2_inc(%rip), $D0
    vpaddd $ctr0_store, $D0, $D0
    vmovdqa $D0, $ctr0_store
___
$code.=<<___ if ($n eq 2);
    vmovdqa .avx2_inc(%rip), $D0
    vpaddd $ctr0_store, $D0, $D1
    vpaddd $D1, $D0, $D0
    vmovdqa $D0, $ctr0_store
    vmovdqa $D1, $ctr1_store
___
$code.=<<___ if ($n eq 3);
    vmovdqa .avx2_inc(%rip), $D0
    vpaddd $ctr0_store, $D0, $D2
    vpaddd $D2, $D0, $D1
    vpaddd $D1, $D0, $D0
    vmovdqa $D0, $ctr0_store
    vmovdqa $D1, $ctr1_store
    vmovdqa $D2, $ctr2_store
___
$code.=<<___ if ($n eq 4);
    vmovdqa .avx2_inc(%rip), $D0
    vpaddd $ctr0_store, $D0, $D3
    vpaddd $D3, $D0, $D2
    vpaddd $D2, $D0, $D1
    vpaddd $D1, $D0, $D0
    vmovdqa $D3, $ctr3_store
    vmovdqa $D2, $ctr2_store
    vmovdqa $D1, $ctr1_store
    vmovdqa $D0, $ctr0_store
___
}

sub finalize_state_avx2 {
my ($n)=@_;
$code.=<<___ if ($n eq 4);
    vpaddd .chacha20_consts(%rip), $A3, $A3
    vpaddd $state1_store, $B3, $B3
    vpaddd $state2_store, $C3, $C3
    vpaddd $ctr3_store, $D3, $D3
___
$code.=<<___ if ($n ge 3);
    vpaddd .chacha20_consts(%rip), $A2, $A2
    vpaddd $state1_store, $B2, $B2
    vpaddd $state2_store, $C2, $C2
    vpaddd $ctr2_store, $D2, $D2
___
$code.=<<___ if ($n ge 2);
    vpaddd .chacha20_consts(%rip), $A1, $A1
    vpaddd $state1_store, $B1, $B1
    vpaddd $state2_store, $C1, $C1
    vpaddd $ctr1_store, $D1, $D1
___
$code.=<<___;
    vpaddd .chacha20_consts(%rip), $A0, $A0
    vpaddd $state1_store, $B0, $B0
    vpaddd $state2_store, $C0, $C0
    vpaddd $ctr0_store, $D0, $D0
___
}

sub xor_stream_avx2 {
my ($A, $B, $C, $D, $offset, $hlp)=@_;
$code.=<<___;
    vperm2i128 \$0x02, $A, $B, $hlp
    vperm2i128 \$0x13, $A, $B, $B
    vperm2i128 \$0x02, $C, $D, $A
    vperm2i128 \$0x13, $C, $D, $C
    vpxor 0*32+$offset($inp), $hlp, $hlp
    vpxor 1*32+$offset($inp), $A, $A
    vpxor 2*32+$offset($inp), $B, $B
    vpxor 3*32+$offset($inp), $C, $C
    vmovdqu $hlp, 0*32+$offset($oup)
    vmovdqu $A, 1*32+$offset($oup)
    vmovdqu $B, 2*32+$offset($oup)
    vmovdqu $C, 3*32+$offset($oup)
___
}

sub finish_stream_avx2 {
my ($A, $B, $C, $D, $hlp)=@_;
$code.=<<___;
    vperm2i128 \$0x13, $A, $B, $hlp
    vperm2i128 \$0x02, $A, $B, $A
    vperm2i128 \$0x02, $C, $D, $B
    vperm2i128 \$0x13, $C, $D, $D
    vmovdqa $hlp, $C
___
}

sub poly_stage1_mulx {
$code.=<<___;
    mov 0+$r_store, %rdx
    mov %rdx, $t2
    mulx $acc0, $t0, $t1
    mulx $acc1, %rax, %rdx
    imulq $acc2, $t2
    add %rax, $t1
    adc %rdx, $t2
___
}

sub poly_stage2_mulx {
$code.=<<___;
    mov 8+$r_store, %rdx
    mulx $acc0, $acc0, %rax
    add $acc0, $t1
    mulx $acc1, $acc1, $t3
    adc $acc1, $t2
    adc \$0, $t3
    imulq $acc2, %rdx
___
}

sub poly_stage3_mulx {
$code.=<<___;
    add %rax, $t2
    adc %rdx, $t3
___
}

sub poly_mul_mulx {
    &poly_stage1_mulx();
    &poly_stage2_mulx();
    &poly_stage3_mulx();
    &poly_reduce_stage();
}

sub gen_chacha_round_avx2 {
my ($rot1, $rot2, $shift)=@_;
my $round="";
$round=$round ."vmovdqa $C0, $tmp_store\n" if ($rot1 eq 20);
$round=$round ."vmovdqa $rot2, $C0
                vpaddd $B3, $A3, $A3
                vpaddd $B2, $A2, $A2
                vpaddd $B1, $A1, $A1
                vpaddd $B0, $A0, $A0
                vpxor $A3, $D3, $D3
                vpxor $A2, $D2, $D2
                vpxor $A1, $D1, $D1
                vpxor $A0, $D0, $D0
                vpshufb $C0, $D3, $D3
                vpshufb $C0, $D2, $D2
                vpshufb $C0, $D1, $D1
                vpshufb $C0, $D0, $D0
                vmovdqa $tmp_store, $C0
                vpaddd $D3, $C3, $C3
                vpaddd $D2, $C2, $C2
                vpaddd $D1, $C1, $C1
                vpaddd $D0, $C0, $C0
                vpxor $C3, $B3, $B3
                vpxor $C2, $B2, $B2
                vpxor $C1, $B1, $B1
                vpxor $C0, $B0, $B0
                vmovdqa $C0, $tmp_store
                vpsrld \$$rot1, $B3, $C0
                vpslld \$32-$rot1, $B3, $B3
                vpxor $C0, $B3, $B3
                vpsrld \$$rot1, $B2, $C0
                vpslld \$32-$rot1, $B2, $B2
                vpxor $C0, $B2, $B2
                vpsrld \$$rot1, $B1, $C0
                vpslld \$32-$rot1, $B1, $B1
                vpxor $C0, $B1, $B1
                vpsrld \$$rot1, $B0, $C0
                vpslld \$32-$rot1, $B0, $B0
                vpxor $C0, $B0, $B0\n";
($s1,$s2,$s3)=(4,8,12) if ($shift =~ /left/);
($s1,$s2,$s3)=(12,8,4) if ($shift =~ /right/);
$round=$round ."vmovdqa $tmp_store, $C0
                vpalignr \$$s1, $B3, $B3, $B3
                vpalignr \$$s2, $C3, $C3, $C3
                vpalignr \$$s3, $D3, $D3, $D3
                vpalignr \$$s1, $B2, $B2, $B2
                vpalignr \$$s2, $C2, $C2, $C2
                vpalignr \$$s3, $D2, $D2, $D2
                vpalignr \$$s1, $B1, $B1, $B1
                vpalignr \$$s2, $C1, $C1, $C1
                vpalignr \$$s3, $D1, $D1, $D1
                vpalignr \$$s1, $B0, $B0, $B0
                vpalignr \$$s2, $C0, $C0, $C0
                vpalignr \$$s3, $D0, $D0, $D0\n"
if (($shift =~ /left/) || ($shift =~ /right/));
return $round;
};

$chacha_body = &gen_chacha_round_avx2(20, ".rol16(%rip)") .
               &gen_chacha_round_avx2(25, ".rol8(%rip)", "left") .
               &gen_chacha_round_avx2(20, ".rol16(%rip)") .
               &gen_chacha_round_avx2(25, ".rol8(%rip)", "right");

@loop_body = split /\n/, $chacha_body;

$code.="
###############################################################################
.type chacha20_poly1305_open_avx2,\@function,2
.align 64
chacha20_poly1305_open_avx2:
    vzeroupper
    vmovdqa .chacha20_consts(%rip), $A0
    vbroadcasti128 0*16($keyp), $B0
    vbroadcasti128 1*16($keyp), $C0
    vbroadcasti128 2*16($keyp), $D0
    vpaddd .avx2_init(%rip), $D0, $D0
    cmp \$6*32, $inl
    jbe open_avx2_192
    cmp \$10*32, $inl
    jbe open_avx2_320

    vmovdqa $B0, $state1_store
    vmovdqa $C0, $state2_store
    vmovdqa $D0, $ctr0_store
    mov \$10, $acc0
1:  \n";
        &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"left");
        &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"right"); $code.="
        dec $acc0
    jne 1b
    vpaddd .chacha20_consts(%rip), $A0, $A0
    vpaddd $state1_store, $B0, $B0
    vpaddd $state2_store, $C0, $C0
    vpaddd $ctr0_store, $D0, $D0

    vperm2i128 \$0x02, $A0, $B0, $T0
    # Clamp and store key
    vpand .clamp(%rip), $T0, $T0
    vmovdqa $T0, $r_store
    # Stream for the first 64 bytes
    vperm2i128 \$0x13, $A0, $B0, $A0
    vperm2i128 \$0x13, $C0, $D0, $B0
    # Hash AD + first 64 bytes
    mov %r8, $itr2
    call poly_hash_ad_internal
    xor $itr1, $itr1
    # Hash first 64 bytes
1:  \n";
       &poly_add("0($inp, $itr1)");
       &poly_mul(); $code.="
       add \$16, $itr1
       cmp \$2*32, $itr1
    jne 1b
    # Decrypt first 64 bytes
    vpxor 0*32($inp), $A0, $A0
    vpxor 1*32($inp), $B0, $B0
    vmovdqu $A0, 0*32($oup)
    vmovdqu $B0, 1*32($oup)
    lea 2*32($inp), $inp
    lea 2*32($oup), $oup
    sub \$2*32, $inl
1:
        # Hash and decrypt 512 bytes each iteration
        cmp \$16*32, $inl
        jb 3f\n";
        &prep_state_avx2(4); $code.="
        xor $itr1, $itr1
2:  \n";
            &poly_add("0*8($inp, $itr1)");
            &emit_body(10);
            &poly_stage1_mulx();
            &emit_body(9);
            &poly_stage2_mulx();
            &emit_body(12);
            &poly_stage3_mulx();
            &emit_body(10);
            &poly_reduce_stage();
            &emit_body(9);
            &poly_add("2*8($inp, $itr1)");
            &emit_body(8);
            &poly_stage1_mulx();
            &emit_body(18);
            &poly_stage2_mulx();
            &emit_body(18);
            &poly_stage3_mulx();
            &emit_body(9);
            &poly_reduce_stage();
            &emit_body(8);
            &poly_add("4*8($inp, $itr1)"); $code.="
            lea 6*8($itr1), $itr1\n";
            &emit_body(18);
            &poly_stage1_mulx();
            &emit_body(8);
            &poly_stage2_mulx();
            &emit_body(8);
            &poly_stage3_mulx();
            &emit_body(18);
            &poly_reduce_stage();
            foreach $l (@loop_body) {$code.=$l."\n";}
            @loop_body = split /\n/, $chacha_body; $code.="
            cmp \$10*6*8, $itr1
        jne 2b\n";
        &finalize_state_avx2(4); $code.="
        vmovdqa $A0, $tmp_store\n";
        &poly_add("10*6*8($inp)");
        &xor_stream_avx2($A3, $B3, $C3, $D3, 0*32, $A0); $code.="
        vmovdqa $tmp_store, $A0\n";
        &poly_mul();
        &xor_stream_avx2($A2, $B2, $C2, $D2, 4*32, $A3);
        &poly_add("10*6*8+2*8($inp)");
        &xor_stream_avx2($A1, $B1, $C1, $D1, 8*32, $A3);
        &poly_mul();
        &xor_stream_avx2($A0, $B0, $C0, $D0, 12*32, $A3); $code.="
        lea 16*32($inp), $inp
        lea 16*32($oup), $oup
        sub \$16*32, $inl
    jmp 1b
3:
    test $inl, $inl
    vzeroupper
    je open_sse_finalize
3:
    cmp \$4*32, $inl
    ja 3f\n";
###############################################################################
    # 1-128 bytes left
    &prep_state_avx2(1); $code.="
    xor $itr2, $itr2
    mov $inl, $itr1
    and \$-16, $itr1
    test $itr1, $itr1
    je 2f
1:  \n";
        &poly_add("0*8($inp, $itr2)");
        &poly_mul(); $code.="
2:
        add \$16, $itr2\n";
        &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"left");
        &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"right"); $code.="
        cmp $itr1, $itr2
    jb 1b
        cmp \$160, $itr2
    jne 2b\n";
    &finalize_state_avx2(1);
    &finish_stream_avx2($A0,$B0,$C0,$D0,$T0); $code.="
    jmp open_avx2_tail_loop
3:
    cmp \$8*32, $inl
    ja 3f\n";
###############################################################################
    # 129-256 bytes left
    &prep_state_avx2(2); $code.="
    mov $inl, $tmp_store
    mov $inl, $itr1
    sub \$4*32, $itr1
    shr \$4, $itr1
    mov \$10, $itr2
    cmp \$10, $itr1
    cmovg $itr2, $itr1
    mov $inp, $inl
    xor $itr2, $itr2
1:  \n";
        &poly_add("0*8($inl)");
        &poly_mul_mulx(); $code.="
        lea 16($inl), $inl
2:  \n";
        &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"left");
        &chacha_qr_avx2($A1,$B1,$C1,$D1,$T0,"left"); $code.="
        inc $itr2\n";
        &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"right");
        &chacha_qr_avx2($A1,$B1,$C1,$D1,$T0,"right");
        &chacha_qr_avx2($A2,$B2,$C2,$D2,$T0,"right"); $code.="
        cmp $itr1, $itr2
    jb 1b
        cmp \$10, $itr2
    jne 2b
    mov $inl, $itr2
    sub $inp, $inl
    mov $inl, $itr1
    mov $tmp_store, $inl
1:
        add \$16, $itr1
        cmp $inl, $itr1
        jg 1f\n";
        &poly_add("0*8($itr2)");
        &poly_mul_mulx(); $code.="
        lea 16($itr2), $itr2
    jmp 1b
1:  \n";
    &finalize_state_avx2(2);
    &xor_stream_avx2($A1, $B1, $C1, $D1, 0*32, $T0);
    &finish_stream_avx2($A0, $B0, $C0, $D0, $T0); $code.="
    lea 4*32($inp), $inp
    lea 4*32($oup), $oup
    sub \$4*32, $inl
    jmp open_avx2_tail_loop
3:
    cmp \$12*32, $inl
    ja 3f\n";
###############################################################################
    # 257-383 bytes left
    &prep_state_avx2(3); $code.="
    mov $inl, $tmp_store
    mov $inl, $itr1
    sub \$8*32, $itr1
    shr \$4, $itr1
    add \$6, $itr1
    mov \$10, $itr2
    cmp \$10, $itr1
    cmovg $itr2, $itr1
    mov $inp, $inl
    xor $itr2, $itr2
1:  \n";
        &poly_add("0*8($inl)");
        &poly_mul_mulx(); $code.="
        lea 16($inl), $inl
2:  \n";
        &chacha_qr_avx2($A2,$B2,$C2,$D2,$T0,"left");
        &chacha_qr_avx2($A1,$B1,$C1,$D1,$T0,"left");
        &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"left");
        &poly_add("0*8($inl)");
        &poly_mul(); $code.="
        lea 16($inl), $inl
        inc $itr2\n";
        &chacha_qr_avx2($A2,$B2,$C2,$D2,$T0,"right");
        &chacha_qr_avx2($A1,$B1,$C1,$D1,$T0,"right");
        &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"right"); $code.="
        cmp $itr1, $itr2
    jb 1b
        cmp \$10, $itr2
    jne 2b
    mov $inl, $itr2
    sub $inp, $inl
    mov $inl, $itr1
    mov $tmp_store, $inl
1:
        add \$16, $itr1
        cmp $inl, $itr1
        jg 1f\n";
        &poly_add("0*8($itr2)");
        &poly_mul_mulx(); $code.="
        lea 16($itr2), $itr2
    jmp 1b
1:  \n";
    &finalize_state_avx2(3);
    &xor_stream_avx2($A2, $B2, $C2, $D2, 0*32, $T0);
    &xor_stream_avx2($A1, $B1, $C1, $D1, 4*32, $T0);
    &finish_stream_avx2($A0, $B0, $C0, $D0, $T0); $code.="
    lea 8*32($inp), $inp
    lea 8*32($oup), $oup
    sub \$8*32, $inl
    jmp open_avx2_tail_loop
3:  \n";
###############################################################################
    # 384-512 bytes left
    &prep_state_avx2(4); $code.="
    xor $itr1, $itr1
    mov $inp, $itr2
1:  \n";
        &poly_add("0*8($itr2)");
        &poly_mul(); $code.="
        lea 2*8($itr2), $itr2
2:  \n";
        &emit_body(37);
        &poly_add("0*8($itr2)");
        &poly_mul_mulx();
        &emit_body(48);
        &poly_add("2*8($itr2)");
        &poly_mul_mulx(); $code.="
        lea 4*8($itr2), $itr2\n";
        foreach $l (@loop_body) {$code.=$l."\n";}
        @loop_body = split /\n/, $chacha_body; $code.="
        inc $itr1
        cmp \$4, $itr1
    jl  1b
        cmp \$10, $itr1
    jne 2b
    mov $inl, $itr1
    sub \$12*32, $itr1
    and \$-16, $itr1
1:
        test $itr1, $itr1
        je 1f\n";
        &poly_add("0*8($itr2)");
        &poly_mul_mulx(); $code.="
        lea 2*8($itr2), $itr2
        sub \$2*8, $itr1
    jmp 1b
1:  \n";
    &finalize_state_avx2(4); $code.="
    vmovdqa $A0, $tmp_store\n";
    &xor_stream_avx2($A3, $B3, $C3, $D3, 0*32, $A0); $code.="
    vmovdqa $tmp_store, $A0\n";
    &xor_stream_avx2($A2, $B2, $C2, $D2, 4*32, $A3);
    &xor_stream_avx2($A1, $B1, $C1, $D1, 8*32, $A3);
    &finish_stream_avx2($A0, $B0, $C0, $D0, $A3); $code.="
    lea 12*32($inp), $inp
    lea 12*32($oup), $oup
    sub \$12*32, $inl
open_avx2_tail_loop:
    cmp \$32, $inl
    jb open_avx2_tail
        sub \$32, $inl
        vpxor ($inp), $A0, $A0
        vmovdqu $A0, ($oup)
        lea 1*32($inp), $inp
        lea 1*32($oup), $oup
        vmovdqa $B0, $A0
        vmovdqa $C0, $B0
        vmovdqa $D0, $C0
    jmp open_avx2_tail_loop
open_avx2_tail:
    cmp \$16, $inl
    vmovdqa $A0x, $A1x
    jb 1f
    sub \$16, $inl
    #load for decryption
    vpxor ($inp), $A0x, $A1x
    vmovdqu $A1x, ($oup)
    lea 1*16($inp), $inp
    lea 1*16($oup), $oup
    vperm2i128 \$0x11, $A0, $A0, $A0
    vmovdqa $A0x, $A1x
1:
    vzeroupper
    jmp open_sse_tail_16
###############################################################################
open_avx2_192:
    vmovdqa $A0, $A1
    vmovdqa $A0, $A2
    vmovdqa $B0, $B1
    vmovdqa $B0, $B2
    vmovdqa $C0, $C1
    vmovdqa $C0, $C2
    vpaddd .avx2_inc(%rip), $D0, $D1
    vmovdqa $D0, $T2
    vmovdqa $D1, $T3
    mov \$10, $acc0
1:  \n";
        &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"left");
        &chacha_qr_avx2($A1,$B1,$C1,$D1,$T0,"left");
        &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"right");
        &chacha_qr_avx2($A1,$B1,$C1,$D1,$T0,"right"); $code.="
        dec $acc0
    jne 1b
    vpaddd $A2, $A0, $A0
    vpaddd $A2, $A1, $A1
    vpaddd $B2, $B0, $B0
    vpaddd $B2, $B1, $B1
    vpaddd $C2, $C0, $C0
    vpaddd $C2, $C1, $C1
    vpaddd $T2, $D0, $D0
    vpaddd $T3, $D1, $D1
    vperm2i128 \$0x02, $A0, $B0, $T0
    # Clamp and store the key
    vpand .clamp(%rip), $T0, $T0
    vmovdqa $T0, $r_store
    # Stream for up to 192 bytes
    vperm2i128 \$0x13, $A0, $B0, $A0
    vperm2i128 \$0x13, $C0, $D0, $B0
    vperm2i128 \$0x02, $A1, $B1, $C0
    vperm2i128 \$0x02, $C1, $D1, $D0
    vperm2i128 \$0x13, $A1, $B1, $A1
    vperm2i128 \$0x13, $C1, $D1, $B1
open_avx2_short:
    mov %r8, $itr2
    call poly_hash_ad_internal
open_avx2_hash_and_xor_loop:
        cmp \$32, $inl
        jb open_avx2_short_tail_32
        sub \$32, $inl\n";
        # Load + hash
        &poly_add("0*8($inp)");
        &poly_mul();
        &poly_add("2*8($inp)");
        &poly_mul(); $code.="
        # Load + decrypt
        vpxor ($inp), $A0, $A0
        vmovdqu $A0, ($oup)
        lea 1*32($inp), $inp
        lea 1*32($oup), $oup
        # Shift stream
        vmovdqa $B0, $A0
        vmovdqa $C0, $B0
        vmovdqa $D0, $C0
        vmovdqa $A1, $D0
        vmovdqa $B1, $A1
        vmovdqa $C1, $B1
        vmovdqa $D1, $C1
        vmovdqa $A2, $D1
        vmovdqa $B2, $A2
    jmp open_avx2_hash_and_xor_loop
open_avx2_short_tail_32:
    cmp \$16, $inl
    vmovdqa $A0x, $A1x
    jb 1f
    sub \$16, $inl\n";
    &poly_add("0*8($inp)");
    &poly_mul(); $code.="
    vpxor ($inp), $A0x, $A3x
    vmovdqu $A3x, ($oup)
    lea 1*16($inp), $inp
    lea 1*16($oup), $oup
    vextracti128 \$1, $A0, $A1x
1:
    vzeroupper
    jmp open_sse_tail_16
###############################################################################
open_avx2_320:
    vmovdqa $A0, $A1
    vmovdqa $A0, $A2
    vmovdqa $B0, $B1
    vmovdqa $B0, $B2
    vmovdqa $C0, $C1
    vmovdqa $C0, $C2
    vpaddd .avx2_inc(%rip), $D0, $D1
    vpaddd .avx2_inc(%rip), $D1, $D2
    vmovdqa $B0, $T1
    vmovdqa $C0, $T2
    vmovdqa $D0, $ctr0_store
    vmovdqa $D1, $ctr1_store
    vmovdqa $D2, $ctr2_store
    mov \$10, $acc0
1:  \n";
        &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"left");
        &chacha_qr_avx2($A1,$B1,$C1,$D1,$T0,"left");
        &chacha_qr_avx2($A2,$B2,$C2,$D2,$T0,"left");
        &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"right");
        &chacha_qr_avx2($A1,$B1,$C1,$D1,$T0,"right");
        &chacha_qr_avx2($A2,$B2,$C2,$D2,$T0,"right"); $code.="
        dec $acc0
    jne 1b
    vpaddd .chacha20_consts(%rip), $A0, $A0
    vpaddd .chacha20_consts(%rip), $A1, $A1
    vpaddd .chacha20_consts(%rip), $A2, $A2
    vpaddd $T1, $B0, $B0
    vpaddd $T1, $B1, $B1
    vpaddd $T1, $B2, $B2
    vpaddd $T2, $C0, $C0
    vpaddd $T2, $C1, $C1
    vpaddd $T2, $C2, $C2
    vpaddd $ctr0_store, $D0, $D0
    vpaddd $ctr1_store, $D1, $D1
    vpaddd $ctr2_store, $D2, $D2
    vperm2i128 \$0x02, $A0, $B0, $T0
    # Clamp and store the key
    vpand .clamp(%rip), $T0, $T0
    vmovdqa $T0, $r_store
    # Stream for up to 320 bytes
    vperm2i128 \$0x13, $A0, $B0, $A0
    vperm2i128 \$0x13, $C0, $D0, $B0
    vperm2i128 \$0x02, $A1, $B1, $C0
    vperm2i128 \$0x02, $C1, $D1, $D0
    vperm2i128 \$0x13, $A1, $B1, $A1
    vperm2i128 \$0x13, $C1, $D1, $B1
    vperm2i128 \$0x02, $A2, $B2, $C1
    vperm2i128 \$0x02, $C2, $D2, $D1
    vperm2i128 \$0x13, $A2, $B2, $A2
    vperm2i128 \$0x13, $C2, $D2, $B2
    jmp open_avx2_short
.size chacha20_poly1305_open_avx2, .-chacha20_poly1305_open_avx2
###############################################################################
###############################################################################
.type chacha20_poly1305_seal_avx2,\@function,2
.align 64
chacha20_poly1305_seal_avx2:
    vzeroupper
    vmovdqa .chacha20_consts(%rip), $A0
    vbroadcasti128 0*16($keyp), $B0
    vbroadcasti128 1*16($keyp), $C0
    vbroadcasti128 2*16($keyp), $D0
    vpaddd .avx2_init(%rip), $D0, $D0
    cmp \$6*32, $inl
    jbe seal_avx2_192
    cmp \$10*32, $inl
    jbe seal_avx2_320
    vmovdqa $A0, $A1
    vmovdqa $A0, $A2
    vmovdqa $A0, $A3
    vmovdqa $B0, $B1
    vmovdqa $B0, $B2
    vmovdqa $B0, $B3
    vmovdqa $B0, $state1_store
    vmovdqa $C0, $C1
    vmovdqa $C0, $C2
    vmovdqa $C0, $C3
    vmovdqa $C0, $state2_store
    vmovdqa $D0, $D3
    vpaddd .avx2_inc(%rip), $D3, $D2
    vpaddd .avx2_inc(%rip), $D2, $D1
    vpaddd .avx2_inc(%rip), $D1, $D0
    vmovdqa $D0, $ctr0_store
    vmovdqa $D1, $ctr1_store
    vmovdqa $D2, $ctr2_store
    vmovdqa $D3, $ctr3_store
    mov \$10, $acc0
1:  \n";
        foreach $l (@loop_body) {$code.=$l."\n";}
        @loop_body = split /\n/, $chacha_body; $code.="
        dec $acc0
        jnz 1b\n";
    &finalize_state_avx2(4); $code.="
    vperm2i128 \$0x13, $C3, $D3, $C3
    vperm2i128 \$0x02, $A3, $B3, $D3
    vperm2i128 \$0x13, $A3, $B3, $A3
    vpand .clamp(%rip), $D3, $D3
    vmovdqa $D3, $r_store
    mov %r8, $itr2
    call poly_hash_ad_internal
    # Safely store 320 bytes (otherwise would handle with optimized call)
    vpxor 0*32($inp), $A3, $A3
    vpxor 1*32($inp), $C3, $C3
    vmovdqu $A3, 0*32($oup)
    vmovdqu $C3, 1*32($oup)\n";
    &xor_stream_avx2($A2,$B2,$C2,$D2,2*32,$T3);
    &xor_stream_avx2($A1,$B1,$C1,$D1,6*32,$T3);
    &finish_stream_avx2($A0,$B0,$C0,$D0,$T3); $code.="
    lea 10*32($inp), $inp
    sub \$10*32, $inl
    mov \$10*32, $itr1
    cmp \$4*32, $inl
    jbe seal_avx2_hash
    vpxor 0*32($inp), $A0, $A0
    vpxor 1*32($inp), $B0, $B0
    vpxor 2*32($inp), $C0, $C0
    vpxor 3*32($inp), $D0, $D0
    vmovdqu $A0, 10*32($oup)
    vmovdqu $B0, 11*32($oup)
    vmovdqu $C0, 12*32($oup)
    vmovdqu $D0, 13*32($oup)
    lea 4*32($inp), $inp
    sub \$4*32, $inl
    mov \$8, $itr1
    mov \$2, $itr2
    cmp \$4*32, $inl
    jbe seal_avx2_tail_128
    cmp \$8*32, $inl
    jbe seal_avx2_tail_256
    cmp \$12*32, $inl
    jbe seal_avx2_tail_384
    cmp \$16*32, $inl
    jbe seal_avx2_tail_512\n";
    # We have 448 bytes to hash, but main loop hashes 512 bytes at a time - perform some rounds, before the main loop
    &prep_state_avx2(4);
    foreach $l (@loop_body) {$code.=$l."\n";}
    @loop_body = split /\n/, $chacha_body;
    &emit_body(41);
    @loop_body = split /\n/, $chacha_body; $code.="
    sub \$16, $oup
    mov \$9, $itr1
    jmp 4f
1:  \n";
        &prep_state_avx2(4); $code.="
        mov \$10, $itr1
2:  \n";
            &poly_add("0*8($oup)");
            &emit_body(10);
            &poly_stage1_mulx();
            &emit_body(9);
            &poly_stage2_mulx();
            &emit_body(12);
            &poly_stage3_mulx();
            &emit_body(10);
            &poly_reduce_stage(); $code.="
4:  \n";
            &emit_body(9);
            &poly_add("2*8($oup)");
            &emit_body(8);
            &poly_stage1_mulx();
            &emit_body(18);
            &poly_stage2_mulx();
            &emit_body(18);
            &poly_stage3_mulx();
            &emit_body(9);
            &poly_reduce_stage();
            &emit_body(8);
            &poly_add("4*8($oup)"); $code.="
            lea 6*8($oup), $oup\n";
            &emit_body(18);
            &poly_stage1_mulx();
            &emit_body(8);
            &poly_stage2_mulx();
            &emit_body(8);
            &poly_stage3_mulx();
            &emit_body(18);
            &poly_reduce_stage();
            foreach $l (@loop_body) {$code.=$l."\n";}
            @loop_body = split /\n/, $chacha_body; $code.="
            dec $itr1
        jne 2b\n";
        &finalize_state_avx2(4); $code.="
        lea 4*8($oup), $oup
        vmovdqa $A0, $tmp_store\n";
        &poly_add("-4*8($oup)");
        &xor_stream_avx2($A3, $B3, $C3, $D3, 0*32, $A0); $code.="
        vmovdqa $tmp_store, $A0\n";
        &poly_mul();
        &xor_stream_avx2($A2, $B2, $C2, $D2, 4*32, $A3);
        &poly_add("-2*8($oup)");
        &xor_stream_avx2($A1, $B1, $C1, $D1, 8*32, $A3);
        &poly_mul();
        &xor_stream_avx2($A0, $B0, $C0, $D0, 12*32, $A3); $code.="
        lea 16*32($inp), $inp
        sub \$16*32, $inl
        cmp \$16*32, $inl
    jg 1b\n";
    &poly_add("0*8($oup)");
    &poly_mul();
    &poly_add("2*8($oup)");
    &poly_mul(); $code.="
    lea 4*8($oup), $oup
    mov \$10, $itr1
    xor $itr2, $itr2
    cmp \$4*32, $inl
    ja 3f
###############################################################################
seal_avx2_tail_128:\n";
    &prep_state_avx2(1); $code.="
1:  \n";
        &poly_add("0($oup)");
        &poly_mul(); $code.="
        lea 2*8($oup), $oup
2:  \n";
        &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"left");
        &poly_add("0*8($oup)");
        &poly_mul();
        &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"right");
        &poly_add("2*8($oup)");
        &poly_mul(); $code.="
        lea 4*8($oup), $oup
        dec $itr1
    jg 1b
        dec $itr2
    jge 2b\n";
    &finalize_state_avx2(1);
    &finish_stream_avx2($A0,$B0,$C0,$D0,$T0); $code.="
    jmp seal_avx2_short_loop
3:
    cmp \$8*32, $inl
    ja 3f
###############################################################################
seal_avx2_tail_256:\n";
    &prep_state_avx2(2); $code.="
1:  \n";
        &poly_add("0($oup)");
        &poly_mul(); $code.="
        lea 2*8($oup), $oup
2:  \n";
        &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"left");
        &chacha_qr_avx2($A1,$B1,$C1,$D1,$T0,"left");
        &poly_add("0*8($oup)");
        &poly_mul();
        &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"right");
        &chacha_qr_avx2($A1,$B1,$C1,$D1,$T0,"right");
        &poly_add("2*8($oup)");
        &poly_mul(); $code.="
        lea 4*8($oup), $oup
        dec $itr1
    jg 1b
        dec $itr2
    jge 2b\n";
    &finalize_state_avx2(2);
    &xor_stream_avx2($A1,$B1,$C1,$D1,0*32,$T0);
    &finish_stream_avx2($A0,$B0,$C0,$D0,$T0); $code.="
    mov \$4*32, $itr1
    lea 4*32($inp), $inp
    sub \$4*32, $inl
    jmp seal_avx2_hash
3:
    cmp \$12*32, $inl
    ja seal_avx2_tail_512
###############################################################################
seal_avx2_tail_384:\n";
    &prep_state_avx2(3); $code.="
1:  \n";
        &poly_add("0($oup)");
        &poly_mul(); $code.="
        lea 2*8($oup), $oup
2:  \n";
        &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"left");
        &chacha_qr_avx2($A1,$B1,$C1,$D1,$T0,"left");
        &poly_add("0*8($oup)");
        &poly_mul();
        &chacha_qr_avx2($A2,$B2,$C2,$D2,$T0,"left");
        &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"right");
        &poly_add("2*8($oup)");
        &poly_mul();
        &chacha_qr_avx2($A1,$B1,$C1,$D1,$T0,"right");
        &chacha_qr_avx2($A2,$B2,$C2,$D2,$T0,"right"); $code.="
        lea 4*8($oup), $oup
        dec $itr1
    jg 1b
        dec $itr2
    jge 2b\n";
    &finalize_state_avx2(3);
    &xor_stream_avx2($A2,$B2,$C2,$D2,0*32,$T0);
    &xor_stream_avx2($A1,$B1,$C1,$D1,4*32,$T0);
    &finish_stream_avx2($A0,$B0,$C0,$D0,$T0); $code.="
    mov \$8*32, $itr1
    lea 8*32($inp), $inp
    sub \$8*32, $inl
    jmp seal_avx2_hash
###############################################################################
seal_avx2_tail_512:\n";
    &prep_state_avx2(4); $code.="
1:  \n";
        &poly_add("0($oup)");
        &poly_mul_mulx(); $code.="
        lea 2*8($oup), $oup
2:  \n";
        &emit_body(20);
        &poly_add("0*8($oup)");
        &emit_body(20);
        &poly_stage1_mulx();
        &emit_body(20);
        &poly_stage2_mulx();
        &emit_body(20);
        &poly_stage3_mulx();
        &emit_body(20);
        &poly_reduce_stage();
        &emit_body(20);
        &poly_add("2*8($oup)");
        &emit_body(20);
        &poly_stage1_mulx();
        &emit_body(20);
        &poly_stage2_mulx();
        &emit_body(20);
        &poly_stage3_mulx();
        &emit_body(20);
        &poly_reduce_stage();
        foreach $l (@loop_body) {$code.=$l."\n";}
        @loop_body = split /\n/, $chacha_body; $code.="
        lea 4*8($oup), $oup
        dec $itr1
    jg 1b
        dec $itr2
    jge 2b\n";
    &finalize_state_avx2(4); $code.="
    vmovdqa $A0, $tmp_store\n";
    &xor_stream_avx2($A3, $B3, $C3, $D3, 0*32, $A0); $code.="
    vmovdqa $tmp_store, $A0\n";
    &xor_stream_avx2($A2, $B2, $C2, $D2, 4*32, $A3);
    &xor_stream_avx2($A1, $B1, $C1, $D1, 8*32, $A3);
    &finish_stream_avx2($A0,$B0,$C0,$D0,$T0); $code.="
    mov \$12*32, $itr1
    lea 12*32($inp), $inp
    sub \$12*32, $inl
    jmp seal_avx2_hash
################################################################################
seal_avx2_320:
    vmovdqa $A0, $A1
    vmovdqa $A0, $A2
    vmovdqa $B0, $B1
    vmovdqa $B0, $B2
    vmovdqa $C0, $C1
    vmovdqa $C0, $C2
    vpaddd .avx2_inc(%rip), $D0, $D1
    vpaddd .avx2_inc(%rip), $D1, $D2
    vmovdqa $B0, $T1
    vmovdqa $C0, $T2
    vmovdqa $D0, $ctr0_store
    vmovdqa $D1, $ctr1_store
    vmovdqa $D2, $ctr2_store
    mov \$10, $acc0
1:  \n";
        &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"left");
        &chacha_qr_avx2($A1,$B1,$C1,$D1,$T0,"left");
        &chacha_qr_avx2($A2,$B2,$C2,$D2,$T0,"left");
        &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"right");
        &chacha_qr_avx2($A1,$B1,$C1,$D1,$T0,"right");
        &chacha_qr_avx2($A2,$B2,$C2,$D2,$T0,"right"); $code.="
        dec $acc0
    jne 1b
    vpaddd .chacha20_consts(%rip), $A0, $A0
    vpaddd .chacha20_consts(%rip), $A1, $A1
    vpaddd .chacha20_consts(%rip), $A2, $A2
    vpaddd $T1, $B0, $B0
    vpaddd $T1, $B1, $B1
    vpaddd $T1, $B2, $B2
    vpaddd $T2, $C0, $C0
    vpaddd $T2, $C1, $C1
    vpaddd $T2, $C2, $C2
    vpaddd $ctr0_store, $D0, $D0
    vpaddd $ctr1_store, $D1, $D1
    vpaddd $ctr2_store, $D2, $D2
    vperm2i128 \$0x02, $A0, $B0, $T0
    # Clamp and store the key
    vpand .clamp(%rip), $T0, $T0
    vmovdqa $T0, $r_store
    # Stream for up to 320 bytes
    vperm2i128 \$0x13, $A0, $B0, $A0
    vperm2i128 \$0x13, $C0, $D0, $B0
    vperm2i128 \$0x02, $A1, $B1, $C0
    vperm2i128 \$0x02, $C1, $D1, $D0
    vperm2i128 \$0x13, $A1, $B1, $A1
    vperm2i128 \$0x13, $C1, $D1, $B1
    vperm2i128 \$0x02, $A2, $B2, $C1
    vperm2i128 \$0x02, $C2, $D2, $D1
    vperm2i128 \$0x13, $A2, $B2, $A2
    vperm2i128 \$0x13, $C2, $D2, $B2
    jmp seal_avx2_short
################################################################################
seal_avx2_192:
    vmovdqa $A0, $A1
    vmovdqa $A0, $A2
    vmovdqa $B0, $B1
    vmovdqa $B0, $B2
    vmovdqa $C0, $C1
    vmovdqa $C0, $C2
    vpaddd .avx2_inc(%rip), $D0, $D1
    vmovdqa $D0, $T2
    vmovdqa $D1, $T3
    mov \$10, $acc0
1:  \n";
        &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"left");
        &chacha_qr_avx2($A1,$B1,$C1,$D1,$T0,"left");
        &chacha_qr_avx2($A0,$B0,$C0,$D0,$T0,"right");
        &chacha_qr_avx2($A1,$B1,$C1,$D1,$T0,"right"); $code.="
        dec $acc0
    jne 1b
    vpaddd $A2, $A0, $A0
    vpaddd $A2, $A1, $A1
    vpaddd $B2, $B0, $B0
    vpaddd $B2, $B1, $B1
    vpaddd $C2, $C0, $C0
    vpaddd $C2, $C1, $C1
    vpaddd $T2, $D0, $D0
    vpaddd $T3, $D1, $D1
    vperm2i128 \$0x02, $A0, $B0, $T0
    # Clamp and store the key
    vpand .clamp(%rip), $T0, $T0
    vmovdqa $T0, $r_store
    # Stream for up to 192 bytes
    vperm2i128 \$0x13, $A0, $B0, $A0
    vperm2i128 \$0x13, $C0, $D0, $B0
    vperm2i128 \$0x02, $A1, $B1, $C0
    vperm2i128 \$0x02, $C1, $D1, $D0
    vperm2i128 \$0x13, $A1, $B1, $A1
    vperm2i128 \$0x13, $C1, $D1, $B1
seal_avx2_short:
    mov %r8, $itr2
    call poly_hash_ad_internal
    xor $itr1, $itr1
seal_avx2_hash:
        cmp \$16, $itr1
        jb seal_avx2_short_loop\n";
        &poly_add("0($oup)");
        &poly_mul(); $code.="
        sub \$16, $itr1
        add \$16, $oup
    jmp seal_avx2_hash
seal_avx2_short_loop:
        cmp \$32, $inl
        jb seal_avx2_short_tail
        sub \$32, $inl
        # Encrypt
        vpxor ($inp), $A0, $A0
        vmovdqu $A0, ($oup)
        lea 1*32($inp), $inp
        # Load + hash\n";
        &poly_add("0*8($oup)");
        &poly_mul();
        &poly_add("2*8($oup)");
        &poly_mul(); $code.="
        lea 1*32($oup), $oup
        # Shift stream
        vmovdqa $B0, $A0
        vmovdqa $C0, $B0
        vmovdqa $D0, $C0
        vmovdqa $A1, $D0
        vmovdqa $B1, $A1
        vmovdqa $C1, $B1
        vmovdqa $D1, $C1
        vmovdqa $A2, $D1
        vmovdqa $B2, $A2
    jmp seal_avx2_short_loop
seal_avx2_short_tail:
    cmp \$16, $inl
    jb 1f
    sub \$16, $inl
    vpxor ($inp), $A0x, $A3x
    vmovdqu $A3x, ($oup)
    lea 1*16($inp), $inp\n";
    &poly_add("0*8($oup)");
    &poly_mul(); $code.="
    lea 1*16($oup), $oup
    vextracti128 \$1, $A0, $A0x
1:
    vzeroupper
    jmp seal_sse_tail_16
.cfi_endproc
";
}

if (!$win64) {
  $code =~ s/\`([^\`]*)\`/eval $1/gem;
  print $code;
} else {
  print <<___;
.globl dummy_chacha20_poly1305_asm
.type dummy_chacha20_poly1305_asm,\@abi-omnipotent
dummy_chacha20_poly1305_asm:
    ret
___
}

close STDOUT;