|
- #!/usr/bin/env perl
-
- # Copyright (c) 2017, Shay Gueron.
- # Copyright (c) 2017, Google Inc.
- #
- # Permission to use, copy, modify, and/or distribute this software for any
- # purpose with or without fee is hereby granted, provided that the above
- # copyright notice and this permission notice appear in all copies.
- #
- # THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
- # WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
- # MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
- # SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
- # WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
- # OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
- # CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */
-
- use warnings FATAL => 'all';
-
- $flavour = shift;
- $output = shift;
- if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
-
- $win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
-
- $0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
- ( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
- ( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
- die "can't locate x86_64-xlate.pl";
-
- open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
- *STDOUT=*OUT;
-
- $code.=<<___;
- .data
-
- .align 16
- one:
- .quad 1,0
- two:
- .quad 2,0
- three:
- .quad 3,0
- four:
- .quad 4,0
- five:
- .quad 5,0
- six:
- .quad 6,0
- seven:
- .quad 7,0
- eight:
- .quad 8,0
-
- OR_MASK:
- .long 0x00000000,0x00000000,0x00000000,0x80000000
- poly:
- .quad 0x1, 0xc200000000000000
- mask:
- .long 0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d
- con1:
- .long 1,1,1,1
- con2:
- .long 0x1b,0x1b,0x1b,0x1b
- con3:
- .byte -1,-1,-1,-1,-1,-1,-1,-1,4,5,6,7,4,5,6,7
- and_mask:
- .long 0,0xffffffff, 0xffffffff, 0xffffffff
- ___
-
- $code.=<<___;
- .text
- ___
-
- sub gfmul {
- #########################
- # a = T
- # b = TMP0 - remains unchanged
- # res = T
- # uses also TMP1,TMP2,TMP3,TMP4
- # __m128i GFMUL(__m128i A, __m128i B);
-
- my $T = "%xmm0";
- my $TMP0 = "%xmm1";
- my $TMP1 = "%xmm2";
- my $TMP2 = "%xmm3";
- my $TMP3 = "%xmm4";
- my $TMP4 = "%xmm5";
-
- $code.=<<___;
- .type GFMUL,\@abi-omnipotent
- .align 16
- GFMUL:
- .cfi_startproc
- vpclmulqdq \$0x00, $TMP0, $T, $TMP1
- vpclmulqdq \$0x11, $TMP0, $T, $TMP4
- vpclmulqdq \$0x10, $TMP0, $T, $TMP2
- vpclmulqdq \$0x01, $TMP0, $T, $TMP3
- vpxor $TMP3, $TMP2, $TMP2
- vpslldq \$8, $TMP2, $TMP3
- vpsrldq \$8, $TMP2, $TMP2
- vpxor $TMP3, $TMP1, $TMP1
- vpxor $TMP2, $TMP4, $TMP4
-
- vpclmulqdq \$0x10, poly(%rip), $TMP1, $TMP2
- vpshufd \$78, $TMP1, $TMP3
- vpxor $TMP3, $TMP2, $TMP1
-
- vpclmulqdq \$0x10, poly(%rip), $TMP1, $TMP2
- vpshufd \$78, $TMP1, $TMP3
- vpxor $TMP3, $TMP2, $TMP1
-
- vpxor $TMP4, $TMP1, $T
- ret
- .cfi_endproc
- .size GFMUL, .-GFMUL
- ___
- }
- gfmul();
-
- sub aesgcmsiv_htable_init {
- # aesgcmsiv_htable_init writes an eight-entry table of powers of |H| to
- # |out_htable|.
- # void aesgcmsiv_htable_init(uint8_t out_htable[16*8], uint8_t *H);
-
- my $Htbl = "%rdi";
- my $H = "%rsi";
- my $T = "%xmm0";
- my $TMP0 = "%xmm1";
-
- $code.=<<___;
- .globl aesgcmsiv_htable_init
- .type aesgcmsiv_htable_init,\@function,2
- .align 16
- aesgcmsiv_htable_init:
- .cfi_startproc
- vmovdqa ($H), $T
- vmovdqa $T, $TMP0
- vmovdqa $T, ($Htbl) # H
- call GFMUL
- vmovdqa $T, 16($Htbl) # H^2
- call GFMUL
- vmovdqa $T, 32($Htbl) # H^3
- call GFMUL
- vmovdqa $T, 48($Htbl) # H^4
- call GFMUL
- vmovdqa $T, 64($Htbl) # H^5
- call GFMUL
- vmovdqa $T, 80($Htbl) # H^6
- call GFMUL
- vmovdqa $T, 96($Htbl) # H^7
- call GFMUL
- vmovdqa $T, 112($Htbl) # H^8
- ret
- .cfi_endproc
- .size aesgcmsiv_htable_init, .-aesgcmsiv_htable_init
- ___
- }
- aesgcmsiv_htable_init();
-
- sub aesgcmsiv_htable6_init {
- # aesgcmsiv_htable6_init writes a six-entry table of powers of |H| to
- # |out_htable|.
- # void aesgcmsiv_htable6_init(uint8_t out_htable[16*6], uint8_t *H);
- #
- my $Htbl = "%rdi";
- my $H = "%rsi";
- my $T = "%xmm0";
- my $TMP0 = "%xmm1";
-
- $code.=<<___;
- .globl aesgcmsiv_htable6_init
- .type aesgcmsiv_htable6_init,\@function,2
- .align 16
- aesgcmsiv_htable6_init:
- .cfi_startproc
- vmovdqa ($H), $T
- vmovdqa $T, $TMP0
- vmovdqa $T, ($Htbl) # H
- call GFMUL
- vmovdqa $T, 16($Htbl) # H^2
- call GFMUL
- vmovdqa $T, 32($Htbl) # H^3
- call GFMUL
- vmovdqa $T, 48($Htbl) # H^4
- call GFMUL
- vmovdqa $T, 64($Htbl) # H^5
- call GFMUL
- vmovdqa $T, 80($Htbl) # H^6
- ret
- .cfi_endproc
- .size aesgcmsiv_htable6_init, .-aesgcmsiv_htable6_init
- ___
- }
- aesgcmsiv_htable6_init();
-
- sub aesgcmsiv_htable_polyval {
- # void aesgcmsiv_htable_polyval(uint8_t Htbl[16*8], uint8_t *MSG, uint64_t LEN, uint8_t *T);
- # parameter 1: %rdi Htable - pointer to Htable
- # parameter 2: %rsi INp - pointer to input
- # parameter 3: %rdx LEN - length of BUFFER in bytes
- # parameter 4: %rcx T - pointer to POLYVAL output
-
- my $DATA = "%xmm0";
- my $hlp0 = "%r11";
- my $Htbl = "%rdi";
- my $inp = "%rsi";
- my $len = "%rdx";
- my $TMP0 = "%xmm3";
- my $TMP1 = "%xmm4";
- my $TMP2 = "%xmm5";
- my $TMP3 = "%xmm6";
- my $TMP4 = "%xmm7";
- my $Tp = "%rcx";
- my $T = "%xmm1";
- my $Xhi = "%xmm9";
-
- my $SCHOOLBOOK_AAD = sub {
- my ($i)=@_;
- return <<___;
- vpclmulqdq \$0x01, ${\eval(16*$i)}($Htbl), $DATA, $TMP3
- vpxor $TMP3, $TMP2, $TMP2
- vpclmulqdq \$0x00, ${\eval(16*$i)}($Htbl), $DATA, $TMP3
- vpxor $TMP3, $TMP0, $TMP0
- vpclmulqdq \$0x11, ${\eval(16*$i)}($Htbl), $DATA, $TMP3
- vpxor $TMP3, $TMP1, $TMP1
- vpclmulqdq \$0x10, ${\eval(16*$i)}($Htbl), $DATA, $TMP3
- vpxor $TMP3, $TMP2, $TMP2
- ___
- };
-
- $code.=<<___;
- .globl aesgcmsiv_htable_polyval
- .type aesgcmsiv_htable_polyval,\@function,4
- .align 16
- aesgcmsiv_htable_polyval:
- .cfi_startproc
- test $len, $len
- jnz .Lhtable_polyval_start
- ret
-
- .Lhtable_polyval_start:
- vzeroall
-
- # We hash 8 blocks each iteration. If the total number of blocks is not a
- # multiple of 8, we first hash the leading n%8 blocks.
- movq $len, $hlp0
- andq \$127, $hlp0
-
- jz .Lhtable_polyval_no_prefix
-
- vpxor $Xhi, $Xhi, $Xhi
- vmovdqa ($Tp), $T
- sub $hlp0, $len
-
- sub \$16, $hlp0
-
- # hash first prefix block
- vmovdqu ($inp), $DATA
- vpxor $T, $DATA, $DATA
-
- vpclmulqdq \$0x01, ($Htbl,$hlp0), $DATA, $TMP2
- vpclmulqdq \$0x00, ($Htbl,$hlp0), $DATA, $TMP0
- vpclmulqdq \$0x11, ($Htbl,$hlp0), $DATA, $TMP1
- vpclmulqdq \$0x10, ($Htbl,$hlp0), $DATA, $TMP3
- vpxor $TMP3, $TMP2, $TMP2
-
- lea 16($inp), $inp
- test $hlp0, $hlp0
- jnz .Lhtable_polyval_prefix_loop
- jmp .Lhtable_polyval_prefix_complete
-
- # hash remaining prefix bocks (up to 7 total prefix blocks)
- .align 64
- .Lhtable_polyval_prefix_loop:
- sub \$16, $hlp0
-
- vmovdqu ($inp), $DATA # next data block
-
- vpclmulqdq \$0x00, ($Htbl,$hlp0), $DATA, $TMP3
- vpxor $TMP3, $TMP0, $TMP0
- vpclmulqdq \$0x11, ($Htbl,$hlp0), $DATA, $TMP3
- vpxor $TMP3, $TMP1, $TMP1
- vpclmulqdq \$0x01, ($Htbl,$hlp0), $DATA, $TMP3
- vpxor $TMP3, $TMP2, $TMP2
- vpclmulqdq \$0x10, ($Htbl,$hlp0), $DATA, $TMP3
- vpxor $TMP3, $TMP2, $TMP2
-
- test $hlp0, $hlp0
-
- lea 16($inp), $inp
-
- jnz .Lhtable_polyval_prefix_loop
-
- .Lhtable_polyval_prefix_complete:
- vpsrldq \$8, $TMP2, $TMP3
- vpslldq \$8, $TMP2, $TMP2
-
- vpxor $TMP3, $TMP1, $Xhi
- vpxor $TMP2, $TMP0, $T
-
- jmp .Lhtable_polyval_main_loop
-
- .Lhtable_polyval_no_prefix:
- # At this point we know the number of blocks is a multiple of 8. However,
- # the reduction in the main loop includes a multiplication by x^(-128). In
- # order to counter this, the existing tag needs to be multipled by x^128.
- # In practice, this just means that it is loaded into $Xhi, not $T.
- vpxor $T, $T, $T
- vmovdqa ($Tp), $Xhi
-
- .align 64
- .Lhtable_polyval_main_loop:
- sub \$0x80, $len
- jb .Lhtable_polyval_out
-
- vmovdqu 16*7($inp), $DATA # Ii
-
- vpclmulqdq \$0x01, ($Htbl), $DATA, $TMP2
- vpclmulqdq \$0x00, ($Htbl), $DATA, $TMP0
- vpclmulqdq \$0x11, ($Htbl), $DATA, $TMP1
- vpclmulqdq \$0x10, ($Htbl), $DATA, $TMP3
- vpxor $TMP3, $TMP2, $TMP2
-
- #########################################################
- vmovdqu 16*6($inp), $DATA
- ${\$SCHOOLBOOK_AAD->(1)}
-
- #########################################################
- vmovdqu 16*5($inp), $DATA
-
- vpclmulqdq \$0x10, poly(%rip), $T, $TMP4 # reduction stage 1a
- vpalignr \$8, $T, $T, $T
-
- ${\$SCHOOLBOOK_AAD->(2)}
-
- vpxor $TMP4, $T, $T # reduction stage 1b
- #########################################################
- vmovdqu 16*4($inp), $DATA
-
- ${\$SCHOOLBOOK_AAD->(3)}
- #########################################################
- vmovdqu 16*3($inp), $DATA
-
- vpclmulqdq \$0x10, poly(%rip), $T, $TMP4 # reduction stage 2a
- vpalignr \$8, $T, $T, $T
-
- ${\$SCHOOLBOOK_AAD->(4)}
-
- vpxor $TMP4, $T, $T # reduction stage 2b
- #########################################################
- vmovdqu 16*2($inp), $DATA
-
- ${\$SCHOOLBOOK_AAD->(5)}
-
- vpxor $Xhi, $T, $T # reduction finalize
- #########################################################
- vmovdqu 16*1($inp), $DATA
-
- ${\$SCHOOLBOOK_AAD->(6)}
- #########################################################
- vmovdqu 16*0($inp), $DATA
- vpxor $T, $DATA, $DATA
-
- ${\$SCHOOLBOOK_AAD->(7)}
- #########################################################
- vpsrldq \$8, $TMP2, $TMP3
- vpslldq \$8, $TMP2, $TMP2
-
- vpxor $TMP3, $TMP1, $Xhi
- vpxor $TMP2, $TMP0, $T
-
- lea 16*8($inp), $inp
- jmp .Lhtable_polyval_main_loop
-
- #########################################################
-
- .Lhtable_polyval_out:
- vpclmulqdq \$0x10, poly(%rip), $T, $TMP3
- vpalignr \$8, $T, $T, $T
- vpxor $TMP3, $T, $T
-
- vpclmulqdq \$0x10, poly(%rip), $T, $TMP3
- vpalignr \$8, $T, $T, $T
- vpxor $TMP3, $T, $T
- vpxor $Xhi, $T, $T
-
- vmovdqu $T, ($Tp)
- vzeroupper
- ret
- .cfi_endproc
- .size aesgcmsiv_htable_polyval,.-aesgcmsiv_htable_polyval
- ___
- }
- aesgcmsiv_htable_polyval();
-
- sub aesgcmsiv_polyval_horner {
- #void aesgcmsiv_polyval_horner(unsigned char T[16], // output
- # const unsigned char* H, // H
- # unsigned char* BUF, // Buffer
- # unsigned int blocks); // Len2
- #
- # parameter 1: %rdi T - pointers to POLYVAL output
- # parameter 2: %rsi Hp - pointer to H (user key)
- # parameter 3: %rdx INp - pointer to input
- # parameter 4: %rcx L - total number of blocks in input BUFFER
- #
- my $T = "%rdi";
- my $Hp = "%rsi";
- my $INp = "%rdx";
- my $L = "%rcx";
- my $LOC = "%r10";
- my $LEN = "%eax";
- my $H = "%xmm1";
- my $RES = "%xmm0";
-
- $code.=<<___;
- .globl aesgcmsiv_polyval_horner
- .type aesgcmsiv_polyval_horner,\@function,4
- .align 16
- aesgcmsiv_polyval_horner:
- .cfi_startproc
- test $L, $L
- jnz .Lpolyval_horner_start
- ret
-
- .Lpolyval_horner_start:
- # We will start with L GFMULS for POLYVAL(BIG_BUFFER)
- # RES = GFMUL(RES, H)
-
- xorq $LOC, $LOC
- shlq \$4, $L # L contains number of bytes to process
-
- vmovdqa ($Hp), $H
- vmovdqa ($T), $RES
-
- .Lpolyval_horner_loop:
- vpxor ($INp,$LOC), $RES, $RES # RES = RES + Xi
- call GFMUL # RES = RES * H
-
- add \$16, $LOC
- cmp $LOC, $L
- jne .Lpolyval_horner_loop
-
- # calculation of T is complete. RES=T
- vmovdqa $RES, ($T)
- ret
- .cfi_endproc
- .size aesgcmsiv_polyval_horner,.-aesgcmsiv_polyval_horner
- ___
- }
- aesgcmsiv_polyval_horner();
-
- # void aes128gcmsiv_aes_ks(const uint8_t *key, uint8_t *out_expanded_key);
- # parameter 1: %rdi
- # parameter 2: %rsi
- $code.=<<___;
- .globl aes128gcmsiv_aes_ks
- .type aes128gcmsiv_aes_ks,\@function,2
- .align 16
- aes128gcmsiv_aes_ks:
- .cfi_startproc
- vmovdqu (%rdi), %xmm1 # xmm1 = user key
- vmovdqa %xmm1, (%rsi) # rsi points to output
-
- vmovdqa con1(%rip), %xmm0
- vmovdqa mask(%rip), %xmm15
-
- movq \$8, %rax
-
- .Lks128_loop:
- addq \$16, %rsi # rsi points for next key
- subq \$1, %rax
- vpshufb %xmm15, %xmm1, %xmm2 # xmm2 = shuffled user key
- vaesenclast %xmm0, %xmm2, %xmm2
- vpslld \$1, %xmm0, %xmm0
- vpslldq \$4, %xmm1, %xmm3
- vpxor %xmm3, %xmm1, %xmm1
- vpslldq \$4, %xmm3, %xmm3
- vpxor %xmm3, %xmm1, %xmm1
- vpslldq \$4, %xmm3, %xmm3
- vpxor %xmm3, %xmm1, %xmm1
- vpxor %xmm2, %xmm1, %xmm1
- vmovdqa %xmm1, (%rsi)
- jne .Lks128_loop
-
- vmovdqa con2(%rip), %xmm0
- vpshufb %xmm15, %xmm1, %xmm2
- vaesenclast %xmm0, %xmm2, %xmm2
- vpslld \$1, %xmm0, %xmm0
- vpslldq \$4, %xmm1, %xmm3
- vpxor %xmm3, %xmm1, %xmm1
- vpslldq \$4, %xmm3, %xmm3
- vpxor %xmm3, %xmm1, %xmm1
- vpslldq \$4, %xmm3, %xmm3
- vpxor %xmm3, %xmm1, %xmm1
- vpxor %xmm2, %xmm1, %xmm1
- vmovdqa %xmm1, 16(%rsi)
-
- vpshufb %xmm15, %xmm1, %xmm2
- vaesenclast %xmm0, %xmm2, %xmm2
- vpslldq \$4, %xmm1, %xmm3
- vpxor %xmm3, %xmm1, %xmm1
- vpslldq \$4, %xmm3, %xmm3
- vpxor %xmm3, %xmm1, %xmm1
- vpslldq \$4, %xmm3, %xmm3
- vpxor %xmm3, %xmm1, %xmm1
- vpxor %xmm2, %xmm1, %xmm1
- vmovdqa %xmm1, 32(%rsi)
- ret
- .cfi_endproc
- .size aes128gcmsiv_aes_ks,.-aes128gcmsiv_aes_ks
- ___
-
- # void aes256gcmsiv_aes_ks(const uint8_t *key, uint8_t *out_expanded_key);
- # parameter 1: %rdi
- # parameter 2: %rsi
- $code.=<<___;
- .globl aes256gcmsiv_aes_ks
- .type aes256gcmsiv_aes_ks,\@function,2
- .align 16
- aes256gcmsiv_aes_ks:
- .cfi_startproc
- vmovdqu (%rdi), %xmm1
- vmovdqu 16(%rdi), %xmm3
- vmovdqa %xmm1, (%rsi)
- vmovdqa %xmm3, 16(%rsi)
- vmovdqa con1(%rip), %xmm0
- vmovdqa mask(%rip), %xmm15
- vpxor %xmm14, %xmm14, %xmm14
- mov \$6, %rax
-
- .Lks256_loop:
- add \$32, %rsi
- subq \$1, %rax
- vpshufb %xmm15, %xmm3, %xmm2
- vaesenclast %xmm0, %xmm2, %xmm2
- vpslld \$1, %xmm0, %xmm0
- vpsllq \$32, %xmm1, %xmm4
- vpxor %xmm4, %xmm1, %xmm1
- vpshufb con3(%rip), %xmm1, %xmm4
- vpxor %xmm4, %xmm1, %xmm1
- vpxor %xmm2, %xmm1, %xmm1
- vmovdqa %xmm1, (%rsi)
- vpshufd \$0xff, %xmm1, %xmm2
- vaesenclast %xmm14, %xmm2, %xmm2
- vpsllq \$32, %xmm3, %xmm4
- vpxor %xmm4, %xmm3, %xmm3
- vpshufb con3(%rip), %xmm3, %xmm4
- vpxor %xmm4, %xmm3, %xmm3
- vpxor %xmm2, %xmm3, %xmm3
- vmovdqa %xmm3, 16(%rsi)
- jne .Lks256_loop
-
- vpshufb %xmm15, %xmm3, %xmm2
- vaesenclast %xmm0, %xmm2, %xmm2
- vpsllq \$32, %xmm1, %xmm4
- vpxor %xmm4, %xmm1, %xmm1
- vpshufb con3(%rip), %xmm1, %xmm4
- vpxor %xmm4, %xmm1, %xmm1
- vpxor %xmm2, %xmm1, %xmm1
- vmovdqa %xmm1, 32(%rsi)
- ret
- .cfi_endproc
- ___
-
- sub aes128gcmsiv_aes_ks_enc_x1 {
- my $KS1_REGA = "%xmm1";
- my $KS1_REGB = "%xmm2";
- my $BLOCK1 = "%xmm4";
- my $AUXREG = "%xmm3";
-
- my $KS_BLOCK = sub {
- my ($reg, $reg2, $auxReg) = @_;
- return <<___;
- vpsllq \$32, $reg, $auxReg #!!saving mov instruction to xmm3
- vpxor $auxReg, $reg, $reg
- vpshufb con3(%rip), $reg, $auxReg
- vpxor $auxReg, $reg, $reg
- vpxor $reg2, $reg, $reg
- ___
- };
-
- my $round = sub {
- my ($i, $j) = @_;
- return <<___;
- vpshufb %xmm15, %xmm1, %xmm2 #!!saving mov instruction to xmm2
- vaesenclast %xmm0, %xmm2, %xmm2
- vpslld \$1, %xmm0, %xmm0
- ${\$KS_BLOCK->($KS1_REGA, $KS1_REGB, $AUXREG)}
- vaesenc %xmm1, $BLOCK1, $BLOCK1
- vmovdqa %xmm1, ${\eval(16*$i)}($j)
- ___
- };
-
- my $roundlast = sub {
- my ($i, $j) = @_;
- return <<___;
- vpshufb %xmm15, %xmm1, %xmm2 #!!saving mov instruction to xmm2
- vaesenclast %xmm0, %xmm2, %xmm2
- ${\$KS_BLOCK->($KS1_REGA, $KS1_REGB, $AUXREG)}
- vaesenclast %xmm1, $BLOCK1, $BLOCK1
- vmovdqa %xmm1, ${\eval(16*$i)}($j)
- ___
- };
-
- # parameter 1: %rdi Pointer to PT
- # parameter 2: %rsi Pointer to CT
- # parameter 4: %rdx Pointer to keys
- # parameter 5: %rcx Pointer to initial key
- $code.=<<___;
- .globl aes128gcmsiv_aes_ks_enc_x1
- .type aes128gcmsiv_aes_ks_enc_x1,\@function,4
- .align 16
- aes128gcmsiv_aes_ks_enc_x1:
- .cfi_startproc
- vmovdqa (%rcx), %xmm1 # xmm1 = first 16 bytes of random key
- vmovdqa 0*16(%rdi), $BLOCK1
-
- vmovdqa %xmm1, (%rdx) # KEY[0] = first 16 bytes of random key
- vpxor %xmm1, $BLOCK1, $BLOCK1
-
- vmovdqa con1(%rip), %xmm0 # xmm0 = 1,1,1,1
- vmovdqa mask(%rip), %xmm15 # xmm15 = mask
-
- ${\$round->(1, "%rdx")}
- ${\$round->(2, "%rdx")}
- ${\$round->(3, "%rdx")}
- ${\$round->(4, "%rdx")}
- ${\$round->(5, "%rdx")}
- ${\$round->(6, "%rdx")}
- ${\$round->(7, "%rdx")}
- ${\$round->(8, "%rdx")}
-
- vmovdqa con2(%rip), %xmm0
-
- ${\$round->(9, "%rdx")}
- ${\$roundlast->(10, "%rdx")}
-
- vmovdqa $BLOCK1, 0*16(%rsi)
- ret
- .cfi_endproc
- .size aes128gcmsiv_aes_ks_enc_x1,.-aes128gcmsiv_aes_ks_enc_x1
- ___
- }
- aes128gcmsiv_aes_ks_enc_x1();
-
- sub aes128gcmsiv_kdf {
- my $BLOCK1 = "%xmm9";
- my $BLOCK2 = "%xmm10";
- my $BLOCK3 = "%xmm11";
- my $BLOCK4 = "%xmm12";
- my $BLOCK5 = "%xmm13";
- my $BLOCK6 = "%xmm14";
- my $ONE = "%xmm13";
- my $KSp = "%rdx";
- my $STATE_1 = "%xmm1";
-
- my $enc_roundx4 = sub {
- my ($i, $j) = @_;
- return <<___;
- vmovdqa ${\eval($i*16)}(%rdx), $j
- vaesenc $j, $BLOCK1, $BLOCK1
- vaesenc $j, $BLOCK2, $BLOCK2
- vaesenc $j, $BLOCK3, $BLOCK3
- vaesenc $j, $BLOCK4, $BLOCK4
- ___
- };
-
- my $enc_roundlastx4 = sub {
- my ($i, $j) = @_;
- return <<___;
- vmovdqa ${\eval($i*16)}(%rdx), $j
- vaesenclast $j, $BLOCK1, $BLOCK1
- vaesenclast $j, $BLOCK2, $BLOCK2
- vaesenclast $j, $BLOCK3, $BLOCK3
- vaesenclast $j, $BLOCK4, $BLOCK4
- ___
- };
-
- # void aes128gcmsiv_kdf(const uint8_t nonce[16],
- # uint8_t *out_key_material,
- # const uint8_t *key_schedule);
- $code.=<<___;
- .globl aes128gcmsiv_kdf
- .type aes128gcmsiv_kdf,\@function,3
- .align 16
- aes128gcmsiv_kdf:
- .cfi_startproc
- # parameter 1: %rdi Pointer to NONCE
- # parameter 2: %rsi Pointer to CT
- # parameter 4: %rdx Pointer to keys
-
- vmovdqa (%rdx), %xmm1 # xmm1 = first 16 bytes of random key
- vmovdqa 0*16(%rdi), $BLOCK1
- vmovdqa and_mask(%rip), $BLOCK4
- vmovdqa one(%rip), $ONE
- vpshufd \$0x90, $BLOCK1, $BLOCK1
- vpand $BLOCK4, $BLOCK1, $BLOCK1
- vpaddd $ONE, $BLOCK1, $BLOCK2
- vpaddd $ONE, $BLOCK2, $BLOCK3
- vpaddd $ONE, $BLOCK3, $BLOCK4
-
- vpxor %xmm1, $BLOCK1, $BLOCK1
- vpxor %xmm1, $BLOCK2, $BLOCK2
- vpxor %xmm1, $BLOCK3, $BLOCK3
- vpxor %xmm1, $BLOCK4, $BLOCK4
-
- ${\$enc_roundx4->(1, "%xmm1")}
- ${\$enc_roundx4->(2, "%xmm2")}
- ${\$enc_roundx4->(3, "%xmm1")}
- ${\$enc_roundx4->(4, "%xmm2")}
- ${\$enc_roundx4->(5, "%xmm1")}
- ${\$enc_roundx4->(6, "%xmm2")}
- ${\$enc_roundx4->(7, "%xmm1")}
- ${\$enc_roundx4->(8, "%xmm2")}
- ${\$enc_roundx4->(9, "%xmm1")}
- ${\$enc_roundlastx4->(10, "%xmm2")}
-
- vmovdqa $BLOCK1, 0*16(%rsi)
- vmovdqa $BLOCK2, 1*16(%rsi)
- vmovdqa $BLOCK3, 2*16(%rsi)
- vmovdqa $BLOCK4, 3*16(%rsi)
- ret
- .cfi_endproc
- .size aes128gcmsiv_kdf,.-aes128gcmsiv_kdf
- ___
- }
- aes128gcmsiv_kdf();
-
- sub aes128gcmsiv_enc_msg_x4 {
- my $CTR1 = "%xmm0";
- my $CTR2 = "%xmm1";
- my $CTR3 = "%xmm2";
- my $CTR4 = "%xmm3";
- my $ADDER = "%xmm4";
-
- my $STATE1 = "%xmm5";
- my $STATE2 = "%xmm6";
- my $STATE3 = "%xmm7";
- my $STATE4 = "%xmm8";
-
- my $TMP = "%xmm12";
- my $TMP2 = "%xmm13";
- my $TMP3 = "%xmm14";
- my $IV = "%xmm15";
-
- my $PT = "%rdi";
- my $CT = "%rsi";
- my $TAG = "%rdx";
- my $KS = "%rcx";
- my $LEN = "%r8";
-
- my $aes_round = sub {
- my ($i) = @_;
- return <<___;
- vmovdqu ${\eval($i*16)}($KS), $TMP
- vaesenc $TMP, $STATE1, $STATE1
- vaesenc $TMP, $STATE2, $STATE2
- vaesenc $TMP, $STATE3, $STATE3
- vaesenc $TMP, $STATE4, $STATE4
- ___
- };
-
- my $aes_lastround = sub {
- my ($i) = @_;
- return <<___;
- vmovdqu ${\eval($i*16)}($KS), $TMP
- vaesenclast $TMP, $STATE1, $STATE1
- vaesenclast $TMP, $STATE2, $STATE2
- vaesenclast $TMP, $STATE3, $STATE3
- vaesenclast $TMP, $STATE4, $STATE4
- ___
- };
-
- # void aes128gcmsiv_enc_msg_x4(unsigned char* PT, unsigned char* CT,
- # unsigned char* TAG, unsigned char* KS,
- # size_t byte_len);
- # parameter 1: %rdi #PT
- # parameter 2: %rsi #CT
- # parameter 3: %rdx #TAG [127 126 ... 0] IV=[127...32]
- # parameter 4: %rcx #KS
- # parameter 5: %r8 #LEN MSG_length in bytes
- $code.=<<___;
- .globl aes128gcmsiv_enc_msg_x4
- .type aes128gcmsiv_enc_msg_x4,\@function,5
- .align 16
- aes128gcmsiv_enc_msg_x4:
- .cfi_startproc
- test $LEN, $LEN
- jnz .L128_enc_msg_x4_start
- ret
-
- .L128_enc_msg_x4_start:
- pushq %r12
- .cfi_push %r12
- pushq %r13
- .cfi_push %r13
-
- shrq \$4, $LEN # LEN = num of blocks
- movq $LEN, %r10
- shlq \$62, %r10
- shrq \$62, %r10
-
- # make IV from TAG
- vmovdqa ($TAG), $IV
- vpor OR_MASK(%rip), $IV, $IV #IV = [1]TAG[126...32][00..00]
-
- vmovdqu four(%rip), $ADDER # Register to increment counters
- vmovdqa $IV, $CTR1 # CTR1 = TAG[1][127...32][00..00]
- vpaddd one(%rip), $IV, $CTR2 # CTR2 = TAG[1][127...32][00..01]
- vpaddd two(%rip), $IV, $CTR3 # CTR3 = TAG[1][127...32][00..02]
- vpaddd three(%rip), $IV, $CTR4 # CTR4 = TAG[1][127...32][00..03]
-
- shrq \$2, $LEN
- je .L128_enc_msg_x4_check_remainder
-
- subq \$64, $CT
- subq \$64, $PT
-
- .L128_enc_msg_x4_loop1:
- addq \$64, $CT
- addq \$64, $PT
-
- vmovdqa $CTR1, $STATE1
- vmovdqa $CTR2, $STATE2
- vmovdqa $CTR3, $STATE3
- vmovdqa $CTR4, $STATE4
-
- vpxor ($KS), $STATE1, $STATE1
- vpxor ($KS), $STATE2, $STATE2
- vpxor ($KS), $STATE3, $STATE3
- vpxor ($KS), $STATE4, $STATE4
-
- ${\$aes_round->(1)}
- vpaddd $ADDER, $CTR1, $CTR1
- ${\$aes_round->(2)}
- vpaddd $ADDER, $CTR2, $CTR2
- ${\$aes_round->(3)}
- vpaddd $ADDER, $CTR3, $CTR3
- ${\$aes_round->(4)}
- vpaddd $ADDER, $CTR4, $CTR4
-
- ${\$aes_round->(5)}
- ${\$aes_round->(6)}
- ${\$aes_round->(7)}
- ${\$aes_round->(8)}
- ${\$aes_round->(9)}
- ${\$aes_lastround->(10)}
-
- # XOR with Plaintext
- vpxor 0*16($PT), $STATE1, $STATE1
- vpxor 1*16($PT), $STATE2, $STATE2
- vpxor 2*16($PT), $STATE3, $STATE3
- vpxor 3*16($PT), $STATE4, $STATE4
-
- subq \$1, $LEN
-
- vmovdqu $STATE1, 0*16($CT)
- vmovdqu $STATE2, 1*16($CT)
- vmovdqu $STATE3, 2*16($CT)
- vmovdqu $STATE4, 3*16($CT)
-
- jne .L128_enc_msg_x4_loop1
-
- addq \$64,$CT
- addq \$64,$PT
-
- .L128_enc_msg_x4_check_remainder:
- cmpq \$0, %r10
- je .L128_enc_msg_x4_out
-
- .L128_enc_msg_x4_loop2:
- # enc each block separately
- # CTR1 is the highest counter (even if no LOOP done)
- vmovdqa $CTR1, $STATE1
- vpaddd one(%rip), $CTR1, $CTR1 # inc counter
-
- vpxor ($KS), $STATE1, $STATE1
- vaesenc 16($KS), $STATE1, $STATE1
- vaesenc 32($KS), $STATE1, $STATE1
- vaesenc 48($KS), $STATE1, $STATE1
- vaesenc 64($KS), $STATE1, $STATE1
- vaesenc 80($KS), $STATE1, $STATE1
- vaesenc 96($KS), $STATE1, $STATE1
- vaesenc 112($KS), $STATE1, $STATE1
- vaesenc 128($KS), $STATE1, $STATE1
- vaesenc 144($KS), $STATE1, $STATE1
- vaesenclast 160($KS), $STATE1, $STATE1
-
- # XOR with plaintext
- vpxor ($PT), $STATE1, $STATE1
- vmovdqu $STATE1, ($CT)
-
- addq \$16, $PT
- addq \$16, $CT
-
- subq \$1, %r10
- jne .L128_enc_msg_x4_loop2
-
- .L128_enc_msg_x4_out:
- popq %r13
- .cfi_pop %r13
- popq %r12
- .cfi_pop %r12
- ret
- .cfi_endproc
- .size aes128gcmsiv_enc_msg_x4,.-aes128gcmsiv_enc_msg_x4
- ___
- }
- aes128gcmsiv_enc_msg_x4();
-
- sub aes128gcmsiv_enc_msg_x8 {
- my $STATE1 = "%xmm1";
- my $STATE2 = "%xmm2";
- my $STATE3 = "%xmm3";
- my $STATE4 = "%xmm4";
- my $STATE5 = "%xmm5";
- my $STATE6 = "%xmm6";
- my $STATE7 = "%xmm7";
- my $STATE8 = "%xmm8";
-
- my $CTR1 = "%xmm0";
- my $CTR2 = "%xmm9";
- my $CTR3 = "%xmm10";
- my $CTR4 = "%xmm11";
- my $CTR5 = "%xmm12";
- my $CTR6 = "%xmm13";
- my $CTR7 = "%xmm14";
- my $SCHED = "%xmm15";
-
- my $TMP1 = "%xmm1";
- my $TMP2 = "%xmm2";
-
- my $PT = "%rdi";
- my $CT = "%rsi";
- my $TAG = "%rdx";
- my $KS = "%rcx";
- my $LEN = "%r8";
-
- my $aes_round8 = sub {
- my ($i) = @_;
- return <<___;
- vmovdqu ${\eval($i*16)}($KS), $SCHED
- vaesenc $SCHED, $STATE1, $STATE1
- vaesenc $SCHED, $STATE2, $STATE2
- vaesenc $SCHED, $STATE3, $STATE3
- vaesenc $SCHED, $STATE4, $STATE4
- vaesenc $SCHED, $STATE5, $STATE5
- vaesenc $SCHED, $STATE6, $STATE6
- vaesenc $SCHED, $STATE7, $STATE7
- vaesenc $SCHED, $STATE8, $STATE8
- ___
- };
-
- my $aes_lastround8 = sub {
- my ($i) = @_;
- return <<___;
- vmovdqu ${\eval($i*16)}($KS), $SCHED
- vaesenclast $SCHED, $STATE1, $STATE1
- vaesenclast $SCHED, $STATE2, $STATE2
- vaesenclast $SCHED, $STATE3, $STATE3
- vaesenclast $SCHED, $STATE4, $STATE4
- vaesenclast $SCHED, $STATE5, $STATE5
- vaesenclast $SCHED, $STATE6, $STATE6
- vaesenclast $SCHED, $STATE7, $STATE7
- vaesenclast $SCHED, $STATE8, $STATE8
- ___
- };
-
- # void ENC_MSG_x8(unsigned char* PT,
- # unsigned char* CT,
- # unsigned char* TAG,
- # unsigned char* KS,
- # size_t byte_len);
- # parameter 1: %rdi #PT
- # parameter 2: %rsi #CT
- # parameter 3: %rdx #TAG [127 126 ... 0] IV=[127...32]
- # parameter 4: %rcx #KS
- # parameter 5: %r8 #LEN MSG_length in bytes
- $code.=<<___;
- .globl aes128gcmsiv_enc_msg_x8
- .type aes128gcmsiv_enc_msg_x8,\@function,5
- .align 16
- aes128gcmsiv_enc_msg_x8:
- .cfi_startproc
- test $LEN, $LEN
- jnz .L128_enc_msg_x8_start
- ret
-
- .L128_enc_msg_x8_start:
- pushq %r12
- .cfi_push %r12
- pushq %r13
- .cfi_push %r13
- pushq %rbp
- .cfi_push %rbp
- movq %rsp, %rbp
- .cfi_def_cfa_register rbp
-
- # Place in stack
- subq \$128, %rsp
- andq \$-64, %rsp
-
- shrq \$4, $LEN # LEN = num of blocks
- movq $LEN, %r10
- shlq \$61, %r10
- shrq \$61, %r10
-
- # make IV from TAG
- vmovdqu ($TAG), $TMP1
- vpor OR_MASK(%rip), $TMP1, $TMP1 # TMP1= IV = [1]TAG[126...32][00..00]
-
- # store counter8 in the stack
- vpaddd seven(%rip), $TMP1, $CTR1
- vmovdqu $CTR1, (%rsp) # CTR8 = TAG[127...32][00..07]
- vpaddd one(%rip), $TMP1, $CTR2 # CTR2 = TAG[127...32][00..01]
- vpaddd two(%rip), $TMP1, $CTR3 # CTR3 = TAG[127...32][00..02]
- vpaddd three(%rip), $TMP1, $CTR4 # CTR4 = TAG[127...32][00..03]
- vpaddd four(%rip), $TMP1, $CTR5 # CTR5 = TAG[127...32][00..04]
- vpaddd five(%rip), $TMP1, $CTR6 # CTR6 = TAG[127...32][00..05]
- vpaddd six(%rip), $TMP1, $CTR7 # CTR7 = TAG[127...32][00..06]
- vmovdqa $TMP1, $CTR1 # CTR1 = TAG[127...32][00..00]
-
- shrq \$3, $LEN
- je .L128_enc_msg_x8_check_remainder
-
- subq \$128, $CT
- subq \$128, $PT
-
- .L128_enc_msg_x8_loop1:
- addq \$128, $CT
- addq \$128, $PT
-
- vmovdqa $CTR1, $STATE1
- vmovdqa $CTR2, $STATE2
- vmovdqa $CTR3, $STATE3
- vmovdqa $CTR4, $STATE4
- vmovdqa $CTR5, $STATE5
- vmovdqa $CTR6, $STATE6
- vmovdqa $CTR7, $STATE7
- # move from stack
- vmovdqu (%rsp), $STATE8
-
- vpxor ($KS), $STATE1, $STATE1
- vpxor ($KS), $STATE2, $STATE2
- vpxor ($KS), $STATE3, $STATE3
- vpxor ($KS), $STATE4, $STATE4
- vpxor ($KS), $STATE5, $STATE5
- vpxor ($KS), $STATE6, $STATE6
- vpxor ($KS), $STATE7, $STATE7
- vpxor ($KS), $STATE8, $STATE8
-
- ${\$aes_round8->(1)}
- vmovdqu (%rsp), $CTR7 # deal with CTR8
- vpaddd eight(%rip), $CTR7, $CTR7
- vmovdqu $CTR7, (%rsp)
- ${\$aes_round8->(2)}
- vpsubd one(%rip), $CTR7, $CTR7
- ${\$aes_round8->(3)}
- vpaddd eight(%rip), $CTR1, $CTR1
- ${\$aes_round8->(4)}
- vpaddd eight(%rip), $CTR2, $CTR2
- ${\$aes_round8->(5)}
- vpaddd eight(%rip), $CTR3, $CTR3
- ${\$aes_round8->(6)}
- vpaddd eight(%rip), $CTR4, $CTR4
- ${\$aes_round8->(7)}
- vpaddd eight(%rip), $CTR5, $CTR5
- ${\$aes_round8->(8)}
- vpaddd eight(%rip), $CTR6, $CTR6
- ${\$aes_round8->(9)}
- ${\$aes_lastround8->(10)}
-
- # XOR with Plaintext
- vpxor 0*16($PT), $STATE1, $STATE1
- vpxor 1*16($PT), $STATE2, $STATE2
- vpxor 2*16($PT), $STATE3, $STATE3
- vpxor 3*16($PT), $STATE4, $STATE4
- vpxor 4*16($PT), $STATE5, $STATE5
- vpxor 5*16($PT), $STATE6, $STATE6
- vpxor 6*16($PT), $STATE7, $STATE7
- vpxor 7*16($PT), $STATE8, $STATE8
-
- dec $LEN
-
- vmovdqu $STATE1, 0*16($CT)
- vmovdqu $STATE2, 1*16($CT)
- vmovdqu $STATE3, 2*16($CT)
- vmovdqu $STATE4, 3*16($CT)
- vmovdqu $STATE5, 4*16($CT)
- vmovdqu $STATE6, 5*16($CT)
- vmovdqu $STATE7, 6*16($CT)
- vmovdqu $STATE8, 7*16($CT)
-
- jne .L128_enc_msg_x8_loop1
-
- addq \$128, $CT
- addq \$128, $PT
-
- .L128_enc_msg_x8_check_remainder:
- cmpq \$0, %r10
- je .L128_enc_msg_x8_out
-
- .L128_enc_msg_x8_loop2:
- # enc each block separately
- # CTR1 is the highest counter (even if no LOOP done)
- vmovdqa $CTR1, $STATE1
- vpaddd one(%rip), $CTR1, $CTR1 # inc counter
-
- vpxor ($KS), $STATE1, $STATE1
- vaesenc 16($KS), $STATE1, $STATE1
- vaesenc 32($KS), $STATE1, $STATE1
- vaesenc 48($KS), $STATE1, $STATE1
- vaesenc 64($KS), $STATE1, $STATE1
- vaesenc 80($KS), $STATE1, $STATE1
- vaesenc 96($KS), $STATE1, $STATE1
- vaesenc 112($KS), $STATE1, $STATE1
- vaesenc 128($KS), $STATE1, $STATE1
- vaesenc 144($KS), $STATE1, $STATE1
- vaesenclast 160($KS), $STATE1, $STATE1
-
- # XOR with Plaintext
- vpxor ($PT), $STATE1, $STATE1
-
- vmovdqu $STATE1, ($CT)
-
- addq \$16, $PT
- addq \$16, $CT
-
- decq %r10
- jne .L128_enc_msg_x8_loop2
-
- .L128_enc_msg_x8_out:
- movq %rbp, %rsp
- .cfi_def_cfa_register %rsp
- popq %rbp
- .cfi_pop %rbp
- popq %r13
- .cfi_pop %r13
- popq %r12
- .cfi_pop %r12
- ret
- .cfi_endproc
- .size aes128gcmsiv_enc_msg_x8,.-aes128gcmsiv_enc_msg_x8
- ___
- }
- aes128gcmsiv_enc_msg_x8();
-
- sub aesgcmsiv_dec {
- my ($aes256) = @_;
-
- my $T = "%xmm0";
- my $TMP0 = "%xmm1";
- my $TMP1 = "%xmm2";
- my $TMP2 = "%xmm3";
- my $TMP3 = "%xmm4";
- my $TMP4 = "%xmm5";
- my $TMP5 = "%xmm6";
- my $CTR1 = "%xmm7";
- my $CTR2 = "%xmm8";
- my $CTR3 = "%xmm9";
- my $CTR4 = "%xmm10";
- my $CTR5 = "%xmm11";
- my $CTR6 = "%xmm12";
- my $CTR = "%xmm15";
- my $CT = "%rdi";
- my $PT = "%rsi";
- my $POL = "%rdx";
- my $Htbl = "%rcx";
- my $KS = "%r8";
- my $LEN = "%r9";
- my $secureBuffer = "%rax";
- my $HTABLE_ROUNDS = "%xmm13";
-
- my $labelPrefix = "128";
- if ($aes256) {
- $labelPrefix = "256";
- }
-
- my $aes_round_dec = sub {
- my ($i) = @_;
- return <<___;
- vmovdqu ${\eval($i*16)}($KS), $TMP3
- vaesenc $TMP3, $CTR1, $CTR1
- vaesenc $TMP3, $CTR2, $CTR2
- vaesenc $TMP3, $CTR3, $CTR3
- vaesenc $TMP3, $CTR4, $CTR4
- vaesenc $TMP3, $CTR5, $CTR5
- vaesenc $TMP3, $CTR6, $CTR6
- ___
- };
-
- my $aes_lastround_dec = sub {
- my ($i) = @_;
- return <<___;
- vmovdqu ${\eval($i*16)}($KS), $TMP3
- vaesenclast $TMP3, $CTR1, $CTR1
- vaesenclast $TMP3, $CTR2, $CTR2
- vaesenclast $TMP3, $CTR3, $CTR3
- vaesenclast $TMP3, $CTR4, $CTR4
- vaesenclast $TMP3, $CTR5, $CTR5
- vaesenclast $TMP3, $CTR6, $CTR6
- ___
- };
-
- my $schoolbook = sub {
- my ($i) = @_;
- return <<___;
- vmovdqu ${\eval($i*16-32)}($secureBuffer), $TMP5
- vmovdqu ${\eval($i*16-32)}($Htbl), $HTABLE_ROUNDS
-
- vpclmulqdq \$0x10, $HTABLE_ROUNDS, $TMP5, $TMP3
- vpxor $TMP3, $TMP0, $TMP0
- vpclmulqdq \$0x11, $HTABLE_ROUNDS, $TMP5, $TMP3
- vpxor $TMP3, $TMP1, $TMP1
- vpclmulqdq \$0x00, $HTABLE_ROUNDS, $TMP5, $TMP3
- vpxor $TMP3, $TMP2, $TMP2
- vpclmulqdq \$0x01, $HTABLE_ROUNDS, $TMP5, $TMP3
- vpxor $TMP3, $TMP0, $TMP0
- ___
- };
-
- if ($aes256) {
- $code.=<<___;
- .globl aes256gcmsiv_dec
- .type aes256gcmsiv_dec,\@function,6
- .align 16
- aes256gcmsiv_dec:
- ___
- } else {
- $code.=<<___;
- .globl aes128gcmsiv_dec
- .type aes128gcmsiv_dec,\@function,6
- .align 16
- aes128gcmsiv_dec:
- ___
- }
-
- $code.=<<___;
- .cfi_startproc
- test \$~15, $LEN
- jnz .L${labelPrefix}_dec_start
- ret
-
- .L${labelPrefix}_dec_start:
- vzeroupper
- vmovdqa ($POL), $T
- movq $POL, $secureBuffer
-
- leaq 32($secureBuffer), $secureBuffer
- leaq 32($Htbl), $Htbl
-
- # make CTRBLKs from given tag.
- vmovdqu ($CT,$LEN), $CTR
- vpor OR_MASK(%rip), $CTR, $CTR # CTR = [1]TAG[126...32][00..00]
- andq \$~15, $LEN
-
- # If less then 6 blocks, make singles
- cmp \$96, $LEN
- jb .L${labelPrefix}_dec_loop2
-
- # Decrypt the first six blocks
- sub \$96, $LEN
- vmovdqa $CTR, $CTR1
- vpaddd one(%rip), $CTR1, $CTR2
- vpaddd two(%rip), $CTR1, $CTR3
- vpaddd one(%rip), $CTR3, $CTR4
- vpaddd two(%rip), $CTR3, $CTR5
- vpaddd one(%rip), $CTR5, $CTR6
- vpaddd two(%rip), $CTR5, $CTR
-
- vpxor ($KS), $CTR1, $CTR1
- vpxor ($KS), $CTR2, $CTR2
- vpxor ($KS), $CTR3, $CTR3
- vpxor ($KS), $CTR4, $CTR4
- vpxor ($KS), $CTR5, $CTR5
- vpxor ($KS), $CTR6, $CTR6
-
- ${\$aes_round_dec->(1)}
- ${\$aes_round_dec->(2)}
- ${\$aes_round_dec->(3)}
- ${\$aes_round_dec->(4)}
- ${\$aes_round_dec->(5)}
- ${\$aes_round_dec->(6)}
- ${\$aes_round_dec->(7)}
- ${\$aes_round_dec->(8)}
- ${\$aes_round_dec->(9)}
- ___
-
- if ($aes256) {
- $code.=<<___;
- ${\$aes_round_dec->(10)}
- ${\$aes_round_dec->(11)}
- ${\$aes_round_dec->(12)}
- ${\$aes_round_dec->(13)}
- ${\$aes_lastround_dec->(14)}
- ___
- } else {
- $code.=<<___;
- ${\$aes_lastround_dec->(10)}
- ___
- }
-
- $code.=<<___;
- # XOR with CT
- vpxor 0*16($CT), $CTR1, $CTR1
- vpxor 1*16($CT), $CTR2, $CTR2
- vpxor 2*16($CT), $CTR3, $CTR3
- vpxor 3*16($CT), $CTR4, $CTR4
- vpxor 4*16($CT), $CTR5, $CTR5
- vpxor 5*16($CT), $CTR6, $CTR6
-
- vmovdqu $CTR1, 0*16($PT)
- vmovdqu $CTR2, 1*16($PT)
- vmovdqu $CTR3, 2*16($PT)
- vmovdqu $CTR4, 3*16($PT)
- vmovdqu $CTR5, 4*16($PT)
- vmovdqu $CTR6, 5*16($PT)
-
- addq \$96, $CT
- addq \$96, $PT
- jmp .L${labelPrefix}_dec_loop1
-
- # Decrypt 6 blocks each time while hashing previous 6 blocks
- .align 64
- .L${labelPrefix}_dec_loop1:
- cmp \$96, $LEN
- jb .L${labelPrefix}_dec_finish_96
- sub \$96, $LEN
-
- vmovdqa $CTR6, $TMP5
- vmovdqa $CTR5, 1*16-32($secureBuffer)
- vmovdqa $CTR4, 2*16-32($secureBuffer)
- vmovdqa $CTR3, 3*16-32($secureBuffer)
- vmovdqa $CTR2, 4*16-32($secureBuffer)
- vmovdqa $CTR1, 5*16-32($secureBuffer)
-
- vmovdqa $CTR, $CTR1
- vpaddd one(%rip), $CTR1, $CTR2
- vpaddd two(%rip), $CTR1, $CTR3
- vpaddd one(%rip), $CTR3, $CTR4
- vpaddd two(%rip), $CTR3, $CTR5
- vpaddd one(%rip), $CTR5, $CTR6
- vpaddd two(%rip), $CTR5, $CTR
-
- vmovdqa ($KS), $TMP3
- vpxor $TMP3, $CTR1, $CTR1
- vpxor $TMP3, $CTR2, $CTR2
- vpxor $TMP3, $CTR3, $CTR3
- vpxor $TMP3, $CTR4, $CTR4
- vpxor $TMP3, $CTR5, $CTR5
- vpxor $TMP3, $CTR6, $CTR6
-
- vmovdqu 0*16-32($Htbl), $TMP3
- vpclmulqdq \$0x11, $TMP3, $TMP5, $TMP1
- vpclmulqdq \$0x00, $TMP3, $TMP5, $TMP2
- vpclmulqdq \$0x01, $TMP3, $TMP5, $TMP0
- vpclmulqdq \$0x10, $TMP3, $TMP5, $TMP3
- vpxor $TMP3, $TMP0, $TMP0
-
- ${\$aes_round_dec->(1)}
- ${\$schoolbook->(1)}
-
- ${\$aes_round_dec->(2)}
- ${\$schoolbook->(2)}
-
- ${\$aes_round_dec->(3)}
- ${\$schoolbook->(3)}
-
- ${\$aes_round_dec->(4)}
- ${\$schoolbook->(4)}
-
- ${\$aes_round_dec->(5)}
- ${\$aes_round_dec->(6)}
- ${\$aes_round_dec->(7)}
-
- vmovdqa 5*16-32($secureBuffer), $TMP5
- vpxor $T, $TMP5, $TMP5
- vmovdqu 5*16-32($Htbl), $TMP4
-
- vpclmulqdq \$0x01, $TMP4, $TMP5, $TMP3
- vpxor $TMP3, $TMP0, $TMP0
- vpclmulqdq \$0x11, $TMP4, $TMP5, $TMP3
- vpxor $TMP3, $TMP1, $TMP1
- vpclmulqdq \$0x00, $TMP4, $TMP5, $TMP3
- vpxor $TMP3, $TMP2, $TMP2
- vpclmulqdq \$0x10, $TMP4, $TMP5, $TMP3
- vpxor $TMP3, $TMP0, $TMP0
-
- ${\$aes_round_dec->(8)}
-
- vpsrldq \$8, $TMP0, $TMP3
- vpxor $TMP3, $TMP1, $TMP4
- vpslldq \$8, $TMP0, $TMP3
- vpxor $TMP3, $TMP2, $T
-
- vmovdqa poly(%rip), $TMP2
-
- ${\$aes_round_dec->(9)}
- ___
-
- if ($aes256) {
- $code.=<<___;
- ${\$aes_round_dec->(10)}
- ${\$aes_round_dec->(11)}
- ${\$aes_round_dec->(12)}
- ${\$aes_round_dec->(13)}
- vmovdqu 14*16($KS), $TMP5
- ___
- } else {
- $code.=<<___;
- vmovdqu 10*16($KS), $TMP5
- ___
- }
-
- $code.=<<___;
- vpalignr \$8, $T, $T, $TMP1
- vpclmulqdq \$0x10, $TMP2, $T, $T
- vpxor $T, $TMP1, $T
-
- vpxor 0*16($CT), $TMP5, $TMP3
- vaesenclast $TMP3, $CTR1, $CTR1
- vpxor 1*16($CT), $TMP5, $TMP3
- vaesenclast $TMP3, $CTR2, $CTR2
- vpxor 2*16($CT), $TMP5, $TMP3
- vaesenclast $TMP3, $CTR3, $CTR3
- vpxor 3*16($CT), $TMP5, $TMP3
- vaesenclast $TMP3, $CTR4, $CTR4
- vpxor 4*16($CT), $TMP5, $TMP3
- vaesenclast $TMP3, $CTR5, $CTR5
- vpxor 5*16($CT), $TMP5, $TMP3
- vaesenclast $TMP3, $CTR6, $CTR6
-
- vpalignr \$8, $T, $T, $TMP1
- vpclmulqdq \$0x10, $TMP2, $T, $T
- vpxor $T, $TMP1, $T
-
- vmovdqu $CTR1, 0*16($PT)
- vmovdqu $CTR2, 1*16($PT)
- vmovdqu $CTR3, 2*16($PT)
- vmovdqu $CTR4, 3*16($PT)
- vmovdqu $CTR5, 4*16($PT)
- vmovdqu $CTR6, 5*16($PT)
-
- vpxor $TMP4, $T, $T
-
- lea 96($CT), $CT
- lea 96($PT), $PT
- jmp .L${labelPrefix}_dec_loop1
-
- .L${labelPrefix}_dec_finish_96:
- vmovdqa $CTR6, $TMP5
- vmovdqa $CTR5, 1*16-32($secureBuffer)
- vmovdqa $CTR4, 2*16-32($secureBuffer)
- vmovdqa $CTR3, 3*16-32($secureBuffer)
- vmovdqa $CTR2, 4*16-32($secureBuffer)
- vmovdqa $CTR1, 5*16-32($secureBuffer)
-
- vmovdqu 0*16-32($Htbl), $TMP3
- vpclmulqdq \$0x10, $TMP3, $TMP5, $TMP0
- vpclmulqdq \$0x11, $TMP3, $TMP5, $TMP1
- vpclmulqdq \$0x00, $TMP3, $TMP5, $TMP2
- vpclmulqdq \$0x01, $TMP3, $TMP5, $TMP3
- vpxor $TMP3, $TMP0, $TMP0
-
- ${\$schoolbook->(1)}
- ${\$schoolbook->(2)}
- ${\$schoolbook->(3)}
- ${\$schoolbook->(4)}
-
- vmovdqu 5*16-32($secureBuffer), $TMP5
- vpxor $T, $TMP5, $TMP5
- vmovdqu 5*16-32($Htbl), $TMP4
- vpclmulqdq \$0x11, $TMP4, $TMP5, $TMP3
- vpxor $TMP3, $TMP1, $TMP1
- vpclmulqdq \$0x00, $TMP4, $TMP5, $TMP3
- vpxor $TMP3, $TMP2, $TMP2
- vpclmulqdq \$0x10, $TMP4, $TMP5, $TMP3
- vpxor $TMP3, $TMP0, $TMP0
- vpclmulqdq \$0x01, $TMP4, $TMP5, $TMP3
- vpxor $TMP3, $TMP0, $TMP0
-
- vpsrldq \$8, $TMP0, $TMP3
- vpxor $TMP3, $TMP1, $TMP4
- vpslldq \$8, $TMP0, $TMP3
- vpxor $TMP3, $TMP2, $T
-
- vmovdqa poly(%rip), $TMP2
-
- vpalignr \$8, $T, $T, $TMP1
- vpclmulqdq \$0x10, $TMP2, $T, $T
- vpxor $T, $TMP1, $T
-
- vpalignr \$8, $T, $T, $TMP1
- vpclmulqdq \$0x10, $TMP2, $T, $T
- vpxor $T, $TMP1, $T
-
- vpxor $TMP4, $T, $T
-
- .L${labelPrefix}_dec_loop2:
- # Here we encrypt any remaining whole block
-
- # if there are no whole blocks
- cmp \$16, $LEN
- jb .L${labelPrefix}_dec_out
- sub \$16, $LEN
-
- vmovdqa $CTR, $TMP1
- vpaddd one(%rip), $CTR, $CTR
-
- vpxor 0*16($KS), $TMP1, $TMP1
- vaesenc 1*16($KS), $TMP1, $TMP1
- vaesenc 2*16($KS), $TMP1, $TMP1
- vaesenc 3*16($KS), $TMP1, $TMP1
- vaesenc 4*16($KS), $TMP1, $TMP1
- vaesenc 5*16($KS), $TMP1, $TMP1
- vaesenc 6*16($KS), $TMP1, $TMP1
- vaesenc 7*16($KS), $TMP1, $TMP1
- vaesenc 8*16($KS), $TMP1, $TMP1
- vaesenc 9*16($KS), $TMP1, $TMP1
- ___
- if ($aes256) {
- $code.=<<___;
- vaesenc 10*16($KS), $TMP1, $TMP1
- vaesenc 11*16($KS), $TMP1, $TMP1
- vaesenc 12*16($KS), $TMP1, $TMP1
- vaesenc 13*16($KS), $TMP1, $TMP1
- vaesenclast 14*16($KS), $TMP1, $TMP1
- ___
- } else {
- $code.=<<___;
- vaesenclast 10*16($KS), $TMP1, $TMP1
- ___
- }
-
- $code.=<<___;
- vpxor ($CT), $TMP1, $TMP1
- vmovdqu $TMP1, ($PT)
- addq \$16, $CT
- addq \$16, $PT
-
- vpxor $TMP1, $T, $T
- vmovdqa -32($Htbl), $TMP0
- call GFMUL
-
- jmp .L${labelPrefix}_dec_loop2
-
- .L${labelPrefix}_dec_out:
- vmovdqu $T, ($POL)
- ret
- .cfi_endproc
- ___
-
- if ($aes256) {
- $code.=<<___;
- .size aes256gcmsiv_dec, .-aes256gcmsiv_dec
- ___
- } else {
- $code.=<<___;
- .size aes128gcmsiv_dec, .-aes128gcmsiv_dec
- ___
- }
- }
-
- aesgcmsiv_dec(0); # emit 128-bit version
-
- sub aes128gcmsiv_ecb_enc_block {
- my $STATE_1 = "%xmm1";
- my $KSp = "%rdx";
-
- # parameter 1: PT %rdi (pointer to 128 bit)
- # parameter 2: CT %rsi (pointer to 128 bit)
- # parameter 3: ks %rdx (pointer to ks)
- $code.=<<___;
- .globl aes128gcmsiv_ecb_enc_block
- .type aes128gcmsiv_ecb_enc_block,\@function,3
- .align 16
- aes128gcmsiv_ecb_enc_block:
- .cfi_startproc
- vmovdqa (%rdi), $STATE_1
-
- vpxor ($KSp), $STATE_1, $STATE_1
- vaesenc 1*16($KSp), $STATE_1, $STATE_1
- vaesenc 2*16($KSp), $STATE_1, $STATE_1
- vaesenc 3*16($KSp), $STATE_1, $STATE_1
- vaesenc 4*16($KSp), $STATE_1, $STATE_1
- vaesenc 5*16($KSp), $STATE_1, $STATE_1
- vaesenc 6*16($KSp), $STATE_1, $STATE_1
- vaesenc 7*16($KSp), $STATE_1, $STATE_1
- vaesenc 8*16($KSp), $STATE_1, $STATE_1
- vaesenc 9*16($KSp), $STATE_1, $STATE_1
- vaesenclast 10*16($KSp), $STATE_1, $STATE_1 # STATE_1 == IV
-
- vmovdqa $STATE_1, (%rsi)
-
- ret
- .cfi_endproc
- .size aes128gcmsiv_ecb_enc_block,.-aes128gcmsiv_ecb_enc_block
- ___
- }
- aes128gcmsiv_ecb_enc_block();
-
- sub aes256gcmsiv_aes_ks_enc_x1 {
- my $KS = "%rdx";
- my $KEYp = "%rcx";
- my $CON_MASK = "%xmm0";
- my $MASK_256 = "%xmm15";
- my $KEY_1 = "%xmm1";
- my $KEY_2 = "%xmm3";
- my $BLOCK1 = "%xmm8";
- my $AUX_REG = "%xmm14";
- my $PT = "%rdi";
- my $CT = "%rsi";
-
- my $round_double = sub {
- my ($i, $j) = @_;
- return <<___;
- vpshufb %xmm15, %xmm3, %xmm2
- vaesenclast %xmm0, %xmm2, %xmm2
- vpslld \$1, %xmm0, %xmm0
- vpslldq \$4, %xmm1, %xmm4
- vpxor %xmm4, %xmm1, %xmm1
- vpslldq \$4, %xmm4, %xmm4
- vpxor %xmm4, %xmm1, %xmm1
- vpslldq \$4, %xmm4, %xmm4
- vpxor %xmm4, %xmm1, %xmm1
- vpxor %xmm2, %xmm1, %xmm1
- vaesenc %xmm1, $BLOCK1, $BLOCK1
- vmovdqu %xmm1, ${\eval(16*$i)}($KS)
-
- vpshufd \$0xff, %xmm1, %xmm2
- vaesenclast %xmm14, %xmm2, %xmm2
- vpslldq \$4, %xmm3, %xmm4
- vpxor %xmm4, %xmm3, %xmm3
- vpslldq \$4, %xmm4, %xmm4
- vpxor %xmm4, %xmm3, %xmm3
- vpslldq \$4, %xmm4, %xmm4
- vpxor %xmm4, %xmm3, %xmm3
- vpxor %xmm2, %xmm3, %xmm3
- vaesenc %xmm3, $BLOCK1, $BLOCK1
- vmovdqu %xmm3, ${\eval(16*$j)}($KS)
- ___
- };
-
- my $round_last = sub {
- my ($i) = @_;
- return <<___;
- vpshufb %xmm15, %xmm3, %xmm2
- vaesenclast %xmm0, %xmm2, %xmm2
- vpslldq \$4, %xmm1, %xmm4
- vpxor %xmm4, %xmm1, %xmm1
- vpslldq \$4, %xmm4, %xmm4
- vpxor %xmm4, %xmm1, %xmm1
- vpslldq \$4, %xmm4, %xmm4
- vpxor %xmm4, %xmm1, %xmm1
- vpxor %xmm2, %xmm1, %xmm1
- vaesenclast %xmm1, $BLOCK1, $BLOCK1
- vmovdqu %xmm1, ${\eval(16*$i)}($KS)
- ___
- };
-
- # parameter 1: %rdi Pointer to PT1
- # parameter 2: %rsi Pointer to CT1
- # parameter 3: %rdx Pointer to KS
- # parameter 4: %rcx Pointer to initial key
- $code.=<<___;
- .globl aes256gcmsiv_aes_ks_enc_x1
- .type aes256gcmsiv_aes_ks_enc_x1,\@function,4
- .align 16
- aes256gcmsiv_aes_ks_enc_x1:
- .cfi_startproc
- vmovdqa con1(%rip), $CON_MASK # CON_MASK = 1,1,1,1
- vmovdqa mask(%rip), $MASK_256 # MASK_256
- vmovdqa ($PT), $BLOCK1
- vmovdqa ($KEYp), $KEY_1 # KEY_1 || KEY_2 [0..7] = user key
- vmovdqa 16($KEYp), $KEY_2
- vpxor $KEY_1, $BLOCK1, $BLOCK1
- vaesenc $KEY_2, $BLOCK1, $BLOCK1
- vmovdqu $KEY_1, ($KS) # First round key
- vmovdqu $KEY_2, 16($KS)
- vpxor $AUX_REG, $AUX_REG, $AUX_REG
-
- ${\$round_double->(2, 3)}
- ${\$round_double->(4, 5)}
- ${\$round_double->(6, 7)}
- ${\$round_double->(8, 9)}
- ${\$round_double->(10, 11)}
- ${\$round_double->(12, 13)}
- ${\$round_last->(14)}
- vmovdqa $BLOCK1, ($CT)
- ret
- .cfi_endproc
- .size aes256gcmsiv_aes_ks_enc_x1,.-aes256gcmsiv_aes_ks_enc_x1
- ___
- }
- aes256gcmsiv_aes_ks_enc_x1();
-
- sub aes256gcmsiv_ecb_enc_block {
- my $STATE_1 = "%xmm1";
- my $PT = "%rdi";
- my $CT = "%rsi";
- my $KSp = "%rdx";
-
- # parameter 1: PT %rdi (pointer to 128 bit)
- # parameter 2: CT %rsi (pointer to 128 bit)
- # parameter 3: ks %rdx (pointer to ks)
- $code.=<<___;
- .globl aes256gcmsiv_ecb_enc_block
- .type aes256gcmsiv_ecb_enc_block,\@function,3
- .align 16
- aes256gcmsiv_ecb_enc_block:
- .cfi_startproc
- vmovdqa (%rdi), $STATE_1
- vpxor ($KSp), $STATE_1, $STATE_1
- vaesenc 1*16($KSp), $STATE_1, $STATE_1
- vaesenc 2*16($KSp), $STATE_1, $STATE_1
- vaesenc 3*16($KSp), $STATE_1, $STATE_1
- vaesenc 4*16($KSp), $STATE_1, $STATE_1
- vaesenc 5*16($KSp), $STATE_1, $STATE_1
- vaesenc 6*16($KSp), $STATE_1, $STATE_1
- vaesenc 7*16($KSp), $STATE_1, $STATE_1
- vaesenc 8*16($KSp), $STATE_1, $STATE_1
- vaesenc 9*16($KSp), $STATE_1, $STATE_1
- vaesenc 10*16($KSp), $STATE_1, $STATE_1
- vaesenc 11*16($KSp), $STATE_1, $STATE_1
- vaesenc 12*16($KSp), $STATE_1, $STATE_1
- vaesenc 13*16($KSp), $STATE_1, $STATE_1
- vaesenclast 14*16($KSp), $STATE_1, $STATE_1 # $STATE_1 == IV
- vmovdqa $STATE_1, (%rsi)
- ret
- .cfi_endproc
- .size aes256gcmsiv_ecb_enc_block,.-aes256gcmsiv_ecb_enc_block
- ___
- }
- aes256gcmsiv_ecb_enc_block();
-
- sub aes256gcmsiv_enc_msg_x4 {
- my $CTR1 = "%xmm0";
- my $CTR2 = "%xmm1";
- my $CTR3 = "%xmm2";
- my $CTR4 = "%xmm3";
- my $ADDER = "%xmm4";
-
- my $STATE1 = "%xmm5";
- my $STATE2 = "%xmm6";
- my $STATE3 = "%xmm7";
- my $STATE4 = "%xmm8";
-
- my $TMP = "%xmm12";
- my $TMP2 = "%xmm13";
- my $TMP3 = "%xmm14";
- my $IV = "%xmm15";
-
- my $PT = "%rdi";
- my $CT = "%rsi";
- my $TAG = "%rdx";
- my $KS = "%rcx";
- my $LEN = "%r8";
-
- my $aes_round = sub {
- my ($i) = @_;
- return <<___;
- vmovdqu ${\eval($i*16)}($KS), $TMP
- vaesenc $TMP, $STATE1, $STATE1
- vaesenc $TMP, $STATE2, $STATE2
- vaesenc $TMP, $STATE3, $STATE3
- vaesenc $TMP, $STATE4, $STATE4
- ___
- };
-
- my $aes_lastround = sub {
- my ($i) = @_;
- return <<___;
- vmovdqu ${\eval($i*16)}($KS), $TMP
- vaesenclast $TMP, $STATE1, $STATE1
- vaesenclast $TMP, $STATE2, $STATE2
- vaesenclast $TMP, $STATE3, $STATE3
- vaesenclast $TMP, $STATE4, $STATE4
- ___
- };
-
- # void aes256gcmsiv_enc_msg_x4(unsigned char* PT, unsigned char* CT,
- # unsigned char* TAG, unsigned char* KS,
- # size_t byte_len);
- # parameter 1: %rdi #PT
- # parameter 2: %rsi #CT
- # parameter 3: %rdx #TAG [127 126 ... 0] IV=[127...32]
- # parameter 4: %rcx #KS
- # parameter 5: %r8 #LEN MSG_length in bytes
- $code.=<<___;
- .globl aes256gcmsiv_enc_msg_x4
- .type aes256gcmsiv_enc_msg_x4,\@function,5
- .align 16
- aes256gcmsiv_enc_msg_x4:
- .cfi_startproc
- test $LEN, $LEN
- jnz .L256_enc_msg_x4_start
- ret
-
- .L256_enc_msg_x4_start:
- movq $LEN, %r10
- shrq \$4, $LEN # LEN = num of blocks
- shlq \$60, %r10
- jz .L256_enc_msg_x4_start2
- addq \$1, $LEN
-
- .L256_enc_msg_x4_start2:
- movq $LEN, %r10
- shlq \$62, %r10
- shrq \$62, %r10
-
- # make IV from TAG
- vmovdqa ($TAG), $IV
- vpor OR_MASK(%rip), $IV, $IV # IV = [1]TAG[126...32][00..00]
-
- vmovdqa four(%rip), $ADDER # Register to increment counters
- vmovdqa $IV, $CTR1 # CTR1 = TAG[1][127...32][00..00]
- vpaddd one(%rip), $IV, $CTR2 # CTR2 = TAG[1][127...32][00..01]
- vpaddd two(%rip), $IV, $CTR3 # CTR3 = TAG[1][127...32][00..02]
- vpaddd three(%rip), $IV, $CTR4 # CTR4 = TAG[1][127...32][00..03]
-
- shrq \$2, $LEN
- je .L256_enc_msg_x4_check_remainder
-
- subq \$64, $CT
- subq \$64, $PT
-
- .L256_enc_msg_x4_loop1:
- addq \$64, $CT
- addq \$64, $PT
-
- vmovdqa $CTR1, $STATE1
- vmovdqa $CTR2, $STATE2
- vmovdqa $CTR3, $STATE3
- vmovdqa $CTR4, $STATE4
-
- vpxor ($KS), $STATE1, $STATE1
- vpxor ($KS), $STATE2, $STATE2
- vpxor ($KS), $STATE3, $STATE3
- vpxor ($KS), $STATE4, $STATE4
-
- ${\$aes_round->(1)}
- vpaddd $ADDER, $CTR1, $CTR1
- ${\$aes_round->(2)}
- vpaddd $ADDER, $CTR2, $CTR2
- ${\$aes_round->(3)}
- vpaddd $ADDER, $CTR3, $CTR3
- ${\$aes_round->(4)}
- vpaddd $ADDER, $CTR4, $CTR4
-
- ${\$aes_round->(5)}
- ${\$aes_round->(6)}
- ${\$aes_round->(7)}
- ${\$aes_round->(8)}
- ${\$aes_round->(9)}
- ${\$aes_round->(10)}
- ${\$aes_round->(11)}
- ${\$aes_round->(12)}
- ${\$aes_round->(13)}
- ${\$aes_lastround->(14)}
-
- # XOR with Plaintext
- vpxor 0*16($PT), $STATE1, $STATE1
- vpxor 1*16($PT), $STATE2, $STATE2
- vpxor 2*16($PT), $STATE3, $STATE3
- vpxor 3*16($PT), $STATE4, $STATE4
-
- subq \$1, $LEN
-
- vmovdqu $STATE1, 0*16($CT)
- vmovdqu $STATE2, 1*16($CT)
- vmovdqu $STATE3, 2*16($CT)
- vmovdqu $STATE4, 3*16($CT)
-
- jne .L256_enc_msg_x4_loop1
-
- addq \$64, $CT
- addq \$64, $PT
-
- .L256_enc_msg_x4_check_remainder:
- cmpq \$0, %r10
- je .L256_enc_msg_x4_out
-
- .L256_enc_msg_x4_loop2:
- # encrypt each block separately
- # CTR1 is the highest counter (even if no LOOP done)
-
- vmovdqa $CTR1, $STATE1
- vpaddd one(%rip), $CTR1, $CTR1 # inc counter
- vpxor ($KS), $STATE1, $STATE1
- vaesenc 16($KS), $STATE1, $STATE1
- vaesenc 32($KS), $STATE1, $STATE1
- vaesenc 48($KS), $STATE1, $STATE1
- vaesenc 64($KS), $STATE1, $STATE1
- vaesenc 80($KS), $STATE1, $STATE1
- vaesenc 96($KS), $STATE1, $STATE1
- vaesenc 112($KS), $STATE1, $STATE1
- vaesenc 128($KS), $STATE1, $STATE1
- vaesenc 144($KS), $STATE1, $STATE1
- vaesenc 160($KS), $STATE1, $STATE1
- vaesenc 176($KS), $STATE1, $STATE1
- vaesenc 192($KS), $STATE1, $STATE1
- vaesenc 208($KS), $STATE1, $STATE1
- vaesenclast 224($KS), $STATE1, $STATE1
-
- # XOR with Plaintext
- vpxor ($PT), $STATE1, $STATE1
-
- vmovdqu $STATE1, ($CT)
-
- addq \$16, $PT
- addq \$16, $CT
-
- subq \$1, %r10
- jne .L256_enc_msg_x4_loop2
-
- .L256_enc_msg_x4_out:
- ret
- .cfi_endproc
- .size aes256gcmsiv_enc_msg_x4,.-aes256gcmsiv_enc_msg_x4
- ___
- }
- aes256gcmsiv_enc_msg_x4();
-
- sub aes256gcmsiv_enc_msg_x8() {
- my $STATE1 = "%xmm1";
- my $STATE2 = "%xmm2";
- my $STATE3 = "%xmm3";
- my $STATE4 = "%xmm4";
- my $STATE5 = "%xmm5";
- my $STATE6 = "%xmm6";
- my $STATE7 = "%xmm7";
- my $STATE8 = "%xmm8";
- my $CTR1 = "%xmm0";
- my $CTR2 = "%xmm9";
- my $CTR3 = "%xmm10";
- my $CTR4 = "%xmm11";
- my $CTR5 = "%xmm12";
- my $CTR6 = "%xmm13";
- my $CTR7 = "%xmm14";
- my $TMP1 = "%xmm1";
- my $TMP2 = "%xmm2";
- my $KS = "%rcx";
- my $LEN = "%r8";
- my $PT = "%rdi";
- my $CT = "%rsi";
- my $TAG = "%rdx";
- my $SCHED = "%xmm15";
-
- my $aes_round8 = sub {
- my ($i) = @_;
- return <<___;
- vmovdqu ${\eval($i*16)}($KS), $SCHED
- vaesenc $SCHED, $STATE1, $STATE1
- vaesenc $SCHED, $STATE2, $STATE2
- vaesenc $SCHED, $STATE3, $STATE3
- vaesenc $SCHED, $STATE4, $STATE4
- vaesenc $SCHED, $STATE5, $STATE5
- vaesenc $SCHED, $STATE6, $STATE6
- vaesenc $SCHED, $STATE7, $STATE7
- vaesenc $SCHED, $STATE8, $STATE8
- ___
- };
-
- my $aes_lastround8 = sub {
- my ($i) = @_;
- return <<___;
- vmovdqu ${\eval($i*16)}($KS), $SCHED
- vaesenclast $SCHED, $STATE1, $STATE1
- vaesenclast $SCHED, $STATE2, $STATE2
- vaesenclast $SCHED, $STATE3, $STATE3
- vaesenclast $SCHED, $STATE4, $STATE4
- vaesenclast $SCHED, $STATE5, $STATE5
- vaesenclast $SCHED, $STATE6, $STATE6
- vaesenclast $SCHED, $STATE7, $STATE7
- vaesenclast $SCHED, $STATE8, $STATE8
- ___
- };
-
- # void ENC_MSG_x8(unsigned char* PT,
- # unsigned char* CT,
- # unsigned char* TAG,
- # unsigned char* KS,
- # size_t byte_len);
- # parameter 1: %rdi #PT
- # parameter 2: %rsi #CT
- # parameter 3: %rdx #TAG [127 126 ... 0] IV=[127...32]
- # parameter 4: %rcx #KS
- # parameter 5: %r8 #LEN MSG_length in bytes
- $code.=<<___;
- .globl aes256gcmsiv_enc_msg_x8
- .type aes256gcmsiv_enc_msg_x8,\@function,5
- .align 16
- aes256gcmsiv_enc_msg_x8:
- .cfi_startproc
- test $LEN, $LEN
- jnz .L256_enc_msg_x8_start
- ret
-
- .L256_enc_msg_x8_start:
- # Place in stack
- movq %rsp, %r11
- subq \$16, %r11
- andq \$-64, %r11
-
- movq $LEN, %r10
- shrq \$4, $LEN # LEN = num of blocks
- shlq \$60, %r10
- jz .L256_enc_msg_x8_start2
- addq \$1, $LEN
-
- .L256_enc_msg_x8_start2:
- movq $LEN, %r10
- shlq \$61, %r10
- shrq \$61, %r10
-
- # Make IV from TAG
- vmovdqa ($TAG), $TMP1
- vpor OR_MASK(%rip), $TMP1, $TMP1 # TMP1= IV = [1]TAG[126...32][00..00]
-
- # store counter8 on the stack
- vpaddd seven(%rip), $TMP1, $CTR1
- vmovdqa $CTR1, (%r11) # CTR8 = TAG[127...32][00..07]
- vpaddd one(%rip), $TMP1, $CTR2 # CTR2 = TAG[127...32][00..01]
- vpaddd two(%rip), $TMP1, $CTR3 # CTR3 = TAG[127...32][00..02]
- vpaddd three(%rip), $TMP1, $CTR4 # CTR4 = TAG[127...32][00..03]
- vpaddd four(%rip), $TMP1, $CTR5 # CTR5 = TAG[127...32][00..04]
- vpaddd five(%rip), $TMP1, $CTR6 # CTR6 = TAG[127...32][00..05]
- vpaddd six(%rip), $TMP1, $CTR7 # CTR7 = TAG[127...32][00..06]
- vmovdqa $TMP1, $CTR1 # CTR1 = TAG[127...32][00..00]
-
- shrq \$3, $LEN
- jz .L256_enc_msg_x8_check_remainder
-
- subq \$128, $CT
- subq \$128, $PT
-
- .L256_enc_msg_x8_loop1:
- addq \$128, $CT
- addq \$128, $PT
-
- vmovdqa $CTR1, $STATE1
- vmovdqa $CTR2, $STATE2
- vmovdqa $CTR3, $STATE3
- vmovdqa $CTR4, $STATE4
- vmovdqa $CTR5, $STATE5
- vmovdqa $CTR6, $STATE6
- vmovdqa $CTR7, $STATE7
- # move from stack
- vmovdqa (%r11), $STATE8
-
- vpxor ($KS), $STATE1, $STATE1
- vpxor ($KS), $STATE2, $STATE2
- vpxor ($KS), $STATE3, $STATE3
- vpxor ($KS), $STATE4, $STATE4
- vpxor ($KS), $STATE5, $STATE5
- vpxor ($KS), $STATE6, $STATE6
- vpxor ($KS), $STATE7, $STATE7
- vpxor ($KS), $STATE8, $STATE8
-
- ${\$aes_round8->(1)}
- vmovdqa (%r11), $CTR7 # deal with CTR8
- vpaddd eight(%rip), $CTR7, $CTR7
- vmovdqa $CTR7, (%r11)
- ${\$aes_round8->(2)}
- vpsubd one(%rip), $CTR7, $CTR7
- ${\$aes_round8->(3)}
- vpaddd eight(%rip), $CTR1, $CTR1
- ${\$aes_round8->(4)}
- vpaddd eight(%rip), $CTR2, $CTR2
- ${\$aes_round8->(5)}
- vpaddd eight(%rip), $CTR3, $CTR3
- ${\$aes_round8->(6)}
- vpaddd eight(%rip), $CTR4, $CTR4
- ${\$aes_round8->(7)}
- vpaddd eight(%rip), $CTR5, $CTR5
- ${\$aes_round8->(8)}
- vpaddd eight(%rip), $CTR6, $CTR6
- ${\$aes_round8->(9)}
- ${\$aes_round8->(10)}
- ${\$aes_round8->(11)}
- ${\$aes_round8->(12)}
- ${\$aes_round8->(13)}
- ${\$aes_lastround8->(14)}
-
- # XOR with Plaintext
- vpxor 0*16($PT), $STATE1, $STATE1
- vpxor 1*16($PT), $STATE2, $STATE2
- vpxor 2*16($PT), $STATE3, $STATE3
- vpxor 3*16($PT), $STATE4, $STATE4
- vpxor 4*16($PT), $STATE5, $STATE5
- vpxor 5*16($PT), $STATE6, $STATE6
- vpxor 6*16($PT), $STATE7, $STATE7
- vpxor 7*16($PT), $STATE8, $STATE8
-
- subq \$1, $LEN
-
- vmovdqu $STATE1, 0*16($CT)
- vmovdqu $STATE2, 1*16($CT)
- vmovdqu $STATE3, 2*16($CT)
- vmovdqu $STATE4, 3*16($CT)
- vmovdqu $STATE5, 4*16($CT)
- vmovdqu $STATE6, 5*16($CT)
- vmovdqu $STATE7, 6*16($CT)
- vmovdqu $STATE8, 7*16($CT)
-
- jne .L256_enc_msg_x8_loop1
-
- addq \$128, $CT
- addq \$128, $PT
-
- .L256_enc_msg_x8_check_remainder:
- cmpq \$0, %r10
- je .L256_enc_msg_x8_out
-
- .L256_enc_msg_x8_loop2:
- # encrypt each block separately
- # CTR1 is the highest counter (even if no LOOP done)
- vmovdqa $CTR1, $STATE1
- vpaddd one(%rip), $CTR1, $CTR1
-
- vpxor ($KS), $STATE1, $STATE1
- vaesenc 16($KS), $STATE1, $STATE1
- vaesenc 32($KS), $STATE1, $STATE1
- vaesenc 48($KS), $STATE1, $STATE1
- vaesenc 64($KS), $STATE1, $STATE1
- vaesenc 80($KS), $STATE1, $STATE1
- vaesenc 96($KS), $STATE1, $STATE1
- vaesenc 112($KS), $STATE1, $STATE1
- vaesenc 128($KS), $STATE1, $STATE1
- vaesenc 144($KS), $STATE1, $STATE1
- vaesenc 160($KS), $STATE1, $STATE1
- vaesenc 176($KS), $STATE1, $STATE1
- vaesenc 192($KS), $STATE1, $STATE1
- vaesenc 208($KS), $STATE1, $STATE1
- vaesenclast 224($KS), $STATE1, $STATE1
-
- # XOR with Plaintext
- vpxor ($PT), $STATE1, $STATE1
-
- vmovdqu $STATE1, ($CT)
-
- addq \$16, $PT
- addq \$16, $CT
- subq \$1, %r10
- jnz .L256_enc_msg_x8_loop2
-
- .L256_enc_msg_x8_out:
- ret
-
- .cfi_endproc
- .size aes256gcmsiv_enc_msg_x8,.-aes256gcmsiv_enc_msg_x8
- ___
- }
- aes256gcmsiv_enc_msg_x8();
- aesgcmsiv_dec(1);
-
- sub aes256gcmsiv_kdf {
- my $ONE = "%xmm8";
- my $BLOCK1 = "%xmm4";
- my $BLOCK2 = "%xmm6";
- my $BLOCK3 = "%xmm7";
- my $BLOCK4 = "%xmm11";
- my $BLOCK5 = "%xmm12";
- my $BLOCK6 = "%xmm13";
-
- my $enc_roundx6 = sub {
- my ($i, $j) = @_;
- return <<___;
- vmovdqa ${\eval($i*16)}(%rdx), $j
- vaesenc $j, $BLOCK1, $BLOCK1
- vaesenc $j, $BLOCK2, $BLOCK2
- vaesenc $j, $BLOCK3, $BLOCK3
- vaesenc $j, $BLOCK4, $BLOCK4
- vaesenc $j, $BLOCK5, $BLOCK5
- vaesenc $j, $BLOCK6, $BLOCK6
- ___
- };
-
- my $enc_roundlastx6 = sub {
- my ($i, $j) = @_;
- return <<___;
- vmovdqa ${\eval($i*16)}(%rdx), $j
- vaesenclast $j, $BLOCK1, $BLOCK1
- vaesenclast $j, $BLOCK2, $BLOCK2
- vaesenclast $j, $BLOCK3, $BLOCK3
- vaesenclast $j, $BLOCK4, $BLOCK4
- vaesenclast $j, $BLOCK5, $BLOCK5
- vaesenclast $j, $BLOCK6, $BLOCK6
- ___
- };
-
- # void aes256gcmsiv_kdf(const uint8_t nonce[16],
- # uint8_t *out_key_material,
- # const uint8_t *key_schedule);
- $code.=<<___;
- .globl aes256gcmsiv_kdf
- .type aes256gcmsiv_kdf,\@function,3
- .align 16
- aes256gcmsiv_kdf:
- .cfi_startproc
- # parameter 1: %rdi Pointer to NONCE
- # parameter 2: %rsi Pointer to CT
- # parameter 4: %rdx Pointer to keys
-
- vmovdqa (%rdx), %xmm1 # xmm1 = first 16 bytes of random key
- vmovdqa 0*16(%rdi), $BLOCK1
- vmovdqa and_mask(%rip), $BLOCK4
- vmovdqa one(%rip), $ONE
- vpshufd \$0x90, $BLOCK1, $BLOCK1
- vpand $BLOCK4, $BLOCK1, $BLOCK1
- vpaddd $ONE, $BLOCK1, $BLOCK2
- vpaddd $ONE, $BLOCK2, $BLOCK3
- vpaddd $ONE, $BLOCK3, $BLOCK4
- vpaddd $ONE, $BLOCK4, $BLOCK5
- vpaddd $ONE, $BLOCK5, $BLOCK6
-
- vpxor %xmm1, $BLOCK1, $BLOCK1
- vpxor %xmm1, $BLOCK2, $BLOCK2
- vpxor %xmm1, $BLOCK3, $BLOCK3
- vpxor %xmm1, $BLOCK4, $BLOCK4
- vpxor %xmm1, $BLOCK5, $BLOCK5
- vpxor %xmm1, $BLOCK6, $BLOCK6
-
- ${\$enc_roundx6->(1, "%xmm1")}
- ${\$enc_roundx6->(2, "%xmm2")}
- ${\$enc_roundx6->(3, "%xmm1")}
- ${\$enc_roundx6->(4, "%xmm2")}
- ${\$enc_roundx6->(5, "%xmm1")}
- ${\$enc_roundx6->(6, "%xmm2")}
- ${\$enc_roundx6->(7, "%xmm1")}
- ${\$enc_roundx6->(8, "%xmm2")}
- ${\$enc_roundx6->(9, "%xmm1")}
- ${\$enc_roundx6->(10, "%xmm2")}
- ${\$enc_roundx6->(11, "%xmm1")}
- ${\$enc_roundx6->(12, "%xmm2")}
- ${\$enc_roundx6->(13, "%xmm1")}
- ${\$enc_roundlastx6->(14, "%xmm2")}
-
- vmovdqa $BLOCK1, 0*16(%rsi)
- vmovdqa $BLOCK2, 1*16(%rsi)
- vmovdqa $BLOCK3, 2*16(%rsi)
- vmovdqa $BLOCK4, 3*16(%rsi)
- vmovdqa $BLOCK5, 4*16(%rsi)
- vmovdqa $BLOCK6, 5*16(%rsi)
- ret
- .cfi_endproc
- .size aes256gcmsiv_kdf, .-aes256gcmsiv_kdf
- ___
- }
- aes256gcmsiv_kdf();
-
- print $code;
-
- close STDOUT;
|