boringssl/crypto/cipher_extra/asm/aes128gcmsiv-x86_64.pl
Adam Langley c5e9ac1cac Move AES-GCM-SIV out from SMALL and handle unaligned keys.
In order to use AES-GCM-SIV in the open-source QUIC boxer, it needs to
be moved out from OPENSSL_SMALL. (Hopefully the linker can still discard
it in the vast majority of cases.)

Additionally, the input to the key schedule function comes from outside
and may not be aligned, thus we need to use unaligned instructions to
read it.

Change-Id: I02c261fe0663d13a96c428174943c7e5ac8415a7
Reviewed-on: https://boringssl-review.googlesource.com/16824
Commit-Queue: David Benjamin <davidben@google.com>
Reviewed-by: David Benjamin <davidben@google.com>
CQ-Verified: CQ bot account: commit-bot@chromium.org <commit-bot@chromium.org>
2017-06-01 18:45:06 +00:00

2257 lines
57 KiB
Perl

#!/usr/bin/env perl
# Copyright (c) 2017, Shay Gueron.
# Copyright (c) 2017, Google Inc.
#
# Permission to use, copy, modify, and/or distribute this software for any
# purpose with or without fee is hereby granted, provided that the above
# copyright notice and this permission notice appear in all copies.
#
# THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
# WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
# MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
# SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
# WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
# OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
# CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */
use warnings FATAL => 'all';
$flavour = shift;
$output = shift;
if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
die "can't locate x86_64-xlate.pl";
open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
*STDOUT=*OUT;
$code.=<<___;
.data
.align 16
one:
.quad 1,0
two:
.quad 2,0
three:
.quad 3,0
four:
.quad 4,0
five:
.quad 5,0
six:
.quad 6,0
seven:
.quad 7,0
eight:
.quad 8,0
OR_MASK:
.long 0x00000000,0x00000000,0x00000000,0x80000000
poly:
.quad 0x1, 0xc200000000000000
mask:
.long 0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d
con1:
.long 1,1,1,1
con2:
.long 0x1b,0x1b,0x1b,0x1b
con3:
.byte -1,-1,-1,-1,-1,-1,-1,-1,4,5,6,7,4,5,6,7
and_mask:
.long 0,0xffffffff, 0xffffffff, 0xffffffff
___
$code.=<<___;
.text
___
sub gfmul {
#########################
# a = T
# b = TMP0 - remains unchanged
# res = T
# uses also TMP1,TMP2,TMP3,TMP4
# __m128i GFMUL(__m128i A, __m128i B);
my $T = "%xmm0";
my $TMP0 = "%xmm1";
my $TMP1 = "%xmm2";
my $TMP2 = "%xmm3";
my $TMP3 = "%xmm4";
my $TMP4 = "%xmm5";
$code.=<<___;
.type GFMUL,\@abi-omnipotent
.align 16
GFMUL:
.cfi_startproc
vpclmulqdq \$0x00, $TMP0, $T, $TMP1
vpclmulqdq \$0x11, $TMP0, $T, $TMP4
vpclmulqdq \$0x10, $TMP0, $T, $TMP2
vpclmulqdq \$0x01, $TMP0, $T, $TMP3
vpxor $TMP3, $TMP2, $TMP2
vpslldq \$8, $TMP2, $TMP3
vpsrldq \$8, $TMP2, $TMP2
vpxor $TMP3, $TMP1, $TMP1
vpxor $TMP2, $TMP4, $TMP4
vpclmulqdq \$0x10, poly(%rip), $TMP1, $TMP2
vpshufd \$78, $TMP1, $TMP3
vpxor $TMP3, $TMP2, $TMP1
vpclmulqdq \$0x10, poly(%rip), $TMP1, $TMP2
vpshufd \$78, $TMP1, $TMP3
vpxor $TMP3, $TMP2, $TMP1
vpxor $TMP4, $TMP1, $T
ret
.cfi_endproc
.size GFMUL, .-GFMUL
___
}
gfmul();
sub aesgcmsiv_htable_init {
# aesgcmsiv_htable_init writes an eight-entry table of powers of |H| to
# |out_htable|.
# void aesgcmsiv_htable_init(uint8_t out_htable[16*8], uint8_t *H);
my $Htbl = "%rdi";
my $H = "%rsi";
my $T = "%xmm0";
my $TMP0 = "%xmm1";
$code.=<<___;
.globl aesgcmsiv_htable_init
.type aesgcmsiv_htable_init,\@function,2
.align 16
aesgcmsiv_htable_init:
.cfi_startproc
vmovdqa ($H), $T
vmovdqa $T, $TMP0
vmovdqa $T, ($Htbl) # H
call GFMUL
vmovdqa $T, 16($Htbl) # H^2
call GFMUL
vmovdqa $T, 32($Htbl) # H^3
call GFMUL
vmovdqa $T, 48($Htbl) # H^4
call GFMUL
vmovdqa $T, 64($Htbl) # H^5
call GFMUL
vmovdqa $T, 80($Htbl) # H^6
call GFMUL
vmovdqa $T, 96($Htbl) # H^7
call GFMUL
vmovdqa $T, 112($Htbl) # H^8
ret
.cfi_endproc
.size aesgcmsiv_htable_init, .-aesgcmsiv_htable_init
___
}
aesgcmsiv_htable_init();
sub aesgcmsiv_htable6_init {
# aesgcmsiv_htable6_init writes a six-entry table of powers of |H| to
# |out_htable|.
# void aesgcmsiv_htable6_init(uint8_t out_htable[16*6], uint8_t *H);
#
my $Htbl = "%rdi";
my $H = "%rsi";
my $T = "%xmm0";
my $TMP0 = "%xmm1";
$code.=<<___;
.globl aesgcmsiv_htable6_init
.type aesgcmsiv_htable6_init,\@function,2
.align 16
aesgcmsiv_htable6_init:
.cfi_startproc
vmovdqa ($H), $T
vmovdqa $T, $TMP0
vmovdqa $T, ($Htbl) # H
call GFMUL
vmovdqa $T, 16($Htbl) # H^2
call GFMUL
vmovdqa $T, 32($Htbl) # H^3
call GFMUL
vmovdqa $T, 48($Htbl) # H^4
call GFMUL
vmovdqa $T, 64($Htbl) # H^5
call GFMUL
vmovdqa $T, 80($Htbl) # H^6
ret
.cfi_endproc
.size aesgcmsiv_htable6_init, .-aesgcmsiv_htable6_init
___
}
aesgcmsiv_htable6_init();
sub aesgcmsiv_htable_polyval {
# void aesgcmsiv_htable_polyval(uint8_t Htbl[16*8], uint8_t *MSG, uint64_t LEN, uint8_t *T);
# parameter 1: %rdi Htable - pointer to Htable
# parameter 2: %rsi INp - pointer to input
# parameter 3: %rdx LEN - length of BUFFER in bytes
# parameter 4: %rcx T - pointer to POLYVAL output
my $DATA = "%xmm0";
my $hlp0 = "%r11";
my $Htbl = "%rdi";
my $inp = "%rsi";
my $len = "%rdx";
my $TMP0 = "%xmm3";
my $TMP1 = "%xmm4";
my $TMP2 = "%xmm5";
my $TMP3 = "%xmm6";
my $TMP4 = "%xmm7";
my $Tp = "%rcx";
my $T = "%xmm1";
my $Xhi = "%xmm9";
my $SCHOOLBOOK_AAD = sub {
my ($i)=@_;
return <<___;
vpclmulqdq \$0x01, ${\eval(16*$i)}($Htbl), $DATA, $TMP3
vpxor $TMP3, $TMP2, $TMP2
vpclmulqdq \$0x00, ${\eval(16*$i)}($Htbl), $DATA, $TMP3
vpxor $TMP3, $TMP0, $TMP0
vpclmulqdq \$0x11, ${\eval(16*$i)}($Htbl), $DATA, $TMP3
vpxor $TMP3, $TMP1, $TMP1
vpclmulqdq \$0x10, ${\eval(16*$i)}($Htbl), $DATA, $TMP3
vpxor $TMP3, $TMP2, $TMP2
___
};
$code.=<<___;
.globl aesgcmsiv_htable_polyval
.type aesgcmsiv_htable_polyval,\@function,4
.align 16
aesgcmsiv_htable_polyval:
.cfi_startproc
test $len, $len
jnz .Lhtable_polyval_start
ret
.Lhtable_polyval_start:
vzeroall
# We hash 8 blocks each iteration. If the total number of blocks is not a
# multiple of 8, we first hash the leading n%8 blocks.
movq $len, $hlp0
andq \$127, $hlp0
jz .Lhtable_polyval_no_prefix
vpxor $Xhi, $Xhi, $Xhi
vmovdqa ($Tp), $T
sub $hlp0, $len
sub \$16, $hlp0
# hash first prefix block
vmovdqu ($inp), $DATA
vpxor $T, $DATA, $DATA
vpclmulqdq \$0x01, ($Htbl,$hlp0), $DATA, $TMP2
vpclmulqdq \$0x00, ($Htbl,$hlp0), $DATA, $TMP0
vpclmulqdq \$0x11, ($Htbl,$hlp0), $DATA, $TMP1
vpclmulqdq \$0x10, ($Htbl,$hlp0), $DATA, $TMP3
vpxor $TMP3, $TMP2, $TMP2
lea 16($inp), $inp
test $hlp0, $hlp0
jnz .Lhtable_polyval_prefix_loop
jmp .Lhtable_polyval_prefix_complete
# hash remaining prefix bocks (up to 7 total prefix blocks)
.align 64
.Lhtable_polyval_prefix_loop:
sub \$16, $hlp0
vmovdqu ($inp), $DATA # next data block
vpclmulqdq \$0x00, ($Htbl,$hlp0), $DATA, $TMP3
vpxor $TMP3, $TMP0, $TMP0
vpclmulqdq \$0x11, ($Htbl,$hlp0), $DATA, $TMP3
vpxor $TMP3, $TMP1, $TMP1
vpclmulqdq \$0x01, ($Htbl,$hlp0), $DATA, $TMP3
vpxor $TMP3, $TMP2, $TMP2
vpclmulqdq \$0x10, ($Htbl,$hlp0), $DATA, $TMP3
vpxor $TMP3, $TMP2, $TMP2
test $hlp0, $hlp0
lea 16($inp), $inp
jnz .Lhtable_polyval_prefix_loop
.Lhtable_polyval_prefix_complete:
vpsrldq \$8, $TMP2, $TMP3
vpslldq \$8, $TMP2, $TMP2
vpxor $TMP3, $TMP1, $Xhi
vpxor $TMP2, $TMP0, $T
jmp .Lhtable_polyval_main_loop
.Lhtable_polyval_no_prefix:
# At this point we know the number of blocks is a multiple of 8. However,
# the reduction in the main loop includes a multiplication by x^(-128). In
# order to counter this, the existing tag needs to be multipled by x^128.
# In practice, this just means that it is loaded into $Xhi, not $T.
vpxor $T, $T, $T
vmovdqa ($Tp), $Xhi
.align 64
.Lhtable_polyval_main_loop:
sub \$0x80, $len
jb .Lhtable_polyval_out
vmovdqu 16*7($inp), $DATA # Ii
vpclmulqdq \$0x01, ($Htbl), $DATA, $TMP2
vpclmulqdq \$0x00, ($Htbl), $DATA, $TMP0
vpclmulqdq \$0x11, ($Htbl), $DATA, $TMP1
vpclmulqdq \$0x10, ($Htbl), $DATA, $TMP3
vpxor $TMP3, $TMP2, $TMP2
#########################################################
vmovdqu 16*6($inp), $DATA
${\$SCHOOLBOOK_AAD->(1)}
#########################################################
vmovdqu 16*5($inp), $DATA
vpclmulqdq \$0x10, poly(%rip), $T, $TMP4 # reduction stage 1a
vpalignr \$8, $T, $T, $T
${\$SCHOOLBOOK_AAD->(2)}
vpxor $TMP4, $T, $T # reduction stage 1b
#########################################################
vmovdqu 16*4($inp), $DATA
${\$SCHOOLBOOK_AAD->(3)}
#########################################################
vmovdqu 16*3($inp), $DATA
vpclmulqdq \$0x10, poly(%rip), $T, $TMP4 # reduction stage 2a
vpalignr \$8, $T, $T, $T
${\$SCHOOLBOOK_AAD->(4)}
vpxor $TMP4, $T, $T # reduction stage 2b
#########################################################
vmovdqu 16*2($inp), $DATA
${\$SCHOOLBOOK_AAD->(5)}
vpxor $Xhi, $T, $T # reduction finalize
#########################################################
vmovdqu 16*1($inp), $DATA
${\$SCHOOLBOOK_AAD->(6)}
#########################################################
vmovdqu 16*0($inp), $DATA
vpxor $T, $DATA, $DATA
${\$SCHOOLBOOK_AAD->(7)}
#########################################################
vpsrldq \$8, $TMP2, $TMP3
vpslldq \$8, $TMP2, $TMP2
vpxor $TMP3, $TMP1, $Xhi
vpxor $TMP2, $TMP0, $T
lea 16*8($inp), $inp
jmp .Lhtable_polyval_main_loop
#########################################################
.Lhtable_polyval_out:
vpclmulqdq \$0x10, poly(%rip), $T, $TMP3
vpalignr \$8, $T, $T, $T
vpxor $TMP3, $T, $T
vpclmulqdq \$0x10, poly(%rip), $T, $TMP3
vpalignr \$8, $T, $T, $T
vpxor $TMP3, $T, $T
vpxor $Xhi, $T, $T
vmovdqu $T, ($Tp)
vzeroupper
ret
.cfi_endproc
.size aesgcmsiv_htable_polyval,.-aesgcmsiv_htable_polyval
___
}
aesgcmsiv_htable_polyval();
sub aesgcmsiv_polyval_horner {
#void aesgcmsiv_polyval_horner(unsigned char T[16], // output
# const unsigned char* H, // H
# unsigned char* BUF, // Buffer
# unsigned int blocks); // Len2
#
# parameter 1: %rdi T - pointers to POLYVAL output
# parameter 2: %rsi Hp - pointer to H (user key)
# parameter 3: %rdx INp - pointer to input
# parameter 4: %rcx L - total number of blocks in input BUFFER
#
my $T = "%rdi";
my $Hp = "%rsi";
my $INp = "%rdx";
my $L = "%rcx";
my $LOC = "%r10";
my $LEN = "%eax";
my $H = "%xmm1";
my $RES = "%xmm0";
$code.=<<___;
.globl aesgcmsiv_polyval_horner
.type aesgcmsiv_polyval_horner,\@function,4
.align 16
aesgcmsiv_polyval_horner:
.cfi_startproc
test $L, $L
jnz .Lpolyval_horner_start
ret
.Lpolyval_horner_start:
# We will start with L GFMULS for POLYVAL(BIG_BUFFER)
# RES = GFMUL(RES, H)
xorq $LOC, $LOC
shlq \$4, $L # L contains number of bytes to process
vmovdqa ($Hp), $H
vmovdqa ($T), $RES
.Lpolyval_horner_loop:
vpxor ($INp,$LOC), $RES, $RES # RES = RES + Xi
call GFMUL # RES = RES * H
add \$16, $LOC
cmp $LOC, $L
jne .Lpolyval_horner_loop
# calculation of T is complete. RES=T
vmovdqa $RES, ($T)
ret
.cfi_endproc
.size aesgcmsiv_polyval_horner,.-aesgcmsiv_polyval_horner
___
}
aesgcmsiv_polyval_horner();
# void aes128gcmsiv_aes_ks(const uint8_t *key, uint8_t *out_expanded_key);
# parameter 1: %rdi
# parameter 2: %rsi
$code.=<<___;
.globl aes128gcmsiv_aes_ks
.type aes128gcmsiv_aes_ks,\@function,2
.align 16
aes128gcmsiv_aes_ks:
.cfi_startproc
vmovdqu (%rdi), %xmm1 # xmm1 = user key
vmovdqa %xmm1, (%rsi) # rsi points to output
vmovdqa con1(%rip), %xmm0
vmovdqa mask(%rip), %xmm15
movq \$8, %rax
.Lks128_loop:
addq \$16, %rsi # rsi points for next key
subq \$1, %rax
vpshufb %xmm15, %xmm1, %xmm2 # xmm2 = shuffled user key
vaesenclast %xmm0, %xmm2, %xmm2
vpslld \$1, %xmm0, %xmm0
vpslldq \$4, %xmm1, %xmm3
vpxor %xmm3, %xmm1, %xmm1
vpslldq \$4, %xmm3, %xmm3
vpxor %xmm3, %xmm1, %xmm1
vpslldq \$4, %xmm3, %xmm3
vpxor %xmm3, %xmm1, %xmm1
vpxor %xmm2, %xmm1, %xmm1
vmovdqa %xmm1, (%rsi)
jne .Lks128_loop
vmovdqa con2(%rip), %xmm0
vpshufb %xmm15, %xmm1, %xmm2
vaesenclast %xmm0, %xmm2, %xmm2
vpslld \$1, %xmm0, %xmm0
vpslldq \$4, %xmm1, %xmm3
vpxor %xmm3, %xmm1, %xmm1
vpslldq \$4, %xmm3, %xmm3
vpxor %xmm3, %xmm1, %xmm1
vpslldq \$4, %xmm3, %xmm3
vpxor %xmm3, %xmm1, %xmm1
vpxor %xmm2, %xmm1, %xmm1
vmovdqa %xmm1, 16(%rsi)
vpshufb %xmm15, %xmm1, %xmm2
vaesenclast %xmm0, %xmm2, %xmm2
vpslldq \$4, %xmm1, %xmm3
vpxor %xmm3, %xmm1, %xmm1
vpslldq \$4, %xmm3, %xmm3
vpxor %xmm3, %xmm1, %xmm1
vpslldq \$4, %xmm3, %xmm3
vpxor %xmm3, %xmm1, %xmm1
vpxor %xmm2, %xmm1, %xmm1
vmovdqa %xmm1, 32(%rsi)
ret
.cfi_endproc
.size aes128gcmsiv_aes_ks,.-aes128gcmsiv_aes_ks
___
# void aes256gcmsiv_aes_ks(const uint8_t *key, uint8_t *out_expanded_key);
# parameter 1: %rdi
# parameter 2: %rsi
$code.=<<___;
.globl aes256gcmsiv_aes_ks
.type aes256gcmsiv_aes_ks,\@function,2
.align 16
aes256gcmsiv_aes_ks:
.cfi_startproc
vmovdqu (%rdi), %xmm1
vmovdqu 16(%rdi), %xmm3
vmovdqa %xmm1, (%rsi)
vmovdqa %xmm3, 16(%rsi)
vmovdqa con1(%rip), %xmm0
vmovdqa mask(%rip), %xmm15
vpxor %xmm14, %xmm14, %xmm14
mov \$6, %rax
.Lks256_loop:
add \$32, %rsi
subq \$1, %rax
vpshufb %xmm15, %xmm3, %xmm2
vaesenclast %xmm0, %xmm2, %xmm2
vpslld \$1, %xmm0, %xmm0
vpsllq \$32, %xmm1, %xmm4
vpxor %xmm4, %xmm1, %xmm1
vpshufb con3(%rip), %xmm1, %xmm4
vpxor %xmm4, %xmm1, %xmm1
vpxor %xmm2, %xmm1, %xmm1
vmovdqa %xmm1, (%rsi)
vpshufd \$0xff, %xmm1, %xmm2
vaesenclast %xmm14, %xmm2, %xmm2
vpsllq \$32, %xmm3, %xmm4
vpxor %xmm4, %xmm3, %xmm3
vpshufb con3(%rip), %xmm3, %xmm4
vpxor %xmm4, %xmm3, %xmm3
vpxor %xmm2, %xmm3, %xmm3
vmovdqa %xmm3, 16(%rsi)
jne .Lks256_loop
vpshufb %xmm15, %xmm3, %xmm2
vaesenclast %xmm0, %xmm2, %xmm2
vpsllq \$32, %xmm1, %xmm4
vpxor %xmm4, %xmm1, %xmm1
vpshufb con3(%rip), %xmm1, %xmm4
vpxor %xmm4, %xmm1, %xmm1
vpxor %xmm2, %xmm1, %xmm1
vmovdqa %xmm1, 32(%rsi)
ret
.cfi_endproc
___
sub aes128gcmsiv_aes_ks_enc_x1 {
my $KS1_REGA = "%xmm1";
my $KS1_REGB = "%xmm2";
my $BLOCK1 = "%xmm4";
my $AUXREG = "%xmm3";
my $KS_BLOCK = sub {
my ($reg, $reg2, $auxReg) = @_;
return <<___;
vpsllq \$32, $reg, $auxReg #!!saving mov instruction to xmm3
vpxor $auxReg, $reg, $reg
vpshufb con3(%rip), $reg, $auxReg
vpxor $auxReg, $reg, $reg
vpxor $reg2, $reg, $reg
___
};
my $round = sub {
my ($i, $j) = @_;
return <<___;
vpshufb %xmm15, %xmm1, %xmm2 #!!saving mov instruction to xmm2
vaesenclast %xmm0, %xmm2, %xmm2
vpslld \$1, %xmm0, %xmm0
${\$KS_BLOCK->($KS1_REGA, $KS1_REGB, $AUXREG)}
vaesenc %xmm1, $BLOCK1, $BLOCK1
vmovdqa %xmm1, ${\eval(16*$i)}($j)
___
};
my $roundlast = sub {
my ($i, $j) = @_;
return <<___;
vpshufb %xmm15, %xmm1, %xmm2 #!!saving mov instruction to xmm2
vaesenclast %xmm0, %xmm2, %xmm2
${\$KS_BLOCK->($KS1_REGA, $KS1_REGB, $AUXREG)}
vaesenclast %xmm1, $BLOCK1, $BLOCK1
vmovdqa %xmm1, ${\eval(16*$i)}($j)
___
};
# parameter 1: %rdi Pointer to PT
# parameter 2: %rsi Pointer to CT
# parameter 4: %rdx Pointer to keys
# parameter 5: %rcx Pointer to initial key
$code.=<<___;
.globl aes128gcmsiv_aes_ks_enc_x1
.type aes128gcmsiv_aes_ks_enc_x1,\@function,4
.align 16
aes128gcmsiv_aes_ks_enc_x1:
.cfi_startproc
vmovdqa (%rcx), %xmm1 # xmm1 = first 16 bytes of random key
vmovdqa 0*16(%rdi), $BLOCK1
vmovdqa %xmm1, (%rdx) # KEY[0] = first 16 bytes of random key
vpxor %xmm1, $BLOCK1, $BLOCK1
vmovdqa con1(%rip), %xmm0 # xmm0 = 1,1,1,1
vmovdqa mask(%rip), %xmm15 # xmm15 = mask
${\$round->(1, "%rdx")}
${\$round->(2, "%rdx")}
${\$round->(3, "%rdx")}
${\$round->(4, "%rdx")}
${\$round->(5, "%rdx")}
${\$round->(6, "%rdx")}
${\$round->(7, "%rdx")}
${\$round->(8, "%rdx")}
vmovdqa con2(%rip), %xmm0
${\$round->(9, "%rdx")}
${\$roundlast->(10, "%rdx")}
vmovdqa $BLOCK1, 0*16(%rsi)
ret
.cfi_endproc
.size aes128gcmsiv_aes_ks_enc_x1,.-aes128gcmsiv_aes_ks_enc_x1
___
}
aes128gcmsiv_aes_ks_enc_x1();
sub aes128gcmsiv_kdf {
my $BLOCK1 = "%xmm9";
my $BLOCK2 = "%xmm10";
my $BLOCK3 = "%xmm11";
my $BLOCK4 = "%xmm12";
my $BLOCK5 = "%xmm13";
my $BLOCK6 = "%xmm14";
my $ONE = "%xmm13";
my $KSp = "%rdx";
my $STATE_1 = "%xmm1";
my $enc_roundx4 = sub {
my ($i, $j) = @_;
return <<___;
vmovdqa ${\eval($i*16)}(%rdx), $j
vaesenc $j, $BLOCK1, $BLOCK1
vaesenc $j, $BLOCK2, $BLOCK2
vaesenc $j, $BLOCK3, $BLOCK3
vaesenc $j, $BLOCK4, $BLOCK4
___
};
my $enc_roundlastx4 = sub {
my ($i, $j) = @_;
return <<___;
vmovdqa ${\eval($i*16)}(%rdx), $j
vaesenclast $j, $BLOCK1, $BLOCK1
vaesenclast $j, $BLOCK2, $BLOCK2
vaesenclast $j, $BLOCK3, $BLOCK3
vaesenclast $j, $BLOCK4, $BLOCK4
___
};
# void aes128gcmsiv_kdf(const uint8_t nonce[16],
# uint8_t *out_key_material,
# const uint8_t *key_schedule);
$code.=<<___;
.globl aes128gcmsiv_kdf
.type aes128gcmsiv_kdf,\@function,3
.align 16
aes128gcmsiv_kdf:
.cfi_startproc
# parameter 1: %rdi Pointer to NONCE
# parameter 2: %rsi Pointer to CT
# parameter 4: %rdx Pointer to keys
vmovdqa (%rdx), %xmm1 # xmm1 = first 16 bytes of random key
vmovdqa 0*16(%rdi), $BLOCK1
vmovdqa and_mask(%rip), $BLOCK4
vmovdqa one(%rip), $ONE
vpshufd \$0x90, $BLOCK1, $BLOCK1
vpand $BLOCK4, $BLOCK1, $BLOCK1
vpaddd $ONE, $BLOCK1, $BLOCK2
vpaddd $ONE, $BLOCK2, $BLOCK3
vpaddd $ONE, $BLOCK3, $BLOCK4
vpxor %xmm1, $BLOCK1, $BLOCK1
vpxor %xmm1, $BLOCK2, $BLOCK2
vpxor %xmm1, $BLOCK3, $BLOCK3
vpxor %xmm1, $BLOCK4, $BLOCK4
${\$enc_roundx4->(1, "%xmm1")}
${\$enc_roundx4->(2, "%xmm2")}
${\$enc_roundx4->(3, "%xmm1")}
${\$enc_roundx4->(4, "%xmm2")}
${\$enc_roundx4->(5, "%xmm1")}
${\$enc_roundx4->(6, "%xmm2")}
${\$enc_roundx4->(7, "%xmm1")}
${\$enc_roundx4->(8, "%xmm2")}
${\$enc_roundx4->(9, "%xmm1")}
${\$enc_roundlastx4->(10, "%xmm2")}
vmovdqa $BLOCK1, 0*16(%rsi)
vmovdqa $BLOCK2, 1*16(%rsi)
vmovdqa $BLOCK3, 2*16(%rsi)
vmovdqa $BLOCK4, 3*16(%rsi)
ret
.cfi_endproc
.size aes128gcmsiv_kdf,.-aes128gcmsiv_kdf
___
}
aes128gcmsiv_kdf();
sub aes128gcmsiv_enc_msg_x4 {
my $CTR1 = "%xmm0";
my $CTR2 = "%xmm1";
my $CTR3 = "%xmm2";
my $CTR4 = "%xmm3";
my $ADDER = "%xmm4";
my $STATE1 = "%xmm5";
my $STATE2 = "%xmm6";
my $STATE3 = "%xmm7";
my $STATE4 = "%xmm8";
my $TMP = "%xmm12";
my $TMP2 = "%xmm13";
my $TMP3 = "%xmm14";
my $IV = "%xmm15";
my $PT = "%rdi";
my $CT = "%rsi";
my $TAG = "%rdx";
my $KS = "%rcx";
my $LEN = "%r8";
my $aes_round = sub {
my ($i) = @_;
return <<___;
vmovdqu ${\eval($i*16)}($KS), $TMP
vaesenc $TMP, $STATE1, $STATE1
vaesenc $TMP, $STATE2, $STATE2
vaesenc $TMP, $STATE3, $STATE3
vaesenc $TMP, $STATE4, $STATE4
___
};
my $aes_lastround = sub {
my ($i) = @_;
return <<___;
vmovdqu ${\eval($i*16)}($KS), $TMP
vaesenclast $TMP, $STATE1, $STATE1
vaesenclast $TMP, $STATE2, $STATE2
vaesenclast $TMP, $STATE3, $STATE3
vaesenclast $TMP, $STATE4, $STATE4
___
};
# void aes128gcmsiv_enc_msg_x4(unsigned char* PT, unsigned char* CT,
# unsigned char* TAG, unsigned char* KS,
# size_t byte_len);
# parameter 1: %rdi #PT
# parameter 2: %rsi #CT
# parameter 3: %rdx #TAG [127 126 ... 0] IV=[127...32]
# parameter 4: %rcx #KS
# parameter 5: %r8 #LEN MSG_length in bytes
$code.=<<___;
.globl aes128gcmsiv_enc_msg_x4
.type aes128gcmsiv_enc_msg_x4,\@function,5
.align 16
aes128gcmsiv_enc_msg_x4:
.cfi_startproc
test $LEN, $LEN
jnz .L128_enc_msg_x4_start
ret
.L128_enc_msg_x4_start:
pushq %r12
.cfi_push %r12
pushq %r13
.cfi_push %r13
shrq \$4, $LEN # LEN = num of blocks
movq $LEN, %r10
shlq \$62, %r10
shrq \$62, %r10
# make IV from TAG
vmovdqa ($TAG), $IV
vpor OR_MASK(%rip), $IV, $IV #IV = [1]TAG[126...32][00..00]
vmovdqu four(%rip), $ADDER # Register to increment counters
vmovdqa $IV, $CTR1 # CTR1 = TAG[1][127...32][00..00]
vpaddd one(%rip), $IV, $CTR2 # CTR2 = TAG[1][127...32][00..01]
vpaddd two(%rip), $IV, $CTR3 # CTR3 = TAG[1][127...32][00..02]
vpaddd three(%rip), $IV, $CTR4 # CTR4 = TAG[1][127...32][00..03]
shrq \$2, $LEN
je .L128_enc_msg_x4_check_remainder
subq \$64, $CT
subq \$64, $PT
.L128_enc_msg_x4_loop1:
addq \$64, $CT
addq \$64, $PT
vmovdqa $CTR1, $STATE1
vmovdqa $CTR2, $STATE2
vmovdqa $CTR3, $STATE3
vmovdqa $CTR4, $STATE4
vpxor ($KS), $STATE1, $STATE1
vpxor ($KS), $STATE2, $STATE2
vpxor ($KS), $STATE3, $STATE3
vpxor ($KS), $STATE4, $STATE4
${\$aes_round->(1)}
vpaddd $ADDER, $CTR1, $CTR1
${\$aes_round->(2)}
vpaddd $ADDER, $CTR2, $CTR2
${\$aes_round->(3)}
vpaddd $ADDER, $CTR3, $CTR3
${\$aes_round->(4)}
vpaddd $ADDER, $CTR4, $CTR4
${\$aes_round->(5)}
${\$aes_round->(6)}
${\$aes_round->(7)}
${\$aes_round->(8)}
${\$aes_round->(9)}
${\$aes_lastround->(10)}
# XOR with Plaintext
vpxor 0*16($PT), $STATE1, $STATE1
vpxor 1*16($PT), $STATE2, $STATE2
vpxor 2*16($PT), $STATE3, $STATE3
vpxor 3*16($PT), $STATE4, $STATE4
subq \$1, $LEN
vmovdqu $STATE1, 0*16($CT)
vmovdqu $STATE2, 1*16($CT)
vmovdqu $STATE3, 2*16($CT)
vmovdqu $STATE4, 3*16($CT)
jne .L128_enc_msg_x4_loop1
addq \$64,$CT
addq \$64,$PT
.L128_enc_msg_x4_check_remainder:
cmpq \$0, %r10
je .L128_enc_msg_x4_out
.L128_enc_msg_x4_loop2:
# enc each block separately
# CTR1 is the highest counter (even if no LOOP done)
vmovdqa $CTR1, $STATE1
vpaddd one(%rip), $CTR1, $CTR1 # inc counter
vpxor ($KS), $STATE1, $STATE1
vaesenc 16($KS), $STATE1, $STATE1
vaesenc 32($KS), $STATE1, $STATE1
vaesenc 48($KS), $STATE1, $STATE1
vaesenc 64($KS), $STATE1, $STATE1
vaesenc 80($KS), $STATE1, $STATE1
vaesenc 96($KS), $STATE1, $STATE1
vaesenc 112($KS), $STATE1, $STATE1
vaesenc 128($KS), $STATE1, $STATE1
vaesenc 144($KS), $STATE1, $STATE1
vaesenclast 160($KS), $STATE1, $STATE1
# XOR with plaintext
vpxor ($PT), $STATE1, $STATE1
vmovdqu $STATE1, ($CT)
addq \$16, $PT
addq \$16, $CT
subq \$1, %r10
jne .L128_enc_msg_x4_loop2
.L128_enc_msg_x4_out:
popq %r13
.cfi_pop %r13
popq %r12
.cfi_pop %r12
ret
.cfi_endproc
.size aes128gcmsiv_enc_msg_x4,.-aes128gcmsiv_enc_msg_x4
___
}
aes128gcmsiv_enc_msg_x4();
sub aes128gcmsiv_enc_msg_x8 {
my $STATE1 = "%xmm1";
my $STATE2 = "%xmm2";
my $STATE3 = "%xmm3";
my $STATE4 = "%xmm4";
my $STATE5 = "%xmm5";
my $STATE6 = "%xmm6";
my $STATE7 = "%xmm7";
my $STATE8 = "%xmm8";
my $CTR1 = "%xmm0";
my $CTR2 = "%xmm9";
my $CTR3 = "%xmm10";
my $CTR4 = "%xmm11";
my $CTR5 = "%xmm12";
my $CTR6 = "%xmm13";
my $CTR7 = "%xmm14";
my $SCHED = "%xmm15";
my $TMP1 = "%xmm1";
my $TMP2 = "%xmm2";
my $PT = "%rdi";
my $CT = "%rsi";
my $TAG = "%rdx";
my $KS = "%rcx";
my $LEN = "%r8";
my $aes_round8 = sub {
my ($i) = @_;
return <<___;
vmovdqu ${\eval($i*16)}($KS), $SCHED
vaesenc $SCHED, $STATE1, $STATE1
vaesenc $SCHED, $STATE2, $STATE2
vaesenc $SCHED, $STATE3, $STATE3
vaesenc $SCHED, $STATE4, $STATE4
vaesenc $SCHED, $STATE5, $STATE5
vaesenc $SCHED, $STATE6, $STATE6
vaesenc $SCHED, $STATE7, $STATE7
vaesenc $SCHED, $STATE8, $STATE8
___
};
my $aes_lastround8 = sub {
my ($i) = @_;
return <<___;
vmovdqu ${\eval($i*16)}($KS), $SCHED
vaesenclast $SCHED, $STATE1, $STATE1
vaesenclast $SCHED, $STATE2, $STATE2
vaesenclast $SCHED, $STATE3, $STATE3
vaesenclast $SCHED, $STATE4, $STATE4
vaesenclast $SCHED, $STATE5, $STATE5
vaesenclast $SCHED, $STATE6, $STATE6
vaesenclast $SCHED, $STATE7, $STATE7
vaesenclast $SCHED, $STATE8, $STATE8
___
};
# void ENC_MSG_x8(unsigned char* PT,
# unsigned char* CT,
# unsigned char* TAG,
# unsigned char* KS,
# size_t byte_len);
# parameter 1: %rdi #PT
# parameter 2: %rsi #CT
# parameter 3: %rdx #TAG [127 126 ... 0] IV=[127...32]
# parameter 4: %rcx #KS
# parameter 5: %r8 #LEN MSG_length in bytes
$code.=<<___;
.globl aes128gcmsiv_enc_msg_x8
.type aes128gcmsiv_enc_msg_x8,\@function,5
.align 16
aes128gcmsiv_enc_msg_x8:
.cfi_startproc
test $LEN, $LEN
jnz .L128_enc_msg_x8_start
ret
.L128_enc_msg_x8_start:
pushq %r12
.cfi_push %r12
pushq %r13
.cfi_push %r13
pushq %rbp
.cfi_push %rbp
movq %rsp, %rbp
.cfi_def_cfa_register rbp
# Place in stack
subq \$128, %rsp
andq \$-64, %rsp
shrq \$4, $LEN # LEN = num of blocks
movq $LEN, %r10
shlq \$61, %r10
shrq \$61, %r10
# make IV from TAG
vmovdqu ($TAG), $TMP1
vpor OR_MASK(%rip), $TMP1, $TMP1 # TMP1= IV = [1]TAG[126...32][00..00]
# store counter8 in the stack
vpaddd seven(%rip), $TMP1, $CTR1
vmovdqu $CTR1, (%rsp) # CTR8 = TAG[127...32][00..07]
vpaddd one(%rip), $TMP1, $CTR2 # CTR2 = TAG[127...32][00..01]
vpaddd two(%rip), $TMP1, $CTR3 # CTR3 = TAG[127...32][00..02]
vpaddd three(%rip), $TMP1, $CTR4 # CTR4 = TAG[127...32][00..03]
vpaddd four(%rip), $TMP1, $CTR5 # CTR5 = TAG[127...32][00..04]
vpaddd five(%rip), $TMP1, $CTR6 # CTR6 = TAG[127...32][00..05]
vpaddd six(%rip), $TMP1, $CTR7 # CTR7 = TAG[127...32][00..06]
vmovdqa $TMP1, $CTR1 # CTR1 = TAG[127...32][00..00]
shrq \$3, $LEN
je .L128_enc_msg_x8_check_remainder
subq \$128, $CT
subq \$128, $PT
.L128_enc_msg_x8_loop1:
addq \$128, $CT
addq \$128, $PT
vmovdqa $CTR1, $STATE1
vmovdqa $CTR2, $STATE2
vmovdqa $CTR3, $STATE3
vmovdqa $CTR4, $STATE4
vmovdqa $CTR5, $STATE5
vmovdqa $CTR6, $STATE6
vmovdqa $CTR7, $STATE7
# move from stack
vmovdqu (%rsp), $STATE8
vpxor ($KS), $STATE1, $STATE1
vpxor ($KS), $STATE2, $STATE2
vpxor ($KS), $STATE3, $STATE3
vpxor ($KS), $STATE4, $STATE4
vpxor ($KS), $STATE5, $STATE5
vpxor ($KS), $STATE6, $STATE6
vpxor ($KS), $STATE7, $STATE7
vpxor ($KS), $STATE8, $STATE8
${\$aes_round8->(1)}
vmovdqu (%rsp), $CTR7 # deal with CTR8
vpaddd eight(%rip), $CTR7, $CTR7
vmovdqu $CTR7, (%rsp)
${\$aes_round8->(2)}
vpsubd one(%rip), $CTR7, $CTR7
${\$aes_round8->(3)}
vpaddd eight(%rip), $CTR1, $CTR1
${\$aes_round8->(4)}
vpaddd eight(%rip), $CTR2, $CTR2
${\$aes_round8->(5)}
vpaddd eight(%rip), $CTR3, $CTR3
${\$aes_round8->(6)}
vpaddd eight(%rip), $CTR4, $CTR4
${\$aes_round8->(7)}
vpaddd eight(%rip), $CTR5, $CTR5
${\$aes_round8->(8)}
vpaddd eight(%rip), $CTR6, $CTR6
${\$aes_round8->(9)}
${\$aes_lastround8->(10)}
# XOR with Plaintext
vpxor 0*16($PT), $STATE1, $STATE1
vpxor 1*16($PT), $STATE2, $STATE2
vpxor 2*16($PT), $STATE3, $STATE3
vpxor 3*16($PT), $STATE4, $STATE4
vpxor 4*16($PT), $STATE5, $STATE5
vpxor 5*16($PT), $STATE6, $STATE6
vpxor 6*16($PT), $STATE7, $STATE7
vpxor 7*16($PT), $STATE8, $STATE8
dec $LEN
vmovdqu $STATE1, 0*16($CT)
vmovdqu $STATE2, 1*16($CT)
vmovdqu $STATE3, 2*16($CT)
vmovdqu $STATE4, 3*16($CT)
vmovdqu $STATE5, 4*16($CT)
vmovdqu $STATE6, 5*16($CT)
vmovdqu $STATE7, 6*16($CT)
vmovdqu $STATE8, 7*16($CT)
jne .L128_enc_msg_x8_loop1
addq \$128, $CT
addq \$128, $PT
.L128_enc_msg_x8_check_remainder:
cmpq \$0, %r10
je .L128_enc_msg_x8_out
.L128_enc_msg_x8_loop2:
# enc each block separately
# CTR1 is the highest counter (even if no LOOP done)
vmovdqa $CTR1, $STATE1
vpaddd one(%rip), $CTR1, $CTR1 # inc counter
vpxor ($KS), $STATE1, $STATE1
vaesenc 16($KS), $STATE1, $STATE1
vaesenc 32($KS), $STATE1, $STATE1
vaesenc 48($KS), $STATE1, $STATE1
vaesenc 64($KS), $STATE1, $STATE1
vaesenc 80($KS), $STATE1, $STATE1
vaesenc 96($KS), $STATE1, $STATE1
vaesenc 112($KS), $STATE1, $STATE1
vaesenc 128($KS), $STATE1, $STATE1
vaesenc 144($KS), $STATE1, $STATE1
vaesenclast 160($KS), $STATE1, $STATE1
# XOR with Plaintext
vpxor ($PT), $STATE1, $STATE1
vmovdqu $STATE1, ($CT)
addq \$16, $PT
addq \$16, $CT
decq %r10
jne .L128_enc_msg_x8_loop2
.L128_enc_msg_x8_out:
movq %rbp, %rsp
.cfi_def_cfa_register %rsp
popq %rbp
.cfi_pop %rbp
popq %r13
.cfi_pop %r13
popq %r12
.cfi_pop %r12
ret
.cfi_endproc
.size aes128gcmsiv_enc_msg_x8,.-aes128gcmsiv_enc_msg_x8
___
}
aes128gcmsiv_enc_msg_x8();
sub aesgcmsiv_dec {
my ($aes256) = @_;
my $T = "%xmm0";
my $TMP0 = "%xmm1";
my $TMP1 = "%xmm2";
my $TMP2 = "%xmm3";
my $TMP3 = "%xmm4";
my $TMP4 = "%xmm5";
my $TMP5 = "%xmm6";
my $CTR1 = "%xmm7";
my $CTR2 = "%xmm8";
my $CTR3 = "%xmm9";
my $CTR4 = "%xmm10";
my $CTR5 = "%xmm11";
my $CTR6 = "%xmm12";
my $CTR = "%xmm15";
my $CT = "%rdi";
my $PT = "%rsi";
my $POL = "%rdx";
my $Htbl = "%rcx";
my $KS = "%r8";
my $LEN = "%r9";
my $secureBuffer = "%rax";
my $HTABLE_ROUNDS = "%xmm13";
my $labelPrefix = "128";
if ($aes256) {
$labelPrefix = "256";
}
my $aes_round_dec = sub {
my ($i) = @_;
return <<___;
vmovdqu ${\eval($i*16)}($KS), $TMP3
vaesenc $TMP3, $CTR1, $CTR1
vaesenc $TMP3, $CTR2, $CTR2
vaesenc $TMP3, $CTR3, $CTR3
vaesenc $TMP3, $CTR4, $CTR4
vaesenc $TMP3, $CTR5, $CTR5
vaesenc $TMP3, $CTR6, $CTR6
___
};
my $aes_lastround_dec = sub {
my ($i) = @_;
return <<___;
vmovdqu ${\eval($i*16)}($KS), $TMP3
vaesenclast $TMP3, $CTR1, $CTR1
vaesenclast $TMP3, $CTR2, $CTR2
vaesenclast $TMP3, $CTR3, $CTR3
vaesenclast $TMP3, $CTR4, $CTR4
vaesenclast $TMP3, $CTR5, $CTR5
vaesenclast $TMP3, $CTR6, $CTR6
___
};
my $schoolbook = sub {
my ($i) = @_;
return <<___;
vmovdqu ${\eval($i*16-32)}($secureBuffer), $TMP5
vmovdqu ${\eval($i*16-32)}($Htbl), $HTABLE_ROUNDS
vpclmulqdq \$0x10, $HTABLE_ROUNDS, $TMP5, $TMP3
vpxor $TMP3, $TMP0, $TMP0
vpclmulqdq \$0x11, $HTABLE_ROUNDS, $TMP5, $TMP3
vpxor $TMP3, $TMP1, $TMP1
vpclmulqdq \$0x00, $HTABLE_ROUNDS, $TMP5, $TMP3
vpxor $TMP3, $TMP2, $TMP2
vpclmulqdq \$0x01, $HTABLE_ROUNDS, $TMP5, $TMP3
vpxor $TMP3, $TMP0, $TMP0
___
};
if ($aes256) {
$code.=<<___;
.globl aes256gcmsiv_dec
.type aes256gcmsiv_dec,\@function,6
.align 16
aes256gcmsiv_dec:
___
} else {
$code.=<<___;
.globl aes128gcmsiv_dec
.type aes128gcmsiv_dec,\@function,6
.align 16
aes128gcmsiv_dec:
___
}
$code.=<<___;
.cfi_startproc
test \$~15, $LEN
jnz .L${labelPrefix}_dec_start
ret
.L${labelPrefix}_dec_start:
vzeroupper
vmovdqa ($POL), $T
movq $POL, $secureBuffer
leaq 32($secureBuffer), $secureBuffer
leaq 32($Htbl), $Htbl
# make CTRBLKs from given tag.
vmovdqu ($CT,$LEN), $CTR
vpor OR_MASK(%rip), $CTR, $CTR # CTR = [1]TAG[126...32][00..00]
andq \$~15, $LEN
# If less then 6 blocks, make singles
cmp \$96, $LEN
jb .L${labelPrefix}_dec_loop2
# Decrypt the first six blocks
sub \$96, $LEN
vmovdqa $CTR, $CTR1
vpaddd one(%rip), $CTR1, $CTR2
vpaddd two(%rip), $CTR1, $CTR3
vpaddd one(%rip), $CTR3, $CTR4
vpaddd two(%rip), $CTR3, $CTR5
vpaddd one(%rip), $CTR5, $CTR6
vpaddd two(%rip), $CTR5, $CTR
vpxor ($KS), $CTR1, $CTR1
vpxor ($KS), $CTR2, $CTR2
vpxor ($KS), $CTR3, $CTR3
vpxor ($KS), $CTR4, $CTR4
vpxor ($KS), $CTR5, $CTR5
vpxor ($KS), $CTR6, $CTR6
${\$aes_round_dec->(1)}
${\$aes_round_dec->(2)}
${\$aes_round_dec->(3)}
${\$aes_round_dec->(4)}
${\$aes_round_dec->(5)}
${\$aes_round_dec->(6)}
${\$aes_round_dec->(7)}
${\$aes_round_dec->(8)}
${\$aes_round_dec->(9)}
___
if ($aes256) {
$code.=<<___;
${\$aes_round_dec->(10)}
${\$aes_round_dec->(11)}
${\$aes_round_dec->(12)}
${\$aes_round_dec->(13)}
${\$aes_lastround_dec->(14)}
___
} else {
$code.=<<___;
${\$aes_lastround_dec->(10)}
___
}
$code.=<<___;
# XOR with CT
vpxor 0*16($CT), $CTR1, $CTR1
vpxor 1*16($CT), $CTR2, $CTR2
vpxor 2*16($CT), $CTR3, $CTR3
vpxor 3*16($CT), $CTR4, $CTR4
vpxor 4*16($CT), $CTR5, $CTR5
vpxor 5*16($CT), $CTR6, $CTR6
vmovdqu $CTR1, 0*16($PT)
vmovdqu $CTR2, 1*16($PT)
vmovdqu $CTR3, 2*16($PT)
vmovdqu $CTR4, 3*16($PT)
vmovdqu $CTR5, 4*16($PT)
vmovdqu $CTR6, 5*16($PT)
addq \$96, $CT
addq \$96, $PT
jmp .L${labelPrefix}_dec_loop1
# Decrypt 6 blocks each time while hashing previous 6 blocks
.align 64
.L${labelPrefix}_dec_loop1:
cmp \$96, $LEN
jb .L${labelPrefix}_dec_finish_96
sub \$96, $LEN
vmovdqa $CTR6, $TMP5
vmovdqa $CTR5, 1*16-32($secureBuffer)
vmovdqa $CTR4, 2*16-32($secureBuffer)
vmovdqa $CTR3, 3*16-32($secureBuffer)
vmovdqa $CTR2, 4*16-32($secureBuffer)
vmovdqa $CTR1, 5*16-32($secureBuffer)
vmovdqa $CTR, $CTR1
vpaddd one(%rip), $CTR1, $CTR2
vpaddd two(%rip), $CTR1, $CTR3
vpaddd one(%rip), $CTR3, $CTR4
vpaddd two(%rip), $CTR3, $CTR5
vpaddd one(%rip), $CTR5, $CTR6
vpaddd two(%rip), $CTR5, $CTR
vmovdqa ($KS), $TMP3
vpxor $TMP3, $CTR1, $CTR1
vpxor $TMP3, $CTR2, $CTR2
vpxor $TMP3, $CTR3, $CTR3
vpxor $TMP3, $CTR4, $CTR4
vpxor $TMP3, $CTR5, $CTR5
vpxor $TMP3, $CTR6, $CTR6
vmovdqu 0*16-32($Htbl), $TMP3
vpclmulqdq \$0x11, $TMP3, $TMP5, $TMP1
vpclmulqdq \$0x00, $TMP3, $TMP5, $TMP2
vpclmulqdq \$0x01, $TMP3, $TMP5, $TMP0
vpclmulqdq \$0x10, $TMP3, $TMP5, $TMP3
vpxor $TMP3, $TMP0, $TMP0
${\$aes_round_dec->(1)}
${\$schoolbook->(1)}
${\$aes_round_dec->(2)}
${\$schoolbook->(2)}
${\$aes_round_dec->(3)}
${\$schoolbook->(3)}
${\$aes_round_dec->(4)}
${\$schoolbook->(4)}
${\$aes_round_dec->(5)}
${\$aes_round_dec->(6)}
${\$aes_round_dec->(7)}
vmovdqa 5*16-32($secureBuffer), $TMP5
vpxor $T, $TMP5, $TMP5
vmovdqu 5*16-32($Htbl), $TMP4
vpclmulqdq \$0x01, $TMP4, $TMP5, $TMP3
vpxor $TMP3, $TMP0, $TMP0
vpclmulqdq \$0x11, $TMP4, $TMP5, $TMP3
vpxor $TMP3, $TMP1, $TMP1
vpclmulqdq \$0x00, $TMP4, $TMP5, $TMP3
vpxor $TMP3, $TMP2, $TMP2
vpclmulqdq \$0x10, $TMP4, $TMP5, $TMP3
vpxor $TMP3, $TMP0, $TMP0
${\$aes_round_dec->(8)}
vpsrldq \$8, $TMP0, $TMP3
vpxor $TMP3, $TMP1, $TMP4
vpslldq \$8, $TMP0, $TMP3
vpxor $TMP3, $TMP2, $T
vmovdqa poly(%rip), $TMP2
${\$aes_round_dec->(9)}
___
if ($aes256) {
$code.=<<___;
${\$aes_round_dec->(10)}
${\$aes_round_dec->(11)}
${\$aes_round_dec->(12)}
${\$aes_round_dec->(13)}
vmovdqu 14*16($KS), $TMP5
___
} else {
$code.=<<___;
vmovdqu 10*16($KS), $TMP5
___
}
$code.=<<___;
vpalignr \$8, $T, $T, $TMP1
vpclmulqdq \$0x10, $TMP2, $T, $T
vpxor $T, $TMP1, $T
vpxor 0*16($CT), $TMP5, $TMP3
vaesenclast $TMP3, $CTR1, $CTR1
vpxor 1*16($CT), $TMP5, $TMP3
vaesenclast $TMP3, $CTR2, $CTR2
vpxor 2*16($CT), $TMP5, $TMP3
vaesenclast $TMP3, $CTR3, $CTR3
vpxor 3*16($CT), $TMP5, $TMP3
vaesenclast $TMP3, $CTR4, $CTR4
vpxor 4*16($CT), $TMP5, $TMP3
vaesenclast $TMP3, $CTR5, $CTR5
vpxor 5*16($CT), $TMP5, $TMP3
vaesenclast $TMP3, $CTR6, $CTR6
vpalignr \$8, $T, $T, $TMP1
vpclmulqdq \$0x10, $TMP2, $T, $T
vpxor $T, $TMP1, $T
vmovdqu $CTR1, 0*16($PT)
vmovdqu $CTR2, 1*16($PT)
vmovdqu $CTR3, 2*16($PT)
vmovdqu $CTR4, 3*16($PT)
vmovdqu $CTR5, 4*16($PT)
vmovdqu $CTR6, 5*16($PT)
vpxor $TMP4, $T, $T
lea 96($CT), $CT
lea 96($PT), $PT
jmp .L${labelPrefix}_dec_loop1
.L${labelPrefix}_dec_finish_96:
vmovdqa $CTR6, $TMP5
vmovdqa $CTR5, 1*16-32($secureBuffer)
vmovdqa $CTR4, 2*16-32($secureBuffer)
vmovdqa $CTR3, 3*16-32($secureBuffer)
vmovdqa $CTR2, 4*16-32($secureBuffer)
vmovdqa $CTR1, 5*16-32($secureBuffer)
vmovdqu 0*16-32($Htbl), $TMP3
vpclmulqdq \$0x10, $TMP3, $TMP5, $TMP0
vpclmulqdq \$0x11, $TMP3, $TMP5, $TMP1
vpclmulqdq \$0x00, $TMP3, $TMP5, $TMP2
vpclmulqdq \$0x01, $TMP3, $TMP5, $TMP3
vpxor $TMP3, $TMP0, $TMP0
${\$schoolbook->(1)}
${\$schoolbook->(2)}
${\$schoolbook->(3)}
${\$schoolbook->(4)}
vmovdqu 5*16-32($secureBuffer), $TMP5
vpxor $T, $TMP5, $TMP5
vmovdqu 5*16-32($Htbl), $TMP4
vpclmulqdq \$0x11, $TMP4, $TMP5, $TMP3
vpxor $TMP3, $TMP1, $TMP1
vpclmulqdq \$0x00, $TMP4, $TMP5, $TMP3
vpxor $TMP3, $TMP2, $TMP2
vpclmulqdq \$0x10, $TMP4, $TMP5, $TMP3
vpxor $TMP3, $TMP0, $TMP0
vpclmulqdq \$0x01, $TMP4, $TMP5, $TMP3
vpxor $TMP3, $TMP0, $TMP0
vpsrldq \$8, $TMP0, $TMP3
vpxor $TMP3, $TMP1, $TMP4
vpslldq \$8, $TMP0, $TMP3
vpxor $TMP3, $TMP2, $T
vmovdqa poly(%rip), $TMP2
vpalignr \$8, $T, $T, $TMP1
vpclmulqdq \$0x10, $TMP2, $T, $T
vpxor $T, $TMP1, $T
vpalignr \$8, $T, $T, $TMP1
vpclmulqdq \$0x10, $TMP2, $T, $T
vpxor $T, $TMP1, $T
vpxor $TMP4, $T, $T
.L${labelPrefix}_dec_loop2:
# Here we encrypt any remaining whole block
# if there are no whole blocks
cmp \$16, $LEN
jb .L${labelPrefix}_dec_out
sub \$16, $LEN
vmovdqa $CTR, $TMP1
vpaddd one(%rip), $CTR, $CTR
vpxor 0*16($KS), $TMP1, $TMP1
vaesenc 1*16($KS), $TMP1, $TMP1
vaesenc 2*16($KS), $TMP1, $TMP1
vaesenc 3*16($KS), $TMP1, $TMP1
vaesenc 4*16($KS), $TMP1, $TMP1
vaesenc 5*16($KS), $TMP1, $TMP1
vaesenc 6*16($KS), $TMP1, $TMP1
vaesenc 7*16($KS), $TMP1, $TMP1
vaesenc 8*16($KS), $TMP1, $TMP1
vaesenc 9*16($KS), $TMP1, $TMP1
___
if ($aes256) {
$code.=<<___;
vaesenc 10*16($KS), $TMP1, $TMP1
vaesenc 11*16($KS), $TMP1, $TMP1
vaesenc 12*16($KS), $TMP1, $TMP1
vaesenc 13*16($KS), $TMP1, $TMP1
vaesenclast 14*16($KS), $TMP1, $TMP1
___
} else {
$code.=<<___;
vaesenclast 10*16($KS), $TMP1, $TMP1
___
}
$code.=<<___;
vpxor ($CT), $TMP1, $TMP1
vmovdqu $TMP1, ($PT)
addq \$16, $CT
addq \$16, $PT
vpxor $TMP1, $T, $T
vmovdqa -32($Htbl), $TMP0
call GFMUL
jmp .L${labelPrefix}_dec_loop2
.L${labelPrefix}_dec_out:
vmovdqu $T, ($POL)
ret
.cfi_endproc
___
if ($aes256) {
$code.=<<___;
.size aes256gcmsiv_dec, .-aes256gcmsiv_dec
___
} else {
$code.=<<___;
.size aes128gcmsiv_dec, .-aes128gcmsiv_dec
___
}
}
aesgcmsiv_dec(0); # emit 128-bit version
sub aes128gcmsiv_ecb_enc_block {
my $STATE_1 = "%xmm1";
my $KSp = "%rdx";
# parameter 1: PT %rdi (pointer to 128 bit)
# parameter 2: CT %rsi (pointer to 128 bit)
# parameter 3: ks %rdx (pointer to ks)
$code.=<<___;
.globl aes128gcmsiv_ecb_enc_block
.type aes128gcmsiv_ecb_enc_block,\@function,3
.align 16
aes128gcmsiv_ecb_enc_block:
.cfi_startproc
vmovdqa (%rdi), $STATE_1
vpxor ($KSp), $STATE_1, $STATE_1
vaesenc 1*16($KSp), $STATE_1, $STATE_1
vaesenc 2*16($KSp), $STATE_1, $STATE_1
vaesenc 3*16($KSp), $STATE_1, $STATE_1
vaesenc 4*16($KSp), $STATE_1, $STATE_1
vaesenc 5*16($KSp), $STATE_1, $STATE_1
vaesenc 6*16($KSp), $STATE_1, $STATE_1
vaesenc 7*16($KSp), $STATE_1, $STATE_1
vaesenc 8*16($KSp), $STATE_1, $STATE_1
vaesenc 9*16($KSp), $STATE_1, $STATE_1
vaesenclast 10*16($KSp), $STATE_1, $STATE_1 # STATE_1 == IV
vmovdqa $STATE_1, (%rsi)
ret
.cfi_endproc
.size aes128gcmsiv_ecb_enc_block,.-aes128gcmsiv_ecb_enc_block
___
}
aes128gcmsiv_ecb_enc_block();
sub aes256gcmsiv_aes_ks_enc_x1 {
my $KS = "%rdx";
my $KEYp = "%rcx";
my $CON_MASK = "%xmm0";
my $MASK_256 = "%xmm15";
my $KEY_1 = "%xmm1";
my $KEY_2 = "%xmm3";
my $BLOCK1 = "%xmm8";
my $AUX_REG = "%xmm14";
my $PT = "%rdi";
my $CT = "%rsi";
my $round_double = sub {
my ($i, $j) = @_;
return <<___;
vpshufb %xmm15, %xmm3, %xmm2
vaesenclast %xmm0, %xmm2, %xmm2
vpslld \$1, %xmm0, %xmm0
vpslldq \$4, %xmm1, %xmm4
vpxor %xmm4, %xmm1, %xmm1
vpslldq \$4, %xmm4, %xmm4
vpxor %xmm4, %xmm1, %xmm1
vpslldq \$4, %xmm4, %xmm4
vpxor %xmm4, %xmm1, %xmm1
vpxor %xmm2, %xmm1, %xmm1
vaesenc %xmm1, $BLOCK1, $BLOCK1
vmovdqu %xmm1, ${\eval(16*$i)}($KS)
vpshufd \$0xff, %xmm1, %xmm2
vaesenclast %xmm14, %xmm2, %xmm2
vpslldq \$4, %xmm3, %xmm4
vpxor %xmm4, %xmm3, %xmm3
vpslldq \$4, %xmm4, %xmm4
vpxor %xmm4, %xmm3, %xmm3
vpslldq \$4, %xmm4, %xmm4
vpxor %xmm4, %xmm3, %xmm3
vpxor %xmm2, %xmm3, %xmm3
vaesenc %xmm3, $BLOCK1, $BLOCK1
vmovdqu %xmm3, ${\eval(16*$j)}($KS)
___
};
my $round_last = sub {
my ($i) = @_;
return <<___;
vpshufb %xmm15, %xmm3, %xmm2
vaesenclast %xmm0, %xmm2, %xmm2
vpslldq \$4, %xmm1, %xmm4
vpxor %xmm4, %xmm1, %xmm1
vpslldq \$4, %xmm4, %xmm4
vpxor %xmm4, %xmm1, %xmm1
vpslldq \$4, %xmm4, %xmm4
vpxor %xmm4, %xmm1, %xmm1
vpxor %xmm2, %xmm1, %xmm1
vaesenclast %xmm1, $BLOCK1, $BLOCK1
vmovdqu %xmm1, ${\eval(16*$i)}($KS)
___
};
# parameter 1: %rdi Pointer to PT1
# parameter 2: %rsi Pointer to CT1
# parameter 3: %rdx Pointer to KS
# parameter 4: %rcx Pointer to initial key
$code.=<<___;
.globl aes256gcmsiv_aes_ks_enc_x1
.type aes256gcmsiv_aes_ks_enc_x1,\@function,4
.align 16
aes256gcmsiv_aes_ks_enc_x1:
.cfi_startproc
vmovdqa con1(%rip), $CON_MASK # CON_MASK = 1,1,1,1
vmovdqa mask(%rip), $MASK_256 # MASK_256
vmovdqa ($PT), $BLOCK1
vmovdqa ($KEYp), $KEY_1 # KEY_1 || KEY_2 [0..7] = user key
vmovdqa 16($KEYp), $KEY_2
vpxor $KEY_1, $BLOCK1, $BLOCK1
vaesenc $KEY_2, $BLOCK1, $BLOCK1
vmovdqu $KEY_1, ($KS) # First round key
vmovdqu $KEY_2, 16($KS)
vpxor $AUX_REG, $AUX_REG, $AUX_REG
${\$round_double->(2, 3)}
${\$round_double->(4, 5)}
${\$round_double->(6, 7)}
${\$round_double->(8, 9)}
${\$round_double->(10, 11)}
${\$round_double->(12, 13)}
${\$round_last->(14)}
vmovdqa $BLOCK1, ($CT)
ret
.cfi_endproc
.size aes256gcmsiv_aes_ks_enc_x1,.-aes256gcmsiv_aes_ks_enc_x1
___
}
aes256gcmsiv_aes_ks_enc_x1();
sub aes256gcmsiv_ecb_enc_block {
my $STATE_1 = "%xmm1";
my $PT = "%rdi";
my $CT = "%rsi";
my $KSp = "%rdx";
# parameter 1: PT %rdi (pointer to 128 bit)
# parameter 2: CT %rsi (pointer to 128 bit)
# parameter 3: ks %rdx (pointer to ks)
$code.=<<___;
.globl aes256gcmsiv_ecb_enc_block
.type aes256gcmsiv_ecb_enc_block,\@function,3
.align 16
aes256gcmsiv_ecb_enc_block:
.cfi_startproc
vmovdqa (%rdi), $STATE_1
vpxor ($KSp), $STATE_1, $STATE_1
vaesenc 1*16($KSp), $STATE_1, $STATE_1
vaesenc 2*16($KSp), $STATE_1, $STATE_1
vaesenc 3*16($KSp), $STATE_1, $STATE_1
vaesenc 4*16($KSp), $STATE_1, $STATE_1
vaesenc 5*16($KSp), $STATE_1, $STATE_1
vaesenc 6*16($KSp), $STATE_1, $STATE_1
vaesenc 7*16($KSp), $STATE_1, $STATE_1
vaesenc 8*16($KSp), $STATE_1, $STATE_1
vaesenc 9*16($KSp), $STATE_1, $STATE_1
vaesenc 10*16($KSp), $STATE_1, $STATE_1
vaesenc 11*16($KSp), $STATE_1, $STATE_1
vaesenc 12*16($KSp), $STATE_1, $STATE_1
vaesenc 13*16($KSp), $STATE_1, $STATE_1
vaesenclast 14*16($KSp), $STATE_1, $STATE_1 # $STATE_1 == IV
vmovdqa $STATE_1, (%rsi)
ret
.cfi_endproc
.size aes256gcmsiv_ecb_enc_block,.-aes256gcmsiv_ecb_enc_block
___
}
aes256gcmsiv_ecb_enc_block();
sub aes256gcmsiv_enc_msg_x4 {
my $CTR1 = "%xmm0";
my $CTR2 = "%xmm1";
my $CTR3 = "%xmm2";
my $CTR4 = "%xmm3";
my $ADDER = "%xmm4";
my $STATE1 = "%xmm5";
my $STATE2 = "%xmm6";
my $STATE3 = "%xmm7";
my $STATE4 = "%xmm8";
my $TMP = "%xmm12";
my $TMP2 = "%xmm13";
my $TMP3 = "%xmm14";
my $IV = "%xmm15";
my $PT = "%rdi";
my $CT = "%rsi";
my $TAG = "%rdx";
my $KS = "%rcx";
my $LEN = "%r8";
my $aes_round = sub {
my ($i) = @_;
return <<___;
vmovdqu ${\eval($i*16)}($KS), $TMP
vaesenc $TMP, $STATE1, $STATE1
vaesenc $TMP, $STATE2, $STATE2
vaesenc $TMP, $STATE3, $STATE3
vaesenc $TMP, $STATE4, $STATE4
___
};
my $aes_lastround = sub {
my ($i) = @_;
return <<___;
vmovdqu ${\eval($i*16)}($KS), $TMP
vaesenclast $TMP, $STATE1, $STATE1
vaesenclast $TMP, $STATE2, $STATE2
vaesenclast $TMP, $STATE3, $STATE3
vaesenclast $TMP, $STATE4, $STATE4
___
};
# void aes256gcmsiv_enc_msg_x4(unsigned char* PT, unsigned char* CT,
# unsigned char* TAG, unsigned char* KS,
# size_t byte_len);
# parameter 1: %rdi #PT
# parameter 2: %rsi #CT
# parameter 3: %rdx #TAG [127 126 ... 0] IV=[127...32]
# parameter 4: %rcx #KS
# parameter 5: %r8 #LEN MSG_length in bytes
$code.=<<___;
.globl aes256gcmsiv_enc_msg_x4
.type aes256gcmsiv_enc_msg_x4,\@function,5
.align 16
aes256gcmsiv_enc_msg_x4:
.cfi_startproc
test $LEN, $LEN
jnz .L256_enc_msg_x4_start
ret
.L256_enc_msg_x4_start:
movq $LEN, %r10
shrq \$4, $LEN # LEN = num of blocks
shlq \$60, %r10
jz .L256_enc_msg_x4_start2
addq \$1, $LEN
.L256_enc_msg_x4_start2:
movq $LEN, %r10
shlq \$62, %r10
shrq \$62, %r10
# make IV from TAG
vmovdqa ($TAG), $IV
vpor OR_MASK(%rip), $IV, $IV # IV = [1]TAG[126...32][00..00]
vmovdqa four(%rip), $ADDER # Register to increment counters
vmovdqa $IV, $CTR1 # CTR1 = TAG[1][127...32][00..00]
vpaddd one(%rip), $IV, $CTR2 # CTR2 = TAG[1][127...32][00..01]
vpaddd two(%rip), $IV, $CTR3 # CTR3 = TAG[1][127...32][00..02]
vpaddd three(%rip), $IV, $CTR4 # CTR4 = TAG[1][127...32][00..03]
shrq \$2, $LEN
je .L256_enc_msg_x4_check_remainder
subq \$64, $CT
subq \$64, $PT
.L256_enc_msg_x4_loop1:
addq \$64, $CT
addq \$64, $PT
vmovdqa $CTR1, $STATE1
vmovdqa $CTR2, $STATE2
vmovdqa $CTR3, $STATE3
vmovdqa $CTR4, $STATE4
vpxor ($KS), $STATE1, $STATE1
vpxor ($KS), $STATE2, $STATE2
vpxor ($KS), $STATE3, $STATE3
vpxor ($KS), $STATE4, $STATE4
${\$aes_round->(1)}
vpaddd $ADDER, $CTR1, $CTR1
${\$aes_round->(2)}
vpaddd $ADDER, $CTR2, $CTR2
${\$aes_round->(3)}
vpaddd $ADDER, $CTR3, $CTR3
${\$aes_round->(4)}
vpaddd $ADDER, $CTR4, $CTR4
${\$aes_round->(5)}
${\$aes_round->(6)}
${\$aes_round->(7)}
${\$aes_round->(8)}
${\$aes_round->(9)}
${\$aes_round->(10)}
${\$aes_round->(11)}
${\$aes_round->(12)}
${\$aes_round->(13)}
${\$aes_lastround->(14)}
# XOR with Plaintext
vpxor 0*16($PT), $STATE1, $STATE1
vpxor 1*16($PT), $STATE2, $STATE2
vpxor 2*16($PT), $STATE3, $STATE3
vpxor 3*16($PT), $STATE4, $STATE4
subq \$1, $LEN
vmovdqu $STATE1, 0*16($CT)
vmovdqu $STATE2, 1*16($CT)
vmovdqu $STATE3, 2*16($CT)
vmovdqu $STATE4, 3*16($CT)
jne .L256_enc_msg_x4_loop1
addq \$64, $CT
addq \$64, $PT
.L256_enc_msg_x4_check_remainder:
cmpq \$0, %r10
je .L256_enc_msg_x4_out
.L256_enc_msg_x4_loop2:
# encrypt each block separately
# CTR1 is the highest counter (even if no LOOP done)
vmovdqa $CTR1, $STATE1
vpaddd one(%rip), $CTR1, $CTR1 # inc counter
vpxor ($KS), $STATE1, $STATE1
vaesenc 16($KS), $STATE1, $STATE1
vaesenc 32($KS), $STATE1, $STATE1
vaesenc 48($KS), $STATE1, $STATE1
vaesenc 64($KS), $STATE1, $STATE1
vaesenc 80($KS), $STATE1, $STATE1
vaesenc 96($KS), $STATE1, $STATE1
vaesenc 112($KS), $STATE1, $STATE1
vaesenc 128($KS), $STATE1, $STATE1
vaesenc 144($KS), $STATE1, $STATE1
vaesenc 160($KS), $STATE1, $STATE1
vaesenc 176($KS), $STATE1, $STATE1
vaesenc 192($KS), $STATE1, $STATE1
vaesenc 208($KS), $STATE1, $STATE1
vaesenclast 224($KS), $STATE1, $STATE1
# XOR with Plaintext
vpxor ($PT), $STATE1, $STATE1
vmovdqu $STATE1, ($CT)
addq \$16, $PT
addq \$16, $CT
subq \$1, %r10
jne .L256_enc_msg_x4_loop2
.L256_enc_msg_x4_out:
ret
.cfi_endproc
.size aes256gcmsiv_enc_msg_x4,.-aes256gcmsiv_enc_msg_x4
___
}
aes256gcmsiv_enc_msg_x4();
sub aes256gcmsiv_enc_msg_x8() {
my $STATE1 = "%xmm1";
my $STATE2 = "%xmm2";
my $STATE3 = "%xmm3";
my $STATE4 = "%xmm4";
my $STATE5 = "%xmm5";
my $STATE6 = "%xmm6";
my $STATE7 = "%xmm7";
my $STATE8 = "%xmm8";
my $CTR1 = "%xmm0";
my $CTR2 = "%xmm9";
my $CTR3 = "%xmm10";
my $CTR4 = "%xmm11";
my $CTR5 = "%xmm12";
my $CTR6 = "%xmm13";
my $CTR7 = "%xmm14";
my $TMP1 = "%xmm1";
my $TMP2 = "%xmm2";
my $KS = "%rcx";
my $LEN = "%r8";
my $PT = "%rdi";
my $CT = "%rsi";
my $TAG = "%rdx";
my $SCHED = "%xmm15";
my $aes_round8 = sub {
my ($i) = @_;
return <<___;
vmovdqu ${\eval($i*16)}($KS), $SCHED
vaesenc $SCHED, $STATE1, $STATE1
vaesenc $SCHED, $STATE2, $STATE2
vaesenc $SCHED, $STATE3, $STATE3
vaesenc $SCHED, $STATE4, $STATE4
vaesenc $SCHED, $STATE5, $STATE5
vaesenc $SCHED, $STATE6, $STATE6
vaesenc $SCHED, $STATE7, $STATE7
vaesenc $SCHED, $STATE8, $STATE8
___
};
my $aes_lastround8 = sub {
my ($i) = @_;
return <<___;
vmovdqu ${\eval($i*16)}($KS), $SCHED
vaesenclast $SCHED, $STATE1, $STATE1
vaesenclast $SCHED, $STATE2, $STATE2
vaesenclast $SCHED, $STATE3, $STATE3
vaesenclast $SCHED, $STATE4, $STATE4
vaesenclast $SCHED, $STATE5, $STATE5
vaesenclast $SCHED, $STATE6, $STATE6
vaesenclast $SCHED, $STATE7, $STATE7
vaesenclast $SCHED, $STATE8, $STATE8
___
};
# void ENC_MSG_x8(unsigned char* PT,
# unsigned char* CT,
# unsigned char* TAG,
# unsigned char* KS,
# size_t byte_len);
# parameter 1: %rdi #PT
# parameter 2: %rsi #CT
# parameter 3: %rdx #TAG [127 126 ... 0] IV=[127...32]
# parameter 4: %rcx #KS
# parameter 5: %r8 #LEN MSG_length in bytes
$code.=<<___;
.globl aes256gcmsiv_enc_msg_x8
.type aes256gcmsiv_enc_msg_x8,\@function,5
.align 16
aes256gcmsiv_enc_msg_x8:
.cfi_startproc
test $LEN, $LEN
jnz .L256_enc_msg_x8_start
ret
.L256_enc_msg_x8_start:
# Place in stack
movq %rsp, %r11
subq \$16, %r11
andq \$-64, %r11
movq $LEN, %r10
shrq \$4, $LEN # LEN = num of blocks
shlq \$60, %r10
jz .L256_enc_msg_x8_start2
addq \$1, $LEN
.L256_enc_msg_x8_start2:
movq $LEN, %r10
shlq \$61, %r10
shrq \$61, %r10
# Make IV from TAG
vmovdqa ($TAG), $TMP1
vpor OR_MASK(%rip), $TMP1, $TMP1 # TMP1= IV = [1]TAG[126...32][00..00]
# store counter8 on the stack
vpaddd seven(%rip), $TMP1, $CTR1
vmovdqa $CTR1, (%r11) # CTR8 = TAG[127...32][00..07]
vpaddd one(%rip), $TMP1, $CTR2 # CTR2 = TAG[127...32][00..01]
vpaddd two(%rip), $TMP1, $CTR3 # CTR3 = TAG[127...32][00..02]
vpaddd three(%rip), $TMP1, $CTR4 # CTR4 = TAG[127...32][00..03]
vpaddd four(%rip), $TMP1, $CTR5 # CTR5 = TAG[127...32][00..04]
vpaddd five(%rip), $TMP1, $CTR6 # CTR6 = TAG[127...32][00..05]
vpaddd six(%rip), $TMP1, $CTR7 # CTR7 = TAG[127...32][00..06]
vmovdqa $TMP1, $CTR1 # CTR1 = TAG[127...32][00..00]
shrq \$3, $LEN
jz .L256_enc_msg_x8_check_remainder
subq \$128, $CT
subq \$128, $PT
.L256_enc_msg_x8_loop1:
addq \$128, $CT
addq \$128, $PT
vmovdqa $CTR1, $STATE1
vmovdqa $CTR2, $STATE2
vmovdqa $CTR3, $STATE3
vmovdqa $CTR4, $STATE4
vmovdqa $CTR5, $STATE5
vmovdqa $CTR6, $STATE6
vmovdqa $CTR7, $STATE7
# move from stack
vmovdqa (%r11), $STATE8
vpxor ($KS), $STATE1, $STATE1
vpxor ($KS), $STATE2, $STATE2
vpxor ($KS), $STATE3, $STATE3
vpxor ($KS), $STATE4, $STATE4
vpxor ($KS), $STATE5, $STATE5
vpxor ($KS), $STATE6, $STATE6
vpxor ($KS), $STATE7, $STATE7
vpxor ($KS), $STATE8, $STATE8
${\$aes_round8->(1)}
vmovdqa (%r11), $CTR7 # deal with CTR8
vpaddd eight(%rip), $CTR7, $CTR7
vmovdqa $CTR7, (%r11)
${\$aes_round8->(2)}
vpsubd one(%rip), $CTR7, $CTR7
${\$aes_round8->(3)}
vpaddd eight(%rip), $CTR1, $CTR1
${\$aes_round8->(4)}
vpaddd eight(%rip), $CTR2, $CTR2
${\$aes_round8->(5)}
vpaddd eight(%rip), $CTR3, $CTR3
${\$aes_round8->(6)}
vpaddd eight(%rip), $CTR4, $CTR4
${\$aes_round8->(7)}
vpaddd eight(%rip), $CTR5, $CTR5
${\$aes_round8->(8)}
vpaddd eight(%rip), $CTR6, $CTR6
${\$aes_round8->(9)}
${\$aes_round8->(10)}
${\$aes_round8->(11)}
${\$aes_round8->(12)}
${\$aes_round8->(13)}
${\$aes_lastround8->(14)}
# XOR with Plaintext
vpxor 0*16($PT), $STATE1, $STATE1
vpxor 1*16($PT), $STATE2, $STATE2
vpxor 2*16($PT), $STATE3, $STATE3
vpxor 3*16($PT), $STATE4, $STATE4
vpxor 4*16($PT), $STATE5, $STATE5
vpxor 5*16($PT), $STATE6, $STATE6
vpxor 6*16($PT), $STATE7, $STATE7
vpxor 7*16($PT), $STATE8, $STATE8
subq \$1, $LEN
vmovdqu $STATE1, 0*16($CT)
vmovdqu $STATE2, 1*16($CT)
vmovdqu $STATE3, 2*16($CT)
vmovdqu $STATE4, 3*16($CT)
vmovdqu $STATE5, 4*16($CT)
vmovdqu $STATE6, 5*16($CT)
vmovdqu $STATE7, 6*16($CT)
vmovdqu $STATE8, 7*16($CT)
jne .L256_enc_msg_x8_loop1
addq \$128, $CT
addq \$128, $PT
.L256_enc_msg_x8_check_remainder:
cmpq \$0, %r10
je .L256_enc_msg_x8_out
.L256_enc_msg_x8_loop2:
# encrypt each block separately
# CTR1 is the highest counter (even if no LOOP done)
vmovdqa $CTR1, $STATE1
vpaddd one(%rip), $CTR1, $CTR1
vpxor ($KS), $STATE1, $STATE1
vaesenc 16($KS), $STATE1, $STATE1
vaesenc 32($KS), $STATE1, $STATE1
vaesenc 48($KS), $STATE1, $STATE1
vaesenc 64($KS), $STATE1, $STATE1
vaesenc 80($KS), $STATE1, $STATE1
vaesenc 96($KS), $STATE1, $STATE1
vaesenc 112($KS), $STATE1, $STATE1
vaesenc 128($KS), $STATE1, $STATE1
vaesenc 144($KS), $STATE1, $STATE1
vaesenc 160($KS), $STATE1, $STATE1
vaesenc 176($KS), $STATE1, $STATE1
vaesenc 192($KS), $STATE1, $STATE1
vaesenc 208($KS), $STATE1, $STATE1
vaesenclast 224($KS), $STATE1, $STATE1
# XOR with Plaintext
vpxor ($PT), $STATE1, $STATE1
vmovdqu $STATE1, ($CT)
addq \$16, $PT
addq \$16, $CT
subq \$1, %r10
jnz .L256_enc_msg_x8_loop2
.L256_enc_msg_x8_out:
ret
.cfi_endproc
.size aes256gcmsiv_enc_msg_x8,.-aes256gcmsiv_enc_msg_x8
___
}
aes256gcmsiv_enc_msg_x8();
aesgcmsiv_dec(1);
sub aes256gcmsiv_kdf {
my $ONE = "%xmm8";
my $BLOCK1 = "%xmm4";
my $BLOCK2 = "%xmm6";
my $BLOCK3 = "%xmm7";
my $BLOCK4 = "%xmm11";
my $BLOCK5 = "%xmm12";
my $BLOCK6 = "%xmm13";
my $enc_roundx6 = sub {
my ($i, $j) = @_;
return <<___;
vmovdqa ${\eval($i*16)}(%rdx), $j
vaesenc $j, $BLOCK1, $BLOCK1
vaesenc $j, $BLOCK2, $BLOCK2
vaesenc $j, $BLOCK3, $BLOCK3
vaesenc $j, $BLOCK4, $BLOCK4
vaesenc $j, $BLOCK5, $BLOCK5
vaesenc $j, $BLOCK6, $BLOCK6
___
};
my $enc_roundlastx6 = sub {
my ($i, $j) = @_;
return <<___;
vmovdqa ${\eval($i*16)}(%rdx), $j
vaesenclast $j, $BLOCK1, $BLOCK1
vaesenclast $j, $BLOCK2, $BLOCK2
vaesenclast $j, $BLOCK3, $BLOCK3
vaesenclast $j, $BLOCK4, $BLOCK4
vaesenclast $j, $BLOCK5, $BLOCK5
vaesenclast $j, $BLOCK6, $BLOCK6
___
};
# void aes256gcmsiv_kdf(const uint8_t nonce[16],
# uint8_t *out_key_material,
# const uint8_t *key_schedule);
$code.=<<___;
.globl aes256gcmsiv_kdf
.type aes256gcmsiv_kdf,\@function,3
.align 16
aes256gcmsiv_kdf:
.cfi_startproc
# parameter 1: %rdi Pointer to NONCE
# parameter 2: %rsi Pointer to CT
# parameter 4: %rdx Pointer to keys
vmovdqa (%rdx), %xmm1 # xmm1 = first 16 bytes of random key
vmovdqa 0*16(%rdi), $BLOCK1
vmovdqa and_mask(%rip), $BLOCK4
vmovdqa one(%rip), $ONE
vpshufd \$0x90, $BLOCK1, $BLOCK1
vpand $BLOCK4, $BLOCK1, $BLOCK1
vpaddd $ONE, $BLOCK1, $BLOCK2
vpaddd $ONE, $BLOCK2, $BLOCK3
vpaddd $ONE, $BLOCK3, $BLOCK4
vpaddd $ONE, $BLOCK4, $BLOCK5
vpaddd $ONE, $BLOCK5, $BLOCK6
vpxor %xmm1, $BLOCK1, $BLOCK1
vpxor %xmm1, $BLOCK2, $BLOCK2
vpxor %xmm1, $BLOCK3, $BLOCK3
vpxor %xmm1, $BLOCK4, $BLOCK4
vpxor %xmm1, $BLOCK5, $BLOCK5
vpxor %xmm1, $BLOCK6, $BLOCK6
${\$enc_roundx6->(1, "%xmm1")}
${\$enc_roundx6->(2, "%xmm2")}
${\$enc_roundx6->(3, "%xmm1")}
${\$enc_roundx6->(4, "%xmm2")}
${\$enc_roundx6->(5, "%xmm1")}
${\$enc_roundx6->(6, "%xmm2")}
${\$enc_roundx6->(7, "%xmm1")}
${\$enc_roundx6->(8, "%xmm2")}
${\$enc_roundx6->(9, "%xmm1")}
${\$enc_roundx6->(10, "%xmm2")}
${\$enc_roundx6->(11, "%xmm1")}
${\$enc_roundx6->(12, "%xmm2")}
${\$enc_roundx6->(13, "%xmm1")}
${\$enc_roundlastx6->(14, "%xmm2")}
vmovdqa $BLOCK1, 0*16(%rsi)
vmovdqa $BLOCK2, 1*16(%rsi)
vmovdqa $BLOCK3, 2*16(%rsi)
vmovdqa $BLOCK4, 3*16(%rsi)
vmovdqa $BLOCK5, 4*16(%rsi)
vmovdqa $BLOCK6, 5*16(%rsi)
ret
.cfi_endproc
.size aes256gcmsiv_kdf, .-aes256gcmsiv_kdf
___
}
aes256gcmsiv_kdf();
print $code;
close STDOUT;