c5e9ac1cac
In order to use AES-GCM-SIV in the open-source QUIC boxer, it needs to be moved out from OPENSSL_SMALL. (Hopefully the linker can still discard it in the vast majority of cases.) Additionally, the input to the key schedule function comes from outside and may not be aligned, thus we need to use unaligned instructions to read it. Change-Id: I02c261fe0663d13a96c428174943c7e5ac8415a7 Reviewed-on: https://boringssl-review.googlesource.com/16824 Commit-Queue: David Benjamin <davidben@google.com> Reviewed-by: David Benjamin <davidben@google.com> CQ-Verified: CQ bot account: commit-bot@chromium.org <commit-bot@chromium.org>
2257 lines
57 KiB
Perl
2257 lines
57 KiB
Perl
#!/usr/bin/env perl
|
|
|
|
# Copyright (c) 2017, Shay Gueron.
|
|
# Copyright (c) 2017, Google Inc.
|
|
#
|
|
# Permission to use, copy, modify, and/or distribute this software for any
|
|
# purpose with or without fee is hereby granted, provided that the above
|
|
# copyright notice and this permission notice appear in all copies.
|
|
#
|
|
# THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
|
|
# WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
|
|
# MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
|
|
# SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
|
|
# WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
|
|
# OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
|
|
# CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */
|
|
|
|
use warnings FATAL => 'all';
|
|
|
|
$flavour = shift;
|
|
$output = shift;
|
|
if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
|
|
|
|
$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
|
|
|
|
$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
|
|
( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
|
|
( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
|
|
die "can't locate x86_64-xlate.pl";
|
|
|
|
open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
|
|
*STDOUT=*OUT;
|
|
|
|
$code.=<<___;
|
|
.data
|
|
|
|
.align 16
|
|
one:
|
|
.quad 1,0
|
|
two:
|
|
.quad 2,0
|
|
three:
|
|
.quad 3,0
|
|
four:
|
|
.quad 4,0
|
|
five:
|
|
.quad 5,0
|
|
six:
|
|
.quad 6,0
|
|
seven:
|
|
.quad 7,0
|
|
eight:
|
|
.quad 8,0
|
|
|
|
OR_MASK:
|
|
.long 0x00000000,0x00000000,0x00000000,0x80000000
|
|
poly:
|
|
.quad 0x1, 0xc200000000000000
|
|
mask:
|
|
.long 0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d
|
|
con1:
|
|
.long 1,1,1,1
|
|
con2:
|
|
.long 0x1b,0x1b,0x1b,0x1b
|
|
con3:
|
|
.byte -1,-1,-1,-1,-1,-1,-1,-1,4,5,6,7,4,5,6,7
|
|
and_mask:
|
|
.long 0,0xffffffff, 0xffffffff, 0xffffffff
|
|
___
|
|
|
|
$code.=<<___;
|
|
.text
|
|
___
|
|
|
|
sub gfmul {
|
|
#########################
|
|
# a = T
|
|
# b = TMP0 - remains unchanged
|
|
# res = T
|
|
# uses also TMP1,TMP2,TMP3,TMP4
|
|
# __m128i GFMUL(__m128i A, __m128i B);
|
|
|
|
my $T = "%xmm0";
|
|
my $TMP0 = "%xmm1";
|
|
my $TMP1 = "%xmm2";
|
|
my $TMP2 = "%xmm3";
|
|
my $TMP3 = "%xmm4";
|
|
my $TMP4 = "%xmm5";
|
|
|
|
$code.=<<___;
|
|
.type GFMUL,\@abi-omnipotent
|
|
.align 16
|
|
GFMUL:
|
|
.cfi_startproc
|
|
vpclmulqdq \$0x00, $TMP0, $T, $TMP1
|
|
vpclmulqdq \$0x11, $TMP0, $T, $TMP4
|
|
vpclmulqdq \$0x10, $TMP0, $T, $TMP2
|
|
vpclmulqdq \$0x01, $TMP0, $T, $TMP3
|
|
vpxor $TMP3, $TMP2, $TMP2
|
|
vpslldq \$8, $TMP2, $TMP3
|
|
vpsrldq \$8, $TMP2, $TMP2
|
|
vpxor $TMP3, $TMP1, $TMP1
|
|
vpxor $TMP2, $TMP4, $TMP4
|
|
|
|
vpclmulqdq \$0x10, poly(%rip), $TMP1, $TMP2
|
|
vpshufd \$78, $TMP1, $TMP3
|
|
vpxor $TMP3, $TMP2, $TMP1
|
|
|
|
vpclmulqdq \$0x10, poly(%rip), $TMP1, $TMP2
|
|
vpshufd \$78, $TMP1, $TMP3
|
|
vpxor $TMP3, $TMP2, $TMP1
|
|
|
|
vpxor $TMP4, $TMP1, $T
|
|
ret
|
|
.cfi_endproc
|
|
.size GFMUL, .-GFMUL
|
|
___
|
|
}
|
|
gfmul();
|
|
|
|
sub aesgcmsiv_htable_init {
|
|
# aesgcmsiv_htable_init writes an eight-entry table of powers of |H| to
|
|
# |out_htable|.
|
|
# void aesgcmsiv_htable_init(uint8_t out_htable[16*8], uint8_t *H);
|
|
|
|
my $Htbl = "%rdi";
|
|
my $H = "%rsi";
|
|
my $T = "%xmm0";
|
|
my $TMP0 = "%xmm1";
|
|
|
|
$code.=<<___;
|
|
.globl aesgcmsiv_htable_init
|
|
.type aesgcmsiv_htable_init,\@function,2
|
|
.align 16
|
|
aesgcmsiv_htable_init:
|
|
.cfi_startproc
|
|
vmovdqa ($H), $T
|
|
vmovdqa $T, $TMP0
|
|
vmovdqa $T, ($Htbl) # H
|
|
call GFMUL
|
|
vmovdqa $T, 16($Htbl) # H^2
|
|
call GFMUL
|
|
vmovdqa $T, 32($Htbl) # H^3
|
|
call GFMUL
|
|
vmovdqa $T, 48($Htbl) # H^4
|
|
call GFMUL
|
|
vmovdqa $T, 64($Htbl) # H^5
|
|
call GFMUL
|
|
vmovdqa $T, 80($Htbl) # H^6
|
|
call GFMUL
|
|
vmovdqa $T, 96($Htbl) # H^7
|
|
call GFMUL
|
|
vmovdqa $T, 112($Htbl) # H^8
|
|
ret
|
|
.cfi_endproc
|
|
.size aesgcmsiv_htable_init, .-aesgcmsiv_htable_init
|
|
___
|
|
}
|
|
aesgcmsiv_htable_init();
|
|
|
|
sub aesgcmsiv_htable6_init {
|
|
# aesgcmsiv_htable6_init writes a six-entry table of powers of |H| to
|
|
# |out_htable|.
|
|
# void aesgcmsiv_htable6_init(uint8_t out_htable[16*6], uint8_t *H);
|
|
#
|
|
my $Htbl = "%rdi";
|
|
my $H = "%rsi";
|
|
my $T = "%xmm0";
|
|
my $TMP0 = "%xmm1";
|
|
|
|
$code.=<<___;
|
|
.globl aesgcmsiv_htable6_init
|
|
.type aesgcmsiv_htable6_init,\@function,2
|
|
.align 16
|
|
aesgcmsiv_htable6_init:
|
|
.cfi_startproc
|
|
vmovdqa ($H), $T
|
|
vmovdqa $T, $TMP0
|
|
vmovdqa $T, ($Htbl) # H
|
|
call GFMUL
|
|
vmovdqa $T, 16($Htbl) # H^2
|
|
call GFMUL
|
|
vmovdqa $T, 32($Htbl) # H^3
|
|
call GFMUL
|
|
vmovdqa $T, 48($Htbl) # H^4
|
|
call GFMUL
|
|
vmovdqa $T, 64($Htbl) # H^5
|
|
call GFMUL
|
|
vmovdqa $T, 80($Htbl) # H^6
|
|
ret
|
|
.cfi_endproc
|
|
.size aesgcmsiv_htable6_init, .-aesgcmsiv_htable6_init
|
|
___
|
|
}
|
|
aesgcmsiv_htable6_init();
|
|
|
|
sub aesgcmsiv_htable_polyval {
|
|
# void aesgcmsiv_htable_polyval(uint8_t Htbl[16*8], uint8_t *MSG, uint64_t LEN, uint8_t *T);
|
|
# parameter 1: %rdi Htable - pointer to Htable
|
|
# parameter 2: %rsi INp - pointer to input
|
|
# parameter 3: %rdx LEN - length of BUFFER in bytes
|
|
# parameter 4: %rcx T - pointer to POLYVAL output
|
|
|
|
my $DATA = "%xmm0";
|
|
my $hlp0 = "%r11";
|
|
my $Htbl = "%rdi";
|
|
my $inp = "%rsi";
|
|
my $len = "%rdx";
|
|
my $TMP0 = "%xmm3";
|
|
my $TMP1 = "%xmm4";
|
|
my $TMP2 = "%xmm5";
|
|
my $TMP3 = "%xmm6";
|
|
my $TMP4 = "%xmm7";
|
|
my $Tp = "%rcx";
|
|
my $T = "%xmm1";
|
|
my $Xhi = "%xmm9";
|
|
|
|
my $SCHOOLBOOK_AAD = sub {
|
|
my ($i)=@_;
|
|
return <<___;
|
|
vpclmulqdq \$0x01, ${\eval(16*$i)}($Htbl), $DATA, $TMP3
|
|
vpxor $TMP3, $TMP2, $TMP2
|
|
vpclmulqdq \$0x00, ${\eval(16*$i)}($Htbl), $DATA, $TMP3
|
|
vpxor $TMP3, $TMP0, $TMP0
|
|
vpclmulqdq \$0x11, ${\eval(16*$i)}($Htbl), $DATA, $TMP3
|
|
vpxor $TMP3, $TMP1, $TMP1
|
|
vpclmulqdq \$0x10, ${\eval(16*$i)}($Htbl), $DATA, $TMP3
|
|
vpxor $TMP3, $TMP2, $TMP2
|
|
___
|
|
};
|
|
|
|
$code.=<<___;
|
|
.globl aesgcmsiv_htable_polyval
|
|
.type aesgcmsiv_htable_polyval,\@function,4
|
|
.align 16
|
|
aesgcmsiv_htable_polyval:
|
|
.cfi_startproc
|
|
test $len, $len
|
|
jnz .Lhtable_polyval_start
|
|
ret
|
|
|
|
.Lhtable_polyval_start:
|
|
vzeroall
|
|
|
|
# We hash 8 blocks each iteration. If the total number of blocks is not a
|
|
# multiple of 8, we first hash the leading n%8 blocks.
|
|
movq $len, $hlp0
|
|
andq \$127, $hlp0
|
|
|
|
jz .Lhtable_polyval_no_prefix
|
|
|
|
vpxor $Xhi, $Xhi, $Xhi
|
|
vmovdqa ($Tp), $T
|
|
sub $hlp0, $len
|
|
|
|
sub \$16, $hlp0
|
|
|
|
# hash first prefix block
|
|
vmovdqu ($inp), $DATA
|
|
vpxor $T, $DATA, $DATA
|
|
|
|
vpclmulqdq \$0x01, ($Htbl,$hlp0), $DATA, $TMP2
|
|
vpclmulqdq \$0x00, ($Htbl,$hlp0), $DATA, $TMP0
|
|
vpclmulqdq \$0x11, ($Htbl,$hlp0), $DATA, $TMP1
|
|
vpclmulqdq \$0x10, ($Htbl,$hlp0), $DATA, $TMP3
|
|
vpxor $TMP3, $TMP2, $TMP2
|
|
|
|
lea 16($inp), $inp
|
|
test $hlp0, $hlp0
|
|
jnz .Lhtable_polyval_prefix_loop
|
|
jmp .Lhtable_polyval_prefix_complete
|
|
|
|
# hash remaining prefix bocks (up to 7 total prefix blocks)
|
|
.align 64
|
|
.Lhtable_polyval_prefix_loop:
|
|
sub \$16, $hlp0
|
|
|
|
vmovdqu ($inp), $DATA # next data block
|
|
|
|
vpclmulqdq \$0x00, ($Htbl,$hlp0), $DATA, $TMP3
|
|
vpxor $TMP3, $TMP0, $TMP0
|
|
vpclmulqdq \$0x11, ($Htbl,$hlp0), $DATA, $TMP3
|
|
vpxor $TMP3, $TMP1, $TMP1
|
|
vpclmulqdq \$0x01, ($Htbl,$hlp0), $DATA, $TMP3
|
|
vpxor $TMP3, $TMP2, $TMP2
|
|
vpclmulqdq \$0x10, ($Htbl,$hlp0), $DATA, $TMP3
|
|
vpxor $TMP3, $TMP2, $TMP2
|
|
|
|
test $hlp0, $hlp0
|
|
|
|
lea 16($inp), $inp
|
|
|
|
jnz .Lhtable_polyval_prefix_loop
|
|
|
|
.Lhtable_polyval_prefix_complete:
|
|
vpsrldq \$8, $TMP2, $TMP3
|
|
vpslldq \$8, $TMP2, $TMP2
|
|
|
|
vpxor $TMP3, $TMP1, $Xhi
|
|
vpxor $TMP2, $TMP0, $T
|
|
|
|
jmp .Lhtable_polyval_main_loop
|
|
|
|
.Lhtable_polyval_no_prefix:
|
|
# At this point we know the number of blocks is a multiple of 8. However,
|
|
# the reduction in the main loop includes a multiplication by x^(-128). In
|
|
# order to counter this, the existing tag needs to be multipled by x^128.
|
|
# In practice, this just means that it is loaded into $Xhi, not $T.
|
|
vpxor $T, $T, $T
|
|
vmovdqa ($Tp), $Xhi
|
|
|
|
.align 64
|
|
.Lhtable_polyval_main_loop:
|
|
sub \$0x80, $len
|
|
jb .Lhtable_polyval_out
|
|
|
|
vmovdqu 16*7($inp), $DATA # Ii
|
|
|
|
vpclmulqdq \$0x01, ($Htbl), $DATA, $TMP2
|
|
vpclmulqdq \$0x00, ($Htbl), $DATA, $TMP0
|
|
vpclmulqdq \$0x11, ($Htbl), $DATA, $TMP1
|
|
vpclmulqdq \$0x10, ($Htbl), $DATA, $TMP3
|
|
vpxor $TMP3, $TMP2, $TMP2
|
|
|
|
#########################################################
|
|
vmovdqu 16*6($inp), $DATA
|
|
${\$SCHOOLBOOK_AAD->(1)}
|
|
|
|
#########################################################
|
|
vmovdqu 16*5($inp), $DATA
|
|
|
|
vpclmulqdq \$0x10, poly(%rip), $T, $TMP4 # reduction stage 1a
|
|
vpalignr \$8, $T, $T, $T
|
|
|
|
${\$SCHOOLBOOK_AAD->(2)}
|
|
|
|
vpxor $TMP4, $T, $T # reduction stage 1b
|
|
#########################################################
|
|
vmovdqu 16*4($inp), $DATA
|
|
|
|
${\$SCHOOLBOOK_AAD->(3)}
|
|
#########################################################
|
|
vmovdqu 16*3($inp), $DATA
|
|
|
|
vpclmulqdq \$0x10, poly(%rip), $T, $TMP4 # reduction stage 2a
|
|
vpalignr \$8, $T, $T, $T
|
|
|
|
${\$SCHOOLBOOK_AAD->(4)}
|
|
|
|
vpxor $TMP4, $T, $T # reduction stage 2b
|
|
#########################################################
|
|
vmovdqu 16*2($inp), $DATA
|
|
|
|
${\$SCHOOLBOOK_AAD->(5)}
|
|
|
|
vpxor $Xhi, $T, $T # reduction finalize
|
|
#########################################################
|
|
vmovdqu 16*1($inp), $DATA
|
|
|
|
${\$SCHOOLBOOK_AAD->(6)}
|
|
#########################################################
|
|
vmovdqu 16*0($inp), $DATA
|
|
vpxor $T, $DATA, $DATA
|
|
|
|
${\$SCHOOLBOOK_AAD->(7)}
|
|
#########################################################
|
|
vpsrldq \$8, $TMP2, $TMP3
|
|
vpslldq \$8, $TMP2, $TMP2
|
|
|
|
vpxor $TMP3, $TMP1, $Xhi
|
|
vpxor $TMP2, $TMP0, $T
|
|
|
|
lea 16*8($inp), $inp
|
|
jmp .Lhtable_polyval_main_loop
|
|
|
|
#########################################################
|
|
|
|
.Lhtable_polyval_out:
|
|
vpclmulqdq \$0x10, poly(%rip), $T, $TMP3
|
|
vpalignr \$8, $T, $T, $T
|
|
vpxor $TMP3, $T, $T
|
|
|
|
vpclmulqdq \$0x10, poly(%rip), $T, $TMP3
|
|
vpalignr \$8, $T, $T, $T
|
|
vpxor $TMP3, $T, $T
|
|
vpxor $Xhi, $T, $T
|
|
|
|
vmovdqu $T, ($Tp)
|
|
vzeroupper
|
|
ret
|
|
.cfi_endproc
|
|
.size aesgcmsiv_htable_polyval,.-aesgcmsiv_htable_polyval
|
|
___
|
|
}
|
|
aesgcmsiv_htable_polyval();
|
|
|
|
sub aesgcmsiv_polyval_horner {
|
|
#void aesgcmsiv_polyval_horner(unsigned char T[16], // output
|
|
# const unsigned char* H, // H
|
|
# unsigned char* BUF, // Buffer
|
|
# unsigned int blocks); // Len2
|
|
#
|
|
# parameter 1: %rdi T - pointers to POLYVAL output
|
|
# parameter 2: %rsi Hp - pointer to H (user key)
|
|
# parameter 3: %rdx INp - pointer to input
|
|
# parameter 4: %rcx L - total number of blocks in input BUFFER
|
|
#
|
|
my $T = "%rdi";
|
|
my $Hp = "%rsi";
|
|
my $INp = "%rdx";
|
|
my $L = "%rcx";
|
|
my $LOC = "%r10";
|
|
my $LEN = "%eax";
|
|
my $H = "%xmm1";
|
|
my $RES = "%xmm0";
|
|
|
|
$code.=<<___;
|
|
.globl aesgcmsiv_polyval_horner
|
|
.type aesgcmsiv_polyval_horner,\@function,4
|
|
.align 16
|
|
aesgcmsiv_polyval_horner:
|
|
.cfi_startproc
|
|
test $L, $L
|
|
jnz .Lpolyval_horner_start
|
|
ret
|
|
|
|
.Lpolyval_horner_start:
|
|
# We will start with L GFMULS for POLYVAL(BIG_BUFFER)
|
|
# RES = GFMUL(RES, H)
|
|
|
|
xorq $LOC, $LOC
|
|
shlq \$4, $L # L contains number of bytes to process
|
|
|
|
vmovdqa ($Hp), $H
|
|
vmovdqa ($T), $RES
|
|
|
|
.Lpolyval_horner_loop:
|
|
vpxor ($INp,$LOC), $RES, $RES # RES = RES + Xi
|
|
call GFMUL # RES = RES * H
|
|
|
|
add \$16, $LOC
|
|
cmp $LOC, $L
|
|
jne .Lpolyval_horner_loop
|
|
|
|
# calculation of T is complete. RES=T
|
|
vmovdqa $RES, ($T)
|
|
ret
|
|
.cfi_endproc
|
|
.size aesgcmsiv_polyval_horner,.-aesgcmsiv_polyval_horner
|
|
___
|
|
}
|
|
aesgcmsiv_polyval_horner();
|
|
|
|
# void aes128gcmsiv_aes_ks(const uint8_t *key, uint8_t *out_expanded_key);
|
|
# parameter 1: %rdi
|
|
# parameter 2: %rsi
|
|
$code.=<<___;
|
|
.globl aes128gcmsiv_aes_ks
|
|
.type aes128gcmsiv_aes_ks,\@function,2
|
|
.align 16
|
|
aes128gcmsiv_aes_ks:
|
|
.cfi_startproc
|
|
vmovdqu (%rdi), %xmm1 # xmm1 = user key
|
|
vmovdqa %xmm1, (%rsi) # rsi points to output
|
|
|
|
vmovdqa con1(%rip), %xmm0
|
|
vmovdqa mask(%rip), %xmm15
|
|
|
|
movq \$8, %rax
|
|
|
|
.Lks128_loop:
|
|
addq \$16, %rsi # rsi points for next key
|
|
subq \$1, %rax
|
|
vpshufb %xmm15, %xmm1, %xmm2 # xmm2 = shuffled user key
|
|
vaesenclast %xmm0, %xmm2, %xmm2
|
|
vpslld \$1, %xmm0, %xmm0
|
|
vpslldq \$4, %xmm1, %xmm3
|
|
vpxor %xmm3, %xmm1, %xmm1
|
|
vpslldq \$4, %xmm3, %xmm3
|
|
vpxor %xmm3, %xmm1, %xmm1
|
|
vpslldq \$4, %xmm3, %xmm3
|
|
vpxor %xmm3, %xmm1, %xmm1
|
|
vpxor %xmm2, %xmm1, %xmm1
|
|
vmovdqa %xmm1, (%rsi)
|
|
jne .Lks128_loop
|
|
|
|
vmovdqa con2(%rip), %xmm0
|
|
vpshufb %xmm15, %xmm1, %xmm2
|
|
vaesenclast %xmm0, %xmm2, %xmm2
|
|
vpslld \$1, %xmm0, %xmm0
|
|
vpslldq \$4, %xmm1, %xmm3
|
|
vpxor %xmm3, %xmm1, %xmm1
|
|
vpslldq \$4, %xmm3, %xmm3
|
|
vpxor %xmm3, %xmm1, %xmm1
|
|
vpslldq \$4, %xmm3, %xmm3
|
|
vpxor %xmm3, %xmm1, %xmm1
|
|
vpxor %xmm2, %xmm1, %xmm1
|
|
vmovdqa %xmm1, 16(%rsi)
|
|
|
|
vpshufb %xmm15, %xmm1, %xmm2
|
|
vaesenclast %xmm0, %xmm2, %xmm2
|
|
vpslldq \$4, %xmm1, %xmm3
|
|
vpxor %xmm3, %xmm1, %xmm1
|
|
vpslldq \$4, %xmm3, %xmm3
|
|
vpxor %xmm3, %xmm1, %xmm1
|
|
vpslldq \$4, %xmm3, %xmm3
|
|
vpxor %xmm3, %xmm1, %xmm1
|
|
vpxor %xmm2, %xmm1, %xmm1
|
|
vmovdqa %xmm1, 32(%rsi)
|
|
ret
|
|
.cfi_endproc
|
|
.size aes128gcmsiv_aes_ks,.-aes128gcmsiv_aes_ks
|
|
___
|
|
|
|
# void aes256gcmsiv_aes_ks(const uint8_t *key, uint8_t *out_expanded_key);
|
|
# parameter 1: %rdi
|
|
# parameter 2: %rsi
|
|
$code.=<<___;
|
|
.globl aes256gcmsiv_aes_ks
|
|
.type aes256gcmsiv_aes_ks,\@function,2
|
|
.align 16
|
|
aes256gcmsiv_aes_ks:
|
|
.cfi_startproc
|
|
vmovdqu (%rdi), %xmm1
|
|
vmovdqu 16(%rdi), %xmm3
|
|
vmovdqa %xmm1, (%rsi)
|
|
vmovdqa %xmm3, 16(%rsi)
|
|
vmovdqa con1(%rip), %xmm0
|
|
vmovdqa mask(%rip), %xmm15
|
|
vpxor %xmm14, %xmm14, %xmm14
|
|
mov \$6, %rax
|
|
|
|
.Lks256_loop:
|
|
add \$32, %rsi
|
|
subq \$1, %rax
|
|
vpshufb %xmm15, %xmm3, %xmm2
|
|
vaesenclast %xmm0, %xmm2, %xmm2
|
|
vpslld \$1, %xmm0, %xmm0
|
|
vpsllq \$32, %xmm1, %xmm4
|
|
vpxor %xmm4, %xmm1, %xmm1
|
|
vpshufb con3(%rip), %xmm1, %xmm4
|
|
vpxor %xmm4, %xmm1, %xmm1
|
|
vpxor %xmm2, %xmm1, %xmm1
|
|
vmovdqa %xmm1, (%rsi)
|
|
vpshufd \$0xff, %xmm1, %xmm2
|
|
vaesenclast %xmm14, %xmm2, %xmm2
|
|
vpsllq \$32, %xmm3, %xmm4
|
|
vpxor %xmm4, %xmm3, %xmm3
|
|
vpshufb con3(%rip), %xmm3, %xmm4
|
|
vpxor %xmm4, %xmm3, %xmm3
|
|
vpxor %xmm2, %xmm3, %xmm3
|
|
vmovdqa %xmm3, 16(%rsi)
|
|
jne .Lks256_loop
|
|
|
|
vpshufb %xmm15, %xmm3, %xmm2
|
|
vaesenclast %xmm0, %xmm2, %xmm2
|
|
vpsllq \$32, %xmm1, %xmm4
|
|
vpxor %xmm4, %xmm1, %xmm1
|
|
vpshufb con3(%rip), %xmm1, %xmm4
|
|
vpxor %xmm4, %xmm1, %xmm1
|
|
vpxor %xmm2, %xmm1, %xmm1
|
|
vmovdqa %xmm1, 32(%rsi)
|
|
ret
|
|
.cfi_endproc
|
|
___
|
|
|
|
sub aes128gcmsiv_aes_ks_enc_x1 {
|
|
my $KS1_REGA = "%xmm1";
|
|
my $KS1_REGB = "%xmm2";
|
|
my $BLOCK1 = "%xmm4";
|
|
my $AUXREG = "%xmm3";
|
|
|
|
my $KS_BLOCK = sub {
|
|
my ($reg, $reg2, $auxReg) = @_;
|
|
return <<___;
|
|
vpsllq \$32, $reg, $auxReg #!!saving mov instruction to xmm3
|
|
vpxor $auxReg, $reg, $reg
|
|
vpshufb con3(%rip), $reg, $auxReg
|
|
vpxor $auxReg, $reg, $reg
|
|
vpxor $reg2, $reg, $reg
|
|
___
|
|
};
|
|
|
|
my $round = sub {
|
|
my ($i, $j) = @_;
|
|
return <<___;
|
|
vpshufb %xmm15, %xmm1, %xmm2 #!!saving mov instruction to xmm2
|
|
vaesenclast %xmm0, %xmm2, %xmm2
|
|
vpslld \$1, %xmm0, %xmm0
|
|
${\$KS_BLOCK->($KS1_REGA, $KS1_REGB, $AUXREG)}
|
|
vaesenc %xmm1, $BLOCK1, $BLOCK1
|
|
vmovdqa %xmm1, ${\eval(16*$i)}($j)
|
|
___
|
|
};
|
|
|
|
my $roundlast = sub {
|
|
my ($i, $j) = @_;
|
|
return <<___;
|
|
vpshufb %xmm15, %xmm1, %xmm2 #!!saving mov instruction to xmm2
|
|
vaesenclast %xmm0, %xmm2, %xmm2
|
|
${\$KS_BLOCK->($KS1_REGA, $KS1_REGB, $AUXREG)}
|
|
vaesenclast %xmm1, $BLOCK1, $BLOCK1
|
|
vmovdqa %xmm1, ${\eval(16*$i)}($j)
|
|
___
|
|
};
|
|
|
|
# parameter 1: %rdi Pointer to PT
|
|
# parameter 2: %rsi Pointer to CT
|
|
# parameter 4: %rdx Pointer to keys
|
|
# parameter 5: %rcx Pointer to initial key
|
|
$code.=<<___;
|
|
.globl aes128gcmsiv_aes_ks_enc_x1
|
|
.type aes128gcmsiv_aes_ks_enc_x1,\@function,4
|
|
.align 16
|
|
aes128gcmsiv_aes_ks_enc_x1:
|
|
.cfi_startproc
|
|
vmovdqa (%rcx), %xmm1 # xmm1 = first 16 bytes of random key
|
|
vmovdqa 0*16(%rdi), $BLOCK1
|
|
|
|
vmovdqa %xmm1, (%rdx) # KEY[0] = first 16 bytes of random key
|
|
vpxor %xmm1, $BLOCK1, $BLOCK1
|
|
|
|
vmovdqa con1(%rip), %xmm0 # xmm0 = 1,1,1,1
|
|
vmovdqa mask(%rip), %xmm15 # xmm15 = mask
|
|
|
|
${\$round->(1, "%rdx")}
|
|
${\$round->(2, "%rdx")}
|
|
${\$round->(3, "%rdx")}
|
|
${\$round->(4, "%rdx")}
|
|
${\$round->(5, "%rdx")}
|
|
${\$round->(6, "%rdx")}
|
|
${\$round->(7, "%rdx")}
|
|
${\$round->(8, "%rdx")}
|
|
|
|
vmovdqa con2(%rip), %xmm0
|
|
|
|
${\$round->(9, "%rdx")}
|
|
${\$roundlast->(10, "%rdx")}
|
|
|
|
vmovdqa $BLOCK1, 0*16(%rsi)
|
|
ret
|
|
.cfi_endproc
|
|
.size aes128gcmsiv_aes_ks_enc_x1,.-aes128gcmsiv_aes_ks_enc_x1
|
|
___
|
|
}
|
|
aes128gcmsiv_aes_ks_enc_x1();
|
|
|
|
sub aes128gcmsiv_kdf {
|
|
my $BLOCK1 = "%xmm9";
|
|
my $BLOCK2 = "%xmm10";
|
|
my $BLOCK3 = "%xmm11";
|
|
my $BLOCK4 = "%xmm12";
|
|
my $BLOCK5 = "%xmm13";
|
|
my $BLOCK6 = "%xmm14";
|
|
my $ONE = "%xmm13";
|
|
my $KSp = "%rdx";
|
|
my $STATE_1 = "%xmm1";
|
|
|
|
my $enc_roundx4 = sub {
|
|
my ($i, $j) = @_;
|
|
return <<___;
|
|
vmovdqa ${\eval($i*16)}(%rdx), $j
|
|
vaesenc $j, $BLOCK1, $BLOCK1
|
|
vaesenc $j, $BLOCK2, $BLOCK2
|
|
vaesenc $j, $BLOCK3, $BLOCK3
|
|
vaesenc $j, $BLOCK4, $BLOCK4
|
|
___
|
|
};
|
|
|
|
my $enc_roundlastx4 = sub {
|
|
my ($i, $j) = @_;
|
|
return <<___;
|
|
vmovdqa ${\eval($i*16)}(%rdx), $j
|
|
vaesenclast $j, $BLOCK1, $BLOCK1
|
|
vaesenclast $j, $BLOCK2, $BLOCK2
|
|
vaesenclast $j, $BLOCK3, $BLOCK3
|
|
vaesenclast $j, $BLOCK4, $BLOCK4
|
|
___
|
|
};
|
|
|
|
# void aes128gcmsiv_kdf(const uint8_t nonce[16],
|
|
# uint8_t *out_key_material,
|
|
# const uint8_t *key_schedule);
|
|
$code.=<<___;
|
|
.globl aes128gcmsiv_kdf
|
|
.type aes128gcmsiv_kdf,\@function,3
|
|
.align 16
|
|
aes128gcmsiv_kdf:
|
|
.cfi_startproc
|
|
# parameter 1: %rdi Pointer to NONCE
|
|
# parameter 2: %rsi Pointer to CT
|
|
# parameter 4: %rdx Pointer to keys
|
|
|
|
vmovdqa (%rdx), %xmm1 # xmm1 = first 16 bytes of random key
|
|
vmovdqa 0*16(%rdi), $BLOCK1
|
|
vmovdqa and_mask(%rip), $BLOCK4
|
|
vmovdqa one(%rip), $ONE
|
|
vpshufd \$0x90, $BLOCK1, $BLOCK1
|
|
vpand $BLOCK4, $BLOCK1, $BLOCK1
|
|
vpaddd $ONE, $BLOCK1, $BLOCK2
|
|
vpaddd $ONE, $BLOCK2, $BLOCK3
|
|
vpaddd $ONE, $BLOCK3, $BLOCK4
|
|
|
|
vpxor %xmm1, $BLOCK1, $BLOCK1
|
|
vpxor %xmm1, $BLOCK2, $BLOCK2
|
|
vpxor %xmm1, $BLOCK3, $BLOCK3
|
|
vpxor %xmm1, $BLOCK4, $BLOCK4
|
|
|
|
${\$enc_roundx4->(1, "%xmm1")}
|
|
${\$enc_roundx4->(2, "%xmm2")}
|
|
${\$enc_roundx4->(3, "%xmm1")}
|
|
${\$enc_roundx4->(4, "%xmm2")}
|
|
${\$enc_roundx4->(5, "%xmm1")}
|
|
${\$enc_roundx4->(6, "%xmm2")}
|
|
${\$enc_roundx4->(7, "%xmm1")}
|
|
${\$enc_roundx4->(8, "%xmm2")}
|
|
${\$enc_roundx4->(9, "%xmm1")}
|
|
${\$enc_roundlastx4->(10, "%xmm2")}
|
|
|
|
vmovdqa $BLOCK1, 0*16(%rsi)
|
|
vmovdqa $BLOCK2, 1*16(%rsi)
|
|
vmovdqa $BLOCK3, 2*16(%rsi)
|
|
vmovdqa $BLOCK4, 3*16(%rsi)
|
|
ret
|
|
.cfi_endproc
|
|
.size aes128gcmsiv_kdf,.-aes128gcmsiv_kdf
|
|
___
|
|
}
|
|
aes128gcmsiv_kdf();
|
|
|
|
sub aes128gcmsiv_enc_msg_x4 {
|
|
my $CTR1 = "%xmm0";
|
|
my $CTR2 = "%xmm1";
|
|
my $CTR3 = "%xmm2";
|
|
my $CTR4 = "%xmm3";
|
|
my $ADDER = "%xmm4";
|
|
|
|
my $STATE1 = "%xmm5";
|
|
my $STATE2 = "%xmm6";
|
|
my $STATE3 = "%xmm7";
|
|
my $STATE4 = "%xmm8";
|
|
|
|
my $TMP = "%xmm12";
|
|
my $TMP2 = "%xmm13";
|
|
my $TMP3 = "%xmm14";
|
|
my $IV = "%xmm15";
|
|
|
|
my $PT = "%rdi";
|
|
my $CT = "%rsi";
|
|
my $TAG = "%rdx";
|
|
my $KS = "%rcx";
|
|
my $LEN = "%r8";
|
|
|
|
my $aes_round = sub {
|
|
my ($i) = @_;
|
|
return <<___;
|
|
vmovdqu ${\eval($i*16)}($KS), $TMP
|
|
vaesenc $TMP, $STATE1, $STATE1
|
|
vaesenc $TMP, $STATE2, $STATE2
|
|
vaesenc $TMP, $STATE3, $STATE3
|
|
vaesenc $TMP, $STATE4, $STATE4
|
|
___
|
|
};
|
|
|
|
my $aes_lastround = sub {
|
|
my ($i) = @_;
|
|
return <<___;
|
|
vmovdqu ${\eval($i*16)}($KS), $TMP
|
|
vaesenclast $TMP, $STATE1, $STATE1
|
|
vaesenclast $TMP, $STATE2, $STATE2
|
|
vaesenclast $TMP, $STATE3, $STATE3
|
|
vaesenclast $TMP, $STATE4, $STATE4
|
|
___
|
|
};
|
|
|
|
# void aes128gcmsiv_enc_msg_x4(unsigned char* PT, unsigned char* CT,
|
|
# unsigned char* TAG, unsigned char* KS,
|
|
# size_t byte_len);
|
|
# parameter 1: %rdi #PT
|
|
# parameter 2: %rsi #CT
|
|
# parameter 3: %rdx #TAG [127 126 ... 0] IV=[127...32]
|
|
# parameter 4: %rcx #KS
|
|
# parameter 5: %r8 #LEN MSG_length in bytes
|
|
$code.=<<___;
|
|
.globl aes128gcmsiv_enc_msg_x4
|
|
.type aes128gcmsiv_enc_msg_x4,\@function,5
|
|
.align 16
|
|
aes128gcmsiv_enc_msg_x4:
|
|
.cfi_startproc
|
|
test $LEN, $LEN
|
|
jnz .L128_enc_msg_x4_start
|
|
ret
|
|
|
|
.L128_enc_msg_x4_start:
|
|
pushq %r12
|
|
.cfi_push %r12
|
|
pushq %r13
|
|
.cfi_push %r13
|
|
|
|
shrq \$4, $LEN # LEN = num of blocks
|
|
movq $LEN, %r10
|
|
shlq \$62, %r10
|
|
shrq \$62, %r10
|
|
|
|
# make IV from TAG
|
|
vmovdqa ($TAG), $IV
|
|
vpor OR_MASK(%rip), $IV, $IV #IV = [1]TAG[126...32][00..00]
|
|
|
|
vmovdqu four(%rip), $ADDER # Register to increment counters
|
|
vmovdqa $IV, $CTR1 # CTR1 = TAG[1][127...32][00..00]
|
|
vpaddd one(%rip), $IV, $CTR2 # CTR2 = TAG[1][127...32][00..01]
|
|
vpaddd two(%rip), $IV, $CTR3 # CTR3 = TAG[1][127...32][00..02]
|
|
vpaddd three(%rip), $IV, $CTR4 # CTR4 = TAG[1][127...32][00..03]
|
|
|
|
shrq \$2, $LEN
|
|
je .L128_enc_msg_x4_check_remainder
|
|
|
|
subq \$64, $CT
|
|
subq \$64, $PT
|
|
|
|
.L128_enc_msg_x4_loop1:
|
|
addq \$64, $CT
|
|
addq \$64, $PT
|
|
|
|
vmovdqa $CTR1, $STATE1
|
|
vmovdqa $CTR2, $STATE2
|
|
vmovdqa $CTR3, $STATE3
|
|
vmovdqa $CTR4, $STATE4
|
|
|
|
vpxor ($KS), $STATE1, $STATE1
|
|
vpxor ($KS), $STATE2, $STATE2
|
|
vpxor ($KS), $STATE3, $STATE3
|
|
vpxor ($KS), $STATE4, $STATE4
|
|
|
|
${\$aes_round->(1)}
|
|
vpaddd $ADDER, $CTR1, $CTR1
|
|
${\$aes_round->(2)}
|
|
vpaddd $ADDER, $CTR2, $CTR2
|
|
${\$aes_round->(3)}
|
|
vpaddd $ADDER, $CTR3, $CTR3
|
|
${\$aes_round->(4)}
|
|
vpaddd $ADDER, $CTR4, $CTR4
|
|
|
|
${\$aes_round->(5)}
|
|
${\$aes_round->(6)}
|
|
${\$aes_round->(7)}
|
|
${\$aes_round->(8)}
|
|
${\$aes_round->(9)}
|
|
${\$aes_lastround->(10)}
|
|
|
|
# XOR with Plaintext
|
|
vpxor 0*16($PT), $STATE1, $STATE1
|
|
vpxor 1*16($PT), $STATE2, $STATE2
|
|
vpxor 2*16($PT), $STATE3, $STATE3
|
|
vpxor 3*16($PT), $STATE4, $STATE4
|
|
|
|
subq \$1, $LEN
|
|
|
|
vmovdqu $STATE1, 0*16($CT)
|
|
vmovdqu $STATE2, 1*16($CT)
|
|
vmovdqu $STATE3, 2*16($CT)
|
|
vmovdqu $STATE4, 3*16($CT)
|
|
|
|
jne .L128_enc_msg_x4_loop1
|
|
|
|
addq \$64,$CT
|
|
addq \$64,$PT
|
|
|
|
.L128_enc_msg_x4_check_remainder:
|
|
cmpq \$0, %r10
|
|
je .L128_enc_msg_x4_out
|
|
|
|
.L128_enc_msg_x4_loop2:
|
|
# enc each block separately
|
|
# CTR1 is the highest counter (even if no LOOP done)
|
|
vmovdqa $CTR1, $STATE1
|
|
vpaddd one(%rip), $CTR1, $CTR1 # inc counter
|
|
|
|
vpxor ($KS), $STATE1, $STATE1
|
|
vaesenc 16($KS), $STATE1, $STATE1
|
|
vaesenc 32($KS), $STATE1, $STATE1
|
|
vaesenc 48($KS), $STATE1, $STATE1
|
|
vaesenc 64($KS), $STATE1, $STATE1
|
|
vaesenc 80($KS), $STATE1, $STATE1
|
|
vaesenc 96($KS), $STATE1, $STATE1
|
|
vaesenc 112($KS), $STATE1, $STATE1
|
|
vaesenc 128($KS), $STATE1, $STATE1
|
|
vaesenc 144($KS), $STATE1, $STATE1
|
|
vaesenclast 160($KS), $STATE1, $STATE1
|
|
|
|
# XOR with plaintext
|
|
vpxor ($PT), $STATE1, $STATE1
|
|
vmovdqu $STATE1, ($CT)
|
|
|
|
addq \$16, $PT
|
|
addq \$16, $CT
|
|
|
|
subq \$1, %r10
|
|
jne .L128_enc_msg_x4_loop2
|
|
|
|
.L128_enc_msg_x4_out:
|
|
popq %r13
|
|
.cfi_pop %r13
|
|
popq %r12
|
|
.cfi_pop %r12
|
|
ret
|
|
.cfi_endproc
|
|
.size aes128gcmsiv_enc_msg_x4,.-aes128gcmsiv_enc_msg_x4
|
|
___
|
|
}
|
|
aes128gcmsiv_enc_msg_x4();
|
|
|
|
sub aes128gcmsiv_enc_msg_x8 {
|
|
my $STATE1 = "%xmm1";
|
|
my $STATE2 = "%xmm2";
|
|
my $STATE3 = "%xmm3";
|
|
my $STATE4 = "%xmm4";
|
|
my $STATE5 = "%xmm5";
|
|
my $STATE6 = "%xmm6";
|
|
my $STATE7 = "%xmm7";
|
|
my $STATE8 = "%xmm8";
|
|
|
|
my $CTR1 = "%xmm0";
|
|
my $CTR2 = "%xmm9";
|
|
my $CTR3 = "%xmm10";
|
|
my $CTR4 = "%xmm11";
|
|
my $CTR5 = "%xmm12";
|
|
my $CTR6 = "%xmm13";
|
|
my $CTR7 = "%xmm14";
|
|
my $SCHED = "%xmm15";
|
|
|
|
my $TMP1 = "%xmm1";
|
|
my $TMP2 = "%xmm2";
|
|
|
|
my $PT = "%rdi";
|
|
my $CT = "%rsi";
|
|
my $TAG = "%rdx";
|
|
my $KS = "%rcx";
|
|
my $LEN = "%r8";
|
|
|
|
my $aes_round8 = sub {
|
|
my ($i) = @_;
|
|
return <<___;
|
|
vmovdqu ${\eval($i*16)}($KS), $SCHED
|
|
vaesenc $SCHED, $STATE1, $STATE1
|
|
vaesenc $SCHED, $STATE2, $STATE2
|
|
vaesenc $SCHED, $STATE3, $STATE3
|
|
vaesenc $SCHED, $STATE4, $STATE4
|
|
vaesenc $SCHED, $STATE5, $STATE5
|
|
vaesenc $SCHED, $STATE6, $STATE6
|
|
vaesenc $SCHED, $STATE7, $STATE7
|
|
vaesenc $SCHED, $STATE8, $STATE8
|
|
___
|
|
};
|
|
|
|
my $aes_lastround8 = sub {
|
|
my ($i) = @_;
|
|
return <<___;
|
|
vmovdqu ${\eval($i*16)}($KS), $SCHED
|
|
vaesenclast $SCHED, $STATE1, $STATE1
|
|
vaesenclast $SCHED, $STATE2, $STATE2
|
|
vaesenclast $SCHED, $STATE3, $STATE3
|
|
vaesenclast $SCHED, $STATE4, $STATE4
|
|
vaesenclast $SCHED, $STATE5, $STATE5
|
|
vaesenclast $SCHED, $STATE6, $STATE6
|
|
vaesenclast $SCHED, $STATE7, $STATE7
|
|
vaesenclast $SCHED, $STATE8, $STATE8
|
|
___
|
|
};
|
|
|
|
# void ENC_MSG_x8(unsigned char* PT,
|
|
# unsigned char* CT,
|
|
# unsigned char* TAG,
|
|
# unsigned char* KS,
|
|
# size_t byte_len);
|
|
# parameter 1: %rdi #PT
|
|
# parameter 2: %rsi #CT
|
|
# parameter 3: %rdx #TAG [127 126 ... 0] IV=[127...32]
|
|
# parameter 4: %rcx #KS
|
|
# parameter 5: %r8 #LEN MSG_length in bytes
|
|
$code.=<<___;
|
|
.globl aes128gcmsiv_enc_msg_x8
|
|
.type aes128gcmsiv_enc_msg_x8,\@function,5
|
|
.align 16
|
|
aes128gcmsiv_enc_msg_x8:
|
|
.cfi_startproc
|
|
test $LEN, $LEN
|
|
jnz .L128_enc_msg_x8_start
|
|
ret
|
|
|
|
.L128_enc_msg_x8_start:
|
|
pushq %r12
|
|
.cfi_push %r12
|
|
pushq %r13
|
|
.cfi_push %r13
|
|
pushq %rbp
|
|
.cfi_push %rbp
|
|
movq %rsp, %rbp
|
|
.cfi_def_cfa_register rbp
|
|
|
|
# Place in stack
|
|
subq \$128, %rsp
|
|
andq \$-64, %rsp
|
|
|
|
shrq \$4, $LEN # LEN = num of blocks
|
|
movq $LEN, %r10
|
|
shlq \$61, %r10
|
|
shrq \$61, %r10
|
|
|
|
# make IV from TAG
|
|
vmovdqu ($TAG), $TMP1
|
|
vpor OR_MASK(%rip), $TMP1, $TMP1 # TMP1= IV = [1]TAG[126...32][00..00]
|
|
|
|
# store counter8 in the stack
|
|
vpaddd seven(%rip), $TMP1, $CTR1
|
|
vmovdqu $CTR1, (%rsp) # CTR8 = TAG[127...32][00..07]
|
|
vpaddd one(%rip), $TMP1, $CTR2 # CTR2 = TAG[127...32][00..01]
|
|
vpaddd two(%rip), $TMP1, $CTR3 # CTR3 = TAG[127...32][00..02]
|
|
vpaddd three(%rip), $TMP1, $CTR4 # CTR4 = TAG[127...32][00..03]
|
|
vpaddd four(%rip), $TMP1, $CTR5 # CTR5 = TAG[127...32][00..04]
|
|
vpaddd five(%rip), $TMP1, $CTR6 # CTR6 = TAG[127...32][00..05]
|
|
vpaddd six(%rip), $TMP1, $CTR7 # CTR7 = TAG[127...32][00..06]
|
|
vmovdqa $TMP1, $CTR1 # CTR1 = TAG[127...32][00..00]
|
|
|
|
shrq \$3, $LEN
|
|
je .L128_enc_msg_x8_check_remainder
|
|
|
|
subq \$128, $CT
|
|
subq \$128, $PT
|
|
|
|
.L128_enc_msg_x8_loop1:
|
|
addq \$128, $CT
|
|
addq \$128, $PT
|
|
|
|
vmovdqa $CTR1, $STATE1
|
|
vmovdqa $CTR2, $STATE2
|
|
vmovdqa $CTR3, $STATE3
|
|
vmovdqa $CTR4, $STATE4
|
|
vmovdqa $CTR5, $STATE5
|
|
vmovdqa $CTR6, $STATE6
|
|
vmovdqa $CTR7, $STATE7
|
|
# move from stack
|
|
vmovdqu (%rsp), $STATE8
|
|
|
|
vpxor ($KS), $STATE1, $STATE1
|
|
vpxor ($KS), $STATE2, $STATE2
|
|
vpxor ($KS), $STATE3, $STATE3
|
|
vpxor ($KS), $STATE4, $STATE4
|
|
vpxor ($KS), $STATE5, $STATE5
|
|
vpxor ($KS), $STATE6, $STATE6
|
|
vpxor ($KS), $STATE7, $STATE7
|
|
vpxor ($KS), $STATE8, $STATE8
|
|
|
|
${\$aes_round8->(1)}
|
|
vmovdqu (%rsp), $CTR7 # deal with CTR8
|
|
vpaddd eight(%rip), $CTR7, $CTR7
|
|
vmovdqu $CTR7, (%rsp)
|
|
${\$aes_round8->(2)}
|
|
vpsubd one(%rip), $CTR7, $CTR7
|
|
${\$aes_round8->(3)}
|
|
vpaddd eight(%rip), $CTR1, $CTR1
|
|
${\$aes_round8->(4)}
|
|
vpaddd eight(%rip), $CTR2, $CTR2
|
|
${\$aes_round8->(5)}
|
|
vpaddd eight(%rip), $CTR3, $CTR3
|
|
${\$aes_round8->(6)}
|
|
vpaddd eight(%rip), $CTR4, $CTR4
|
|
${\$aes_round8->(7)}
|
|
vpaddd eight(%rip), $CTR5, $CTR5
|
|
${\$aes_round8->(8)}
|
|
vpaddd eight(%rip), $CTR6, $CTR6
|
|
${\$aes_round8->(9)}
|
|
${\$aes_lastround8->(10)}
|
|
|
|
# XOR with Plaintext
|
|
vpxor 0*16($PT), $STATE1, $STATE1
|
|
vpxor 1*16($PT), $STATE2, $STATE2
|
|
vpxor 2*16($PT), $STATE3, $STATE3
|
|
vpxor 3*16($PT), $STATE4, $STATE4
|
|
vpxor 4*16($PT), $STATE5, $STATE5
|
|
vpxor 5*16($PT), $STATE6, $STATE6
|
|
vpxor 6*16($PT), $STATE7, $STATE7
|
|
vpxor 7*16($PT), $STATE8, $STATE8
|
|
|
|
dec $LEN
|
|
|
|
vmovdqu $STATE1, 0*16($CT)
|
|
vmovdqu $STATE2, 1*16($CT)
|
|
vmovdqu $STATE3, 2*16($CT)
|
|
vmovdqu $STATE4, 3*16($CT)
|
|
vmovdqu $STATE5, 4*16($CT)
|
|
vmovdqu $STATE6, 5*16($CT)
|
|
vmovdqu $STATE7, 6*16($CT)
|
|
vmovdqu $STATE8, 7*16($CT)
|
|
|
|
jne .L128_enc_msg_x8_loop1
|
|
|
|
addq \$128, $CT
|
|
addq \$128, $PT
|
|
|
|
.L128_enc_msg_x8_check_remainder:
|
|
cmpq \$0, %r10
|
|
je .L128_enc_msg_x8_out
|
|
|
|
.L128_enc_msg_x8_loop2:
|
|
# enc each block separately
|
|
# CTR1 is the highest counter (even if no LOOP done)
|
|
vmovdqa $CTR1, $STATE1
|
|
vpaddd one(%rip), $CTR1, $CTR1 # inc counter
|
|
|
|
vpxor ($KS), $STATE1, $STATE1
|
|
vaesenc 16($KS), $STATE1, $STATE1
|
|
vaesenc 32($KS), $STATE1, $STATE1
|
|
vaesenc 48($KS), $STATE1, $STATE1
|
|
vaesenc 64($KS), $STATE1, $STATE1
|
|
vaesenc 80($KS), $STATE1, $STATE1
|
|
vaesenc 96($KS), $STATE1, $STATE1
|
|
vaesenc 112($KS), $STATE1, $STATE1
|
|
vaesenc 128($KS), $STATE1, $STATE1
|
|
vaesenc 144($KS), $STATE1, $STATE1
|
|
vaesenclast 160($KS), $STATE1, $STATE1
|
|
|
|
# XOR with Plaintext
|
|
vpxor ($PT), $STATE1, $STATE1
|
|
|
|
vmovdqu $STATE1, ($CT)
|
|
|
|
addq \$16, $PT
|
|
addq \$16, $CT
|
|
|
|
decq %r10
|
|
jne .L128_enc_msg_x8_loop2
|
|
|
|
.L128_enc_msg_x8_out:
|
|
movq %rbp, %rsp
|
|
.cfi_def_cfa_register %rsp
|
|
popq %rbp
|
|
.cfi_pop %rbp
|
|
popq %r13
|
|
.cfi_pop %r13
|
|
popq %r12
|
|
.cfi_pop %r12
|
|
ret
|
|
.cfi_endproc
|
|
.size aes128gcmsiv_enc_msg_x8,.-aes128gcmsiv_enc_msg_x8
|
|
___
|
|
}
|
|
aes128gcmsiv_enc_msg_x8();
|
|
|
|
sub aesgcmsiv_dec {
|
|
my ($aes256) = @_;
|
|
|
|
my $T = "%xmm0";
|
|
my $TMP0 = "%xmm1";
|
|
my $TMP1 = "%xmm2";
|
|
my $TMP2 = "%xmm3";
|
|
my $TMP3 = "%xmm4";
|
|
my $TMP4 = "%xmm5";
|
|
my $TMP5 = "%xmm6";
|
|
my $CTR1 = "%xmm7";
|
|
my $CTR2 = "%xmm8";
|
|
my $CTR3 = "%xmm9";
|
|
my $CTR4 = "%xmm10";
|
|
my $CTR5 = "%xmm11";
|
|
my $CTR6 = "%xmm12";
|
|
my $CTR = "%xmm15";
|
|
my $CT = "%rdi";
|
|
my $PT = "%rsi";
|
|
my $POL = "%rdx";
|
|
my $Htbl = "%rcx";
|
|
my $KS = "%r8";
|
|
my $LEN = "%r9";
|
|
my $secureBuffer = "%rax";
|
|
my $HTABLE_ROUNDS = "%xmm13";
|
|
|
|
my $labelPrefix = "128";
|
|
if ($aes256) {
|
|
$labelPrefix = "256";
|
|
}
|
|
|
|
my $aes_round_dec = sub {
|
|
my ($i) = @_;
|
|
return <<___;
|
|
vmovdqu ${\eval($i*16)}($KS), $TMP3
|
|
vaesenc $TMP3, $CTR1, $CTR1
|
|
vaesenc $TMP3, $CTR2, $CTR2
|
|
vaesenc $TMP3, $CTR3, $CTR3
|
|
vaesenc $TMP3, $CTR4, $CTR4
|
|
vaesenc $TMP3, $CTR5, $CTR5
|
|
vaesenc $TMP3, $CTR6, $CTR6
|
|
___
|
|
};
|
|
|
|
my $aes_lastround_dec = sub {
|
|
my ($i) = @_;
|
|
return <<___;
|
|
vmovdqu ${\eval($i*16)}($KS), $TMP3
|
|
vaesenclast $TMP3, $CTR1, $CTR1
|
|
vaesenclast $TMP3, $CTR2, $CTR2
|
|
vaesenclast $TMP3, $CTR3, $CTR3
|
|
vaesenclast $TMP3, $CTR4, $CTR4
|
|
vaesenclast $TMP3, $CTR5, $CTR5
|
|
vaesenclast $TMP3, $CTR6, $CTR6
|
|
___
|
|
};
|
|
|
|
my $schoolbook = sub {
|
|
my ($i) = @_;
|
|
return <<___;
|
|
vmovdqu ${\eval($i*16-32)}($secureBuffer), $TMP5
|
|
vmovdqu ${\eval($i*16-32)}($Htbl), $HTABLE_ROUNDS
|
|
|
|
vpclmulqdq \$0x10, $HTABLE_ROUNDS, $TMP5, $TMP3
|
|
vpxor $TMP3, $TMP0, $TMP0
|
|
vpclmulqdq \$0x11, $HTABLE_ROUNDS, $TMP5, $TMP3
|
|
vpxor $TMP3, $TMP1, $TMP1
|
|
vpclmulqdq \$0x00, $HTABLE_ROUNDS, $TMP5, $TMP3
|
|
vpxor $TMP3, $TMP2, $TMP2
|
|
vpclmulqdq \$0x01, $HTABLE_ROUNDS, $TMP5, $TMP3
|
|
vpxor $TMP3, $TMP0, $TMP0
|
|
___
|
|
};
|
|
|
|
if ($aes256) {
|
|
$code.=<<___;
|
|
.globl aes256gcmsiv_dec
|
|
.type aes256gcmsiv_dec,\@function,6
|
|
.align 16
|
|
aes256gcmsiv_dec:
|
|
___
|
|
} else {
|
|
$code.=<<___;
|
|
.globl aes128gcmsiv_dec
|
|
.type aes128gcmsiv_dec,\@function,6
|
|
.align 16
|
|
aes128gcmsiv_dec:
|
|
___
|
|
}
|
|
|
|
$code.=<<___;
|
|
.cfi_startproc
|
|
test \$~15, $LEN
|
|
jnz .L${labelPrefix}_dec_start
|
|
ret
|
|
|
|
.L${labelPrefix}_dec_start:
|
|
vzeroupper
|
|
vmovdqa ($POL), $T
|
|
movq $POL, $secureBuffer
|
|
|
|
leaq 32($secureBuffer), $secureBuffer
|
|
leaq 32($Htbl), $Htbl
|
|
|
|
# make CTRBLKs from given tag.
|
|
vmovdqu ($CT,$LEN), $CTR
|
|
vpor OR_MASK(%rip), $CTR, $CTR # CTR = [1]TAG[126...32][00..00]
|
|
andq \$~15, $LEN
|
|
|
|
# If less then 6 blocks, make singles
|
|
cmp \$96, $LEN
|
|
jb .L${labelPrefix}_dec_loop2
|
|
|
|
# Decrypt the first six blocks
|
|
sub \$96, $LEN
|
|
vmovdqa $CTR, $CTR1
|
|
vpaddd one(%rip), $CTR1, $CTR2
|
|
vpaddd two(%rip), $CTR1, $CTR3
|
|
vpaddd one(%rip), $CTR3, $CTR4
|
|
vpaddd two(%rip), $CTR3, $CTR5
|
|
vpaddd one(%rip), $CTR5, $CTR6
|
|
vpaddd two(%rip), $CTR5, $CTR
|
|
|
|
vpxor ($KS), $CTR1, $CTR1
|
|
vpxor ($KS), $CTR2, $CTR2
|
|
vpxor ($KS), $CTR3, $CTR3
|
|
vpxor ($KS), $CTR4, $CTR4
|
|
vpxor ($KS), $CTR5, $CTR5
|
|
vpxor ($KS), $CTR6, $CTR6
|
|
|
|
${\$aes_round_dec->(1)}
|
|
${\$aes_round_dec->(2)}
|
|
${\$aes_round_dec->(3)}
|
|
${\$aes_round_dec->(4)}
|
|
${\$aes_round_dec->(5)}
|
|
${\$aes_round_dec->(6)}
|
|
${\$aes_round_dec->(7)}
|
|
${\$aes_round_dec->(8)}
|
|
${\$aes_round_dec->(9)}
|
|
___
|
|
|
|
if ($aes256) {
|
|
$code.=<<___;
|
|
${\$aes_round_dec->(10)}
|
|
${\$aes_round_dec->(11)}
|
|
${\$aes_round_dec->(12)}
|
|
${\$aes_round_dec->(13)}
|
|
${\$aes_lastround_dec->(14)}
|
|
___
|
|
} else {
|
|
$code.=<<___;
|
|
${\$aes_lastround_dec->(10)}
|
|
___
|
|
}
|
|
|
|
$code.=<<___;
|
|
# XOR with CT
|
|
vpxor 0*16($CT), $CTR1, $CTR1
|
|
vpxor 1*16($CT), $CTR2, $CTR2
|
|
vpxor 2*16($CT), $CTR3, $CTR3
|
|
vpxor 3*16($CT), $CTR4, $CTR4
|
|
vpxor 4*16($CT), $CTR5, $CTR5
|
|
vpxor 5*16($CT), $CTR6, $CTR6
|
|
|
|
vmovdqu $CTR1, 0*16($PT)
|
|
vmovdqu $CTR2, 1*16($PT)
|
|
vmovdqu $CTR3, 2*16($PT)
|
|
vmovdqu $CTR4, 3*16($PT)
|
|
vmovdqu $CTR5, 4*16($PT)
|
|
vmovdqu $CTR6, 5*16($PT)
|
|
|
|
addq \$96, $CT
|
|
addq \$96, $PT
|
|
jmp .L${labelPrefix}_dec_loop1
|
|
|
|
# Decrypt 6 blocks each time while hashing previous 6 blocks
|
|
.align 64
|
|
.L${labelPrefix}_dec_loop1:
|
|
cmp \$96, $LEN
|
|
jb .L${labelPrefix}_dec_finish_96
|
|
sub \$96, $LEN
|
|
|
|
vmovdqa $CTR6, $TMP5
|
|
vmovdqa $CTR5, 1*16-32($secureBuffer)
|
|
vmovdqa $CTR4, 2*16-32($secureBuffer)
|
|
vmovdqa $CTR3, 3*16-32($secureBuffer)
|
|
vmovdqa $CTR2, 4*16-32($secureBuffer)
|
|
vmovdqa $CTR1, 5*16-32($secureBuffer)
|
|
|
|
vmovdqa $CTR, $CTR1
|
|
vpaddd one(%rip), $CTR1, $CTR2
|
|
vpaddd two(%rip), $CTR1, $CTR3
|
|
vpaddd one(%rip), $CTR3, $CTR4
|
|
vpaddd two(%rip), $CTR3, $CTR5
|
|
vpaddd one(%rip), $CTR5, $CTR6
|
|
vpaddd two(%rip), $CTR5, $CTR
|
|
|
|
vmovdqa ($KS), $TMP3
|
|
vpxor $TMP3, $CTR1, $CTR1
|
|
vpxor $TMP3, $CTR2, $CTR2
|
|
vpxor $TMP3, $CTR3, $CTR3
|
|
vpxor $TMP3, $CTR4, $CTR4
|
|
vpxor $TMP3, $CTR5, $CTR5
|
|
vpxor $TMP3, $CTR6, $CTR6
|
|
|
|
vmovdqu 0*16-32($Htbl), $TMP3
|
|
vpclmulqdq \$0x11, $TMP3, $TMP5, $TMP1
|
|
vpclmulqdq \$0x00, $TMP3, $TMP5, $TMP2
|
|
vpclmulqdq \$0x01, $TMP3, $TMP5, $TMP0
|
|
vpclmulqdq \$0x10, $TMP3, $TMP5, $TMP3
|
|
vpxor $TMP3, $TMP0, $TMP0
|
|
|
|
${\$aes_round_dec->(1)}
|
|
${\$schoolbook->(1)}
|
|
|
|
${\$aes_round_dec->(2)}
|
|
${\$schoolbook->(2)}
|
|
|
|
${\$aes_round_dec->(3)}
|
|
${\$schoolbook->(3)}
|
|
|
|
${\$aes_round_dec->(4)}
|
|
${\$schoolbook->(4)}
|
|
|
|
${\$aes_round_dec->(5)}
|
|
${\$aes_round_dec->(6)}
|
|
${\$aes_round_dec->(7)}
|
|
|
|
vmovdqa 5*16-32($secureBuffer), $TMP5
|
|
vpxor $T, $TMP5, $TMP5
|
|
vmovdqu 5*16-32($Htbl), $TMP4
|
|
|
|
vpclmulqdq \$0x01, $TMP4, $TMP5, $TMP3
|
|
vpxor $TMP3, $TMP0, $TMP0
|
|
vpclmulqdq \$0x11, $TMP4, $TMP5, $TMP3
|
|
vpxor $TMP3, $TMP1, $TMP1
|
|
vpclmulqdq \$0x00, $TMP4, $TMP5, $TMP3
|
|
vpxor $TMP3, $TMP2, $TMP2
|
|
vpclmulqdq \$0x10, $TMP4, $TMP5, $TMP3
|
|
vpxor $TMP3, $TMP0, $TMP0
|
|
|
|
${\$aes_round_dec->(8)}
|
|
|
|
vpsrldq \$8, $TMP0, $TMP3
|
|
vpxor $TMP3, $TMP1, $TMP4
|
|
vpslldq \$8, $TMP0, $TMP3
|
|
vpxor $TMP3, $TMP2, $T
|
|
|
|
vmovdqa poly(%rip), $TMP2
|
|
|
|
${\$aes_round_dec->(9)}
|
|
___
|
|
|
|
if ($aes256) {
|
|
$code.=<<___;
|
|
${\$aes_round_dec->(10)}
|
|
${\$aes_round_dec->(11)}
|
|
${\$aes_round_dec->(12)}
|
|
${\$aes_round_dec->(13)}
|
|
vmovdqu 14*16($KS), $TMP5
|
|
___
|
|
} else {
|
|
$code.=<<___;
|
|
vmovdqu 10*16($KS), $TMP5
|
|
___
|
|
}
|
|
|
|
$code.=<<___;
|
|
vpalignr \$8, $T, $T, $TMP1
|
|
vpclmulqdq \$0x10, $TMP2, $T, $T
|
|
vpxor $T, $TMP1, $T
|
|
|
|
vpxor 0*16($CT), $TMP5, $TMP3
|
|
vaesenclast $TMP3, $CTR1, $CTR1
|
|
vpxor 1*16($CT), $TMP5, $TMP3
|
|
vaesenclast $TMP3, $CTR2, $CTR2
|
|
vpxor 2*16($CT), $TMP5, $TMP3
|
|
vaesenclast $TMP3, $CTR3, $CTR3
|
|
vpxor 3*16($CT), $TMP5, $TMP3
|
|
vaesenclast $TMP3, $CTR4, $CTR4
|
|
vpxor 4*16($CT), $TMP5, $TMP3
|
|
vaesenclast $TMP3, $CTR5, $CTR5
|
|
vpxor 5*16($CT), $TMP5, $TMP3
|
|
vaesenclast $TMP3, $CTR6, $CTR6
|
|
|
|
vpalignr \$8, $T, $T, $TMP1
|
|
vpclmulqdq \$0x10, $TMP2, $T, $T
|
|
vpxor $T, $TMP1, $T
|
|
|
|
vmovdqu $CTR1, 0*16($PT)
|
|
vmovdqu $CTR2, 1*16($PT)
|
|
vmovdqu $CTR3, 2*16($PT)
|
|
vmovdqu $CTR4, 3*16($PT)
|
|
vmovdqu $CTR5, 4*16($PT)
|
|
vmovdqu $CTR6, 5*16($PT)
|
|
|
|
vpxor $TMP4, $T, $T
|
|
|
|
lea 96($CT), $CT
|
|
lea 96($PT), $PT
|
|
jmp .L${labelPrefix}_dec_loop1
|
|
|
|
.L${labelPrefix}_dec_finish_96:
|
|
vmovdqa $CTR6, $TMP5
|
|
vmovdqa $CTR5, 1*16-32($secureBuffer)
|
|
vmovdqa $CTR4, 2*16-32($secureBuffer)
|
|
vmovdqa $CTR3, 3*16-32($secureBuffer)
|
|
vmovdqa $CTR2, 4*16-32($secureBuffer)
|
|
vmovdqa $CTR1, 5*16-32($secureBuffer)
|
|
|
|
vmovdqu 0*16-32($Htbl), $TMP3
|
|
vpclmulqdq \$0x10, $TMP3, $TMP5, $TMP0
|
|
vpclmulqdq \$0x11, $TMP3, $TMP5, $TMP1
|
|
vpclmulqdq \$0x00, $TMP3, $TMP5, $TMP2
|
|
vpclmulqdq \$0x01, $TMP3, $TMP5, $TMP3
|
|
vpxor $TMP3, $TMP0, $TMP0
|
|
|
|
${\$schoolbook->(1)}
|
|
${\$schoolbook->(2)}
|
|
${\$schoolbook->(3)}
|
|
${\$schoolbook->(4)}
|
|
|
|
vmovdqu 5*16-32($secureBuffer), $TMP5
|
|
vpxor $T, $TMP5, $TMP5
|
|
vmovdqu 5*16-32($Htbl), $TMP4
|
|
vpclmulqdq \$0x11, $TMP4, $TMP5, $TMP3
|
|
vpxor $TMP3, $TMP1, $TMP1
|
|
vpclmulqdq \$0x00, $TMP4, $TMP5, $TMP3
|
|
vpxor $TMP3, $TMP2, $TMP2
|
|
vpclmulqdq \$0x10, $TMP4, $TMP5, $TMP3
|
|
vpxor $TMP3, $TMP0, $TMP0
|
|
vpclmulqdq \$0x01, $TMP4, $TMP5, $TMP3
|
|
vpxor $TMP3, $TMP0, $TMP0
|
|
|
|
vpsrldq \$8, $TMP0, $TMP3
|
|
vpxor $TMP3, $TMP1, $TMP4
|
|
vpslldq \$8, $TMP0, $TMP3
|
|
vpxor $TMP3, $TMP2, $T
|
|
|
|
vmovdqa poly(%rip), $TMP2
|
|
|
|
vpalignr \$8, $T, $T, $TMP1
|
|
vpclmulqdq \$0x10, $TMP2, $T, $T
|
|
vpxor $T, $TMP1, $T
|
|
|
|
vpalignr \$8, $T, $T, $TMP1
|
|
vpclmulqdq \$0x10, $TMP2, $T, $T
|
|
vpxor $T, $TMP1, $T
|
|
|
|
vpxor $TMP4, $T, $T
|
|
|
|
.L${labelPrefix}_dec_loop2:
|
|
# Here we encrypt any remaining whole block
|
|
|
|
# if there are no whole blocks
|
|
cmp \$16, $LEN
|
|
jb .L${labelPrefix}_dec_out
|
|
sub \$16, $LEN
|
|
|
|
vmovdqa $CTR, $TMP1
|
|
vpaddd one(%rip), $CTR, $CTR
|
|
|
|
vpxor 0*16($KS), $TMP1, $TMP1
|
|
vaesenc 1*16($KS), $TMP1, $TMP1
|
|
vaesenc 2*16($KS), $TMP1, $TMP1
|
|
vaesenc 3*16($KS), $TMP1, $TMP1
|
|
vaesenc 4*16($KS), $TMP1, $TMP1
|
|
vaesenc 5*16($KS), $TMP1, $TMP1
|
|
vaesenc 6*16($KS), $TMP1, $TMP1
|
|
vaesenc 7*16($KS), $TMP1, $TMP1
|
|
vaesenc 8*16($KS), $TMP1, $TMP1
|
|
vaesenc 9*16($KS), $TMP1, $TMP1
|
|
___
|
|
if ($aes256) {
|
|
$code.=<<___;
|
|
vaesenc 10*16($KS), $TMP1, $TMP1
|
|
vaesenc 11*16($KS), $TMP1, $TMP1
|
|
vaesenc 12*16($KS), $TMP1, $TMP1
|
|
vaesenc 13*16($KS), $TMP1, $TMP1
|
|
vaesenclast 14*16($KS), $TMP1, $TMP1
|
|
___
|
|
} else {
|
|
$code.=<<___;
|
|
vaesenclast 10*16($KS), $TMP1, $TMP1
|
|
___
|
|
}
|
|
|
|
$code.=<<___;
|
|
vpxor ($CT), $TMP1, $TMP1
|
|
vmovdqu $TMP1, ($PT)
|
|
addq \$16, $CT
|
|
addq \$16, $PT
|
|
|
|
vpxor $TMP1, $T, $T
|
|
vmovdqa -32($Htbl), $TMP0
|
|
call GFMUL
|
|
|
|
jmp .L${labelPrefix}_dec_loop2
|
|
|
|
.L${labelPrefix}_dec_out:
|
|
vmovdqu $T, ($POL)
|
|
ret
|
|
.cfi_endproc
|
|
___
|
|
|
|
if ($aes256) {
|
|
$code.=<<___;
|
|
.size aes256gcmsiv_dec, .-aes256gcmsiv_dec
|
|
___
|
|
} else {
|
|
$code.=<<___;
|
|
.size aes128gcmsiv_dec, .-aes128gcmsiv_dec
|
|
___
|
|
}
|
|
}
|
|
|
|
aesgcmsiv_dec(0); # emit 128-bit version
|
|
|
|
sub aes128gcmsiv_ecb_enc_block {
|
|
my $STATE_1 = "%xmm1";
|
|
my $KSp = "%rdx";
|
|
|
|
# parameter 1: PT %rdi (pointer to 128 bit)
|
|
# parameter 2: CT %rsi (pointer to 128 bit)
|
|
# parameter 3: ks %rdx (pointer to ks)
|
|
$code.=<<___;
|
|
.globl aes128gcmsiv_ecb_enc_block
|
|
.type aes128gcmsiv_ecb_enc_block,\@function,3
|
|
.align 16
|
|
aes128gcmsiv_ecb_enc_block:
|
|
.cfi_startproc
|
|
vmovdqa (%rdi), $STATE_1
|
|
|
|
vpxor ($KSp), $STATE_1, $STATE_1
|
|
vaesenc 1*16($KSp), $STATE_1, $STATE_1
|
|
vaesenc 2*16($KSp), $STATE_1, $STATE_1
|
|
vaesenc 3*16($KSp), $STATE_1, $STATE_1
|
|
vaesenc 4*16($KSp), $STATE_1, $STATE_1
|
|
vaesenc 5*16($KSp), $STATE_1, $STATE_1
|
|
vaesenc 6*16($KSp), $STATE_1, $STATE_1
|
|
vaesenc 7*16($KSp), $STATE_1, $STATE_1
|
|
vaesenc 8*16($KSp), $STATE_1, $STATE_1
|
|
vaesenc 9*16($KSp), $STATE_1, $STATE_1
|
|
vaesenclast 10*16($KSp), $STATE_1, $STATE_1 # STATE_1 == IV
|
|
|
|
vmovdqa $STATE_1, (%rsi)
|
|
|
|
ret
|
|
.cfi_endproc
|
|
.size aes128gcmsiv_ecb_enc_block,.-aes128gcmsiv_ecb_enc_block
|
|
___
|
|
}
|
|
aes128gcmsiv_ecb_enc_block();
|
|
|
|
sub aes256gcmsiv_aes_ks_enc_x1 {
|
|
my $KS = "%rdx";
|
|
my $KEYp = "%rcx";
|
|
my $CON_MASK = "%xmm0";
|
|
my $MASK_256 = "%xmm15";
|
|
my $KEY_1 = "%xmm1";
|
|
my $KEY_2 = "%xmm3";
|
|
my $BLOCK1 = "%xmm8";
|
|
my $AUX_REG = "%xmm14";
|
|
my $PT = "%rdi";
|
|
my $CT = "%rsi";
|
|
|
|
my $round_double = sub {
|
|
my ($i, $j) = @_;
|
|
return <<___;
|
|
vpshufb %xmm15, %xmm3, %xmm2
|
|
vaesenclast %xmm0, %xmm2, %xmm2
|
|
vpslld \$1, %xmm0, %xmm0
|
|
vpslldq \$4, %xmm1, %xmm4
|
|
vpxor %xmm4, %xmm1, %xmm1
|
|
vpslldq \$4, %xmm4, %xmm4
|
|
vpxor %xmm4, %xmm1, %xmm1
|
|
vpslldq \$4, %xmm4, %xmm4
|
|
vpxor %xmm4, %xmm1, %xmm1
|
|
vpxor %xmm2, %xmm1, %xmm1
|
|
vaesenc %xmm1, $BLOCK1, $BLOCK1
|
|
vmovdqu %xmm1, ${\eval(16*$i)}($KS)
|
|
|
|
vpshufd \$0xff, %xmm1, %xmm2
|
|
vaesenclast %xmm14, %xmm2, %xmm2
|
|
vpslldq \$4, %xmm3, %xmm4
|
|
vpxor %xmm4, %xmm3, %xmm3
|
|
vpslldq \$4, %xmm4, %xmm4
|
|
vpxor %xmm4, %xmm3, %xmm3
|
|
vpslldq \$4, %xmm4, %xmm4
|
|
vpxor %xmm4, %xmm3, %xmm3
|
|
vpxor %xmm2, %xmm3, %xmm3
|
|
vaesenc %xmm3, $BLOCK1, $BLOCK1
|
|
vmovdqu %xmm3, ${\eval(16*$j)}($KS)
|
|
___
|
|
};
|
|
|
|
my $round_last = sub {
|
|
my ($i) = @_;
|
|
return <<___;
|
|
vpshufb %xmm15, %xmm3, %xmm2
|
|
vaesenclast %xmm0, %xmm2, %xmm2
|
|
vpslldq \$4, %xmm1, %xmm4
|
|
vpxor %xmm4, %xmm1, %xmm1
|
|
vpslldq \$4, %xmm4, %xmm4
|
|
vpxor %xmm4, %xmm1, %xmm1
|
|
vpslldq \$4, %xmm4, %xmm4
|
|
vpxor %xmm4, %xmm1, %xmm1
|
|
vpxor %xmm2, %xmm1, %xmm1
|
|
vaesenclast %xmm1, $BLOCK1, $BLOCK1
|
|
vmovdqu %xmm1, ${\eval(16*$i)}($KS)
|
|
___
|
|
};
|
|
|
|
# parameter 1: %rdi Pointer to PT1
|
|
# parameter 2: %rsi Pointer to CT1
|
|
# parameter 3: %rdx Pointer to KS
|
|
# parameter 4: %rcx Pointer to initial key
|
|
$code.=<<___;
|
|
.globl aes256gcmsiv_aes_ks_enc_x1
|
|
.type aes256gcmsiv_aes_ks_enc_x1,\@function,4
|
|
.align 16
|
|
aes256gcmsiv_aes_ks_enc_x1:
|
|
.cfi_startproc
|
|
vmovdqa con1(%rip), $CON_MASK # CON_MASK = 1,1,1,1
|
|
vmovdqa mask(%rip), $MASK_256 # MASK_256
|
|
vmovdqa ($PT), $BLOCK1
|
|
vmovdqa ($KEYp), $KEY_1 # KEY_1 || KEY_2 [0..7] = user key
|
|
vmovdqa 16($KEYp), $KEY_2
|
|
vpxor $KEY_1, $BLOCK1, $BLOCK1
|
|
vaesenc $KEY_2, $BLOCK1, $BLOCK1
|
|
vmovdqu $KEY_1, ($KS) # First round key
|
|
vmovdqu $KEY_2, 16($KS)
|
|
vpxor $AUX_REG, $AUX_REG, $AUX_REG
|
|
|
|
${\$round_double->(2, 3)}
|
|
${\$round_double->(4, 5)}
|
|
${\$round_double->(6, 7)}
|
|
${\$round_double->(8, 9)}
|
|
${\$round_double->(10, 11)}
|
|
${\$round_double->(12, 13)}
|
|
${\$round_last->(14)}
|
|
vmovdqa $BLOCK1, ($CT)
|
|
ret
|
|
.cfi_endproc
|
|
.size aes256gcmsiv_aes_ks_enc_x1,.-aes256gcmsiv_aes_ks_enc_x1
|
|
___
|
|
}
|
|
aes256gcmsiv_aes_ks_enc_x1();
|
|
|
|
sub aes256gcmsiv_ecb_enc_block {
|
|
my $STATE_1 = "%xmm1";
|
|
my $PT = "%rdi";
|
|
my $CT = "%rsi";
|
|
my $KSp = "%rdx";
|
|
|
|
# parameter 1: PT %rdi (pointer to 128 bit)
|
|
# parameter 2: CT %rsi (pointer to 128 bit)
|
|
# parameter 3: ks %rdx (pointer to ks)
|
|
$code.=<<___;
|
|
.globl aes256gcmsiv_ecb_enc_block
|
|
.type aes256gcmsiv_ecb_enc_block,\@function,3
|
|
.align 16
|
|
aes256gcmsiv_ecb_enc_block:
|
|
.cfi_startproc
|
|
vmovdqa (%rdi), $STATE_1
|
|
vpxor ($KSp), $STATE_1, $STATE_1
|
|
vaesenc 1*16($KSp), $STATE_1, $STATE_1
|
|
vaesenc 2*16($KSp), $STATE_1, $STATE_1
|
|
vaesenc 3*16($KSp), $STATE_1, $STATE_1
|
|
vaesenc 4*16($KSp), $STATE_1, $STATE_1
|
|
vaesenc 5*16($KSp), $STATE_1, $STATE_1
|
|
vaesenc 6*16($KSp), $STATE_1, $STATE_1
|
|
vaesenc 7*16($KSp), $STATE_1, $STATE_1
|
|
vaesenc 8*16($KSp), $STATE_1, $STATE_1
|
|
vaesenc 9*16($KSp), $STATE_1, $STATE_1
|
|
vaesenc 10*16($KSp), $STATE_1, $STATE_1
|
|
vaesenc 11*16($KSp), $STATE_1, $STATE_1
|
|
vaesenc 12*16($KSp), $STATE_1, $STATE_1
|
|
vaesenc 13*16($KSp), $STATE_1, $STATE_1
|
|
vaesenclast 14*16($KSp), $STATE_1, $STATE_1 # $STATE_1 == IV
|
|
vmovdqa $STATE_1, (%rsi)
|
|
ret
|
|
.cfi_endproc
|
|
.size aes256gcmsiv_ecb_enc_block,.-aes256gcmsiv_ecb_enc_block
|
|
___
|
|
}
|
|
aes256gcmsiv_ecb_enc_block();
|
|
|
|
sub aes256gcmsiv_enc_msg_x4 {
|
|
my $CTR1 = "%xmm0";
|
|
my $CTR2 = "%xmm1";
|
|
my $CTR3 = "%xmm2";
|
|
my $CTR4 = "%xmm3";
|
|
my $ADDER = "%xmm4";
|
|
|
|
my $STATE1 = "%xmm5";
|
|
my $STATE2 = "%xmm6";
|
|
my $STATE3 = "%xmm7";
|
|
my $STATE4 = "%xmm8";
|
|
|
|
my $TMP = "%xmm12";
|
|
my $TMP2 = "%xmm13";
|
|
my $TMP3 = "%xmm14";
|
|
my $IV = "%xmm15";
|
|
|
|
my $PT = "%rdi";
|
|
my $CT = "%rsi";
|
|
my $TAG = "%rdx";
|
|
my $KS = "%rcx";
|
|
my $LEN = "%r8";
|
|
|
|
my $aes_round = sub {
|
|
my ($i) = @_;
|
|
return <<___;
|
|
vmovdqu ${\eval($i*16)}($KS), $TMP
|
|
vaesenc $TMP, $STATE1, $STATE1
|
|
vaesenc $TMP, $STATE2, $STATE2
|
|
vaesenc $TMP, $STATE3, $STATE3
|
|
vaesenc $TMP, $STATE4, $STATE4
|
|
___
|
|
};
|
|
|
|
my $aes_lastround = sub {
|
|
my ($i) = @_;
|
|
return <<___;
|
|
vmovdqu ${\eval($i*16)}($KS), $TMP
|
|
vaesenclast $TMP, $STATE1, $STATE1
|
|
vaesenclast $TMP, $STATE2, $STATE2
|
|
vaesenclast $TMP, $STATE3, $STATE3
|
|
vaesenclast $TMP, $STATE4, $STATE4
|
|
___
|
|
};
|
|
|
|
# void aes256gcmsiv_enc_msg_x4(unsigned char* PT, unsigned char* CT,
|
|
# unsigned char* TAG, unsigned char* KS,
|
|
# size_t byte_len);
|
|
# parameter 1: %rdi #PT
|
|
# parameter 2: %rsi #CT
|
|
# parameter 3: %rdx #TAG [127 126 ... 0] IV=[127...32]
|
|
# parameter 4: %rcx #KS
|
|
# parameter 5: %r8 #LEN MSG_length in bytes
|
|
$code.=<<___;
|
|
.globl aes256gcmsiv_enc_msg_x4
|
|
.type aes256gcmsiv_enc_msg_x4,\@function,5
|
|
.align 16
|
|
aes256gcmsiv_enc_msg_x4:
|
|
.cfi_startproc
|
|
test $LEN, $LEN
|
|
jnz .L256_enc_msg_x4_start
|
|
ret
|
|
|
|
.L256_enc_msg_x4_start:
|
|
movq $LEN, %r10
|
|
shrq \$4, $LEN # LEN = num of blocks
|
|
shlq \$60, %r10
|
|
jz .L256_enc_msg_x4_start2
|
|
addq \$1, $LEN
|
|
|
|
.L256_enc_msg_x4_start2:
|
|
movq $LEN, %r10
|
|
shlq \$62, %r10
|
|
shrq \$62, %r10
|
|
|
|
# make IV from TAG
|
|
vmovdqa ($TAG), $IV
|
|
vpor OR_MASK(%rip), $IV, $IV # IV = [1]TAG[126...32][00..00]
|
|
|
|
vmovdqa four(%rip), $ADDER # Register to increment counters
|
|
vmovdqa $IV, $CTR1 # CTR1 = TAG[1][127...32][00..00]
|
|
vpaddd one(%rip), $IV, $CTR2 # CTR2 = TAG[1][127...32][00..01]
|
|
vpaddd two(%rip), $IV, $CTR3 # CTR3 = TAG[1][127...32][00..02]
|
|
vpaddd three(%rip), $IV, $CTR4 # CTR4 = TAG[1][127...32][00..03]
|
|
|
|
shrq \$2, $LEN
|
|
je .L256_enc_msg_x4_check_remainder
|
|
|
|
subq \$64, $CT
|
|
subq \$64, $PT
|
|
|
|
.L256_enc_msg_x4_loop1:
|
|
addq \$64, $CT
|
|
addq \$64, $PT
|
|
|
|
vmovdqa $CTR1, $STATE1
|
|
vmovdqa $CTR2, $STATE2
|
|
vmovdqa $CTR3, $STATE3
|
|
vmovdqa $CTR4, $STATE4
|
|
|
|
vpxor ($KS), $STATE1, $STATE1
|
|
vpxor ($KS), $STATE2, $STATE2
|
|
vpxor ($KS), $STATE3, $STATE3
|
|
vpxor ($KS), $STATE4, $STATE4
|
|
|
|
${\$aes_round->(1)}
|
|
vpaddd $ADDER, $CTR1, $CTR1
|
|
${\$aes_round->(2)}
|
|
vpaddd $ADDER, $CTR2, $CTR2
|
|
${\$aes_round->(3)}
|
|
vpaddd $ADDER, $CTR3, $CTR3
|
|
${\$aes_round->(4)}
|
|
vpaddd $ADDER, $CTR4, $CTR4
|
|
|
|
${\$aes_round->(5)}
|
|
${\$aes_round->(6)}
|
|
${\$aes_round->(7)}
|
|
${\$aes_round->(8)}
|
|
${\$aes_round->(9)}
|
|
${\$aes_round->(10)}
|
|
${\$aes_round->(11)}
|
|
${\$aes_round->(12)}
|
|
${\$aes_round->(13)}
|
|
${\$aes_lastround->(14)}
|
|
|
|
# XOR with Plaintext
|
|
vpxor 0*16($PT), $STATE1, $STATE1
|
|
vpxor 1*16($PT), $STATE2, $STATE2
|
|
vpxor 2*16($PT), $STATE3, $STATE3
|
|
vpxor 3*16($PT), $STATE4, $STATE4
|
|
|
|
subq \$1, $LEN
|
|
|
|
vmovdqu $STATE1, 0*16($CT)
|
|
vmovdqu $STATE2, 1*16($CT)
|
|
vmovdqu $STATE3, 2*16($CT)
|
|
vmovdqu $STATE4, 3*16($CT)
|
|
|
|
jne .L256_enc_msg_x4_loop1
|
|
|
|
addq \$64, $CT
|
|
addq \$64, $PT
|
|
|
|
.L256_enc_msg_x4_check_remainder:
|
|
cmpq \$0, %r10
|
|
je .L256_enc_msg_x4_out
|
|
|
|
.L256_enc_msg_x4_loop2:
|
|
# encrypt each block separately
|
|
# CTR1 is the highest counter (even if no LOOP done)
|
|
|
|
vmovdqa $CTR1, $STATE1
|
|
vpaddd one(%rip), $CTR1, $CTR1 # inc counter
|
|
vpxor ($KS), $STATE1, $STATE1
|
|
vaesenc 16($KS), $STATE1, $STATE1
|
|
vaesenc 32($KS), $STATE1, $STATE1
|
|
vaesenc 48($KS), $STATE1, $STATE1
|
|
vaesenc 64($KS), $STATE1, $STATE1
|
|
vaesenc 80($KS), $STATE1, $STATE1
|
|
vaesenc 96($KS), $STATE1, $STATE1
|
|
vaesenc 112($KS), $STATE1, $STATE1
|
|
vaesenc 128($KS), $STATE1, $STATE1
|
|
vaesenc 144($KS), $STATE1, $STATE1
|
|
vaesenc 160($KS), $STATE1, $STATE1
|
|
vaesenc 176($KS), $STATE1, $STATE1
|
|
vaesenc 192($KS), $STATE1, $STATE1
|
|
vaesenc 208($KS), $STATE1, $STATE1
|
|
vaesenclast 224($KS), $STATE1, $STATE1
|
|
|
|
# XOR with Plaintext
|
|
vpxor ($PT), $STATE1, $STATE1
|
|
|
|
vmovdqu $STATE1, ($CT)
|
|
|
|
addq \$16, $PT
|
|
addq \$16, $CT
|
|
|
|
subq \$1, %r10
|
|
jne .L256_enc_msg_x4_loop2
|
|
|
|
.L256_enc_msg_x4_out:
|
|
ret
|
|
.cfi_endproc
|
|
.size aes256gcmsiv_enc_msg_x4,.-aes256gcmsiv_enc_msg_x4
|
|
___
|
|
}
|
|
aes256gcmsiv_enc_msg_x4();
|
|
|
|
sub aes256gcmsiv_enc_msg_x8() {
|
|
my $STATE1 = "%xmm1";
|
|
my $STATE2 = "%xmm2";
|
|
my $STATE3 = "%xmm3";
|
|
my $STATE4 = "%xmm4";
|
|
my $STATE5 = "%xmm5";
|
|
my $STATE6 = "%xmm6";
|
|
my $STATE7 = "%xmm7";
|
|
my $STATE8 = "%xmm8";
|
|
my $CTR1 = "%xmm0";
|
|
my $CTR2 = "%xmm9";
|
|
my $CTR3 = "%xmm10";
|
|
my $CTR4 = "%xmm11";
|
|
my $CTR5 = "%xmm12";
|
|
my $CTR6 = "%xmm13";
|
|
my $CTR7 = "%xmm14";
|
|
my $TMP1 = "%xmm1";
|
|
my $TMP2 = "%xmm2";
|
|
my $KS = "%rcx";
|
|
my $LEN = "%r8";
|
|
my $PT = "%rdi";
|
|
my $CT = "%rsi";
|
|
my $TAG = "%rdx";
|
|
my $SCHED = "%xmm15";
|
|
|
|
my $aes_round8 = sub {
|
|
my ($i) = @_;
|
|
return <<___;
|
|
vmovdqu ${\eval($i*16)}($KS), $SCHED
|
|
vaesenc $SCHED, $STATE1, $STATE1
|
|
vaesenc $SCHED, $STATE2, $STATE2
|
|
vaesenc $SCHED, $STATE3, $STATE3
|
|
vaesenc $SCHED, $STATE4, $STATE4
|
|
vaesenc $SCHED, $STATE5, $STATE5
|
|
vaesenc $SCHED, $STATE6, $STATE6
|
|
vaesenc $SCHED, $STATE7, $STATE7
|
|
vaesenc $SCHED, $STATE8, $STATE8
|
|
___
|
|
};
|
|
|
|
my $aes_lastround8 = sub {
|
|
my ($i) = @_;
|
|
return <<___;
|
|
vmovdqu ${\eval($i*16)}($KS), $SCHED
|
|
vaesenclast $SCHED, $STATE1, $STATE1
|
|
vaesenclast $SCHED, $STATE2, $STATE2
|
|
vaesenclast $SCHED, $STATE3, $STATE3
|
|
vaesenclast $SCHED, $STATE4, $STATE4
|
|
vaesenclast $SCHED, $STATE5, $STATE5
|
|
vaesenclast $SCHED, $STATE6, $STATE6
|
|
vaesenclast $SCHED, $STATE7, $STATE7
|
|
vaesenclast $SCHED, $STATE8, $STATE8
|
|
___
|
|
};
|
|
|
|
# void ENC_MSG_x8(unsigned char* PT,
|
|
# unsigned char* CT,
|
|
# unsigned char* TAG,
|
|
# unsigned char* KS,
|
|
# size_t byte_len);
|
|
# parameter 1: %rdi #PT
|
|
# parameter 2: %rsi #CT
|
|
# parameter 3: %rdx #TAG [127 126 ... 0] IV=[127...32]
|
|
# parameter 4: %rcx #KS
|
|
# parameter 5: %r8 #LEN MSG_length in bytes
|
|
$code.=<<___;
|
|
.globl aes256gcmsiv_enc_msg_x8
|
|
.type aes256gcmsiv_enc_msg_x8,\@function,5
|
|
.align 16
|
|
aes256gcmsiv_enc_msg_x8:
|
|
.cfi_startproc
|
|
test $LEN, $LEN
|
|
jnz .L256_enc_msg_x8_start
|
|
ret
|
|
|
|
.L256_enc_msg_x8_start:
|
|
# Place in stack
|
|
movq %rsp, %r11
|
|
subq \$16, %r11
|
|
andq \$-64, %r11
|
|
|
|
movq $LEN, %r10
|
|
shrq \$4, $LEN # LEN = num of blocks
|
|
shlq \$60, %r10
|
|
jz .L256_enc_msg_x8_start2
|
|
addq \$1, $LEN
|
|
|
|
.L256_enc_msg_x8_start2:
|
|
movq $LEN, %r10
|
|
shlq \$61, %r10
|
|
shrq \$61, %r10
|
|
|
|
# Make IV from TAG
|
|
vmovdqa ($TAG), $TMP1
|
|
vpor OR_MASK(%rip), $TMP1, $TMP1 # TMP1= IV = [1]TAG[126...32][00..00]
|
|
|
|
# store counter8 on the stack
|
|
vpaddd seven(%rip), $TMP1, $CTR1
|
|
vmovdqa $CTR1, (%r11) # CTR8 = TAG[127...32][00..07]
|
|
vpaddd one(%rip), $TMP1, $CTR2 # CTR2 = TAG[127...32][00..01]
|
|
vpaddd two(%rip), $TMP1, $CTR3 # CTR3 = TAG[127...32][00..02]
|
|
vpaddd three(%rip), $TMP1, $CTR4 # CTR4 = TAG[127...32][00..03]
|
|
vpaddd four(%rip), $TMP1, $CTR5 # CTR5 = TAG[127...32][00..04]
|
|
vpaddd five(%rip), $TMP1, $CTR6 # CTR6 = TAG[127...32][00..05]
|
|
vpaddd six(%rip), $TMP1, $CTR7 # CTR7 = TAG[127...32][00..06]
|
|
vmovdqa $TMP1, $CTR1 # CTR1 = TAG[127...32][00..00]
|
|
|
|
shrq \$3, $LEN
|
|
jz .L256_enc_msg_x8_check_remainder
|
|
|
|
subq \$128, $CT
|
|
subq \$128, $PT
|
|
|
|
.L256_enc_msg_x8_loop1:
|
|
addq \$128, $CT
|
|
addq \$128, $PT
|
|
|
|
vmovdqa $CTR1, $STATE1
|
|
vmovdqa $CTR2, $STATE2
|
|
vmovdqa $CTR3, $STATE3
|
|
vmovdqa $CTR4, $STATE4
|
|
vmovdqa $CTR5, $STATE5
|
|
vmovdqa $CTR6, $STATE6
|
|
vmovdqa $CTR7, $STATE7
|
|
# move from stack
|
|
vmovdqa (%r11), $STATE8
|
|
|
|
vpxor ($KS), $STATE1, $STATE1
|
|
vpxor ($KS), $STATE2, $STATE2
|
|
vpxor ($KS), $STATE3, $STATE3
|
|
vpxor ($KS), $STATE4, $STATE4
|
|
vpxor ($KS), $STATE5, $STATE5
|
|
vpxor ($KS), $STATE6, $STATE6
|
|
vpxor ($KS), $STATE7, $STATE7
|
|
vpxor ($KS), $STATE8, $STATE8
|
|
|
|
${\$aes_round8->(1)}
|
|
vmovdqa (%r11), $CTR7 # deal with CTR8
|
|
vpaddd eight(%rip), $CTR7, $CTR7
|
|
vmovdqa $CTR7, (%r11)
|
|
${\$aes_round8->(2)}
|
|
vpsubd one(%rip), $CTR7, $CTR7
|
|
${\$aes_round8->(3)}
|
|
vpaddd eight(%rip), $CTR1, $CTR1
|
|
${\$aes_round8->(4)}
|
|
vpaddd eight(%rip), $CTR2, $CTR2
|
|
${\$aes_round8->(5)}
|
|
vpaddd eight(%rip), $CTR3, $CTR3
|
|
${\$aes_round8->(6)}
|
|
vpaddd eight(%rip), $CTR4, $CTR4
|
|
${\$aes_round8->(7)}
|
|
vpaddd eight(%rip), $CTR5, $CTR5
|
|
${\$aes_round8->(8)}
|
|
vpaddd eight(%rip), $CTR6, $CTR6
|
|
${\$aes_round8->(9)}
|
|
${\$aes_round8->(10)}
|
|
${\$aes_round8->(11)}
|
|
${\$aes_round8->(12)}
|
|
${\$aes_round8->(13)}
|
|
${\$aes_lastround8->(14)}
|
|
|
|
# XOR with Plaintext
|
|
vpxor 0*16($PT), $STATE1, $STATE1
|
|
vpxor 1*16($PT), $STATE2, $STATE2
|
|
vpxor 2*16($PT), $STATE3, $STATE3
|
|
vpxor 3*16($PT), $STATE4, $STATE4
|
|
vpxor 4*16($PT), $STATE5, $STATE5
|
|
vpxor 5*16($PT), $STATE6, $STATE6
|
|
vpxor 6*16($PT), $STATE7, $STATE7
|
|
vpxor 7*16($PT), $STATE8, $STATE8
|
|
|
|
subq \$1, $LEN
|
|
|
|
vmovdqu $STATE1, 0*16($CT)
|
|
vmovdqu $STATE2, 1*16($CT)
|
|
vmovdqu $STATE3, 2*16($CT)
|
|
vmovdqu $STATE4, 3*16($CT)
|
|
vmovdqu $STATE5, 4*16($CT)
|
|
vmovdqu $STATE6, 5*16($CT)
|
|
vmovdqu $STATE7, 6*16($CT)
|
|
vmovdqu $STATE8, 7*16($CT)
|
|
|
|
jne .L256_enc_msg_x8_loop1
|
|
|
|
addq \$128, $CT
|
|
addq \$128, $PT
|
|
|
|
.L256_enc_msg_x8_check_remainder:
|
|
cmpq \$0, %r10
|
|
je .L256_enc_msg_x8_out
|
|
|
|
.L256_enc_msg_x8_loop2:
|
|
# encrypt each block separately
|
|
# CTR1 is the highest counter (even if no LOOP done)
|
|
vmovdqa $CTR1, $STATE1
|
|
vpaddd one(%rip), $CTR1, $CTR1
|
|
|
|
vpxor ($KS), $STATE1, $STATE1
|
|
vaesenc 16($KS), $STATE1, $STATE1
|
|
vaesenc 32($KS), $STATE1, $STATE1
|
|
vaesenc 48($KS), $STATE1, $STATE1
|
|
vaesenc 64($KS), $STATE1, $STATE1
|
|
vaesenc 80($KS), $STATE1, $STATE1
|
|
vaesenc 96($KS), $STATE1, $STATE1
|
|
vaesenc 112($KS), $STATE1, $STATE1
|
|
vaesenc 128($KS), $STATE1, $STATE1
|
|
vaesenc 144($KS), $STATE1, $STATE1
|
|
vaesenc 160($KS), $STATE1, $STATE1
|
|
vaesenc 176($KS), $STATE1, $STATE1
|
|
vaesenc 192($KS), $STATE1, $STATE1
|
|
vaesenc 208($KS), $STATE1, $STATE1
|
|
vaesenclast 224($KS), $STATE1, $STATE1
|
|
|
|
# XOR with Plaintext
|
|
vpxor ($PT), $STATE1, $STATE1
|
|
|
|
vmovdqu $STATE1, ($CT)
|
|
|
|
addq \$16, $PT
|
|
addq \$16, $CT
|
|
subq \$1, %r10
|
|
jnz .L256_enc_msg_x8_loop2
|
|
|
|
.L256_enc_msg_x8_out:
|
|
ret
|
|
|
|
.cfi_endproc
|
|
.size aes256gcmsiv_enc_msg_x8,.-aes256gcmsiv_enc_msg_x8
|
|
___
|
|
}
|
|
aes256gcmsiv_enc_msg_x8();
|
|
aesgcmsiv_dec(1);
|
|
|
|
sub aes256gcmsiv_kdf {
|
|
my $ONE = "%xmm8";
|
|
my $BLOCK1 = "%xmm4";
|
|
my $BLOCK2 = "%xmm6";
|
|
my $BLOCK3 = "%xmm7";
|
|
my $BLOCK4 = "%xmm11";
|
|
my $BLOCK5 = "%xmm12";
|
|
my $BLOCK6 = "%xmm13";
|
|
|
|
my $enc_roundx6 = sub {
|
|
my ($i, $j) = @_;
|
|
return <<___;
|
|
vmovdqa ${\eval($i*16)}(%rdx), $j
|
|
vaesenc $j, $BLOCK1, $BLOCK1
|
|
vaesenc $j, $BLOCK2, $BLOCK2
|
|
vaesenc $j, $BLOCK3, $BLOCK3
|
|
vaesenc $j, $BLOCK4, $BLOCK4
|
|
vaesenc $j, $BLOCK5, $BLOCK5
|
|
vaesenc $j, $BLOCK6, $BLOCK6
|
|
___
|
|
};
|
|
|
|
my $enc_roundlastx6 = sub {
|
|
my ($i, $j) = @_;
|
|
return <<___;
|
|
vmovdqa ${\eval($i*16)}(%rdx), $j
|
|
vaesenclast $j, $BLOCK1, $BLOCK1
|
|
vaesenclast $j, $BLOCK2, $BLOCK2
|
|
vaesenclast $j, $BLOCK3, $BLOCK3
|
|
vaesenclast $j, $BLOCK4, $BLOCK4
|
|
vaesenclast $j, $BLOCK5, $BLOCK5
|
|
vaesenclast $j, $BLOCK6, $BLOCK6
|
|
___
|
|
};
|
|
|
|
# void aes256gcmsiv_kdf(const uint8_t nonce[16],
|
|
# uint8_t *out_key_material,
|
|
# const uint8_t *key_schedule);
|
|
$code.=<<___;
|
|
.globl aes256gcmsiv_kdf
|
|
.type aes256gcmsiv_kdf,\@function,3
|
|
.align 16
|
|
aes256gcmsiv_kdf:
|
|
.cfi_startproc
|
|
# parameter 1: %rdi Pointer to NONCE
|
|
# parameter 2: %rsi Pointer to CT
|
|
# parameter 4: %rdx Pointer to keys
|
|
|
|
vmovdqa (%rdx), %xmm1 # xmm1 = first 16 bytes of random key
|
|
vmovdqa 0*16(%rdi), $BLOCK1
|
|
vmovdqa and_mask(%rip), $BLOCK4
|
|
vmovdqa one(%rip), $ONE
|
|
vpshufd \$0x90, $BLOCK1, $BLOCK1
|
|
vpand $BLOCK4, $BLOCK1, $BLOCK1
|
|
vpaddd $ONE, $BLOCK1, $BLOCK2
|
|
vpaddd $ONE, $BLOCK2, $BLOCK3
|
|
vpaddd $ONE, $BLOCK3, $BLOCK4
|
|
vpaddd $ONE, $BLOCK4, $BLOCK5
|
|
vpaddd $ONE, $BLOCK5, $BLOCK6
|
|
|
|
vpxor %xmm1, $BLOCK1, $BLOCK1
|
|
vpxor %xmm1, $BLOCK2, $BLOCK2
|
|
vpxor %xmm1, $BLOCK3, $BLOCK3
|
|
vpxor %xmm1, $BLOCK4, $BLOCK4
|
|
vpxor %xmm1, $BLOCK5, $BLOCK5
|
|
vpxor %xmm1, $BLOCK6, $BLOCK6
|
|
|
|
${\$enc_roundx6->(1, "%xmm1")}
|
|
${\$enc_roundx6->(2, "%xmm2")}
|
|
${\$enc_roundx6->(3, "%xmm1")}
|
|
${\$enc_roundx6->(4, "%xmm2")}
|
|
${\$enc_roundx6->(5, "%xmm1")}
|
|
${\$enc_roundx6->(6, "%xmm2")}
|
|
${\$enc_roundx6->(7, "%xmm1")}
|
|
${\$enc_roundx6->(8, "%xmm2")}
|
|
${\$enc_roundx6->(9, "%xmm1")}
|
|
${\$enc_roundx6->(10, "%xmm2")}
|
|
${\$enc_roundx6->(11, "%xmm1")}
|
|
${\$enc_roundx6->(12, "%xmm2")}
|
|
${\$enc_roundx6->(13, "%xmm1")}
|
|
${\$enc_roundlastx6->(14, "%xmm2")}
|
|
|
|
vmovdqa $BLOCK1, 0*16(%rsi)
|
|
vmovdqa $BLOCK2, 1*16(%rsi)
|
|
vmovdqa $BLOCK3, 2*16(%rsi)
|
|
vmovdqa $BLOCK4, 3*16(%rsi)
|
|
vmovdqa $BLOCK5, 4*16(%rsi)
|
|
vmovdqa $BLOCK6, 5*16(%rsi)
|
|
ret
|
|
.cfi_endproc
|
|
.size aes256gcmsiv_kdf, .-aes256gcmsiv_kdf
|
|
___
|
|
}
|
|
aes256gcmsiv_kdf();
|
|
|
|
print $code;
|
|
|
|
close STDOUT;
|