fdd8e9c8c7
Depending on architecture, perlasm differed on which one or both of: perl foo.pl flavor output.S perl foo.pl flavor > output.S Upstream has now unified on the first form after making a number of changes to their files (the second does not even work for their x86 files anymore). Sync those portions of our perlasm scripts with upstream and update CMakeLists.txt and generate_build_files.py per the new convention. This imports various commits like this one: 184bc45f683c76531d7e065b6553ca9086564576 (this was done by taking a diff, so I don't have the full list) Confirmed that generate_build_files.py sees no change. BUG=14 Change-Id: Id2fb5b8bc2a7369d077221b5df9a6947d41f50d2 Reviewed-on: https://boringssl-review.googlesource.com/8518 Reviewed-by: Adam Langley <agl@google.com>
4049 lines
102 KiB
Perl
4049 lines
102 KiB
Perl
#!/usr/bin/env perl
|
||
#
|
||
# ====================================================================
|
||
# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
|
||
# project. The module is, however, dual licensed under OpenSSL and
|
||
# CRYPTOGAMS licenses depending on where you obtain it. For further
|
||
# details see http://www.openssl.org/~appro/cryptogams/.
|
||
# ====================================================================
|
||
#
|
||
# This module implements support for Intel AES-NI extension. In
|
||
# OpenSSL context it's used with Intel engine, but can also be used as
|
||
# drop-in replacement for crypto/aes/asm/aes-x86_64.pl [see below for
|
||
# details].
|
||
#
|
||
# Performance.
|
||
#
|
||
# Given aes(enc|dec) instructions' latency asymptotic performance for
|
||
# non-parallelizable modes such as CBC encrypt is 3.75 cycles per byte
|
||
# processed with 128-bit key. And given their throughput asymptotic
|
||
# performance for parallelizable modes is 1.25 cycles per byte. Being
|
||
# asymptotic limit it's not something you commonly achieve in reality,
|
||
# but how close does one get? Below are results collected for
|
||
# different modes and block sized. Pairs of numbers are for en-/
|
||
# decryption.
|
||
#
|
||
# 16-byte 64-byte 256-byte 1-KB 8-KB
|
||
# ECB 4.25/4.25 1.38/1.38 1.28/1.28 1.26/1.26 1.26/1.26
|
||
# CTR 5.42/5.42 1.92/1.92 1.44/1.44 1.28/1.28 1.26/1.26
|
||
# CBC 4.38/4.43 4.15/1.43 4.07/1.32 4.07/1.29 4.06/1.28
|
||
# CCM 5.66/9.42 4.42/5.41 4.16/4.40 4.09/4.15 4.06/4.07
|
||
# OFB 5.42/5.42 4.64/4.64 4.44/4.44 4.39/4.39 4.38/4.38
|
||
# CFB 5.73/5.85 5.56/5.62 5.48/5.56 5.47/5.55 5.47/5.55
|
||
#
|
||
# ECB, CTR, CBC and CCM results are free from EVP overhead. This means
|
||
# that otherwise used 'openssl speed -evp aes-128-??? -engine aesni
|
||
# [-decrypt]' will exhibit 10-15% worse results for smaller blocks.
|
||
# The results were collected with specially crafted speed.c benchmark
|
||
# in order to compare them with results reported in "Intel Advanced
|
||
# Encryption Standard (AES) New Instruction Set" White Paper Revision
|
||
# 3.0 dated May 2010. All above results are consistently better. This
|
||
# module also provides better performance for block sizes smaller than
|
||
# 128 bytes in points *not* represented in the above table.
|
||
#
|
||
# Looking at the results for 8-KB buffer.
|
||
#
|
||
# CFB and OFB results are far from the limit, because implementation
|
||
# uses "generic" CRYPTO_[c|o]fb128_encrypt interfaces relying on
|
||
# single-block aesni_encrypt, which is not the most optimal way to go.
|
||
# CBC encrypt result is unexpectedly high and there is no documented
|
||
# explanation for it. Seemingly there is a small penalty for feeding
|
||
# the result back to AES unit the way it's done in CBC mode. There is
|
||
# nothing one can do and the result appears optimal. CCM result is
|
||
# identical to CBC, because CBC-MAC is essentially CBC encrypt without
|
||
# saving output. CCM CTR "stays invisible," because it's neatly
|
||
# interleaved wih CBC-MAC. This provides ~30% improvement over
|
||
# "straghtforward" CCM implementation with CTR and CBC-MAC performed
|
||
# disjointly. Parallelizable modes practically achieve the theoretical
|
||
# limit.
|
||
#
|
||
# Looking at how results vary with buffer size.
|
||
#
|
||
# Curves are practically saturated at 1-KB buffer size. In most cases
|
||
# "256-byte" performance is >95%, and "64-byte" is ~90% of "8-KB" one.
|
||
# CTR curve doesn't follow this pattern and is "slowest" changing one
|
||
# with "256-byte" result being 87% of "8-KB." This is because overhead
|
||
# in CTR mode is most computationally intensive. Small-block CCM
|
||
# decrypt is slower than encrypt, because first CTR and last CBC-MAC
|
||
# iterations can't be interleaved.
|
||
#
|
||
# Results for 192- and 256-bit keys.
|
||
#
|
||
# EVP-free results were observed to scale perfectly with number of
|
||
# rounds for larger block sizes, i.e. 192-bit result being 10/12 times
|
||
# lower and 256-bit one - 10/14. Well, in CBC encrypt case differences
|
||
# are a tad smaller, because the above mentioned penalty biases all
|
||
# results by same constant value. In similar way function call
|
||
# overhead affects small-block performance, as well as OFB and CFB
|
||
# results. Differences are not large, most common coefficients are
|
||
# 10/11.7 and 10/13.4 (as opposite to 10/12.0 and 10/14.0), but one
|
||
# observe even 10/11.2 and 10/12.4 (CTR, OFB, CFB)...
|
||
|
||
# January 2011
|
||
#
|
||
# While Westmere processor features 6 cycles latency for aes[enc|dec]
|
||
# instructions, which can be scheduled every second cycle, Sandy
|
||
# Bridge spends 8 cycles per instruction, but it can schedule them
|
||
# every cycle. This means that code targeting Westmere would perform
|
||
# suboptimally on Sandy Bridge. Therefore this update.
|
||
#
|
||
# In addition, non-parallelizable CBC encrypt (as well as CCM) is
|
||
# optimized. Relative improvement might appear modest, 8% on Westmere,
|
||
# but in absolute terms it's 3.77 cycles per byte encrypted with
|
||
# 128-bit key on Westmere, and 5.07 - on Sandy Bridge. These numbers
|
||
# should be compared to asymptotic limits of 3.75 for Westmere and
|
||
# 5.00 for Sandy Bridge. Actually, the fact that they get this close
|
||
# to asymptotic limits is quite amazing. Indeed, the limit is
|
||
# calculated as latency times number of rounds, 10 for 128-bit key,
|
||
# and divided by 16, the number of bytes in block, or in other words
|
||
# it accounts *solely* for aesenc instructions. But there are extra
|
||
# instructions, and numbers so close to the asymptotic limits mean
|
||
# that it's as if it takes as little as *one* additional cycle to
|
||
# execute all of them. How is it possible? It is possible thanks to
|
||
# out-of-order execution logic, which manages to overlap post-
|
||
# processing of previous block, things like saving the output, with
|
||
# actual encryption of current block, as well as pre-processing of
|
||
# current block, things like fetching input and xor-ing it with
|
||
# 0-round element of the key schedule, with actual encryption of
|
||
# previous block. Keep this in mind...
|
||
#
|
||
# For parallelizable modes, such as ECB, CBC decrypt, CTR, higher
|
||
# performance is achieved by interleaving instructions working on
|
||
# independent blocks. In which case asymptotic limit for such modes
|
||
# can be obtained by dividing above mentioned numbers by AES
|
||
# instructions' interleave factor. Westmere can execute at most 3
|
||
# instructions at a time, meaning that optimal interleave factor is 3,
|
||
# and that's where the "magic" number of 1.25 come from. "Optimal
|
||
# interleave factor" means that increase of interleave factor does
|
||
# not improve performance. The formula has proven to reflect reality
|
||
# pretty well on Westmere... Sandy Bridge on the other hand can
|
||
# execute up to 8 AES instructions at a time, so how does varying
|
||
# interleave factor affect the performance? Here is table for ECB
|
||
# (numbers are cycles per byte processed with 128-bit key):
|
||
#
|
||
# instruction interleave factor 3x 6x 8x
|
||
# theoretical asymptotic limit 1.67 0.83 0.625
|
||
# measured performance for 8KB block 1.05 0.86 0.84
|
||
#
|
||
# "as if" interleave factor 4.7x 5.8x 6.0x
|
||
#
|
||
# Further data for other parallelizable modes:
|
||
#
|
||
# CBC decrypt 1.16 0.93 0.74
|
||
# CTR 1.14 0.91 0.74
|
||
#
|
||
# Well, given 3x column it's probably inappropriate to call the limit
|
||
# asymptotic, if it can be surpassed, isn't it? What happens there?
|
||
# Rewind to CBC paragraph for the answer. Yes, out-of-order execution
|
||
# magic is responsible for this. Processor overlaps not only the
|
||
# additional instructions with AES ones, but even AES instuctions
|
||
# processing adjacent triplets of independent blocks. In the 6x case
|
||
# additional instructions still claim disproportionally small amount
|
||
# of additional cycles, but in 8x case number of instructions must be
|
||
# a tad too high for out-of-order logic to cope with, and AES unit
|
||
# remains underutilized... As you can see 8x interleave is hardly
|
||
# justifiable, so there no need to feel bad that 32-bit aesni-x86.pl
|
||
# utilizies 6x interleave because of limited register bank capacity.
|
||
#
|
||
# Higher interleave factors do have negative impact on Westmere
|
||
# performance. While for ECB mode it's negligible ~1.5%, other
|
||
# parallelizables perform ~5% worse, which is outweighed by ~25%
|
||
# improvement on Sandy Bridge. To balance regression on Westmere
|
||
# CTR mode was implemented with 6x aesenc interleave factor.
|
||
|
||
# April 2011
|
||
#
|
||
# Add aesni_xts_[en|de]crypt. Westmere spends 1.25 cycles processing
|
||
# one byte out of 8KB with 128-bit key, Sandy Bridge - 0.90. Just like
|
||
# in CTR mode AES instruction interleave factor was chosen to be 6x.
|
||
|
||
######################################################################
|
||
# Current large-block performance in cycles per byte processed with
|
||
# 128-bit key (less is better).
|
||
#
|
||
# CBC en-/decrypt CTR XTS ECB
|
||
# Westmere 3.77/1.25 1.25 1.25 1.26
|
||
# * Bridge 5.07/0.74 0.75 0.90 0.85
|
||
# Haswell 4.44/0.63 0.63 0.73 0.63
|
||
# Silvermont 5.75/3.54 3.56 4.12 3.87(*)
|
||
# Bulldozer 5.77/0.70 0.72 0.90 0.70
|
||
#
|
||
# (*) Atom Silvermont ECB result is suboptimal because of penalties
|
||
# incurred by operations on %xmm8-15. As ECB is not considered
|
||
# critical, nothing was done to mitigate the problem.
|
||
|
||
$PREFIX="aesni"; # if $PREFIX is set to "AES", the script
|
||
# generates drop-in replacement for
|
||
# crypto/aes/asm/aes-x86_64.pl:-)
|
||
|
||
$flavour = shift;
|
||
$output = shift;
|
||
if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
|
||
|
||
$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
|
||
|
||
$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
|
||
( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
|
||
( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
|
||
die "can't locate x86_64-xlate.pl";
|
||
|
||
open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
|
||
*STDOUT=*OUT;
|
||
|
||
$movkey = $PREFIX eq "aesni" ? "movups" : "movups";
|
||
@_4args=$win64? ("%rcx","%rdx","%r8", "%r9") : # Win64 order
|
||
("%rdi","%rsi","%rdx","%rcx"); # Unix order
|
||
|
||
$code=".text\n";
|
||
$code.=".extern OPENSSL_ia32cap_P\n";
|
||
|
||
$rounds="%eax"; # input to and changed by aesni_[en|de]cryptN !!!
|
||
# this is natural Unix argument order for public $PREFIX_[ecb|cbc]_encrypt ...
|
||
$inp="%rdi";
|
||
$out="%rsi";
|
||
$len="%rdx";
|
||
$key="%rcx"; # input to and changed by aesni_[en|de]cryptN !!!
|
||
$ivp="%r8"; # cbc, ctr, ...
|
||
|
||
$rnds_="%r10d"; # backup copy for $rounds
|
||
$key_="%r11"; # backup copy for $key
|
||
|
||
# %xmm register layout
|
||
$rndkey0="%xmm0"; $rndkey1="%xmm1";
|
||
$inout0="%xmm2"; $inout1="%xmm3";
|
||
$inout2="%xmm4"; $inout3="%xmm5";
|
||
$inout4="%xmm6"; $inout5="%xmm7";
|
||
$inout6="%xmm8"; $inout7="%xmm9";
|
||
|
||
$in2="%xmm6"; $in1="%xmm7"; # used in CBC decrypt, CTR, ...
|
||
$in0="%xmm8"; $iv="%xmm9";
|
||
|
||
# Inline version of internal aesni_[en|de]crypt1.
|
||
#
|
||
# Why folded loop? Because aes[enc|dec] is slow enough to accommodate
|
||
# cycles which take care of loop variables...
|
||
{ my $sn;
|
||
sub aesni_generate1 {
|
||
my ($p,$key,$rounds,$inout,$ivec)=@_; $inout=$inout0 if (!defined($inout));
|
||
++$sn;
|
||
$code.=<<___;
|
||
$movkey ($key),$rndkey0
|
||
$movkey 16($key),$rndkey1
|
||
___
|
||
$code.=<<___ if (defined($ivec));
|
||
xorps $rndkey0,$ivec
|
||
lea 32($key),$key
|
||
xorps $ivec,$inout
|
||
___
|
||
$code.=<<___ if (!defined($ivec));
|
||
lea 32($key),$key
|
||
xorps $rndkey0,$inout
|
||
___
|
||
$code.=<<___;
|
||
.Loop_${p}1_$sn:
|
||
aes${p} $rndkey1,$inout
|
||
dec $rounds
|
||
$movkey ($key),$rndkey1
|
||
lea 16($key),$key
|
||
jnz .Loop_${p}1_$sn # loop body is 16 bytes
|
||
aes${p}last $rndkey1,$inout
|
||
___
|
||
}}
|
||
# void $PREFIX_[en|de]crypt (const void *inp,void *out,const AES_KEY *key);
|
||
#
|
||
{ my ($inp,$out,$key) = @_4args;
|
||
|
||
$code.=<<___;
|
||
.globl ${PREFIX}_encrypt
|
||
.type ${PREFIX}_encrypt,\@abi-omnipotent
|
||
.align 16
|
||
${PREFIX}_encrypt:
|
||
movups ($inp),$inout0 # load input
|
||
mov 240($key),$rounds # key->rounds
|
||
___
|
||
&aesni_generate1("enc",$key,$rounds);
|
||
$code.=<<___;
|
||
pxor $rndkey0,$rndkey0 # clear register bank
|
||
pxor $rndkey1,$rndkey1
|
||
movups $inout0,($out) # output
|
||
pxor $inout0,$inout0
|
||
ret
|
||
.size ${PREFIX}_encrypt,.-${PREFIX}_encrypt
|
||
|
||
.globl ${PREFIX}_decrypt
|
||
.type ${PREFIX}_decrypt,\@abi-omnipotent
|
||
.align 16
|
||
${PREFIX}_decrypt:
|
||
movups ($inp),$inout0 # load input
|
||
mov 240($key),$rounds # key->rounds
|
||
___
|
||
&aesni_generate1("dec",$key,$rounds);
|
||
$code.=<<___;
|
||
pxor $rndkey0,$rndkey0 # clear register bank
|
||
pxor $rndkey1,$rndkey1
|
||
movups $inout0,($out) # output
|
||
pxor $inout0,$inout0
|
||
ret
|
||
.size ${PREFIX}_decrypt, .-${PREFIX}_decrypt
|
||
___
|
||
}
|
||
|
||
# _aesni_[en|de]cryptN are private interfaces, N denotes interleave
|
||
# factor. Why 3x subroutine were originally used in loops? Even though
|
||
# aes[enc|dec] latency was originally 6, it could be scheduled only
|
||
# every *2nd* cycle. Thus 3x interleave was the one providing optimal
|
||
# utilization, i.e. when subroutine's throughput is virtually same as
|
||
# of non-interleaved subroutine [for number of input blocks up to 3].
|
||
# This is why it originally made no sense to implement 2x subroutine.
|
||
# But times change and it became appropriate to spend extra 192 bytes
|
||
# on 2x subroutine on Atom Silvermont account. For processors that
|
||
# can schedule aes[enc|dec] every cycle optimal interleave factor
|
||
# equals to corresponding instructions latency. 8x is optimal for
|
||
# * Bridge and "super-optimal" for other Intel CPUs...
|
||
|
||
sub aesni_generate2 {
|
||
my $dir=shift;
|
||
# As already mentioned it takes in $key and $rounds, which are *not*
|
||
# preserved. $inout[0-1] is cipher/clear text...
|
||
$code.=<<___;
|
||
.type _aesni_${dir}rypt2,\@abi-omnipotent
|
||
.align 16
|
||
_aesni_${dir}rypt2:
|
||
$movkey ($key),$rndkey0
|
||
shl \$4,$rounds
|
||
$movkey 16($key),$rndkey1
|
||
xorps $rndkey0,$inout0
|
||
xorps $rndkey0,$inout1
|
||
$movkey 32($key),$rndkey0
|
||
lea 32($key,$rounds),$key
|
||
neg %rax # $rounds
|
||
add \$16,%rax
|
||
|
||
.L${dir}_loop2:
|
||
aes${dir} $rndkey1,$inout0
|
||
aes${dir} $rndkey1,$inout1
|
||
$movkey ($key,%rax),$rndkey1
|
||
add \$32,%rax
|
||
aes${dir} $rndkey0,$inout0
|
||
aes${dir} $rndkey0,$inout1
|
||
$movkey -16($key,%rax),$rndkey0
|
||
jnz .L${dir}_loop2
|
||
|
||
aes${dir} $rndkey1,$inout0
|
||
aes${dir} $rndkey1,$inout1
|
||
aes${dir}last $rndkey0,$inout0
|
||
aes${dir}last $rndkey0,$inout1
|
||
ret
|
||
.size _aesni_${dir}rypt2,.-_aesni_${dir}rypt2
|
||
___
|
||
}
|
||
sub aesni_generate3 {
|
||
my $dir=shift;
|
||
# As already mentioned it takes in $key and $rounds, which are *not*
|
||
# preserved. $inout[0-2] is cipher/clear text...
|
||
$code.=<<___;
|
||
.type _aesni_${dir}rypt3,\@abi-omnipotent
|
||
.align 16
|
||
_aesni_${dir}rypt3:
|
||
$movkey ($key),$rndkey0
|
||
shl \$4,$rounds
|
||
$movkey 16($key),$rndkey1
|
||
xorps $rndkey0,$inout0
|
||
xorps $rndkey0,$inout1
|
||
xorps $rndkey0,$inout2
|
||
$movkey 32($key),$rndkey0
|
||
lea 32($key,$rounds),$key
|
||
neg %rax # $rounds
|
||
add \$16,%rax
|
||
|
||
.L${dir}_loop3:
|
||
aes${dir} $rndkey1,$inout0
|
||
aes${dir} $rndkey1,$inout1
|
||
aes${dir} $rndkey1,$inout2
|
||
$movkey ($key,%rax),$rndkey1
|
||
add \$32,%rax
|
||
aes${dir} $rndkey0,$inout0
|
||
aes${dir} $rndkey0,$inout1
|
||
aes${dir} $rndkey0,$inout2
|
||
$movkey -16($key,%rax),$rndkey0
|
||
jnz .L${dir}_loop3
|
||
|
||
aes${dir} $rndkey1,$inout0
|
||
aes${dir} $rndkey1,$inout1
|
||
aes${dir} $rndkey1,$inout2
|
||
aes${dir}last $rndkey0,$inout0
|
||
aes${dir}last $rndkey0,$inout1
|
||
aes${dir}last $rndkey0,$inout2
|
||
ret
|
||
.size _aesni_${dir}rypt3,.-_aesni_${dir}rypt3
|
||
___
|
||
}
|
||
# 4x interleave is implemented to improve small block performance,
|
||
# most notably [and naturally] 4 block by ~30%. One can argue that one
|
||
# should have implemented 5x as well, but improvement would be <20%,
|
||
# so it's not worth it...
|
||
sub aesni_generate4 {
|
||
my $dir=shift;
|
||
# As already mentioned it takes in $key and $rounds, which are *not*
|
||
# preserved. $inout[0-3] is cipher/clear text...
|
||
$code.=<<___;
|
||
.type _aesni_${dir}rypt4,\@abi-omnipotent
|
||
.align 16
|
||
_aesni_${dir}rypt4:
|
||
$movkey ($key),$rndkey0
|
||
shl \$4,$rounds
|
||
$movkey 16($key),$rndkey1
|
||
xorps $rndkey0,$inout0
|
||
xorps $rndkey0,$inout1
|
||
xorps $rndkey0,$inout2
|
||
xorps $rndkey0,$inout3
|
||
$movkey 32($key),$rndkey0
|
||
lea 32($key,$rounds),$key
|
||
neg %rax # $rounds
|
||
.byte 0x0f,0x1f,0x00
|
||
add \$16,%rax
|
||
|
||
.L${dir}_loop4:
|
||
aes${dir} $rndkey1,$inout0
|
||
aes${dir} $rndkey1,$inout1
|
||
aes${dir} $rndkey1,$inout2
|
||
aes${dir} $rndkey1,$inout3
|
||
$movkey ($key,%rax),$rndkey1
|
||
add \$32,%rax
|
||
aes${dir} $rndkey0,$inout0
|
||
aes${dir} $rndkey0,$inout1
|
||
aes${dir} $rndkey0,$inout2
|
||
aes${dir} $rndkey0,$inout3
|
||
$movkey -16($key,%rax),$rndkey0
|
||
jnz .L${dir}_loop4
|
||
|
||
aes${dir} $rndkey1,$inout0
|
||
aes${dir} $rndkey1,$inout1
|
||
aes${dir} $rndkey1,$inout2
|
||
aes${dir} $rndkey1,$inout3
|
||
aes${dir}last $rndkey0,$inout0
|
||
aes${dir}last $rndkey0,$inout1
|
||
aes${dir}last $rndkey0,$inout2
|
||
aes${dir}last $rndkey0,$inout3
|
||
ret
|
||
.size _aesni_${dir}rypt4,.-_aesni_${dir}rypt4
|
||
___
|
||
}
|
||
sub aesni_generate6 {
|
||
my $dir=shift;
|
||
# As already mentioned it takes in $key and $rounds, which are *not*
|
||
# preserved. $inout[0-5] is cipher/clear text...
|
||
$code.=<<___;
|
||
.type _aesni_${dir}rypt6,\@abi-omnipotent
|
||
.align 16
|
||
_aesni_${dir}rypt6:
|
||
$movkey ($key),$rndkey0
|
||
shl \$4,$rounds
|
||
$movkey 16($key),$rndkey1
|
||
xorps $rndkey0,$inout0
|
||
pxor $rndkey0,$inout1
|
||
pxor $rndkey0,$inout2
|
||
aes${dir} $rndkey1,$inout0
|
||
lea 32($key,$rounds),$key
|
||
neg %rax # $rounds
|
||
aes${dir} $rndkey1,$inout1
|
||
pxor $rndkey0,$inout3
|
||
pxor $rndkey0,$inout4
|
||
aes${dir} $rndkey1,$inout2
|
||
pxor $rndkey0,$inout5
|
||
$movkey ($key,%rax),$rndkey0
|
||
add \$16,%rax
|
||
jmp .L${dir}_loop6_enter
|
||
.align 16
|
||
.L${dir}_loop6:
|
||
aes${dir} $rndkey1,$inout0
|
||
aes${dir} $rndkey1,$inout1
|
||
aes${dir} $rndkey1,$inout2
|
||
.L${dir}_loop6_enter:
|
||
aes${dir} $rndkey1,$inout3
|
||
aes${dir} $rndkey1,$inout4
|
||
aes${dir} $rndkey1,$inout5
|
||
$movkey ($key,%rax),$rndkey1
|
||
add \$32,%rax
|
||
aes${dir} $rndkey0,$inout0
|
||
aes${dir} $rndkey0,$inout1
|
||
aes${dir} $rndkey0,$inout2
|
||
aes${dir} $rndkey0,$inout3
|
||
aes${dir} $rndkey0,$inout4
|
||
aes${dir} $rndkey0,$inout5
|
||
$movkey -16($key,%rax),$rndkey0
|
||
jnz .L${dir}_loop6
|
||
|
||
aes${dir} $rndkey1,$inout0
|
||
aes${dir} $rndkey1,$inout1
|
||
aes${dir} $rndkey1,$inout2
|
||
aes${dir} $rndkey1,$inout3
|
||
aes${dir} $rndkey1,$inout4
|
||
aes${dir} $rndkey1,$inout5
|
||
aes${dir}last $rndkey0,$inout0
|
||
aes${dir}last $rndkey0,$inout1
|
||
aes${dir}last $rndkey0,$inout2
|
||
aes${dir}last $rndkey0,$inout3
|
||
aes${dir}last $rndkey0,$inout4
|
||
aes${dir}last $rndkey0,$inout5
|
||
ret
|
||
.size _aesni_${dir}rypt6,.-_aesni_${dir}rypt6
|
||
___
|
||
}
|
||
sub aesni_generate8 {
|
||
my $dir=shift;
|
||
# As already mentioned it takes in $key and $rounds, which are *not*
|
||
# preserved. $inout[0-7] is cipher/clear text...
|
||
$code.=<<___;
|
||
.type _aesni_${dir}rypt8,\@abi-omnipotent
|
||
.align 16
|
||
_aesni_${dir}rypt8:
|
||
$movkey ($key),$rndkey0
|
||
shl \$4,$rounds
|
||
$movkey 16($key),$rndkey1
|
||
xorps $rndkey0,$inout0
|
||
xorps $rndkey0,$inout1
|
||
pxor $rndkey0,$inout2
|
||
pxor $rndkey0,$inout3
|
||
pxor $rndkey0,$inout4
|
||
lea 32($key,$rounds),$key
|
||
neg %rax # $rounds
|
||
aes${dir} $rndkey1,$inout0
|
||
pxor $rndkey0,$inout5
|
||
pxor $rndkey0,$inout6
|
||
aes${dir} $rndkey1,$inout1
|
||
pxor $rndkey0,$inout7
|
||
$movkey ($key,%rax),$rndkey0
|
||
add \$16,%rax
|
||
jmp .L${dir}_loop8_inner
|
||
.align 16
|
||
.L${dir}_loop8:
|
||
aes${dir} $rndkey1,$inout0
|
||
aes${dir} $rndkey1,$inout1
|
||
.L${dir}_loop8_inner:
|
||
aes${dir} $rndkey1,$inout2
|
||
aes${dir} $rndkey1,$inout3
|
||
aes${dir} $rndkey1,$inout4
|
||
aes${dir} $rndkey1,$inout5
|
||
aes${dir} $rndkey1,$inout6
|
||
aes${dir} $rndkey1,$inout7
|
||
.L${dir}_loop8_enter:
|
||
$movkey ($key,%rax),$rndkey1
|
||
add \$32,%rax
|
||
aes${dir} $rndkey0,$inout0
|
||
aes${dir} $rndkey0,$inout1
|
||
aes${dir} $rndkey0,$inout2
|
||
aes${dir} $rndkey0,$inout3
|
||
aes${dir} $rndkey0,$inout4
|
||
aes${dir} $rndkey0,$inout5
|
||
aes${dir} $rndkey0,$inout6
|
||
aes${dir} $rndkey0,$inout7
|
||
$movkey -16($key,%rax),$rndkey0
|
||
jnz .L${dir}_loop8
|
||
|
||
aes${dir} $rndkey1,$inout0
|
||
aes${dir} $rndkey1,$inout1
|
||
aes${dir} $rndkey1,$inout2
|
||
aes${dir} $rndkey1,$inout3
|
||
aes${dir} $rndkey1,$inout4
|
||
aes${dir} $rndkey1,$inout5
|
||
aes${dir} $rndkey1,$inout6
|
||
aes${dir} $rndkey1,$inout7
|
||
aes${dir}last $rndkey0,$inout0
|
||
aes${dir}last $rndkey0,$inout1
|
||
aes${dir}last $rndkey0,$inout2
|
||
aes${dir}last $rndkey0,$inout3
|
||
aes${dir}last $rndkey0,$inout4
|
||
aes${dir}last $rndkey0,$inout5
|
||
aes${dir}last $rndkey0,$inout6
|
||
aes${dir}last $rndkey0,$inout7
|
||
ret
|
||
.size _aesni_${dir}rypt8,.-_aesni_${dir}rypt8
|
||
___
|
||
}
|
||
&aesni_generate2("enc") if ($PREFIX eq "aesni");
|
||
&aesni_generate2("dec");
|
||
&aesni_generate3("enc") if ($PREFIX eq "aesni");
|
||
&aesni_generate3("dec");
|
||
&aesni_generate4("enc") if ($PREFIX eq "aesni");
|
||
&aesni_generate4("dec");
|
||
&aesni_generate6("enc") if ($PREFIX eq "aesni");
|
||
&aesni_generate6("dec");
|
||
&aesni_generate8("enc") if ($PREFIX eq "aesni");
|
||
&aesni_generate8("dec");
|
||
|
||
if ($PREFIX eq "aesni") {
|
||
########################################################################
|
||
# void aesni_ecb_encrypt (const void *in, void *out,
|
||
# size_t length, const AES_KEY *key,
|
||
# int enc);
|
||
$code.=<<___;
|
||
.globl aesni_ecb_encrypt
|
||
.type aesni_ecb_encrypt,\@function,5
|
||
.align 16
|
||
aesni_ecb_encrypt:
|
||
___
|
||
$code.=<<___ if ($win64);
|
||
lea -0x58(%rsp),%rsp
|
||
movaps %xmm6,(%rsp) # offload $inout4..7
|
||
movaps %xmm7,0x10(%rsp)
|
||
movaps %xmm8,0x20(%rsp)
|
||
movaps %xmm9,0x30(%rsp)
|
||
.Lecb_enc_body:
|
||
___
|
||
$code.=<<___;
|
||
and \$-16,$len # if ($len<16)
|
||
jz .Lecb_ret # return
|
||
|
||
mov 240($key),$rounds # key->rounds
|
||
$movkey ($key),$rndkey0
|
||
mov $key,$key_ # backup $key
|
||
mov $rounds,$rnds_ # backup $rounds
|
||
test %r8d,%r8d # 5th argument
|
||
jz .Lecb_decrypt
|
||
#--------------------------- ECB ENCRYPT ------------------------------#
|
||
cmp \$0x80,$len # if ($len<8*16)
|
||
jb .Lecb_enc_tail # short input
|
||
|
||
movdqu ($inp),$inout0 # load 8 input blocks
|
||
movdqu 0x10($inp),$inout1
|
||
movdqu 0x20($inp),$inout2
|
||
movdqu 0x30($inp),$inout3
|
||
movdqu 0x40($inp),$inout4
|
||
movdqu 0x50($inp),$inout5
|
||
movdqu 0x60($inp),$inout6
|
||
movdqu 0x70($inp),$inout7
|
||
lea 0x80($inp),$inp # $inp+=8*16
|
||
sub \$0x80,$len # $len-=8*16 (can be zero)
|
||
jmp .Lecb_enc_loop8_enter
|
||
.align 16
|
||
.Lecb_enc_loop8:
|
||
movups $inout0,($out) # store 8 output blocks
|
||
mov $key_,$key # restore $key
|
||
movdqu ($inp),$inout0 # load 8 input blocks
|
||
mov $rnds_,$rounds # restore $rounds
|
||
movups $inout1,0x10($out)
|
||
movdqu 0x10($inp),$inout1
|
||
movups $inout2,0x20($out)
|
||
movdqu 0x20($inp),$inout2
|
||
movups $inout3,0x30($out)
|
||
movdqu 0x30($inp),$inout3
|
||
movups $inout4,0x40($out)
|
||
movdqu 0x40($inp),$inout4
|
||
movups $inout5,0x50($out)
|
||
movdqu 0x50($inp),$inout5
|
||
movups $inout6,0x60($out)
|
||
movdqu 0x60($inp),$inout6
|
||
movups $inout7,0x70($out)
|
||
lea 0x80($out),$out # $out+=8*16
|
||
movdqu 0x70($inp),$inout7
|
||
lea 0x80($inp),$inp # $inp+=8*16
|
||
.Lecb_enc_loop8_enter:
|
||
|
||
call _aesni_encrypt8
|
||
|
||
sub \$0x80,$len
|
||
jnc .Lecb_enc_loop8 # loop if $len-=8*16 didn't borrow
|
||
|
||
movups $inout0,($out) # store 8 output blocks
|
||
mov $key_,$key # restore $key
|
||
movups $inout1,0x10($out)
|
||
mov $rnds_,$rounds # restore $rounds
|
||
movups $inout2,0x20($out)
|
||
movups $inout3,0x30($out)
|
||
movups $inout4,0x40($out)
|
||
movups $inout5,0x50($out)
|
||
movups $inout6,0x60($out)
|
||
movups $inout7,0x70($out)
|
||
lea 0x80($out),$out # $out+=8*16
|
||
add \$0x80,$len # restore real remaining $len
|
||
jz .Lecb_ret # done if ($len==0)
|
||
|
||
.Lecb_enc_tail: # $len is less than 8*16
|
||
movups ($inp),$inout0
|
||
cmp \$0x20,$len
|
||
jb .Lecb_enc_one
|
||
movups 0x10($inp),$inout1
|
||
je .Lecb_enc_two
|
||
movups 0x20($inp),$inout2
|
||
cmp \$0x40,$len
|
||
jb .Lecb_enc_three
|
||
movups 0x30($inp),$inout3
|
||
je .Lecb_enc_four
|
||
movups 0x40($inp),$inout4
|
||
cmp \$0x60,$len
|
||
jb .Lecb_enc_five
|
||
movups 0x50($inp),$inout5
|
||
je .Lecb_enc_six
|
||
movdqu 0x60($inp),$inout6
|
||
xorps $inout7,$inout7
|
||
call _aesni_encrypt8
|
||
movups $inout0,($out) # store 7 output blocks
|
||
movups $inout1,0x10($out)
|
||
movups $inout2,0x20($out)
|
||
movups $inout3,0x30($out)
|
||
movups $inout4,0x40($out)
|
||
movups $inout5,0x50($out)
|
||
movups $inout6,0x60($out)
|
||
jmp .Lecb_ret
|
||
.align 16
|
||
.Lecb_enc_one:
|
||
___
|
||
&aesni_generate1("enc",$key,$rounds);
|
||
$code.=<<___;
|
||
movups $inout0,($out) # store one output block
|
||
jmp .Lecb_ret
|
||
.align 16
|
||
.Lecb_enc_two:
|
||
call _aesni_encrypt2
|
||
movups $inout0,($out) # store 2 output blocks
|
||
movups $inout1,0x10($out)
|
||
jmp .Lecb_ret
|
||
.align 16
|
||
.Lecb_enc_three:
|
||
call _aesni_encrypt3
|
||
movups $inout0,($out) # store 3 output blocks
|
||
movups $inout1,0x10($out)
|
||
movups $inout2,0x20($out)
|
||
jmp .Lecb_ret
|
||
.align 16
|
||
.Lecb_enc_four:
|
||
call _aesni_encrypt4
|
||
movups $inout0,($out) # store 4 output blocks
|
||
movups $inout1,0x10($out)
|
||
movups $inout2,0x20($out)
|
||
movups $inout3,0x30($out)
|
||
jmp .Lecb_ret
|
||
.align 16
|
||
.Lecb_enc_five:
|
||
xorps $inout5,$inout5
|
||
call _aesni_encrypt6
|
||
movups $inout0,($out) # store 5 output blocks
|
||
movups $inout1,0x10($out)
|
||
movups $inout2,0x20($out)
|
||
movups $inout3,0x30($out)
|
||
movups $inout4,0x40($out)
|
||
jmp .Lecb_ret
|
||
.align 16
|
||
.Lecb_enc_six:
|
||
call _aesni_encrypt6
|
||
movups $inout0,($out) # store 6 output blocks
|
||
movups $inout1,0x10($out)
|
||
movups $inout2,0x20($out)
|
||
movups $inout3,0x30($out)
|
||
movups $inout4,0x40($out)
|
||
movups $inout5,0x50($out)
|
||
jmp .Lecb_ret
|
||
#--------------------------- ECB DECRYPT ------------------------------#
|
||
.align 16
|
||
.Lecb_decrypt:
|
||
cmp \$0x80,$len # if ($len<8*16)
|
||
jb .Lecb_dec_tail # short input
|
||
|
||
movdqu ($inp),$inout0 # load 8 input blocks
|
||
movdqu 0x10($inp),$inout1
|
||
movdqu 0x20($inp),$inout2
|
||
movdqu 0x30($inp),$inout3
|
||
movdqu 0x40($inp),$inout4
|
||
movdqu 0x50($inp),$inout5
|
||
movdqu 0x60($inp),$inout6
|
||
movdqu 0x70($inp),$inout7
|
||
lea 0x80($inp),$inp # $inp+=8*16
|
||
sub \$0x80,$len # $len-=8*16 (can be zero)
|
||
jmp .Lecb_dec_loop8_enter
|
||
.align 16
|
||
.Lecb_dec_loop8:
|
||
movups $inout0,($out) # store 8 output blocks
|
||
mov $key_,$key # restore $key
|
||
movdqu ($inp),$inout0 # load 8 input blocks
|
||
mov $rnds_,$rounds # restore $rounds
|
||
movups $inout1,0x10($out)
|
||
movdqu 0x10($inp),$inout1
|
||
movups $inout2,0x20($out)
|
||
movdqu 0x20($inp),$inout2
|
||
movups $inout3,0x30($out)
|
||
movdqu 0x30($inp),$inout3
|
||
movups $inout4,0x40($out)
|
||
movdqu 0x40($inp),$inout4
|
||
movups $inout5,0x50($out)
|
||
movdqu 0x50($inp),$inout5
|
||
movups $inout6,0x60($out)
|
||
movdqu 0x60($inp),$inout6
|
||
movups $inout7,0x70($out)
|
||
lea 0x80($out),$out # $out+=8*16
|
||
movdqu 0x70($inp),$inout7
|
||
lea 0x80($inp),$inp # $inp+=8*16
|
||
.Lecb_dec_loop8_enter:
|
||
|
||
call _aesni_decrypt8
|
||
|
||
$movkey ($key_),$rndkey0
|
||
sub \$0x80,$len
|
||
jnc .Lecb_dec_loop8 # loop if $len-=8*16 didn't borrow
|
||
|
||
movups $inout0,($out) # store 8 output blocks
|
||
pxor $inout0,$inout0 # clear register bank
|
||
mov $key_,$key # restore $key
|
||
movups $inout1,0x10($out)
|
||
pxor $inout1,$inout1
|
||
mov $rnds_,$rounds # restore $rounds
|
||
movups $inout2,0x20($out)
|
||
pxor $inout2,$inout2
|
||
movups $inout3,0x30($out)
|
||
pxor $inout3,$inout3
|
||
movups $inout4,0x40($out)
|
||
pxor $inout4,$inout4
|
||
movups $inout5,0x50($out)
|
||
pxor $inout5,$inout5
|
||
movups $inout6,0x60($out)
|
||
pxor $inout6,$inout6
|
||
movups $inout7,0x70($out)
|
||
pxor $inout7,$inout7
|
||
lea 0x80($out),$out # $out+=8*16
|
||
add \$0x80,$len # restore real remaining $len
|
||
jz .Lecb_ret # done if ($len==0)
|
||
|
||
.Lecb_dec_tail:
|
||
movups ($inp),$inout0
|
||
cmp \$0x20,$len
|
||
jb .Lecb_dec_one
|
||
movups 0x10($inp),$inout1
|
||
je .Lecb_dec_two
|
||
movups 0x20($inp),$inout2
|
||
cmp \$0x40,$len
|
||
jb .Lecb_dec_three
|
||
movups 0x30($inp),$inout3
|
||
je .Lecb_dec_four
|
||
movups 0x40($inp),$inout4
|
||
cmp \$0x60,$len
|
||
jb .Lecb_dec_five
|
||
movups 0x50($inp),$inout5
|
||
je .Lecb_dec_six
|
||
movups 0x60($inp),$inout6
|
||
$movkey ($key),$rndkey0
|
||
xorps $inout7,$inout7
|
||
call _aesni_decrypt8
|
||
movups $inout0,($out) # store 7 output blocks
|
||
pxor $inout0,$inout0 # clear register bank
|
||
movups $inout1,0x10($out)
|
||
pxor $inout1,$inout1
|
||
movups $inout2,0x20($out)
|
||
pxor $inout2,$inout2
|
||
movups $inout3,0x30($out)
|
||
pxor $inout3,$inout3
|
||
movups $inout4,0x40($out)
|
||
pxor $inout4,$inout4
|
||
movups $inout5,0x50($out)
|
||
pxor $inout5,$inout5
|
||
movups $inout6,0x60($out)
|
||
pxor $inout6,$inout6
|
||
pxor $inout7,$inout7
|
||
jmp .Lecb_ret
|
||
.align 16
|
||
.Lecb_dec_one:
|
||
___
|
||
&aesni_generate1("dec",$key,$rounds);
|
||
$code.=<<___;
|
||
movups $inout0,($out) # store one output block
|
||
pxor $inout0,$inout0 # clear register bank
|
||
jmp .Lecb_ret
|
||
.align 16
|
||
.Lecb_dec_two:
|
||
call _aesni_decrypt2
|
||
movups $inout0,($out) # store 2 output blocks
|
||
pxor $inout0,$inout0 # clear register bank
|
||
movups $inout1,0x10($out)
|
||
pxor $inout1,$inout1
|
||
jmp .Lecb_ret
|
||
.align 16
|
||
.Lecb_dec_three:
|
||
call _aesni_decrypt3
|
||
movups $inout0,($out) # store 3 output blocks
|
||
pxor $inout0,$inout0 # clear register bank
|
||
movups $inout1,0x10($out)
|
||
pxor $inout1,$inout1
|
||
movups $inout2,0x20($out)
|
||
pxor $inout2,$inout2
|
||
jmp .Lecb_ret
|
||
.align 16
|
||
.Lecb_dec_four:
|
||
call _aesni_decrypt4
|
||
movups $inout0,($out) # store 4 output blocks
|
||
pxor $inout0,$inout0 # clear register bank
|
||
movups $inout1,0x10($out)
|
||
pxor $inout1,$inout1
|
||
movups $inout2,0x20($out)
|
||
pxor $inout2,$inout2
|
||
movups $inout3,0x30($out)
|
||
pxor $inout3,$inout3
|
||
jmp .Lecb_ret
|
||
.align 16
|
||
.Lecb_dec_five:
|
||
xorps $inout5,$inout5
|
||
call _aesni_decrypt6
|
||
movups $inout0,($out) # store 5 output blocks
|
||
pxor $inout0,$inout0 # clear register bank
|
||
movups $inout1,0x10($out)
|
||
pxor $inout1,$inout1
|
||
movups $inout2,0x20($out)
|
||
pxor $inout2,$inout2
|
||
movups $inout3,0x30($out)
|
||
pxor $inout3,$inout3
|
||
movups $inout4,0x40($out)
|
||
pxor $inout4,$inout4
|
||
pxor $inout5,$inout5
|
||
jmp .Lecb_ret
|
||
.align 16
|
||
.Lecb_dec_six:
|
||
call _aesni_decrypt6
|
||
movups $inout0,($out) # store 6 output blocks
|
||
pxor $inout0,$inout0 # clear register bank
|
||
movups $inout1,0x10($out)
|
||
pxor $inout1,$inout1
|
||
movups $inout2,0x20($out)
|
||
pxor $inout2,$inout2
|
||
movups $inout3,0x30($out)
|
||
pxor $inout3,$inout3
|
||
movups $inout4,0x40($out)
|
||
pxor $inout4,$inout4
|
||
movups $inout5,0x50($out)
|
||
pxor $inout5,$inout5
|
||
|
||
.Lecb_ret:
|
||
xorps $rndkey0,$rndkey0 # %xmm0
|
||
pxor $rndkey1,$rndkey1
|
||
___
|
||
$code.=<<___ if ($win64);
|
||
movaps (%rsp),%xmm6
|
||
movaps %xmm0,(%rsp) # clear stack
|
||
movaps 0x10(%rsp),%xmm7
|
||
movaps %xmm0,0x10(%rsp)
|
||
movaps 0x20(%rsp),%xmm8
|
||
movaps %xmm0,0x20(%rsp)
|
||
movaps 0x30(%rsp),%xmm9
|
||
movaps %xmm0,0x30(%rsp)
|
||
lea 0x58(%rsp),%rsp
|
||
.Lecb_enc_ret:
|
||
___
|
||
$code.=<<___;
|
||
ret
|
||
.size aesni_ecb_encrypt,.-aesni_ecb_encrypt
|
||
___
|
||
|
||
{
|
||
######################################################################
|
||
# void aesni_ccm64_[en|de]crypt_blocks (const void *in, void *out,
|
||
# size_t blocks, const AES_KEY *key,
|
||
# const char *ivec,char *cmac);
|
||
#
|
||
# Handles only complete blocks, operates on 64-bit counter and
|
||
# does not update *ivec! Nor does it finalize CMAC value
|
||
# (see engine/eng_aesni.c for details)
|
||
#
|
||
{
|
||
my $cmac="%r9"; # 6th argument
|
||
|
||
my $increment="%xmm9";
|
||
my $iv="%xmm6";
|
||
my $bswap_mask="%xmm7";
|
||
|
||
$code.=<<___;
|
||
.globl aesni_ccm64_encrypt_blocks
|
||
.type aesni_ccm64_encrypt_blocks,\@function,6
|
||
.align 16
|
||
aesni_ccm64_encrypt_blocks:
|
||
___
|
||
$code.=<<___ if ($win64);
|
||
lea -0x58(%rsp),%rsp
|
||
movaps %xmm6,(%rsp) # $iv
|
||
movaps %xmm7,0x10(%rsp) # $bswap_mask
|
||
movaps %xmm8,0x20(%rsp) # $in0
|
||
movaps %xmm9,0x30(%rsp) # $increment
|
||
.Lccm64_enc_body:
|
||
___
|
||
$code.=<<___;
|
||
mov 240($key),$rounds # key->rounds
|
||
movdqu ($ivp),$iv
|
||
movdqa .Lincrement64(%rip),$increment
|
||
movdqa .Lbswap_mask(%rip),$bswap_mask
|
||
|
||
shl \$4,$rounds
|
||
mov \$16,$rnds_
|
||
lea 0($key),$key_
|
||
movdqu ($cmac),$inout1
|
||
movdqa $iv,$inout0
|
||
lea 32($key,$rounds),$key # end of key schedule
|
||
pshufb $bswap_mask,$iv
|
||
sub %rax,%r10 # twisted $rounds
|
||
jmp .Lccm64_enc_outer
|
||
.align 16
|
||
.Lccm64_enc_outer:
|
||
$movkey ($key_),$rndkey0
|
||
mov %r10,%rax
|
||
movups ($inp),$in0 # load inp
|
||
|
||
xorps $rndkey0,$inout0 # counter
|
||
$movkey 16($key_),$rndkey1
|
||
xorps $in0,$rndkey0
|
||
xorps $rndkey0,$inout1 # cmac^=inp
|
||
$movkey 32($key_),$rndkey0
|
||
|
||
.Lccm64_enc2_loop:
|
||
aesenc $rndkey1,$inout0
|
||
aesenc $rndkey1,$inout1
|
||
$movkey ($key,%rax),$rndkey1
|
||
add \$32,%rax
|
||
aesenc $rndkey0,$inout0
|
||
aesenc $rndkey0,$inout1
|
||
$movkey -16($key,%rax),$rndkey0
|
||
jnz .Lccm64_enc2_loop
|
||
aesenc $rndkey1,$inout0
|
||
aesenc $rndkey1,$inout1
|
||
paddq $increment,$iv
|
||
dec $len # $len-- ($len is in blocks)
|
||
aesenclast $rndkey0,$inout0
|
||
aesenclast $rndkey0,$inout1
|
||
|
||
lea 16($inp),$inp
|
||
xorps $inout0,$in0 # inp ^= E(iv)
|
||
movdqa $iv,$inout0
|
||
movups $in0,($out) # save output
|
||
pshufb $bswap_mask,$inout0
|
||
lea 16($out),$out # $out+=16
|
||
jnz .Lccm64_enc_outer # loop if ($len!=0)
|
||
|
||
pxor $rndkey0,$rndkey0 # clear register bank
|
||
pxor $rndkey1,$rndkey1
|
||
pxor $inout0,$inout0
|
||
movups $inout1,($cmac) # store resulting mac
|
||
pxor $inout1,$inout1
|
||
pxor $in0,$in0
|
||
pxor $iv,$iv
|
||
___
|
||
$code.=<<___ if ($win64);
|
||
movaps (%rsp),%xmm6
|
||
movaps %xmm0,(%rsp) # clear stack
|
||
movaps 0x10(%rsp),%xmm7
|
||
movaps %xmm0,0x10(%rsp)
|
||
movaps 0x20(%rsp),%xmm8
|
||
movaps %xmm0,0x20(%rsp)
|
||
movaps 0x30(%rsp),%xmm9
|
||
movaps %xmm0,0x30(%rsp)
|
||
lea 0x58(%rsp),%rsp
|
||
.Lccm64_enc_ret:
|
||
___
|
||
$code.=<<___;
|
||
ret
|
||
.size aesni_ccm64_encrypt_blocks,.-aesni_ccm64_encrypt_blocks
|
||
___
|
||
######################################################################
|
||
$code.=<<___;
|
||
.globl aesni_ccm64_decrypt_blocks
|
||
.type aesni_ccm64_decrypt_blocks,\@function,6
|
||
.align 16
|
||
aesni_ccm64_decrypt_blocks:
|
||
___
|
||
$code.=<<___ if ($win64);
|
||
lea -0x58(%rsp),%rsp
|
||
movaps %xmm6,(%rsp) # $iv
|
||
movaps %xmm7,0x10(%rsp) # $bswap_mask
|
||
movaps %xmm8,0x20(%rsp) # $in8
|
||
movaps %xmm9,0x30(%rsp) # $increment
|
||
.Lccm64_dec_body:
|
||
___
|
||
$code.=<<___;
|
||
mov 240($key),$rounds # key->rounds
|
||
movups ($ivp),$iv
|
||
movdqu ($cmac),$inout1
|
||
movdqa .Lincrement64(%rip),$increment
|
||
movdqa .Lbswap_mask(%rip),$bswap_mask
|
||
|
||
movaps $iv,$inout0
|
||
mov $rounds,$rnds_
|
||
mov $key,$key_
|
||
pshufb $bswap_mask,$iv
|
||
___
|
||
&aesni_generate1("enc",$key,$rounds);
|
||
$code.=<<___;
|
||
shl \$4,$rnds_
|
||
mov \$16,$rounds
|
||
movups ($inp),$in0 # load inp
|
||
paddq $increment,$iv
|
||
lea 16($inp),$inp # $inp+=16
|
||
sub %r10,%rax # twisted $rounds
|
||
lea 32($key_,$rnds_),$key # end of key schedule
|
||
mov %rax,%r10
|
||
jmp .Lccm64_dec_outer
|
||
.align 16
|
||
.Lccm64_dec_outer:
|
||
xorps $inout0,$in0 # inp ^= E(iv)
|
||
movdqa $iv,$inout0
|
||
movups $in0,($out) # save output
|
||
lea 16($out),$out # $out+=16
|
||
pshufb $bswap_mask,$inout0
|
||
|
||
sub \$1,$len # $len-- ($len is in blocks)
|
||
jz .Lccm64_dec_break # if ($len==0) break
|
||
|
||
$movkey ($key_),$rndkey0
|
||
mov %r10,%rax
|
||
$movkey 16($key_),$rndkey1
|
||
xorps $rndkey0,$in0
|
||
xorps $rndkey0,$inout0
|
||
xorps $in0,$inout1 # cmac^=out
|
||
$movkey 32($key_),$rndkey0
|
||
jmp .Lccm64_dec2_loop
|
||
.align 16
|
||
.Lccm64_dec2_loop:
|
||
aesenc $rndkey1,$inout0
|
||
aesenc $rndkey1,$inout1
|
||
$movkey ($key,%rax),$rndkey1
|
||
add \$32,%rax
|
||
aesenc $rndkey0,$inout0
|
||
aesenc $rndkey0,$inout1
|
||
$movkey -16($key,%rax),$rndkey0
|
||
jnz .Lccm64_dec2_loop
|
||
movups ($inp),$in0 # load input
|
||
paddq $increment,$iv
|
||
aesenc $rndkey1,$inout0
|
||
aesenc $rndkey1,$inout1
|
||
aesenclast $rndkey0,$inout0
|
||
aesenclast $rndkey0,$inout1
|
||
lea 16($inp),$inp # $inp+=16
|
||
jmp .Lccm64_dec_outer
|
||
|
||
.align 16
|
||
.Lccm64_dec_break:
|
||
#xorps $in0,$inout1 # cmac^=out
|
||
mov 240($key_),$rounds
|
||
___
|
||
&aesni_generate1("enc",$key_,$rounds,$inout1,$in0);
|
||
$code.=<<___;
|
||
pxor $rndkey0,$rndkey0 # clear register bank
|
||
pxor $rndkey1,$rndkey1
|
||
pxor $inout0,$inout0
|
||
movups $inout1,($cmac) # store resulting mac
|
||
pxor $inout1,$inout1
|
||
pxor $in0,$in0
|
||
pxor $iv,$iv
|
||
___
|
||
$code.=<<___ if ($win64);
|
||
movaps (%rsp),%xmm6
|
||
movaps %xmm0,(%rsp) # clear stack
|
||
movaps 0x10(%rsp),%xmm7
|
||
movaps %xmm0,0x10(%rsp)
|
||
movaps 0x20(%rsp),%xmm8
|
||
movaps %xmm0,0x20(%rsp)
|
||
movaps 0x30(%rsp),%xmm9
|
||
movaps %xmm0,0x30(%rsp)
|
||
lea 0x58(%rsp),%rsp
|
||
.Lccm64_dec_ret:
|
||
___
|
||
$code.=<<___;
|
||
ret
|
||
.size aesni_ccm64_decrypt_blocks,.-aesni_ccm64_decrypt_blocks
|
||
___
|
||
}
|
||
######################################################################
|
||
# void aesni_ctr32_encrypt_blocks (const void *in, void *out,
|
||
# size_t blocks, const AES_KEY *key,
|
||
# const char *ivec);
|
||
#
|
||
# Handles only complete blocks, operates on 32-bit counter and
|
||
# does not update *ivec! (see crypto/modes/ctr128.c for details)
|
||
#
|
||
# Overhaul based on suggestions from Shay Gueron and Vlad Krasnov,
|
||
# http://rt.openssl.org/Ticket/Display.html?id=3021&user=guest&pass=guest.
|
||
# Keywords are full unroll and modulo-schedule counter calculations
|
||
# with zero-round key xor.
|
||
{
|
||
my ($in0,$in1,$in2,$in3,$in4,$in5)=map("%xmm$_",(10..15));
|
||
my ($key0,$ctr)=("${key_}d","${ivp}d");
|
||
my $frame_size = 0x80 + ($win64?160:0);
|
||
|
||
$code.=<<___;
|
||
.globl aesni_ctr32_encrypt_blocks
|
||
.type aesni_ctr32_encrypt_blocks,\@function,5
|
||
.align 16
|
||
aesni_ctr32_encrypt_blocks:
|
||
cmp \$1,$len
|
||
jne .Lctr32_bulk
|
||
|
||
# handle single block without allocating stack frame,
|
||
# useful when handling edges
|
||
movups ($ivp),$inout0
|
||
movups ($inp),$inout1
|
||
mov 240($key),%edx # key->rounds
|
||
___
|
||
&aesni_generate1("enc",$key,"%edx");
|
||
$code.=<<___;
|
||
pxor $rndkey0,$rndkey0 # clear register bank
|
||
pxor $rndkey1,$rndkey1
|
||
xorps $inout1,$inout0
|
||
pxor $inout1,$inout1
|
||
movups $inout0,($out)
|
||
xorps $inout0,$inout0
|
||
jmp .Lctr32_epilogue
|
||
|
||
.align 16
|
||
.Lctr32_bulk:
|
||
lea (%rsp),%rax
|
||
push %rbp
|
||
sub \$$frame_size,%rsp
|
||
and \$-16,%rsp # Linux kernel stack can be incorrectly seeded
|
||
___
|
||
$code.=<<___ if ($win64);
|
||
movaps %xmm6,-0xa8(%rax) # offload everything
|
||
movaps %xmm7,-0x98(%rax)
|
||
movaps %xmm8,-0x88(%rax)
|
||
movaps %xmm9,-0x78(%rax)
|
||
movaps %xmm10,-0x68(%rax)
|
||
movaps %xmm11,-0x58(%rax)
|
||
movaps %xmm12,-0x48(%rax)
|
||
movaps %xmm13,-0x38(%rax)
|
||
movaps %xmm14,-0x28(%rax)
|
||
movaps %xmm15,-0x18(%rax)
|
||
.Lctr32_body:
|
||
___
|
||
$code.=<<___;
|
||
lea -8(%rax),%rbp
|
||
|
||
# 8 16-byte words on top of stack are counter values
|
||
# xor-ed with zero-round key
|
||
|
||
movdqu ($ivp),$inout0
|
||
movdqu ($key),$rndkey0
|
||
mov 12($ivp),$ctr # counter LSB
|
||
pxor $rndkey0,$inout0
|
||
mov 12($key),$key0 # 0-round key LSB
|
||
movdqa $inout0,0x00(%rsp) # populate counter block
|
||
bswap $ctr
|
||
movdqa $inout0,$inout1
|
||
movdqa $inout0,$inout2
|
||
movdqa $inout0,$inout3
|
||
movdqa $inout0,0x40(%rsp)
|
||
movdqa $inout0,0x50(%rsp)
|
||
movdqa $inout0,0x60(%rsp)
|
||
mov %rdx,%r10 # about to borrow %rdx
|
||
movdqa $inout0,0x70(%rsp)
|
||
|
||
lea 1($ctr),%rax
|
||
lea 2($ctr),%rdx
|
||
bswap %eax
|
||
bswap %edx
|
||
xor $key0,%eax
|
||
xor $key0,%edx
|
||
pinsrd \$3,%eax,$inout1
|
||
lea 3($ctr),%rax
|
||
movdqa $inout1,0x10(%rsp)
|
||
pinsrd \$3,%edx,$inout2
|
||
bswap %eax
|
||
mov %r10,%rdx # restore %rdx
|
||
lea 4($ctr),%r10
|
||
movdqa $inout2,0x20(%rsp)
|
||
xor $key0,%eax
|
||
bswap %r10d
|
||
pinsrd \$3,%eax,$inout3
|
||
xor $key0,%r10d
|
||
movdqa $inout3,0x30(%rsp)
|
||
lea 5($ctr),%r9
|
||
mov %r10d,0x40+12(%rsp)
|
||
bswap %r9d
|
||
lea 6($ctr),%r10
|
||
mov 240($key),$rounds # key->rounds
|
||
xor $key0,%r9d
|
||
bswap %r10d
|
||
mov %r9d,0x50+12(%rsp)
|
||
xor $key0,%r10d
|
||
lea 7($ctr),%r9
|
||
mov %r10d,0x60+12(%rsp)
|
||
bswap %r9d
|
||
mov OPENSSL_ia32cap_P+4(%rip),%r10d
|
||
xor $key0,%r9d
|
||
and \$`1<<26|1<<22`,%r10d # isolate XSAVE+MOVBE
|
||
mov %r9d,0x70+12(%rsp)
|
||
|
||
$movkey 0x10($key),$rndkey1
|
||
|
||
movdqa 0x40(%rsp),$inout4
|
||
movdqa 0x50(%rsp),$inout5
|
||
|
||
cmp \$8,$len # $len is in blocks
|
||
jb .Lctr32_tail # short input if ($len<8)
|
||
|
||
sub \$6,$len # $len is biased by -6
|
||
cmp \$`1<<22`,%r10d # check for MOVBE without XSAVE
|
||
je .Lctr32_6x # [which denotes Atom Silvermont]
|
||
|
||
lea 0x80($key),$key # size optimization
|
||
sub \$2,$len # $len is biased by -8
|
||
jmp .Lctr32_loop8
|
||
|
||
.align 16
|
||
.Lctr32_6x:
|
||
shl \$4,$rounds
|
||
mov \$48,$rnds_
|
||
bswap $key0
|
||
lea 32($key,$rounds),$key # end of key schedule
|
||
sub %rax,%r10 # twisted $rounds
|
||
jmp .Lctr32_loop6
|
||
|
||
.align 16
|
||
.Lctr32_loop6:
|
||
add \$6,$ctr # next counter value
|
||
$movkey -48($key,$rnds_),$rndkey0
|
||
aesenc $rndkey1,$inout0
|
||
mov $ctr,%eax
|
||
xor $key0,%eax
|
||
aesenc $rndkey1,$inout1
|
||
movbe %eax,`0x00+12`(%rsp) # store next counter value
|
||
lea 1($ctr),%eax
|
||
aesenc $rndkey1,$inout2
|
||
xor $key0,%eax
|
||
movbe %eax,`0x10+12`(%rsp)
|
||
aesenc $rndkey1,$inout3
|
||
lea 2($ctr),%eax
|
||
xor $key0,%eax
|
||
aesenc $rndkey1,$inout4
|
||
movbe %eax,`0x20+12`(%rsp)
|
||
lea 3($ctr),%eax
|
||
aesenc $rndkey1,$inout5
|
||
$movkey -32($key,$rnds_),$rndkey1
|
||
xor $key0,%eax
|
||
|
||
aesenc $rndkey0,$inout0
|
||
movbe %eax,`0x30+12`(%rsp)
|
||
lea 4($ctr),%eax
|
||
aesenc $rndkey0,$inout1
|
||
xor $key0,%eax
|
||
movbe %eax,`0x40+12`(%rsp)
|
||
aesenc $rndkey0,$inout2
|
||
lea 5($ctr),%eax
|
||
xor $key0,%eax
|
||
aesenc $rndkey0,$inout3
|
||
movbe %eax,`0x50+12`(%rsp)
|
||
mov %r10,%rax # mov $rnds_,$rounds
|
||
aesenc $rndkey0,$inout4
|
||
aesenc $rndkey0,$inout5
|
||
$movkey -16($key,$rnds_),$rndkey0
|
||
|
||
call .Lenc_loop6
|
||
|
||
movdqu ($inp),$inout6 # load 6 input blocks
|
||
movdqu 0x10($inp),$inout7
|
||
movdqu 0x20($inp),$in0
|
||
movdqu 0x30($inp),$in1
|
||
movdqu 0x40($inp),$in2
|
||
movdqu 0x50($inp),$in3
|
||
lea 0x60($inp),$inp # $inp+=6*16
|
||
$movkey -64($key,$rnds_),$rndkey1
|
||
pxor $inout0,$inout6 # inp^=E(ctr)
|
||
movaps 0x00(%rsp),$inout0 # load next counter [xor-ed with 0 round]
|
||
pxor $inout1,$inout7
|
||
movaps 0x10(%rsp),$inout1
|
||
pxor $inout2,$in0
|
||
movaps 0x20(%rsp),$inout2
|
||
pxor $inout3,$in1
|
||
movaps 0x30(%rsp),$inout3
|
||
pxor $inout4,$in2
|
||
movaps 0x40(%rsp),$inout4
|
||
pxor $inout5,$in3
|
||
movaps 0x50(%rsp),$inout5
|
||
movdqu $inout6,($out) # store 6 output blocks
|
||
movdqu $inout7,0x10($out)
|
||
movdqu $in0,0x20($out)
|
||
movdqu $in1,0x30($out)
|
||
movdqu $in2,0x40($out)
|
||
movdqu $in3,0x50($out)
|
||
lea 0x60($out),$out # $out+=6*16
|
||
|
||
sub \$6,$len
|
||
jnc .Lctr32_loop6 # loop if $len-=6 didn't borrow
|
||
|
||
add \$6,$len # restore real remaining $len
|
||
jz .Lctr32_done # done if ($len==0)
|
||
|
||
lea -48($rnds_),$rounds
|
||
lea -80($key,$rnds_),$key # restore $key
|
||
neg $rounds
|
||
shr \$4,$rounds # restore $rounds
|
||
jmp .Lctr32_tail
|
||
|
||
.align 32
|
||
.Lctr32_loop8:
|
||
add \$8,$ctr # next counter value
|
||
movdqa 0x60(%rsp),$inout6
|
||
aesenc $rndkey1,$inout0
|
||
mov $ctr,%r9d
|
||
movdqa 0x70(%rsp),$inout7
|
||
aesenc $rndkey1,$inout1
|
||
bswap %r9d
|
||
$movkey 0x20-0x80($key),$rndkey0
|
||
aesenc $rndkey1,$inout2
|
||
xor $key0,%r9d
|
||
nop
|
||
aesenc $rndkey1,$inout3
|
||
mov %r9d,0x00+12(%rsp) # store next counter value
|
||
lea 1($ctr),%r9
|
||
aesenc $rndkey1,$inout4
|
||
aesenc $rndkey1,$inout5
|
||
aesenc $rndkey1,$inout6
|
||
aesenc $rndkey1,$inout7
|
||
$movkey 0x30-0x80($key),$rndkey1
|
||
___
|
||
for($i=2;$i<8;$i++) {
|
||
my $rndkeyx = ($i&1)?$rndkey1:$rndkey0;
|
||
$code.=<<___;
|
||
bswap %r9d
|
||
aesenc $rndkeyx,$inout0
|
||
aesenc $rndkeyx,$inout1
|
||
xor $key0,%r9d
|
||
.byte 0x66,0x90
|
||
aesenc $rndkeyx,$inout2
|
||
aesenc $rndkeyx,$inout3
|
||
mov %r9d,`0x10*($i-1)`+12(%rsp)
|
||
lea $i($ctr),%r9
|
||
aesenc $rndkeyx,$inout4
|
||
aesenc $rndkeyx,$inout5
|
||
aesenc $rndkeyx,$inout6
|
||
aesenc $rndkeyx,$inout7
|
||
$movkey `0x20+0x10*$i`-0x80($key),$rndkeyx
|
||
___
|
||
}
|
||
$code.=<<___;
|
||
bswap %r9d
|
||
aesenc $rndkey0,$inout0
|
||
aesenc $rndkey0,$inout1
|
||
aesenc $rndkey0,$inout2
|
||
xor $key0,%r9d
|
||
movdqu 0x00($inp),$in0 # start loading input
|
||
aesenc $rndkey0,$inout3
|
||
mov %r9d,0x70+12(%rsp)
|
||
cmp \$11,$rounds
|
||
aesenc $rndkey0,$inout4
|
||
aesenc $rndkey0,$inout5
|
||
aesenc $rndkey0,$inout6
|
||
aesenc $rndkey0,$inout7
|
||
$movkey 0xa0-0x80($key),$rndkey0
|
||
|
||
jb .Lctr32_enc_done
|
||
|
||
aesenc $rndkey1,$inout0
|
||
aesenc $rndkey1,$inout1
|
||
aesenc $rndkey1,$inout2
|
||
aesenc $rndkey1,$inout3
|
||
aesenc $rndkey1,$inout4
|
||
aesenc $rndkey1,$inout5
|
||
aesenc $rndkey1,$inout6
|
||
aesenc $rndkey1,$inout7
|
||
$movkey 0xb0-0x80($key),$rndkey1
|
||
|
||
aesenc $rndkey0,$inout0
|
||
aesenc $rndkey0,$inout1
|
||
aesenc $rndkey0,$inout2
|
||
aesenc $rndkey0,$inout3
|
||
aesenc $rndkey0,$inout4
|
||
aesenc $rndkey0,$inout5
|
||
aesenc $rndkey0,$inout6
|
||
aesenc $rndkey0,$inout7
|
||
$movkey 0xc0-0x80($key),$rndkey0
|
||
je .Lctr32_enc_done
|
||
|
||
aesenc $rndkey1,$inout0
|
||
aesenc $rndkey1,$inout1
|
||
aesenc $rndkey1,$inout2
|
||
aesenc $rndkey1,$inout3
|
||
aesenc $rndkey1,$inout4
|
||
aesenc $rndkey1,$inout5
|
||
aesenc $rndkey1,$inout6
|
||
aesenc $rndkey1,$inout7
|
||
$movkey 0xd0-0x80($key),$rndkey1
|
||
|
||
aesenc $rndkey0,$inout0
|
||
aesenc $rndkey0,$inout1
|
||
aesenc $rndkey0,$inout2
|
||
aesenc $rndkey0,$inout3
|
||
aesenc $rndkey0,$inout4
|
||
aesenc $rndkey0,$inout5
|
||
aesenc $rndkey0,$inout6
|
||
aesenc $rndkey0,$inout7
|
||
$movkey 0xe0-0x80($key),$rndkey0
|
||
jmp .Lctr32_enc_done
|
||
|
||
.align 16
|
||
.Lctr32_enc_done:
|
||
movdqu 0x10($inp),$in1
|
||
pxor $rndkey0,$in0 # input^=round[last]
|
||
movdqu 0x20($inp),$in2
|
||
pxor $rndkey0,$in1
|
||
movdqu 0x30($inp),$in3
|
||
pxor $rndkey0,$in2
|
||
movdqu 0x40($inp),$in4
|
||
pxor $rndkey0,$in3
|
||
movdqu 0x50($inp),$in5
|
||
pxor $rndkey0,$in4
|
||
pxor $rndkey0,$in5
|
||
aesenc $rndkey1,$inout0
|
||
aesenc $rndkey1,$inout1
|
||
aesenc $rndkey1,$inout2
|
||
aesenc $rndkey1,$inout3
|
||
aesenc $rndkey1,$inout4
|
||
aesenc $rndkey1,$inout5
|
||
aesenc $rndkey1,$inout6
|
||
aesenc $rndkey1,$inout7
|
||
movdqu 0x60($inp),$rndkey1 # borrow $rndkey1 for inp[6]
|
||
lea 0x80($inp),$inp # $inp+=8*16
|
||
|
||
aesenclast $in0,$inout0 # $inN is inp[N]^round[last]
|
||
pxor $rndkey0,$rndkey1 # borrowed $rndkey
|
||
movdqu 0x70-0x80($inp),$in0
|
||
aesenclast $in1,$inout1
|
||
pxor $rndkey0,$in0
|
||
movdqa 0x00(%rsp),$in1 # load next counter block
|
||
aesenclast $in2,$inout2
|
||
aesenclast $in3,$inout3
|
||
movdqa 0x10(%rsp),$in2
|
||
movdqa 0x20(%rsp),$in3
|
||
aesenclast $in4,$inout4
|
||
aesenclast $in5,$inout5
|
||
movdqa 0x30(%rsp),$in4
|
||
movdqa 0x40(%rsp),$in5
|
||
aesenclast $rndkey1,$inout6
|
||
movdqa 0x50(%rsp),$rndkey0
|
||
$movkey 0x10-0x80($key),$rndkey1#real 1st-round key
|
||
aesenclast $in0,$inout7
|
||
|
||
movups $inout0,($out) # store 8 output blocks
|
||
movdqa $in1,$inout0
|
||
movups $inout1,0x10($out)
|
||
movdqa $in2,$inout1
|
||
movups $inout2,0x20($out)
|
||
movdqa $in3,$inout2
|
||
movups $inout3,0x30($out)
|
||
movdqa $in4,$inout3
|
||
movups $inout4,0x40($out)
|
||
movdqa $in5,$inout4
|
||
movups $inout5,0x50($out)
|
||
movdqa $rndkey0,$inout5
|
||
movups $inout6,0x60($out)
|
||
movups $inout7,0x70($out)
|
||
lea 0x80($out),$out # $out+=8*16
|
||
|
||
sub \$8,$len
|
||
jnc .Lctr32_loop8 # loop if $len-=8 didn't borrow
|
||
|
||
add \$8,$len # restore real remainig $len
|
||
jz .Lctr32_done # done if ($len==0)
|
||
lea -0x80($key),$key
|
||
|
||
.Lctr32_tail:
|
||
# note that at this point $inout0..5 are populated with
|
||
# counter values xor-ed with 0-round key
|
||
lea 16($key),$key
|
||
cmp \$4,$len
|
||
jb .Lctr32_loop3
|
||
je .Lctr32_loop4
|
||
|
||
# if ($len>4) compute 7 E(counter)
|
||
shl \$4,$rounds
|
||
movdqa 0x60(%rsp),$inout6
|
||
pxor $inout7,$inout7
|
||
|
||
$movkey 16($key),$rndkey0
|
||
aesenc $rndkey1,$inout0
|
||
aesenc $rndkey1,$inout1
|
||
lea 32-16($key,$rounds),$key# prepare for .Lenc_loop8_enter
|
||
neg %rax
|
||
aesenc $rndkey1,$inout2
|
||
add \$16,%rax # prepare for .Lenc_loop8_enter
|
||
movups ($inp),$in0
|
||
aesenc $rndkey1,$inout3
|
||
aesenc $rndkey1,$inout4
|
||
movups 0x10($inp),$in1 # pre-load input
|
||
movups 0x20($inp),$in2
|
||
aesenc $rndkey1,$inout5
|
||
aesenc $rndkey1,$inout6
|
||
|
||
call .Lenc_loop8_enter
|
||
|
||
movdqu 0x30($inp),$in3
|
||
pxor $in0,$inout0
|
||
movdqu 0x40($inp),$in0
|
||
pxor $in1,$inout1
|
||
movdqu $inout0,($out) # store output
|
||
pxor $in2,$inout2
|
||
movdqu $inout1,0x10($out)
|
||
pxor $in3,$inout3
|
||
movdqu $inout2,0x20($out)
|
||
pxor $in0,$inout4
|
||
movdqu $inout3,0x30($out)
|
||
movdqu $inout4,0x40($out)
|
||
cmp \$6,$len
|
||
jb .Lctr32_done # $len was 5, stop store
|
||
|
||
movups 0x50($inp),$in1
|
||
xorps $in1,$inout5
|
||
movups $inout5,0x50($out)
|
||
je .Lctr32_done # $len was 6, stop store
|
||
|
||
movups 0x60($inp),$in2
|
||
xorps $in2,$inout6
|
||
movups $inout6,0x60($out)
|
||
jmp .Lctr32_done # $len was 7, stop store
|
||
|
||
.align 32
|
||
.Lctr32_loop4:
|
||
aesenc $rndkey1,$inout0
|
||
lea 16($key),$key
|
||
dec $rounds
|
||
aesenc $rndkey1,$inout1
|
||
aesenc $rndkey1,$inout2
|
||
aesenc $rndkey1,$inout3
|
||
$movkey ($key),$rndkey1
|
||
jnz .Lctr32_loop4
|
||
aesenclast $rndkey1,$inout0
|
||
aesenclast $rndkey1,$inout1
|
||
movups ($inp),$in0 # load input
|
||
movups 0x10($inp),$in1
|
||
aesenclast $rndkey1,$inout2
|
||
aesenclast $rndkey1,$inout3
|
||
movups 0x20($inp),$in2
|
||
movups 0x30($inp),$in3
|
||
|
||
xorps $in0,$inout0
|
||
movups $inout0,($out) # store output
|
||
xorps $in1,$inout1
|
||
movups $inout1,0x10($out)
|
||
pxor $in2,$inout2
|
||
movdqu $inout2,0x20($out)
|
||
pxor $in3,$inout3
|
||
movdqu $inout3,0x30($out)
|
||
jmp .Lctr32_done # $len was 4, stop store
|
||
|
||
.align 32
|
||
.Lctr32_loop3:
|
||
aesenc $rndkey1,$inout0
|
||
lea 16($key),$key
|
||
dec $rounds
|
||
aesenc $rndkey1,$inout1
|
||
aesenc $rndkey1,$inout2
|
||
$movkey ($key),$rndkey1
|
||
jnz .Lctr32_loop3
|
||
aesenclast $rndkey1,$inout0
|
||
aesenclast $rndkey1,$inout1
|
||
aesenclast $rndkey1,$inout2
|
||
|
||
movups ($inp),$in0 # load input
|
||
xorps $in0,$inout0
|
||
movups $inout0,($out) # store output
|
||
cmp \$2,$len
|
||
jb .Lctr32_done # $len was 1, stop store
|
||
|
||
movups 0x10($inp),$in1
|
||
xorps $in1,$inout1
|
||
movups $inout1,0x10($out)
|
||
je .Lctr32_done # $len was 2, stop store
|
||
|
||
movups 0x20($inp),$in2
|
||
xorps $in2,$inout2
|
||
movups $inout2,0x20($out) # $len was 3, stop store
|
||
|
||
.Lctr32_done:
|
||
xorps %xmm0,%xmm0 # clear regiser bank
|
||
xor $key0,$key0
|
||
pxor %xmm1,%xmm1
|
||
pxor %xmm2,%xmm2
|
||
pxor %xmm3,%xmm3
|
||
pxor %xmm4,%xmm4
|
||
pxor %xmm5,%xmm5
|
||
___
|
||
$code.=<<___ if (!$win64);
|
||
pxor %xmm6,%xmm6
|
||
pxor %xmm7,%xmm7
|
||
movaps %xmm0,0x00(%rsp) # clear stack
|
||
pxor %xmm8,%xmm8
|
||
movaps %xmm0,0x10(%rsp)
|
||
pxor %xmm9,%xmm9
|
||
movaps %xmm0,0x20(%rsp)
|
||
pxor %xmm10,%xmm10
|
||
movaps %xmm0,0x30(%rsp)
|
||
pxor %xmm11,%xmm11
|
||
movaps %xmm0,0x40(%rsp)
|
||
pxor %xmm12,%xmm12
|
||
movaps %xmm0,0x50(%rsp)
|
||
pxor %xmm13,%xmm13
|
||
movaps %xmm0,0x60(%rsp)
|
||
pxor %xmm14,%xmm14
|
||
movaps %xmm0,0x70(%rsp)
|
||
pxor %xmm15,%xmm15
|
||
___
|
||
$code.=<<___ if ($win64);
|
||
movaps -0xa0(%rbp),%xmm6
|
||
movaps %xmm0,-0xa0(%rbp) # clear stack
|
||
movaps -0x90(%rbp),%xmm7
|
||
movaps %xmm0,-0x90(%rbp)
|
||
movaps -0x80(%rbp),%xmm8
|
||
movaps %xmm0,-0x80(%rbp)
|
||
movaps -0x70(%rbp),%xmm9
|
||
movaps %xmm0,-0x70(%rbp)
|
||
movaps -0x60(%rbp),%xmm10
|
||
movaps %xmm0,-0x60(%rbp)
|
||
movaps -0x50(%rbp),%xmm11
|
||
movaps %xmm0,-0x50(%rbp)
|
||
movaps -0x40(%rbp),%xmm12
|
||
movaps %xmm0,-0x40(%rbp)
|
||
movaps -0x30(%rbp),%xmm13
|
||
movaps %xmm0,-0x30(%rbp)
|
||
movaps -0x20(%rbp),%xmm14
|
||
movaps %xmm0,-0x20(%rbp)
|
||
movaps -0x10(%rbp),%xmm15
|
||
movaps %xmm0,-0x10(%rbp)
|
||
movaps %xmm0,0x00(%rsp)
|
||
movaps %xmm0,0x10(%rsp)
|
||
movaps %xmm0,0x20(%rsp)
|
||
movaps %xmm0,0x30(%rsp)
|
||
movaps %xmm0,0x40(%rsp)
|
||
movaps %xmm0,0x50(%rsp)
|
||
movaps %xmm0,0x60(%rsp)
|
||
movaps %xmm0,0x70(%rsp)
|
||
___
|
||
$code.=<<___;
|
||
lea (%rbp),%rsp
|
||
pop %rbp
|
||
.Lctr32_epilogue:
|
||
ret
|
||
.size aesni_ctr32_encrypt_blocks,.-aesni_ctr32_encrypt_blocks
|
||
___
|
||
}
|
||
|
||
######################################################################
|
||
# void aesni_xts_[en|de]crypt(const char *inp,char *out,size_t len,
|
||
# const AES_KEY *key1, const AES_KEY *key2
|
||
# const unsigned char iv[16]);
|
||
#
|
||
{
|
||
my @tweak=map("%xmm$_",(10..15));
|
||
my ($twmask,$twres,$twtmp)=("%xmm8","%xmm9",@tweak[4]);
|
||
my ($key2,$ivp,$len_)=("%r8","%r9","%r9");
|
||
my $frame_size = 0x70 + ($win64?160:0);
|
||
|
||
$code.=<<___;
|
||
.globl aesni_xts_encrypt
|
||
.type aesni_xts_encrypt,\@function,6
|
||
.align 16
|
||
aesni_xts_encrypt:
|
||
lea (%rsp),%rax
|
||
push %rbp
|
||
sub \$$frame_size,%rsp
|
||
and \$-16,%rsp # Linux kernel stack can be incorrectly seeded
|
||
___
|
||
$code.=<<___ if ($win64);
|
||
movaps %xmm6,-0xa8(%rax) # offload everything
|
||
movaps %xmm7,-0x98(%rax)
|
||
movaps %xmm8,-0x88(%rax)
|
||
movaps %xmm9,-0x78(%rax)
|
||
movaps %xmm10,-0x68(%rax)
|
||
movaps %xmm11,-0x58(%rax)
|
||
movaps %xmm12,-0x48(%rax)
|
||
movaps %xmm13,-0x38(%rax)
|
||
movaps %xmm14,-0x28(%rax)
|
||
movaps %xmm15,-0x18(%rax)
|
||
.Lxts_enc_body:
|
||
___
|
||
$code.=<<___;
|
||
lea -8(%rax),%rbp
|
||
movups ($ivp),$inout0 # load clear-text tweak
|
||
mov 240(%r8),$rounds # key2->rounds
|
||
mov 240($key),$rnds_ # key1->rounds
|
||
___
|
||
# generate the tweak
|
||
&aesni_generate1("enc",$key2,$rounds,$inout0);
|
||
$code.=<<___;
|
||
$movkey ($key),$rndkey0 # zero round key
|
||
mov $key,$key_ # backup $key
|
||
mov $rnds_,$rounds # backup $rounds
|
||
shl \$4,$rnds_
|
||
mov $len,$len_ # backup $len
|
||
and \$-16,$len
|
||
|
||
$movkey 16($key,$rnds_),$rndkey1 # last round key
|
||
|
||
movdqa .Lxts_magic(%rip),$twmask
|
||
movdqa $inout0,@tweak[5]
|
||
pshufd \$0x5f,$inout0,$twres
|
||
pxor $rndkey0,$rndkey1
|
||
___
|
||
# alternative tweak calculation algorithm is based on suggestions
|
||
# by Shay Gueron. psrad doesn't conflict with AES-NI instructions
|
||
# and should help in the future...
|
||
for ($i=0;$i<4;$i++) {
|
||
$code.=<<___;
|
||
movdqa $twres,$twtmp
|
||
paddd $twres,$twres
|
||
movdqa @tweak[5],@tweak[$i]
|
||
psrad \$31,$twtmp # broadcast upper bits
|
||
paddq @tweak[5],@tweak[5]
|
||
pand $twmask,$twtmp
|
||
pxor $rndkey0,@tweak[$i]
|
||
pxor $twtmp,@tweak[5]
|
||
___
|
||
}
|
||
$code.=<<___;
|
||
movdqa @tweak[5],@tweak[4]
|
||
psrad \$31,$twres
|
||
paddq @tweak[5],@tweak[5]
|
||
pand $twmask,$twres
|
||
pxor $rndkey0,@tweak[4]
|
||
pxor $twres,@tweak[5]
|
||
movaps $rndkey1,0x60(%rsp) # save round[0]^round[last]
|
||
|
||
sub \$16*6,$len
|
||
jc .Lxts_enc_short # if $len-=6*16 borrowed
|
||
|
||
mov \$16+96,$rounds
|
||
lea 32($key_,$rnds_),$key # end of key schedule
|
||
sub %r10,%rax # twisted $rounds
|
||
$movkey 16($key_),$rndkey1
|
||
mov %rax,%r10 # backup twisted $rounds
|
||
lea .Lxts_magic(%rip),%r8
|
||
jmp .Lxts_enc_grandloop
|
||
|
||
.align 32
|
||
.Lxts_enc_grandloop:
|
||
movdqu `16*0`($inp),$inout0 # load input
|
||
movdqa $rndkey0,$twmask
|
||
movdqu `16*1`($inp),$inout1
|
||
pxor @tweak[0],$inout0 # input^=tweak^round[0]
|
||
movdqu `16*2`($inp),$inout2
|
||
pxor @tweak[1],$inout1
|
||
aesenc $rndkey1,$inout0
|
||
movdqu `16*3`($inp),$inout3
|
||
pxor @tweak[2],$inout2
|
||
aesenc $rndkey1,$inout1
|
||
movdqu `16*4`($inp),$inout4
|
||
pxor @tweak[3],$inout3
|
||
aesenc $rndkey1,$inout2
|
||
movdqu `16*5`($inp),$inout5
|
||
pxor @tweak[5],$twmask # round[0]^=tweak[5]
|
||
movdqa 0x60(%rsp),$twres # load round[0]^round[last]
|
||
pxor @tweak[4],$inout4
|
||
aesenc $rndkey1,$inout3
|
||
$movkey 32($key_),$rndkey0
|
||
lea `16*6`($inp),$inp
|
||
pxor $twmask,$inout5
|
||
|
||
pxor $twres,@tweak[0] # calclulate tweaks^round[last]
|
||
aesenc $rndkey1,$inout4
|
||
pxor $twres,@tweak[1]
|
||
movdqa @tweak[0],`16*0`(%rsp) # put aside tweaks^round[last]
|
||
aesenc $rndkey1,$inout5
|
||
$movkey 48($key_),$rndkey1
|
||
pxor $twres,@tweak[2]
|
||
|
||
aesenc $rndkey0,$inout0
|
||
pxor $twres,@tweak[3]
|
||
movdqa @tweak[1],`16*1`(%rsp)
|
||
aesenc $rndkey0,$inout1
|
||
pxor $twres,@tweak[4]
|
||
movdqa @tweak[2],`16*2`(%rsp)
|
||
aesenc $rndkey0,$inout2
|
||
aesenc $rndkey0,$inout3
|
||
pxor $twres,$twmask
|
||
movdqa @tweak[4],`16*4`(%rsp)
|
||
aesenc $rndkey0,$inout4
|
||
aesenc $rndkey0,$inout5
|
||
$movkey 64($key_),$rndkey0
|
||
movdqa $twmask,`16*5`(%rsp)
|
||
pshufd \$0x5f,@tweak[5],$twres
|
||
jmp .Lxts_enc_loop6
|
||
.align 32
|
||
.Lxts_enc_loop6:
|
||
aesenc $rndkey1,$inout0
|
||
aesenc $rndkey1,$inout1
|
||
aesenc $rndkey1,$inout2
|
||
aesenc $rndkey1,$inout3
|
||
aesenc $rndkey1,$inout4
|
||
aesenc $rndkey1,$inout5
|
||
$movkey -64($key,%rax),$rndkey1
|
||
add \$32,%rax
|
||
|
||
aesenc $rndkey0,$inout0
|
||
aesenc $rndkey0,$inout1
|
||
aesenc $rndkey0,$inout2
|
||
aesenc $rndkey0,$inout3
|
||
aesenc $rndkey0,$inout4
|
||
aesenc $rndkey0,$inout5
|
||
$movkey -80($key,%rax),$rndkey0
|
||
jnz .Lxts_enc_loop6
|
||
|
||
movdqa (%r8),$twmask # start calculating next tweak
|
||
movdqa $twres,$twtmp
|
||
paddd $twres,$twres
|
||
aesenc $rndkey1,$inout0
|
||
paddq @tweak[5],@tweak[5]
|
||
psrad \$31,$twtmp
|
||
aesenc $rndkey1,$inout1
|
||
pand $twmask,$twtmp
|
||
$movkey ($key_),@tweak[0] # load round[0]
|
||
aesenc $rndkey1,$inout2
|
||
aesenc $rndkey1,$inout3
|
||
aesenc $rndkey1,$inout4
|
||
pxor $twtmp,@tweak[5]
|
||
movaps @tweak[0],@tweak[1] # copy round[0]
|
||
aesenc $rndkey1,$inout5
|
||
$movkey -64($key),$rndkey1
|
||
|
||
movdqa $twres,$twtmp
|
||
aesenc $rndkey0,$inout0
|
||
paddd $twres,$twres
|
||
pxor @tweak[5],@tweak[0]
|
||
aesenc $rndkey0,$inout1
|
||
psrad \$31,$twtmp
|
||
paddq @tweak[5],@tweak[5]
|
||
aesenc $rndkey0,$inout2
|
||
aesenc $rndkey0,$inout3
|
||
pand $twmask,$twtmp
|
||
movaps @tweak[1],@tweak[2]
|
||
aesenc $rndkey0,$inout4
|
||
pxor $twtmp,@tweak[5]
|
||
movdqa $twres,$twtmp
|
||
aesenc $rndkey0,$inout5
|
||
$movkey -48($key),$rndkey0
|
||
|
||
paddd $twres,$twres
|
||
aesenc $rndkey1,$inout0
|
||
pxor @tweak[5],@tweak[1]
|
||
psrad \$31,$twtmp
|
||
aesenc $rndkey1,$inout1
|
||
paddq @tweak[5],@tweak[5]
|
||
pand $twmask,$twtmp
|
||
aesenc $rndkey1,$inout2
|
||
aesenc $rndkey1,$inout3
|
||
movdqa @tweak[3],`16*3`(%rsp)
|
||
pxor $twtmp,@tweak[5]
|
||
aesenc $rndkey1,$inout4
|
||
movaps @tweak[2],@tweak[3]
|
||
movdqa $twres,$twtmp
|
||
aesenc $rndkey1,$inout5
|
||
$movkey -32($key),$rndkey1
|
||
|
||
paddd $twres,$twres
|
||
aesenc $rndkey0,$inout0
|
||
pxor @tweak[5],@tweak[2]
|
||
psrad \$31,$twtmp
|
||
aesenc $rndkey0,$inout1
|
||
paddq @tweak[5],@tweak[5]
|
||
pand $twmask,$twtmp
|
||
aesenc $rndkey0,$inout2
|
||
aesenc $rndkey0,$inout3
|
||
aesenc $rndkey0,$inout4
|
||
pxor $twtmp,@tweak[5]
|
||
movaps @tweak[3],@tweak[4]
|
||
aesenc $rndkey0,$inout5
|
||
|
||
movdqa $twres,$rndkey0
|
||
paddd $twres,$twres
|
||
aesenc $rndkey1,$inout0
|
||
pxor @tweak[5],@tweak[3]
|
||
psrad \$31,$rndkey0
|
||
aesenc $rndkey1,$inout1
|
||
paddq @tweak[5],@tweak[5]
|
||
pand $twmask,$rndkey0
|
||
aesenc $rndkey1,$inout2
|
||
aesenc $rndkey1,$inout3
|
||
pxor $rndkey0,@tweak[5]
|
||
$movkey ($key_),$rndkey0
|
||
aesenc $rndkey1,$inout4
|
||
aesenc $rndkey1,$inout5
|
||
$movkey 16($key_),$rndkey1
|
||
|
||
pxor @tweak[5],@tweak[4]
|
||
aesenclast `16*0`(%rsp),$inout0
|
||
psrad \$31,$twres
|
||
paddq @tweak[5],@tweak[5]
|
||
aesenclast `16*1`(%rsp),$inout1
|
||
aesenclast `16*2`(%rsp),$inout2
|
||
pand $twmask,$twres
|
||
mov %r10,%rax # restore $rounds
|
||
aesenclast `16*3`(%rsp),$inout3
|
||
aesenclast `16*4`(%rsp),$inout4
|
||
aesenclast `16*5`(%rsp),$inout5
|
||
pxor $twres,@tweak[5]
|
||
|
||
lea `16*6`($out),$out # $out+=6*16
|
||
movups $inout0,`-16*6`($out) # store 6 output blocks
|
||
movups $inout1,`-16*5`($out)
|
||
movups $inout2,`-16*4`($out)
|
||
movups $inout3,`-16*3`($out)
|
||
movups $inout4,`-16*2`($out)
|
||
movups $inout5,`-16*1`($out)
|
||
sub \$16*6,$len
|
||
jnc .Lxts_enc_grandloop # loop if $len-=6*16 didn't borrow
|
||
|
||
mov \$16+96,$rounds
|
||
sub $rnds_,$rounds
|
||
mov $key_,$key # restore $key
|
||
shr \$4,$rounds # restore original value
|
||
|
||
.Lxts_enc_short:
|
||
# at the point @tweak[0..5] are populated with tweak values
|
||
mov $rounds,$rnds_ # backup $rounds
|
||
pxor $rndkey0,@tweak[0]
|
||
add \$16*6,$len # restore real remaining $len
|
||
jz .Lxts_enc_done # done if ($len==0)
|
||
|
||
pxor $rndkey0,@tweak[1]
|
||
cmp \$0x20,$len
|
||
jb .Lxts_enc_one # $len is 1*16
|
||
pxor $rndkey0,@tweak[2]
|
||
je .Lxts_enc_two # $len is 2*16
|
||
|
||
pxor $rndkey0,@tweak[3]
|
||
cmp \$0x40,$len
|
||
jb .Lxts_enc_three # $len is 3*16
|
||
pxor $rndkey0,@tweak[4]
|
||
je .Lxts_enc_four # $len is 4*16
|
||
|
||
movdqu ($inp),$inout0 # $len is 5*16
|
||
movdqu 16*1($inp),$inout1
|
||
movdqu 16*2($inp),$inout2
|
||
pxor @tweak[0],$inout0
|
||
movdqu 16*3($inp),$inout3
|
||
pxor @tweak[1],$inout1
|
||
movdqu 16*4($inp),$inout4
|
||
lea 16*5($inp),$inp # $inp+=5*16
|
||
pxor @tweak[2],$inout2
|
||
pxor @tweak[3],$inout3
|
||
pxor @tweak[4],$inout4
|
||
pxor $inout5,$inout5
|
||
|
||
call _aesni_encrypt6
|
||
|
||
xorps @tweak[0],$inout0
|
||
movdqa @tweak[5],@tweak[0]
|
||
xorps @tweak[1],$inout1
|
||
xorps @tweak[2],$inout2
|
||
movdqu $inout0,($out) # store 5 output blocks
|
||
xorps @tweak[3],$inout3
|
||
movdqu $inout1,16*1($out)
|
||
xorps @tweak[4],$inout4
|
||
movdqu $inout2,16*2($out)
|
||
movdqu $inout3,16*3($out)
|
||
movdqu $inout4,16*4($out)
|
||
lea 16*5($out),$out # $out+=5*16
|
||
jmp .Lxts_enc_done
|
||
|
||
.align 16
|
||
.Lxts_enc_one:
|
||
movups ($inp),$inout0
|
||
lea 16*1($inp),$inp # inp+=1*16
|
||
xorps @tweak[0],$inout0
|
||
___
|
||
&aesni_generate1("enc",$key,$rounds);
|
||
$code.=<<___;
|
||
xorps @tweak[0],$inout0
|
||
movdqa @tweak[1],@tweak[0]
|
||
movups $inout0,($out) # store one output block
|
||
lea 16*1($out),$out # $out+=1*16
|
||
jmp .Lxts_enc_done
|
||
|
||
.align 16
|
||
.Lxts_enc_two:
|
||
movups ($inp),$inout0
|
||
movups 16($inp),$inout1
|
||
lea 32($inp),$inp # $inp+=2*16
|
||
xorps @tweak[0],$inout0
|
||
xorps @tweak[1],$inout1
|
||
|
||
call _aesni_encrypt2
|
||
|
||
xorps @tweak[0],$inout0
|
||
movdqa @tweak[2],@tweak[0]
|
||
xorps @tweak[1],$inout1
|
||
movups $inout0,($out) # store 2 output blocks
|
||
movups $inout1,16*1($out)
|
||
lea 16*2($out),$out # $out+=2*16
|
||
jmp .Lxts_enc_done
|
||
|
||
.align 16
|
||
.Lxts_enc_three:
|
||
movups ($inp),$inout0
|
||
movups 16*1($inp),$inout1
|
||
movups 16*2($inp),$inout2
|
||
lea 16*3($inp),$inp # $inp+=3*16
|
||
xorps @tweak[0],$inout0
|
||
xorps @tweak[1],$inout1
|
||
xorps @tweak[2],$inout2
|
||
|
||
call _aesni_encrypt3
|
||
|
||
xorps @tweak[0],$inout0
|
||
movdqa @tweak[3],@tweak[0]
|
||
xorps @tweak[1],$inout1
|
||
xorps @tweak[2],$inout2
|
||
movups $inout0,($out) # store 3 output blocks
|
||
movups $inout1,16*1($out)
|
||
movups $inout2,16*2($out)
|
||
lea 16*3($out),$out # $out+=3*16
|
||
jmp .Lxts_enc_done
|
||
|
||
.align 16
|
||
.Lxts_enc_four:
|
||
movups ($inp),$inout0
|
||
movups 16*1($inp),$inout1
|
||
movups 16*2($inp),$inout2
|
||
xorps @tweak[0],$inout0
|
||
movups 16*3($inp),$inout3
|
||
lea 16*4($inp),$inp # $inp+=4*16
|
||
xorps @tweak[1],$inout1
|
||
xorps @tweak[2],$inout2
|
||
xorps @tweak[3],$inout3
|
||
|
||
call _aesni_encrypt4
|
||
|
||
pxor @tweak[0],$inout0
|
||
movdqa @tweak[4],@tweak[0]
|
||
pxor @tweak[1],$inout1
|
||
pxor @tweak[2],$inout2
|
||
movdqu $inout0,($out) # store 4 output blocks
|
||
pxor @tweak[3],$inout3
|
||
movdqu $inout1,16*1($out)
|
||
movdqu $inout2,16*2($out)
|
||
movdqu $inout3,16*3($out)
|
||
lea 16*4($out),$out # $out+=4*16
|
||
jmp .Lxts_enc_done
|
||
|
||
.align 16
|
||
.Lxts_enc_done:
|
||
and \$15,$len_ # see if $len%16 is 0
|
||
jz .Lxts_enc_ret
|
||
mov $len_,$len
|
||
|
||
.Lxts_enc_steal:
|
||
movzb ($inp),%eax # borrow $rounds ...
|
||
movzb -16($out),%ecx # ... and $key
|
||
lea 1($inp),$inp
|
||
mov %al,-16($out)
|
||
mov %cl,0($out)
|
||
lea 1($out),$out
|
||
sub \$1,$len
|
||
jnz .Lxts_enc_steal
|
||
|
||
sub $len_,$out # rewind $out
|
||
mov $key_,$key # restore $key
|
||
mov $rnds_,$rounds # restore $rounds
|
||
|
||
movups -16($out),$inout0
|
||
xorps @tweak[0],$inout0
|
||
___
|
||
&aesni_generate1("enc",$key,$rounds);
|
||
$code.=<<___;
|
||
xorps @tweak[0],$inout0
|
||
movups $inout0,-16($out)
|
||
|
||
.Lxts_enc_ret:
|
||
xorps %xmm0,%xmm0 # clear register bank
|
||
pxor %xmm1,%xmm1
|
||
pxor %xmm2,%xmm2
|
||
pxor %xmm3,%xmm3
|
||
pxor %xmm4,%xmm4
|
||
pxor %xmm5,%xmm5
|
||
___
|
||
$code.=<<___ if (!$win64);
|
||
pxor %xmm6,%xmm6
|
||
pxor %xmm7,%xmm7
|
||
movaps %xmm0,0x00(%rsp) # clear stack
|
||
pxor %xmm8,%xmm8
|
||
movaps %xmm0,0x10(%rsp)
|
||
pxor %xmm9,%xmm9
|
||
movaps %xmm0,0x20(%rsp)
|
||
pxor %xmm10,%xmm10
|
||
movaps %xmm0,0x30(%rsp)
|
||
pxor %xmm11,%xmm11
|
||
movaps %xmm0,0x40(%rsp)
|
||
pxor %xmm12,%xmm12
|
||
movaps %xmm0,0x50(%rsp)
|
||
pxor %xmm13,%xmm13
|
||
movaps %xmm0,0x60(%rsp)
|
||
pxor %xmm14,%xmm14
|
||
pxor %xmm15,%xmm15
|
||
___
|
||
$code.=<<___ if ($win64);
|
||
movaps -0xa0(%rbp),%xmm6
|
||
movaps %xmm0,-0xa0(%rbp) # clear stack
|
||
movaps -0x90(%rbp),%xmm7
|
||
movaps %xmm0,-0x90(%rbp)
|
||
movaps -0x80(%rbp),%xmm8
|
||
movaps %xmm0,-0x80(%rbp)
|
||
movaps -0x70(%rbp),%xmm9
|
||
movaps %xmm0,-0x70(%rbp)
|
||
movaps -0x60(%rbp),%xmm10
|
||
movaps %xmm0,-0x60(%rbp)
|
||
movaps -0x50(%rbp),%xmm11
|
||
movaps %xmm0,-0x50(%rbp)
|
||
movaps -0x40(%rbp),%xmm12
|
||
movaps %xmm0,-0x40(%rbp)
|
||
movaps -0x30(%rbp),%xmm13
|
||
movaps %xmm0,-0x30(%rbp)
|
||
movaps -0x20(%rbp),%xmm14
|
||
movaps %xmm0,-0x20(%rbp)
|
||
movaps -0x10(%rbp),%xmm15
|
||
movaps %xmm0,-0x10(%rbp)
|
||
movaps %xmm0,0x00(%rsp)
|
||
movaps %xmm0,0x10(%rsp)
|
||
movaps %xmm0,0x20(%rsp)
|
||
movaps %xmm0,0x30(%rsp)
|
||
movaps %xmm0,0x40(%rsp)
|
||
movaps %xmm0,0x50(%rsp)
|
||
movaps %xmm0,0x60(%rsp)
|
||
___
|
||
$code.=<<___;
|
||
lea (%rbp),%rsp
|
||
pop %rbp
|
||
.Lxts_enc_epilogue:
|
||
ret
|
||
.size aesni_xts_encrypt,.-aesni_xts_encrypt
|
||
___
|
||
|
||
$code.=<<___;
|
||
.globl aesni_xts_decrypt
|
||
.type aesni_xts_decrypt,\@function,6
|
||
.align 16
|
||
aesni_xts_decrypt:
|
||
lea (%rsp),%rax
|
||
push %rbp
|
||
sub \$$frame_size,%rsp
|
||
and \$-16,%rsp # Linux kernel stack can be incorrectly seeded
|
||
___
|
||
$code.=<<___ if ($win64);
|
||
movaps %xmm6,-0xa8(%rax) # offload everything
|
||
movaps %xmm7,-0x98(%rax)
|
||
movaps %xmm8,-0x88(%rax)
|
||
movaps %xmm9,-0x78(%rax)
|
||
movaps %xmm10,-0x68(%rax)
|
||
movaps %xmm11,-0x58(%rax)
|
||
movaps %xmm12,-0x48(%rax)
|
||
movaps %xmm13,-0x38(%rax)
|
||
movaps %xmm14,-0x28(%rax)
|
||
movaps %xmm15,-0x18(%rax)
|
||
.Lxts_dec_body:
|
||
___
|
||
$code.=<<___;
|
||
lea -8(%rax),%rbp
|
||
movups ($ivp),$inout0 # load clear-text tweak
|
||
mov 240($key2),$rounds # key2->rounds
|
||
mov 240($key),$rnds_ # key1->rounds
|
||
___
|
||
# generate the tweak
|
||
&aesni_generate1("enc",$key2,$rounds,$inout0);
|
||
$code.=<<___;
|
||
xor %eax,%eax # if ($len%16) len-=16;
|
||
test \$15,$len
|
||
setnz %al
|
||
shl \$4,%rax
|
||
sub %rax,$len
|
||
|
||
$movkey ($key),$rndkey0 # zero round key
|
||
mov $key,$key_ # backup $key
|
||
mov $rnds_,$rounds # backup $rounds
|
||
shl \$4,$rnds_
|
||
mov $len,$len_ # backup $len
|
||
and \$-16,$len
|
||
|
||
$movkey 16($key,$rnds_),$rndkey1 # last round key
|
||
|
||
movdqa .Lxts_magic(%rip),$twmask
|
||
movdqa $inout0,@tweak[5]
|
||
pshufd \$0x5f,$inout0,$twres
|
||
pxor $rndkey0,$rndkey1
|
||
___
|
||
for ($i=0;$i<4;$i++) {
|
||
$code.=<<___;
|
||
movdqa $twres,$twtmp
|
||
paddd $twres,$twres
|
||
movdqa @tweak[5],@tweak[$i]
|
||
psrad \$31,$twtmp # broadcast upper bits
|
||
paddq @tweak[5],@tweak[5]
|
||
pand $twmask,$twtmp
|
||
pxor $rndkey0,@tweak[$i]
|
||
pxor $twtmp,@tweak[5]
|
||
___
|
||
}
|
||
$code.=<<___;
|
||
movdqa @tweak[5],@tweak[4]
|
||
psrad \$31,$twres
|
||
paddq @tweak[5],@tweak[5]
|
||
pand $twmask,$twres
|
||
pxor $rndkey0,@tweak[4]
|
||
pxor $twres,@tweak[5]
|
||
movaps $rndkey1,0x60(%rsp) # save round[0]^round[last]
|
||
|
||
sub \$16*6,$len
|
||
jc .Lxts_dec_short # if $len-=6*16 borrowed
|
||
|
||
mov \$16+96,$rounds
|
||
lea 32($key_,$rnds_),$key # end of key schedule
|
||
sub %r10,%rax # twisted $rounds
|
||
$movkey 16($key_),$rndkey1
|
||
mov %rax,%r10 # backup twisted $rounds
|
||
lea .Lxts_magic(%rip),%r8
|
||
jmp .Lxts_dec_grandloop
|
||
|
||
.align 32
|
||
.Lxts_dec_grandloop:
|
||
movdqu `16*0`($inp),$inout0 # load input
|
||
movdqa $rndkey0,$twmask
|
||
movdqu `16*1`($inp),$inout1
|
||
pxor @tweak[0],$inout0 # intput^=tweak^round[0]
|
||
movdqu `16*2`($inp),$inout2
|
||
pxor @tweak[1],$inout1
|
||
aesdec $rndkey1,$inout0
|
||
movdqu `16*3`($inp),$inout3
|
||
pxor @tweak[2],$inout2
|
||
aesdec $rndkey1,$inout1
|
||
movdqu `16*4`($inp),$inout4
|
||
pxor @tweak[3],$inout3
|
||
aesdec $rndkey1,$inout2
|
||
movdqu `16*5`($inp),$inout5
|
||
pxor @tweak[5],$twmask # round[0]^=tweak[5]
|
||
movdqa 0x60(%rsp),$twres # load round[0]^round[last]
|
||
pxor @tweak[4],$inout4
|
||
aesdec $rndkey1,$inout3
|
||
$movkey 32($key_),$rndkey0
|
||
lea `16*6`($inp),$inp
|
||
pxor $twmask,$inout5
|
||
|
||
pxor $twres,@tweak[0] # calclulate tweaks^round[last]
|
||
aesdec $rndkey1,$inout4
|
||
pxor $twres,@tweak[1]
|
||
movdqa @tweak[0],`16*0`(%rsp) # put aside tweaks^last round key
|
||
aesdec $rndkey1,$inout5
|
||
$movkey 48($key_),$rndkey1
|
||
pxor $twres,@tweak[2]
|
||
|
||
aesdec $rndkey0,$inout0
|
||
pxor $twres,@tweak[3]
|
||
movdqa @tweak[1],`16*1`(%rsp)
|
||
aesdec $rndkey0,$inout1
|
||
pxor $twres,@tweak[4]
|
||
movdqa @tweak[2],`16*2`(%rsp)
|
||
aesdec $rndkey0,$inout2
|
||
aesdec $rndkey0,$inout3
|
||
pxor $twres,$twmask
|
||
movdqa @tweak[4],`16*4`(%rsp)
|
||
aesdec $rndkey0,$inout4
|
||
aesdec $rndkey0,$inout5
|
||
$movkey 64($key_),$rndkey0
|
||
movdqa $twmask,`16*5`(%rsp)
|
||
pshufd \$0x5f,@tweak[5],$twres
|
||
jmp .Lxts_dec_loop6
|
||
.align 32
|
||
.Lxts_dec_loop6:
|
||
aesdec $rndkey1,$inout0
|
||
aesdec $rndkey1,$inout1
|
||
aesdec $rndkey1,$inout2
|
||
aesdec $rndkey1,$inout3
|
||
aesdec $rndkey1,$inout4
|
||
aesdec $rndkey1,$inout5
|
||
$movkey -64($key,%rax),$rndkey1
|
||
add \$32,%rax
|
||
|
||
aesdec $rndkey0,$inout0
|
||
aesdec $rndkey0,$inout1
|
||
aesdec $rndkey0,$inout2
|
||
aesdec $rndkey0,$inout3
|
||
aesdec $rndkey0,$inout4
|
||
aesdec $rndkey0,$inout5
|
||
$movkey -80($key,%rax),$rndkey0
|
||
jnz .Lxts_dec_loop6
|
||
|
||
movdqa (%r8),$twmask # start calculating next tweak
|
||
movdqa $twres,$twtmp
|
||
paddd $twres,$twres
|
||
aesdec $rndkey1,$inout0
|
||
paddq @tweak[5],@tweak[5]
|
||
psrad \$31,$twtmp
|
||
aesdec $rndkey1,$inout1
|
||
pand $twmask,$twtmp
|
||
$movkey ($key_),@tweak[0] # load round[0]
|
||
aesdec $rndkey1,$inout2
|
||
aesdec $rndkey1,$inout3
|
||
aesdec $rndkey1,$inout4
|
||
pxor $twtmp,@tweak[5]
|
||
movaps @tweak[0],@tweak[1] # copy round[0]
|
||
aesdec $rndkey1,$inout5
|
||
$movkey -64($key),$rndkey1
|
||
|
||
movdqa $twres,$twtmp
|
||
aesdec $rndkey0,$inout0
|
||
paddd $twres,$twres
|
||
pxor @tweak[5],@tweak[0]
|
||
aesdec $rndkey0,$inout1
|
||
psrad \$31,$twtmp
|
||
paddq @tweak[5],@tweak[5]
|
||
aesdec $rndkey0,$inout2
|
||
aesdec $rndkey0,$inout3
|
||
pand $twmask,$twtmp
|
||
movaps @tweak[1],@tweak[2]
|
||
aesdec $rndkey0,$inout4
|
||
pxor $twtmp,@tweak[5]
|
||
movdqa $twres,$twtmp
|
||
aesdec $rndkey0,$inout5
|
||
$movkey -48($key),$rndkey0
|
||
|
||
paddd $twres,$twres
|
||
aesdec $rndkey1,$inout0
|
||
pxor @tweak[5],@tweak[1]
|
||
psrad \$31,$twtmp
|
||
aesdec $rndkey1,$inout1
|
||
paddq @tweak[5],@tweak[5]
|
||
pand $twmask,$twtmp
|
||
aesdec $rndkey1,$inout2
|
||
aesdec $rndkey1,$inout3
|
||
movdqa @tweak[3],`16*3`(%rsp)
|
||
pxor $twtmp,@tweak[5]
|
||
aesdec $rndkey1,$inout4
|
||
movaps @tweak[2],@tweak[3]
|
||
movdqa $twres,$twtmp
|
||
aesdec $rndkey1,$inout5
|
||
$movkey -32($key),$rndkey1
|
||
|
||
paddd $twres,$twres
|
||
aesdec $rndkey0,$inout0
|
||
pxor @tweak[5],@tweak[2]
|
||
psrad \$31,$twtmp
|
||
aesdec $rndkey0,$inout1
|
||
paddq @tweak[5],@tweak[5]
|
||
pand $twmask,$twtmp
|
||
aesdec $rndkey0,$inout2
|
||
aesdec $rndkey0,$inout3
|
||
aesdec $rndkey0,$inout4
|
||
pxor $twtmp,@tweak[5]
|
||
movaps @tweak[3],@tweak[4]
|
||
aesdec $rndkey0,$inout5
|
||
|
||
movdqa $twres,$rndkey0
|
||
paddd $twres,$twres
|
||
aesdec $rndkey1,$inout0
|
||
pxor @tweak[5],@tweak[3]
|
||
psrad \$31,$rndkey0
|
||
aesdec $rndkey1,$inout1
|
||
paddq @tweak[5],@tweak[5]
|
||
pand $twmask,$rndkey0
|
||
aesdec $rndkey1,$inout2
|
||
aesdec $rndkey1,$inout3
|
||
pxor $rndkey0,@tweak[5]
|
||
$movkey ($key_),$rndkey0
|
||
aesdec $rndkey1,$inout4
|
||
aesdec $rndkey1,$inout5
|
||
$movkey 16($key_),$rndkey1
|
||
|
||
pxor @tweak[5],@tweak[4]
|
||
aesdeclast `16*0`(%rsp),$inout0
|
||
psrad \$31,$twres
|
||
paddq @tweak[5],@tweak[5]
|
||
aesdeclast `16*1`(%rsp),$inout1
|
||
aesdeclast `16*2`(%rsp),$inout2
|
||
pand $twmask,$twres
|
||
mov %r10,%rax # restore $rounds
|
||
aesdeclast `16*3`(%rsp),$inout3
|
||
aesdeclast `16*4`(%rsp),$inout4
|
||
aesdeclast `16*5`(%rsp),$inout5
|
||
pxor $twres,@tweak[5]
|
||
|
||
lea `16*6`($out),$out # $out+=6*16
|
||
movups $inout0,`-16*6`($out) # store 6 output blocks
|
||
movups $inout1,`-16*5`($out)
|
||
movups $inout2,`-16*4`($out)
|
||
movups $inout3,`-16*3`($out)
|
||
movups $inout4,`-16*2`($out)
|
||
movups $inout5,`-16*1`($out)
|
||
sub \$16*6,$len
|
||
jnc .Lxts_dec_grandloop # loop if $len-=6*16 didn't borrow
|
||
|
||
mov \$16+96,$rounds
|
||
sub $rnds_,$rounds
|
||
mov $key_,$key # restore $key
|
||
shr \$4,$rounds # restore original value
|
||
|
||
.Lxts_dec_short:
|
||
# at the point @tweak[0..5] are populated with tweak values
|
||
mov $rounds,$rnds_ # backup $rounds
|
||
pxor $rndkey0,@tweak[0]
|
||
pxor $rndkey0,@tweak[1]
|
||
add \$16*6,$len # restore real remaining $len
|
||
jz .Lxts_dec_done # done if ($len==0)
|
||
|
||
pxor $rndkey0,@tweak[2]
|
||
cmp \$0x20,$len
|
||
jb .Lxts_dec_one # $len is 1*16
|
||
pxor $rndkey0,@tweak[3]
|
||
je .Lxts_dec_two # $len is 2*16
|
||
|
||
pxor $rndkey0,@tweak[4]
|
||
cmp \$0x40,$len
|
||
jb .Lxts_dec_three # $len is 3*16
|
||
je .Lxts_dec_four # $len is 4*16
|
||
|
||
movdqu ($inp),$inout0 # $len is 5*16
|
||
movdqu 16*1($inp),$inout1
|
||
movdqu 16*2($inp),$inout2
|
||
pxor @tweak[0],$inout0
|
||
movdqu 16*3($inp),$inout3
|
||
pxor @tweak[1],$inout1
|
||
movdqu 16*4($inp),$inout4
|
||
lea 16*5($inp),$inp # $inp+=5*16
|
||
pxor @tweak[2],$inout2
|
||
pxor @tweak[3],$inout3
|
||
pxor @tweak[4],$inout4
|
||
|
||
call _aesni_decrypt6
|
||
|
||
xorps @tweak[0],$inout0
|
||
xorps @tweak[1],$inout1
|
||
xorps @tweak[2],$inout2
|
||
movdqu $inout0,($out) # store 5 output blocks
|
||
xorps @tweak[3],$inout3
|
||
movdqu $inout1,16*1($out)
|
||
xorps @tweak[4],$inout4
|
||
movdqu $inout2,16*2($out)
|
||
pxor $twtmp,$twtmp
|
||
movdqu $inout3,16*3($out)
|
||
pcmpgtd @tweak[5],$twtmp
|
||
movdqu $inout4,16*4($out)
|
||
lea 16*5($out),$out # $out+=5*16
|
||
pshufd \$0x13,$twtmp,@tweak[1] # $twres
|
||
and \$15,$len_
|
||
jz .Lxts_dec_ret
|
||
|
||
movdqa @tweak[5],@tweak[0]
|
||
paddq @tweak[5],@tweak[5] # psllq 1,$tweak
|
||
pand $twmask,@tweak[1] # isolate carry and residue
|
||
pxor @tweak[5],@tweak[1]
|
||
jmp .Lxts_dec_done2
|
||
|
||
.align 16
|
||
.Lxts_dec_one:
|
||
movups ($inp),$inout0
|
||
lea 16*1($inp),$inp # $inp+=1*16
|
||
xorps @tweak[0],$inout0
|
||
___
|
||
&aesni_generate1("dec",$key,$rounds);
|
||
$code.=<<___;
|
||
xorps @tweak[0],$inout0
|
||
movdqa @tweak[1],@tweak[0]
|
||
movups $inout0,($out) # store one output block
|
||
movdqa @tweak[2],@tweak[1]
|
||
lea 16*1($out),$out # $out+=1*16
|
||
jmp .Lxts_dec_done
|
||
|
||
.align 16
|
||
.Lxts_dec_two:
|
||
movups ($inp),$inout0
|
||
movups 16($inp),$inout1
|
||
lea 32($inp),$inp # $inp+=2*16
|
||
xorps @tweak[0],$inout0
|
||
xorps @tweak[1],$inout1
|
||
|
||
call _aesni_decrypt2
|
||
|
||
xorps @tweak[0],$inout0
|
||
movdqa @tweak[2],@tweak[0]
|
||
xorps @tweak[1],$inout1
|
||
movdqa @tweak[3],@tweak[1]
|
||
movups $inout0,($out) # store 2 output blocks
|
||
movups $inout1,16*1($out)
|
||
lea 16*2($out),$out # $out+=2*16
|
||
jmp .Lxts_dec_done
|
||
|
||
.align 16
|
||
.Lxts_dec_three:
|
||
movups ($inp),$inout0
|
||
movups 16*1($inp),$inout1
|
||
movups 16*2($inp),$inout2
|
||
lea 16*3($inp),$inp # $inp+=3*16
|
||
xorps @tweak[0],$inout0
|
||
xorps @tweak[1],$inout1
|
||
xorps @tweak[2],$inout2
|
||
|
||
call _aesni_decrypt3
|
||
|
||
xorps @tweak[0],$inout0
|
||
movdqa @tweak[3],@tweak[0]
|
||
xorps @tweak[1],$inout1
|
||
movdqa @tweak[4],@tweak[1]
|
||
xorps @tweak[2],$inout2
|
||
movups $inout0,($out) # store 3 output blocks
|
||
movups $inout1,16*1($out)
|
||
movups $inout2,16*2($out)
|
||
lea 16*3($out),$out # $out+=3*16
|
||
jmp .Lxts_dec_done
|
||
|
||
.align 16
|
||
.Lxts_dec_four:
|
||
movups ($inp),$inout0
|
||
movups 16*1($inp),$inout1
|
||
movups 16*2($inp),$inout2
|
||
xorps @tweak[0],$inout0
|
||
movups 16*3($inp),$inout3
|
||
lea 16*4($inp),$inp # $inp+=4*16
|
||
xorps @tweak[1],$inout1
|
||
xorps @tweak[2],$inout2
|
||
xorps @tweak[3],$inout3
|
||
|
||
call _aesni_decrypt4
|
||
|
||
pxor @tweak[0],$inout0
|
||
movdqa @tweak[4],@tweak[0]
|
||
pxor @tweak[1],$inout1
|
||
movdqa @tweak[5],@tweak[1]
|
||
pxor @tweak[2],$inout2
|
||
movdqu $inout0,($out) # store 4 output blocks
|
||
pxor @tweak[3],$inout3
|
||
movdqu $inout1,16*1($out)
|
||
movdqu $inout2,16*2($out)
|
||
movdqu $inout3,16*3($out)
|
||
lea 16*4($out),$out # $out+=4*16
|
||
jmp .Lxts_dec_done
|
||
|
||
.align 16
|
||
.Lxts_dec_done:
|
||
and \$15,$len_ # see if $len%16 is 0
|
||
jz .Lxts_dec_ret
|
||
.Lxts_dec_done2:
|
||
mov $len_,$len
|
||
mov $key_,$key # restore $key
|
||
mov $rnds_,$rounds # restore $rounds
|
||
|
||
movups ($inp),$inout0
|
||
xorps @tweak[1],$inout0
|
||
___
|
||
&aesni_generate1("dec",$key,$rounds);
|
||
$code.=<<___;
|
||
xorps @tweak[1],$inout0
|
||
movups $inout0,($out)
|
||
|
||
.Lxts_dec_steal:
|
||
movzb 16($inp),%eax # borrow $rounds ...
|
||
movzb ($out),%ecx # ... and $key
|
||
lea 1($inp),$inp
|
||
mov %al,($out)
|
||
mov %cl,16($out)
|
||
lea 1($out),$out
|
||
sub \$1,$len
|
||
jnz .Lxts_dec_steal
|
||
|
||
sub $len_,$out # rewind $out
|
||
mov $key_,$key # restore $key
|
||
mov $rnds_,$rounds # restore $rounds
|
||
|
||
movups ($out),$inout0
|
||
xorps @tweak[0],$inout0
|
||
___
|
||
&aesni_generate1("dec",$key,$rounds);
|
||
$code.=<<___;
|
||
xorps @tweak[0],$inout0
|
||
movups $inout0,($out)
|
||
|
||
.Lxts_dec_ret:
|
||
xorps %xmm0,%xmm0 # clear register bank
|
||
pxor %xmm1,%xmm1
|
||
pxor %xmm2,%xmm2
|
||
pxor %xmm3,%xmm3
|
||
pxor %xmm4,%xmm4
|
||
pxor %xmm5,%xmm5
|
||
___
|
||
$code.=<<___ if (!$win64);
|
||
pxor %xmm6,%xmm6
|
||
pxor %xmm7,%xmm7
|
||
movaps %xmm0,0x00(%rsp) # clear stack
|
||
pxor %xmm8,%xmm8
|
||
movaps %xmm0,0x10(%rsp)
|
||
pxor %xmm9,%xmm9
|
||
movaps %xmm0,0x20(%rsp)
|
||
pxor %xmm10,%xmm10
|
||
movaps %xmm0,0x30(%rsp)
|
||
pxor %xmm11,%xmm11
|
||
movaps %xmm0,0x40(%rsp)
|
||
pxor %xmm12,%xmm12
|
||
movaps %xmm0,0x50(%rsp)
|
||
pxor %xmm13,%xmm13
|
||
movaps %xmm0,0x60(%rsp)
|
||
pxor %xmm14,%xmm14
|
||
pxor %xmm15,%xmm15
|
||
___
|
||
$code.=<<___ if ($win64);
|
||
movaps -0xa0(%rbp),%xmm6
|
||
movaps %xmm0,-0xa0(%rbp) # clear stack
|
||
movaps -0x90(%rbp),%xmm7
|
||
movaps %xmm0,-0x90(%rbp)
|
||
movaps -0x80(%rbp),%xmm8
|
||
movaps %xmm0,-0x80(%rbp)
|
||
movaps -0x70(%rbp),%xmm9
|
||
movaps %xmm0,-0x70(%rbp)
|
||
movaps -0x60(%rbp),%xmm10
|
||
movaps %xmm0,-0x60(%rbp)
|
||
movaps -0x50(%rbp),%xmm11
|
||
movaps %xmm0,-0x50(%rbp)
|
||
movaps -0x40(%rbp),%xmm12
|
||
movaps %xmm0,-0x40(%rbp)
|
||
movaps -0x30(%rbp),%xmm13
|
||
movaps %xmm0,-0x30(%rbp)
|
||
movaps -0x20(%rbp),%xmm14
|
||
movaps %xmm0,-0x20(%rbp)
|
||
movaps -0x10(%rbp),%xmm15
|
||
movaps %xmm0,-0x10(%rbp)
|
||
movaps %xmm0,0x00(%rsp)
|
||
movaps %xmm0,0x10(%rsp)
|
||
movaps %xmm0,0x20(%rsp)
|
||
movaps %xmm0,0x30(%rsp)
|
||
movaps %xmm0,0x40(%rsp)
|
||
movaps %xmm0,0x50(%rsp)
|
||
movaps %xmm0,0x60(%rsp)
|
||
___
|
||
$code.=<<___;
|
||
lea (%rbp),%rsp
|
||
pop %rbp
|
||
.Lxts_dec_epilogue:
|
||
ret
|
||
.size aesni_xts_decrypt,.-aesni_xts_decrypt
|
||
___
|
||
} }}
|
||
|
||
########################################################################
|
||
# void $PREFIX_cbc_encrypt (const void *inp, void *out,
|
||
# size_t length, const AES_KEY *key,
|
||
# unsigned char *ivp,const int enc);
|
||
{
|
||
my $frame_size = 0x10 + ($win64?0xa0:0); # used in decrypt
|
||
my ($iv,$in0,$in1,$in2,$in3,$in4)=map("%xmm$_",(10..15));
|
||
my $inp_=$key_;
|
||
|
||
$code.=<<___;
|
||
.globl ${PREFIX}_cbc_encrypt
|
||
.type ${PREFIX}_cbc_encrypt,\@function,6
|
||
.align 16
|
||
${PREFIX}_cbc_encrypt:
|
||
test $len,$len # check length
|
||
jz .Lcbc_ret
|
||
|
||
mov 240($key),$rnds_ # key->rounds
|
||
mov $key,$key_ # backup $key
|
||
test %r9d,%r9d # 6th argument
|
||
jz .Lcbc_decrypt
|
||
#--------------------------- CBC ENCRYPT ------------------------------#
|
||
movups ($ivp),$inout0 # load iv as initial state
|
||
mov $rnds_,$rounds
|
||
cmp \$16,$len
|
||
jb .Lcbc_enc_tail
|
||
sub \$16,$len
|
||
jmp .Lcbc_enc_loop
|
||
.align 16
|
||
.Lcbc_enc_loop:
|
||
movups ($inp),$inout1 # load input
|
||
lea 16($inp),$inp
|
||
#xorps $inout1,$inout0
|
||
___
|
||
&aesni_generate1("enc",$key,$rounds,$inout0,$inout1);
|
||
$code.=<<___;
|
||
mov $rnds_,$rounds # restore $rounds
|
||
mov $key_,$key # restore $key
|
||
movups $inout0,0($out) # store output
|
||
lea 16($out),$out
|
||
sub \$16,$len
|
||
jnc .Lcbc_enc_loop
|
||
add \$16,$len
|
||
jnz .Lcbc_enc_tail
|
||
pxor $rndkey0,$rndkey0 # clear register bank
|
||
pxor $rndkey1,$rndkey1
|
||
movups $inout0,($ivp)
|
||
pxor $inout0,$inout0
|
||
pxor $inout1,$inout1
|
||
jmp .Lcbc_ret
|
||
|
||
.Lcbc_enc_tail:
|
||
mov $len,%rcx # zaps $key
|
||
xchg $inp,$out # $inp is %rsi and $out is %rdi now
|
||
.long 0x9066A4F3 # rep movsb
|
||
mov \$16,%ecx # zero tail
|
||
sub $len,%rcx
|
||
xor %eax,%eax
|
||
.long 0x9066AAF3 # rep stosb
|
||
lea -16(%rdi),%rdi # rewind $out by 1 block
|
||
mov $rnds_,$rounds # restore $rounds
|
||
mov %rdi,%rsi # $inp and $out are the same
|
||
mov $key_,$key # restore $key
|
||
xor $len,$len # len=16
|
||
jmp .Lcbc_enc_loop # one more spin
|
||
#--------------------------- CBC DECRYPT ------------------------------#
|
||
.align 16
|
||
.Lcbc_decrypt:
|
||
cmp \$16,$len
|
||
jne .Lcbc_decrypt_bulk
|
||
|
||
# handle single block without allocating stack frame,
|
||
# useful in ciphertext stealing mode
|
||
movdqu ($inp),$inout0 # load input
|
||
movdqu ($ivp),$inout1 # load iv
|
||
movdqa $inout0,$inout2 # future iv
|
||
___
|
||
&aesni_generate1("dec",$key,$rnds_);
|
||
$code.=<<___;
|
||
pxor $rndkey0,$rndkey0 # clear register bank
|
||
pxor $rndkey1,$rndkey1
|
||
movdqu $inout2,($ivp) # store iv
|
||
xorps $inout1,$inout0 # ^=iv
|
||
pxor $inout1,$inout1
|
||
movups $inout0,($out) # store output
|
||
pxor $inout0,$inout0
|
||
jmp .Lcbc_ret
|
||
.align 16
|
||
.Lcbc_decrypt_bulk:
|
||
lea (%rsp),%rax
|
||
push %rbp
|
||
sub \$$frame_size,%rsp
|
||
and \$-16,%rsp # Linux kernel stack can be incorrectly seeded
|
||
___
|
||
$code.=<<___ if ($win64);
|
||
movaps %xmm6,0x10(%rsp)
|
||
movaps %xmm7,0x20(%rsp)
|
||
movaps %xmm8,0x30(%rsp)
|
||
movaps %xmm9,0x40(%rsp)
|
||
movaps %xmm10,0x50(%rsp)
|
||
movaps %xmm11,0x60(%rsp)
|
||
movaps %xmm12,0x70(%rsp)
|
||
movaps %xmm13,0x80(%rsp)
|
||
movaps %xmm14,0x90(%rsp)
|
||
movaps %xmm15,0xa0(%rsp)
|
||
.Lcbc_decrypt_body:
|
||
___
|
||
$code.=<<___;
|
||
lea -8(%rax),%rbp
|
||
movups ($ivp),$iv
|
||
mov $rnds_,$rounds
|
||
cmp \$0x50,$len
|
||
jbe .Lcbc_dec_tail
|
||
|
||
$movkey ($key),$rndkey0
|
||
movdqu 0x00($inp),$inout0 # load input
|
||
movdqu 0x10($inp),$inout1
|
||
movdqa $inout0,$in0
|
||
movdqu 0x20($inp),$inout2
|
||
movdqa $inout1,$in1
|
||
movdqu 0x30($inp),$inout3
|
||
movdqa $inout2,$in2
|
||
movdqu 0x40($inp),$inout4
|
||
movdqa $inout3,$in3
|
||
movdqu 0x50($inp),$inout5
|
||
movdqa $inout4,$in4
|
||
mov OPENSSL_ia32cap_P+4(%rip),%r9d
|
||
cmp \$0x70,$len
|
||
jbe .Lcbc_dec_six_or_seven
|
||
|
||
and \$`1<<26|1<<22`,%r9d # isolate XSAVE+MOVBE
|
||
sub \$0x50,$len # $len is biased by -5*16
|
||
cmp \$`1<<22`,%r9d # check for MOVBE without XSAVE
|
||
je .Lcbc_dec_loop6_enter # [which denotes Atom Silvermont]
|
||
sub \$0x20,$len # $len is biased by -7*16
|
||
lea 0x70($key),$key # size optimization
|
||
jmp .Lcbc_dec_loop8_enter
|
||
.align 16
|
||
.Lcbc_dec_loop8:
|
||
movups $inout7,($out)
|
||
lea 0x10($out),$out
|
||
.Lcbc_dec_loop8_enter:
|
||
movdqu 0x60($inp),$inout6
|
||
pxor $rndkey0,$inout0
|
||
movdqu 0x70($inp),$inout7
|
||
pxor $rndkey0,$inout1
|
||
$movkey 0x10-0x70($key),$rndkey1
|
||
pxor $rndkey0,$inout2
|
||
xor $inp_,$inp_
|
||
cmp \$0x70,$len # is there at least 0x60 bytes ahead?
|
||
pxor $rndkey0,$inout3
|
||
pxor $rndkey0,$inout4
|
||
pxor $rndkey0,$inout5
|
||
pxor $rndkey0,$inout6
|
||
|
||
aesdec $rndkey1,$inout0
|
||
pxor $rndkey0,$inout7
|
||
$movkey 0x20-0x70($key),$rndkey0
|
||
aesdec $rndkey1,$inout1
|
||
aesdec $rndkey1,$inout2
|
||
aesdec $rndkey1,$inout3
|
||
aesdec $rndkey1,$inout4
|
||
aesdec $rndkey1,$inout5
|
||
aesdec $rndkey1,$inout6
|
||
setnc ${inp_}b
|
||
shl \$7,$inp_
|
||
aesdec $rndkey1,$inout7
|
||
add $inp,$inp_
|
||
$movkey 0x30-0x70($key),$rndkey1
|
||
___
|
||
for($i=1;$i<12;$i++) {
|
||
my $rndkeyx = ($i&1)?$rndkey0:$rndkey1;
|
||
$code.=<<___ if ($i==7);
|
||
cmp \$11,$rounds
|
||
___
|
||
$code.=<<___;
|
||
aesdec $rndkeyx,$inout0
|
||
aesdec $rndkeyx,$inout1
|
||
aesdec $rndkeyx,$inout2
|
||
aesdec $rndkeyx,$inout3
|
||
aesdec $rndkeyx,$inout4
|
||
aesdec $rndkeyx,$inout5
|
||
aesdec $rndkeyx,$inout6
|
||
aesdec $rndkeyx,$inout7
|
||
$movkey `0x30+0x10*$i`-0x70($key),$rndkeyx
|
||
___
|
||
$code.=<<___ if ($i<6 || (!($i&1) && $i>7));
|
||
nop
|
||
___
|
||
$code.=<<___ if ($i==7);
|
||
jb .Lcbc_dec_done
|
||
___
|
||
$code.=<<___ if ($i==9);
|
||
je .Lcbc_dec_done
|
||
___
|
||
$code.=<<___ if ($i==11);
|
||
jmp .Lcbc_dec_done
|
||
___
|
||
}
|
||
$code.=<<___;
|
||
.align 16
|
||
.Lcbc_dec_done:
|
||
aesdec $rndkey1,$inout0
|
||
aesdec $rndkey1,$inout1
|
||
pxor $rndkey0,$iv
|
||
pxor $rndkey0,$in0
|
||
aesdec $rndkey1,$inout2
|
||
aesdec $rndkey1,$inout3
|
||
pxor $rndkey0,$in1
|
||
pxor $rndkey0,$in2
|
||
aesdec $rndkey1,$inout4
|
||
aesdec $rndkey1,$inout5
|
||
pxor $rndkey0,$in3
|
||
pxor $rndkey0,$in4
|
||
aesdec $rndkey1,$inout6
|
||
aesdec $rndkey1,$inout7
|
||
movdqu 0x50($inp),$rndkey1
|
||
|
||
aesdeclast $iv,$inout0
|
||
movdqu 0x60($inp),$iv # borrow $iv
|
||
pxor $rndkey0,$rndkey1
|
||
aesdeclast $in0,$inout1
|
||
pxor $rndkey0,$iv
|
||
movdqu 0x70($inp),$rndkey0 # next IV
|
||
aesdeclast $in1,$inout2
|
||
lea 0x80($inp),$inp
|
||
movdqu 0x00($inp_),$in0
|
||
aesdeclast $in2,$inout3
|
||
aesdeclast $in3,$inout4
|
||
movdqu 0x10($inp_),$in1
|
||
movdqu 0x20($inp_),$in2
|
||
aesdeclast $in4,$inout5
|
||
aesdeclast $rndkey1,$inout6
|
||
movdqu 0x30($inp_),$in3
|
||
movdqu 0x40($inp_),$in4
|
||
aesdeclast $iv,$inout7
|
||
movdqa $rndkey0,$iv # return $iv
|
||
movdqu 0x50($inp_),$rndkey1
|
||
$movkey -0x70($key),$rndkey0
|
||
|
||
movups $inout0,($out) # store output
|
||
movdqa $in0,$inout0
|
||
movups $inout1,0x10($out)
|
||
movdqa $in1,$inout1
|
||
movups $inout2,0x20($out)
|
||
movdqa $in2,$inout2
|
||
movups $inout3,0x30($out)
|
||
movdqa $in3,$inout3
|
||
movups $inout4,0x40($out)
|
||
movdqa $in4,$inout4
|
||
movups $inout5,0x50($out)
|
||
movdqa $rndkey1,$inout5
|
||
movups $inout6,0x60($out)
|
||
lea 0x70($out),$out
|
||
|
||
sub \$0x80,$len
|
||
ja .Lcbc_dec_loop8
|
||
|
||
movaps $inout7,$inout0
|
||
lea -0x70($key),$key
|
||
add \$0x70,$len
|
||
jle .Lcbc_dec_clear_tail_collected
|
||
movups $inout7,($out)
|
||
lea 0x10($out),$out
|
||
cmp \$0x50,$len
|
||
jbe .Lcbc_dec_tail
|
||
|
||
movaps $in0,$inout0
|
||
.Lcbc_dec_six_or_seven:
|
||
cmp \$0x60,$len
|
||
ja .Lcbc_dec_seven
|
||
|
||
movaps $inout5,$inout6
|
||
call _aesni_decrypt6
|
||
pxor $iv,$inout0 # ^= IV
|
||
movaps $inout6,$iv
|
||
pxor $in0,$inout1
|
||
movdqu $inout0,($out)
|
||
pxor $in1,$inout2
|
||
movdqu $inout1,0x10($out)
|
||
pxor $inout1,$inout1 # clear register bank
|
||
pxor $in2,$inout3
|
||
movdqu $inout2,0x20($out)
|
||
pxor $inout2,$inout2
|
||
pxor $in3,$inout4
|
||
movdqu $inout3,0x30($out)
|
||
pxor $inout3,$inout3
|
||
pxor $in4,$inout5
|
||
movdqu $inout4,0x40($out)
|
||
pxor $inout4,$inout4
|
||
lea 0x50($out),$out
|
||
movdqa $inout5,$inout0
|
||
pxor $inout5,$inout5
|
||
jmp .Lcbc_dec_tail_collected
|
||
|
||
.align 16
|
||
.Lcbc_dec_seven:
|
||
movups 0x60($inp),$inout6
|
||
xorps $inout7,$inout7
|
||
call _aesni_decrypt8
|
||
movups 0x50($inp),$inout7
|
||
pxor $iv,$inout0 # ^= IV
|
||
movups 0x60($inp),$iv
|
||
pxor $in0,$inout1
|
||
movdqu $inout0,($out)
|
||
pxor $in1,$inout2
|
||
movdqu $inout1,0x10($out)
|
||
pxor $inout1,$inout1 # clear register bank
|
||
pxor $in2,$inout3
|
||
movdqu $inout2,0x20($out)
|
||
pxor $inout2,$inout2
|
||
pxor $in3,$inout4
|
||
movdqu $inout3,0x30($out)
|
||
pxor $inout3,$inout3
|
||
pxor $in4,$inout5
|
||
movdqu $inout4,0x40($out)
|
||
pxor $inout4,$inout4
|
||
pxor $inout7,$inout6
|
||
movdqu $inout5,0x50($out)
|
||
pxor $inout5,$inout5
|
||
lea 0x60($out),$out
|
||
movdqa $inout6,$inout0
|
||
pxor $inout6,$inout6
|
||
pxor $inout7,$inout7
|
||
jmp .Lcbc_dec_tail_collected
|
||
|
||
.align 16
|
||
.Lcbc_dec_loop6:
|
||
movups $inout5,($out)
|
||
lea 0x10($out),$out
|
||
movdqu 0x00($inp),$inout0 # load input
|
||
movdqu 0x10($inp),$inout1
|
||
movdqa $inout0,$in0
|
||
movdqu 0x20($inp),$inout2
|
||
movdqa $inout1,$in1
|
||
movdqu 0x30($inp),$inout3
|
||
movdqa $inout2,$in2
|
||
movdqu 0x40($inp),$inout4
|
||
movdqa $inout3,$in3
|
||
movdqu 0x50($inp),$inout5
|
||
movdqa $inout4,$in4
|
||
.Lcbc_dec_loop6_enter:
|
||
lea 0x60($inp),$inp
|
||
movdqa $inout5,$inout6
|
||
|
||
call _aesni_decrypt6
|
||
|
||
pxor $iv,$inout0 # ^= IV
|
||
movdqa $inout6,$iv
|
||
pxor $in0,$inout1
|
||
movdqu $inout0,($out)
|
||
pxor $in1,$inout2
|
||
movdqu $inout1,0x10($out)
|
||
pxor $in2,$inout3
|
||
movdqu $inout2,0x20($out)
|
||
pxor $in3,$inout4
|
||
mov $key_,$key
|
||
movdqu $inout3,0x30($out)
|
||
pxor $in4,$inout5
|
||
mov $rnds_,$rounds
|
||
movdqu $inout4,0x40($out)
|
||
lea 0x50($out),$out
|
||
sub \$0x60,$len
|
||
ja .Lcbc_dec_loop6
|
||
|
||
movdqa $inout5,$inout0
|
||
add \$0x50,$len
|
||
jle .Lcbc_dec_clear_tail_collected
|
||
movups $inout5,($out)
|
||
lea 0x10($out),$out
|
||
|
||
.Lcbc_dec_tail:
|
||
movups ($inp),$inout0
|
||
sub \$0x10,$len
|
||
jbe .Lcbc_dec_one # $len is 1*16 or less
|
||
|
||
movups 0x10($inp),$inout1
|
||
movaps $inout0,$in0
|
||
sub \$0x10,$len
|
||
jbe .Lcbc_dec_two # $len is 2*16 or less
|
||
|
||
movups 0x20($inp),$inout2
|
||
movaps $inout1,$in1
|
||
sub \$0x10,$len
|
||
jbe .Lcbc_dec_three # $len is 3*16 or less
|
||
|
||
movups 0x30($inp),$inout3
|
||
movaps $inout2,$in2
|
||
sub \$0x10,$len
|
||
jbe .Lcbc_dec_four # $len is 4*16 or less
|
||
|
||
movups 0x40($inp),$inout4 # $len is 5*16 or less
|
||
movaps $inout3,$in3
|
||
movaps $inout4,$in4
|
||
xorps $inout5,$inout5
|
||
call _aesni_decrypt6
|
||
pxor $iv,$inout0
|
||
movaps $in4,$iv
|
||
pxor $in0,$inout1
|
||
movdqu $inout0,($out)
|
||
pxor $in1,$inout2
|
||
movdqu $inout1,0x10($out)
|
||
pxor $inout1,$inout1 # clear register bank
|
||
pxor $in2,$inout3
|
||
movdqu $inout2,0x20($out)
|
||
pxor $inout2,$inout2
|
||
pxor $in3,$inout4
|
||
movdqu $inout3,0x30($out)
|
||
pxor $inout3,$inout3
|
||
lea 0x40($out),$out
|
||
movdqa $inout4,$inout0
|
||
pxor $inout4,$inout4
|
||
pxor $inout5,$inout5
|
||
sub \$0x10,$len
|
||
jmp .Lcbc_dec_tail_collected
|
||
|
||
.align 16
|
||
.Lcbc_dec_one:
|
||
movaps $inout0,$in0
|
||
___
|
||
&aesni_generate1("dec",$key,$rounds);
|
||
$code.=<<___;
|
||
xorps $iv,$inout0
|
||
movaps $in0,$iv
|
||
jmp .Lcbc_dec_tail_collected
|
||
.align 16
|
||
.Lcbc_dec_two:
|
||
movaps $inout1,$in1
|
||
call _aesni_decrypt2
|
||
pxor $iv,$inout0
|
||
movaps $in1,$iv
|
||
pxor $in0,$inout1
|
||
movdqu $inout0,($out)
|
||
movdqa $inout1,$inout0
|
||
pxor $inout1,$inout1 # clear register bank
|
||
lea 0x10($out),$out
|
||
jmp .Lcbc_dec_tail_collected
|
||
.align 16
|
||
.Lcbc_dec_three:
|
||
movaps $inout2,$in2
|
||
call _aesni_decrypt3
|
||
pxor $iv,$inout0
|
||
movaps $in2,$iv
|
||
pxor $in0,$inout1
|
||
movdqu $inout0,($out)
|
||
pxor $in1,$inout2
|
||
movdqu $inout1,0x10($out)
|
||
pxor $inout1,$inout1 # clear register bank
|
||
movdqa $inout2,$inout0
|
||
pxor $inout2,$inout2
|
||
lea 0x20($out),$out
|
||
jmp .Lcbc_dec_tail_collected
|
||
.align 16
|
||
.Lcbc_dec_four:
|
||
movaps $inout3,$in3
|
||
call _aesni_decrypt4
|
||
pxor $iv,$inout0
|
||
movaps $in3,$iv
|
||
pxor $in0,$inout1
|
||
movdqu $inout0,($out)
|
||
pxor $in1,$inout2
|
||
movdqu $inout1,0x10($out)
|
||
pxor $inout1,$inout1 # clear register bank
|
||
pxor $in2,$inout3
|
||
movdqu $inout2,0x20($out)
|
||
pxor $inout2,$inout2
|
||
movdqa $inout3,$inout0
|
||
pxor $inout3,$inout3
|
||
lea 0x30($out),$out
|
||
jmp .Lcbc_dec_tail_collected
|
||
|
||
.align 16
|
||
.Lcbc_dec_clear_tail_collected:
|
||
pxor $inout1,$inout1 # clear register bank
|
||
pxor $inout2,$inout2
|
||
pxor $inout3,$inout3
|
||
___
|
||
$code.=<<___ if (!$win64);
|
||
pxor $inout4,$inout4 # %xmm6..9
|
||
pxor $inout5,$inout5
|
||
pxor $inout6,$inout6
|
||
pxor $inout7,$inout7
|
||
___
|
||
$code.=<<___;
|
||
.Lcbc_dec_tail_collected:
|
||
movups $iv,($ivp)
|
||
and \$15,$len
|
||
jnz .Lcbc_dec_tail_partial
|
||
movups $inout0,($out)
|
||
pxor $inout0,$inout0
|
||
jmp .Lcbc_dec_ret
|
||
.align 16
|
||
.Lcbc_dec_tail_partial:
|
||
movaps $inout0,(%rsp)
|
||
pxor $inout0,$inout0
|
||
mov \$16,%rcx
|
||
mov $out,%rdi
|
||
sub $len,%rcx
|
||
lea (%rsp),%rsi
|
||
.long 0x9066A4F3 # rep movsb
|
||
movdqa $inout0,(%rsp)
|
||
|
||
.Lcbc_dec_ret:
|
||
xorps $rndkey0,$rndkey0 # %xmm0
|
||
pxor $rndkey1,$rndkey1
|
||
___
|
||
$code.=<<___ if ($win64);
|
||
movaps 0x10(%rsp),%xmm6
|
||
movaps %xmm0,0x10(%rsp) # clear stack
|
||
movaps 0x20(%rsp),%xmm7
|
||
movaps %xmm0,0x20(%rsp)
|
||
movaps 0x30(%rsp),%xmm8
|
||
movaps %xmm0,0x30(%rsp)
|
||
movaps 0x40(%rsp),%xmm9
|
||
movaps %xmm0,0x40(%rsp)
|
||
movaps 0x50(%rsp),%xmm10
|
||
movaps %xmm0,0x50(%rsp)
|
||
movaps 0x60(%rsp),%xmm11
|
||
movaps %xmm0,0x60(%rsp)
|
||
movaps 0x70(%rsp),%xmm12
|
||
movaps %xmm0,0x70(%rsp)
|
||
movaps 0x80(%rsp),%xmm13
|
||
movaps %xmm0,0x80(%rsp)
|
||
movaps 0x90(%rsp),%xmm14
|
||
movaps %xmm0,0x90(%rsp)
|
||
movaps 0xa0(%rsp),%xmm15
|
||
movaps %xmm0,0xa0(%rsp)
|
||
___
|
||
$code.=<<___;
|
||
lea (%rbp),%rsp
|
||
pop %rbp
|
||
.Lcbc_ret:
|
||
ret
|
||
.size ${PREFIX}_cbc_encrypt,.-${PREFIX}_cbc_encrypt
|
||
___
|
||
}
|
||
# int ${PREFIX}_set_decrypt_key(const unsigned char *inp,
|
||
# int bits, AES_KEY *key)
|
||
#
|
||
# input: $inp user-supplied key
|
||
# $bits $inp length in bits
|
||
# $key pointer to key schedule
|
||
# output: %eax 0 denoting success, -1 or -2 - failure (see C)
|
||
# *$key key schedule
|
||
#
|
||
{ my ($inp,$bits,$key) = @_4args;
|
||
$bits =~ s/%r/%e/;
|
||
|
||
$code.=<<___;
|
||
.globl ${PREFIX}_set_decrypt_key
|
||
.type ${PREFIX}_set_decrypt_key,\@abi-omnipotent
|
||
.align 16
|
||
${PREFIX}_set_decrypt_key:
|
||
.byte 0x48,0x83,0xEC,0x08 # sub rsp,8
|
||
call __aesni_set_encrypt_key
|
||
shl \$4,$bits # rounds-1 after _aesni_set_encrypt_key
|
||
test %eax,%eax
|
||
jnz .Ldec_key_ret
|
||
lea 16($key,$bits),$inp # points at the end of key schedule
|
||
|
||
$movkey ($key),%xmm0 # just swap
|
||
$movkey ($inp),%xmm1
|
||
$movkey %xmm0,($inp)
|
||
$movkey %xmm1,($key)
|
||
lea 16($key),$key
|
||
lea -16($inp),$inp
|
||
|
||
.Ldec_key_inverse:
|
||
$movkey ($key),%xmm0 # swap and inverse
|
||
$movkey ($inp),%xmm1
|
||
aesimc %xmm0,%xmm0
|
||
aesimc %xmm1,%xmm1
|
||
lea 16($key),$key
|
||
lea -16($inp),$inp
|
||
$movkey %xmm0,16($inp)
|
||
$movkey %xmm1,-16($key)
|
||
cmp $key,$inp
|
||
ja .Ldec_key_inverse
|
||
|
||
$movkey ($key),%xmm0 # inverse middle
|
||
aesimc %xmm0,%xmm0
|
||
pxor %xmm1,%xmm1
|
||
$movkey %xmm0,($inp)
|
||
pxor %xmm0,%xmm0
|
||
.Ldec_key_ret:
|
||
add \$8,%rsp
|
||
ret
|
||
.LSEH_end_set_decrypt_key:
|
||
.size ${PREFIX}_set_decrypt_key,.-${PREFIX}_set_decrypt_key
|
||
___
|
||
|
||
# This is based on submission by
|
||
#
|
||
# Huang Ying <ying.huang@intel.com>
|
||
# Vinodh Gopal <vinodh.gopal@intel.com>
|
||
# Kahraman Akdemir
|
||
#
|
||
# Agressively optimized in respect to aeskeygenassist's critical path
|
||
# and is contained in %xmm0-5 to meet Win64 ABI requirement.
|
||
#
|
||
# int ${PREFIX}_set_encrypt_key(const unsigned char *inp,
|
||
# int bits, AES_KEY * const key);
|
||
#
|
||
# input: $inp user-supplied key
|
||
# $bits $inp length in bits
|
||
# $key pointer to key schedule
|
||
# output: %eax 0 denoting success, -1 or -2 - failure (see C)
|
||
# $bits rounds-1 (used in aesni_set_decrypt_key)
|
||
# *$key key schedule
|
||
# $key pointer to key schedule (used in
|
||
# aesni_set_decrypt_key)
|
||
#
|
||
# Subroutine is frame-less, which means that only volatile registers
|
||
# are used. Note that it's declared "abi-omnipotent", which means that
|
||
# amount of volatile registers is smaller on Windows.
|
||
#
|
||
$code.=<<___;
|
||
.globl ${PREFIX}_set_encrypt_key
|
||
.type ${PREFIX}_set_encrypt_key,\@abi-omnipotent
|
||
.align 16
|
||
${PREFIX}_set_encrypt_key:
|
||
__aesni_set_encrypt_key:
|
||
.byte 0x48,0x83,0xEC,0x08 # sub rsp,8
|
||
mov \$-1,%rax
|
||
test $inp,$inp
|
||
jz .Lenc_key_ret
|
||
test $key,$key
|
||
jz .Lenc_key_ret
|
||
|
||
mov \$`1<<28|1<<11`,%r10d # AVX and XOP bits
|
||
movups ($inp),%xmm0 # pull first 128 bits of *userKey
|
||
xorps %xmm4,%xmm4 # low dword of xmm4 is assumed 0
|
||
and OPENSSL_ia32cap_P+4(%rip),%r10d
|
||
lea 16($key),%rax # %rax is used as modifiable copy of $key
|
||
cmp \$256,$bits
|
||
je .L14rounds
|
||
cmp \$192,$bits
|
||
je .L12rounds
|
||
cmp \$128,$bits
|
||
jne .Lbad_keybits
|
||
|
||
.L10rounds:
|
||
mov \$9,$bits # 10 rounds for 128-bit key
|
||
cmp \$`1<<28`,%r10d # AVX, bit no XOP
|
||
je .L10rounds_alt
|
||
|
||
$movkey %xmm0,($key) # round 0
|
||
aeskeygenassist \$0x1,%xmm0,%xmm1 # round 1
|
||
call .Lkey_expansion_128_cold
|
||
aeskeygenassist \$0x2,%xmm0,%xmm1 # round 2
|
||
call .Lkey_expansion_128
|
||
aeskeygenassist \$0x4,%xmm0,%xmm1 # round 3
|
||
call .Lkey_expansion_128
|
||
aeskeygenassist \$0x8,%xmm0,%xmm1 # round 4
|
||
call .Lkey_expansion_128
|
||
aeskeygenassist \$0x10,%xmm0,%xmm1 # round 5
|
||
call .Lkey_expansion_128
|
||
aeskeygenassist \$0x20,%xmm0,%xmm1 # round 6
|
||
call .Lkey_expansion_128
|
||
aeskeygenassist \$0x40,%xmm0,%xmm1 # round 7
|
||
call .Lkey_expansion_128
|
||
aeskeygenassist \$0x80,%xmm0,%xmm1 # round 8
|
||
call .Lkey_expansion_128
|
||
aeskeygenassist \$0x1b,%xmm0,%xmm1 # round 9
|
||
call .Lkey_expansion_128
|
||
aeskeygenassist \$0x36,%xmm0,%xmm1 # round 10
|
||
call .Lkey_expansion_128
|
||
$movkey %xmm0,(%rax)
|
||
mov $bits,80(%rax) # 240(%rdx)
|
||
xor %eax,%eax
|
||
jmp .Lenc_key_ret
|
||
|
||
.align 16
|
||
.L10rounds_alt:
|
||
movdqa .Lkey_rotate(%rip),%xmm5
|
||
mov \$8,%r10d
|
||
movdqa .Lkey_rcon1(%rip),%xmm4
|
||
movdqa %xmm0,%xmm2
|
||
movdqu %xmm0,($key)
|
||
jmp .Loop_key128
|
||
|
||
.align 16
|
||
.Loop_key128:
|
||
pshufb %xmm5,%xmm0
|
||
aesenclast %xmm4,%xmm0
|
||
pslld \$1,%xmm4
|
||
lea 16(%rax),%rax
|
||
|
||
movdqa %xmm2,%xmm3
|
||
pslldq \$4,%xmm2
|
||
pxor %xmm2,%xmm3
|
||
pslldq \$4,%xmm2
|
||
pxor %xmm2,%xmm3
|
||
pslldq \$4,%xmm2
|
||
pxor %xmm3,%xmm2
|
||
|
||
pxor %xmm2,%xmm0
|
||
movdqu %xmm0,-16(%rax)
|
||
movdqa %xmm0,%xmm2
|
||
|
||
dec %r10d
|
||
jnz .Loop_key128
|
||
|
||
movdqa .Lkey_rcon1b(%rip),%xmm4
|
||
|
||
pshufb %xmm5,%xmm0
|
||
aesenclast %xmm4,%xmm0
|
||
pslld \$1,%xmm4
|
||
|
||
movdqa %xmm2,%xmm3
|
||
pslldq \$4,%xmm2
|
||
pxor %xmm2,%xmm3
|
||
pslldq \$4,%xmm2
|
||
pxor %xmm2,%xmm3
|
||
pslldq \$4,%xmm2
|
||
pxor %xmm3,%xmm2
|
||
|
||
pxor %xmm2,%xmm0
|
||
movdqu %xmm0,(%rax)
|
||
|
||
movdqa %xmm0,%xmm2
|
||
pshufb %xmm5,%xmm0
|
||
aesenclast %xmm4,%xmm0
|
||
|
||
movdqa %xmm2,%xmm3
|
||
pslldq \$4,%xmm2
|
||
pxor %xmm2,%xmm3
|
||
pslldq \$4,%xmm2
|
||
pxor %xmm2,%xmm3
|
||
pslldq \$4,%xmm2
|
||
pxor %xmm3,%xmm2
|
||
|
||
pxor %xmm2,%xmm0
|
||
movdqu %xmm0,16(%rax)
|
||
|
||
mov $bits,96(%rax) # 240($key)
|
||
xor %eax,%eax
|
||
jmp .Lenc_key_ret
|
||
|
||
.align 16
|
||
.L12rounds:
|
||
movq 16($inp),%xmm2 # remaining 1/3 of *userKey
|
||
mov \$11,$bits # 12 rounds for 192
|
||
cmp \$`1<<28`,%r10d # AVX, but no XOP
|
||
je .L12rounds_alt
|
||
|
||
$movkey %xmm0,($key) # round 0
|
||
aeskeygenassist \$0x1,%xmm2,%xmm1 # round 1,2
|
||
call .Lkey_expansion_192a_cold
|
||
aeskeygenassist \$0x2,%xmm2,%xmm1 # round 2,3
|
||
call .Lkey_expansion_192b
|
||
aeskeygenassist \$0x4,%xmm2,%xmm1 # round 4,5
|
||
call .Lkey_expansion_192a
|
||
aeskeygenassist \$0x8,%xmm2,%xmm1 # round 5,6
|
||
call .Lkey_expansion_192b
|
||
aeskeygenassist \$0x10,%xmm2,%xmm1 # round 7,8
|
||
call .Lkey_expansion_192a
|
||
aeskeygenassist \$0x20,%xmm2,%xmm1 # round 8,9
|
||
call .Lkey_expansion_192b
|
||
aeskeygenassist \$0x40,%xmm2,%xmm1 # round 10,11
|
||
call .Lkey_expansion_192a
|
||
aeskeygenassist \$0x80,%xmm2,%xmm1 # round 11,12
|
||
call .Lkey_expansion_192b
|
||
$movkey %xmm0,(%rax)
|
||
mov $bits,48(%rax) # 240(%rdx)
|
||
xor %rax, %rax
|
||
jmp .Lenc_key_ret
|
||
|
||
.align 16
|
||
.L12rounds_alt:
|
||
movdqa .Lkey_rotate192(%rip),%xmm5
|
||
movdqa .Lkey_rcon1(%rip),%xmm4
|
||
mov \$8,%r10d
|
||
movdqu %xmm0,($key)
|
||
jmp .Loop_key192
|
||
|
||
.align 16
|
||
.Loop_key192:
|
||
movq %xmm2,0(%rax)
|
||
movdqa %xmm2,%xmm1
|
||
pshufb %xmm5,%xmm2
|
||
aesenclast %xmm4,%xmm2
|
||
pslld \$1, %xmm4
|
||
lea 24(%rax),%rax
|
||
|
||
movdqa %xmm0,%xmm3
|
||
pslldq \$4,%xmm0
|
||
pxor %xmm0,%xmm3
|
||
pslldq \$4,%xmm0
|
||
pxor %xmm0,%xmm3
|
||
pslldq \$4,%xmm0
|
||
pxor %xmm3,%xmm0
|
||
|
||
pshufd \$0xff,%xmm0,%xmm3
|
||
pxor %xmm1,%xmm3
|
||
pslldq \$4,%xmm1
|
||
pxor %xmm1,%xmm3
|
||
|
||
pxor %xmm2,%xmm0
|
||
pxor %xmm3,%xmm2
|
||
movdqu %xmm0,-16(%rax)
|
||
|
||
dec %r10d
|
||
jnz .Loop_key192
|
||
|
||
mov $bits,32(%rax) # 240($key)
|
||
xor %eax,%eax
|
||
jmp .Lenc_key_ret
|
||
|
||
.align 16
|
||
.L14rounds:
|
||
movups 16($inp),%xmm2 # remaning half of *userKey
|
||
mov \$13,$bits # 14 rounds for 256
|
||
lea 16(%rax),%rax
|
||
cmp \$`1<<28`,%r10d # AVX, but no XOP
|
||
je .L14rounds_alt
|
||
|
||
$movkey %xmm0,($key) # round 0
|
||
$movkey %xmm2,16($key) # round 1
|
||
aeskeygenassist \$0x1,%xmm2,%xmm1 # round 2
|
||
call .Lkey_expansion_256a_cold
|
||
aeskeygenassist \$0x1,%xmm0,%xmm1 # round 3
|
||
call .Lkey_expansion_256b
|
||
aeskeygenassist \$0x2,%xmm2,%xmm1 # round 4
|
||
call .Lkey_expansion_256a
|
||
aeskeygenassist \$0x2,%xmm0,%xmm1 # round 5
|
||
call .Lkey_expansion_256b
|
||
aeskeygenassist \$0x4,%xmm2,%xmm1 # round 6
|
||
call .Lkey_expansion_256a
|
||
aeskeygenassist \$0x4,%xmm0,%xmm1 # round 7
|
||
call .Lkey_expansion_256b
|
||
aeskeygenassist \$0x8,%xmm2,%xmm1 # round 8
|
||
call .Lkey_expansion_256a
|
||
aeskeygenassist \$0x8,%xmm0,%xmm1 # round 9
|
||
call .Lkey_expansion_256b
|
||
aeskeygenassist \$0x10,%xmm2,%xmm1 # round 10
|
||
call .Lkey_expansion_256a
|
||
aeskeygenassist \$0x10,%xmm0,%xmm1 # round 11
|
||
call .Lkey_expansion_256b
|
||
aeskeygenassist \$0x20,%xmm2,%xmm1 # round 12
|
||
call .Lkey_expansion_256a
|
||
aeskeygenassist \$0x20,%xmm0,%xmm1 # round 13
|
||
call .Lkey_expansion_256b
|
||
aeskeygenassist \$0x40,%xmm2,%xmm1 # round 14
|
||
call .Lkey_expansion_256a
|
||
$movkey %xmm0,(%rax)
|
||
mov $bits,16(%rax) # 240(%rdx)
|
||
xor %rax,%rax
|
||
jmp .Lenc_key_ret
|
||
|
||
.align 16
|
||
.L14rounds_alt:
|
||
movdqa .Lkey_rotate(%rip),%xmm5
|
||
movdqa .Lkey_rcon1(%rip),%xmm4
|
||
mov \$7,%r10d
|
||
movdqu %xmm0,0($key)
|
||
movdqa %xmm2,%xmm1
|
||
movdqu %xmm2,16($key)
|
||
jmp .Loop_key256
|
||
|
||
.align 16
|
||
.Loop_key256:
|
||
pshufb %xmm5,%xmm2
|
||
aesenclast %xmm4,%xmm2
|
||
|
||
movdqa %xmm0,%xmm3
|
||
pslldq \$4,%xmm0
|
||
pxor %xmm0,%xmm3
|
||
pslldq \$4,%xmm0
|
||
pxor %xmm0,%xmm3
|
||
pslldq \$4,%xmm0
|
||
pxor %xmm3,%xmm0
|
||
pslld \$1,%xmm4
|
||
|
||
pxor %xmm2,%xmm0
|
||
movdqu %xmm0,(%rax)
|
||
|
||
dec %r10d
|
||
jz .Ldone_key256
|
||
|
||
pshufd \$0xff,%xmm0,%xmm2
|
||
pxor %xmm3,%xmm3
|
||
aesenclast %xmm3,%xmm2
|
||
|
||
movdqa %xmm1,%xmm3
|
||
pslldq \$4,%xmm1
|
||
pxor %xmm1,%xmm3
|
||
pslldq \$4,%xmm1
|
||
pxor %xmm1,%xmm3
|
||
pslldq \$4,%xmm1
|
||
pxor %xmm3,%xmm1
|
||
|
||
pxor %xmm1,%xmm2
|
||
movdqu %xmm2,16(%rax)
|
||
lea 32(%rax),%rax
|
||
movdqa %xmm2,%xmm1
|
||
|
||
jmp .Loop_key256
|
||
|
||
.Ldone_key256:
|
||
mov $bits,16(%rax) # 240($key)
|
||
xor %eax,%eax
|
||
jmp .Lenc_key_ret
|
||
|
||
.align 16
|
||
.Lbad_keybits:
|
||
mov \$-2,%rax
|
||
.Lenc_key_ret:
|
||
pxor %xmm0,%xmm0
|
||
pxor %xmm1,%xmm1
|
||
pxor %xmm2,%xmm2
|
||
pxor %xmm3,%xmm3
|
||
pxor %xmm4,%xmm4
|
||
pxor %xmm5,%xmm5
|
||
add \$8,%rsp
|
||
ret
|
||
.LSEH_end_set_encrypt_key:
|
||
|
||
.align 16
|
||
.Lkey_expansion_128:
|
||
$movkey %xmm0,(%rax)
|
||
lea 16(%rax),%rax
|
||
.Lkey_expansion_128_cold:
|
||
shufps \$0b00010000,%xmm0,%xmm4
|
||
xorps %xmm4, %xmm0
|
||
shufps \$0b10001100,%xmm0,%xmm4
|
||
xorps %xmm4, %xmm0
|
||
shufps \$0b11111111,%xmm1,%xmm1 # critical path
|
||
xorps %xmm1,%xmm0
|
||
ret
|
||
|
||
.align 16
|
||
.Lkey_expansion_192a:
|
||
$movkey %xmm0,(%rax)
|
||
lea 16(%rax),%rax
|
||
.Lkey_expansion_192a_cold:
|
||
movaps %xmm2, %xmm5
|
||
.Lkey_expansion_192b_warm:
|
||
shufps \$0b00010000,%xmm0,%xmm4
|
||
movdqa %xmm2,%xmm3
|
||
xorps %xmm4,%xmm0
|
||
shufps \$0b10001100,%xmm0,%xmm4
|
||
pslldq \$4,%xmm3
|
||
xorps %xmm4,%xmm0
|
||
pshufd \$0b01010101,%xmm1,%xmm1 # critical path
|
||
pxor %xmm3,%xmm2
|
||
pxor %xmm1,%xmm0
|
||
pshufd \$0b11111111,%xmm0,%xmm3
|
||
pxor %xmm3,%xmm2
|
||
ret
|
||
|
||
.align 16
|
||
.Lkey_expansion_192b:
|
||
movaps %xmm0,%xmm3
|
||
shufps \$0b01000100,%xmm0,%xmm5
|
||
$movkey %xmm5,(%rax)
|
||
shufps \$0b01001110,%xmm2,%xmm3
|
||
$movkey %xmm3,16(%rax)
|
||
lea 32(%rax),%rax
|
||
jmp .Lkey_expansion_192b_warm
|
||
|
||
.align 16
|
||
.Lkey_expansion_256a:
|
||
$movkey %xmm2,(%rax)
|
||
lea 16(%rax),%rax
|
||
.Lkey_expansion_256a_cold:
|
||
shufps \$0b00010000,%xmm0,%xmm4
|
||
xorps %xmm4,%xmm0
|
||
shufps \$0b10001100,%xmm0,%xmm4
|
||
xorps %xmm4,%xmm0
|
||
shufps \$0b11111111,%xmm1,%xmm1 # critical path
|
||
xorps %xmm1,%xmm0
|
||
ret
|
||
|
||
.align 16
|
||
.Lkey_expansion_256b:
|
||
$movkey %xmm0,(%rax)
|
||
lea 16(%rax),%rax
|
||
|
||
shufps \$0b00010000,%xmm2,%xmm4
|
||
xorps %xmm4,%xmm2
|
||
shufps \$0b10001100,%xmm2,%xmm4
|
||
xorps %xmm4,%xmm2
|
||
shufps \$0b10101010,%xmm1,%xmm1 # critical path
|
||
xorps %xmm1,%xmm2
|
||
ret
|
||
.size ${PREFIX}_set_encrypt_key,.-${PREFIX}_set_encrypt_key
|
||
.size __aesni_set_encrypt_key,.-__aesni_set_encrypt_key
|
||
___
|
||
}
|
||
|
||
$code.=<<___;
|
||
.align 64
|
||
.Lbswap_mask:
|
||
.byte 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
|
||
.Lincrement32:
|
||
.long 6,6,6,0
|
||
.Lincrement64:
|
||
.long 1,0,0,0
|
||
.Lxts_magic:
|
||
.long 0x87,0,1,0
|
||
.Lincrement1:
|
||
.byte 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
|
||
.Lkey_rotate:
|
||
.long 0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d
|
||
.Lkey_rotate192:
|
||
.long 0x04070605,0x04070605,0x04070605,0x04070605
|
||
.Lkey_rcon1:
|
||
.long 1,1,1,1
|
||
.Lkey_rcon1b:
|
||
.long 0x1b,0x1b,0x1b,0x1b
|
||
|
||
.asciz "AES for Intel AES-NI, CRYPTOGAMS by <appro\@openssl.org>"
|
||
.align 64
|
||
___
|
||
|
||
# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
|
||
# CONTEXT *context,DISPATCHER_CONTEXT *disp)
|
||
if ($win64) {
|
||
$rec="%rcx";
|
||
$frame="%rdx";
|
||
$context="%r8";
|
||
$disp="%r9";
|
||
|
||
$code.=<<___;
|
||
.extern __imp_RtlVirtualUnwind
|
||
___
|
||
$code.=<<___ if ($PREFIX eq "aesni");
|
||
.type ecb_ccm64_se_handler,\@abi-omnipotent
|
||
.align 16
|
||
ecb_ccm64_se_handler:
|
||
push %rsi
|
||
push %rdi
|
||
push %rbx
|
||
push %rbp
|
||
push %r12
|
||
push %r13
|
||
push %r14
|
||
push %r15
|
||
pushfq
|
||
sub \$64,%rsp
|
||
|
||
mov 120($context),%rax # pull context->Rax
|
||
mov 248($context),%rbx # pull context->Rip
|
||
|
||
mov 8($disp),%rsi # disp->ImageBase
|
||
mov 56($disp),%r11 # disp->HandlerData
|
||
|
||
mov 0(%r11),%r10d # HandlerData[0]
|
||
lea (%rsi,%r10),%r10 # prologue label
|
||
cmp %r10,%rbx # context->Rip<prologue label
|
||
jb .Lcommon_seh_tail
|
||
|
||
mov 152($context),%rax # pull context->Rsp
|
||
|
||
mov 4(%r11),%r10d # HandlerData[1]
|
||
lea (%rsi,%r10),%r10 # epilogue label
|
||
cmp %r10,%rbx # context->Rip>=epilogue label
|
||
jae .Lcommon_seh_tail
|
||
|
||
lea 0(%rax),%rsi # %xmm save area
|
||
lea 512($context),%rdi # &context.Xmm6
|
||
mov \$8,%ecx # 4*sizeof(%xmm0)/sizeof(%rax)
|
||
.long 0xa548f3fc # cld; rep movsq
|
||
lea 0x58(%rax),%rax # adjust stack pointer
|
||
|
||
jmp .Lcommon_seh_tail
|
||
.size ecb_ccm64_se_handler,.-ecb_ccm64_se_handler
|
||
|
||
.type ctr_xts_se_handler,\@abi-omnipotent
|
||
.align 16
|
||
ctr_xts_se_handler:
|
||
push %rsi
|
||
push %rdi
|
||
push %rbx
|
||
push %rbp
|
||
push %r12
|
||
push %r13
|
||
push %r14
|
||
push %r15
|
||
pushfq
|
||
sub \$64,%rsp
|
||
|
||
mov 120($context),%rax # pull context->Rax
|
||
mov 248($context),%rbx # pull context->Rip
|
||
|
||
mov 8($disp),%rsi # disp->ImageBase
|
||
mov 56($disp),%r11 # disp->HandlerData
|
||
|
||
mov 0(%r11),%r10d # HandlerData[0]
|
||
lea (%rsi,%r10),%r10 # prologue lable
|
||
cmp %r10,%rbx # context->Rip<prologue label
|
||
jb .Lcommon_seh_tail
|
||
|
||
mov 152($context),%rax # pull context->Rsp
|
||
|
||
mov 4(%r11),%r10d # HandlerData[1]
|
||
lea (%rsi,%r10),%r10 # epilogue label
|
||
cmp %r10,%rbx # context->Rip>=epilogue label
|
||
jae .Lcommon_seh_tail
|
||
|
||
mov 160($context),%rax # pull context->Rbp
|
||
lea -0xa0(%rax),%rsi # %xmm save area
|
||
lea 512($context),%rdi # & context.Xmm6
|
||
mov \$20,%ecx # 10*sizeof(%xmm0)/sizeof(%rax)
|
||
.long 0xa548f3fc # cld; rep movsq
|
||
|
||
jmp .Lcommon_rbp_tail
|
||
.size ctr_xts_se_handler,.-ctr_xts_se_handler
|
||
___
|
||
$code.=<<___;
|
||
.type cbc_se_handler,\@abi-omnipotent
|
||
.align 16
|
||
cbc_se_handler:
|
||
push %rsi
|
||
push %rdi
|
||
push %rbx
|
||
push %rbp
|
||
push %r12
|
||
push %r13
|
||
push %r14
|
||
push %r15
|
||
pushfq
|
||
sub \$64,%rsp
|
||
|
||
mov 152($context),%rax # pull context->Rsp
|
||
mov 248($context),%rbx # pull context->Rip
|
||
|
||
lea .Lcbc_decrypt_bulk(%rip),%r10
|
||
cmp %r10,%rbx # context->Rip<"prologue" label
|
||
jb .Lcommon_seh_tail
|
||
|
||
lea .Lcbc_decrypt_body(%rip),%r10
|
||
cmp %r10,%rbx # context->Rip<cbc_decrypt_body
|
||
jb .Lrestore_cbc_rax
|
||
|
||
lea .Lcbc_ret(%rip),%r10
|
||
cmp %r10,%rbx # context->Rip>="epilogue" label
|
||
jae .Lcommon_seh_tail
|
||
|
||
lea 16(%rax),%rsi # %xmm save area
|
||
lea 512($context),%rdi # &context.Xmm6
|
||
mov \$20,%ecx # 10*sizeof(%xmm0)/sizeof(%rax)
|
||
.long 0xa548f3fc # cld; rep movsq
|
||
|
||
.Lcommon_rbp_tail:
|
||
mov 160($context),%rax # pull context->Rbp
|
||
mov (%rax),%rbp # restore saved %rbp
|
||
lea 8(%rax),%rax # adjust stack pointer
|
||
mov %rbp,160($context) # restore context->Rbp
|
||
jmp .Lcommon_seh_tail
|
||
|
||
.Lrestore_cbc_rax:
|
||
mov 120($context),%rax
|
||
|
||
.Lcommon_seh_tail:
|
||
mov 8(%rax),%rdi
|
||
mov 16(%rax),%rsi
|
||
mov %rax,152($context) # restore context->Rsp
|
||
mov %rsi,168($context) # restore context->Rsi
|
||
mov %rdi,176($context) # restore context->Rdi
|
||
|
||
mov 40($disp),%rdi # disp->ContextRecord
|
||
mov $context,%rsi # context
|
||
mov \$154,%ecx # sizeof(CONTEXT)
|
||
.long 0xa548f3fc # cld; rep movsq
|
||
|
||
mov $disp,%rsi
|
||
xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER
|
||
mov 8(%rsi),%rdx # arg2, disp->ImageBase
|
||
mov 0(%rsi),%r8 # arg3, disp->ControlPc
|
||
mov 16(%rsi),%r9 # arg4, disp->FunctionEntry
|
||
mov 40(%rsi),%r10 # disp->ContextRecord
|
||
lea 56(%rsi),%r11 # &disp->HandlerData
|
||
lea 24(%rsi),%r12 # &disp->EstablisherFrame
|
||
mov %r10,32(%rsp) # arg5
|
||
mov %r11,40(%rsp) # arg6
|
||
mov %r12,48(%rsp) # arg7
|
||
mov %rcx,56(%rsp) # arg8, (NULL)
|
||
call *__imp_RtlVirtualUnwind(%rip)
|
||
|
||
mov \$1,%eax # ExceptionContinueSearch
|
||
add \$64,%rsp
|
||
popfq
|
||
pop %r15
|
||
pop %r14
|
||
pop %r13
|
||
pop %r12
|
||
pop %rbp
|
||
pop %rbx
|
||
pop %rdi
|
||
pop %rsi
|
||
ret
|
||
.size cbc_se_handler,.-cbc_se_handler
|
||
|
||
.section .pdata
|
||
.align 4
|
||
___
|
||
$code.=<<___ if ($PREFIX eq "aesni");
|
||
.rva .LSEH_begin_aesni_ecb_encrypt
|
||
.rva .LSEH_end_aesni_ecb_encrypt
|
||
.rva .LSEH_info_ecb
|
||
|
||
.rva .LSEH_begin_aesni_ccm64_encrypt_blocks
|
||
.rva .LSEH_end_aesni_ccm64_encrypt_blocks
|
||
.rva .LSEH_info_ccm64_enc
|
||
|
||
.rva .LSEH_begin_aesni_ccm64_decrypt_blocks
|
||
.rva .LSEH_end_aesni_ccm64_decrypt_blocks
|
||
.rva .LSEH_info_ccm64_dec
|
||
|
||
.rva .LSEH_begin_aesni_ctr32_encrypt_blocks
|
||
.rva .LSEH_end_aesni_ctr32_encrypt_blocks
|
||
.rva .LSEH_info_ctr32
|
||
|
||
.rva .LSEH_begin_aesni_xts_encrypt
|
||
.rva .LSEH_end_aesni_xts_encrypt
|
||
.rva .LSEH_info_xts_enc
|
||
|
||
.rva .LSEH_begin_aesni_xts_decrypt
|
||
.rva .LSEH_end_aesni_xts_decrypt
|
||
.rva .LSEH_info_xts_dec
|
||
___
|
||
$code.=<<___;
|
||
.rva .LSEH_begin_${PREFIX}_cbc_encrypt
|
||
.rva .LSEH_end_${PREFIX}_cbc_encrypt
|
||
.rva .LSEH_info_cbc
|
||
|
||
.rva ${PREFIX}_set_decrypt_key
|
||
.rva .LSEH_end_set_decrypt_key
|
||
.rva .LSEH_info_key
|
||
|
||
.rva ${PREFIX}_set_encrypt_key
|
||
.rva .LSEH_end_set_encrypt_key
|
||
.rva .LSEH_info_key
|
||
.section .xdata
|
||
.align 8
|
||
___
|
||
$code.=<<___ if ($PREFIX eq "aesni");
|
||
.LSEH_info_ecb:
|
||
.byte 9,0,0,0
|
||
.rva ecb_ccm64_se_handler
|
||
.rva .Lecb_enc_body,.Lecb_enc_ret # HandlerData[]
|
||
.LSEH_info_ccm64_enc:
|
||
.byte 9,0,0,0
|
||
.rva ecb_ccm64_se_handler
|
||
.rva .Lccm64_enc_body,.Lccm64_enc_ret # HandlerData[]
|
||
.LSEH_info_ccm64_dec:
|
||
.byte 9,0,0,0
|
||
.rva ecb_ccm64_se_handler
|
||
.rva .Lccm64_dec_body,.Lccm64_dec_ret # HandlerData[]
|
||
.LSEH_info_ctr32:
|
||
.byte 9,0,0,0
|
||
.rva ctr_xts_se_handler
|
||
.rva .Lctr32_body,.Lctr32_epilogue # HandlerData[]
|
||
.LSEH_info_xts_enc:
|
||
.byte 9,0,0,0
|
||
.rva ctr_xts_se_handler
|
||
.rva .Lxts_enc_body,.Lxts_enc_epilogue # HandlerData[]
|
||
.LSEH_info_xts_dec:
|
||
.byte 9,0,0,0
|
||
.rva ctr_xts_se_handler
|
||
.rva .Lxts_dec_body,.Lxts_dec_epilogue # HandlerData[]
|
||
___
|
||
$code.=<<___;
|
||
.LSEH_info_cbc:
|
||
.byte 9,0,0,0
|
||
.rva cbc_se_handler
|
||
.LSEH_info_key:
|
||
.byte 0x01,0x04,0x01,0x00
|
||
.byte 0x04,0x02,0x00,0x00 # sub rsp,8
|
||
___
|
||
}
|
||
|
||
sub rex {
|
||
local *opcode=shift;
|
||
my ($dst,$src)=@_;
|
||
my $rex=0;
|
||
|
||
$rex|=0x04 if($dst>=8);
|
||
$rex|=0x01 if($src>=8);
|
||
push @opcode,$rex|0x40 if($rex);
|
||
}
|
||
|
||
sub aesni {
|
||
my $line=shift;
|
||
my @opcode=(0x66);
|
||
|
||
if ($line=~/(aeskeygenassist)\s+\$([x0-9a-f]+),\s*%xmm([0-9]+),\s*%xmm([0-9]+)/) {
|
||
rex(\@opcode,$4,$3);
|
||
push @opcode,0x0f,0x3a,0xdf;
|
||
push @opcode,0xc0|($3&7)|(($4&7)<<3); # ModR/M
|
||
my $c=$2;
|
||
push @opcode,$c=~/^0/?oct($c):$c;
|
||
return ".byte\t".join(',',@opcode);
|
||
}
|
||
elsif ($line=~/(aes[a-z]+)\s+%xmm([0-9]+),\s*%xmm([0-9]+)/) {
|
||
my %opcodelet = (
|
||
"aesimc" => 0xdb,
|
||
"aesenc" => 0xdc, "aesenclast" => 0xdd,
|
||
"aesdec" => 0xde, "aesdeclast" => 0xdf
|
||
);
|
||
return undef if (!defined($opcodelet{$1}));
|
||
rex(\@opcode,$3,$2);
|
||
push @opcode,0x0f,0x38,$opcodelet{$1};
|
||
push @opcode,0xc0|($2&7)|(($3&7)<<3); # ModR/M
|
||
return ".byte\t".join(',',@opcode);
|
||
}
|
||
elsif ($line=~/(aes[a-z]+)\s+([0x1-9a-fA-F]*)\(%rsp\),\s*%xmm([0-9]+)/) {
|
||
my %opcodelet = (
|
||
"aesenc" => 0xdc, "aesenclast" => 0xdd,
|
||
"aesdec" => 0xde, "aesdeclast" => 0xdf
|
||
);
|
||
return undef if (!defined($opcodelet{$1}));
|
||
my $off = $2;
|
||
push @opcode,0x44 if ($3>=8);
|
||
push @opcode,0x0f,0x38,$opcodelet{$1};
|
||
push @opcode,0x44|(($3&7)<<3),0x24; # ModR/M
|
||
push @opcode,($off=~/^0/?oct($off):$off)&0xff;
|
||
return ".byte\t".join(',',@opcode);
|
||
}
|
||
return $line;
|
||
}
|
||
|
||
sub movbe {
|
||
".byte 0x0f,0x38,0xf1,0x44,0x24,".shift;
|
||
}
|
||
|
||
$code =~ s/\`([^\`]*)\`/eval($1)/gem;
|
||
$code =~ s/\b(aes.*%xmm[0-9]+).*$/aesni($1)/gem;
|
||
#$code =~ s/\bmovbe\s+%eax/bswap %eax; mov %eax/gm; # debugging artefact
|
||
$code =~ s/\bmovbe\s+%eax,\s*([0-9]+)\(%rsp\)/movbe($1)/gem;
|
||
|
||
print $code;
|
||
|
||
close STDOUT;
|