boringssl/crypto/aes/asm/bsaes-x86_64.pl
David Benjamin fdd8e9c8c7 Switch perlasm calling convention.
Depending on architecture, perlasm differed on which one or both of:

  perl foo.pl flavor output.S
  perl foo.pl flavor > output.S

Upstream has now unified on the first form after making a number of
changes to their files (the second does not even work for their x86
files anymore). Sync those portions of our perlasm scripts with upstream
and update CMakeLists.txt and generate_build_files.py per the new
convention.

This imports various commits like this one:
184bc45f683c76531d7e065b6553ca9086564576 (this was done by taking a
diff, so I don't have the full list)

Confirmed that generate_build_files.py sees no change.

BUG=14

Change-Id: Id2fb5b8bc2a7369d077221b5df9a6947d41f50d2
Reviewed-on: https://boringssl-review.googlesource.com/8518
Reviewed-by: Adam Langley <agl@google.com>
2016-06-27 21:59:26 +00:00

3103 lines
71 KiB
Perl

#!/usr/bin/env perl
###################################################################
### AES-128 [originally in CTR mode] ###
### bitsliced implementation for Intel Core 2 processors ###
### requires support of SSE extensions up to SSSE3 ###
### Author: Emilia Käsper and Peter Schwabe ###
### Date: 2009-03-19 ###
### Public domain ###
### ###
### See http://homes.esat.kuleuven.be/~ekasper/#software for ###
### further information. ###
###################################################################
#
# September 2011.
#
# Started as transliteration to "perlasm" the original code has
# undergone following changes:
#
# - code was made position-independent;
# - rounds were folded into a loop resulting in >5x size reduction
# from 12.5KB to 2.2KB;
# - above was possibile thanks to mixcolumns() modification that
# allowed to feed its output back to aesenc[last], this was
# achieved at cost of two additional inter-registers moves;
# - some instruction reordering and interleaving;
# - this module doesn't implement key setup subroutine, instead it
# relies on conversion of "conventional" key schedule as returned
# by AES_set_encrypt_key (see discussion below);
# - first and last round keys are treated differently, which allowed
# to skip one shiftrows(), reduce bit-sliced key schedule and
# speed-up conversion by 22%;
# - support for 192- and 256-bit keys was added;
#
# Resulting performance in CPU cycles spent to encrypt one byte out
# of 4096-byte buffer with 128-bit key is:
#
# Emilia's this(*) difference
#
# Core 2 9.30 8.69 +7%
# Nehalem(**) 7.63 6.88 +11%
# Atom 17.1 16.4 +4%
# Silvermont - 12.9
#
# (*) Comparison is not completely fair, because "this" is ECB,
# i.e. no extra processing such as counter values calculation
# and xor-ing input as in Emilia's CTR implementation is
# performed. However, the CTR calculations stand for not more
# than 1% of total time, so comparison is *rather* fair.
#
# (**) Results were collected on Westmere, which is considered to
# be equivalent to Nehalem for this code.
#
# As for key schedule conversion subroutine. Interface to OpenSSL
# relies on per-invocation on-the-fly conversion. This naturally
# has impact on performance, especially for short inputs. Conversion
# time in CPU cycles and its ratio to CPU cycles spent in 8x block
# function is:
#
# conversion conversion/8x block
# Core 2 240 0.22
# Nehalem 180 0.20
# Atom 430 0.20
#
# The ratio values mean that 128-byte blocks will be processed
# 16-18% slower, 256-byte blocks - 9-10%, 384-byte blocks - 6-7%,
# etc. Then keep in mind that input sizes not divisible by 128 are
# *effectively* slower, especially shortest ones, e.g. consecutive
# 144-byte blocks are processed 44% slower than one would expect,
# 272 - 29%, 400 - 22%, etc. Yet, despite all these "shortcomings"
# it's still faster than ["hyper-threading-safe" code path in]
# aes-x86_64.pl on all lengths above 64 bytes...
#
# October 2011.
#
# Add decryption procedure. Performance in CPU cycles spent to decrypt
# one byte out of 4096-byte buffer with 128-bit key is:
#
# Core 2 9.98
# Nehalem 7.80
# Atom 17.9
# Silvermont 14.0
#
# November 2011.
#
# Add bsaes_xts_[en|de]crypt. Less-than-80-bytes-block performance is
# suboptimal, but XTS is meant to be used with larger blocks...
#
# <appro@openssl.org>
$flavour = shift;
$output = shift;
if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
die "can't locate x86_64-xlate.pl";
open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
*STDOUT=*OUT;
my ($inp,$out,$len,$key,$ivp)=("%rdi","%rsi","%rdx","%rcx");
my @XMM=map("%xmm$_",(15,0..14)); # best on Atom, +10% over (0..15)
my $ecb=0; # suppress unreferenced ECB subroutines, spare some space...
{
my ($key,$rounds,$const)=("%rax","%r10d","%r11");
sub Sbox {
# input in lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb
# output in lsb > [b0, b1, b4, b6, b3, b7, b2, b5] < msb
my @b=@_[0..7];
my @t=@_[8..11];
my @s=@_[12..15];
&InBasisChange (@b);
&Inv_GF256 (@b[6,5,0,3,7,1,4,2],@t,@s);
&OutBasisChange (@b[7,1,4,2,6,5,0,3]);
}
sub InBasisChange {
# input in lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb
# output in lsb > [b6, b5, b0, b3, b7, b1, b4, b2] < msb
my @b=@_[0..7];
$code.=<<___;
pxor @b[6], @b[5]
pxor @b[1], @b[2]
pxor @b[0], @b[3]
pxor @b[2], @b[6]
pxor @b[0], @b[5]
pxor @b[3], @b[6]
pxor @b[7], @b[3]
pxor @b[5], @b[7]
pxor @b[4], @b[3]
pxor @b[5], @b[4]
pxor @b[1], @b[3]
pxor @b[7], @b[2]
pxor @b[5], @b[1]
___
}
sub OutBasisChange {
# input in lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb
# output in lsb > [b6, b1, b2, b4, b7, b0, b3, b5] < msb
my @b=@_[0..7];
$code.=<<___;
pxor @b[6], @b[0]
pxor @b[4], @b[1]
pxor @b[0], @b[2]
pxor @b[6], @b[4]
pxor @b[1], @b[6]
pxor @b[5], @b[1]
pxor @b[3], @b[5]
pxor @b[7], @b[3]
pxor @b[5], @b[7]
pxor @b[5], @b[2]
pxor @b[7], @b[4]
___
}
sub InvSbox {
# input in lsb > [b0, b1, b2, b3, b4, b5, b6, b7] < msb
# output in lsb > [b0, b1, b6, b4, b2, b7, b3, b5] < msb
my @b=@_[0..7];
my @t=@_[8..11];
my @s=@_[12..15];
&InvInBasisChange (@b);
&Inv_GF256 (@b[5,1,2,6,3,7,0,4],@t,@s);
&InvOutBasisChange (@b[3,7,0,4,5,1,2,6]);
}
sub InvInBasisChange { # OutBasisChange in reverse
my @b=@_[5,1,2,6,3,7,0,4];
$code.=<<___
pxor @b[7], @b[4]
pxor @b[5], @b[7]
pxor @b[5], @b[2]
pxor @b[7], @b[3]
pxor @b[3], @b[5]
pxor @b[5], @b[1]
pxor @b[1], @b[6]
pxor @b[0], @b[2]
pxor @b[6], @b[4]
pxor @b[6], @b[0]
pxor @b[4], @b[1]
___
}
sub InvOutBasisChange { # InBasisChange in reverse
my @b=@_[2,5,7,3,6,1,0,4];
$code.=<<___;
pxor @b[5], @b[1]
pxor @b[7], @b[2]
pxor @b[1], @b[3]
pxor @b[5], @b[4]
pxor @b[5], @b[7]
pxor @b[4], @b[3]
pxor @b[0], @b[5]
pxor @b[7], @b[3]
pxor @b[2], @b[6]
pxor @b[1], @b[2]
pxor @b[3], @b[6]
pxor @b[0], @b[3]
pxor @b[6], @b[5]
___
}
sub Mul_GF4 {
#;*************************************************************
#;* Mul_GF4: Input x0-x1,y0-y1 Output x0-x1 Temp t0 (8) *
#;*************************************************************
my ($x0,$x1,$y0,$y1,$t0)=@_;
$code.=<<___;
movdqa $y0, $t0
pxor $y1, $t0
pand $x0, $t0
pxor $x1, $x0
pand $y0, $x1
pand $y1, $x0
pxor $x1, $x0
pxor $t0, $x1
___
}
sub Mul_GF4_N { # not used, see next subroutine
# multiply and scale by N
my ($x0,$x1,$y0,$y1,$t0)=@_;
$code.=<<___;
movdqa $y0, $t0
pxor $y1, $t0
pand $x0, $t0
pxor $x1, $x0
pand $y0, $x1
pand $y1, $x0
pxor $x0, $x1
pxor $t0, $x0
___
}
sub Mul_GF4_N_GF4 {
# interleaved Mul_GF4_N and Mul_GF4
my ($x0,$x1,$y0,$y1,$t0,
$x2,$x3,$y2,$y3,$t1)=@_;
$code.=<<___;
movdqa $y0, $t0
movdqa $y2, $t1
pxor $y1, $t0
pxor $y3, $t1
pand $x0, $t0
pand $x2, $t1
pxor $x1, $x0
pxor $x3, $x2
pand $y0, $x1
pand $y2, $x3
pand $y1, $x0
pand $y3, $x2
pxor $x0, $x1
pxor $x3, $x2
pxor $t0, $x0
pxor $t1, $x3
___
}
sub Mul_GF16_2 {
my @x=@_[0..7];
my @y=@_[8..11];
my @t=@_[12..15];
$code.=<<___;
movdqa @x[0], @t[0]
movdqa @x[1], @t[1]
___
&Mul_GF4 (@x[0], @x[1], @y[0], @y[1], @t[2]);
$code.=<<___;
pxor @x[2], @t[0]
pxor @x[3], @t[1]
pxor @y[2], @y[0]
pxor @y[3], @y[1]
___
Mul_GF4_N_GF4 (@t[0], @t[1], @y[0], @y[1], @t[3],
@x[2], @x[3], @y[2], @y[3], @t[2]);
$code.=<<___;
pxor @t[0], @x[0]
pxor @t[0], @x[2]
pxor @t[1], @x[1]
pxor @t[1], @x[3]
movdqa @x[4], @t[0]
movdqa @x[5], @t[1]
pxor @x[6], @t[0]
pxor @x[7], @t[1]
___
&Mul_GF4_N_GF4 (@t[0], @t[1], @y[0], @y[1], @t[3],
@x[6], @x[7], @y[2], @y[3], @t[2]);
$code.=<<___;
pxor @y[2], @y[0]
pxor @y[3], @y[1]
___
&Mul_GF4 (@x[4], @x[5], @y[0], @y[1], @t[3]);
$code.=<<___;
pxor @t[0], @x[4]
pxor @t[0], @x[6]
pxor @t[1], @x[5]
pxor @t[1], @x[7]
___
}
sub Inv_GF256 {
#;********************************************************************
#;* Inv_GF256: Input x0-x7 Output x0-x7 Temp t0-t3,s0-s3 (144) *
#;********************************************************************
my @x=@_[0..7];
my @t=@_[8..11];
my @s=@_[12..15];
# direct optimizations from hardware
$code.=<<___;
movdqa @x[4], @t[3]
movdqa @x[5], @t[2]
movdqa @x[1], @t[1]
movdqa @x[7], @s[1]
movdqa @x[0], @s[0]
pxor @x[6], @t[3]
pxor @x[7], @t[2]
pxor @x[3], @t[1]
movdqa @t[3], @s[2]
pxor @x[6], @s[1]
movdqa @t[2], @t[0]
pxor @x[2], @s[0]
movdqa @t[3], @s[3]
por @t[1], @t[2]
por @s[0], @t[3]
pxor @t[0], @s[3]
pand @s[0], @s[2]
pxor @t[1], @s[0]
pand @t[1], @t[0]
pand @s[0], @s[3]
movdqa @x[3], @s[0]
pxor @x[2], @s[0]
pand @s[0], @s[1]
pxor @s[1], @t[3]
pxor @s[1], @t[2]
movdqa @x[4], @s[1]
movdqa @x[1], @s[0]
pxor @x[5], @s[1]
pxor @x[0], @s[0]
movdqa @s[1], @t[1]
pand @s[0], @s[1]
por @s[0], @t[1]
pxor @s[1], @t[0]
pxor @s[3], @t[3]
pxor @s[2], @t[2]
pxor @s[3], @t[1]
movdqa @x[7], @s[0]
pxor @s[2], @t[0]
movdqa @x[6], @s[1]
pxor @s[2], @t[1]
movdqa @x[5], @s[2]
pand @x[3], @s[0]
movdqa @x[4], @s[3]
pand @x[2], @s[1]
pand @x[1], @s[2]
por @x[0], @s[3]
pxor @s[0], @t[3]
pxor @s[1], @t[2]
pxor @s[2], @t[1]
pxor @s[3], @t[0]
#Inv_GF16 \t0, \t1, \t2, \t3, \s0, \s1, \s2, \s3
# new smaller inversion
movdqa @t[3], @s[0]
pand @t[1], @t[3]
pxor @t[2], @s[0]
movdqa @t[0], @s[2]
movdqa @s[0], @s[3]
pxor @t[3], @s[2]
pand @s[2], @s[3]
movdqa @t[1], @s[1]
pxor @t[2], @s[3]
pxor @t[0], @s[1]
pxor @t[2], @t[3]
pand @t[3], @s[1]
movdqa @s[2], @t[2]
pxor @t[0], @s[1]
pxor @s[1], @t[2]
pxor @s[1], @t[1]
pand @t[0], @t[2]
pxor @t[2], @s[2]
pxor @t[2], @t[1]
pand @s[3], @s[2]
pxor @s[0], @s[2]
___
# output in s3, s2, s1, t1
# Mul_GF16_2 \x0, \x1, \x2, \x3, \x4, \x5, \x6, \x7, \t2, \t3, \t0, \t1, \s0, \s1, \s2, \s3
# Mul_GF16_2 \x0, \x1, \x2, \x3, \x4, \x5, \x6, \x7, \s3, \s2, \s1, \t1, \s0, \t0, \t2, \t3
&Mul_GF16_2(@x,@s[3,2,1],@t[1],@s[0],@t[0,2,3]);
### output msb > [x3,x2,x1,x0,x7,x6,x5,x4] < lsb
}
# AES linear components
sub ShiftRows {
my @x=@_[0..7];
my $mask=pop;
$code.=<<___;
pxor 0x00($key),@x[0]
pxor 0x10($key),@x[1]
pxor 0x20($key),@x[2]
pxor 0x30($key),@x[3]
pshufb $mask,@x[0]
pshufb $mask,@x[1]
pxor 0x40($key),@x[4]
pxor 0x50($key),@x[5]
pshufb $mask,@x[2]
pshufb $mask,@x[3]
pxor 0x60($key),@x[6]
pxor 0x70($key),@x[7]
pshufb $mask,@x[4]
pshufb $mask,@x[5]
pshufb $mask,@x[6]
pshufb $mask,@x[7]
lea 0x80($key),$key
___
}
sub MixColumns {
# modified to emit output in order suitable for feeding back to aesenc[last]
my @x=@_[0..7];
my @t=@_[8..15];
my $inv=@_[16]; # optional
$code.=<<___;
pshufd \$0x93, @x[0], @t[0] # x0 <<< 32
pshufd \$0x93, @x[1], @t[1]
pxor @t[0], @x[0] # x0 ^ (x0 <<< 32)
pshufd \$0x93, @x[2], @t[2]
pxor @t[1], @x[1]
pshufd \$0x93, @x[3], @t[3]
pxor @t[2], @x[2]
pshufd \$0x93, @x[4], @t[4]
pxor @t[3], @x[3]
pshufd \$0x93, @x[5], @t[5]
pxor @t[4], @x[4]
pshufd \$0x93, @x[6], @t[6]
pxor @t[5], @x[5]
pshufd \$0x93, @x[7], @t[7]
pxor @t[6], @x[6]
pxor @t[7], @x[7]
pxor @x[0], @t[1]
pxor @x[7], @t[0]
pxor @x[7], @t[1]
pshufd \$0x4E, @x[0], @x[0] # (x0 ^ (x0 <<< 32)) <<< 64)
pxor @x[1], @t[2]
pshufd \$0x4E, @x[1], @x[1]
pxor @x[4], @t[5]
pxor @t[0], @x[0]
pxor @x[5], @t[6]
pxor @t[1], @x[1]
pxor @x[3], @t[4]
pshufd \$0x4E, @x[4], @t[0]
pxor @x[6], @t[7]
pshufd \$0x4E, @x[5], @t[1]
pxor @x[2], @t[3]
pshufd \$0x4E, @x[3], @x[4]
pxor @x[7], @t[3]
pshufd \$0x4E, @x[7], @x[5]
pxor @x[7], @t[4]
pshufd \$0x4E, @x[6], @x[3]
pxor @t[4], @t[0]
pshufd \$0x4E, @x[2], @x[6]
pxor @t[5], @t[1]
___
$code.=<<___ if (!$inv);
pxor @t[3], @x[4]
pxor @t[7], @x[5]
pxor @t[6], @x[3]
movdqa @t[0], @x[2]
pxor @t[2], @x[6]
movdqa @t[1], @x[7]
___
$code.=<<___ if ($inv);
pxor @x[4], @t[3]
pxor @t[7], @x[5]
pxor @x[3], @t[6]
movdqa @t[0], @x[3]
pxor @t[2], @x[6]
movdqa @t[6], @x[2]
movdqa @t[1], @x[7]
movdqa @x[6], @x[4]
movdqa @t[3], @x[6]
___
}
sub InvMixColumns_orig {
my @x=@_[0..7];
my @t=@_[8..15];
$code.=<<___;
# multiplication by 0x0e
pshufd \$0x93, @x[7], @t[7]
movdqa @x[2], @t[2]
pxor @x[5], @x[7] # 7 5
pxor @x[5], @x[2] # 2 5
pshufd \$0x93, @x[0], @t[0]
movdqa @x[5], @t[5]
pxor @x[0], @x[5] # 5 0 [1]
pxor @x[1], @x[0] # 0 1
pshufd \$0x93, @x[1], @t[1]
pxor @x[2], @x[1] # 1 25
pxor @x[6], @x[0] # 01 6 [2]
pxor @x[3], @x[1] # 125 3 [4]
pshufd \$0x93, @x[3], @t[3]
pxor @x[0], @x[2] # 25 016 [3]
pxor @x[7], @x[3] # 3 75
pxor @x[6], @x[7] # 75 6 [0]
pshufd \$0x93, @x[6], @t[6]
movdqa @x[4], @t[4]
pxor @x[4], @x[6] # 6 4
pxor @x[3], @x[4] # 4 375 [6]
pxor @x[7], @x[3] # 375 756=36
pxor @t[5], @x[6] # 64 5 [7]
pxor @t[2], @x[3] # 36 2
pxor @t[4], @x[3] # 362 4 [5]
pshufd \$0x93, @t[5], @t[5]
___
my @y = @x[7,5,0,2,1,3,4,6];
$code.=<<___;
# multiplication by 0x0b
pxor @y[0], @y[1]
pxor @t[0], @y[0]
pxor @t[1], @y[1]
pshufd \$0x93, @t[2], @t[2]
pxor @t[5], @y[0]
pxor @t[6], @y[1]
pxor @t[7], @y[0]
pshufd \$0x93, @t[4], @t[4]
pxor @t[6], @t[7] # clobber t[7]
pxor @y[0], @y[1]
pxor @t[0], @y[3]
pshufd \$0x93, @t[0], @t[0]
pxor @t[1], @y[2]
pxor @t[1], @y[4]
pxor @t[2], @y[2]
pshufd \$0x93, @t[1], @t[1]
pxor @t[2], @y[3]
pxor @t[2], @y[5]
pxor @t[7], @y[2]
pshufd \$0x93, @t[2], @t[2]
pxor @t[3], @y[3]
pxor @t[3], @y[6]
pxor @t[3], @y[4]
pshufd \$0x93, @t[3], @t[3]
pxor @t[4], @y[7]
pxor @t[4], @y[5]
pxor @t[7], @y[7]
pxor @t[5], @y[3]
pxor @t[4], @y[4]
pxor @t[5], @t[7] # clobber t[7] even more
pxor @t[7], @y[5]
pshufd \$0x93, @t[4], @t[4]
pxor @t[7], @y[6]
pxor @t[7], @y[4]
pxor @t[5], @t[7]
pshufd \$0x93, @t[5], @t[5]
pxor @t[6], @t[7] # restore t[7]
# multiplication by 0x0d
pxor @y[7], @y[4]
pxor @t[4], @y[7]
pshufd \$0x93, @t[6], @t[6]
pxor @t[0], @y[2]
pxor @t[5], @y[7]
pxor @t[2], @y[2]
pshufd \$0x93, @t[7], @t[7]
pxor @y[1], @y[3]
pxor @t[1], @y[1]
pxor @t[0], @y[0]
pxor @t[0], @y[3]
pxor @t[5], @y[1]
pxor @t[5], @y[0]
pxor @t[7], @y[1]
pshufd \$0x93, @t[0], @t[0]
pxor @t[6], @y[0]
pxor @y[1], @y[3]
pxor @t[1], @y[4]
pshufd \$0x93, @t[1], @t[1]
pxor @t[7], @y[7]
pxor @t[2], @y[4]
pxor @t[2], @y[5]
pshufd \$0x93, @t[2], @t[2]
pxor @t[6], @y[2]
pxor @t[3], @t[6] # clobber t[6]
pxor @y[7], @y[4]
pxor @t[6], @y[3]
pxor @t[6], @y[6]
pxor @t[5], @y[5]
pxor @t[4], @y[6]
pshufd \$0x93, @t[4], @t[4]
pxor @t[6], @y[5]
pxor @t[7], @y[6]
pxor @t[3], @t[6] # restore t[6]
pshufd \$0x93, @t[5], @t[5]
pshufd \$0x93, @t[6], @t[6]
pshufd \$0x93, @t[7], @t[7]
pshufd \$0x93, @t[3], @t[3]
# multiplication by 0x09
pxor @y[1], @y[4]
pxor @y[1], @t[1] # t[1]=y[1]
pxor @t[5], @t[0] # clobber t[0]
pxor @t[5], @t[1]
pxor @t[0], @y[3]
pxor @y[0], @t[0] # t[0]=y[0]
pxor @t[6], @t[1]
pxor @t[7], @t[6] # clobber t[6]
pxor @t[1], @y[4]
pxor @t[4], @y[7]
pxor @y[4], @t[4] # t[4]=y[4]
pxor @t[3], @y[6]
pxor @y[3], @t[3] # t[3]=y[3]
pxor @t[2], @y[5]
pxor @y[2], @t[2] # t[2]=y[2]
pxor @t[7], @t[3]
pxor @y[5], @t[5] # t[5]=y[5]
pxor @t[6], @t[2]
pxor @t[6], @t[5]
pxor @y[6], @t[6] # t[6]=y[6]
pxor @y[7], @t[7] # t[7]=y[7]
movdqa @t[0],@XMM[0]
movdqa @t[1],@XMM[1]
movdqa @t[2],@XMM[2]
movdqa @t[3],@XMM[3]
movdqa @t[4],@XMM[4]
movdqa @t[5],@XMM[5]
movdqa @t[6],@XMM[6]
movdqa @t[7],@XMM[7]
___
}
sub InvMixColumns {
my @x=@_[0..7];
my @t=@_[8..15];
# Thanks to Jussi Kivilinna for providing pointer to
#
# | 0e 0b 0d 09 | | 02 03 01 01 | | 05 00 04 00 |
# | 09 0e 0b 0d | = | 01 02 03 01 | x | 00 05 00 04 |
# | 0d 09 0e 0b | | 01 01 02 03 | | 04 00 05 00 |
# | 0b 0d 09 0e | | 03 01 01 02 | | 00 04 00 05 |
$code.=<<___;
# multiplication by 0x05-0x00-0x04-0x00
pshufd \$0x4E, @x[0], @t[0]
pshufd \$0x4E, @x[6], @t[6]
pxor @x[0], @t[0]
pshufd \$0x4E, @x[7], @t[7]
pxor @x[6], @t[6]
pshufd \$0x4E, @x[1], @t[1]
pxor @x[7], @t[7]
pshufd \$0x4E, @x[2], @t[2]
pxor @x[1], @t[1]
pshufd \$0x4E, @x[3], @t[3]
pxor @x[2], @t[2]
pxor @t[6], @x[0]
pxor @t[6], @x[1]
pshufd \$0x4E, @x[4], @t[4]
pxor @x[3], @t[3]
pxor @t[0], @x[2]
pxor @t[1], @x[3]
pshufd \$0x4E, @x[5], @t[5]
pxor @x[4], @t[4]
pxor @t[7], @x[1]
pxor @t[2], @x[4]
pxor @x[5], @t[5]
pxor @t[7], @x[2]
pxor @t[6], @x[3]
pxor @t[6], @x[4]
pxor @t[3], @x[5]
pxor @t[4], @x[6]
pxor @t[7], @x[4]
pxor @t[7], @x[5]
pxor @t[5], @x[7]
___
&MixColumns (@x,@t,1); # flipped 2<->3 and 4<->6
}
sub aesenc { # not used
my @b=@_[0..7];
my @t=@_[8..15];
$code.=<<___;
movdqa 0x30($const),@t[0] # .LSR
___
&ShiftRows (@b,@t[0]);
&Sbox (@b,@t);
&MixColumns (@b[0,1,4,6,3,7,2,5],@t);
}
sub aesenclast { # not used
my @b=@_[0..7];
my @t=@_[8..15];
$code.=<<___;
movdqa 0x40($const),@t[0] # .LSRM0
___
&ShiftRows (@b,@t[0]);
&Sbox (@b,@t);
$code.=<<___
pxor 0x00($key),@b[0]
pxor 0x10($key),@b[1]
pxor 0x20($key),@b[4]
pxor 0x30($key),@b[6]
pxor 0x40($key),@b[3]
pxor 0x50($key),@b[7]
pxor 0x60($key),@b[2]
pxor 0x70($key),@b[5]
___
}
sub swapmove {
my ($a,$b,$n,$mask,$t)=@_;
$code.=<<___;
movdqa $b,$t
psrlq \$$n,$b
pxor $a,$b
pand $mask,$b
pxor $b,$a
psllq \$$n,$b
pxor $t,$b
___
}
sub swapmove2x {
my ($a0,$b0,$a1,$b1,$n,$mask,$t0,$t1)=@_;
$code.=<<___;
movdqa $b0,$t0
psrlq \$$n,$b0
movdqa $b1,$t1
psrlq \$$n,$b1
pxor $a0,$b0
pxor $a1,$b1
pand $mask,$b0
pand $mask,$b1
pxor $b0,$a0
psllq \$$n,$b0
pxor $b1,$a1
psllq \$$n,$b1
pxor $t0,$b0
pxor $t1,$b1
___
}
sub bitslice {
my @x=reverse(@_[0..7]);
my ($t0,$t1,$t2,$t3)=@_[8..11];
$code.=<<___;
movdqa 0x00($const),$t0 # .LBS0
movdqa 0x10($const),$t1 # .LBS1
___
&swapmove2x(@x[0,1,2,3],1,$t0,$t2,$t3);
&swapmove2x(@x[4,5,6,7],1,$t0,$t2,$t3);
$code.=<<___;
movdqa 0x20($const),$t0 # .LBS2
___
&swapmove2x(@x[0,2,1,3],2,$t1,$t2,$t3);
&swapmove2x(@x[4,6,5,7],2,$t1,$t2,$t3);
&swapmove2x(@x[0,4,1,5],4,$t0,$t2,$t3);
&swapmove2x(@x[2,6,3,7],4,$t0,$t2,$t3);
}
$code.=<<___;
.text
.extern asm_AES_encrypt
.extern asm_AES_decrypt
.type _bsaes_encrypt8,\@abi-omnipotent
.align 64
_bsaes_encrypt8:
lea .LBS0(%rip), $const # constants table
movdqa ($key), @XMM[9] # round 0 key
lea 0x10($key), $key
movdqa 0x50($const), @XMM[8] # .LM0SR
pxor @XMM[9], @XMM[0] # xor with round0 key
pxor @XMM[9], @XMM[1]
pxor @XMM[9], @XMM[2]
pxor @XMM[9], @XMM[3]
pshufb @XMM[8], @XMM[0]
pshufb @XMM[8], @XMM[1]
pxor @XMM[9], @XMM[4]
pxor @XMM[9], @XMM[5]
pshufb @XMM[8], @XMM[2]
pshufb @XMM[8], @XMM[3]
pxor @XMM[9], @XMM[6]
pxor @XMM[9], @XMM[7]
pshufb @XMM[8], @XMM[4]
pshufb @XMM[8], @XMM[5]
pshufb @XMM[8], @XMM[6]
pshufb @XMM[8], @XMM[7]
_bsaes_encrypt8_bitslice:
___
&bitslice (@XMM[0..7, 8..11]);
$code.=<<___;
dec $rounds
jmp .Lenc_sbox
.align 16
.Lenc_loop:
___
&ShiftRows (@XMM[0..7, 8]);
$code.=".Lenc_sbox:\n";
&Sbox (@XMM[0..7, 8..15]);
$code.=<<___;
dec $rounds
jl .Lenc_done
___
&MixColumns (@XMM[0,1,4,6,3,7,2,5, 8..15]);
$code.=<<___;
movdqa 0x30($const), @XMM[8] # .LSR
jnz .Lenc_loop
movdqa 0x40($const), @XMM[8] # .LSRM0
jmp .Lenc_loop
.align 16
.Lenc_done:
___
# output in lsb > [t0, t1, t4, t6, t3, t7, t2, t5] < msb
&bitslice (@XMM[0,1,4,6,3,7,2,5, 8..11]);
$code.=<<___;
movdqa ($key), @XMM[8] # last round key
pxor @XMM[8], @XMM[4]
pxor @XMM[8], @XMM[6]
pxor @XMM[8], @XMM[3]
pxor @XMM[8], @XMM[7]
pxor @XMM[8], @XMM[2]
pxor @XMM[8], @XMM[5]
pxor @XMM[8], @XMM[0]
pxor @XMM[8], @XMM[1]
ret
.size _bsaes_encrypt8,.-_bsaes_encrypt8
.type _bsaes_decrypt8,\@abi-omnipotent
.align 64
_bsaes_decrypt8:
lea .LBS0(%rip), $const # constants table
movdqa ($key), @XMM[9] # round 0 key
lea 0x10($key), $key
movdqa -0x30($const), @XMM[8] # .LM0ISR
pxor @XMM[9], @XMM[0] # xor with round0 key
pxor @XMM[9], @XMM[1]
pxor @XMM[9], @XMM[2]
pxor @XMM[9], @XMM[3]
pshufb @XMM[8], @XMM[0]
pshufb @XMM[8], @XMM[1]
pxor @XMM[9], @XMM[4]
pxor @XMM[9], @XMM[5]
pshufb @XMM[8], @XMM[2]
pshufb @XMM[8], @XMM[3]
pxor @XMM[9], @XMM[6]
pxor @XMM[9], @XMM[7]
pshufb @XMM[8], @XMM[4]
pshufb @XMM[8], @XMM[5]
pshufb @XMM[8], @XMM[6]
pshufb @XMM[8], @XMM[7]
___
&bitslice (@XMM[0..7, 8..11]);
$code.=<<___;
dec $rounds
jmp .Ldec_sbox
.align 16
.Ldec_loop:
___
&ShiftRows (@XMM[0..7, 8]);
$code.=".Ldec_sbox:\n";
&InvSbox (@XMM[0..7, 8..15]);
$code.=<<___;
dec $rounds
jl .Ldec_done
___
&InvMixColumns (@XMM[0,1,6,4,2,7,3,5, 8..15]);
$code.=<<___;
movdqa -0x10($const), @XMM[8] # .LISR
jnz .Ldec_loop
movdqa -0x20($const), @XMM[8] # .LISRM0
jmp .Ldec_loop
.align 16
.Ldec_done:
___
&bitslice (@XMM[0,1,6,4,2,7,3,5, 8..11]);
$code.=<<___;
movdqa ($key), @XMM[8] # last round key
pxor @XMM[8], @XMM[6]
pxor @XMM[8], @XMM[4]
pxor @XMM[8], @XMM[2]
pxor @XMM[8], @XMM[7]
pxor @XMM[8], @XMM[3]
pxor @XMM[8], @XMM[5]
pxor @XMM[8], @XMM[0]
pxor @XMM[8], @XMM[1]
ret
.size _bsaes_decrypt8,.-_bsaes_decrypt8
___
}
{
my ($out,$inp,$rounds,$const)=("%rax","%rcx","%r10d","%r11");
sub bitslice_key {
my @x=reverse(@_[0..7]);
my ($bs0,$bs1,$bs2,$t2,$t3)=@_[8..12];
&swapmove (@x[0,1],1,$bs0,$t2,$t3);
$code.=<<___;
#&swapmove(@x[2,3],1,$t0,$t2,$t3);
movdqa @x[0], @x[2]
movdqa @x[1], @x[3]
___
#&swapmove2x(@x[4,5,6,7],1,$t0,$t2,$t3);
&swapmove2x (@x[0,2,1,3],2,$bs1,$t2,$t3);
$code.=<<___;
#&swapmove2x(@x[4,6,5,7],2,$t1,$t2,$t3);
movdqa @x[0], @x[4]
movdqa @x[2], @x[6]
movdqa @x[1], @x[5]
movdqa @x[3], @x[7]
___
&swapmove2x (@x[0,4,1,5],4,$bs2,$t2,$t3);
&swapmove2x (@x[2,6,3,7],4,$bs2,$t2,$t3);
}
$code.=<<___;
.type _bsaes_key_convert,\@abi-omnipotent
.align 16
_bsaes_key_convert:
lea .Lmasks(%rip), $const
movdqu ($inp), %xmm7 # load round 0 key
lea 0x10($inp), $inp
movdqa 0x00($const), %xmm0 # 0x01...
movdqa 0x10($const), %xmm1 # 0x02...
movdqa 0x20($const), %xmm2 # 0x04...
movdqa 0x30($const), %xmm3 # 0x08...
movdqa 0x40($const), %xmm4 # .LM0
pcmpeqd %xmm5, %xmm5 # .LNOT
movdqu ($inp), %xmm6 # load round 1 key
movdqa %xmm7, ($out) # save round 0 key
lea 0x10($out), $out
dec $rounds
jmp .Lkey_loop
.align 16
.Lkey_loop:
pshufb %xmm4, %xmm6 # .LM0
movdqa %xmm0, %xmm8
movdqa %xmm1, %xmm9
pand %xmm6, %xmm8
pand %xmm6, %xmm9
movdqa %xmm2, %xmm10
pcmpeqb %xmm0, %xmm8
psllq \$4, %xmm0 # 0x10...
movdqa %xmm3, %xmm11
pcmpeqb %xmm1, %xmm9
psllq \$4, %xmm1 # 0x20...
pand %xmm6, %xmm10
pand %xmm6, %xmm11
movdqa %xmm0, %xmm12
pcmpeqb %xmm2, %xmm10
psllq \$4, %xmm2 # 0x40...
movdqa %xmm1, %xmm13
pcmpeqb %xmm3, %xmm11
psllq \$4, %xmm3 # 0x80...
movdqa %xmm2, %xmm14
movdqa %xmm3, %xmm15
pxor %xmm5, %xmm8 # "pnot"
pxor %xmm5, %xmm9
pand %xmm6, %xmm12
pand %xmm6, %xmm13
movdqa %xmm8, 0x00($out) # write bit-sliced round key
pcmpeqb %xmm0, %xmm12
psrlq \$4, %xmm0 # 0x01...
movdqa %xmm9, 0x10($out)
pcmpeqb %xmm1, %xmm13
psrlq \$4, %xmm1 # 0x02...
lea 0x10($inp), $inp
pand %xmm6, %xmm14
pand %xmm6, %xmm15
movdqa %xmm10, 0x20($out)
pcmpeqb %xmm2, %xmm14
psrlq \$4, %xmm2 # 0x04...
movdqa %xmm11, 0x30($out)
pcmpeqb %xmm3, %xmm15
psrlq \$4, %xmm3 # 0x08...
movdqu ($inp), %xmm6 # load next round key
pxor %xmm5, %xmm13 # "pnot"
pxor %xmm5, %xmm14
movdqa %xmm12, 0x40($out)
movdqa %xmm13, 0x50($out)
movdqa %xmm14, 0x60($out)
movdqa %xmm15, 0x70($out)
lea 0x80($out),$out
dec $rounds
jnz .Lkey_loop
movdqa 0x50($const), %xmm7 # .L63
#movdqa %xmm6, ($out) # don't save last round key
ret
.size _bsaes_key_convert,.-_bsaes_key_convert
___
}
if (0 && !$win64) { # following four functions are unsupported interface
# used for benchmarking...
$code.=<<___;
.globl bsaes_enc_key_convert
.type bsaes_enc_key_convert,\@function,2
.align 16
bsaes_enc_key_convert:
mov 240($inp),%r10d # pass rounds
mov $inp,%rcx # pass key
mov $out,%rax # pass key schedule
call _bsaes_key_convert
pxor %xmm6,%xmm7 # fix up last round key
movdqa %xmm7,(%rax) # save last round key
ret
.size bsaes_enc_key_convert,.-bsaes_enc_key_convert
.globl bsaes_encrypt_128
.type bsaes_encrypt_128,\@function,4
.align 16
bsaes_encrypt_128:
.Lenc128_loop:
movdqu 0x00($inp), @XMM[0] # load input
movdqu 0x10($inp), @XMM[1]
movdqu 0x20($inp), @XMM[2]
movdqu 0x30($inp), @XMM[3]
movdqu 0x40($inp), @XMM[4]
movdqu 0x50($inp), @XMM[5]
movdqu 0x60($inp), @XMM[6]
movdqu 0x70($inp), @XMM[7]
mov $key, %rax # pass the $key
lea 0x80($inp), $inp
mov \$10,%r10d
call _bsaes_encrypt8
movdqu @XMM[0], 0x00($out) # write output
movdqu @XMM[1], 0x10($out)
movdqu @XMM[4], 0x20($out)
movdqu @XMM[6], 0x30($out)
movdqu @XMM[3], 0x40($out)
movdqu @XMM[7], 0x50($out)
movdqu @XMM[2], 0x60($out)
movdqu @XMM[5], 0x70($out)
lea 0x80($out), $out
sub \$0x80,$len
ja .Lenc128_loop
ret
.size bsaes_encrypt_128,.-bsaes_encrypt_128
.globl bsaes_dec_key_convert
.type bsaes_dec_key_convert,\@function,2
.align 16
bsaes_dec_key_convert:
mov 240($inp),%r10d # pass rounds
mov $inp,%rcx # pass key
mov $out,%rax # pass key schedule
call _bsaes_key_convert
pxor ($out),%xmm7 # fix up round 0 key
movdqa %xmm6,(%rax) # save last round key
movdqa %xmm7,($out)
ret
.size bsaes_dec_key_convert,.-bsaes_dec_key_convert
.globl bsaes_decrypt_128
.type bsaes_decrypt_128,\@function,4
.align 16
bsaes_decrypt_128:
.Ldec128_loop:
movdqu 0x00($inp), @XMM[0] # load input
movdqu 0x10($inp), @XMM[1]
movdqu 0x20($inp), @XMM[2]
movdqu 0x30($inp), @XMM[3]
movdqu 0x40($inp), @XMM[4]
movdqu 0x50($inp), @XMM[5]
movdqu 0x60($inp), @XMM[6]
movdqu 0x70($inp), @XMM[7]
mov $key, %rax # pass the $key
lea 0x80($inp), $inp
mov \$10,%r10d
call _bsaes_decrypt8
movdqu @XMM[0], 0x00($out) # write output
movdqu @XMM[1], 0x10($out)
movdqu @XMM[6], 0x20($out)
movdqu @XMM[4], 0x30($out)
movdqu @XMM[2], 0x40($out)
movdqu @XMM[7], 0x50($out)
movdqu @XMM[3], 0x60($out)
movdqu @XMM[5], 0x70($out)
lea 0x80($out), $out
sub \$0x80,$len
ja .Ldec128_loop
ret
.size bsaes_decrypt_128,.-bsaes_decrypt_128
___
}
{
######################################################################
#
# OpenSSL interface
#
my ($arg1,$arg2,$arg3,$arg4,$arg5,$arg6)=$win64 ? ("%rcx","%rdx","%r8","%r9","%r10","%r11d")
: ("%rdi","%rsi","%rdx","%rcx","%r8","%r9d");
my ($inp,$out,$len,$key)=("%r12","%r13","%r14","%r15");
if ($ecb) {
$code.=<<___;
.globl bsaes_ecb_encrypt_blocks
.type bsaes_ecb_encrypt_blocks,\@abi-omnipotent
.align 16
bsaes_ecb_encrypt_blocks:
mov %rsp, %rax
.Lecb_enc_prologue:
push %rbp
push %rbx
push %r12
push %r13
push %r14
push %r15
lea -0x48(%rsp),%rsp
___
$code.=<<___ if ($win64);
lea -0xa0(%rsp), %rsp
movaps %xmm6, 0x40(%rsp)
movaps %xmm7, 0x50(%rsp)
movaps %xmm8, 0x60(%rsp)
movaps %xmm9, 0x70(%rsp)
movaps %xmm10, 0x80(%rsp)
movaps %xmm11, 0x90(%rsp)
movaps %xmm12, 0xa0(%rsp)
movaps %xmm13, 0xb0(%rsp)
movaps %xmm14, 0xc0(%rsp)
movaps %xmm15, 0xd0(%rsp)
.Lecb_enc_body:
___
$code.=<<___;
mov %rsp,%rbp # backup %rsp
mov 240($arg4),%eax # rounds
mov $arg1,$inp # backup arguments
mov $arg2,$out
mov $arg3,$len
mov $arg4,$key
cmp \$8,$arg3
jb .Lecb_enc_short
mov %eax,%ebx # backup rounds
shl \$7,%rax # 128 bytes per inner round key
sub \$`128-32`,%rax # size of bit-sliced key schedule
sub %rax,%rsp
mov %rsp,%rax # pass key schedule
mov $key,%rcx # pass key
mov %ebx,%r10d # pass rounds
call _bsaes_key_convert
pxor %xmm6,%xmm7 # fix up last round key
movdqa %xmm7,(%rax) # save last round key
sub \$8,$len
.Lecb_enc_loop:
movdqu 0x00($inp), @XMM[0] # load input
movdqu 0x10($inp), @XMM[1]
movdqu 0x20($inp), @XMM[2]
movdqu 0x30($inp), @XMM[3]
movdqu 0x40($inp), @XMM[4]
movdqu 0x50($inp), @XMM[5]
mov %rsp, %rax # pass key schedule
movdqu 0x60($inp), @XMM[6]
mov %ebx,%r10d # pass rounds
movdqu 0x70($inp), @XMM[7]
lea 0x80($inp), $inp
call _bsaes_encrypt8
movdqu @XMM[0], 0x00($out) # write output
movdqu @XMM[1], 0x10($out)
movdqu @XMM[4], 0x20($out)
movdqu @XMM[6], 0x30($out)
movdqu @XMM[3], 0x40($out)
movdqu @XMM[7], 0x50($out)
movdqu @XMM[2], 0x60($out)
movdqu @XMM[5], 0x70($out)
lea 0x80($out), $out
sub \$8,$len
jnc .Lecb_enc_loop
add \$8,$len
jz .Lecb_enc_done
movdqu 0x00($inp), @XMM[0] # load input
mov %rsp, %rax # pass key schedule
mov %ebx,%r10d # pass rounds
cmp \$2,$len
jb .Lecb_enc_one
movdqu 0x10($inp), @XMM[1]
je .Lecb_enc_two
movdqu 0x20($inp), @XMM[2]
cmp \$4,$len
jb .Lecb_enc_three
movdqu 0x30($inp), @XMM[3]
je .Lecb_enc_four
movdqu 0x40($inp), @XMM[4]
cmp \$6,$len
jb .Lecb_enc_five
movdqu 0x50($inp), @XMM[5]
je .Lecb_enc_six
movdqu 0x60($inp), @XMM[6]
call _bsaes_encrypt8
movdqu @XMM[0], 0x00($out) # write output
movdqu @XMM[1], 0x10($out)
movdqu @XMM[4], 0x20($out)
movdqu @XMM[6], 0x30($out)
movdqu @XMM[3], 0x40($out)
movdqu @XMM[7], 0x50($out)
movdqu @XMM[2], 0x60($out)
jmp .Lecb_enc_done
.align 16
.Lecb_enc_six:
call _bsaes_encrypt8
movdqu @XMM[0], 0x00($out) # write output
movdqu @XMM[1], 0x10($out)
movdqu @XMM[4], 0x20($out)
movdqu @XMM[6], 0x30($out)
movdqu @XMM[3], 0x40($out)
movdqu @XMM[7], 0x50($out)
jmp .Lecb_enc_done
.align 16
.Lecb_enc_five:
call _bsaes_encrypt8
movdqu @XMM[0], 0x00($out) # write output
movdqu @XMM[1], 0x10($out)
movdqu @XMM[4], 0x20($out)
movdqu @XMM[6], 0x30($out)
movdqu @XMM[3], 0x40($out)
jmp .Lecb_enc_done
.align 16
.Lecb_enc_four:
call _bsaes_encrypt8
movdqu @XMM[0], 0x00($out) # write output
movdqu @XMM[1], 0x10($out)
movdqu @XMM[4], 0x20($out)
movdqu @XMM[6], 0x30($out)
jmp .Lecb_enc_done
.align 16
.Lecb_enc_three:
call _bsaes_encrypt8
movdqu @XMM[0], 0x00($out) # write output
movdqu @XMM[1], 0x10($out)
movdqu @XMM[4], 0x20($out)
jmp .Lecb_enc_done
.align 16
.Lecb_enc_two:
call _bsaes_encrypt8
movdqu @XMM[0], 0x00($out) # write output
movdqu @XMM[1], 0x10($out)
jmp .Lecb_enc_done
.align 16
.Lecb_enc_one:
call _bsaes_encrypt8
movdqu @XMM[0], 0x00($out) # write output
jmp .Lecb_enc_done
.align 16
.Lecb_enc_short:
lea ($inp), $arg1
lea ($out), $arg2
lea ($key), $arg3
call asm_AES_encrypt
lea 16($inp), $inp
lea 16($out), $out
dec $len
jnz .Lecb_enc_short
.Lecb_enc_done:
lea (%rsp),%rax
pxor %xmm0, %xmm0
.Lecb_enc_bzero: # wipe key schedule [if any]
movdqa %xmm0, 0x00(%rax)
movdqa %xmm0, 0x10(%rax)
lea 0x20(%rax), %rax
cmp %rax, %rbp
jb .Lecb_enc_bzero
lea (%rbp),%rsp # restore %rsp
___
$code.=<<___ if ($win64);
movaps 0x40(%rbp), %xmm6
movaps 0x50(%rbp), %xmm7
movaps 0x60(%rbp), %xmm8
movaps 0x70(%rbp), %xmm9
movaps 0x80(%rbp), %xmm10
movaps 0x90(%rbp), %xmm11
movaps 0xa0(%rbp), %xmm12
movaps 0xb0(%rbp), %xmm13
movaps 0xc0(%rbp), %xmm14
movaps 0xd0(%rbp), %xmm15
lea 0xa0(%rbp), %rsp
___
$code.=<<___;
mov 0x48(%rsp), %r15
mov 0x50(%rsp), %r14
mov 0x58(%rsp), %r13
mov 0x60(%rsp), %r12
mov 0x68(%rsp), %rbx
mov 0x70(%rsp), %rax
lea 0x78(%rsp), %rsp
mov %rax, %rbp
.Lecb_enc_epilogue:
ret
.size bsaes_ecb_encrypt_blocks,.-bsaes_ecb_encrypt_blocks
.globl bsaes_ecb_decrypt_blocks
.type bsaes_ecb_decrypt_blocks,\@abi-omnipotent
.align 16
bsaes_ecb_decrypt_blocks:
mov %rsp, %rax
.Lecb_dec_prologue:
push %rbp
push %rbx
push %r12
push %r13
push %r14
push %r15
lea -0x48(%rsp),%rsp
___
$code.=<<___ if ($win64);
lea -0xa0(%rsp), %rsp
movaps %xmm6, 0x40(%rsp)
movaps %xmm7, 0x50(%rsp)
movaps %xmm8, 0x60(%rsp)
movaps %xmm9, 0x70(%rsp)
movaps %xmm10, 0x80(%rsp)
movaps %xmm11, 0x90(%rsp)
movaps %xmm12, 0xa0(%rsp)
movaps %xmm13, 0xb0(%rsp)
movaps %xmm14, 0xc0(%rsp)
movaps %xmm15, 0xd0(%rsp)
.Lecb_dec_body:
___
$code.=<<___;
mov %rsp,%rbp # backup %rsp
mov 240($arg4),%eax # rounds
mov $arg1,$inp # backup arguments
mov $arg2,$out
mov $arg3,$len
mov $arg4,$key
cmp \$8,$arg3
jb .Lecb_dec_short
mov %eax,%ebx # backup rounds
shl \$7,%rax # 128 bytes per inner round key
sub \$`128-32`,%rax # size of bit-sliced key schedule
sub %rax,%rsp
mov %rsp,%rax # pass key schedule
mov $key,%rcx # pass key
mov %ebx,%r10d # pass rounds
call _bsaes_key_convert
pxor (%rsp),%xmm7 # fix up 0 round key
movdqa %xmm6,(%rax) # save last round key
movdqa %xmm7,(%rsp)
sub \$8,$len
.Lecb_dec_loop:
movdqu 0x00($inp), @XMM[0] # load input
movdqu 0x10($inp), @XMM[1]
movdqu 0x20($inp), @XMM[2]
movdqu 0x30($inp), @XMM[3]
movdqu 0x40($inp), @XMM[4]
movdqu 0x50($inp), @XMM[5]
mov %rsp, %rax # pass key schedule
movdqu 0x60($inp), @XMM[6]
mov %ebx,%r10d # pass rounds
movdqu 0x70($inp), @XMM[7]
lea 0x80($inp), $inp
call _bsaes_decrypt8
movdqu @XMM[0], 0x00($out) # write output
movdqu @XMM[1], 0x10($out)
movdqu @XMM[6], 0x20($out)
movdqu @XMM[4], 0x30($out)
movdqu @XMM[2], 0x40($out)
movdqu @XMM[7], 0x50($out)
movdqu @XMM[3], 0x60($out)
movdqu @XMM[5], 0x70($out)
lea 0x80($out), $out
sub \$8,$len
jnc .Lecb_dec_loop
add \$8,$len
jz .Lecb_dec_done
movdqu 0x00($inp), @XMM[0] # load input
mov %rsp, %rax # pass key schedule
mov %ebx,%r10d # pass rounds
cmp \$2,$len
jb .Lecb_dec_one
movdqu 0x10($inp), @XMM[1]
je .Lecb_dec_two
movdqu 0x20($inp), @XMM[2]
cmp \$4,$len
jb .Lecb_dec_three
movdqu 0x30($inp), @XMM[3]
je .Lecb_dec_four
movdqu 0x40($inp), @XMM[4]
cmp \$6,$len
jb .Lecb_dec_five
movdqu 0x50($inp), @XMM[5]
je .Lecb_dec_six
movdqu 0x60($inp), @XMM[6]
call _bsaes_decrypt8
movdqu @XMM[0], 0x00($out) # write output
movdqu @XMM[1], 0x10($out)
movdqu @XMM[6], 0x20($out)
movdqu @XMM[4], 0x30($out)
movdqu @XMM[2], 0x40($out)
movdqu @XMM[7], 0x50($out)
movdqu @XMM[3], 0x60($out)
jmp .Lecb_dec_done
.align 16
.Lecb_dec_six:
call _bsaes_decrypt8
movdqu @XMM[0], 0x00($out) # write output
movdqu @XMM[1], 0x10($out)
movdqu @XMM[6], 0x20($out)
movdqu @XMM[4], 0x30($out)
movdqu @XMM[2], 0x40($out)
movdqu @XMM[7], 0x50($out)
jmp .Lecb_dec_done
.align 16
.Lecb_dec_five:
call _bsaes_decrypt8
movdqu @XMM[0], 0x00($out) # write output
movdqu @XMM[1], 0x10($out)
movdqu @XMM[6], 0x20($out)
movdqu @XMM[4], 0x30($out)
movdqu @XMM[2], 0x40($out)
jmp .Lecb_dec_done
.align 16
.Lecb_dec_four:
call _bsaes_decrypt8
movdqu @XMM[0], 0x00($out) # write output
movdqu @XMM[1], 0x10($out)
movdqu @XMM[6], 0x20($out)
movdqu @XMM[4], 0x30($out)
jmp .Lecb_dec_done
.align 16
.Lecb_dec_three:
call _bsaes_decrypt8
movdqu @XMM[0], 0x00($out) # write output
movdqu @XMM[1], 0x10($out)
movdqu @XMM[6], 0x20($out)
jmp .Lecb_dec_done
.align 16
.Lecb_dec_two:
call _bsaes_decrypt8
movdqu @XMM[0], 0x00($out) # write output
movdqu @XMM[1], 0x10($out)
jmp .Lecb_dec_done
.align 16
.Lecb_dec_one:
call _bsaes_decrypt8
movdqu @XMM[0], 0x00($out) # write output
jmp .Lecb_dec_done
.align 16
.Lecb_dec_short:
lea ($inp), $arg1
lea ($out), $arg2
lea ($key), $arg3
call asm_AES_decrypt
lea 16($inp), $inp
lea 16($out), $out
dec $len
jnz .Lecb_dec_short
.Lecb_dec_done:
lea (%rsp),%rax
pxor %xmm0, %xmm0
.Lecb_dec_bzero: # wipe key schedule [if any]
movdqa %xmm0, 0x00(%rax)
movdqa %xmm0, 0x10(%rax)
lea 0x20(%rax), %rax
cmp %rax, %rbp
jb .Lecb_dec_bzero
lea (%rbp),%rsp # restore %rsp
___
$code.=<<___ if ($win64);
movaps 0x40(%rbp), %xmm6
movaps 0x50(%rbp), %xmm7
movaps 0x60(%rbp), %xmm8
movaps 0x70(%rbp), %xmm9
movaps 0x80(%rbp), %xmm10
movaps 0x90(%rbp), %xmm11
movaps 0xa0(%rbp), %xmm12
movaps 0xb0(%rbp), %xmm13
movaps 0xc0(%rbp), %xmm14
movaps 0xd0(%rbp), %xmm15
lea 0xa0(%rbp), %rsp
___
$code.=<<___;
mov 0x48(%rsp), %r15
mov 0x50(%rsp), %r14
mov 0x58(%rsp), %r13
mov 0x60(%rsp), %r12
mov 0x68(%rsp), %rbx
mov 0x70(%rsp), %rax
lea 0x78(%rsp), %rsp
mov %rax, %rbp
.Lecb_dec_epilogue:
ret
.size bsaes_ecb_decrypt_blocks,.-bsaes_ecb_decrypt_blocks
___
}
$code.=<<___;
.extern asm_AES_cbc_encrypt
.globl bsaes_cbc_encrypt
.type bsaes_cbc_encrypt,\@abi-omnipotent
.align 16
bsaes_cbc_encrypt:
___
$code.=<<___ if ($win64);
mov 48(%rsp),$arg6 # pull direction flag
___
$code.=<<___;
cmp \$0,$arg6
jne asm_AES_cbc_encrypt
cmp \$128,$arg3
jb asm_AES_cbc_encrypt
mov %rsp, %rax
.Lcbc_dec_prologue:
push %rbp
push %rbx
push %r12
push %r13
push %r14
push %r15
lea -0x48(%rsp), %rsp
___
$code.=<<___ if ($win64);
mov 0xa0(%rsp),$arg5 # pull ivp
lea -0xa0(%rsp), %rsp
movaps %xmm6, 0x40(%rsp)
movaps %xmm7, 0x50(%rsp)
movaps %xmm8, 0x60(%rsp)
movaps %xmm9, 0x70(%rsp)
movaps %xmm10, 0x80(%rsp)
movaps %xmm11, 0x90(%rsp)
movaps %xmm12, 0xa0(%rsp)
movaps %xmm13, 0xb0(%rsp)
movaps %xmm14, 0xc0(%rsp)
movaps %xmm15, 0xd0(%rsp)
.Lcbc_dec_body:
___
$code.=<<___;
mov %rsp, %rbp # backup %rsp
mov 240($arg4), %eax # rounds
mov $arg1, $inp # backup arguments
mov $arg2, $out
mov $arg3, $len
mov $arg4, $key
mov $arg5, %rbx
shr \$4, $len # bytes to blocks
mov %eax, %edx # rounds
shl \$7, %rax # 128 bytes per inner round key
sub \$`128-32`, %rax # size of bit-sliced key schedule
sub %rax, %rsp
mov %rsp, %rax # pass key schedule
mov $key, %rcx # pass key
mov %edx, %r10d # pass rounds
call _bsaes_key_convert
pxor (%rsp),%xmm7 # fix up 0 round key
movdqa %xmm6,(%rax) # save last round key
movdqa %xmm7,(%rsp)
movdqu (%rbx), @XMM[15] # load IV
sub \$8,$len
.Lcbc_dec_loop:
movdqu 0x00($inp), @XMM[0] # load input
movdqu 0x10($inp), @XMM[1]
movdqu 0x20($inp), @XMM[2]
movdqu 0x30($inp), @XMM[3]
movdqu 0x40($inp), @XMM[4]
movdqu 0x50($inp), @XMM[5]
mov %rsp, %rax # pass key schedule
movdqu 0x60($inp), @XMM[6]
mov %edx,%r10d # pass rounds
movdqu 0x70($inp), @XMM[7]
movdqa @XMM[15], 0x20(%rbp) # put aside IV
call _bsaes_decrypt8
pxor 0x20(%rbp), @XMM[0] # ^= IV
movdqu 0x00($inp), @XMM[8] # re-load input
movdqu 0x10($inp), @XMM[9]
pxor @XMM[8], @XMM[1]
movdqu 0x20($inp), @XMM[10]
pxor @XMM[9], @XMM[6]
movdqu 0x30($inp), @XMM[11]
pxor @XMM[10], @XMM[4]
movdqu 0x40($inp), @XMM[12]
pxor @XMM[11], @XMM[2]
movdqu 0x50($inp), @XMM[13]
pxor @XMM[12], @XMM[7]
movdqu 0x60($inp), @XMM[14]
pxor @XMM[13], @XMM[3]
movdqu 0x70($inp), @XMM[15] # IV
pxor @XMM[14], @XMM[5]
movdqu @XMM[0], 0x00($out) # write output
lea 0x80($inp), $inp
movdqu @XMM[1], 0x10($out)
movdqu @XMM[6], 0x20($out)
movdqu @XMM[4], 0x30($out)
movdqu @XMM[2], 0x40($out)
movdqu @XMM[7], 0x50($out)
movdqu @XMM[3], 0x60($out)
movdqu @XMM[5], 0x70($out)
lea 0x80($out), $out
sub \$8,$len
jnc .Lcbc_dec_loop
add \$8,$len
jz .Lcbc_dec_done
movdqu 0x00($inp), @XMM[0] # load input
mov %rsp, %rax # pass key schedule
mov %edx, %r10d # pass rounds
cmp \$2,$len
jb .Lcbc_dec_one
movdqu 0x10($inp), @XMM[1]
je .Lcbc_dec_two
movdqu 0x20($inp), @XMM[2]
cmp \$4,$len
jb .Lcbc_dec_three
movdqu 0x30($inp), @XMM[3]
je .Lcbc_dec_four
movdqu 0x40($inp), @XMM[4]
cmp \$6,$len
jb .Lcbc_dec_five
movdqu 0x50($inp), @XMM[5]
je .Lcbc_dec_six
movdqu 0x60($inp), @XMM[6]
movdqa @XMM[15], 0x20(%rbp) # put aside IV
call _bsaes_decrypt8
pxor 0x20(%rbp), @XMM[0] # ^= IV
movdqu 0x00($inp), @XMM[8] # re-load input
movdqu 0x10($inp), @XMM[9]
pxor @XMM[8], @XMM[1]
movdqu 0x20($inp), @XMM[10]
pxor @XMM[9], @XMM[6]
movdqu 0x30($inp), @XMM[11]
pxor @XMM[10], @XMM[4]
movdqu 0x40($inp), @XMM[12]
pxor @XMM[11], @XMM[2]
movdqu 0x50($inp), @XMM[13]
pxor @XMM[12], @XMM[7]
movdqu 0x60($inp), @XMM[15] # IV
pxor @XMM[13], @XMM[3]
movdqu @XMM[0], 0x00($out) # write output
movdqu @XMM[1], 0x10($out)
movdqu @XMM[6], 0x20($out)
movdqu @XMM[4], 0x30($out)
movdqu @XMM[2], 0x40($out)
movdqu @XMM[7], 0x50($out)
movdqu @XMM[3], 0x60($out)
jmp .Lcbc_dec_done
.align 16
.Lcbc_dec_six:
movdqa @XMM[15], 0x20(%rbp) # put aside IV
call _bsaes_decrypt8
pxor 0x20(%rbp), @XMM[0] # ^= IV
movdqu 0x00($inp), @XMM[8] # re-load input
movdqu 0x10($inp), @XMM[9]
pxor @XMM[8], @XMM[1]
movdqu 0x20($inp), @XMM[10]
pxor @XMM[9], @XMM[6]
movdqu 0x30($inp), @XMM[11]
pxor @XMM[10], @XMM[4]
movdqu 0x40($inp), @XMM[12]
pxor @XMM[11], @XMM[2]
movdqu 0x50($inp), @XMM[15] # IV
pxor @XMM[12], @XMM[7]
movdqu @XMM[0], 0x00($out) # write output
movdqu @XMM[1], 0x10($out)
movdqu @XMM[6], 0x20($out)
movdqu @XMM[4], 0x30($out)
movdqu @XMM[2], 0x40($out)
movdqu @XMM[7], 0x50($out)
jmp .Lcbc_dec_done
.align 16
.Lcbc_dec_five:
movdqa @XMM[15], 0x20(%rbp) # put aside IV
call _bsaes_decrypt8
pxor 0x20(%rbp), @XMM[0] # ^= IV
movdqu 0x00($inp), @XMM[8] # re-load input
movdqu 0x10($inp), @XMM[9]
pxor @XMM[8], @XMM[1]
movdqu 0x20($inp), @XMM[10]
pxor @XMM[9], @XMM[6]
movdqu 0x30($inp), @XMM[11]
pxor @XMM[10], @XMM[4]
movdqu 0x40($inp), @XMM[15] # IV
pxor @XMM[11], @XMM[2]
movdqu @XMM[0], 0x00($out) # write output
movdqu @XMM[1], 0x10($out)
movdqu @XMM[6], 0x20($out)
movdqu @XMM[4], 0x30($out)
movdqu @XMM[2], 0x40($out)
jmp .Lcbc_dec_done
.align 16
.Lcbc_dec_four:
movdqa @XMM[15], 0x20(%rbp) # put aside IV
call _bsaes_decrypt8
pxor 0x20(%rbp), @XMM[0] # ^= IV
movdqu 0x00($inp), @XMM[8] # re-load input
movdqu 0x10($inp), @XMM[9]
pxor @XMM[8], @XMM[1]
movdqu 0x20($inp), @XMM[10]
pxor @XMM[9], @XMM[6]
movdqu 0x30($inp), @XMM[15] # IV
pxor @XMM[10], @XMM[4]
movdqu @XMM[0], 0x00($out) # write output
movdqu @XMM[1], 0x10($out)
movdqu @XMM[6], 0x20($out)
movdqu @XMM[4], 0x30($out)
jmp .Lcbc_dec_done
.align 16
.Lcbc_dec_three:
movdqa @XMM[15], 0x20(%rbp) # put aside IV
call _bsaes_decrypt8
pxor 0x20(%rbp), @XMM[0] # ^= IV
movdqu 0x00($inp), @XMM[8] # re-load input
movdqu 0x10($inp), @XMM[9]
pxor @XMM[8], @XMM[1]
movdqu 0x20($inp), @XMM[15] # IV
pxor @XMM[9], @XMM[6]
movdqu @XMM[0], 0x00($out) # write output
movdqu @XMM[1], 0x10($out)
movdqu @XMM[6], 0x20($out)
jmp .Lcbc_dec_done
.align 16
.Lcbc_dec_two:
movdqa @XMM[15], 0x20(%rbp) # put aside IV
call _bsaes_decrypt8
pxor 0x20(%rbp), @XMM[0] # ^= IV
movdqu 0x00($inp), @XMM[8] # re-load input
movdqu 0x10($inp), @XMM[15] # IV
pxor @XMM[8], @XMM[1]
movdqu @XMM[0], 0x00($out) # write output
movdqu @XMM[1], 0x10($out)
jmp .Lcbc_dec_done
.align 16
.Lcbc_dec_one:
lea ($inp), $arg1
lea 0x20(%rbp), $arg2 # buffer output
lea ($key), $arg3
call asm_AES_decrypt # doesn't touch %xmm
pxor 0x20(%rbp), @XMM[15] # ^= IV
movdqu @XMM[15], ($out) # write output
movdqa @XMM[0], @XMM[15] # IV
.Lcbc_dec_done:
movdqu @XMM[15], (%rbx) # return IV
lea (%rsp), %rax
pxor %xmm0, %xmm0
.Lcbc_dec_bzero: # wipe key schedule [if any]
movdqa %xmm0, 0x00(%rax)
movdqa %xmm0, 0x10(%rax)
lea 0x20(%rax), %rax
cmp %rax, %rbp
ja .Lcbc_dec_bzero
lea (%rbp),%rsp # restore %rsp
___
$code.=<<___ if ($win64);
movaps 0x40(%rbp), %xmm6
movaps 0x50(%rbp), %xmm7
movaps 0x60(%rbp), %xmm8
movaps 0x70(%rbp), %xmm9
movaps 0x80(%rbp), %xmm10
movaps 0x90(%rbp), %xmm11
movaps 0xa0(%rbp), %xmm12
movaps 0xb0(%rbp), %xmm13
movaps 0xc0(%rbp), %xmm14
movaps 0xd0(%rbp), %xmm15
lea 0xa0(%rbp), %rsp
___
$code.=<<___;
mov 0x48(%rsp), %r15
mov 0x50(%rsp), %r14
mov 0x58(%rsp), %r13
mov 0x60(%rsp), %r12
mov 0x68(%rsp), %rbx
mov 0x70(%rsp), %rax
lea 0x78(%rsp), %rsp
mov %rax, %rbp
.Lcbc_dec_epilogue:
ret
.size bsaes_cbc_encrypt,.-bsaes_cbc_encrypt
.globl bsaes_ctr32_encrypt_blocks
.type bsaes_ctr32_encrypt_blocks,\@abi-omnipotent
.align 16
bsaes_ctr32_encrypt_blocks:
mov %rsp, %rax
.Lctr_enc_prologue:
push %rbp
push %rbx
push %r12
push %r13
push %r14
push %r15
lea -0x48(%rsp), %rsp
___
$code.=<<___ if ($win64);
mov 0xa0(%rsp),$arg5 # pull ivp
lea -0xa0(%rsp), %rsp
movaps %xmm6, 0x40(%rsp)
movaps %xmm7, 0x50(%rsp)
movaps %xmm8, 0x60(%rsp)
movaps %xmm9, 0x70(%rsp)
movaps %xmm10, 0x80(%rsp)
movaps %xmm11, 0x90(%rsp)
movaps %xmm12, 0xa0(%rsp)
movaps %xmm13, 0xb0(%rsp)
movaps %xmm14, 0xc0(%rsp)
movaps %xmm15, 0xd0(%rsp)
.Lctr_enc_body:
___
$code.=<<___;
mov %rsp, %rbp # backup %rsp
movdqu ($arg5), %xmm0 # load counter
mov 240($arg4), %eax # rounds
mov $arg1, $inp # backup arguments
mov $arg2, $out
mov $arg3, $len
mov $arg4, $key
movdqa %xmm0, 0x20(%rbp) # copy counter
cmp \$8, $arg3
jb .Lctr_enc_short
mov %eax, %ebx # rounds
shl \$7, %rax # 128 bytes per inner round key
sub \$`128-32`, %rax # size of bit-sliced key schedule
sub %rax, %rsp
mov %rsp, %rax # pass key schedule
mov $key, %rcx # pass key
mov %ebx, %r10d # pass rounds
call _bsaes_key_convert
pxor %xmm6,%xmm7 # fix up last round key
movdqa %xmm7,(%rax) # save last round key
movdqa (%rsp), @XMM[9] # load round0 key
lea .LADD1(%rip), %r11
movdqa 0x20(%rbp), @XMM[0] # counter copy
movdqa -0x20(%r11), @XMM[8] # .LSWPUP
pshufb @XMM[8], @XMM[9] # byte swap upper part
pshufb @XMM[8], @XMM[0]
movdqa @XMM[9], (%rsp) # save adjusted round0 key
jmp .Lctr_enc_loop
.align 16
.Lctr_enc_loop:
movdqa @XMM[0], 0x20(%rbp) # save counter
movdqa @XMM[0], @XMM[1] # prepare 8 counter values
movdqa @XMM[0], @XMM[2]
paddd 0x00(%r11), @XMM[1] # .LADD1
movdqa @XMM[0], @XMM[3]
paddd 0x10(%r11), @XMM[2] # .LADD2
movdqa @XMM[0], @XMM[4]
paddd 0x20(%r11), @XMM[3] # .LADD3
movdqa @XMM[0], @XMM[5]
paddd 0x30(%r11), @XMM[4] # .LADD4
movdqa @XMM[0], @XMM[6]
paddd 0x40(%r11), @XMM[5] # .LADD5
movdqa @XMM[0], @XMM[7]
paddd 0x50(%r11), @XMM[6] # .LADD6
paddd 0x60(%r11), @XMM[7] # .LADD7
# Borrow prologue from _bsaes_encrypt8 to use the opportunity
# to flip byte order in 32-bit counter
movdqa (%rsp), @XMM[9] # round 0 key
lea 0x10(%rsp), %rax # pass key schedule
movdqa -0x10(%r11), @XMM[8] # .LSWPUPM0SR
pxor @XMM[9], @XMM[0] # xor with round0 key
pxor @XMM[9], @XMM[1]
pxor @XMM[9], @XMM[2]
pxor @XMM[9], @XMM[3]
pshufb @XMM[8], @XMM[0]
pshufb @XMM[8], @XMM[1]
pxor @XMM[9], @XMM[4]
pxor @XMM[9], @XMM[5]
pshufb @XMM[8], @XMM[2]
pshufb @XMM[8], @XMM[3]
pxor @XMM[9], @XMM[6]
pxor @XMM[9], @XMM[7]
pshufb @XMM[8], @XMM[4]
pshufb @XMM[8], @XMM[5]
pshufb @XMM[8], @XMM[6]
pshufb @XMM[8], @XMM[7]
lea .LBS0(%rip), %r11 # constants table
mov %ebx,%r10d # pass rounds
call _bsaes_encrypt8_bitslice
sub \$8,$len
jc .Lctr_enc_loop_done
movdqu 0x00($inp), @XMM[8] # load input
movdqu 0x10($inp), @XMM[9]
movdqu 0x20($inp), @XMM[10]
movdqu 0x30($inp), @XMM[11]
movdqu 0x40($inp), @XMM[12]
movdqu 0x50($inp), @XMM[13]
movdqu 0x60($inp), @XMM[14]
movdqu 0x70($inp), @XMM[15]
lea 0x80($inp),$inp
pxor @XMM[0], @XMM[8]
movdqa 0x20(%rbp), @XMM[0] # load counter
pxor @XMM[9], @XMM[1]
movdqu @XMM[8], 0x00($out) # write output
pxor @XMM[10], @XMM[4]
movdqu @XMM[1], 0x10($out)
pxor @XMM[11], @XMM[6]
movdqu @XMM[4], 0x20($out)
pxor @XMM[12], @XMM[3]
movdqu @XMM[6], 0x30($out)
pxor @XMM[13], @XMM[7]
movdqu @XMM[3], 0x40($out)
pxor @XMM[14], @XMM[2]
movdqu @XMM[7], 0x50($out)
pxor @XMM[15], @XMM[5]
movdqu @XMM[2], 0x60($out)
lea .LADD1(%rip), %r11
movdqu @XMM[5], 0x70($out)
lea 0x80($out), $out
paddd 0x70(%r11), @XMM[0] # .LADD8
jnz .Lctr_enc_loop
jmp .Lctr_enc_done
.align 16
.Lctr_enc_loop_done:
add \$8, $len
movdqu 0x00($inp), @XMM[8] # load input
pxor @XMM[8], @XMM[0]
movdqu @XMM[0], 0x00($out) # write output
cmp \$2,$len
jb .Lctr_enc_done
movdqu 0x10($inp), @XMM[9]
pxor @XMM[9], @XMM[1]
movdqu @XMM[1], 0x10($out)
je .Lctr_enc_done
movdqu 0x20($inp), @XMM[10]
pxor @XMM[10], @XMM[4]
movdqu @XMM[4], 0x20($out)
cmp \$4,$len
jb .Lctr_enc_done
movdqu 0x30($inp), @XMM[11]
pxor @XMM[11], @XMM[6]
movdqu @XMM[6], 0x30($out)
je .Lctr_enc_done
movdqu 0x40($inp), @XMM[12]
pxor @XMM[12], @XMM[3]
movdqu @XMM[3], 0x40($out)
cmp \$6,$len
jb .Lctr_enc_done
movdqu 0x50($inp), @XMM[13]
pxor @XMM[13], @XMM[7]
movdqu @XMM[7], 0x50($out)
je .Lctr_enc_done
movdqu 0x60($inp), @XMM[14]
pxor @XMM[14], @XMM[2]
movdqu @XMM[2], 0x60($out)
jmp .Lctr_enc_done
.align 16
.Lctr_enc_short:
lea 0x20(%rbp), $arg1
lea 0x30(%rbp), $arg2
lea ($key), $arg3
call asm_AES_encrypt
movdqu ($inp), @XMM[1]
lea 16($inp), $inp
mov 0x2c(%rbp), %eax # load 32-bit counter
bswap %eax
pxor 0x30(%rbp), @XMM[1]
inc %eax # increment
movdqu @XMM[1], ($out)
bswap %eax
lea 16($out), $out
mov %eax, 0x2c(%rsp) # save 32-bit counter
dec $len
jnz .Lctr_enc_short
.Lctr_enc_done:
lea (%rsp), %rax
pxor %xmm0, %xmm0
.Lctr_enc_bzero: # wipe key schedule [if any]
movdqa %xmm0, 0x00(%rax)
movdqa %xmm0, 0x10(%rax)
lea 0x20(%rax), %rax
cmp %rax, %rbp
ja .Lctr_enc_bzero
lea (%rbp),%rsp # restore %rsp
___
$code.=<<___ if ($win64);
movaps 0x40(%rbp), %xmm6
movaps 0x50(%rbp), %xmm7
movaps 0x60(%rbp), %xmm8
movaps 0x70(%rbp), %xmm9
movaps 0x80(%rbp), %xmm10
movaps 0x90(%rbp), %xmm11
movaps 0xa0(%rbp), %xmm12
movaps 0xb0(%rbp), %xmm13
movaps 0xc0(%rbp), %xmm14
movaps 0xd0(%rbp), %xmm15
lea 0xa0(%rbp), %rsp
___
$code.=<<___;
mov 0x48(%rsp), %r15
mov 0x50(%rsp), %r14
mov 0x58(%rsp), %r13
mov 0x60(%rsp), %r12
mov 0x68(%rsp), %rbx
mov 0x70(%rsp), %rax
lea 0x78(%rsp), %rsp
mov %rax, %rbp
.Lctr_enc_epilogue:
ret
.size bsaes_ctr32_encrypt_blocks,.-bsaes_ctr32_encrypt_blocks
___
######################################################################
# void bsaes_xts_[en|de]crypt(const char *inp,char *out,size_t len,
# const AES_KEY *key1, const AES_KEY *key2,
# const unsigned char iv[16]);
#
my ($twmask,$twres,$twtmp)=@XMM[13..15];
$arg6=~s/d$//;
$code.=<<___;
.globl bsaes_xts_encrypt
.type bsaes_xts_encrypt,\@abi-omnipotent
.align 16
bsaes_xts_encrypt:
mov %rsp, %rax
.Lxts_enc_prologue:
push %rbp
push %rbx
push %r12
push %r13
push %r14
push %r15
lea -0x48(%rsp), %rsp
___
$code.=<<___ if ($win64);
mov 0xa0(%rsp),$arg5 # pull key2
mov 0xa8(%rsp),$arg6 # pull ivp
lea -0xa0(%rsp), %rsp
movaps %xmm6, 0x40(%rsp)
movaps %xmm7, 0x50(%rsp)
movaps %xmm8, 0x60(%rsp)
movaps %xmm9, 0x70(%rsp)
movaps %xmm10, 0x80(%rsp)
movaps %xmm11, 0x90(%rsp)
movaps %xmm12, 0xa0(%rsp)
movaps %xmm13, 0xb0(%rsp)
movaps %xmm14, 0xc0(%rsp)
movaps %xmm15, 0xd0(%rsp)
.Lxts_enc_body:
___
$code.=<<___;
mov %rsp, %rbp # backup %rsp
mov $arg1, $inp # backup arguments
mov $arg2, $out
mov $arg3, $len
mov $arg4, $key
lea ($arg6), $arg1
lea 0x20(%rbp), $arg2
lea ($arg5), $arg3
call asm_AES_encrypt # generate initial tweak
mov 240($key), %eax # rounds
mov $len, %rbx # backup $len
mov %eax, %edx # rounds
shl \$7, %rax # 128 bytes per inner round key
sub \$`128-32`, %rax # size of bit-sliced key schedule
sub %rax, %rsp
mov %rsp, %rax # pass key schedule
mov $key, %rcx # pass key
mov %edx, %r10d # pass rounds
call _bsaes_key_convert
pxor %xmm6, %xmm7 # fix up last round key
movdqa %xmm7, (%rax) # save last round key
and \$-16, $len
sub \$0x80, %rsp # place for tweak[8]
movdqa 0x20(%rbp), @XMM[7] # initial tweak
pxor $twtmp, $twtmp
movdqa .Lxts_magic(%rip), $twmask
pcmpgtd @XMM[7], $twtmp # broadcast upper bits
sub \$0x80, $len
jc .Lxts_enc_short
jmp .Lxts_enc_loop
.align 16
.Lxts_enc_loop:
___
for ($i=0;$i<7;$i++) {
$code.=<<___;
pshufd \$0x13, $twtmp, $twres
pxor $twtmp, $twtmp
movdqa @XMM[7], @XMM[$i]
movdqa @XMM[7], `0x10*$i`(%rsp)# save tweak[$i]
paddq @XMM[7], @XMM[7] # psllq 1,$tweak
pand $twmask, $twres # isolate carry and residue
pcmpgtd @XMM[7], $twtmp # broadcast upper bits
pxor $twres, @XMM[7]
___
$code.=<<___ if ($i>=1);
movdqu `0x10*($i-1)`($inp), @XMM[8+$i-1]
___
$code.=<<___ if ($i>=2);
pxor @XMM[8+$i-2], @XMM[$i-2]# input[] ^ tweak[]
___
}
$code.=<<___;
movdqu 0x60($inp), @XMM[8+6]
pxor @XMM[8+5], @XMM[5]
movdqu 0x70($inp), @XMM[8+7]
lea 0x80($inp), $inp
movdqa @XMM[7], 0x70(%rsp)
pxor @XMM[8+6], @XMM[6]
lea 0x80(%rsp), %rax # pass key schedule
pxor @XMM[8+7], @XMM[7]
mov %edx, %r10d # pass rounds
call _bsaes_encrypt8
pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
pxor 0x10(%rsp), @XMM[1]
movdqu @XMM[0], 0x00($out) # write output
pxor 0x20(%rsp), @XMM[4]
movdqu @XMM[1], 0x10($out)
pxor 0x30(%rsp), @XMM[6]
movdqu @XMM[4], 0x20($out)
pxor 0x40(%rsp), @XMM[3]
movdqu @XMM[6], 0x30($out)
pxor 0x50(%rsp), @XMM[7]
movdqu @XMM[3], 0x40($out)
pxor 0x60(%rsp), @XMM[2]
movdqu @XMM[7], 0x50($out)
pxor 0x70(%rsp), @XMM[5]
movdqu @XMM[2], 0x60($out)
movdqu @XMM[5], 0x70($out)
lea 0x80($out), $out
movdqa 0x70(%rsp), @XMM[7] # prepare next iteration tweak
pxor $twtmp, $twtmp
movdqa .Lxts_magic(%rip), $twmask
pcmpgtd @XMM[7], $twtmp
pshufd \$0x13, $twtmp, $twres
pxor $twtmp, $twtmp
paddq @XMM[7], @XMM[7] # psllq 1,$tweak
pand $twmask, $twres # isolate carry and residue
pcmpgtd @XMM[7], $twtmp # broadcast upper bits
pxor $twres, @XMM[7]
sub \$0x80,$len
jnc .Lxts_enc_loop
.Lxts_enc_short:
add \$0x80, $len
jz .Lxts_enc_done
___
for ($i=0;$i<7;$i++) {
$code.=<<___;
pshufd \$0x13, $twtmp, $twres
pxor $twtmp, $twtmp
movdqa @XMM[7], @XMM[$i]
movdqa @XMM[7], `0x10*$i`(%rsp)# save tweak[$i]
paddq @XMM[7], @XMM[7] # psllq 1,$tweak
pand $twmask, $twres # isolate carry and residue
pcmpgtd @XMM[7], $twtmp # broadcast upper bits
pxor $twres, @XMM[7]
___
$code.=<<___ if ($i>=1);
movdqu `0x10*($i-1)`($inp), @XMM[8+$i-1]
cmp \$`0x10*$i`,$len
je .Lxts_enc_$i
___
$code.=<<___ if ($i>=2);
pxor @XMM[8+$i-2], @XMM[$i-2]# input[] ^ tweak[]
___
}
$code.=<<___;
movdqu 0x60($inp), @XMM[8+6]
pxor @XMM[8+5], @XMM[5]
movdqa @XMM[7], 0x70(%rsp)
lea 0x70($inp), $inp
pxor @XMM[8+6], @XMM[6]
lea 0x80(%rsp), %rax # pass key schedule
mov %edx, %r10d # pass rounds
call _bsaes_encrypt8
pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
pxor 0x10(%rsp), @XMM[1]
movdqu @XMM[0], 0x00($out) # write output
pxor 0x20(%rsp), @XMM[4]
movdqu @XMM[1], 0x10($out)
pxor 0x30(%rsp), @XMM[6]
movdqu @XMM[4], 0x20($out)
pxor 0x40(%rsp), @XMM[3]
movdqu @XMM[6], 0x30($out)
pxor 0x50(%rsp), @XMM[7]
movdqu @XMM[3], 0x40($out)
pxor 0x60(%rsp), @XMM[2]
movdqu @XMM[7], 0x50($out)
movdqu @XMM[2], 0x60($out)
lea 0x70($out), $out
movdqa 0x70(%rsp), @XMM[7] # next iteration tweak
jmp .Lxts_enc_done
.align 16
.Lxts_enc_6:
pxor @XMM[8+4], @XMM[4]
lea 0x60($inp), $inp
pxor @XMM[8+5], @XMM[5]
lea 0x80(%rsp), %rax # pass key schedule
mov %edx, %r10d # pass rounds
call _bsaes_encrypt8
pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
pxor 0x10(%rsp), @XMM[1]
movdqu @XMM[0], 0x00($out) # write output
pxor 0x20(%rsp), @XMM[4]
movdqu @XMM[1], 0x10($out)
pxor 0x30(%rsp), @XMM[6]
movdqu @XMM[4], 0x20($out)
pxor 0x40(%rsp), @XMM[3]
movdqu @XMM[6], 0x30($out)
pxor 0x50(%rsp), @XMM[7]
movdqu @XMM[3], 0x40($out)
movdqu @XMM[7], 0x50($out)
lea 0x60($out), $out
movdqa 0x60(%rsp), @XMM[7] # next iteration tweak
jmp .Lxts_enc_done
.align 16
.Lxts_enc_5:
pxor @XMM[8+3], @XMM[3]
lea 0x50($inp), $inp
pxor @XMM[8+4], @XMM[4]
lea 0x80(%rsp), %rax # pass key schedule
mov %edx, %r10d # pass rounds
call _bsaes_encrypt8
pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
pxor 0x10(%rsp), @XMM[1]
movdqu @XMM[0], 0x00($out) # write output
pxor 0x20(%rsp), @XMM[4]
movdqu @XMM[1], 0x10($out)
pxor 0x30(%rsp), @XMM[6]
movdqu @XMM[4], 0x20($out)
pxor 0x40(%rsp), @XMM[3]
movdqu @XMM[6], 0x30($out)
movdqu @XMM[3], 0x40($out)
lea 0x50($out), $out
movdqa 0x50(%rsp), @XMM[7] # next iteration tweak
jmp .Lxts_enc_done
.align 16
.Lxts_enc_4:
pxor @XMM[8+2], @XMM[2]
lea 0x40($inp), $inp
pxor @XMM[8+3], @XMM[3]
lea 0x80(%rsp), %rax # pass key schedule
mov %edx, %r10d # pass rounds
call _bsaes_encrypt8
pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
pxor 0x10(%rsp), @XMM[1]
movdqu @XMM[0], 0x00($out) # write output
pxor 0x20(%rsp), @XMM[4]
movdqu @XMM[1], 0x10($out)
pxor 0x30(%rsp), @XMM[6]
movdqu @XMM[4], 0x20($out)
movdqu @XMM[6], 0x30($out)
lea 0x40($out), $out
movdqa 0x40(%rsp), @XMM[7] # next iteration tweak
jmp .Lxts_enc_done
.align 16
.Lxts_enc_3:
pxor @XMM[8+1], @XMM[1]
lea 0x30($inp), $inp
pxor @XMM[8+2], @XMM[2]
lea 0x80(%rsp), %rax # pass key schedule
mov %edx, %r10d # pass rounds
call _bsaes_encrypt8
pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
pxor 0x10(%rsp), @XMM[1]
movdqu @XMM[0], 0x00($out) # write output
pxor 0x20(%rsp), @XMM[4]
movdqu @XMM[1], 0x10($out)
movdqu @XMM[4], 0x20($out)
lea 0x30($out), $out
movdqa 0x30(%rsp), @XMM[7] # next iteration tweak
jmp .Lxts_enc_done
.align 16
.Lxts_enc_2:
pxor @XMM[8+0], @XMM[0]
lea 0x20($inp), $inp
pxor @XMM[8+1], @XMM[1]
lea 0x80(%rsp), %rax # pass key schedule
mov %edx, %r10d # pass rounds
call _bsaes_encrypt8
pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
pxor 0x10(%rsp), @XMM[1]
movdqu @XMM[0], 0x00($out) # write output
movdqu @XMM[1], 0x10($out)
lea 0x20($out), $out
movdqa 0x20(%rsp), @XMM[7] # next iteration tweak
jmp .Lxts_enc_done
.align 16
.Lxts_enc_1:
pxor @XMM[0], @XMM[8]
lea 0x10($inp), $inp
movdqa @XMM[8], 0x20(%rbp)
lea 0x20(%rbp), $arg1
lea 0x20(%rbp), $arg2
lea ($key), $arg3
call asm_AES_encrypt # doesn't touch %xmm
pxor 0x20(%rbp), @XMM[0] # ^= tweak[]
#pxor @XMM[8], @XMM[0]
#lea 0x80(%rsp), %rax # pass key schedule
#mov %edx, %r10d # pass rounds
#call _bsaes_encrypt8
#pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
movdqu @XMM[0], 0x00($out) # write output
lea 0x10($out), $out
movdqa 0x10(%rsp), @XMM[7] # next iteration tweak
.Lxts_enc_done:
and \$15, %ebx
jz .Lxts_enc_ret
mov $out, %rdx
.Lxts_enc_steal:
movzb ($inp), %eax
movzb -16(%rdx), %ecx
lea 1($inp), $inp
mov %al, -16(%rdx)
mov %cl, 0(%rdx)
lea 1(%rdx), %rdx
sub \$1,%ebx
jnz .Lxts_enc_steal
movdqu -16($out), @XMM[0]
lea 0x20(%rbp), $arg1
pxor @XMM[7], @XMM[0]
lea 0x20(%rbp), $arg2
movdqa @XMM[0], 0x20(%rbp)
lea ($key), $arg3
call asm_AES_encrypt # doesn't touch %xmm
pxor 0x20(%rbp), @XMM[7]
movdqu @XMM[7], -16($out)
.Lxts_enc_ret:
lea (%rsp), %rax
pxor %xmm0, %xmm0
.Lxts_enc_bzero: # wipe key schedule [if any]
movdqa %xmm0, 0x00(%rax)
movdqa %xmm0, 0x10(%rax)
lea 0x20(%rax), %rax
cmp %rax, %rbp
ja .Lxts_enc_bzero
lea (%rbp),%rsp # restore %rsp
___
$code.=<<___ if ($win64);
movaps 0x40(%rbp), %xmm6
movaps 0x50(%rbp), %xmm7
movaps 0x60(%rbp), %xmm8
movaps 0x70(%rbp), %xmm9
movaps 0x80(%rbp), %xmm10
movaps 0x90(%rbp), %xmm11
movaps 0xa0(%rbp), %xmm12
movaps 0xb0(%rbp), %xmm13
movaps 0xc0(%rbp), %xmm14
movaps 0xd0(%rbp), %xmm15
lea 0xa0(%rbp), %rsp
___
$code.=<<___;
mov 0x48(%rsp), %r15
mov 0x50(%rsp), %r14
mov 0x58(%rsp), %r13
mov 0x60(%rsp), %r12
mov 0x68(%rsp), %rbx
mov 0x70(%rsp), %rax
lea 0x78(%rsp), %rsp
mov %rax, %rbp
.Lxts_enc_epilogue:
ret
.size bsaes_xts_encrypt,.-bsaes_xts_encrypt
.globl bsaes_xts_decrypt
.type bsaes_xts_decrypt,\@abi-omnipotent
.align 16
bsaes_xts_decrypt:
mov %rsp, %rax
.Lxts_dec_prologue:
push %rbp
push %rbx
push %r12
push %r13
push %r14
push %r15
lea -0x48(%rsp), %rsp
___
$code.=<<___ if ($win64);
mov 0xa0(%rsp),$arg5 # pull key2
mov 0xa8(%rsp),$arg6 # pull ivp
lea -0xa0(%rsp), %rsp
movaps %xmm6, 0x40(%rsp)
movaps %xmm7, 0x50(%rsp)
movaps %xmm8, 0x60(%rsp)
movaps %xmm9, 0x70(%rsp)
movaps %xmm10, 0x80(%rsp)
movaps %xmm11, 0x90(%rsp)
movaps %xmm12, 0xa0(%rsp)
movaps %xmm13, 0xb0(%rsp)
movaps %xmm14, 0xc0(%rsp)
movaps %xmm15, 0xd0(%rsp)
.Lxts_dec_body:
___
$code.=<<___;
mov %rsp, %rbp # backup %rsp
mov $arg1, $inp # backup arguments
mov $arg2, $out
mov $arg3, $len
mov $arg4, $key
lea ($arg6), $arg1
lea 0x20(%rbp), $arg2
lea ($arg5), $arg3
call asm_AES_encrypt # generate initial tweak
mov 240($key), %eax # rounds
mov $len, %rbx # backup $len
mov %eax, %edx # rounds
shl \$7, %rax # 128 bytes per inner round key
sub \$`128-32`, %rax # size of bit-sliced key schedule
sub %rax, %rsp
mov %rsp, %rax # pass key schedule
mov $key, %rcx # pass key
mov %edx, %r10d # pass rounds
call _bsaes_key_convert
pxor (%rsp), %xmm7 # fix up round 0 key
movdqa %xmm6, (%rax) # save last round key
movdqa %xmm7, (%rsp)
xor %eax, %eax # if ($len%16) len-=16;
and \$-16, $len
test \$15, %ebx
setnz %al
shl \$4, %rax
sub %rax, $len
sub \$0x80, %rsp # place for tweak[8]
movdqa 0x20(%rbp), @XMM[7] # initial tweak
pxor $twtmp, $twtmp
movdqa .Lxts_magic(%rip), $twmask
pcmpgtd @XMM[7], $twtmp # broadcast upper bits
sub \$0x80, $len
jc .Lxts_dec_short
jmp .Lxts_dec_loop
.align 16
.Lxts_dec_loop:
___
for ($i=0;$i<7;$i++) {
$code.=<<___;
pshufd \$0x13, $twtmp, $twres
pxor $twtmp, $twtmp
movdqa @XMM[7], @XMM[$i]
movdqa @XMM[7], `0x10*$i`(%rsp)# save tweak[$i]
paddq @XMM[7], @XMM[7] # psllq 1,$tweak
pand $twmask, $twres # isolate carry and residue
pcmpgtd @XMM[7], $twtmp # broadcast upper bits
pxor $twres, @XMM[7]
___
$code.=<<___ if ($i>=1);
movdqu `0x10*($i-1)`($inp), @XMM[8+$i-1]
___
$code.=<<___ if ($i>=2);
pxor @XMM[8+$i-2], @XMM[$i-2]# input[] ^ tweak[]
___
}
$code.=<<___;
movdqu 0x60($inp), @XMM[8+6]
pxor @XMM[8+5], @XMM[5]
movdqu 0x70($inp), @XMM[8+7]
lea 0x80($inp), $inp
movdqa @XMM[7], 0x70(%rsp)
pxor @XMM[8+6], @XMM[6]
lea 0x80(%rsp), %rax # pass key schedule
pxor @XMM[8+7], @XMM[7]
mov %edx, %r10d # pass rounds
call _bsaes_decrypt8
pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
pxor 0x10(%rsp), @XMM[1]
movdqu @XMM[0], 0x00($out) # write output
pxor 0x20(%rsp), @XMM[6]
movdqu @XMM[1], 0x10($out)
pxor 0x30(%rsp), @XMM[4]
movdqu @XMM[6], 0x20($out)
pxor 0x40(%rsp), @XMM[2]
movdqu @XMM[4], 0x30($out)
pxor 0x50(%rsp), @XMM[7]
movdqu @XMM[2], 0x40($out)
pxor 0x60(%rsp), @XMM[3]
movdqu @XMM[7], 0x50($out)
pxor 0x70(%rsp), @XMM[5]
movdqu @XMM[3], 0x60($out)
movdqu @XMM[5], 0x70($out)
lea 0x80($out), $out
movdqa 0x70(%rsp), @XMM[7] # prepare next iteration tweak
pxor $twtmp, $twtmp
movdqa .Lxts_magic(%rip), $twmask
pcmpgtd @XMM[7], $twtmp
pshufd \$0x13, $twtmp, $twres
pxor $twtmp, $twtmp
paddq @XMM[7], @XMM[7] # psllq 1,$tweak
pand $twmask, $twres # isolate carry and residue
pcmpgtd @XMM[7], $twtmp # broadcast upper bits
pxor $twres, @XMM[7]
sub \$0x80,$len
jnc .Lxts_dec_loop
.Lxts_dec_short:
add \$0x80, $len
jz .Lxts_dec_done
___
for ($i=0;$i<7;$i++) {
$code.=<<___;
pshufd \$0x13, $twtmp, $twres
pxor $twtmp, $twtmp
movdqa @XMM[7], @XMM[$i]
movdqa @XMM[7], `0x10*$i`(%rsp)# save tweak[$i]
paddq @XMM[7], @XMM[7] # psllq 1,$tweak
pand $twmask, $twres # isolate carry and residue
pcmpgtd @XMM[7], $twtmp # broadcast upper bits
pxor $twres, @XMM[7]
___
$code.=<<___ if ($i>=1);
movdqu `0x10*($i-1)`($inp), @XMM[8+$i-1]
cmp \$`0x10*$i`,$len
je .Lxts_dec_$i
___
$code.=<<___ if ($i>=2);
pxor @XMM[8+$i-2], @XMM[$i-2]# input[] ^ tweak[]
___
}
$code.=<<___;
movdqu 0x60($inp), @XMM[8+6]
pxor @XMM[8+5], @XMM[5]
movdqa @XMM[7], 0x70(%rsp)
lea 0x70($inp), $inp
pxor @XMM[8+6], @XMM[6]
lea 0x80(%rsp), %rax # pass key schedule
mov %edx, %r10d # pass rounds
call _bsaes_decrypt8
pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
pxor 0x10(%rsp), @XMM[1]
movdqu @XMM[0], 0x00($out) # write output
pxor 0x20(%rsp), @XMM[6]
movdqu @XMM[1], 0x10($out)
pxor 0x30(%rsp), @XMM[4]
movdqu @XMM[6], 0x20($out)
pxor 0x40(%rsp), @XMM[2]
movdqu @XMM[4], 0x30($out)
pxor 0x50(%rsp), @XMM[7]
movdqu @XMM[2], 0x40($out)
pxor 0x60(%rsp), @XMM[3]
movdqu @XMM[7], 0x50($out)
movdqu @XMM[3], 0x60($out)
lea 0x70($out), $out
movdqa 0x70(%rsp), @XMM[7] # next iteration tweak
jmp .Lxts_dec_done
.align 16
.Lxts_dec_6:
pxor @XMM[8+4], @XMM[4]
lea 0x60($inp), $inp
pxor @XMM[8+5], @XMM[5]
lea 0x80(%rsp), %rax # pass key schedule
mov %edx, %r10d # pass rounds
call _bsaes_decrypt8
pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
pxor 0x10(%rsp), @XMM[1]
movdqu @XMM[0], 0x00($out) # write output
pxor 0x20(%rsp), @XMM[6]
movdqu @XMM[1], 0x10($out)
pxor 0x30(%rsp), @XMM[4]
movdqu @XMM[6], 0x20($out)
pxor 0x40(%rsp), @XMM[2]
movdqu @XMM[4], 0x30($out)
pxor 0x50(%rsp), @XMM[7]
movdqu @XMM[2], 0x40($out)
movdqu @XMM[7], 0x50($out)
lea 0x60($out), $out
movdqa 0x60(%rsp), @XMM[7] # next iteration tweak
jmp .Lxts_dec_done
.align 16
.Lxts_dec_5:
pxor @XMM[8+3], @XMM[3]
lea 0x50($inp), $inp
pxor @XMM[8+4], @XMM[4]
lea 0x80(%rsp), %rax # pass key schedule
mov %edx, %r10d # pass rounds
call _bsaes_decrypt8
pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
pxor 0x10(%rsp), @XMM[1]
movdqu @XMM[0], 0x00($out) # write output
pxor 0x20(%rsp), @XMM[6]
movdqu @XMM[1], 0x10($out)
pxor 0x30(%rsp), @XMM[4]
movdqu @XMM[6], 0x20($out)
pxor 0x40(%rsp), @XMM[2]
movdqu @XMM[4], 0x30($out)
movdqu @XMM[2], 0x40($out)
lea 0x50($out), $out
movdqa 0x50(%rsp), @XMM[7] # next iteration tweak
jmp .Lxts_dec_done
.align 16
.Lxts_dec_4:
pxor @XMM[8+2], @XMM[2]
lea 0x40($inp), $inp
pxor @XMM[8+3], @XMM[3]
lea 0x80(%rsp), %rax # pass key schedule
mov %edx, %r10d # pass rounds
call _bsaes_decrypt8
pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
pxor 0x10(%rsp), @XMM[1]
movdqu @XMM[0], 0x00($out) # write output
pxor 0x20(%rsp), @XMM[6]
movdqu @XMM[1], 0x10($out)
pxor 0x30(%rsp), @XMM[4]
movdqu @XMM[6], 0x20($out)
movdqu @XMM[4], 0x30($out)
lea 0x40($out), $out
movdqa 0x40(%rsp), @XMM[7] # next iteration tweak
jmp .Lxts_dec_done
.align 16
.Lxts_dec_3:
pxor @XMM[8+1], @XMM[1]
lea 0x30($inp), $inp
pxor @XMM[8+2], @XMM[2]
lea 0x80(%rsp), %rax # pass key schedule
mov %edx, %r10d # pass rounds
call _bsaes_decrypt8
pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
pxor 0x10(%rsp), @XMM[1]
movdqu @XMM[0], 0x00($out) # write output
pxor 0x20(%rsp), @XMM[6]
movdqu @XMM[1], 0x10($out)
movdqu @XMM[6], 0x20($out)
lea 0x30($out), $out
movdqa 0x30(%rsp), @XMM[7] # next iteration tweak
jmp .Lxts_dec_done
.align 16
.Lxts_dec_2:
pxor @XMM[8+0], @XMM[0]
lea 0x20($inp), $inp
pxor @XMM[8+1], @XMM[1]
lea 0x80(%rsp), %rax # pass key schedule
mov %edx, %r10d # pass rounds
call _bsaes_decrypt8
pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
pxor 0x10(%rsp), @XMM[1]
movdqu @XMM[0], 0x00($out) # write output
movdqu @XMM[1], 0x10($out)
lea 0x20($out), $out
movdqa 0x20(%rsp), @XMM[7] # next iteration tweak
jmp .Lxts_dec_done
.align 16
.Lxts_dec_1:
pxor @XMM[0], @XMM[8]
lea 0x10($inp), $inp
movdqa @XMM[8], 0x20(%rbp)
lea 0x20(%rbp), $arg1
lea 0x20(%rbp), $arg2
lea ($key), $arg3
call asm_AES_decrypt # doesn't touch %xmm
pxor 0x20(%rbp), @XMM[0] # ^= tweak[]
#pxor @XMM[8], @XMM[0]
#lea 0x80(%rsp), %rax # pass key schedule
#mov %edx, %r10d # pass rounds
#call _bsaes_decrypt8
#pxor 0x00(%rsp), @XMM[0] # ^= tweak[]
movdqu @XMM[0], 0x00($out) # write output
lea 0x10($out), $out
movdqa 0x10(%rsp), @XMM[7] # next iteration tweak
.Lxts_dec_done:
and \$15, %ebx
jz .Lxts_dec_ret
pxor $twtmp, $twtmp
movdqa .Lxts_magic(%rip), $twmask
pcmpgtd @XMM[7], $twtmp
pshufd \$0x13, $twtmp, $twres
movdqa @XMM[7], @XMM[6]
paddq @XMM[7], @XMM[7] # psllq 1,$tweak
pand $twmask, $twres # isolate carry and residue
movdqu ($inp), @XMM[0]
pxor $twres, @XMM[7]
lea 0x20(%rbp), $arg1
pxor @XMM[7], @XMM[0]
lea 0x20(%rbp), $arg2
movdqa @XMM[0], 0x20(%rbp)
lea ($key), $arg3
call asm_AES_decrypt # doesn't touch %xmm
pxor 0x20(%rbp), @XMM[7]
mov $out, %rdx
movdqu @XMM[7], ($out)
.Lxts_dec_steal:
movzb 16($inp), %eax
movzb (%rdx), %ecx
lea 1($inp), $inp
mov %al, (%rdx)
mov %cl, 16(%rdx)
lea 1(%rdx), %rdx
sub \$1,%ebx
jnz .Lxts_dec_steal
movdqu ($out), @XMM[0]
lea 0x20(%rbp), $arg1
pxor @XMM[6], @XMM[0]
lea 0x20(%rbp), $arg2
movdqa @XMM[0], 0x20(%rbp)
lea ($key), $arg3
call asm_AES_decrypt # doesn't touch %xmm
pxor 0x20(%rbp), @XMM[6]
movdqu @XMM[6], ($out)
.Lxts_dec_ret:
lea (%rsp), %rax
pxor %xmm0, %xmm0
.Lxts_dec_bzero: # wipe key schedule [if any]
movdqa %xmm0, 0x00(%rax)
movdqa %xmm0, 0x10(%rax)
lea 0x20(%rax), %rax
cmp %rax, %rbp
ja .Lxts_dec_bzero
lea (%rbp),%rsp # restore %rsp
___
$code.=<<___ if ($win64);
movaps 0x40(%rbp), %xmm6
movaps 0x50(%rbp), %xmm7
movaps 0x60(%rbp), %xmm8
movaps 0x70(%rbp), %xmm9
movaps 0x80(%rbp), %xmm10
movaps 0x90(%rbp), %xmm11
movaps 0xa0(%rbp), %xmm12
movaps 0xb0(%rbp), %xmm13
movaps 0xc0(%rbp), %xmm14
movaps 0xd0(%rbp), %xmm15
lea 0xa0(%rbp), %rsp
___
$code.=<<___;
mov 0x48(%rsp), %r15
mov 0x50(%rsp), %r14
mov 0x58(%rsp), %r13
mov 0x60(%rsp), %r12
mov 0x68(%rsp), %rbx
mov 0x70(%rsp), %rax
lea 0x78(%rsp), %rsp
mov %rax, %rbp
.Lxts_dec_epilogue:
ret
.size bsaes_xts_decrypt,.-bsaes_xts_decrypt
___
}
$code.=<<___;
.type _bsaes_const,\@object
.align 64
_bsaes_const:
.LM0ISR: # InvShiftRows constants
.quad 0x0a0e0206070b0f03, 0x0004080c0d010509
.LISRM0:
.quad 0x01040b0e0205080f, 0x0306090c00070a0d
.LISR:
.quad 0x0504070602010003, 0x0f0e0d0c080b0a09
.LBS0: # bit-slice constants
.quad 0x5555555555555555, 0x5555555555555555
.LBS1:
.quad 0x3333333333333333, 0x3333333333333333
.LBS2:
.quad 0x0f0f0f0f0f0f0f0f, 0x0f0f0f0f0f0f0f0f
.LSR: # shiftrows constants
.quad 0x0504070600030201, 0x0f0e0d0c0a09080b
.LSRM0:
.quad 0x0304090e00050a0f, 0x01060b0c0207080d
.LM0SR:
.quad 0x0a0e02060f03070b, 0x0004080c05090d01
.LSWPUP: # byte-swap upper dword
.quad 0x0706050403020100, 0x0c0d0e0f0b0a0908
.LSWPUPM0SR:
.quad 0x0a0d02060c03070b, 0x0004080f05090e01
.LADD1: # counter increment constants
.quad 0x0000000000000000, 0x0000000100000000
.LADD2:
.quad 0x0000000000000000, 0x0000000200000000
.LADD3:
.quad 0x0000000000000000, 0x0000000300000000
.LADD4:
.quad 0x0000000000000000, 0x0000000400000000
.LADD5:
.quad 0x0000000000000000, 0x0000000500000000
.LADD6:
.quad 0x0000000000000000, 0x0000000600000000
.LADD7:
.quad 0x0000000000000000, 0x0000000700000000
.LADD8:
.quad 0x0000000000000000, 0x0000000800000000
.Lxts_magic:
.long 0x87,0,1,0
.Lmasks:
.quad 0x0101010101010101, 0x0101010101010101
.quad 0x0202020202020202, 0x0202020202020202
.quad 0x0404040404040404, 0x0404040404040404
.quad 0x0808080808080808, 0x0808080808080808
.LM0:
.quad 0x02060a0e03070b0f, 0x0004080c0105090d
.L63:
.quad 0x6363636363636363, 0x6363636363636363
.asciz "Bit-sliced AES for x86_64/SSSE3, Emilia Käsper, Peter Schwabe, Andy Polyakov"
.align 64
.size _bsaes_const,.-_bsaes_const
___
# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
# CONTEXT *context,DISPATCHER_CONTEXT *disp)
if ($win64) {
$rec="%rcx";
$frame="%rdx";
$context="%r8";
$disp="%r9";
$code.=<<___;
.extern __imp_RtlVirtualUnwind
.type se_handler,\@abi-omnipotent
.align 16
se_handler:
push %rsi
push %rdi
push %rbx
push %rbp
push %r12
push %r13
push %r14
push %r15
pushfq
sub \$64,%rsp
mov 120($context),%rax # pull context->Rax
mov 248($context),%rbx # pull context->Rip
mov 8($disp),%rsi # disp->ImageBase
mov 56($disp),%r11 # disp->HandlerData
mov 0(%r11),%r10d # HandlerData[0]
lea (%rsi,%r10),%r10 # prologue label
cmp %r10,%rbx # context->Rip<prologue label
jb .Lin_prologue
mov 152($context),%rax # pull context->Rsp
mov 4(%r11),%r10d # HandlerData[1]
lea (%rsi,%r10),%r10 # epilogue label
cmp %r10,%rbx # context->Rip>=epilogue label
jae .Lin_prologue
mov 160($context),%rax # pull context->Rbp
lea 0x40(%rax),%rsi # %xmm save area
lea 512($context),%rdi # &context.Xmm6
mov \$20,%ecx # 10*sizeof(%xmm0)/sizeof(%rax)
.long 0xa548f3fc # cld; rep movsq
lea 0xa0(%rax),%rax # adjust stack pointer
mov 0x70(%rax),%rbp
mov 0x68(%rax),%rbx
mov 0x60(%rax),%r12
mov 0x58(%rax),%r13
mov 0x50(%rax),%r14
mov 0x48(%rax),%r15
lea 0x78(%rax),%rax # adjust stack pointer
mov %rbx,144($context) # restore context->Rbx
mov %rbp,160($context) # restore context->Rbp
mov %r12,216($context) # restore context->R12
mov %r13,224($context) # restore context->R13
mov %r14,232($context) # restore context->R14
mov %r15,240($context) # restore context->R15
.Lin_prologue:
mov %rax,152($context) # restore context->Rsp
mov 40($disp),%rdi # disp->ContextRecord
mov $context,%rsi # context
mov \$`1232/8`,%ecx # sizeof(CONTEXT)
.long 0xa548f3fc # cld; rep movsq
mov $disp,%rsi
xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER
mov 8(%rsi),%rdx # arg2, disp->ImageBase
mov 0(%rsi),%r8 # arg3, disp->ControlPc
mov 16(%rsi),%r9 # arg4, disp->FunctionEntry
mov 40(%rsi),%r10 # disp->ContextRecord
lea 56(%rsi),%r11 # &disp->HandlerData
lea 24(%rsi),%r12 # &disp->EstablisherFrame
mov %r10,32(%rsp) # arg5
mov %r11,40(%rsp) # arg6
mov %r12,48(%rsp) # arg7
mov %rcx,56(%rsp) # arg8, (NULL)
call *__imp_RtlVirtualUnwind(%rip)
mov \$1,%eax # ExceptionContinueSearch
add \$64,%rsp
popfq
pop %r15
pop %r14
pop %r13
pop %r12
pop %rbp
pop %rbx
pop %rdi
pop %rsi
ret
.size se_handler,.-se_handler
.section .pdata
.align 4
___
$code.=<<___ if ($ecb);
.rva .Lecb_enc_prologue
.rva .Lecb_enc_epilogue
.rva .Lecb_enc_info
.rva .Lecb_dec_prologue
.rva .Lecb_dec_epilogue
.rva .Lecb_dec_info
___
$code.=<<___;
.rva .Lcbc_dec_prologue
.rva .Lcbc_dec_epilogue
.rva .Lcbc_dec_info
.rva .Lctr_enc_prologue
.rva .Lctr_enc_epilogue
.rva .Lctr_enc_info
.rva .Lxts_enc_prologue
.rva .Lxts_enc_epilogue
.rva .Lxts_enc_info
.rva .Lxts_dec_prologue
.rva .Lxts_dec_epilogue
.rva .Lxts_dec_info
.section .xdata
.align 8
___
$code.=<<___ if ($ecb);
.Lecb_enc_info:
.byte 9,0,0,0
.rva se_handler
.rva .Lecb_enc_body,.Lecb_enc_epilogue # HandlerData[]
.Lecb_dec_info:
.byte 9,0,0,0
.rva se_handler
.rva .Lecb_dec_body,.Lecb_dec_epilogue # HandlerData[]
___
$code.=<<___;
.Lcbc_dec_info:
.byte 9,0,0,0
.rva se_handler
.rva .Lcbc_dec_body,.Lcbc_dec_epilogue # HandlerData[]
.Lctr_enc_info:
.byte 9,0,0,0
.rva se_handler
.rva .Lctr_enc_body,.Lctr_enc_epilogue # HandlerData[]
.Lxts_enc_info:
.byte 9,0,0,0
.rva se_handler
.rva .Lxts_enc_body,.Lxts_enc_epilogue # HandlerData[]
.Lxts_dec_info:
.byte 9,0,0,0
.rva se_handler
.rva .Lxts_dec_body,.Lxts_dec_epilogue # HandlerData[]
___
}
$code =~ s/\`([^\`]*)\`/eval($1)/gem;
print $code;
close STDOUT;