boringssl/crypto/aes/asm/aesni-x86.pl
Adam Langley cf9a98cc0c x86 assembly pack: update performance results.
(Imports upstream's a30b0522cb937be54e172c68b0e9f5fa6ec30bf3.)

Change-Id: I6b9e67f97de935ecaaa9524943c6bdbe3540c0d0
Reviewed-on: https://boringssl-review.googlesource.com/13780
Commit-Queue: Adam Langley <agl@google.com>
Reviewed-by: David Benjamin <davidben@google.com>
2017-02-14 00:44:17 +00:00

2534 lines
74 KiB
Perl
Raw Blame History

This file contains invisible Unicode characters

This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env perl
# ====================================================================
# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
# project. The module is, however, dual licensed under OpenSSL and
# CRYPTOGAMS licenses depending on where you obtain it. For further
# details see http://www.openssl.org/~appro/cryptogams/.
# ====================================================================
#
# This module implements support for Intel AES-NI extension. In
# OpenSSL context it's used with Intel engine, but can also be used as
# drop-in replacement for crypto/aes/asm/aes-586.pl [see below for
# details].
#
# Performance.
#
# To start with see corresponding paragraph in aesni-x86_64.pl...
# Instead of filling table similar to one found there I've chosen to
# summarize *comparison* results for raw ECB, CTR and CBC benchmarks.
# The simplified table below represents 32-bit performance relative
# to 64-bit one in every given point. Ratios vary for different
# encryption modes, therefore interval values.
#
# 16-byte 64-byte 256-byte 1-KB 8-KB
# 53-67% 67-84% 91-94% 95-98% 97-99.5%
#
# Lower ratios for smaller block sizes are perfectly understandable,
# because function call overhead is higher in 32-bit mode. Largest
# 8-KB block performance is virtually same: 32-bit code is less than
# 1% slower for ECB, CBC and CCM, and ~3% slower otherwise.
# January 2011
#
# See aesni-x86_64.pl for details. Unlike x86_64 version this module
# interleaves at most 6 aes[enc|dec] instructions, because there are
# not enough registers for 8x interleave [which should be optimal for
# Sandy Bridge]. Actually, performance results for 6x interleave
# factor presented in aesni-x86_64.pl (except for CTR) are for this
# module.
# April 2011
#
# Add aesni_xts_[en|de]crypt. Westmere spends 1.50 cycles processing
# one byte out of 8KB with 128-bit key, Sandy Bridge - 1.09.
######################################################################
# Current large-block performance in cycles per byte processed with
# 128-bit key (less is better).
#
# CBC en-/decrypt CTR XTS ECB
# Westmere 3.77/1.37 1.37 1.52 1.27
# * Bridge 5.07/0.98 0.99 1.09 0.91
# Haswell 4.44/0.80 0.97 1.03 0.72
# Skylake 2.68/0.65 0.65 0.66 0.64
# Silvermont 5.77/3.56 3.67 4.03 3.46
# Goldmont 3.84/1.39 1.39 1.63 1.31
# Bulldozer 5.80/0.98 1.05 1.24 0.93
$PREFIX="aesni"; # if $PREFIX is set to "AES", the script
# generates drop-in replacement for
# crypto/aes/asm/aes-586.pl:-)
$inline=1; # inline _aesni_[en|de]crypt
$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
push(@INC,"${dir}","${dir}../../perlasm");
require "x86asm.pl";
$output = pop;
open OUT,">$output";
*STDOUT=*OUT;
&asm_init($ARGV[0],$0);
&external_label("OPENSSL_ia32cap_P");
&static_label("key_const");
if ($PREFIX eq "aesni") { $movekey=\&movups; }
else { $movekey=\&movups; }
$len="eax";
$rounds="ecx";
$key="edx";
$inp="esi";
$out="edi";
$rounds_="ebx"; # backup copy for $rounds
$key_="ebp"; # backup copy for $key
$rndkey0="xmm0";
$rndkey1="xmm1";
$inout0="xmm2";
$inout1="xmm3";
$inout2="xmm4";
$inout3="xmm5"; $in1="xmm5";
$inout4="xmm6"; $in0="xmm6";
$inout5="xmm7"; $ivec="xmm7";
# AESNI extension
sub aeskeygenassist
{ my($dst,$src,$imm)=@_;
if ("$dst:$src" =~ /xmm([0-7]):xmm([0-7])/)
{ &data_byte(0x66,0x0f,0x3a,0xdf,0xc0|($1<<3)|$2,$imm); }
}
sub aescommon
{ my($opcodelet,$dst,$src)=@_;
if ("$dst:$src" =~ /xmm([0-7]):xmm([0-7])/)
{ &data_byte(0x66,0x0f,0x38,$opcodelet,0xc0|($1<<3)|$2);}
}
sub aesimc { aescommon(0xdb,@_); }
sub aesenc { aescommon(0xdc,@_); }
sub aesenclast { aescommon(0xdd,@_); }
sub aesdec { aescommon(0xde,@_); }
sub aesdeclast { aescommon(0xdf,@_); }
# Inline version of internal aesni_[en|de]crypt1
{ my $sn;
sub aesni_inline_generate1
{ my ($p,$inout,$ivec)=@_; $inout=$inout0 if (!defined($inout));
$sn++;
&$movekey ($rndkey0,&QWP(0,$key));
&$movekey ($rndkey1,&QWP(16,$key));
&xorps ($ivec,$rndkey0) if (defined($ivec));
&lea ($key,&DWP(32,$key));
&xorps ($inout,$ivec) if (defined($ivec));
&xorps ($inout,$rndkey0) if (!defined($ivec));
&set_label("${p}1_loop_$sn");
eval"&aes${p} ($inout,$rndkey1)";
&dec ($rounds);
&$movekey ($rndkey1,&QWP(0,$key));
&lea ($key,&DWP(16,$key));
&jnz (&label("${p}1_loop_$sn"));
eval"&aes${p}last ($inout,$rndkey1)";
}}
sub aesni_generate1 # fully unrolled loop
{ my ($p,$inout)=@_; $inout=$inout0 if (!defined($inout));
&function_begin_B("_aesni_${p}rypt1");
&movups ($rndkey0,&QWP(0,$key));
&$movekey ($rndkey1,&QWP(0x10,$key));
&xorps ($inout,$rndkey0);
&$movekey ($rndkey0,&QWP(0x20,$key));
&lea ($key,&DWP(0x30,$key));
&cmp ($rounds,11);
&jb (&label("${p}128"));
&lea ($key,&DWP(0x20,$key));
&je (&label("${p}192"));
&lea ($key,&DWP(0x20,$key));
eval"&aes${p} ($inout,$rndkey1)";
&$movekey ($rndkey1,&QWP(-0x40,$key));
eval"&aes${p} ($inout,$rndkey0)";
&$movekey ($rndkey0,&QWP(-0x30,$key));
&set_label("${p}192");
eval"&aes${p} ($inout,$rndkey1)";
&$movekey ($rndkey1,&QWP(-0x20,$key));
eval"&aes${p} ($inout,$rndkey0)";
&$movekey ($rndkey0,&QWP(-0x10,$key));
&set_label("${p}128");
eval"&aes${p} ($inout,$rndkey1)";
&$movekey ($rndkey1,&QWP(0,$key));
eval"&aes${p} ($inout,$rndkey0)";
&$movekey ($rndkey0,&QWP(0x10,$key));
eval"&aes${p} ($inout,$rndkey1)";
&$movekey ($rndkey1,&QWP(0x20,$key));
eval"&aes${p} ($inout,$rndkey0)";
&$movekey ($rndkey0,&QWP(0x30,$key));
eval"&aes${p} ($inout,$rndkey1)";
&$movekey ($rndkey1,&QWP(0x40,$key));
eval"&aes${p} ($inout,$rndkey0)";
&$movekey ($rndkey0,&QWP(0x50,$key));
eval"&aes${p} ($inout,$rndkey1)";
&$movekey ($rndkey1,&QWP(0x60,$key));
eval"&aes${p} ($inout,$rndkey0)";
&$movekey ($rndkey0,&QWP(0x70,$key));
eval"&aes${p} ($inout,$rndkey1)";
eval"&aes${p}last ($inout,$rndkey0)";
&ret();
&function_end_B("_aesni_${p}rypt1");
}
# void $PREFIX_encrypt (const void *inp,void *out,const AES_KEY *key);
&aesni_generate1("enc") if (!$inline);
&function_begin_B("${PREFIX}_encrypt");
&mov ("eax",&wparam(0));
&mov ($key,&wparam(2));
&movups ($inout0,&QWP(0,"eax"));
&mov ($rounds,&DWP(240,$key));
&mov ("eax",&wparam(1));
if ($inline)
{ &aesni_inline_generate1("enc"); }
else
{ &call ("_aesni_encrypt1"); }
&pxor ($rndkey0,$rndkey0); # clear register bank
&pxor ($rndkey1,$rndkey1);
&movups (&QWP(0,"eax"),$inout0);
&pxor ($inout0,$inout0);
&ret ();
&function_end_B("${PREFIX}_encrypt");
# void $PREFIX_decrypt (const void *inp,void *out,const AES_KEY *key);
&aesni_generate1("dec") if(!$inline);
&function_begin_B("${PREFIX}_decrypt");
&mov ("eax",&wparam(0));
&mov ($key,&wparam(2));
&movups ($inout0,&QWP(0,"eax"));
&mov ($rounds,&DWP(240,$key));
&mov ("eax",&wparam(1));
if ($inline)
{ &aesni_inline_generate1("dec"); }
else
{ &call ("_aesni_decrypt1"); }
&pxor ($rndkey0,$rndkey0); # clear register bank
&pxor ($rndkey1,$rndkey1);
&movups (&QWP(0,"eax"),$inout0);
&pxor ($inout0,$inout0);
&ret ();
&function_end_B("${PREFIX}_decrypt");
# _aesni_[en|de]cryptN are private interfaces, N denotes interleave
# factor. Why 3x subroutine were originally used in loops? Even though
# aes[enc|dec] latency was originally 6, it could be scheduled only
# every *2nd* cycle. Thus 3x interleave was the one providing optimal
# utilization, i.e. when subroutine's throughput is virtually same as
# of non-interleaved subroutine [for number of input blocks up to 3].
# This is why it originally made no sense to implement 2x subroutine.
# But times change and it became appropriate to spend extra 192 bytes
# on 2x subroutine on Atom Silvermont account. For processors that
# can schedule aes[enc|dec] every cycle optimal interleave factor
# equals to corresponding instructions latency. 8x is optimal for
# * Bridge, but it's unfeasible to accommodate such implementation
# in XMM registers addreassable in 32-bit mode and therefore maximum
# of 6x is used instead...
sub aesni_generate2
{ my $p=shift;
&function_begin_B("_aesni_${p}rypt2");
&$movekey ($rndkey0,&QWP(0,$key));
&shl ($rounds,4);
&$movekey ($rndkey1,&QWP(16,$key));
&xorps ($inout0,$rndkey0);
&pxor ($inout1,$rndkey0);
&$movekey ($rndkey0,&QWP(32,$key));
&lea ($key,&DWP(32,$key,$rounds));
&neg ($rounds);
&add ($rounds,16);
&set_label("${p}2_loop");
eval"&aes${p} ($inout0,$rndkey1)";
eval"&aes${p} ($inout1,$rndkey1)";
&$movekey ($rndkey1,&QWP(0,$key,$rounds));
&add ($rounds,32);
eval"&aes${p} ($inout0,$rndkey0)";
eval"&aes${p} ($inout1,$rndkey0)";
&$movekey ($rndkey0,&QWP(-16,$key,$rounds));
&jnz (&label("${p}2_loop"));
eval"&aes${p} ($inout0,$rndkey1)";
eval"&aes${p} ($inout1,$rndkey1)";
eval"&aes${p}last ($inout0,$rndkey0)";
eval"&aes${p}last ($inout1,$rndkey0)";
&ret();
&function_end_B("_aesni_${p}rypt2");
}
sub aesni_generate3
{ my $p=shift;
&function_begin_B("_aesni_${p}rypt3");
&$movekey ($rndkey0,&QWP(0,$key));
&shl ($rounds,4);
&$movekey ($rndkey1,&QWP(16,$key));
&xorps ($inout0,$rndkey0);
&pxor ($inout1,$rndkey0);
&pxor ($inout2,$rndkey0);
&$movekey ($rndkey0,&QWP(32,$key));
&lea ($key,&DWP(32,$key,$rounds));
&neg ($rounds);
&add ($rounds,16);
&set_label("${p}3_loop");
eval"&aes${p} ($inout0,$rndkey1)";
eval"&aes${p} ($inout1,$rndkey1)";
eval"&aes${p} ($inout2,$rndkey1)";
&$movekey ($rndkey1,&QWP(0,$key,$rounds));
&add ($rounds,32);
eval"&aes${p} ($inout0,$rndkey0)";
eval"&aes${p} ($inout1,$rndkey0)";
eval"&aes${p} ($inout2,$rndkey0)";
&$movekey ($rndkey0,&QWP(-16,$key,$rounds));
&jnz (&label("${p}3_loop"));
eval"&aes${p} ($inout0,$rndkey1)";
eval"&aes${p} ($inout1,$rndkey1)";
eval"&aes${p} ($inout2,$rndkey1)";
eval"&aes${p}last ($inout0,$rndkey0)";
eval"&aes${p}last ($inout1,$rndkey0)";
eval"&aes${p}last ($inout2,$rndkey0)";
&ret();
&function_end_B("_aesni_${p}rypt3");
}
# 4x interleave is implemented to improve small block performance,
# most notably [and naturally] 4 block by ~30%. One can argue that one
# should have implemented 5x as well, but improvement would be <20%,
# so it's not worth it...
sub aesni_generate4
{ my $p=shift;
&function_begin_B("_aesni_${p}rypt4");
&$movekey ($rndkey0,&QWP(0,$key));
&$movekey ($rndkey1,&QWP(16,$key));
&shl ($rounds,4);
&xorps ($inout0,$rndkey0);
&pxor ($inout1,$rndkey0);
&pxor ($inout2,$rndkey0);
&pxor ($inout3,$rndkey0);
&$movekey ($rndkey0,&QWP(32,$key));
&lea ($key,&DWP(32,$key,$rounds));
&neg ($rounds);
&data_byte (0x0f,0x1f,0x40,0x00);
&add ($rounds,16);
&set_label("${p}4_loop");
eval"&aes${p} ($inout0,$rndkey1)";
eval"&aes${p} ($inout1,$rndkey1)";
eval"&aes${p} ($inout2,$rndkey1)";
eval"&aes${p} ($inout3,$rndkey1)";
&$movekey ($rndkey1,&QWP(0,$key,$rounds));
&add ($rounds,32);
eval"&aes${p} ($inout0,$rndkey0)";
eval"&aes${p} ($inout1,$rndkey0)";
eval"&aes${p} ($inout2,$rndkey0)";
eval"&aes${p} ($inout3,$rndkey0)";
&$movekey ($rndkey0,&QWP(-16,$key,$rounds));
&jnz (&label("${p}4_loop"));
eval"&aes${p} ($inout0,$rndkey1)";
eval"&aes${p} ($inout1,$rndkey1)";
eval"&aes${p} ($inout2,$rndkey1)";
eval"&aes${p} ($inout3,$rndkey1)";
eval"&aes${p}last ($inout0,$rndkey0)";
eval"&aes${p}last ($inout1,$rndkey0)";
eval"&aes${p}last ($inout2,$rndkey0)";
eval"&aes${p}last ($inout3,$rndkey0)";
&ret();
&function_end_B("_aesni_${p}rypt4");
}
sub aesni_generate6
{ my $p=shift;
&function_begin_B("_aesni_${p}rypt6");
&static_label("_aesni_${p}rypt6_enter");
&$movekey ($rndkey0,&QWP(0,$key));
&shl ($rounds,4);
&$movekey ($rndkey1,&QWP(16,$key));
&xorps ($inout0,$rndkey0);
&pxor ($inout1,$rndkey0); # pxor does better here
&pxor ($inout2,$rndkey0);
eval"&aes${p} ($inout0,$rndkey1)";
&pxor ($inout3,$rndkey0);
&pxor ($inout4,$rndkey0);
eval"&aes${p} ($inout1,$rndkey1)";
&lea ($key,&DWP(32,$key,$rounds));
&neg ($rounds);
eval"&aes${p} ($inout2,$rndkey1)";
&pxor ($inout5,$rndkey0);
&$movekey ($rndkey0,&QWP(0,$key,$rounds));
&add ($rounds,16);
&jmp (&label("_aesni_${p}rypt6_inner"));
&set_label("${p}6_loop",16);
eval"&aes${p} ($inout0,$rndkey1)";
eval"&aes${p} ($inout1,$rndkey1)";
eval"&aes${p} ($inout2,$rndkey1)";
&set_label("_aesni_${p}rypt6_inner");
eval"&aes${p} ($inout3,$rndkey1)";
eval"&aes${p} ($inout4,$rndkey1)";
eval"&aes${p} ($inout5,$rndkey1)";
&set_label("_aesni_${p}rypt6_enter");
&$movekey ($rndkey1,&QWP(0,$key,$rounds));
&add ($rounds,32);
eval"&aes${p} ($inout0,$rndkey0)";
eval"&aes${p} ($inout1,$rndkey0)";
eval"&aes${p} ($inout2,$rndkey0)";
eval"&aes${p} ($inout3,$rndkey0)";
eval"&aes${p} ($inout4,$rndkey0)";
eval"&aes${p} ($inout5,$rndkey0)";
&$movekey ($rndkey0,&QWP(-16,$key,$rounds));
&jnz (&label("${p}6_loop"));
eval"&aes${p} ($inout0,$rndkey1)";
eval"&aes${p} ($inout1,$rndkey1)";
eval"&aes${p} ($inout2,$rndkey1)";
eval"&aes${p} ($inout3,$rndkey1)";
eval"&aes${p} ($inout4,$rndkey1)";
eval"&aes${p} ($inout5,$rndkey1)";
eval"&aes${p}last ($inout0,$rndkey0)";
eval"&aes${p}last ($inout1,$rndkey0)";
eval"&aes${p}last ($inout2,$rndkey0)";
eval"&aes${p}last ($inout3,$rndkey0)";
eval"&aes${p}last ($inout4,$rndkey0)";
eval"&aes${p}last ($inout5,$rndkey0)";
&ret();
&function_end_B("_aesni_${p}rypt6");
}
&aesni_generate2("enc") if ($PREFIX eq "aesni");
&aesni_generate2("dec");
&aesni_generate3("enc") if ($PREFIX eq "aesni");
&aesni_generate3("dec");
&aesni_generate4("enc") if ($PREFIX eq "aesni");
&aesni_generate4("dec");
&aesni_generate6("enc") if ($PREFIX eq "aesni");
&aesni_generate6("dec");
if ($PREFIX eq "aesni") {
######################################################################
# void aesni_ecb_encrypt (const void *in, void *out,
# size_t length, const AES_KEY *key,
# int enc);
&function_begin("aesni_ecb_encrypt");
&mov ($inp,&wparam(0));
&mov ($out,&wparam(1));
&mov ($len,&wparam(2));
&mov ($key,&wparam(3));
&mov ($rounds_,&wparam(4));
&and ($len,-16);
&jz (&label("ecb_ret"));
&mov ($rounds,&DWP(240,$key));
&test ($rounds_,$rounds_);
&jz (&label("ecb_decrypt"));
&mov ($key_,$key); # backup $key
&mov ($rounds_,$rounds); # backup $rounds
&cmp ($len,0x60);
&jb (&label("ecb_enc_tail"));
&movdqu ($inout0,&QWP(0,$inp));
&movdqu ($inout1,&QWP(0x10,$inp));
&movdqu ($inout2,&QWP(0x20,$inp));
&movdqu ($inout3,&QWP(0x30,$inp));
&movdqu ($inout4,&QWP(0x40,$inp));
&movdqu ($inout5,&QWP(0x50,$inp));
&lea ($inp,&DWP(0x60,$inp));
&sub ($len,0x60);
&jmp (&label("ecb_enc_loop6_enter"));
&set_label("ecb_enc_loop6",16);
&movups (&QWP(0,$out),$inout0);
&movdqu ($inout0,&QWP(0,$inp));
&movups (&QWP(0x10,$out),$inout1);
&movdqu ($inout1,&QWP(0x10,$inp));
&movups (&QWP(0x20,$out),$inout2);
&movdqu ($inout2,&QWP(0x20,$inp));
&movups (&QWP(0x30,$out),$inout3);
&movdqu ($inout3,&QWP(0x30,$inp));
&movups (&QWP(0x40,$out),$inout4);
&movdqu ($inout4,&QWP(0x40,$inp));
&movups (&QWP(0x50,$out),$inout5);
&lea ($out,&DWP(0x60,$out));
&movdqu ($inout5,&QWP(0x50,$inp));
&lea ($inp,&DWP(0x60,$inp));
&set_label("ecb_enc_loop6_enter");
&call ("_aesni_encrypt6");
&mov ($key,$key_); # restore $key
&mov ($rounds,$rounds_); # restore $rounds
&sub ($len,0x60);
&jnc (&label("ecb_enc_loop6"));
&movups (&QWP(0,$out),$inout0);
&movups (&QWP(0x10,$out),$inout1);
&movups (&QWP(0x20,$out),$inout2);
&movups (&QWP(0x30,$out),$inout3);
&movups (&QWP(0x40,$out),$inout4);
&movups (&QWP(0x50,$out),$inout5);
&lea ($out,&DWP(0x60,$out));
&add ($len,0x60);
&jz (&label("ecb_ret"));
&set_label("ecb_enc_tail");
&movups ($inout0,&QWP(0,$inp));
&cmp ($len,0x20);
&jb (&label("ecb_enc_one"));
&movups ($inout1,&QWP(0x10,$inp));
&je (&label("ecb_enc_two"));
&movups ($inout2,&QWP(0x20,$inp));
&cmp ($len,0x40);
&jb (&label("ecb_enc_three"));
&movups ($inout3,&QWP(0x30,$inp));
&je (&label("ecb_enc_four"));
&movups ($inout4,&QWP(0x40,$inp));
&xorps ($inout5,$inout5);
&call ("_aesni_encrypt6");
&movups (&QWP(0,$out),$inout0);
&movups (&QWP(0x10,$out),$inout1);
&movups (&QWP(0x20,$out),$inout2);
&movups (&QWP(0x30,$out),$inout3);
&movups (&QWP(0x40,$out),$inout4);
jmp (&label("ecb_ret"));
&set_label("ecb_enc_one",16);
if ($inline)
{ &aesni_inline_generate1("enc"); }
else
{ &call ("_aesni_encrypt1"); }
&movups (&QWP(0,$out),$inout0);
&jmp (&label("ecb_ret"));
&set_label("ecb_enc_two",16);
&call ("_aesni_encrypt2");
&movups (&QWP(0,$out),$inout0);
&movups (&QWP(0x10,$out),$inout1);
&jmp (&label("ecb_ret"));
&set_label("ecb_enc_three",16);
&call ("_aesni_encrypt3");
&movups (&QWP(0,$out),$inout0);
&movups (&QWP(0x10,$out),$inout1);
&movups (&QWP(0x20,$out),$inout2);
&jmp (&label("ecb_ret"));
&set_label("ecb_enc_four",16);
&call ("_aesni_encrypt4");
&movups (&QWP(0,$out),$inout0);
&movups (&QWP(0x10,$out),$inout1);
&movups (&QWP(0x20,$out),$inout2);
&movups (&QWP(0x30,$out),$inout3);
&jmp (&label("ecb_ret"));
######################################################################
&set_label("ecb_decrypt",16);
&mov ($key_,$key); # backup $key
&mov ($rounds_,$rounds); # backup $rounds
&cmp ($len,0x60);
&jb (&label("ecb_dec_tail"));
&movdqu ($inout0,&QWP(0,$inp));
&movdqu ($inout1,&QWP(0x10,$inp));
&movdqu ($inout2,&QWP(0x20,$inp));
&movdqu ($inout3,&QWP(0x30,$inp));
&movdqu ($inout4,&QWP(0x40,$inp));
&movdqu ($inout5,&QWP(0x50,$inp));
&lea ($inp,&DWP(0x60,$inp));
&sub ($len,0x60);
&jmp (&label("ecb_dec_loop6_enter"));
&set_label("ecb_dec_loop6",16);
&movups (&QWP(0,$out),$inout0);
&movdqu ($inout0,&QWP(0,$inp));
&movups (&QWP(0x10,$out),$inout1);
&movdqu ($inout1,&QWP(0x10,$inp));
&movups (&QWP(0x20,$out),$inout2);
&movdqu ($inout2,&QWP(0x20,$inp));
&movups (&QWP(0x30,$out),$inout3);
&movdqu ($inout3,&QWP(0x30,$inp));
&movups (&QWP(0x40,$out),$inout4);
&movdqu ($inout4,&QWP(0x40,$inp));
&movups (&QWP(0x50,$out),$inout5);
&lea ($out,&DWP(0x60,$out));
&movdqu ($inout5,&QWP(0x50,$inp));
&lea ($inp,&DWP(0x60,$inp));
&set_label("ecb_dec_loop6_enter");
&call ("_aesni_decrypt6");
&mov ($key,$key_); # restore $key
&mov ($rounds,$rounds_); # restore $rounds
&sub ($len,0x60);
&jnc (&label("ecb_dec_loop6"));
&movups (&QWP(0,$out),$inout0);
&movups (&QWP(0x10,$out),$inout1);
&movups (&QWP(0x20,$out),$inout2);
&movups (&QWP(0x30,$out),$inout3);
&movups (&QWP(0x40,$out),$inout4);
&movups (&QWP(0x50,$out),$inout5);
&lea ($out,&DWP(0x60,$out));
&add ($len,0x60);
&jz (&label("ecb_ret"));
&set_label("ecb_dec_tail");
&movups ($inout0,&QWP(0,$inp));
&cmp ($len,0x20);
&jb (&label("ecb_dec_one"));
&movups ($inout1,&QWP(0x10,$inp));
&je (&label("ecb_dec_two"));
&movups ($inout2,&QWP(0x20,$inp));
&cmp ($len,0x40);
&jb (&label("ecb_dec_three"));
&movups ($inout3,&QWP(0x30,$inp));
&je (&label("ecb_dec_four"));
&movups ($inout4,&QWP(0x40,$inp));
&xorps ($inout5,$inout5);
&call ("_aesni_decrypt6");
&movups (&QWP(0,$out),$inout0);
&movups (&QWP(0x10,$out),$inout1);
&movups (&QWP(0x20,$out),$inout2);
&movups (&QWP(0x30,$out),$inout3);
&movups (&QWP(0x40,$out),$inout4);
&jmp (&label("ecb_ret"));
&set_label("ecb_dec_one",16);
if ($inline)
{ &aesni_inline_generate1("dec"); }
else
{ &call ("_aesni_decrypt1"); }
&movups (&QWP(0,$out),$inout0);
&jmp (&label("ecb_ret"));
&set_label("ecb_dec_two",16);
&call ("_aesni_decrypt2");
&movups (&QWP(0,$out),$inout0);
&movups (&QWP(0x10,$out),$inout1);
&jmp (&label("ecb_ret"));
&set_label("ecb_dec_three",16);
&call ("_aesni_decrypt3");
&movups (&QWP(0,$out),$inout0);
&movups (&QWP(0x10,$out),$inout1);
&movups (&QWP(0x20,$out),$inout2);
&jmp (&label("ecb_ret"));
&set_label("ecb_dec_four",16);
&call ("_aesni_decrypt4");
&movups (&QWP(0,$out),$inout0);
&movups (&QWP(0x10,$out),$inout1);
&movups (&QWP(0x20,$out),$inout2);
&movups (&QWP(0x30,$out),$inout3);
&set_label("ecb_ret");
&pxor ("xmm0","xmm0"); # clear register bank
&pxor ("xmm1","xmm1");
&pxor ("xmm2","xmm2");
&pxor ("xmm3","xmm3");
&pxor ("xmm4","xmm4");
&pxor ("xmm5","xmm5");
&pxor ("xmm6","xmm6");
&pxor ("xmm7","xmm7");
&function_end("aesni_ecb_encrypt");
######################################################################
# void aesni_ccm64_[en|de]crypt_blocks (const void *in, void *out,
# size_t blocks, const AES_KEY *key,
# const char *ivec,char *cmac);
#
# Handles only complete blocks, operates on 64-bit counter and
# does not update *ivec! Nor does it finalize CMAC value
# (see engine/eng_aesni.c for details)
#
{ my $cmac=$inout1;
&function_begin("aesni_ccm64_encrypt_blocks");
&mov ($inp,&wparam(0));
&mov ($out,&wparam(1));
&mov ($len,&wparam(2));
&mov ($key,&wparam(3));
&mov ($rounds_,&wparam(4));
&mov ($rounds,&wparam(5));
&mov ($key_,"esp");
&sub ("esp",60);
&and ("esp",-16); # align stack
&mov (&DWP(48,"esp"),$key_);
&movdqu ($ivec,&QWP(0,$rounds_)); # load ivec
&movdqu ($cmac,&QWP(0,$rounds)); # load cmac
&mov ($rounds,&DWP(240,$key));
# compose byte-swap control mask for pshufb on stack
&mov (&DWP(0,"esp"),0x0c0d0e0f);
&mov (&DWP(4,"esp"),0x08090a0b);
&mov (&DWP(8,"esp"),0x04050607);
&mov (&DWP(12,"esp"),0x00010203);
# compose counter increment vector on stack
&mov ($rounds_,1);
&xor ($key_,$key_);
&mov (&DWP(16,"esp"),$rounds_);
&mov (&DWP(20,"esp"),$key_);
&mov (&DWP(24,"esp"),$key_);
&mov (&DWP(28,"esp"),$key_);
&shl ($rounds,4);
&mov ($rounds_,16);
&lea ($key_,&DWP(0,$key));
&movdqa ($inout3,&QWP(0,"esp"));
&movdqa ($inout0,$ivec);
&lea ($key,&DWP(32,$key,$rounds));
&sub ($rounds_,$rounds);
&pshufb ($ivec,$inout3);
&set_label("ccm64_enc_outer");
&$movekey ($rndkey0,&QWP(0,$key_));
&mov ($rounds,$rounds_);
&movups ($in0,&QWP(0,$inp));
&xorps ($inout0,$rndkey0);
&$movekey ($rndkey1,&QWP(16,$key_));
&xorps ($rndkey0,$in0);
&xorps ($cmac,$rndkey0); # cmac^=inp
&$movekey ($rndkey0,&QWP(32,$key_));
&set_label("ccm64_enc2_loop");
&aesenc ($inout0,$rndkey1);
&aesenc ($cmac,$rndkey1);
&$movekey ($rndkey1,&QWP(0,$key,$rounds));
&add ($rounds,32);
&aesenc ($inout0,$rndkey0);
&aesenc ($cmac,$rndkey0);
&$movekey ($rndkey0,&QWP(-16,$key,$rounds));
&jnz (&label("ccm64_enc2_loop"));
&aesenc ($inout0,$rndkey1);
&aesenc ($cmac,$rndkey1);
&paddq ($ivec,&QWP(16,"esp"));
&dec ($len);
&aesenclast ($inout0,$rndkey0);
&aesenclast ($cmac,$rndkey0);
&lea ($inp,&DWP(16,$inp));
&xorps ($in0,$inout0); # inp^=E(ivec)
&movdqa ($inout0,$ivec);
&movups (&QWP(0,$out),$in0); # save output
&pshufb ($inout0,$inout3);
&lea ($out,&DWP(16,$out));
&jnz (&label("ccm64_enc_outer"));
&mov ("esp",&DWP(48,"esp"));
&mov ($out,&wparam(5));
&movups (&QWP(0,$out),$cmac);
&pxor ("xmm0","xmm0"); # clear register bank
&pxor ("xmm1","xmm1");
&pxor ("xmm2","xmm2");
&pxor ("xmm3","xmm3");
&pxor ("xmm4","xmm4");
&pxor ("xmm5","xmm5");
&pxor ("xmm6","xmm6");
&pxor ("xmm7","xmm7");
&function_end("aesni_ccm64_encrypt_blocks");
&function_begin("aesni_ccm64_decrypt_blocks");
&mov ($inp,&wparam(0));
&mov ($out,&wparam(1));
&mov ($len,&wparam(2));
&mov ($key,&wparam(3));
&mov ($rounds_,&wparam(4));
&mov ($rounds,&wparam(5));
&mov ($key_,"esp");
&sub ("esp",60);
&and ("esp",-16); # align stack
&mov (&DWP(48,"esp"),$key_);
&movdqu ($ivec,&QWP(0,$rounds_)); # load ivec
&movdqu ($cmac,&QWP(0,$rounds)); # load cmac
&mov ($rounds,&DWP(240,$key));
# compose byte-swap control mask for pshufb on stack
&mov (&DWP(0,"esp"),0x0c0d0e0f);
&mov (&DWP(4,"esp"),0x08090a0b);
&mov (&DWP(8,"esp"),0x04050607);
&mov (&DWP(12,"esp"),0x00010203);
# compose counter increment vector on stack
&mov ($rounds_,1);
&xor ($key_,$key_);
&mov (&DWP(16,"esp"),$rounds_);
&mov (&DWP(20,"esp"),$key_);
&mov (&DWP(24,"esp"),$key_);
&mov (&DWP(28,"esp"),$key_);
&movdqa ($inout3,&QWP(0,"esp")); # bswap mask
&movdqa ($inout0,$ivec);
&mov ($key_,$key);
&mov ($rounds_,$rounds);
&pshufb ($ivec,$inout3);
if ($inline)
{ &aesni_inline_generate1("enc"); }
else
{ &call ("_aesni_encrypt1"); }
&shl ($rounds_,4);
&mov ($rounds,16);
&movups ($in0,&QWP(0,$inp)); # load inp
&paddq ($ivec,&QWP(16,"esp"));
&lea ($inp,&QWP(16,$inp));
&sub ($rounds,$rounds_);
&lea ($key,&DWP(32,$key_,$rounds_));
&mov ($rounds_,$rounds);
&jmp (&label("ccm64_dec_outer"));
&set_label("ccm64_dec_outer",16);
&xorps ($in0,$inout0); # inp ^= E(ivec)
&movdqa ($inout0,$ivec);
&movups (&QWP(0,$out),$in0); # save output
&lea ($out,&DWP(16,$out));
&pshufb ($inout0,$inout3);
&sub ($len,1);
&jz (&label("ccm64_dec_break"));
&$movekey ($rndkey0,&QWP(0,$key_));
&mov ($rounds,$rounds_);
&$movekey ($rndkey1,&QWP(16,$key_));
&xorps ($in0,$rndkey0);
&xorps ($inout0,$rndkey0);
&xorps ($cmac,$in0); # cmac^=out
&$movekey ($rndkey0,&QWP(32,$key_));
&set_label("ccm64_dec2_loop");
&aesenc ($inout0,$rndkey1);
&aesenc ($cmac,$rndkey1);
&$movekey ($rndkey1,&QWP(0,$key,$rounds));
&add ($rounds,32);
&aesenc ($inout0,$rndkey0);
&aesenc ($cmac,$rndkey0);
&$movekey ($rndkey0,&QWP(-16,$key,$rounds));
&jnz (&label("ccm64_dec2_loop"));
&movups ($in0,&QWP(0,$inp)); # load inp
&paddq ($ivec,&QWP(16,"esp"));
&aesenc ($inout0,$rndkey1);
&aesenc ($cmac,$rndkey1);
&aesenclast ($inout0,$rndkey0);
&aesenclast ($cmac,$rndkey0);
&lea ($inp,&QWP(16,$inp));
&jmp (&label("ccm64_dec_outer"));
&set_label("ccm64_dec_break",16);
&mov ($rounds,&DWP(240,$key_));
&mov ($key,$key_);
if ($inline)
{ &aesni_inline_generate1("enc",$cmac,$in0); }
else
{ &call ("_aesni_encrypt1",$cmac); }
&mov ("esp",&DWP(48,"esp"));
&mov ($out,&wparam(5));
&movups (&QWP(0,$out),$cmac);
&pxor ("xmm0","xmm0"); # clear register bank
&pxor ("xmm1","xmm1");
&pxor ("xmm2","xmm2");
&pxor ("xmm3","xmm3");
&pxor ("xmm4","xmm4");
&pxor ("xmm5","xmm5");
&pxor ("xmm6","xmm6");
&pxor ("xmm7","xmm7");
&function_end("aesni_ccm64_decrypt_blocks");
}
######################################################################
# void aesni_ctr32_encrypt_blocks (const void *in, void *out,
# size_t blocks, const AES_KEY *key,
# const char *ivec);
#
# Handles only complete blocks, operates on 32-bit counter and
# does not update *ivec! (see crypto/modes/ctr128.c for details)
#
# stack layout:
# 0 pshufb mask
# 16 vector addend: 0,6,6,6
# 32 counter-less ivec
# 48 1st triplet of counter vector
# 64 2nd triplet of counter vector
# 80 saved %esp
&function_begin("aesni_ctr32_encrypt_blocks");
&mov ($inp,&wparam(0));
&mov ($out,&wparam(1));
&mov ($len,&wparam(2));
&mov ($key,&wparam(3));
&mov ($rounds_,&wparam(4));
&mov ($key_,"esp");
&sub ("esp",88);
&and ("esp",-16); # align stack
&mov (&DWP(80,"esp"),$key_);
&cmp ($len,1);
&je (&label("ctr32_one_shortcut"));
&movdqu ($inout5,&QWP(0,$rounds_)); # load ivec
# compose byte-swap control mask for pshufb on stack
&mov (&DWP(0,"esp"),0x0c0d0e0f);
&mov (&DWP(4,"esp"),0x08090a0b);
&mov (&DWP(8,"esp"),0x04050607);
&mov (&DWP(12,"esp"),0x00010203);
# compose counter increment vector on stack
&mov ($rounds,6);
&xor ($key_,$key_);
&mov (&DWP(16,"esp"),$rounds);
&mov (&DWP(20,"esp"),$rounds);
&mov (&DWP(24,"esp"),$rounds);
&mov (&DWP(28,"esp"),$key_);
&pextrd ($rounds_,$inout5,3); # pull 32-bit counter
&pinsrd ($inout5,$key_,3); # wipe 32-bit counter
&mov ($rounds,&DWP(240,$key)); # key->rounds
# compose 2 vectors of 3x32-bit counters
&bswap ($rounds_);
&pxor ($rndkey0,$rndkey0);
&pxor ($rndkey1,$rndkey1);
&movdqa ($inout0,&QWP(0,"esp")); # load byte-swap mask
&pinsrd ($rndkey0,$rounds_,0);
&lea ($key_,&DWP(3,$rounds_));
&pinsrd ($rndkey1,$key_,0);
&inc ($rounds_);
&pinsrd ($rndkey0,$rounds_,1);
&inc ($key_);
&pinsrd ($rndkey1,$key_,1);
&inc ($rounds_);
&pinsrd ($rndkey0,$rounds_,2);
&inc ($key_);
&pinsrd ($rndkey1,$key_,2);
&movdqa (&QWP(48,"esp"),$rndkey0); # save 1st triplet
&pshufb ($rndkey0,$inout0); # byte swap
&movdqu ($inout4,&QWP(0,$key)); # key[0]
&movdqa (&QWP(64,"esp"),$rndkey1); # save 2nd triplet
&pshufb ($rndkey1,$inout0); # byte swap
&pshufd ($inout0,$rndkey0,3<<6); # place counter to upper dword
&pshufd ($inout1,$rndkey0,2<<6);
&cmp ($len,6);
&jb (&label("ctr32_tail"));
&pxor ($inout5,$inout4); # counter-less ivec^key[0]
&shl ($rounds,4);
&mov ($rounds_,16);
&movdqa (&QWP(32,"esp"),$inout5); # save counter-less ivec^key[0]
&mov ($key_,$key); # backup $key
&sub ($rounds_,$rounds); # backup twisted $rounds
&lea ($key,&DWP(32,$key,$rounds));
&sub ($len,6);
&jmp (&label("ctr32_loop6"));
&set_label("ctr32_loop6",16);
# inlining _aesni_encrypt6's prologue gives ~6% improvement...
&pshufd ($inout2,$rndkey0,1<<6);
&movdqa ($rndkey0,&QWP(32,"esp")); # pull counter-less ivec
&pshufd ($inout3,$rndkey1,3<<6);
&pxor ($inout0,$rndkey0); # merge counter-less ivec
&pshufd ($inout4,$rndkey1,2<<6);
&pxor ($inout1,$rndkey0);
&pshufd ($inout5,$rndkey1,1<<6);
&$movekey ($rndkey1,&QWP(16,$key_));
&pxor ($inout2,$rndkey0);
&pxor ($inout3,$rndkey0);
&aesenc ($inout0,$rndkey1);
&pxor ($inout4,$rndkey0);
&pxor ($inout5,$rndkey0);
&aesenc ($inout1,$rndkey1);
&$movekey ($rndkey0,&QWP(32,$key_));
&mov ($rounds,$rounds_);
&aesenc ($inout2,$rndkey1);
&aesenc ($inout3,$rndkey1);
&aesenc ($inout4,$rndkey1);
&aesenc ($inout5,$rndkey1);
&call (&label("_aesni_encrypt6_enter"));
&movups ($rndkey1,&QWP(0,$inp));
&movups ($rndkey0,&QWP(0x10,$inp));
&xorps ($inout0,$rndkey1);
&movups ($rndkey1,&QWP(0x20,$inp));
&xorps ($inout1,$rndkey0);
&movups (&QWP(0,$out),$inout0);
&movdqa ($rndkey0,&QWP(16,"esp")); # load increment
&xorps ($inout2,$rndkey1);
&movdqa ($rndkey1,&QWP(64,"esp")); # load 2nd triplet
&movups (&QWP(0x10,$out),$inout1);
&movups (&QWP(0x20,$out),$inout2);
&paddd ($rndkey1,$rndkey0); # 2nd triplet increment
&paddd ($rndkey0,&QWP(48,"esp")); # 1st triplet increment
&movdqa ($inout0,&QWP(0,"esp")); # load byte swap mask
&movups ($inout1,&QWP(0x30,$inp));
&movups ($inout2,&QWP(0x40,$inp));
&xorps ($inout3,$inout1);
&movups ($inout1,&QWP(0x50,$inp));
&lea ($inp,&DWP(0x60,$inp));
&movdqa (&QWP(48,"esp"),$rndkey0); # save 1st triplet
&pshufb ($rndkey0,$inout0); # byte swap
&xorps ($inout4,$inout2);
&movups (&QWP(0x30,$out),$inout3);
&xorps ($inout5,$inout1);
&movdqa (&QWP(64,"esp"),$rndkey1); # save 2nd triplet
&pshufb ($rndkey1,$inout0); # byte swap
&movups (&QWP(0x40,$out),$inout4);
&pshufd ($inout0,$rndkey0,3<<6);
&movups (&QWP(0x50,$out),$inout5);
&lea ($out,&DWP(0x60,$out));
&pshufd ($inout1,$rndkey0,2<<6);
&sub ($len,6);
&jnc (&label("ctr32_loop6"));
&add ($len,6);
&jz (&label("ctr32_ret"));
&movdqu ($inout5,&QWP(0,$key_));
&mov ($key,$key_);
&pxor ($inout5,&QWP(32,"esp")); # restore count-less ivec
&mov ($rounds,&DWP(240,$key_)); # restore $rounds
&set_label("ctr32_tail");
&por ($inout0,$inout5);
&cmp ($len,2);
&jb (&label("ctr32_one"));
&pshufd ($inout2,$rndkey0,1<<6);
&por ($inout1,$inout5);
&je (&label("ctr32_two"));
&pshufd ($inout3,$rndkey1,3<<6);
&por ($inout2,$inout5);
&cmp ($len,4);
&jb (&label("ctr32_three"));
&pshufd ($inout4,$rndkey1,2<<6);
&por ($inout3,$inout5);
&je (&label("ctr32_four"));
&por ($inout4,$inout5);
&call ("_aesni_encrypt6");
&movups ($rndkey1,&QWP(0,$inp));
&movups ($rndkey0,&QWP(0x10,$inp));
&xorps ($inout0,$rndkey1);
&movups ($rndkey1,&QWP(0x20,$inp));
&xorps ($inout1,$rndkey0);
&movups ($rndkey0,&QWP(0x30,$inp));
&xorps ($inout2,$rndkey1);
&movups ($rndkey1,&QWP(0x40,$inp));
&xorps ($inout3,$rndkey0);
&movups (&QWP(0,$out),$inout0);
&xorps ($inout4,$rndkey1);
&movups (&QWP(0x10,$out),$inout1);
&movups (&QWP(0x20,$out),$inout2);
&movups (&QWP(0x30,$out),$inout3);
&movups (&QWP(0x40,$out),$inout4);
&jmp (&label("ctr32_ret"));
&set_label("ctr32_one_shortcut",16);
&movups ($inout0,&QWP(0,$rounds_)); # load ivec
&mov ($rounds,&DWP(240,$key));
&set_label("ctr32_one");
if ($inline)
{ &aesni_inline_generate1("enc"); }
else
{ &call ("_aesni_encrypt1"); }
&movups ($in0,&QWP(0,$inp));
&xorps ($in0,$inout0);
&movups (&QWP(0,$out),$in0);
&jmp (&label("ctr32_ret"));
&set_label("ctr32_two",16);
&call ("_aesni_encrypt2");
&movups ($inout3,&QWP(0,$inp));
&movups ($inout4,&QWP(0x10,$inp));
&xorps ($inout0,$inout3);
&xorps ($inout1,$inout4);
&movups (&QWP(0,$out),$inout0);
&movups (&QWP(0x10,$out),$inout1);
&jmp (&label("ctr32_ret"));
&set_label("ctr32_three",16);
&call ("_aesni_encrypt3");
&movups ($inout3,&QWP(0,$inp));
&movups ($inout4,&QWP(0x10,$inp));
&xorps ($inout0,$inout3);
&movups ($inout5,&QWP(0x20,$inp));
&xorps ($inout1,$inout4);
&movups (&QWP(0,$out),$inout0);
&xorps ($inout2,$inout5);
&movups (&QWP(0x10,$out),$inout1);
&movups (&QWP(0x20,$out),$inout2);
&jmp (&label("ctr32_ret"));
&set_label("ctr32_four",16);
&call ("_aesni_encrypt4");
&movups ($inout4,&QWP(0,$inp));
&movups ($inout5,&QWP(0x10,$inp));
&movups ($rndkey1,&QWP(0x20,$inp));
&xorps ($inout0,$inout4);
&movups ($rndkey0,&QWP(0x30,$inp));
&xorps ($inout1,$inout5);
&movups (&QWP(0,$out),$inout0);
&xorps ($inout2,$rndkey1);
&movups (&QWP(0x10,$out),$inout1);
&xorps ($inout3,$rndkey0);
&movups (&QWP(0x20,$out),$inout2);
&movups (&QWP(0x30,$out),$inout3);
&set_label("ctr32_ret");
&pxor ("xmm0","xmm0"); # clear register bank
&pxor ("xmm1","xmm1");
&pxor ("xmm2","xmm2");
&pxor ("xmm3","xmm3");
&pxor ("xmm4","xmm4");
&movdqa (&QWP(32,"esp"),"xmm0"); # clear stack
&pxor ("xmm5","xmm5");
&movdqa (&QWP(48,"esp"),"xmm0");
&pxor ("xmm6","xmm6");
&movdqa (&QWP(64,"esp"),"xmm0");
&pxor ("xmm7","xmm7");
&mov ("esp",&DWP(80,"esp"));
&function_end("aesni_ctr32_encrypt_blocks");
######################################################################
# void aesni_xts_[en|de]crypt(const char *inp,char *out,size_t len,
# const AES_KEY *key1, const AES_KEY *key2
# const unsigned char iv[16]);
#
{ my ($tweak,$twtmp,$twres,$twmask)=($rndkey1,$rndkey0,$inout0,$inout1);
&function_begin("aesni_xts_encrypt");
&mov ($key,&wparam(4)); # key2
&mov ($inp,&wparam(5)); # clear-text tweak
&mov ($rounds,&DWP(240,$key)); # key2->rounds
&movups ($inout0,&QWP(0,$inp));
if ($inline)
{ &aesni_inline_generate1("enc"); }
else
{ &call ("_aesni_encrypt1"); }
&mov ($inp,&wparam(0));
&mov ($out,&wparam(1));
&mov ($len,&wparam(2));
&mov ($key,&wparam(3)); # key1
&mov ($key_,"esp");
&sub ("esp",16*7+8);
&mov ($rounds,&DWP(240,$key)); # key1->rounds
&and ("esp",-16); # align stack
&mov (&DWP(16*6+0,"esp"),0x87); # compose the magic constant
&mov (&DWP(16*6+4,"esp"),0);
&mov (&DWP(16*6+8,"esp"),1);
&mov (&DWP(16*6+12,"esp"),0);
&mov (&DWP(16*7+0,"esp"),$len); # save original $len
&mov (&DWP(16*7+4,"esp"),$key_); # save original %esp
&movdqa ($tweak,$inout0);
&pxor ($twtmp,$twtmp);
&movdqa ($twmask,&QWP(6*16,"esp")); # 0x0...010...87
&pcmpgtd($twtmp,$tweak); # broadcast upper bits
&and ($len,-16);
&mov ($key_,$key); # backup $key
&mov ($rounds_,$rounds); # backup $rounds
&sub ($len,16*6);
&jc (&label("xts_enc_short"));
&shl ($rounds,4);
&mov ($rounds_,16);
&sub ($rounds_,$rounds);
&lea ($key,&DWP(32,$key,$rounds));
&jmp (&label("xts_enc_loop6"));
&set_label("xts_enc_loop6",16);
for ($i=0;$i<4;$i++) {
&pshufd ($twres,$twtmp,0x13);
&pxor ($twtmp,$twtmp);
&movdqa (&QWP(16*$i,"esp"),$tweak);
&paddq ($tweak,$tweak); # &psllq($tweak,1);
&pand ($twres,$twmask); # isolate carry and residue
&pcmpgtd ($twtmp,$tweak); # broadcast upper bits
&pxor ($tweak,$twres);
}
&pshufd ($inout5,$twtmp,0x13);
&movdqa (&QWP(16*$i++,"esp"),$tweak);
&paddq ($tweak,$tweak); # &psllq($tweak,1);
&$movekey ($rndkey0,&QWP(0,$key_));
&pand ($inout5,$twmask); # isolate carry and residue
&movups ($inout0,&QWP(0,$inp)); # load input
&pxor ($inout5,$tweak);
# inline _aesni_encrypt6 prologue and flip xor with tweak and key[0]
&mov ($rounds,$rounds_); # restore $rounds
&movdqu ($inout1,&QWP(16*1,$inp));
&xorps ($inout0,$rndkey0); # input^=rndkey[0]
&movdqu ($inout2,&QWP(16*2,$inp));
&pxor ($inout1,$rndkey0);
&movdqu ($inout3,&QWP(16*3,$inp));
&pxor ($inout2,$rndkey0);
&movdqu ($inout4,&QWP(16*4,$inp));
&pxor ($inout3,$rndkey0);
&movdqu ($rndkey1,&QWP(16*5,$inp));
&pxor ($inout4,$rndkey0);
&lea ($inp,&DWP(16*6,$inp));
&pxor ($inout0,&QWP(16*0,"esp")); # input^=tweak
&movdqa (&QWP(16*$i,"esp"),$inout5); # save last tweak
&pxor ($inout5,$rndkey1);
&$movekey ($rndkey1,&QWP(16,$key_));
&pxor ($inout1,&QWP(16*1,"esp"));
&pxor ($inout2,&QWP(16*2,"esp"));
&aesenc ($inout0,$rndkey1);
&pxor ($inout3,&QWP(16*3,"esp"));
&pxor ($inout4,&QWP(16*4,"esp"));
&aesenc ($inout1,$rndkey1);
&pxor ($inout5,$rndkey0);
&$movekey ($rndkey0,&QWP(32,$key_));
&aesenc ($inout2,$rndkey1);
&aesenc ($inout3,$rndkey1);
&aesenc ($inout4,$rndkey1);
&aesenc ($inout5,$rndkey1);
&call (&label("_aesni_encrypt6_enter"));
&movdqa ($tweak,&QWP(16*5,"esp")); # last tweak
&pxor ($twtmp,$twtmp);
&xorps ($inout0,&QWP(16*0,"esp")); # output^=tweak
&pcmpgtd ($twtmp,$tweak); # broadcast upper bits
&xorps ($inout1,&QWP(16*1,"esp"));
&movups (&QWP(16*0,$out),$inout0); # write output
&xorps ($inout2,&QWP(16*2,"esp"));
&movups (&QWP(16*1,$out),$inout1);
&xorps ($inout3,&QWP(16*3,"esp"));
&movups (&QWP(16*2,$out),$inout2);
&xorps ($inout4,&QWP(16*4,"esp"));
&movups (&QWP(16*3,$out),$inout3);
&xorps ($inout5,$tweak);
&movups (&QWP(16*4,$out),$inout4);
&pshufd ($twres,$twtmp,0x13);
&movups (&QWP(16*5,$out),$inout5);
&lea ($out,&DWP(16*6,$out));
&movdqa ($twmask,&QWP(16*6,"esp")); # 0x0...010...87
&pxor ($twtmp,$twtmp);
&paddq ($tweak,$tweak); # &psllq($tweak,1);
&pand ($twres,$twmask); # isolate carry and residue
&pcmpgtd($twtmp,$tweak); # broadcast upper bits
&pxor ($tweak,$twres);
&sub ($len,16*6);
&jnc (&label("xts_enc_loop6"));
&mov ($rounds,&DWP(240,$key_)); # restore $rounds
&mov ($key,$key_); # restore $key
&mov ($rounds_,$rounds);
&set_label("xts_enc_short");
&add ($len,16*6);
&jz (&label("xts_enc_done6x"));
&movdqa ($inout3,$tweak); # put aside previous tweak
&cmp ($len,0x20);
&jb (&label("xts_enc_one"));
&pshufd ($twres,$twtmp,0x13);
&pxor ($twtmp,$twtmp);
&paddq ($tweak,$tweak); # &psllq($tweak,1);
&pand ($twres,$twmask); # isolate carry and residue
&pcmpgtd($twtmp,$tweak); # broadcast upper bits
&pxor ($tweak,$twres);
&je (&label("xts_enc_two"));
&pshufd ($twres,$twtmp,0x13);
&pxor ($twtmp,$twtmp);
&movdqa ($inout4,$tweak); # put aside previous tweak
&paddq ($tweak,$tweak); # &psllq($tweak,1);
&pand ($twres,$twmask); # isolate carry and residue
&pcmpgtd($twtmp,$tweak); # broadcast upper bits
&pxor ($tweak,$twres);
&cmp ($len,0x40);
&jb (&label("xts_enc_three"));
&pshufd ($twres,$twtmp,0x13);
&pxor ($twtmp,$twtmp);
&movdqa ($inout5,$tweak); # put aside previous tweak
&paddq ($tweak,$tweak); # &psllq($tweak,1);
&pand ($twres,$twmask); # isolate carry and residue
&pcmpgtd($twtmp,$tweak); # broadcast upper bits
&pxor ($tweak,$twres);
&movdqa (&QWP(16*0,"esp"),$inout3);
&movdqa (&QWP(16*1,"esp"),$inout4);
&je (&label("xts_enc_four"));
&movdqa (&QWP(16*2,"esp"),$inout5);
&pshufd ($inout5,$twtmp,0x13);
&movdqa (&QWP(16*3,"esp"),$tweak);
&paddq ($tweak,$tweak); # &psllq($inout0,1);
&pand ($inout5,$twmask); # isolate carry and residue
&pxor ($inout5,$tweak);
&movdqu ($inout0,&QWP(16*0,$inp)); # load input
&movdqu ($inout1,&QWP(16*1,$inp));
&movdqu ($inout2,&QWP(16*2,$inp));
&pxor ($inout0,&QWP(16*0,"esp")); # input^=tweak
&movdqu ($inout3,&QWP(16*3,$inp));
&pxor ($inout1,&QWP(16*1,"esp"));
&movdqu ($inout4,&QWP(16*4,$inp));
&pxor ($inout2,&QWP(16*2,"esp"));
&lea ($inp,&DWP(16*5,$inp));
&pxor ($inout3,&QWP(16*3,"esp"));
&movdqa (&QWP(16*4,"esp"),$inout5); # save last tweak
&pxor ($inout4,$inout5);
&call ("_aesni_encrypt6");
&movaps ($tweak,&QWP(16*4,"esp")); # last tweak
&xorps ($inout0,&QWP(16*0,"esp")); # output^=tweak
&xorps ($inout1,&QWP(16*1,"esp"));
&xorps ($inout2,&QWP(16*2,"esp"));
&movups (&QWP(16*0,$out),$inout0); # write output
&xorps ($inout3,&QWP(16*3,"esp"));
&movups (&QWP(16*1,$out),$inout1);
&xorps ($inout4,$tweak);
&movups (&QWP(16*2,$out),$inout2);
&movups (&QWP(16*3,$out),$inout3);
&movups (&QWP(16*4,$out),$inout4);
&lea ($out,&DWP(16*5,$out));
&jmp (&label("xts_enc_done"));
&set_label("xts_enc_one",16);
&movups ($inout0,&QWP(16*0,$inp)); # load input
&lea ($inp,&DWP(16*1,$inp));
&xorps ($inout0,$inout3); # input^=tweak
if ($inline)
{ &aesni_inline_generate1("enc"); }
else
{ &call ("_aesni_encrypt1"); }
&xorps ($inout0,$inout3); # output^=tweak
&movups (&QWP(16*0,$out),$inout0); # write output
&lea ($out,&DWP(16*1,$out));
&movdqa ($tweak,$inout3); # last tweak
&jmp (&label("xts_enc_done"));
&set_label("xts_enc_two",16);
&movaps ($inout4,$tweak); # put aside last tweak
&movups ($inout0,&QWP(16*0,$inp)); # load input
&movups ($inout1,&QWP(16*1,$inp));
&lea ($inp,&DWP(16*2,$inp));
&xorps ($inout0,$inout3); # input^=tweak
&xorps ($inout1,$inout4);
&call ("_aesni_encrypt2");
&xorps ($inout0,$inout3); # output^=tweak
&xorps ($inout1,$inout4);
&movups (&QWP(16*0,$out),$inout0); # write output
&movups (&QWP(16*1,$out),$inout1);
&lea ($out,&DWP(16*2,$out));
&movdqa ($tweak,$inout4); # last tweak
&jmp (&label("xts_enc_done"));
&set_label("xts_enc_three",16);
&movaps ($inout5,$tweak); # put aside last tweak
&movups ($inout0,&QWP(16*0,$inp)); # load input
&movups ($inout1,&QWP(16*1,$inp));
&movups ($inout2,&QWP(16*2,$inp));
&lea ($inp,&DWP(16*3,$inp));
&xorps ($inout0,$inout3); # input^=tweak
&xorps ($inout1,$inout4);
&xorps ($inout2,$inout5);
&call ("_aesni_encrypt3");
&xorps ($inout0,$inout3); # output^=tweak
&xorps ($inout1,$inout4);
&xorps ($inout2,$inout5);
&movups (&QWP(16*0,$out),$inout0); # write output
&movups (&QWP(16*1,$out),$inout1);
&movups (&QWP(16*2,$out),$inout2);
&lea ($out,&DWP(16*3,$out));
&movdqa ($tweak,$inout5); # last tweak
&jmp (&label("xts_enc_done"));
&set_label("xts_enc_four",16);
&movaps ($inout4,$tweak); # put aside last tweak
&movups ($inout0,&QWP(16*0,$inp)); # load input
&movups ($inout1,&QWP(16*1,$inp));
&movups ($inout2,&QWP(16*2,$inp));
&xorps ($inout0,&QWP(16*0,"esp")); # input^=tweak
&movups ($inout3,&QWP(16*3,$inp));
&lea ($inp,&DWP(16*4,$inp));
&xorps ($inout1,&QWP(16*1,"esp"));
&xorps ($inout2,$inout5);
&xorps ($inout3,$inout4);
&call ("_aesni_encrypt4");
&xorps ($inout0,&QWP(16*0,"esp")); # output^=tweak
&xorps ($inout1,&QWP(16*1,"esp"));
&xorps ($inout2,$inout5);
&movups (&QWP(16*0,$out),$inout0); # write output
&xorps ($inout3,$inout4);
&movups (&QWP(16*1,$out),$inout1);
&movups (&QWP(16*2,$out),$inout2);
&movups (&QWP(16*3,$out),$inout3);
&lea ($out,&DWP(16*4,$out));
&movdqa ($tweak,$inout4); # last tweak
&jmp (&label("xts_enc_done"));
&set_label("xts_enc_done6x",16); # $tweak is pre-calculated
&mov ($len,&DWP(16*7+0,"esp")); # restore original $len
&and ($len,15);
&jz (&label("xts_enc_ret"));
&movdqa ($inout3,$tweak);
&mov (&DWP(16*7+0,"esp"),$len); # save $len%16
&jmp (&label("xts_enc_steal"));
&set_label("xts_enc_done",16);
&mov ($len,&DWP(16*7+0,"esp")); # restore original $len
&pxor ($twtmp,$twtmp);
&and ($len,15);
&jz (&label("xts_enc_ret"));
&pcmpgtd($twtmp,$tweak); # broadcast upper bits
&mov (&DWP(16*7+0,"esp"),$len); # save $len%16
&pshufd ($inout3,$twtmp,0x13);
&paddq ($tweak,$tweak); # &psllq($tweak,1);
&pand ($inout3,&QWP(16*6,"esp")); # isolate carry and residue
&pxor ($inout3,$tweak);
&set_label("xts_enc_steal");
&movz ($rounds,&BP(0,$inp));
&movz ($key,&BP(-16,$out));
&lea ($inp,&DWP(1,$inp));
&mov (&BP(-16,$out),&LB($rounds));
&mov (&BP(0,$out),&LB($key));
&lea ($out,&DWP(1,$out));
&sub ($len,1);
&jnz (&label("xts_enc_steal"));
&sub ($out,&DWP(16*7+0,"esp")); # rewind $out
&mov ($key,$key_); # restore $key
&mov ($rounds,$rounds_); # restore $rounds
&movups ($inout0,&QWP(-16,$out)); # load input
&xorps ($inout0,$inout3); # input^=tweak
if ($inline)
{ &aesni_inline_generate1("enc"); }
else
{ &call ("_aesni_encrypt1"); }
&xorps ($inout0,$inout3); # output^=tweak
&movups (&QWP(-16,$out),$inout0); # write output
&set_label("xts_enc_ret");
&pxor ("xmm0","xmm0"); # clear register bank
&pxor ("xmm1","xmm1");
&pxor ("xmm2","xmm2");
&movdqa (&QWP(16*0,"esp"),"xmm0"); # clear stack
&pxor ("xmm3","xmm3");
&movdqa (&QWP(16*1,"esp"),"xmm0");
&pxor ("xmm4","xmm4");
&movdqa (&QWP(16*2,"esp"),"xmm0");
&pxor ("xmm5","xmm5");
&movdqa (&QWP(16*3,"esp"),"xmm0");
&pxor ("xmm6","xmm6");
&movdqa (&QWP(16*4,"esp"),"xmm0");
&pxor ("xmm7","xmm7");
&movdqa (&QWP(16*5,"esp"),"xmm0");
&mov ("esp",&DWP(16*7+4,"esp")); # restore %esp
&function_end("aesni_xts_encrypt");
&function_begin("aesni_xts_decrypt");
&mov ($key,&wparam(4)); # key2
&mov ($inp,&wparam(5)); # clear-text tweak
&mov ($rounds,&DWP(240,$key)); # key2->rounds
&movups ($inout0,&QWP(0,$inp));
if ($inline)
{ &aesni_inline_generate1("enc"); }
else
{ &call ("_aesni_encrypt1"); }
&mov ($inp,&wparam(0));
&mov ($out,&wparam(1));
&mov ($len,&wparam(2));
&mov ($key,&wparam(3)); # key1
&mov ($key_,"esp");
&sub ("esp",16*7+8);
&and ("esp",-16); # align stack
&xor ($rounds_,$rounds_); # if(len%16) len-=16;
&test ($len,15);
&setnz (&LB($rounds_));
&shl ($rounds_,4);
&sub ($len,$rounds_);
&mov (&DWP(16*6+0,"esp"),0x87); # compose the magic constant
&mov (&DWP(16*6+4,"esp"),0);
&mov (&DWP(16*6+8,"esp"),1);
&mov (&DWP(16*6+12,"esp"),0);
&mov (&DWP(16*7+0,"esp"),$len); # save original $len
&mov (&DWP(16*7+4,"esp"),$key_); # save original %esp
&mov ($rounds,&DWP(240,$key)); # key1->rounds
&mov ($key_,$key); # backup $key
&mov ($rounds_,$rounds); # backup $rounds
&movdqa ($tweak,$inout0);
&pxor ($twtmp,$twtmp);
&movdqa ($twmask,&QWP(6*16,"esp")); # 0x0...010...87
&pcmpgtd($twtmp,$tweak); # broadcast upper bits
&and ($len,-16);
&sub ($len,16*6);
&jc (&label("xts_dec_short"));
&shl ($rounds,4);
&mov ($rounds_,16);
&sub ($rounds_,$rounds);
&lea ($key,&DWP(32,$key,$rounds));
&jmp (&label("xts_dec_loop6"));
&set_label("xts_dec_loop6",16);
for ($i=0;$i<4;$i++) {
&pshufd ($twres,$twtmp,0x13);
&pxor ($twtmp,$twtmp);
&movdqa (&QWP(16*$i,"esp"),$tweak);
&paddq ($tweak,$tweak); # &psllq($tweak,1);
&pand ($twres,$twmask); # isolate carry and residue
&pcmpgtd ($twtmp,$tweak); # broadcast upper bits
&pxor ($tweak,$twres);
}
&pshufd ($inout5,$twtmp,0x13);
&movdqa (&QWP(16*$i++,"esp"),$tweak);
&paddq ($tweak,$tweak); # &psllq($tweak,1);
&$movekey ($rndkey0,&QWP(0,$key_));
&pand ($inout5,$twmask); # isolate carry and residue
&movups ($inout0,&QWP(0,$inp)); # load input
&pxor ($inout5,$tweak);
# inline _aesni_encrypt6 prologue and flip xor with tweak and key[0]
&mov ($rounds,$rounds_);
&movdqu ($inout1,&QWP(16*1,$inp));
&xorps ($inout0,$rndkey0); # input^=rndkey[0]
&movdqu ($inout2,&QWP(16*2,$inp));
&pxor ($inout1,$rndkey0);
&movdqu ($inout3,&QWP(16*3,$inp));
&pxor ($inout2,$rndkey0);
&movdqu ($inout4,&QWP(16*4,$inp));
&pxor ($inout3,$rndkey0);
&movdqu ($rndkey1,&QWP(16*5,$inp));
&pxor ($inout4,$rndkey0);
&lea ($inp,&DWP(16*6,$inp));
&pxor ($inout0,&QWP(16*0,"esp")); # input^=tweak
&movdqa (&QWP(16*$i,"esp"),$inout5); # save last tweak
&pxor ($inout5,$rndkey1);
&$movekey ($rndkey1,&QWP(16,$key_));
&pxor ($inout1,&QWP(16*1,"esp"));
&pxor ($inout2,&QWP(16*2,"esp"));
&aesdec ($inout0,$rndkey1);
&pxor ($inout3,&QWP(16*3,"esp"));
&pxor ($inout4,&QWP(16*4,"esp"));
&aesdec ($inout1,$rndkey1);
&pxor ($inout5,$rndkey0);
&$movekey ($rndkey0,&QWP(32,$key_));
&aesdec ($inout2,$rndkey1);
&aesdec ($inout3,$rndkey1);
&aesdec ($inout4,$rndkey1);
&aesdec ($inout5,$rndkey1);
&call (&label("_aesni_decrypt6_enter"));
&movdqa ($tweak,&QWP(16*5,"esp")); # last tweak
&pxor ($twtmp,$twtmp);
&xorps ($inout0,&QWP(16*0,"esp")); # output^=tweak
&pcmpgtd ($twtmp,$tweak); # broadcast upper bits
&xorps ($inout1,&QWP(16*1,"esp"));
&movups (&QWP(16*0,$out),$inout0); # write output
&xorps ($inout2,&QWP(16*2,"esp"));
&movups (&QWP(16*1,$out),$inout1);
&xorps ($inout3,&QWP(16*3,"esp"));
&movups (&QWP(16*2,$out),$inout2);
&xorps ($inout4,&QWP(16*4,"esp"));
&movups (&QWP(16*3,$out),$inout3);
&xorps ($inout5,$tweak);
&movups (&QWP(16*4,$out),$inout4);
&pshufd ($twres,$twtmp,0x13);
&movups (&QWP(16*5,$out),$inout5);
&lea ($out,&DWP(16*6,$out));
&movdqa ($twmask,&QWP(16*6,"esp")); # 0x0...010...87
&pxor ($twtmp,$twtmp);
&paddq ($tweak,$tweak); # &psllq($tweak,1);
&pand ($twres,$twmask); # isolate carry and residue
&pcmpgtd($twtmp,$tweak); # broadcast upper bits
&pxor ($tweak,$twres);
&sub ($len,16*6);
&jnc (&label("xts_dec_loop6"));
&mov ($rounds,&DWP(240,$key_)); # restore $rounds
&mov ($key,$key_); # restore $key
&mov ($rounds_,$rounds);
&set_label("xts_dec_short");
&add ($len,16*6);
&jz (&label("xts_dec_done6x"));
&movdqa ($inout3,$tweak); # put aside previous tweak
&cmp ($len,0x20);
&jb (&label("xts_dec_one"));
&pshufd ($twres,$twtmp,0x13);
&pxor ($twtmp,$twtmp);
&paddq ($tweak,$tweak); # &psllq($tweak,1);
&pand ($twres,$twmask); # isolate carry and residue
&pcmpgtd($twtmp,$tweak); # broadcast upper bits
&pxor ($tweak,$twres);
&je (&label("xts_dec_two"));
&pshufd ($twres,$twtmp,0x13);
&pxor ($twtmp,$twtmp);
&movdqa ($inout4,$tweak); # put aside previous tweak
&paddq ($tweak,$tweak); # &psllq($tweak,1);
&pand ($twres,$twmask); # isolate carry and residue
&pcmpgtd($twtmp,$tweak); # broadcast upper bits
&pxor ($tweak,$twres);
&cmp ($len,0x40);
&jb (&label("xts_dec_three"));
&pshufd ($twres,$twtmp,0x13);
&pxor ($twtmp,$twtmp);
&movdqa ($inout5,$tweak); # put aside previous tweak
&paddq ($tweak,$tweak); # &psllq($tweak,1);
&pand ($twres,$twmask); # isolate carry and residue
&pcmpgtd($twtmp,$tweak); # broadcast upper bits
&pxor ($tweak,$twres);
&movdqa (&QWP(16*0,"esp"),$inout3);
&movdqa (&QWP(16*1,"esp"),$inout4);
&je (&label("xts_dec_four"));
&movdqa (&QWP(16*2,"esp"),$inout5);
&pshufd ($inout5,$twtmp,0x13);
&movdqa (&QWP(16*3,"esp"),$tweak);
&paddq ($tweak,$tweak); # &psllq($inout0,1);
&pand ($inout5,$twmask); # isolate carry and residue
&pxor ($inout5,$tweak);
&movdqu ($inout0,&QWP(16*0,$inp)); # load input
&movdqu ($inout1,&QWP(16*1,$inp));
&movdqu ($inout2,&QWP(16*2,$inp));
&pxor ($inout0,&QWP(16*0,"esp")); # input^=tweak
&movdqu ($inout3,&QWP(16*3,$inp));
&pxor ($inout1,&QWP(16*1,"esp"));
&movdqu ($inout4,&QWP(16*4,$inp));
&pxor ($inout2,&QWP(16*2,"esp"));
&lea ($inp,&DWP(16*5,$inp));
&pxor ($inout3,&QWP(16*3,"esp"));
&movdqa (&QWP(16*4,"esp"),$inout5); # save last tweak
&pxor ($inout4,$inout5);
&call ("_aesni_decrypt6");
&movaps ($tweak,&QWP(16*4,"esp")); # last tweak
&xorps ($inout0,&QWP(16*0,"esp")); # output^=tweak
&xorps ($inout1,&QWP(16*1,"esp"));
&xorps ($inout2,&QWP(16*2,"esp"));
&movups (&QWP(16*0,$out),$inout0); # write output
&xorps ($inout3,&QWP(16*3,"esp"));
&movups (&QWP(16*1,$out),$inout1);
&xorps ($inout4,$tweak);
&movups (&QWP(16*2,$out),$inout2);
&movups (&QWP(16*3,$out),$inout3);
&movups (&QWP(16*4,$out),$inout4);
&lea ($out,&DWP(16*5,$out));
&jmp (&label("xts_dec_done"));
&set_label("xts_dec_one",16);
&movups ($inout0,&QWP(16*0,$inp)); # load input
&lea ($inp,&DWP(16*1,$inp));
&xorps ($inout0,$inout3); # input^=tweak
if ($inline)
{ &aesni_inline_generate1("dec"); }
else
{ &call ("_aesni_decrypt1"); }
&xorps ($inout0,$inout3); # output^=tweak
&movups (&QWP(16*0,$out),$inout0); # write output
&lea ($out,&DWP(16*1,$out));
&movdqa ($tweak,$inout3); # last tweak
&jmp (&label("xts_dec_done"));
&set_label("xts_dec_two",16);
&movaps ($inout4,$tweak); # put aside last tweak
&movups ($inout0,&QWP(16*0,$inp)); # load input
&movups ($inout1,&QWP(16*1,$inp));
&lea ($inp,&DWP(16*2,$inp));
&xorps ($inout0,$inout3); # input^=tweak
&xorps ($inout1,$inout4);
&call ("_aesni_decrypt2");
&xorps ($inout0,$inout3); # output^=tweak
&xorps ($inout1,$inout4);
&movups (&QWP(16*0,$out),$inout0); # write output
&movups (&QWP(16*1,$out),$inout1);
&lea ($out,&DWP(16*2,$out));
&movdqa ($tweak,$inout4); # last tweak
&jmp (&label("xts_dec_done"));
&set_label("xts_dec_three",16);
&movaps ($inout5,$tweak); # put aside last tweak
&movups ($inout0,&QWP(16*0,$inp)); # load input
&movups ($inout1,&QWP(16*1,$inp));
&movups ($inout2,&QWP(16*2,$inp));
&lea ($inp,&DWP(16*3,$inp));
&xorps ($inout0,$inout3); # input^=tweak
&xorps ($inout1,$inout4);
&xorps ($inout2,$inout5);
&call ("_aesni_decrypt3");
&xorps ($inout0,$inout3); # output^=tweak
&xorps ($inout1,$inout4);
&xorps ($inout2,$inout5);
&movups (&QWP(16*0,$out),$inout0); # write output
&movups (&QWP(16*1,$out),$inout1);
&movups (&QWP(16*2,$out),$inout2);
&lea ($out,&DWP(16*3,$out));
&movdqa ($tweak,$inout5); # last tweak
&jmp (&label("xts_dec_done"));
&set_label("xts_dec_four",16);
&movaps ($inout4,$tweak); # put aside last tweak
&movups ($inout0,&QWP(16*0,$inp)); # load input
&movups ($inout1,&QWP(16*1,$inp));
&movups ($inout2,&QWP(16*2,$inp));
&xorps ($inout0,&QWP(16*0,"esp")); # input^=tweak
&movups ($inout3,&QWP(16*3,$inp));
&lea ($inp,&DWP(16*4,$inp));
&xorps ($inout1,&QWP(16*1,"esp"));
&xorps ($inout2,$inout5);
&xorps ($inout3,$inout4);
&call ("_aesni_decrypt4");
&xorps ($inout0,&QWP(16*0,"esp")); # output^=tweak
&xorps ($inout1,&QWP(16*1,"esp"));
&xorps ($inout2,$inout5);
&movups (&QWP(16*0,$out),$inout0); # write output
&xorps ($inout3,$inout4);
&movups (&QWP(16*1,$out),$inout1);
&movups (&QWP(16*2,$out),$inout2);
&movups (&QWP(16*3,$out),$inout3);
&lea ($out,&DWP(16*4,$out));
&movdqa ($tweak,$inout4); # last tweak
&jmp (&label("xts_dec_done"));
&set_label("xts_dec_done6x",16); # $tweak is pre-calculated
&mov ($len,&DWP(16*7+0,"esp")); # restore original $len
&and ($len,15);
&jz (&label("xts_dec_ret"));
&mov (&DWP(16*7+0,"esp"),$len); # save $len%16
&jmp (&label("xts_dec_only_one_more"));
&set_label("xts_dec_done",16);
&mov ($len,&DWP(16*7+0,"esp")); # restore original $len
&pxor ($twtmp,$twtmp);
&and ($len,15);
&jz (&label("xts_dec_ret"));
&pcmpgtd($twtmp,$tweak); # broadcast upper bits
&mov (&DWP(16*7+0,"esp"),$len); # save $len%16
&pshufd ($twres,$twtmp,0x13);
&pxor ($twtmp,$twtmp);
&movdqa ($twmask,&QWP(16*6,"esp"));
&paddq ($tweak,$tweak); # &psllq($tweak,1);
&pand ($twres,$twmask); # isolate carry and residue
&pcmpgtd($twtmp,$tweak); # broadcast upper bits
&pxor ($tweak,$twres);
&set_label("xts_dec_only_one_more");
&pshufd ($inout3,$twtmp,0x13);
&movdqa ($inout4,$tweak); # put aside previous tweak
&paddq ($tweak,$tweak); # &psllq($tweak,1);
&pand ($inout3,$twmask); # isolate carry and residue
&pxor ($inout3,$tweak);
&mov ($key,$key_); # restore $key
&mov ($rounds,$rounds_); # restore $rounds
&movups ($inout0,&QWP(0,$inp)); # load input
&xorps ($inout0,$inout3); # input^=tweak
if ($inline)
{ &aesni_inline_generate1("dec"); }
else
{ &call ("_aesni_decrypt1"); }
&xorps ($inout0,$inout3); # output^=tweak
&movups (&QWP(0,$out),$inout0); # write output
&set_label("xts_dec_steal");
&movz ($rounds,&BP(16,$inp));
&movz ($key,&BP(0,$out));
&lea ($inp,&DWP(1,$inp));
&mov (&BP(0,$out),&LB($rounds));
&mov (&BP(16,$out),&LB($key));
&lea ($out,&DWP(1,$out));
&sub ($len,1);
&jnz (&label("xts_dec_steal"));
&sub ($out,&DWP(16*7+0,"esp")); # rewind $out
&mov ($key,$key_); # restore $key
&mov ($rounds,$rounds_); # restore $rounds
&movups ($inout0,&QWP(0,$out)); # load input
&xorps ($inout0,$inout4); # input^=tweak
if ($inline)
{ &aesni_inline_generate1("dec"); }
else
{ &call ("_aesni_decrypt1"); }
&xorps ($inout0,$inout4); # output^=tweak
&movups (&QWP(0,$out),$inout0); # write output
&set_label("xts_dec_ret");
&pxor ("xmm0","xmm0"); # clear register bank
&pxor ("xmm1","xmm1");
&pxor ("xmm2","xmm2");
&movdqa (&QWP(16*0,"esp"),"xmm0"); # clear stack
&pxor ("xmm3","xmm3");
&movdqa (&QWP(16*1,"esp"),"xmm0");
&pxor ("xmm4","xmm4");
&movdqa (&QWP(16*2,"esp"),"xmm0");
&pxor ("xmm5","xmm5");
&movdqa (&QWP(16*3,"esp"),"xmm0");
&pxor ("xmm6","xmm6");
&movdqa (&QWP(16*4,"esp"),"xmm0");
&pxor ("xmm7","xmm7");
&movdqa (&QWP(16*5,"esp"),"xmm0");
&mov ("esp",&DWP(16*7+4,"esp")); # restore %esp
&function_end("aesni_xts_decrypt");
}
}
######################################################################
# void $PREFIX_cbc_encrypt (const void *inp, void *out,
# size_t length, const AES_KEY *key,
# unsigned char *ivp,const int enc);
&function_begin("${PREFIX}_cbc_encrypt");
&mov ($inp,&wparam(0));
&mov ($rounds_,"esp");
&mov ($out,&wparam(1));
&sub ($rounds_,24);
&mov ($len,&wparam(2));
&and ($rounds_,-16);
&mov ($key,&wparam(3));
&mov ($key_,&wparam(4));
&test ($len,$len);
&jz (&label("cbc_abort"));
&cmp (&wparam(5),0);
&xchg ($rounds_,"esp"); # alloca
&movups ($ivec,&QWP(0,$key_)); # load IV
&mov ($rounds,&DWP(240,$key));
&mov ($key_,$key); # backup $key
&mov (&DWP(16,"esp"),$rounds_); # save original %esp
&mov ($rounds_,$rounds); # backup $rounds
&je (&label("cbc_decrypt"));
&movaps ($inout0,$ivec);
&cmp ($len,16);
&jb (&label("cbc_enc_tail"));
&sub ($len,16);
&jmp (&label("cbc_enc_loop"));
&set_label("cbc_enc_loop",16);
&movups ($ivec,&QWP(0,$inp)); # input actually
&lea ($inp,&DWP(16,$inp));
if ($inline)
{ &aesni_inline_generate1("enc",$inout0,$ivec); }
else
{ &xorps($inout0,$ivec); &call("_aesni_encrypt1"); }
&mov ($rounds,$rounds_); # restore $rounds
&mov ($key,$key_); # restore $key
&movups (&QWP(0,$out),$inout0); # store output
&lea ($out,&DWP(16,$out));
&sub ($len,16);
&jnc (&label("cbc_enc_loop"));
&add ($len,16);
&jnz (&label("cbc_enc_tail"));
&movaps ($ivec,$inout0);
&pxor ($inout0,$inout0);
&jmp (&label("cbc_ret"));
&set_label("cbc_enc_tail");
&mov ("ecx",$len); # zaps $rounds
&data_word(0xA4F3F689); # rep movsb
&mov ("ecx",16); # zero tail
&sub ("ecx",$len);
&xor ("eax","eax"); # zaps $len
&data_word(0xAAF3F689); # rep stosb
&lea ($out,&DWP(-16,$out)); # rewind $out by 1 block
&mov ($rounds,$rounds_); # restore $rounds
&mov ($inp,$out); # $inp and $out are the same
&mov ($key,$key_); # restore $key
&jmp (&label("cbc_enc_loop"));
######################################################################
&set_label("cbc_decrypt",16);
&cmp ($len,0x50);
&jbe (&label("cbc_dec_tail"));
&movaps (&QWP(0,"esp"),$ivec); # save IV
&sub ($len,0x50);
&jmp (&label("cbc_dec_loop6_enter"));
&set_label("cbc_dec_loop6",16);
&movaps (&QWP(0,"esp"),$rndkey0); # save IV
&movups (&QWP(0,$out),$inout5);
&lea ($out,&DWP(0x10,$out));
&set_label("cbc_dec_loop6_enter");
&movdqu ($inout0,&QWP(0,$inp));
&movdqu ($inout1,&QWP(0x10,$inp));
&movdqu ($inout2,&QWP(0x20,$inp));
&movdqu ($inout3,&QWP(0x30,$inp));
&movdqu ($inout4,&QWP(0x40,$inp));
&movdqu ($inout5,&QWP(0x50,$inp));
&call ("_aesni_decrypt6");
&movups ($rndkey1,&QWP(0,$inp));
&movups ($rndkey0,&QWP(0x10,$inp));
&xorps ($inout0,&QWP(0,"esp")); # ^=IV
&xorps ($inout1,$rndkey1);
&movups ($rndkey1,&QWP(0x20,$inp));
&xorps ($inout2,$rndkey0);
&movups ($rndkey0,&QWP(0x30,$inp));
&xorps ($inout3,$rndkey1);
&movups ($rndkey1,&QWP(0x40,$inp));
&xorps ($inout4,$rndkey0);
&movups ($rndkey0,&QWP(0x50,$inp)); # IV
&xorps ($inout5,$rndkey1);
&movups (&QWP(0,$out),$inout0);
&movups (&QWP(0x10,$out),$inout1);
&lea ($inp,&DWP(0x60,$inp));
&movups (&QWP(0x20,$out),$inout2);
&mov ($rounds,$rounds_); # restore $rounds
&movups (&QWP(0x30,$out),$inout3);
&mov ($key,$key_); # restore $key
&movups (&QWP(0x40,$out),$inout4);
&lea ($out,&DWP(0x50,$out));
&sub ($len,0x60);
&ja (&label("cbc_dec_loop6"));
&movaps ($inout0,$inout5);
&movaps ($ivec,$rndkey0);
&add ($len,0x50);
&jle (&label("cbc_dec_clear_tail_collected"));
&movups (&QWP(0,$out),$inout0);
&lea ($out,&DWP(0x10,$out));
&set_label("cbc_dec_tail");
&movups ($inout0,&QWP(0,$inp));
&movaps ($in0,$inout0);
&cmp ($len,0x10);
&jbe (&label("cbc_dec_one"));
&movups ($inout1,&QWP(0x10,$inp));
&movaps ($in1,$inout1);
&cmp ($len,0x20);
&jbe (&label("cbc_dec_two"));
&movups ($inout2,&QWP(0x20,$inp));
&cmp ($len,0x30);
&jbe (&label("cbc_dec_three"));
&movups ($inout3,&QWP(0x30,$inp));
&cmp ($len,0x40);
&jbe (&label("cbc_dec_four"));
&movups ($inout4,&QWP(0x40,$inp));
&movaps (&QWP(0,"esp"),$ivec); # save IV
&movups ($inout0,&QWP(0,$inp));
&xorps ($inout5,$inout5);
&call ("_aesni_decrypt6");
&movups ($rndkey1,&QWP(0,$inp));
&movups ($rndkey0,&QWP(0x10,$inp));
&xorps ($inout0,&QWP(0,"esp")); # ^= IV
&xorps ($inout1,$rndkey1);
&movups ($rndkey1,&QWP(0x20,$inp));
&xorps ($inout2,$rndkey0);
&movups ($rndkey0,&QWP(0x30,$inp));
&xorps ($inout3,$rndkey1);
&movups ($ivec,&QWP(0x40,$inp)); # IV
&xorps ($inout4,$rndkey0);
&movups (&QWP(0,$out),$inout0);
&movups (&QWP(0x10,$out),$inout1);
&pxor ($inout1,$inout1);
&movups (&QWP(0x20,$out),$inout2);
&pxor ($inout2,$inout2);
&movups (&QWP(0x30,$out),$inout3);
&pxor ($inout3,$inout3);
&lea ($out,&DWP(0x40,$out));
&movaps ($inout0,$inout4);
&pxor ($inout4,$inout4);
&sub ($len,0x50);
&jmp (&label("cbc_dec_tail_collected"));
&set_label("cbc_dec_one",16);
if ($inline)
{ &aesni_inline_generate1("dec"); }
else
{ &call ("_aesni_decrypt1"); }
&xorps ($inout0,$ivec);
&movaps ($ivec,$in0);
&sub ($len,0x10);
&jmp (&label("cbc_dec_tail_collected"));
&set_label("cbc_dec_two",16);
&call ("_aesni_decrypt2");
&xorps ($inout0,$ivec);
&xorps ($inout1,$in0);
&movups (&QWP(0,$out),$inout0);
&movaps ($inout0,$inout1);
&pxor ($inout1,$inout1);
&lea ($out,&DWP(0x10,$out));
&movaps ($ivec,$in1);
&sub ($len,0x20);
&jmp (&label("cbc_dec_tail_collected"));
&set_label("cbc_dec_three",16);
&call ("_aesni_decrypt3");
&xorps ($inout0,$ivec);
&xorps ($inout1,$in0);
&xorps ($inout2,$in1);
&movups (&QWP(0,$out),$inout0);
&movaps ($inout0,$inout2);
&pxor ($inout2,$inout2);
&movups (&QWP(0x10,$out),$inout1);
&pxor ($inout1,$inout1);
&lea ($out,&DWP(0x20,$out));
&movups ($ivec,&QWP(0x20,$inp));
&sub ($len,0x30);
&jmp (&label("cbc_dec_tail_collected"));
&set_label("cbc_dec_four",16);
&call ("_aesni_decrypt4");
&movups ($rndkey1,&QWP(0x10,$inp));
&movups ($rndkey0,&QWP(0x20,$inp));
&xorps ($inout0,$ivec);
&movups ($ivec,&QWP(0x30,$inp));
&xorps ($inout1,$in0);
&movups (&QWP(0,$out),$inout0);
&xorps ($inout2,$rndkey1);
&movups (&QWP(0x10,$out),$inout1);
&pxor ($inout1,$inout1);
&xorps ($inout3,$rndkey0);
&movups (&QWP(0x20,$out),$inout2);
&pxor ($inout2,$inout2);
&lea ($out,&DWP(0x30,$out));
&movaps ($inout0,$inout3);
&pxor ($inout3,$inout3);
&sub ($len,0x40);
&jmp (&label("cbc_dec_tail_collected"));
&set_label("cbc_dec_clear_tail_collected",16);
&pxor ($inout1,$inout1);
&pxor ($inout2,$inout2);
&pxor ($inout3,$inout3);
&pxor ($inout4,$inout4);
&set_label("cbc_dec_tail_collected");
&and ($len,15);
&jnz (&label("cbc_dec_tail_partial"));
&movups (&QWP(0,$out),$inout0);
&pxor ($rndkey0,$rndkey0);
&jmp (&label("cbc_ret"));
&set_label("cbc_dec_tail_partial",16);
&movaps (&QWP(0,"esp"),$inout0);
&pxor ($rndkey0,$rndkey0);
&mov ("ecx",16);
&mov ($inp,"esp");
&sub ("ecx",$len);
&data_word(0xA4F3F689); # rep movsb
&movdqa (&QWP(0,"esp"),$inout0);
&set_label("cbc_ret");
&mov ("esp",&DWP(16,"esp")); # pull original %esp
&mov ($key_,&wparam(4));
&pxor ($inout0,$inout0);
&pxor ($rndkey1,$rndkey1);
&movups (&QWP(0,$key_),$ivec); # output IV
&pxor ($ivec,$ivec);
&set_label("cbc_abort");
&function_end("${PREFIX}_cbc_encrypt");
######################################################################
# Mechanical port from aesni-x86_64.pl.
#
# _aesni_set_encrypt_key is private interface,
# input:
# "eax" const unsigned char *userKey
# $rounds int bits
# $key AES_KEY *key
# output:
# "eax" return code
# $round rounds
&function_begin_B("_aesni_set_encrypt_key");
&push ("ebp");
&push ("ebx");
&test ("eax","eax");
&jz (&label("bad_pointer"));
&test ($key,$key);
&jz (&label("bad_pointer"));
&call (&label("pic"));
&set_label("pic");
&blindpop("ebx");
&lea ("ebx",&DWP(&label("key_const")."-".&label("pic"),"ebx"));
&picmeup("ebp","OPENSSL_ia32cap_P","ebx",&label("key_const"));
&movups ("xmm0",&QWP(0,"eax")); # pull first 128 bits of *userKey
&xorps ("xmm4","xmm4"); # low dword of xmm4 is assumed 0
&mov ("ebp",&DWP(4,"ebp"));
&lea ($key,&DWP(16,$key));
&and ("ebp",1<<28|1<<11); # AVX and XOP bits
&cmp ($rounds,256);
&je (&label("14rounds"));
&cmp ($rounds,192);
&je (&label("12rounds"));
&cmp ($rounds,128);
&jne (&label("bad_keybits"));
&set_label("10rounds",16);
&cmp ("ebp",1<<28);
&je (&label("10rounds_alt"));
&mov ($rounds,9);
&$movekey (&QWP(-16,$key),"xmm0"); # round 0
&aeskeygenassist("xmm1","xmm0",0x01); # round 1
&call (&label("key_128_cold"));
&aeskeygenassist("xmm1","xmm0",0x2); # round 2
&call (&label("key_128"));
&aeskeygenassist("xmm1","xmm0",0x04); # round 3
&call (&label("key_128"));
&aeskeygenassist("xmm1","xmm0",0x08); # round 4
&call (&label("key_128"));
&aeskeygenassist("xmm1","xmm0",0x10); # round 5
&call (&label("key_128"));
&aeskeygenassist("xmm1","xmm0",0x20); # round 6
&call (&label("key_128"));
&aeskeygenassist("xmm1","xmm0",0x40); # round 7
&call (&label("key_128"));
&aeskeygenassist("xmm1","xmm0",0x80); # round 8
&call (&label("key_128"));
&aeskeygenassist("xmm1","xmm0",0x1b); # round 9
&call (&label("key_128"));
&aeskeygenassist("xmm1","xmm0",0x36); # round 10
&call (&label("key_128"));
&$movekey (&QWP(0,$key),"xmm0");
&mov (&DWP(80,$key),$rounds);
&jmp (&label("good_key"));
&set_label("key_128",16);
&$movekey (&QWP(0,$key),"xmm0");
&lea ($key,&DWP(16,$key));
&set_label("key_128_cold");
&shufps ("xmm4","xmm0",0b00010000);
&xorps ("xmm0","xmm4");
&shufps ("xmm4","xmm0",0b10001100);
&xorps ("xmm0","xmm4");
&shufps ("xmm1","xmm1",0b11111111); # critical path
&xorps ("xmm0","xmm1");
&ret();
&set_label("10rounds_alt",16);
&movdqa ("xmm5",&QWP(0x00,"ebx"));
&mov ($rounds,8);
&movdqa ("xmm4",&QWP(0x20,"ebx"));
&movdqa ("xmm2","xmm0");
&movdqu (&QWP(-16,$key),"xmm0");
&set_label("loop_key128");
&pshufb ("xmm0","xmm5");
&aesenclast ("xmm0","xmm4");
&pslld ("xmm4",1);
&lea ($key,&DWP(16,$key));
&movdqa ("xmm3","xmm2");
&pslldq ("xmm2",4);
&pxor ("xmm3","xmm2");
&pslldq ("xmm2",4);
&pxor ("xmm3","xmm2");
&pslldq ("xmm2",4);
&pxor ("xmm2","xmm3");
&pxor ("xmm0","xmm2");
&movdqu (&QWP(-16,$key),"xmm0");
&movdqa ("xmm2","xmm0");
&dec ($rounds);
&jnz (&label("loop_key128"));
&movdqa ("xmm4",&QWP(0x30,"ebx"));
&pshufb ("xmm0","xmm5");
&aesenclast ("xmm0","xmm4");
&pslld ("xmm4",1);
&movdqa ("xmm3","xmm2");
&pslldq ("xmm2",4);
&pxor ("xmm3","xmm2");
&pslldq ("xmm2",4);
&pxor ("xmm3","xmm2");
&pslldq ("xmm2",4);
&pxor ("xmm2","xmm3");
&pxor ("xmm0","xmm2");
&movdqu (&QWP(0,$key),"xmm0");
&movdqa ("xmm2","xmm0");
&pshufb ("xmm0","xmm5");
&aesenclast ("xmm0","xmm4");
&movdqa ("xmm3","xmm2");
&pslldq ("xmm2",4);
&pxor ("xmm3","xmm2");
&pslldq ("xmm2",4);
&pxor ("xmm3","xmm2");
&pslldq ("xmm2",4);
&pxor ("xmm2","xmm3");
&pxor ("xmm0","xmm2");
&movdqu (&QWP(16,$key),"xmm0");
&mov ($rounds,9);
&mov (&DWP(96,$key),$rounds);
&jmp (&label("good_key"));
&set_label("12rounds",16);
&movq ("xmm2",&QWP(16,"eax")); # remaining 1/3 of *userKey
&cmp ("ebp",1<<28);
&je (&label("12rounds_alt"));
&mov ($rounds,11);
&$movekey (&QWP(-16,$key),"xmm0"); # round 0
&aeskeygenassist("xmm1","xmm2",0x01); # round 1,2
&call (&label("key_192a_cold"));
&aeskeygenassist("xmm1","xmm2",0x02); # round 2,3
&call (&label("key_192b"));
&aeskeygenassist("xmm1","xmm2",0x04); # round 4,5
&call (&label("key_192a"));
&aeskeygenassist("xmm1","xmm2",0x08); # round 5,6
&call (&label("key_192b"));
&aeskeygenassist("xmm1","xmm2",0x10); # round 7,8
&call (&label("key_192a"));
&aeskeygenassist("xmm1","xmm2",0x20); # round 8,9
&call (&label("key_192b"));
&aeskeygenassist("xmm1","xmm2",0x40); # round 10,11
&call (&label("key_192a"));
&aeskeygenassist("xmm1","xmm2",0x80); # round 11,12
&call (&label("key_192b"));
&$movekey (&QWP(0,$key),"xmm0");
&mov (&DWP(48,$key),$rounds);
&jmp (&label("good_key"));
&set_label("key_192a",16);
&$movekey (&QWP(0,$key),"xmm0");
&lea ($key,&DWP(16,$key));
&set_label("key_192a_cold",16);
&movaps ("xmm5","xmm2");
&set_label("key_192b_warm");
&shufps ("xmm4","xmm0",0b00010000);
&movdqa ("xmm3","xmm2");
&xorps ("xmm0","xmm4");
&shufps ("xmm4","xmm0",0b10001100);
&pslldq ("xmm3",4);
&xorps ("xmm0","xmm4");
&pshufd ("xmm1","xmm1",0b01010101); # critical path
&pxor ("xmm2","xmm3");
&pxor ("xmm0","xmm1");
&pshufd ("xmm3","xmm0",0b11111111);
&pxor ("xmm2","xmm3");
&ret();
&set_label("key_192b",16);
&movaps ("xmm3","xmm0");
&shufps ("xmm5","xmm0",0b01000100);
&$movekey (&QWP(0,$key),"xmm5");
&shufps ("xmm3","xmm2",0b01001110);
&$movekey (&QWP(16,$key),"xmm3");
&lea ($key,&DWP(32,$key));
&jmp (&label("key_192b_warm"));
&set_label("12rounds_alt",16);
&movdqa ("xmm5",&QWP(0x10,"ebx"));
&movdqa ("xmm4",&QWP(0x20,"ebx"));
&mov ($rounds,8);
&movdqu (&QWP(-16,$key),"xmm0");
&set_label("loop_key192");
&movq (&QWP(0,$key),"xmm2");
&movdqa ("xmm1","xmm2");
&pshufb ("xmm2","xmm5");
&aesenclast ("xmm2","xmm4");
&pslld ("xmm4",1);
&lea ($key,&DWP(24,$key));
&movdqa ("xmm3","xmm0");
&pslldq ("xmm0",4);
&pxor ("xmm3","xmm0");
&pslldq ("xmm0",4);
&pxor ("xmm3","xmm0");
&pslldq ("xmm0",4);
&pxor ("xmm0","xmm3");
&pshufd ("xmm3","xmm0",0xff);
&pxor ("xmm3","xmm1");
&pslldq ("xmm1",4);
&pxor ("xmm3","xmm1");
&pxor ("xmm0","xmm2");
&pxor ("xmm2","xmm3");
&movdqu (&QWP(-16,$key),"xmm0");
&dec ($rounds);
&jnz (&label("loop_key192"));
&mov ($rounds,11);
&mov (&DWP(32,$key),$rounds);
&jmp (&label("good_key"));
&set_label("14rounds",16);
&movups ("xmm2",&QWP(16,"eax")); # remaining half of *userKey
&lea ($key,&DWP(16,$key));
&cmp ("ebp",1<<28);
&je (&label("14rounds_alt"));
&mov ($rounds,13);
&$movekey (&QWP(-32,$key),"xmm0"); # round 0
&$movekey (&QWP(-16,$key),"xmm2"); # round 1
&aeskeygenassist("xmm1","xmm2",0x01); # round 2
&call (&label("key_256a_cold"));
&aeskeygenassist("xmm1","xmm0",0x01); # round 3
&call (&label("key_256b"));
&aeskeygenassist("xmm1","xmm2",0x02); # round 4
&call (&label("key_256a"));
&aeskeygenassist("xmm1","xmm0",0x02); # round 5
&call (&label("key_256b"));
&aeskeygenassist("xmm1","xmm2",0x04); # round 6
&call (&label("key_256a"));
&aeskeygenassist("xmm1","xmm0",0x04); # round 7
&call (&label("key_256b"));
&aeskeygenassist("xmm1","xmm2",0x08); # round 8
&call (&label("key_256a"));
&aeskeygenassist("xmm1","xmm0",0x08); # round 9
&call (&label("key_256b"));
&aeskeygenassist("xmm1","xmm2",0x10); # round 10
&call (&label("key_256a"));
&aeskeygenassist("xmm1","xmm0",0x10); # round 11
&call (&label("key_256b"));
&aeskeygenassist("xmm1","xmm2",0x20); # round 12
&call (&label("key_256a"));
&aeskeygenassist("xmm1","xmm0",0x20); # round 13
&call (&label("key_256b"));
&aeskeygenassist("xmm1","xmm2",0x40); # round 14
&call (&label("key_256a"));
&$movekey (&QWP(0,$key),"xmm0");
&mov (&DWP(16,$key),$rounds);
&xor ("eax","eax");
&jmp (&label("good_key"));
&set_label("key_256a",16);
&$movekey (&QWP(0,$key),"xmm2");
&lea ($key,&DWP(16,$key));
&set_label("key_256a_cold");
&shufps ("xmm4","xmm0",0b00010000);
&xorps ("xmm0","xmm4");
&shufps ("xmm4","xmm0",0b10001100);
&xorps ("xmm0","xmm4");
&shufps ("xmm1","xmm1",0b11111111); # critical path
&xorps ("xmm0","xmm1");
&ret();
&set_label("key_256b",16);
&$movekey (&QWP(0,$key),"xmm0");
&lea ($key,&DWP(16,$key));
&shufps ("xmm4","xmm2",0b00010000);
&xorps ("xmm2","xmm4");
&shufps ("xmm4","xmm2",0b10001100);
&xorps ("xmm2","xmm4");
&shufps ("xmm1","xmm1",0b10101010); # critical path
&xorps ("xmm2","xmm1");
&ret();
&set_label("14rounds_alt",16);
&movdqa ("xmm5",&QWP(0x00,"ebx"));
&movdqa ("xmm4",&QWP(0x20,"ebx"));
&mov ($rounds,7);
&movdqu (&QWP(-32,$key),"xmm0");
&movdqa ("xmm1","xmm2");
&movdqu (&QWP(-16,$key),"xmm2");
&set_label("loop_key256");
&pshufb ("xmm2","xmm5");
&aesenclast ("xmm2","xmm4");
&movdqa ("xmm3","xmm0");
&pslldq ("xmm0",4);
&pxor ("xmm3","xmm0");
&pslldq ("xmm0",4);
&pxor ("xmm3","xmm0");
&pslldq ("xmm0",4);
&pxor ("xmm0","xmm3");
&pslld ("xmm4",1);
&pxor ("xmm0","xmm2");
&movdqu (&QWP(0,$key),"xmm0");
&dec ($rounds);
&jz (&label("done_key256"));
&pshufd ("xmm2","xmm0",0xff);
&pxor ("xmm3","xmm3");
&aesenclast ("xmm2","xmm3");
&movdqa ("xmm3","xmm1")
&pslldq ("xmm1",4);
&pxor ("xmm3","xmm1");
&pslldq ("xmm1",4);
&pxor ("xmm3","xmm1");
&pslldq ("xmm1",4);
&pxor ("xmm1","xmm3");
&pxor ("xmm2","xmm1");
&movdqu (&QWP(16,$key),"xmm2");
&lea ($key,&DWP(32,$key));
&movdqa ("xmm1","xmm2");
&jmp (&label("loop_key256"));
&set_label("done_key256");
&mov ($rounds,13);
&mov (&DWP(16,$key),$rounds);
&set_label("good_key");
&pxor ("xmm0","xmm0");
&pxor ("xmm1","xmm1");
&pxor ("xmm2","xmm2");
&pxor ("xmm3","xmm3");
&pxor ("xmm4","xmm4");
&pxor ("xmm5","xmm5");
&xor ("eax","eax");
&pop ("ebx");
&pop ("ebp");
&ret ();
&set_label("bad_pointer",4);
&mov ("eax",-1);
&pop ("ebx");
&pop ("ebp");
&ret ();
&set_label("bad_keybits",4);
&pxor ("xmm0","xmm0");
&mov ("eax",-2);
&pop ("ebx");
&pop ("ebp");
&ret ();
&function_end_B("_aesni_set_encrypt_key");
# int $PREFIX_set_encrypt_key (const unsigned char *userKey, int bits,
# AES_KEY *key)
&function_begin_B("${PREFIX}_set_encrypt_key");
&mov ("eax",&wparam(0));
&mov ($rounds,&wparam(1));
&mov ($key,&wparam(2));
&call ("_aesni_set_encrypt_key");
&ret ();
&function_end_B("${PREFIX}_set_encrypt_key");
# int $PREFIX_set_decrypt_key (const unsigned char *userKey, int bits,
# AES_KEY *key)
&function_begin_B("${PREFIX}_set_decrypt_key");
&mov ("eax",&wparam(0));
&mov ($rounds,&wparam(1));
&mov ($key,&wparam(2));
&call ("_aesni_set_encrypt_key");
&mov ($key,&wparam(2));
&shl ($rounds,4); # rounds-1 after _aesni_set_encrypt_key
&test ("eax","eax");
&jnz (&label("dec_key_ret"));
&lea ("eax",&DWP(16,$key,$rounds)); # end of key schedule
&$movekey ("xmm0",&QWP(0,$key)); # just swap
&$movekey ("xmm1",&QWP(0,"eax"));
&$movekey (&QWP(0,"eax"),"xmm0");
&$movekey (&QWP(0,$key),"xmm1");
&lea ($key,&DWP(16,$key));
&lea ("eax",&DWP(-16,"eax"));
&set_label("dec_key_inverse");
&$movekey ("xmm0",&QWP(0,$key)); # swap and inverse
&$movekey ("xmm1",&QWP(0,"eax"));
&aesimc ("xmm0","xmm0");
&aesimc ("xmm1","xmm1");
&lea ($key,&DWP(16,$key));
&lea ("eax",&DWP(-16,"eax"));
&$movekey (&QWP(16,"eax"),"xmm0");
&$movekey (&QWP(-16,$key),"xmm1");
&cmp ("eax",$key);
&ja (&label("dec_key_inverse"));
&$movekey ("xmm0",&QWP(0,$key)); # inverse middle
&aesimc ("xmm0","xmm0");
&$movekey (&QWP(0,$key),"xmm0");
&pxor ("xmm0","xmm0");
&pxor ("xmm1","xmm1");
&xor ("eax","eax"); # return success
&set_label("dec_key_ret");
&ret ();
&function_end_B("${PREFIX}_set_decrypt_key");
&set_label("key_const",64);
&data_word(0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d);
&data_word(0x04070605,0x04070605,0x04070605,0x04070605);
&data_word(1,1,1,1);
&data_word(0x1b,0x1b,0x1b,0x1b);
&asciz("AES for Intel AES-NI, CRYPTOGAMS by <appro\@openssl.org>");
&asm_finish();
close STDOUT;