boringssl/crypto/modes/asm/aesni-gcm-x86_64.pl

1044 lines
27 KiB
Perl
Raw Normal View History

#!/usr/bin/env perl
#
# ====================================================================
# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
# project. The module is, however, dual licensed under OpenSSL and
# CRYPTOGAMS licenses depending on where you obtain it. For further
# details see http://www.openssl.org/~appro/cryptogams/.
# ====================================================================
#
#
# AES-NI-CTR+GHASH stitch.
#
# February 2013
#
# OpenSSL GCM implementation is organized in such way that its
# performance is rather close to the sum of its streamed components,
# in the context parallelized AES-NI CTR and modulo-scheduled
# PCLMULQDQ-enabled GHASH. Unfortunately, as no stitch implementation
# was observed to perform significantly better than the sum of the
# components on contemporary CPUs, the effort was deemed impossible to
# justify. This module is based on combination of Intel submissions,
# [1] and [2], with MOVBE twist suggested by Ilya Albrekht and Max
# Locktyukhin of Intel Corp. who verified that it reduces shuffles
# pressure with notable relative improvement, achieving 1.0 cycle per
# byte processed with 128-bit key on Haswell processor, and 0.74 -
# on Broadwell. [Mentioned results are raw profiled measurements for
# favourable packet size, one divisible by 96. Applications using the
# EVP interface will observe a few percent worse performance.]
#
# [1] http://rt.openssl.org/Ticket/Display.html?id=2900&user=guest&pass=guest
# [2] http://www.intel.com/content/dam/www/public/us/en/documents/software-support/enabling-high-performance-gcm.pdf
$flavour = shift;
$output = shift;
if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
die "can't locate x86_64-xlate.pl";
Get rid of all compiler version checks in perlasm files. Since we pre-generate our perlasm, having the output of these files be sensitive to the environment the run in is unhelpful. It would be bad to suddenly change what features we do or don't compile in whenever workstations' toolchains change. Enable all compiler-version-gated features as they should all be runtime-gated anyway. This should align with what upstream's files would have produced on modern toolschains. We should assume our assemblers can take whatever we'd like to throw at them. (If it turns out some can't, we'd rather find out and probably switch the problematic instructions to explicit byte sequences.) This actually results in a fairly significant change to the assembly we generate. I'm guessing upstream's buildsystem sets the CC environment variable, while ours doesn't and so the version checks were all coming out conservative. diffstat of generated files: linux-x86/crypto/sha/sha1-586.S | 1176 ++++++++++++ linux-x86/crypto/sha/sha256-586.S | 2248 ++++++++++++++++++++++++ linux-x86_64/crypto/bn/rsaz-avx2.S | 1644 +++++++++++++++++ linux-x86_64/crypto/bn/rsaz-x86_64.S | 638 ++++++ linux-x86_64/crypto/bn/x86_64-mont.S | 332 +++ linux-x86_64/crypto/bn/x86_64-mont5.S | 1130 ++++++++++++ linux-x86_64/crypto/modes/aesni-gcm-x86_64.S | 754 ++++++++ linux-x86_64/crypto/modes/ghash-x86_64.S | 475 +++++ linux-x86_64/crypto/sha/sha1-x86_64.S | 1121 ++++++++++++ linux-x86_64/crypto/sha/sha256-x86_64.S | 1062 +++++++++++ linux-x86_64/crypto/sha/sha512-x86_64.S | 2241 ++++++++++++++++++++++++ mac-x86/crypto/sha/sha1-586.S | 1174 ++++++++++++ mac-x86/crypto/sha/sha256-586.S | 2248 ++++++++++++++++++++++++ mac-x86_64/crypto/bn/rsaz-avx2.S | 1637 +++++++++++++++++ mac-x86_64/crypto/bn/rsaz-x86_64.S | 638 ++++++ mac-x86_64/crypto/bn/x86_64-mont.S | 331 +++ mac-x86_64/crypto/bn/x86_64-mont5.S | 1130 ++++++++++++ mac-x86_64/crypto/modes/aesni-gcm-x86_64.S | 750 ++++++++ mac-x86_64/crypto/modes/ghash-x86_64.S | 475 +++++ mac-x86_64/crypto/sha/sha1-x86_64.S | 1121 ++++++++++++ mac-x86_64/crypto/sha/sha256-x86_64.S | 1062 +++++++++++ mac-x86_64/crypto/sha/sha512-x86_64.S | 2241 ++++++++++++++++++++++++ win-x86/crypto/sha/sha1-586.asm | 1173 ++++++++++++ win-x86/crypto/sha/sha256-586.asm | 2248 ++++++++++++++++++++++++ win-x86_64/crypto/bn/rsaz-avx2.asm | 1858 +++++++++++++++++++- win-x86_64/crypto/bn/rsaz-x86_64.asm | 638 ++++++ win-x86_64/crypto/bn/x86_64-mont.asm | 352 +++ win-x86_64/crypto/bn/x86_64-mont5.asm | 1184 ++++++++++++ win-x86_64/crypto/modes/aesni-gcm-x86_64.asm | 933 ++++++++++ win-x86_64/crypto/modes/ghash-x86_64.asm | 515 +++++ win-x86_64/crypto/sha/sha1-x86_64.asm | 1152 ++++++++++++ win-x86_64/crypto/sha/sha256-x86_64.asm | 1088 +++++++++++ win-x86_64/crypto/sha/sha512-x86_64.asm | 2499 ++++++ SHA* gets faster. RSA and AES-GCM seem to be more of a wash and even slower sometimes! This is a little concerning. Though when I repeated the latter two, it's definitely noisy (RSA in particular), so we may wish to repeat in a more controlled environment. We could also flip some of these toggles to something other than the highest setting if it seems some of the variants aren't desirable. We just shouldn't have them enabled or disabled on accident. This aligns us closer to upstream though. $ /tmp/bssl.old speed SHA- Did 5028000 SHA-1 (16 bytes) operations in 1000048us (5027758.7 ops/sec): 80.4 MB/s Did 1708000 SHA-1 (256 bytes) operations in 1000257us (1707561.2 ops/sec): 437.1 MB/s Did 73000 SHA-1 (8192 bytes) operations in 1008406us (72391.5 ops/sec): 593.0 MB/s Did 3041000 SHA-256 (16 bytes) operations in 1000311us (3040054.5 ops/sec): 48.6 MB/s Did 779000 SHA-256 (256 bytes) operations in 1000820us (778361.7 ops/sec): 199.3 MB/s Did 26000 SHA-256 (8192 bytes) operations in 1009875us (25745.8 ops/sec): 210.9 MB/s Did 1837000 SHA-512 (16 bytes) operations in 1000251us (1836539.0 ops/sec): 29.4 MB/s Did 803000 SHA-512 (256 bytes) operations in 1000969us (802222.6 ops/sec): 205.4 MB/s Did 41000 SHA-512 (8192 bytes) operations in 1016768us (40323.8 ops/sec): 330.3 MB/s $ /tmp/bssl.new speed SHA- Did 5354000 SHA-1 (16 bytes) operations in 1000104us (5353443.2 ops/sec): 85.7 MB/s Did 1779000 SHA-1 (256 bytes) operations in 1000121us (1778784.8 ops/sec): 455.4 MB/s Did 87000 SHA-1 (8192 bytes) operations in 1012641us (85914.0 ops/sec): 703.8 MB/s Did 3517000 SHA-256 (16 bytes) operations in 1000114us (3516599.1 ops/sec): 56.3 MB/s Did 935000 SHA-256 (256 bytes) operations in 1000096us (934910.2 ops/sec): 239.3 MB/s Did 38000 SHA-256 (8192 bytes) operations in 1004476us (37830.7 ops/sec): 309.9 MB/s Did 2930000 SHA-512 (16 bytes) operations in 1000259us (2929241.3 ops/sec): 46.9 MB/s Did 1008000 SHA-512 (256 bytes) operations in 1000509us (1007487.2 ops/sec): 257.9 MB/s Did 45000 SHA-512 (8192 bytes) operations in 1000593us (44973.3 ops/sec): 368.4 MB/s $ /tmp/bssl.old speed RSA Did 820 RSA 2048 signing operations in 1017008us (806.3 ops/sec) Did 27000 RSA 2048 verify operations in 1015400us (26590.5 ops/sec) Did 1292 RSA 2048 (3 prime, e=3) signing operations in 1008185us (1281.5 ops/sec) Did 65000 RSA 2048 (3 prime, e=3) verify operations in 1011388us (64268.1 ops/sec) Did 120 RSA 4096 signing operations in 1061027us (113.1 ops/sec) Did 8208 RSA 4096 verify operations in 1002717us (8185.8 ops/sec) $ /tmp/bssl.new speed RSA Did 760 RSA 2048 signing operations in 1003351us (757.5 ops/sec) Did 25900 RSA 2048 verify operations in 1028931us (25171.8 ops/sec) Did 1320 RSA 2048 (3 prime, e=3) signing operations in 1040806us (1268.2 ops/sec) Did 63000 RSA 2048 (3 prime, e=3) verify operations in 1016042us (62005.3 ops/sec) Did 104 RSA 4096 signing operations in 1008718us (103.1 ops/sec) Did 6875 RSA 4096 verify operations in 1093441us (6287.5 ops/sec) $ /tmp/bssl.old speed GCM Did 5316000 AES-128-GCM (16 bytes) seal operations in 1000082us (5315564.1 ops/sec): 85.0 MB/s Did 712000 AES-128-GCM (1350 bytes) seal operations in 1000252us (711820.6 ops/sec): 961.0 MB/s Did 149000 AES-128-GCM (8192 bytes) seal operations in 1003182us (148527.4 ops/sec): 1216.7 MB/s Did 5919750 AES-256-GCM (16 bytes) seal operations in 1000016us (5919655.3 ops/sec): 94.7 MB/s Did 800000 AES-256-GCM (1350 bytes) seal operations in 1000951us (799239.9 ops/sec): 1079.0 MB/s Did 152000 AES-256-GCM (8192 bytes) seal operations in 1000765us (151883.8 ops/sec): 1244.2 MB/s $ /tmp/bssl.new speed GCM Did 5315000 AES-128-GCM (16 bytes) seal operations in 1000125us (5314335.7 ops/sec): 85.0 MB/s Did 755000 AES-128-GCM (1350 bytes) seal operations in 1000878us (754337.7 ops/sec): 1018.4 MB/s Did 151000 AES-128-GCM (8192 bytes) seal operations in 1005655us (150150.9 ops/sec): 1230.0 MB/s Did 5913500 AES-256-GCM (16 bytes) seal operations in 1000041us (5913257.6 ops/sec): 94.6 MB/s Did 782000 AES-256-GCM (1350 bytes) seal operations in 1001484us (780841.2 ops/sec): 1054.1 MB/s Did 121000 AES-256-GCM (8192 bytes) seal operations in 1006389us (120231.8 ops/sec): 984.9 MB/s Change-Id: I0efb32f896c597abc7d7e55c31d038528a5c72a1 Reviewed-on: https://boringssl-review.googlesource.com/6260 Reviewed-by: Adam Langley <alangley@gmail.com>
2015-10-14 19:03:23 +01:00
# In upstream, this is controlled by shelling out to the compiler to check
# versions, but BoringSSL is intended to be used with pre-generated perlasm
# output, so this isn't useful anyway.
$avx = 2;
open OUT,"| \"$^X\" $xlate $flavour $output";
*STDOUT=*OUT;
if ($avx>1) {{{
($inp,$out,$len,$key,$ivp,$Xip)=("%rdi","%rsi","%rdx","%rcx","%r8","%r9");
($Ii,$T1,$T2,$Hkey,
$Z0,$Z1,$Z2,$Z3,$Xi) = map("%xmm$_",(0..8));
($inout0,$inout1,$inout2,$inout3,$inout4,$inout5,$rndkey) = map("%xmm$_",(9..15));
($counter,$rounds,$ret,$const,$in0,$end0)=("%ebx","%ebp","%r10","%r11","%r14","%r15");
$code=<<___;
.text
.type _aesni_ctr32_ghash_6x,\@abi-omnipotent
.align 32
_aesni_ctr32_ghash_6x:
vmovdqu 0x20($const),$T2 # borrow $T2, .Lone_msb
sub \$6,$len
vpxor $Z0,$Z0,$Z0 # $Z0 = 0
vmovdqu 0x00-0x80($key),$rndkey
vpaddb $T2,$T1,$inout1
vpaddb $T2,$inout1,$inout2
vpaddb $T2,$inout2,$inout3
vpaddb $T2,$inout3,$inout4
vpaddb $T2,$inout4,$inout5
vpxor $rndkey,$T1,$inout0
vmovdqu $Z0,16+8(%rsp) # "$Z3" = 0
jmp .Loop6x
.align 32
.Loop6x:
add \$`6<<24`,$counter
jc .Lhandle_ctr32 # discard $inout[1-5]?
vmovdqu 0x00-0x20($Xip),$Hkey # $Hkey^1
vpaddb $T2,$inout5,$T1 # next counter value
vpxor $rndkey,$inout1,$inout1
vpxor $rndkey,$inout2,$inout2
.Lresume_ctr32:
vmovdqu $T1,($ivp) # save next counter value
vpclmulqdq \$0x10,$Hkey,$Z3,$Z1
vpxor $rndkey,$inout3,$inout3
vmovups 0x10-0x80($key),$T2 # borrow $T2 for $rndkey
vpclmulqdq \$0x01,$Hkey,$Z3,$Z2
xor %r12,%r12
cmp $in0,$end0
vaesenc $T2,$inout0,$inout0
vmovdqu 0x30+8(%rsp),$Ii # I[4]
vpxor $rndkey,$inout4,$inout4
vpclmulqdq \$0x00,$Hkey,$Z3,$T1
vaesenc $T2,$inout1,$inout1
vpxor $rndkey,$inout5,$inout5
setnc %r12b
vpclmulqdq \$0x11,$Hkey,$Z3,$Z3
vaesenc $T2,$inout2,$inout2
vmovdqu 0x10-0x20($Xip),$Hkey # $Hkey^2
neg %r12
vaesenc $T2,$inout3,$inout3
vpxor $Z1,$Z2,$Z2
vpclmulqdq \$0x00,$Hkey,$Ii,$Z1
vpxor $Z0,$Xi,$Xi # modulo-scheduled
vaesenc $T2,$inout4,$inout4
vpxor $Z1,$T1,$Z0
and \$0x60,%r12
vmovups 0x20-0x80($key),$rndkey
vpclmulqdq \$0x10,$Hkey,$Ii,$T1
vaesenc $T2,$inout5,$inout5
vpclmulqdq \$0x01,$Hkey,$Ii,$T2
lea ($in0,%r12),$in0
vaesenc $rndkey,$inout0,$inout0
vpxor 16+8(%rsp),$Xi,$Xi # modulo-scheduled [vpxor $Z3,$Xi,$Xi]
vpclmulqdq \$0x11,$Hkey,$Ii,$Hkey
vmovdqu 0x40+8(%rsp),$Ii # I[3]
vaesenc $rndkey,$inout1,$inout1
movbe 0x58($in0),%r13
vaesenc $rndkey,$inout2,$inout2
movbe 0x50($in0),%r12
vaesenc $rndkey,$inout3,$inout3
mov %r13,0x20+8(%rsp)
vaesenc $rndkey,$inout4,$inout4
mov %r12,0x28+8(%rsp)
vmovdqu 0x30-0x20($Xip),$Z1 # borrow $Z1 for $Hkey^3
vaesenc $rndkey,$inout5,$inout5
vmovups 0x30-0x80($key),$rndkey
vpxor $T1,$Z2,$Z2
vpclmulqdq \$0x00,$Z1,$Ii,$T1
vaesenc $rndkey,$inout0,$inout0
vpxor $T2,$Z2,$Z2
vpclmulqdq \$0x10,$Z1,$Ii,$T2
vaesenc $rndkey,$inout1,$inout1
vpxor $Hkey,$Z3,$Z3
vpclmulqdq \$0x01,$Z1,$Ii,$Hkey
vaesenc $rndkey,$inout2,$inout2
vpclmulqdq \$0x11,$Z1,$Ii,$Z1
vmovdqu 0x50+8(%rsp),$Ii # I[2]
vaesenc $rndkey,$inout3,$inout3
vaesenc $rndkey,$inout4,$inout4
vpxor $T1,$Z0,$Z0
vmovdqu 0x40-0x20($Xip),$T1 # borrow $T1 for $Hkey^4
vaesenc $rndkey,$inout5,$inout5
vmovups 0x40-0x80($key),$rndkey
vpxor $T2,$Z2,$Z2
vpclmulqdq \$0x00,$T1,$Ii,$T2
vaesenc $rndkey,$inout0,$inout0
vpxor $Hkey,$Z2,$Z2
vpclmulqdq \$0x10,$T1,$Ii,$Hkey
vaesenc $rndkey,$inout1,$inout1
movbe 0x48($in0),%r13
vpxor $Z1,$Z3,$Z3
vpclmulqdq \$0x01,$T1,$Ii,$Z1
vaesenc $rndkey,$inout2,$inout2
movbe 0x40($in0),%r12
vpclmulqdq \$0x11,$T1,$Ii,$T1
vmovdqu 0x60+8(%rsp),$Ii # I[1]
vaesenc $rndkey,$inout3,$inout3
mov %r13,0x30+8(%rsp)
vaesenc $rndkey,$inout4,$inout4
mov %r12,0x38+8(%rsp)
vpxor $T2,$Z0,$Z0
vmovdqu 0x60-0x20($Xip),$T2 # borrow $T2 for $Hkey^5
vaesenc $rndkey,$inout5,$inout5
vmovups 0x50-0x80($key),$rndkey
vpxor $Hkey,$Z2,$Z2
vpclmulqdq \$0x00,$T2,$Ii,$Hkey
vaesenc $rndkey,$inout0,$inout0
vpxor $Z1,$Z2,$Z2
vpclmulqdq \$0x10,$T2,$Ii,$Z1
vaesenc $rndkey,$inout1,$inout1
movbe 0x38($in0),%r13
vpxor $T1,$Z3,$Z3
vpclmulqdq \$0x01,$T2,$Ii,$T1
vpxor 0x70+8(%rsp),$Xi,$Xi # accumulate I[0]
vaesenc $rndkey,$inout2,$inout2
movbe 0x30($in0),%r12
vpclmulqdq \$0x11,$T2,$Ii,$T2
vaesenc $rndkey,$inout3,$inout3
mov %r13,0x40+8(%rsp)
vaesenc $rndkey,$inout4,$inout4
mov %r12,0x48+8(%rsp)
vpxor $Hkey,$Z0,$Z0
vmovdqu 0x70-0x20($Xip),$Hkey # $Hkey^6
vaesenc $rndkey,$inout5,$inout5
vmovups 0x60-0x80($key),$rndkey
vpxor $Z1,$Z2,$Z2
vpclmulqdq \$0x10,$Hkey,$Xi,$Z1
vaesenc $rndkey,$inout0,$inout0
vpxor $T1,$Z2,$Z2
vpclmulqdq \$0x01,$Hkey,$Xi,$T1
vaesenc $rndkey,$inout1,$inout1
movbe 0x28($in0),%r13
vpxor $T2,$Z3,$Z3
vpclmulqdq \$0x00,$Hkey,$Xi,$T2
vaesenc $rndkey,$inout2,$inout2
movbe 0x20($in0),%r12
vpclmulqdq \$0x11,$Hkey,$Xi,$Xi
vaesenc $rndkey,$inout3,$inout3
mov %r13,0x50+8(%rsp)
vaesenc $rndkey,$inout4,$inout4
mov %r12,0x58+8(%rsp)
vpxor $Z1,$Z2,$Z2
vaesenc $rndkey,$inout5,$inout5
vpxor $T1,$Z2,$Z2
vmovups 0x70-0x80($key),$rndkey
vpslldq \$8,$Z2,$Z1
vpxor $T2,$Z0,$Z0
vmovdqu 0x10($const),$Hkey # .Lpoly
vaesenc $rndkey,$inout0,$inout0
vpxor $Xi,$Z3,$Z3
vaesenc $rndkey,$inout1,$inout1
vpxor $Z1,$Z0,$Z0
movbe 0x18($in0),%r13
vaesenc $rndkey,$inout2,$inout2
movbe 0x10($in0),%r12
vpalignr \$8,$Z0,$Z0,$Ii # 1st phase
vpclmulqdq \$0x10,$Hkey,$Z0,$Z0
mov %r13,0x60+8(%rsp)
vaesenc $rndkey,$inout3,$inout3
mov %r12,0x68+8(%rsp)
vaesenc $rndkey,$inout4,$inout4
vmovups 0x80-0x80($key),$T1 # borrow $T1 for $rndkey
vaesenc $rndkey,$inout5,$inout5
vaesenc $T1,$inout0,$inout0
vmovups 0x90-0x80($key),$rndkey
vaesenc $T1,$inout1,$inout1
vpsrldq \$8,$Z2,$Z2
vaesenc $T1,$inout2,$inout2
vpxor $Z2,$Z3,$Z3
vaesenc $T1,$inout3,$inout3
vpxor $Ii,$Z0,$Z0
movbe 0x08($in0),%r13
vaesenc $T1,$inout4,$inout4
movbe 0x00($in0),%r12
vaesenc $T1,$inout5,$inout5
vmovups 0xa0-0x80($key),$T1
cmp \$11,$rounds
jb .Lenc_tail # 128-bit key
vaesenc $rndkey,$inout0,$inout0
vaesenc $rndkey,$inout1,$inout1
vaesenc $rndkey,$inout2,$inout2
vaesenc $rndkey,$inout3,$inout3
vaesenc $rndkey,$inout4,$inout4
vaesenc $rndkey,$inout5,$inout5
vaesenc $T1,$inout0,$inout0
vaesenc $T1,$inout1,$inout1
vaesenc $T1,$inout2,$inout2
vaesenc $T1,$inout3,$inout3
vaesenc $T1,$inout4,$inout4
vmovups 0xb0-0x80($key),$rndkey
vaesenc $T1,$inout5,$inout5
vmovups 0xc0-0x80($key),$T1
je .Lenc_tail # 192-bit key
vaesenc $rndkey,$inout0,$inout0
vaesenc $rndkey,$inout1,$inout1
vaesenc $rndkey,$inout2,$inout2
vaesenc $rndkey,$inout3,$inout3
vaesenc $rndkey,$inout4,$inout4
vaesenc $rndkey,$inout5,$inout5
vaesenc $T1,$inout0,$inout0
vaesenc $T1,$inout1,$inout1
vaesenc $T1,$inout2,$inout2
vaesenc $T1,$inout3,$inout3
vaesenc $T1,$inout4,$inout4
vmovups 0xd0-0x80($key),$rndkey
vaesenc $T1,$inout5,$inout5
vmovups 0xe0-0x80($key),$T1
jmp .Lenc_tail # 256-bit key
.align 32
.Lhandle_ctr32:
vmovdqu ($const),$Ii # borrow $Ii for .Lbswap_mask
vpshufb $Ii,$T1,$Z2 # byte-swap counter
vmovdqu 0x30($const),$Z1 # borrow $Z1, .Ltwo_lsb
vpaddd 0x40($const),$Z2,$inout1 # .Lone_lsb
vpaddd $Z1,$Z2,$inout2
vmovdqu 0x00-0x20($Xip),$Hkey # $Hkey^1
vpaddd $Z1,$inout1,$inout3
vpshufb $Ii,$inout1,$inout1
vpaddd $Z1,$inout2,$inout4
vpshufb $Ii,$inout2,$inout2
vpxor $rndkey,$inout1,$inout1
vpaddd $Z1,$inout3,$inout5
vpshufb $Ii,$inout3,$inout3
vpxor $rndkey,$inout2,$inout2
vpaddd $Z1,$inout4,$T1 # byte-swapped next counter value
vpshufb $Ii,$inout4,$inout4
vpshufb $Ii,$inout5,$inout5
vpshufb $Ii,$T1,$T1 # next counter value
jmp .Lresume_ctr32
.align 32
.Lenc_tail:
vaesenc $rndkey,$inout0,$inout0
vmovdqu $Z3,16+8(%rsp) # postpone vpxor $Z3,$Xi,$Xi
vpalignr \$8,$Z0,$Z0,$Xi # 2nd phase
vaesenc $rndkey,$inout1,$inout1
vpclmulqdq \$0x10,$Hkey,$Z0,$Z0
vpxor 0x00($inp),$T1,$T2
vaesenc $rndkey,$inout2,$inout2
vpxor 0x10($inp),$T1,$Ii
vaesenc $rndkey,$inout3,$inout3
vpxor 0x20($inp),$T1,$Z1
vaesenc $rndkey,$inout4,$inout4
vpxor 0x30($inp),$T1,$Z2
vaesenc $rndkey,$inout5,$inout5
vpxor 0x40($inp),$T1,$Z3
vpxor 0x50($inp),$T1,$Hkey
vmovdqu ($ivp),$T1 # load next counter value
vaesenclast $T2,$inout0,$inout0
vmovdqu 0x20($const),$T2 # borrow $T2, .Lone_msb
vaesenclast $Ii,$inout1,$inout1
vpaddb $T2,$T1,$Ii
mov %r13,0x70+8(%rsp)
lea 0x60($inp),$inp
vaesenclast $Z1,$inout2,$inout2
vpaddb $T2,$Ii,$Z1
mov %r12,0x78+8(%rsp)
lea 0x60($out),$out
vmovdqu 0x00-0x80($key),$rndkey
vaesenclast $Z2,$inout3,$inout3
vpaddb $T2,$Z1,$Z2
vaesenclast $Z3, $inout4,$inout4
vpaddb $T2,$Z2,$Z3
vaesenclast $Hkey,$inout5,$inout5
vpaddb $T2,$Z3,$Hkey
add \$0x60,$ret
sub \$0x6,$len
jc .L6x_done
vmovups $inout0,-0x60($out) # save output
vpxor $rndkey,$T1,$inout0
vmovups $inout1,-0x50($out)
vmovdqa $Ii,$inout1 # 0 latency
vmovups $inout2,-0x40($out)
vmovdqa $Z1,$inout2 # 0 latency
vmovups $inout3,-0x30($out)
vmovdqa $Z2,$inout3 # 0 latency
vmovups $inout4,-0x20($out)
vmovdqa $Z3,$inout4 # 0 latency
vmovups $inout5,-0x10($out)
vmovdqa $Hkey,$inout5 # 0 latency
vmovdqu 0x20+8(%rsp),$Z3 # I[5]
jmp .Loop6x
.L6x_done:
vpxor 16+8(%rsp),$Xi,$Xi # modulo-scheduled
vpxor $Z0,$Xi,$Xi # modulo-scheduled
ret
.size _aesni_ctr32_ghash_6x,.-_aesni_ctr32_ghash_6x
___
######################################################################
#
# size_t aesni_gcm_[en|de]crypt(const void *inp, void *out, size_t len,
# const AES_KEY *key, unsigned char iv[16],
# struct { u128 Xi,H,Htbl[9]; } *Xip);
$code.=<<___;
.globl aesni_gcm_decrypt
.type aesni_gcm_decrypt,\@function,6
.align 32
aesni_gcm_decrypt:
xor $ret,$ret
cmp \$0x60,$len # minimal accepted length
jb .Lgcm_dec_abort
lea (%rsp),%rax # save stack pointer
push %rbx
push %rbp
push %r12
push %r13
push %r14
push %r15
___
$code.=<<___ if ($win64);
lea -0xa8(%rsp),%rsp
movaps %xmm6,-0xd8(%rax)
movaps %xmm7,-0xc8(%rax)
movaps %xmm8,-0xb8(%rax)
movaps %xmm9,-0xa8(%rax)
movaps %xmm10,-0x98(%rax)
movaps %xmm11,-0x88(%rax)
movaps %xmm12,-0x78(%rax)
movaps %xmm13,-0x68(%rax)
movaps %xmm14,-0x58(%rax)
movaps %xmm15,-0x48(%rax)
.Lgcm_dec_body:
___
$code.=<<___;
vzeroupper
vmovdqu ($ivp),$T1 # input counter value
add \$-128,%rsp
mov 12($ivp),$counter
lea .Lbswap_mask(%rip),$const
lea -0x80($key),$in0 # borrow $in0
mov \$0xf80,$end0 # borrow $end0
vmovdqu ($Xip),$Xi # load Xi
and \$-128,%rsp # ensure stack alignment
vmovdqu ($const),$Ii # borrow $Ii for .Lbswap_mask
lea 0x80($key),$key # size optimization
lea 0x20+0x20($Xip),$Xip # size optimization
mov 0xf0-0x80($key),$rounds
vpshufb $Ii,$Xi,$Xi
and $end0,$in0
and %rsp,$end0
sub $in0,$end0
jc .Ldec_no_key_aliasing
cmp \$768,$end0
jnc .Ldec_no_key_aliasing
sub $end0,%rsp # avoid aliasing with key
.Ldec_no_key_aliasing:
vmovdqu 0x50($inp),$Z3 # I[5]
lea ($inp),$in0
vmovdqu 0x40($inp),$Z0
lea -0xc0($inp,$len),$end0
vmovdqu 0x30($inp),$Z1
shr \$4,$len
xor $ret,$ret
vmovdqu 0x20($inp),$Z2
vpshufb $Ii,$Z3,$Z3 # passed to _aesni_ctr32_ghash_6x
vmovdqu 0x10($inp),$T2
vpshufb $Ii,$Z0,$Z0
vmovdqu ($inp),$Hkey
vpshufb $Ii,$Z1,$Z1
vmovdqu $Z0,0x30(%rsp)
vpshufb $Ii,$Z2,$Z2
vmovdqu $Z1,0x40(%rsp)
vpshufb $Ii,$T2,$T2
vmovdqu $Z2,0x50(%rsp)
vpshufb $Ii,$Hkey,$Hkey
vmovdqu $T2,0x60(%rsp)
vmovdqu $Hkey,0x70(%rsp)
call _aesni_ctr32_ghash_6x
vmovups $inout0,-0x60($out) # save output
vmovups $inout1,-0x50($out)
vmovups $inout2,-0x40($out)
vmovups $inout3,-0x30($out)
vmovups $inout4,-0x20($out)
vmovups $inout5,-0x10($out)
vpshufb ($const),$Xi,$Xi # .Lbswap_mask
vmovdqu $Xi,-0x40($Xip) # output Xi
vzeroupper
___
$code.=<<___ if ($win64);
movaps -0xd8(%rax),%xmm6
movaps -0xd8(%rax),%xmm7
movaps -0xb8(%rax),%xmm8
movaps -0xa8(%rax),%xmm9
movaps -0x98(%rax),%xmm10
movaps -0x88(%rax),%xmm11
movaps -0x78(%rax),%xmm12
movaps -0x68(%rax),%xmm13
movaps -0x58(%rax),%xmm14
movaps -0x48(%rax),%xmm15
___
$code.=<<___;
mov -48(%rax),%r15
mov -40(%rax),%r14
mov -32(%rax),%r13
mov -24(%rax),%r12
mov -16(%rax),%rbp
mov -8(%rax),%rbx
lea (%rax),%rsp # restore %rsp
.Lgcm_dec_abort:
mov $ret,%rax # return value
ret
.size aesni_gcm_decrypt,.-aesni_gcm_decrypt
___
$code.=<<___;
.type _aesni_ctr32_6x,\@abi-omnipotent
.align 32
_aesni_ctr32_6x:
vmovdqu 0x00-0x80($key),$Z0 # borrow $Z0 for $rndkey
vmovdqu 0x20($const),$T2 # borrow $T2, .Lone_msb
lea -1($rounds),%r13
vmovups 0x10-0x80($key),$rndkey
lea 0x20-0x80($key),%r12
vpxor $Z0,$T1,$inout0
add \$`6<<24`,$counter
jc .Lhandle_ctr32_2
vpaddb $T2,$T1,$inout1
vpaddb $T2,$inout1,$inout2
vpxor $Z0,$inout1,$inout1
vpaddb $T2,$inout2,$inout3
vpxor $Z0,$inout2,$inout2
vpaddb $T2,$inout3,$inout4
vpxor $Z0,$inout3,$inout3
vpaddb $T2,$inout4,$inout5
vpxor $Z0,$inout4,$inout4
vpaddb $T2,$inout5,$T1
vpxor $Z0,$inout5,$inout5
jmp .Loop_ctr32
.align 16
.Loop_ctr32:
vaesenc $rndkey,$inout0,$inout0
vaesenc $rndkey,$inout1,$inout1
vaesenc $rndkey,$inout2,$inout2
vaesenc $rndkey,$inout3,$inout3
vaesenc $rndkey,$inout4,$inout4
vaesenc $rndkey,$inout5,$inout5
vmovups (%r12),$rndkey
lea 0x10(%r12),%r12
dec %r13d
jnz .Loop_ctr32
vmovdqu (%r12),$Hkey # last round key
vaesenc $rndkey,$inout0,$inout0
vpxor 0x00($inp),$Hkey,$Z0
vaesenc $rndkey,$inout1,$inout1
vpxor 0x10($inp),$Hkey,$Z1
vaesenc $rndkey,$inout2,$inout2
vpxor 0x20($inp),$Hkey,$Z2
vaesenc $rndkey,$inout3,$inout3
vpxor 0x30($inp),$Hkey,$Xi
vaesenc $rndkey,$inout4,$inout4
vpxor 0x40($inp),$Hkey,$T2
vaesenc $rndkey,$inout5,$inout5
vpxor 0x50($inp),$Hkey,$Hkey
lea 0x60($inp),$inp
vaesenclast $Z0,$inout0,$inout0
vaesenclast $Z1,$inout1,$inout1
vaesenclast $Z2,$inout2,$inout2
vaesenclast $Xi,$inout3,$inout3
vaesenclast $T2,$inout4,$inout4
vaesenclast $Hkey,$inout5,$inout5
vmovups $inout0,0x00($out)
vmovups $inout1,0x10($out)
vmovups $inout2,0x20($out)
vmovups $inout3,0x30($out)
vmovups $inout4,0x40($out)
vmovups $inout5,0x50($out)
lea 0x60($out),$out
ret
.align 32
.Lhandle_ctr32_2:
vpshufb $Ii,$T1,$Z2 # byte-swap counter
vmovdqu 0x30($const),$Z1 # borrow $Z1, .Ltwo_lsb
vpaddd 0x40($const),$Z2,$inout1 # .Lone_lsb
vpaddd $Z1,$Z2,$inout2
vpaddd $Z1,$inout1,$inout3
vpshufb $Ii,$inout1,$inout1
vpaddd $Z1,$inout2,$inout4
vpshufb $Ii,$inout2,$inout2
vpxor $Z0,$inout1,$inout1
vpaddd $Z1,$inout3,$inout5
vpshufb $Ii,$inout3,$inout3
vpxor $Z0,$inout2,$inout2
vpaddd $Z1,$inout4,$T1 # byte-swapped next counter value
vpshufb $Ii,$inout4,$inout4
vpxor $Z0,$inout3,$inout3
vpshufb $Ii,$inout5,$inout5
vpxor $Z0,$inout4,$inout4
vpshufb $Ii,$T1,$T1 # next counter value
vpxor $Z0,$inout5,$inout5
jmp .Loop_ctr32
.size _aesni_ctr32_6x,.-_aesni_ctr32_6x
.globl aesni_gcm_encrypt
.type aesni_gcm_encrypt,\@function,6
.align 32
aesni_gcm_encrypt:
xor $ret,$ret
cmp \$0x60*3,$len # minimal accepted length
jb .Lgcm_enc_abort
lea (%rsp),%rax # save stack pointer
push %rbx
push %rbp
push %r12
push %r13
push %r14
push %r15
___
$code.=<<___ if ($win64);
lea -0xa8(%rsp),%rsp
movaps %xmm6,-0xd8(%rax)
movaps %xmm7,-0xc8(%rax)
movaps %xmm8,-0xb8(%rax)
movaps %xmm9,-0xa8(%rax)
movaps %xmm10,-0x98(%rax)
movaps %xmm11,-0x88(%rax)
movaps %xmm12,-0x78(%rax)
movaps %xmm13,-0x68(%rax)
movaps %xmm14,-0x58(%rax)
movaps %xmm15,-0x48(%rax)
.Lgcm_enc_body:
___
$code.=<<___;
vzeroupper
vmovdqu ($ivp),$T1 # input counter value
add \$-128,%rsp
mov 12($ivp),$counter
lea .Lbswap_mask(%rip),$const
lea -0x80($key),$in0 # borrow $in0
mov \$0xf80,$end0 # borrow $end0
lea 0x80($key),$key # size optimization
vmovdqu ($const),$Ii # borrow $Ii for .Lbswap_mask
and \$-128,%rsp # ensure stack alignment
mov 0xf0-0x80($key),$rounds
and $end0,$in0
and %rsp,$end0
sub $in0,$end0
jc .Lenc_no_key_aliasing
cmp \$768,$end0
jnc .Lenc_no_key_aliasing
sub $end0,%rsp # avoid aliasing with key
.Lenc_no_key_aliasing:
lea ($out),$in0
lea -0xc0($out,$len),$end0
shr \$4,$len
call _aesni_ctr32_6x
vpshufb $Ii,$inout0,$Xi # save bswapped output on stack
vpshufb $Ii,$inout1,$T2
vmovdqu $Xi,0x70(%rsp)
vpshufb $Ii,$inout2,$Z0
vmovdqu $T2,0x60(%rsp)
vpshufb $Ii,$inout3,$Z1
vmovdqu $Z0,0x50(%rsp)
vpshufb $Ii,$inout4,$Z2
vmovdqu $Z1,0x40(%rsp)
vpshufb $Ii,$inout5,$Z3 # passed to _aesni_ctr32_ghash_6x
vmovdqu $Z2,0x30(%rsp)
call _aesni_ctr32_6x
vmovdqu ($Xip),$Xi # load Xi
lea 0x20+0x20($Xip),$Xip # size optimization
sub \$12,$len
mov \$0x60*2,$ret
vpshufb $Ii,$Xi,$Xi
call _aesni_ctr32_ghash_6x
vmovdqu 0x20(%rsp),$Z3 # I[5]
vmovdqu ($const),$Ii # borrow $Ii for .Lbswap_mask
vmovdqu 0x00-0x20($Xip),$Hkey # $Hkey^1
vpunpckhqdq $Z3,$Z3,$T1
vmovdqu 0x20-0x20($Xip),$rndkey # borrow $rndkey for $HK
vmovups $inout0,-0x60($out) # save output
vpshufb $Ii,$inout0,$inout0 # but keep bswapped copy
vpxor $Z3,$T1,$T1
vmovups $inout1,-0x50($out)
vpshufb $Ii,$inout1,$inout1
vmovups $inout2,-0x40($out)
vpshufb $Ii,$inout2,$inout2
vmovups $inout3,-0x30($out)
vpshufb $Ii,$inout3,$inout3
vmovups $inout4,-0x20($out)
vpshufb $Ii,$inout4,$inout4
vmovups $inout5,-0x10($out)
vpshufb $Ii,$inout5,$inout5
vmovdqu $inout0,0x10(%rsp) # free $inout0
___
{ my ($HK,$T3)=($rndkey,$inout0);
$code.=<<___;
vmovdqu 0x30(%rsp),$Z2 # I[4]
vmovdqu 0x10-0x20($Xip),$Ii # borrow $Ii for $Hkey^2
vpunpckhqdq $Z2,$Z2,$T2
vpclmulqdq \$0x00,$Hkey,$Z3,$Z1
vpxor $Z2,$T2,$T2
vpclmulqdq \$0x11,$Hkey,$Z3,$Z3
vpclmulqdq \$0x00,$HK,$T1,$T1
vmovdqu 0x40(%rsp),$T3 # I[3]
vpclmulqdq \$0x00,$Ii,$Z2,$Z0
vmovdqu 0x30-0x20($Xip),$Hkey # $Hkey^3
vpxor $Z1,$Z0,$Z0
vpunpckhqdq $T3,$T3,$Z1
vpclmulqdq \$0x11,$Ii,$Z2,$Z2
vpxor $T3,$Z1,$Z1
vpxor $Z3,$Z2,$Z2
vpclmulqdq \$0x10,$HK,$T2,$T2
vmovdqu 0x50-0x20($Xip),$HK
vpxor $T1,$T2,$T2
vmovdqu 0x50(%rsp),$T1 # I[2]
vpclmulqdq \$0x00,$Hkey,$T3,$Z3
vmovdqu 0x40-0x20($Xip),$Ii # borrow $Ii for $Hkey^4
vpxor $Z0,$Z3,$Z3
vpunpckhqdq $T1,$T1,$Z0
vpclmulqdq \$0x11,$Hkey,$T3,$T3
vpxor $T1,$Z0,$Z0
vpxor $Z2,$T3,$T3
vpclmulqdq \$0x00,$HK,$Z1,$Z1
vpxor $T2,$Z1,$Z1
vmovdqu 0x60(%rsp),$T2 # I[1]
vpclmulqdq \$0x00,$Ii,$T1,$Z2
vmovdqu 0x60-0x20($Xip),$Hkey # $Hkey^5
vpxor $Z3,$Z2,$Z2
vpunpckhqdq $T2,$T2,$Z3
vpclmulqdq \$0x11,$Ii,$T1,$T1
vpxor $T2,$Z3,$Z3
vpxor $T3,$T1,$T1
vpclmulqdq \$0x10,$HK,$Z0,$Z0
vmovdqu 0x80-0x20($Xip),$HK
vpxor $Z1,$Z0,$Z0
vpxor 0x70(%rsp),$Xi,$Xi # accumulate I[0]
vpclmulqdq \$0x00,$Hkey,$T2,$Z1
vmovdqu 0x70-0x20($Xip),$Ii # borrow $Ii for $Hkey^6
vpunpckhqdq $Xi,$Xi,$T3
vpxor $Z2,$Z1,$Z1
vpclmulqdq \$0x11,$Hkey,$T2,$T2
vpxor $Xi,$T3,$T3
vpxor $T1,$T2,$T2
vpclmulqdq \$0x00,$HK,$Z3,$Z3
vpxor $Z0,$Z3,$Z0
vpclmulqdq \$0x00,$Ii,$Xi,$Z2
vmovdqu 0x00-0x20($Xip),$Hkey # $Hkey^1
vpunpckhqdq $inout5,$inout5,$T1
vpclmulqdq \$0x11,$Ii,$Xi,$Xi
vpxor $inout5,$T1,$T1
vpxor $Z1,$Z2,$Z1
vpclmulqdq \$0x10,$HK,$T3,$T3
vmovdqu 0x20-0x20($Xip),$HK
vpxor $T2,$Xi,$Z3
vpxor $Z0,$T3,$Z2
vmovdqu 0x10-0x20($Xip),$Ii # borrow $Ii for $Hkey^2
vpxor $Z1,$Z3,$T3 # aggregated Karatsuba post-processing
vpclmulqdq \$0x00,$Hkey,$inout5,$Z0
vpxor $T3,$Z2,$Z2
vpunpckhqdq $inout4,$inout4,$T2
vpclmulqdq \$0x11,$Hkey,$inout5,$inout5
vpxor $inout4,$T2,$T2
vpslldq \$8,$Z2,$T3
vpclmulqdq \$0x00,$HK,$T1,$T1
vpxor $T3,$Z1,$Xi
vpsrldq \$8,$Z2,$Z2
vpxor $Z2,$Z3,$Z3
vpclmulqdq \$0x00,$Ii,$inout4,$Z1
vmovdqu 0x30-0x20($Xip),$Hkey # $Hkey^3
vpxor $Z0,$Z1,$Z1
vpunpckhqdq $inout3,$inout3,$T3
vpclmulqdq \$0x11,$Ii,$inout4,$inout4
vpxor $inout3,$T3,$T3
vpxor $inout5,$inout4,$inout4
vpalignr \$8,$Xi,$Xi,$inout5 # 1st phase
vpclmulqdq \$0x10,$HK,$T2,$T2
vmovdqu 0x50-0x20($Xip),$HK
vpxor $T1,$T2,$T2
vpclmulqdq \$0x00,$Hkey,$inout3,$Z0
vmovdqu 0x40-0x20($Xip),$Ii # borrow $Ii for $Hkey^4
vpxor $Z1,$Z0,$Z0
vpunpckhqdq $inout2,$inout2,$T1
vpclmulqdq \$0x11,$Hkey,$inout3,$inout3
vpxor $inout2,$T1,$T1
vpxor $inout4,$inout3,$inout3
vxorps 0x10(%rsp),$Z3,$Z3 # accumulate $inout0
vpclmulqdq \$0x00,$HK,$T3,$T3
vpxor $T2,$T3,$T3
vpclmulqdq \$0x10,0x10($const),$Xi,$Xi
vxorps $inout5,$Xi,$Xi
vpclmulqdq \$0x00,$Ii,$inout2,$Z1
vmovdqu 0x60-0x20($Xip),$Hkey # $Hkey^5
vpxor $Z0,$Z1,$Z1
vpunpckhqdq $inout1,$inout1,$T2
vpclmulqdq \$0x11,$Ii,$inout2,$inout2
vpxor $inout1,$T2,$T2
vpalignr \$8,$Xi,$Xi,$inout5 # 2nd phase
vpxor $inout3,$inout2,$inout2
vpclmulqdq \$0x10,$HK,$T1,$T1
vmovdqu 0x80-0x20($Xip),$HK
vpxor $T3,$T1,$T1
vxorps $Z3,$inout5,$inout5
vpclmulqdq \$0x10,0x10($const),$Xi,$Xi
vxorps $inout5,$Xi,$Xi
vpclmulqdq \$0x00,$Hkey,$inout1,$Z0
vmovdqu 0x70-0x20($Xip),$Ii # borrow $Ii for $Hkey^6
vpxor $Z1,$Z0,$Z0
vpunpckhqdq $Xi,$Xi,$T3
vpclmulqdq \$0x11,$Hkey,$inout1,$inout1
vpxor $Xi,$T3,$T3
vpxor $inout2,$inout1,$inout1
vpclmulqdq \$0x00,$HK,$T2,$T2
vpxor $T1,$T2,$T2
vpclmulqdq \$0x00,$Ii,$Xi,$Z1
vpclmulqdq \$0x11,$Ii,$Xi,$Z3
vpxor $Z0,$Z1,$Z1
vpclmulqdq \$0x10,$HK,$T3,$Z2
vpxor $inout1,$Z3,$Z3
vpxor $T2,$Z2,$Z2
vpxor $Z1,$Z3,$Z0 # aggregated Karatsuba post-processing
vpxor $Z0,$Z2,$Z2
vpslldq \$8,$Z2,$T1
vmovdqu 0x10($const),$Hkey # .Lpoly
vpsrldq \$8,$Z2,$Z2
vpxor $T1,$Z1,$Xi
vpxor $Z2,$Z3,$Z3
vpalignr \$8,$Xi,$Xi,$T2 # 1st phase
vpclmulqdq \$0x10,$Hkey,$Xi,$Xi
vpxor $T2,$Xi,$Xi
vpalignr \$8,$Xi,$Xi,$T2 # 2nd phase
vpclmulqdq \$0x10,$Hkey,$Xi,$Xi
vpxor $Z3,$T2,$T2
vpxor $T2,$Xi,$Xi
___
}
$code.=<<___;
vpshufb ($const),$Xi,$Xi # .Lbswap_mask
vmovdqu $Xi,-0x40($Xip) # output Xi
vzeroupper
___
$code.=<<___ if ($win64);
movaps -0xd8(%rax),%xmm6
movaps -0xc8(%rax),%xmm7
movaps -0xb8(%rax),%xmm8
movaps -0xa8(%rax),%xmm9
movaps -0x98(%rax),%xmm10
movaps -0x88(%rax),%xmm11
movaps -0x78(%rax),%xmm12
movaps -0x68(%rax),%xmm13
movaps -0x58(%rax),%xmm14
movaps -0x48(%rax),%xmm15
___
$code.=<<___;
mov -48(%rax),%r15
mov -40(%rax),%r14
mov -32(%rax),%r13
mov -24(%rax),%r12
mov -16(%rax),%rbp
mov -8(%rax),%rbx
lea (%rax),%rsp # restore %rsp
.Lgcm_enc_abort:
mov $ret,%rax # return value
ret
.size aesni_gcm_encrypt,.-aesni_gcm_encrypt
___
$code.=<<___;
.align 64
.Lbswap_mask:
.byte 15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0
.Lpoly:
.byte 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0xc2
.Lone_msb:
.byte 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1
.Ltwo_lsb:
.byte 2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
.Lone_lsb:
.byte 1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
.asciz "AES-NI GCM module for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
.align 64
___
if ($win64) {
$rec="%rcx";
$frame="%rdx";
$context="%r8";
$disp="%r9";
$code.=<<___
.extern __imp_RtlVirtualUnwind
.type gcm_se_handler,\@abi-omnipotent
.align 16
gcm_se_handler:
push %rsi
push %rdi
push %rbx
push %rbp
push %r12
push %r13
push %r14
push %r15
pushfq
sub \$64,%rsp
mov 120($context),%rax # pull context->Rax
mov 248($context),%rbx # pull context->Rip
mov 8($disp),%rsi # disp->ImageBase
mov 56($disp),%r11 # disp->HandlerData
mov 0(%r11),%r10d # HandlerData[0]
lea (%rsi,%r10),%r10 # prologue label
cmp %r10,%rbx # context->Rip<prologue label
jb .Lcommon_seh_tail
mov 152($context),%rax # pull context->Rsp
mov 4(%r11),%r10d # HandlerData[1]
lea (%rsi,%r10),%r10 # epilogue label
cmp %r10,%rbx # context->Rip>=epilogue label
jae .Lcommon_seh_tail
mov 120($context),%rax # pull context->Rax
mov -48(%rax),%r15
mov -40(%rax),%r14
mov -32(%rax),%r13
mov -24(%rax),%r12
mov -16(%rax),%rbp
mov -8(%rax),%rbx
mov %r15,240($context)
mov %r14,232($context)
mov %r13,224($context)
mov %r12,216($context)
mov %rbp,160($context)
mov %rbx,144($context)
lea -0xd8(%rax),%rsi # %xmm save area
lea 512($context),%rdi # & context.Xmm6
mov \$20,%ecx # 10*sizeof(%xmm0)/sizeof(%rax)
.long 0xa548f3fc # cld; rep movsq
.Lcommon_seh_tail:
mov 8(%rax),%rdi
mov 16(%rax),%rsi
mov %rax,152($context) # restore context->Rsp
mov %rsi,168($context) # restore context->Rsi
mov %rdi,176($context) # restore context->Rdi
mov 40($disp),%rdi # disp->ContextRecord
mov $context,%rsi # context
mov \$154,%ecx # sizeof(CONTEXT)
.long 0xa548f3fc # cld; rep movsq
mov $disp,%rsi
xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER
mov 8(%rsi),%rdx # arg2, disp->ImageBase
mov 0(%rsi),%r8 # arg3, disp->ControlPc
mov 16(%rsi),%r9 # arg4, disp->FunctionEntry
mov 40(%rsi),%r10 # disp->ContextRecord
lea 56(%rsi),%r11 # &disp->HandlerData
lea 24(%rsi),%r12 # &disp->EstablisherFrame
mov %r10,32(%rsp) # arg5
mov %r11,40(%rsp) # arg6
mov %r12,48(%rsp) # arg7
mov %rcx,56(%rsp) # arg8, (NULL)
call *__imp_RtlVirtualUnwind(%rip)
mov \$1,%eax # ExceptionContinueSearch
add \$64,%rsp
popfq
pop %r15
pop %r14
pop %r13
pop %r12
pop %rbp
pop %rbx
pop %rdi
pop %rsi
ret
.size gcm_se_handler,.-gcm_se_handler
.section .pdata
.align 4
.rva .LSEH_begin_aesni_gcm_decrypt
.rva .LSEH_end_aesni_gcm_decrypt
.rva .LSEH_gcm_dec_info
.rva .LSEH_begin_aesni_gcm_encrypt
.rva .LSEH_end_aesni_gcm_encrypt
.rva .LSEH_gcm_enc_info
.section .xdata
.align 8
.LSEH_gcm_dec_info:
.byte 9,0,0,0
.rva gcm_se_handler
.rva .Lgcm_dec_body,.Lgcm_dec_abort
.LSEH_gcm_enc_info:
.byte 9,0,0,0
.rva gcm_se_handler
.rva .Lgcm_enc_body,.Lgcm_enc_abort
___
}
}}} else {{{
$code=<<___; # assembler is too old
.text
.globl aesni_gcm_encrypt
.type aesni_gcm_encrypt,\@abi-omnipotent
aesni_gcm_encrypt:
xor %eax,%eax
ret
.size aesni_gcm_encrypt,.-aesni_gcm_encrypt
.globl aesni_gcm_decrypt
.type aesni_gcm_decrypt,\@abi-omnipotent
aesni_gcm_decrypt:
xor %eax,%eax
ret
.size aesni_gcm_decrypt,.-aesni_gcm_decrypt
___
}}}
$code =~ s/\`([^\`]*)\`/eval($1)/gem;
print $code;
close STDOUT;