boringssl/crypto/bn/asm/rsaz-x86_64.pl

2130 lines
41 KiB
Perl
Raw Normal View History

#!/usr/bin/env perl
##############################################################################
# #
# Copyright (c) 2012, Intel Corporation #
# #
# All rights reserved. #
# #
# Redistribution and use in source and binary forms, with or without #
# modification, are permitted provided that the following conditions are #
# met: #
# #
# * Redistributions of source code must retain the above copyright #
# notice, this list of conditions and the following disclaimer. #
# #
# * Redistributions in binary form must reproduce the above copyright #
# notice, this list of conditions and the following disclaimer in the #
# documentation and/or other materials provided with the #
# distribution. #
# #
# * Neither the name of the Intel Corporation nor the names of its #
# contributors may be used to endorse or promote products derived from #
# this software without specific prior written permission. #
# #
# #
# THIS SOFTWARE IS PROVIDED BY INTEL CORPORATION ""AS IS"" AND ANY #
# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE #
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR #
# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL CORPORATION OR #
# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, #
# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, #
# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR #
# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF #
# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING #
# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS #
# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #
# #
##############################################################################
# Developers and authors: #
# Shay Gueron (1, 2), and Vlad Krasnov (1) #
# (1) Intel Architecture Group, Microprocessor and Chipset Development, #
# Israel Development Center, Haifa, Israel #
# (2) University of Haifa #
##############################################################################
# Reference: #
# [1] S. Gueron, "Efficient Software Implementations of Modular #
# Exponentiation", http://eprint.iacr.org/2011/239 #
# [2] S. Gueron, V. Krasnov. "Speeding up Big-Numbers Squaring". #
# IEEE Proceedings of 9th International Conference on Information #
# Technology: New Generations (ITNG 2012), 821-823 (2012). #
# [3] S. Gueron, Efficient Software Implementations of Modular Exponentiation#
# Journal of Cryptographic Engineering 2:31-43 (2012). #
# [4] S. Gueron, V. Krasnov: "[PATCH] Efficient and side channel analysis #
# resistant 512-bit and 1024-bit modular exponentiation for optimizing #
# RSA1024 and RSA2048 on x86_64 platforms", #
# http://rt.openssl.org/Ticket/Display.html?id=2582&user=guest&pass=guest#
##############################################################################
# While original submission covers 512- and 1024-bit exponentiation,
# this module is limited to 512-bit version only (and as such
# accelerates RSA1024 sign). This is because improvement for longer
# keys is not high enough to justify the effort, highest measured
# was ~5% on Westmere. [This is relative to OpenSSL 1.0.2, upcoming
# for the moment of this writing!] Nor does this module implement
# "monolithic" complete exponentiation jumbo-subroutine, but adheres
# to more modular mixture of C and assembly. And it's optimized even
# for processors other than Intel Core family (see table below for
# improvement coefficients).
# <appro@openssl.org>
#
# RSA1024 sign/sec this/original |this/rsax(*) this/fips(*)
# ----------------+---------------------------
# Opteron +13% |+5% +20%
# Bulldozer -0% |-1% +10%
# P4 +11% |+7% +8%
# Westmere +5% |+14% +17%
# Sandy Bridge +2% |+12% +29%
# Ivy Bridge +1% |+11% +35%
# Haswell(**) -0% |+12% +39%
# Atom +13% |+11% +4%
# VIA Nano +70% |+9% +25%
#
# (*) rsax engine and fips numbers are presented for reference
# purposes;
# (**) MULX was attempted, but found to give only marginal improvement;
$flavour = shift;
$output = shift;
if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
die "can't locate x86_64-xlate.pl";
open OUT,"| \"$^X\" $xlate $flavour $output";
*STDOUT=*OUT;
Get rid of all compiler version checks in perlasm files. Since we pre-generate our perlasm, having the output of these files be sensitive to the environment the run in is unhelpful. It would be bad to suddenly change what features we do or don't compile in whenever workstations' toolchains change. Enable all compiler-version-gated features as they should all be runtime-gated anyway. This should align with what upstream's files would have produced on modern toolschains. We should assume our assemblers can take whatever we'd like to throw at them. (If it turns out some can't, we'd rather find out and probably switch the problematic instructions to explicit byte sequences.) This actually results in a fairly significant change to the assembly we generate. I'm guessing upstream's buildsystem sets the CC environment variable, while ours doesn't and so the version checks were all coming out conservative. diffstat of generated files: linux-x86/crypto/sha/sha1-586.S | 1176 ++++++++++++ linux-x86/crypto/sha/sha256-586.S | 2248 ++++++++++++++++++++++++ linux-x86_64/crypto/bn/rsaz-avx2.S | 1644 +++++++++++++++++ linux-x86_64/crypto/bn/rsaz-x86_64.S | 638 ++++++ linux-x86_64/crypto/bn/x86_64-mont.S | 332 +++ linux-x86_64/crypto/bn/x86_64-mont5.S | 1130 ++++++++++++ linux-x86_64/crypto/modes/aesni-gcm-x86_64.S | 754 ++++++++ linux-x86_64/crypto/modes/ghash-x86_64.S | 475 +++++ linux-x86_64/crypto/sha/sha1-x86_64.S | 1121 ++++++++++++ linux-x86_64/crypto/sha/sha256-x86_64.S | 1062 +++++++++++ linux-x86_64/crypto/sha/sha512-x86_64.S | 2241 ++++++++++++++++++++++++ mac-x86/crypto/sha/sha1-586.S | 1174 ++++++++++++ mac-x86/crypto/sha/sha256-586.S | 2248 ++++++++++++++++++++++++ mac-x86_64/crypto/bn/rsaz-avx2.S | 1637 +++++++++++++++++ mac-x86_64/crypto/bn/rsaz-x86_64.S | 638 ++++++ mac-x86_64/crypto/bn/x86_64-mont.S | 331 +++ mac-x86_64/crypto/bn/x86_64-mont5.S | 1130 ++++++++++++ mac-x86_64/crypto/modes/aesni-gcm-x86_64.S | 750 ++++++++ mac-x86_64/crypto/modes/ghash-x86_64.S | 475 +++++ mac-x86_64/crypto/sha/sha1-x86_64.S | 1121 ++++++++++++ mac-x86_64/crypto/sha/sha256-x86_64.S | 1062 +++++++++++ mac-x86_64/crypto/sha/sha512-x86_64.S | 2241 ++++++++++++++++++++++++ win-x86/crypto/sha/sha1-586.asm | 1173 ++++++++++++ win-x86/crypto/sha/sha256-586.asm | 2248 ++++++++++++++++++++++++ win-x86_64/crypto/bn/rsaz-avx2.asm | 1858 +++++++++++++++++++- win-x86_64/crypto/bn/rsaz-x86_64.asm | 638 ++++++ win-x86_64/crypto/bn/x86_64-mont.asm | 352 +++ win-x86_64/crypto/bn/x86_64-mont5.asm | 1184 ++++++++++++ win-x86_64/crypto/modes/aesni-gcm-x86_64.asm | 933 ++++++++++ win-x86_64/crypto/modes/ghash-x86_64.asm | 515 +++++ win-x86_64/crypto/sha/sha1-x86_64.asm | 1152 ++++++++++++ win-x86_64/crypto/sha/sha256-x86_64.asm | 1088 +++++++++++ win-x86_64/crypto/sha/sha512-x86_64.asm | 2499 ++++++ SHA* gets faster. RSA and AES-GCM seem to be more of a wash and even slower sometimes! This is a little concerning. Though when I repeated the latter two, it's definitely noisy (RSA in particular), so we may wish to repeat in a more controlled environment. We could also flip some of these toggles to something other than the highest setting if it seems some of the variants aren't desirable. We just shouldn't have them enabled or disabled on accident. This aligns us closer to upstream though. $ /tmp/bssl.old speed SHA- Did 5028000 SHA-1 (16 bytes) operations in 1000048us (5027758.7 ops/sec): 80.4 MB/s Did 1708000 SHA-1 (256 bytes) operations in 1000257us (1707561.2 ops/sec): 437.1 MB/s Did 73000 SHA-1 (8192 bytes) operations in 1008406us (72391.5 ops/sec): 593.0 MB/s Did 3041000 SHA-256 (16 bytes) operations in 1000311us (3040054.5 ops/sec): 48.6 MB/s Did 779000 SHA-256 (256 bytes) operations in 1000820us (778361.7 ops/sec): 199.3 MB/s Did 26000 SHA-256 (8192 bytes) operations in 1009875us (25745.8 ops/sec): 210.9 MB/s Did 1837000 SHA-512 (16 bytes) operations in 1000251us (1836539.0 ops/sec): 29.4 MB/s Did 803000 SHA-512 (256 bytes) operations in 1000969us (802222.6 ops/sec): 205.4 MB/s Did 41000 SHA-512 (8192 bytes) operations in 1016768us (40323.8 ops/sec): 330.3 MB/s $ /tmp/bssl.new speed SHA- Did 5354000 SHA-1 (16 bytes) operations in 1000104us (5353443.2 ops/sec): 85.7 MB/s Did 1779000 SHA-1 (256 bytes) operations in 1000121us (1778784.8 ops/sec): 455.4 MB/s Did 87000 SHA-1 (8192 bytes) operations in 1012641us (85914.0 ops/sec): 703.8 MB/s Did 3517000 SHA-256 (16 bytes) operations in 1000114us (3516599.1 ops/sec): 56.3 MB/s Did 935000 SHA-256 (256 bytes) operations in 1000096us (934910.2 ops/sec): 239.3 MB/s Did 38000 SHA-256 (8192 bytes) operations in 1004476us (37830.7 ops/sec): 309.9 MB/s Did 2930000 SHA-512 (16 bytes) operations in 1000259us (2929241.3 ops/sec): 46.9 MB/s Did 1008000 SHA-512 (256 bytes) operations in 1000509us (1007487.2 ops/sec): 257.9 MB/s Did 45000 SHA-512 (8192 bytes) operations in 1000593us (44973.3 ops/sec): 368.4 MB/s $ /tmp/bssl.old speed RSA Did 820 RSA 2048 signing operations in 1017008us (806.3 ops/sec) Did 27000 RSA 2048 verify operations in 1015400us (26590.5 ops/sec) Did 1292 RSA 2048 (3 prime, e=3) signing operations in 1008185us (1281.5 ops/sec) Did 65000 RSA 2048 (3 prime, e=3) verify operations in 1011388us (64268.1 ops/sec) Did 120 RSA 4096 signing operations in 1061027us (113.1 ops/sec) Did 8208 RSA 4096 verify operations in 1002717us (8185.8 ops/sec) $ /tmp/bssl.new speed RSA Did 760 RSA 2048 signing operations in 1003351us (757.5 ops/sec) Did 25900 RSA 2048 verify operations in 1028931us (25171.8 ops/sec) Did 1320 RSA 2048 (3 prime, e=3) signing operations in 1040806us (1268.2 ops/sec) Did 63000 RSA 2048 (3 prime, e=3) verify operations in 1016042us (62005.3 ops/sec) Did 104 RSA 4096 signing operations in 1008718us (103.1 ops/sec) Did 6875 RSA 4096 verify operations in 1093441us (6287.5 ops/sec) $ /tmp/bssl.old speed GCM Did 5316000 AES-128-GCM (16 bytes) seal operations in 1000082us (5315564.1 ops/sec): 85.0 MB/s Did 712000 AES-128-GCM (1350 bytes) seal operations in 1000252us (711820.6 ops/sec): 961.0 MB/s Did 149000 AES-128-GCM (8192 bytes) seal operations in 1003182us (148527.4 ops/sec): 1216.7 MB/s Did 5919750 AES-256-GCM (16 bytes) seal operations in 1000016us (5919655.3 ops/sec): 94.7 MB/s Did 800000 AES-256-GCM (1350 bytes) seal operations in 1000951us (799239.9 ops/sec): 1079.0 MB/s Did 152000 AES-256-GCM (8192 bytes) seal operations in 1000765us (151883.8 ops/sec): 1244.2 MB/s $ /tmp/bssl.new speed GCM Did 5315000 AES-128-GCM (16 bytes) seal operations in 1000125us (5314335.7 ops/sec): 85.0 MB/s Did 755000 AES-128-GCM (1350 bytes) seal operations in 1000878us (754337.7 ops/sec): 1018.4 MB/s Did 151000 AES-128-GCM (8192 bytes) seal operations in 1005655us (150150.9 ops/sec): 1230.0 MB/s Did 5913500 AES-256-GCM (16 bytes) seal operations in 1000041us (5913257.6 ops/sec): 94.6 MB/s Did 782000 AES-256-GCM (1350 bytes) seal operations in 1001484us (780841.2 ops/sec): 1054.1 MB/s Did 121000 AES-256-GCM (8192 bytes) seal operations in 1006389us (120231.8 ops/sec): 984.9 MB/s Change-Id: I0efb32f896c597abc7d7e55c31d038528a5c72a1 Reviewed-on: https://boringssl-review.googlesource.com/6260 Reviewed-by: Adam Langley <alangley@gmail.com>
2015-10-14 19:03:23 +01:00
# In upstream, this is controlled by shelling out to the compiler to check
# versions, but BoringSSL is intended to be used with pre-generated perlasm
# output, so this isn't useful anyway.
$addx = 1;
($out, $inp, $mod) = ("%rdi", "%rsi", "%rbp"); # common internal API
{
my ($out,$inp,$mod,$n0,$times) = ("%rdi","%rsi","%rdx","%rcx","%r8d");
$code.=<<___;
.text
.extern OPENSSL_ia32cap_P
.globl rsaz_512_sqr
.type rsaz_512_sqr,\@function,5
.align 32
rsaz_512_sqr: # 25-29% faster than rsaz_512_mul
push %rbx
push %rbp
push %r12
push %r13
push %r14
push %r15
subq \$128+24, %rsp
.Lsqr_body:
movq $mod, %rbp # common argument
movq ($inp), %rdx
movq 8($inp), %rax
movq $n0, 128(%rsp)
___
$code.=<<___ if ($addx);
movl \$0x80100,%r11d
andl OPENSSL_ia32cap_P+8(%rip),%r11d
cmpl \$0x80100,%r11d # check for MULX and ADO/CX
je .Loop_sqrx
___
$code.=<<___;
jmp .Loop_sqr
.align 32
.Loop_sqr:
movl $times,128+8(%rsp)
#first iteration
movq %rdx, %rbx
mulq %rdx
movq %rax, %r8
movq 16($inp), %rax
movq %rdx, %r9
mulq %rbx
addq %rax, %r9
movq 24($inp), %rax
movq %rdx, %r10
adcq \$0, %r10
mulq %rbx
addq %rax, %r10
movq 32($inp), %rax
movq %rdx, %r11
adcq \$0, %r11
mulq %rbx
addq %rax, %r11
movq 40($inp), %rax
movq %rdx, %r12
adcq \$0, %r12
mulq %rbx
addq %rax, %r12
movq 48($inp), %rax
movq %rdx, %r13
adcq \$0, %r13
mulq %rbx
addq %rax, %r13
movq 56($inp), %rax
movq %rdx, %r14
adcq \$0, %r14
mulq %rbx
addq %rax, %r14
movq %rbx, %rax
movq %rdx, %r15
adcq \$0, %r15
addq %r8, %r8 #shlq \$1, %r8
movq %r9, %rcx
adcq %r9, %r9 #shld \$1, %r8, %r9
mulq %rax
movq %rax, (%rsp)
addq %rdx, %r8
adcq \$0, %r9
movq %r8, 8(%rsp)
shrq \$63, %rcx
#second iteration
movq 8($inp), %r8
movq 16($inp), %rax
mulq %r8
addq %rax, %r10
movq 24($inp), %rax
movq %rdx, %rbx
adcq \$0, %rbx
mulq %r8
addq %rax, %r11
movq 32($inp), %rax
adcq \$0, %rdx
addq %rbx, %r11
movq %rdx, %rbx
adcq \$0, %rbx
mulq %r8
addq %rax, %r12
movq 40($inp), %rax
adcq \$0, %rdx
addq %rbx, %r12
movq %rdx, %rbx
adcq \$0, %rbx
mulq %r8
addq %rax, %r13
movq 48($inp), %rax
adcq \$0, %rdx
addq %rbx, %r13
movq %rdx, %rbx
adcq \$0, %rbx
mulq %r8
addq %rax, %r14
movq 56($inp), %rax
adcq \$0, %rdx
addq %rbx, %r14
movq %rdx, %rbx
adcq \$0, %rbx
mulq %r8
addq %rax, %r15
movq %r8, %rax
adcq \$0, %rdx
addq %rbx, %r15
movq %rdx, %r8
movq %r10, %rdx
adcq \$0, %r8
add %rdx, %rdx
lea (%rcx,%r10,2), %r10 #shld \$1, %rcx, %r10
movq %r11, %rbx
adcq %r11, %r11 #shld \$1, %r10, %r11
mulq %rax
addq %rax, %r9
adcq %rdx, %r10
adcq \$0, %r11
movq %r9, 16(%rsp)
movq %r10, 24(%rsp)
shrq \$63, %rbx
#third iteration
movq 16($inp), %r9
movq 24($inp), %rax
mulq %r9
addq %rax, %r12
movq 32($inp), %rax
movq %rdx, %rcx
adcq \$0, %rcx
mulq %r9
addq %rax, %r13
movq 40($inp), %rax
adcq \$0, %rdx
addq %rcx, %r13
movq %rdx, %rcx
adcq \$0, %rcx
mulq %r9
addq %rax, %r14
movq 48($inp), %rax
adcq \$0, %rdx
addq %rcx, %r14
movq %rdx, %rcx
adcq \$0, %rcx
mulq %r9
movq %r12, %r10
lea (%rbx,%r12,2), %r12 #shld \$1, %rbx, %r12
addq %rax, %r15
movq 56($inp), %rax
adcq \$0, %rdx
addq %rcx, %r15
movq %rdx, %rcx
adcq \$0, %rcx
mulq %r9
shrq \$63, %r10
addq %rax, %r8
movq %r9, %rax
adcq \$0, %rdx
addq %rcx, %r8
movq %rdx, %r9
adcq \$0, %r9
movq %r13, %rcx
leaq (%r10,%r13,2), %r13 #shld \$1, %r12, %r13
mulq %rax
addq %rax, %r11
adcq %rdx, %r12
adcq \$0, %r13
movq %r11, 32(%rsp)
movq %r12, 40(%rsp)
shrq \$63, %rcx
#fourth iteration
movq 24($inp), %r10
movq 32($inp), %rax
mulq %r10
addq %rax, %r14
movq 40($inp), %rax
movq %rdx, %rbx
adcq \$0, %rbx
mulq %r10
addq %rax, %r15
movq 48($inp), %rax
adcq \$0, %rdx
addq %rbx, %r15
movq %rdx, %rbx
adcq \$0, %rbx
mulq %r10
movq %r14, %r12
leaq (%rcx,%r14,2), %r14 #shld \$1, %rcx, %r14
addq %rax, %r8
movq 56($inp), %rax
adcq \$0, %rdx
addq %rbx, %r8
movq %rdx, %rbx
adcq \$0, %rbx
mulq %r10
shrq \$63, %r12
addq %rax, %r9
movq %r10, %rax
adcq \$0, %rdx
addq %rbx, %r9
movq %rdx, %r10
adcq \$0, %r10
movq %r15, %rbx
leaq (%r12,%r15,2),%r15 #shld \$1, %r14, %r15
mulq %rax
addq %rax, %r13
adcq %rdx, %r14
adcq \$0, %r15
movq %r13, 48(%rsp)
movq %r14, 56(%rsp)
shrq \$63, %rbx
#fifth iteration
movq 32($inp), %r11
movq 40($inp), %rax
mulq %r11
addq %rax, %r8
movq 48($inp), %rax
movq %rdx, %rcx
adcq \$0, %rcx
mulq %r11
addq %rax, %r9
movq 56($inp), %rax
adcq \$0, %rdx
movq %r8, %r12
leaq (%rbx,%r8,2), %r8 #shld \$1, %rbx, %r8
addq %rcx, %r9
movq %rdx, %rcx
adcq \$0, %rcx
mulq %r11
shrq \$63, %r12
addq %rax, %r10
movq %r11, %rax
adcq \$0, %rdx
addq %rcx, %r10
movq %rdx, %r11
adcq \$0, %r11
movq %r9, %rcx
leaq (%r12,%r9,2), %r9 #shld \$1, %r8, %r9
mulq %rax
addq %rax, %r15
adcq %rdx, %r8
adcq \$0, %r9
movq %r15, 64(%rsp)
movq %r8, 72(%rsp)
shrq \$63, %rcx
#sixth iteration
movq 40($inp), %r12
movq 48($inp), %rax
mulq %r12
addq %rax, %r10
movq 56($inp), %rax
movq %rdx, %rbx
adcq \$0, %rbx
mulq %r12
addq %rax, %r11
movq %r12, %rax
movq %r10, %r15
leaq (%rcx,%r10,2), %r10 #shld \$1, %rcx, %r10
adcq \$0, %rdx
shrq \$63, %r15
addq %rbx, %r11
movq %rdx, %r12
adcq \$0, %r12
movq %r11, %rbx
leaq (%r15,%r11,2), %r11 #shld \$1, %r10, %r11
mulq %rax
addq %rax, %r9
adcq %rdx, %r10
adcq \$0, %r11
movq %r9, 80(%rsp)
movq %r10, 88(%rsp)
#seventh iteration
movq 48($inp), %r13
movq 56($inp), %rax
mulq %r13
addq %rax, %r12
movq %r13, %rax
movq %rdx, %r13
adcq \$0, %r13
xorq %r14, %r14
shlq \$1, %rbx
adcq %r12, %r12 #shld \$1, %rbx, %r12
adcq %r13, %r13 #shld \$1, %r12, %r13
adcq %r14, %r14 #shld \$1, %r13, %r14
mulq %rax
addq %rax, %r11
adcq %rdx, %r12
adcq \$0, %r13
movq %r11, 96(%rsp)
movq %r12, 104(%rsp)
#eighth iteration
movq 56($inp), %rax
mulq %rax
addq %rax, %r13
adcq \$0, %rdx
addq %rdx, %r14
movq %r13, 112(%rsp)
movq %r14, 120(%rsp)
movq (%rsp), %r8
movq 8(%rsp), %r9
movq 16(%rsp), %r10
movq 24(%rsp), %r11
movq 32(%rsp), %r12
movq 40(%rsp), %r13
movq 48(%rsp), %r14
movq 56(%rsp), %r15
call __rsaz_512_reduce
addq 64(%rsp), %r8
adcq 72(%rsp), %r9
adcq 80(%rsp), %r10
adcq 88(%rsp), %r11
adcq 96(%rsp), %r12
adcq 104(%rsp), %r13
adcq 112(%rsp), %r14
adcq 120(%rsp), %r15
sbbq %rcx, %rcx
call __rsaz_512_subtract
movq %r8, %rdx
movq %r9, %rax
movl 128+8(%rsp), $times
movq $out, $inp
decl $times
jnz .Loop_sqr
___
if ($addx) {
$code.=<<___;
jmp .Lsqr_tail
.align 32
.Loop_sqrx:
movl $times,128+8(%rsp)
movq $out, %xmm0 # off-load
movq %rbp, %xmm1 # off-load
#first iteration
mulx %rax, %r8, %r9
mulx 16($inp), %rcx, %r10
xor %rbp, %rbp # cf=0, of=0
mulx 24($inp), %rax, %r11
adcx %rcx, %r9
mulx 32($inp), %rcx, %r12
adcx %rax, %r10
mulx 40($inp), %rax, %r13
adcx %rcx, %r11
.byte 0xc4,0x62,0xf3,0xf6,0xb6,0x30,0x00,0x00,0x00 # mulx 48($inp), %rcx, %r14
adcx %rax, %r12
adcx %rcx, %r13
.byte 0xc4,0x62,0xfb,0xf6,0xbe,0x38,0x00,0x00,0x00 # mulx 56($inp), %rax, %r15
adcx %rax, %r14
adcx %rbp, %r15 # %rbp is 0
mov %r9, %rcx
shld \$1, %r8, %r9
shl \$1, %r8
xor %ebp, %ebp
mulx %rdx, %rax, %rdx
adcx %rdx, %r8
mov 8($inp), %rdx
adcx %rbp, %r9
mov %rax, (%rsp)
mov %r8, 8(%rsp)
#second iteration
mulx 16($inp), %rax, %rbx
adox %rax, %r10
adcx %rbx, %r11
.byte 0xc4,0x62,0xc3,0xf6,0x86,0x18,0x00,0x00,0x00 # mulx 24($inp), $out, %r8
adox $out, %r11
adcx %r8, %r12
mulx 32($inp), %rax, %rbx
adox %rax, %r12
adcx %rbx, %r13
mulx 40($inp), $out, %r8
adox $out, %r13
adcx %r8, %r14
.byte 0xc4,0xe2,0xfb,0xf6,0x9e,0x30,0x00,0x00,0x00 # mulx 48($inp), %rax, %rbx
adox %rax, %r14
adcx %rbx, %r15
.byte 0xc4,0x62,0xc3,0xf6,0x86,0x38,0x00,0x00,0x00 # mulx 56($inp), $out, %r8
adox $out, %r15
adcx %rbp, %r8
adox %rbp, %r8
mov %r11, %rbx
shld \$1, %r10, %r11
shld \$1, %rcx, %r10
xor %ebp,%ebp
mulx %rdx, %rax, %rcx
mov 16($inp), %rdx
adcx %rax, %r9
adcx %rcx, %r10
adcx %rbp, %r11
mov %r9, 16(%rsp)
.byte 0x4c,0x89,0x94,0x24,0x18,0x00,0x00,0x00 # mov %r10, 24(%rsp)
#third iteration
.byte 0xc4,0x62,0xc3,0xf6,0x8e,0x18,0x00,0x00,0x00 # mulx 24($inp), $out, %r9
adox $out, %r12
adcx %r9, %r13
mulx 32($inp), %rax, %rcx
adox %rax, %r13
adcx %rcx, %r14
mulx 40($inp), $out, %r9
adox $out, %r14
adcx %r9, %r15
.byte 0xc4,0xe2,0xfb,0xf6,0x8e,0x30,0x00,0x00,0x00 # mulx 48($inp), %rax, %rcx
adox %rax, %r15
adcx %rcx, %r8
.byte 0xc4,0x62,0xc3,0xf6,0x8e,0x38,0x00,0x00,0x00 # mulx 56($inp), $out, %r9
adox $out, %r8
adcx %rbp, %r9
adox %rbp, %r9
mov %r13, %rcx
shld \$1, %r12, %r13
shld \$1, %rbx, %r12
xor %ebp, %ebp
mulx %rdx, %rax, %rdx
adcx %rax, %r11
adcx %rdx, %r12
mov 24($inp), %rdx
adcx %rbp, %r13
mov %r11, 32(%rsp)
.byte 0x4c,0x89,0xa4,0x24,0x28,0x00,0x00,0x00 # mov %r12, 40(%rsp)
#fourth iteration
.byte 0xc4,0xe2,0xfb,0xf6,0x9e,0x20,0x00,0x00,0x00 # mulx 32($inp), %rax, %rbx
adox %rax, %r14
adcx %rbx, %r15
mulx 40($inp), $out, %r10
adox $out, %r15
adcx %r10, %r8
mulx 48($inp), %rax, %rbx
adox %rax, %r8
adcx %rbx, %r9
mulx 56($inp), $out, %r10
adox $out, %r9
adcx %rbp, %r10
adox %rbp, %r10
.byte 0x66
mov %r15, %rbx
shld \$1, %r14, %r15
shld \$1, %rcx, %r14
xor %ebp, %ebp
mulx %rdx, %rax, %rdx
adcx %rax, %r13
adcx %rdx, %r14
mov 32($inp), %rdx
adcx %rbp, %r15
mov %r13, 48(%rsp)
mov %r14, 56(%rsp)
#fifth iteration
.byte 0xc4,0x62,0xc3,0xf6,0x9e,0x28,0x00,0x00,0x00 # mulx 40($inp), $out, %r11
adox $out, %r8
adcx %r11, %r9
mulx 48($inp), %rax, %rcx
adox %rax, %r9
adcx %rcx, %r10
mulx 56($inp), $out, %r11
adox $out, %r10
adcx %rbp, %r11
adox %rbp, %r11
mov %r9, %rcx
shld \$1, %r8, %r9
shld \$1, %rbx, %r8
xor %ebp, %ebp
mulx %rdx, %rax, %rdx
adcx %rax, %r15
adcx %rdx, %r8
mov 40($inp), %rdx
adcx %rbp, %r9
mov %r15, 64(%rsp)
mov %r8, 72(%rsp)
#sixth iteration
.byte 0xc4,0xe2,0xfb,0xf6,0x9e,0x30,0x00,0x00,0x00 # mulx 48($inp), %rax, %rbx
adox %rax, %r10
adcx %rbx, %r11
.byte 0xc4,0x62,0xc3,0xf6,0xa6,0x38,0x00,0x00,0x00 # mulx 56($inp), $out, %r12
adox $out, %r11
adcx %rbp, %r12
adox %rbp, %r12
mov %r11, %rbx
shld \$1, %r10, %r11
shld \$1, %rcx, %r10
xor %ebp, %ebp
mulx %rdx, %rax, %rdx
adcx %rax, %r9
adcx %rdx, %r10
mov 48($inp), %rdx
adcx %rbp, %r11
mov %r9, 80(%rsp)
mov %r10, 88(%rsp)
#seventh iteration
.byte 0xc4,0x62,0xfb,0xf6,0xae,0x38,0x00,0x00,0x00 # mulx 56($inp), %rax, %r13
adox %rax, %r12
adox %rbp, %r13
xor %r14, %r14
shld \$1, %r13, %r14
shld \$1, %r12, %r13
shld \$1, %rbx, %r12
xor %ebp, %ebp
mulx %rdx, %rax, %rdx
adcx %rax, %r11
adcx %rdx, %r12
mov 56($inp), %rdx
adcx %rbp, %r13
.byte 0x4c,0x89,0x9c,0x24,0x60,0x00,0x00,0x00 # mov %r11, 96(%rsp)
.byte 0x4c,0x89,0xa4,0x24,0x68,0x00,0x00,0x00 # mov %r12, 104(%rsp)
#eighth iteration
mulx %rdx, %rax, %rdx
adox %rax, %r13
adox %rbp, %rdx
.byte 0x66
add %rdx, %r14
movq %r13, 112(%rsp)
movq %r14, 120(%rsp)
movq %xmm0, $out
movq %xmm1, %rbp
movq 128(%rsp), %rdx # pull $n0
movq (%rsp), %r8
movq 8(%rsp), %r9
movq 16(%rsp), %r10
movq 24(%rsp), %r11
movq 32(%rsp), %r12
movq 40(%rsp), %r13
movq 48(%rsp), %r14
movq 56(%rsp), %r15
call __rsaz_512_reducex
addq 64(%rsp), %r8
adcq 72(%rsp), %r9
adcq 80(%rsp), %r10
adcq 88(%rsp), %r11
adcq 96(%rsp), %r12
adcq 104(%rsp), %r13
adcq 112(%rsp), %r14
adcq 120(%rsp), %r15
sbbq %rcx, %rcx
call __rsaz_512_subtract
movq %r8, %rdx
movq %r9, %rax
movl 128+8(%rsp), $times
movq $out, $inp
decl $times
jnz .Loop_sqrx
.Lsqr_tail:
___
}
$code.=<<___;
leaq 128+24+48(%rsp), %rax
movq -48(%rax), %r15
movq -40(%rax), %r14
movq -32(%rax), %r13
movq -24(%rax), %r12
movq -16(%rax), %rbp
movq -8(%rax), %rbx
leaq (%rax), %rsp
.Lsqr_epilogue:
ret
.size rsaz_512_sqr,.-rsaz_512_sqr
___
}
{
my ($out,$ap,$bp,$mod,$n0) = ("%rdi","%rsi","%rdx","%rcx","%r8");
$code.=<<___;
.globl rsaz_512_mul
.type rsaz_512_mul,\@function,5
.align 32
rsaz_512_mul:
push %rbx
push %rbp
push %r12
push %r13
push %r14
push %r15
subq \$128+24, %rsp
.Lmul_body:
movq $out, %xmm0 # off-load arguments
movq $mod, %xmm1
movq $n0, 128(%rsp)
___
$code.=<<___ if ($addx);
movl \$0x80100,%r11d
andl OPENSSL_ia32cap_P+8(%rip),%r11d
cmpl \$0x80100,%r11d # check for MULX and ADO/CX
je .Lmulx
___
$code.=<<___;
movq ($bp), %rbx # pass b[0]
movq $bp, %rbp # pass argument
call __rsaz_512_mul
movq %xmm0, $out
movq %xmm1, %rbp
movq (%rsp), %r8
movq 8(%rsp), %r9
movq 16(%rsp), %r10
movq 24(%rsp), %r11
movq 32(%rsp), %r12
movq 40(%rsp), %r13
movq 48(%rsp), %r14
movq 56(%rsp), %r15
call __rsaz_512_reduce
___
$code.=<<___ if ($addx);
jmp .Lmul_tail
.align 32
.Lmulx:
movq $bp, %rbp # pass argument
movq ($bp), %rdx # pass b[0]
call __rsaz_512_mulx
movq %xmm0, $out
movq %xmm1, %rbp
movq 128(%rsp), %rdx # pull $n0
movq (%rsp), %r8
movq 8(%rsp), %r9
movq 16(%rsp), %r10
movq 24(%rsp), %r11
movq 32(%rsp), %r12
movq 40(%rsp), %r13
movq 48(%rsp), %r14
movq 56(%rsp), %r15
call __rsaz_512_reducex
.Lmul_tail:
___
$code.=<<___;
addq 64(%rsp), %r8
adcq 72(%rsp), %r9
adcq 80(%rsp), %r10
adcq 88(%rsp), %r11
adcq 96(%rsp), %r12
adcq 104(%rsp), %r13
adcq 112(%rsp), %r14
adcq 120(%rsp), %r15
sbbq %rcx, %rcx
call __rsaz_512_subtract
leaq 128+24+48(%rsp), %rax
movq -48(%rax), %r15
movq -40(%rax), %r14
movq -32(%rax), %r13
movq -24(%rax), %r12
movq -16(%rax), %rbp
movq -8(%rax), %rbx
leaq (%rax), %rsp
.Lmul_epilogue:
ret
.size rsaz_512_mul,.-rsaz_512_mul
___
}
{
my ($out,$ap,$bp,$mod,$n0,$pwr) = ("%rdi","%rsi","%rdx","%rcx","%r8","%r9d");
$code.=<<___;
.globl rsaz_512_mul_gather4
.type rsaz_512_mul_gather4,\@function,6
.align 32
rsaz_512_mul_gather4:
push %rbx
push %rbp
push %r12
push %r13
push %r14
push %r15
mov $pwr, $pwr
subq \$128+24, %rsp
.Lmul_gather4_body:
___
$code.=<<___ if ($addx);
movl \$0x80100,%r11d
andl OPENSSL_ia32cap_P+8(%rip),%r11d
cmpl \$0x80100,%r11d # check for MULX and ADO/CX
je .Lmulx_gather
___
$code.=<<___;
movl 64($bp,$pwr,4), %eax
movq $out, %xmm0 # off-load arguments
movl ($bp,$pwr,4), %ebx
movq $mod, %xmm1
movq $n0, 128(%rsp)
shlq \$32, %rax
or %rax, %rbx
movq ($ap), %rax
movq 8($ap), %rcx
leaq 128($bp,$pwr,4), %rbp
mulq %rbx # 0 iteration
movq %rax, (%rsp)
movq %rcx, %rax
movq %rdx, %r8
mulq %rbx
movd (%rbp), %xmm4
addq %rax, %r8
movq 16($ap), %rax
movq %rdx, %r9
adcq \$0, %r9
mulq %rbx
movd 64(%rbp), %xmm5
addq %rax, %r9
movq 24($ap), %rax
movq %rdx, %r10
adcq \$0, %r10
mulq %rbx
pslldq \$4, %xmm5
addq %rax, %r10
movq 32($ap), %rax
movq %rdx, %r11
adcq \$0, %r11
mulq %rbx
por %xmm5, %xmm4
addq %rax, %r11
movq 40($ap), %rax
movq %rdx, %r12
adcq \$0, %r12
mulq %rbx
addq %rax, %r12
movq 48($ap), %rax
movq %rdx, %r13
adcq \$0, %r13
mulq %rbx
leaq 128(%rbp), %rbp
addq %rax, %r13
movq 56($ap), %rax
movq %rdx, %r14
adcq \$0, %r14
mulq %rbx
movq %xmm4, %rbx
addq %rax, %r14
movq ($ap), %rax
movq %rdx, %r15
adcq \$0, %r15
leaq 8(%rsp), %rdi
movl \$7, %ecx
jmp .Loop_mul_gather
.align 32
.Loop_mul_gather:
mulq %rbx
addq %rax, %r8
movq 8($ap), %rax
movq %r8, (%rdi)
movq %rdx, %r8
adcq \$0, %r8
mulq %rbx
movd (%rbp), %xmm4
addq %rax, %r9
movq 16($ap), %rax
adcq \$0, %rdx
addq %r9, %r8
movq %rdx, %r9
adcq \$0, %r9
mulq %rbx
movd 64(%rbp), %xmm5
addq %rax, %r10
movq 24($ap), %rax
adcq \$0, %rdx
addq %r10, %r9
movq %rdx, %r10
adcq \$0, %r10
mulq %rbx
pslldq \$4, %xmm5
addq %rax, %r11
movq 32($ap), %rax
adcq \$0, %rdx
addq %r11, %r10
movq %rdx, %r11
adcq \$0, %r11
mulq %rbx
por %xmm5, %xmm4
addq %rax, %r12
movq 40($ap), %rax
adcq \$0, %rdx
addq %r12, %r11
movq %rdx, %r12
adcq \$0, %r12
mulq %rbx
addq %rax, %r13
movq 48($ap), %rax
adcq \$0, %rdx
addq %r13, %r12
movq %rdx, %r13
adcq \$0, %r13
mulq %rbx
addq %rax, %r14
movq 56($ap), %rax
adcq \$0, %rdx
addq %r14, %r13
movq %rdx, %r14
adcq \$0, %r14
mulq %rbx
movq %xmm4, %rbx
addq %rax, %r15
movq ($ap), %rax
adcq \$0, %rdx
addq %r15, %r14
movq %rdx, %r15
adcq \$0, %r15
leaq 128(%rbp), %rbp
leaq 8(%rdi), %rdi
decl %ecx
jnz .Loop_mul_gather
movq %r8, (%rdi)
movq %r9, 8(%rdi)
movq %r10, 16(%rdi)
movq %r11, 24(%rdi)
movq %r12, 32(%rdi)
movq %r13, 40(%rdi)
movq %r14, 48(%rdi)
movq %r15, 56(%rdi)
movq %xmm0, $out
movq %xmm1, %rbp
movq (%rsp), %r8
movq 8(%rsp), %r9
movq 16(%rsp), %r10
movq 24(%rsp), %r11
movq 32(%rsp), %r12
movq 40(%rsp), %r13
movq 48(%rsp), %r14
movq 56(%rsp), %r15
call __rsaz_512_reduce
___
$code.=<<___ if ($addx);
jmp .Lmul_gather_tail
.align 32
.Lmulx_gather:
mov 64($bp,$pwr,4), %eax
movq $out, %xmm0 # off-load arguments
lea 128($bp,$pwr,4), %rbp
mov ($bp,$pwr,4), %edx
movq $mod, %xmm1
mov $n0, 128(%rsp)
shl \$32, %rax
or %rax, %rdx
mulx ($ap), %rbx, %r8 # 0 iteration
mov %rbx, (%rsp)
xor %edi, %edi # cf=0, of=0
mulx 8($ap), %rax, %r9
movd (%rbp), %xmm4
mulx 16($ap), %rbx, %r10
movd 64(%rbp), %xmm5
adcx %rax, %r8
mulx 24($ap), %rax, %r11
pslldq \$4, %xmm5
adcx %rbx, %r9
mulx 32($ap), %rbx, %r12
por %xmm5, %xmm4
adcx %rax, %r10
mulx 40($ap), %rax, %r13
adcx %rbx, %r11
mulx 48($ap), %rbx, %r14
lea 128(%rbp), %rbp
adcx %rax, %r12
mulx 56($ap), %rax, %r15
movq %xmm4, %rdx
adcx %rbx, %r13
adcx %rax, %r14
mov %r8, %rbx
adcx %rdi, %r15 # %rdi is 0
mov \$-7, %rcx
jmp .Loop_mulx_gather
.align 32
.Loop_mulx_gather:
mulx ($ap), %rax, %r8
adcx %rax, %rbx
adox %r9, %r8
mulx 8($ap), %rax, %r9
.byte 0x66,0x0f,0x6e,0xa5,0x00,0x00,0x00,0x00 # movd (%rbp), %xmm4
adcx %rax, %r8
adox %r10, %r9
mulx 16($ap), %rax, %r10
movd 64(%rbp), %xmm5
lea 128(%rbp), %rbp
adcx %rax, %r9
adox %r11, %r10
.byte 0xc4,0x62,0xfb,0xf6,0x9e,0x18,0x00,0x00,0x00 # mulx 24($ap), %rax, %r11
pslldq \$4, %xmm5
por %xmm5, %xmm4
adcx %rax, %r10
adox %r12, %r11
mulx 32($ap), %rax, %r12
adcx %rax, %r11
adox %r13, %r12
mulx 40($ap), %rax, %r13
adcx %rax, %r12
adox %r14, %r13
.byte 0xc4,0x62,0xfb,0xf6,0xb6,0x30,0x00,0x00,0x00 # mulx 48($ap), %rax, %r14
adcx %rax, %r13
adox %r15, %r14
mulx 56($ap), %rax, %r15
movq %xmm4, %rdx
mov %rbx, 64(%rsp,%rcx,8)
adcx %rax, %r14
adox %rdi, %r15
mov %r8, %rbx
adcx %rdi, %r15 # cf=0
inc %rcx # of=0
jnz .Loop_mulx_gather
mov %r8, 64(%rsp)
mov %r9, 64+8(%rsp)
mov %r10, 64+16(%rsp)
mov %r11, 64+24(%rsp)
mov %r12, 64+32(%rsp)
mov %r13, 64+40(%rsp)
mov %r14, 64+48(%rsp)
mov %r15, 64+56(%rsp)
movq %xmm0, $out
movq %xmm1, %rbp
mov 128(%rsp), %rdx # pull $n0
mov (%rsp), %r8
mov 8(%rsp), %r9
mov 16(%rsp), %r10
mov 24(%rsp), %r11
mov 32(%rsp), %r12
mov 40(%rsp), %r13
mov 48(%rsp), %r14
mov 56(%rsp), %r15
call __rsaz_512_reducex
.Lmul_gather_tail:
___
$code.=<<___;
addq 64(%rsp), %r8
adcq 72(%rsp), %r9
adcq 80(%rsp), %r10
adcq 88(%rsp), %r11
adcq 96(%rsp), %r12
adcq 104(%rsp), %r13
adcq 112(%rsp), %r14
adcq 120(%rsp), %r15
sbbq %rcx, %rcx
call __rsaz_512_subtract
leaq 128+24+48(%rsp), %rax
movq -48(%rax), %r15
movq -40(%rax), %r14
movq -32(%rax), %r13
movq -24(%rax), %r12
movq -16(%rax), %rbp
movq -8(%rax), %rbx
leaq (%rax), %rsp
.Lmul_gather4_epilogue:
ret
.size rsaz_512_mul_gather4,.-rsaz_512_mul_gather4
___
}
{
my ($out,$ap,$mod,$n0,$tbl,$pwr) = ("%rdi","%rsi","%rdx","%rcx","%r8","%r9d");
$code.=<<___;
.globl rsaz_512_mul_scatter4
.type rsaz_512_mul_scatter4,\@function,6
.align 32
rsaz_512_mul_scatter4:
push %rbx
push %rbp
push %r12
push %r13
push %r14
push %r15
mov $pwr, $pwr
subq \$128+24, %rsp
.Lmul_scatter4_body:
leaq ($tbl,$pwr,4), $tbl
movq $out, %xmm0 # off-load arguments
movq $mod, %xmm1
movq $tbl, %xmm2
movq $n0, 128(%rsp)
movq $out, %rbp
___
$code.=<<___ if ($addx);
movl \$0x80100,%r11d
andl OPENSSL_ia32cap_P+8(%rip),%r11d
cmpl \$0x80100,%r11d # check for MULX and ADO/CX
je .Lmulx_scatter
___
$code.=<<___;
movq ($out),%rbx # pass b[0]
call __rsaz_512_mul
movq %xmm0, $out
movq %xmm1, %rbp
movq (%rsp), %r8
movq 8(%rsp), %r9
movq 16(%rsp), %r10
movq 24(%rsp), %r11
movq 32(%rsp), %r12
movq 40(%rsp), %r13
movq 48(%rsp), %r14
movq 56(%rsp), %r15
call __rsaz_512_reduce
___
$code.=<<___ if ($addx);
jmp .Lmul_scatter_tail
.align 32
.Lmulx_scatter:
movq ($out), %rdx # pass b[0]
call __rsaz_512_mulx
movq %xmm0, $out
movq %xmm1, %rbp
movq 128(%rsp), %rdx # pull $n0
movq (%rsp), %r8
movq 8(%rsp), %r9
movq 16(%rsp), %r10
movq 24(%rsp), %r11
movq 32(%rsp), %r12
movq 40(%rsp), %r13
movq 48(%rsp), %r14
movq 56(%rsp), %r15
call __rsaz_512_reducex
.Lmul_scatter_tail:
___
$code.=<<___;
addq 64(%rsp), %r8
adcq 72(%rsp), %r9
adcq 80(%rsp), %r10
adcq 88(%rsp), %r11
adcq 96(%rsp), %r12
adcq 104(%rsp), %r13
adcq 112(%rsp), %r14
adcq 120(%rsp), %r15
movq %xmm2, $inp
sbbq %rcx, %rcx
call __rsaz_512_subtract
movl %r8d, 64*0($inp) # scatter
shrq \$32, %r8
movl %r9d, 64*2($inp)
shrq \$32, %r9
movl %r10d, 64*4($inp)
shrq \$32, %r10
movl %r11d, 64*6($inp)
shrq \$32, %r11
movl %r12d, 64*8($inp)
shrq \$32, %r12
movl %r13d, 64*10($inp)
shrq \$32, %r13
movl %r14d, 64*12($inp)
shrq \$32, %r14
movl %r15d, 64*14($inp)
shrq \$32, %r15
movl %r8d, 64*1($inp)
movl %r9d, 64*3($inp)
movl %r10d, 64*5($inp)
movl %r11d, 64*7($inp)
movl %r12d, 64*9($inp)
movl %r13d, 64*11($inp)
movl %r14d, 64*13($inp)
movl %r15d, 64*15($inp)
leaq 128+24+48(%rsp), %rax
movq -48(%rax), %r15
movq -40(%rax), %r14
movq -32(%rax), %r13
movq -24(%rax), %r12
movq -16(%rax), %rbp
movq -8(%rax), %rbx
leaq (%rax), %rsp
.Lmul_scatter4_epilogue:
ret
.size rsaz_512_mul_scatter4,.-rsaz_512_mul_scatter4
___
}
{
my ($out,$inp,$mod,$n0) = ("%rdi","%rsi","%rdx","%rcx");
$code.=<<___;
.globl rsaz_512_mul_by_one
.type rsaz_512_mul_by_one,\@function,4
.align 32
rsaz_512_mul_by_one:
push %rbx
push %rbp
push %r12
push %r13
push %r14
push %r15
subq \$128+24, %rsp
.Lmul_by_one_body:
___
$code.=<<___ if ($addx);
movl OPENSSL_ia32cap_P+8(%rip),%eax
___
$code.=<<___;
movq $mod, %rbp # reassign argument
movq $n0, 128(%rsp)
movq ($inp), %r8
pxor %xmm0, %xmm0
movq 8($inp), %r9
movq 16($inp), %r10
movq 24($inp), %r11
movq 32($inp), %r12
movq 40($inp), %r13
movq 48($inp), %r14
movq 56($inp), %r15
movdqa %xmm0, (%rsp)
movdqa %xmm0, 16(%rsp)
movdqa %xmm0, 32(%rsp)
movdqa %xmm0, 48(%rsp)
movdqa %xmm0, 64(%rsp)
movdqa %xmm0, 80(%rsp)
movdqa %xmm0, 96(%rsp)
___
$code.=<<___ if ($addx);
andl \$0x80100,%eax
cmpl \$0x80100,%eax # check for MULX and ADO/CX
je .Lby_one_callx
___
$code.=<<___;
call __rsaz_512_reduce
___
$code.=<<___ if ($addx);
jmp .Lby_one_tail
.align 32
.Lby_one_callx:
movq 128(%rsp), %rdx # pull $n0
call __rsaz_512_reducex
.Lby_one_tail:
___
$code.=<<___;
movq %r8, ($out)
movq %r9, 8($out)
movq %r10, 16($out)
movq %r11, 24($out)
movq %r12, 32($out)
movq %r13, 40($out)
movq %r14, 48($out)
movq %r15, 56($out)
leaq 128+24+48(%rsp), %rax
movq -48(%rax), %r15
movq -40(%rax), %r14
movq -32(%rax), %r13
movq -24(%rax), %r12
movq -16(%rax), %rbp
movq -8(%rax), %rbx
leaq (%rax), %rsp
.Lmul_by_one_epilogue:
ret
.size rsaz_512_mul_by_one,.-rsaz_512_mul_by_one
___
}
{ # __rsaz_512_reduce
#
# input: %r8-%r15, %rbp - mod, 128(%rsp) - n0
# output: %r8-%r15
# clobbers: everything except %rbp and %rdi
$code.=<<___;
.type __rsaz_512_reduce,\@abi-omnipotent
.align 32
__rsaz_512_reduce:
movq %r8, %rbx
imulq 128+8(%rsp), %rbx
movq 0(%rbp), %rax
movl \$8, %ecx
jmp .Lreduction_loop
.align 32
.Lreduction_loop:
mulq %rbx
movq 8(%rbp), %rax
negq %r8
movq %rdx, %r8
adcq \$0, %r8
mulq %rbx
addq %rax, %r9
movq 16(%rbp), %rax
adcq \$0, %rdx
addq %r9, %r8
movq %rdx, %r9
adcq \$0, %r9
mulq %rbx
addq %rax, %r10
movq 24(%rbp), %rax
adcq \$0, %rdx
addq %r10, %r9
movq %rdx, %r10
adcq \$0, %r10
mulq %rbx
addq %rax, %r11
movq 32(%rbp), %rax
adcq \$0, %rdx
addq %r11, %r10
movq 128+8(%rsp), %rsi
#movq %rdx, %r11
#adcq \$0, %r11
adcq \$0, %rdx
movq %rdx, %r11
mulq %rbx
addq %rax, %r12
movq 40(%rbp), %rax
adcq \$0, %rdx
imulq %r8, %rsi
addq %r12, %r11
movq %rdx, %r12
adcq \$0, %r12
mulq %rbx
addq %rax, %r13
movq 48(%rbp), %rax
adcq \$0, %rdx
addq %r13, %r12
movq %rdx, %r13
adcq \$0, %r13
mulq %rbx
addq %rax, %r14
movq 56(%rbp), %rax
adcq \$0, %rdx
addq %r14, %r13
movq %rdx, %r14
adcq \$0, %r14
mulq %rbx
movq %rsi, %rbx
addq %rax, %r15
movq 0(%rbp), %rax
adcq \$0, %rdx
addq %r15, %r14
movq %rdx, %r15
adcq \$0, %r15
decl %ecx
jne .Lreduction_loop
ret
.size __rsaz_512_reduce,.-__rsaz_512_reduce
___
}
if ($addx) {
# __rsaz_512_reducex
#
# input: %r8-%r15, %rbp - mod, 128(%rsp) - n0
# output: %r8-%r15
# clobbers: everything except %rbp and %rdi
$code.=<<___;
.type __rsaz_512_reducex,\@abi-omnipotent
.align 32
__rsaz_512_reducex:
#movq 128+8(%rsp), %rdx # pull $n0
imulq %r8, %rdx
xorq %rsi, %rsi # cf=0,of=0
movl \$8, %ecx
jmp .Lreduction_loopx
.align 32
.Lreduction_loopx:
mov %r8, %rbx
mulx 0(%rbp), %rax, %r8
adcx %rbx, %rax
adox %r9, %r8
mulx 8(%rbp), %rax, %r9
adcx %rax, %r8
adox %r10, %r9
mulx 16(%rbp), %rbx, %r10
adcx %rbx, %r9
adox %r11, %r10
mulx 24(%rbp), %rbx, %r11
adcx %rbx, %r10
adox %r12, %r11
.byte 0xc4,0x62,0xe3,0xf6,0xa5,0x20,0x00,0x00,0x00 # mulx 32(%rbp), %rbx, %r12
mov %rdx, %rax
mov %r8, %rdx
adcx %rbx, %r11
adox %r13, %r12
mulx 128+8(%rsp), %rbx, %rdx
mov %rax, %rdx
mulx 40(%rbp), %rax, %r13
adcx %rax, %r12
adox %r14, %r13
.byte 0xc4,0x62,0xfb,0xf6,0xb5,0x30,0x00,0x00,0x00 # mulx 48(%rbp), %rax, %r14
adcx %rax, %r13
adox %r15, %r14
mulx 56(%rbp), %rax, %r15
mov %rbx, %rdx
adcx %rax, %r14
adox %rsi, %r15 # %rsi is 0
adcx %rsi, %r15 # cf=0
decl %ecx # of=0
jne .Lreduction_loopx
ret
.size __rsaz_512_reducex,.-__rsaz_512_reducex
___
}
{ # __rsaz_512_subtract
# input: %r8-%r15, %rdi - $out, %rbp - $mod, %rcx - mask
# output:
# clobbers: everything but %rdi, %rsi and %rbp
$code.=<<___;
.type __rsaz_512_subtract,\@abi-omnipotent
.align 32
__rsaz_512_subtract:
movq %r8, ($out)
movq %r9, 8($out)
movq %r10, 16($out)
movq %r11, 24($out)
movq %r12, 32($out)
movq %r13, 40($out)
movq %r14, 48($out)
movq %r15, 56($out)
movq 0($mod), %r8
movq 8($mod), %r9
negq %r8
notq %r9
andq %rcx, %r8
movq 16($mod), %r10
andq %rcx, %r9
notq %r10
movq 24($mod), %r11
andq %rcx, %r10
notq %r11
movq 32($mod), %r12
andq %rcx, %r11
notq %r12
movq 40($mod), %r13
andq %rcx, %r12
notq %r13
movq 48($mod), %r14
andq %rcx, %r13
notq %r14
movq 56($mod), %r15
andq %rcx, %r14
notq %r15
andq %rcx, %r15
addq ($out), %r8
adcq 8($out), %r9
adcq 16($out), %r10
adcq 24($out), %r11
adcq 32($out), %r12
adcq 40($out), %r13
adcq 48($out), %r14
adcq 56($out), %r15
movq %r8, ($out)
movq %r9, 8($out)
movq %r10, 16($out)
movq %r11, 24($out)
movq %r12, 32($out)
movq %r13, 40($out)
movq %r14, 48($out)
movq %r15, 56($out)
ret
.size __rsaz_512_subtract,.-__rsaz_512_subtract
___
}
{ # __rsaz_512_mul
#
# input: %rsi - ap, %rbp - bp
# ouput:
# clobbers: everything
my ($ap,$bp) = ("%rsi","%rbp");
$code.=<<___;
.type __rsaz_512_mul,\@abi-omnipotent
.align 32
__rsaz_512_mul:
leaq 8(%rsp), %rdi
movq ($ap), %rax
mulq %rbx
movq %rax, (%rdi)
movq 8($ap), %rax
movq %rdx, %r8
mulq %rbx
addq %rax, %r8
movq 16($ap), %rax
movq %rdx, %r9
adcq \$0, %r9
mulq %rbx
addq %rax, %r9
movq 24($ap), %rax
movq %rdx, %r10
adcq \$0, %r10
mulq %rbx
addq %rax, %r10
movq 32($ap), %rax
movq %rdx, %r11
adcq \$0, %r11
mulq %rbx
addq %rax, %r11
movq 40($ap), %rax
movq %rdx, %r12
adcq \$0, %r12
mulq %rbx
addq %rax, %r12
movq 48($ap), %rax
movq %rdx, %r13
adcq \$0, %r13
mulq %rbx
addq %rax, %r13
movq 56($ap), %rax
movq %rdx, %r14
adcq \$0, %r14
mulq %rbx
addq %rax, %r14
movq ($ap), %rax
movq %rdx, %r15
adcq \$0, %r15
leaq 8($bp), $bp
leaq 8(%rdi), %rdi
movl \$7, %ecx
jmp .Loop_mul
.align 32
.Loop_mul:
movq ($bp), %rbx
mulq %rbx
addq %rax, %r8
movq 8($ap), %rax
movq %r8, (%rdi)
movq %rdx, %r8
adcq \$0, %r8
mulq %rbx
addq %rax, %r9
movq 16($ap), %rax
adcq \$0, %rdx
addq %r9, %r8
movq %rdx, %r9
adcq \$0, %r9
mulq %rbx
addq %rax, %r10
movq 24($ap), %rax
adcq \$0, %rdx
addq %r10, %r9
movq %rdx, %r10
adcq \$0, %r10
mulq %rbx
addq %rax, %r11
movq 32($ap), %rax
adcq \$0, %rdx
addq %r11, %r10
movq %rdx, %r11
adcq \$0, %r11
mulq %rbx
addq %rax, %r12
movq 40($ap), %rax
adcq \$0, %rdx
addq %r12, %r11
movq %rdx, %r12
adcq \$0, %r12
mulq %rbx
addq %rax, %r13
movq 48($ap), %rax
adcq \$0, %rdx
addq %r13, %r12
movq %rdx, %r13
adcq \$0, %r13
mulq %rbx
addq %rax, %r14
movq 56($ap), %rax
adcq \$0, %rdx
addq %r14, %r13
movq %rdx, %r14
leaq 8($bp), $bp
adcq \$0, %r14
mulq %rbx
addq %rax, %r15
movq ($ap), %rax
adcq \$0, %rdx
addq %r15, %r14
movq %rdx, %r15
adcq \$0, %r15
leaq 8(%rdi), %rdi
decl %ecx
jnz .Loop_mul
movq %r8, (%rdi)
movq %r9, 8(%rdi)
movq %r10, 16(%rdi)
movq %r11, 24(%rdi)
movq %r12, 32(%rdi)
movq %r13, 40(%rdi)
movq %r14, 48(%rdi)
movq %r15, 56(%rdi)
ret
.size __rsaz_512_mul,.-__rsaz_512_mul
___
}
if ($addx) {
# __rsaz_512_mulx
#
# input: %rsi - ap, %rbp - bp
# ouput:
# clobbers: everything
my ($ap,$bp,$zero) = ("%rsi","%rbp","%rdi");
$code.=<<___;
.type __rsaz_512_mulx,\@abi-omnipotent
.align 32
__rsaz_512_mulx:
mulx ($ap), %rbx, %r8 # initial %rdx preloaded by caller
mov \$-6, %rcx
mulx 8($ap), %rax, %r9
movq %rbx, 8(%rsp)
mulx 16($ap), %rbx, %r10
adc %rax, %r8
mulx 24($ap), %rax, %r11
adc %rbx, %r9
mulx 32($ap), %rbx, %r12
adc %rax, %r10
mulx 40($ap), %rax, %r13
adc %rbx, %r11
mulx 48($ap), %rbx, %r14
adc %rax, %r12
mulx 56($ap), %rax, %r15
mov 8($bp), %rdx
adc %rbx, %r13
adc %rax, %r14
adc \$0, %r15
xor $zero, $zero # cf=0,of=0
jmp .Loop_mulx
.align 32
.Loop_mulx:
movq %r8, %rbx
mulx ($ap), %rax, %r8
adcx %rax, %rbx
adox %r9, %r8
mulx 8($ap), %rax, %r9
adcx %rax, %r8
adox %r10, %r9
mulx 16($ap), %rax, %r10
adcx %rax, %r9
adox %r11, %r10
mulx 24($ap), %rax, %r11
adcx %rax, %r10
adox %r12, %r11
.byte 0x3e,0xc4,0x62,0xfb,0xf6,0xa6,0x20,0x00,0x00,0x00 # mulx 32($ap), %rax, %r12
adcx %rax, %r11
adox %r13, %r12
mulx 40($ap), %rax, %r13
adcx %rax, %r12
adox %r14, %r13
mulx 48($ap), %rax, %r14
adcx %rax, %r13
adox %r15, %r14
mulx 56($ap), %rax, %r15
movq 64($bp,%rcx,8), %rdx
movq %rbx, 8+64-8(%rsp,%rcx,8)
adcx %rax, %r14
adox $zero, %r15
adcx $zero, %r15 # cf=0
inc %rcx # of=0
jnz .Loop_mulx
movq %r8, %rbx
mulx ($ap), %rax, %r8
adcx %rax, %rbx
adox %r9, %r8
.byte 0xc4,0x62,0xfb,0xf6,0x8e,0x08,0x00,0x00,0x00 # mulx 8($ap), %rax, %r9
adcx %rax, %r8
adox %r10, %r9
.byte 0xc4,0x62,0xfb,0xf6,0x96,0x10,0x00,0x00,0x00 # mulx 16($ap), %rax, %r10
adcx %rax, %r9
adox %r11, %r10
mulx 24($ap), %rax, %r11
adcx %rax, %r10
adox %r12, %r11
mulx 32($ap), %rax, %r12
adcx %rax, %r11
adox %r13, %r12
mulx 40($ap), %rax, %r13
adcx %rax, %r12
adox %r14, %r13
.byte 0xc4,0x62,0xfb,0xf6,0xb6,0x30,0x00,0x00,0x00 # mulx 48($ap), %rax, %r14
adcx %rax, %r13
adox %r15, %r14
.byte 0xc4,0x62,0xfb,0xf6,0xbe,0x38,0x00,0x00,0x00 # mulx 56($ap), %rax, %r15
adcx %rax, %r14
adox $zero, %r15
adcx $zero, %r15
mov %rbx, 8+64-8(%rsp)
mov %r8, 8+64(%rsp)
mov %r9, 8+64+8(%rsp)
mov %r10, 8+64+16(%rsp)
mov %r11, 8+64+24(%rsp)
mov %r12, 8+64+32(%rsp)
mov %r13, 8+64+40(%rsp)
mov %r14, 8+64+48(%rsp)
mov %r15, 8+64+56(%rsp)
ret
.size __rsaz_512_mulx,.-__rsaz_512_mulx
___
}
{
my ($out,$inp,$power)= $win64 ? ("%rcx","%rdx","%r8d") : ("%rdi","%rsi","%edx");
$code.=<<___;
.globl rsaz_512_scatter4
.type rsaz_512_scatter4,\@abi-omnipotent
.align 16
rsaz_512_scatter4:
leaq ($out,$power,4), $out
movl \$8, %r9d
jmp .Loop_scatter
.align 16
.Loop_scatter:
movq ($inp), %rax
leaq 8($inp), $inp
movl %eax, ($out)
shrq \$32, %rax
movl %eax, 64($out)
leaq 128($out), $out
decl %r9d
jnz .Loop_scatter
ret
.size rsaz_512_scatter4,.-rsaz_512_scatter4
.globl rsaz_512_gather4
.type rsaz_512_gather4,\@abi-omnipotent
.align 16
rsaz_512_gather4:
leaq ($inp,$power,4), $inp
movl \$8, %r9d
jmp .Loop_gather
.align 16
.Loop_gather:
movl ($inp), %eax
movl 64($inp), %r8d
leaq 128($inp), $inp
shlq \$32, %r8
or %r8, %rax
movq %rax, ($out)
leaq 8($out), $out
decl %r9d
jnz .Loop_gather
ret
.size rsaz_512_gather4,.-rsaz_512_gather4
___
}
# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
# CONTEXT *context,DISPATCHER_CONTEXT *disp)
if ($win64) {
$rec="%rcx";
$frame="%rdx";
$context="%r8";
$disp="%r9";
$code.=<<___;
.extern __imp_RtlVirtualUnwind
.type se_handler,\@abi-omnipotent
.align 16
se_handler:
push %rsi
push %rdi
push %rbx
push %rbp
push %r12
push %r13
push %r14
push %r15
pushfq
sub \$64,%rsp
mov 120($context),%rax # pull context->Rax
mov 248($context),%rbx # pull context->Rip
mov 8($disp),%rsi # disp->ImageBase
mov 56($disp),%r11 # disp->HandlerData
mov 0(%r11),%r10d # HandlerData[0]
lea (%rsi,%r10),%r10 # end of prologue label
cmp %r10,%rbx # context->Rip<end of prologue label
jb .Lcommon_seh_tail
mov 152($context),%rax # pull context->Rsp
mov 4(%r11),%r10d # HandlerData[1]
lea (%rsi,%r10),%r10 # epilogue label
cmp %r10,%rbx # context->Rip>=epilogue label
jae .Lcommon_seh_tail
lea 128+24+48(%rax),%rax
mov -8(%rax),%rbx
mov -16(%rax),%rbp
mov -24(%rax),%r12
mov -32(%rax),%r13
mov -40(%rax),%r14
mov -48(%rax),%r15
mov %rbx,144($context) # restore context->Rbx
mov %rbp,160($context) # restore context->Rbp
mov %r12,216($context) # restore context->R12
mov %r13,224($context) # restore context->R13
mov %r14,232($context) # restore context->R14
mov %r15,240($context) # restore context->R15
.Lcommon_seh_tail:
mov 8(%rax),%rdi
mov 16(%rax),%rsi
mov %rax,152($context) # restore context->Rsp
mov %rsi,168($context) # restore context->Rsi
mov %rdi,176($context) # restore context->Rdi
mov 40($disp),%rdi # disp->ContextRecord
mov $context,%rsi # context
mov \$154,%ecx # sizeof(CONTEXT)
.long 0xa548f3fc # cld; rep movsq
mov $disp,%rsi
xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER
mov 8(%rsi),%rdx # arg2, disp->ImageBase
mov 0(%rsi),%r8 # arg3, disp->ControlPc
mov 16(%rsi),%r9 # arg4, disp->FunctionEntry
mov 40(%rsi),%r10 # disp->ContextRecord
lea 56(%rsi),%r11 # &disp->HandlerData
lea 24(%rsi),%r12 # &disp->EstablisherFrame
mov %r10,32(%rsp) # arg5
mov %r11,40(%rsp) # arg6
mov %r12,48(%rsp) # arg7
mov %rcx,56(%rsp) # arg8, (NULL)
call *__imp_RtlVirtualUnwind(%rip)
mov \$1,%eax # ExceptionContinueSearch
add \$64,%rsp
popfq
pop %r15
pop %r14
pop %r13
pop %r12
pop %rbp
pop %rbx
pop %rdi
pop %rsi
ret
.size sqr_handler,.-sqr_handler
.section .pdata
.align 4
.rva .LSEH_begin_rsaz_512_sqr
.rva .LSEH_end_rsaz_512_sqr
.rva .LSEH_info_rsaz_512_sqr
.rva .LSEH_begin_rsaz_512_mul
.rva .LSEH_end_rsaz_512_mul
.rva .LSEH_info_rsaz_512_mul
.rva .LSEH_begin_rsaz_512_mul_gather4
.rva .LSEH_end_rsaz_512_mul_gather4
.rva .LSEH_info_rsaz_512_mul_gather4
.rva .LSEH_begin_rsaz_512_mul_scatter4
.rva .LSEH_end_rsaz_512_mul_scatter4
.rva .LSEH_info_rsaz_512_mul_scatter4
.rva .LSEH_begin_rsaz_512_mul_by_one
.rva .LSEH_end_rsaz_512_mul_by_one
.rva .LSEH_info_rsaz_512_mul_by_one
.section .xdata
.align 8
.LSEH_info_rsaz_512_sqr:
.byte 9,0,0,0
.rva se_handler
.rva .Lsqr_body,.Lsqr_epilogue # HandlerData[]
.LSEH_info_rsaz_512_mul:
.byte 9,0,0,0
.rva se_handler
.rva .Lmul_body,.Lmul_epilogue # HandlerData[]
.LSEH_info_rsaz_512_mul_gather4:
.byte 9,0,0,0
.rva se_handler
.rva .Lmul_gather4_body,.Lmul_gather4_epilogue # HandlerData[]
.LSEH_info_rsaz_512_mul_scatter4:
.byte 9,0,0,0
.rva se_handler
.rva .Lmul_scatter4_body,.Lmul_scatter4_epilogue # HandlerData[]
.LSEH_info_rsaz_512_mul_by_one:
.byte 9,0,0,0
.rva se_handler
.rva .Lmul_by_one_body,.Lmul_by_one_epilogue # HandlerData[]
___
}
$code =~ s/\`([^\`]*)\`/eval $1/gem;
print $code;
close STDOUT;