b9c26014de
Since we pre-generate our perlasm, having the output of these files be sensitive to the environment the run in is unhelpful. It would be bad to suddenly change what features we do or don't compile in whenever workstations' toolchains change. Enable all compiler-version-gated features as they should all be runtime-gated anyway. This should align with what upstream's files would have produced on modern toolschains. We should assume our assemblers can take whatever we'd like to throw at them. (If it turns out some can't, we'd rather find out and probably switch the problematic instructions to explicit byte sequences.) This actually results in a fairly significant change to the assembly we generate. I'm guessing upstream's buildsystem sets the CC environment variable, while ours doesn't and so the version checks were all coming out conservative. diffstat of generated files: linux-x86/crypto/sha/sha1-586.S | 1176 ++++++++++++ linux-x86/crypto/sha/sha256-586.S | 2248 ++++++++++++++++++++++++ linux-x86_64/crypto/bn/rsaz-avx2.S | 1644 +++++++++++++++++ linux-x86_64/crypto/bn/rsaz-x86_64.S | 638 ++++++ linux-x86_64/crypto/bn/x86_64-mont.S | 332 +++ linux-x86_64/crypto/bn/x86_64-mont5.S | 1130 ++++++++++++ linux-x86_64/crypto/modes/aesni-gcm-x86_64.S | 754 ++++++++ linux-x86_64/crypto/modes/ghash-x86_64.S | 475 +++++ linux-x86_64/crypto/sha/sha1-x86_64.S | 1121 ++++++++++++ linux-x86_64/crypto/sha/sha256-x86_64.S | 1062 +++++++++++ linux-x86_64/crypto/sha/sha512-x86_64.S | 2241 ++++++++++++++++++++++++ mac-x86/crypto/sha/sha1-586.S | 1174 ++++++++++++ mac-x86/crypto/sha/sha256-586.S | 2248 ++++++++++++++++++++++++ mac-x86_64/crypto/bn/rsaz-avx2.S | 1637 +++++++++++++++++ mac-x86_64/crypto/bn/rsaz-x86_64.S | 638 ++++++ mac-x86_64/crypto/bn/x86_64-mont.S | 331 +++ mac-x86_64/crypto/bn/x86_64-mont5.S | 1130 ++++++++++++ mac-x86_64/crypto/modes/aesni-gcm-x86_64.S | 750 ++++++++ mac-x86_64/crypto/modes/ghash-x86_64.S | 475 +++++ mac-x86_64/crypto/sha/sha1-x86_64.S | 1121 ++++++++++++ mac-x86_64/crypto/sha/sha256-x86_64.S | 1062 +++++++++++ mac-x86_64/crypto/sha/sha512-x86_64.S | 2241 ++++++++++++++++++++++++ win-x86/crypto/sha/sha1-586.asm | 1173 ++++++++++++ win-x86/crypto/sha/sha256-586.asm | 2248 ++++++++++++++++++++++++ win-x86_64/crypto/bn/rsaz-avx2.asm | 1858 +++++++++++++++++++- win-x86_64/crypto/bn/rsaz-x86_64.asm | 638 ++++++ win-x86_64/crypto/bn/x86_64-mont.asm | 352 +++ win-x86_64/crypto/bn/x86_64-mont5.asm | 1184 ++++++++++++ win-x86_64/crypto/modes/aesni-gcm-x86_64.asm | 933 ++++++++++ win-x86_64/crypto/modes/ghash-x86_64.asm | 515 +++++ win-x86_64/crypto/sha/sha1-x86_64.asm | 1152 ++++++++++++ win-x86_64/crypto/sha/sha256-x86_64.asm | 1088 +++++++++++ win-x86_64/crypto/sha/sha512-x86_64.asm | 2499 ++++++ SHA* gets faster. RSA and AES-GCM seem to be more of a wash and even slower sometimes! This is a little concerning. Though when I repeated the latter two, it's definitely noisy (RSA in particular), so we may wish to repeat in a more controlled environment. We could also flip some of these toggles to something other than the highest setting if it seems some of the variants aren't desirable. We just shouldn't have them enabled or disabled on accident. This aligns us closer to upstream though. $ /tmp/bssl.old speed SHA- Did 5028000 SHA-1 (16 bytes) operations in 1000048us (5027758.7 ops/sec): 80.4 MB/s Did 1708000 SHA-1 (256 bytes) operations in 1000257us (1707561.2 ops/sec): 437.1 MB/s Did 73000 SHA-1 (8192 bytes) operations in 1008406us (72391.5 ops/sec): 593.0 MB/s Did 3041000 SHA-256 (16 bytes) operations in 1000311us (3040054.5 ops/sec): 48.6 MB/s Did 779000 SHA-256 (256 bytes) operations in 1000820us (778361.7 ops/sec): 199.3 MB/s Did 26000 SHA-256 (8192 bytes) operations in 1009875us (25745.8 ops/sec): 210.9 MB/s Did 1837000 SHA-512 (16 bytes) operations in 1000251us (1836539.0 ops/sec): 29.4 MB/s Did 803000 SHA-512 (256 bytes) operations in 1000969us (802222.6 ops/sec): 205.4 MB/s Did 41000 SHA-512 (8192 bytes) operations in 1016768us (40323.8 ops/sec): 330.3 MB/s $ /tmp/bssl.new speed SHA- Did 5354000 SHA-1 (16 bytes) operations in 1000104us (5353443.2 ops/sec): 85.7 MB/s Did 1779000 SHA-1 (256 bytes) operations in 1000121us (1778784.8 ops/sec): 455.4 MB/s Did 87000 SHA-1 (8192 bytes) operations in 1012641us (85914.0 ops/sec): 703.8 MB/s Did 3517000 SHA-256 (16 bytes) operations in 1000114us (3516599.1 ops/sec): 56.3 MB/s Did 935000 SHA-256 (256 bytes) operations in 1000096us (934910.2 ops/sec): 239.3 MB/s Did 38000 SHA-256 (8192 bytes) operations in 1004476us (37830.7 ops/sec): 309.9 MB/s Did 2930000 SHA-512 (16 bytes) operations in 1000259us (2929241.3 ops/sec): 46.9 MB/s Did 1008000 SHA-512 (256 bytes) operations in 1000509us (1007487.2 ops/sec): 257.9 MB/s Did 45000 SHA-512 (8192 bytes) operations in 1000593us (44973.3 ops/sec): 368.4 MB/s $ /tmp/bssl.old speed RSA Did 820 RSA 2048 signing operations in 1017008us (806.3 ops/sec) Did 27000 RSA 2048 verify operations in 1015400us (26590.5 ops/sec) Did 1292 RSA 2048 (3 prime, e=3) signing operations in 1008185us (1281.5 ops/sec) Did 65000 RSA 2048 (3 prime, e=3) verify operations in 1011388us (64268.1 ops/sec) Did 120 RSA 4096 signing operations in 1061027us (113.1 ops/sec) Did 8208 RSA 4096 verify operations in 1002717us (8185.8 ops/sec) $ /tmp/bssl.new speed RSA Did 760 RSA 2048 signing operations in 1003351us (757.5 ops/sec) Did 25900 RSA 2048 verify operations in 1028931us (25171.8 ops/sec) Did 1320 RSA 2048 (3 prime, e=3) signing operations in 1040806us (1268.2 ops/sec) Did 63000 RSA 2048 (3 prime, e=3) verify operations in 1016042us (62005.3 ops/sec) Did 104 RSA 4096 signing operations in 1008718us (103.1 ops/sec) Did 6875 RSA 4096 verify operations in 1093441us (6287.5 ops/sec) $ /tmp/bssl.old speed GCM Did 5316000 AES-128-GCM (16 bytes) seal operations in 1000082us (5315564.1 ops/sec): 85.0 MB/s Did 712000 AES-128-GCM (1350 bytes) seal operations in 1000252us (711820.6 ops/sec): 961.0 MB/s Did 149000 AES-128-GCM (8192 bytes) seal operations in 1003182us (148527.4 ops/sec): 1216.7 MB/s Did 5919750 AES-256-GCM (16 bytes) seal operations in 1000016us (5919655.3 ops/sec): 94.7 MB/s Did 800000 AES-256-GCM (1350 bytes) seal operations in 1000951us (799239.9 ops/sec): 1079.0 MB/s Did 152000 AES-256-GCM (8192 bytes) seal operations in 1000765us (151883.8 ops/sec): 1244.2 MB/s $ /tmp/bssl.new speed GCM Did 5315000 AES-128-GCM (16 bytes) seal operations in 1000125us (5314335.7 ops/sec): 85.0 MB/s Did 755000 AES-128-GCM (1350 bytes) seal operations in 1000878us (754337.7 ops/sec): 1018.4 MB/s Did 151000 AES-128-GCM (8192 bytes) seal operations in 1005655us (150150.9 ops/sec): 1230.0 MB/s Did 5913500 AES-256-GCM (16 bytes) seal operations in 1000041us (5913257.6 ops/sec): 94.6 MB/s Did 782000 AES-256-GCM (1350 bytes) seal operations in 1001484us (780841.2 ops/sec): 1054.1 MB/s Did 121000 AES-256-GCM (8192 bytes) seal operations in 1006389us (120231.8 ops/sec): 984.9 MB/s Change-Id: I0efb32f896c597abc7d7e55c31d038528a5c72a1 Reviewed-on: https://boringssl-review.googlesource.com/6260 Reviewed-by: Adam Langley <alangley@gmail.com>
1881 lines
50 KiB
Perl
1881 lines
50 KiB
Perl
#!/usr/bin/env perl
|
|
|
|
##############################################################################
|
|
# #
|
|
# Copyright (c) 2012, Intel Corporation #
|
|
# #
|
|
# All rights reserved. #
|
|
# #
|
|
# Redistribution and use in source and binary forms, with or without #
|
|
# modification, are permitted provided that the following conditions are #
|
|
# met: #
|
|
# #
|
|
# * Redistributions of source code must retain the above copyright #
|
|
# notice, this list of conditions and the following disclaimer. #
|
|
# #
|
|
# * Redistributions in binary form must reproduce the above copyright #
|
|
# notice, this list of conditions and the following disclaimer in the #
|
|
# documentation and/or other materials provided with the #
|
|
# distribution. #
|
|
# #
|
|
# * Neither the name of the Intel Corporation nor the names of its #
|
|
# contributors may be used to endorse or promote products derived from #
|
|
# this software without specific prior written permission. #
|
|
# #
|
|
# #
|
|
# THIS SOFTWARE IS PROVIDED BY INTEL CORPORATION ""AS IS"" AND ANY #
|
|
# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE #
|
|
# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR #
|
|
# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL CORPORATION OR #
|
|
# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, #
|
|
# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, #
|
|
# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR #
|
|
# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF #
|
|
# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING #
|
|
# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS #
|
|
# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. #
|
|
# #
|
|
##############################################################################
|
|
# Developers and authors: #
|
|
# Shay Gueron (1, 2), and Vlad Krasnov (1) #
|
|
# (1) Intel Corporation, Israel Development Center, Haifa, Israel #
|
|
# (2) University of Haifa, Israel #
|
|
##############################################################################
|
|
# Reference: #
|
|
# [1] S. Gueron, V. Krasnov: "Software Implementation of Modular #
|
|
# Exponentiation, Using Advanced Vector Instructions Architectures", #
|
|
# F. Ozbudak and F. Rodriguez-Henriquez (Eds.): WAIFI 2012, LNCS 7369, #
|
|
# pp. 119?135, 2012. Springer-Verlag Berlin Heidelberg 2012 #
|
|
# [2] S. Gueron: "Efficient Software Implementations of Modular #
|
|
# Exponentiation", Journal of Cryptographic Engineering 2:31-43 (2012). #
|
|
# [3] S. Gueron, V. Krasnov: "Speeding up Big-numbers Squaring",IEEE #
|
|
# Proceedings of 9th International Conference on Information Technology: #
|
|
# New Generations (ITNG 2012), pp.821-823 (2012) #
|
|
# [4] S. Gueron, V. Krasnov: "[PATCH] Efficient and side channel analysis #
|
|
# resistant 1024-bit modular exponentiation, for optimizing RSA2048 #
|
|
# on AVX2 capable x86_64 platforms", #
|
|
# http://rt.openssl.org/Ticket/Display.html?id=2850&user=guest&pass=guest#
|
|
##############################################################################
|
|
#
|
|
# +13% improvement over original submission by <appro@openssl.org>
|
|
#
|
|
# rsa2048 sign/sec OpenSSL 1.0.1 scalar(*) this
|
|
# 2.3GHz Haswell 621 765/+23% 1113/+79%
|
|
# 2.3GHz Broadwell(**) 688 1200(***)/+74% 1120/+63%
|
|
#
|
|
# (*) if system doesn't support AVX2, for reference purposes;
|
|
# (**) scaled to 2.3GHz to simplify comparison;
|
|
# (***) scalar AD*X code is faster than AVX2 and is preferred code
|
|
# path for Broadwell;
|
|
|
|
$flavour = shift;
|
|
$output = shift;
|
|
if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
|
|
|
|
$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
|
|
|
|
$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
|
|
( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
|
|
( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
|
|
die "can't locate x86_64-xlate.pl";
|
|
|
|
# In upstream, this is controlled by shelling out to the compiler to check
|
|
# versions, but BoringSSL is intended to be used with pre-generated perlasm
|
|
# output, so this isn't useful anyway.
|
|
$avx = 2;
|
|
$addx = 1;
|
|
|
|
open OUT,"| \"$^X\" $xlate $flavour $output";
|
|
*STDOUT = *OUT;
|
|
|
|
if ($avx>1) {{{
|
|
{ # void AMS_WW(
|
|
my $rp="%rdi"; # BN_ULONG *rp,
|
|
my $ap="%rsi"; # const BN_ULONG *ap,
|
|
my $np="%rdx"; # const BN_ULONG *np,
|
|
my $n0="%ecx"; # const BN_ULONG n0,
|
|
my $rep="%r8d"; # int repeat);
|
|
|
|
# The registers that hold the accumulated redundant result
|
|
# The AMM works on 1024 bit operands, and redundant word size is 29
|
|
# Therefore: ceil(1024/29)/4 = 9
|
|
my $ACC0="%ymm0";
|
|
my $ACC1="%ymm1";
|
|
my $ACC2="%ymm2";
|
|
my $ACC3="%ymm3";
|
|
my $ACC4="%ymm4";
|
|
my $ACC5="%ymm5";
|
|
my $ACC6="%ymm6";
|
|
my $ACC7="%ymm7";
|
|
my $ACC8="%ymm8";
|
|
my $ACC9="%ymm9";
|
|
# Registers that hold the broadcasted words of bp, currently used
|
|
my $B1="%ymm10";
|
|
my $B2="%ymm11";
|
|
# Registers that hold the broadcasted words of Y, currently used
|
|
my $Y1="%ymm12";
|
|
my $Y2="%ymm13";
|
|
# Helper registers
|
|
my $TEMP1="%ymm14";
|
|
my $AND_MASK="%ymm15";
|
|
# alu registers that hold the first words of the ACC
|
|
my $r0="%r9";
|
|
my $r1="%r10";
|
|
my $r2="%r11";
|
|
my $r3="%r12";
|
|
|
|
my $i="%r14d"; # loop counter
|
|
my $tmp = "%r15";
|
|
|
|
my $FrameSize=32*18+32*8; # place for A^2 and 2*A
|
|
|
|
my $aap=$r0;
|
|
my $tp0="%rbx";
|
|
my $tp1=$r3;
|
|
my $tpa=$tmp;
|
|
|
|
$np="%r13"; # reassigned argument
|
|
|
|
$code.=<<___;
|
|
.text
|
|
|
|
.globl rsaz_1024_sqr_avx2
|
|
.type rsaz_1024_sqr_avx2,\@function,5
|
|
.align 64
|
|
rsaz_1024_sqr_avx2: # 702 cycles, 14% faster than rsaz_1024_mul_avx2
|
|
lea (%rsp), %rax
|
|
push %rbx
|
|
push %rbp
|
|
push %r12
|
|
push %r13
|
|
push %r14
|
|
push %r15
|
|
vzeroupper
|
|
___
|
|
$code.=<<___ if ($win64);
|
|
lea -0xa8(%rsp),%rsp
|
|
vmovaps %xmm6,-0xd8(%rax)
|
|
vmovaps %xmm7,-0xc8(%rax)
|
|
vmovaps %xmm8,-0xb8(%rax)
|
|
vmovaps %xmm9,-0xa8(%rax)
|
|
vmovaps %xmm10,-0x98(%rax)
|
|
vmovaps %xmm11,-0x88(%rax)
|
|
vmovaps %xmm12,-0x78(%rax)
|
|
vmovaps %xmm13,-0x68(%rax)
|
|
vmovaps %xmm14,-0x58(%rax)
|
|
vmovaps %xmm15,-0x48(%rax)
|
|
.Lsqr_1024_body:
|
|
___
|
|
$code.=<<___;
|
|
mov %rax,%rbp
|
|
mov %rdx, $np # reassigned argument
|
|
sub \$$FrameSize, %rsp
|
|
mov $np, $tmp
|
|
sub \$-128, $rp # size optimization
|
|
sub \$-128, $ap
|
|
sub \$-128, $np
|
|
|
|
and \$4095, $tmp # see if $np crosses page
|
|
add \$32*10, $tmp
|
|
shr \$12, $tmp
|
|
vpxor $ACC9,$ACC9,$ACC9
|
|
jz .Lsqr_1024_no_n_copy
|
|
|
|
# unaligned 256-bit load that crosses page boundary can
|
|
# cause >2x performance degradation here, so if $np does
|
|
# cross page boundary, copy it to stack and make sure stack
|
|
# frame doesn't...
|
|
sub \$32*10,%rsp
|
|
vmovdqu 32*0-128($np), $ACC0
|
|
and \$-2048, %rsp
|
|
vmovdqu 32*1-128($np), $ACC1
|
|
vmovdqu 32*2-128($np), $ACC2
|
|
vmovdqu 32*3-128($np), $ACC3
|
|
vmovdqu 32*4-128($np), $ACC4
|
|
vmovdqu 32*5-128($np), $ACC5
|
|
vmovdqu 32*6-128($np), $ACC6
|
|
vmovdqu 32*7-128($np), $ACC7
|
|
vmovdqu 32*8-128($np), $ACC8
|
|
lea $FrameSize+128(%rsp),$np
|
|
vmovdqu $ACC0, 32*0-128($np)
|
|
vmovdqu $ACC1, 32*1-128($np)
|
|
vmovdqu $ACC2, 32*2-128($np)
|
|
vmovdqu $ACC3, 32*3-128($np)
|
|
vmovdqu $ACC4, 32*4-128($np)
|
|
vmovdqu $ACC5, 32*5-128($np)
|
|
vmovdqu $ACC6, 32*6-128($np)
|
|
vmovdqu $ACC7, 32*7-128($np)
|
|
vmovdqu $ACC8, 32*8-128($np)
|
|
vmovdqu $ACC9, 32*9-128($np) # $ACC9 is zero
|
|
|
|
.Lsqr_1024_no_n_copy:
|
|
and \$-1024, %rsp
|
|
|
|
vmovdqu 32*1-128($ap), $ACC1
|
|
vmovdqu 32*2-128($ap), $ACC2
|
|
vmovdqu 32*3-128($ap), $ACC3
|
|
vmovdqu 32*4-128($ap), $ACC4
|
|
vmovdqu 32*5-128($ap), $ACC5
|
|
vmovdqu 32*6-128($ap), $ACC6
|
|
vmovdqu 32*7-128($ap), $ACC7
|
|
vmovdqu 32*8-128($ap), $ACC8
|
|
|
|
lea 192(%rsp), $tp0 # 64+128=192
|
|
vpbroadcastq .Land_mask(%rip), $AND_MASK
|
|
jmp .LOOP_GRANDE_SQR_1024
|
|
|
|
.align 32
|
|
.LOOP_GRANDE_SQR_1024:
|
|
lea 32*18+128(%rsp), $aap # size optimization
|
|
lea 448(%rsp), $tp1 # 64+128+256=448
|
|
|
|
# the squaring is performed as described in Variant B of
|
|
# "Speeding up Big-Number Squaring", so start by calculating
|
|
# the A*2=A+A vector
|
|
vpaddq $ACC1, $ACC1, $ACC1
|
|
vpbroadcastq 32*0-128($ap), $B1
|
|
vpaddq $ACC2, $ACC2, $ACC2
|
|
vmovdqa $ACC1, 32*0-128($aap)
|
|
vpaddq $ACC3, $ACC3, $ACC3
|
|
vmovdqa $ACC2, 32*1-128($aap)
|
|
vpaddq $ACC4, $ACC4, $ACC4
|
|
vmovdqa $ACC3, 32*2-128($aap)
|
|
vpaddq $ACC5, $ACC5, $ACC5
|
|
vmovdqa $ACC4, 32*3-128($aap)
|
|
vpaddq $ACC6, $ACC6, $ACC6
|
|
vmovdqa $ACC5, 32*4-128($aap)
|
|
vpaddq $ACC7, $ACC7, $ACC7
|
|
vmovdqa $ACC6, 32*5-128($aap)
|
|
vpaddq $ACC8, $ACC8, $ACC8
|
|
vmovdqa $ACC7, 32*6-128($aap)
|
|
vpxor $ACC9, $ACC9, $ACC9
|
|
vmovdqa $ACC8, 32*7-128($aap)
|
|
|
|
vpmuludq 32*0-128($ap), $B1, $ACC0
|
|
vpbroadcastq 32*1-128($ap), $B2
|
|
vmovdqu $ACC9, 32*9-192($tp0) # zero upper half
|
|
vpmuludq $B1, $ACC1, $ACC1
|
|
vmovdqu $ACC9, 32*10-448($tp1)
|
|
vpmuludq $B1, $ACC2, $ACC2
|
|
vmovdqu $ACC9, 32*11-448($tp1)
|
|
vpmuludq $B1, $ACC3, $ACC3
|
|
vmovdqu $ACC9, 32*12-448($tp1)
|
|
vpmuludq $B1, $ACC4, $ACC4
|
|
vmovdqu $ACC9, 32*13-448($tp1)
|
|
vpmuludq $B1, $ACC5, $ACC5
|
|
vmovdqu $ACC9, 32*14-448($tp1)
|
|
vpmuludq $B1, $ACC6, $ACC6
|
|
vmovdqu $ACC9, 32*15-448($tp1)
|
|
vpmuludq $B1, $ACC7, $ACC7
|
|
vmovdqu $ACC9, 32*16-448($tp1)
|
|
vpmuludq $B1, $ACC8, $ACC8
|
|
vpbroadcastq 32*2-128($ap), $B1
|
|
vmovdqu $ACC9, 32*17-448($tp1)
|
|
|
|
mov $ap, $tpa
|
|
mov \$4, $i
|
|
jmp .Lsqr_entry_1024
|
|
___
|
|
$TEMP0=$Y1;
|
|
$TEMP2=$Y2;
|
|
$code.=<<___;
|
|
.align 32
|
|
.LOOP_SQR_1024:
|
|
vpbroadcastq 32*1-128($tpa), $B2
|
|
vpmuludq 32*0-128($ap), $B1, $ACC0
|
|
vpaddq 32*0-192($tp0), $ACC0, $ACC0
|
|
vpmuludq 32*0-128($aap), $B1, $ACC1
|
|
vpaddq 32*1-192($tp0), $ACC1, $ACC1
|
|
vpmuludq 32*1-128($aap), $B1, $ACC2
|
|
vpaddq 32*2-192($tp0), $ACC2, $ACC2
|
|
vpmuludq 32*2-128($aap), $B1, $ACC3
|
|
vpaddq 32*3-192($tp0), $ACC3, $ACC3
|
|
vpmuludq 32*3-128($aap), $B1, $ACC4
|
|
vpaddq 32*4-192($tp0), $ACC4, $ACC4
|
|
vpmuludq 32*4-128($aap), $B1, $ACC5
|
|
vpaddq 32*5-192($tp0), $ACC5, $ACC5
|
|
vpmuludq 32*5-128($aap), $B1, $ACC6
|
|
vpaddq 32*6-192($tp0), $ACC6, $ACC6
|
|
vpmuludq 32*6-128($aap), $B1, $ACC7
|
|
vpaddq 32*7-192($tp0), $ACC7, $ACC7
|
|
vpmuludq 32*7-128($aap), $B1, $ACC8
|
|
vpbroadcastq 32*2-128($tpa), $B1
|
|
vpaddq 32*8-192($tp0), $ACC8, $ACC8
|
|
.Lsqr_entry_1024:
|
|
vmovdqu $ACC0, 32*0-192($tp0)
|
|
vmovdqu $ACC1, 32*1-192($tp0)
|
|
|
|
vpmuludq 32*1-128($ap), $B2, $TEMP0
|
|
vpaddq $TEMP0, $ACC2, $ACC2
|
|
vpmuludq 32*1-128($aap), $B2, $TEMP1
|
|
vpaddq $TEMP1, $ACC3, $ACC3
|
|
vpmuludq 32*2-128($aap), $B2, $TEMP2
|
|
vpaddq $TEMP2, $ACC4, $ACC4
|
|
vpmuludq 32*3-128($aap), $B2, $TEMP0
|
|
vpaddq $TEMP0, $ACC5, $ACC5
|
|
vpmuludq 32*4-128($aap), $B2, $TEMP1
|
|
vpaddq $TEMP1, $ACC6, $ACC6
|
|
vpmuludq 32*5-128($aap), $B2, $TEMP2
|
|
vpaddq $TEMP2, $ACC7, $ACC7
|
|
vpmuludq 32*6-128($aap), $B2, $TEMP0
|
|
vpaddq $TEMP0, $ACC8, $ACC8
|
|
vpmuludq 32*7-128($aap), $B2, $ACC0
|
|
vpbroadcastq 32*3-128($tpa), $B2
|
|
vpaddq 32*9-192($tp0), $ACC0, $ACC0
|
|
|
|
vmovdqu $ACC2, 32*2-192($tp0)
|
|
vmovdqu $ACC3, 32*3-192($tp0)
|
|
|
|
vpmuludq 32*2-128($ap), $B1, $TEMP2
|
|
vpaddq $TEMP2, $ACC4, $ACC4
|
|
vpmuludq 32*2-128($aap), $B1, $TEMP0
|
|
vpaddq $TEMP0, $ACC5, $ACC5
|
|
vpmuludq 32*3-128($aap), $B1, $TEMP1
|
|
vpaddq $TEMP1, $ACC6, $ACC6
|
|
vpmuludq 32*4-128($aap), $B1, $TEMP2
|
|
vpaddq $TEMP2, $ACC7, $ACC7
|
|
vpmuludq 32*5-128($aap), $B1, $TEMP0
|
|
vpaddq $TEMP0, $ACC8, $ACC8
|
|
vpmuludq 32*6-128($aap), $B1, $TEMP1
|
|
vpaddq $TEMP1, $ACC0, $ACC0
|
|
vpmuludq 32*7-128($aap), $B1, $ACC1
|
|
vpbroadcastq 32*4-128($tpa), $B1
|
|
vpaddq 32*10-448($tp1), $ACC1, $ACC1
|
|
|
|
vmovdqu $ACC4, 32*4-192($tp0)
|
|
vmovdqu $ACC5, 32*5-192($tp0)
|
|
|
|
vpmuludq 32*3-128($ap), $B2, $TEMP0
|
|
vpaddq $TEMP0, $ACC6, $ACC6
|
|
vpmuludq 32*3-128($aap), $B2, $TEMP1
|
|
vpaddq $TEMP1, $ACC7, $ACC7
|
|
vpmuludq 32*4-128($aap), $B2, $TEMP2
|
|
vpaddq $TEMP2, $ACC8, $ACC8
|
|
vpmuludq 32*5-128($aap), $B2, $TEMP0
|
|
vpaddq $TEMP0, $ACC0, $ACC0
|
|
vpmuludq 32*6-128($aap), $B2, $TEMP1
|
|
vpaddq $TEMP1, $ACC1, $ACC1
|
|
vpmuludq 32*7-128($aap), $B2, $ACC2
|
|
vpbroadcastq 32*5-128($tpa), $B2
|
|
vpaddq 32*11-448($tp1), $ACC2, $ACC2
|
|
|
|
vmovdqu $ACC6, 32*6-192($tp0)
|
|
vmovdqu $ACC7, 32*7-192($tp0)
|
|
|
|
vpmuludq 32*4-128($ap), $B1, $TEMP0
|
|
vpaddq $TEMP0, $ACC8, $ACC8
|
|
vpmuludq 32*4-128($aap), $B1, $TEMP1
|
|
vpaddq $TEMP1, $ACC0, $ACC0
|
|
vpmuludq 32*5-128($aap), $B1, $TEMP2
|
|
vpaddq $TEMP2, $ACC1, $ACC1
|
|
vpmuludq 32*6-128($aap), $B1, $TEMP0
|
|
vpaddq $TEMP0, $ACC2, $ACC2
|
|
vpmuludq 32*7-128($aap), $B1, $ACC3
|
|
vpbroadcastq 32*6-128($tpa), $B1
|
|
vpaddq 32*12-448($tp1), $ACC3, $ACC3
|
|
|
|
vmovdqu $ACC8, 32*8-192($tp0)
|
|
vmovdqu $ACC0, 32*9-192($tp0)
|
|
lea 8($tp0), $tp0
|
|
|
|
vpmuludq 32*5-128($ap), $B2, $TEMP2
|
|
vpaddq $TEMP2, $ACC1, $ACC1
|
|
vpmuludq 32*5-128($aap), $B2, $TEMP0
|
|
vpaddq $TEMP0, $ACC2, $ACC2
|
|
vpmuludq 32*6-128($aap), $B2, $TEMP1
|
|
vpaddq $TEMP1, $ACC3, $ACC3
|
|
vpmuludq 32*7-128($aap), $B2, $ACC4
|
|
vpbroadcastq 32*7-128($tpa), $B2
|
|
vpaddq 32*13-448($tp1), $ACC4, $ACC4
|
|
|
|
vmovdqu $ACC1, 32*10-448($tp1)
|
|
vmovdqu $ACC2, 32*11-448($tp1)
|
|
|
|
vpmuludq 32*6-128($ap), $B1, $TEMP0
|
|
vpaddq $TEMP0, $ACC3, $ACC3
|
|
vpmuludq 32*6-128($aap), $B1, $TEMP1
|
|
vpbroadcastq 32*8-128($tpa), $ACC0 # borrow $ACC0 for $B1
|
|
vpaddq $TEMP1, $ACC4, $ACC4
|
|
vpmuludq 32*7-128($aap), $B1, $ACC5
|
|
vpbroadcastq 32*0+8-128($tpa), $B1 # for next iteration
|
|
vpaddq 32*14-448($tp1), $ACC5, $ACC5
|
|
|
|
vmovdqu $ACC3, 32*12-448($tp1)
|
|
vmovdqu $ACC4, 32*13-448($tp1)
|
|
lea 8($tpa), $tpa
|
|
|
|
vpmuludq 32*7-128($ap), $B2, $TEMP0
|
|
vpaddq $TEMP0, $ACC5, $ACC5
|
|
vpmuludq 32*7-128($aap), $B2, $ACC6
|
|
vpaddq 32*15-448($tp1), $ACC6, $ACC6
|
|
|
|
vpmuludq 32*8-128($ap), $ACC0, $ACC7
|
|
vmovdqu $ACC5, 32*14-448($tp1)
|
|
vpaddq 32*16-448($tp1), $ACC7, $ACC7
|
|
vmovdqu $ACC6, 32*15-448($tp1)
|
|
vmovdqu $ACC7, 32*16-448($tp1)
|
|
lea 8($tp1), $tp1
|
|
|
|
dec $i
|
|
jnz .LOOP_SQR_1024
|
|
___
|
|
$ZERO = $ACC9;
|
|
$TEMP0 = $B1;
|
|
$TEMP2 = $B2;
|
|
$TEMP3 = $Y1;
|
|
$TEMP4 = $Y2;
|
|
$code.=<<___;
|
|
#we need to fix indexes 32-39 to avoid overflow
|
|
vmovdqu 32*8(%rsp), $ACC8 # 32*8-192($tp0),
|
|
vmovdqu 32*9(%rsp), $ACC1 # 32*9-192($tp0)
|
|
vmovdqu 32*10(%rsp), $ACC2 # 32*10-192($tp0)
|
|
lea 192(%rsp), $tp0 # 64+128=192
|
|
|
|
vpsrlq \$29, $ACC8, $TEMP1
|
|
vpand $AND_MASK, $ACC8, $ACC8
|
|
vpsrlq \$29, $ACC1, $TEMP2
|
|
vpand $AND_MASK, $ACC1, $ACC1
|
|
|
|
vpermq \$0x93, $TEMP1, $TEMP1
|
|
vpxor $ZERO, $ZERO, $ZERO
|
|
vpermq \$0x93, $TEMP2, $TEMP2
|
|
|
|
vpblendd \$3, $ZERO, $TEMP1, $TEMP0
|
|
vpblendd \$3, $TEMP1, $TEMP2, $TEMP1
|
|
vpaddq $TEMP0, $ACC8, $ACC8
|
|
vpblendd \$3, $TEMP2, $ZERO, $TEMP2
|
|
vpaddq $TEMP1, $ACC1, $ACC1
|
|
vpaddq $TEMP2, $ACC2, $ACC2
|
|
vmovdqu $ACC1, 32*9-192($tp0)
|
|
vmovdqu $ACC2, 32*10-192($tp0)
|
|
|
|
mov (%rsp), %rax
|
|
mov 8(%rsp), $r1
|
|
mov 16(%rsp), $r2
|
|
mov 24(%rsp), $r3
|
|
vmovdqu 32*1(%rsp), $ACC1
|
|
vmovdqu 32*2-192($tp0), $ACC2
|
|
vmovdqu 32*3-192($tp0), $ACC3
|
|
vmovdqu 32*4-192($tp0), $ACC4
|
|
vmovdqu 32*5-192($tp0), $ACC5
|
|
vmovdqu 32*6-192($tp0), $ACC6
|
|
vmovdqu 32*7-192($tp0), $ACC7
|
|
|
|
mov %rax, $r0
|
|
imull $n0, %eax
|
|
and \$0x1fffffff, %eax
|
|
vmovd %eax, $Y1
|
|
|
|
mov %rax, %rdx
|
|
imulq -128($np), %rax
|
|
vpbroadcastq $Y1, $Y1
|
|
add %rax, $r0
|
|
mov %rdx, %rax
|
|
imulq 8-128($np), %rax
|
|
shr \$29, $r0
|
|
add %rax, $r1
|
|
mov %rdx, %rax
|
|
imulq 16-128($np), %rax
|
|
add $r0, $r1
|
|
add %rax, $r2
|
|
imulq 24-128($np), %rdx
|
|
add %rdx, $r3
|
|
|
|
mov $r1, %rax
|
|
imull $n0, %eax
|
|
and \$0x1fffffff, %eax
|
|
|
|
mov \$9, $i
|
|
jmp .LOOP_REDUCE_1024
|
|
|
|
.align 32
|
|
.LOOP_REDUCE_1024:
|
|
vmovd %eax, $Y2
|
|
vpbroadcastq $Y2, $Y2
|
|
|
|
vpmuludq 32*1-128($np), $Y1, $TEMP0
|
|
mov %rax, %rdx
|
|
imulq -128($np), %rax
|
|
vpaddq $TEMP0, $ACC1, $ACC1
|
|
add %rax, $r1
|
|
vpmuludq 32*2-128($np), $Y1, $TEMP1
|
|
mov %rdx, %rax
|
|
imulq 8-128($np), %rax
|
|
vpaddq $TEMP1, $ACC2, $ACC2
|
|
vpmuludq 32*3-128($np), $Y1, $TEMP2
|
|
.byte 0x67
|
|
add %rax, $r2
|
|
.byte 0x67
|
|
mov %rdx, %rax
|
|
imulq 16-128($np), %rax
|
|
shr \$29, $r1
|
|
vpaddq $TEMP2, $ACC3, $ACC3
|
|
vpmuludq 32*4-128($np), $Y1, $TEMP0
|
|
add %rax, $r3
|
|
add $r1, $r2
|
|
vpaddq $TEMP0, $ACC4, $ACC4
|
|
vpmuludq 32*5-128($np), $Y1, $TEMP1
|
|
mov $r2, %rax
|
|
imull $n0, %eax
|
|
vpaddq $TEMP1, $ACC5, $ACC5
|
|
vpmuludq 32*6-128($np), $Y1, $TEMP2
|
|
and \$0x1fffffff, %eax
|
|
vpaddq $TEMP2, $ACC6, $ACC6
|
|
vpmuludq 32*7-128($np), $Y1, $TEMP0
|
|
vpaddq $TEMP0, $ACC7, $ACC7
|
|
vpmuludq 32*8-128($np), $Y1, $TEMP1
|
|
vmovd %eax, $Y1
|
|
#vmovdqu 32*1-8-128($np), $TEMP2 # moved below
|
|
vpaddq $TEMP1, $ACC8, $ACC8
|
|
#vmovdqu 32*2-8-128($np), $TEMP0 # moved below
|
|
vpbroadcastq $Y1, $Y1
|
|
|
|
vpmuludq 32*1-8-128($np), $Y2, $TEMP2 # see above
|
|
vmovdqu 32*3-8-128($np), $TEMP1
|
|
mov %rax, %rdx
|
|
imulq -128($np), %rax
|
|
vpaddq $TEMP2, $ACC1, $ACC1
|
|
vpmuludq 32*2-8-128($np), $Y2, $TEMP0 # see above
|
|
vmovdqu 32*4-8-128($np), $TEMP2
|
|
add %rax, $r2
|
|
mov %rdx, %rax
|
|
imulq 8-128($np), %rax
|
|
vpaddq $TEMP0, $ACC2, $ACC2
|
|
add $r3, %rax
|
|
shr \$29, $r2
|
|
vpmuludq $Y2, $TEMP1, $TEMP1
|
|
vmovdqu 32*5-8-128($np), $TEMP0
|
|
add $r2, %rax
|
|
vpaddq $TEMP1, $ACC3, $ACC3
|
|
vpmuludq $Y2, $TEMP2, $TEMP2
|
|
vmovdqu 32*6-8-128($np), $TEMP1
|
|
.byte 0x67
|
|
mov %rax, $r3
|
|
imull $n0, %eax
|
|
vpaddq $TEMP2, $ACC4, $ACC4
|
|
vpmuludq $Y2, $TEMP0, $TEMP0
|
|
.byte 0xc4,0x41,0x7e,0x6f,0x9d,0x58,0x00,0x00,0x00 # vmovdqu 32*7-8-128($np), $TEMP2
|
|
and \$0x1fffffff, %eax
|
|
vpaddq $TEMP0, $ACC5, $ACC5
|
|
vpmuludq $Y2, $TEMP1, $TEMP1
|
|
vmovdqu 32*8-8-128($np), $TEMP0
|
|
vpaddq $TEMP1, $ACC6, $ACC6
|
|
vpmuludq $Y2, $TEMP2, $TEMP2
|
|
vmovdqu 32*9-8-128($np), $ACC9
|
|
vmovd %eax, $ACC0 # borrow ACC0 for Y2
|
|
imulq -128($np), %rax
|
|
vpaddq $TEMP2, $ACC7, $ACC7
|
|
vpmuludq $Y2, $TEMP0, $TEMP0
|
|
vmovdqu 32*1-16-128($np), $TEMP1
|
|
vpbroadcastq $ACC0, $ACC0
|
|
vpaddq $TEMP0, $ACC8, $ACC8
|
|
vpmuludq $Y2, $ACC9, $ACC9
|
|
vmovdqu 32*2-16-128($np), $TEMP2
|
|
add %rax, $r3
|
|
|
|
___
|
|
($ACC0,$Y2)=($Y2,$ACC0);
|
|
$code.=<<___;
|
|
vmovdqu 32*1-24-128($np), $ACC0
|
|
vpmuludq $Y1, $TEMP1, $TEMP1
|
|
vmovdqu 32*3-16-128($np), $TEMP0
|
|
vpaddq $TEMP1, $ACC1, $ACC1
|
|
vpmuludq $Y2, $ACC0, $ACC0
|
|
vpmuludq $Y1, $TEMP2, $TEMP2
|
|
.byte 0xc4,0x41,0x7e,0x6f,0xb5,0xf0,0xff,0xff,0xff # vmovdqu 32*4-16-128($np), $TEMP1
|
|
vpaddq $ACC1, $ACC0, $ACC0
|
|
vpaddq $TEMP2, $ACC2, $ACC2
|
|
vpmuludq $Y1, $TEMP0, $TEMP0
|
|
vmovdqu 32*5-16-128($np), $TEMP2
|
|
.byte 0x67
|
|
vmovq $ACC0, %rax
|
|
vmovdqu $ACC0, (%rsp) # transfer $r0-$r3
|
|
vpaddq $TEMP0, $ACC3, $ACC3
|
|
vpmuludq $Y1, $TEMP1, $TEMP1
|
|
vmovdqu 32*6-16-128($np), $TEMP0
|
|
vpaddq $TEMP1, $ACC4, $ACC4
|
|
vpmuludq $Y1, $TEMP2, $TEMP2
|
|
vmovdqu 32*7-16-128($np), $TEMP1
|
|
vpaddq $TEMP2, $ACC5, $ACC5
|
|
vpmuludq $Y1, $TEMP0, $TEMP0
|
|
vmovdqu 32*8-16-128($np), $TEMP2
|
|
vpaddq $TEMP0, $ACC6, $ACC6
|
|
vpmuludq $Y1, $TEMP1, $TEMP1
|
|
shr \$29, $r3
|
|
vmovdqu 32*9-16-128($np), $TEMP0
|
|
add $r3, %rax
|
|
vpaddq $TEMP1, $ACC7, $ACC7
|
|
vpmuludq $Y1, $TEMP2, $TEMP2
|
|
#vmovdqu 32*2-24-128($np), $TEMP1 # moved below
|
|
mov %rax, $r0
|
|
imull $n0, %eax
|
|
vpaddq $TEMP2, $ACC8, $ACC8
|
|
vpmuludq $Y1, $TEMP0, $TEMP0
|
|
and \$0x1fffffff, %eax
|
|
vmovd %eax, $Y1
|
|
vmovdqu 32*3-24-128($np), $TEMP2
|
|
.byte 0x67
|
|
vpaddq $TEMP0, $ACC9, $ACC9
|
|
vpbroadcastq $Y1, $Y1
|
|
|
|
vpmuludq 32*2-24-128($np), $Y2, $TEMP1 # see above
|
|
vmovdqu 32*4-24-128($np), $TEMP0
|
|
mov %rax, %rdx
|
|
imulq -128($np), %rax
|
|
mov 8(%rsp), $r1
|
|
vpaddq $TEMP1, $ACC2, $ACC1
|
|
vpmuludq $Y2, $TEMP2, $TEMP2
|
|
vmovdqu 32*5-24-128($np), $TEMP1
|
|
add %rax, $r0
|
|
mov %rdx, %rax
|
|
imulq 8-128($np), %rax
|
|
.byte 0x67
|
|
shr \$29, $r0
|
|
mov 16(%rsp), $r2
|
|
vpaddq $TEMP2, $ACC3, $ACC2
|
|
vpmuludq $Y2, $TEMP0, $TEMP0
|
|
vmovdqu 32*6-24-128($np), $TEMP2
|
|
add %rax, $r1
|
|
mov %rdx, %rax
|
|
imulq 16-128($np), %rax
|
|
vpaddq $TEMP0, $ACC4, $ACC3
|
|
vpmuludq $Y2, $TEMP1, $TEMP1
|
|
vmovdqu 32*7-24-128($np), $TEMP0
|
|
imulq 24-128($np), %rdx # future $r3
|
|
add %rax, $r2
|
|
lea ($r0,$r1), %rax
|
|
vpaddq $TEMP1, $ACC5, $ACC4
|
|
vpmuludq $Y2, $TEMP2, $TEMP2
|
|
vmovdqu 32*8-24-128($np), $TEMP1
|
|
mov %rax, $r1
|
|
imull $n0, %eax
|
|
vpmuludq $Y2, $TEMP0, $TEMP0
|
|
vpaddq $TEMP2, $ACC6, $ACC5
|
|
vmovdqu 32*9-24-128($np), $TEMP2
|
|
and \$0x1fffffff, %eax
|
|
vpaddq $TEMP0, $ACC7, $ACC6
|
|
vpmuludq $Y2, $TEMP1, $TEMP1
|
|
add 24(%rsp), %rdx
|
|
vpaddq $TEMP1, $ACC8, $ACC7
|
|
vpmuludq $Y2, $TEMP2, $TEMP2
|
|
vpaddq $TEMP2, $ACC9, $ACC8
|
|
vmovq $r3, $ACC9
|
|
mov %rdx, $r3
|
|
|
|
dec $i
|
|
jnz .LOOP_REDUCE_1024
|
|
___
|
|
($ACC0,$Y2)=($Y2,$ACC0);
|
|
$code.=<<___;
|
|
lea 448(%rsp), $tp1 # size optimization
|
|
vpaddq $ACC9, $Y2, $ACC0
|
|
vpxor $ZERO, $ZERO, $ZERO
|
|
|
|
vpaddq 32*9-192($tp0), $ACC0, $ACC0
|
|
vpaddq 32*10-448($tp1), $ACC1, $ACC1
|
|
vpaddq 32*11-448($tp1), $ACC2, $ACC2
|
|
vpaddq 32*12-448($tp1), $ACC3, $ACC3
|
|
vpaddq 32*13-448($tp1), $ACC4, $ACC4
|
|
vpaddq 32*14-448($tp1), $ACC5, $ACC5
|
|
vpaddq 32*15-448($tp1), $ACC6, $ACC6
|
|
vpaddq 32*16-448($tp1), $ACC7, $ACC7
|
|
vpaddq 32*17-448($tp1), $ACC8, $ACC8
|
|
|
|
vpsrlq \$29, $ACC0, $TEMP1
|
|
vpand $AND_MASK, $ACC0, $ACC0
|
|
vpsrlq \$29, $ACC1, $TEMP2
|
|
vpand $AND_MASK, $ACC1, $ACC1
|
|
vpsrlq \$29, $ACC2, $TEMP3
|
|
vpermq \$0x93, $TEMP1, $TEMP1
|
|
vpand $AND_MASK, $ACC2, $ACC2
|
|
vpsrlq \$29, $ACC3, $TEMP4
|
|
vpermq \$0x93, $TEMP2, $TEMP2
|
|
vpand $AND_MASK, $ACC3, $ACC3
|
|
vpermq \$0x93, $TEMP3, $TEMP3
|
|
|
|
vpblendd \$3, $ZERO, $TEMP1, $TEMP0
|
|
vpermq \$0x93, $TEMP4, $TEMP4
|
|
vpblendd \$3, $TEMP1, $TEMP2, $TEMP1
|
|
vpaddq $TEMP0, $ACC0, $ACC0
|
|
vpblendd \$3, $TEMP2, $TEMP3, $TEMP2
|
|
vpaddq $TEMP1, $ACC1, $ACC1
|
|
vpblendd \$3, $TEMP3, $TEMP4, $TEMP3
|
|
vpaddq $TEMP2, $ACC2, $ACC2
|
|
vpblendd \$3, $TEMP4, $ZERO, $TEMP4
|
|
vpaddq $TEMP3, $ACC3, $ACC3
|
|
vpaddq $TEMP4, $ACC4, $ACC4
|
|
|
|
vpsrlq \$29, $ACC0, $TEMP1
|
|
vpand $AND_MASK, $ACC0, $ACC0
|
|
vpsrlq \$29, $ACC1, $TEMP2
|
|
vpand $AND_MASK, $ACC1, $ACC1
|
|
vpsrlq \$29, $ACC2, $TEMP3
|
|
vpermq \$0x93, $TEMP1, $TEMP1
|
|
vpand $AND_MASK, $ACC2, $ACC2
|
|
vpsrlq \$29, $ACC3, $TEMP4
|
|
vpermq \$0x93, $TEMP2, $TEMP2
|
|
vpand $AND_MASK, $ACC3, $ACC3
|
|
vpermq \$0x93, $TEMP3, $TEMP3
|
|
|
|
vpblendd \$3, $ZERO, $TEMP1, $TEMP0
|
|
vpermq \$0x93, $TEMP4, $TEMP4
|
|
vpblendd \$3, $TEMP1, $TEMP2, $TEMP1
|
|
vpaddq $TEMP0, $ACC0, $ACC0
|
|
vpblendd \$3, $TEMP2, $TEMP3, $TEMP2
|
|
vpaddq $TEMP1, $ACC1, $ACC1
|
|
vmovdqu $ACC0, 32*0-128($rp)
|
|
vpblendd \$3, $TEMP3, $TEMP4, $TEMP3
|
|
vpaddq $TEMP2, $ACC2, $ACC2
|
|
vmovdqu $ACC1, 32*1-128($rp)
|
|
vpblendd \$3, $TEMP4, $ZERO, $TEMP4
|
|
vpaddq $TEMP3, $ACC3, $ACC3
|
|
vmovdqu $ACC2, 32*2-128($rp)
|
|
vpaddq $TEMP4, $ACC4, $ACC4
|
|
vmovdqu $ACC3, 32*3-128($rp)
|
|
___
|
|
$TEMP5=$ACC0;
|
|
$code.=<<___;
|
|
vpsrlq \$29, $ACC4, $TEMP1
|
|
vpand $AND_MASK, $ACC4, $ACC4
|
|
vpsrlq \$29, $ACC5, $TEMP2
|
|
vpand $AND_MASK, $ACC5, $ACC5
|
|
vpsrlq \$29, $ACC6, $TEMP3
|
|
vpermq \$0x93, $TEMP1, $TEMP1
|
|
vpand $AND_MASK, $ACC6, $ACC6
|
|
vpsrlq \$29, $ACC7, $TEMP4
|
|
vpermq \$0x93, $TEMP2, $TEMP2
|
|
vpand $AND_MASK, $ACC7, $ACC7
|
|
vpsrlq \$29, $ACC8, $TEMP5
|
|
vpermq \$0x93, $TEMP3, $TEMP3
|
|
vpand $AND_MASK, $ACC8, $ACC8
|
|
vpermq \$0x93, $TEMP4, $TEMP4
|
|
|
|
vpblendd \$3, $ZERO, $TEMP1, $TEMP0
|
|
vpermq \$0x93, $TEMP5, $TEMP5
|
|
vpblendd \$3, $TEMP1, $TEMP2, $TEMP1
|
|
vpaddq $TEMP0, $ACC4, $ACC4
|
|
vpblendd \$3, $TEMP2, $TEMP3, $TEMP2
|
|
vpaddq $TEMP1, $ACC5, $ACC5
|
|
vpblendd \$3, $TEMP3, $TEMP4, $TEMP3
|
|
vpaddq $TEMP2, $ACC6, $ACC6
|
|
vpblendd \$3, $TEMP4, $TEMP5, $TEMP4
|
|
vpaddq $TEMP3, $ACC7, $ACC7
|
|
vpaddq $TEMP4, $ACC8, $ACC8
|
|
|
|
vpsrlq \$29, $ACC4, $TEMP1
|
|
vpand $AND_MASK, $ACC4, $ACC4
|
|
vpsrlq \$29, $ACC5, $TEMP2
|
|
vpand $AND_MASK, $ACC5, $ACC5
|
|
vpsrlq \$29, $ACC6, $TEMP3
|
|
vpermq \$0x93, $TEMP1, $TEMP1
|
|
vpand $AND_MASK, $ACC6, $ACC6
|
|
vpsrlq \$29, $ACC7, $TEMP4
|
|
vpermq \$0x93, $TEMP2, $TEMP2
|
|
vpand $AND_MASK, $ACC7, $ACC7
|
|
vpsrlq \$29, $ACC8, $TEMP5
|
|
vpermq \$0x93, $TEMP3, $TEMP3
|
|
vpand $AND_MASK, $ACC8, $ACC8
|
|
vpermq \$0x93, $TEMP4, $TEMP4
|
|
|
|
vpblendd \$3, $ZERO, $TEMP1, $TEMP0
|
|
vpermq \$0x93, $TEMP5, $TEMP5
|
|
vpblendd \$3, $TEMP1, $TEMP2, $TEMP1
|
|
vpaddq $TEMP0, $ACC4, $ACC4
|
|
vpblendd \$3, $TEMP2, $TEMP3, $TEMP2
|
|
vpaddq $TEMP1, $ACC5, $ACC5
|
|
vmovdqu $ACC4, 32*4-128($rp)
|
|
vpblendd \$3, $TEMP3, $TEMP4, $TEMP3
|
|
vpaddq $TEMP2, $ACC6, $ACC6
|
|
vmovdqu $ACC5, 32*5-128($rp)
|
|
vpblendd \$3, $TEMP4, $TEMP5, $TEMP4
|
|
vpaddq $TEMP3, $ACC7, $ACC7
|
|
vmovdqu $ACC6, 32*6-128($rp)
|
|
vpaddq $TEMP4, $ACC8, $ACC8
|
|
vmovdqu $ACC7, 32*7-128($rp)
|
|
vmovdqu $ACC8, 32*8-128($rp)
|
|
|
|
mov $rp, $ap
|
|
dec $rep
|
|
jne .LOOP_GRANDE_SQR_1024
|
|
|
|
vzeroall
|
|
mov %rbp, %rax
|
|
___
|
|
$code.=<<___ if ($win64);
|
|
movaps -0xd8(%rax),%xmm6
|
|
movaps -0xc8(%rax),%xmm7
|
|
movaps -0xb8(%rax),%xmm8
|
|
movaps -0xa8(%rax),%xmm9
|
|
movaps -0x98(%rax),%xmm10
|
|
movaps -0x88(%rax),%xmm11
|
|
movaps -0x78(%rax),%xmm12
|
|
movaps -0x68(%rax),%xmm13
|
|
movaps -0x58(%rax),%xmm14
|
|
movaps -0x48(%rax),%xmm15
|
|
___
|
|
$code.=<<___;
|
|
mov -48(%rax),%r15
|
|
mov -40(%rax),%r14
|
|
mov -32(%rax),%r13
|
|
mov -24(%rax),%r12
|
|
mov -16(%rax),%rbp
|
|
mov -8(%rax),%rbx
|
|
lea (%rax),%rsp # restore %rsp
|
|
.Lsqr_1024_epilogue:
|
|
ret
|
|
.size rsaz_1024_sqr_avx2,.-rsaz_1024_sqr_avx2
|
|
___
|
|
}
|
|
|
|
{ # void AMM_WW(
|
|
my $rp="%rdi"; # BN_ULONG *rp,
|
|
my $ap="%rsi"; # const BN_ULONG *ap,
|
|
my $bp="%rdx"; # const BN_ULONG *bp,
|
|
my $np="%rcx"; # const BN_ULONG *np,
|
|
my $n0="%r8d"; # unsigned int n0);
|
|
|
|
# The registers that hold the accumulated redundant result
|
|
# The AMM works on 1024 bit operands, and redundant word size is 29
|
|
# Therefore: ceil(1024/29)/4 = 9
|
|
my $ACC0="%ymm0";
|
|
my $ACC1="%ymm1";
|
|
my $ACC2="%ymm2";
|
|
my $ACC3="%ymm3";
|
|
my $ACC4="%ymm4";
|
|
my $ACC5="%ymm5";
|
|
my $ACC6="%ymm6";
|
|
my $ACC7="%ymm7";
|
|
my $ACC8="%ymm8";
|
|
my $ACC9="%ymm9";
|
|
|
|
# Registers that hold the broadcasted words of multiplier, currently used
|
|
my $Bi="%ymm10";
|
|
my $Yi="%ymm11";
|
|
|
|
# Helper registers
|
|
my $TEMP0=$ACC0;
|
|
my $TEMP1="%ymm12";
|
|
my $TEMP2="%ymm13";
|
|
my $ZERO="%ymm14";
|
|
my $AND_MASK="%ymm15";
|
|
|
|
# alu registers that hold the first words of the ACC
|
|
my $r0="%r9";
|
|
my $r1="%r10";
|
|
my $r2="%r11";
|
|
my $r3="%r12";
|
|
|
|
my $i="%r14d";
|
|
my $tmp="%r15";
|
|
|
|
$bp="%r13"; # reassigned argument
|
|
|
|
$code.=<<___;
|
|
.globl rsaz_1024_mul_avx2
|
|
.type rsaz_1024_mul_avx2,\@function,5
|
|
.align 64
|
|
rsaz_1024_mul_avx2:
|
|
lea (%rsp), %rax
|
|
push %rbx
|
|
push %rbp
|
|
push %r12
|
|
push %r13
|
|
push %r14
|
|
push %r15
|
|
___
|
|
$code.=<<___ if ($win64);
|
|
vzeroupper
|
|
lea -0xa8(%rsp),%rsp
|
|
vmovaps %xmm6,-0xd8(%rax)
|
|
vmovaps %xmm7,-0xc8(%rax)
|
|
vmovaps %xmm8,-0xb8(%rax)
|
|
vmovaps %xmm9,-0xa8(%rax)
|
|
vmovaps %xmm10,-0x98(%rax)
|
|
vmovaps %xmm11,-0x88(%rax)
|
|
vmovaps %xmm12,-0x78(%rax)
|
|
vmovaps %xmm13,-0x68(%rax)
|
|
vmovaps %xmm14,-0x58(%rax)
|
|
vmovaps %xmm15,-0x48(%rax)
|
|
.Lmul_1024_body:
|
|
___
|
|
$code.=<<___;
|
|
mov %rax,%rbp
|
|
vzeroall
|
|
mov %rdx, $bp # reassigned argument
|
|
sub \$64,%rsp
|
|
|
|
# unaligned 256-bit load that crosses page boundary can
|
|
# cause severe performance degradation here, so if $ap does
|
|
# cross page boundary, swap it with $bp [meaning that caller
|
|
# is advised to lay down $ap and $bp next to each other, so
|
|
# that only one can cross page boundary].
|
|
.byte 0x67,0x67
|
|
mov $ap, $tmp
|
|
and \$4095, $tmp
|
|
add \$32*10, $tmp
|
|
shr \$12, $tmp
|
|
mov $ap, $tmp
|
|
cmovnz $bp, $ap
|
|
cmovnz $tmp, $bp
|
|
|
|
mov $np, $tmp
|
|
sub \$-128,$ap # size optimization
|
|
sub \$-128,$np
|
|
sub \$-128,$rp
|
|
|
|
and \$4095, $tmp # see if $np crosses page
|
|
add \$32*10, $tmp
|
|
.byte 0x67,0x67
|
|
shr \$12, $tmp
|
|
jz .Lmul_1024_no_n_copy
|
|
|
|
# unaligned 256-bit load that crosses page boundary can
|
|
# cause severe performance degradation here, so if $np does
|
|
# cross page boundary, copy it to stack and make sure stack
|
|
# frame doesn't...
|
|
sub \$32*10,%rsp
|
|
vmovdqu 32*0-128($np), $ACC0
|
|
and \$-512, %rsp
|
|
vmovdqu 32*1-128($np), $ACC1
|
|
vmovdqu 32*2-128($np), $ACC2
|
|
vmovdqu 32*3-128($np), $ACC3
|
|
vmovdqu 32*4-128($np), $ACC4
|
|
vmovdqu 32*5-128($np), $ACC5
|
|
vmovdqu 32*6-128($np), $ACC6
|
|
vmovdqu 32*7-128($np), $ACC7
|
|
vmovdqu 32*8-128($np), $ACC8
|
|
lea 64+128(%rsp),$np
|
|
vmovdqu $ACC0, 32*0-128($np)
|
|
vpxor $ACC0, $ACC0, $ACC0
|
|
vmovdqu $ACC1, 32*1-128($np)
|
|
vpxor $ACC1, $ACC1, $ACC1
|
|
vmovdqu $ACC2, 32*2-128($np)
|
|
vpxor $ACC2, $ACC2, $ACC2
|
|
vmovdqu $ACC3, 32*3-128($np)
|
|
vpxor $ACC3, $ACC3, $ACC3
|
|
vmovdqu $ACC4, 32*4-128($np)
|
|
vpxor $ACC4, $ACC4, $ACC4
|
|
vmovdqu $ACC5, 32*5-128($np)
|
|
vpxor $ACC5, $ACC5, $ACC5
|
|
vmovdqu $ACC6, 32*6-128($np)
|
|
vpxor $ACC6, $ACC6, $ACC6
|
|
vmovdqu $ACC7, 32*7-128($np)
|
|
vpxor $ACC7, $ACC7, $ACC7
|
|
vmovdqu $ACC8, 32*8-128($np)
|
|
vmovdqa $ACC0, $ACC8
|
|
vmovdqu $ACC9, 32*9-128($np) # $ACC9 is zero after vzeroall
|
|
.Lmul_1024_no_n_copy:
|
|
and \$-64,%rsp
|
|
|
|
mov ($bp), %rbx
|
|
vpbroadcastq ($bp), $Bi
|
|
vmovdqu $ACC0, (%rsp) # clear top of stack
|
|
xor $r0, $r0
|
|
.byte 0x67
|
|
xor $r1, $r1
|
|
xor $r2, $r2
|
|
xor $r3, $r3
|
|
|
|
vmovdqu .Land_mask(%rip), $AND_MASK
|
|
mov \$9, $i
|
|
vmovdqu $ACC9, 32*9-128($rp) # $ACC9 is zero after vzeroall
|
|
jmp .Loop_mul_1024
|
|
|
|
.align 32
|
|
.Loop_mul_1024:
|
|
vpsrlq \$29, $ACC3, $ACC9 # correct $ACC3(*)
|
|
mov %rbx, %rax
|
|
imulq -128($ap), %rax
|
|
add $r0, %rax
|
|
mov %rbx, $r1
|
|
imulq 8-128($ap), $r1
|
|
add 8(%rsp), $r1
|
|
|
|
mov %rax, $r0
|
|
imull $n0, %eax
|
|
and \$0x1fffffff, %eax
|
|
|
|
mov %rbx, $r2
|
|
imulq 16-128($ap), $r2
|
|
add 16(%rsp), $r2
|
|
|
|
mov %rbx, $r3
|
|
imulq 24-128($ap), $r3
|
|
add 24(%rsp), $r3
|
|
vpmuludq 32*1-128($ap),$Bi,$TEMP0
|
|
vmovd %eax, $Yi
|
|
vpaddq $TEMP0,$ACC1,$ACC1
|
|
vpmuludq 32*2-128($ap),$Bi,$TEMP1
|
|
vpbroadcastq $Yi, $Yi
|
|
vpaddq $TEMP1,$ACC2,$ACC2
|
|
vpmuludq 32*3-128($ap),$Bi,$TEMP2
|
|
vpand $AND_MASK, $ACC3, $ACC3 # correct $ACC3
|
|
vpaddq $TEMP2,$ACC3,$ACC3
|
|
vpmuludq 32*4-128($ap),$Bi,$TEMP0
|
|
vpaddq $TEMP0,$ACC4,$ACC4
|
|
vpmuludq 32*5-128($ap),$Bi,$TEMP1
|
|
vpaddq $TEMP1,$ACC5,$ACC5
|
|
vpmuludq 32*6-128($ap),$Bi,$TEMP2
|
|
vpaddq $TEMP2,$ACC6,$ACC6
|
|
vpmuludq 32*7-128($ap),$Bi,$TEMP0
|
|
vpermq \$0x93, $ACC9, $ACC9 # correct $ACC3
|
|
vpaddq $TEMP0,$ACC7,$ACC7
|
|
vpmuludq 32*8-128($ap),$Bi,$TEMP1
|
|
vpbroadcastq 8($bp), $Bi
|
|
vpaddq $TEMP1,$ACC8,$ACC8
|
|
|
|
mov %rax,%rdx
|
|
imulq -128($np),%rax
|
|
add %rax,$r0
|
|
mov %rdx,%rax
|
|
imulq 8-128($np),%rax
|
|
add %rax,$r1
|
|
mov %rdx,%rax
|
|
imulq 16-128($np),%rax
|
|
add %rax,$r2
|
|
shr \$29, $r0
|
|
imulq 24-128($np),%rdx
|
|
add %rdx,$r3
|
|
add $r0, $r1
|
|
|
|
vpmuludq 32*1-128($np),$Yi,$TEMP2
|
|
vmovq $Bi, %rbx
|
|
vpaddq $TEMP2,$ACC1,$ACC1
|
|
vpmuludq 32*2-128($np),$Yi,$TEMP0
|
|
vpaddq $TEMP0,$ACC2,$ACC2
|
|
vpmuludq 32*3-128($np),$Yi,$TEMP1
|
|
vpaddq $TEMP1,$ACC3,$ACC3
|
|
vpmuludq 32*4-128($np),$Yi,$TEMP2
|
|
vpaddq $TEMP2,$ACC4,$ACC4
|
|
vpmuludq 32*5-128($np),$Yi,$TEMP0
|
|
vpaddq $TEMP0,$ACC5,$ACC5
|
|
vpmuludq 32*6-128($np),$Yi,$TEMP1
|
|
vpaddq $TEMP1,$ACC6,$ACC6
|
|
vpmuludq 32*7-128($np),$Yi,$TEMP2
|
|
vpblendd \$3, $ZERO, $ACC9, $ACC9 # correct $ACC3
|
|
vpaddq $TEMP2,$ACC7,$ACC7
|
|
vpmuludq 32*8-128($np),$Yi,$TEMP0
|
|
vpaddq $ACC9, $ACC3, $ACC3 # correct $ACC3
|
|
vpaddq $TEMP0,$ACC8,$ACC8
|
|
|
|
mov %rbx, %rax
|
|
imulq -128($ap),%rax
|
|
add %rax,$r1
|
|
vmovdqu -8+32*1-128($ap),$TEMP1
|
|
mov %rbx, %rax
|
|
imulq 8-128($ap),%rax
|
|
add %rax,$r2
|
|
vmovdqu -8+32*2-128($ap),$TEMP2
|
|
|
|
mov $r1, %rax
|
|
imull $n0, %eax
|
|
and \$0x1fffffff, %eax
|
|
|
|
imulq 16-128($ap),%rbx
|
|
add %rbx,$r3
|
|
vpmuludq $Bi,$TEMP1,$TEMP1
|
|
vmovd %eax, $Yi
|
|
vmovdqu -8+32*3-128($ap),$TEMP0
|
|
vpaddq $TEMP1,$ACC1,$ACC1
|
|
vpmuludq $Bi,$TEMP2,$TEMP2
|
|
vpbroadcastq $Yi, $Yi
|
|
vmovdqu -8+32*4-128($ap),$TEMP1
|
|
vpaddq $TEMP2,$ACC2,$ACC2
|
|
vpmuludq $Bi,$TEMP0,$TEMP0
|
|
vmovdqu -8+32*5-128($ap),$TEMP2
|
|
vpaddq $TEMP0,$ACC3,$ACC3
|
|
vpmuludq $Bi,$TEMP1,$TEMP1
|
|
vmovdqu -8+32*6-128($ap),$TEMP0
|
|
vpaddq $TEMP1,$ACC4,$ACC4
|
|
vpmuludq $Bi,$TEMP2,$TEMP2
|
|
vmovdqu -8+32*7-128($ap),$TEMP1
|
|
vpaddq $TEMP2,$ACC5,$ACC5
|
|
vpmuludq $Bi,$TEMP0,$TEMP0
|
|
vmovdqu -8+32*8-128($ap),$TEMP2
|
|
vpaddq $TEMP0,$ACC6,$ACC6
|
|
vpmuludq $Bi,$TEMP1,$TEMP1
|
|
vmovdqu -8+32*9-128($ap),$ACC9
|
|
vpaddq $TEMP1,$ACC7,$ACC7
|
|
vpmuludq $Bi,$TEMP2,$TEMP2
|
|
vpaddq $TEMP2,$ACC8,$ACC8
|
|
vpmuludq $Bi,$ACC9,$ACC9
|
|
vpbroadcastq 16($bp), $Bi
|
|
|
|
mov %rax,%rdx
|
|
imulq -128($np),%rax
|
|
add %rax,$r1
|
|
vmovdqu -8+32*1-128($np),$TEMP0
|
|
mov %rdx,%rax
|
|
imulq 8-128($np),%rax
|
|
add %rax,$r2
|
|
vmovdqu -8+32*2-128($np),$TEMP1
|
|
shr \$29, $r1
|
|
imulq 16-128($np),%rdx
|
|
add %rdx,$r3
|
|
add $r1, $r2
|
|
|
|
vpmuludq $Yi,$TEMP0,$TEMP0
|
|
vmovq $Bi, %rbx
|
|
vmovdqu -8+32*3-128($np),$TEMP2
|
|
vpaddq $TEMP0,$ACC1,$ACC1
|
|
vpmuludq $Yi,$TEMP1,$TEMP1
|
|
vmovdqu -8+32*4-128($np),$TEMP0
|
|
vpaddq $TEMP1,$ACC2,$ACC2
|
|
vpmuludq $Yi,$TEMP2,$TEMP2
|
|
vmovdqu -8+32*5-128($np),$TEMP1
|
|
vpaddq $TEMP2,$ACC3,$ACC3
|
|
vpmuludq $Yi,$TEMP0,$TEMP0
|
|
vmovdqu -8+32*6-128($np),$TEMP2
|
|
vpaddq $TEMP0,$ACC4,$ACC4
|
|
vpmuludq $Yi,$TEMP1,$TEMP1
|
|
vmovdqu -8+32*7-128($np),$TEMP0
|
|
vpaddq $TEMP1,$ACC5,$ACC5
|
|
vpmuludq $Yi,$TEMP2,$TEMP2
|
|
vmovdqu -8+32*8-128($np),$TEMP1
|
|
vpaddq $TEMP2,$ACC6,$ACC6
|
|
vpmuludq $Yi,$TEMP0,$TEMP0
|
|
vmovdqu -8+32*9-128($np),$TEMP2
|
|
vpaddq $TEMP0,$ACC7,$ACC7
|
|
vpmuludq $Yi,$TEMP1,$TEMP1
|
|
vpaddq $TEMP1,$ACC8,$ACC8
|
|
vpmuludq $Yi,$TEMP2,$TEMP2
|
|
vpaddq $TEMP2,$ACC9,$ACC9
|
|
|
|
vmovdqu -16+32*1-128($ap),$TEMP0
|
|
mov %rbx,%rax
|
|
imulq -128($ap),%rax
|
|
add $r2,%rax
|
|
|
|
vmovdqu -16+32*2-128($ap),$TEMP1
|
|
mov %rax,$r2
|
|
imull $n0, %eax
|
|
and \$0x1fffffff, %eax
|
|
|
|
imulq 8-128($ap),%rbx
|
|
add %rbx,$r3
|
|
vpmuludq $Bi,$TEMP0,$TEMP0
|
|
vmovd %eax, $Yi
|
|
vmovdqu -16+32*3-128($ap),$TEMP2
|
|
vpaddq $TEMP0,$ACC1,$ACC1
|
|
vpmuludq $Bi,$TEMP1,$TEMP1
|
|
vpbroadcastq $Yi, $Yi
|
|
vmovdqu -16+32*4-128($ap),$TEMP0
|
|
vpaddq $TEMP1,$ACC2,$ACC2
|
|
vpmuludq $Bi,$TEMP2,$TEMP2
|
|
vmovdqu -16+32*5-128($ap),$TEMP1
|
|
vpaddq $TEMP2,$ACC3,$ACC3
|
|
vpmuludq $Bi,$TEMP0,$TEMP0
|
|
vmovdqu -16+32*6-128($ap),$TEMP2
|
|
vpaddq $TEMP0,$ACC4,$ACC4
|
|
vpmuludq $Bi,$TEMP1,$TEMP1
|
|
vmovdqu -16+32*7-128($ap),$TEMP0
|
|
vpaddq $TEMP1,$ACC5,$ACC5
|
|
vpmuludq $Bi,$TEMP2,$TEMP2
|
|
vmovdqu -16+32*8-128($ap),$TEMP1
|
|
vpaddq $TEMP2,$ACC6,$ACC6
|
|
vpmuludq $Bi,$TEMP0,$TEMP0
|
|
vmovdqu -16+32*9-128($ap),$TEMP2
|
|
vpaddq $TEMP0,$ACC7,$ACC7
|
|
vpmuludq $Bi,$TEMP1,$TEMP1
|
|
vpaddq $TEMP1,$ACC8,$ACC8
|
|
vpmuludq $Bi,$TEMP2,$TEMP2
|
|
vpbroadcastq 24($bp), $Bi
|
|
vpaddq $TEMP2,$ACC9,$ACC9
|
|
|
|
vmovdqu -16+32*1-128($np),$TEMP0
|
|
mov %rax,%rdx
|
|
imulq -128($np),%rax
|
|
add %rax,$r2
|
|
vmovdqu -16+32*2-128($np),$TEMP1
|
|
imulq 8-128($np),%rdx
|
|
add %rdx,$r3
|
|
shr \$29, $r2
|
|
|
|
vpmuludq $Yi,$TEMP0,$TEMP0
|
|
vmovq $Bi, %rbx
|
|
vmovdqu -16+32*3-128($np),$TEMP2
|
|
vpaddq $TEMP0,$ACC1,$ACC1
|
|
vpmuludq $Yi,$TEMP1,$TEMP1
|
|
vmovdqu -16+32*4-128($np),$TEMP0
|
|
vpaddq $TEMP1,$ACC2,$ACC2
|
|
vpmuludq $Yi,$TEMP2,$TEMP2
|
|
vmovdqu -16+32*5-128($np),$TEMP1
|
|
vpaddq $TEMP2,$ACC3,$ACC3
|
|
vpmuludq $Yi,$TEMP0,$TEMP0
|
|
vmovdqu -16+32*6-128($np),$TEMP2
|
|
vpaddq $TEMP0,$ACC4,$ACC4
|
|
vpmuludq $Yi,$TEMP1,$TEMP1
|
|
vmovdqu -16+32*7-128($np),$TEMP0
|
|
vpaddq $TEMP1,$ACC5,$ACC5
|
|
vpmuludq $Yi,$TEMP2,$TEMP2
|
|
vmovdqu -16+32*8-128($np),$TEMP1
|
|
vpaddq $TEMP2,$ACC6,$ACC6
|
|
vpmuludq $Yi,$TEMP0,$TEMP0
|
|
vmovdqu -16+32*9-128($np),$TEMP2
|
|
vpaddq $TEMP0,$ACC7,$ACC7
|
|
vpmuludq $Yi,$TEMP1,$TEMP1
|
|
vmovdqu -24+32*1-128($ap),$TEMP0
|
|
vpaddq $TEMP1,$ACC8,$ACC8
|
|
vpmuludq $Yi,$TEMP2,$TEMP2
|
|
vmovdqu -24+32*2-128($ap),$TEMP1
|
|
vpaddq $TEMP2,$ACC9,$ACC9
|
|
|
|
add $r2, $r3
|
|
imulq -128($ap),%rbx
|
|
add %rbx,$r3
|
|
|
|
mov $r3, %rax
|
|
imull $n0, %eax
|
|
and \$0x1fffffff, %eax
|
|
|
|
vpmuludq $Bi,$TEMP0,$TEMP0
|
|
vmovd %eax, $Yi
|
|
vmovdqu -24+32*3-128($ap),$TEMP2
|
|
vpaddq $TEMP0,$ACC1,$ACC1
|
|
vpmuludq $Bi,$TEMP1,$TEMP1
|
|
vpbroadcastq $Yi, $Yi
|
|
vmovdqu -24+32*4-128($ap),$TEMP0
|
|
vpaddq $TEMP1,$ACC2,$ACC2
|
|
vpmuludq $Bi,$TEMP2,$TEMP2
|
|
vmovdqu -24+32*5-128($ap),$TEMP1
|
|
vpaddq $TEMP2,$ACC3,$ACC3
|
|
vpmuludq $Bi,$TEMP0,$TEMP0
|
|
vmovdqu -24+32*6-128($ap),$TEMP2
|
|
vpaddq $TEMP0,$ACC4,$ACC4
|
|
vpmuludq $Bi,$TEMP1,$TEMP1
|
|
vmovdqu -24+32*7-128($ap),$TEMP0
|
|
vpaddq $TEMP1,$ACC5,$ACC5
|
|
vpmuludq $Bi,$TEMP2,$TEMP2
|
|
vmovdqu -24+32*8-128($ap),$TEMP1
|
|
vpaddq $TEMP2,$ACC6,$ACC6
|
|
vpmuludq $Bi,$TEMP0,$TEMP0
|
|
vmovdqu -24+32*9-128($ap),$TEMP2
|
|
vpaddq $TEMP0,$ACC7,$ACC7
|
|
vpmuludq $Bi,$TEMP1,$TEMP1
|
|
vpaddq $TEMP1,$ACC8,$ACC8
|
|
vpmuludq $Bi,$TEMP2,$TEMP2
|
|
vpbroadcastq 32($bp), $Bi
|
|
vpaddq $TEMP2,$ACC9,$ACC9
|
|
add \$32, $bp # $bp++
|
|
|
|
vmovdqu -24+32*1-128($np),$TEMP0
|
|
imulq -128($np),%rax
|
|
add %rax,$r3
|
|
shr \$29, $r3
|
|
|
|
vmovdqu -24+32*2-128($np),$TEMP1
|
|
vpmuludq $Yi,$TEMP0,$TEMP0
|
|
vmovq $Bi, %rbx
|
|
vmovdqu -24+32*3-128($np),$TEMP2
|
|
vpaddq $TEMP0,$ACC1,$ACC0 # $ACC0==$TEMP0
|
|
vpmuludq $Yi,$TEMP1,$TEMP1
|
|
vmovdqu $ACC0, (%rsp) # transfer $r0-$r3
|
|
vpaddq $TEMP1,$ACC2,$ACC1
|
|
vmovdqu -24+32*4-128($np),$TEMP0
|
|
vpmuludq $Yi,$TEMP2,$TEMP2
|
|
vmovdqu -24+32*5-128($np),$TEMP1
|
|
vpaddq $TEMP2,$ACC3,$ACC2
|
|
vpmuludq $Yi,$TEMP0,$TEMP0
|
|
vmovdqu -24+32*6-128($np),$TEMP2
|
|
vpaddq $TEMP0,$ACC4,$ACC3
|
|
vpmuludq $Yi,$TEMP1,$TEMP1
|
|
vmovdqu -24+32*7-128($np),$TEMP0
|
|
vpaddq $TEMP1,$ACC5,$ACC4
|
|
vpmuludq $Yi,$TEMP2,$TEMP2
|
|
vmovdqu -24+32*8-128($np),$TEMP1
|
|
vpaddq $TEMP2,$ACC6,$ACC5
|
|
vpmuludq $Yi,$TEMP0,$TEMP0
|
|
vmovdqu -24+32*9-128($np),$TEMP2
|
|
mov $r3, $r0
|
|
vpaddq $TEMP0,$ACC7,$ACC6
|
|
vpmuludq $Yi,$TEMP1,$TEMP1
|
|
add (%rsp), $r0
|
|
vpaddq $TEMP1,$ACC8,$ACC7
|
|
vpmuludq $Yi,$TEMP2,$TEMP2
|
|
vmovq $r3, $TEMP1
|
|
vpaddq $TEMP2,$ACC9,$ACC8
|
|
|
|
dec $i
|
|
jnz .Loop_mul_1024
|
|
___
|
|
|
|
# (*) Original implementation was correcting ACC1-ACC3 for overflow
|
|
# after 7 loop runs, or after 28 iterations, or 56 additions.
|
|
# But as we underutilize resources, it's possible to correct in
|
|
# each iteration with marginal performance loss. But then, as
|
|
# we do it in each iteration, we can correct less digits, and
|
|
# avoid performance penalties completely. Also note that we
|
|
# correct only three digits out of four. This works because
|
|
# most significant digit is subjected to less additions.
|
|
|
|
$TEMP0 = $ACC9;
|
|
$TEMP3 = $Bi;
|
|
$TEMP4 = $Yi;
|
|
$code.=<<___;
|
|
vpermq \$0, $AND_MASK, $AND_MASK
|
|
vpaddq (%rsp), $TEMP1, $ACC0
|
|
|
|
vpsrlq \$29, $ACC0, $TEMP1
|
|
vpand $AND_MASK, $ACC0, $ACC0
|
|
vpsrlq \$29, $ACC1, $TEMP2
|
|
vpand $AND_MASK, $ACC1, $ACC1
|
|
vpsrlq \$29, $ACC2, $TEMP3
|
|
vpermq \$0x93, $TEMP1, $TEMP1
|
|
vpand $AND_MASK, $ACC2, $ACC2
|
|
vpsrlq \$29, $ACC3, $TEMP4
|
|
vpermq \$0x93, $TEMP2, $TEMP2
|
|
vpand $AND_MASK, $ACC3, $ACC3
|
|
|
|
vpblendd \$3, $ZERO, $TEMP1, $TEMP0
|
|
vpermq \$0x93, $TEMP3, $TEMP3
|
|
vpblendd \$3, $TEMP1, $TEMP2, $TEMP1
|
|
vpermq \$0x93, $TEMP4, $TEMP4
|
|
vpaddq $TEMP0, $ACC0, $ACC0
|
|
vpblendd \$3, $TEMP2, $TEMP3, $TEMP2
|
|
vpaddq $TEMP1, $ACC1, $ACC1
|
|
vpblendd \$3, $TEMP3, $TEMP4, $TEMP3
|
|
vpaddq $TEMP2, $ACC2, $ACC2
|
|
vpblendd \$3, $TEMP4, $ZERO, $TEMP4
|
|
vpaddq $TEMP3, $ACC3, $ACC3
|
|
vpaddq $TEMP4, $ACC4, $ACC4
|
|
|
|
vpsrlq \$29, $ACC0, $TEMP1
|
|
vpand $AND_MASK, $ACC0, $ACC0
|
|
vpsrlq \$29, $ACC1, $TEMP2
|
|
vpand $AND_MASK, $ACC1, $ACC1
|
|
vpsrlq \$29, $ACC2, $TEMP3
|
|
vpermq \$0x93, $TEMP1, $TEMP1
|
|
vpand $AND_MASK, $ACC2, $ACC2
|
|
vpsrlq \$29, $ACC3, $TEMP4
|
|
vpermq \$0x93, $TEMP2, $TEMP2
|
|
vpand $AND_MASK, $ACC3, $ACC3
|
|
vpermq \$0x93, $TEMP3, $TEMP3
|
|
|
|
vpblendd \$3, $ZERO, $TEMP1, $TEMP0
|
|
vpermq \$0x93, $TEMP4, $TEMP4
|
|
vpblendd \$3, $TEMP1, $TEMP2, $TEMP1
|
|
vpaddq $TEMP0, $ACC0, $ACC0
|
|
vpblendd \$3, $TEMP2, $TEMP3, $TEMP2
|
|
vpaddq $TEMP1, $ACC1, $ACC1
|
|
vpblendd \$3, $TEMP3, $TEMP4, $TEMP3
|
|
vpaddq $TEMP2, $ACC2, $ACC2
|
|
vpblendd \$3, $TEMP4, $ZERO, $TEMP4
|
|
vpaddq $TEMP3, $ACC3, $ACC3
|
|
vpaddq $TEMP4, $ACC4, $ACC4
|
|
|
|
vmovdqu $ACC0, 0-128($rp)
|
|
vmovdqu $ACC1, 32-128($rp)
|
|
vmovdqu $ACC2, 64-128($rp)
|
|
vmovdqu $ACC3, 96-128($rp)
|
|
___
|
|
|
|
$TEMP5=$ACC0;
|
|
$code.=<<___;
|
|
vpsrlq \$29, $ACC4, $TEMP1
|
|
vpand $AND_MASK, $ACC4, $ACC4
|
|
vpsrlq \$29, $ACC5, $TEMP2
|
|
vpand $AND_MASK, $ACC5, $ACC5
|
|
vpsrlq \$29, $ACC6, $TEMP3
|
|
vpermq \$0x93, $TEMP1, $TEMP1
|
|
vpand $AND_MASK, $ACC6, $ACC6
|
|
vpsrlq \$29, $ACC7, $TEMP4
|
|
vpermq \$0x93, $TEMP2, $TEMP2
|
|
vpand $AND_MASK, $ACC7, $ACC7
|
|
vpsrlq \$29, $ACC8, $TEMP5
|
|
vpermq \$0x93, $TEMP3, $TEMP3
|
|
vpand $AND_MASK, $ACC8, $ACC8
|
|
vpermq \$0x93, $TEMP4, $TEMP4
|
|
|
|
vpblendd \$3, $ZERO, $TEMP1, $TEMP0
|
|
vpermq \$0x93, $TEMP5, $TEMP5
|
|
vpblendd \$3, $TEMP1, $TEMP2, $TEMP1
|
|
vpaddq $TEMP0, $ACC4, $ACC4
|
|
vpblendd \$3, $TEMP2, $TEMP3, $TEMP2
|
|
vpaddq $TEMP1, $ACC5, $ACC5
|
|
vpblendd \$3, $TEMP3, $TEMP4, $TEMP3
|
|
vpaddq $TEMP2, $ACC6, $ACC6
|
|
vpblendd \$3, $TEMP4, $TEMP5, $TEMP4
|
|
vpaddq $TEMP3, $ACC7, $ACC7
|
|
vpaddq $TEMP4, $ACC8, $ACC8
|
|
|
|
vpsrlq \$29, $ACC4, $TEMP1
|
|
vpand $AND_MASK, $ACC4, $ACC4
|
|
vpsrlq \$29, $ACC5, $TEMP2
|
|
vpand $AND_MASK, $ACC5, $ACC5
|
|
vpsrlq \$29, $ACC6, $TEMP3
|
|
vpermq \$0x93, $TEMP1, $TEMP1
|
|
vpand $AND_MASK, $ACC6, $ACC6
|
|
vpsrlq \$29, $ACC7, $TEMP4
|
|
vpermq \$0x93, $TEMP2, $TEMP2
|
|
vpand $AND_MASK, $ACC7, $ACC7
|
|
vpsrlq \$29, $ACC8, $TEMP5
|
|
vpermq \$0x93, $TEMP3, $TEMP3
|
|
vpand $AND_MASK, $ACC8, $ACC8
|
|
vpermq \$0x93, $TEMP4, $TEMP4
|
|
|
|
vpblendd \$3, $ZERO, $TEMP1, $TEMP0
|
|
vpermq \$0x93, $TEMP5, $TEMP5
|
|
vpblendd \$3, $TEMP1, $TEMP2, $TEMP1
|
|
vpaddq $TEMP0, $ACC4, $ACC4
|
|
vpblendd \$3, $TEMP2, $TEMP3, $TEMP2
|
|
vpaddq $TEMP1, $ACC5, $ACC5
|
|
vpblendd \$3, $TEMP3, $TEMP4, $TEMP3
|
|
vpaddq $TEMP2, $ACC6, $ACC6
|
|
vpblendd \$3, $TEMP4, $TEMP5, $TEMP4
|
|
vpaddq $TEMP3, $ACC7, $ACC7
|
|
vpaddq $TEMP4, $ACC8, $ACC8
|
|
|
|
vmovdqu $ACC4, 128-128($rp)
|
|
vmovdqu $ACC5, 160-128($rp)
|
|
vmovdqu $ACC6, 192-128($rp)
|
|
vmovdqu $ACC7, 224-128($rp)
|
|
vmovdqu $ACC8, 256-128($rp)
|
|
vzeroupper
|
|
|
|
mov %rbp, %rax
|
|
___
|
|
$code.=<<___ if ($win64);
|
|
movaps -0xd8(%rax),%xmm6
|
|
movaps -0xc8(%rax),%xmm7
|
|
movaps -0xb8(%rax),%xmm8
|
|
movaps -0xa8(%rax),%xmm9
|
|
movaps -0x98(%rax),%xmm10
|
|
movaps -0x88(%rax),%xmm11
|
|
movaps -0x78(%rax),%xmm12
|
|
movaps -0x68(%rax),%xmm13
|
|
movaps -0x58(%rax),%xmm14
|
|
movaps -0x48(%rax),%xmm15
|
|
___
|
|
$code.=<<___;
|
|
mov -48(%rax),%r15
|
|
mov -40(%rax),%r14
|
|
mov -32(%rax),%r13
|
|
mov -24(%rax),%r12
|
|
mov -16(%rax),%rbp
|
|
mov -8(%rax),%rbx
|
|
lea (%rax),%rsp # restore %rsp
|
|
.Lmul_1024_epilogue:
|
|
ret
|
|
.size rsaz_1024_mul_avx2,.-rsaz_1024_mul_avx2
|
|
___
|
|
}
|
|
{
|
|
my ($out,$inp) = $win64 ? ("%rcx","%rdx") : ("%rdi","%rsi");
|
|
my @T = map("%r$_",(8..11));
|
|
|
|
$code.=<<___;
|
|
.globl rsaz_1024_red2norm_avx2
|
|
.type rsaz_1024_red2norm_avx2,\@abi-omnipotent
|
|
.align 32
|
|
rsaz_1024_red2norm_avx2:
|
|
sub \$-128,$inp # size optimization
|
|
xor %rax,%rax
|
|
___
|
|
|
|
for ($j=0,$i=0; $i<16; $i++) {
|
|
my $k=0;
|
|
while (29*$j<64*($i+1)) { # load data till boundary
|
|
$code.=" mov `8*$j-128`($inp), @T[0]\n";
|
|
$j++; $k++; push(@T,shift(@T));
|
|
}
|
|
$l=$k;
|
|
while ($k>1) { # shift loaded data but last value
|
|
$code.=" shl \$`29*($j-$k)`,@T[-$k]\n";
|
|
$k--;
|
|
}
|
|
$code.=<<___; # shift last value
|
|
mov @T[-1], @T[0]
|
|
shl \$`29*($j-1)`, @T[-1]
|
|
shr \$`-29*($j-1)`, @T[0]
|
|
___
|
|
while ($l) { # accumulate all values
|
|
$code.=" add @T[-$l], %rax\n";
|
|
$l--;
|
|
}
|
|
$code.=<<___;
|
|
adc \$0, @T[0] # consume eventual carry
|
|
mov %rax, 8*$i($out)
|
|
mov @T[0], %rax
|
|
___
|
|
push(@T,shift(@T));
|
|
}
|
|
$code.=<<___;
|
|
ret
|
|
.size rsaz_1024_red2norm_avx2,.-rsaz_1024_red2norm_avx2
|
|
|
|
.globl rsaz_1024_norm2red_avx2
|
|
.type rsaz_1024_norm2red_avx2,\@abi-omnipotent
|
|
.align 32
|
|
rsaz_1024_norm2red_avx2:
|
|
sub \$-128,$out # size optimization
|
|
mov ($inp),@T[0]
|
|
mov \$0x1fffffff,%eax
|
|
___
|
|
for ($j=0,$i=0; $i<16; $i++) {
|
|
$code.=" mov `8*($i+1)`($inp),@T[1]\n" if ($i<15);
|
|
$code.=" xor @T[1],@T[1]\n" if ($i==15);
|
|
my $k=1;
|
|
while (29*($j+1)<64*($i+1)) {
|
|
$code.=<<___;
|
|
mov @T[0],@T[-$k]
|
|
shr \$`29*$j`,@T[-$k]
|
|
and %rax,@T[-$k] # &0x1fffffff
|
|
mov @T[-$k],`8*$j-128`($out)
|
|
___
|
|
$j++; $k++;
|
|
}
|
|
$code.=<<___;
|
|
shrd \$`29*$j`,@T[1],@T[0]
|
|
and %rax,@T[0]
|
|
mov @T[0],`8*$j-128`($out)
|
|
___
|
|
$j++;
|
|
push(@T,shift(@T));
|
|
}
|
|
$code.=<<___;
|
|
mov @T[0],`8*$j-128`($out) # zero
|
|
mov @T[0],`8*($j+1)-128`($out)
|
|
mov @T[0],`8*($j+2)-128`($out)
|
|
mov @T[0],`8*($j+3)-128`($out)
|
|
ret
|
|
.size rsaz_1024_norm2red_avx2,.-rsaz_1024_norm2red_avx2
|
|
___
|
|
}
|
|
{
|
|
my ($out,$inp,$power) = $win64 ? ("%rcx","%rdx","%r8d") : ("%rdi","%rsi","%edx");
|
|
|
|
$code.=<<___;
|
|
.globl rsaz_1024_scatter5_avx2
|
|
.type rsaz_1024_scatter5_avx2,\@abi-omnipotent
|
|
.align 32
|
|
rsaz_1024_scatter5_avx2:
|
|
vzeroupper
|
|
vmovdqu .Lscatter_permd(%rip),%ymm5
|
|
shl \$4,$power
|
|
lea ($out,$power),$out
|
|
mov \$9,%eax
|
|
jmp .Loop_scatter_1024
|
|
|
|
.align 32
|
|
.Loop_scatter_1024:
|
|
vmovdqu ($inp),%ymm0
|
|
lea 32($inp),$inp
|
|
vpermd %ymm0,%ymm5,%ymm0
|
|
vmovdqu %xmm0,($out)
|
|
lea 16*32($out),$out
|
|
dec %eax
|
|
jnz .Loop_scatter_1024
|
|
|
|
vzeroupper
|
|
ret
|
|
.size rsaz_1024_scatter5_avx2,.-rsaz_1024_scatter5_avx2
|
|
|
|
.globl rsaz_1024_gather5_avx2
|
|
.type rsaz_1024_gather5_avx2,\@abi-omnipotent
|
|
.align 32
|
|
rsaz_1024_gather5_avx2:
|
|
___
|
|
$code.=<<___ if ($win64);
|
|
lea -0x88(%rsp),%rax
|
|
vzeroupper
|
|
.LSEH_begin_rsaz_1024_gather5:
|
|
# I can't trust assembler to use specific encoding:-(
|
|
.byte 0x48,0x8d,0x60,0xe0 #lea -0x20(%rax),%rsp
|
|
.byte 0xc5,0xf8,0x29,0x70,0xe0 #vmovaps %xmm6,-0x20(%rax)
|
|
.byte 0xc5,0xf8,0x29,0x78,0xf0 #vmovaps %xmm7,-0x10(%rax)
|
|
.byte 0xc5,0x78,0x29,0x40,0x00 #vmovaps %xmm8,0(%rax)
|
|
.byte 0xc5,0x78,0x29,0x48,0x10 #vmovaps %xmm9,0x10(%rax)
|
|
.byte 0xc5,0x78,0x29,0x50,0x20 #vmovaps %xmm10,0x20(%rax)
|
|
.byte 0xc5,0x78,0x29,0x58,0x30 #vmovaps %xmm11,0x30(%rax)
|
|
.byte 0xc5,0x78,0x29,0x60,0x40 #vmovaps %xmm12,0x40(%rax)
|
|
.byte 0xc5,0x78,0x29,0x68,0x50 #vmovaps %xmm13,0x50(%rax)
|
|
.byte 0xc5,0x78,0x29,0x70,0x60 #vmovaps %xmm14,0x60(%rax)
|
|
.byte 0xc5,0x78,0x29,0x78,0x70 #vmovaps %xmm15,0x70(%rax)
|
|
___
|
|
$code.=<<___;
|
|
lea .Lgather_table(%rip),%r11
|
|
mov $power,%eax
|
|
and \$3,$power
|
|
shr \$2,%eax # cache line number
|
|
shl \$4,$power # offset within cache line
|
|
|
|
vmovdqu -32(%r11),%ymm7 # .Lgather_permd
|
|
vpbroadcastb 8(%r11,%rax), %xmm8
|
|
vpbroadcastb 7(%r11,%rax), %xmm9
|
|
vpbroadcastb 6(%r11,%rax), %xmm10
|
|
vpbroadcastb 5(%r11,%rax), %xmm11
|
|
vpbroadcastb 4(%r11,%rax), %xmm12
|
|
vpbroadcastb 3(%r11,%rax), %xmm13
|
|
vpbroadcastb 2(%r11,%rax), %xmm14
|
|
vpbroadcastb 1(%r11,%rax), %xmm15
|
|
|
|
lea 64($inp,$power),$inp
|
|
mov \$64,%r11 # size optimization
|
|
mov \$9,%eax
|
|
jmp .Loop_gather_1024
|
|
|
|
.align 32
|
|
.Loop_gather_1024:
|
|
vpand -64($inp), %xmm8,%xmm0
|
|
vpand ($inp), %xmm9,%xmm1
|
|
vpand 64($inp), %xmm10,%xmm2
|
|
vpand ($inp,%r11,2), %xmm11,%xmm3
|
|
vpor %xmm0,%xmm1,%xmm1
|
|
vpand 64($inp,%r11,2), %xmm12,%xmm4
|
|
vpor %xmm2,%xmm3,%xmm3
|
|
vpand ($inp,%r11,4), %xmm13,%xmm5
|
|
vpor %xmm1,%xmm3,%xmm3
|
|
vpand 64($inp,%r11,4), %xmm14,%xmm6
|
|
vpor %xmm4,%xmm5,%xmm5
|
|
vpand -128($inp,%r11,8), %xmm15,%xmm2
|
|
lea ($inp,%r11,8),$inp
|
|
vpor %xmm3,%xmm5,%xmm5
|
|
vpor %xmm2,%xmm6,%xmm6
|
|
vpor %xmm5,%xmm6,%xmm6
|
|
vpermd %ymm6,%ymm7,%ymm6
|
|
vmovdqu %ymm6,($out)
|
|
lea 32($out),$out
|
|
dec %eax
|
|
jnz .Loop_gather_1024
|
|
|
|
vpxor %ymm0,%ymm0,%ymm0
|
|
vmovdqu %ymm0,($out)
|
|
vzeroupper
|
|
___
|
|
$code.=<<___ if ($win64);
|
|
movaps (%rsp),%xmm6
|
|
movaps 0x10(%rsp),%xmm7
|
|
movaps 0x20(%rsp),%xmm8
|
|
movaps 0x30(%rsp),%xmm9
|
|
movaps 0x40(%rsp),%xmm10
|
|
movaps 0x50(%rsp),%xmm11
|
|
movaps 0x60(%rsp),%xmm12
|
|
movaps 0x70(%rsp),%xmm13
|
|
movaps 0x80(%rsp),%xmm14
|
|
movaps 0x90(%rsp),%xmm15
|
|
lea 0xa8(%rsp),%rsp
|
|
.LSEH_end_rsaz_1024_gather5:
|
|
___
|
|
$code.=<<___;
|
|
ret
|
|
.size rsaz_1024_gather5_avx2,.-rsaz_1024_gather5_avx2
|
|
___
|
|
}
|
|
|
|
$code.=<<___;
|
|
.extern OPENSSL_ia32cap_P
|
|
.globl rsaz_avx2_eligible
|
|
.type rsaz_avx2_eligible,\@abi-omnipotent
|
|
.align 32
|
|
rsaz_avx2_eligible:
|
|
mov OPENSSL_ia32cap_P+8(%rip),%eax
|
|
___
|
|
$code.=<<___ if ($addx);
|
|
mov \$`1<<8|1<<19`,%ecx
|
|
mov \$0,%edx
|
|
and %eax,%ecx
|
|
cmp \$`1<<8|1<<19`,%ecx # check for BMI2+AD*X
|
|
cmove %edx,%eax
|
|
___
|
|
$code.=<<___;
|
|
and \$`1<<5`,%eax
|
|
shr \$5,%eax
|
|
ret
|
|
.size rsaz_avx2_eligible,.-rsaz_avx2_eligible
|
|
|
|
.align 64
|
|
.Land_mask:
|
|
.quad 0x1fffffff,0x1fffffff,0x1fffffff,-1
|
|
.Lscatter_permd:
|
|
.long 0,2,4,6,7,7,7,7
|
|
.Lgather_permd:
|
|
.long 0,7,1,7,2,7,3,7
|
|
.Lgather_table:
|
|
.byte 0,0,0,0,0,0,0,0, 0xff,0,0,0,0,0,0,0
|
|
.align 64
|
|
___
|
|
|
|
if ($win64) {
|
|
$rec="%rcx";
|
|
$frame="%rdx";
|
|
$context="%r8";
|
|
$disp="%r9";
|
|
|
|
$code.=<<___
|
|
.extern __imp_RtlVirtualUnwind
|
|
.type rsaz_se_handler,\@abi-omnipotent
|
|
.align 16
|
|
rsaz_se_handler:
|
|
push %rsi
|
|
push %rdi
|
|
push %rbx
|
|
push %rbp
|
|
push %r12
|
|
push %r13
|
|
push %r14
|
|
push %r15
|
|
pushfq
|
|
sub \$64,%rsp
|
|
|
|
mov 120($context),%rax # pull context->Rax
|
|
mov 248($context),%rbx # pull context->Rip
|
|
|
|
mov 8($disp),%rsi # disp->ImageBase
|
|
mov 56($disp),%r11 # disp->HandlerData
|
|
|
|
mov 0(%r11),%r10d # HandlerData[0]
|
|
lea (%rsi,%r10),%r10 # prologue label
|
|
cmp %r10,%rbx # context->Rip<prologue label
|
|
jb .Lcommon_seh_tail
|
|
|
|
mov 152($context),%rax # pull context->Rsp
|
|
|
|
mov 4(%r11),%r10d # HandlerData[1]
|
|
lea (%rsi,%r10),%r10 # epilogue label
|
|
cmp %r10,%rbx # context->Rip>=epilogue label
|
|
jae .Lcommon_seh_tail
|
|
|
|
mov 160($context),%rax # pull context->Rbp
|
|
|
|
mov -48(%rax),%r15
|
|
mov -40(%rax),%r14
|
|
mov -32(%rax),%r13
|
|
mov -24(%rax),%r12
|
|
mov -16(%rax),%rbp
|
|
mov -8(%rax),%rbx
|
|
mov %r15,240($context)
|
|
mov %r14,232($context)
|
|
mov %r13,224($context)
|
|
mov %r12,216($context)
|
|
mov %rbp,160($context)
|
|
mov %rbx,144($context)
|
|
|
|
lea -0xd8(%rax),%rsi # %xmm save area
|
|
lea 512($context),%rdi # & context.Xmm6
|
|
mov \$20,%ecx # 10*sizeof(%xmm0)/sizeof(%rax)
|
|
.long 0xa548f3fc # cld; rep movsq
|
|
|
|
.Lcommon_seh_tail:
|
|
mov 8(%rax),%rdi
|
|
mov 16(%rax),%rsi
|
|
mov %rax,152($context) # restore context->Rsp
|
|
mov %rsi,168($context) # restore context->Rsi
|
|
mov %rdi,176($context) # restore context->Rdi
|
|
|
|
mov 40($disp),%rdi # disp->ContextRecord
|
|
mov $context,%rsi # context
|
|
mov \$154,%ecx # sizeof(CONTEXT)
|
|
.long 0xa548f3fc # cld; rep movsq
|
|
|
|
mov $disp,%rsi
|
|
xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER
|
|
mov 8(%rsi),%rdx # arg2, disp->ImageBase
|
|
mov 0(%rsi),%r8 # arg3, disp->ControlPc
|
|
mov 16(%rsi),%r9 # arg4, disp->FunctionEntry
|
|
mov 40(%rsi),%r10 # disp->ContextRecord
|
|
lea 56(%rsi),%r11 # &disp->HandlerData
|
|
lea 24(%rsi),%r12 # &disp->EstablisherFrame
|
|
mov %r10,32(%rsp) # arg5
|
|
mov %r11,40(%rsp) # arg6
|
|
mov %r12,48(%rsp) # arg7
|
|
mov %rcx,56(%rsp) # arg8, (NULL)
|
|
call *__imp_RtlVirtualUnwind(%rip)
|
|
|
|
mov \$1,%eax # ExceptionContinueSearch
|
|
add \$64,%rsp
|
|
popfq
|
|
pop %r15
|
|
pop %r14
|
|
pop %r13
|
|
pop %r12
|
|
pop %rbp
|
|
pop %rbx
|
|
pop %rdi
|
|
pop %rsi
|
|
ret
|
|
.size rsaz_se_handler,.-rsaz_se_handler
|
|
|
|
.section .pdata
|
|
.align 4
|
|
.rva .LSEH_begin_rsaz_1024_sqr_avx2
|
|
.rva .LSEH_end_rsaz_1024_sqr_avx2
|
|
.rva .LSEH_info_rsaz_1024_sqr_avx2
|
|
|
|
.rva .LSEH_begin_rsaz_1024_mul_avx2
|
|
.rva .LSEH_end_rsaz_1024_mul_avx2
|
|
.rva .LSEH_info_rsaz_1024_mul_avx2
|
|
|
|
.rva .LSEH_begin_rsaz_1024_gather5
|
|
.rva .LSEH_end_rsaz_1024_gather5
|
|
.rva .LSEH_info_rsaz_1024_gather5
|
|
.section .xdata
|
|
.align 8
|
|
.LSEH_info_rsaz_1024_sqr_avx2:
|
|
.byte 9,0,0,0
|
|
.rva rsaz_se_handler
|
|
.rva .Lsqr_1024_body,.Lsqr_1024_epilogue
|
|
.LSEH_info_rsaz_1024_mul_avx2:
|
|
.byte 9,0,0,0
|
|
.rva rsaz_se_handler
|
|
.rva .Lmul_1024_body,.Lmul_1024_epilogue
|
|
.LSEH_info_rsaz_1024_gather5:
|
|
.byte 0x01,0x33,0x16,0x00
|
|
.byte 0x36,0xf8,0x09,0x00 #vmovaps 0x90(rsp),xmm15
|
|
.byte 0x31,0xe8,0x08,0x00 #vmovaps 0x80(rsp),xmm14
|
|
.byte 0x2c,0xd8,0x07,0x00 #vmovaps 0x70(rsp),xmm13
|
|
.byte 0x27,0xc8,0x06,0x00 #vmovaps 0x60(rsp),xmm12
|
|
.byte 0x22,0xb8,0x05,0x00 #vmovaps 0x50(rsp),xmm11
|
|
.byte 0x1d,0xa8,0x04,0x00 #vmovaps 0x40(rsp),xmm10
|
|
.byte 0x18,0x98,0x03,0x00 #vmovaps 0x30(rsp),xmm9
|
|
.byte 0x13,0x88,0x02,0x00 #vmovaps 0x20(rsp),xmm8
|
|
.byte 0x0e,0x78,0x01,0x00 #vmovaps 0x10(rsp),xmm7
|
|
.byte 0x09,0x68,0x00,0x00 #vmovaps 0x00(rsp),xmm6
|
|
.byte 0x04,0x01,0x15,0x00 #sub rsp,0xa8
|
|
___
|
|
}
|
|
|
|
foreach (split("\n",$code)) {
|
|
s/\`([^\`]*)\`/eval($1)/ge;
|
|
|
|
s/\b(sh[rl]d?\s+\$)(-?[0-9]+)/$1.$2%64/ge or
|
|
|
|
s/\b(vmov[dq])\b(.+)%ymm([0-9]+)/$1$2%xmm$3/go or
|
|
s/\b(vmovdqu)\b(.+)%x%ymm([0-9]+)/$1$2%xmm$3/go or
|
|
s/\b(vpinsr[qd])\b(.+)%ymm([0-9]+)/$1$2%xmm$3/go or
|
|
s/\b(vpextr[qd])\b(.+)%ymm([0-9]+)/$1$2%xmm$3/go or
|
|
s/\b(vpbroadcast[qd]\s+)%ymm([0-9]+)/$1%xmm$2/go;
|
|
print $_,"\n";
|
|
}
|
|
|
|
}}} else {{{
|
|
print <<___; # assembler is too old
|
|
.text
|
|
|
|
.globl rsaz_avx2_eligible
|
|
.type rsaz_avx2_eligible,\@abi-omnipotent
|
|
rsaz_avx2_eligible:
|
|
xor %eax,%eax
|
|
ret
|
|
.size rsaz_avx2_eligible,.-rsaz_avx2_eligible
|
|
|
|
.globl rsaz_1024_sqr_avx2
|
|
.globl rsaz_1024_mul_avx2
|
|
.globl rsaz_1024_norm2red_avx2
|
|
.globl rsaz_1024_red2norm_avx2
|
|
.globl rsaz_1024_scatter5_avx2
|
|
.globl rsaz_1024_gather5_avx2
|
|
.type rsaz_1024_sqr_avx2,\@abi-omnipotent
|
|
rsaz_1024_sqr_avx2:
|
|
rsaz_1024_mul_avx2:
|
|
rsaz_1024_norm2red_avx2:
|
|
rsaz_1024_red2norm_avx2:
|
|
rsaz_1024_scatter5_avx2:
|
|
rsaz_1024_gather5_avx2:
|
|
.byte 0x0f,0x0b # ud2
|
|
ret
|
|
.size rsaz_1024_sqr_avx2,.-rsaz_1024_sqr_avx2
|
|
___
|
|
}}}
|
|
|
|
close STDOUT;
|