b9c26014de
Since we pre-generate our perlasm, having the output of these files be sensitive to the environment the run in is unhelpful. It would be bad to suddenly change what features we do or don't compile in whenever workstations' toolchains change. Enable all compiler-version-gated features as they should all be runtime-gated anyway. This should align with what upstream's files would have produced on modern toolschains. We should assume our assemblers can take whatever we'd like to throw at them. (If it turns out some can't, we'd rather find out and probably switch the problematic instructions to explicit byte sequences.) This actually results in a fairly significant change to the assembly we generate. I'm guessing upstream's buildsystem sets the CC environment variable, while ours doesn't and so the version checks were all coming out conservative. diffstat of generated files: linux-x86/crypto/sha/sha1-586.S | 1176 ++++++++++++ linux-x86/crypto/sha/sha256-586.S | 2248 ++++++++++++++++++++++++ linux-x86_64/crypto/bn/rsaz-avx2.S | 1644 +++++++++++++++++ linux-x86_64/crypto/bn/rsaz-x86_64.S | 638 ++++++ linux-x86_64/crypto/bn/x86_64-mont.S | 332 +++ linux-x86_64/crypto/bn/x86_64-mont5.S | 1130 ++++++++++++ linux-x86_64/crypto/modes/aesni-gcm-x86_64.S | 754 ++++++++ linux-x86_64/crypto/modes/ghash-x86_64.S | 475 +++++ linux-x86_64/crypto/sha/sha1-x86_64.S | 1121 ++++++++++++ linux-x86_64/crypto/sha/sha256-x86_64.S | 1062 +++++++++++ linux-x86_64/crypto/sha/sha512-x86_64.S | 2241 ++++++++++++++++++++++++ mac-x86/crypto/sha/sha1-586.S | 1174 ++++++++++++ mac-x86/crypto/sha/sha256-586.S | 2248 ++++++++++++++++++++++++ mac-x86_64/crypto/bn/rsaz-avx2.S | 1637 +++++++++++++++++ mac-x86_64/crypto/bn/rsaz-x86_64.S | 638 ++++++ mac-x86_64/crypto/bn/x86_64-mont.S | 331 +++ mac-x86_64/crypto/bn/x86_64-mont5.S | 1130 ++++++++++++ mac-x86_64/crypto/modes/aesni-gcm-x86_64.S | 750 ++++++++ mac-x86_64/crypto/modes/ghash-x86_64.S | 475 +++++ mac-x86_64/crypto/sha/sha1-x86_64.S | 1121 ++++++++++++ mac-x86_64/crypto/sha/sha256-x86_64.S | 1062 +++++++++++ mac-x86_64/crypto/sha/sha512-x86_64.S | 2241 ++++++++++++++++++++++++ win-x86/crypto/sha/sha1-586.asm | 1173 ++++++++++++ win-x86/crypto/sha/sha256-586.asm | 2248 ++++++++++++++++++++++++ win-x86_64/crypto/bn/rsaz-avx2.asm | 1858 +++++++++++++++++++- win-x86_64/crypto/bn/rsaz-x86_64.asm | 638 ++++++ win-x86_64/crypto/bn/x86_64-mont.asm | 352 +++ win-x86_64/crypto/bn/x86_64-mont5.asm | 1184 ++++++++++++ win-x86_64/crypto/modes/aesni-gcm-x86_64.asm | 933 ++++++++++ win-x86_64/crypto/modes/ghash-x86_64.asm | 515 +++++ win-x86_64/crypto/sha/sha1-x86_64.asm | 1152 ++++++++++++ win-x86_64/crypto/sha/sha256-x86_64.asm | 1088 +++++++++++ win-x86_64/crypto/sha/sha512-x86_64.asm | 2499 ++++++ SHA* gets faster. RSA and AES-GCM seem to be more of a wash and even slower sometimes! This is a little concerning. Though when I repeated the latter two, it's definitely noisy (RSA in particular), so we may wish to repeat in a more controlled environment. We could also flip some of these toggles to something other than the highest setting if it seems some of the variants aren't desirable. We just shouldn't have them enabled or disabled on accident. This aligns us closer to upstream though. $ /tmp/bssl.old speed SHA- Did 5028000 SHA-1 (16 bytes) operations in 1000048us (5027758.7 ops/sec): 80.4 MB/s Did 1708000 SHA-1 (256 bytes) operations in 1000257us (1707561.2 ops/sec): 437.1 MB/s Did 73000 SHA-1 (8192 bytes) operations in 1008406us (72391.5 ops/sec): 593.0 MB/s Did 3041000 SHA-256 (16 bytes) operations in 1000311us (3040054.5 ops/sec): 48.6 MB/s Did 779000 SHA-256 (256 bytes) operations in 1000820us (778361.7 ops/sec): 199.3 MB/s Did 26000 SHA-256 (8192 bytes) operations in 1009875us (25745.8 ops/sec): 210.9 MB/s Did 1837000 SHA-512 (16 bytes) operations in 1000251us (1836539.0 ops/sec): 29.4 MB/s Did 803000 SHA-512 (256 bytes) operations in 1000969us (802222.6 ops/sec): 205.4 MB/s Did 41000 SHA-512 (8192 bytes) operations in 1016768us (40323.8 ops/sec): 330.3 MB/s $ /tmp/bssl.new speed SHA- Did 5354000 SHA-1 (16 bytes) operations in 1000104us (5353443.2 ops/sec): 85.7 MB/s Did 1779000 SHA-1 (256 bytes) operations in 1000121us (1778784.8 ops/sec): 455.4 MB/s Did 87000 SHA-1 (8192 bytes) operations in 1012641us (85914.0 ops/sec): 703.8 MB/s Did 3517000 SHA-256 (16 bytes) operations in 1000114us (3516599.1 ops/sec): 56.3 MB/s Did 935000 SHA-256 (256 bytes) operations in 1000096us (934910.2 ops/sec): 239.3 MB/s Did 38000 SHA-256 (8192 bytes) operations in 1004476us (37830.7 ops/sec): 309.9 MB/s Did 2930000 SHA-512 (16 bytes) operations in 1000259us (2929241.3 ops/sec): 46.9 MB/s Did 1008000 SHA-512 (256 bytes) operations in 1000509us (1007487.2 ops/sec): 257.9 MB/s Did 45000 SHA-512 (8192 bytes) operations in 1000593us (44973.3 ops/sec): 368.4 MB/s $ /tmp/bssl.old speed RSA Did 820 RSA 2048 signing operations in 1017008us (806.3 ops/sec) Did 27000 RSA 2048 verify operations in 1015400us (26590.5 ops/sec) Did 1292 RSA 2048 (3 prime, e=3) signing operations in 1008185us (1281.5 ops/sec) Did 65000 RSA 2048 (3 prime, e=3) verify operations in 1011388us (64268.1 ops/sec) Did 120 RSA 4096 signing operations in 1061027us (113.1 ops/sec) Did 8208 RSA 4096 verify operations in 1002717us (8185.8 ops/sec) $ /tmp/bssl.new speed RSA Did 760 RSA 2048 signing operations in 1003351us (757.5 ops/sec) Did 25900 RSA 2048 verify operations in 1028931us (25171.8 ops/sec) Did 1320 RSA 2048 (3 prime, e=3) signing operations in 1040806us (1268.2 ops/sec) Did 63000 RSA 2048 (3 prime, e=3) verify operations in 1016042us (62005.3 ops/sec) Did 104 RSA 4096 signing operations in 1008718us (103.1 ops/sec) Did 6875 RSA 4096 verify operations in 1093441us (6287.5 ops/sec) $ /tmp/bssl.old speed GCM Did 5316000 AES-128-GCM (16 bytes) seal operations in 1000082us (5315564.1 ops/sec): 85.0 MB/s Did 712000 AES-128-GCM (1350 bytes) seal operations in 1000252us (711820.6 ops/sec): 961.0 MB/s Did 149000 AES-128-GCM (8192 bytes) seal operations in 1003182us (148527.4 ops/sec): 1216.7 MB/s Did 5919750 AES-256-GCM (16 bytes) seal operations in 1000016us (5919655.3 ops/sec): 94.7 MB/s Did 800000 AES-256-GCM (1350 bytes) seal operations in 1000951us (799239.9 ops/sec): 1079.0 MB/s Did 152000 AES-256-GCM (8192 bytes) seal operations in 1000765us (151883.8 ops/sec): 1244.2 MB/s $ /tmp/bssl.new speed GCM Did 5315000 AES-128-GCM (16 bytes) seal operations in 1000125us (5314335.7 ops/sec): 85.0 MB/s Did 755000 AES-128-GCM (1350 bytes) seal operations in 1000878us (754337.7 ops/sec): 1018.4 MB/s Did 151000 AES-128-GCM (8192 bytes) seal operations in 1005655us (150150.9 ops/sec): 1230.0 MB/s Did 5913500 AES-256-GCM (16 bytes) seal operations in 1000041us (5913257.6 ops/sec): 94.6 MB/s Did 782000 AES-256-GCM (1350 bytes) seal operations in 1001484us (780841.2 ops/sec): 1054.1 MB/s Did 121000 AES-256-GCM (8192 bytes) seal operations in 1006389us (120231.8 ops/sec): 984.9 MB/s Change-Id: I0efb32f896c597abc7d7e55c31d038528a5c72a1 Reviewed-on: https://boringssl-review.googlesource.com/6260 Reviewed-by: Adam Langley <alangley@gmail.com>
1392 lines
28 KiB
Perl
1392 lines
28 KiB
Perl
#!/usr/bin/env perl
|
||
|
||
# ====================================================================
|
||
# Written by Andy Polyakov <appro@openssl.org> for the OpenSSL
|
||
# project. The module is, however, dual licensed under OpenSSL and
|
||
# CRYPTOGAMS licenses depending on where you obtain it. For further
|
||
# details see http://www.openssl.org/~appro/cryptogams/.
|
||
# ====================================================================
|
||
|
||
# October 2005.
|
||
#
|
||
# Montgomery multiplication routine for x86_64. While it gives modest
|
||
# 9% improvement of rsa4096 sign on Opteron, rsa512 sign runs more
|
||
# than twice, >2x, as fast. Most common rsa1024 sign is improved by
|
||
# respectful 50%. It remains to be seen if loop unrolling and
|
||
# dedicated squaring routine can provide further improvement...
|
||
|
||
# July 2011.
|
||
#
|
||
# Add dedicated squaring procedure. Performance improvement varies
|
||
# from platform to platform, but in average it's ~5%/15%/25%/33%
|
||
# for 512-/1024-/2048-/4096-bit RSA *sign* benchmarks respectively.
|
||
|
||
# August 2011.
|
||
#
|
||
# Unroll and modulo-schedule inner loops in such manner that they
|
||
# are "fallen through" for input lengths of 8, which is critical for
|
||
# 1024-bit RSA *sign*. Average performance improvement in comparison
|
||
# to *initial* version of this module from 2005 is ~0%/30%/40%/45%
|
||
# for 512-/1024-/2048-/4096-bit RSA *sign* benchmarks respectively.
|
||
|
||
# June 2013.
|
||
#
|
||
# Optimize reduction in squaring procedure and improve 1024+-bit RSA
|
||
# sign performance by 10-16% on Intel Sandy Bridge and later
|
||
# (virtually same on non-Intel processors).
|
||
|
||
# August 2013.
|
||
#
|
||
# Add MULX/ADOX/ADCX code path.
|
||
|
||
$flavour = shift;
|
||
$output = shift;
|
||
if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
|
||
|
||
$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
|
||
|
||
$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
|
||
( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
|
||
( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
|
||
die "can't locate x86_64-xlate.pl";
|
||
|
||
open OUT,"| \"$^X\" $xlate $flavour $output";
|
||
*STDOUT=*OUT;
|
||
|
||
# In upstream, this is controlled by shelling out to the compiler to check
|
||
# versions, but BoringSSL is intended to be used with pre-generated perlasm
|
||
# output, so this isn't useful anyway.
|
||
$addx = 1;
|
||
|
||
# int bn_mul_mont(
|
||
$rp="%rdi"; # BN_ULONG *rp,
|
||
$ap="%rsi"; # const BN_ULONG *ap,
|
||
$bp="%rdx"; # const BN_ULONG *bp,
|
||
$np="%rcx"; # const BN_ULONG *np,
|
||
$n0="%r8"; # const BN_ULONG *n0,
|
||
$num="%r9"; # int num);
|
||
$lo0="%r10";
|
||
$hi0="%r11";
|
||
$hi1="%r13";
|
||
$i="%r14";
|
||
$j="%r15";
|
||
$m0="%rbx";
|
||
$m1="%rbp";
|
||
|
||
$code=<<___;
|
||
.text
|
||
|
||
.extern OPENSSL_ia32cap_P
|
||
|
||
.globl bn_mul_mont
|
||
.type bn_mul_mont,\@function,6
|
||
.align 16
|
||
bn_mul_mont:
|
||
test \$3,${num}d
|
||
jnz .Lmul_enter
|
||
cmp \$8,${num}d
|
||
jb .Lmul_enter
|
||
___
|
||
$code.=<<___ if ($addx);
|
||
mov OPENSSL_ia32cap_P+8(%rip),%r11d
|
||
___
|
||
$code.=<<___;
|
||
cmp $ap,$bp
|
||
jne .Lmul4x_enter
|
||
test \$7,${num}d
|
||
jz .Lsqr8x_enter
|
||
jmp .Lmul4x_enter
|
||
|
||
.align 16
|
||
.Lmul_enter:
|
||
push %rbx
|
||
push %rbp
|
||
push %r12
|
||
push %r13
|
||
push %r14
|
||
push %r15
|
||
|
||
mov ${num}d,${num}d
|
||
lea 2($num),%r10
|
||
mov %rsp,%r11
|
||
neg %r10
|
||
lea (%rsp,%r10,8),%rsp # tp=alloca(8*(num+2))
|
||
and \$-1024,%rsp # minimize TLB usage
|
||
|
||
mov %r11,8(%rsp,$num,8) # tp[num+1]=%rsp
|
||
.Lmul_body:
|
||
mov $bp,%r12 # reassign $bp
|
||
___
|
||
$bp="%r12";
|
||
$code.=<<___;
|
||
mov ($n0),$n0 # pull n0[0] value
|
||
mov ($bp),$m0 # m0=bp[0]
|
||
mov ($ap),%rax
|
||
|
||
xor $i,$i # i=0
|
||
xor $j,$j # j=0
|
||
|
||
mov $n0,$m1
|
||
mulq $m0 # ap[0]*bp[0]
|
||
mov %rax,$lo0
|
||
mov ($np),%rax
|
||
|
||
imulq $lo0,$m1 # "tp[0]"*n0
|
||
mov %rdx,$hi0
|
||
|
||
mulq $m1 # np[0]*m1
|
||
add %rax,$lo0 # discarded
|
||
mov 8($ap),%rax
|
||
adc \$0,%rdx
|
||
mov %rdx,$hi1
|
||
|
||
lea 1($j),$j # j++
|
||
jmp .L1st_enter
|
||
|
||
.align 16
|
||
.L1st:
|
||
add %rax,$hi1
|
||
mov ($ap,$j,8),%rax
|
||
adc \$0,%rdx
|
||
add $hi0,$hi1 # np[j]*m1+ap[j]*bp[0]
|
||
mov $lo0,$hi0
|
||
adc \$0,%rdx
|
||
mov $hi1,-16(%rsp,$j,8) # tp[j-1]
|
||
mov %rdx,$hi1
|
||
|
||
.L1st_enter:
|
||
mulq $m0 # ap[j]*bp[0]
|
||
add %rax,$hi0
|
||
mov ($np,$j,8),%rax
|
||
adc \$0,%rdx
|
||
lea 1($j),$j # j++
|
||
mov %rdx,$lo0
|
||
|
||
mulq $m1 # np[j]*m1
|
||
cmp $num,$j
|
||
jne .L1st
|
||
|
||
add %rax,$hi1
|
||
mov ($ap),%rax # ap[0]
|
||
adc \$0,%rdx
|
||
add $hi0,$hi1 # np[j]*m1+ap[j]*bp[0]
|
||
adc \$0,%rdx
|
||
mov $hi1,-16(%rsp,$j,8) # tp[j-1]
|
||
mov %rdx,$hi1
|
||
mov $lo0,$hi0
|
||
|
||
xor %rdx,%rdx
|
||
add $hi0,$hi1
|
||
adc \$0,%rdx
|
||
mov $hi1,-8(%rsp,$num,8)
|
||
mov %rdx,(%rsp,$num,8) # store upmost overflow bit
|
||
|
||
lea 1($i),$i # i++
|
||
jmp .Louter
|
||
.align 16
|
||
.Louter:
|
||
mov ($bp,$i,8),$m0 # m0=bp[i]
|
||
xor $j,$j # j=0
|
||
mov $n0,$m1
|
||
mov (%rsp),$lo0
|
||
mulq $m0 # ap[0]*bp[i]
|
||
add %rax,$lo0 # ap[0]*bp[i]+tp[0]
|
||
mov ($np),%rax
|
||
adc \$0,%rdx
|
||
|
||
imulq $lo0,$m1 # tp[0]*n0
|
||
mov %rdx,$hi0
|
||
|
||
mulq $m1 # np[0]*m1
|
||
add %rax,$lo0 # discarded
|
||
mov 8($ap),%rax
|
||
adc \$0,%rdx
|
||
mov 8(%rsp),$lo0 # tp[1]
|
||
mov %rdx,$hi1
|
||
|
||
lea 1($j),$j # j++
|
||
jmp .Linner_enter
|
||
|
||
.align 16
|
||
.Linner:
|
||
add %rax,$hi1
|
||
mov ($ap,$j,8),%rax
|
||
adc \$0,%rdx
|
||
add $lo0,$hi1 # np[j]*m1+ap[j]*bp[i]+tp[j]
|
||
mov (%rsp,$j,8),$lo0
|
||
adc \$0,%rdx
|
||
mov $hi1,-16(%rsp,$j,8) # tp[j-1]
|
||
mov %rdx,$hi1
|
||
|
||
.Linner_enter:
|
||
mulq $m0 # ap[j]*bp[i]
|
||
add %rax,$hi0
|
||
mov ($np,$j,8),%rax
|
||
adc \$0,%rdx
|
||
add $hi0,$lo0 # ap[j]*bp[i]+tp[j]
|
||
mov %rdx,$hi0
|
||
adc \$0,$hi0
|
||
lea 1($j),$j # j++
|
||
|
||
mulq $m1 # np[j]*m1
|
||
cmp $num,$j
|
||
jne .Linner
|
||
|
||
add %rax,$hi1
|
||
mov ($ap),%rax # ap[0]
|
||
adc \$0,%rdx
|
||
add $lo0,$hi1 # np[j]*m1+ap[j]*bp[i]+tp[j]
|
||
mov (%rsp,$j,8),$lo0
|
||
adc \$0,%rdx
|
||
mov $hi1,-16(%rsp,$j,8) # tp[j-1]
|
||
mov %rdx,$hi1
|
||
|
||
xor %rdx,%rdx
|
||
add $hi0,$hi1
|
||
adc \$0,%rdx
|
||
add $lo0,$hi1 # pull upmost overflow bit
|
||
adc \$0,%rdx
|
||
mov $hi1,-8(%rsp,$num,8)
|
||
mov %rdx,(%rsp,$num,8) # store upmost overflow bit
|
||
|
||
lea 1($i),$i # i++
|
||
cmp $num,$i
|
||
jb .Louter
|
||
|
||
xor $i,$i # i=0 and clear CF!
|
||
mov (%rsp),%rax # tp[0]
|
||
lea (%rsp),$ap # borrow ap for tp
|
||
mov $num,$j # j=num
|
||
jmp .Lsub
|
||
.align 16
|
||
.Lsub: sbb ($np,$i,8),%rax
|
||
mov %rax,($rp,$i,8) # rp[i]=tp[i]-np[i]
|
||
mov 8($ap,$i,8),%rax # tp[i+1]
|
||
lea 1($i),$i # i++
|
||
dec $j # doesn't affect CF!
|
||
jnz .Lsub
|
||
|
||
sbb \$0,%rax # handle upmost overflow bit
|
||
xor $i,$i
|
||
mov $num,$j # j=num
|
||
.align 16
|
||
.Lcopy: # copy or in-place refresh
|
||
mov (%rsp,$i,8),$ap
|
||
mov ($rp,$i,8),$np
|
||
xor $np,$ap # conditional select:
|
||
and %rax,$ap # ((ap ^ np) & %rax) ^ np
|
||
xor $np,$ap # ap = borrow?tp:rp
|
||
mov $i,(%rsp,$i,8) # zap temporary vector
|
||
mov $ap,($rp,$i,8) # rp[i]=tp[i]
|
||
lea 1($i),$i
|
||
sub \$1,$j
|
||
jnz .Lcopy
|
||
|
||
mov 8(%rsp,$num,8),%rsi # restore %rsp
|
||
mov \$1,%rax
|
||
mov (%rsi),%r15
|
||
mov 8(%rsi),%r14
|
||
mov 16(%rsi),%r13
|
||
mov 24(%rsi),%r12
|
||
mov 32(%rsi),%rbp
|
||
mov 40(%rsi),%rbx
|
||
lea 48(%rsi),%rsp
|
||
.Lmul_epilogue:
|
||
ret
|
||
.size bn_mul_mont,.-bn_mul_mont
|
||
___
|
||
{{{
|
||
my @A=("%r10","%r11");
|
||
my @N=("%r13","%rdi");
|
||
$code.=<<___;
|
||
.type bn_mul4x_mont,\@function,6
|
||
.align 16
|
||
bn_mul4x_mont:
|
||
.Lmul4x_enter:
|
||
___
|
||
$code.=<<___ if ($addx);
|
||
and \$0x80100,%r11d
|
||
cmp \$0x80100,%r11d
|
||
je .Lmulx4x_enter
|
||
___
|
||
$code.=<<___;
|
||
push %rbx
|
||
push %rbp
|
||
push %r12
|
||
push %r13
|
||
push %r14
|
||
push %r15
|
||
|
||
mov ${num}d,${num}d
|
||
lea 4($num),%r10
|
||
mov %rsp,%r11
|
||
neg %r10
|
||
lea (%rsp,%r10,8),%rsp # tp=alloca(8*(num+4))
|
||
and \$-1024,%rsp # minimize TLB usage
|
||
|
||
mov %r11,8(%rsp,$num,8) # tp[num+1]=%rsp
|
||
.Lmul4x_body:
|
||
mov $rp,16(%rsp,$num,8) # tp[num+2]=$rp
|
||
mov %rdx,%r12 # reassign $bp
|
||
___
|
||
$bp="%r12";
|
||
$code.=<<___;
|
||
mov ($n0),$n0 # pull n0[0] value
|
||
mov ($bp),$m0 # m0=bp[0]
|
||
mov ($ap),%rax
|
||
|
||
xor $i,$i # i=0
|
||
xor $j,$j # j=0
|
||
|
||
mov $n0,$m1
|
||
mulq $m0 # ap[0]*bp[0]
|
||
mov %rax,$A[0]
|
||
mov ($np),%rax
|
||
|
||
imulq $A[0],$m1 # "tp[0]"*n0
|
||
mov %rdx,$A[1]
|
||
|
||
mulq $m1 # np[0]*m1
|
||
add %rax,$A[0] # discarded
|
||
mov 8($ap),%rax
|
||
adc \$0,%rdx
|
||
mov %rdx,$N[1]
|
||
|
||
mulq $m0
|
||
add %rax,$A[1]
|
||
mov 8($np),%rax
|
||
adc \$0,%rdx
|
||
mov %rdx,$A[0]
|
||
|
||
mulq $m1
|
||
add %rax,$N[1]
|
||
mov 16($ap),%rax
|
||
adc \$0,%rdx
|
||
add $A[1],$N[1]
|
||
lea 4($j),$j # j++
|
||
adc \$0,%rdx
|
||
mov $N[1],(%rsp)
|
||
mov %rdx,$N[0]
|
||
jmp .L1st4x
|
||
.align 16
|
||
.L1st4x:
|
||
mulq $m0 # ap[j]*bp[0]
|
||
add %rax,$A[0]
|
||
mov -16($np,$j,8),%rax
|
||
adc \$0,%rdx
|
||
mov %rdx,$A[1]
|
||
|
||
mulq $m1 # np[j]*m1
|
||
add %rax,$N[0]
|
||
mov -8($ap,$j,8),%rax
|
||
adc \$0,%rdx
|
||
add $A[0],$N[0] # np[j]*m1+ap[j]*bp[0]
|
||
adc \$0,%rdx
|
||
mov $N[0],-24(%rsp,$j,8) # tp[j-1]
|
||
mov %rdx,$N[1]
|
||
|
||
mulq $m0 # ap[j]*bp[0]
|
||
add %rax,$A[1]
|
||
mov -8($np,$j,8),%rax
|
||
adc \$0,%rdx
|
||
mov %rdx,$A[0]
|
||
|
||
mulq $m1 # np[j]*m1
|
||
add %rax,$N[1]
|
||
mov ($ap,$j,8),%rax
|
||
adc \$0,%rdx
|
||
add $A[1],$N[1] # np[j]*m1+ap[j]*bp[0]
|
||
adc \$0,%rdx
|
||
mov $N[1],-16(%rsp,$j,8) # tp[j-1]
|
||
mov %rdx,$N[0]
|
||
|
||
mulq $m0 # ap[j]*bp[0]
|
||
add %rax,$A[0]
|
||
mov ($np,$j,8),%rax
|
||
adc \$0,%rdx
|
||
mov %rdx,$A[1]
|
||
|
||
mulq $m1 # np[j]*m1
|
||
add %rax,$N[0]
|
||
mov 8($ap,$j,8),%rax
|
||
adc \$0,%rdx
|
||
add $A[0],$N[0] # np[j]*m1+ap[j]*bp[0]
|
||
adc \$0,%rdx
|
||
mov $N[0],-8(%rsp,$j,8) # tp[j-1]
|
||
mov %rdx,$N[1]
|
||
|
||
mulq $m0 # ap[j]*bp[0]
|
||
add %rax,$A[1]
|
||
mov 8($np,$j,8),%rax
|
||
adc \$0,%rdx
|
||
lea 4($j),$j # j++
|
||
mov %rdx,$A[0]
|
||
|
||
mulq $m1 # np[j]*m1
|
||
add %rax,$N[1]
|
||
mov -16($ap,$j,8),%rax
|
||
adc \$0,%rdx
|
||
add $A[1],$N[1] # np[j]*m1+ap[j]*bp[0]
|
||
adc \$0,%rdx
|
||
mov $N[1],-32(%rsp,$j,8) # tp[j-1]
|
||
mov %rdx,$N[0]
|
||
cmp $num,$j
|
||
jb .L1st4x
|
||
|
||
mulq $m0 # ap[j]*bp[0]
|
||
add %rax,$A[0]
|
||
mov -16($np,$j,8),%rax
|
||
adc \$0,%rdx
|
||
mov %rdx,$A[1]
|
||
|
||
mulq $m1 # np[j]*m1
|
||
add %rax,$N[0]
|
||
mov -8($ap,$j,8),%rax
|
||
adc \$0,%rdx
|
||
add $A[0],$N[0] # np[j]*m1+ap[j]*bp[0]
|
||
adc \$0,%rdx
|
||
mov $N[0],-24(%rsp,$j,8) # tp[j-1]
|
||
mov %rdx,$N[1]
|
||
|
||
mulq $m0 # ap[j]*bp[0]
|
||
add %rax,$A[1]
|
||
mov -8($np,$j,8),%rax
|
||
adc \$0,%rdx
|
||
mov %rdx,$A[0]
|
||
|
||
mulq $m1 # np[j]*m1
|
||
add %rax,$N[1]
|
||
mov ($ap),%rax # ap[0]
|
||
adc \$0,%rdx
|
||
add $A[1],$N[1] # np[j]*m1+ap[j]*bp[0]
|
||
adc \$0,%rdx
|
||
mov $N[1],-16(%rsp,$j,8) # tp[j-1]
|
||
mov %rdx,$N[0]
|
||
|
||
xor $N[1],$N[1]
|
||
add $A[0],$N[0]
|
||
adc \$0,$N[1]
|
||
mov $N[0],-8(%rsp,$j,8)
|
||
mov $N[1],(%rsp,$j,8) # store upmost overflow bit
|
||
|
||
lea 1($i),$i # i++
|
||
.align 4
|
||
.Louter4x:
|
||
mov ($bp,$i,8),$m0 # m0=bp[i]
|
||
xor $j,$j # j=0
|
||
mov (%rsp),$A[0]
|
||
mov $n0,$m1
|
||
mulq $m0 # ap[0]*bp[i]
|
||
add %rax,$A[0] # ap[0]*bp[i]+tp[0]
|
||
mov ($np),%rax
|
||
adc \$0,%rdx
|
||
|
||
imulq $A[0],$m1 # tp[0]*n0
|
||
mov %rdx,$A[1]
|
||
|
||
mulq $m1 # np[0]*m1
|
||
add %rax,$A[0] # "$N[0]", discarded
|
||
mov 8($ap),%rax
|
||
adc \$0,%rdx
|
||
mov %rdx,$N[1]
|
||
|
||
mulq $m0 # ap[j]*bp[i]
|
||
add %rax,$A[1]
|
||
mov 8($np),%rax
|
||
adc \$0,%rdx
|
||
add 8(%rsp),$A[1] # +tp[1]
|
||
adc \$0,%rdx
|
||
mov %rdx,$A[0]
|
||
|
||
mulq $m1 # np[j]*m1
|
||
add %rax,$N[1]
|
||
mov 16($ap),%rax
|
||
adc \$0,%rdx
|
||
add $A[1],$N[1] # np[j]*m1+ap[j]*bp[i]+tp[j]
|
||
lea 4($j),$j # j+=2
|
||
adc \$0,%rdx
|
||
mov $N[1],(%rsp) # tp[j-1]
|
||
mov %rdx,$N[0]
|
||
jmp .Linner4x
|
||
.align 16
|
||
.Linner4x:
|
||
mulq $m0 # ap[j]*bp[i]
|
||
add %rax,$A[0]
|
||
mov -16($np,$j,8),%rax
|
||
adc \$0,%rdx
|
||
add -16(%rsp,$j,8),$A[0] # ap[j]*bp[i]+tp[j]
|
||
adc \$0,%rdx
|
||
mov %rdx,$A[1]
|
||
|
||
mulq $m1 # np[j]*m1
|
||
add %rax,$N[0]
|
||
mov -8($ap,$j,8),%rax
|
||
adc \$0,%rdx
|
||
add $A[0],$N[0]
|
||
adc \$0,%rdx
|
||
mov $N[0],-24(%rsp,$j,8) # tp[j-1]
|
||
mov %rdx,$N[1]
|
||
|
||
mulq $m0 # ap[j]*bp[i]
|
||
add %rax,$A[1]
|
||
mov -8($np,$j,8),%rax
|
||
adc \$0,%rdx
|
||
add -8(%rsp,$j,8),$A[1]
|
||
adc \$0,%rdx
|
||
mov %rdx,$A[0]
|
||
|
||
mulq $m1 # np[j]*m1
|
||
add %rax,$N[1]
|
||
mov ($ap,$j,8),%rax
|
||
adc \$0,%rdx
|
||
add $A[1],$N[1]
|
||
adc \$0,%rdx
|
||
mov $N[1],-16(%rsp,$j,8) # tp[j-1]
|
||
mov %rdx,$N[0]
|
||
|
||
mulq $m0 # ap[j]*bp[i]
|
||
add %rax,$A[0]
|
||
mov ($np,$j,8),%rax
|
||
adc \$0,%rdx
|
||
add (%rsp,$j,8),$A[0] # ap[j]*bp[i]+tp[j]
|
||
adc \$0,%rdx
|
||
mov %rdx,$A[1]
|
||
|
||
mulq $m1 # np[j]*m1
|
||
add %rax,$N[0]
|
||
mov 8($ap,$j,8),%rax
|
||
adc \$0,%rdx
|
||
add $A[0],$N[0]
|
||
adc \$0,%rdx
|
||
mov $N[0],-8(%rsp,$j,8) # tp[j-1]
|
||
mov %rdx,$N[1]
|
||
|
||
mulq $m0 # ap[j]*bp[i]
|
||
add %rax,$A[1]
|
||
mov 8($np,$j,8),%rax
|
||
adc \$0,%rdx
|
||
add 8(%rsp,$j,8),$A[1]
|
||
adc \$0,%rdx
|
||
lea 4($j),$j # j++
|
||
mov %rdx,$A[0]
|
||
|
||
mulq $m1 # np[j]*m1
|
||
add %rax,$N[1]
|
||
mov -16($ap,$j,8),%rax
|
||
adc \$0,%rdx
|
||
add $A[1],$N[1]
|
||
adc \$0,%rdx
|
||
mov $N[1],-32(%rsp,$j,8) # tp[j-1]
|
||
mov %rdx,$N[0]
|
||
cmp $num,$j
|
||
jb .Linner4x
|
||
|
||
mulq $m0 # ap[j]*bp[i]
|
||
add %rax,$A[0]
|
||
mov -16($np,$j,8),%rax
|
||
adc \$0,%rdx
|
||
add -16(%rsp,$j,8),$A[0] # ap[j]*bp[i]+tp[j]
|
||
adc \$0,%rdx
|
||
mov %rdx,$A[1]
|
||
|
||
mulq $m1 # np[j]*m1
|
||
add %rax,$N[0]
|
||
mov -8($ap,$j,8),%rax
|
||
adc \$0,%rdx
|
||
add $A[0],$N[0]
|
||
adc \$0,%rdx
|
||
mov $N[0],-24(%rsp,$j,8) # tp[j-1]
|
||
mov %rdx,$N[1]
|
||
|
||
mulq $m0 # ap[j]*bp[i]
|
||
add %rax,$A[1]
|
||
mov -8($np,$j,8),%rax
|
||
adc \$0,%rdx
|
||
add -8(%rsp,$j,8),$A[1]
|
||
adc \$0,%rdx
|
||
lea 1($i),$i # i++
|
||
mov %rdx,$A[0]
|
||
|
||
mulq $m1 # np[j]*m1
|
||
add %rax,$N[1]
|
||
mov ($ap),%rax # ap[0]
|
||
adc \$0,%rdx
|
||
add $A[1],$N[1]
|
||
adc \$0,%rdx
|
||
mov $N[1],-16(%rsp,$j,8) # tp[j-1]
|
||
mov %rdx,$N[0]
|
||
|
||
xor $N[1],$N[1]
|
||
add $A[0],$N[0]
|
||
adc \$0,$N[1]
|
||
add (%rsp,$num,8),$N[0] # pull upmost overflow bit
|
||
adc \$0,$N[1]
|
||
mov $N[0],-8(%rsp,$j,8)
|
||
mov $N[1],(%rsp,$j,8) # store upmost overflow bit
|
||
|
||
cmp $num,$i
|
||
jb .Louter4x
|
||
___
|
||
{
|
||
my @ri=("%rax","%rdx",$m0,$m1);
|
||
$code.=<<___;
|
||
mov 16(%rsp,$num,8),$rp # restore $rp
|
||
mov 0(%rsp),@ri[0] # tp[0]
|
||
mov 8(%rsp),@ri[1] # tp[1]
|
||
shr \$2,$num # num/=4
|
||
lea (%rsp),$ap # borrow ap for tp
|
||
xor $i,$i # i=0 and clear CF!
|
||
|
||
sub 0($np),@ri[0]
|
||
mov 16($ap),@ri[2] # tp[2]
|
||
mov 24($ap),@ri[3] # tp[3]
|
||
sbb 8($np),@ri[1]
|
||
lea -1($num),$j # j=num/4-1
|
||
jmp .Lsub4x
|
||
.align 16
|
||
.Lsub4x:
|
||
mov @ri[0],0($rp,$i,8) # rp[i]=tp[i]-np[i]
|
||
mov @ri[1],8($rp,$i,8) # rp[i]=tp[i]-np[i]
|
||
sbb 16($np,$i,8),@ri[2]
|
||
mov 32($ap,$i,8),@ri[0] # tp[i+1]
|
||
mov 40($ap,$i,8),@ri[1]
|
||
sbb 24($np,$i,8),@ri[3]
|
||
mov @ri[2],16($rp,$i,8) # rp[i]=tp[i]-np[i]
|
||
mov @ri[3],24($rp,$i,8) # rp[i]=tp[i]-np[i]
|
||
sbb 32($np,$i,8),@ri[0]
|
||
mov 48($ap,$i,8),@ri[2]
|
||
mov 56($ap,$i,8),@ri[3]
|
||
sbb 40($np,$i,8),@ri[1]
|
||
lea 4($i),$i # i++
|
||
dec $j # doesnn't affect CF!
|
||
jnz .Lsub4x
|
||
|
||
mov @ri[0],0($rp,$i,8) # rp[i]=tp[i]-np[i]
|
||
mov 32($ap,$i,8),@ri[0] # load overflow bit
|
||
sbb 16($np,$i,8),@ri[2]
|
||
mov @ri[1],8($rp,$i,8) # rp[i]=tp[i]-np[i]
|
||
sbb 24($np,$i,8),@ri[3]
|
||
mov @ri[2],16($rp,$i,8) # rp[i]=tp[i]-np[i]
|
||
|
||
sbb \$0,@ri[0] # handle upmost overflow bit
|
||
mov @ri[0],%xmm0
|
||
punpcklqdq %xmm0,%xmm0 # extend mask to 128 bits
|
||
mov @ri[3],24($rp,$i,8) # rp[i]=tp[i]-np[i]
|
||
xor $i,$i # i=0
|
||
|
||
mov $num,$j
|
||
pxor %xmm5,%xmm5
|
||
jmp .Lcopy4x
|
||
.align 16
|
||
.Lcopy4x: # copy or in-place refresh
|
||
movdqu (%rsp,$i),%xmm2
|
||
movdqu 16(%rsp,$i),%xmm4
|
||
movdqu ($rp,$i),%xmm1
|
||
movdqu 16($rp,$i),%xmm3
|
||
pxor %xmm1,%xmm2 # conditional select
|
||
pxor %xmm3,%xmm4
|
||
pand %xmm0,%xmm2
|
||
pand %xmm0,%xmm4
|
||
pxor %xmm1,%xmm2
|
||
pxor %xmm3,%xmm4
|
||
movdqu %xmm2,($rp,$i)
|
||
movdqu %xmm4,16($rp,$i)
|
||
movdqa %xmm5,(%rsp,$i) # zap temporary vectors
|
||
movdqa %xmm5,16(%rsp,$i)
|
||
|
||
lea 32($i),$i
|
||
dec $j
|
||
jnz .Lcopy4x
|
||
|
||
shl \$2,$num
|
||
___
|
||
}
|
||
$code.=<<___;
|
||
mov 8(%rsp,$num,8),%rsi # restore %rsp
|
||
mov \$1,%rax
|
||
mov (%rsi),%r15
|
||
mov 8(%rsi),%r14
|
||
mov 16(%rsi),%r13
|
||
mov 24(%rsi),%r12
|
||
mov 32(%rsi),%rbp
|
||
mov 40(%rsi),%rbx
|
||
lea 48(%rsi),%rsp
|
||
.Lmul4x_epilogue:
|
||
ret
|
||
.size bn_mul4x_mont,.-bn_mul4x_mont
|
||
___
|
||
}}}
|
||
{{{
|
||
######################################################################
|
||
# void bn_sqr8x_mont(
|
||
my $rptr="%rdi"; # const BN_ULONG *rptr,
|
||
my $aptr="%rsi"; # const BN_ULONG *aptr,
|
||
my $bptr="%rdx"; # not used
|
||
my $nptr="%rcx"; # const BN_ULONG *nptr,
|
||
my $n0 ="%r8"; # const BN_ULONG *n0);
|
||
my $num ="%r9"; # int num, has to be divisible by 8
|
||
|
||
my ($i,$j,$tptr)=("%rbp","%rcx",$rptr);
|
||
my @A0=("%r10","%r11");
|
||
my @A1=("%r12","%r13");
|
||
my ($a0,$a1,$ai)=("%r14","%r15","%rbx");
|
||
|
||
$code.=<<___ if ($addx);
|
||
.extern bn_sqrx8x_internal # see x86_64-mont5 module
|
||
___
|
||
$code.=<<___;
|
||
.extern bn_sqr8x_internal # see x86_64-mont5 module
|
||
|
||
.type bn_sqr8x_mont,\@function,6
|
||
.align 32
|
||
bn_sqr8x_mont:
|
||
.Lsqr8x_enter:
|
||
mov %rsp,%rax
|
||
push %rbx
|
||
push %rbp
|
||
push %r12
|
||
push %r13
|
||
push %r14
|
||
push %r15
|
||
|
||
mov ${num}d,%r10d
|
||
shl \$3,${num}d # convert $num to bytes
|
||
shl \$3+2,%r10 # 4*$num
|
||
neg $num
|
||
|
||
##############################################################
|
||
# ensure that stack frame doesn't alias with $aptr modulo
|
||
# 4096. this is done to allow memory disambiguation logic
|
||
# do its job.
|
||
#
|
||
lea -64(%rsp,$num,4),%r11
|
||
mov ($n0),$n0 # *n0
|
||
sub $aptr,%r11
|
||
and \$4095,%r11
|
||
cmp %r11,%r10
|
||
jb .Lsqr8x_sp_alt
|
||
sub %r11,%rsp # align with $aptr
|
||
lea -64(%rsp,$num,4),%rsp # alloca(frame+4*$num)
|
||
jmp .Lsqr8x_sp_done
|
||
|
||
.align 32
|
||
.Lsqr8x_sp_alt:
|
||
lea 4096-64(,$num,4),%r10 # 4096-frame-4*$num
|
||
lea -64(%rsp,$num,4),%rsp # alloca(frame+4*$num)
|
||
sub %r10,%r11
|
||
mov \$0,%r10
|
||
cmovc %r10,%r11
|
||
sub %r11,%rsp
|
||
.Lsqr8x_sp_done:
|
||
and \$-64,%rsp
|
||
mov $num,%r10
|
||
neg $num
|
||
|
||
lea 64(%rsp,$num,2),%r11 # copy of modulus
|
||
mov $n0, 32(%rsp)
|
||
mov %rax, 40(%rsp) # save original %rsp
|
||
.Lsqr8x_body:
|
||
|
||
mov $num,$i
|
||
movq %r11, %xmm2 # save pointer to modulus copy
|
||
shr \$3+2,$i
|
||
mov OPENSSL_ia32cap_P+8(%rip),%eax
|
||
jmp .Lsqr8x_copy_n
|
||
|
||
.align 32
|
||
.Lsqr8x_copy_n:
|
||
movq 8*0($nptr),%xmm0
|
||
movq 8*1($nptr),%xmm1
|
||
movq 8*2($nptr),%xmm3
|
||
movq 8*3($nptr),%xmm4
|
||
lea 8*4($nptr),$nptr
|
||
movdqa %xmm0,16*0(%r11)
|
||
movdqa %xmm1,16*1(%r11)
|
||
movdqa %xmm3,16*2(%r11)
|
||
movdqa %xmm4,16*3(%r11)
|
||
lea 16*4(%r11),%r11
|
||
dec $i
|
||
jnz .Lsqr8x_copy_n
|
||
|
||
pxor %xmm0,%xmm0
|
||
movq $rptr,%xmm1 # save $rptr
|
||
movq %r10, %xmm3 # -$num
|
||
___
|
||
$code.=<<___ if ($addx);
|
||
and \$0x80100,%eax
|
||
cmp \$0x80100,%eax
|
||
jne .Lsqr8x_nox
|
||
|
||
call bn_sqrx8x_internal # see x86_64-mont5 module
|
||
|
||
pxor %xmm0,%xmm0
|
||
lea 48(%rsp),%rax
|
||
lea 64(%rsp,$num,2),%rdx
|
||
shr \$3+2,$num
|
||
mov 40(%rsp),%rsi # restore %rsp
|
||
jmp .Lsqr8x_zero
|
||
|
||
.align 32
|
||
.Lsqr8x_nox:
|
||
___
|
||
$code.=<<___;
|
||
call bn_sqr8x_internal # see x86_64-mont5 module
|
||
|
||
pxor %xmm0,%xmm0
|
||
lea 48(%rsp),%rax
|
||
lea 64(%rsp,$num,2),%rdx
|
||
shr \$3+2,$num
|
||
mov 40(%rsp),%rsi # restore %rsp
|
||
jmp .Lsqr8x_zero
|
||
|
||
.align 32
|
||
.Lsqr8x_zero:
|
||
movdqa %xmm0,16*0(%rax) # wipe t
|
||
movdqa %xmm0,16*1(%rax)
|
||
movdqa %xmm0,16*2(%rax)
|
||
movdqa %xmm0,16*3(%rax)
|
||
lea 16*4(%rax),%rax
|
||
movdqa %xmm0,16*0(%rdx) # wipe n
|
||
movdqa %xmm0,16*1(%rdx)
|
||
movdqa %xmm0,16*2(%rdx)
|
||
movdqa %xmm0,16*3(%rdx)
|
||
lea 16*4(%rdx),%rdx
|
||
dec $num
|
||
jnz .Lsqr8x_zero
|
||
|
||
mov \$1,%rax
|
||
mov -48(%rsi),%r15
|
||
mov -40(%rsi),%r14
|
||
mov -32(%rsi),%r13
|
||
mov -24(%rsi),%r12
|
||
mov -16(%rsi),%rbp
|
||
mov -8(%rsi),%rbx
|
||
lea (%rsi),%rsp
|
||
.Lsqr8x_epilogue:
|
||
ret
|
||
.size bn_sqr8x_mont,.-bn_sqr8x_mont
|
||
___
|
||
}}}
|
||
|
||
if ($addx) {{{
|
||
my $bp="%rdx"; # original value
|
||
|
||
$code.=<<___;
|
||
.type bn_mulx4x_mont,\@function,6
|
||
.align 32
|
||
bn_mulx4x_mont:
|
||
.Lmulx4x_enter:
|
||
mov %rsp,%rax
|
||
push %rbx
|
||
push %rbp
|
||
push %r12
|
||
push %r13
|
||
push %r14
|
||
push %r15
|
||
|
||
shl \$3,${num}d # convert $num to bytes
|
||
.byte 0x67
|
||
xor %r10,%r10
|
||
sub $num,%r10 # -$num
|
||
mov ($n0),$n0 # *n0
|
||
lea -72(%rsp,%r10),%rsp # alloca(frame+$num+8)
|
||
lea ($bp,$num),%r10
|
||
and \$-128,%rsp
|
||
##############################################################
|
||
# Stack layout
|
||
# +0 num
|
||
# +8 off-loaded &b[i]
|
||
# +16 end of b[num]
|
||
# +24 saved n0
|
||
# +32 saved rp
|
||
# +40 saved %rsp
|
||
# +48 inner counter
|
||
# +56
|
||
# +64 tmp[num+1]
|
||
#
|
||
mov $num,0(%rsp) # save $num
|
||
shr \$5,$num
|
||
mov %r10,16(%rsp) # end of b[num]
|
||
sub \$1,$num
|
||
mov $n0, 24(%rsp) # save *n0
|
||
mov $rp, 32(%rsp) # save $rp
|
||
mov %rax,40(%rsp) # save original %rsp
|
||
mov $num,48(%rsp) # inner counter
|
||
jmp .Lmulx4x_body
|
||
|
||
.align 32
|
||
.Lmulx4x_body:
|
||
___
|
||
my ($aptr, $bptr, $nptr, $tptr, $mi, $bi, $zero, $num)=
|
||
("%rsi","%rdi","%rcx","%rbx","%r8","%r9","%rbp","%rax");
|
||
my $rptr=$bptr;
|
||
$code.=<<___;
|
||
lea 8($bp),$bptr
|
||
mov ($bp),%rdx # b[0], $bp==%rdx actually
|
||
lea 64+32(%rsp),$tptr
|
||
mov %rdx,$bi
|
||
|
||
mulx 0*8($aptr),$mi,%rax # a[0]*b[0]
|
||
mulx 1*8($aptr),%r11,%r14 # a[1]*b[0]
|
||
add %rax,%r11
|
||
mov $bptr,8(%rsp) # off-load &b[i]
|
||
mulx 2*8($aptr),%r12,%r13 # ...
|
||
adc %r14,%r12
|
||
adc \$0,%r13
|
||
|
||
mov $mi,$bptr # borrow $bptr
|
||
imulq 24(%rsp),$mi # "t[0]"*n0
|
||
xor $zero,$zero # cf=0, of=0
|
||
|
||
mulx 3*8($aptr),%rax,%r14
|
||
mov $mi,%rdx
|
||
lea 4*8($aptr),$aptr
|
||
adcx %rax,%r13
|
||
adcx $zero,%r14 # cf=0
|
||
|
||
mulx 0*8($nptr),%rax,%r10
|
||
adcx %rax,$bptr # discarded
|
||
adox %r11,%r10
|
||
mulx 1*8($nptr),%rax,%r11
|
||
adcx %rax,%r10
|
||
adox %r12,%r11
|
||
.byte 0xc4,0x62,0xfb,0xf6,0xa1,0x10,0x00,0x00,0x00 # mulx 2*8($nptr),%rax,%r12
|
||
mov 48(%rsp),$bptr # counter value
|
||
mov %r10,-4*8($tptr)
|
||
adcx %rax,%r11
|
||
adox %r13,%r12
|
||
mulx 3*8($nptr),%rax,%r15
|
||
mov $bi,%rdx
|
||
mov %r11,-3*8($tptr)
|
||
adcx %rax,%r12
|
||
adox $zero,%r15 # of=0
|
||
lea 4*8($nptr),$nptr
|
||
mov %r12,-2*8($tptr)
|
||
|
||
jmp .Lmulx4x_1st
|
||
|
||
.align 32
|
||
.Lmulx4x_1st:
|
||
adcx $zero,%r15 # cf=0, modulo-scheduled
|
||
mulx 0*8($aptr),%r10,%rax # a[4]*b[0]
|
||
adcx %r14,%r10
|
||
mulx 1*8($aptr),%r11,%r14 # a[5]*b[0]
|
||
adcx %rax,%r11
|
||
mulx 2*8($aptr),%r12,%rax # ...
|
||
adcx %r14,%r12
|
||
mulx 3*8($aptr),%r13,%r14
|
||
.byte 0x67,0x67
|
||
mov $mi,%rdx
|
||
adcx %rax,%r13
|
||
adcx $zero,%r14 # cf=0
|
||
lea 4*8($aptr),$aptr
|
||
lea 4*8($tptr),$tptr
|
||
|
||
adox %r15,%r10
|
||
mulx 0*8($nptr),%rax,%r15
|
||
adcx %rax,%r10
|
||
adox %r15,%r11
|
||
mulx 1*8($nptr),%rax,%r15
|
||
adcx %rax,%r11
|
||
adox %r15,%r12
|
||
mulx 2*8($nptr),%rax,%r15
|
||
mov %r10,-5*8($tptr)
|
||
adcx %rax,%r12
|
||
mov %r11,-4*8($tptr)
|
||
adox %r15,%r13
|
||
mulx 3*8($nptr),%rax,%r15
|
||
mov $bi,%rdx
|
||
mov %r12,-3*8($tptr)
|
||
adcx %rax,%r13
|
||
adox $zero,%r15
|
||
lea 4*8($nptr),$nptr
|
||
mov %r13,-2*8($tptr)
|
||
|
||
dec $bptr # of=0, pass cf
|
||
jnz .Lmulx4x_1st
|
||
|
||
mov 0(%rsp),$num # load num
|
||
mov 8(%rsp),$bptr # re-load &b[i]
|
||
adc $zero,%r15 # modulo-scheduled
|
||
add %r15,%r14
|
||
sbb %r15,%r15 # top-most carry
|
||
mov %r14,-1*8($tptr)
|
||
jmp .Lmulx4x_outer
|
||
|
||
.align 32
|
||
.Lmulx4x_outer:
|
||
mov ($bptr),%rdx # b[i]
|
||
lea 8($bptr),$bptr # b++
|
||
sub $num,$aptr # rewind $aptr
|
||
mov %r15,($tptr) # save top-most carry
|
||
lea 64+4*8(%rsp),$tptr
|
||
sub $num,$nptr # rewind $nptr
|
||
|
||
mulx 0*8($aptr),$mi,%r11 # a[0]*b[i]
|
||
xor %ebp,%ebp # xor $zero,$zero # cf=0, of=0
|
||
mov %rdx,$bi
|
||
mulx 1*8($aptr),%r14,%r12 # a[1]*b[i]
|
||
adox -4*8($tptr),$mi
|
||
adcx %r14,%r11
|
||
mulx 2*8($aptr),%r15,%r13 # ...
|
||
adox -3*8($tptr),%r11
|
||
adcx %r15,%r12
|
||
adox $zero,%r12
|
||
adcx $zero,%r13
|
||
|
||
mov $bptr,8(%rsp) # off-load &b[i]
|
||
.byte 0x67
|
||
mov $mi,%r15
|
||
imulq 24(%rsp),$mi # "t[0]"*n0
|
||
xor %ebp,%ebp # xor $zero,$zero # cf=0, of=0
|
||
|
||
mulx 3*8($aptr),%rax,%r14
|
||
mov $mi,%rdx
|
||
adox -2*8($tptr),%r12
|
||
adcx %rax,%r13
|
||
adox -1*8($tptr),%r13
|
||
adcx $zero,%r14
|
||
lea 4*8($aptr),$aptr
|
||
adox $zero,%r14
|
||
|
||
mulx 0*8($nptr),%rax,%r10
|
||
adcx %rax,%r15 # discarded
|
||
adox %r11,%r10
|
||
mulx 1*8($nptr),%rax,%r11
|
||
adcx %rax,%r10
|
||
adox %r12,%r11
|
||
mulx 2*8($nptr),%rax,%r12
|
||
mov %r10,-4*8($tptr)
|
||
adcx %rax,%r11
|
||
adox %r13,%r12
|
||
mulx 3*8($nptr),%rax,%r15
|
||
mov $bi,%rdx
|
||
mov %r11,-3*8($tptr)
|
||
lea 4*8($nptr),$nptr
|
||
adcx %rax,%r12
|
||
adox $zero,%r15 # of=0
|
||
mov 48(%rsp),$bptr # counter value
|
||
mov %r12,-2*8($tptr)
|
||
|
||
jmp .Lmulx4x_inner
|
||
|
||
.align 32
|
||
.Lmulx4x_inner:
|
||
mulx 0*8($aptr),%r10,%rax # a[4]*b[i]
|
||
adcx $zero,%r15 # cf=0, modulo-scheduled
|
||
adox %r14,%r10
|
||
mulx 1*8($aptr),%r11,%r14 # a[5]*b[i]
|
||
adcx 0*8($tptr),%r10
|
||
adox %rax,%r11
|
||
mulx 2*8($aptr),%r12,%rax # ...
|
||
adcx 1*8($tptr),%r11
|
||
adox %r14,%r12
|
||
mulx 3*8($aptr),%r13,%r14
|
||
mov $mi,%rdx
|
||
adcx 2*8($tptr),%r12
|
||
adox %rax,%r13
|
||
adcx 3*8($tptr),%r13
|
||
adox $zero,%r14 # of=0
|
||
lea 4*8($aptr),$aptr
|
||
lea 4*8($tptr),$tptr
|
||
adcx $zero,%r14 # cf=0
|
||
|
||
adox %r15,%r10
|
||
mulx 0*8($nptr),%rax,%r15
|
||
adcx %rax,%r10
|
||
adox %r15,%r11
|
||
mulx 1*8($nptr),%rax,%r15
|
||
adcx %rax,%r11
|
||
adox %r15,%r12
|
||
mulx 2*8($nptr),%rax,%r15
|
||
mov %r10,-5*8($tptr)
|
||
adcx %rax,%r12
|
||
adox %r15,%r13
|
||
mulx 3*8($nptr),%rax,%r15
|
||
mov $bi,%rdx
|
||
mov %r11,-4*8($tptr)
|
||
mov %r12,-3*8($tptr)
|
||
adcx %rax,%r13
|
||
adox $zero,%r15
|
||
lea 4*8($nptr),$nptr
|
||
mov %r13,-2*8($tptr)
|
||
|
||
dec $bptr # of=0, pass cf
|
||
jnz .Lmulx4x_inner
|
||
|
||
mov 0(%rsp),$num # load num
|
||
mov 8(%rsp),$bptr # re-load &b[i]
|
||
adc $zero,%r15 # modulo-scheduled
|
||
sub 0*8($tptr),$zero # pull top-most carry
|
||
adc %r15,%r14
|
||
mov -8($nptr),$mi
|
||
sbb %r15,%r15 # top-most carry
|
||
mov %r14,-1*8($tptr)
|
||
|
||
cmp 16(%rsp),$bptr
|
||
jne .Lmulx4x_outer
|
||
|
||
sub %r14,$mi # compare top-most words
|
||
sbb $mi,$mi
|
||
or $mi,%r15
|
||
|
||
neg $num
|
||
xor %rdx,%rdx
|
||
mov 32(%rsp),$rptr # restore rp
|
||
lea 64(%rsp),$tptr
|
||
|
||
pxor %xmm0,%xmm0
|
||
mov 0*8($nptr,$num),%r8
|
||
mov 1*8($nptr,$num),%r9
|
||
neg %r8
|
||
jmp .Lmulx4x_sub_entry
|
||
|
||
.align 32
|
||
.Lmulx4x_sub:
|
||
mov 0*8($nptr,$num),%r8
|
||
mov 1*8($nptr,$num),%r9
|
||
not %r8
|
||
.Lmulx4x_sub_entry:
|
||
mov 2*8($nptr,$num),%r10
|
||
not %r9
|
||
and %r15,%r8
|
||
mov 3*8($nptr,$num),%r11
|
||
not %r10
|
||
and %r15,%r9
|
||
not %r11
|
||
and %r15,%r10
|
||
and %r15,%r11
|
||
|
||
neg %rdx # mov %rdx,%cf
|
||
adc 0*8($tptr),%r8
|
||
adc 1*8($tptr),%r9
|
||
movdqa %xmm0,($tptr)
|
||
adc 2*8($tptr),%r10
|
||
adc 3*8($tptr),%r11
|
||
movdqa %xmm0,16($tptr)
|
||
lea 4*8($tptr),$tptr
|
||
sbb %rdx,%rdx # mov %cf,%rdx
|
||
|
||
mov %r8,0*8($rptr)
|
||
mov %r9,1*8($rptr)
|
||
mov %r10,2*8($rptr)
|
||
mov %r11,3*8($rptr)
|
||
lea 4*8($rptr),$rptr
|
||
|
||
add \$32,$num
|
||
jnz .Lmulx4x_sub
|
||
|
||
mov 40(%rsp),%rsi # restore %rsp
|
||
mov \$1,%rax
|
||
mov -48(%rsi),%r15
|
||
mov -40(%rsi),%r14
|
||
mov -32(%rsi),%r13
|
||
mov -24(%rsi),%r12
|
||
mov -16(%rsi),%rbp
|
||
mov -8(%rsi),%rbx
|
||
lea (%rsi),%rsp
|
||
.Lmulx4x_epilogue:
|
||
ret
|
||
.size bn_mulx4x_mont,.-bn_mulx4x_mont
|
||
___
|
||
}}}
|
||
$code.=<<___;
|
||
.asciz "Montgomery Multiplication for x86_64, CRYPTOGAMS by <appro\@openssl.org>"
|
||
.align 16
|
||
___
|
||
|
||
# EXCEPTION_DISPOSITION handler (EXCEPTION_RECORD *rec,ULONG64 frame,
|
||
# CONTEXT *context,DISPATCHER_CONTEXT *disp)
|
||
if ($win64) {
|
||
$rec="%rcx";
|
||
$frame="%rdx";
|
||
$context="%r8";
|
||
$disp="%r9";
|
||
|
||
$code.=<<___;
|
||
.extern __imp_RtlVirtualUnwind
|
||
.type mul_handler,\@abi-omnipotent
|
||
.align 16
|
||
mul_handler:
|
||
push %rsi
|
||
push %rdi
|
||
push %rbx
|
||
push %rbp
|
||
push %r12
|
||
push %r13
|
||
push %r14
|
||
push %r15
|
||
pushfq
|
||
sub \$64,%rsp
|
||
|
||
mov 120($context),%rax # pull context->Rax
|
||
mov 248($context),%rbx # pull context->Rip
|
||
|
||
mov 8($disp),%rsi # disp->ImageBase
|
||
mov 56($disp),%r11 # disp->HandlerData
|
||
|
||
mov 0(%r11),%r10d # HandlerData[0]
|
||
lea (%rsi,%r10),%r10 # end of prologue label
|
||
cmp %r10,%rbx # context->Rip<end of prologue label
|
||
jb .Lcommon_seh_tail
|
||
|
||
mov 152($context),%rax # pull context->Rsp
|
||
|
||
mov 4(%r11),%r10d # HandlerData[1]
|
||
lea (%rsi,%r10),%r10 # epilogue label
|
||
cmp %r10,%rbx # context->Rip>=epilogue label
|
||
jae .Lcommon_seh_tail
|
||
|
||
mov 192($context),%r10 # pull $num
|
||
mov 8(%rax,%r10,8),%rax # pull saved stack pointer
|
||
lea 48(%rax),%rax
|
||
|
||
mov -8(%rax),%rbx
|
||
mov -16(%rax),%rbp
|
||
mov -24(%rax),%r12
|
||
mov -32(%rax),%r13
|
||
mov -40(%rax),%r14
|
||
mov -48(%rax),%r15
|
||
mov %rbx,144($context) # restore context->Rbx
|
||
mov %rbp,160($context) # restore context->Rbp
|
||
mov %r12,216($context) # restore context->R12
|
||
mov %r13,224($context) # restore context->R13
|
||
mov %r14,232($context) # restore context->R14
|
||
mov %r15,240($context) # restore context->R15
|
||
|
||
jmp .Lcommon_seh_tail
|
||
.size mul_handler,.-mul_handler
|
||
|
||
.type sqr_handler,\@abi-omnipotent
|
||
.align 16
|
||
sqr_handler:
|
||
push %rsi
|
||
push %rdi
|
||
push %rbx
|
||
push %rbp
|
||
push %r12
|
||
push %r13
|
||
push %r14
|
||
push %r15
|
||
pushfq
|
||
sub \$64,%rsp
|
||
|
||
mov 120($context),%rax # pull context->Rax
|
||
mov 248($context),%rbx # pull context->Rip
|
||
|
||
mov 8($disp),%rsi # disp->ImageBase
|
||
mov 56($disp),%r11 # disp->HandlerData
|
||
|
||
mov 0(%r11),%r10d # HandlerData[0]
|
||
lea (%rsi,%r10),%r10 # end of prologue label
|
||
cmp %r10,%rbx # context->Rip<.Lsqr_body
|
||
jb .Lcommon_seh_tail
|
||
|
||
mov 152($context),%rax # pull context->Rsp
|
||
|
||
mov 4(%r11),%r10d # HandlerData[1]
|
||
lea (%rsi,%r10),%r10 # epilogue label
|
||
cmp %r10,%rbx # context->Rip>=.Lsqr_epilogue
|
||
jae .Lcommon_seh_tail
|
||
|
||
mov 40(%rax),%rax # pull saved stack pointer
|
||
|
||
mov -8(%rax),%rbx
|
||
mov -16(%rax),%rbp
|
||
mov -24(%rax),%r12
|
||
mov -32(%rax),%r13
|
||
mov -40(%rax),%r14
|
||
mov -48(%rax),%r15
|
||
mov %rbx,144($context) # restore context->Rbx
|
||
mov %rbp,160($context) # restore context->Rbp
|
||
mov %r12,216($context) # restore context->R12
|
||
mov %r13,224($context) # restore context->R13
|
||
mov %r14,232($context) # restore context->R14
|
||
mov %r15,240($context) # restore context->R15
|
||
|
||
.Lcommon_seh_tail:
|
||
mov 8(%rax),%rdi
|
||
mov 16(%rax),%rsi
|
||
mov %rax,152($context) # restore context->Rsp
|
||
mov %rsi,168($context) # restore context->Rsi
|
||
mov %rdi,176($context) # restore context->Rdi
|
||
|
||
mov 40($disp),%rdi # disp->ContextRecord
|
||
mov $context,%rsi # context
|
||
mov \$154,%ecx # sizeof(CONTEXT)
|
||
.long 0xa548f3fc # cld; rep movsq
|
||
|
||
mov $disp,%rsi
|
||
xor %rcx,%rcx # arg1, UNW_FLAG_NHANDLER
|
||
mov 8(%rsi),%rdx # arg2, disp->ImageBase
|
||
mov 0(%rsi),%r8 # arg3, disp->ControlPc
|
||
mov 16(%rsi),%r9 # arg4, disp->FunctionEntry
|
||
mov 40(%rsi),%r10 # disp->ContextRecord
|
||
lea 56(%rsi),%r11 # &disp->HandlerData
|
||
lea 24(%rsi),%r12 # &disp->EstablisherFrame
|
||
mov %r10,32(%rsp) # arg5
|
||
mov %r11,40(%rsp) # arg6
|
||
mov %r12,48(%rsp) # arg7
|
||
mov %rcx,56(%rsp) # arg8, (NULL)
|
||
call *__imp_RtlVirtualUnwind(%rip)
|
||
|
||
mov \$1,%eax # ExceptionContinueSearch
|
||
add \$64,%rsp
|
||
popfq
|
||
pop %r15
|
||
pop %r14
|
||
pop %r13
|
||
pop %r12
|
||
pop %rbp
|
||
pop %rbx
|
||
pop %rdi
|
||
pop %rsi
|
||
ret
|
||
.size sqr_handler,.-sqr_handler
|
||
|
||
.section .pdata
|
||
.align 4
|
||
.rva .LSEH_begin_bn_mul_mont
|
||
.rva .LSEH_end_bn_mul_mont
|
||
.rva .LSEH_info_bn_mul_mont
|
||
|
||
.rva .LSEH_begin_bn_mul4x_mont
|
||
.rva .LSEH_end_bn_mul4x_mont
|
||
.rva .LSEH_info_bn_mul4x_mont
|
||
|
||
.rva .LSEH_begin_bn_sqr8x_mont
|
||
.rva .LSEH_end_bn_sqr8x_mont
|
||
.rva .LSEH_info_bn_sqr8x_mont
|
||
___
|
||
$code.=<<___ if ($addx);
|
||
.rva .LSEH_begin_bn_mulx4x_mont
|
||
.rva .LSEH_end_bn_mulx4x_mont
|
||
.rva .LSEH_info_bn_mulx4x_mont
|
||
___
|
||
$code.=<<___;
|
||
.section .xdata
|
||
.align 8
|
||
.LSEH_info_bn_mul_mont:
|
||
.byte 9,0,0,0
|
||
.rva mul_handler
|
||
.rva .Lmul_body,.Lmul_epilogue # HandlerData[]
|
||
.LSEH_info_bn_mul4x_mont:
|
||
.byte 9,0,0,0
|
||
.rva mul_handler
|
||
.rva .Lmul4x_body,.Lmul4x_epilogue # HandlerData[]
|
||
.LSEH_info_bn_sqr8x_mont:
|
||
.byte 9,0,0,0
|
||
.rva sqr_handler
|
||
.rva .Lsqr8x_body,.Lsqr8x_epilogue # HandlerData[]
|
||
___
|
||
$code.=<<___ if ($addx);
|
||
.LSEH_info_bn_mulx4x_mont:
|
||
.byte 9,0,0,0
|
||
.rva sqr_handler
|
||
.rva .Lmulx4x_body,.Lmulx4x_epilogue # HandlerData[]
|
||
___
|
||
}
|
||
|
||
print $code;
|
||
close STDOUT;
|