b4cc925c30
This function is only called twice per ECDH or ECDSA operation, and it only saves a few scalar multiplications and additions compared to the alternative, so it doesn't need to be specialized. As the TODO comment above the callers notes, the two calls can be reduced to one. Implementing |ecp_nistz256_from_mont| in terms of |ecp_nistz256_mul_mont| helps show that that change is safe. This also saves a small amount of code size and improves testing and verification efficiency. Note that this is already how the function is implemented for targets other than x86-64 in OpenSSL. Change-Id: If1404951f1a787d2618c853afd1f0e99a019e012 Reviewed-on: https://boringssl-review.googlesource.com/13021 Reviewed-by: Adam Langley <alangley@gmail.com>
2589 lines
57 KiB
Raku
Executable File
2589 lines
57 KiB
Raku
Executable File
#!/usr/bin/env perl
|
|
|
|
# Copyright (c) 2014, Intel Corporation.
|
|
#
|
|
# Permission to use, copy, modify, and/or distribute this software for any
|
|
# purpose with or without fee is hereby granted, provided that the above
|
|
# copyright notice and this permission notice appear in all copies.
|
|
#
|
|
# THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
|
|
# WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
|
|
# MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY
|
|
# SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
|
|
# WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN ACTION
|
|
# OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF OR IN
|
|
# CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
|
|
|
|
# Developers and authors:
|
|
# Shay Gueron (1, 2), and Vlad Krasnov (1)
|
|
# (1) Intel Corporation, Israel Development Center
|
|
# (2) University of Haifa
|
|
|
|
# Reference:
|
|
# S.Gueron and V.Krasnov, "Fast Prime Field Elliptic Curve Cryptography with
|
|
# 256 Bit Primes"
|
|
|
|
# Further optimization by <appro@openssl.org>:
|
|
#
|
|
# this/original
|
|
# Opteron +12-49%
|
|
# Bulldozer +14-45%
|
|
# P4 +18-46%
|
|
# Westmere +12-34%
|
|
# Sandy Bridge +9-35%
|
|
# Ivy Bridge +9-35%
|
|
# Haswell +8-37%
|
|
# Broadwell +18-58%
|
|
# Atom +15-50%
|
|
# VIA Nano +43-160%
|
|
#
|
|
# Ranges denote minimum and maximum improvement coefficients depending
|
|
# on benchmark.
|
|
|
|
$flavour = shift;
|
|
$output = shift;
|
|
if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
|
|
|
|
$win64=0; $win64=1 if ($flavour =~ /[nm]asm|mingw64/ || $output =~ /\.asm$/);
|
|
|
|
$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
|
|
( $xlate="${dir}x86_64-xlate.pl" and -f $xlate ) or
|
|
( $xlate="${dir}../../perlasm/x86_64-xlate.pl" and -f $xlate) or
|
|
die "can't locate x86_64-xlate.pl";
|
|
|
|
open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
|
|
*STDOUT=*OUT;
|
|
|
|
# TODO: enable these after testing. $avx goes to two and $addx to one.
|
|
$avx=0;
|
|
$addx=0;
|
|
|
|
$code.=<<___;
|
|
.text
|
|
.extern OPENSSL_ia32cap_P
|
|
|
|
# The polynomial
|
|
.align 64
|
|
.Lpoly:
|
|
.quad 0xffffffffffffffff, 0x00000000ffffffff, 0x0000000000000000, 0xffffffff00000001
|
|
|
|
.LOne:
|
|
.long 1,1,1,1,1,1,1,1
|
|
.LTwo:
|
|
.long 2,2,2,2,2,2,2,2
|
|
.LThree:
|
|
.long 3,3,3,3,3,3,3,3
|
|
.LONE_mont:
|
|
.quad 0x0000000000000001, 0xffffffff00000000, 0xffffffffffffffff, 0x00000000fffffffe
|
|
___
|
|
|
|
{
|
|
my ($a0,$a1,$a2,$a3)=map("%r$_",(8..11));
|
|
my ($t0,$t1,$t2,$t3,$t4)=("%rax","%rdx","%rcx","%r12","%r13");
|
|
my ($r_ptr,$a_ptr,$b_ptr)=("%rdi","%rsi","%rdx");
|
|
|
|
$code.=<<___;
|
|
|
|
################################################################################
|
|
# void ecp_nistz256_neg(uint64_t res[4], uint64_t a[4]);
|
|
.globl ecp_nistz256_neg
|
|
.type ecp_nistz256_neg,\@function,2
|
|
.align 32
|
|
ecp_nistz256_neg:
|
|
push %r12
|
|
push %r13
|
|
|
|
xor $a0, $a0
|
|
xor $a1, $a1
|
|
xor $a2, $a2
|
|
xor $a3, $a3
|
|
xor $t4, $t4
|
|
|
|
sub 8*0($a_ptr), $a0
|
|
sbb 8*1($a_ptr), $a1
|
|
sbb 8*2($a_ptr), $a2
|
|
mov $a0, $t0
|
|
sbb 8*3($a_ptr), $a3
|
|
lea .Lpoly(%rip), $a_ptr
|
|
mov $a1, $t1
|
|
sbb \$0, $t4
|
|
|
|
add 8*0($a_ptr), $a0
|
|
mov $a2, $t2
|
|
adc 8*1($a_ptr), $a1
|
|
adc 8*2($a_ptr), $a2
|
|
mov $a3, $t3
|
|
adc 8*3($a_ptr), $a3
|
|
test $t4, $t4
|
|
|
|
cmovz $t0, $a0
|
|
cmovz $t1, $a1
|
|
mov $a0, 8*0($r_ptr)
|
|
cmovz $t2, $a2
|
|
mov $a1, 8*1($r_ptr)
|
|
cmovz $t3, $a3
|
|
mov $a2, 8*2($r_ptr)
|
|
mov $a3, 8*3($r_ptr)
|
|
|
|
pop %r13
|
|
pop %r12
|
|
ret
|
|
.size ecp_nistz256_neg,.-ecp_nistz256_neg
|
|
___
|
|
}
|
|
{
|
|
my ($r_ptr,$a_ptr,$b_org,$b_ptr)=("%rdi","%rsi","%rdx","%rbx");
|
|
my ($acc0,$acc1,$acc2,$acc3,$acc4,$acc5,$acc6,$acc7)=map("%r$_",(8..15));
|
|
my ($t0,$t1,$t2,$t3,$t4)=("%rcx","%rbp","%rbx","%rdx","%rax");
|
|
my ($poly1,$poly3)=($acc6,$acc7);
|
|
|
|
$code.=<<___;
|
|
################################################################################
|
|
# void ecp_nistz256_mul_mont(
|
|
# uint64_t res[4],
|
|
# uint64_t a[4],
|
|
# uint64_t b[4]);
|
|
|
|
.globl ecp_nistz256_mul_mont
|
|
.type ecp_nistz256_mul_mont,\@function,3
|
|
.align 32
|
|
ecp_nistz256_mul_mont:
|
|
___
|
|
$code.=<<___ if ($addx);
|
|
mov \$0x80100, %ecx
|
|
and OPENSSL_ia32cap_P+8(%rip), %ecx
|
|
___
|
|
$code.=<<___;
|
|
.Lmul_mont:
|
|
push %rbp
|
|
push %rbx
|
|
push %r12
|
|
push %r13
|
|
push %r14
|
|
push %r15
|
|
___
|
|
$code.=<<___ if ($addx);
|
|
cmp \$0x80100, %ecx
|
|
je .Lmul_montx
|
|
___
|
|
$code.=<<___;
|
|
mov $b_org, $b_ptr
|
|
mov 8*0($b_org), %rax
|
|
mov 8*0($a_ptr), $acc1
|
|
mov 8*1($a_ptr), $acc2
|
|
mov 8*2($a_ptr), $acc3
|
|
mov 8*3($a_ptr), $acc4
|
|
|
|
call __ecp_nistz256_mul_montq
|
|
___
|
|
$code.=<<___ if ($addx);
|
|
jmp .Lmul_mont_done
|
|
|
|
.align 32
|
|
.Lmul_montx:
|
|
mov $b_org, $b_ptr
|
|
mov 8*0($b_org), %rdx
|
|
mov 8*0($a_ptr), $acc1
|
|
mov 8*1($a_ptr), $acc2
|
|
mov 8*2($a_ptr), $acc3
|
|
mov 8*3($a_ptr), $acc4
|
|
lea -128($a_ptr), $a_ptr # control u-op density
|
|
|
|
call __ecp_nistz256_mul_montx
|
|
___
|
|
$code.=<<___;
|
|
.Lmul_mont_done:
|
|
pop %r15
|
|
pop %r14
|
|
pop %r13
|
|
pop %r12
|
|
pop %rbx
|
|
pop %rbp
|
|
ret
|
|
.size ecp_nistz256_mul_mont,.-ecp_nistz256_mul_mont
|
|
|
|
.type __ecp_nistz256_mul_montq,\@abi-omnipotent
|
|
.align 32
|
|
__ecp_nistz256_mul_montq:
|
|
########################################################################
|
|
# Multiply a by b[0]
|
|
mov %rax, $t1
|
|
mulq $acc1
|
|
mov .Lpoly+8*1(%rip),$poly1
|
|
mov %rax, $acc0
|
|
mov $t1, %rax
|
|
mov %rdx, $acc1
|
|
|
|
mulq $acc2
|
|
mov .Lpoly+8*3(%rip),$poly3
|
|
add %rax, $acc1
|
|
mov $t1, %rax
|
|
adc \$0, %rdx
|
|
mov %rdx, $acc2
|
|
|
|
mulq $acc3
|
|
add %rax, $acc2
|
|
mov $t1, %rax
|
|
adc \$0, %rdx
|
|
mov %rdx, $acc3
|
|
|
|
mulq $acc4
|
|
add %rax, $acc3
|
|
mov $acc0, %rax
|
|
adc \$0, %rdx
|
|
xor $acc5, $acc5
|
|
mov %rdx, $acc4
|
|
|
|
########################################################################
|
|
# First reduction step
|
|
# Basically now we want to multiply acc[0] by p256,
|
|
# and add the result to the acc.
|
|
# Due to the special form of p256 we do some optimizations
|
|
#
|
|
# acc[0] x p256[0..1] = acc[0] x 2^96 - acc[0]
|
|
# then we add acc[0] and get acc[0] x 2^96
|
|
|
|
mov $acc0, $t1
|
|
shl \$32, $acc0
|
|
mulq $poly3
|
|
shr \$32, $t1
|
|
add $acc0, $acc1 # +=acc[0]<<96
|
|
adc $t1, $acc2
|
|
adc %rax, $acc3
|
|
mov 8*1($b_ptr), %rax
|
|
adc %rdx, $acc4
|
|
adc \$0, $acc5
|
|
xor $acc0, $acc0
|
|
|
|
########################################################################
|
|
# Multiply by b[1]
|
|
mov %rax, $t1
|
|
mulq 8*0($a_ptr)
|
|
add %rax, $acc1
|
|
mov $t1, %rax
|
|
adc \$0, %rdx
|
|
mov %rdx, $t0
|
|
|
|
mulq 8*1($a_ptr)
|
|
add $t0, $acc2
|
|
adc \$0, %rdx
|
|
add %rax, $acc2
|
|
mov $t1, %rax
|
|
adc \$0, %rdx
|
|
mov %rdx, $t0
|
|
|
|
mulq 8*2($a_ptr)
|
|
add $t0, $acc3
|
|
adc \$0, %rdx
|
|
add %rax, $acc3
|
|
mov $t1, %rax
|
|
adc \$0, %rdx
|
|
mov %rdx, $t0
|
|
|
|
mulq 8*3($a_ptr)
|
|
add $t0, $acc4
|
|
adc \$0, %rdx
|
|
add %rax, $acc4
|
|
mov $acc1, %rax
|
|
adc %rdx, $acc5
|
|
adc \$0, $acc0
|
|
|
|
########################################################################
|
|
# Second reduction step
|
|
mov $acc1, $t1
|
|
shl \$32, $acc1
|
|
mulq $poly3
|
|
shr \$32, $t1
|
|
add $acc1, $acc2
|
|
adc $t1, $acc3
|
|
adc %rax, $acc4
|
|
mov 8*2($b_ptr), %rax
|
|
adc %rdx, $acc5
|
|
adc \$0, $acc0
|
|
xor $acc1, $acc1
|
|
|
|
########################################################################
|
|
# Multiply by b[2]
|
|
mov %rax, $t1
|
|
mulq 8*0($a_ptr)
|
|
add %rax, $acc2
|
|
mov $t1, %rax
|
|
adc \$0, %rdx
|
|
mov %rdx, $t0
|
|
|
|
mulq 8*1($a_ptr)
|
|
add $t0, $acc3
|
|
adc \$0, %rdx
|
|
add %rax, $acc3
|
|
mov $t1, %rax
|
|
adc \$0, %rdx
|
|
mov %rdx, $t0
|
|
|
|
mulq 8*2($a_ptr)
|
|
add $t0, $acc4
|
|
adc \$0, %rdx
|
|
add %rax, $acc4
|
|
mov $t1, %rax
|
|
adc \$0, %rdx
|
|
mov %rdx, $t0
|
|
|
|
mulq 8*3($a_ptr)
|
|
add $t0, $acc5
|
|
adc \$0, %rdx
|
|
add %rax, $acc5
|
|
mov $acc2, %rax
|
|
adc %rdx, $acc0
|
|
adc \$0, $acc1
|
|
|
|
########################################################################
|
|
# Third reduction step
|
|
mov $acc2, $t1
|
|
shl \$32, $acc2
|
|
mulq $poly3
|
|
shr \$32, $t1
|
|
add $acc2, $acc3
|
|
adc $t1, $acc4
|
|
adc %rax, $acc5
|
|
mov 8*3($b_ptr), %rax
|
|
adc %rdx, $acc0
|
|
adc \$0, $acc1
|
|
xor $acc2, $acc2
|
|
|
|
########################################################################
|
|
# Multiply by b[3]
|
|
mov %rax, $t1
|
|
mulq 8*0($a_ptr)
|
|
add %rax, $acc3
|
|
mov $t1, %rax
|
|
adc \$0, %rdx
|
|
mov %rdx, $t0
|
|
|
|
mulq 8*1($a_ptr)
|
|
add $t0, $acc4
|
|
adc \$0, %rdx
|
|
add %rax, $acc4
|
|
mov $t1, %rax
|
|
adc \$0, %rdx
|
|
mov %rdx, $t0
|
|
|
|
mulq 8*2($a_ptr)
|
|
add $t0, $acc5
|
|
adc \$0, %rdx
|
|
add %rax, $acc5
|
|
mov $t1, %rax
|
|
adc \$0, %rdx
|
|
mov %rdx, $t0
|
|
|
|
mulq 8*3($a_ptr)
|
|
add $t0, $acc0
|
|
adc \$0, %rdx
|
|
add %rax, $acc0
|
|
mov $acc3, %rax
|
|
adc %rdx, $acc1
|
|
adc \$0, $acc2
|
|
|
|
########################################################################
|
|
# Final reduction step
|
|
mov $acc3, $t1
|
|
shl \$32, $acc3
|
|
mulq $poly3
|
|
shr \$32, $t1
|
|
add $acc3, $acc4
|
|
adc $t1, $acc5
|
|
mov $acc4, $t0
|
|
adc %rax, $acc0
|
|
adc %rdx, $acc1
|
|
mov $acc5, $t1
|
|
adc \$0, $acc2
|
|
|
|
########################################################################
|
|
# Branch-less conditional subtraction of P
|
|
sub \$-1, $acc4 # .Lpoly[0]
|
|
mov $acc0, $t2
|
|
sbb $poly1, $acc5 # .Lpoly[1]
|
|
sbb \$0, $acc0 # .Lpoly[2]
|
|
mov $acc1, $t3
|
|
sbb $poly3, $acc1 # .Lpoly[3]
|
|
sbb \$0, $acc2
|
|
|
|
cmovc $t0, $acc4
|
|
cmovc $t1, $acc5
|
|
mov $acc4, 8*0($r_ptr)
|
|
cmovc $t2, $acc0
|
|
mov $acc5, 8*1($r_ptr)
|
|
cmovc $t3, $acc1
|
|
mov $acc0, 8*2($r_ptr)
|
|
mov $acc1, 8*3($r_ptr)
|
|
|
|
ret
|
|
.size __ecp_nistz256_mul_montq,.-__ecp_nistz256_mul_montq
|
|
|
|
################################################################################
|
|
# void ecp_nistz256_sqr_mont(
|
|
# uint64_t res[4],
|
|
# uint64_t a[4]);
|
|
|
|
# we optimize the square according to S.Gueron and V.Krasnov,
|
|
# "Speeding up Big-Number Squaring"
|
|
.globl ecp_nistz256_sqr_mont
|
|
.type ecp_nistz256_sqr_mont,\@function,2
|
|
.align 32
|
|
ecp_nistz256_sqr_mont:
|
|
___
|
|
$code.=<<___ if ($addx);
|
|
mov \$0x80100, %ecx
|
|
and OPENSSL_ia32cap_P+8(%rip), %ecx
|
|
___
|
|
$code.=<<___;
|
|
push %rbp
|
|
push %rbx
|
|
push %r12
|
|
push %r13
|
|
push %r14
|
|
push %r15
|
|
___
|
|
$code.=<<___ if ($addx);
|
|
cmp \$0x80100, %ecx
|
|
je .Lsqr_montx
|
|
___
|
|
$code.=<<___;
|
|
mov 8*0($a_ptr), %rax
|
|
mov 8*1($a_ptr), $acc6
|
|
mov 8*2($a_ptr), $acc7
|
|
mov 8*3($a_ptr), $acc0
|
|
|
|
call __ecp_nistz256_sqr_montq
|
|
___
|
|
$code.=<<___ if ($addx);
|
|
jmp .Lsqr_mont_done
|
|
|
|
.align 32
|
|
.Lsqr_montx:
|
|
mov 8*0($a_ptr), %rdx
|
|
mov 8*1($a_ptr), $acc6
|
|
mov 8*2($a_ptr), $acc7
|
|
mov 8*3($a_ptr), $acc0
|
|
lea -128($a_ptr), $a_ptr # control u-op density
|
|
|
|
call __ecp_nistz256_sqr_montx
|
|
___
|
|
$code.=<<___;
|
|
.Lsqr_mont_done:
|
|
pop %r15
|
|
pop %r14
|
|
pop %r13
|
|
pop %r12
|
|
pop %rbx
|
|
pop %rbp
|
|
ret
|
|
.size ecp_nistz256_sqr_mont,.-ecp_nistz256_sqr_mont
|
|
|
|
.type __ecp_nistz256_sqr_montq,\@abi-omnipotent
|
|
.align 32
|
|
__ecp_nistz256_sqr_montq:
|
|
mov %rax, $acc5
|
|
mulq $acc6 # a[1]*a[0]
|
|
mov %rax, $acc1
|
|
mov $acc7, %rax
|
|
mov %rdx, $acc2
|
|
|
|
mulq $acc5 # a[0]*a[2]
|
|
add %rax, $acc2
|
|
mov $acc0, %rax
|
|
adc \$0, %rdx
|
|
mov %rdx, $acc3
|
|
|
|
mulq $acc5 # a[0]*a[3]
|
|
add %rax, $acc3
|
|
mov $acc7, %rax
|
|
adc \$0, %rdx
|
|
mov %rdx, $acc4
|
|
|
|
#################################
|
|
mulq $acc6 # a[1]*a[2]
|
|
add %rax, $acc3
|
|
mov $acc0, %rax
|
|
adc \$0, %rdx
|
|
mov %rdx, $t1
|
|
|
|
mulq $acc6 # a[1]*a[3]
|
|
add %rax, $acc4
|
|
mov $acc0, %rax
|
|
adc \$0, %rdx
|
|
add $t1, $acc4
|
|
mov %rdx, $acc5
|
|
adc \$0, $acc5
|
|
|
|
#################################
|
|
mulq $acc7 # a[2]*a[3]
|
|
xor $acc7, $acc7
|
|
add %rax, $acc5
|
|
mov 8*0($a_ptr), %rax
|
|
mov %rdx, $acc6
|
|
adc \$0, $acc6
|
|
|
|
add $acc1, $acc1 # acc1:6<<1
|
|
adc $acc2, $acc2
|
|
adc $acc3, $acc3
|
|
adc $acc4, $acc4
|
|
adc $acc5, $acc5
|
|
adc $acc6, $acc6
|
|
adc \$0, $acc7
|
|
|
|
mulq %rax
|
|
mov %rax, $acc0
|
|
mov 8*1($a_ptr), %rax
|
|
mov %rdx, $t0
|
|
|
|
mulq %rax
|
|
add $t0, $acc1
|
|
adc %rax, $acc2
|
|
mov 8*2($a_ptr), %rax
|
|
adc \$0, %rdx
|
|
mov %rdx, $t0
|
|
|
|
mulq %rax
|
|
add $t0, $acc3
|
|
adc %rax, $acc4
|
|
mov 8*3($a_ptr), %rax
|
|
adc \$0, %rdx
|
|
mov %rdx, $t0
|
|
|
|
mulq %rax
|
|
add $t0, $acc5
|
|
adc %rax, $acc6
|
|
mov $acc0, %rax
|
|
adc %rdx, $acc7
|
|
|
|
mov .Lpoly+8*1(%rip), $a_ptr
|
|
mov .Lpoly+8*3(%rip), $t1
|
|
|
|
##########################################
|
|
# Now the reduction
|
|
# First iteration
|
|
mov $acc0, $t0
|
|
shl \$32, $acc0
|
|
mulq $t1
|
|
shr \$32, $t0
|
|
add $acc0, $acc1 # +=acc[0]<<96
|
|
adc $t0, $acc2
|
|
adc %rax, $acc3
|
|
mov $acc1, %rax
|
|
adc \$0, %rdx
|
|
|
|
##########################################
|
|
# Second iteration
|
|
mov $acc1, $t0
|
|
shl \$32, $acc1
|
|
mov %rdx, $acc0
|
|
mulq $t1
|
|
shr \$32, $t0
|
|
add $acc1, $acc2
|
|
adc $t0, $acc3
|
|
adc %rax, $acc0
|
|
mov $acc2, %rax
|
|
adc \$0, %rdx
|
|
|
|
##########################################
|
|
# Third iteration
|
|
mov $acc2, $t0
|
|
shl \$32, $acc2
|
|
mov %rdx, $acc1
|
|
mulq $t1
|
|
shr \$32, $t0
|
|
add $acc2, $acc3
|
|
adc $t0, $acc0
|
|
adc %rax, $acc1
|
|
mov $acc3, %rax
|
|
adc \$0, %rdx
|
|
|
|
###########################################
|
|
# Last iteration
|
|
mov $acc3, $t0
|
|
shl \$32, $acc3
|
|
mov %rdx, $acc2
|
|
mulq $t1
|
|
shr \$32, $t0
|
|
add $acc3, $acc0
|
|
adc $t0, $acc1
|
|
adc %rax, $acc2
|
|
adc \$0, %rdx
|
|
xor $acc3, $acc3
|
|
|
|
############################################
|
|
# Add the rest of the acc
|
|
add $acc0, $acc4
|
|
adc $acc1, $acc5
|
|
mov $acc4, $acc0
|
|
adc $acc2, $acc6
|
|
adc %rdx, $acc7
|
|
mov $acc5, $acc1
|
|
adc \$0, $acc3
|
|
|
|
sub \$-1, $acc4 # .Lpoly[0]
|
|
mov $acc6, $acc2
|
|
sbb $a_ptr, $acc5 # .Lpoly[1]
|
|
sbb \$0, $acc6 # .Lpoly[2]
|
|
mov $acc7, $t0
|
|
sbb $t1, $acc7 # .Lpoly[3]
|
|
sbb \$0, $acc3
|
|
|
|
cmovc $acc0, $acc4
|
|
cmovc $acc1, $acc5
|
|
mov $acc4, 8*0($r_ptr)
|
|
cmovc $acc2, $acc6
|
|
mov $acc5, 8*1($r_ptr)
|
|
cmovc $t0, $acc7
|
|
mov $acc6, 8*2($r_ptr)
|
|
mov $acc7, 8*3($r_ptr)
|
|
|
|
ret
|
|
.size __ecp_nistz256_sqr_montq,.-__ecp_nistz256_sqr_montq
|
|
___
|
|
|
|
if ($addx) {
|
|
$code.=<<___;
|
|
.type __ecp_nistz256_mul_montx,\@abi-omnipotent
|
|
.align 32
|
|
__ecp_nistz256_mul_montx:
|
|
########################################################################
|
|
# Multiply by b[0]
|
|
mulx $acc1, $acc0, $acc1
|
|
mulx $acc2, $t0, $acc2
|
|
mov \$32, $poly1
|
|
xor $acc5, $acc5 # cf=0
|
|
mulx $acc3, $t1, $acc3
|
|
mov .Lpoly+8*3(%rip), $poly3
|
|
adc $t0, $acc1
|
|
mulx $acc4, $t0, $acc4
|
|
mov $acc0, %rdx
|
|
adc $t1, $acc2
|
|
shlx $poly1,$acc0,$t1
|
|
adc $t0, $acc3
|
|
shrx $poly1,$acc0,$t0
|
|
adc \$0, $acc4
|
|
|
|
########################################################################
|
|
# First reduction step
|
|
add $t1, $acc1
|
|
adc $t0, $acc2
|
|
|
|
mulx $poly3, $t0, $t1
|
|
mov 8*1($b_ptr), %rdx
|
|
adc $t0, $acc3
|
|
adc $t1, $acc4
|
|
adc \$0, $acc5
|
|
xor $acc0, $acc0 # $acc0=0,cf=0,of=0
|
|
|
|
########################################################################
|
|
# Multiply by b[1]
|
|
mulx 8*0+128($a_ptr), $t0, $t1
|
|
adcx $t0, $acc1
|
|
adox $t1, $acc2
|
|
|
|
mulx 8*1+128($a_ptr), $t0, $t1
|
|
adcx $t0, $acc2
|
|
adox $t1, $acc3
|
|
|
|
mulx 8*2+128($a_ptr), $t0, $t1
|
|
adcx $t0, $acc3
|
|
adox $t1, $acc4
|
|
|
|
mulx 8*3+128($a_ptr), $t0, $t1
|
|
mov $acc1, %rdx
|
|
adcx $t0, $acc4
|
|
shlx $poly1, $acc1, $t0
|
|
adox $t1, $acc5
|
|
shrx $poly1, $acc1, $t1
|
|
|
|
adcx $acc0, $acc5
|
|
adox $acc0, $acc0
|
|
adc \$0, $acc0
|
|
|
|
########################################################################
|
|
# Second reduction step
|
|
add $t0, $acc2
|
|
adc $t1, $acc3
|
|
|
|
mulx $poly3, $t0, $t1
|
|
mov 8*2($b_ptr), %rdx
|
|
adc $t0, $acc4
|
|
adc $t1, $acc5
|
|
adc \$0, $acc0
|
|
xor $acc1 ,$acc1 # $acc1=0,cf=0,of=0
|
|
|
|
########################################################################
|
|
# Multiply by b[2]
|
|
mulx 8*0+128($a_ptr), $t0, $t1
|
|
adcx $t0, $acc2
|
|
adox $t1, $acc3
|
|
|
|
mulx 8*1+128($a_ptr), $t0, $t1
|
|
adcx $t0, $acc3
|
|
adox $t1, $acc4
|
|
|
|
mulx 8*2+128($a_ptr), $t0, $t1
|
|
adcx $t0, $acc4
|
|
adox $t1, $acc5
|
|
|
|
mulx 8*3+128($a_ptr), $t0, $t1
|
|
mov $acc2, %rdx
|
|
adcx $t0, $acc5
|
|
shlx $poly1, $acc2, $t0
|
|
adox $t1, $acc0
|
|
shrx $poly1, $acc2, $t1
|
|
|
|
adcx $acc1, $acc0
|
|
adox $acc1, $acc1
|
|
adc \$0, $acc1
|
|
|
|
########################################################################
|
|
# Third reduction step
|
|
add $t0, $acc3
|
|
adc $t1, $acc4
|
|
|
|
mulx $poly3, $t0, $t1
|
|
mov 8*3($b_ptr), %rdx
|
|
adc $t0, $acc5
|
|
adc $t1, $acc0
|
|
adc \$0, $acc1
|
|
xor $acc2, $acc2 # $acc2=0,cf=0,of=0
|
|
|
|
########################################################################
|
|
# Multiply by b[3]
|
|
mulx 8*0+128($a_ptr), $t0, $t1
|
|
adcx $t0, $acc3
|
|
adox $t1, $acc4
|
|
|
|
mulx 8*1+128($a_ptr), $t0, $t1
|
|
adcx $t0, $acc4
|
|
adox $t1, $acc5
|
|
|
|
mulx 8*2+128($a_ptr), $t0, $t1
|
|
adcx $t0, $acc5
|
|
adox $t1, $acc0
|
|
|
|
mulx 8*3+128($a_ptr), $t0, $t1
|
|
mov $acc3, %rdx
|
|
adcx $t0, $acc0
|
|
shlx $poly1, $acc3, $t0
|
|
adox $t1, $acc1
|
|
shrx $poly1, $acc3, $t1
|
|
|
|
adcx $acc2, $acc1
|
|
adox $acc2, $acc2
|
|
adc \$0, $acc2
|
|
|
|
########################################################################
|
|
# Fourth reduction step
|
|
add $t0, $acc4
|
|
adc $t1, $acc5
|
|
|
|
mulx $poly3, $t0, $t1
|
|
mov $acc4, $t2
|
|
mov .Lpoly+8*1(%rip), $poly1
|
|
adc $t0, $acc0
|
|
mov $acc5, $t3
|
|
adc $t1, $acc1
|
|
adc \$0, $acc2
|
|
|
|
########################################################################
|
|
# Branch-less conditional subtraction of P
|
|
xor %eax, %eax
|
|
mov $acc0, $t0
|
|
sbb \$-1, $acc4 # .Lpoly[0]
|
|
sbb $poly1, $acc5 # .Lpoly[1]
|
|
sbb \$0, $acc0 # .Lpoly[2]
|
|
mov $acc1, $t1
|
|
sbb $poly3, $acc1 # .Lpoly[3]
|
|
sbb \$0, $acc2
|
|
|
|
cmovc $t2, $acc4
|
|
cmovc $t3, $acc5
|
|
mov $acc4, 8*0($r_ptr)
|
|
cmovc $t0, $acc0
|
|
mov $acc5, 8*1($r_ptr)
|
|
cmovc $t1, $acc1
|
|
mov $acc0, 8*2($r_ptr)
|
|
mov $acc1, 8*3($r_ptr)
|
|
|
|
ret
|
|
.size __ecp_nistz256_mul_montx,.-__ecp_nistz256_mul_montx
|
|
|
|
.type __ecp_nistz256_sqr_montx,\@abi-omnipotent
|
|
.align 32
|
|
__ecp_nistz256_sqr_montx:
|
|
mulx $acc6, $acc1, $acc2 # a[0]*a[1]
|
|
mulx $acc7, $t0, $acc3 # a[0]*a[2]
|
|
xor %eax, %eax
|
|
adc $t0, $acc2
|
|
mulx $acc0, $t1, $acc4 # a[0]*a[3]
|
|
mov $acc6, %rdx
|
|
adc $t1, $acc3
|
|
adc \$0, $acc4
|
|
xor $acc5, $acc5 # $acc5=0,cf=0,of=0
|
|
|
|
#################################
|
|
mulx $acc7, $t0, $t1 # a[1]*a[2]
|
|
adcx $t0, $acc3
|
|
adox $t1, $acc4
|
|
|
|
mulx $acc0, $t0, $t1 # a[1]*a[3]
|
|
mov $acc7, %rdx
|
|
adcx $t0, $acc4
|
|
adox $t1, $acc5
|
|
adc \$0, $acc5
|
|
|
|
#################################
|
|
mulx $acc0, $t0, $acc6 # a[2]*a[3]
|
|
mov 8*0+128($a_ptr), %rdx
|
|
xor $acc7, $acc7 # $acc7=0,cf=0,of=0
|
|
adcx $acc1, $acc1 # acc1:6<<1
|
|
adox $t0, $acc5
|
|
adcx $acc2, $acc2
|
|
adox $acc7, $acc6 # of=0
|
|
|
|
mulx %rdx, $acc0, $t1
|
|
mov 8*1+128($a_ptr), %rdx
|
|
adcx $acc3, $acc3
|
|
adox $t1, $acc1
|
|
adcx $acc4, $acc4
|
|
mulx %rdx, $t0, $t4
|
|
mov 8*2+128($a_ptr), %rdx
|
|
adcx $acc5, $acc5
|
|
adox $t0, $acc2
|
|
adcx $acc6, $acc6
|
|
.byte 0x67
|
|
mulx %rdx, $t0, $t1
|
|
mov 8*3+128($a_ptr), %rdx
|
|
adox $t4, $acc3
|
|
adcx $acc7, $acc7
|
|
adox $t0, $acc4
|
|
mov \$32, $a_ptr
|
|
adox $t1, $acc5
|
|
.byte 0x67,0x67
|
|
mulx %rdx, $t0, $t4
|
|
mov $acc0, %rdx
|
|
adox $t0, $acc6
|
|
shlx $a_ptr, $acc0, $t0
|
|
adox $t4, $acc7
|
|
shrx $a_ptr, $acc0, $t4
|
|
mov .Lpoly+8*3(%rip), $t1
|
|
|
|
# reduction step 1
|
|
add $t0, $acc1
|
|
adc $t4, $acc2
|
|
|
|
mulx $t1, $t0, $acc0
|
|
mov $acc1, %rdx
|
|
adc $t0, $acc3
|
|
shlx $a_ptr, $acc1, $t0
|
|
adc \$0, $acc0
|
|
shrx $a_ptr, $acc1, $t4
|
|
|
|
# reduction step 2
|
|
add $t0, $acc2
|
|
adc $t4, $acc3
|
|
|
|
mulx $t1, $t0, $acc1
|
|
mov $acc2, %rdx
|
|
adc $t0, $acc0
|
|
shlx $a_ptr, $acc2, $t0
|
|
adc \$0, $acc1
|
|
shrx $a_ptr, $acc2, $t4
|
|
|
|
# reduction step 3
|
|
add $t0, $acc3
|
|
adc $t4, $acc0
|
|
|
|
mulx $t1, $t0, $acc2
|
|
mov $acc3, %rdx
|
|
adc $t0, $acc1
|
|
shlx $a_ptr, $acc3, $t0
|
|
adc \$0, $acc2
|
|
shrx $a_ptr, $acc3, $t4
|
|
|
|
# reduction step 4
|
|
add $t0, $acc0
|
|
adc $t4, $acc1
|
|
|
|
mulx $t1, $t0, $acc3
|
|
adc $t0, $acc2
|
|
adc \$0, $acc3
|
|
|
|
xor $t3, $t3 # cf=0
|
|
adc $acc0, $acc4 # accumulate upper half
|
|
mov .Lpoly+8*1(%rip), $a_ptr
|
|
adc $acc1, $acc5
|
|
mov $acc4, $acc0
|
|
adc $acc2, $acc6
|
|
adc $acc3, $acc7
|
|
mov $acc5, $acc1
|
|
adc \$0, $t3
|
|
|
|
xor %eax, %eax # cf=0
|
|
sbb \$-1, $acc4 # .Lpoly[0]
|
|
mov $acc6, $acc2
|
|
sbb $a_ptr, $acc5 # .Lpoly[1]
|
|
sbb \$0, $acc6 # .Lpoly[2]
|
|
mov $acc7, $acc3
|
|
sbb $t1, $acc7 # .Lpoly[3]
|
|
sbb \$0, $t3
|
|
|
|
cmovc $acc0, $acc4
|
|
cmovc $acc1, $acc5
|
|
mov $acc4, 8*0($r_ptr)
|
|
cmovc $acc2, $acc6
|
|
mov $acc5, 8*1($r_ptr)
|
|
cmovc $acc3, $acc7
|
|
mov $acc6, 8*2($r_ptr)
|
|
mov $acc7, 8*3($r_ptr)
|
|
|
|
ret
|
|
.size __ecp_nistz256_sqr_montx,.-__ecp_nistz256_sqr_montx
|
|
___
|
|
}
|
|
}
|
|
{
|
|
my ($val,$in_t,$index)=$win64?("%rcx","%rdx","%r8d"):("%rdi","%rsi","%edx");
|
|
my ($ONE,$INDEX,$Ra,$Rb,$Rc,$Rd,$Re,$Rf)=map("%xmm$_",(0..7));
|
|
my ($M0,$T0a,$T0b,$T0c,$T0d,$T0e,$T0f,$TMP0)=map("%xmm$_",(8..15));
|
|
my ($M1,$T2a,$T2b,$TMP2,$M2,$T2a,$T2b,$TMP2)=map("%xmm$_",(8..15));
|
|
|
|
$code.=<<___;
|
|
################################################################################
|
|
# void ecp_nistz256_select_w5(uint64_t *val, uint64_t *in_t, int index);
|
|
.globl ecp_nistz256_select_w5
|
|
.type ecp_nistz256_select_w5,\@abi-omnipotent
|
|
.align 32
|
|
ecp_nistz256_select_w5:
|
|
___
|
|
$code.=<<___ if ($avx>1);
|
|
mov OPENSSL_ia32cap_P+8(%rip), %eax
|
|
test \$`1<<5`, %eax
|
|
jnz .Lavx2_select_w5
|
|
___
|
|
$code.=<<___ if ($win64);
|
|
lea -0x88(%rsp), %rax
|
|
.LSEH_begin_ecp_nistz256_select_w5:
|
|
.byte 0x48,0x8d,0x60,0xe0 #lea -0x20(%rax), %rsp
|
|
.byte 0x0f,0x29,0x70,0xe0 #movaps %xmm6, -0x20(%rax)
|
|
.byte 0x0f,0x29,0x78,0xf0 #movaps %xmm7, -0x10(%rax)
|
|
.byte 0x44,0x0f,0x29,0x00 #movaps %xmm8, 0(%rax)
|
|
.byte 0x44,0x0f,0x29,0x48,0x10 #movaps %xmm9, 0x10(%rax)
|
|
.byte 0x44,0x0f,0x29,0x50,0x20 #movaps %xmm10, 0x20(%rax)
|
|
.byte 0x44,0x0f,0x29,0x58,0x30 #movaps %xmm11, 0x30(%rax)
|
|
.byte 0x44,0x0f,0x29,0x60,0x40 #movaps %xmm12, 0x40(%rax)
|
|
.byte 0x44,0x0f,0x29,0x68,0x50 #movaps %xmm13, 0x50(%rax)
|
|
.byte 0x44,0x0f,0x29,0x70,0x60 #movaps %xmm14, 0x60(%rax)
|
|
.byte 0x44,0x0f,0x29,0x78,0x70 #movaps %xmm15, 0x70(%rax)
|
|
___
|
|
$code.=<<___;
|
|
movdqa .LOne(%rip), $ONE
|
|
movd $index, $INDEX
|
|
|
|
pxor $Ra, $Ra
|
|
pxor $Rb, $Rb
|
|
pxor $Rc, $Rc
|
|
pxor $Rd, $Rd
|
|
pxor $Re, $Re
|
|
pxor $Rf, $Rf
|
|
|
|
movdqa $ONE, $M0
|
|
pshufd \$0, $INDEX, $INDEX
|
|
|
|
mov \$16, %rax
|
|
.Lselect_loop_sse_w5:
|
|
|
|
movdqa $M0, $TMP0
|
|
paddd $ONE, $M0
|
|
pcmpeqd $INDEX, $TMP0
|
|
|
|
movdqa 16*0($in_t), $T0a
|
|
movdqa 16*1($in_t), $T0b
|
|
movdqa 16*2($in_t), $T0c
|
|
movdqa 16*3($in_t), $T0d
|
|
movdqa 16*4($in_t), $T0e
|
|
movdqa 16*5($in_t), $T0f
|
|
lea 16*6($in_t), $in_t
|
|
|
|
pand $TMP0, $T0a
|
|
pand $TMP0, $T0b
|
|
por $T0a, $Ra
|
|
pand $TMP0, $T0c
|
|
por $T0b, $Rb
|
|
pand $TMP0, $T0d
|
|
por $T0c, $Rc
|
|
pand $TMP0, $T0e
|
|
por $T0d, $Rd
|
|
pand $TMP0, $T0f
|
|
por $T0e, $Re
|
|
por $T0f, $Rf
|
|
|
|
dec %rax
|
|
jnz .Lselect_loop_sse_w5
|
|
|
|
movdqu $Ra, 16*0($val)
|
|
movdqu $Rb, 16*1($val)
|
|
movdqu $Rc, 16*2($val)
|
|
movdqu $Rd, 16*3($val)
|
|
movdqu $Re, 16*4($val)
|
|
movdqu $Rf, 16*5($val)
|
|
___
|
|
$code.=<<___ if ($win64);
|
|
movaps (%rsp), %xmm6
|
|
movaps 0x10(%rsp), %xmm7
|
|
movaps 0x20(%rsp), %xmm8
|
|
movaps 0x30(%rsp), %xmm9
|
|
movaps 0x40(%rsp), %xmm10
|
|
movaps 0x50(%rsp), %xmm11
|
|
movaps 0x60(%rsp), %xmm12
|
|
movaps 0x70(%rsp), %xmm13
|
|
movaps 0x80(%rsp), %xmm14
|
|
movaps 0x90(%rsp), %xmm15
|
|
lea 0xa8(%rsp), %rsp
|
|
.LSEH_end_ecp_nistz256_select_w5:
|
|
___
|
|
$code.=<<___;
|
|
ret
|
|
.size ecp_nistz256_select_w5,.-ecp_nistz256_select_w5
|
|
|
|
################################################################################
|
|
# void ecp_nistz256_select_w7(uint64_t *val, uint64_t *in_t, int index);
|
|
.globl ecp_nistz256_select_w7
|
|
.type ecp_nistz256_select_w7,\@abi-omnipotent
|
|
.align 32
|
|
ecp_nistz256_select_w7:
|
|
___
|
|
$code.=<<___ if ($avx>1);
|
|
mov OPENSSL_ia32cap_P+8(%rip), %eax
|
|
test \$`1<<5`, %eax
|
|
jnz .Lavx2_select_w7
|
|
___
|
|
$code.=<<___ if ($win64);
|
|
lea -0x88(%rsp), %rax
|
|
.LSEH_begin_ecp_nistz256_select_w7:
|
|
.byte 0x48,0x8d,0x60,0xe0 #lea -0x20(%rax), %rsp
|
|
.byte 0x0f,0x29,0x70,0xe0 #movaps %xmm6, -0x20(%rax)
|
|
.byte 0x0f,0x29,0x78,0xf0 #movaps %xmm7, -0x10(%rax)
|
|
.byte 0x44,0x0f,0x29,0x00 #movaps %xmm8, 0(%rax)
|
|
.byte 0x44,0x0f,0x29,0x48,0x10 #movaps %xmm9, 0x10(%rax)
|
|
.byte 0x44,0x0f,0x29,0x50,0x20 #movaps %xmm10, 0x20(%rax)
|
|
.byte 0x44,0x0f,0x29,0x58,0x30 #movaps %xmm11, 0x30(%rax)
|
|
.byte 0x44,0x0f,0x29,0x60,0x40 #movaps %xmm12, 0x40(%rax)
|
|
.byte 0x44,0x0f,0x29,0x68,0x50 #movaps %xmm13, 0x50(%rax)
|
|
.byte 0x44,0x0f,0x29,0x70,0x60 #movaps %xmm14, 0x60(%rax)
|
|
.byte 0x44,0x0f,0x29,0x78,0x70 #movaps %xmm15, 0x70(%rax)
|
|
___
|
|
$code.=<<___;
|
|
movdqa .LOne(%rip), $M0
|
|
movd $index, $INDEX
|
|
|
|
pxor $Ra, $Ra
|
|
pxor $Rb, $Rb
|
|
pxor $Rc, $Rc
|
|
pxor $Rd, $Rd
|
|
|
|
movdqa $M0, $ONE
|
|
pshufd \$0, $INDEX, $INDEX
|
|
mov \$64, %rax
|
|
|
|
.Lselect_loop_sse_w7:
|
|
movdqa $M0, $TMP0
|
|
paddd $ONE, $M0
|
|
movdqa 16*0($in_t), $T0a
|
|
movdqa 16*1($in_t), $T0b
|
|
pcmpeqd $INDEX, $TMP0
|
|
movdqa 16*2($in_t), $T0c
|
|
movdqa 16*3($in_t), $T0d
|
|
lea 16*4($in_t), $in_t
|
|
|
|
pand $TMP0, $T0a
|
|
pand $TMP0, $T0b
|
|
por $T0a, $Ra
|
|
pand $TMP0, $T0c
|
|
por $T0b, $Rb
|
|
pand $TMP0, $T0d
|
|
por $T0c, $Rc
|
|
prefetcht0 255($in_t)
|
|
por $T0d, $Rd
|
|
|
|
dec %rax
|
|
jnz .Lselect_loop_sse_w7
|
|
|
|
movdqu $Ra, 16*0($val)
|
|
movdqu $Rb, 16*1($val)
|
|
movdqu $Rc, 16*2($val)
|
|
movdqu $Rd, 16*3($val)
|
|
___
|
|
$code.=<<___ if ($win64);
|
|
movaps (%rsp), %xmm6
|
|
movaps 0x10(%rsp), %xmm7
|
|
movaps 0x20(%rsp), %xmm8
|
|
movaps 0x30(%rsp), %xmm9
|
|
movaps 0x40(%rsp), %xmm10
|
|
movaps 0x50(%rsp), %xmm11
|
|
movaps 0x60(%rsp), %xmm12
|
|
movaps 0x70(%rsp), %xmm13
|
|
movaps 0x80(%rsp), %xmm14
|
|
movaps 0x90(%rsp), %xmm15
|
|
lea 0xa8(%rsp), %rsp
|
|
.LSEH_end_ecp_nistz256_select_w7:
|
|
___
|
|
$code.=<<___;
|
|
ret
|
|
.size ecp_nistz256_select_w7,.-ecp_nistz256_select_w7
|
|
___
|
|
}
|
|
if ($avx>1) {
|
|
my ($val,$in_t,$index)=$win64?("%rcx","%rdx","%r8d"):("%rdi","%rsi","%edx");
|
|
my ($TWO,$INDEX,$Ra,$Rb,$Rc)=map("%ymm$_",(0..4));
|
|
my ($M0,$T0a,$T0b,$T0c,$TMP0)=map("%ymm$_",(5..9));
|
|
my ($M1,$T1a,$T1b,$T1c,$TMP1)=map("%ymm$_",(10..14));
|
|
|
|
$code.=<<___;
|
|
################################################################################
|
|
# void ecp_nistz256_avx2_select_w5(uint64_t *val, uint64_t *in_t, int index);
|
|
.type ecp_nistz256_avx2_select_w5,\@abi-omnipotent
|
|
.align 32
|
|
ecp_nistz256_avx2_select_w5:
|
|
.Lavx2_select_w5:
|
|
vzeroupper
|
|
___
|
|
$code.=<<___ if ($win64);
|
|
lea -0x88(%rsp), %rax
|
|
.LSEH_begin_ecp_nistz256_avx2_select_w5:
|
|
.byte 0x48,0x8d,0x60,0xe0 #lea -0x20(%rax), %rsp
|
|
.byte 0xc5,0xf8,0x29,0x70,0xe0 #vmovaps %xmm6, -0x20(%rax)
|
|
.byte 0xc5,0xf8,0x29,0x78,0xf0 #vmovaps %xmm7, -0x10(%rax)
|
|
.byte 0xc5,0x78,0x29,0x40,0x00 #vmovaps %xmm8, 8(%rax)
|
|
.byte 0xc5,0x78,0x29,0x48,0x10 #vmovaps %xmm9, 0x10(%rax)
|
|
.byte 0xc5,0x78,0x29,0x50,0x20 #vmovaps %xmm10, 0x20(%rax)
|
|
.byte 0xc5,0x78,0x29,0x58,0x30 #vmovaps %xmm11, 0x30(%rax)
|
|
.byte 0xc5,0x78,0x29,0x60,0x40 #vmovaps %xmm12, 0x40(%rax)
|
|
.byte 0xc5,0x78,0x29,0x68,0x50 #vmovaps %xmm13, 0x50(%rax)
|
|
.byte 0xc5,0x78,0x29,0x70,0x60 #vmovaps %xmm14, 0x60(%rax)
|
|
.byte 0xc5,0x78,0x29,0x78,0x70 #vmovaps %xmm15, 0x70(%rax)
|
|
___
|
|
$code.=<<___;
|
|
vmovdqa .LTwo(%rip), $TWO
|
|
|
|
vpxor $Ra, $Ra, $Ra
|
|
vpxor $Rb, $Rb, $Rb
|
|
vpxor $Rc, $Rc, $Rc
|
|
|
|
vmovdqa .LOne(%rip), $M0
|
|
vmovdqa .LTwo(%rip), $M1
|
|
|
|
vmovd $index, %xmm1
|
|
vpermd $INDEX, $Ra, $INDEX
|
|
|
|
mov \$8, %rax
|
|
.Lselect_loop_avx2_w5:
|
|
|
|
vmovdqa 32*0($in_t), $T0a
|
|
vmovdqa 32*1($in_t), $T0b
|
|
vmovdqa 32*2($in_t), $T0c
|
|
|
|
vmovdqa 32*3($in_t), $T1a
|
|
vmovdqa 32*4($in_t), $T1b
|
|
vmovdqa 32*5($in_t), $T1c
|
|
|
|
vpcmpeqd $INDEX, $M0, $TMP0
|
|
vpcmpeqd $INDEX, $M1, $TMP1
|
|
|
|
vpaddd $TWO, $M0, $M0
|
|
vpaddd $TWO, $M1, $M1
|
|
lea 32*6($in_t), $in_t
|
|
|
|
vpand $TMP0, $T0a, $T0a
|
|
vpand $TMP0, $T0b, $T0b
|
|
vpand $TMP0, $T0c, $T0c
|
|
vpand $TMP1, $T1a, $T1a
|
|
vpand $TMP1, $T1b, $T1b
|
|
vpand $TMP1, $T1c, $T1c
|
|
|
|
vpxor $T0a, $Ra, $Ra
|
|
vpxor $T0b, $Rb, $Rb
|
|
vpxor $T0c, $Rc, $Rc
|
|
vpxor $T1a, $Ra, $Ra
|
|
vpxor $T1b, $Rb, $Rb
|
|
vpxor $T1c, $Rc, $Rc
|
|
|
|
dec %rax
|
|
jnz .Lselect_loop_avx2_w5
|
|
|
|
vmovdqu $Ra, 32*0($val)
|
|
vmovdqu $Rb, 32*1($val)
|
|
vmovdqu $Rc, 32*2($val)
|
|
vzeroupper
|
|
___
|
|
$code.=<<___ if ($win64);
|
|
movaps (%rsp), %xmm6
|
|
movaps 0x10(%rsp), %xmm7
|
|
movaps 0x20(%rsp), %xmm8
|
|
movaps 0x30(%rsp), %xmm9
|
|
movaps 0x40(%rsp), %xmm10
|
|
movaps 0x50(%rsp), %xmm11
|
|
movaps 0x60(%rsp), %xmm12
|
|
movaps 0x70(%rsp), %xmm13
|
|
movaps 0x80(%rsp), %xmm14
|
|
movaps 0x90(%rsp), %xmm15
|
|
lea 0xa8(%rsp), %rsp
|
|
.LSEH_end_ecp_nistz256_avx2_select_w5:
|
|
___
|
|
$code.=<<___;
|
|
ret
|
|
.size ecp_nistz256_avx2_select_w5,.-ecp_nistz256_avx2_select_w5
|
|
___
|
|
}
|
|
if ($avx>1) {
|
|
my ($val,$in_t,$index)=$win64?("%rcx","%rdx","%r8d"):("%rdi","%rsi","%edx");
|
|
my ($THREE,$INDEX,$Ra,$Rb)=map("%ymm$_",(0..3));
|
|
my ($M0,$T0a,$T0b,$TMP0)=map("%ymm$_",(4..7));
|
|
my ($M1,$T1a,$T1b,$TMP1)=map("%ymm$_",(8..11));
|
|
my ($M2,$T2a,$T2b,$TMP2)=map("%ymm$_",(12..15));
|
|
|
|
$code.=<<___;
|
|
|
|
################################################################################
|
|
# void ecp_nistz256_avx2_select_w7(uint64_t *val, uint64_t *in_t, int index);
|
|
.globl ecp_nistz256_avx2_select_w7
|
|
.type ecp_nistz256_avx2_select_w7,\@abi-omnipotent
|
|
.align 32
|
|
ecp_nistz256_avx2_select_w7:
|
|
.Lavx2_select_w7:
|
|
vzeroupper
|
|
___
|
|
$code.=<<___ if ($win64);
|
|
lea -0x88(%rsp), %rax
|
|
.LSEH_begin_ecp_nistz256_avx2_select_w7:
|
|
.byte 0x48,0x8d,0x60,0xe0 #lea -0x20(%rax), %rsp
|
|
.byte 0xc5,0xf8,0x29,0x70,0xe0 #vmovaps %xmm6, -0x20(%rax)
|
|
.byte 0xc5,0xf8,0x29,0x78,0xf0 #vmovaps %xmm7, -0x10(%rax)
|
|
.byte 0xc5,0x78,0x29,0x40,0x00 #vmovaps %xmm8, 8(%rax)
|
|
.byte 0xc5,0x78,0x29,0x48,0x10 #vmovaps %xmm9, 0x10(%rax)
|
|
.byte 0xc5,0x78,0x29,0x50,0x20 #vmovaps %xmm10, 0x20(%rax)
|
|
.byte 0xc5,0x78,0x29,0x58,0x30 #vmovaps %xmm11, 0x30(%rax)
|
|
.byte 0xc5,0x78,0x29,0x60,0x40 #vmovaps %xmm12, 0x40(%rax)
|
|
.byte 0xc5,0x78,0x29,0x68,0x50 #vmovaps %xmm13, 0x50(%rax)
|
|
.byte 0xc5,0x78,0x29,0x70,0x60 #vmovaps %xmm14, 0x60(%rax)
|
|
.byte 0xc5,0x78,0x29,0x78,0x70 #vmovaps %xmm15, 0x70(%rax)
|
|
___
|
|
$code.=<<___;
|
|
vmovdqa .LThree(%rip), $THREE
|
|
|
|
vpxor $Ra, $Ra, $Ra
|
|
vpxor $Rb, $Rb, $Rb
|
|
|
|
vmovdqa .LOne(%rip), $M0
|
|
vmovdqa .LTwo(%rip), $M1
|
|
vmovdqa .LThree(%rip), $M2
|
|
|
|
vmovd $index, %xmm1
|
|
vpermd $INDEX, $Ra, $INDEX
|
|
# Skip index = 0, because it is implicitly the point at infinity
|
|
|
|
mov \$21, %rax
|
|
.Lselect_loop_avx2_w7:
|
|
|
|
vmovdqa 32*0($in_t), $T0a
|
|
vmovdqa 32*1($in_t), $T0b
|
|
|
|
vmovdqa 32*2($in_t), $T1a
|
|
vmovdqa 32*3($in_t), $T1b
|
|
|
|
vmovdqa 32*4($in_t), $T2a
|
|
vmovdqa 32*5($in_t), $T2b
|
|
|
|
vpcmpeqd $INDEX, $M0, $TMP0
|
|
vpcmpeqd $INDEX, $M1, $TMP1
|
|
vpcmpeqd $INDEX, $M2, $TMP2
|
|
|
|
vpaddd $THREE, $M0, $M0
|
|
vpaddd $THREE, $M1, $M1
|
|
vpaddd $THREE, $M2, $M2
|
|
lea 32*6($in_t), $in_t
|
|
|
|
vpand $TMP0, $T0a, $T0a
|
|
vpand $TMP0, $T0b, $T0b
|
|
vpand $TMP1, $T1a, $T1a
|
|
vpand $TMP1, $T1b, $T1b
|
|
vpand $TMP2, $T2a, $T2a
|
|
vpand $TMP2, $T2b, $T2b
|
|
|
|
vpxor $T0a, $Ra, $Ra
|
|
vpxor $T0b, $Rb, $Rb
|
|
vpxor $T1a, $Ra, $Ra
|
|
vpxor $T1b, $Rb, $Rb
|
|
vpxor $T2a, $Ra, $Ra
|
|
vpxor $T2b, $Rb, $Rb
|
|
|
|
dec %rax
|
|
jnz .Lselect_loop_avx2_w7
|
|
|
|
|
|
vmovdqa 32*0($in_t), $T0a
|
|
vmovdqa 32*1($in_t), $T0b
|
|
|
|
vpcmpeqd $INDEX, $M0, $TMP0
|
|
|
|
vpand $TMP0, $T0a, $T0a
|
|
vpand $TMP0, $T0b, $T0b
|
|
|
|
vpxor $T0a, $Ra, $Ra
|
|
vpxor $T0b, $Rb, $Rb
|
|
|
|
vmovdqu $Ra, 32*0($val)
|
|
vmovdqu $Rb, 32*1($val)
|
|
vzeroupper
|
|
___
|
|
$code.=<<___ if ($win64);
|
|
movaps (%rsp), %xmm6
|
|
movaps 0x10(%rsp), %xmm7
|
|
movaps 0x20(%rsp), %xmm8
|
|
movaps 0x30(%rsp), %xmm9
|
|
movaps 0x40(%rsp), %xmm10
|
|
movaps 0x50(%rsp), %xmm11
|
|
movaps 0x60(%rsp), %xmm12
|
|
movaps 0x70(%rsp), %xmm13
|
|
movaps 0x80(%rsp), %xmm14
|
|
movaps 0x90(%rsp), %xmm15
|
|
lea 0xa8(%rsp), %rsp
|
|
.LSEH_end_ecp_nistz256_avx2_select_w7:
|
|
___
|
|
$code.=<<___;
|
|
ret
|
|
.size ecp_nistz256_avx2_select_w7,.-ecp_nistz256_avx2_select_w7
|
|
___
|
|
} else {
|
|
$code.=<<___;
|
|
.globl ecp_nistz256_avx2_select_w7
|
|
.type ecp_nistz256_avx2_select_w7,\@function,3
|
|
.align 32
|
|
ecp_nistz256_avx2_select_w7:
|
|
.byte 0x0f,0x0b # ud2
|
|
ret
|
|
.size ecp_nistz256_avx2_select_w7,.-ecp_nistz256_avx2_select_w7
|
|
___
|
|
}
|
|
{{{
|
|
########################################################################
|
|
# This block implements higher level point_double, point_add and
|
|
# point_add_affine. The key to performance in this case is to allow
|
|
# out-of-order execution logic to overlap computations from next step
|
|
# with tail processing from current step. By using tailored calling
|
|
# sequence we minimize inter-step overhead to give processor better
|
|
# shot at overlapping operations...
|
|
#
|
|
# You will notice that input data is copied to stack. Trouble is that
|
|
# there are no registers to spare for holding original pointers and
|
|
# reloading them, pointers, would create undesired dependencies on
|
|
# effective addresses calculation paths. In other words it's too done
|
|
# to favour out-of-order execution logic.
|
|
# <appro@openssl.org>
|
|
|
|
my ($r_ptr,$a_ptr,$b_org,$b_ptr)=("%rdi","%rsi","%rdx","%rbx");
|
|
my ($acc0,$acc1,$acc2,$acc3,$acc4,$acc5,$acc6,$acc7)=map("%r$_",(8..15));
|
|
my ($t0,$t1,$t2,$t3,$t4)=("%rax","%rbp","%rcx",$acc4,$acc4);
|
|
my ($poly1,$poly3)=($acc6,$acc7);
|
|
|
|
sub load_for_mul () {
|
|
my ($a,$b,$src0) = @_;
|
|
my $bias = $src0 eq "%rax" ? 0 : -128;
|
|
|
|
" mov $b, $src0
|
|
lea $b, $b_ptr
|
|
mov 8*0+$a, $acc1
|
|
mov 8*1+$a, $acc2
|
|
lea $bias+$a, $a_ptr
|
|
mov 8*2+$a, $acc3
|
|
mov 8*3+$a, $acc4"
|
|
}
|
|
|
|
sub load_for_sqr () {
|
|
my ($a,$src0) = @_;
|
|
my $bias = $src0 eq "%rax" ? 0 : -128;
|
|
|
|
" mov 8*0+$a, $src0
|
|
mov 8*1+$a, $acc6
|
|
lea $bias+$a, $a_ptr
|
|
mov 8*2+$a, $acc7
|
|
mov 8*3+$a, $acc0"
|
|
}
|
|
|
|
{
|
|
########################################################################
|
|
# operate in 4-5-0-1 "name space" that matches multiplication output
|
|
#
|
|
my ($a0,$a1,$a2,$a3,$t3,$t4)=($acc4,$acc5,$acc0,$acc1,$acc2,$acc3);
|
|
|
|
$code.=<<___;
|
|
.type __ecp_nistz256_add_toq,\@abi-omnipotent
|
|
.align 32
|
|
__ecp_nistz256_add_toq:
|
|
xor $t4,$t4
|
|
add 8*0($b_ptr), $a0
|
|
adc 8*1($b_ptr), $a1
|
|
mov $a0, $t0
|
|
adc 8*2($b_ptr), $a2
|
|
adc 8*3($b_ptr), $a3
|
|
mov $a1, $t1
|
|
adc \$0, $t4
|
|
|
|
sub \$-1, $a0
|
|
mov $a2, $t2
|
|
sbb $poly1, $a1
|
|
sbb \$0, $a2
|
|
mov $a3, $t3
|
|
sbb $poly3, $a3
|
|
sbb \$0, $t4
|
|
|
|
cmovc $t0, $a0
|
|
cmovc $t1, $a1
|
|
mov $a0, 8*0($r_ptr)
|
|
cmovc $t2, $a2
|
|
mov $a1, 8*1($r_ptr)
|
|
cmovc $t3, $a3
|
|
mov $a2, 8*2($r_ptr)
|
|
mov $a3, 8*3($r_ptr)
|
|
|
|
ret
|
|
.size __ecp_nistz256_add_toq,.-__ecp_nistz256_add_toq
|
|
|
|
.type __ecp_nistz256_sub_fromq,\@abi-omnipotent
|
|
.align 32
|
|
__ecp_nistz256_sub_fromq:
|
|
sub 8*0($b_ptr), $a0
|
|
sbb 8*1($b_ptr), $a1
|
|
mov $a0, $t0
|
|
sbb 8*2($b_ptr), $a2
|
|
sbb 8*3($b_ptr), $a3
|
|
mov $a1, $t1
|
|
sbb $t4, $t4
|
|
|
|
add \$-1, $a0
|
|
mov $a2, $t2
|
|
adc $poly1, $a1
|
|
adc \$0, $a2
|
|
mov $a3, $t3
|
|
adc $poly3, $a3
|
|
test $t4, $t4
|
|
|
|
cmovz $t0, $a0
|
|
cmovz $t1, $a1
|
|
mov $a0, 8*0($r_ptr)
|
|
cmovz $t2, $a2
|
|
mov $a1, 8*1($r_ptr)
|
|
cmovz $t3, $a3
|
|
mov $a2, 8*2($r_ptr)
|
|
mov $a3, 8*3($r_ptr)
|
|
|
|
ret
|
|
.size __ecp_nistz256_sub_fromq,.-__ecp_nistz256_sub_fromq
|
|
|
|
.type __ecp_nistz256_subq,\@abi-omnipotent
|
|
.align 32
|
|
__ecp_nistz256_subq:
|
|
sub $a0, $t0
|
|
sbb $a1, $t1
|
|
mov $t0, $a0
|
|
sbb $a2, $t2
|
|
sbb $a3, $t3
|
|
mov $t1, $a1
|
|
sbb $t4, $t4
|
|
|
|
add \$-1, $t0
|
|
mov $t2, $a2
|
|
adc $poly1, $t1
|
|
adc \$0, $t2
|
|
mov $t3, $a3
|
|
adc $poly3, $t3
|
|
test $t4, $t4
|
|
|
|
cmovnz $t0, $a0
|
|
cmovnz $t1, $a1
|
|
cmovnz $t2, $a2
|
|
cmovnz $t3, $a3
|
|
|
|
ret
|
|
.size __ecp_nistz256_subq,.-__ecp_nistz256_subq
|
|
|
|
.type __ecp_nistz256_mul_by_2q,\@abi-omnipotent
|
|
.align 32
|
|
__ecp_nistz256_mul_by_2q:
|
|
xor $t4, $t4
|
|
add $a0, $a0 # a0:a3+a0:a3
|
|
adc $a1, $a1
|
|
mov $a0, $t0
|
|
adc $a2, $a2
|
|
adc $a3, $a3
|
|
mov $a1, $t1
|
|
adc \$0, $t4
|
|
|
|
sub \$-1, $a0
|
|
mov $a2, $t2
|
|
sbb $poly1, $a1
|
|
sbb \$0, $a2
|
|
mov $a3, $t3
|
|
sbb $poly3, $a3
|
|
sbb \$0, $t4
|
|
|
|
cmovc $t0, $a0
|
|
cmovc $t1, $a1
|
|
mov $a0, 8*0($r_ptr)
|
|
cmovc $t2, $a2
|
|
mov $a1, 8*1($r_ptr)
|
|
cmovc $t3, $a3
|
|
mov $a2, 8*2($r_ptr)
|
|
mov $a3, 8*3($r_ptr)
|
|
|
|
ret
|
|
.size __ecp_nistz256_mul_by_2q,.-__ecp_nistz256_mul_by_2q
|
|
___
|
|
}
|
|
sub gen_double () {
|
|
my $x = shift;
|
|
my ($src0,$sfx,$bias);
|
|
my ($S,$M,$Zsqr,$in_x,$tmp0)=map(32*$_,(0..4));
|
|
|
|
if ($x ne "x") {
|
|
$src0 = "%rax";
|
|
$sfx = "";
|
|
$bias = 0;
|
|
|
|
$code.=<<___;
|
|
.globl ecp_nistz256_point_double
|
|
.type ecp_nistz256_point_double,\@function,2
|
|
.align 32
|
|
ecp_nistz256_point_double:
|
|
___
|
|
$code.=<<___ if ($addx);
|
|
mov \$0x80100, %ecx
|
|
and OPENSSL_ia32cap_P+8(%rip), %ecx
|
|
cmp \$0x80100, %ecx
|
|
je .Lpoint_doublex
|
|
___
|
|
} else {
|
|
$src0 = "%rdx";
|
|
$sfx = "x";
|
|
$bias = 128;
|
|
|
|
$code.=<<___;
|
|
.type ecp_nistz256_point_doublex,\@function,2
|
|
.align 32
|
|
ecp_nistz256_point_doublex:
|
|
.Lpoint_doublex:
|
|
___
|
|
}
|
|
$code.=<<___;
|
|
push %rbp
|
|
push %rbx
|
|
push %r12
|
|
push %r13
|
|
push %r14
|
|
push %r15
|
|
sub \$32*5+8, %rsp
|
|
|
|
.Lpoint_double_shortcut$x:
|
|
movdqu 0x00($a_ptr), %xmm0 # copy *(P256_POINT *)$a_ptr.x
|
|
mov $a_ptr, $b_ptr # backup copy
|
|
movdqu 0x10($a_ptr), %xmm1
|
|
mov 0x20+8*0($a_ptr), $acc4 # load in_y in "5-4-0-1" order
|
|
mov 0x20+8*1($a_ptr), $acc5
|
|
mov 0x20+8*2($a_ptr), $acc0
|
|
mov 0x20+8*3($a_ptr), $acc1
|
|
mov .Lpoly+8*1(%rip), $poly1
|
|
mov .Lpoly+8*3(%rip), $poly3
|
|
movdqa %xmm0, $in_x(%rsp)
|
|
movdqa %xmm1, $in_x+0x10(%rsp)
|
|
lea 0x20($r_ptr), $acc2
|
|
lea 0x40($r_ptr), $acc3
|
|
movq $r_ptr, %xmm0
|
|
movq $acc2, %xmm1
|
|
movq $acc3, %xmm2
|
|
|
|
lea $S(%rsp), $r_ptr
|
|
call __ecp_nistz256_mul_by_2$x # p256_mul_by_2(S, in_y);
|
|
|
|
mov 0x40+8*0($a_ptr), $src0
|
|
mov 0x40+8*1($a_ptr), $acc6
|
|
mov 0x40+8*2($a_ptr), $acc7
|
|
mov 0x40+8*3($a_ptr), $acc0
|
|
lea 0x40-$bias($a_ptr), $a_ptr
|
|
lea $Zsqr(%rsp), $r_ptr
|
|
call __ecp_nistz256_sqr_mont$x # p256_sqr_mont(Zsqr, in_z);
|
|
|
|
`&load_for_sqr("$S(%rsp)", "$src0")`
|
|
lea $S(%rsp), $r_ptr
|
|
call __ecp_nistz256_sqr_mont$x # p256_sqr_mont(S, S);
|
|
|
|
mov 0x20($b_ptr), $src0 # $b_ptr is still valid
|
|
mov 0x40+8*0($b_ptr), $acc1
|
|
mov 0x40+8*1($b_ptr), $acc2
|
|
mov 0x40+8*2($b_ptr), $acc3
|
|
mov 0x40+8*3($b_ptr), $acc4
|
|
lea 0x40-$bias($b_ptr), $a_ptr
|
|
lea 0x20($b_ptr), $b_ptr
|
|
movq %xmm2, $r_ptr
|
|
call __ecp_nistz256_mul_mont$x # p256_mul_mont(res_z, in_z, in_y);
|
|
call __ecp_nistz256_mul_by_2$x # p256_mul_by_2(res_z, res_z);
|
|
|
|
mov $in_x+8*0(%rsp), $acc4 # "5-4-0-1" order
|
|
mov $in_x+8*1(%rsp), $acc5
|
|
lea $Zsqr(%rsp), $b_ptr
|
|
mov $in_x+8*2(%rsp), $acc0
|
|
mov $in_x+8*3(%rsp), $acc1
|
|
lea $M(%rsp), $r_ptr
|
|
call __ecp_nistz256_add_to$x # p256_add(M, in_x, Zsqr);
|
|
|
|
mov $in_x+8*0(%rsp), $acc4 # "5-4-0-1" order
|
|
mov $in_x+8*1(%rsp), $acc5
|
|
lea $Zsqr(%rsp), $b_ptr
|
|
mov $in_x+8*2(%rsp), $acc0
|
|
mov $in_x+8*3(%rsp), $acc1
|
|
lea $Zsqr(%rsp), $r_ptr
|
|
call __ecp_nistz256_sub_from$x # p256_sub(Zsqr, in_x, Zsqr);
|
|
|
|
`&load_for_sqr("$S(%rsp)", "$src0")`
|
|
movq %xmm1, $r_ptr
|
|
call __ecp_nistz256_sqr_mont$x # p256_sqr_mont(res_y, S);
|
|
___
|
|
{
|
|
######## ecp_nistz256_div_by_2(res_y, res_y); ##########################
|
|
# operate in 4-5-6-7 "name space" that matches squaring output
|
|
#
|
|
my ($poly1,$poly3)=($a_ptr,$t1);
|
|
my ($a0,$a1,$a2,$a3,$t3,$t4,$t1)=($acc4,$acc5,$acc6,$acc7,$acc0,$acc1,$acc2);
|
|
|
|
$code.=<<___;
|
|
xor $t4, $t4
|
|
mov $a0, $t0
|
|
add \$-1, $a0
|
|
mov $a1, $t1
|
|
adc $poly1, $a1
|
|
mov $a2, $t2
|
|
adc \$0, $a2
|
|
mov $a3, $t3
|
|
adc $poly3, $a3
|
|
adc \$0, $t4
|
|
xor $a_ptr, $a_ptr # borrow $a_ptr
|
|
test \$1, $t0
|
|
|
|
cmovz $t0, $a0
|
|
cmovz $t1, $a1
|
|
cmovz $t2, $a2
|
|
cmovz $t3, $a3
|
|
cmovz $a_ptr, $t4
|
|
|
|
mov $a1, $t0 # a0:a3>>1
|
|
shr \$1, $a0
|
|
shl \$63, $t0
|
|
mov $a2, $t1
|
|
shr \$1, $a1
|
|
or $t0, $a0
|
|
shl \$63, $t1
|
|
mov $a3, $t2
|
|
shr \$1, $a2
|
|
or $t1, $a1
|
|
shl \$63, $t2
|
|
mov $a0, 8*0($r_ptr)
|
|
shr \$1, $a3
|
|
mov $a1, 8*1($r_ptr)
|
|
shl \$63, $t4
|
|
or $t2, $a2
|
|
or $t4, $a3
|
|
mov $a2, 8*2($r_ptr)
|
|
mov $a3, 8*3($r_ptr)
|
|
___
|
|
}
|
|
$code.=<<___;
|
|
`&load_for_mul("$M(%rsp)", "$Zsqr(%rsp)", "$src0")`
|
|
lea $M(%rsp), $r_ptr
|
|
call __ecp_nistz256_mul_mont$x # p256_mul_mont(M, M, Zsqr);
|
|
|
|
lea $tmp0(%rsp), $r_ptr
|
|
call __ecp_nistz256_mul_by_2$x
|
|
|
|
lea $M(%rsp), $b_ptr
|
|
lea $M(%rsp), $r_ptr
|
|
call __ecp_nistz256_add_to$x # p256_mul_by_3(M, M);
|
|
|
|
`&load_for_mul("$S(%rsp)", "$in_x(%rsp)", "$src0")`
|
|
lea $S(%rsp), $r_ptr
|
|
call __ecp_nistz256_mul_mont$x # p256_mul_mont(S, S, in_x);
|
|
|
|
lea $tmp0(%rsp), $r_ptr
|
|
call __ecp_nistz256_mul_by_2$x # p256_mul_by_2(tmp0, S);
|
|
|
|
`&load_for_sqr("$M(%rsp)", "$src0")`
|
|
movq %xmm0, $r_ptr
|
|
call __ecp_nistz256_sqr_mont$x # p256_sqr_mont(res_x, M);
|
|
|
|
lea $tmp0(%rsp), $b_ptr
|
|
mov $acc6, $acc0 # harmonize sqr output and sub input
|
|
mov $acc7, $acc1
|
|
mov $a_ptr, $poly1
|
|
mov $t1, $poly3
|
|
call __ecp_nistz256_sub_from$x # p256_sub(res_x, res_x, tmp0);
|
|
|
|
mov $S+8*0(%rsp), $t0
|
|
mov $S+8*1(%rsp), $t1
|
|
mov $S+8*2(%rsp), $t2
|
|
mov $S+8*3(%rsp), $acc2 # "4-5-0-1" order
|
|
lea $S(%rsp), $r_ptr
|
|
call __ecp_nistz256_sub$x # p256_sub(S, S, res_x);
|
|
|
|
mov $M(%rsp), $src0
|
|
lea $M(%rsp), $b_ptr
|
|
mov $acc4, $acc6 # harmonize sub output and mul input
|
|
xor %ecx, %ecx
|
|
mov $acc4, $S+8*0(%rsp) # have to save:-(
|
|
mov $acc5, $acc2
|
|
mov $acc5, $S+8*1(%rsp)
|
|
cmovz $acc0, $acc3
|
|
mov $acc0, $S+8*2(%rsp)
|
|
lea $S-$bias(%rsp), $a_ptr
|
|
cmovz $acc1, $acc4
|
|
mov $acc1, $S+8*3(%rsp)
|
|
mov $acc6, $acc1
|
|
lea $S(%rsp), $r_ptr
|
|
call __ecp_nistz256_mul_mont$x # p256_mul_mont(S, S, M);
|
|
|
|
movq %xmm1, $b_ptr
|
|
movq %xmm1, $r_ptr
|
|
call __ecp_nistz256_sub_from$x # p256_sub(res_y, S, res_y);
|
|
|
|
add \$32*5+8, %rsp
|
|
pop %r15
|
|
pop %r14
|
|
pop %r13
|
|
pop %r12
|
|
pop %rbx
|
|
pop %rbp
|
|
ret
|
|
.size ecp_nistz256_point_double$sfx,.-ecp_nistz256_point_double$sfx
|
|
___
|
|
}
|
|
&gen_double("q");
|
|
|
|
sub gen_add () {
|
|
my $x = shift;
|
|
my ($src0,$sfx,$bias);
|
|
my ($H,$Hsqr,$R,$Rsqr,$Hcub,
|
|
$U1,$U2,$S1,$S2,
|
|
$res_x,$res_y,$res_z,
|
|
$in1_x,$in1_y,$in1_z,
|
|
$in2_x,$in2_y,$in2_z)=map(32*$_,(0..17));
|
|
my ($Z1sqr, $Z2sqr) = ($Hsqr, $Rsqr);
|
|
|
|
if ($x ne "x") {
|
|
$src0 = "%rax";
|
|
$sfx = "";
|
|
$bias = 0;
|
|
|
|
$code.=<<___;
|
|
.globl ecp_nistz256_point_add
|
|
.type ecp_nistz256_point_add,\@function,3
|
|
.align 32
|
|
ecp_nistz256_point_add:
|
|
___
|
|
$code.=<<___ if ($addx);
|
|
mov \$0x80100, %ecx
|
|
and OPENSSL_ia32cap_P+8(%rip), %ecx
|
|
cmp \$0x80100, %ecx
|
|
je .Lpoint_addx
|
|
___
|
|
} else {
|
|
$src0 = "%rdx";
|
|
$sfx = "x";
|
|
$bias = 128;
|
|
|
|
$code.=<<___;
|
|
.type ecp_nistz256_point_addx,\@function,3
|
|
.align 32
|
|
ecp_nistz256_point_addx:
|
|
.Lpoint_addx:
|
|
___
|
|
}
|
|
$code.=<<___;
|
|
push %rbp
|
|
push %rbx
|
|
push %r12
|
|
push %r13
|
|
push %r14
|
|
push %r15
|
|
sub \$32*18+8, %rsp
|
|
|
|
movdqu 0x00($a_ptr), %xmm0 # copy *(P256_POINT *)$a_ptr
|
|
movdqu 0x10($a_ptr), %xmm1
|
|
movdqu 0x20($a_ptr), %xmm2
|
|
movdqu 0x30($a_ptr), %xmm3
|
|
movdqu 0x40($a_ptr), %xmm4
|
|
movdqu 0x50($a_ptr), %xmm5
|
|
mov $a_ptr, $b_ptr # reassign
|
|
mov $b_org, $a_ptr # reassign
|
|
movdqa %xmm0, $in1_x(%rsp)
|
|
movdqa %xmm1, $in1_x+0x10(%rsp)
|
|
movdqa %xmm2, $in1_y(%rsp)
|
|
movdqa %xmm3, $in1_y+0x10(%rsp)
|
|
movdqa %xmm4, $in1_z(%rsp)
|
|
movdqa %xmm5, $in1_z+0x10(%rsp)
|
|
por %xmm4, %xmm5
|
|
|
|
movdqu 0x00($a_ptr), %xmm0 # copy *(P256_POINT *)$b_ptr
|
|
pshufd \$0xb1, %xmm5, %xmm3
|
|
movdqu 0x10($a_ptr), %xmm1
|
|
movdqu 0x20($a_ptr), %xmm2
|
|
por %xmm3, %xmm5
|
|
movdqu 0x30($a_ptr), %xmm3
|
|
mov 0x40+8*0($a_ptr), $src0 # load original in2_z
|
|
mov 0x40+8*1($a_ptr), $acc6
|
|
mov 0x40+8*2($a_ptr), $acc7
|
|
mov 0x40+8*3($a_ptr), $acc0
|
|
movdqa %xmm0, $in2_x(%rsp)
|
|
pshufd \$0x1e, %xmm5, %xmm4
|
|
movdqa %xmm1, $in2_x+0x10(%rsp)
|
|
movdqu 0x40($a_ptr),%xmm0 # in2_z again
|
|
movdqu 0x50($a_ptr),%xmm1
|
|
movdqa %xmm2, $in2_y(%rsp)
|
|
movdqa %xmm3, $in2_y+0x10(%rsp)
|
|
por %xmm4, %xmm5
|
|
pxor %xmm4, %xmm4
|
|
por %xmm0, %xmm1
|
|
movq $r_ptr, %xmm0 # save $r_ptr
|
|
|
|
lea 0x40-$bias($a_ptr), $a_ptr # $a_ptr is still valid
|
|
mov $src0, $in2_z+8*0(%rsp) # make in2_z copy
|
|
mov $acc6, $in2_z+8*1(%rsp)
|
|
mov $acc7, $in2_z+8*2(%rsp)
|
|
mov $acc0, $in2_z+8*3(%rsp)
|
|
lea $Z2sqr(%rsp), $r_ptr # Z2^2
|
|
call __ecp_nistz256_sqr_mont$x # p256_sqr_mont(Z2sqr, in2_z);
|
|
|
|
pcmpeqd %xmm4, %xmm5
|
|
pshufd \$0xb1, %xmm1, %xmm4
|
|
por %xmm1, %xmm4
|
|
pshufd \$0, %xmm5, %xmm5 # in1infty
|
|
pshufd \$0x1e, %xmm4, %xmm3
|
|
por %xmm3, %xmm4
|
|
pxor %xmm3, %xmm3
|
|
pcmpeqd %xmm3, %xmm4
|
|
pshufd \$0, %xmm4, %xmm4 # in2infty
|
|
mov 0x40+8*0($b_ptr), $src0 # load original in1_z
|
|
mov 0x40+8*1($b_ptr), $acc6
|
|
mov 0x40+8*2($b_ptr), $acc7
|
|
mov 0x40+8*3($b_ptr), $acc0
|
|
movq $b_ptr, %xmm1
|
|
|
|
lea 0x40-$bias($b_ptr), $a_ptr
|
|
lea $Z1sqr(%rsp), $r_ptr # Z1^2
|
|
call __ecp_nistz256_sqr_mont$x # p256_sqr_mont(Z1sqr, in1_z);
|
|
|
|
`&load_for_mul("$Z2sqr(%rsp)", "$in2_z(%rsp)", "$src0")`
|
|
lea $S1(%rsp), $r_ptr # S1 = Z2^3
|
|
call __ecp_nistz256_mul_mont$x # p256_mul_mont(S1, Z2sqr, in2_z);
|
|
|
|
`&load_for_mul("$Z1sqr(%rsp)", "$in1_z(%rsp)", "$src0")`
|
|
lea $S2(%rsp), $r_ptr # S2 = Z1^3
|
|
call __ecp_nistz256_mul_mont$x # p256_mul_mont(S2, Z1sqr, in1_z);
|
|
|
|
`&load_for_mul("$S1(%rsp)", "$in1_y(%rsp)", "$src0")`
|
|
lea $S1(%rsp), $r_ptr # S1 = Y1*Z2^3
|
|
call __ecp_nistz256_mul_mont$x # p256_mul_mont(S1, S1, in1_y);
|
|
|
|
`&load_for_mul("$S2(%rsp)", "$in2_y(%rsp)", "$src0")`
|
|
lea $S2(%rsp), $r_ptr # S2 = Y2*Z1^3
|
|
call __ecp_nistz256_mul_mont$x # p256_mul_mont(S2, S2, in2_y);
|
|
|
|
lea $S1(%rsp), $b_ptr
|
|
lea $R(%rsp), $r_ptr # R = S2 - S1
|
|
call __ecp_nistz256_sub_from$x # p256_sub(R, S2, S1);
|
|
|
|
or $acc5, $acc4 # see if result is zero
|
|
movdqa %xmm4, %xmm2
|
|
or $acc0, $acc4
|
|
or $acc1, $acc4
|
|
por %xmm5, %xmm2 # in1infty || in2infty
|
|
movq $acc4, %xmm3
|
|
|
|
`&load_for_mul("$Z2sqr(%rsp)", "$in1_x(%rsp)", "$src0")`
|
|
lea $U1(%rsp), $r_ptr # U1 = X1*Z2^2
|
|
call __ecp_nistz256_mul_mont$x # p256_mul_mont(U1, in1_x, Z2sqr);
|
|
|
|
`&load_for_mul("$Z1sqr(%rsp)", "$in2_x(%rsp)", "$src0")`
|
|
lea $U2(%rsp), $r_ptr # U2 = X2*Z1^2
|
|
call __ecp_nistz256_mul_mont$x # p256_mul_mont(U2, in2_x, Z1sqr);
|
|
|
|
lea $U1(%rsp), $b_ptr
|
|
lea $H(%rsp), $r_ptr # H = U2 - U1
|
|
call __ecp_nistz256_sub_from$x # p256_sub(H, U2, U1);
|
|
|
|
or $acc5, $acc4 # see if result is zero
|
|
or $acc0, $acc4
|
|
or $acc1, $acc4
|
|
|
|
.byte 0x3e # predict taken
|
|
jnz .Ladd_proceed$x # is_equal(U1,U2)?
|
|
movq %xmm2, $acc0
|
|
movq %xmm3, $acc1
|
|
test $acc0, $acc0
|
|
jnz .Ladd_proceed$x # (in1infty || in2infty)?
|
|
test $acc1, $acc1
|
|
jz .Ladd_double$x # is_equal(S1,S2)?
|
|
|
|
movq %xmm0, $r_ptr # restore $r_ptr
|
|
pxor %xmm0, %xmm0
|
|
movdqu %xmm0, 0x00($r_ptr)
|
|
movdqu %xmm0, 0x10($r_ptr)
|
|
movdqu %xmm0, 0x20($r_ptr)
|
|
movdqu %xmm0, 0x30($r_ptr)
|
|
movdqu %xmm0, 0x40($r_ptr)
|
|
movdqu %xmm0, 0x50($r_ptr)
|
|
jmp .Ladd_done$x
|
|
|
|
.align 32
|
|
.Ladd_double$x:
|
|
movq %xmm1, $a_ptr # restore $a_ptr
|
|
movq %xmm0, $r_ptr # restore $r_ptr
|
|
add \$`32*(18-5)`, %rsp # difference in frame sizes
|
|
jmp .Lpoint_double_shortcut$x
|
|
|
|
.align 32
|
|
.Ladd_proceed$x:
|
|
`&load_for_sqr("$R(%rsp)", "$src0")`
|
|
lea $Rsqr(%rsp), $r_ptr # R^2
|
|
call __ecp_nistz256_sqr_mont$x # p256_sqr_mont(Rsqr, R);
|
|
|
|
`&load_for_mul("$H(%rsp)", "$in1_z(%rsp)", "$src0")`
|
|
lea $res_z(%rsp), $r_ptr # Z3 = H*Z1*Z2
|
|
call __ecp_nistz256_mul_mont$x # p256_mul_mont(res_z, H, in1_z);
|
|
|
|
`&load_for_sqr("$H(%rsp)", "$src0")`
|
|
lea $Hsqr(%rsp), $r_ptr # H^2
|
|
call __ecp_nistz256_sqr_mont$x # p256_sqr_mont(Hsqr, H);
|
|
|
|
`&load_for_mul("$res_z(%rsp)", "$in2_z(%rsp)", "$src0")`
|
|
lea $res_z(%rsp), $r_ptr # Z3 = H*Z1*Z2
|
|
call __ecp_nistz256_mul_mont$x # p256_mul_mont(res_z, res_z, in2_z);
|
|
|
|
`&load_for_mul("$Hsqr(%rsp)", "$H(%rsp)", "$src0")`
|
|
lea $Hcub(%rsp), $r_ptr # H^3
|
|
call __ecp_nistz256_mul_mont$x # p256_mul_mont(Hcub, Hsqr, H);
|
|
|
|
`&load_for_mul("$Hsqr(%rsp)", "$U1(%rsp)", "$src0")`
|
|
lea $U2(%rsp), $r_ptr # U1*H^2
|
|
call __ecp_nistz256_mul_mont$x # p256_mul_mont(U2, U1, Hsqr);
|
|
___
|
|
{
|
|
#######################################################################
|
|
# operate in 4-5-0-1 "name space" that matches multiplication output
|
|
#
|
|
my ($acc0,$acc1,$acc2,$acc3,$t3,$t4)=($acc4,$acc5,$acc0,$acc1,$acc2,$acc3);
|
|
my ($poly1, $poly3)=($acc6,$acc7);
|
|
|
|
$code.=<<___;
|
|
#lea $U2(%rsp), $a_ptr
|
|
#lea $Hsqr(%rsp), $r_ptr # 2*U1*H^2
|
|
#call __ecp_nistz256_mul_by_2 # ecp_nistz256_mul_by_2(Hsqr, U2);
|
|
|
|
xor $t4, $t4
|
|
add $acc0, $acc0 # a0:a3+a0:a3
|
|
lea $Rsqr(%rsp), $a_ptr
|
|
adc $acc1, $acc1
|
|
mov $acc0, $t0
|
|
adc $acc2, $acc2
|
|
adc $acc3, $acc3
|
|
mov $acc1, $t1
|
|
adc \$0, $t4
|
|
|
|
sub \$-1, $acc0
|
|
mov $acc2, $t2
|
|
sbb $poly1, $acc1
|
|
sbb \$0, $acc2
|
|
mov $acc3, $t3
|
|
sbb $poly3, $acc3
|
|
sbb \$0, $t4
|
|
|
|
cmovc $t0, $acc0
|
|
mov 8*0($a_ptr), $t0
|
|
cmovc $t1, $acc1
|
|
mov 8*1($a_ptr), $t1
|
|
cmovc $t2, $acc2
|
|
mov 8*2($a_ptr), $t2
|
|
cmovc $t3, $acc3
|
|
mov 8*3($a_ptr), $t3
|
|
|
|
call __ecp_nistz256_sub$x # p256_sub(res_x, Rsqr, Hsqr);
|
|
|
|
lea $Hcub(%rsp), $b_ptr
|
|
lea $res_x(%rsp), $r_ptr
|
|
call __ecp_nistz256_sub_from$x # p256_sub(res_x, res_x, Hcub);
|
|
|
|
mov $U2+8*0(%rsp), $t0
|
|
mov $U2+8*1(%rsp), $t1
|
|
mov $U2+8*2(%rsp), $t2
|
|
mov $U2+8*3(%rsp), $t3
|
|
lea $res_y(%rsp), $r_ptr
|
|
|
|
call __ecp_nistz256_sub$x # p256_sub(res_y, U2, res_x);
|
|
|
|
mov $acc0, 8*0($r_ptr) # save the result, as
|
|
mov $acc1, 8*1($r_ptr) # __ecp_nistz256_sub doesn't
|
|
mov $acc2, 8*2($r_ptr)
|
|
mov $acc3, 8*3($r_ptr)
|
|
___
|
|
}
|
|
$code.=<<___;
|
|
`&load_for_mul("$S1(%rsp)", "$Hcub(%rsp)", "$src0")`
|
|
lea $S2(%rsp), $r_ptr
|
|
call __ecp_nistz256_mul_mont$x # p256_mul_mont(S2, S1, Hcub);
|
|
|
|
`&load_for_mul("$R(%rsp)", "$res_y(%rsp)", "$src0")`
|
|
lea $res_y(%rsp), $r_ptr
|
|
call __ecp_nistz256_mul_mont$x # p256_mul_mont(res_y, R, res_y);
|
|
|
|
lea $S2(%rsp), $b_ptr
|
|
lea $res_y(%rsp), $r_ptr
|
|
call __ecp_nistz256_sub_from$x # p256_sub(res_y, res_y, S2);
|
|
|
|
movq %xmm0, $r_ptr # restore $r_ptr
|
|
|
|
movdqa %xmm5, %xmm0 # copy_conditional(res_z, in2_z, in1infty);
|
|
movdqa %xmm5, %xmm1
|
|
pandn $res_z(%rsp), %xmm0
|
|
movdqa %xmm5, %xmm2
|
|
pandn $res_z+0x10(%rsp), %xmm1
|
|
movdqa %xmm5, %xmm3
|
|
pand $in2_z(%rsp), %xmm2
|
|
pand $in2_z+0x10(%rsp), %xmm3
|
|
por %xmm0, %xmm2
|
|
por %xmm1, %xmm3
|
|
|
|
movdqa %xmm4, %xmm0 # copy_conditional(res_z, in1_z, in2infty);
|
|
movdqa %xmm4, %xmm1
|
|
pandn %xmm2, %xmm0
|
|
movdqa %xmm4, %xmm2
|
|
pandn %xmm3, %xmm1
|
|
movdqa %xmm4, %xmm3
|
|
pand $in1_z(%rsp), %xmm2
|
|
pand $in1_z+0x10(%rsp), %xmm3
|
|
por %xmm0, %xmm2
|
|
por %xmm1, %xmm3
|
|
movdqu %xmm2, 0x40($r_ptr)
|
|
movdqu %xmm3, 0x50($r_ptr)
|
|
|
|
movdqa %xmm5, %xmm0 # copy_conditional(res_x, in2_x, in1infty);
|
|
movdqa %xmm5, %xmm1
|
|
pandn $res_x(%rsp), %xmm0
|
|
movdqa %xmm5, %xmm2
|
|
pandn $res_x+0x10(%rsp), %xmm1
|
|
movdqa %xmm5, %xmm3
|
|
pand $in2_x(%rsp), %xmm2
|
|
pand $in2_x+0x10(%rsp), %xmm3
|
|
por %xmm0, %xmm2
|
|
por %xmm1, %xmm3
|
|
|
|
movdqa %xmm4, %xmm0 # copy_conditional(res_x, in1_x, in2infty);
|
|
movdqa %xmm4, %xmm1
|
|
pandn %xmm2, %xmm0
|
|
movdqa %xmm4, %xmm2
|
|
pandn %xmm3, %xmm1
|
|
movdqa %xmm4, %xmm3
|
|
pand $in1_x(%rsp), %xmm2
|
|
pand $in1_x+0x10(%rsp), %xmm3
|
|
por %xmm0, %xmm2
|
|
por %xmm1, %xmm3
|
|
movdqu %xmm2, 0x00($r_ptr)
|
|
movdqu %xmm3, 0x10($r_ptr)
|
|
|
|
movdqa %xmm5, %xmm0 # copy_conditional(res_y, in2_y, in1infty);
|
|
movdqa %xmm5, %xmm1
|
|
pandn $res_y(%rsp), %xmm0
|
|
movdqa %xmm5, %xmm2
|
|
pandn $res_y+0x10(%rsp), %xmm1
|
|
movdqa %xmm5, %xmm3
|
|
pand $in2_y(%rsp), %xmm2
|
|
pand $in2_y+0x10(%rsp), %xmm3
|
|
por %xmm0, %xmm2
|
|
por %xmm1, %xmm3
|
|
|
|
movdqa %xmm4, %xmm0 # copy_conditional(res_y, in1_y, in2infty);
|
|
movdqa %xmm4, %xmm1
|
|
pandn %xmm2, %xmm0
|
|
movdqa %xmm4, %xmm2
|
|
pandn %xmm3, %xmm1
|
|
movdqa %xmm4, %xmm3
|
|
pand $in1_y(%rsp), %xmm2
|
|
pand $in1_y+0x10(%rsp), %xmm3
|
|
por %xmm0, %xmm2
|
|
por %xmm1, %xmm3
|
|
movdqu %xmm2, 0x20($r_ptr)
|
|
movdqu %xmm3, 0x30($r_ptr)
|
|
|
|
.Ladd_done$x:
|
|
add \$32*18+8, %rsp
|
|
pop %r15
|
|
pop %r14
|
|
pop %r13
|
|
pop %r12
|
|
pop %rbx
|
|
pop %rbp
|
|
ret
|
|
.size ecp_nistz256_point_add$sfx,.-ecp_nistz256_point_add$sfx
|
|
___
|
|
}
|
|
&gen_add("q");
|
|
|
|
sub gen_add_affine () {
|
|
my $x = shift;
|
|
my ($src0,$sfx,$bias);
|
|
my ($U2,$S2,$H,$R,$Hsqr,$Hcub,$Rsqr,
|
|
$res_x,$res_y,$res_z,
|
|
$in1_x,$in1_y,$in1_z,
|
|
$in2_x,$in2_y)=map(32*$_,(0..14));
|
|
my $Z1sqr = $S2;
|
|
|
|
if ($x ne "x") {
|
|
$src0 = "%rax";
|
|
$sfx = "";
|
|
$bias = 0;
|
|
|
|
$code.=<<___;
|
|
.globl ecp_nistz256_point_add_affine
|
|
.type ecp_nistz256_point_add_affine,\@function,3
|
|
.align 32
|
|
ecp_nistz256_point_add_affine:
|
|
___
|
|
$code.=<<___ if ($addx);
|
|
mov \$0x80100, %ecx
|
|
and OPENSSL_ia32cap_P+8(%rip), %ecx
|
|
cmp \$0x80100, %ecx
|
|
je .Lpoint_add_affinex
|
|
___
|
|
} else {
|
|
$src0 = "%rdx";
|
|
$sfx = "x";
|
|
$bias = 128;
|
|
|
|
$code.=<<___;
|
|
.type ecp_nistz256_point_add_affinex,\@function,3
|
|
.align 32
|
|
ecp_nistz256_point_add_affinex:
|
|
.Lpoint_add_affinex:
|
|
___
|
|
}
|
|
$code.=<<___;
|
|
push %rbp
|
|
push %rbx
|
|
push %r12
|
|
push %r13
|
|
push %r14
|
|
push %r15
|
|
sub \$32*15+8, %rsp
|
|
|
|
movdqu 0x00($a_ptr), %xmm0 # copy *(P256_POINT *)$a_ptr
|
|
mov $b_org, $b_ptr # reassign
|
|
movdqu 0x10($a_ptr), %xmm1
|
|
movdqu 0x20($a_ptr), %xmm2
|
|
movdqu 0x30($a_ptr), %xmm3
|
|
movdqu 0x40($a_ptr), %xmm4
|
|
movdqu 0x50($a_ptr), %xmm5
|
|
mov 0x40+8*0($a_ptr), $src0 # load original in1_z
|
|
mov 0x40+8*1($a_ptr), $acc6
|
|
mov 0x40+8*2($a_ptr), $acc7
|
|
mov 0x40+8*3($a_ptr), $acc0
|
|
movdqa %xmm0, $in1_x(%rsp)
|
|
movdqa %xmm1, $in1_x+0x10(%rsp)
|
|
movdqa %xmm2, $in1_y(%rsp)
|
|
movdqa %xmm3, $in1_y+0x10(%rsp)
|
|
movdqa %xmm4, $in1_z(%rsp)
|
|
movdqa %xmm5, $in1_z+0x10(%rsp)
|
|
por %xmm4, %xmm5
|
|
|
|
movdqu 0x00($b_ptr), %xmm0 # copy *(P256_POINT_AFFINE *)$b_ptr
|
|
pshufd \$0xb1, %xmm5, %xmm3
|
|
movdqu 0x10($b_ptr), %xmm1
|
|
movdqu 0x20($b_ptr), %xmm2
|
|
por %xmm3, %xmm5
|
|
movdqu 0x30($b_ptr), %xmm3
|
|
movdqa %xmm0, $in2_x(%rsp)
|
|
pshufd \$0x1e, %xmm5, %xmm4
|
|
movdqa %xmm1, $in2_x+0x10(%rsp)
|
|
por %xmm0, %xmm1
|
|
movq $r_ptr, %xmm0 # save $r_ptr
|
|
movdqa %xmm2, $in2_y(%rsp)
|
|
movdqa %xmm3, $in2_y+0x10(%rsp)
|
|
por %xmm2, %xmm3
|
|
por %xmm4, %xmm5
|
|
pxor %xmm4, %xmm4
|
|
por %xmm1, %xmm3
|
|
|
|
lea 0x40-$bias($a_ptr), $a_ptr # $a_ptr is still valid
|
|
lea $Z1sqr(%rsp), $r_ptr # Z1^2
|
|
call __ecp_nistz256_sqr_mont$x # p256_sqr_mont(Z1sqr, in1_z);
|
|
|
|
pcmpeqd %xmm4, %xmm5
|
|
pshufd \$0xb1, %xmm3, %xmm4
|
|
mov 0x00($b_ptr), $src0 # $b_ptr is still valid
|
|
#lea 0x00($b_ptr), $b_ptr
|
|
mov $acc4, $acc1 # harmonize sqr output and mul input
|
|
por %xmm3, %xmm4
|
|
pshufd \$0, %xmm5, %xmm5 # in1infty
|
|
pshufd \$0x1e, %xmm4, %xmm3
|
|
mov $acc5, $acc2
|
|
por %xmm3, %xmm4
|
|
pxor %xmm3, %xmm3
|
|
mov $acc6, $acc3
|
|
pcmpeqd %xmm3, %xmm4
|
|
pshufd \$0, %xmm4, %xmm4 # in2infty
|
|
|
|
lea $Z1sqr-$bias(%rsp), $a_ptr
|
|
mov $acc7, $acc4
|
|
lea $U2(%rsp), $r_ptr # U2 = X2*Z1^2
|
|
call __ecp_nistz256_mul_mont$x # p256_mul_mont(U2, Z1sqr, in2_x);
|
|
|
|
lea $in1_x(%rsp), $b_ptr
|
|
lea $H(%rsp), $r_ptr # H = U2 - U1
|
|
call __ecp_nistz256_sub_from$x # p256_sub(H, U2, in1_x);
|
|
|
|
`&load_for_mul("$Z1sqr(%rsp)", "$in1_z(%rsp)", "$src0")`
|
|
lea $S2(%rsp), $r_ptr # S2 = Z1^3
|
|
call __ecp_nistz256_mul_mont$x # p256_mul_mont(S2, Z1sqr, in1_z);
|
|
|
|
`&load_for_mul("$H(%rsp)", "$in1_z(%rsp)", "$src0")`
|
|
lea $res_z(%rsp), $r_ptr # Z3 = H*Z1*Z2
|
|
call __ecp_nistz256_mul_mont$x # p256_mul_mont(res_z, H, in1_z);
|
|
|
|
`&load_for_mul("$S2(%rsp)", "$in2_y(%rsp)", "$src0")`
|
|
lea $S2(%rsp), $r_ptr # S2 = Y2*Z1^3
|
|
call __ecp_nistz256_mul_mont$x # p256_mul_mont(S2, S2, in2_y);
|
|
|
|
lea $in1_y(%rsp), $b_ptr
|
|
lea $R(%rsp), $r_ptr # R = S2 - S1
|
|
call __ecp_nistz256_sub_from$x # p256_sub(R, S2, in1_y);
|
|
|
|
`&load_for_sqr("$H(%rsp)", "$src0")`
|
|
lea $Hsqr(%rsp), $r_ptr # H^2
|
|
call __ecp_nistz256_sqr_mont$x # p256_sqr_mont(Hsqr, H);
|
|
|
|
`&load_for_sqr("$R(%rsp)", "$src0")`
|
|
lea $Rsqr(%rsp), $r_ptr # R^2
|
|
call __ecp_nistz256_sqr_mont$x # p256_sqr_mont(Rsqr, R);
|
|
|
|
`&load_for_mul("$H(%rsp)", "$Hsqr(%rsp)", "$src0")`
|
|
lea $Hcub(%rsp), $r_ptr # H^3
|
|
call __ecp_nistz256_mul_mont$x # p256_mul_mont(Hcub, Hsqr, H);
|
|
|
|
`&load_for_mul("$Hsqr(%rsp)", "$in1_x(%rsp)", "$src0")`
|
|
lea $U2(%rsp), $r_ptr # U1*H^2
|
|
call __ecp_nistz256_mul_mont$x # p256_mul_mont(U2, in1_x, Hsqr);
|
|
___
|
|
{
|
|
#######################################################################
|
|
# operate in 4-5-0-1 "name space" that matches multiplication output
|
|
#
|
|
my ($acc0,$acc1,$acc2,$acc3,$t3,$t4)=($acc4,$acc5,$acc0,$acc1,$acc2,$acc3);
|
|
my ($poly1, $poly3)=($acc6,$acc7);
|
|
|
|
$code.=<<___;
|
|
#lea $U2(%rsp), $a_ptr
|
|
#lea $Hsqr(%rsp), $r_ptr # 2*U1*H^2
|
|
#call __ecp_nistz256_mul_by_2 # ecp_nistz256_mul_by_2(Hsqr, U2);
|
|
|
|
xor $t4, $t4
|
|
add $acc0, $acc0 # a0:a3+a0:a3
|
|
lea $Rsqr(%rsp), $a_ptr
|
|
adc $acc1, $acc1
|
|
mov $acc0, $t0
|
|
adc $acc2, $acc2
|
|
adc $acc3, $acc3
|
|
mov $acc1, $t1
|
|
adc \$0, $t4
|
|
|
|
sub \$-1, $acc0
|
|
mov $acc2, $t2
|
|
sbb $poly1, $acc1
|
|
sbb \$0, $acc2
|
|
mov $acc3, $t3
|
|
sbb $poly3, $acc3
|
|
sbb \$0, $t4
|
|
|
|
cmovc $t0, $acc0
|
|
mov 8*0($a_ptr), $t0
|
|
cmovc $t1, $acc1
|
|
mov 8*1($a_ptr), $t1
|
|
cmovc $t2, $acc2
|
|
mov 8*2($a_ptr), $t2
|
|
cmovc $t3, $acc3
|
|
mov 8*3($a_ptr), $t3
|
|
|
|
call __ecp_nistz256_sub$x # p256_sub(res_x, Rsqr, Hsqr);
|
|
|
|
lea $Hcub(%rsp), $b_ptr
|
|
lea $res_x(%rsp), $r_ptr
|
|
call __ecp_nistz256_sub_from$x # p256_sub(res_x, res_x, Hcub);
|
|
|
|
mov $U2+8*0(%rsp), $t0
|
|
mov $U2+8*1(%rsp), $t1
|
|
mov $U2+8*2(%rsp), $t2
|
|
mov $U2+8*3(%rsp), $t3
|
|
lea $H(%rsp), $r_ptr
|
|
|
|
call __ecp_nistz256_sub$x # p256_sub(H, U2, res_x);
|
|
|
|
mov $acc0, 8*0($r_ptr) # save the result, as
|
|
mov $acc1, 8*1($r_ptr) # __ecp_nistz256_sub doesn't
|
|
mov $acc2, 8*2($r_ptr)
|
|
mov $acc3, 8*3($r_ptr)
|
|
___
|
|
}
|
|
$code.=<<___;
|
|
`&load_for_mul("$Hcub(%rsp)", "$in1_y(%rsp)", "$src0")`
|
|
lea $S2(%rsp), $r_ptr
|
|
call __ecp_nistz256_mul_mont$x # p256_mul_mont(S2, Hcub, in1_y);
|
|
|
|
`&load_for_mul("$H(%rsp)", "$R(%rsp)", "$src0")`
|
|
lea $H(%rsp), $r_ptr
|
|
call __ecp_nistz256_mul_mont$x # p256_mul_mont(H, H, R);
|
|
|
|
lea $S2(%rsp), $b_ptr
|
|
lea $res_y(%rsp), $r_ptr
|
|
call __ecp_nistz256_sub_from$x # p256_sub(res_y, H, S2);
|
|
|
|
movq %xmm0, $r_ptr # restore $r_ptr
|
|
|
|
movdqa %xmm5, %xmm0 # copy_conditional(res_z, ONE, in1infty);
|
|
movdqa %xmm5, %xmm1
|
|
pandn $res_z(%rsp), %xmm0
|
|
movdqa %xmm5, %xmm2
|
|
pandn $res_z+0x10(%rsp), %xmm1
|
|
movdqa %xmm5, %xmm3
|
|
pand .LONE_mont(%rip), %xmm2
|
|
pand .LONE_mont+0x10(%rip), %xmm3
|
|
por %xmm0, %xmm2
|
|
por %xmm1, %xmm3
|
|
|
|
movdqa %xmm4, %xmm0 # copy_conditional(res_z, in1_z, in2infty);
|
|
movdqa %xmm4, %xmm1
|
|
pandn %xmm2, %xmm0
|
|
movdqa %xmm4, %xmm2
|
|
pandn %xmm3, %xmm1
|
|
movdqa %xmm4, %xmm3
|
|
pand $in1_z(%rsp), %xmm2
|
|
pand $in1_z+0x10(%rsp), %xmm3
|
|
por %xmm0, %xmm2
|
|
por %xmm1, %xmm3
|
|
movdqu %xmm2, 0x40($r_ptr)
|
|
movdqu %xmm3, 0x50($r_ptr)
|
|
|
|
movdqa %xmm5, %xmm0 # copy_conditional(res_x, in2_x, in1infty);
|
|
movdqa %xmm5, %xmm1
|
|
pandn $res_x(%rsp), %xmm0
|
|
movdqa %xmm5, %xmm2
|
|
pandn $res_x+0x10(%rsp), %xmm1
|
|
movdqa %xmm5, %xmm3
|
|
pand $in2_x(%rsp), %xmm2
|
|
pand $in2_x+0x10(%rsp), %xmm3
|
|
por %xmm0, %xmm2
|
|
por %xmm1, %xmm3
|
|
|
|
movdqa %xmm4, %xmm0 # copy_conditional(res_x, in1_x, in2infty);
|
|
movdqa %xmm4, %xmm1
|
|
pandn %xmm2, %xmm0
|
|
movdqa %xmm4, %xmm2
|
|
pandn %xmm3, %xmm1
|
|
movdqa %xmm4, %xmm3
|
|
pand $in1_x(%rsp), %xmm2
|
|
pand $in1_x+0x10(%rsp), %xmm3
|
|
por %xmm0, %xmm2
|
|
por %xmm1, %xmm3
|
|
movdqu %xmm2, 0x00($r_ptr)
|
|
movdqu %xmm3, 0x10($r_ptr)
|
|
|
|
movdqa %xmm5, %xmm0 # copy_conditional(res_y, in2_y, in1infty);
|
|
movdqa %xmm5, %xmm1
|
|
pandn $res_y(%rsp), %xmm0
|
|
movdqa %xmm5, %xmm2
|
|
pandn $res_y+0x10(%rsp), %xmm1
|
|
movdqa %xmm5, %xmm3
|
|
pand $in2_y(%rsp), %xmm2
|
|
pand $in2_y+0x10(%rsp), %xmm3
|
|
por %xmm0, %xmm2
|
|
por %xmm1, %xmm3
|
|
|
|
movdqa %xmm4, %xmm0 # copy_conditional(res_y, in1_y, in2infty);
|
|
movdqa %xmm4, %xmm1
|
|
pandn %xmm2, %xmm0
|
|
movdqa %xmm4, %xmm2
|
|
pandn %xmm3, %xmm1
|
|
movdqa %xmm4, %xmm3
|
|
pand $in1_y(%rsp), %xmm2
|
|
pand $in1_y+0x10(%rsp), %xmm3
|
|
por %xmm0, %xmm2
|
|
por %xmm1, %xmm3
|
|
movdqu %xmm2, 0x20($r_ptr)
|
|
movdqu %xmm3, 0x30($r_ptr)
|
|
|
|
add \$32*15+8, %rsp
|
|
pop %r15
|
|
pop %r14
|
|
pop %r13
|
|
pop %r12
|
|
pop %rbx
|
|
pop %rbp
|
|
ret
|
|
.size ecp_nistz256_point_add_affine$sfx,.-ecp_nistz256_point_add_affine$sfx
|
|
___
|
|
}
|
|
&gen_add_affine("q");
|
|
|
|
########################################################################
|
|
# AD*X magic
|
|
#
|
|
if ($addx) { {
|
|
########################################################################
|
|
# operate in 4-5-0-1 "name space" that matches multiplication output
|
|
#
|
|
my ($a0,$a1,$a2,$a3,$t3,$t4)=($acc4,$acc5,$acc0,$acc1,$acc2,$acc3);
|
|
|
|
$code.=<<___;
|
|
.type __ecp_nistz256_add_tox,\@abi-omnipotent
|
|
.align 32
|
|
__ecp_nistz256_add_tox:
|
|
xor $t4, $t4
|
|
adc 8*0($b_ptr), $a0
|
|
adc 8*1($b_ptr), $a1
|
|
mov $a0, $t0
|
|
adc 8*2($b_ptr), $a2
|
|
adc 8*3($b_ptr), $a3
|
|
mov $a1, $t1
|
|
adc \$0, $t4
|
|
|
|
xor $t3, $t3
|
|
sbb \$-1, $a0
|
|
mov $a2, $t2
|
|
sbb $poly1, $a1
|
|
sbb \$0, $a2
|
|
mov $a3, $t3
|
|
sbb $poly3, $a3
|
|
sbb \$0, $t4
|
|
|
|
cmovc $t0, $a0
|
|
cmovc $t1, $a1
|
|
mov $a0, 8*0($r_ptr)
|
|
cmovc $t2, $a2
|
|
mov $a1, 8*1($r_ptr)
|
|
cmovc $t3, $a3
|
|
mov $a2, 8*2($r_ptr)
|
|
mov $a3, 8*3($r_ptr)
|
|
|
|
ret
|
|
.size __ecp_nistz256_add_tox,.-__ecp_nistz256_add_tox
|
|
|
|
.type __ecp_nistz256_sub_fromx,\@abi-omnipotent
|
|
.align 32
|
|
__ecp_nistz256_sub_fromx:
|
|
xor $t4, $t4
|
|
sbb 8*0($b_ptr), $a0
|
|
sbb 8*1($b_ptr), $a1
|
|
mov $a0, $t0
|
|
sbb 8*2($b_ptr), $a2
|
|
sbb 8*3($b_ptr), $a3
|
|
mov $a1, $t1
|
|
sbb \$0, $t4
|
|
|
|
xor $t3, $t3
|
|
adc \$-1, $a0
|
|
mov $a2, $t2
|
|
adc $poly1, $a1
|
|
adc \$0, $a2
|
|
mov $a3, $t3
|
|
adc $poly3, $a3
|
|
|
|
bt \$0, $t4
|
|
cmovnc $t0, $a0
|
|
cmovnc $t1, $a1
|
|
mov $a0, 8*0($r_ptr)
|
|
cmovnc $t2, $a2
|
|
mov $a1, 8*1($r_ptr)
|
|
cmovnc $t3, $a3
|
|
mov $a2, 8*2($r_ptr)
|
|
mov $a3, 8*3($r_ptr)
|
|
|
|
ret
|
|
.size __ecp_nistz256_sub_fromx,.-__ecp_nistz256_sub_fromx
|
|
|
|
.type __ecp_nistz256_subx,\@abi-omnipotent
|
|
.align 32
|
|
__ecp_nistz256_subx:
|
|
xor $t4, $t4
|
|
sbb $a0, $t0
|
|
sbb $a1, $t1
|
|
mov $t0, $a0
|
|
sbb $a2, $t2
|
|
sbb $a3, $t3
|
|
mov $t1, $a1
|
|
sbb \$0, $t4
|
|
|
|
xor $a3 ,$a3
|
|
adc \$-1, $t0
|
|
mov $t2, $a2
|
|
adc $poly1, $t1
|
|
adc \$0, $t2
|
|
mov $t3, $a3
|
|
adc $poly3, $t3
|
|
|
|
bt \$0, $t4
|
|
cmovc $t0, $a0
|
|
cmovc $t1, $a1
|
|
cmovc $t2, $a2
|
|
cmovc $t3, $a3
|
|
|
|
ret
|
|
.size __ecp_nistz256_subx,.-__ecp_nistz256_subx
|
|
|
|
.type __ecp_nistz256_mul_by_2x,\@abi-omnipotent
|
|
.align 32
|
|
__ecp_nistz256_mul_by_2x:
|
|
xor $t4, $t4
|
|
adc $a0, $a0 # a0:a3+a0:a3
|
|
adc $a1, $a1
|
|
mov $a0, $t0
|
|
adc $a2, $a2
|
|
adc $a3, $a3
|
|
mov $a1, $t1
|
|
adc \$0, $t4
|
|
|
|
xor $t3, $t3
|
|
sbb \$-1, $a0
|
|
mov $a2, $t2
|
|
sbb $poly1, $a1
|
|
sbb \$0, $a2
|
|
mov $a3, $t3
|
|
sbb $poly3, $a3
|
|
sbb \$0, $t4
|
|
|
|
cmovc $t0, $a0
|
|
cmovc $t1, $a1
|
|
mov $a0, 8*0($r_ptr)
|
|
cmovc $t2, $a2
|
|
mov $a1, 8*1($r_ptr)
|
|
cmovc $t3, $a3
|
|
mov $a2, 8*2($r_ptr)
|
|
mov $a3, 8*3($r_ptr)
|
|
|
|
ret
|
|
.size __ecp_nistz256_mul_by_2x,.-__ecp_nistz256_mul_by_2x
|
|
___
|
|
}
|
|
&gen_double("x");
|
|
&gen_add("x");
|
|
&gen_add_affine("x");
|
|
}
|
|
}}}
|
|
|
|
$code =~ s/\`([^\`]*)\`/eval $1/gem;
|
|
print $code;
|
|
close STDOUT;
|