Change-Id: I5bd962f76a12b4253ba136c4da416d58f363512b
This commit is contained in:
Henry Case 2019-04-19 14:29:07 +01:00
parent 4733dd1c4f
commit 8cc902f4a8

View File

@ -21,6 +21,7 @@ open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
$PREFIX="sike";
$addx = 1;
$bmi2_adx = 1;
# Swaps 16-bytes pointed by %rdi and %rsi
# in constant time
@ -1344,7 +1345,7 @@ $code.=<<___;
___
# Optimized x86 code for CPUs with ADOX/ADCX and BMI2
$code.=<<___ if ($addx);
$code.=<<___ if ($bmi2_adx);
###########################################
# Montgomery reduction
# Based on method described in Faz-Hernandez et al. https://eprint.iacr.org/2017/1015
@ -1364,8 +1365,8 @@ $code.=<<___ if ($addx);
___
# a[0-1] x p503p1_nz --> result: r8:r14
&MUL128x320_SCHOOL(0, "rdi", "p503p1_nz(%rip)", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "rbx", "rcx", "r15") if($addx);
$code.=<<___ if ($addx);
&MUL128x320_SCHOOL(0, "rdi", "p503p1_nz(%rip)", map("r$_",(8..14)), "rbx", "rcx", "r15") if($bmi2_adx);
$code.=<<___ if ($bmi2_adx);
xor %r15, %r15
add 0x18(%rdi), %r8
adc 0x20(%rdi), %r9
@ -1401,9 +1402,9 @@ $code.=<<___ if ($addx);
___
# a[2-3] x p503p1_nz --> result: r8:r14
&MUL128x320_SCHOOL(16, "rdi", "p503p1_nz(%rip)", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "rbx", "rcx", "r15") if($addx);
&MUL128x320_SCHOOL(16, "rdi", "p503p1_nz(%rip)", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "rbx", "rcx", "r15") if($bmi2_adx);
$code.=<<___ if ($addx);
$code.=<<___ if ($bmi2_adx);
xor %r15, %r15
add 0x28(%rdi), %r8
adc 0x30(%rdi), %r9
@ -1433,9 +1434,9 @@ $code.=<<___ if ($addx);
___
# a[4-5] x p503p1_nz --> result: r8:r14
&MUL128x320_SCHOOL(32, "rdi", "p503p1_nz(%rip)", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "rbx", "rcx", "r15") if($addx);
&MUL128x320_SCHOOL(32, "rdi", "p503p1_nz(%rip)", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "rbx", "rcx", "r15") if($bmi2_adx);
$code.=<<___ if ($addx);
$code.=<<___ if ($bmi2_adx);
xor %r15, %r15
xor %rbx, %rbx
add 0x38(%rdi), %r8
@ -1459,10 +1460,10 @@ $code.=<<___ if ($addx);
___
# a[6-7] x p503p1_nz --> result: r8:r14
&MUL128x320_SCHOOL(48, "rdi", "p503p1_nz(%rip)", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "rbx", "rcx", "r15") if($addx);
&MUL128x320_SCHOOL(48, "rdi", "p503p1_nz(%rip)", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "rbx", "rcx", "r15") if($bmi2_adx);
# Final result c1:c7
$code.=<<___ if ($addx);
$code.=<<___ if ($bmi2_adx);
add 0x48(%rdi), %r8
adc 0x50(%rdi), %r9
adc 0x58(%rdi), %r10
@ -1479,7 +1480,7 @@ $code.=<<___ if ($addx);
mov %r14, 0x38(%rsi)
___
$code.=<<___ if ($addx);
$code.=<<___ if ($bmi2_adx);
pop %rbx
.cfi_adjust_cfa_offset -8
.cfi_same_value rbx
@ -1527,7 +1528,7 @@ ${PREFIX}_fprdc:
.cfi_offset rbx, -48
___
$code.=<<___ if ($addx);
$code.=<<___ if ($bmi2_adx);
lea OPENSSL_ia32cap_P(%rip), %rcx
mov 8(%rcx), %rcx
and \$0x80100, %ecx
@ -1548,7 +1549,8 @@ $code.=<<___;
adc %rdx, %r8
xor %r9, %r9
mov 8(%rbx), %rax
#mov 0x8(%rbx), %rax
mov (p503p1+8)(%rip), %rax
mul %r11
xor %r10, %r10
add %rax, %r8