From 8cc902f4a896f930d3842e3e260d708b009c95d9 Mon Sep 17 00:00:00 2001 From: Kris Kwiatkowski Date: Fri, 19 Apr 2019 14:29:07 +0100 Subject: [PATCH] WIP Change-Id: I5bd962f76a12b4253ba136c4da416d58f363512b --- third_party/sike/asm/fp-x86_64.pl | 26 ++++++++++++++------------ 1 file changed, 14 insertions(+), 12 deletions(-) diff --git a/third_party/sike/asm/fp-x86_64.pl b/third_party/sike/asm/fp-x86_64.pl index d2a975a4..0e20fa4f 100755 --- a/third_party/sike/asm/fp-x86_64.pl +++ b/third_party/sike/asm/fp-x86_64.pl @@ -21,6 +21,7 @@ open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""; $PREFIX="sike"; $addx = 1; +$bmi2_adx = 1; # Swaps 16-bytes pointed by %rdi and %rsi # in constant time @@ -1344,7 +1345,7 @@ $code.=<<___; ___ # Optimized x86 code for CPUs with ADOX/ADCX and BMI2 -$code.=<<___ if ($addx); +$code.=<<___ if ($bmi2_adx); ########################################### # Montgomery reduction # Based on method described in Faz-Hernandez et al. https://eprint.iacr.org/2017/1015 @@ -1364,8 +1365,8 @@ $code.=<<___ if ($addx); ___ # a[0-1] x p503p1_nz --> result: r8:r14 -&MUL128x320_SCHOOL(0, "rdi", "p503p1_nz(%rip)", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "rbx", "rcx", "r15") if($addx); -$code.=<<___ if ($addx); +&MUL128x320_SCHOOL(0, "rdi", "p503p1_nz(%rip)", map("r$_",(8..14)), "rbx", "rcx", "r15") if($bmi2_adx); +$code.=<<___ if ($bmi2_adx); xor %r15, %r15 add 0x18(%rdi), %r8 adc 0x20(%rdi), %r9 @@ -1401,9 +1402,9 @@ $code.=<<___ if ($addx); ___ # a[2-3] x p503p1_nz --> result: r8:r14 -&MUL128x320_SCHOOL(16, "rdi", "p503p1_nz(%rip)", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "rbx", "rcx", "r15") if($addx); +&MUL128x320_SCHOOL(16, "rdi", "p503p1_nz(%rip)", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "rbx", "rcx", "r15") if($bmi2_adx); -$code.=<<___ if ($addx); +$code.=<<___ if ($bmi2_adx); xor %r15, %r15 add 0x28(%rdi), %r8 adc 0x30(%rdi), %r9 @@ -1433,9 +1434,9 @@ $code.=<<___ if ($addx); ___ # a[4-5] x p503p1_nz --> result: r8:r14 -&MUL128x320_SCHOOL(32, "rdi", "p503p1_nz(%rip)", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "rbx", "rcx", "r15") if($addx); +&MUL128x320_SCHOOL(32, "rdi", "p503p1_nz(%rip)", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "rbx", "rcx", "r15") if($bmi2_adx); -$code.=<<___ if ($addx); +$code.=<<___ if ($bmi2_adx); xor %r15, %r15 xor %rbx, %rbx add 0x38(%rdi), %r8 @@ -1459,10 +1460,10 @@ $code.=<<___ if ($addx); ___ # a[6-7] x p503p1_nz --> result: r8:r14 -&MUL128x320_SCHOOL(48, "rdi", "p503p1_nz(%rip)", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "rbx", "rcx", "r15") if($addx); +&MUL128x320_SCHOOL(48, "rdi", "p503p1_nz(%rip)", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "rbx", "rcx", "r15") if($bmi2_adx); # Final result c1:c7 -$code.=<<___ if ($addx); +$code.=<<___ if ($bmi2_adx); add 0x48(%rdi), %r8 adc 0x50(%rdi), %r9 adc 0x58(%rdi), %r10 @@ -1479,7 +1480,7 @@ $code.=<<___ if ($addx); mov %r14, 0x38(%rsi) ___ -$code.=<<___ if ($addx); +$code.=<<___ if ($bmi2_adx); pop %rbx .cfi_adjust_cfa_offset -8 .cfi_same_value rbx @@ -1527,7 +1528,7 @@ ${PREFIX}_fprdc: .cfi_offset rbx, -48 ___ -$code.=<<___ if ($addx); +$code.=<<___ if ($bmi2_adx); lea OPENSSL_ia32cap_P(%rip), %rcx mov 8(%rcx), %rcx and \$0x80100, %ecx @@ -1548,7 +1549,8 @@ $code.=<<___; adc %rdx, %r8 xor %r9, %r9 - mov 8(%rbx), %rax + #mov 0x8(%rbx), %rax + mov (p503p1+8)(%rip), %rax mul %r11 xor %r10, %r10 add %rax, %r8