diff --git a/third_party/sike/asm/fp-x86_64.pl b/third_party/sike/asm/fp-x86_64.pl index 42f6ac70..6bc6e9d2 100755 --- a/third_party/sike/asm/fp-x86_64.pl +++ b/third_party/sike/asm/fp-x86_64.pl @@ -55,7 +55,6 @@ p503p1_nz: ___ - # Swaps 16-bytes pointed by %rdi and %rsi # in constant time sub CSWAP16() { @@ -232,30 +231,28 @@ ${PREFIX}_fpadd: adc 0x30(%rsi), %r14 adc 0x38(%rsi), %r15 - lea p503x2(%rip), %rbx - - mov 0(%rbx), %rcx; + mov p503x2(%rip), %rcx; sub %rcx, %r8 - mov 8(%rbx), %rcx; + mov 8+p503x2(%rip), %rcx; sbb %rcx, %r9 sbb %rcx, %r10 - mov 16(%rbx), %rcx; + mov 16+p503x2(%rip), %rcx; sbb %rcx, %r11 - mov 24(%rbx), %rcx; + mov 24+p503x2(%rip), %rcx; sbb %rcx, %r12 - mov 32(%rbx), %rcx; + mov 32+p503x2(%rip), %rcx; sbb %rcx, %r13 - mov 40(%rbx), %rcx; + mov 40+p503x2(%rip), %rcx; sbb %rcx, %r14 - mov 48(%rbx), %rcx; + mov 48+p503x2(%rip), %rcx; sbb %rcx, %r15 sbb \$0, %rax - mov 0(%rbx), %rdi + mov p503x2(%rip), %rdi and %rax, %rdi - mov 8(%rbx), %rsi + mov 8+p503x2(%rip), %rsi and %rax, %rsi - mov 16(%rbx), %rcx + mov 16+p503x2(%rip), %rcx and %rax, %rcx add %rdi, %r8 @@ -269,13 +266,13 @@ ${PREFIX}_fpadd: setc %cl - mov 24(%rbx), %r8 + mov 24+p503x2(%rip), %r8 and %rax, %r8 - mov 32(%rbx), %r9 + mov 32+p503x2(%rip), %r9 and %rax, %r9 - mov 40(%rbx), %r10 + mov 40+p503x2(%rip), %r10 and %rax, %r10 - mov 48(%rbx), %r11 + mov 48+p503x2(%rip), %r11 and %rax, %r11 bt \$0, %rcx @@ -346,9 +343,6 @@ ${PREFIX}_fpsub: push %r15 .cfi_adjust_cfa_offset 8 .cfi_offset r15, -40 - push %rbx -.cfi_adjust_cfa_offset 8 -.cfi_offset rbx, -48 xor %rax, %rax @@ -371,13 +365,11 @@ ${PREFIX}_fpsub: sbb 0x38(%rsi), %r15 sbb \$0x0, %rax - lea p503x2(%rip), %rbx - mov p503x2(%rip), %rdi and %rax, %rdi - mov 8+p503x2(%rip), %rsi + mov 0x8+p503x2(%rip), %rsi and %rax, %rsi - mov 0x10(%rbx), %rcx + mov 0x10+p503x2(%rip), %rcx and %rax, %rcx add %rdi, %r8 @@ -391,13 +383,13 @@ ${PREFIX}_fpsub: setc %cl - mov 0x18(%rbx), %r8 + mov 0x18+p503x2(%rip), %r8 and %rax, %r8 - mov 0x20(%rbx), %r9 + mov 0x20+p503x2(%rip), %r9 and %rax, %r9 - mov 0x28(%rbx), %r10 + mov 0x28+p503x2(%rip), %r10 and %rax, %r10 - mov 0x30(%rbx), %r11 + mov 0x30+p503x2(%rip), %r11 and %rax, %r11 bt \$0x0, %rcx @@ -411,8 +403,6 @@ ${PREFIX}_fpsub: mov %r14, 0x30(%rdx) mov %r15, 0x38(%rdx) - pop %rbx -.cfi_adjust_cfa_offset -8 pop %r15 .cfi_adjust_cfa_offset -8 pop %r14 @@ -1332,7 +1322,7 @@ $code.=<<___ if ($bmi2_adx); ___ # a[0-1] x p503p1_nz --> result: r8:r14 -&MUL128x320_SCHOOL(0, "rdi", "p503p1_nz(%rip)", map("r$_",(8..14)), "rbx", "rcx", "r15") if($bmi2_adx); +&MUL128x320_SCHOOL(0,"rdi","p503p1_nz(%rip)",map("r$_",(8..14)),"rbx","rcx","r15") if($bmi2_adx); $code.=<<___ if ($bmi2_adx); xor %r15, %r15 add 0x18(%rdi), %r8 @@ -1369,7 +1359,7 @@ $code.=<<___ if ($bmi2_adx); ___ # a[2-3] x p503p1_nz --> result: r8:r14 -&MUL128x320_SCHOOL(16, "rdi", "p503p1_nz(%rip)", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "rbx", "rcx", "r15") if($bmi2_adx); +&MUL128x320_SCHOOL(16,"rdi","p503p1_nz(%rip)",map("r$_",(8..14)),"rbx","rcx","r15") if($bmi2_adx); $code.=<<___ if ($bmi2_adx); xor %r15, %r15 @@ -1401,7 +1391,7 @@ $code.=<<___ if ($bmi2_adx); ___ # a[4-5] x p503p1_nz --> result: r8:r14 -&MUL128x320_SCHOOL(32, "rdi", "p503p1_nz(%rip)", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "rbx", "rcx", "r15") if($bmi2_adx); +&MUL128x320_SCHOOL(32,"rdi","p503p1_nz(%rip)",map("r$_",(8..14)),"rbx","rcx","r15") if($bmi2_adx); $code.=<<___ if ($bmi2_adx); xor %r15, %r15 @@ -1427,7 +1417,7 @@ $code.=<<___ if ($bmi2_adx); ___ # a[6-7] x p503p1_nz --> result: r8:r14 -&MUL128x320_SCHOOL(48, "rdi", "p503p1_nz(%rip)", "r8", "r9", "r10", "r11", "r12", "r13", "r14", "rbx", "rcx", "r15") if($bmi2_adx); +&MUL128x320_SCHOOL(48,"rdi","p503p1_nz(%rip)",map("r$_", (8..14)),"rbx","rcx","r15") if($bmi2_adx); # Final result c1:c7 $code.=<<___ if ($bmi2_adx);