Pārlūkot izejas kodu

Removes usage of x29

kris/for/master/CECPQ3_armX29
Henry Case pirms 5 gadiem
vecāks
revīzija
843fcd8e0e
2 mainītis faili ar 190 papildinājumiem un 109 dzēšanām
  1. +181
    -100
      third_party/sike/asm/fp-armv8.pl
  2. +9
    -9
      third_party/sike/fpx.c

+ 181
- 100
third_party/sike/asm/fp-armv8.pl Parādīt failu

@@ -34,22 +34,23 @@ $code.=<<___;
.text .text
___ ___


sub mul128_comba_cut {
my ($A0,$A1,$B0,$B1,$C0,$C1,$C2,$C3,$T0)=@_;
# C[0-2] = A[0] * B[0-1]
sub mul64x128_comba_cut {
my ($A0,$B0,$B1,$C0,$C1,$C2,$T0,$T1)=@_;
my $body=<<___; my $body=<<___;
mul $A0, $A1, $B0
umulh $B0, $A1, $B0
adds $C1, $C1, $C3
adc $C2, $C2, xzr
mul $T1, $A0, $B0
umulh $B0, $A0, $B0
adds $C0, $C0, $C2
adc $C1, $C1, xzr


mul $T0, $A1, $B1
umulh $B1, $A1, $B1
adds $C1, $C1, $A0
adcs $C2, $C2, $B0
adc $C3, xzr, xzr
mul $T0, $A0, $B1
umulh $B1, $A0, $B1
adds $C0, $C0, $T1
adcs $C1, $C1, $B0
adc $C2, xzr, xzr


adds $C2, $C2, $T0
adc $C3, $C3, $B1
adds $C1, $C1, $T0
adc $C2, $C2, $B1
___ ___
return $body; return $body;
} }
@@ -57,11 +58,11 @@ ___
sub mul256_karatsuba_comba { sub mul256_karatsuba_comba {
my ($M,$A0,$A1,$A2,$A3,$B0,$B1,$B2,$B3,$C0,$C1,$C2,$C3,$C4,$C5,$C6,$C7,$T0,$T1)=@_; my ($M,$A0,$A1,$A2,$A3,$B0,$B1,$B2,$B3,$C0,$C1,$C2,$C3,$C4,$C5,$C6,$C7,$T0,$T1)=@_;
# (AH+AL) x (BH+BL), low part # (AH+AL) x (BH+BL), low part
my $mul_low=&mul128_comba_cut($A0, $A1, $C6, $T1, $C2, $C3, $C4, $C5, $C7);
my $mul_low=&mul64x128_comba_cut($A1, $C6, $T1, $C3, $C4, $C5, $C7, $A0);
# AL x BL # AL x BL
my $mul_albl=&mul128_comba_cut($A0, $A1, $B0, $B1, $C0, $C1, $T1, $C7, $C6);
my $mul_albl=&mul64x128_comba_cut($A1, $B0, $B1, $C1, $T1, $C7, $C6, $A0);
# AH x BH # AH x BH
my $mul_ahbh=&mul128_comba_cut($A2, $A3, $B2, $B3, $A0, $A1, $C6, $B0, $B1);
my $mul_ahbh=&mul64x128_comba_cut($A3, $B2, $B3, $A1, $C6, $B0, $B1, $A2);
my $body=<<___; my $body=<<___;
// A0-A1 <- AH + AL, T0 <- mask // A0-A1 <- AH + AL, T0 <- mask
adds $A0, $A0, $A2 adds $A0, $A0, $A2
@@ -145,23 +146,38 @@ ___
# Operation: c [x2] = a [x0] * b [x1] # Operation: c [x2] = a [x0] * b [x1]
sub mul { sub mul {
# (AH+AL) x (BH+BL), low part # (AH+AL) x (BH+BL), low part
my $mul_kc_low=&mul256_karatsuba_comba("x2","x27","x28","x29","x30","x11","x12","x13","x14","x8","x9","x10","x20","x21","x22","x23","x24","x25","x26");
my $mul_kc_low=&mul256_karatsuba_comba(
"x2", # M0
"x3","x4","x5","x6", # A0-A3
"x11","x12","x13","x14", # B0-B3
"x8","x9","x10","x20","x21","x22","x23","x24", # C0-C7
"x25","x26"); # TMP
# AL x BL # AL x BL
my $mul_albl=&mul256_karatsuba_comba("x0","x3","x4","x5","x6","x11","x12","x13","x14","x21","x22","x23","x24","x25","x26","x27","x28","x29","x30");
my $mul_albl=&mul256_karatsuba_comba(
"x0", # M0
"x3","x4","x5","x6", # A0-A3
"x11","x12","x13","x14", # B0-B3
"x21","x22","x23","x24","x25","x26","x27","x28",# C0-C7
"x8","x9"); # TMP
# AH x BH # AH x BH
my $mul_ahbh=&mul256_karatsuba_comba("x0","x3","x4","x5","x6","x11","x12","x13","x14","x21","x22","x23","x24","x25","x26","x27","x28","x29","x30");
my $mul_ahbh=&mul256_karatsuba_comba(
"x0", # M0
"x3","x4","x5","x6", # A0-A3
"x11","x12","x13","x14", # B0-B3
"x21","x22","x23","x24","x25","x26","x27","x28",# C0-C7
"x8","x9"); # TMP


my $body=<<___; my $body=<<___;
.global ${PREFIX}_mpmul .global ${PREFIX}_mpmul
.align 4 .align 4
${PREFIX}_mpmul: ${PREFIX}_mpmul:
sub sp, sp, #96
stp x19, x20, [sp,#0]
stp x21, x22, [sp,#16]
stp x23, x24, [sp,#32]
stp x25, x26, [sp,#48]
stp x27, x28, [sp,#64]
stp x29, x30, [sp,#80]
stp x29, x30, [sp,#-96]!
add x29, sp, #0
stp x19, x20, [sp,#16]
stp x21, x22, [sp,#32]
stp x23, x24, [sp,#48]
stp x25, x26, [sp,#64]
stp x27, x28, [sp,#80]


ldp x3, x4, [x0] ldp x3, x4, [x0]
ldp x5, x6, [x0,#16] ldp x5, x6, [x0,#16]
@@ -172,44 +188,48 @@ sub mul {
ldp x15, x16, [x1,#32] ldp x15, x16, [x1,#32]
ldp x17, x19, [x1,#48] ldp x17, x19, [x1,#48]


// x27-x30 <- AH + AL, x7 <- mask
adds x27, x3, x7
adcs x28, x4, x8
adcs x29, x5, x9
adcs x30, x6, x10
// x3-x7 <- AH + AL, x7 <- carry
adds x3, x3, x7
adcs x4, x4, x8
adcs x5, x5, x9
adcs x6, x6, x10
adc x7, xzr, xzr adc x7, xzr, xzr


// x11-x14 <- BH + BL, x8 <- mask
// x11-x14 <- BH + BL, x8 <- carry
adds x11, x11, x15 adds x11, x11, x15
adcs x12, x12, x16 adcs x12, x12, x16
adcs x13, x13, x17 adcs x13, x13, x17
adcs x14, x14, x19 adcs x14, x14, x19
adc x8, xzr, xzr adc x8, xzr, xzr


// x9 <- combined carry
and x9, x7, x8
// x7-x8 <- mask
sub x7, xzr, x7
sub x8, xzr, x8


// x15-x19 <- masked (BH + BL) // x15-x19 <- masked (BH + BL)
sub x9, xzr, x7
sub x10, xzr, x8
and x15, x11, x9
and x16, x12, x9
and x17, x13, x9
and x19, x14, x9

// x20-x23 <- masked (AH + AL), x7 <- combined carry
and x20, x27, x10
and x21, x28, x10
and x22, x29, x10
and x23, x30, x10
and x7, x7, x8
and x15, x11, x7
and x16, x12, x7
and x17, x13, x7
and x19, x14, x7

// x20-x23 <- masked (AH + AL)
and x20, x3, x8
and x21, x4, x8
and x22, x5, x8
and x23, x6, x8


// x15-x19, x7 <- masked (AH+AL) + masked (BH+BL), step 1 // x15-x19, x7 <- masked (AH+AL) + masked (BH+BL), step 1
adds x15, x15, x20 adds x15, x15, x20
adcs x16, x16, x21 adcs x16, x16, x21
adcs x17, x17, x22 adcs x17, x17, x22
adcs x19, x19, x23 adcs x19, x19, x23
adc x7, x7, xzr
adc x7, x9, xzr


// x8-x10,x20-x24 <- (AH+AL) x (BH+BL), low part // x8-x10,x20-x24 <- (AH+AL) x (BH+BL), low part
stp x27, x28, [x2,#0]
stp x3, x4, [x2,#0]
$mul_kc_low $mul_kc_low


// x15-x19, x7 <- (AH+AL) x (BH+BL), final step // x15-x19, x7 <- (AH+AL) x (BH+BL), final step
@@ -219,12 +239,21 @@ sub mul {
adcs x19, x19, x24 adcs x19, x19, x24
adc x7, x7, xzr adc x7, x7, xzr


// x21-x28 <- AL x BL
// Load AL
ldp x3, x4, [x0]
ldp x5, x6, [x0,#16]
// Load BL
ldp x11, x12, [x1,#0] ldp x11, x12, [x1,#0]
ldp x13, x14, [x1,#16] ldp x13, x14, [x1,#16]

// Temporarily store x8,x9 in x2
stp x8,x9, [x2,#0]
// x21-x28 <- AL x BL
$mul_albl $mul_albl
// Restore x8,x9
ldp x8,x9, [x2,#0]


// x13-x14, x3-x5 <- (AH+AL) x (BH+BL) - ALxBL
// x8-x10,x20,x15-x17,x19 <- maskd (AH+AL) x (BH+BL) - ALxBL
subs x8, x8, x21 subs x8, x8, x21
sbcs x9, x9, x22 sbcs x9, x9, x22
sbcs x10, x10, x23 sbcs x10, x10, x23
@@ -235,11 +264,14 @@ sub mul {
sbcs x19, x19, x28 sbcs x19, x19, x28
sbc x7, x7, xzr sbc x7, x7, xzr


// Store ALxBL, low
stp x21, x22, [x2] stp x21, x22, [x2]
stp x23, x24, [x2,#16] stp x23, x24, [x2,#16]


// Load AH
ldp x3, x4, [x0,#32] ldp x3, x4, [x0,#32]
ldp x5, x6, [x0,#48] ldp x5, x6, [x0,#48]
// Load BH
ldp x11, x12, [x1,#32] ldp x11, x12, [x1,#32]
ldp x13, x14, [x1,#48] ldp x13, x14, [x1,#48]


@@ -249,12 +281,17 @@ sub mul {
adcs x20, x20, x28 adcs x20, x20, x28
adc x1, xzr, xzr adc x1, xzr, xzr


// x21-x28 <- AH x BH
add x0, x0, #32 add x0, x0, #32
// Temporarily store x8,x9 in x2
stp x8,x9, [x2,#32]
// x21-x28 <- AH x BH
$mul_ahbh $mul_ahbh
// Restore x8,x9
ldp x8,x9, [x2,#32]

neg x1, x1 neg x1, x1


// x13-x14, x3-x5 <- (AH+AL) x (BH+BL) - ALxBL - AHxBH
// x8-x10,x20,x15-x17,x19 <- (AH+AL) x (BH+BL) - ALxBL - AHxBH
subs x8, x8, x21 subs x8, x8, x21
sbcs x9, x9, x22 sbcs x9, x9, x22
sbcs x10, x10, x23 sbcs x10, x10, x23
@@ -265,6 +302,7 @@ sub mul {
sbcs x19, x19, x28 sbcs x19, x19, x28
sbc x7, x7, xzr sbc x7, x7, xzr


// Store (AH+AL) x (BH+BL) - ALxBL - AHxBH, low
stp x8, x9, [x2,#32] stp x8, x9, [x2,#32]
stp x10, x20, [x2,#48] stp x10, x20, [x2,#48]


@@ -283,13 +321,12 @@ sub mul {
stp x25, x26, [x2,#96] stp x25, x26, [x2,#96]
stp x27, x28, [x2,#112] stp x27, x28, [x2,#112]


ldp x19, x20, [sp,#0]
ldp x21, x22, [sp,#16]
ldp x23, x24, [sp,#32]
ldp x25, x26, [sp,#48]
ldp x27, x28, [sp,#64]
ldp x29, x30, [sp,#80]
add sp, sp, #96
ldp x19, x20, [x29,#16]
ldp x21, x22, [x29,#32]
ldp x23, x24, [x29,#48]
ldp x25, x26, [x29,#64]
ldp x27, x28, [x29,#80]
ldp x29, x30, [sp],#96
ret ret
___ ___
return $body; return $body;
@@ -347,21 +384,37 @@ ___
# Operation: mc [x1] = ma [x0] # Operation: mc [x1] = ma [x0]
# NOTE: ma=mc is not allowed # NOTE: ma=mc is not allowed
sub rdc { sub rdc {
my $mul01=&mul128x256_comba("x2","x3","x24","x25","x26","x27","x5","x6","x7","x8","x9","x28","x29","x1","x10");
my $mul23=&mul128x256_comba("x3","x11","x24","x25","x26","x27","x5","x6","x7","x8","x9","x28","x29","x1","x10");
my $mul45=&mul128x256_comba("x12","x13","x24","x25","x26","x27","x5","x6","x7","x8","x9","x28","x29","x1","x10");
my $mul67=&mul128x256_comba("x14","x15","x24","x25","x26","x27","x5","x6","x7","x8","x9","x28","x29","x1","x10");
my $mul01=&mul128x256_comba(
"x2","x3", # A0-A1
"x24","x25","x26","x27", # B0-B3
"x5","x6","x7","x8","x9", # C0-B4
"x1","x10","x11","x19"); # TMP
my $mul23=&mul128x256_comba(
"x2","x3", # A0-A1
"x24","x25","x26","x27", # B0-B3
"x5","x6","x7","x8","x9", # C0-C4
"x1","x10","x11","x19"); # TMP
my $mul45=&mul128x256_comba(
"x12","x13", # A0-A1
"x24","x25","x26","x27", # B0-B3
"x5","x6","x7","x8","x9", # C0-C4
"x1","x10","x11","x19"); # TMP
my $mul67=&mul128x256_comba(
"x14","x15", # A0-A1
"x24","x25","x26","x27", # B0-B3
"x5","x6","x7","x8","x9", # C0-C4
"x1","x10","x11","x19"); # TMP
my $body=<<___; my $body=<<___;
.global ${PREFIX}_fprdc .global ${PREFIX}_fprdc
.align 4 .align 4
${PREFIX}_fprdc: ${PREFIX}_fprdc:
sub sp, sp, #112
stp x19, x20, [sp]
stp x21, x22, [sp,#16]
stp x23, x24, [sp,#32]
stp x25, x26, [sp,#48]
stp x27, x28, [sp,#64]
stp x29, x30, [sp,#80]
stp x29, x30, [sp, #-112]!
add x29, sp, #0
stp x19, x20, [sp,#16]
stp x21, x22, [sp,#32]
stp x23, x24, [sp,#48]
stp x25, x26, [sp,#64]
stp x27, x28, [sp,#80]
str x1, [sp,#96] str x1, [sp,#96]


ldp x2, x3, [x0,#0] // a[0-1] ldp x2, x3, [x0,#0] // a[0-1]
@@ -380,7 +433,7 @@ sub rdc {


$mul01 $mul01


ldp x3, x11, [x0,#16] // a[2]
ldp x2, x3, [x0,#16] // a[2]
ldp x12, x13, [x0,#32] ldp x12, x13, [x0,#32]
ldp x14, x15, [x0,#48] ldp x14, x15, [x0,#48]


@@ -397,23 +450,23 @@ sub rdc {
orr x5, x5, x4, lsr #8 orr x5, x5, x4, lsr #8
lsl x4, x4, #56 lsl x4, x4, #56


adds x11, x4, x11 // a[3]
adds x3, x4, x3 // a[3]
adcs x12, x5, x12 // a[4] adcs x12, x5, x12 // a[4]
adcs x13, x6, x13 adcs x13, x6, x13
adcs x14, x7, x14 adcs x14, x7, x14
adcs x15, x8, x15 adcs x15, x8, x15
ldp x16, x17, [x0,#64] ldp x16, x17, [x0,#64]
ldp x30, x19, [x0,#80]
mul x4, x3, x24 // a[2] x .Lp503p1_nz_s8[0]
umulh x7, x3, x24
ldp x28, x30, [x0,#80]
mul x4, x2, x24 // a[2] x .Lp503p1_nz_s8[0]
umulh x7, x2, x24
adcs x16, x9, x16 adcs x16, x9, x16
adcs x17, x10, x17 adcs x17, x10, x17
adcs x28, xzr, x28
adcs x30, xzr, x30 adcs x30, xzr, x30
adcs x19, xzr, x19
ldp x20, x21, [x0,#96] ldp x20, x21, [x0,#96]
ldp x22, x23, [x0,#112] ldp x22, x23, [x0,#112]
mul x5, x3, x25 // a[2] x .Lp503p1_nz_s8[1]
umulh x6, x3, x25
mul x5, x2, x25 // a[2] x .Lp503p1_nz_s8[1]
umulh x6, x2, x25
adcs x20, xzr, x20 adcs x20, xzr, x20
adcs x21, xzr, x21 adcs x21, xzr, x21
adcs x22, xzr, x22 adcs x22, xzr, x22
@@ -442,8 +495,8 @@ sub rdc {
mul x4, x12, x24 // a[4] x .Lp503p1_nz_s8[0] mul x4, x12, x24 // a[4] x .Lp503p1_nz_s8[0]
umulh x7, x12, x24 umulh x7, x12, x24
adcs x17, x8, x17 adcs x17, x8, x17
adcs x30, x9, x30
adcs x19, x10, x19
adcs x28, x9, x28
adcs x30, x10, x30
adcs x20, xzr, x20 adcs x20, xzr, x20
mul x5, x12, x25 // a[4] x .Lp503p1_nz_s8[1] mul x5, x12, x25 // a[4] x .Lp503p1_nz_s8[1]
umulh x6, x12, x25 umulh x6, x12, x25
@@ -470,10 +523,10 @@ sub rdc {
adds x15, x4, x15 // a[7] adds x15, x4, x15 // a[7]
adcs x16, x5, x16 // a[8] adcs x16, x5, x16 // a[8]
adcs x17, x6, x17 adcs x17, x6, x17
adcs x30, x7, x30
adcs x28, x7, x28
mul x4, x14, x24 // a[6] x .Lp503p1_nz_s8[0] mul x4, x14, x24 // a[6] x .Lp503p1_nz_s8[0]
umulh x7, x14, x24 umulh x7, x14, x24
adcs x19, x8, x19
adcs x30, x8, x30
adcs x20, x9, x20 adcs x20, x9, x20
adcs x21, x10, x21 adcs x21, x10, x21
mul x5, x14, x25 // a[6] x .Lp503p1_nz_s8[1] mul x5, x14, x25 // a[6] x .Lp503p1_nz_s8[1]
@@ -498,25 +551,24 @@ sub rdc {
lsl x4, x4, #56 lsl x4, x4, #56


adds x17, x4, x17 adds x17, x4, x17
adcs x30, x5, x30
adcs x28, x5, x28
ldr x1, [sp,#96] ldr x1, [sp,#96]
adcs x19, x6, x19
adcs x30, x6, x30
adcs x20, x7, x20 adcs x20, x7, x20
stp x16, x17, [x1,#0] // Final result stp x16, x17, [x1,#0] // Final result
stp x30, x19, [x1,#16]
stp x28, x30, [x1,#16]
adcs x21, x8, x21 adcs x21, x8, x21
adcs x22, x9, x22 adcs x22, x9, x22
adc x23, x10, x23 adc x23, x10, x23
stp x20, x21, [x1,#32] stp x20, x21, [x1,#32]
stp x22, x23, [x1,#48] stp x22, x23, [x1,#48]


ldp x19, x20, [sp]
ldp x21, x22, [sp, #16]
ldp x23, x24, [sp, #32]
ldp x25, x26, [sp, #48]
ldp x27, x28, [sp, #64]
ldp x29, x30, [sp, #80]
add sp, sp, #112
ldp x19, x20, [x29,#16]
ldp x21, x22, [x29,#32]
ldp x23, x24, [x29,#48]
ldp x25, x26, [x29,#64]
ldp x27, x28, [x29,#80]
ldp x29, x30, [sp],#112
ret ret


___ ___
@@ -531,6 +583,9 @@ $code.=<<___;
.global ${PREFIX}_fpadd .global ${PREFIX}_fpadd
.align 4 .align 4
${PREFIX}_fpadd: ${PREFIX}_fpadd:
stp x29,x30, [sp,#-16]!
add x29, sp, #0

ldp x3, x4, [x0,#0] ldp x3, x4, [x0,#0]
ldp x5, x6, [x0,#16] ldp x5, x6, [x0,#16]
ldp x11, x12, [x1,#0] ldp x11, x12, [x1,#0]
@@ -590,6 +645,8 @@ $code.=<<___;
stp x5, x6, [x2,#16] stp x5, x6, [x2,#16]
stp x7, x8, [x2,#32] stp x7, x8, [x2,#32]
stp x9, x10, [x2,#48] stp x9, x10, [x2,#48]

ldp x29, x30, [sp],#16
ret ret


___ ___
@@ -600,6 +657,9 @@ $code.=<<___;
.global ${PREFIX}_fpsub .global ${PREFIX}_fpsub
.align 4 .align 4
${PREFIX}_fpsub: ${PREFIX}_fpsub:
stp x29, x30, [sp,#-16]!
add x29, sp, #0

ldp x3, x4, [x0,#0] ldp x3, x4, [x0,#0]
ldp x5, x6, [x0,#16] ldp x5, x6, [x0,#16]
ldp x11, x12, [x1,#0] ldp x11, x12, [x1,#0]
@@ -650,6 +710,8 @@ $code.=<<___;
adc x10, x10, x13 adc x10, x10, x13
stp x7, x8, [x2,#32] stp x7, x8, [x2,#32]
stp x9, x10, [x2,#48] stp x9, x10, [x2,#48]

ldp x29, x30, [sp],#16
ret ret
___ ___


@@ -659,6 +721,9 @@ $code.=<<___;
.global ${PREFIX}_mpadd_asm .global ${PREFIX}_mpadd_asm
.align 4 .align 4
${PREFIX}_mpadd_asm: ${PREFIX}_mpadd_asm:
stp x29, x30, [sp,#-16]!
add x29, sp, #0

ldp x3, x4, [x0,#0] ldp x3, x4, [x0,#0]
ldp x5, x6, [x0,#16] ldp x5, x6, [x0,#16]
ldp x11, x12, [x1,#0] ldp x11, x12, [x1,#0]
@@ -681,6 +746,8 @@ $code.=<<___;
stp x5, x6, [x2,#16] stp x5, x6, [x2,#16]
stp x7, x8, [x2,#32] stp x7, x8, [x2,#32]
stp x9, x10, [x2,#48] stp x9, x10, [x2,#48]

ldp x29, x30, [sp],#16
ret ret
___ ___


@@ -691,6 +758,9 @@ $code.=<<___;
.global ${PREFIX}_mpadd503x2_asm .global ${PREFIX}_mpadd503x2_asm
.align 4 .align 4
${PREFIX}_mpadd503x2_asm: ${PREFIX}_mpadd503x2_asm:
stp x29, x30, [sp,#-16]!
add x29, sp, #0

ldp x3, x4, [x0,#0] ldp x3, x4, [x0,#0]
ldp x5, x6, [x0,#16] ldp x5, x6, [x0,#16]
ldp x11, x12, [x1,#0] ldp x11, x12, [x1,#0]
@@ -734,6 +804,8 @@ $code.=<<___;
stp x5, x6, [x2,#80] stp x5, x6, [x2,#80]
stp x7, x8, [x2,#96] stp x7, x8, [x2,#96]
stp x9, x10, [x2,#112] stp x9, x10, [x2,#112]

ldp x29, x30, [sp],#16
ret ret
___ ___


@@ -746,6 +818,9 @@ $code.=<<___;
.global ${PREFIX}_mpsubx2_asm .global ${PREFIX}_mpsubx2_asm
.align 4 .align 4
${PREFIX}_mpsubx2_asm: ${PREFIX}_mpsubx2_asm:
stp x29, x30, [sp,#-16]!
add x29, sp, #0

ldp x3, x4, [x0,#0] ldp x3, x4, [x0,#0]
ldp x5, x6, [x0,#16] ldp x5, x6, [x0,#16]
ldp x11, x12, [x1,#0] ldp x11, x12, [x1,#0]
@@ -784,12 +859,14 @@ $code.=<<___;
sbcs x8, x8, x12 sbcs x8, x8, x12
sbcs x9, x9, x13 sbcs x9, x9, x13
sbcs x10, x10, x14 sbcs x10, x10, x14
sbc x0, xzr, xzr // OZAPTF
sbc x0, xzr, xzr


stp x3, x4, [x2,#64] stp x3, x4, [x2,#64]
stp x5, x6, [x2,#80] stp x5, x6, [x2,#80]
stp x7, x8, [x2,#96] stp x7, x8, [x2,#96]
stp x9, x10, [x2,#112] stp x9, x10, [x2,#112]

ldp x29, x30, [sp],#16
ret ret
___ ___


@@ -800,10 +877,13 @@ $code.=<<___;
.global ${PREFIX}_mpdblsubx2_asm .global ${PREFIX}_mpdblsubx2_asm
.align 4 .align 4
${PREFIX}_mpdblsubx2_asm: ${PREFIX}_mpdblsubx2_asm:
sub sp, sp, #48
stp x20, x21, [sp, #0]
stp x22, x23, [sp, #16]
str x24, [sp, #32]
stp x29, x30, [sp, #-64]!
add x29, sp, #0

stp x20, x21, [sp, #16]
stp x22, x23, [sp, #32]
str x24, [sp, #48]

ldp x3, x4, [x2,#0] ldp x3, x4, [x2,#0]
ldp x5, x6, [x2,#16] ldp x5, x6, [x2,#16]
ldp x7, x8, [x2,#32] ldp x7, x8, [x2,#32]
@@ -872,10 +952,11 @@ $code.=<<___;
stp x15, x16, [x2,#96] stp x15, x16, [x2,#96]
stp x17, x24, [x2,#112] stp x17, x24, [x2,#112]


ldp x20, x21, [sp, #0]
ldp x22, x23, [sp, #16]
ldr x24, [sp, #32]
add sp, sp, #48
ldp x20, x21, [x29,#16]
ldp x22, x23, [x29,#32]
ldr x24, [x29,#48]

ldp x29, x30, [sp],#64
ret ret
___ ___




+ 9
- 9
third_party/sike/fpx.c Parādīt failu

@@ -216,10 +216,10 @@ void sike_fp2sqr_mont(const f2elm_t a, f2elm_t c) {
felm_t t1, t2, t3; felm_t t1, t2, t3;


mp_addfast(a->c0, a->c1, t1); // t1 = a0+a1 mp_addfast(a->c0, a->c1, t1); // t1 = a0+a1
sike_fpsub(a->c0, a->c1, t2); // t2 = a0-a1
sike_fpsub(a->c0, a->c1, t2); // t2 = a0-a1
mp_addfast(a->c0, a->c0, t3); // t3 = 2a0 mp_addfast(a->c0, a->c0, t3); // t3 = 2a0
sike_fpmul_mont(t1, t2, c->c0); // c0 = (a0+a1)(a0-a1)
sike_fpmul_mont(t3, a->c1, c->c1); // c1 = 2a0*a1
sike_fpmul_mont(t1, t2, c->c0); // c0 = (a0+a1)(a0-a1)
sike_fpmul_mont(t3, a->c1, c->c1); // c1 = 2a0*a1
} }


// Modular negation, a = -a mod p503. // Modular negation, a = -a mod p503.
@@ -276,9 +276,9 @@ void sike_fp2mul_mont(const f2elm_t a, const f2elm_t b, f2elm_t c) {


mp_addfast(a->c0, a->c1, t1); // t1 = a0+a1 mp_addfast(a->c0, a->c1, t1); // t1 = a0+a1
mp_addfast(b->c0, b->c1, t2); // t2 = b0+b1 mp_addfast(b->c0, b->c1, t2); // t2 = b0+b1
sike_mpmul(a->c0, b->c0, tt1); // tt1 = a0*b0
sike_mpmul(a->c1, b->c1, tt2); // tt2 = a1*b1
sike_mpmul(t1, t2, tt3); // tt3 = (a0+a1)*(b0+b1)
sike_mpmul(a->c0, b->c0, tt1); // tt1 = a0*b0
sike_mpmul(a->c1, b->c1, tt2); // tt2 = a1*b1
sike_mpmul(t1, t2, tt3); // tt3 = (a0+a1)*(b0+b1)
mp_dblsubfast(tt1, tt2, tt3); // tt3 = (a0+a1)*(b0+b1) - a0*b0 - a1*b1 mp_dblsubfast(tt1, tt2, tt3); // tt3 = (a0+a1)*(b0+b1) - a0*b0 - a1*b1
mask = mp_subfast(tt1, tt2, tt1); // tt1 = a0*b0 - a1*b1. If tt1 < 0 then mask = 0xFF..F, else if tt1 >= 0 then mask = 0x00..0 mask = mp_subfast(tt1, tt2, tt1); // tt1 = a0*b0 - a1*b1. If tt1 < 0 then mask = 0xFF..F, else if tt1 >= 0 then mask = 0x00..0


@@ -297,9 +297,9 @@ void sike_fp2inv_mont(f2elm_t a) {


fpsqr_mont(a->c0, t1->c0); // t10 = a0^2 fpsqr_mont(a->c0, t1->c0); // t10 = a0^2
fpsqr_mont(a->c1, t1->c1); // t11 = a1^2 fpsqr_mont(a->c1, t1->c1); // t11 = a1^2
sike_fpadd(t1->c0, t1->c1, t1->c0); // t10 = a0^2+a1^2
sike_fpadd(t1->c0, t1->c1, t1->c0); // t10 = a0^2+a1^2
fpinv_mont(t1->c0); // t10 = (a0^2+a1^2)^-1 fpinv_mont(t1->c0); // t10 = (a0^2+a1^2)^-1
sike_fpneg(a->c1); // a = a0-i*a1
sike_fpneg(a->c1); // a = a0-i*a1
sike_fpmul_mont(a->c0, t1->c0, a->c0); sike_fpmul_mont(a->c0, t1->c0, a->c0);
sike_fpmul_mont(a->c1, t1->c0, a->c1); // a = (a0-i*a1)*(a0^2+a1^2)^-1
sike_fpmul_mont(a->c1, t1->c0, a->c1); // a = (a0-i*a1)*(a0^2+a1^2)^-1
} }

Notiek ielāde…
Atcelt
Saglabāt