Przeglądaj źródła

Removes usage of x29

kris/for/master/CECPQ3_armX29
Henry Case 5 lat temu
rodzic
commit
843fcd8e0e
2 zmienionych plików z 190 dodań i 109 usunięć
  1. +181
    -100
      third_party/sike/asm/fp-armv8.pl
  2. +9
    -9
      third_party/sike/fpx.c

+ 181
- 100
third_party/sike/asm/fp-armv8.pl Wyświetl plik

@@ -34,22 +34,23 @@ $code.=<<___;
.text
___

sub mul128_comba_cut {
my ($A0,$A1,$B0,$B1,$C0,$C1,$C2,$C3,$T0)=@_;
# C[0-2] = A[0] * B[0-1]
sub mul64x128_comba_cut {
my ($A0,$B0,$B1,$C0,$C1,$C2,$T0,$T1)=@_;
my $body=<<___;
mul $A0, $A1, $B0
umulh $B0, $A1, $B0
adds $C1, $C1, $C3
adc $C2, $C2, xzr
mul $T1, $A0, $B0
umulh $B0, $A0, $B0
adds $C0, $C0, $C2
adc $C1, $C1, xzr

mul $T0, $A1, $B1
umulh $B1, $A1, $B1
adds $C1, $C1, $A0
adcs $C2, $C2, $B0
adc $C3, xzr, xzr
mul $T0, $A0, $B1
umulh $B1, $A0, $B1
adds $C0, $C0, $T1
adcs $C1, $C1, $B0
adc $C2, xzr, xzr

adds $C2, $C2, $T0
adc $C3, $C3, $B1
adds $C1, $C1, $T0
adc $C2, $C2, $B1
___
return $body;
}
@@ -57,11 +58,11 @@ ___
sub mul256_karatsuba_comba {
my ($M,$A0,$A1,$A2,$A3,$B0,$B1,$B2,$B3,$C0,$C1,$C2,$C3,$C4,$C5,$C6,$C7,$T0,$T1)=@_;
# (AH+AL) x (BH+BL), low part
my $mul_low=&mul128_comba_cut($A0, $A1, $C6, $T1, $C2, $C3, $C4, $C5, $C7);
my $mul_low=&mul64x128_comba_cut($A1, $C6, $T1, $C3, $C4, $C5, $C7, $A0);
# AL x BL
my $mul_albl=&mul128_comba_cut($A0, $A1, $B0, $B1, $C0, $C1, $T1, $C7, $C6);
my $mul_albl=&mul64x128_comba_cut($A1, $B0, $B1, $C1, $T1, $C7, $C6, $A0);
# AH x BH
my $mul_ahbh=&mul128_comba_cut($A2, $A3, $B2, $B3, $A0, $A1, $C6, $B0, $B1);
my $mul_ahbh=&mul64x128_comba_cut($A3, $B2, $B3, $A1, $C6, $B0, $B1, $A2);
my $body=<<___;
// A0-A1 <- AH + AL, T0 <- mask
adds $A0, $A0, $A2
@@ -145,23 +146,38 @@ ___
# Operation: c [x2] = a [x0] * b [x1]
sub mul {
# (AH+AL) x (BH+BL), low part
my $mul_kc_low=&mul256_karatsuba_comba("x2","x27","x28","x29","x30","x11","x12","x13","x14","x8","x9","x10","x20","x21","x22","x23","x24","x25","x26");
my $mul_kc_low=&mul256_karatsuba_comba(
"x2", # M0
"x3","x4","x5","x6", # A0-A3
"x11","x12","x13","x14", # B0-B3
"x8","x9","x10","x20","x21","x22","x23","x24", # C0-C7
"x25","x26"); # TMP
# AL x BL
my $mul_albl=&mul256_karatsuba_comba("x0","x3","x4","x5","x6","x11","x12","x13","x14","x21","x22","x23","x24","x25","x26","x27","x28","x29","x30");
my $mul_albl=&mul256_karatsuba_comba(
"x0", # M0
"x3","x4","x5","x6", # A0-A3
"x11","x12","x13","x14", # B0-B3
"x21","x22","x23","x24","x25","x26","x27","x28",# C0-C7
"x8","x9"); # TMP
# AH x BH
my $mul_ahbh=&mul256_karatsuba_comba("x0","x3","x4","x5","x6","x11","x12","x13","x14","x21","x22","x23","x24","x25","x26","x27","x28","x29","x30");
my $mul_ahbh=&mul256_karatsuba_comba(
"x0", # M0
"x3","x4","x5","x6", # A0-A3
"x11","x12","x13","x14", # B0-B3
"x21","x22","x23","x24","x25","x26","x27","x28",# C0-C7
"x8","x9"); # TMP

my $body=<<___;
.global ${PREFIX}_mpmul
.align 4
${PREFIX}_mpmul:
sub sp, sp, #96
stp x19, x20, [sp,#0]
stp x21, x22, [sp,#16]
stp x23, x24, [sp,#32]
stp x25, x26, [sp,#48]
stp x27, x28, [sp,#64]
stp x29, x30, [sp,#80]
stp x29, x30, [sp,#-96]!
add x29, sp, #0
stp x19, x20, [sp,#16]
stp x21, x22, [sp,#32]
stp x23, x24, [sp,#48]
stp x25, x26, [sp,#64]
stp x27, x28, [sp,#80]

ldp x3, x4, [x0]
ldp x5, x6, [x0,#16]
@@ -172,44 +188,48 @@ sub mul {
ldp x15, x16, [x1,#32]
ldp x17, x19, [x1,#48]

// x27-x30 <- AH + AL, x7 <- mask
adds x27, x3, x7
adcs x28, x4, x8
adcs x29, x5, x9
adcs x30, x6, x10
// x3-x7 <- AH + AL, x7 <- carry
adds x3, x3, x7
adcs x4, x4, x8
adcs x5, x5, x9
adcs x6, x6, x10
adc x7, xzr, xzr

// x11-x14 <- BH + BL, x8 <- mask
// x11-x14 <- BH + BL, x8 <- carry
adds x11, x11, x15
adcs x12, x12, x16
adcs x13, x13, x17
adcs x14, x14, x19
adc x8, xzr, xzr

// x9 <- combined carry
and x9, x7, x8
// x7-x8 <- mask
sub x7, xzr, x7
sub x8, xzr, x8


// x15-x19 <- masked (BH + BL)
sub x9, xzr, x7
sub x10, xzr, x8
and x15, x11, x9
and x16, x12, x9
and x17, x13, x9
and x19, x14, x9

// x20-x23 <- masked (AH + AL), x7 <- combined carry
and x20, x27, x10
and x21, x28, x10
and x22, x29, x10
and x23, x30, x10
and x7, x7, x8
and x15, x11, x7
and x16, x12, x7
and x17, x13, x7
and x19, x14, x7

// x20-x23 <- masked (AH + AL)
and x20, x3, x8
and x21, x4, x8
and x22, x5, x8
and x23, x6, x8

// x15-x19, x7 <- masked (AH+AL) + masked (BH+BL), step 1
adds x15, x15, x20
adcs x16, x16, x21
adcs x17, x17, x22
adcs x19, x19, x23
adc x7, x7, xzr
adc x7, x9, xzr

// x8-x10,x20-x24 <- (AH+AL) x (BH+BL), low part
stp x27, x28, [x2,#0]
stp x3, x4, [x2,#0]
$mul_kc_low

// x15-x19, x7 <- (AH+AL) x (BH+BL), final step
@@ -219,12 +239,21 @@ sub mul {
adcs x19, x19, x24
adc x7, x7, xzr

// x21-x28 <- AL x BL
// Load AL
ldp x3, x4, [x0]
ldp x5, x6, [x0,#16]
// Load BL
ldp x11, x12, [x1,#0]
ldp x13, x14, [x1,#16]

// Temporarily store x8,x9 in x2
stp x8,x9, [x2,#0]
// x21-x28 <- AL x BL
$mul_albl
// Restore x8,x9
ldp x8,x9, [x2,#0]

// x13-x14, x3-x5 <- (AH+AL) x (BH+BL) - ALxBL
// x8-x10,x20,x15-x17,x19 <- maskd (AH+AL) x (BH+BL) - ALxBL
subs x8, x8, x21
sbcs x9, x9, x22
sbcs x10, x10, x23
@@ -235,11 +264,14 @@ sub mul {
sbcs x19, x19, x28
sbc x7, x7, xzr

// Store ALxBL, low
stp x21, x22, [x2]
stp x23, x24, [x2,#16]

// Load AH
ldp x3, x4, [x0,#32]
ldp x5, x6, [x0,#48]
// Load BH
ldp x11, x12, [x1,#32]
ldp x13, x14, [x1,#48]

@@ -249,12 +281,17 @@ sub mul {
adcs x20, x20, x28
adc x1, xzr, xzr

// x21-x28 <- AH x BH
add x0, x0, #32
// Temporarily store x8,x9 in x2
stp x8,x9, [x2,#32]
// x21-x28 <- AH x BH
$mul_ahbh
// Restore x8,x9
ldp x8,x9, [x2,#32]

neg x1, x1

// x13-x14, x3-x5 <- (AH+AL) x (BH+BL) - ALxBL - AHxBH
// x8-x10,x20,x15-x17,x19 <- (AH+AL) x (BH+BL) - ALxBL - AHxBH
subs x8, x8, x21
sbcs x9, x9, x22
sbcs x10, x10, x23
@@ -265,6 +302,7 @@ sub mul {
sbcs x19, x19, x28
sbc x7, x7, xzr

// Store (AH+AL) x (BH+BL) - ALxBL - AHxBH, low
stp x8, x9, [x2,#32]
stp x10, x20, [x2,#48]

@@ -283,13 +321,12 @@ sub mul {
stp x25, x26, [x2,#96]
stp x27, x28, [x2,#112]

ldp x19, x20, [sp,#0]
ldp x21, x22, [sp,#16]
ldp x23, x24, [sp,#32]
ldp x25, x26, [sp,#48]
ldp x27, x28, [sp,#64]
ldp x29, x30, [sp,#80]
add sp, sp, #96
ldp x19, x20, [x29,#16]
ldp x21, x22, [x29,#32]
ldp x23, x24, [x29,#48]
ldp x25, x26, [x29,#64]
ldp x27, x28, [x29,#80]
ldp x29, x30, [sp],#96
ret
___
return $body;
@@ -347,21 +384,37 @@ ___
# Operation: mc [x1] = ma [x0]
# NOTE: ma=mc is not allowed
sub rdc {
my $mul01=&mul128x256_comba("x2","x3","x24","x25","x26","x27","x5","x6","x7","x8","x9","x28","x29","x1","x10");
my $mul23=&mul128x256_comba("x3","x11","x24","x25","x26","x27","x5","x6","x7","x8","x9","x28","x29","x1","x10");
my $mul45=&mul128x256_comba("x12","x13","x24","x25","x26","x27","x5","x6","x7","x8","x9","x28","x29","x1","x10");
my $mul67=&mul128x256_comba("x14","x15","x24","x25","x26","x27","x5","x6","x7","x8","x9","x28","x29","x1","x10");
my $mul01=&mul128x256_comba(
"x2","x3", # A0-A1
"x24","x25","x26","x27", # B0-B3
"x5","x6","x7","x8","x9", # C0-B4
"x1","x10","x11","x19"); # TMP
my $mul23=&mul128x256_comba(
"x2","x3", # A0-A1
"x24","x25","x26","x27", # B0-B3
"x5","x6","x7","x8","x9", # C0-C4
"x1","x10","x11","x19"); # TMP
my $mul45=&mul128x256_comba(
"x12","x13", # A0-A1
"x24","x25","x26","x27", # B0-B3
"x5","x6","x7","x8","x9", # C0-C4
"x1","x10","x11","x19"); # TMP
my $mul67=&mul128x256_comba(
"x14","x15", # A0-A1
"x24","x25","x26","x27", # B0-B3
"x5","x6","x7","x8","x9", # C0-C4
"x1","x10","x11","x19"); # TMP
my $body=<<___;
.global ${PREFIX}_fprdc
.align 4
${PREFIX}_fprdc:
sub sp, sp, #112
stp x19, x20, [sp]
stp x21, x22, [sp,#16]
stp x23, x24, [sp,#32]
stp x25, x26, [sp,#48]
stp x27, x28, [sp,#64]
stp x29, x30, [sp,#80]
stp x29, x30, [sp, #-112]!
add x29, sp, #0
stp x19, x20, [sp,#16]
stp x21, x22, [sp,#32]
stp x23, x24, [sp,#48]
stp x25, x26, [sp,#64]
stp x27, x28, [sp,#80]
str x1, [sp,#96]

ldp x2, x3, [x0,#0] // a[0-1]
@@ -380,7 +433,7 @@ sub rdc {

$mul01

ldp x3, x11, [x0,#16] // a[2]
ldp x2, x3, [x0,#16] // a[2]
ldp x12, x13, [x0,#32]
ldp x14, x15, [x0,#48]

@@ -397,23 +450,23 @@ sub rdc {
orr x5, x5, x4, lsr #8
lsl x4, x4, #56

adds x11, x4, x11 // a[3]
adds x3, x4, x3 // a[3]
adcs x12, x5, x12 // a[4]
adcs x13, x6, x13
adcs x14, x7, x14
adcs x15, x8, x15
ldp x16, x17, [x0,#64]
ldp x30, x19, [x0,#80]
mul x4, x3, x24 // a[2] x .Lp503p1_nz_s8[0]
umulh x7, x3, x24
ldp x28, x30, [x0,#80]
mul x4, x2, x24 // a[2] x .Lp503p1_nz_s8[0]
umulh x7, x2, x24
adcs x16, x9, x16
adcs x17, x10, x17
adcs x28, xzr, x28
adcs x30, xzr, x30
adcs x19, xzr, x19
ldp x20, x21, [x0,#96]
ldp x22, x23, [x0,#112]
mul x5, x3, x25 // a[2] x .Lp503p1_nz_s8[1]
umulh x6, x3, x25
mul x5, x2, x25 // a[2] x .Lp503p1_nz_s8[1]
umulh x6, x2, x25
adcs x20, xzr, x20
adcs x21, xzr, x21
adcs x22, xzr, x22
@@ -442,8 +495,8 @@ sub rdc {
mul x4, x12, x24 // a[4] x .Lp503p1_nz_s8[0]
umulh x7, x12, x24
adcs x17, x8, x17
adcs x30, x9, x30
adcs x19, x10, x19
adcs x28, x9, x28
adcs x30, x10, x30
adcs x20, xzr, x20
mul x5, x12, x25 // a[4] x .Lp503p1_nz_s8[1]
umulh x6, x12, x25
@@ -470,10 +523,10 @@ sub rdc {
adds x15, x4, x15 // a[7]
adcs x16, x5, x16 // a[8]
adcs x17, x6, x17
adcs x30, x7, x30
adcs x28, x7, x28
mul x4, x14, x24 // a[6] x .Lp503p1_nz_s8[0]
umulh x7, x14, x24
adcs x19, x8, x19
adcs x30, x8, x30
adcs x20, x9, x20
adcs x21, x10, x21
mul x5, x14, x25 // a[6] x .Lp503p1_nz_s8[1]
@@ -498,25 +551,24 @@ sub rdc {
lsl x4, x4, #56

adds x17, x4, x17
adcs x30, x5, x30
adcs x28, x5, x28
ldr x1, [sp,#96]
adcs x19, x6, x19
adcs x30, x6, x30
adcs x20, x7, x20
stp x16, x17, [x1,#0] // Final result
stp x30, x19, [x1,#16]
stp x28, x30, [x1,#16]
adcs x21, x8, x21
adcs x22, x9, x22
adc x23, x10, x23
stp x20, x21, [x1,#32]
stp x22, x23, [x1,#48]

ldp x19, x20, [sp]
ldp x21, x22, [sp, #16]
ldp x23, x24, [sp, #32]
ldp x25, x26, [sp, #48]
ldp x27, x28, [sp, #64]
ldp x29, x30, [sp, #80]
add sp, sp, #112
ldp x19, x20, [x29,#16]
ldp x21, x22, [x29,#32]
ldp x23, x24, [x29,#48]
ldp x25, x26, [x29,#64]
ldp x27, x28, [x29,#80]
ldp x29, x30, [sp],#112
ret

___
@@ -531,6 +583,9 @@ $code.=<<___;
.global ${PREFIX}_fpadd
.align 4
${PREFIX}_fpadd:
stp x29,x30, [sp,#-16]!
add x29, sp, #0

ldp x3, x4, [x0,#0]
ldp x5, x6, [x0,#16]
ldp x11, x12, [x1,#0]
@@ -590,6 +645,8 @@ $code.=<<___;
stp x5, x6, [x2,#16]
stp x7, x8, [x2,#32]
stp x9, x10, [x2,#48]

ldp x29, x30, [sp],#16
ret

___
@@ -600,6 +657,9 @@ $code.=<<___;
.global ${PREFIX}_fpsub
.align 4
${PREFIX}_fpsub:
stp x29, x30, [sp,#-16]!
add x29, sp, #0

ldp x3, x4, [x0,#0]
ldp x5, x6, [x0,#16]
ldp x11, x12, [x1,#0]
@@ -650,6 +710,8 @@ $code.=<<___;
adc x10, x10, x13
stp x7, x8, [x2,#32]
stp x9, x10, [x2,#48]

ldp x29, x30, [sp],#16
ret
___

@@ -659,6 +721,9 @@ $code.=<<___;
.global ${PREFIX}_mpadd_asm
.align 4
${PREFIX}_mpadd_asm:
stp x29, x30, [sp,#-16]!
add x29, sp, #0

ldp x3, x4, [x0,#0]
ldp x5, x6, [x0,#16]
ldp x11, x12, [x1,#0]
@@ -681,6 +746,8 @@ $code.=<<___;
stp x5, x6, [x2,#16]
stp x7, x8, [x2,#32]
stp x9, x10, [x2,#48]

ldp x29, x30, [sp],#16
ret
___

@@ -691,6 +758,9 @@ $code.=<<___;
.global ${PREFIX}_mpadd503x2_asm
.align 4
${PREFIX}_mpadd503x2_asm:
stp x29, x30, [sp,#-16]!
add x29, sp, #0

ldp x3, x4, [x0,#0]
ldp x5, x6, [x0,#16]
ldp x11, x12, [x1,#0]
@@ -734,6 +804,8 @@ $code.=<<___;
stp x5, x6, [x2,#80]
stp x7, x8, [x2,#96]
stp x9, x10, [x2,#112]

ldp x29, x30, [sp],#16
ret
___

@@ -746,6 +818,9 @@ $code.=<<___;
.global ${PREFIX}_mpsubx2_asm
.align 4
${PREFIX}_mpsubx2_asm:
stp x29, x30, [sp,#-16]!
add x29, sp, #0

ldp x3, x4, [x0,#0]
ldp x5, x6, [x0,#16]
ldp x11, x12, [x1,#0]
@@ -784,12 +859,14 @@ $code.=<<___;
sbcs x8, x8, x12
sbcs x9, x9, x13
sbcs x10, x10, x14
sbc x0, xzr, xzr // OZAPTF
sbc x0, xzr, xzr

stp x3, x4, [x2,#64]
stp x5, x6, [x2,#80]
stp x7, x8, [x2,#96]
stp x9, x10, [x2,#112]

ldp x29, x30, [sp],#16
ret
___

@@ -800,10 +877,13 @@ $code.=<<___;
.global ${PREFIX}_mpdblsubx2_asm
.align 4
${PREFIX}_mpdblsubx2_asm:
sub sp, sp, #48
stp x20, x21, [sp, #0]
stp x22, x23, [sp, #16]
str x24, [sp, #32]
stp x29, x30, [sp, #-64]!
add x29, sp, #0

stp x20, x21, [sp, #16]
stp x22, x23, [sp, #32]
str x24, [sp, #48]

ldp x3, x4, [x2,#0]
ldp x5, x6, [x2,#16]
ldp x7, x8, [x2,#32]
@@ -872,10 +952,11 @@ $code.=<<___;
stp x15, x16, [x2,#96]
stp x17, x24, [x2,#112]

ldp x20, x21, [sp, #0]
ldp x22, x23, [sp, #16]
ldr x24, [sp, #32]
add sp, sp, #48
ldp x20, x21, [x29,#16]
ldp x22, x23, [x29,#32]
ldr x24, [x29,#48]

ldp x29, x30, [sp],#64
ret
___



+ 9
- 9
third_party/sike/fpx.c Wyświetl plik

@@ -216,10 +216,10 @@ void sike_fp2sqr_mont(const f2elm_t a, f2elm_t c) {
felm_t t1, t2, t3;

mp_addfast(a->c0, a->c1, t1); // t1 = a0+a1
sike_fpsub(a->c0, a->c1, t2); // t2 = a0-a1
sike_fpsub(a->c0, a->c1, t2); // t2 = a0-a1
mp_addfast(a->c0, a->c0, t3); // t3 = 2a0
sike_fpmul_mont(t1, t2, c->c0); // c0 = (a0+a1)(a0-a1)
sike_fpmul_mont(t3, a->c1, c->c1); // c1 = 2a0*a1
sike_fpmul_mont(t1, t2, c->c0); // c0 = (a0+a1)(a0-a1)
sike_fpmul_mont(t3, a->c1, c->c1); // c1 = 2a0*a1
}

// Modular negation, a = -a mod p503.
@@ -276,9 +276,9 @@ void sike_fp2mul_mont(const f2elm_t a, const f2elm_t b, f2elm_t c) {

mp_addfast(a->c0, a->c1, t1); // t1 = a0+a1
mp_addfast(b->c0, b->c1, t2); // t2 = b0+b1
sike_mpmul(a->c0, b->c0, tt1); // tt1 = a0*b0
sike_mpmul(a->c1, b->c1, tt2); // tt2 = a1*b1
sike_mpmul(t1, t2, tt3); // tt3 = (a0+a1)*(b0+b1)
sike_mpmul(a->c0, b->c0, tt1); // tt1 = a0*b0
sike_mpmul(a->c1, b->c1, tt2); // tt2 = a1*b1
sike_mpmul(t1, t2, tt3); // tt3 = (a0+a1)*(b0+b1)
mp_dblsubfast(tt1, tt2, tt3); // tt3 = (a0+a1)*(b0+b1) - a0*b0 - a1*b1
mask = mp_subfast(tt1, tt2, tt1); // tt1 = a0*b0 - a1*b1. If tt1 < 0 then mask = 0xFF..F, else if tt1 >= 0 then mask = 0x00..0

@@ -297,9 +297,9 @@ void sike_fp2inv_mont(f2elm_t a) {

fpsqr_mont(a->c0, t1->c0); // t10 = a0^2
fpsqr_mont(a->c1, t1->c1); // t11 = a1^2
sike_fpadd(t1->c0, t1->c1, t1->c0); // t10 = a0^2+a1^2
sike_fpadd(t1->c0, t1->c1, t1->c0); // t10 = a0^2+a1^2
fpinv_mont(t1->c0); // t10 = (a0^2+a1^2)^-1
sike_fpneg(a->c1); // a = a0-i*a1
sike_fpneg(a->c1); // a = a0-i*a1
sike_fpmul_mont(a->c0, t1->c0, a->c0);
sike_fpmul_mont(a->c1, t1->c0, a->c1); // a = (a0-i*a1)*(a0^2+a1^2)^-1
sike_fpmul_mont(a->c1, t1->c0, a->c1); // a = (a0-i*a1)*(a0^2+a1^2)^-1
}

Ładowanie…
Anuluj
Zapisz