WIP: Armv8.1

Change-Id: Idd9443014d9ba35c053b93d60ced414a05a0654f
This commit is contained in:
Henry Case 2019-04-22 21:42:15 +01:00
parent 7e299e4c39
commit a2dbb55cc5

View File

@ -64,41 +64,41 @@ sub mul256_karatsuba_comba {
# AH x BH
my $mul_ahbh=&mul128_comba_cut($A2, $A3, $B2, $B3, $A0, $A1, $C6, $B0, $B1);
my $body=<<___;
# A0-A1 <- AH + AL, T0 <- mask
// A0-A1 <- AH + AL, T0 <- mask
adds $A0, $A0, $A2
adcs $A1, $A1, $A3
adc $T0, xzr, xzr
# C6, T1 <- BH + BL, C7 <- mask
// C6, T1 <- BH + BL, C7 <- mask
adds $C6, $B0, $B2
adcs $T1, $B1, $B3
adc $C7, xzr, xzr
# C0-C1 <- masked (BH + BL)
// C0-C1 <- masked (BH + BL)
sub $C2, xzr, $T0
sub $C3, xzr, $C7
and $C0, $C6, $C2
and $C1, $T1, $C2
# C4-C5 <- masked (AH + AL), T0 <- combined carry
// C4-C5 <- masked (AH + AL), T0 <- combined carry
and $C4, $A0, $C3
and $C5, $A1, $C3
mul $C2, $A0, $C6
mul $C3, $A0, $T1
and $T0, $T0, $C7
# C0-C1, T0 <- (AH+AL) x (BH+BL), part 1
// C0-C1, T0 <- (AH+AL) x (BH+BL), part 1
adds $C0, $C4, $C0
umulh $C4, $A0, $T1
adcs $C1, $C5, $C1
umulh $C5, $A0, $C6
adc $T0, $T0, xzr
# C2-C5 <- (AH+AL) x (BH+BL), low part
// C2-C5 <- (AH+AL) x (BH+BL), low part
$mul_low
ldp $A0, $A1, [$M,#0]
# C2-C5, T0 <- (AH+AL) x (BH+BL), final part
// C2-C5, T0 <- (AH+AL) x (BH+BL), final part
adds $C4, $C0, $C4
umulh $C7, $A0, $B0
umulh $T1, $A0, $B1
@ -107,10 +107,10 @@ sub mul256_karatsuba_comba {
mul $C1, $A0, $B1
adc $T0, $T0, xzr
# C0-C1, T1, C7 <- AL x BL
// C0-C1, T1, C7 <- AL x BL
$mul_albl
# C2-C5, T0 <- (AH+AL) x (BH+BL) - ALxBL
// C2-C5, T0 <- (AH+AL) x (BH+BL) - ALxBL
mul $A0, $A2, $B2
umulh $B0, $A2, $B2
subs $C2, $C2, $C0
@ -121,10 +121,10 @@ sub mul256_karatsuba_comba {
sbcs $C5, $C5, $C7
sbc $T0, $T0, xzr
# A0, A1, C6, B0 <- AH x BH
// A0, A1, C6, B0 <- AH x BH
$mul_ahbh
# C2-C5, T0 <- (AH+AL) x (BH+BL) - ALxBL - AHxBH
// C2-C5, T0 <- (AH+AL) x (BH+BL) - ALxBL - AHxBH
subs $C2, $C2, $A0
sbcs $C3, $C3, $A1
sbcs $C4, $C4, $C6
@ -171,21 +171,21 @@ sub mul {
ldp x15, x16, [x1,#32]
ldp x17, x18, [x1,#48]
# x26-x29 <- AH + AL, x7 <- mask
// x26-x29 <- AH + AL, x7 <- mask
adds x26, x3, x7
adcs x27, x4, x8
adcs x28, x5, x9
adcs x29, x6, x10
adc x7, xzr, xzr
# x11-x14 <- BH + BL, x8 <- mask
// x11-x14 <- BH + BL, x8 <- mask
adds x11, x11, x15
adcs x12, x12, x16
adcs x13, x13, x17
adcs x14, x14, x18
adc x8, xzr, xzr
# x15-x18 <- masked (BH + BL)
// x15-x18 <- masked (BH + BL)
sub x9, xzr, x7
sub x10, xzr, x8
and x15, x11, x9
@ -193,37 +193,37 @@ sub mul {
and x17, x13, x9
and x18, x14, x9
# x19-x22 <- masked (AH + AL), x7 <- combined carry
// x19-x22 <- masked (AH + AL), x7 <- combined carry
and x19, x26, x10
and x20, x27, x10
and x21, x28, x10
and x22, x29, x10
and x7, x7, x8
# x15-x18, x7 <- masked (AH+AL) + masked (BH+BL), step 1
// x15-x18, x7 <- masked (AH+AL) + masked (BH+BL), step 1
adds x15, x15, x19
adcs x16, x16, x20
adcs x17, x17, x21
adcs x18, x18, x22
adc x7, x7, xzr
# x8-x10,x19-x23 <- (AH+AL) x (BH+BL), low part
// x8-x10,x19-x23 <- (AH+AL) x (BH+BL), low part
stp x26, x27, [x2,#0]
$mul_kc_low
# x15-x18, x7 <- (AH+AL) x (BH+BL), final step
// x15-x18, x7 <- (AH+AL) x (BH+BL), final step
adds x15, x15, x20
adcs x16, x16, x21
adcs x17, x17, x22
adcs x18, x18, x23
adc x7, x7, xzr
# x20-x27 <- AL x BL
// x20-x27 <- AL x BL
ldp x11, x12, [x1,#0]
ldp x13, x14, [x1,#16]
$mul_albl
# x13-x14, x3-x5 <- (AH+AL) x (BH+BL) - ALxBL
// x13-x14, x3-x5 <- (AH+AL) x (BH+BL) - ALxBL
subs x8, x8, x20
sbcs x9, x9, x21
sbcs x10, x10, x22
@ -248,12 +248,12 @@ sub mul {
adcs x19, x19, x27
adc x1, xzr, xzr
# x20-x27 <- AH x BH
// x20-x27 <- AH x BH
add x0, x0, #32
$mul_ahbh
neg x1, x1
# x13-x14, x3-x5 <- (AH+AL) x (BH+BL) - ALxBL - AHxBH
// x13-x14, x3-x5 <- (AH+AL) x (BH+BL) - ALxBL - AHxBH
subs x8, x8, x20
sbcs x9, x9, x21
sbcs x10, x10, x22
@ -360,23 +360,23 @@ sub rdc {
stp x27, x28, [sp, #64]
stp x29, x30, [sp, #80]
ldp x2, x3, [x0,#0] # a[0-1]
ldp x2, x3, [x0,#0] // a[0-1]
# Load the prime constant
// Load the prime constant
adrp x23, .p503p1_nz_s8
add x23, x23, :lo12:.p503p1_nz_s8
ldp x24, x25, [x23, #0]
ldp x26, x27, [x23, #16]
# a[0-1] x p503p1_nz_s8 --> result: x4:x9
mul x4, x2, x24 # a[0] x p503p1_nz_s8[0]
// a[0-1] x p503p1_nz_s8 --> result: x4:x9
mul x4, x2, x24 // a[0] x p503p1_nz_s8[0]
umulh x7, x2, x24
mul x5, x2, x25 # a[0] x p503p1_nz_s8[1]
mul x5, x2, x25 // a[0] x p503p1_nz_s8[1]
umulh x6, x2, x25
$mul01
ldp x3, x11, [x0,#16] # a[2]
ldp x3, x11, [x0,#16] // a[2]
ldp x12, x13, [x0,#32]
ldp x14, x15, [x0,#48]
@ -393,14 +393,14 @@ sub rdc {
orr x5, x5, x4, lsr #8
lsl x4, x4, #56
adds x11, x4, x11 # a[3]
adcs x12, x5, x12 # a[4]
adds x11, x4, x11 // a[3]
adcs x12, x5, x12 // a[4]
adcs x13, x6, x13
adcs x14, x7, x14
adcs x15, x8, x15
ldp x16, x17, [x0,#64]
ldp x18, x19, [x0,#80]
mul x4, x3, x24 # a[2] x p503p1_nz_s8[0]
mul x4, x3, x24 // a[2] x p503p1_nz_s8[0]
umulh x7, x3, x24
adcs x16, x9, x16
adcs x17, x10, x17
@ -408,14 +408,14 @@ sub rdc {
adcs x19, xzr, x19
ldp x20, x21, [x0,#96]
ldp x22, x23, [x0,#112]
mul x5, x3, x25 # a[2] x p503p1_nz_s8[1]
mul x5, x3, x25 // a[2] x p503p1_nz_s8[1]
umulh x6, x3, x25
adcs x20, xzr, x20
adcs x21, xzr, x21
adcs x22, xzr, x22
adc x23, xzr, x23
# a[2-3] x p503p1_nz_s8 --> result: x4:x9
// a[2-3] x p503p1_nz_s8 --> result: x4:x9
$mul23
orr x10, xzr, x9, lsr #8
@ -431,23 +431,23 @@ sub rdc {
orr x5, x5, x4, lsr #8
lsl x4, x4, #56
adds x13, x4, x13 # a[5]
adcs x14, x5, x14 # a[6]
adds x13, x4, x13 // a[5]
adcs x14, x5, x14 // a[6]
adcs x15, x6, x15
adcs x16, x7, x16
mul x4, x12, x24 # a[4] x p503p1_nz_s8[0]
mul x4, x12, x24 // a[4] x p503p1_nz_s8[0]
umulh x7, x12, x24
adcs x17, x8, x17
adcs x18, x9, x18
adcs x19, x10, x19
adcs x20, xzr, x20
mul x5, x12, x25 # a[4] x p503p1_nz_s8[1]
mul x5, x12, x25 // a[4] x p503p1_nz_s8[1]
umulh x6, x12, x25
adcs x21, xzr, x21
adcs x22, xzr, x22
adc x23, xzr, x23
# a[4-5] x p503p1_nz_s8 --> result: x4:x9
// a[4-5] x p503p1_nz_s8 --> result: x4:x9
$mul45
orr x10, xzr, x9, lsr #8
@ -463,21 +463,21 @@ sub rdc {
orr x5, x5, x4, lsr #8
lsl x4, x4, #56
adds x15, x4, x15 # a[7]
adcs x16, x5, x16 # a[8]
adds x15, x4, x15 // a[7]
adcs x16, x5, x16 // a[8]
adcs x17, x6, x17
adcs x18, x7, x18
mul x4, x14, x24 # a[6] x p503p1_nz_s8[0]
mul x4, x14, x24 // a[6] x p503p1_nz_s8[0]
umulh x7, x14, x24
adcs x19, x8, x19
adcs x20, x9, x20
adcs x21, x10, x21
mul x5, x14, x25 # a[6] x p503p1_nz_s8[1]
mul x5, x14, x25 // a[6] x p503p1_nz_s8[1]
umulh x6, x14, x25
adcs x22, xzr, x22
adc x23, xzr, x23
# a[6-7] x p503p1_nz_s8 --> result: x4:x9
// a[6-7] x p503p1_nz_s8 --> result: x4:x9
$mul67
orr x10, xzr, x9, lsr #8
@ -497,7 +497,7 @@ sub rdc {
adcs x18, x5, x18
adcs x19, x6, x19
adcs x20, x7, x20
stp x16, x17, [x1,#0] # Final result
stp x16, x17, [x1,#0] // Final result
stp x18, x19, [x1,#16]
adcs x21, x8, x21
adcs x22, x9, x22
@ -600,7 +600,7 @@ $code.=<<___;
ldp x11, x12, [x1,#0]
ldp x13, x14, [x1,#16]
# Subtract a - b
// Subtract a - b
subs x3, x3, x11
sbcs x4, x4, x12
sbcs x5, x5, x13
@ -615,7 +615,7 @@ $code.=<<___;
sbcs x10, x10, x18
sbc x18, xzr, xzr
# Add 2xp503 anded with the mask in x18
// Add 2xp503 anded with the mask in x18
adrp x19, .p503x2
add x19, x19, :lo12:.p503x2
ldp x11, x12, [x19, #0]