Change-Id: I714b2727fb54824d0a3025ec6f4ad4a5e5d64187
This commit is contained in:
Henry Case 2019-04-24 18:55:23 +01:00
parent 16012c0b1d
commit 4674a7ec65

View File

@ -296,45 +296,47 @@ ___
} }
$code.=&mul(); $code.=&mul();
# Computes C0-C4 = (A0-A1) * (B0-B3)
# Inputs remain intact
sub mul128x256_comba { sub mul128x256_comba {
my ($A0,$A1,$B0,$B1,$B2,$B3,$C1,$C2,$C3,$C4,$C5,$T0,$T1,$T2,$T3)=@_; my ($A0,$A1,$B0,$B1,$B2,$B3,$C0,$C1,$C2,$C3,$C4,$T0,$T1,$T2,$T3)=@_;
my $body=<<___; my $body=<<___;
mul $T0, $A1, $B0 mul $T0, $A1, $B0
umulh $T1, $A1, $B0 umulh $T1, $A1, $B0
adds $C1, $C1, $C3 adds $C0, $C0, $C2
adc $C2, $C2, xzr adc $C1, $C1, xzr
mul $T2, $A0, $B2 mul $T2, $A0, $B2
umulh $T3, $A0, $B2 umulh $T3, $A0, $B2
adds $C1, $C1, $T0 adds $C0, $C0, $T0
adcs $C2, $C2, $T1 adcs $C1, $C1, $T1
adc $C3, xzr, xzr adc $C2, xzr, xzr
mul $T0, $A1, $B1 mul $T0, $A1, $B1
umulh $T1, $A1, $B1 umulh $T1, $A1, $B1
adds $C1, $C1, $T2
adcs $C2, $C2, $T3
adc $C3, xzr, xzr
mul $T2, $A0, $B3
umulh $T3, $A0, $B3
adds $C1, $C1, $T0
adcs $C2, $C2, $T1
adc $C3, $C3, xzr
mul $T0, $A1, $B2
umulh $T1, $A1, $B2
adds $C2, $C2, $T2 adds $C2, $C2, $T2
adcs $C3, $C3, $T3 adcs $C3, $C3, $T3
adc $C4, xzr, xzr adc $C4, xzr, xzr
mul $T2, $A0, $B3 mul $T2, $A1, $B3
umulh $T3, $A0, $B3 umulh $T3, $A1, $B3
adds $C2, $C2, $T0 adds $C2, $C2, $T0
adcs $C3, $C3, $T1 adcs $C3, $C3, $T1
adc $C4, $C4, xzr adc $C4, $C4, xzr
mul $T0, $A1, $B2
umulh $T1, $A1, $B2
adds $C3, $C3, $T2 adds $C3, $C3, $T2
adcs $C4, $C4, $T3 adc $C4, $C4, $T3
adc $C5, xzr, xzr
mul $T2, $A1, $B3
umulh $T3, $A1, $B3
adds $C3, $C3, $T0
adcs $C4, $C4, $T1
adc $C5, $C5, xzr
adds $C4, $C4, $T2
adc $C5, $C5, $T3
___ ___
return $body; return $body;
@ -345,10 +347,10 @@ ___
# Operation: mc [x1] = ma [x0] # Operation: mc [x1] = ma [x0]
# NOTE: ma=mc is not allowed # NOTE: ma=mc is not allowed
sub rdc { sub rdc {
my $mul01=&mul128x256_comba("x2","x3","x24","x25","x26","x27","x5","x6","x7","x8","x9","x28","x29","x30","x10"); my $mul01=&mul128x256_comba("x2","x3","x24","x25","x26","x27","x5","x6","x7","x8","x9","x28","x29","x1","x10");
my $mul23=&mul128x256_comba("x3","x11","x24","x25","x26","x27","x5","x6","x7","x8","x9","x28","x29","x30","x10"); my $mul23=&mul128x256_comba("x3","x11","x24","x25","x26","x27","x5","x6","x7","x8","x9","x28","x29","x1","x10");
my $mul45=&mul128x256_comba("x12","x13","x24","x25","x26","x27","x5","x6","x7","x8","x9","x28","x29","x30","x10"); my $mul45=&mul128x256_comba("x12","x13","x24","x25","x26","x27","x5","x6","x7","x8","x9","x28","x29","x1","x10");
my $mul67=&mul128x256_comba("x14","x15","x24","x25","x26","x27","x5","x6","x7","x8","x9","x28","x29","x30","x10"); my $mul67=&mul128x256_comba("x14","x15","x24","x25","x26","x27","x5","x6","x7","x8","x9","x28","x29","x1","x10");
my $body=<<___; my $body=<<___;
.global ${PREFIX}_fprdc .global ${PREFIX}_fprdc
.align 4 .align 4
@ -401,12 +403,12 @@ sub rdc {
adcs x14, x7, x14 adcs x14, x7, x14
adcs x15, x8, x15 adcs x15, x8, x15
ldp x16, x17, [x0,#64] ldp x16, x17, [x0,#64]
ldp x18, x19, [x0,#80] ldp x30, x19, [x0,#80]
mul x4, x3, x24 // a[2] x .Lp503p1_nz_s8[0] mul x4, x3, x24 // a[2] x .Lp503p1_nz_s8[0]
umulh x7, x3, x24 umulh x7, x3, x24
adcs x16, x9, x16 adcs x16, x9, x16
adcs x17, x10, x17 adcs x17, x10, x17
adcs x18, xzr, x18 adcs x30, xzr, x30
adcs x19, xzr, x19 adcs x19, xzr, x19
ldp x20, x21, [x0,#96] ldp x20, x21, [x0,#96]
ldp x22, x23, [x0,#112] ldp x22, x23, [x0,#112]
@ -440,7 +442,7 @@ sub rdc {
mul x4, x12, x24 // a[4] x .Lp503p1_nz_s8[0] mul x4, x12, x24 // a[4] x .Lp503p1_nz_s8[0]
umulh x7, x12, x24 umulh x7, x12, x24
adcs x17, x8, x17 adcs x17, x8, x17
adcs x18, x9, x18 adcs x30, x9, x30
adcs x19, x10, x19 adcs x19, x10, x19
adcs x20, xzr, x20 adcs x20, xzr, x20
mul x5, x12, x25 // a[4] x .Lp503p1_nz_s8[1] mul x5, x12, x25 // a[4] x .Lp503p1_nz_s8[1]
@ -468,7 +470,7 @@ sub rdc {
adds x15, x4, x15 // a[7] adds x15, x4, x15 // a[7]
adcs x16, x5, x16 // a[8] adcs x16, x5, x16 // a[8]
adcs x17, x6, x17 adcs x17, x6, x17
adcs x18, x7, x18 adcs x30, x7, x30
mul x4, x14, x24 // a[6] x .Lp503p1_nz_s8[0] mul x4, x14, x24 // a[6] x .Lp503p1_nz_s8[0]
umulh x7, x14, x24 umulh x7, x14, x24
adcs x19, x8, x19 adcs x19, x8, x19
@ -496,12 +498,12 @@ sub rdc {
lsl x4, x4, #56 lsl x4, x4, #56
adds x17, x4, x17 adds x17, x4, x17
adcs x18, x5, x18 adcs x30, x5, x30
ldr x1, [sp,#96] ldr x1, [sp,#96]
adcs x19, x6, x19 adcs x19, x6, x19
adcs x20, x7, x20 adcs x20, x7, x20
stp x16, x17, [x1,#0] // Final result stp x16, x17, [x1,#0] // Final result
stp x18, x19, [x1,#16] stp x30, x19, [x1,#16]
adcs x21, x8, x21 adcs x21, x8, x21
adcs x22, x9, x22 adcs x22, x9, x22
adc x23, x10, x23 adc x23, x10, x23