WIP6
Change-Id: I714b2727fb54824d0a3025ec6f4ad4a5e5d64187
This commit is contained in:
parent
16012c0b1d
commit
4674a7ec65
64
third_party/sike/asm/fp-armv8.pl
vendored
64
third_party/sike/asm/fp-armv8.pl
vendored
@ -296,45 +296,47 @@ ___
|
||||
}
|
||||
$code.=&mul();
|
||||
|
||||
# Computes C0-C4 = (A0-A1) * (B0-B3)
|
||||
# Inputs remain intact
|
||||
sub mul128x256_comba {
|
||||
my ($A0,$A1,$B0,$B1,$B2,$B3,$C1,$C2,$C3,$C4,$C5,$T0,$T1,$T2,$T3)=@_;
|
||||
my ($A0,$A1,$B0,$B1,$B2,$B3,$C0,$C1,$C2,$C3,$C4,$T0,$T1,$T2,$T3)=@_;
|
||||
my $body=<<___;
|
||||
mul $T0, $A1, $B0
|
||||
umulh $T1, $A1, $B0
|
||||
adds $C1, $C1, $C3
|
||||
adc $C2, $C2, xzr
|
||||
adds $C0, $C0, $C2
|
||||
adc $C1, $C1, xzr
|
||||
|
||||
mul $T2, $A0, $B2
|
||||
umulh $T3, $A0, $B2
|
||||
adds $C1, $C1, $T0
|
||||
adcs $C2, $C2, $T1
|
||||
adc $C3, xzr, xzr
|
||||
adds $C0, $C0, $T0
|
||||
adcs $C1, $C1, $T1
|
||||
adc $C2, xzr, xzr
|
||||
|
||||
mul $T0, $A1, $B1
|
||||
umulh $T1, $A1, $B1
|
||||
adds $C1, $C1, $T2
|
||||
adcs $C2, $C2, $T3
|
||||
adc $C3, xzr, xzr
|
||||
|
||||
mul $T2, $A0, $B3
|
||||
umulh $T3, $A0, $B3
|
||||
adds $C1, $C1, $T0
|
||||
adcs $C2, $C2, $T1
|
||||
adc $C3, $C3, xzr
|
||||
|
||||
mul $T0, $A1, $B2
|
||||
umulh $T1, $A1, $B2
|
||||
adds $C2, $C2, $T2
|
||||
adcs $C3, $C3, $T3
|
||||
adc $C4, xzr, xzr
|
||||
|
||||
mul $T2, $A0, $B3
|
||||
umulh $T3, $A0, $B3
|
||||
mul $T2, $A1, $B3
|
||||
umulh $T3, $A1, $B3
|
||||
adds $C2, $C2, $T0
|
||||
adcs $C3, $C3, $T1
|
||||
adc $C4, $C4, xzr
|
||||
|
||||
mul $T0, $A1, $B2
|
||||
umulh $T1, $A1, $B2
|
||||
adds $C3, $C3, $T2
|
||||
adcs $C4, $C4, $T3
|
||||
adc $C5, xzr, xzr
|
||||
|
||||
mul $T2, $A1, $B3
|
||||
umulh $T3, $A1, $B3
|
||||
adds $C3, $C3, $T0
|
||||
adcs $C4, $C4, $T1
|
||||
adc $C5, $C5, xzr
|
||||
adds $C4, $C4, $T2
|
||||
adc $C5, $C5, $T3
|
||||
adc $C4, $C4, $T3
|
||||
|
||||
___
|
||||
return $body;
|
||||
@ -345,10 +347,10 @@ ___
|
||||
# Operation: mc [x1] = ma [x0]
|
||||
# NOTE: ma=mc is not allowed
|
||||
sub rdc {
|
||||
my $mul01=&mul128x256_comba("x2","x3","x24","x25","x26","x27","x5","x6","x7","x8","x9","x28","x29","x30","x10");
|
||||
my $mul23=&mul128x256_comba("x3","x11","x24","x25","x26","x27","x5","x6","x7","x8","x9","x28","x29","x30","x10");
|
||||
my $mul45=&mul128x256_comba("x12","x13","x24","x25","x26","x27","x5","x6","x7","x8","x9","x28","x29","x30","x10");
|
||||
my $mul67=&mul128x256_comba("x14","x15","x24","x25","x26","x27","x5","x6","x7","x8","x9","x28","x29","x30","x10");
|
||||
my $mul01=&mul128x256_comba("x2","x3","x24","x25","x26","x27","x5","x6","x7","x8","x9","x28","x29","x1","x10");
|
||||
my $mul23=&mul128x256_comba("x3","x11","x24","x25","x26","x27","x5","x6","x7","x8","x9","x28","x29","x1","x10");
|
||||
my $mul45=&mul128x256_comba("x12","x13","x24","x25","x26","x27","x5","x6","x7","x8","x9","x28","x29","x1","x10");
|
||||
my $mul67=&mul128x256_comba("x14","x15","x24","x25","x26","x27","x5","x6","x7","x8","x9","x28","x29","x1","x10");
|
||||
my $body=<<___;
|
||||
.global ${PREFIX}_fprdc
|
||||
.align 4
|
||||
@ -401,12 +403,12 @@ sub rdc {
|
||||
adcs x14, x7, x14
|
||||
adcs x15, x8, x15
|
||||
ldp x16, x17, [x0,#64]
|
||||
ldp x18, x19, [x0,#80]
|
||||
ldp x30, x19, [x0,#80]
|
||||
mul x4, x3, x24 // a[2] x .Lp503p1_nz_s8[0]
|
||||
umulh x7, x3, x24
|
||||
adcs x16, x9, x16
|
||||
adcs x17, x10, x17
|
||||
adcs x18, xzr, x18
|
||||
adcs x30, xzr, x30
|
||||
adcs x19, xzr, x19
|
||||
ldp x20, x21, [x0,#96]
|
||||
ldp x22, x23, [x0,#112]
|
||||
@ -440,7 +442,7 @@ sub rdc {
|
||||
mul x4, x12, x24 // a[4] x .Lp503p1_nz_s8[0]
|
||||
umulh x7, x12, x24
|
||||
adcs x17, x8, x17
|
||||
adcs x18, x9, x18
|
||||
adcs x30, x9, x30
|
||||
adcs x19, x10, x19
|
||||
adcs x20, xzr, x20
|
||||
mul x5, x12, x25 // a[4] x .Lp503p1_nz_s8[1]
|
||||
@ -468,7 +470,7 @@ sub rdc {
|
||||
adds x15, x4, x15 // a[7]
|
||||
adcs x16, x5, x16 // a[8]
|
||||
adcs x17, x6, x17
|
||||
adcs x18, x7, x18
|
||||
adcs x30, x7, x30
|
||||
mul x4, x14, x24 // a[6] x .Lp503p1_nz_s8[0]
|
||||
umulh x7, x14, x24
|
||||
adcs x19, x8, x19
|
||||
@ -496,12 +498,12 @@ sub rdc {
|
||||
lsl x4, x4, #56
|
||||
|
||||
adds x17, x4, x17
|
||||
adcs x18, x5, x18
|
||||
adcs x30, x5, x30
|
||||
ldr x1, [sp,#96]
|
||||
adcs x19, x6, x19
|
||||
adcs x20, x7, x20
|
||||
stp x16, x17, [x1,#0] // Final result
|
||||
stp x18, x19, [x1,#16]
|
||||
stp x30, x19, [x1,#16]
|
||||
adcs x21, x8, x21
|
||||
adcs x22, x9, x22
|
||||
adc x23, x10, x23
|
||||
|
Loading…
Reference in New Issue
Block a user