WIP
Change-Id: Ic362ed30d5338ae4fca34290b7d7c0b48f5a5fa3
This commit is contained in:
parent
d46eed92fd
commit
1bb081fd8e
112
third_party/sike/asm/fp-x86_64.pl
vendored
112
third_party/sike/asm/fp-x86_64.pl
vendored
@ -20,7 +20,7 @@ open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
|
||||
*STDOUT=*OUT;
|
||||
|
||||
$PREFIX="sike";
|
||||
$addx = 0;
|
||||
$addx = 1;
|
||||
$bmi2_adx = 1;
|
||||
|
||||
# Swaps 16-bytes pointed by %rdi and %rsi
|
||||
@ -41,13 +41,13 @@ $code.=<<___;
|
||||
___
|
||||
}
|
||||
|
||||
sub MUL256_SCHOOL() {
|
||||
sub MUL256_SCHOOL {
|
||||
my $idxM0 = shift;
|
||||
my $M0 = shift;
|
||||
my $idxM1 = shift;
|
||||
my $M1 = shift;
|
||||
my $idxC = shift;
|
||||
my $C = shift;
|
||||
my $idxDST = shift;
|
||||
my $DST = shift;
|
||||
my $T0 = shift;
|
||||
my $T1 = shift;
|
||||
my $T2 = shift;
|
||||
@ -58,89 +58,82 @@ sub MUL256_SCHOOL() {
|
||||
my $T7 = shift;
|
||||
my $T8 = shift;
|
||||
my $T9 = shift;
|
||||
|
||||
# OZAPTF: Is it best approach?
|
||||
my $idxC_0 = $idxC;
|
||||
my $idxC_8 = $idxC + 8;
|
||||
my $idxC_16 = $idxC + 16;
|
||||
my $idxC_24 = $idxC + 24;
|
||||
my $idxC_32 = $idxC + 32;
|
||||
my $idxC_40 = $idxC + 40;
|
||||
my $idxC_48 = $idxC + 48;
|
||||
my $idxC_56 = $idxC + 56;
|
||||
|
||||
my ($ML0,$ML8,$ML16,$ML24)=map("$idxM0+$_(%$M0)",(0,8,16,24));
|
||||
my ($MR0,$MR8,$MR16,$MR24)=map("$idxM1+$_(%$M1)",(0,8,16,24));
|
||||
my ($D0,$D1,$D2,$D3,$D4,$D5,$D6,$D7)=map("$idxDST+$_(%$DST)",(0,8,16,24,32,40,48,56));
|
||||
$code.=<<___;
|
||||
mov ($idxM0+0)(%$M0), %rdx
|
||||
mulx ($idxM1+0)(%$M1), %$T1, %$T0 # T0:T1 = A0*B0
|
||||
mov %$T1, $idxC_0(%$C) # C0_final
|
||||
mulx ($idxM1+8)(%$M1), %$T2, %$T1 # T1:T2 = A0*B1
|
||||
mov $ML0, %rdx
|
||||
mulx $MR0, %$T1, %$T0 # T0:T1 = A0*B0
|
||||
mov %$T1, $D0 # DST0_final
|
||||
mulx $MR8, %$T2, %$T1 # T1:T2 = A0*B1
|
||||
xor %rax, %rax
|
||||
adox %$T2, %$T0
|
||||
mulx ($idxM1+16)(%$M1), %$T3, %$T2 # T2:T3 = A0*B2
|
||||
mulx $MR16,%$T3, %$T2 # T2:T3 = A0*B2
|
||||
adox %$T3, %$T1
|
||||
mulx ($idxM1+24)(%$M1), %$T4, %$T3 # T3:T4 = A0*B3
|
||||
mulx $MR24,%$T4, %$T3 # T3:T4 = A0*B3
|
||||
adox %$T4, %$T2
|
||||
|
||||
mov ($idxM0+8)(%$M0), %rdx
|
||||
mulx ($idxM1+0)(%$M1), %$T4, %$T5 # T5:T4 = A1*B0
|
||||
mov $ML8, %rdx
|
||||
mulx $MR0, %$T4, %$T5 # T5:T4 = A1*B0
|
||||
adox %rax, %$T3
|
||||
xor %rax, %rax
|
||||
mulx ($idxM1+8)(%$M1), %$T7, %$T6 # T6:T7 = A1*B1
|
||||
mulx $MR8, %$T7, %$T6 # T6:T7 = A1*B1
|
||||
adox %$T0, %$T4
|
||||
mov %$T4, $idxC_8(%$C) # C1_final
|
||||
mov %$T4, $D1 # DST1_final
|
||||
adcx %$T7, %$T5
|
||||
mulx ($idxM1+16)(%$M1), %$T8, %$T7 # T7:T8 = A1*B2
|
||||
mulx $MR16,%$T8, %$T7 # T7:T8 = A1*B2
|
||||
adcx %$T8, %$T6
|
||||
adox %$T1, %$T5
|
||||
mulx ($idxM1+24)(%$M1), %$T9, %$T8 # T8:T9 = A1*B3
|
||||
mulx $MR24,%$T9, %$T8 # T8:T9 = A1*B3
|
||||
adcx %$T9, %$T7
|
||||
adcx %rax, %$T8
|
||||
adox %$T2, %$T6
|
||||
|
||||
mov ($idxM0+16)(%$M0), %rdx
|
||||
mulx ($idxM1+ 0)(%$M1), %$T0, %$T1 # T1:T0 = A2*B0
|
||||
mov $ML16,%rdx
|
||||
mulx $MR0, %$T0, %$T1 # T1:T0 = A2*B0
|
||||
adox %$T3, %$T7
|
||||
adox %rax, %$T8
|
||||
xor %rax, %rax
|
||||
mulx ($idxM1+ 8)(%$M1), %$T3, %$T2 # T2:T3 = A2*B1
|
||||
mulx $MR8, %$T3, %$T2 # T2:T3 = A2*B1
|
||||
adox %$T5, %$T0
|
||||
mov %$T0, $idxC_16(%$C) # C2_final
|
||||
mov %$T0, $D2 # DST2_final
|
||||
adcx %$T3, %$T1
|
||||
mulx ($idxM1+16)(%$M1), %$T4, %$T3 # T3:T4 = A2*B2
|
||||
mulx $MR16,%$T4, %$T3 # T3:T4 = A2*B2
|
||||
adcx %$T4, %$T2
|
||||
adox %$T6, %$T1
|
||||
mulx ($idxM1+24)(%$M1), %$T9, %$T4 # T3:T4 = A2*B3
|
||||
mulx $MR24,%$T9, %$T4 # T3:T4 = A2*B3
|
||||
adcx %$T9, %$T3
|
||||
mov ($idxM0+24)(%$M0), %rdx
|
||||
adcx %rax, %$T4
|
||||
|
||||
adcx %rax, %$T4
|
||||
adox %$T7, %$T2
|
||||
adox %$T8, %$T3
|
||||
adox %rax, %$T4
|
||||
|
||||
mulx ($idxM1+ 0)(%$M1), %$T0, %$T5 # T5:T0 = A3*B0
|
||||
xor %rax, %rax
|
||||
mulx ($idxM1+ 8)(%$M1), %$T7, %$T6 # T6:T7 = A3*B1
|
||||
adcx %$T7, %$T5
|
||||
adox %$T0, %$T1
|
||||
mulx ($idxM1+16)(%$M1), %$T8, %$T7 # T7:T8 = A3*B2
|
||||
adcx %$T8, %$T6
|
||||
adox %$T5, %$T2
|
||||
mulx ($idxM1+24)(%$M1), %$T9, %$T8 # T8:T9 = A3*B3
|
||||
adcx %$T9, %$T7
|
||||
adcx %rax, %$T8
|
||||
adox %$T6, %$T3
|
||||
adox %$T7, %$T4
|
||||
adox %rax, %$T8
|
||||
mov %$T1, $idxC_24(%$C) # C3_final
|
||||
mov %$T2, $idxC_32(%$C) # C4_final
|
||||
mov %$T3, $idxC_40(%$C) # C5_final
|
||||
mov %$T4, $idxC_48(%$C) # C6_final
|
||||
mov %$T8, $idxC_56(%$C) # C7_final
|
||||
mov $ML24, %rdx
|
||||
mulx $MR0, %$T0, %$T5 # T5:T0 = A3*B0
|
||||
xor %rax, %rax
|
||||
mulx $MR8, %$T7, %$T6 # T6:T7 = A3*B1
|
||||
adcx %$T7, %$T5
|
||||
adox %$T0, %$T1
|
||||
mulx $MR16, %$T8, %$T7 # T7:T8 = A3*B2
|
||||
adcx %$T8, %$T6
|
||||
adox %$T5, %$T2
|
||||
mulx $MR24, %$T9, %$T8 # T8:T9 = A3*B3
|
||||
adcx %$T9, %$T7
|
||||
adcx %rax, %$T8
|
||||
adox %$T6, %$T3
|
||||
adox %$T7, %$T4
|
||||
adox %rax, %$T8
|
||||
mov %$T1, $D3 # DST3_final
|
||||
mov %$T2, $D4 # DST4_final
|
||||
mov %$T3, $D5 # DST5_final
|
||||
mov %$T4, $D6 # DST6_final
|
||||
mov %$T8, $D7 # DST7_final
|
||||
|
||||
___
|
||||
}
|
||||
|
||||
sub MUL128x320_SCHOOL() {
|
||||
sub MUL128x320_SCHOOL {
|
||||
my $idxM0 = shift;
|
||||
my $M0 = shift;
|
||||
my $M1 = shift;
|
||||
@ -154,8 +147,9 @@ sub MUL128x320_SCHOOL() {
|
||||
my $T7 = shift;
|
||||
my $T8 = shift;
|
||||
my $T9 = shift;
|
||||
my ($MUL0,$MUL8)=map("$idxM0+$_(%$M0)", (0,8));
|
||||
$code.=<<___;
|
||||
mov $idxM0(%$M0), %rdx
|
||||
mov $MUL0, %rdx
|
||||
mulx 0+$M1, %$T0, %$T1 # T0 <- C0_final
|
||||
mulx 8+$M1, %$T4, %$T2
|
||||
|
||||
@ -169,7 +163,7 @@ $code.=<<___;
|
||||
adox %$T6, %$T4
|
||||
adox %rax, %$T5
|
||||
|
||||
mov ($idxM0+8)(%$M0), %rdx
|
||||
mov $MUL8, %rdx
|
||||
mulx 0+$M1, %$T6, %$T7
|
||||
adcx %$T6, %$T1 # T1 <- C1_final
|
||||
adcx %$T7, %$T2
|
||||
@ -740,6 +734,7 @@ ${PREFIX}_mpdblsubx2_asm:
|
||||
mov %r9, 0x48(%rsp)
|
||||
mov %r10, 0x50(%rsp)
|
||||
mov %r11, 0x58(%rsp)
|
||||
|
||||
___
|
||||
|
||||
# [rcx+64] <- (AH+AL) x (BH+BL), low part
|
||||
@ -1549,8 +1544,7 @@ $code.=<<___;
|
||||
adc %rdx, %r8
|
||||
|
||||
xor %r9, %r9
|
||||
#mov 0x8(%rbx), %rax
|
||||
mov (p503p1+8)(%rip), %rax
|
||||
mov 0x8(%rbx), %rax
|
||||
mul %r11
|
||||
xor %r10, %r10
|
||||
add %rax, %r8
|
||||
|
Loading…
Reference in New Issue
Block a user