Change-Id: Ic362ed30d5338ae4fca34290b7d7c0b48f5a5fa3
This commit is contained in:
Henry Case 2019-04-19 18:48:24 +01:00
parent d46eed92fd
commit 1bb081fd8e

View File

@ -20,7 +20,7 @@ open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
*STDOUT=*OUT;
$PREFIX="sike";
$addx = 0;
$addx = 1;
$bmi2_adx = 1;
# Swaps 16-bytes pointed by %rdi and %rsi
@ -41,13 +41,13 @@ $code.=<<___;
___
}
sub MUL256_SCHOOL() {
sub MUL256_SCHOOL {
my $idxM0 = shift;
my $M0 = shift;
my $idxM1 = shift;
my $M1 = shift;
my $idxC = shift;
my $C = shift;
my $idxDST = shift;
my $DST = shift;
my $T0 = shift;
my $T1 = shift;
my $T2 = shift;
@ -58,89 +58,82 @@ sub MUL256_SCHOOL() {
my $T7 = shift;
my $T8 = shift;
my $T9 = shift;
# OZAPTF: Is it best approach?
my $idxC_0 = $idxC;
my $idxC_8 = $idxC + 8;
my $idxC_16 = $idxC + 16;
my $idxC_24 = $idxC + 24;
my $idxC_32 = $idxC + 32;
my $idxC_40 = $idxC + 40;
my $idxC_48 = $idxC + 48;
my $idxC_56 = $idxC + 56;
my ($ML0,$ML8,$ML16,$ML24)=map("$idxM0+$_(%$M0)",(0,8,16,24));
my ($MR0,$MR8,$MR16,$MR24)=map("$idxM1+$_(%$M1)",(0,8,16,24));
my ($D0,$D1,$D2,$D3,$D4,$D5,$D6,$D7)=map("$idxDST+$_(%$DST)",(0,8,16,24,32,40,48,56));
$code.=<<___;
mov ($idxM0+0)(%$M0), %rdx
mulx ($idxM1+0)(%$M1), %$T1, %$T0 # T0:T1 = A0*B0
mov %$T1, $idxC_0(%$C) # C0_final
mulx ($idxM1+8)(%$M1), %$T2, %$T1 # T1:T2 = A0*B1
mov $ML0, %rdx
mulx $MR0, %$T1, %$T0 # T0:T1 = A0*B0
mov %$T1, $D0 # DST0_final
mulx $MR8, %$T2, %$T1 # T1:T2 = A0*B1
xor %rax, %rax
adox %$T2, %$T0
mulx ($idxM1+16)(%$M1), %$T3, %$T2 # T2:T3 = A0*B2
mulx $MR16,%$T3, %$T2 # T2:T3 = A0*B2
adox %$T3, %$T1
mulx ($idxM1+24)(%$M1), %$T4, %$T3 # T3:T4 = A0*B3
mulx $MR24,%$T4, %$T3 # T3:T4 = A0*B3
adox %$T4, %$T2
mov ($idxM0+8)(%$M0), %rdx
mulx ($idxM1+0)(%$M1), %$T4, %$T5 # T5:T4 = A1*B0
mov $ML8, %rdx
mulx $MR0, %$T4, %$T5 # T5:T4 = A1*B0
adox %rax, %$T3
xor %rax, %rax
mulx ($idxM1+8)(%$M1), %$T7, %$T6 # T6:T7 = A1*B1
mulx $MR8, %$T7, %$T6 # T6:T7 = A1*B1
adox %$T0, %$T4
mov %$T4, $idxC_8(%$C) # C1_final
mov %$T4, $D1 # DST1_final
adcx %$T7, %$T5
mulx ($idxM1+16)(%$M1), %$T8, %$T7 # T7:T8 = A1*B2
mulx $MR16,%$T8, %$T7 # T7:T8 = A1*B2
adcx %$T8, %$T6
adox %$T1, %$T5
mulx ($idxM1+24)(%$M1), %$T9, %$T8 # T8:T9 = A1*B3
mulx $MR24,%$T9, %$T8 # T8:T9 = A1*B3
adcx %$T9, %$T7
adcx %rax, %$T8
adox %$T2, %$T6
mov ($idxM0+16)(%$M0), %rdx
mulx ($idxM1+ 0)(%$M1), %$T0, %$T1 # T1:T0 = A2*B0
mov $ML16,%rdx
mulx $MR0, %$T0, %$T1 # T1:T0 = A2*B0
adox %$T3, %$T7
adox %rax, %$T8
xor %rax, %rax
mulx ($idxM1+ 8)(%$M1), %$T3, %$T2 # T2:T3 = A2*B1
mulx $MR8, %$T3, %$T2 # T2:T3 = A2*B1
adox %$T5, %$T0
mov %$T0, $idxC_16(%$C) # C2_final
mov %$T0, $D2 # DST2_final
adcx %$T3, %$T1
mulx ($idxM1+16)(%$M1), %$T4, %$T3 # T3:T4 = A2*B2
mulx $MR16,%$T4, %$T3 # T3:T4 = A2*B2
adcx %$T4, %$T2
adox %$T6, %$T1
mulx ($idxM1+24)(%$M1), %$T9, %$T4 # T3:T4 = A2*B3
mulx $MR24,%$T9, %$T4 # T3:T4 = A2*B3
adcx %$T9, %$T3
mov ($idxM0+24)(%$M0), %rdx
adcx %rax, %$T4
adcx %rax, %$T4
adox %$T7, %$T2
adox %$T8, %$T3
adox %rax, %$T4
mulx ($idxM1+ 0)(%$M1), %$T0, %$T5 # T5:T0 = A3*B0
xor %rax, %rax
mulx ($idxM1+ 8)(%$M1), %$T7, %$T6 # T6:T7 = A3*B1
adcx %$T7, %$T5
adox %$T0, %$T1
mulx ($idxM1+16)(%$M1), %$T8, %$T7 # T7:T8 = A3*B2
adcx %$T8, %$T6
adox %$T5, %$T2
mulx ($idxM1+24)(%$M1), %$T9, %$T8 # T8:T9 = A3*B3
adcx %$T9, %$T7
adcx %rax, %$T8
adox %$T6, %$T3
adox %$T7, %$T4
adox %rax, %$T8
mov %$T1, $idxC_24(%$C) # C3_final
mov %$T2, $idxC_32(%$C) # C4_final
mov %$T3, $idxC_40(%$C) # C5_final
mov %$T4, $idxC_48(%$C) # C6_final
mov %$T8, $idxC_56(%$C) # C7_final
mov $ML24, %rdx
mulx $MR0, %$T0, %$T5 # T5:T0 = A3*B0
xor %rax, %rax
mulx $MR8, %$T7, %$T6 # T6:T7 = A3*B1
adcx %$T7, %$T5
adox %$T0, %$T1
mulx $MR16, %$T8, %$T7 # T7:T8 = A3*B2
adcx %$T8, %$T6
adox %$T5, %$T2
mulx $MR24, %$T9, %$T8 # T8:T9 = A3*B3
adcx %$T9, %$T7
adcx %rax, %$T8
adox %$T6, %$T3
adox %$T7, %$T4
adox %rax, %$T8
mov %$T1, $D3 # DST3_final
mov %$T2, $D4 # DST4_final
mov %$T3, $D5 # DST5_final
mov %$T4, $D6 # DST6_final
mov %$T8, $D7 # DST7_final
___
}
sub MUL128x320_SCHOOL() {
sub MUL128x320_SCHOOL {
my $idxM0 = shift;
my $M0 = shift;
my $M1 = shift;
@ -154,8 +147,9 @@ sub MUL128x320_SCHOOL() {
my $T7 = shift;
my $T8 = shift;
my $T9 = shift;
my ($MUL0,$MUL8)=map("$idxM0+$_(%$M0)", (0,8));
$code.=<<___;
mov $idxM0(%$M0), %rdx
mov $MUL0, %rdx
mulx 0+$M1, %$T0, %$T1 # T0 <- C0_final
mulx 8+$M1, %$T4, %$T2
@ -169,7 +163,7 @@ $code.=<<___;
adox %$T6, %$T4
adox %rax, %$T5
mov ($idxM0+8)(%$M0), %rdx
mov $MUL8, %rdx
mulx 0+$M1, %$T6, %$T7
adcx %$T6, %$T1 # T1 <- C1_final
adcx %$T7, %$T2
@ -740,6 +734,7 @@ ${PREFIX}_mpdblsubx2_asm:
mov %r9, 0x48(%rsp)
mov %r10, 0x50(%rsp)
mov %r11, 0x58(%rsp)
___
# [rcx+64] <- (AH+AL) x (BH+BL), low part
@ -1549,8 +1544,7 @@ $code.=<<___;
adc %rdx, %r8
xor %r9, %r9
#mov 0x8(%rbx), %rax
mov (p503p1+8)(%rip), %rax
mov 0x8(%rbx), %rax
mul %r11
xor %r10, %r10
add %rax, %r8