Change-Id: I86560300c4cfe8f93f77beeb34dc63aef2f310d5
This commit is contained in:
Henry Case 2019-04-21 01:48:58 +01:00
parent b2c4f09734
commit 77c4a5e297

877
third_party/sike/asm/fp-armv8.pl vendored Normal file
View File

@ -0,0 +1,877 @@
#! /usr/bin/env perl
#
# April 2019
#
# Abstract: field arithmetic in x64 assembly for SIDH/p503
$flavour = shift;
$output = shift;
if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
( $xlate="${dir}../../../crypto/perlasm/arm-xlate.pl" and -f $xlate) or
die "can't locate arm-xlate.pl";
open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
*STDOUT=*OUT;
$PREFIX="sike";
$code.=<<___;
.section .rodata
.p503p1_nz_s8:
.quad 0x085BDA2211E7A0AC, 0x9BF6C87B7E7DAF13
.quad 0x45C6BDDA77A4D01B, 0x4066F541811E1E60
.p503x2:
.quad 0xFFFFFFFFFFFFFFFE, 0xFFFFFFFFFFFFFFFF
.quad 0x57FFFFFFFFFFFFFF, 0x2610B7B44423CF41
.quad 0x3737ED90F6FCFB5E, 0xC08B8D7BB4EF49A0
.quad 0x0080CDEA83023C3C
.text
___
sub mul128_comba_cut {
my ($A0,$A1,$B0,$B1,$C0,$C1,$C2,$C3,$T0)=@_;
my $body=<<___;
mul $A0, $A1, $B0
umulh $B0, $A1, $B0
adds $C1, $C1, $C3
adc $C2, $C2, xzr
mul $T0, $A1, $B1
umulh $B1, $A1, $B1
adds $C1, $C1, $A0
adcs $C2, $C2, $B0
adc $C3, xzr, xzr
adds $C2, $C2, $T0
adc $C3, $C3, $B1
___
return $body;
}
sub mul256_karatsuba_comba {
my ($M,$A0,$A1,$A2,$A3,$B0,$B1,$B2,$B3,$C0,$C1,$C2,$C3,$C4,$C5,$C6,$C7,$T0,$T1)=@_;
# (AH+AL) x (BH+BL), low part
my $mul_low=&mul128_comba_cut($A0, $A1, $C6, $T1, $C2, $C3, $C4, $C5, $C7);
# AL x BL
my $mul_albl=&mul128_comba_cut($A0, $A1, $B0, $B1, $C0, $C1, $T1, $C7, $C6);
# AH x BH
my $mul_ahbh=&mul128_comba_cut($A2, $A3, $B2, $B3, $A0, $A1, $C6, $B0, $B1);
my $body=<<___;
# A0-A1 <- AH + AL, T0 <- mask
adds $A0, $A0, $A2
adcs $A1, $A1, $A3
adc $T0, xzr, xzr
# C6, T1 <- BH + BL, C7 <- mask
adds $C6, $B0, $B2
adcs $T1, $B1, $B3
adc $C7, xzr, xzr
# C0-C1 <- masked (BH + BL)
sub $C2, xzr, $T0
sub $C3, xzr, $C7
and $C0, $C6, $C2
and $C1, $T1, $C2
# C4-C5 <- masked (AH + AL), T0 <- combined carry
and $C4, $A0, $C3
and $C5, $A1, $C3
mul $C2, $A0, $C6
mul $C3, $A0, $T1
and $T0, $T0, $C7
# C0-C1, T0 <- (AH+AL) x (BH+BL), part 1
adds $C0, $C4, $C0
umulh $C4, $A0, $T1
adcs $C1, $C5, $C1
umulh $C5, $A0, $C6
adc $T0, $T0, xzr
# C2-C5 <- (AH+AL) x (BH+BL), low part
$mul_low
ldp $A0, $A1, [$M,#0]
# C2-C5, T0 <- (AH+AL) x (BH+BL), final part
adds $C4, $C0, $C4
umulh $C7, $A0, $B0
umulh $T1, $A0, $B1
adcs $C5, $C1, $C5
mul $C0, $A0, $B0
mul $C1, $A0, $B1
adc $T0, $T0, xzr
# C0-C1, T1, C7 <- AL x BL
$mul_albl
# C2-C5, T0 <- (AH+AL) x (BH+BL) - ALxBL
mul $A0, $A2, $B2
umulh $B0, $A2, $B2
subs $C2, $C2, $C0
sbcs $C3, $C3, $C1
sbcs $C4, $C4, $T1
mul $A1, $A2, $B3
umulh $C6, $A2, $B3
sbcs $C5, $C5, $C7
sbc $T0, $T0, xzr
# A0, A1, C6, B0 <- AH x BH
$mul_ahbh
# C2-C5, T0 <- (AH+AL) x (BH+BL) - ALxBL - AHxBH
subs $C2, $C2, $A0
sbcs $C3, $C3, $A1
sbcs $C4, $C4, $C6
sbcs $C5, $C5, $B0
sbc $T0, $T0, xzr
adds $C2, $C2, $T1
adcs $C3, $C3, $C7
adcs $C4, $C4, $A0
adcs $C5, $C5, $A1
adcs $C6, $T0, $C6
adc $C7, $B0, xzr
___
return $body;
}
# 512-bit integer multiplication using Karatsuba (two levels),
# Comba (lower level).
# Operation: c [x2] = a [x0] * b [x1]
sub mul {
# (AH+AL) x (BH+BL), low part
my $mul_kc_low=&mul256_karatsuba_comba("x2","x26","x27","x28","x29","x11","x12","x13","x14","x8","x9","x10","x19","x20","x21","x22","x23","x24","x25");
my $mul_albl=&mul256_karatsuba_comba("x0","x3","x4","x5","x6","x11","x12","x13","x14","x20","x21","x22","x23","x24","x25","x26","x27","x28","x29");
my $mul_ahbh=&mul256_karatsuba_comba("x0","x3","x4","x5","x6","x11","x12","x13","x14","x20","x21","x22","x23","x24","x25","x26","x27","x28","x29");
my $body=<<___;
.global ${PREFIX}_mpmul
.align 4
${PREFIX}_mpmul:
sub sp, sp, #96
stp x19, x20, [sp,#0]
stp x21, x22, [sp,#16]
stp x23, x24, [sp,#32]
stp x25, x26, [sp,#48]
stp x27, x28, [sp,#64]
str x29, [sp, #80]
ldp x3, x4, [x0]
ldp x5, x6, [x0,#16]
ldp x7, x8, [x0,#32]
ldp x9, x10, [x0,#48]
ldp x11, x12, [x1,#0]
ldp x13, x14, [x1,#16]
ldp x15, x16, [x1,#32]
ldp x17, x18, [x1,#48]
// x26-x29 <- AH + AL, x7 <- mask
adds x26, x3, x7
adcs x27, x4, x8
adcs x28, x5, x9
adcs x29, x6, x10
adc x7, xzr, xzr
// x11-x14 <- BH + BL, x8 <- mask
adds x11, x11, x15
adcs x12, x12, x16
adcs x13, x13, x17
adcs x14, x14, x18
adc x8, xzr, xzr
// x15-x18 <- masked (BH + BL)
sub x9, xzr, x7
sub x10, xzr, x8
and x15, x11, x9
and x16, x12, x9
and x17, x13, x9
and x18, x14, x9
// x19-x22 <- masked (AH + AL), x7 <- combined carry
and x19, x26, x10
and x20, x27, x10
and x21, x28, x10
and x22, x29, x10
and x7, x7, x8
// x15-x18, x7 <- masked (AH+AL) + masked (BH+BL), step 1
adds x15, x15, x19
adcs x16, x16, x20
adcs x17, x17, x21
adcs x18, x18, x22
adc x7, x7, xzr
// x8-x10,x19-x23 <- (AH+AL) x (BH+BL), low part
stp x26, x27, [x2,#0]
$mul_kc_low
// x15-x18, x7 <- (AH+AL) x (BH+BL), final step
adds x15, x15, x20
adcs x16, x16, x21
adcs x17, x17, x22
adcs x18, x18, x23
adc x7, x7, xzr
// x20-x27 <- AL x BL
ldp x11, x12, [x1,#0]
ldp x13, x14, [x1,#16]
$mul_albl
// x13-x14, x3-x5 <- (AH+AL) x (BH+BL) - ALxBL
subs x8, x8, x20
sbcs x9, x9, x21
sbcs x10, x10, x22
sbcs x19, x19, x23
sbcs x15, x15, x24
sbcs x16, x16, x25
sbcs x17, x17, x26
sbcs x18, x18, x27
sbc x7, x7, xzr
stp x20, x21, [x2]
stp x22, x23, [x2,#16]
ldp x3, x4, [x0,#32]
ldp x5, x6, [x0,#48]
ldp x11, x12, [x1,#32]
ldp x13, x14, [x1,#48]
adds x8, x8, x24
adcs x9, x9, x25
adcs x10, x10, x26
adcs x19, x19, x27
adc x1, xzr, xzr
// x20-x27 <- AH x BH
add x0, x0, #32
$mul_ahbh
neg x1, x1
// x13-x14, x3-x5 <- (AH+AL) x (BH+BL) - ALxBL - AHxBH
subs x8, x8, x20
sbcs x9, x9, x21
sbcs x10, x10, x22
sbcs x19, x19, x23
sbcs x15, x15, x24
sbcs x16, x16, x25
sbcs x17, x17, x26
sbcs x18, x18, x27
sbc x7, x7, xzr
stp x8, x9, [x2,#32]
stp x10, x19, [x2,#48]
adds x1, x1, #1
adcs x15, x15, x20
adcs x16, x16, x21
adcs x17, x17, x22
adcs x18, x18, x23
adcs x24, x7, x24
adcs x25, x25, xzr
adcs x26, x26, xzr
adc x27, x27, xzr
stp x15, x16, [x2,#64]
stp x17, x18, [x2,#80]
stp x24, x25, [x2,#96]
stp x26, x27, [x2,#112]
ldp x19, x20, [sp,#0]
ldp x21, x22, [sp,#16]
ldp x23, x24, [sp,#32]
ldp x25, x26, [sp,#48]
ldp x27, x28, [sp,#64]
ldr x29, [sp,#80]
add sp, sp, #96
ret
___
return $body;
}
$code.=&mul();
sub mul128x256_comba {
my ($A0,$A1,$B0,$B1,$B2,$B3,$C0,$C1,$C2,$C3,$C4,$C5,$T0,$T1,$T2,$T3)=@_;
my $body=<<___;
mul $T0, $A1, $B0
umulh $T1, $A1, $B0
adds $C1, $C1, $C3
adc $C2, $C2, xzr
mul $T2, $A0, $B2
umulh $T3, $A0, $B2
adds $C1, $C1, $T0
adcs $C2, $C2, $T1
adc $C3, xzr, xzr
mul $T0, $A1, $B1
umulh $T1, $A1, $B1
adds $C2, $C2, $T2
adcs $C3, $C3, $T3
adc $C4, xzr, xzr
mul $T2, $A0, $B3
umulh $T3, $A0, $B3
adds $C2, $C2, $T0
adcs $C3, $C3, $T1
adc $C4, $C4, xzr
mul $T0, $A1, $B2
umulh $T1, $A1, $B2
adds $C3, $C3, $T2
adcs $C4, $C4, $T3
adc $C5, xzr, xzr
mul $T2, $A1, $B3
umulh $T3, $A1, $B3
adds $C3, $C3, $T0
adcs $C4, $C4, $T1
adc $C5, $C5, xzr
adds $C4, $C4, $T2
adc $C5, $C5, $T3
___
return $body;
}
# Montgomery reduction
# Based on method described in Faz-Hernandez et al. https://eprint.iacr.org/2017/1015
# Operation: mc [x1] = ma [x0]
# NOTE: ma=mc is not allowed
sub rdc {
my $mul01=&mul128x256_comba("x2","x3","x24","x25","x26","x27","x4","x5","x6","x7","x8","x9","x28","x29","x30","x10");
my $mul23=&mul128x256_comba("x3","x11","x24","x25","x26","x27","x4","x5","x6","x7","x8","x9","x28","x29","x30","x10");
my $mul45=&mul128x256_comba("x12","x13","x24","x25","x26","x27","x4","x5","x6","x7","x8","x9","x28","x29","x30","x10");
my $mul67=&mul128x256_comba("x14","x15","x24","x25","x26","x27","x4","x5","x6","x7","x8","x9","x28","x29","x30","x10");
my $body=<<___;
.global ${PREFIX}_fprdc
.align 4
${PREFIX}_fprdc:
sub sp, sp, #96
stp x19, x20, [sp]
stp x21, x22, [sp, #16]
stp x23, x24, [sp, #32]
stp x25, x26, [sp, #48]
stp x27, x28, [sp, #64]
stp x29, x30, [sp, #80]
ldp x2, x3, [x0,#0] // a[0-1]
// Load the prime constant
adrp x23, .p503p1_nz_s8
add x23, x23, :lo12:.p503p1_nz_s8
ldp x24, x25, [x23, #0]
ldp x26, x27, [x23, #16]
// a[0-1] x p503p1_nz_s8 --> result: x4:x9
mul x4, x2, x24 // a[0] x p503p1_nz_s8[0]
umulh x7, x2, x24
mul x5, x2, x25 // a[0] x p503p1_nz_s8[1]
umulh x6, x2, x25
$mul01
ldp x3, x11, [x0,#16] // a[2]
ldp x12, x13, [x0,#32]
ldp x14, x15, [x0,#48]
orr x10, xzr, x9, lsr #8
lsl x9, x9, #56
orr x9, x9, x8, lsr #8
lsl x8, x8, #56
orr x8, x8, x7, lsr #8
lsl x7, x7, #56
orr x7, x7, x6, lsr #8
lsl x6, x6, #56
orr x6, x6, x5, lsr #8
lsl x5, x5, #56
orr x5, x5, x4, lsr #8
lsl x4, x4, #56
adds x11, x4, x11 // a[3]
adcs x12, x5, x12 // a[4]
adcs x13, x6, x13
adcs x14, x7, x14
adcs x15, x8, x15
ldp x16, x17, [x0,#64]
ldp x18, x19, [x0,#80]
mul x4, x3, x24 // a[2] x p503p1_nz_s8[0]
umulh x7, x3, x24
adcs x16, x9, x16
adcs x17, x10, x17
adcs x18, xzr, x18
adcs x19, xzr, x19
ldp x20, x21, [x0,#96]
ldp x22, x23, [x0,#112]
mul x5, x3, x25 // a[2] x p503p1_nz_s8[1]
umulh x6, x3, x25
adcs x20, xzr, x20
adcs x21, xzr, x21
adcs x22, xzr, x22
adc x23, xzr, x23
// a[2-3] x p503p1_nz_s8 --> result: x4:x9
$mul23
orr x10, xzr, x9, lsr #8
lsl x9, x9, #56
orr x9, x9, x8, lsr #8
lsl x8, x8, #56
orr x8, x8, x7, lsr #8
lsl x7, x7, #56
orr x7, x7, x6, lsr #8
lsl x6, x6, #56
orr x6, x6, x5, lsr #8
lsl x5, x5, #56
orr x5, x5, x4, lsr #8
lsl x4, x4, #56
adds x13, x4, x13 // a[5]
adcs x14, x5, x14 // a[6]
adcs x15, x6, x15
adcs x16, x7, x16
mul x4, x12, x24 // a[4] x p503p1_nz_s8[0]
umulh x7, x12, x24
adcs x17, x8, x17
adcs x18, x9, x18
adcs x19, x10, x19
adcs x20, xzr, x20
mul x5, x12, x25 // a[4] x p503p1_nz_s8[1]
umulh x6, x12, x25
adcs x21, xzr, x21
adcs x22, xzr, x22
adc x23, xzr, x23
// a[4-5] x p503p1_nz_s8 --> result: x4:x9
$mul45
orr x10, xzr, x9, lsr #8
lsl x9, x9, #56
orr x9, x9, x8, lsr #8
lsl x8, x8, #56
orr x8, x8, x7, lsr #8
lsl x7, x7, #56
orr x7, x7, x6, lsr #8
lsl x6, x6, #56
orr x6, x6, x5, lsr #8
lsl x5, x5, #56
orr x5, x5, x4, lsr #8
lsl x4, x4, #56
adds x15, x4, x15 // a[7]
adcs x16, x5, x16 // a[8]
adcs x17, x6, x17
adcs x18, x7, x18
mul x4, x14, x24 // a[6] x p503p1_nz_s8[0]
umulh x7, x14, x24
adcs x19, x8, x19
adcs x20, x9, x20
adcs x21, x10, x21
mul x5, x14, x25 // a[6] x p503p1_nz_s8[1]
umulh x6, x14, x25
adcs x22, xzr, x22
adc x23, xzr, x23
// a[6-7] x p503p1_nz_s8 --> result: x4:x9
$mul67
orr x10, xzr, x9, lsr #8
lsl x9, x9, #56
orr x9, x9, x8, lsr #8
lsl x8, x8, #56
orr x8, x8, x7, lsr #8
lsl x7, x7, #56
orr x7, x7, x6, lsr #8
lsl x6, x6, #56
orr x6, x6, x5, lsr #8
lsl x5, x5, #56
orr x5, x5, x4, lsr #8
lsl x4, x4, #56
adds x17, x4, x17
adcs x18, x5, x18
adcs x19, x6, x19
adcs x20, x7, x20
stp x16, x17, [x1,#0] // Final result
stp x18, x19, [x1,#16]
adcs x21, x8, x21
adcs x22, x9, x22
adc x23, x10, x23
stp x20, x21, [x1,#32]
stp x22, x23, [x1,#48]
ldp x19, x20, [sp]
ldp x21, x22, [sp, #16]
ldp x23, x24, [sp, #32]
ldp x25, x26, [sp, #48]
ldp x27, x28, [sp, #64]
ldp x29, x30, [sp, #80]
add sp, sp, #96
ret
___
}
$code.=&rdc();
# Field addition
# Operation: c [x2] = a [x0] + b [x1]
$code.=<<___;
.global ${PREFIX}_fpadd
.align 4
${PREFIX}_fpadd:
ldp x3, x4, [x0,#0]
ldp x5, x6, [x0,#16]
ldp x11, x12, [x1,#0]
ldp x13, x14, [x1,#16]
// Add a + b
adds x3, x3, x11
adcs x4, x4, x12
adcs x5, x5, x13
adcs x6, x6, x14
ldp x7, x8, [x0,#32]
ldp x9, x10, [x0,#48]
ldp x15, x16, [x1,#32]
ldp x17, x18, [x1,#48]
adcs x7, x7, x15
adcs x8, x8, x16
adcs x9, x9, x17
adc x10, x10, x18
// Subtract 2xp503
adrp x18, .p503x2
add x18, x18, :lo12:.p503x2
ldp x11, x12, [x18, #0]
ldp x13, x14, [x18, #16]
subs x3, x3, x11
sbcs x4, x4, x12
sbcs x5, x5, x12
sbcs x6, x6, x13
sbcs x7, x7, x14
ldp x15, x16, [x18, #32]
ldr x17, [x18, #48]
sbcs x8, x8, x15
sbcs x9, x9, x16
sbcs x10, x10, x17
sbc x18, xzr, xzr
// Add 2xp503 anded with the mask in x18
and x11, x11, x18
and x12, x12, x18
and x13, x13, x18
and x14, x14, x18
and x15, x15, x18
and x16, x16, x18
and x17, x17, x18
adds x3, x3, x11
adcs x4, x4, x12
adcs x5, x5, x12
adcs x6, x6, x13
adcs x7, x7, x14
adcs x8, x8, x15
adcs x9, x9, x16
adc x10, x10, x17
stp x3, x4, [x2,#0]
stp x5, x6, [x2,#16]
stp x7, x8, [x2,#32]
stp x9, x10, [x2,#48]
ret
___
# Field subtraction
# Operation: c [x2] = a [x0] - b [x1]
$code.=<<___;
.global ${PREFIX}_fpsub
.align 4
${PREFIX}_fpsub:
ldp x3, x4, [x0,#0]
ldp x5, x6, [x0,#16]
ldp x11, x12, [x1,#0]
ldp x13, x14, [x1,#16]
# Subtract a - b
subs x3, x3, x11
sbcs x4, x4, x12
sbcs x5, x5, x13
sbcs x6, x6, x14
ldp x7, x8, [x0,#32]
ldp x9, x10, [x0,#48]
ldp x15, x16, [x1,#32]
ldp x17, x18, [x1,#48]
sbcs x7, x7, x15
sbcs x8, x8, x16
sbcs x9, x9, x17
sbcs x10, x10, x18
sbc x18, xzr, xzr
# Add 2xp503 anded with the mask in x18
adrp x19, .p503x2
add x19, x19, :lo12:.p503x2
ldp x11, x12, [x19, #0]
ldp x13, x14, [x19, #16]
and x11, x11, x18
and x12, x12, x18
and x13, x13, x18
and x14, x14, x18
ldp x15, x16, [x19, #32]
ldr x17, [x19, #48]
and x15, x15, x18
and x16, x16, x18
and x17, x17, x18
adds x3, x3, x11
adcs x4, x4, x12
adcs x5, x5, x12
adcs x6, x6, x13
adcs x7, x7, x14
adcs x8, x8, x15
adcs x9, x9, x16
adc x10, x10, x17
stp x3, x4, [x2,#0]
stp x5, x6, [x2,#16]
stp x7, x8, [x2,#32]
stp x9, x10, [x2,#48]
ret
___
# 503-bit multiprecision addition
# Operation: c [x2] = a [x0] + b [x1]
$code.=<<___;
.global ${PREFIX}_mpadd_asm
.align 4
${PREFIX}_mpadd_asm:
ldp x3, x4, [x0,#0]
ldp x5, x6, [x0,#16]
ldp x11, x12, [x1,#0]
ldp x13, x14, [x1,#16]
adds x3, x3, x11
adcs x4, x4, x12
adcs x5, x5, x13
adcs x6, x6, x14
ldp x7, x8, [x0,#32]
ldp x9, x10, [x0,#48]
ldp x15, x16, [x1,#32]
ldp x17, x18, [x1,#48]
adcs x7, x7, x15
adcs x8, x8, x16
adcs x9, x9, x17
adc x10, x10, x18
stp x3, x4, [x2,#0]
stp x5, x6, [x2,#16]
stp x7, x8, [x2,#32]
stp x9, x10, [x2,#48]
ret
___
# 2x503-bit multiprecision addition
# Operation: c [x2] = a [x0] + b [x1]
$code.=<<___;
.global ${PREFIX}_mpadd503x2_asm
.align 4
${PREFIX}_mpadd503x2_asm:
ldp x3, x4, [x0,#0]
ldp x5, x6, [x0,#16]
ldp x11, x12, [x1,#0]
ldp x13, x14, [x1,#16]
adds x3, x3, x11
adcs x4, x4, x12
adcs x5, x5, x13
adcs x6, x6, x14
ldp x7, x8, [x0,#32]
ldp x9, x10, [x0,#48]
ldp x15, x16, [x1,#32]
ldp x17, x18, [x1,#48]
adcs x7, x7, x15
adcs x8, x8, x16
adcs x9, x9, x17
adcs x10, x10, x18
stp x3, x4, [x2,#0]
stp x5, x6, [x2,#16]
stp x7, x8, [x2,#32]
stp x9, x10, [x2,#48]
ldp x3, x4, [x0,#64]
ldp x5, x6, [x0,#80]
ldp x11, x12, [x1,#64]
ldp x13, x14, [x1,#80]
adcs x3, x3, x11
adcs x4, x4, x12
adcs x5, x5, x13
adcs x6, x6, x14
ldp x7, x8, [x0,#96]
ldp x9, x10, [x0,#112]
ldp x15, x16, [x1,#96]
ldp x17, x18, [x1,#112]
adcs x7, x7, x15
adcs x8, x8, x16
adcs x9, x9, x17
adc x10, x10, x18
stp x3, x4, [x2,#64]
stp x5, x6, [x2,#80]
stp x7, x8, [x2,#96]
stp x9, x10, [x2,#112]
ret
___
# 2x503-bit multiprecision subtraction
# Operation: c [x2] = a [x0] - b [x1]. Returns borrow mask
$code.=<<___;
.global ${PREFIX}_mpsubx2_asm
.align 4
${PREFIX}_mpsubx2_asm:
ldp x3, x4, [x0,#0]
ldp x5, x6, [x0,#16]
ldp x11, x12, [x1,#0]
ldp x13, x14, [x1,#16]
subs x3, x3, x11
sbcs x4, x4, x12
sbcs x5, x5, x13
sbcs x6, x6, x14
ldp x7, x8, [x0,#32]
ldp x9, x10, [x0,#48]
ldp x15, x16, [x1,#32]
ldp x17, x18, [x1,#48]
sbcs x7, x7, x15
sbcs x8, x8, x16
sbcs x9, x9, x17
sbcs x10, x10, x18
stp x3, x4, [x2,#0]
stp x5, x6, [x2,#16]
stp x7, x8, [x2,#32]
stp x9, x10, [x2,#48]
ldp x3, x4, [x0,#64]
ldp x5, x6, [x0,#80]
ldp x11, x12, [x1,#64]
ldp x13, x14, [x1,#80]
sbcs x3, x3, x11
sbcs x4, x4, x12
sbcs x5, x5, x13
sbcs x6, x6, x14
ldp x7, x8, [x0,#96]
ldp x9, x10, [x0,#112]
ldp x15, x16, [x1,#96]
ldp x17, x18, [x1,#112]
sbcs x7, x7, x15
sbcs x8, x8, x16
sbcs x9, x9, x17
sbcs x10, x10, x18
sbc x0, xzr, xzr
stp x3, x4, [x2,#64]
stp x5, x6, [x2,#80]
stp x7, x8, [x2,#96]
stp x9, x10, [x2,#112]
ret
___
# Double 2x503-bit multiprecision subtraction
# Operation: c [x2] = c [x2] - a [x0] - b [x1]
$code.=<<___;
.global ${PREFIX}_mpdblsubx2_asm
.align 4
${PREFIX}_mpdblsubx2_asm:
sub sp, sp, #32
stp x27, x28, [sp, #0]
stp x29, x30, [sp, #16]
ldp x3, x4, [x2,#0]
ldp x5, x6, [x2,#16]
ldp x7, x8, [x2,#32]
ldp x9, x10, [x2,#48]
ldp x11, x12, [x2,#64]
ldp x13, x14, [x2,#80]
ldp x15, x16, [x2,#96]
ldp x17, x18, [x2,#112]
ldp x27, x28, [x0,#0]
ldp x29, x30, [x0,#16]
subs x3, x3, x27
sbcs x4, x4, x28
sbcs x5, x5, x29
sbcs x6, x6, x30
ldp x27, x28, [x0,#32]
ldp x29, x30, [x0,#48]
sbcs x7, x7, x27
sbcs x8, x8, x28
sbcs x9, x9, x29
sbcs x10, x10, x30
ldp x27, x28, [x0,#64]
ldp x29, x30, [x0,#80]
sbcs x11, x11, x27
sbcs x12, x12, x28
sbcs x13, x13, x29
sbcs x14, x14, x30
ldp x27, x28, [x0,#96]
ldp x29, x30, [x0,#112]
sbcs x15, x15, x27
sbcs x16, x16, x28
sbcs x17, x17, x29
sbc x18, x18, x30
ldp x27, x28, [x1,#0]
ldp x29, x30, [x1,#16]
subs x3, x3, x27
sbcs x4, x4, x28
sbcs x5, x5, x29
sbcs x6, x6, x30
ldp x27, x28, [x1,#32]
ldp x29, x30, [x1,#48]
sbcs x7, x7, x27
sbcs x8, x8, x28
sbcs x9, x9, x29
sbcs x10, x10, x30
ldp x27, x28, [x1,#64]
ldp x29, x30, [x1,#80]
sbcs x11, x11, x27
sbcs x12, x12, x28
sbcs x13, x13, x29
sbcs x14, x14, x30
ldp x27, x28, [x1,#96]
ldp x29, x30, [x1,#112]
sbcs x15, x15, x27
sbcs x16, x16, x28
sbcs x17, x17, x29
sbc x18, x18, x30
stp x3, x4, [x2,#0]
stp x5, x6, [x2,#16]
stp x7, x8, [x2,#32]
stp x9, x10, [x2,#48]
stp x11, x12, [x2,#64]
stp x13, x14, [x2,#80]
stp x15, x16, [x2,#96]
stp x17, x18, [x2,#112]
ldp x27, x28, [sp, #0]
ldp x29, x30, [sp, #16]
add sp, sp, #32
ret
___
foreach (split("\n",$code)) {
s/\`([^\`]*)\`/eval($1)/ge;
print $_,"\n";
}
close STDOUT;