From 77c4a5e297bce709a280e1db526473084e370a7c Mon Sep 17 00:00:00 2001 From: Kris Kwiatkowski Date: Sun, 21 Apr 2019 01:48:58 +0100 Subject: [PATCH] WIP Change-Id: I86560300c4cfe8f93f77beeb34dc63aef2f310d5 --- third_party/sike/asm/fp-armv8.pl | 877 +++++++++++++++++++++++++++++++ 1 file changed, 877 insertions(+) create mode 100644 third_party/sike/asm/fp-armv8.pl diff --git a/third_party/sike/asm/fp-armv8.pl b/third_party/sike/asm/fp-armv8.pl new file mode 100644 index 00000000..2019c1af --- /dev/null +++ b/third_party/sike/asm/fp-armv8.pl @@ -0,0 +1,877 @@ +#! /usr/bin/env perl +# +# April 2019 +# +# Abstract: field arithmetic in x64 assembly for SIDH/p503 + +$flavour = shift; +$output = shift; +if ($flavour =~ /\./) { $output = $flavour; undef $flavour; } + +$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1; +( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or +( $xlate="${dir}../../../crypto/perlasm/arm-xlate.pl" and -f $xlate) or +die "can't locate arm-xlate.pl"; + +open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\""; +*STDOUT=*OUT; + +$PREFIX="sike"; + +$code.=<<___; + .section .rodata + + .p503p1_nz_s8: + .quad 0x085BDA2211E7A0AC, 0x9BF6C87B7E7DAF13 + .quad 0x45C6BDDA77A4D01B, 0x4066F541811E1E60 + + .p503x2: + .quad 0xFFFFFFFFFFFFFFFE, 0xFFFFFFFFFFFFFFFF + .quad 0x57FFFFFFFFFFFFFF, 0x2610B7B44423CF41 + .quad 0x3737ED90F6FCFB5E, 0xC08B8D7BB4EF49A0 + .quad 0x0080CDEA83023C3C + + .text + +___ + +sub mul128_comba_cut { + my ($A0,$A1,$B0,$B1,$C0,$C1,$C2,$C3,$T0)=@_; + my $body=<<___; + mul $A0, $A1, $B0 + umulh $B0, $A1, $B0 + adds $C1, $C1, $C3 + adc $C2, $C2, xzr + + mul $T0, $A1, $B1 + umulh $B1, $A1, $B1 + adds $C1, $C1, $A0 + adcs $C2, $C2, $B0 + adc $C3, xzr, xzr + + adds $C2, $C2, $T0 + adc $C3, $C3, $B1 +___ + return $body; +} + +sub mul256_karatsuba_comba { + my ($M,$A0,$A1,$A2,$A3,$B0,$B1,$B2,$B3,$C0,$C1,$C2,$C3,$C4,$C5,$C6,$C7,$T0,$T1)=@_; + # (AH+AL) x (BH+BL), low part + my $mul_low=&mul128_comba_cut($A0, $A1, $C6, $T1, $C2, $C3, $C4, $C5, $C7); + # AL x BL + my $mul_albl=&mul128_comba_cut($A0, $A1, $B0, $B1, $C0, $C1, $T1, $C7, $C6); + # AH x BH + my $mul_ahbh=&mul128_comba_cut($A2, $A3, $B2, $B3, $A0, $A1, $C6, $B0, $B1); + my $body=<<___; + # A0-A1 <- AH + AL, T0 <- mask + adds $A0, $A0, $A2 + adcs $A1, $A1, $A3 + adc $T0, xzr, xzr + + # C6, T1 <- BH + BL, C7 <- mask + adds $C6, $B0, $B2 + adcs $T1, $B1, $B3 + adc $C7, xzr, xzr + + # C0-C1 <- masked (BH + BL) + sub $C2, xzr, $T0 + sub $C3, xzr, $C7 + and $C0, $C6, $C2 + and $C1, $T1, $C2 + + # C4-C5 <- masked (AH + AL), T0 <- combined carry + and $C4, $A0, $C3 + and $C5, $A1, $C3 + mul $C2, $A0, $C6 + mul $C3, $A0, $T1 + and $T0, $T0, $C7 + + # C0-C1, T0 <- (AH+AL) x (BH+BL), part 1 + adds $C0, $C4, $C0 + umulh $C4, $A0, $T1 + adcs $C1, $C5, $C1 + umulh $C5, $A0, $C6 + adc $T0, $T0, xzr + + # C2-C5 <- (AH+AL) x (BH+BL), low part + $mul_low + ldp $A0, $A1, [$M,#0] + + # C2-C5, T0 <- (AH+AL) x (BH+BL), final part + adds $C4, $C0, $C4 + umulh $C7, $A0, $B0 + umulh $T1, $A0, $B1 + adcs $C5, $C1, $C5 + mul $C0, $A0, $B0 + mul $C1, $A0, $B1 + adc $T0, $T0, xzr + + # C0-C1, T1, C7 <- AL x BL + $mul_albl + + # C2-C5, T0 <- (AH+AL) x (BH+BL) - ALxBL + mul $A0, $A2, $B2 + umulh $B0, $A2, $B2 + subs $C2, $C2, $C0 + sbcs $C3, $C3, $C1 + sbcs $C4, $C4, $T1 + mul $A1, $A2, $B3 + umulh $C6, $A2, $B3 + sbcs $C5, $C5, $C7 + sbc $T0, $T0, xzr + + # A0, A1, C6, B0 <- AH x BH + $mul_ahbh + + # C2-C5, T0 <- (AH+AL) x (BH+BL) - ALxBL - AHxBH + subs $C2, $C2, $A0 + sbcs $C3, $C3, $A1 + sbcs $C4, $C4, $C6 + sbcs $C5, $C5, $B0 + sbc $T0, $T0, xzr + + adds $C2, $C2, $T1 + adcs $C3, $C3, $C7 + adcs $C4, $C4, $A0 + adcs $C5, $C5, $A1 + adcs $C6, $T0, $C6 + adc $C7, $B0, xzr +___ + return $body; +} + +# 512-bit integer multiplication using Karatsuba (two levels), +# Comba (lower level). +# Operation: c [x2] = a [x0] * b [x1] +sub mul { + # (AH+AL) x (BH+BL), low part + my $mul_kc_low=&mul256_karatsuba_comba("x2","x26","x27","x28","x29","x11","x12","x13","x14","x8","x9","x10","x19","x20","x21","x22","x23","x24","x25"); + my $mul_albl=&mul256_karatsuba_comba("x0","x3","x4","x5","x6","x11","x12","x13","x14","x20","x21","x22","x23","x24","x25","x26","x27","x28","x29"); + my $mul_ahbh=&mul256_karatsuba_comba("x0","x3","x4","x5","x6","x11","x12","x13","x14","x20","x21","x22","x23","x24","x25","x26","x27","x28","x29"); + + my $body=<<___; + .global ${PREFIX}_mpmul + .align 4 + ${PREFIX}_mpmul: + sub sp, sp, #96 + stp x19, x20, [sp,#0] + stp x21, x22, [sp,#16] + stp x23, x24, [sp,#32] + stp x25, x26, [sp,#48] + stp x27, x28, [sp,#64] + str x29, [sp, #80] + + ldp x3, x4, [x0] + ldp x5, x6, [x0,#16] + ldp x7, x8, [x0,#32] + ldp x9, x10, [x0,#48] + ldp x11, x12, [x1,#0] + ldp x13, x14, [x1,#16] + ldp x15, x16, [x1,#32] + ldp x17, x18, [x1,#48] + + // x26-x29 <- AH + AL, x7 <- mask + adds x26, x3, x7 + adcs x27, x4, x8 + adcs x28, x5, x9 + adcs x29, x6, x10 + adc x7, xzr, xzr + + // x11-x14 <- BH + BL, x8 <- mask + adds x11, x11, x15 + adcs x12, x12, x16 + adcs x13, x13, x17 + adcs x14, x14, x18 + adc x8, xzr, xzr + + // x15-x18 <- masked (BH + BL) + sub x9, xzr, x7 + sub x10, xzr, x8 + and x15, x11, x9 + and x16, x12, x9 + and x17, x13, x9 + and x18, x14, x9 + + // x19-x22 <- masked (AH + AL), x7 <- combined carry + and x19, x26, x10 + and x20, x27, x10 + and x21, x28, x10 + and x22, x29, x10 + and x7, x7, x8 + + // x15-x18, x7 <- masked (AH+AL) + masked (BH+BL), step 1 + adds x15, x15, x19 + adcs x16, x16, x20 + adcs x17, x17, x21 + adcs x18, x18, x22 + adc x7, x7, xzr + + // x8-x10,x19-x23 <- (AH+AL) x (BH+BL), low part + stp x26, x27, [x2,#0] + $mul_kc_low + + // x15-x18, x7 <- (AH+AL) x (BH+BL), final step + adds x15, x15, x20 + adcs x16, x16, x21 + adcs x17, x17, x22 + adcs x18, x18, x23 + adc x7, x7, xzr + + // x20-x27 <- AL x BL + ldp x11, x12, [x1,#0] + ldp x13, x14, [x1,#16] + $mul_albl + + // x13-x14, x3-x5 <- (AH+AL) x (BH+BL) - ALxBL + subs x8, x8, x20 + sbcs x9, x9, x21 + sbcs x10, x10, x22 + sbcs x19, x19, x23 + sbcs x15, x15, x24 + sbcs x16, x16, x25 + sbcs x17, x17, x26 + sbcs x18, x18, x27 + sbc x7, x7, xzr + + stp x20, x21, [x2] + stp x22, x23, [x2,#16] + + ldp x3, x4, [x0,#32] + ldp x5, x6, [x0,#48] + ldp x11, x12, [x1,#32] + ldp x13, x14, [x1,#48] + + adds x8, x8, x24 + adcs x9, x9, x25 + adcs x10, x10, x26 + adcs x19, x19, x27 + adc x1, xzr, xzr + + // x20-x27 <- AH x BH + add x0, x0, #32 + $mul_ahbh + neg x1, x1 + + // x13-x14, x3-x5 <- (AH+AL) x (BH+BL) - ALxBL - AHxBH + subs x8, x8, x20 + sbcs x9, x9, x21 + sbcs x10, x10, x22 + sbcs x19, x19, x23 + sbcs x15, x15, x24 + sbcs x16, x16, x25 + sbcs x17, x17, x26 + sbcs x18, x18, x27 + sbc x7, x7, xzr + + stp x8, x9, [x2,#32] + stp x10, x19, [x2,#48] + + adds x1, x1, #1 + adcs x15, x15, x20 + adcs x16, x16, x21 + adcs x17, x17, x22 + adcs x18, x18, x23 + adcs x24, x7, x24 + adcs x25, x25, xzr + adcs x26, x26, xzr + adc x27, x27, xzr + + stp x15, x16, [x2,#64] + stp x17, x18, [x2,#80] + stp x24, x25, [x2,#96] + stp x26, x27, [x2,#112] + + ldp x19, x20, [sp,#0] + ldp x21, x22, [sp,#16] + ldp x23, x24, [sp,#32] + ldp x25, x26, [sp,#48] + ldp x27, x28, [sp,#64] + ldr x29, [sp,#80] + add sp, sp, #96 + ret +___ + return $body; +} +$code.=&mul(); + +sub mul128x256_comba { + my ($A0,$A1,$B0,$B1,$B2,$B3,$C0,$C1,$C2,$C3,$C4,$C5,$T0,$T1,$T2,$T3)=@_; + my $body=<<___; + mul $T0, $A1, $B0 + umulh $T1, $A1, $B0 + adds $C1, $C1, $C3 + adc $C2, $C2, xzr + + mul $T2, $A0, $B2 + umulh $T3, $A0, $B2 + adds $C1, $C1, $T0 + adcs $C2, $C2, $T1 + adc $C3, xzr, xzr + + mul $T0, $A1, $B1 + umulh $T1, $A1, $B1 + adds $C2, $C2, $T2 + adcs $C3, $C3, $T3 + adc $C4, xzr, xzr + + mul $T2, $A0, $B3 + umulh $T3, $A0, $B3 + adds $C2, $C2, $T0 + adcs $C3, $C3, $T1 + adc $C4, $C4, xzr + + mul $T0, $A1, $B2 + umulh $T1, $A1, $B2 + adds $C3, $C3, $T2 + adcs $C4, $C4, $T3 + adc $C5, xzr, xzr + + mul $T2, $A1, $B3 + umulh $T3, $A1, $B3 + adds $C3, $C3, $T0 + adcs $C4, $C4, $T1 + adc $C5, $C5, xzr + adds $C4, $C4, $T2 + adc $C5, $C5, $T3 + +___ + return $body; +} + +# Montgomery reduction +# Based on method described in Faz-Hernandez et al. https://eprint.iacr.org/2017/1015 +# Operation: mc [x1] = ma [x0] +# NOTE: ma=mc is not allowed +sub rdc { + my $mul01=&mul128x256_comba("x2","x3","x24","x25","x26","x27","x4","x5","x6","x7","x8","x9","x28","x29","x30","x10"); + my $mul23=&mul128x256_comba("x3","x11","x24","x25","x26","x27","x4","x5","x6","x7","x8","x9","x28","x29","x30","x10"); + my $mul45=&mul128x256_comba("x12","x13","x24","x25","x26","x27","x4","x5","x6","x7","x8","x9","x28","x29","x30","x10"); + my $mul67=&mul128x256_comba("x14","x15","x24","x25","x26","x27","x4","x5","x6","x7","x8","x9","x28","x29","x30","x10"); + my $body=<<___; + .global ${PREFIX}_fprdc + .align 4 + ${PREFIX}_fprdc: + sub sp, sp, #96 + stp x19, x20, [sp] + stp x21, x22, [sp, #16] + stp x23, x24, [sp, #32] + stp x25, x26, [sp, #48] + stp x27, x28, [sp, #64] + stp x29, x30, [sp, #80] + + ldp x2, x3, [x0,#0] // a[0-1] + + // Load the prime constant + adrp x23, .p503p1_nz_s8 + add x23, x23, :lo12:.p503p1_nz_s8 + ldp x24, x25, [x23, #0] + ldp x26, x27, [x23, #16] + + // a[0-1] x p503p1_nz_s8 --> result: x4:x9 + mul x4, x2, x24 // a[0] x p503p1_nz_s8[0] + umulh x7, x2, x24 + mul x5, x2, x25 // a[0] x p503p1_nz_s8[1] + umulh x6, x2, x25 + + $mul01 + + ldp x3, x11, [x0,#16] // a[2] + ldp x12, x13, [x0,#32] + ldp x14, x15, [x0,#48] + + orr x10, xzr, x9, lsr #8 + lsl x9, x9, #56 + orr x9, x9, x8, lsr #8 + lsl x8, x8, #56 + orr x8, x8, x7, lsr #8 + lsl x7, x7, #56 + orr x7, x7, x6, lsr #8 + lsl x6, x6, #56 + orr x6, x6, x5, lsr #8 + lsl x5, x5, #56 + orr x5, x5, x4, lsr #8 + lsl x4, x4, #56 + + adds x11, x4, x11 // a[3] + adcs x12, x5, x12 // a[4] + adcs x13, x6, x13 + adcs x14, x7, x14 + adcs x15, x8, x15 + ldp x16, x17, [x0,#64] + ldp x18, x19, [x0,#80] + mul x4, x3, x24 // a[2] x p503p1_nz_s8[0] + umulh x7, x3, x24 + adcs x16, x9, x16 + adcs x17, x10, x17 + adcs x18, xzr, x18 + adcs x19, xzr, x19 + ldp x20, x21, [x0,#96] + ldp x22, x23, [x0,#112] + mul x5, x3, x25 // a[2] x p503p1_nz_s8[1] + umulh x6, x3, x25 + adcs x20, xzr, x20 + adcs x21, xzr, x21 + adcs x22, xzr, x22 + adc x23, xzr, x23 + + // a[2-3] x p503p1_nz_s8 --> result: x4:x9 + $mul23 + + orr x10, xzr, x9, lsr #8 + lsl x9, x9, #56 + orr x9, x9, x8, lsr #8 + lsl x8, x8, #56 + orr x8, x8, x7, lsr #8 + lsl x7, x7, #56 + orr x7, x7, x6, lsr #8 + lsl x6, x6, #56 + orr x6, x6, x5, lsr #8 + lsl x5, x5, #56 + orr x5, x5, x4, lsr #8 + lsl x4, x4, #56 + + adds x13, x4, x13 // a[5] + adcs x14, x5, x14 // a[6] + adcs x15, x6, x15 + adcs x16, x7, x16 + mul x4, x12, x24 // a[4] x p503p1_nz_s8[0] + umulh x7, x12, x24 + adcs x17, x8, x17 + adcs x18, x9, x18 + adcs x19, x10, x19 + adcs x20, xzr, x20 + mul x5, x12, x25 // a[4] x p503p1_nz_s8[1] + umulh x6, x12, x25 + adcs x21, xzr, x21 + adcs x22, xzr, x22 + adc x23, xzr, x23 + + // a[4-5] x p503p1_nz_s8 --> result: x4:x9 + $mul45 + + orr x10, xzr, x9, lsr #8 + lsl x9, x9, #56 + orr x9, x9, x8, lsr #8 + lsl x8, x8, #56 + orr x8, x8, x7, lsr #8 + lsl x7, x7, #56 + orr x7, x7, x6, lsr #8 + lsl x6, x6, #56 + orr x6, x6, x5, lsr #8 + lsl x5, x5, #56 + orr x5, x5, x4, lsr #8 + lsl x4, x4, #56 + + adds x15, x4, x15 // a[7] + adcs x16, x5, x16 // a[8] + adcs x17, x6, x17 + adcs x18, x7, x18 + mul x4, x14, x24 // a[6] x p503p1_nz_s8[0] + umulh x7, x14, x24 + adcs x19, x8, x19 + adcs x20, x9, x20 + adcs x21, x10, x21 + mul x5, x14, x25 // a[6] x p503p1_nz_s8[1] + umulh x6, x14, x25 + adcs x22, xzr, x22 + adc x23, xzr, x23 + + // a[6-7] x p503p1_nz_s8 --> result: x4:x9 + $mul67 + + orr x10, xzr, x9, lsr #8 + lsl x9, x9, #56 + orr x9, x9, x8, lsr #8 + lsl x8, x8, #56 + orr x8, x8, x7, lsr #8 + lsl x7, x7, #56 + orr x7, x7, x6, lsr #8 + lsl x6, x6, #56 + orr x6, x6, x5, lsr #8 + lsl x5, x5, #56 + orr x5, x5, x4, lsr #8 + lsl x4, x4, #56 + + adds x17, x4, x17 + adcs x18, x5, x18 + adcs x19, x6, x19 + adcs x20, x7, x20 + stp x16, x17, [x1,#0] // Final result + stp x18, x19, [x1,#16] + adcs x21, x8, x21 + adcs x22, x9, x22 + adc x23, x10, x23 + stp x20, x21, [x1,#32] + stp x22, x23, [x1,#48] + + ldp x19, x20, [sp] + ldp x21, x22, [sp, #16] + ldp x23, x24, [sp, #32] + ldp x25, x26, [sp, #48] + ldp x27, x28, [sp, #64] + ldp x29, x30, [sp, #80] + add sp, sp, #96 + ret + +___ +} + +$code.=&rdc(); + + +# Field addition +# Operation: c [x2] = a [x0] + b [x1] +$code.=<<___; + .global ${PREFIX}_fpadd + .align 4 + ${PREFIX}_fpadd: + ldp x3, x4, [x0,#0] + ldp x5, x6, [x0,#16] + ldp x11, x12, [x1,#0] + ldp x13, x14, [x1,#16] + + // Add a + b + adds x3, x3, x11 + adcs x4, x4, x12 + adcs x5, x5, x13 + adcs x6, x6, x14 + ldp x7, x8, [x0,#32] + ldp x9, x10, [x0,#48] + ldp x15, x16, [x1,#32] + ldp x17, x18, [x1,#48] + adcs x7, x7, x15 + adcs x8, x8, x16 + adcs x9, x9, x17 + adc x10, x10, x18 + + // Subtract 2xp503 + adrp x18, .p503x2 + add x18, x18, :lo12:.p503x2 + ldp x11, x12, [x18, #0] + ldp x13, x14, [x18, #16] + subs x3, x3, x11 + sbcs x4, x4, x12 + sbcs x5, x5, x12 + sbcs x6, x6, x13 + sbcs x7, x7, x14 + + ldp x15, x16, [x18, #32] + ldr x17, [x18, #48] + sbcs x8, x8, x15 + sbcs x9, x9, x16 + sbcs x10, x10, x17 + sbc x18, xzr, xzr + + // Add 2xp503 anded with the mask in x18 + and x11, x11, x18 + and x12, x12, x18 + and x13, x13, x18 + and x14, x14, x18 + and x15, x15, x18 + and x16, x16, x18 + and x17, x17, x18 + + adds x3, x3, x11 + adcs x4, x4, x12 + adcs x5, x5, x12 + adcs x6, x6, x13 + adcs x7, x7, x14 + adcs x8, x8, x15 + adcs x9, x9, x16 + adc x10, x10, x17 + + stp x3, x4, [x2,#0] + stp x5, x6, [x2,#16] + stp x7, x8, [x2,#32] + stp x9, x10, [x2,#48] + ret + +___ + +# Field subtraction +# Operation: c [x2] = a [x0] - b [x1] +$code.=<<___; + .global ${PREFIX}_fpsub + .align 4 + ${PREFIX}_fpsub: + ldp x3, x4, [x0,#0] + ldp x5, x6, [x0,#16] + ldp x11, x12, [x1,#0] + ldp x13, x14, [x1,#16] + + # Subtract a - b + subs x3, x3, x11 + sbcs x4, x4, x12 + sbcs x5, x5, x13 + sbcs x6, x6, x14 + ldp x7, x8, [x0,#32] + ldp x9, x10, [x0,#48] + ldp x15, x16, [x1,#32] + ldp x17, x18, [x1,#48] + sbcs x7, x7, x15 + sbcs x8, x8, x16 + sbcs x9, x9, x17 + sbcs x10, x10, x18 + sbc x18, xzr, xzr + + # Add 2xp503 anded with the mask in x18 + adrp x19, .p503x2 + add x19, x19, :lo12:.p503x2 + ldp x11, x12, [x19, #0] + ldp x13, x14, [x19, #16] + and x11, x11, x18 + and x12, x12, x18 + and x13, x13, x18 + and x14, x14, x18 + ldp x15, x16, [x19, #32] + ldr x17, [x19, #48] + and x15, x15, x18 + and x16, x16, x18 + and x17, x17, x18 + + adds x3, x3, x11 + adcs x4, x4, x12 + adcs x5, x5, x12 + adcs x6, x6, x13 + adcs x7, x7, x14 + adcs x8, x8, x15 + adcs x9, x9, x16 + adc x10, x10, x17 + + stp x3, x4, [x2,#0] + stp x5, x6, [x2,#16] + stp x7, x8, [x2,#32] + stp x9, x10, [x2,#48] + ret +___ + +# 503-bit multiprecision addition +# Operation: c [x2] = a [x0] + b [x1] +$code.=<<___; + .global ${PREFIX}_mpadd_asm + .align 4 + ${PREFIX}_mpadd_asm: + ldp x3, x4, [x0,#0] + ldp x5, x6, [x0,#16] + ldp x11, x12, [x1,#0] + ldp x13, x14, [x1,#16] + + adds x3, x3, x11 + adcs x4, x4, x12 + adcs x5, x5, x13 + adcs x6, x6, x14 + ldp x7, x8, [x0,#32] + ldp x9, x10, [x0,#48] + ldp x15, x16, [x1,#32] + ldp x17, x18, [x1,#48] + adcs x7, x7, x15 + adcs x8, x8, x16 + adcs x9, x9, x17 + adc x10, x10, x18 + + stp x3, x4, [x2,#0] + stp x5, x6, [x2,#16] + stp x7, x8, [x2,#32] + stp x9, x10, [x2,#48] + ret +___ + + +# 2x503-bit multiprecision addition +# Operation: c [x2] = a [x0] + b [x1] +$code.=<<___; + .global ${PREFIX}_mpadd503x2_asm + .align 4 + ${PREFIX}_mpadd503x2_asm: + ldp x3, x4, [x0,#0] + ldp x5, x6, [x0,#16] + ldp x11, x12, [x1,#0] + ldp x13, x14, [x1,#16] + adds x3, x3, x11 + adcs x4, x4, x12 + adcs x5, x5, x13 + adcs x6, x6, x14 + ldp x7, x8, [x0,#32] + ldp x9, x10, [x0,#48] + ldp x15, x16, [x1,#32] + ldp x17, x18, [x1,#48] + adcs x7, x7, x15 + adcs x8, x8, x16 + adcs x9, x9, x17 + adcs x10, x10, x18 + + stp x3, x4, [x2,#0] + stp x5, x6, [x2,#16] + stp x7, x8, [x2,#32] + stp x9, x10, [x2,#48] + + ldp x3, x4, [x0,#64] + ldp x5, x6, [x0,#80] + ldp x11, x12, [x1,#64] + ldp x13, x14, [x1,#80] + adcs x3, x3, x11 + adcs x4, x4, x12 + adcs x5, x5, x13 + adcs x6, x6, x14 + ldp x7, x8, [x0,#96] + ldp x9, x10, [x0,#112] + ldp x15, x16, [x1,#96] + ldp x17, x18, [x1,#112] + adcs x7, x7, x15 + adcs x8, x8, x16 + adcs x9, x9, x17 + adc x10, x10, x18 + + stp x3, x4, [x2,#64] + stp x5, x6, [x2,#80] + stp x7, x8, [x2,#96] + stp x9, x10, [x2,#112] + ret +___ + + + +# 2x503-bit multiprecision subtraction +# Operation: c [x2] = a [x0] - b [x1]. Returns borrow mask +$code.=<<___; + .global ${PREFIX}_mpsubx2_asm + .align 4 + ${PREFIX}_mpsubx2_asm: + ldp x3, x4, [x0,#0] + ldp x5, x6, [x0,#16] + ldp x11, x12, [x1,#0] + ldp x13, x14, [x1,#16] + subs x3, x3, x11 + sbcs x4, x4, x12 + sbcs x5, x5, x13 + sbcs x6, x6, x14 + ldp x7, x8, [x0,#32] + ldp x9, x10, [x0,#48] + ldp x15, x16, [x1,#32] + ldp x17, x18, [x1,#48] + sbcs x7, x7, x15 + sbcs x8, x8, x16 + sbcs x9, x9, x17 + sbcs x10, x10, x18 + + stp x3, x4, [x2,#0] + stp x5, x6, [x2,#16] + stp x7, x8, [x2,#32] + stp x9, x10, [x2,#48] + + ldp x3, x4, [x0,#64] + ldp x5, x6, [x0,#80] + ldp x11, x12, [x1,#64] + ldp x13, x14, [x1,#80] + sbcs x3, x3, x11 + sbcs x4, x4, x12 + sbcs x5, x5, x13 + sbcs x6, x6, x14 + ldp x7, x8, [x0,#96] + ldp x9, x10, [x0,#112] + ldp x15, x16, [x1,#96] + ldp x17, x18, [x1,#112] + sbcs x7, x7, x15 + sbcs x8, x8, x16 + sbcs x9, x9, x17 + sbcs x10, x10, x18 + sbc x0, xzr, xzr + + stp x3, x4, [x2,#64] + stp x5, x6, [x2,#80] + stp x7, x8, [x2,#96] + stp x9, x10, [x2,#112] + ret +___ + + +# Double 2x503-bit multiprecision subtraction +# Operation: c [x2] = c [x2] - a [x0] - b [x1] +$code.=<<___; + .global ${PREFIX}_mpdblsubx2_asm + .align 4 + ${PREFIX}_mpdblsubx2_asm: + sub sp, sp, #32 + stp x27, x28, [sp, #0] + stp x29, x30, [sp, #16] + ldp x3, x4, [x2,#0] + ldp x5, x6, [x2,#16] + ldp x7, x8, [x2,#32] + ldp x9, x10, [x2,#48] + ldp x11, x12, [x2,#64] + ldp x13, x14, [x2,#80] + ldp x15, x16, [x2,#96] + ldp x17, x18, [x2,#112] + + ldp x27, x28, [x0,#0] + ldp x29, x30, [x0,#16] + subs x3, x3, x27 + sbcs x4, x4, x28 + sbcs x5, x5, x29 + sbcs x6, x6, x30 + ldp x27, x28, [x0,#32] + ldp x29, x30, [x0,#48] + sbcs x7, x7, x27 + sbcs x8, x8, x28 + sbcs x9, x9, x29 + sbcs x10, x10, x30 + ldp x27, x28, [x0,#64] + ldp x29, x30, [x0,#80] + sbcs x11, x11, x27 + sbcs x12, x12, x28 + sbcs x13, x13, x29 + sbcs x14, x14, x30 + ldp x27, x28, [x0,#96] + ldp x29, x30, [x0,#112] + sbcs x15, x15, x27 + sbcs x16, x16, x28 + sbcs x17, x17, x29 + sbc x18, x18, x30 + + ldp x27, x28, [x1,#0] + ldp x29, x30, [x1,#16] + subs x3, x3, x27 + sbcs x4, x4, x28 + sbcs x5, x5, x29 + sbcs x6, x6, x30 + ldp x27, x28, [x1,#32] + ldp x29, x30, [x1,#48] + sbcs x7, x7, x27 + sbcs x8, x8, x28 + sbcs x9, x9, x29 + sbcs x10, x10, x30 + ldp x27, x28, [x1,#64] + ldp x29, x30, [x1,#80] + sbcs x11, x11, x27 + sbcs x12, x12, x28 + sbcs x13, x13, x29 + sbcs x14, x14, x30 + ldp x27, x28, [x1,#96] + ldp x29, x30, [x1,#112] + sbcs x15, x15, x27 + sbcs x16, x16, x28 + sbcs x17, x17, x29 + sbc x18, x18, x30 + + stp x3, x4, [x2,#0] + stp x5, x6, [x2,#16] + stp x7, x8, [x2,#32] + stp x9, x10, [x2,#48] + stp x11, x12, [x2,#64] + stp x13, x14, [x2,#80] + stp x15, x16, [x2,#96] + stp x17, x18, [x2,#112] + + ldp x27, x28, [sp, #0] + ldp x29, x30, [sp, #16] + add sp, sp, #32 + ret +___ + +foreach (split("\n",$code)) { + s/\`([^\`]*)\`/eval($1)/ge; + print $_,"\n"; +} + +close STDOUT;