WIP

Change-Id: I86560300c4cfe8f93f77beeb34dc63aef2f310d5
2019-04-21 01:48:58 +01:00 · 2019-04-21 01:48:58 +01:00 · 77c4a5e297
commit 77c4a5e297
parent b2c4f09734
1 changed files with 877 additions and 0 deletions
--- a/third_party/sike/asm/fp-armv8.pl
+++ b/third_party/sike/asm/fp-armv8.pl
@ -0,0 +1,877 @@
+#! /usr/bin/env perl
+#
+# April 2019
+#
+# Abstract: field arithmetic in x64 assembly for SIDH/p503
+
+$flavour = shift;
+$output  = shift;
+if ($flavour =~ /\./) { $output = $flavour; undef $flavour; }
+
+$0 =~ m/(.*[\/\\])[^\/\\]+$/; $dir=$1;
+( $xlate="${dir}arm-xlate.pl" and -f $xlate ) or
+( $xlate="${dir}../../../crypto/perlasm/arm-xlate.pl" and -f $xlate) or
+die "can't locate arm-xlate.pl";
+
+open OUT,"| \"$^X\" \"$xlate\" $flavour \"$output\"";
+*STDOUT=*OUT;
+
+$PREFIX="sike";
+
+$code.=<<___;
+    .section  .rodata
+
+    .p503p1_nz_s8:
+        .quad  0x085BDA2211E7A0AC, 0x9BF6C87B7E7DAF13
+        .quad  0x45C6BDDA77A4D01B, 0x4066F541811E1E60
+
+    .p503x2:
+        .quad  0xFFFFFFFFFFFFFFFE, 0xFFFFFFFFFFFFFFFF
+        .quad  0x57FFFFFFFFFFFFFF, 0x2610B7B44423CF41
+        .quad  0x3737ED90F6FCFB5E, 0xC08B8D7BB4EF49A0
+        .quad  0x0080CDEA83023C3C
+
+    .text
+
+___
+
+sub mul128_comba_cut {
+    my ($A0,$A1,$B0,$B1,$C0,$C1,$C2,$C3,$T0)=@_;
+    my $body=<<___;
+        mul     $A0, $A1, $B0
+        umulh   $B0, $A1, $B0
+        adds    $C1, $C1, $C3
+        adc     $C2, $C2, xzr
+
+        mul     $T0, $A1, $B1
+        umulh   $B1, $A1, $B1
+        adds    $C1, $C1, $A0
+        adcs    $C2, $C2, $B0
+        adc     $C3, xzr, xzr
+
+        adds    $C2, $C2, $T0
+        adc     $C3, $C3, $B1
+___
+    return $body;
+}
+
+sub mul256_karatsuba_comba {
+    my ($M,$A0,$A1,$A2,$A3,$B0,$B1,$B2,$B3,$C0,$C1,$C2,$C3,$C4,$C5,$C6,$C7,$T0,$T1)=@_;
+    # (AH+AL) x (BH+BL), low part
+    my $mul_low=&mul128_comba_cut($A0, $A1, $C6, $T1, $C2, $C3, $C4, $C5, $C7);
+    # AL x BL
+    my $mul_albl=&mul128_comba_cut($A0, $A1, $B0, $B1, $C0, $C1, $T1, $C7, $C6);
+    # AH x BH
+    my $mul_ahbh=&mul128_comba_cut($A2, $A3, $B2, $B3, $A0, $A1, $C6, $B0, $B1);
+    my $body=<<___;
+        # A0-A1 <- AH + AL, T0 <- mask
+        adds    $A0, $A0, $A2
+        adcs    $A1, $A1, $A3
+        adc     $T0, xzr, xzr
+
+        # C6, T1 <- BH + BL, C7 <- mask
+        adds    $C6, $B0, $B2
+        adcs    $T1, $B1, $B3
+        adc     $C7, xzr, xzr
+
+        # C0-C1 <- masked (BH + BL)
+        sub     $C2, xzr, $T0
+        sub     $C3, xzr, $C7
+        and     $C0, $C6, $C2
+        and     $C1, $T1, $C2
+
+        # C4-C5 <- masked (AH + AL), T0 <- combined carry
+        and     $C4, $A0, $C3
+        and     $C5, $A1, $C3
+        mul     $C2, $A0, $C6
+        mul     $C3, $A0, $T1
+        and     $T0, $T0, $C7
+
+        # C0-C1, T0 <- (AH+AL) x (BH+BL), part 1
+        adds    $C0, $C4, $C0
+        umulh   $C4, $A0, $T1
+        adcs    $C1, $C5, $C1
+        umulh   $C5, $A0, $C6
+        adc     $T0, $T0, xzr
+
+        # C2-C5 <- (AH+AL) x (BH+BL), low part
+        $mul_low
+        ldp     $A0, $A1, [$M,#0]
+
+        # C2-C5, T0 <- (AH+AL) x (BH+BL), final part
+        adds    $C4, $C0, $C4
+        umulh   $C7, $A0, $B0
+        umulh   $T1, $A0, $B1
+        adcs    $C5, $C1, $C5
+        mul     $C0, $A0, $B0
+        mul     $C1, $A0, $B1
+        adc     $T0, $T0, xzr
+
+        # C0-C1, T1, C7 <- AL x BL
+        $mul_albl
+
+        # C2-C5, T0 <- (AH+AL) x (BH+BL) - ALxBL
+        mul     $A0, $A2, $B2
+        umulh   $B0, $A2, $B2
+        subs    $C2, $C2, $C0
+        sbcs    $C3, $C3, $C1
+        sbcs    $C4, $C4, $T1
+        mul     $A1, $A2, $B3
+        umulh   $C6, $A2, $B3
+        sbcs    $C5, $C5, $C7
+        sbc     $T0, $T0, xzr
+
+        # A0, A1, C6, B0 <- AH x BH
+        $mul_ahbh
+
+        # C2-C5, T0 <- (AH+AL) x (BH+BL) - ALxBL - AHxBH
+        subs    $C2, $C2, $A0
+        sbcs    $C3, $C3, $A1
+        sbcs    $C4, $C4, $C6
+        sbcs    $C5, $C5, $B0
+        sbc     $T0, $T0, xzr
+
+        adds    $C2, $C2, $T1
+        adcs    $C3, $C3, $C7
+        adcs    $C4, $C4, $A0
+        adcs    $C5, $C5, $A1
+        adcs    $C6, $T0, $C6
+        adc     $C7, $B0, xzr
+___
+    return $body;
+}
+
+# 512-bit integer multiplication using Karatsuba (two levels),
+# Comba (lower level).
+# Operation: c [x2] = a [x0] * b [x1]
+sub mul {
+    # (AH+AL) x (BH+BL), low part
+    my $mul_kc_low=&mul256_karatsuba_comba("x2","x26","x27","x28","x29","x11","x12","x13","x14","x8","x9","x10","x19","x20","x21","x22","x23","x24","x25");
+    my $mul_albl=&mul256_karatsuba_comba("x0","x3","x4","x5","x6","x11","x12","x13","x14","x20","x21","x22","x23","x24","x25","x26","x27","x28","x29");
+    my $mul_ahbh=&mul256_karatsuba_comba("x0","x3","x4","x5","x6","x11","x12","x13","x14","x20","x21","x22","x23","x24","x25","x26","x27","x28","x29");
+
+    my $body=<<___;
+        .global ${PREFIX}_mpmul
+        .align 4
+        ${PREFIX}_mpmul:
+        sub     sp, sp, #96
+        stp     x19, x20, [sp,#0]
+        stp     x21, x22, [sp,#16]
+        stp     x23, x24, [sp,#32]
+        stp     x25, x26, [sp,#48]
+        stp     x27, x28, [sp,#64]
+        str     x29, [sp, #80]
+
+        ldp     x3, x4, [x0]
+        ldp     x5, x6, [x0,#16]
+        ldp     x7, x8, [x0,#32]
+        ldp     x9, x10, [x0,#48]
+        ldp     x11, x12, [x1,#0]
+        ldp     x13, x14, [x1,#16]
+        ldp     x15, x16, [x1,#32]
+        ldp     x17, x18, [x1,#48]
+
+        // x26-x29 <- AH + AL, x7 <- mask
+        adds    x26, x3, x7
+        adcs    x27, x4, x8
+        adcs    x28, x5, x9
+        adcs    x29, x6, x10
+        adc     x7, xzr, xzr
+
+        // x11-x14 <- BH + BL, x8 <- mask
+        adds    x11, x11, x15
+        adcs    x12, x12, x16
+        adcs    x13, x13, x17
+        adcs    x14, x14, x18
+        adc     x8, xzr, xzr
+
+        // x15-x18 <- masked (BH + BL)
+        sub     x9, xzr, x7
+        sub     x10, xzr, x8
+        and     x15, x11, x9
+        and     x16, x12, x9
+        and     x17, x13, x9
+        and     x18, x14, x9
+
+        // x19-x22 <- masked (AH + AL), x7 <- combined carry
+        and     x19, x26, x10
+        and     x20, x27, x10
+        and     x21, x28, x10
+        and     x22, x29, x10
+        and     x7, x7, x8
+
+        // x15-x18, x7 <- masked (AH+AL) + masked (BH+BL), step 1
+        adds    x15, x15, x19
+        adcs    x16, x16, x20
+        adcs    x17, x17, x21
+        adcs    x18, x18, x22
+        adc     x7, x7, xzr
+
+        // x8-x10,x19-x23 <- (AH+AL) x (BH+BL), low part
+        stp     x26, x27, [x2,#0]
+        $mul_kc_low
+
+        // x15-x18, x7 <- (AH+AL) x (BH+BL), final step
+        adds    x15, x15, x20
+        adcs    x16, x16, x21
+        adcs    x17, x17, x22
+        adcs    x18, x18, x23
+        adc     x7, x7, xzr
+
+        // x20-x27 <- AL x BL
+        ldp     x11, x12, [x1,#0]
+        ldp     x13, x14, [x1,#16]
+        $mul_albl
+
+        // x13-x14, x3-x5 <- (AH+AL) x (BH+BL) - ALxBL
+        subs    x8, x8, x20
+        sbcs    x9, x9, x21
+        sbcs    x10, x10, x22
+        sbcs    x19, x19, x23
+        sbcs    x15, x15, x24
+        sbcs    x16, x16, x25
+        sbcs    x17, x17, x26
+        sbcs    x18, x18, x27
+        sbc     x7, x7, xzr
+
+        stp     x20, x21, [x2]
+        stp     x22, x23, [x2,#16]
+
+        ldp     x3, x4, [x0,#32]
+        ldp     x5, x6, [x0,#48]
+        ldp     x11, x12, [x1,#32]
+        ldp     x13, x14, [x1,#48]
+
+        adds    x8, x8, x24
+        adcs    x9, x9, x25
+        adcs    x10, x10, x26
+        adcs    x19, x19, x27
+        adc     x1, xzr, xzr
+
+        // x20-x27 <- AH x BH
+        add     x0, x0, #32
+        $mul_ahbh
+        neg     x1, x1
+
+        // x13-x14, x3-x5 <- (AH+AL) x (BH+BL) - ALxBL - AHxBH
+        subs    x8, x8, x20
+        sbcs    x9, x9, x21
+        sbcs    x10, x10, x22
+        sbcs    x19, x19, x23
+        sbcs    x15, x15, x24
+        sbcs    x16, x16, x25
+        sbcs    x17, x17, x26
+        sbcs    x18, x18, x27
+        sbc     x7, x7, xzr
+
+        stp     x8, x9, [x2,#32]
+        stp     x10, x19, [x2,#48]
+
+        adds    x1, x1, #1
+        adcs    x15, x15, x20
+        adcs    x16, x16, x21
+        adcs    x17, x17, x22
+        adcs    x18, x18, x23
+        adcs    x24, x7, x24
+        adcs    x25, x25, xzr
+        adcs    x26, x26, xzr
+        adc     x27, x27, xzr
+
+        stp     x15, x16, [x2,#64]
+        stp     x17, x18, [x2,#80]
+        stp     x24, x25, [x2,#96]
+        stp     x26, x27, [x2,#112]
+
+        ldp     x19, x20, [sp,#0]
+        ldp     x21, x22, [sp,#16]
+        ldp     x23, x24, [sp,#32]
+        ldp     x25, x26, [sp,#48]
+        ldp     x27, x28, [sp,#64]
+        ldr     x29, [sp,#80]
+        add     sp, sp, #96
+        ret
+___
+    return $body;
+}
+$code.=&mul();
+
+sub mul128x256_comba {
+    my ($A0,$A1,$B0,$B1,$B2,$B3,$C0,$C1,$C2,$C3,$C4,$C5,$T0,$T1,$T2,$T3)=@_;
+    my $body=<<___;
+        mul     $T0, $A1, $B0
+        umulh   $T1, $A1, $B0
+        adds    $C1, $C1, $C3
+        adc     $C2, $C2, xzr
+
+        mul     $T2, $A0, $B2
+        umulh   $T3, $A0, $B2
+        adds    $C1, $C1, $T0
+        adcs    $C2, $C2, $T1
+        adc     $C3, xzr, xzr
+
+        mul     $T0, $A1, $B1
+        umulh   $T1, $A1, $B1
+        adds    $C2, $C2, $T2
+        adcs    $C3, $C3, $T3
+        adc     $C4, xzr, xzr
+
+        mul     $T2, $A0, $B3
+        umulh   $T3, $A0, $B3
+        adds    $C2, $C2, $T0
+        adcs    $C3, $C3, $T1
+        adc     $C4, $C4, xzr
+
+        mul     $T0, $A1, $B2
+        umulh   $T1, $A1, $B2
+        adds    $C3, $C3, $T2
+        adcs    $C4, $C4, $T3
+        adc     $C5, xzr, xzr
+
+        mul     $T2, $A1, $B3
+        umulh   $T3, $A1, $B3
+        adds    $C3, $C3, $T0
+        adcs    $C4, $C4, $T1
+        adc     $C5, $C5, xzr
+        adds    $C4, $C4, $T2
+        adc     $C5, $C5, $T3
+
+___
+    return $body;
+}
+
+#  Montgomery reduction
+#  Based on method described in Faz-Hernandez et al. https://eprint.iacr.org/2017/1015
+#  Operation: mc [x1] = ma [x0]
+#  NOTE: ma=mc is not allowed
+sub rdc {
+    my $mul01=&mul128x256_comba("x2","x3","x24","x25","x26","x27","x4","x5","x6","x7","x8","x9","x28","x29","x30","x10");
+    my $mul23=&mul128x256_comba("x3","x11","x24","x25","x26","x27","x4","x5","x6","x7","x8","x9","x28","x29","x30","x10");
+    my $mul45=&mul128x256_comba("x12","x13","x24","x25","x26","x27","x4","x5","x6","x7","x8","x9","x28","x29","x30","x10");
+    my $mul67=&mul128x256_comba("x14","x15","x24","x25","x26","x27","x4","x5","x6","x7","x8","x9","x28","x29","x30","x10");
+    my $body=<<___;
+    .global ${PREFIX}_fprdc
+    .align 4
+    ${PREFIX}_fprdc:
+        sub     sp, sp, #96
+        stp     x19, x20, [sp]
+        stp     x21, x22, [sp, #16]
+        stp     x23, x24, [sp, #32]
+        stp     x25, x26, [sp, #48]
+        stp     x27, x28, [sp, #64]
+        stp     x29, x30, [sp, #80]
+
+        ldp     x2, x3, [x0,#0]       // a[0-1]
+
+        // Load the prime constant
+        adrp    x23, .p503p1_nz_s8
+        add     x23, x23, :lo12:.p503p1_nz_s8
+        ldp     x24, x25, [x23, #0]
+        ldp     x26, x27, [x23, #16]
+
+        // a[0-1] x p503p1_nz_s8 --> result: x4:x9
+        mul     x4, x2, x24           // a[0] x p503p1_nz_s8[0]
+        umulh   x7, x2, x24
+        mul     x5, x2, x25           // a[0] x p503p1_nz_s8[1]
+        umulh   x6, x2, x25
+
+        $mul01
+
+        ldp     x3, x11, [x0,#16]     // a[2]
+        ldp     x12, x13, [x0,#32]
+        ldp     x14, x15, [x0,#48]
+
+        orr     x10, xzr, x9, lsr #8
+        lsl     x9, x9, #56
+        orr     x9, x9, x8, lsr #8
+        lsl     x8, x8, #56
+        orr     x8, x8, x7, lsr #8
+        lsl     x7, x7, #56
+        orr     x7, x7, x6, lsr #8
+        lsl     x6, x6, #56
+        orr     x6, x6, x5, lsr #8
+        lsl     x5, x5, #56
+        orr     x5, x5, x4, lsr #8
+        lsl     x4, x4, #56
+
+        adds    x11, x4, x11          // a[3]
+        adcs    x12, x5, x12          // a[4]
+        adcs    x13, x6, x13
+        adcs    x14, x7, x14
+        adcs    x15, x8, x15
+        ldp     x16, x17, [x0,#64]
+        ldp     x18, x19, [x0,#80]
+        mul     x4, x3, x24           // a[2] x p503p1_nz_s8[0]
+        umulh   x7, x3, x24
+        adcs    x16, x9, x16
+        adcs    x17, x10, x17
+        adcs    x18, xzr, x18
+        adcs    x19, xzr, x19
+        ldp     x20, x21, [x0,#96]
+        ldp     x22, x23, [x0,#112]
+        mul     x5, x3, x25           // a[2] x p503p1_nz_s8[1]
+        umulh   x6, x3, x25
+        adcs    x20, xzr, x20
+        adcs    x21, xzr, x21
+        adcs    x22, xzr, x22
+        adc     x23, xzr, x23
+
+        // a[2-3] x p503p1_nz_s8 --> result: x4:x9
+        $mul23
+
+        orr     x10, xzr, x9, lsr #8
+        lsl     x9, x9, #56
+        orr     x9, x9, x8, lsr #8
+        lsl     x8, x8, #56
+        orr     x8, x8, x7, lsr #8
+        lsl     x7, x7, #56
+        orr     x7, x7, x6, lsr #8
+        lsl     x6, x6, #56
+        orr     x6, x6, x5, lsr #8
+        lsl     x5, x5, #56
+        orr     x5, x5, x4, lsr #8
+        lsl     x4, x4, #56
+
+        adds    x13, x4, x13          // a[5]
+        adcs    x14, x5, x14          // a[6]
+        adcs    x15, x6, x15
+        adcs    x16, x7, x16
+        mul     x4, x12, x24          // a[4] x p503p1_nz_s8[0]
+        umulh   x7, x12, x24
+        adcs    x17, x8, x17
+        adcs    x18, x9, x18
+        adcs    x19, x10, x19
+        adcs    x20, xzr, x20
+        mul     x5, x12, x25          // a[4] x p503p1_nz_s8[1]
+        umulh   x6, x12, x25
+        adcs    x21, xzr, x21
+        adcs    x22, xzr, x22
+        adc     x23, xzr, x23
+
+        // a[4-5] x p503p1_nz_s8 --> result: x4:x9
+        $mul45
+
+        orr     x10, xzr, x9, lsr #8
+        lsl     x9, x9, #56
+        orr     x9, x9, x8, lsr #8
+        lsl     x8, x8, #56
+        orr     x8, x8, x7, lsr #8
+        lsl     x7, x7, #56
+        orr     x7, x7, x6, lsr #8
+        lsl     x6, x6, #56
+        orr     x6, x6, x5, lsr #8
+        lsl     x5, x5, #56
+        orr     x5, x5, x4, lsr #8
+        lsl     x4, x4, #56
+
+        adds    x15, x4, x15          // a[7]
+        adcs    x16, x5, x16          // a[8]
+        adcs    x17, x6, x17
+        adcs    x18, x7, x18
+        mul     x4, x14, x24          // a[6] x p503p1_nz_s8[0]
+        umulh   x7, x14, x24
+        adcs    x19, x8, x19
+        adcs    x20, x9, x20
+        adcs    x21, x10, x21
+        mul     x5, x14, x25          // a[6] x p503p1_nz_s8[1]
+        umulh   x6, x14, x25
+        adcs    x22, xzr, x22
+        adc     x23, xzr, x23
+
+        // a[6-7] x p503p1_nz_s8 --> result: x4:x9
+        $mul67
+
+        orr     x10, xzr, x9, lsr #8
+        lsl     x9, x9, #56
+        orr     x9, x9, x8, lsr #8
+        lsl     x8, x8, #56
+        orr     x8, x8, x7, lsr #8
+        lsl     x7, x7, #56
+        orr     x7, x7, x6, lsr #8
+        lsl     x6, x6, #56
+        orr     x6, x6, x5, lsr #8
+        lsl     x5, x5, #56
+        orr     x5, x5, x4, lsr #8
+        lsl     x4, x4, #56
+
+        adds    x17, x4, x17
+        adcs    x18, x5, x18
+        adcs    x19, x6, x19
+        adcs    x20, x7, x20
+        stp     x16, x17, [x1,#0]     // Final result
+        stp     x18, x19, [x1,#16]
+        adcs    x21, x8, x21
+        adcs    x22, x9, x22
+        adc     x23, x10, x23
+        stp     x20, x21, [x1,#32]
+        stp     x22, x23, [x1,#48]
+
+        ldp     x19, x20, [sp]
+        ldp     x21, x22, [sp, #16]
+        ldp     x23, x24, [sp, #32]
+        ldp     x25, x26, [sp, #48]
+        ldp     x27, x28, [sp, #64]
+        ldp     x29, x30, [sp, #80]
+        add     sp, sp, #96
+        ret
+
+___
+}
+
+$code.=&rdc();
+
+
+#  Field addition
+#  Operation: c [x2] = a [x0] + b [x1]
+$code.=<<___;
+    .global ${PREFIX}_fpadd
+    .align 4
+    ${PREFIX}_fpadd:
+        ldp     x3, x4,   [x0,#0]
+        ldp     x5, x6,   [x0,#16]
+        ldp     x11, x12, [x1,#0]
+        ldp     x13, x14, [x1,#16]
+
+        // Add a + b
+        adds    x3, x3, x11
+        adcs    x4, x4, x12
+        adcs    x5, x5, x13
+        adcs    x6, x6, x14
+        ldp     x7, x8,   [x0,#32]
+        ldp     x9, x10,  [x0,#48]
+        ldp     x15, x16, [x1,#32]
+        ldp     x17, x18, [x1,#48]
+        adcs    x7, x7, x15
+        adcs    x8, x8, x16
+        adcs    x9, x9, x17
+        adc     x10, x10, x18
+
+        //  Subtract 2xp503
+        adrp    x18, .p503x2
+        add     x18, x18, :lo12:.p503x2
+        ldp     x11, x12, [x18, #0]
+        ldp     x13, x14, [x18, #16]
+        subs    x3, x3, x11
+        sbcs    x4, x4, x12
+        sbcs    x5, x5, x12
+        sbcs    x6, x6, x13
+        sbcs    x7, x7, x14
+
+        ldp     x15, x16, [x18, #32]
+        ldr     x17,      [x18, #48]
+        sbcs    x8, x8, x15
+        sbcs    x9, x9, x16
+        sbcs    x10, x10, x17
+        sbc     x18, xzr, xzr
+
+        // Add 2xp503 anded with the mask in x18
+        and     x11, x11, x18
+        and     x12, x12, x18
+        and     x13, x13, x18
+        and     x14, x14, x18
+        and     x15, x15, x18
+        and     x16, x16, x18
+        and     x17, x17, x18
+
+        adds    x3, x3, x11
+        adcs    x4, x4, x12
+        adcs    x5, x5, x12
+        adcs    x6, x6, x13
+        adcs    x7, x7, x14
+        adcs    x8, x8, x15
+        adcs    x9, x9, x16
+        adc     x10, x10, x17
+
+        stp     x3, x4,  [x2,#0]
+        stp     x5, x6,  [x2,#16]
+        stp     x7, x8,  [x2,#32]
+        stp     x9, x10, [x2,#48]
+        ret
+
+___
+
+#  Field subtraction
+#  Operation: c [x2] = a [x0] - b [x1]
+$code.=<<___;
+    .global ${PREFIX}_fpsub
+    .align 4
+    ${PREFIX}_fpsub:
+        ldp     x3, x4,   [x0,#0]
+        ldp     x5, x6,   [x0,#16]
+        ldp     x11, x12, [x1,#0]
+        ldp     x13, x14, [x1,#16]
+
+        # Subtract a - b
+        subs    x3, x3, x11
+        sbcs    x4, x4, x12
+        sbcs    x5, x5, x13
+        sbcs    x6, x6, x14
+        ldp     x7, x8,   [x0,#32]
+        ldp     x9, x10,  [x0,#48]
+        ldp     x15, x16, [x1,#32]
+        ldp     x17, x18, [x1,#48]
+        sbcs    x7, x7, x15
+        sbcs    x8, x8, x16
+        sbcs    x9, x9, x17
+        sbcs    x10, x10, x18
+        sbc     x18, xzr, xzr
+
+        # Add 2xp503 anded with the mask in x18
+        adrp    x19, .p503x2
+        add     x19, x19, :lo12:.p503x2
+        ldp     x11, x12, [x19, #0]
+        ldp     x13, x14, [x19, #16]
+        and     x11, x11, x18
+        and     x12, x12, x18
+        and     x13, x13, x18
+        and     x14, x14, x18
+        ldp     x15, x16, [x19, #32]
+        ldr     x17,      [x19, #48]
+        and     x15, x15, x18
+        and     x16, x16, x18
+        and     x17, x17, x18
+
+        adds    x3, x3, x11
+        adcs    x4, x4, x12
+        adcs    x5, x5, x12
+        adcs    x6, x6, x13
+        adcs    x7, x7, x14
+        adcs    x8, x8, x15
+        adcs    x9, x9, x16
+        adc     x10, x10, x17
+
+        stp     x3, x4,  [x2,#0]
+        stp     x5, x6,  [x2,#16]
+        stp     x7, x8,  [x2,#32]
+        stp     x9, x10, [x2,#48]
+        ret
+___
+
+# 503-bit multiprecision addition
+# Operation: c [x2] = a [x0] + b [x1]
+$code.=<<___;
+    .global ${PREFIX}_mpadd_asm
+    .align 4
+    ${PREFIX}_mpadd_asm:
+        ldp     x3, x4,   [x0,#0]
+        ldp     x5, x6,   [x0,#16]
+        ldp     x11, x12, [x1,#0]
+        ldp     x13, x14, [x1,#16]
+
+        adds    x3, x3, x11
+        adcs    x4, x4, x12
+        adcs    x5, x5, x13
+        adcs    x6, x6, x14
+        ldp     x7, x8,   [x0,#32]
+        ldp     x9, x10,  [x0,#48]
+        ldp     x15, x16, [x1,#32]
+        ldp     x17, x18, [x1,#48]
+        adcs    x7, x7, x15
+        adcs    x8, x8, x16
+        adcs    x9, x9, x17
+        adc     x10, x10, x18
+
+        stp     x3, x4,   [x2,#0]
+        stp     x5, x6,   [x2,#16]
+        stp     x7, x8,   [x2,#32]
+        stp     x9, x10,  [x2,#48]
+        ret
+___
+
+
+# 2x503-bit multiprecision addition
+# Operation: c [x2] = a [x0] + b [x1]
+$code.=<<___;
+    .global ${PREFIX}_mpadd503x2_asm
+    .align 4
+    ${PREFIX}_mpadd503x2_asm:
+        ldp     x3, x4,   [x0,#0]
+        ldp     x5, x6,   [x0,#16]
+        ldp     x11, x12, [x1,#0]
+        ldp     x13, x14, [x1,#16]
+        adds    x3, x3, x11
+        adcs    x4, x4, x12
+        adcs    x5, x5, x13
+        adcs    x6, x6, x14
+        ldp     x7, x8,   [x0,#32]
+        ldp     x9, x10,  [x0,#48]
+        ldp     x15, x16, [x1,#32]
+        ldp     x17, x18, [x1,#48]
+        adcs    x7, x7, x15
+        adcs    x8, x8, x16
+        adcs    x9, x9, x17
+        adcs    x10, x10, x18
+
+        stp     x3, x4,   [x2,#0]
+        stp     x5, x6,   [x2,#16]
+        stp     x7, x8,   [x2,#32]
+        stp     x9, x10,  [x2,#48]
+
+        ldp     x3, x4,   [x0,#64]
+        ldp     x5, x6,   [x0,#80]
+        ldp     x11, x12, [x1,#64]
+        ldp     x13, x14, [x1,#80]
+        adcs    x3, x3, x11
+        adcs    x4, x4, x12
+        adcs    x5, x5, x13
+        adcs    x6, x6, x14
+        ldp     x7, x8,   [x0,#96]
+        ldp     x9, x10,  [x0,#112]
+        ldp     x15, x16, [x1,#96]
+        ldp     x17, x18, [x1,#112]
+        adcs    x7, x7, x15
+        adcs    x8, x8, x16
+        adcs    x9, x9, x17
+        adc     x10, x10, x18
+
+        stp     x3, x4,   [x2,#64]
+        stp     x5, x6,   [x2,#80]
+        stp     x7, x8,   [x2,#96]
+        stp     x9, x10,  [x2,#112]
+        ret
+___
+
+
+
+# 2x503-bit multiprecision subtraction
+# Operation: c [x2] = a [x0] - b [x1]. Returns borrow mask
+$code.=<<___;
+    .global ${PREFIX}_mpsubx2_asm
+    .align 4
+    ${PREFIX}_mpsubx2_asm:
+        ldp     x3, x4,   [x0,#0]
+        ldp     x5, x6,   [x0,#16]
+        ldp     x11, x12, [x1,#0]
+        ldp     x13, x14, [x1,#16]
+        subs    x3, x3, x11
+        sbcs    x4, x4, x12
+        sbcs    x5, x5, x13
+        sbcs    x6, x6, x14
+        ldp     x7, x8,   [x0,#32]
+        ldp     x9, x10,  [x0,#48]
+        ldp     x15, x16, [x1,#32]
+        ldp     x17, x18, [x1,#48]
+        sbcs    x7, x7, x15
+        sbcs    x8, x8, x16
+        sbcs    x9, x9, x17
+        sbcs    x10, x10, x18
+
+        stp     x3, x4,   [x2,#0]
+        stp     x5, x6,   [x2,#16]
+        stp     x7, x8,   [x2,#32]
+        stp     x9, x10,  [x2,#48]
+
+        ldp     x3, x4,   [x0,#64]
+        ldp     x5, x6,   [x0,#80]
+        ldp     x11, x12, [x1,#64]
+        ldp     x13, x14, [x1,#80]
+        sbcs    x3, x3, x11
+        sbcs    x4, x4, x12
+        sbcs    x5, x5, x13
+        sbcs    x6, x6, x14
+        ldp     x7, x8,   [x0,#96]
+        ldp     x9, x10,  [x0,#112]
+        ldp     x15, x16, [x1,#96]
+        ldp     x17, x18, [x1,#112]
+        sbcs    x7, x7, x15
+        sbcs    x8, x8, x16
+        sbcs    x9, x9, x17
+        sbcs    x10, x10, x18
+        sbc     x0, xzr, xzr
+
+        stp     x3, x4,   [x2,#64]
+        stp     x5, x6,   [x2,#80]
+        stp     x7, x8,   [x2,#96]
+        stp     x9, x10,  [x2,#112]
+        ret
+___
+
+
+# Double 2x503-bit multiprecision subtraction
+# Operation: c [x2] = c [x2] - a [x0] - b [x1]
+$code.=<<___;
+    .global ${PREFIX}_mpdblsubx2_asm
+    .align 4
+    ${PREFIX}_mpdblsubx2_asm:
+        sub     sp, sp, #32
+        stp     x27, x28, [sp, #0]
+        stp     x29, x30, [sp, #16]
+        ldp     x3, x4,   [x2,#0]
+        ldp     x5, x6,   [x2,#16]
+        ldp     x7, x8,   [x2,#32]
+        ldp     x9, x10,  [x2,#48]
+        ldp     x11, x12, [x2,#64]
+        ldp     x13, x14, [x2,#80]
+        ldp     x15, x16, [x2,#96]
+        ldp     x17, x18, [x2,#112]
+
+        ldp     x27, x28, [x0,#0]
+        ldp     x29, x30, [x0,#16]
+        subs    x3, x3, x27
+        sbcs    x4, x4, x28
+        sbcs    x5, x5, x29
+        sbcs    x6, x6, x30
+        ldp     x27, x28, [x0,#32]
+        ldp     x29, x30, [x0,#48]
+        sbcs    x7, x7, x27
+        sbcs    x8, x8, x28
+        sbcs    x9, x9, x29
+        sbcs    x10, x10, x30
+        ldp     x27, x28, [x0,#64]
+        ldp     x29, x30, [x0,#80]
+        sbcs    x11, x11, x27
+        sbcs    x12, x12, x28
+        sbcs    x13, x13, x29
+        sbcs    x14, x14, x30
+        ldp     x27, x28, [x0,#96]
+        ldp     x29, x30, [x0,#112]
+        sbcs    x15, x15, x27
+        sbcs    x16, x16, x28
+        sbcs    x17, x17, x29
+        sbc     x18, x18, x30
+
+        ldp     x27, x28, [x1,#0]
+        ldp     x29, x30, [x1,#16]
+        subs    x3, x3, x27
+        sbcs    x4, x4, x28
+        sbcs    x5, x5, x29
+        sbcs    x6, x6, x30
+        ldp     x27, x28, [x1,#32]
+        ldp     x29, x30, [x1,#48]
+        sbcs    x7, x7, x27
+        sbcs    x8, x8, x28
+        sbcs    x9, x9, x29
+        sbcs    x10, x10, x30
+        ldp     x27, x28, [x1,#64]
+        ldp     x29, x30, [x1,#80]
+        sbcs    x11, x11, x27
+        sbcs    x12, x12, x28
+        sbcs    x13, x13, x29
+        sbcs    x14, x14, x30
+        ldp     x27, x28, [x1,#96]
+        ldp     x29, x30, [x1,#112]
+        sbcs    x15, x15, x27
+        sbcs    x16, x16, x28
+        sbcs    x17, x17, x29
+        sbc     x18, x18, x30
+
+        stp     x3, x4,   [x2,#0]
+        stp     x5, x6,   [x2,#16]
+        stp     x7, x8,   [x2,#32]
+        stp     x9, x10,  [x2,#48]
+        stp     x11, x12, [x2,#64]
+        stp     x13, x14, [x2,#80]
+        stp     x15, x16, [x2,#96]
+        stp     x17, x18, [x2,#112]
+
+        ldp     x27, x28, [sp, #0]
+        ldp     x29, x30, [sp, #16]
+        add     sp, sp, #32
+        ret
+___
+
+foreach (split("\n",$code)) {
+  s/\`([^\`]*)\`/eval($1)/ge;
+  print $_,"\n";
+}
+
+close STDOUT;