mirror of
https://github.com/henrydcase/nobs.git
synced 2024-11-26 17:11:22 +00:00
1521 lines
26 KiB
ArmAsm
1521 lines
26 KiB
ArmAsm
// +build arm64,!noasm
|
|
|
|
#include "textflag.h"
|
|
|
|
TEXT ·fp751ConditionalSwap(SB), NOSPLIT, $0-17
|
|
MOVD x+0(FP), R0
|
|
MOVD y+8(FP), R1
|
|
MOVB choice+16(FP), R2
|
|
|
|
// Set flags
|
|
// If choice is not 0 or 1, this implementation will swap completely
|
|
CMP $0, R2
|
|
|
|
LDP 0(R0), (R3, R4)
|
|
LDP 0(R1), (R5, R6)
|
|
CSEL EQ, R3, R5, R7
|
|
CSEL EQ, R4, R6, R8
|
|
STP (R7, R8), 0(R0)
|
|
CSEL NE, R3, R5, R9
|
|
CSEL NE, R4, R6, R10
|
|
STP (R9, R10), 0(R1)
|
|
|
|
LDP 16(R0), (R3, R4)
|
|
LDP 16(R1), (R5, R6)
|
|
CSEL EQ, R3, R5, R7
|
|
CSEL EQ, R4, R6, R8
|
|
STP (R7, R8), 16(R0)
|
|
CSEL NE, R3, R5, R9
|
|
CSEL NE, R4, R6, R10
|
|
STP (R9, R10), 16(R1)
|
|
|
|
LDP 32(R0), (R3, R4)
|
|
LDP 32(R1), (R5, R6)
|
|
CSEL EQ, R3, R5, R7
|
|
CSEL EQ, R4, R6, R8
|
|
STP (R7, R8), 32(R0)
|
|
CSEL NE, R3, R5, R9
|
|
CSEL NE, R4, R6, R10
|
|
STP (R9, R10), 32(R1)
|
|
|
|
LDP 48(R0), (R3, R4)
|
|
LDP 48(R1), (R5, R6)
|
|
CSEL EQ, R3, R5, R7
|
|
CSEL EQ, R4, R6, R8
|
|
STP (R7, R8), 48(R0)
|
|
CSEL NE, R3, R5, R9
|
|
CSEL NE, R4, R6, R10
|
|
STP (R9, R10), 48(R1)
|
|
|
|
LDP 64(R0), (R3, R4)
|
|
LDP 64(R1), (R5, R6)
|
|
CSEL EQ, R3, R5, R7
|
|
CSEL EQ, R4, R6, R8
|
|
STP (R7, R8), 64(R0)
|
|
CSEL NE, R3, R5, R9
|
|
CSEL NE, R4, R6, R10
|
|
STP (R9, R10), 64(R1)
|
|
|
|
LDP 80(R0), (R3, R4)
|
|
LDP 80(R1), (R5, R6)
|
|
CSEL EQ, R3, R5, R7
|
|
CSEL EQ, R4, R6, R8
|
|
STP (R7, R8), 80(R0)
|
|
CSEL NE, R3, R5, R9
|
|
CSEL NE, R4, R6, R10
|
|
STP (R9, R10), 80(R1)
|
|
|
|
RET
|
|
|
|
TEXT ·fp751AddReduced(SB), NOSPLIT, $0-24
|
|
MOVD z+0(FP), R2
|
|
MOVD x+8(FP), R0
|
|
MOVD y+16(FP), R1
|
|
|
|
// Load first summand into R3-R14
|
|
// Add first summand and second summand and store result in R3-R14
|
|
LDP 0(R0), (R3, R4)
|
|
LDP 0(R1), (R15, R16)
|
|
LDP 16(R0), (R5, R6)
|
|
LDP 16(R1), (R17, R19)
|
|
ADDS R15, R3
|
|
ADCS R16, R4
|
|
ADCS R17, R5
|
|
ADCS R19, R6
|
|
|
|
LDP 32(R0), (R7, R8)
|
|
LDP 32(R1), (R15, R16)
|
|
LDP 48(R0), (R9, R10)
|
|
LDP 48(R1), (R17, R19)
|
|
ADCS R15, R7
|
|
ADCS R16, R8
|
|
ADCS R17, R9
|
|
ADCS R19, R10
|
|
|
|
LDP 64(R0), (R11, R12)
|
|
LDP 64(R1), (R15, R16)
|
|
LDP 80(R0), (R13, R14)
|
|
LDP 80(R1), (R17, R19)
|
|
ADCS R15, R11
|
|
ADCS R16, R12
|
|
ADCS R17, R13
|
|
ADC R19, R14
|
|
|
|
// Subtract 2 * p751 in R15-R24 from the result in R3-R14
|
|
LDP ·p751x2+0(SB), (R15, R16)
|
|
SUBS R15, R3
|
|
SBCS R16, R4
|
|
LDP ·p751x2+40(SB), (R17, R19)
|
|
SBCS R16, R5
|
|
SBCS R16, R6
|
|
SBCS R16, R7
|
|
LDP ·p751x2+56(SB), (R20, R21)
|
|
SBCS R17, R8
|
|
SBCS R19, R9
|
|
LDP ·p751x2+72(SB), (R22, R23)
|
|
SBCS R20, R10
|
|
SBCS R21, R11
|
|
MOVD ·p751x2+88(SB), R24
|
|
SBCS R22, R12
|
|
SBCS R23, R13
|
|
SBCS R24, R14
|
|
SBC ZR, ZR, R25
|
|
|
|
// If x + y - 2 * p751 < 0, R25 is 1 and 2 * p751 should be added
|
|
AND R25, R15
|
|
AND R25, R16
|
|
AND R25, R17
|
|
AND R25, R19
|
|
AND R25, R20
|
|
AND R25, R21
|
|
AND R25, R22
|
|
AND R25, R23
|
|
AND R25, R24
|
|
|
|
ADDS R15, R3
|
|
ADCS R16, R4
|
|
STP (R3, R4), 0(R2)
|
|
ADCS R16, R5
|
|
ADCS R16, R6
|
|
STP (R5, R6), 16(R2)
|
|
ADCS R16, R7
|
|
ADCS R17, R8
|
|
STP (R7, R8), 32(R2)
|
|
ADCS R19, R9
|
|
ADCS R20, R10
|
|
STP (R9, R10), 48(R2)
|
|
ADCS R21, R11
|
|
ADCS R22, R12
|
|
STP (R11, R12), 64(R2)
|
|
ADCS R23, R13
|
|
ADC R24, R14
|
|
STP (R13, R14), 80(R2)
|
|
|
|
RET
|
|
|
|
TEXT ·fp751SubReduced(SB), NOSPLIT, $0-24
|
|
MOVD z+0(FP), R2
|
|
MOVD x+8(FP), R0
|
|
MOVD y+16(FP), R1
|
|
|
|
// Load x into R3-R14
|
|
// Subtract y from x and store result in R3-R14
|
|
LDP 0(R0), (R3, R4)
|
|
LDP 0(R1), (R15, R16)
|
|
LDP 16(R0), (R5, R6)
|
|
LDP 16(R1), (R17, R19)
|
|
SUBS R15, R3
|
|
SBCS R16, R4
|
|
SBCS R17, R5
|
|
SBCS R19, R6
|
|
|
|
LDP 32(R0), (R7, R8)
|
|
LDP 32(R1), (R15, R16)
|
|
LDP 48(R0), (R9, R10)
|
|
LDP 48(R1), (R17, R19)
|
|
SBCS R15, R7
|
|
SBCS R16, R8
|
|
SBCS R17, R9
|
|
SBCS R19, R10
|
|
|
|
LDP 64(R0), (R11, R12)
|
|
LDP 64(R1), (R15, R16)
|
|
LDP 80(R0), (R13, R14)
|
|
LDP 80(R1), (R17, R19)
|
|
SBCS R15, R11
|
|
SBCS R16, R12
|
|
SBCS R17, R13
|
|
SBCS R19, R14
|
|
SBC ZR, ZR, R15
|
|
|
|
// If x - y < 0, R15 is 1 and 2 * p751 should be added
|
|
LDP ·p751x2+0(SB), (R16, R17)
|
|
AND R15, R16
|
|
AND R15, R17
|
|
LDP ·p751x2+40(SB), (R19, R20)
|
|
AND R15, R19
|
|
AND R15, R20
|
|
|
|
ADDS R16, R3
|
|
ADCS R17, R4
|
|
STP (R3, R4), 0(R2)
|
|
ADCS R17, R5
|
|
ADCS R17, R6
|
|
STP (R5, R6), 16(R2)
|
|
ADCS R17, R7
|
|
ADCS R19, R8
|
|
STP (R7, R8), 32(R2)
|
|
ADCS R20, R9
|
|
|
|
LDP ·p751x2+56(SB), (R16, R17)
|
|
AND R15, R16
|
|
AND R15, R17
|
|
LDP ·p751x2+72(SB), (R19, R20)
|
|
AND R15, R19
|
|
AND R15, R20
|
|
|
|
ADCS R16, R10
|
|
STP (R9, R10), 48(R2)
|
|
ADCS R17, R11
|
|
ADCS R19, R12
|
|
STP (R11, R12), 64(R2)
|
|
ADCS R20, R13
|
|
|
|
MOVD ·p751x2+88(SB), R16
|
|
AND R15, R16
|
|
ADC R16, R14
|
|
STP (R13, R14), 80(R2)
|
|
|
|
RET
|
|
|
|
TEXT ·fp751AddLazy(SB), NOSPLIT, $0-24
|
|
MOVD z+0(FP), R2
|
|
MOVD x+8(FP), R0
|
|
MOVD y+16(FP), R1
|
|
|
|
// Load first summand into R3-R14
|
|
// Add first summand and second summand and store result in R3-R14
|
|
LDP 0(R0), (R3, R4)
|
|
LDP 0(R1), (R15, R16)
|
|
LDP 16(R0), (R5, R6)
|
|
LDP 16(R1), (R17, R19)
|
|
ADDS R15, R3
|
|
ADCS R16, R4
|
|
STP (R3, R4), 0(R2)
|
|
ADCS R17, R5
|
|
ADCS R19, R6
|
|
STP (R5, R6), 16(R2)
|
|
|
|
LDP 32(R0), (R7, R8)
|
|
LDP 32(R1), (R15, R16)
|
|
LDP 48(R0), (R9, R10)
|
|
LDP 48(R1), (R17, R19)
|
|
ADCS R15, R7
|
|
ADCS R16, R8
|
|
STP (R7, R8), 32(R2)
|
|
ADCS R17, R9
|
|
ADCS R19, R10
|
|
STP (R9, R10), 48(R2)
|
|
|
|
LDP 64(R0), (R11, R12)
|
|
LDP 64(R1), (R15, R16)
|
|
LDP 80(R0), (R13, R14)
|
|
LDP 80(R1), (R17, R19)
|
|
ADCS R15, R11
|
|
ADCS R16, R12
|
|
STP (R11, R12), 64(R2)
|
|
ADCS R17, R13
|
|
ADC R19, R14
|
|
STP (R13, R14), 80(R2)
|
|
|
|
RET
|
|
|
|
TEXT ·fp751X2AddLazy(SB), NOSPLIT, $0-24
|
|
MOVD z+0(FP), R2
|
|
MOVD x+8(FP), R0
|
|
MOVD y+16(FP), R1
|
|
|
|
LDP 0(R0), (R3, R4)
|
|
LDP 0(R1), (R15, R16)
|
|
LDP 16(R0), (R5, R6)
|
|
LDP 16(R1), (R17, R19)
|
|
ADDS R15, R3
|
|
ADCS R16, R4
|
|
STP (R3, R4), 0(R2)
|
|
ADCS R17, R5
|
|
ADCS R19, R6
|
|
STP (R5, R6), 16(R2)
|
|
|
|
LDP 32(R0), (R7, R8)
|
|
LDP 32(R1), (R15, R16)
|
|
LDP 48(R0), (R9, R10)
|
|
LDP 48(R1), (R17, R19)
|
|
ADCS R15, R7
|
|
ADCS R16, R8
|
|
STP (R7, R8), 32(R2)
|
|
ADCS R17, R9
|
|
ADCS R19, R10
|
|
STP (R9, R10), 48(R2)
|
|
|
|
LDP 64(R0), (R11, R12)
|
|
LDP 64(R1), (R15, R16)
|
|
LDP 80(R0), (R13, R14)
|
|
LDP 80(R1), (R17, R19)
|
|
ADCS R15, R11
|
|
ADCS R16, R12
|
|
STP (R11, R12), 64(R2)
|
|
ADCS R17, R13
|
|
ADCS R19, R14
|
|
STP (R13, R14), 80(R2)
|
|
|
|
LDP 96(R0), (R3, R4)
|
|
LDP 96(R1), (R15, R16)
|
|
LDP 112(R0), (R5, R6)
|
|
LDP 112(R1), (R17, R19)
|
|
ADCS R15, R3
|
|
ADCS R16, R4
|
|
STP (R3, R4), 96(R2)
|
|
ADCS R17, R5
|
|
ADCS R19, R6
|
|
STP (R5, R6), 112(R2)
|
|
|
|
LDP 128(R0), (R7, R8)
|
|
LDP 128(R1), (R15, R16)
|
|
LDP 144(R0), (R9, R10)
|
|
LDP 144(R1), (R17, R19)
|
|
ADCS R15, R7
|
|
ADCS R16, R8
|
|
STP (R7, R8), 128(R2)
|
|
ADCS R17, R9
|
|
ADCS R19, R10
|
|
STP (R9, R10), 144(R2)
|
|
|
|
LDP 160(R0), (R11, R12)
|
|
LDP 160(R1), (R15, R16)
|
|
LDP 176(R0), (R13, R14)
|
|
LDP 176(R1), (R17, R19)
|
|
ADCS R15, R11
|
|
ADCS R16, R12
|
|
STP (R11, R12), 160(R2)
|
|
ADCS R17, R13
|
|
ADC R19, R14
|
|
STP (R13, R14), 176(R2)
|
|
|
|
RET
|
|
|
|
TEXT ·fp751X2SubLazy(SB), NOSPLIT, $0-24
|
|
MOVD z+0(FP), R2
|
|
MOVD x+8(FP), R0
|
|
MOVD y+16(FP), R1
|
|
|
|
LDP 0(R0), (R3, R4)
|
|
LDP 0(R1), (R15, R16)
|
|
LDP 16(R0), (R5, R6)
|
|
LDP 16(R1), (R17, R19)
|
|
SUBS R15, R3
|
|
SBCS R16, R4
|
|
STP (R3, R4), 0(R2)
|
|
SBCS R17, R5
|
|
SBCS R19, R6
|
|
STP (R5, R6), 16(R2)
|
|
|
|
LDP 32(R0), (R7, R8)
|
|
LDP 32(R1), (R15, R16)
|
|
LDP 48(R0), (R9, R10)
|
|
LDP 48(R1), (R17, R19)
|
|
SBCS R15, R7
|
|
SBCS R16, R8
|
|
STP (R7, R8), 32(R2)
|
|
SBCS R17, R9
|
|
SBCS R19, R10
|
|
STP (R9, R10), 48(R2)
|
|
|
|
LDP 64(R0), (R11, R12)
|
|
LDP 64(R1), (R15, R16)
|
|
LDP 80(R0), (R13, R14)
|
|
LDP 80(R1), (R17, R19)
|
|
SBCS R15, R11
|
|
SBCS R16, R12
|
|
STP (R11, R12), 64(R2)
|
|
SBCS R17, R13
|
|
SBCS R19, R14
|
|
STP (R13, R14), 80(R2)
|
|
|
|
LDP 96(R0), (R3, R4)
|
|
LDP 96(R1), (R15, R16)
|
|
LDP 112(R0), (R5, R6)
|
|
LDP 112(R1), (R17, R19)
|
|
SBCS R15, R3
|
|
SBCS R16, R4
|
|
SBCS R17, R5
|
|
SBCS R19, R6
|
|
|
|
LDP 128(R0), (R7, R8)
|
|
LDP 128(R1), (R15, R16)
|
|
LDP 144(R0), (R9, R10)
|
|
LDP 144(R1), (R17, R19)
|
|
SBCS R15, R7
|
|
SBCS R16, R8
|
|
SBCS R17, R9
|
|
SBCS R19, R10
|
|
|
|
LDP 160(R0), (R11, R12)
|
|
LDP 160(R1), (R15, R16)
|
|
LDP 176(R0), (R13, R14)
|
|
LDP 176(R1), (R17, R19)
|
|
SBCS R15, R11
|
|
SBCS R16, R12
|
|
SBCS R17, R13
|
|
SBCS R19, R14
|
|
SBC ZR, ZR, R15
|
|
|
|
// If x - y < 0, R15 is 1 and p751 should be added
|
|
MOVD ·p751+0(SB), R20
|
|
AND R15, R20
|
|
LDP ·p751+40(SB), (R16, R17)
|
|
ADDS R20, R3
|
|
ADCS R20, R4
|
|
STP (R3, R4), 96(R2)
|
|
ADCS R20, R5
|
|
ADCS R20, R6
|
|
STP (R5, R6), 112(R2)
|
|
ADCS R20, R7
|
|
|
|
LDP ·p751+56(SB), (R19, R20)
|
|
AND R15, R16
|
|
AND R15, R17
|
|
ADCS R16, R8
|
|
STP (R7, R8), 128(R2)
|
|
ADCS R17, R9
|
|
|
|
LDP ·p751+72(SB), (R16, R17)
|
|
AND R15, R19
|
|
AND R15, R20
|
|
ADCS R19, R10
|
|
STP (R9, R10), 144(R2)
|
|
ADCS R20, R11
|
|
|
|
MOVD ·p751+88(SB), R19
|
|
AND R15, R16
|
|
AND R15, R17
|
|
ADCS R16, R12
|
|
STP (R11, R12), 160(R2)
|
|
ADCS R17, R13
|
|
|
|
AND R15, R19
|
|
ADC R19, R14
|
|
STP (R13, R14), 176(R2)
|
|
|
|
RET
|
|
|
|
// Expects that X0*Y0 is already in Z0(low),Z3(high) and X0*Y1 in Z1(low),Z2(high)
|
|
// Z0 is not actually touched
|
|
// Result of (X0-X2) * (Y0-Y2) will be in Z0-Z5
|
|
// Inputs remain intact
|
|
#define mul192x192comba(X0, X1, X2, Y0, Y1, Y2, Z0, Z1, Z2, Z3, Z4, Z5, T0, T1, T2, T3) \
|
|
MUL X1, Y0, T2 \
|
|
UMULH X1, Y0, T3 \
|
|
\
|
|
ADDS Z3, Z1 \
|
|
ADCS ZR, Z2 \
|
|
ADC ZR, ZR, Z3 \
|
|
\
|
|
MUL X0, Y2, T0 \
|
|
UMULH X0, Y2, T1 \
|
|
\
|
|
ADDS T2, Z1 \
|
|
ADCS T3, Z2 \
|
|
ADC ZR, Z3 \
|
|
\
|
|
MUL X1, Y1, T2 \
|
|
UMULH X1, Y1, T3 \
|
|
\
|
|
ADDS T0, Z2 \
|
|
ADCS T1, Z3 \
|
|
ADC ZR, ZR, Z4 \
|
|
\
|
|
MUL X2, Y0, T0 \
|
|
UMULH X2, Y0, T1 \
|
|
\
|
|
ADDS T2, Z2 \
|
|
ADCS T3, Z3 \
|
|
ADC ZR, Z4 \
|
|
\
|
|
MUL X1, Y2, T2 \
|
|
UMULH X1, Y2, T3 \
|
|
\
|
|
ADDS T0, Z2 \
|
|
ADCS T1, Z3 \
|
|
ADC ZR, Z4 \
|
|
\
|
|
MUL X2, Y1, T0 \
|
|
UMULH X2, Y1, T1 \
|
|
\
|
|
ADDS T2, Z3 \
|
|
ADCS T3, Z4 \
|
|
ADC ZR, ZR, Z5 \
|
|
\
|
|
MUL X2, Y2, T2 \
|
|
UMULH X2, Y2, T3 \
|
|
\
|
|
ADDS T0, Z3 \
|
|
ADCS T1, Z4 \
|
|
ADC ZR, Z5 \
|
|
\
|
|
ADDS T2, Z4 \
|
|
ADC T3, Z5
|
|
|
|
// Expects that X points to (X4-X6), Y to (Y4-Y6)
|
|
// Result of (X0-X5) * (Y0-Y5) will be in (0(Z), 8(Z), 16(Z), T0-T8)
|
|
// Inputs get overwritten
|
|
#define mul384x384karatsuba(X, Y, Z, X0, X1, X2, X3, X4, X5, Y0, Y1, Y2, Y3, Y4, Y5, T0, T1, T2, T3, T4, T5, T6, T7, T8, T9, T10)\
|
|
ADDS X0, X3 \ // xH + xL, destroys xH
|
|
ADCS X1, X4 \
|
|
ADCS X2, X5 \
|
|
ADC ZR, ZR, T10 \
|
|
\
|
|
ADDS Y0, Y3 \ // yH + yL, destroys yH
|
|
ADCS Y1, Y4 \
|
|
ADCS Y2, Y5 \
|
|
ADC ZR, ZR, T6 \
|
|
\
|
|
SUB T10, ZR, T7 \
|
|
SUB T6, ZR, T8 \
|
|
AND T6, T10 \ // combined carry
|
|
\
|
|
AND T7, Y3, T0 \ // masked(yH + yL)
|
|
AND T7, Y4, T1 \
|
|
AND T7, Y5, T2 \
|
|
\
|
|
AND T8, X3, T3 \ // masked(xH + xL)
|
|
AND T8, X4, T4 \
|
|
AND T8, X5, T5 \
|
|
\
|
|
ADDS T3, T0 \
|
|
ADCS T4, T1 \
|
|
STP (T0, T1), 0+Z \
|
|
\
|
|
MUL X3, Y3, T0 \
|
|
MUL X3, Y4, T1 \
|
|
\
|
|
ADCS T5, T2 \
|
|
MOVD T2, 16+Z \
|
|
\
|
|
UMULH X3, Y4, T2 \
|
|
UMULH X3, Y3, T3 \
|
|
\
|
|
ADC ZR, T10 \
|
|
\ // (xH + xL) * (yH + yL)
|
|
mul192x192comba(X3, X4, X5, Y3, Y4, Y5, T0, T1, T2, T3, T4, T5, T6, T7, T8, T9)\
|
|
\
|
|
MUL X0, Y0, X3 \
|
|
LDP 0+Z, (T6, T7) \
|
|
MOVD 16+Z, T8 \
|
|
\
|
|
UMULH X0, Y0, Y3 \
|
|
ADDS T6, T3 \
|
|
ADCS T7, T4 \
|
|
MUL X0, Y1, X4 \
|
|
ADCS T8, T5 \
|
|
ADC ZR, T10 \
|
|
UMULH X0, Y1, X5 \
|
|
\ // xL * yL
|
|
mul192x192comba(X0, X1, X2, Y0, Y1, Y2, X3, X4, X5, Y3, Y4, Y5, T6, T7, T8, T9)\
|
|
\
|
|
STP (X3, X4), 0+Z \
|
|
MOVD X5, 16+Z \
|
|
\
|
|
SUBS X3, T0 \ // (xH + xL) * (yH + yL) - xL * yL
|
|
SBCS X4, T1 \
|
|
LDP 0+X, (X3, X4) \
|
|
SBCS X5, T2 \
|
|
MOVD 16+X, X5 \
|
|
SBCS Y3, T3 \
|
|
SBCS Y4, T4 \
|
|
SBCS Y5, T5 \
|
|
SBC ZR, T10 \
|
|
\
|
|
ADDS Y3, T0 \ // ((xH + xL) * (yH + yL) - xL * yL) * 2^192 + xL * yL
|
|
ADCS Y4, T1 \
|
|
LDP 0+Y, (Y3, Y4) \
|
|
MUL X3, Y3, X0 \
|
|
ADCS Y5, T2 \
|
|
UMULH X3, Y3, Y0 \
|
|
MOVD 16+Y, Y5 \
|
|
MUL X3, Y4, X1 \
|
|
ADCS ZR, T3 \
|
|
UMULH X3, Y4, X2 \
|
|
ADCS ZR, T4 \
|
|
ADCS ZR, T5 \
|
|
ADC ZR, T10 \
|
|
\ // xH * yH, overwrite xLow, yLow
|
|
mul192x192comba(X3, X4, X5, Y3, Y4, Y5, X0, X1, X2, Y0, Y1, Y2, T6, T7, T8, T9)\
|
|
\
|
|
SUBS X0, T0 \ // ((xH + xL) * (yH + yL) - xL * yL - xH * yH)
|
|
SBCS X1, T1 \
|
|
SBCS X2, T2 \
|
|
SBCS Y0, T3 \
|
|
SBCS Y1, T4 \
|
|
SBCS Y2, T5 \
|
|
SBC ZR, T10 \
|
|
\
|
|
ADDS X0, T3 \
|
|
ADCS X1, T4 \
|
|
ADCS X2, T5 \
|
|
ADCS T10, Y0, T6 \
|
|
ADCS ZR, Y1, T7 \
|
|
ADC ZR, Y2, T8
|
|
|
|
|
|
TEXT ·fp751Mul(SB), NOSPLIT, $0-24
|
|
MOVD z+0(FP), R2
|
|
MOVD x+8(FP), R0
|
|
MOVD y+16(FP), R1
|
|
|
|
// Load xL in R3-R8, xH in R9-R14
|
|
// (xH + xL) in R3-R8, destroys xH
|
|
LDP 0(R0), (R3, R4)
|
|
LDP 48(R0), (R9, R10)
|
|
ADDS R9, R3
|
|
ADCS R10, R4
|
|
LDP 16(R0), (R5, R6)
|
|
LDP 64(R0), (R11, R12)
|
|
ADCS R11, R5
|
|
ADCS R12, R6
|
|
LDP 32(R0), (R7, R8)
|
|
LDP 80(R0), (R13, R14)
|
|
ADCS R13, R7
|
|
ADCS R14, R8
|
|
ADC ZR, ZR, R22
|
|
|
|
// Load yL in R9-R14, yH in R15-21
|
|
// (yH + yL) in R9-R14, destroys yH
|
|
LDP 0(R1), (R9, R10)
|
|
LDP 48(R1), (R15, R16)
|
|
ADDS R15, R9
|
|
ADCS R16, R10
|
|
LDP 16(R1), (R11, R12)
|
|
LDP 64(R1), (R17, R19)
|
|
ADCS R17, R11
|
|
ADCS R19, R12
|
|
LDP 32(R1), (R13, R14)
|
|
LDP 80(R1), (R20, R21)
|
|
ADCS R20, R13
|
|
ADCS R21, R14
|
|
ADC ZR, ZR, R23
|
|
|
|
// Compute masks and combined carry
|
|
SUB R22, ZR, R24
|
|
SUB R23, ZR, R25
|
|
AND R23, R22
|
|
|
|
// Store xH, yH in z so mul384x384karatsuba can retrieve them from memory
|
|
// It doesn't have enough registers
|
|
// Meanwhile computed masked(xH + xL) in R15-R21
|
|
STP (R6, R7), 0(R2)
|
|
AND R25, R3, R15
|
|
AND R25, R4, R16
|
|
STP (R8, R12), 16(R2)
|
|
AND R25, R5, R17
|
|
AND R25, R6, R19
|
|
STP (R13, R14), 32(R2)
|
|
AND R25, R7, R20
|
|
AND R25, R8, R21
|
|
|
|
// Masked(xH + xL) + masked(yH + yL) in R15-R21
|
|
// Store intermediate values in z
|
|
AND R24, R9, R25
|
|
AND R24, R10, R26
|
|
ADDS R25, R15
|
|
ADCS R26, R16
|
|
STP (R15, R16), 96(R2)
|
|
AND R24, R11, R25
|
|
AND R24, R12, R26
|
|
ADCS R25, R17
|
|
ADCS R26, R19
|
|
STP (R17, R19), 112(R2)
|
|
AND R24, R13, R25
|
|
AND R24, R14, R26
|
|
ADCS R25, R20
|
|
ADCS R26, R21
|
|
STP (R20, R21), 128(R2)
|
|
// Store carry in R29 so it can remain there
|
|
ADC ZR, R22, R29
|
|
|
|
// (xH + xL) * (yH + yL)
|
|
mul384x384karatsuba(0(R2), 24(R2), 48(R2), R3, R4, R5, R6, R7, R8, R9, R10, R11, R12, R13, R14, R15, R16, R17, R19, R20, R21, R22, R23, R24, R25, R26)
|
|
|
|
// Load masked(xH + xL) + masked(yH + yL) and add that to its top half
|
|
// Store the result back in z
|
|
STP (R15, R16), 72(R2)
|
|
LDP 96(R2), (R3, R4)
|
|
ADDS R3, R19
|
|
STP (R17, R19), 88(R2)
|
|
ADCS R4, R20
|
|
LDP 112(R2), (R5, R6)
|
|
ADCS R5, R21
|
|
STP (R20, R21), 104(R2)
|
|
ADCS R6, R22
|
|
LDP 128(R2), (R7, R8)
|
|
ADCS R7, R23
|
|
STP (R22, R23), 120(R2)
|
|
ADCS R8, R24
|
|
MOVD R24, 136(R2)
|
|
ADC ZR, R29
|
|
|
|
// Load xL, yL
|
|
LDP 0(R0), (R3, R4)
|
|
LDP 16(R0), (R5, R6)
|
|
LDP 32(R0), (R7, R8)
|
|
LDP 0(R1), (R9, R10)
|
|
LDP 16(R1), (R11, R12)
|
|
LDP 32(R1), (R13, R14)
|
|
|
|
// xL * yL
|
|
mul384x384karatsuba(24(R0), 24(R1), 0(R2), R3, R4, R5, R6, R7, R8, R9, R10, R11, R12, R13, R14, R15, R16, R17, R19, R20, R21, R22, R23, R24, R25, R26)
|
|
|
|
// (xH + xL) * (yH + yL) - xL * yL in R3-R14
|
|
LDP 0(R2), (R12, R13)
|
|
LDP 48(R2), (R3, R4)
|
|
SUBS R12, R3
|
|
LDP 64(R2), (R5, R6)
|
|
MOVD 16(R2), R14
|
|
SBCS R13, R4
|
|
SBCS R14, R5
|
|
LDP 80(R2), (R7, R8)
|
|
SBCS R15, R6
|
|
SBCS R16, R7
|
|
LDP 96(R2), (R9, R10)
|
|
SBCS R17, R8
|
|
SBCS R19, R9
|
|
LDP 112(R2), (R11, R12)
|
|
SBCS R20, R10
|
|
SBCS R21, R11
|
|
LDP 128(R2), (R13, R14)
|
|
SBCS R22, R12
|
|
SBCS R23, R13
|
|
SBCS R24, R14
|
|
SBC ZR, R29
|
|
|
|
STP (R15, R16), 24(R2)
|
|
MOVD R17, 40(R2)
|
|
|
|
// ((xH + xL) * (yH + yL) - xL * yL) * 2^384 + xL * yL and store back in z
|
|
ADDS R19, R3
|
|
ADCS R20, R4
|
|
STP (R3, R4), 48(R2)
|
|
ADCS R21, R5
|
|
ADCS R22, R6
|
|
STP (R5, R6), 64(R2)
|
|
ADCS R23, R7
|
|
ADCS R24, R8
|
|
STP (R7, R8), 80(R2)
|
|
ADCS ZR, R9
|
|
ADCS ZR, R10
|
|
STP (R9, R10), 96(R2)
|
|
ADCS ZR, R11
|
|
ADCS ZR, R12
|
|
STP (R11, R12), 112(R2)
|
|
ADCS ZR, R13
|
|
ADCS ZR, R14
|
|
STP (R13, R14), 128(R2)
|
|
ADC ZR, R29
|
|
|
|
// Load xH, yH
|
|
LDP 48(R0), (R3, R4)
|
|
LDP 64(R0), (R5, R6)
|
|
LDP 80(R0), (R7, R8)
|
|
LDP 48(R1), (R9, R10)
|
|
LDP 64(R1), (R11, R12)
|
|
LDP 80(R1), (R13, R14)
|
|
|
|
// xH * yH
|
|
mul384x384karatsuba(72(R0), 72(R1), 144(R2), R3, R4, R5, R6, R7, R8, R9, R10, R11, R12, R13, R14, R15, R16, R17, R19, R20, R21, R22, R23, R24, R25, R26)
|
|
|
|
LDP 144(R2), (R12, R13)
|
|
MOVD 160(R2), R14
|
|
|
|
// (xH + xL) * (yH + yL) - xL * yL - xH * yH in R3-R14
|
|
// Store lower half in z, that's done
|
|
LDP 48(R2), (R3, R4)
|
|
SUBS R12, R3
|
|
LDP 64(R2), (R5, R6)
|
|
SBCS R13, R4
|
|
SBCS R14, R5
|
|
LDP 80(R2), (R7, R8)
|
|
SBCS R15, R6
|
|
SBCS R16, R7
|
|
LDP 96(R2), (R9, R10)
|
|
SBCS R17, R8
|
|
SBCS R19, R9
|
|
LDP 112(R2), (R11, R12)
|
|
SBCS R20, R10
|
|
SBCS R21, R11
|
|
LDP 128(R2), (R13, R14)
|
|
SBCS R22, R12
|
|
SBCS R23, R13
|
|
STP (R3, R4), 48(R2)
|
|
SBCS R24, R14
|
|
STP (R5, R6), 64(R2)
|
|
SBC ZR, R29
|
|
STP (R7, R8), 80(R2)
|
|
|
|
// (xH * yH) * 2^768 + ((xH + xL) * (yH + yL) - xL * yL - xH * yH) * 2^384 + xL * yL
|
|
// Store remaining limbs in z
|
|
LDP 144(R2), (R3, R4)
|
|
MOVD 160(R2), R5
|
|
|
|
ADDS R3, R9
|
|
ADCS R4, R10
|
|
STP (R9, R10), 96(R2)
|
|
ADCS R5, R11
|
|
ADCS R15, R12
|
|
STP (R11, R12), 112(R2)
|
|
ADCS R16, R13
|
|
ADCS R17, R14
|
|
STP (R13, R14), 128(R2)
|
|
|
|
ADCS R29, R19
|
|
ADCS ZR, R20
|
|
STP (R19, R20), 144(R2)
|
|
ADCS ZR, R21
|
|
ADCS ZR, R22
|
|
STP (R21, R22), 160(R2)
|
|
ADCS ZR, R23
|
|
ADC ZR, R24
|
|
STP (R23, R24), 176(R2)
|
|
|
|
RET
|
|
|
|
TEXT ·fp751MontgomeryReduce(SB), NOSPLIT, $0-16
|
|
MOVD z+0(FP), R0
|
|
MOVD x+8(FP), R1
|
|
|
|
// Load p751+1 in R14-R17, R29, R19-R20, spread over arithmetic
|
|
LDP ·p751p1+40(SB), (R14, R15)
|
|
// z0-z11 will be R2-R13
|
|
// Load x0-x4 to z0-z4 and x5, spread over arithmetic
|
|
LDP 0(R1), (R2, R3)
|
|
|
|
// x5 iteration
|
|
MUL R2, R14, R22
|
|
LDP 32(R1), (R6, R21)
|
|
UMULH R2, R14, R23
|
|
ADDS R21, R22, R7 // Set z5
|
|
ADC ZR, R23, R25
|
|
|
|
// x6 iteration
|
|
MUL R2, R15, R22
|
|
MOVD 48(R1), R21
|
|
UMULH R2, R15, R23
|
|
ADDS R22, R25
|
|
ADC R23, ZR, R26
|
|
|
|
MUL R3, R14, R22
|
|
LDP ·p751p1+56(SB), (R16, R17)
|
|
UMULH R3, R14, R23
|
|
ADDS R22, R25
|
|
ADCS R23, R26
|
|
ADC ZR, ZR, R24
|
|
|
|
ADDS R21, R25, R8 // Set z6
|
|
ADCS ZR, R26
|
|
ADC ZR, R24
|
|
|
|
// x7 iteration
|
|
MUL R2, R16, R22
|
|
MOVD 56(R1), R21
|
|
UMULH R2, R16, R23
|
|
ADDS R22, R26
|
|
ADCS R23, R24
|
|
ADC ZR, ZR, R25
|
|
|
|
MUL R3, R15, R22
|
|
LDP 16(R1), (R4, R5)
|
|
UMULH R3, R15, R23
|
|
ADDS R22, R26
|
|
ADCS R23, R24
|
|
ADC ZR, R25
|
|
|
|
MUL R4, R14, R22
|
|
LDP ·p751p1+72(SB), (R29, R19)
|
|
UMULH R4, R14, R23
|
|
ADDS R22, R26
|
|
ADCS R23, R24
|
|
ADC ZR, R25
|
|
|
|
ADDS R21, R26, R9 // Set z7
|
|
ADCS ZR, R24
|
|
ADC ZR, R25
|
|
|
|
// x8 iteration
|
|
MUL R2, R17, R22
|
|
MOVD 64(R1), R21
|
|
UMULH R2, R17, R23
|
|
ADDS R22, R24
|
|
ADCS R23, R25
|
|
ADC ZR, ZR, R26
|
|
|
|
MUL R3, R16, R22
|
|
MOVD ·p751p1+88(SB), R20
|
|
UMULH R3, R16, R23
|
|
ADDS R22, R24
|
|
ADCS R23, R25
|
|
ADC ZR, R26
|
|
|
|
MUL R4, R15, R22
|
|
UMULH R4, R15, R23
|
|
ADDS R22, R24
|
|
ADCS R23, R25
|
|
ADC ZR, R26
|
|
|
|
MUL R5, R14, R22
|
|
UMULH R5, R14, R23
|
|
ADDS R22, R24
|
|
ADCS R23, R25
|
|
ADC ZR, R26
|
|
|
|
ADDS R24, R21, R10 // Set z8
|
|
ADCS ZR, R25
|
|
ADC ZR, R26
|
|
|
|
// x9 iteration
|
|
MUL R2, R29, R22
|
|
MOVD 72(R1), R21
|
|
UMULH R2, R29, R23
|
|
ADDS R22, R25
|
|
ADCS R23, R26
|
|
ADC ZR, ZR, R24
|
|
|
|
MUL R3, R17, R22
|
|
UMULH R3, R17, R23
|
|
ADDS R22, R25
|
|
ADCS R23, R26
|
|
ADC ZR, R24
|
|
|
|
MUL R4, R16, R22
|
|
UMULH R4, R16, R23
|
|
ADDS R22, R25
|
|
ADCS R23, R26
|
|
ADC ZR, R24
|
|
|
|
MUL R5, R15, R22
|
|
UMULH R5, R15, R23
|
|
ADDS R22, R25
|
|
ADCS R23, R26
|
|
ADC ZR, R24
|
|
|
|
MUL R6, R14, R22
|
|
UMULH R6, R14, R23
|
|
ADDS R22, R25
|
|
ADCS R23, R26
|
|
ADC ZR, R24
|
|
|
|
ADDS R21, R25, R11 // Set z9
|
|
ADCS ZR, R26
|
|
ADC ZR, R24
|
|
|
|
// x10 iteration
|
|
MUL R2, R19, R22
|
|
MOVD 80(R1), R21
|
|
UMULH R2, R19, R23
|
|
ADDS R22, R26
|
|
ADCS R23, R24
|
|
ADC ZR, ZR, R25
|
|
|
|
MUL R3, R29, R22
|
|
UMULH R3, R29, R23
|
|
ADDS R22, R26
|
|
ADCS R23, R24
|
|
ADC ZR, R25
|
|
|
|
MUL R4, R17, R22
|
|
UMULH R4, R17, R23
|
|
ADDS R22, R26
|
|
ADCS R23, R24
|
|
ADC ZR, R25
|
|
|
|
MUL R5, R16, R22
|
|
UMULH R5, R16, R23
|
|
ADDS R22, R26
|
|
ADCS R23, R24
|
|
ADC ZR, R25
|
|
|
|
MUL R6, R15, R22
|
|
UMULH R6, R15, R23
|
|
ADDS R22, R26
|
|
ADCS R23, R24
|
|
ADC ZR, R25
|
|
|
|
MUL R7, R14, R22
|
|
UMULH R7, R14, R23
|
|
ADDS R22, R26
|
|
ADCS R23, R24
|
|
ADC ZR, R25
|
|
|
|
ADDS R21, R26, R12 // Set z10
|
|
ADCS ZR, R24
|
|
ADC ZR, R25
|
|
|
|
// x11 iteration
|
|
MUL R2, R20, R22
|
|
MOVD 88(R1), R21
|
|
UMULH R2, R20, R23
|
|
ADDS R22, R24
|
|
ADCS R23, R25
|
|
ADC ZR, ZR, R26
|
|
|
|
MUL R3, R19, R22
|
|
UMULH R3, R19, R23
|
|
ADDS R22, R24
|
|
ADCS R23, R25
|
|
ADC ZR, R26
|
|
|
|
MUL R4, R29, R22
|
|
UMULH R4, R29, R23
|
|
ADDS R22, R24
|
|
ADCS R23, R25
|
|
ADC ZR, R26
|
|
|
|
MUL R5, R17, R22
|
|
UMULH R5, R17, R23
|
|
ADDS R22, R24
|
|
ADCS R23, R25
|
|
ADC ZR, R26
|
|
|
|
MUL R6, R16, R22
|
|
UMULH R6, R16, R23
|
|
ADDS R22, R24
|
|
ADCS R23, R25
|
|
ADC ZR, R26
|
|
|
|
MUL R7, R15, R22
|
|
UMULH R7, R15, R23
|
|
ADDS R22, R24
|
|
ADCS R23, R25
|
|
ADC ZR, R26
|
|
|
|
MUL R8, R14, R22
|
|
UMULH R8, R14, R23
|
|
ADDS R22, R24
|
|
ADCS R23, R25
|
|
ADC ZR, R26
|
|
|
|
ADDS R21, R24, R13 // Set z11
|
|
ADCS ZR, R25
|
|
ADC ZR, R26
|
|
|
|
// x12 iteration
|
|
MUL R3, R20, R22
|
|
MOVD 96(R1), R21
|
|
UMULH R3, R20, R23
|
|
ADDS R22, R25
|
|
ADCS R23, R26
|
|
ADC ZR, ZR, R24
|
|
|
|
MUL R4, R19, R22
|
|
UMULH R4, R19, R23
|
|
ADDS R22, R25
|
|
ADCS R23, R26
|
|
ADC ZR, R24
|
|
|
|
MUL R5, R29, R22
|
|
UMULH R5, R29, R23
|
|
ADDS R22, R25
|
|
ADCS R23, R26
|
|
ADC ZR, R24
|
|
|
|
MUL R6, R17, R22
|
|
UMULH R6, R17, R23
|
|
ADDS R22, R25
|
|
ADCS R23, R26
|
|
ADC ZR, R24
|
|
|
|
MUL R7, R16, R22
|
|
UMULH R7, R16, R23
|
|
ADDS R22, R25
|
|
ADCS R23, R26
|
|
ADC ZR, R24
|
|
|
|
MUL R8, R15, R22
|
|
UMULH R8, R15, R23
|
|
ADDS R22, R25
|
|
ADCS R23, R26
|
|
ADC ZR, R24
|
|
|
|
MUL R9, R14, R22
|
|
UMULH R9, R14, R23
|
|
ADDS R22, R25
|
|
ADCS R23, R26
|
|
ADC ZR, R24
|
|
|
|
ADDS R21, R25, R2 // Set z0
|
|
ADCS ZR, R26
|
|
ADC ZR, R24
|
|
|
|
// x13 iteration
|
|
MUL R4, R20, R22
|
|
MOVD 104(R1), R21
|
|
UMULH R4, R20, R23
|
|
ADDS R22, R26
|
|
ADCS R23, R24
|
|
ADC ZR, ZR, R25
|
|
|
|
MUL R5, R19, R22
|
|
UMULH R5, R19, R23
|
|
ADDS R22, R26
|
|
ADCS R23, R24
|
|
ADC ZR, R25
|
|
|
|
MUL R6, R29, R22
|
|
UMULH R6, R29, R23
|
|
ADDS R22, R26
|
|
ADCS R23, R24
|
|
ADC ZR, R25
|
|
|
|
MUL R7, R17, R22
|
|
UMULH R7, R17, R23
|
|
ADDS R22, R26
|
|
ADCS R23, R24
|
|
ADC ZR, R25
|
|
|
|
MUL R8, R16, R22
|
|
UMULH R8, R16, R23
|
|
ADDS R22, R26
|
|
ADCS R23, R24
|
|
ADC ZR, R25
|
|
|
|
MUL R9, R15, R22
|
|
UMULH R9, R15, R23
|
|
ADDS R22, R26
|
|
ADCS R23, R24
|
|
ADC ZR, R25
|
|
|
|
MUL R10, R14, R22
|
|
UMULH R10, R14, R23
|
|
ADDS R22, R26
|
|
ADCS R23, R24
|
|
ADC ZR, R25
|
|
|
|
ADDS R21, R26, R3 // Set z1
|
|
STP (R2, R3), 0(R0)
|
|
ADCS ZR, R24
|
|
ADC ZR, R25
|
|
|
|
// x14 iteration
|
|
MUL R5, R20, R22
|
|
MOVD 112(R1), R21
|
|
UMULH R5, R20, R23
|
|
ADDS R22, R24
|
|
ADCS R23, R25
|
|
ADC ZR, ZR, R26
|
|
|
|
MUL R6, R19, R22
|
|
UMULH R6, R19, R23
|
|
ADDS R22, R24
|
|
ADCS R23, R25
|
|
ADC ZR, R26
|
|
|
|
MUL R7, R29, R22
|
|
UMULH R7, R29, R23
|
|
ADDS R22, R24
|
|
ADCS R23, R25
|
|
ADC ZR, R26
|
|
|
|
MUL R8, R17, R22
|
|
UMULH R8, R17, R23
|
|
ADDS R22, R24
|
|
ADCS R23, R25
|
|
ADC ZR, R26
|
|
|
|
MUL R9, R16, R22
|
|
UMULH R9, R16, R23
|
|
ADDS R22, R24
|
|
ADCS R23, R25
|
|
ADC ZR, R26
|
|
|
|
MUL R10, R15, R22
|
|
UMULH R10, R15, R23
|
|
ADDS R22, R24
|
|
ADCS R23, R25
|
|
ADC ZR, R26
|
|
|
|
MUL R11, R14, R22
|
|
UMULH R11, R14, R23
|
|
ADDS R22, R24
|
|
ADCS R23, R25
|
|
ADC ZR, R26
|
|
|
|
ADDS R21, R24, R4 // Set z2
|
|
ADCS ZR, R25
|
|
ADC ZR, R26
|
|
|
|
// x15 iteration
|
|
MUL R6, R20, R22
|
|
MOVD 120(R1), R21
|
|
UMULH R6, R20, R23
|
|
ADDS R22, R25
|
|
ADCS R23, R26
|
|
ADC ZR, ZR, R24
|
|
|
|
MUL R7, R19, R22
|
|
UMULH R7, R19, R23
|
|
ADDS R22, R25
|
|
ADCS R23, R26
|
|
ADC ZR, R24
|
|
|
|
MUL R8, R29, R22
|
|
UMULH R8, R29, R23
|
|
ADDS R22, R25
|
|
ADCS R23, R26
|
|
ADC ZR, R24
|
|
|
|
MUL R9, R17, R22
|
|
UMULH R9, R17, R23
|
|
ADDS R22, R25
|
|
ADCS R23, R26
|
|
ADC ZR, R24
|
|
|
|
MUL R10, R16, R22
|
|
UMULH R10, R16, R23
|
|
ADDS R22, R25
|
|
ADCS R23, R26
|
|
ADC ZR, R24
|
|
|
|
MUL R11, R15, R22
|
|
UMULH R11, R15, R23
|
|
ADDS R22, R25
|
|
ADCS R23, R26
|
|
ADC ZR, R24
|
|
|
|
MUL R12, R14, R22
|
|
UMULH R12, R14, R23
|
|
ADDS R22, R25
|
|
ADCS R23, R26
|
|
ADC ZR, R24
|
|
|
|
ADDS R21, R25, R5 // Set z3
|
|
STP (R4, R5), 16(R0)
|
|
ADCS ZR, R26
|
|
ADC ZR, R24
|
|
|
|
// x16 iteration
|
|
MUL R7, R20, R22
|
|
MOVD 128(R1), R21
|
|
UMULH R7, R20, R23
|
|
ADDS R22, R26
|
|
ADCS R23, R24
|
|
ADC ZR, ZR, R25
|
|
|
|
MUL R8, R19, R22
|
|
UMULH R8, R19, R23
|
|
ADDS R22, R26
|
|
ADCS R23, R24
|
|
ADC ZR, R25
|
|
|
|
MUL R9, R29, R22
|
|
UMULH R9, R29, R23
|
|
ADDS R22, R26
|
|
ADCS R23, R24
|
|
ADC ZR, R25
|
|
|
|
MUL R10, R17, R22
|
|
UMULH R10, R17, R23
|
|
ADDS R22, R26
|
|
ADCS R23, R24
|
|
ADC ZR, R25
|
|
|
|
MUL R11, R16, R22
|
|
UMULH R11, R16, R23
|
|
ADDS R22, R26
|
|
ADCS R23, R24
|
|
ADC ZR, R25
|
|
|
|
MUL R12, R15, R22
|
|
UMULH R12, R15, R23
|
|
ADDS R22, R26
|
|
ADCS R23, R24
|
|
ADC ZR, R25
|
|
|
|
MUL R13, R14, R22
|
|
UMULH R13, R14, R23
|
|
ADDS R22, R26
|
|
ADCS R23, R24
|
|
ADC ZR, R25
|
|
|
|
ADDS R21, R26, R6 // Set z4
|
|
ADCS ZR, R24
|
|
ADC ZR, R25
|
|
|
|
// x17 iteration
|
|
MUL R8, R20, R22
|
|
MOVD 136(R1), R21
|
|
UMULH R8, R20, R23
|
|
ADDS R22, R24
|
|
ADCS R23, R25
|
|
ADC ZR, ZR, R26
|
|
|
|
MUL R9, R19, R22
|
|
UMULH R9, R19, R23
|
|
ADDS R22, R24
|
|
ADCS R23, R25
|
|
ADC ZR, R26
|
|
|
|
MUL R10, R29, R22
|
|
UMULH R10, R29, R23
|
|
ADDS R22, R24
|
|
ADCS R23, R25
|
|
ADC ZR, R26
|
|
|
|
MUL R11, R17, R22
|
|
UMULH R11, R17, R23
|
|
ADDS R22, R24
|
|
ADCS R23, R25
|
|
ADC ZR, R26
|
|
|
|
MUL R12, R16, R22
|
|
UMULH R12, R16, R23
|
|
ADDS R22, R24
|
|
ADCS R23, R25
|
|
ADC ZR, R26
|
|
|
|
MUL R13, R15, R22
|
|
UMULH R13, R15, R23
|
|
ADDS R22, R24
|
|
ADCS R23, R25
|
|
ADC ZR, R26
|
|
|
|
ADDS R21, R24, R7 // Set z5
|
|
STP (R6, R7), 32(R0)
|
|
ADCS ZR, R25
|
|
ADC ZR, R26
|
|
|
|
// x18 iteration
|
|
MUL R9, R20, R22
|
|
MOVD 144(R1), R21
|
|
UMULH R9, R20, R23
|
|
ADDS R22, R25
|
|
ADCS R23, R26
|
|
ADC ZR, ZR, R24
|
|
|
|
MUL R10, R19, R22
|
|
UMULH R10, R19, R23
|
|
ADDS R22, R25
|
|
ADCS R23, R26
|
|
ADC ZR, R24
|
|
|
|
MUL R11, R29, R22
|
|
UMULH R11, R29, R23
|
|
ADDS R22, R25
|
|
ADCS R23, R26
|
|
ADC ZR, R24
|
|
|
|
MUL R12, R17, R22
|
|
UMULH R12, R17, R23
|
|
ADDS R22, R25
|
|
ADCS R23, R26
|
|
ADC ZR, R24
|
|
|
|
MUL R13, R16, R22
|
|
UMULH R13, R16, R23
|
|
ADDS R22, R25
|
|
ADCS R23, R26
|
|
ADC ZR, R24
|
|
|
|
ADDS R21, R25, R8 // Set z6
|
|
ADCS ZR, R26
|
|
ADC ZR, R24
|
|
|
|
// x19 iteration
|
|
MUL R10, R20, R22
|
|
MOVD 152(R1), R21
|
|
UMULH R10, R20, R23
|
|
ADDS R22, R26
|
|
ADCS R23, R24
|
|
ADC ZR, ZR, R25
|
|
|
|
MUL R11, R19, R22
|
|
UMULH R11, R19, R23
|
|
ADDS R22, R26
|
|
ADCS R23, R24
|
|
ADC ZR, R25
|
|
|
|
MUL R12, R29, R22
|
|
UMULH R12, R29, R23
|
|
ADDS R22, R26
|
|
ADCS R23, R24
|
|
ADC ZR, R25
|
|
|
|
MUL R13, R17, R22
|
|
UMULH R13, R17, R23
|
|
ADDS R22, R26
|
|
ADCS R23, R24
|
|
ADC ZR, R25
|
|
|
|
ADDS R21, R26, R9 // Set z7
|
|
STP (R8, R9), 48(R0)
|
|
ADCS ZR, R24
|
|
ADC ZR, R25
|
|
|
|
// x20 iteration
|
|
MUL R11, R20, R22
|
|
MOVD 160(R1), R21
|
|
UMULH R11, R20, R23
|
|
ADDS R22, R24
|
|
ADCS R23, R25
|
|
ADC ZR, ZR, R26
|
|
|
|
MUL R12, R19, R22
|
|
UMULH R12, R19, R23
|
|
ADDS R22, R24
|
|
ADCS R23, R25
|
|
ADC ZR, R26
|
|
|
|
MUL R13, R29, R22
|
|
UMULH R13, R29, R23
|
|
ADDS R22, R24
|
|
ADCS R23, R25
|
|
ADC ZR, R26
|
|
|
|
ADDS R21, R24, R10 // Set z8
|
|
ADCS ZR, R25
|
|
ADC ZR, R26
|
|
|
|
// x21 iteration
|
|
MUL R12, R20, R22
|
|
MOVD 168(R1), R21
|
|
UMULH R12, R20, R23
|
|
ADDS R22, R25
|
|
ADCS R23, R26
|
|
ADC ZR, ZR, R24
|
|
|
|
MUL R13, R19, R22
|
|
UMULH R13, R19, R23
|
|
ADDS R22, R25
|
|
ADCS R23, R26
|
|
ADC ZR, R24
|
|
|
|
ADDS R21, R25, R11 // Set z9
|
|
STP (R10, R11), 64(R0)
|
|
ADCS ZR, R26
|
|
ADC ZR, R24
|
|
|
|
// x22 iteration
|
|
MUL R13, R20, R22
|
|
MOVD 176(R1), R21
|
|
UMULH R13, R20, R23
|
|
ADDS R22, R26
|
|
ADC R23, R24
|
|
ADDS R21, R26, R12 // Set z10
|
|
|
|
MOVD 184(R1), R21
|
|
ADC R21, R24, R13 // Set z11
|
|
STP (R12, R13), 80(R0)
|
|
|
|
RET
|
|
|
|
TEXT ·fp751StrongReduce(SB), NOSPLIT, $0-8
|
|
MOVD x+0(FP), R0
|
|
|
|
// Keep x in R1-R12, p751 in R13-R21, subtract to R1-R12
|
|
MOVD ·p751+0(SB), R13
|
|
LDP 0(R0), (R1, R2)
|
|
LDP 16(R0), (R3, R4)
|
|
SUBS R13, R1
|
|
SBCS R13, R2
|
|
|
|
LDP 32(R0), (R5, R6)
|
|
LDP ·p751+40(SB), (R14, R15)
|
|
SBCS R13, R3
|
|
SBCS R13, R4
|
|
|
|
LDP 48(R0), (R7, R8)
|
|
LDP ·p751+56(SB), (R16, R17)
|
|
SBCS R13, R5
|
|
SBCS R14, R6
|
|
|
|
LDP 64(R0), (R9, R10)
|
|
LDP ·p751+72(SB), (R19, R20)
|
|
SBCS R15, R7
|
|
SBCS R16, R8
|
|
|
|
LDP 80(R0), (R11, R12)
|
|
MOVD ·p751+88(SB), R21
|
|
SBCS R17, R9
|
|
SBCS R19, R10
|
|
|
|
SBCS R20, R11
|
|
SBCS R21, R12
|
|
SBC ZR, ZR, R22
|
|
|
|
// Mask with the borrow and add p751
|
|
AND R22, R13
|
|
AND R22, R14
|
|
AND R22, R15
|
|
AND R22, R16
|
|
AND R22, R17
|
|
AND R22, R19
|
|
AND R22, R20
|
|
AND R22, R21
|
|
|
|
ADDS R13, R1
|
|
ADCS R13, R2
|
|
STP (R1, R2), 0(R0)
|
|
ADCS R13, R3
|
|
ADCS R13, R4
|
|
STP (R3, R4), 16(R0)
|
|
ADCS R13, R5
|
|
ADCS R14, R6
|
|
STP (R5, R6), 32(R0)
|
|
ADCS R15, R7
|
|
ADCS R16, R8
|
|
STP (R7, R8), 48(R0)
|
|
ADCS R17, R9
|
|
ADCS R19, R10
|
|
STP (R9, R10), 64(R0)
|
|
ADCS R20, R11
|
|
ADC R21, R12
|
|
STP (R11, R12), 80(R0)
|
|
|
|
RET
|