1
0
mirror of https://github.com/henrydcase/nobs.git synced 2024-11-27 09:31:23 +00:00
nobs/dh/sidh/p503/arith_amd64.s

1718 lines
39 KiB
ArmAsm
Raw Normal View History

// +build amd64,!noasm
#include "textflag.h"
// p503
#define P503_0 $0xFFFFFFFFFFFFFFFF
#define P503_1 $0xFFFFFFFFFFFFFFFF
#define P503_2 $0xFFFFFFFFFFFFFFFF
#define P503_3 $0xABFFFFFFFFFFFFFF
#define P503_4 $0x13085BDA2211E7A0
#define P503_5 $0x1B9BF6C87B7E7DAF
#define P503_6 $0x6045C6BDDA77A4D0
#define P503_7 $0x004066F541811E1E
// p503+1
#define P503P1_3 $0xAC00000000000000
#define P503P1_4 $0x13085BDA2211E7A0
#define P503P1_5 $0x1B9BF6C87B7E7DAF
#define P503P1_6 $0x6045C6BDDA77A4D0
#define P503P1_7 $0x004066F541811E1E
// p503x2
#define P503X2_0 $0xFFFFFFFFFFFFFFFE
#define P503X2_1 $0xFFFFFFFFFFFFFFFF
#define P503X2_2 $0xFFFFFFFFFFFFFFFF
#define P503X2_3 $0x57FFFFFFFFFFFFFF
#define P503X2_4 $0x2610B7B44423CF41
#define P503X2_5 $0x3737ED90F6FCFB5E
#define P503X2_6 $0xC08B8D7BB4EF49A0
#define P503X2_7 $0x0080CDEA83023C3C
#define REG_P1 DI
#define REG_P2 SI
#define REG_P3 DX
// Performs schoolbook multiplication of 2 256-bit numbers. This optimized version
// uses MULX instruction. Macro smashes value in DX.
// Input: I0 and I1.
// Output: O
// All the other arguments are resgisters, used for storing temporary values
#define MULS256_MULX(O, I0, I1, T0, T1, T2, T3, T4, T5, T6, T7, T8, T9) \
MOVQ I0, DX \
MULXQ I1, T1, T0 \ // T0:T1 = A0*B0
MOVQ T1, O \ // O[0]
MULXQ 8+I1, T2, T1 \ // T1:T2 = U0*V1
ADDQ T2, T0 \
MULXQ 16+I1, T3, T2 \ // T2:T3 = U0*V2
ADCQ T3, T1 \
MULXQ 24+I1, T4, T3 \ // T3:T4 = U0*V3
ADCQ T4, T2 \
\ // Column U1
MOVQ 8+I0, DX \
ADCQ $0, T3 \
MULXQ 0+I1, T4, T5 \ // T5:T4 = U1*V0
MULXQ 8+I1, T7, T6 \ // T6:T7 = U1*V1
ADDQ T7, T5 \
MULXQ 16+I1, T8, T7 \ // T7:T8 = U1*V2
ADCQ T8, T6 \
MULXQ 24+I1, T9, T8 \ // T8:T9 = U1*V3
ADCQ T9, T7 \
ADCQ $0, T8 \
ADDQ T0, T4 \
MOVQ T4, 8+O \ // O[1]
ADCQ T1, T5 \
ADCQ T2, T6 \
ADCQ T3, T7 \
\ // Column U2
MOVQ 16+I0, DX \
ADCQ $0, T8 \
MULXQ 0+I1, T0, T1 \ // T1:T0 = U2*V0
MULXQ 8+I1, T3, T2 \ // T2:T3 = U2*V1
ADDQ T3, T1 \
MULXQ 16+I1, T4, T3 \ // T3:T4 = U2*V2
ADCQ T4, T2 \
MULXQ 24+I1, T9, T4 \ // T4:T9 = U2*V3
ADCQ T9, T3 \
\ // Column U3
MOVQ 24+I0, DX \
ADCQ $0, T4 \
ADDQ T5, T0 \
MOVQ T0, 16+O \ // O[2]
ADCQ T6, T1 \
ADCQ T7, T2 \
ADCQ T8, T3 \
ADCQ $0, T4 \
MULXQ 0+I1, T0, T5 \ // T5:T0 = U3*V0
MULXQ 8+I1, T7, T6 \ // T6:T7 = U3*V1
ADDQ T7, T5 \
MULXQ 16+I1, T8, T7 \ // T7:T8 = U3*V2
ADCQ T8, T6 \
MULXQ 24+I1, T9, T8 \ // T8:T9 = U3*V3
ADCQ T9, T7 \
ADCQ $0, T8 \
\ // Add values in remaining columns
ADDQ T0, T1 \
MOVQ T1, 24+O \ // O[3]
ADCQ T5, T2 \
MOVQ T2, 32+O \ // O[4]
ADCQ T6, T3 \
MOVQ T3, 40+O \ // O[5]
ADCQ T7, T4 \
MOVQ T4, 48+O \ // O[6]
ADCQ $0, T8 \ // O[7]
MOVQ T8, 56+O
// Performs schoolbook multiplication of 2 256-bit numbers. This optimized version
// uses ADOX, ADCX and MULX instructions. Macro smashes values in AX and DX.
// Input: I0 and I1.
// Output: O
// All the other arguments resgisters are used for storing temporary values
#define MULS256_MULX_ADCX_ADOX(O, I0, I1, T0, T1, T2, T3, T4, T5, T6, T7, T8, T9) \
\ // U0[0]
MOVQ 0+I0, DX \ // MULX requires multiplayer in DX
\ // T0:T1 = I1*DX
MULXQ I1, T1, T0 \ // T0:T1 = U0*V0 (low:high)
MOVQ T1, O \ // O0[0]
MULXQ 8+I1, T2, T1 \ // T2:T1 = U0*V1
XORQ AX, AX \
ADOXQ T2, T0 \
MULXQ 16+I1, T3, T2 \ // T2:T3 = U0*V2
ADOXQ T3, T1 \
MULXQ 24+I1, T4, T3 \ // T3:T4 = U0*V3
ADOXQ T4, T2 \
\ // Column U1
MOVQ 8+I0, DX \
MULXQ I1, T4, T5 \ // T5:T4 = U1*V0
ADOXQ AX, T3 \
XORQ AX, AX \
MULXQ 8+I1, T7, T6 \ // T6:T7 = U1*V1
ADOXQ T0, T4 \
MOVQ T4, 8+O \ // O[1]
ADCXQ T7, T5 \
MULXQ 16+I1, T8, T7 \ // T7:T8 = U1*V2
ADCXQ T8, T6 \
ADOXQ T1, T5 \
MULXQ 24+I1, T9, T8 \ // T8:T9 = U1*V3
ADCXQ T9, T7 \
ADCXQ AX, T8 \
ADOXQ T2, T6 \
\ // Column U2
MOVQ 16+I0, DX \
MULXQ I1, T0, T1 \ // T1:T0 = U2*V0
ADOXQ T3, T7 \
ADOXQ AX, T8 \
XORQ AX, AX \
MULXQ 8+I1, T3, T2 \ // T2:T3 = U2*V1
ADOXQ T5, T0 \
MOVQ T0, 16+O \ // O[2]
ADCXQ T3, T1 \
MULXQ 16+I1, T4, T3 \ // T3:T4 = U2*V2
ADCXQ T4, T2 \
ADOXQ T6, T1 \
MULXQ 24+I1, T9, T4 \ // T9:T4 = U2*V3
ADCXQ T9, T3 \
MOVQ 24+I0, DX \
ADCXQ AX, T4 \
\
ADOXQ T7, T2 \
ADOXQ T8, T3 \
ADOXQ AX, T4 \
\ // Column U3
MULXQ I1, T0, T5 \ // T5:T0 = U3*B0
XORQ AX, AX \
MULXQ 8+I1, T7, T6 \ // T6:T7 = U3*B1
ADCXQ T7, T5 \
ADOXQ T0, T1 \
MULXQ 16+I1, T8, T7 \ // T7:T8 = U3*V2
ADCXQ T8, T6 \
ADOXQ T5, T2 \
MULXQ 24+I1, T9, T8 \ // T8:T9 = U3*V3
ADCXQ T9, T7 \
ADCXQ AX, T8 \
\
ADOXQ T6, T3 \
ADOXQ T7, T4 \
ADOXQ AX, T8 \
MOVQ T1, 24+O \ // O[3]
MOVQ T2, 32+O \ // O[4]
MOVQ T3, 40+O \ // O[5]
MOVQ T4, 48+O \ // O[6] and O[7] below
MOVQ T8, 56+O
// Template of a macro that performs schoolbook multiplication of 128-bit with 320-bit
// number. It uses MULX instruction This template must be customized with functions
// performing ADD (add1, add2) and ADD-with-carry (adc1, adc2). addX/adcX may or may
// not be instructions that use two independent carry chains.
// Input:
// * I0 128-bit number
// * I1 320-bit number
// * add1, add2: instruction performing integer addition and starting carry chain
// * adc1, adc2: instruction performing integer addition with carry
// Output: T[0-6] registers
#define MULS_128x320(I0, I1, T0, T1, T2, T3, T4, T5, T6, T7, T8, T9, add1, add2, adc1, adc2) \
\ // Column 0
MOVQ I0, DX \
MULXQ I1+24(SB), T0, T1 \
MULXQ I1+32(SB), T4, T2 \
XORQ AX, AX \
MULXQ I1+40(SB), T5, T3 \
add1 T4, T1 \
adc1 T5, T2 \
MULXQ I1+48(SB), T7, T4 \
adc1 T7, T3 \
MULXQ I1+56(SB), T6, T5 \
adc1 T6, T4 \
adc1 AX, T5 \
\ // Column 1
MOVQ 8+I0, DX \
MULXQ I1+24(SB), T6, T7 \
add2 T6, T1 \
adc2 T7, T2 \
MULXQ I1+32(SB), T8, T6 \
adc2 T6, T3 \
MULXQ I1+40(SB), T7, T9 \
adc2 T9, T4 \
MULXQ I1+48(SB), T9, T6 \
adc2 T6, T5 \
MULXQ I1+56(SB), DX, T6 \
adc2 AX, T6 \
\ // Output
XORQ AX, AX \
add1 T8, T2 \
adc1 T7, T3 \
adc1 T9, T4 \
adc1 DX, T5 \
adc1 AX, T6
// Multiplies 128-bit with 320-bit integer. Optimized with MULX instruction.
#define MULS_128x320_MULX(I0, I1, T0, T1, T2, T3, T4, T5, T6, T7, T8, T9) \
MULS_128x320(I0, I1, T0, T1, T2, T3, T4, T5, T6, T7, T8, T9, ADDQ, ADDQ, ADCQ, ADCQ)
// Multiplies 128-bit with 320-bit integer. Optimized with MULX, ADOX and ADCX instructions
#define MULS_128x320_MULX_ADCX_ADOX(I0, I1, T0, T1, T2, T3, T4, T5, T6, T7, T8, T9) \
MULS_128x320(I0, I1, T0, T1, T2, T3, T4, T5, T6, T7, T8, T9, ADOXQ, ADCXQ, ADOXQ, ADCXQ)
// Template of a macro performing multiplication of two 512-bit numbers. It uses one
// level of Karatsuba and one level of schoolbook multiplication. Template must be
// customized with macro performing schoolbook multiplication.
// Input:
// * I0, I1 - two 512-bit numbers
// * MULS - either MULS256_MULX or MULS256_MULX_ADCX_ADOX
// Output: OUT - 1024-bit long
#define MUL(OUT, I0, I1, MULS) \
\ // R[8-11]: U1+U0
XORQ AX, AX \
MOVQ ( 0)(I0), R8 \
MOVQ ( 8)(I0), R9 \
MOVQ (16)(I0), R10 \
MOVQ (24)(I0), R11 \
ADDQ (32)(I0), R8 \
ADCQ (40)(I0), R9 \
ADCQ (48)(I0), R10 \
ADCQ (56)(I0), R11 \
SBBQ $0, AX \ // store mask
MOVQ R8, ( 0)(SP) \
MOVQ R9, ( 8)(SP) \
MOVQ R10, (16)(SP) \
MOVQ R11, (24)(SP) \
\
\ // R[12-15]: V1+V0
XORQ BX, BX \
MOVQ ( 0)(I1), R12 \
MOVQ ( 8)(I1), R13 \
MOVQ (16)(I1), R14 \
MOVQ (24)(I1), R15 \
ADDQ (32)(I1), R12 \
ADCQ (40)(I1), R13 \
ADCQ (48)(I1), R14 \
ADCQ (56)(I1), R15 \
SBBQ $0, BX \ // store mask
MOVQ R12, (32)(SP) \
MOVQ R13, (40)(SP) \
MOVQ R14, (48)(SP) \
MOVQ R15, (56)(SP) \
\ // Prepare mask for U0+U1 (U1+U0 mod 256^4 if U1+U0 sets carry flag, otherwise 0)
ANDQ AX, R12 \
ANDQ AX, R13 \
ANDQ AX, R14 \
ANDQ AX, R15 \
\ // Prepare mask for V0+V1 (V1+V0 mod 256^4 if U1+U0 sets carry flag, otherwise 0)
ANDQ BX, R8 \
ANDQ BX, R9 \
ANDQ BX, R10 \
ANDQ BX, R11 \
\ // res = masked(U0+U1) + masked(V0 + V1)
ADDQ R12, R8 \
ADCQ R13, R9 \
ADCQ R14, R10 \
ADCQ R15, R11 \
\ // SP[64-96] <- res
MOVQ R8, (64)(SP) \
MOVQ R9, (72)(SP) \
MOVQ R10, (80)(SP) \
MOVQ R11, (88)(SP) \
\ // BP will be used for schoolbook multiplication below
MOVQ BP, 96(SP) \
\ // (U1+U0)*(V1+V0)
MULS((64)(OUT), 0(SP), 32(SP), R8, R9, R10, R11, R12, R13, R14, R15, BX, BP) \
\ // U0 x V0
MULS(0(OUT), 0(I0), 0(I1), R8, R9, R10, R11, R12, R13, R14, R15, BX, BP) \
\ // U1 x V1
MULS(0(SP), 32(I0), 32(I1), R8, R9, R10, R11, R12, R13, R14, R15, BX, BP) \
\ // Recover BP
MOVQ 96(SP), BP \
\ // Final part of schoolbook multiplication; R[8-11] = (U0+U1) x (V0+V1)
MOVQ (64)(SP), R8 \
MOVQ (72)(SP), R9 \
MOVQ (80)(SP), R10 \
MOVQ (88)(SP), R11 \
MOVQ (96)(OUT), AX \
ADDQ AX, R8 \
MOVQ (104)(OUT), AX \
ADCQ AX, R9 \
MOVQ (112)(OUT), AX \
ADCQ AX, R10 \
MOVQ (120)(OUT), AX \
ADCQ AX, R11 \
\ // R[12-15, 8-11] = (U0+U1) x (V0+V1) - U0xV0
MOVQ (64)(OUT), R12 \
MOVQ (72)(OUT), R13 \
MOVQ (80)(OUT), R14 \
MOVQ (88)(OUT), R15 \
SUBQ ( 0)(OUT), R12 \
SBBQ ( 8)(OUT), R13 \
SBBQ (16)(OUT), R14 \
SBBQ (24)(OUT), R15 \
SBBQ (32)(OUT), R8 \
SBBQ (40)(OUT), R9 \
SBBQ (48)(OUT), R10 \
SBBQ (56)(OUT), R11 \
\ // r8-r15 <- (U0+U1) x (V0+V1) - U0xV0 - U1xV1
SUBQ ( 0)(SP), R12 \
SBBQ ( 8)(SP), R13 \
SBBQ (16)(SP), R14 \
SBBQ (24)(SP), R15 \
SBBQ (32)(SP), R8 \
SBBQ (40)(SP), R9 \
SBBQ (48)(SP), R10 \
SBBQ (56)(SP), R11 \
\
; ADDQ (32)(OUT), R12; MOVQ R12, ( 32)(OUT) \
; ADCQ (40)(OUT), R13; MOVQ R13, ( 40)(OUT) \
; ADCQ (48)(OUT), R14; MOVQ R14, ( 48)(OUT) \
; ADCQ (56)(OUT), R15; MOVQ R15, ( 56)(OUT) \
MOVQ ( 0)(SP), AX; ADCQ AX, R8; MOVQ R8, ( 64)(OUT) \
MOVQ ( 8)(SP), AX; ADCQ AX, R9; MOVQ R9, ( 72)(OUT) \
MOVQ (16)(SP), AX; ADCQ AX, R10; MOVQ R10, ( 80)(OUT) \
MOVQ (24)(SP), AX; ADCQ AX, R11; MOVQ R11, ( 88)(OUT) \
MOVQ (32)(SP), R12; ADCQ $0, R12; MOVQ R12, ( 96)(OUT) \
MOVQ (40)(SP), R13; ADCQ $0, R13; MOVQ R13, (104)(OUT) \
MOVQ (48)(SP), R14; ADCQ $0, R14; MOVQ R14, (112)(OUT) \
MOVQ (56)(SP), R15; ADCQ $0, R15; MOVQ R15, (120)(OUT)
// Template for calculating the Montgomery reduction algorithm described in
// section 5.2.3 of https://eprint.iacr.org/2017/1015.pdf. Template must be
// customized with schoolbook multiplicaton for 128 x 320-bit number.
// This macro reuses memory of IN value and *changes* it. Smashes registers
// R[8-15], BX, CX
// Input:
// * IN: 1024-bit number to be reduced
// * MULS: either MULS_128x320_MULX or MULS_128x320_MULX_ADCX_ADOX
// Output: OUT 512-bit
#define REDC(OUT, IN, MULS) \
MULS(0(IN), ·p503p1, R8, R9, R10, R11, R12, R13, R14, BX, CX, R15) \
XORQ R15, R15 \
ADDQ (24)(IN), R8 \
ADCQ (32)(IN), R9 \
ADCQ (40)(IN), R10 \
ADCQ (48)(IN), R11 \
ADCQ (56)(IN), R12 \
ADCQ (64)(IN), R13 \
ADCQ (72)(IN), R14 \
ADCQ (80)(IN), R15 \
MOVQ R8, (24)(IN) \
MOVQ R9, (32)(IN) \
MOVQ R10, (40)(IN) \
MOVQ R11, (48)(IN) \
MOVQ R12, (56)(IN) \
MOVQ R13, (64)(IN) \
MOVQ R14, (72)(IN) \
MOVQ R15, (80)(IN) \
MOVQ (88)(IN), R8 \
MOVQ (96)(IN), R9 \
MOVQ (104)(IN), R10 \
MOVQ (112)(IN), R11 \
MOVQ (120)(IN), R12 \
ADCQ $0, R8 \
ADCQ $0, R9 \
ADCQ $0, R10 \
ADCQ $0, R11 \
ADCQ $0, R12 \
MOVQ R8, (88)(IN) \
MOVQ R9, (96)(IN) \
MOVQ R10, (104)(IN) \
MOVQ R11, (112)(IN) \
MOVQ R12, (120)(IN) \
\
MULS(16(IN), ·p503p1, R8, R9, R10, R11, R12, R13, R14, BX, CX, R15) \
XORQ R15, R15 \
ADDQ (40)(IN), R8 \
ADCQ (48)(IN), R9 \
ADCQ (56)(IN), R10 \
ADCQ (64)(IN), R11 \
ADCQ (72)(IN), R12 \
ADCQ (80)(IN), R13 \
ADCQ (88)(IN), R14 \
ADCQ (96)(IN), R15 \
MOVQ R8, (40)(IN) \
MOVQ R9, (48)(IN) \
MOVQ R10, (56)(IN) \
MOVQ R11, (64)(IN) \
MOVQ R12, (72)(IN) \
MOVQ R13, (80)(IN) \
MOVQ R14, (88)(IN) \
MOVQ R15, (96)(IN) \
MOVQ (104)(IN), R8 \
MOVQ (112)(IN), R9 \
MOVQ (120)(IN), R10 \
ADCQ $0, R8 \
ADCQ $0, R9 \
ADCQ $0, R10 \
MOVQ R8, (104)(IN) \
MOVQ R9, (112)(IN) \
MOVQ R10, (120)(IN) \
\
MULS(32(IN), ·p503p1, R8, R9, R10, R11, R12, R13, R14, BX, CX, R15) \
XORQ R15, R15 \
XORQ BX, BX \
ADDQ ( 56)(IN), R8 \
ADCQ ( 64)(IN), R9 \
ADCQ ( 72)(IN), R10 \
ADCQ ( 80)(IN), R11 \
ADCQ ( 88)(IN), R12 \
ADCQ ( 96)(IN), R13 \
ADCQ (104)(IN), R14 \
ADCQ (112)(IN), R15 \
ADCQ (120)(IN), BX \
MOVQ R8, ( 56)(IN) \
MOVQ R10, ( 72)(IN) \
MOVQ R11, ( 80)(IN) \
MOVQ R12, ( 88)(IN) \
MOVQ R13, ( 96)(IN) \
MOVQ R14, (104)(IN) \
MOVQ R15, (112)(IN) \
MOVQ BX, (120)(IN) \
MOVQ R9, ( 0)(OUT) \ // Result: OUT[0]
\
MULS(48(IN), ·p503p1, R8, R9, R10, R11, R12, R13, R14, BX, CX, R15) \
ADDQ ( 72)(IN), R8 \
ADCQ ( 80)(IN), R9 \
ADCQ ( 88)(IN), R10 \
ADCQ ( 96)(IN), R11 \
ADCQ (104)(IN), R12 \
ADCQ (112)(IN), R13 \
ADCQ (120)(IN), R14 \
MOVQ R8, ( 8)(OUT) \ // Result: OUT[1]
MOVQ R9, (16)(OUT) \ // Result: OUT[2]
MOVQ R10, (24)(OUT) \ // Result: OUT[3]
MOVQ R11, (32)(OUT) \ // Result: OUT[4]
MOVQ R12, (40)(OUT) \ // Result: OUT[5]
MOVQ R13, (48)(OUT) \ // Result: OUT[6] and OUT[7]
MOVQ R14, (56)(OUT)
TEXT ·fp503StrongReduce(SB), NOSPLIT, $0-8
MOVQ x+0(FP), REG_P1
// Zero AX for later use:
XORQ AX, AX
// Load p into registers:
MOVQ P503_0, R8
// P503_{1,2} = P503_0, so reuse R8
MOVQ P503_3, R9
MOVQ P503_4, R10
MOVQ P503_5, R11
MOVQ P503_6, R12
MOVQ P503_7, R13
// Set x <- x - p
SUBQ R8, ( 0)(REG_P1)
SBBQ R8, ( 8)(REG_P1)
SBBQ R8, (16)(REG_P1)
SBBQ R9, (24)(REG_P1)
SBBQ R10, (32)(REG_P1)
SBBQ R11, (40)(REG_P1)
SBBQ R12, (48)(REG_P1)
SBBQ R13, (56)(REG_P1)
// Save carry flag indicating x-p < 0 as a mask
SBBQ $0, AX
// Conditionally add p to x if x-p < 0
ANDQ AX, R8
ANDQ AX, R9
ANDQ AX, R10
ANDQ AX, R11
ANDQ AX, R12
ANDQ AX, R13
ADDQ R8, ( 0)(REG_P1)
ADCQ R8, ( 8)(REG_P1)
ADCQ R8, (16)(REG_P1)
ADCQ R9, (24)(REG_P1)
ADCQ R10,(32)(REG_P1)
ADCQ R11,(40)(REG_P1)
ADCQ R12,(48)(REG_P1)
ADCQ R13,(56)(REG_P1)
RET
TEXT ·fp503ConditionalSwap(SB),NOSPLIT,$0-17
MOVQ x+0(FP), REG_P1
MOVQ y+8(FP), REG_P2
MOVBLZX choice+16(FP), AX // AL = 0 or 1
// Make AX, so that either all bits are set or non
// AX = 0 or 1
NEGQ AX
// Fill xmm15. After this step first half of XMM15 is
// just zeros and second half is whatever in AX
MOVQ AX, X15
// Copy lower double word everywhere else. So that
// XMM15=AL|AL|AL|AL. As AX has either all bits set
// or non result will be that XMM15 has also either
// all bits set or non of them.
PSHUFD $0, X15, X15
#ifndef CSWAP_BLOCK
#define CSWAP_BLOCK(idx) \
MOVOU (idx*16)(REG_P1), X0 \
MOVOU (idx*16)(REG_P2), X1 \
\ // X2 = mask & (X0 ^ X1)
MOVO X1, X2 \
PXOR X0, X2 \
PAND X15, X2 \
\
PXOR X2, X0 \
PXOR X2, X1 \
\
MOVOU X0, (idx*16)(REG_P1) \
MOVOU X1, (idx*16)(REG_P2)
#endif
CSWAP_BLOCK(0)
CSWAP_BLOCK(1)
CSWAP_BLOCK(2)
CSWAP_BLOCK(3)
#ifdef CSWAP_BLOCK
#undef CSWAP_BLOCK
#endif
RET
TEXT ·fp503AddReduced(SB),NOSPLIT,$0-24
MOVQ z+0(FP), REG_P3
MOVQ x+8(FP), REG_P1
MOVQ y+16(FP), REG_P2
// Used later to calculate a mask
XORQ CX, CX
// [R8-R15]: z = x + y
MOVQ ( 0)(REG_P1), R8
MOVQ ( 8)(REG_P1), R9
MOVQ (16)(REG_P1), R10
MOVQ (24)(REG_P1), R11
MOVQ (32)(REG_P1), R12
MOVQ (40)(REG_P1), R13
MOVQ (48)(REG_P1), R14
MOVQ (56)(REG_P1), R15
ADDQ ( 0)(REG_P2), R8
ADCQ ( 8)(REG_P2), R9
ADCQ (16)(REG_P2), R10
ADCQ (24)(REG_P2), R11
ADCQ (32)(REG_P2), R12
ADCQ (40)(REG_P2), R13
ADCQ (48)(REG_P2), R14
ADCQ (56)(REG_P2), R15
MOVQ P503X2_0, AX
SUBQ AX, R8
MOVQ P503X2_1, AX
SBBQ AX, R9
SBBQ AX, R10
MOVQ P503X2_3, AX
SBBQ AX, R11
MOVQ P503X2_4, AX
SBBQ AX, R12
MOVQ P503X2_5, AX
SBBQ AX, R13
MOVQ P503X2_6, AX
SBBQ AX, R14
MOVQ P503X2_7, AX
SBBQ AX, R15
// mask
SBBQ $0, CX
// move z to REG_P3
MOVQ R8, ( 0)(REG_P3)
MOVQ R9, ( 8)(REG_P3)
MOVQ R10, (16)(REG_P3)
MOVQ R11, (24)(REG_P3)
MOVQ R12, (32)(REG_P3)
MOVQ R13, (40)(REG_P3)
MOVQ R14, (48)(REG_P3)
MOVQ R15, (56)(REG_P3)
// if z<0 add p503x2 back
MOVQ P503X2_0, R8
MOVQ P503X2_1, R9
MOVQ P503X2_3, R10
MOVQ P503X2_4, R11
MOVQ P503X2_5, R12
MOVQ P503X2_6, R13
MOVQ P503X2_7, R14
ANDQ CX, R8
ANDQ CX, R9
ANDQ CX, R10
ANDQ CX, R11
ANDQ CX, R12
ANDQ CX, R13
ANDQ CX, R14
MOVQ ( 0)(REG_P3), AX; ADDQ R8, AX; MOVQ AX, ( 0)(REG_P3)
MOVQ ( 8)(REG_P3), AX; ADCQ R9, AX; MOVQ AX, ( 8)(REG_P3)
MOVQ (16)(REG_P3), AX; ADCQ R9, AX; MOVQ AX, (16)(REG_P3)
MOVQ (24)(REG_P3), AX; ADCQ R10, AX; MOVQ AX, (24)(REG_P3)
MOVQ (32)(REG_P3), AX; ADCQ R11, AX; MOVQ AX, (32)(REG_P3)
MOVQ (40)(REG_P3), AX; ADCQ R12, AX; MOVQ AX, (40)(REG_P3)
MOVQ (48)(REG_P3), AX; ADCQ R13, AX; MOVQ AX, (48)(REG_P3)
MOVQ (56)(REG_P3), AX; ADCQ R14, AX; MOVQ AX, (56)(REG_P3)
RET
TEXT ·fp503SubReduced(SB), NOSPLIT, $0-24
MOVQ z+0(FP), REG_P3
MOVQ x+8(FP), REG_P1
MOVQ y+16(FP), REG_P2
// Used later to calculate a mask
XORQ CX, CX
MOVQ ( 0)(REG_P1), R8
MOVQ ( 8)(REG_P1), R9
MOVQ (16)(REG_P1), R10
MOVQ (24)(REG_P1), R11
MOVQ (32)(REG_P1), R12
MOVQ (40)(REG_P1), R13
MOVQ (48)(REG_P1), R14
MOVQ (56)(REG_P1), R15
SUBQ ( 0)(REG_P2), R8
SBBQ ( 8)(REG_P2), R9
SBBQ (16)(REG_P2), R10
SBBQ (24)(REG_P2), R11
SBBQ (32)(REG_P2), R12
SBBQ (40)(REG_P2), R13
SBBQ (48)(REG_P2), R14
SBBQ (56)(REG_P2), R15
// mask
SBBQ $0, CX
// store x-y in REG_P3
MOVQ R8, ( 0)(REG_P3)
MOVQ R9, ( 8)(REG_P3)
MOVQ R10, (16)(REG_P3)
MOVQ R11, (24)(REG_P3)
MOVQ R12, (32)(REG_P3)
MOVQ R13, (40)(REG_P3)
MOVQ R14, (48)(REG_P3)
MOVQ R15, (56)(REG_P3)
// if z<0 add p503x2 back
MOVQ P503X2_0, R8
MOVQ P503X2_1, R9
MOVQ P503X2_3, R10
MOVQ P503X2_4, R11
MOVQ P503X2_5, R12
MOVQ P503X2_6, R13
MOVQ P503X2_7, R14
ANDQ CX, R8
ANDQ CX, R9
ANDQ CX, R10
ANDQ CX, R11
ANDQ CX, R12
ANDQ CX, R13
ANDQ CX, R14
MOVQ ( 0)(REG_P3), AX; ADDQ R8, AX; MOVQ AX, ( 0)(REG_P3)
MOVQ ( 8)(REG_P3), AX; ADCQ R9, AX; MOVQ AX, ( 8)(REG_P3)
MOVQ (16)(REG_P3), AX; ADCQ R9, AX; MOVQ AX, (16)(REG_P3)
MOVQ (24)(REG_P3), AX; ADCQ R10, AX; MOVQ AX, (24)(REG_P3)
MOVQ (32)(REG_P3), AX; ADCQ R11, AX; MOVQ AX, (32)(REG_P3)
MOVQ (40)(REG_P3), AX; ADCQ R12, AX; MOVQ AX, (40)(REG_P3)
MOVQ (48)(REG_P3), AX; ADCQ R13, AX; MOVQ AX, (48)(REG_P3)
MOVQ (56)(REG_P3), AX; ADCQ R14, AX; MOVQ AX, (56)(REG_P3)
RET
TEXT ·fp503Mul(SB), NOSPLIT, $104-24
MOVQ z+ 0(FP), CX
MOVQ x+ 8(FP), REG_P1
MOVQ y+16(FP), REG_P2
// Check wether to use optimized implementation
CMPB ·HasADXandBMI2(SB), $1
JE mul_with_mulx_adcx_adox
CMPB ·HasBMI2(SB), $1
JE mul_with_mulx
// Generic x86 implementation (below) uses variant of Karatsuba method.
//
// Here we store the destination in CX instead of in REG_P3 because the
// multiplication instructions use DX as an implicit destination
// operand: MULQ $REG sets DX:AX <-- AX * $REG.
// RAX and RDX will be used for a mask (0-borrow)
XORQ AX, AX
// RCX[0-3]: U1+U0
MOVQ (32)(REG_P1), R8
MOVQ (40)(REG_P1), R9
MOVQ (48)(REG_P1), R10
MOVQ (56)(REG_P1), R11
ADDQ ( 0)(REG_P1), R8
ADCQ ( 8)(REG_P1), R9
ADCQ (16)(REG_P1), R10
ADCQ (24)(REG_P1), R11
MOVQ R8, ( 0)(CX)
MOVQ R9, ( 8)(CX)
MOVQ R10, (16)(CX)
MOVQ R11, (24)(CX)
SBBQ $0, AX
// R12-R15: V1+V0
XORQ DX, DX
MOVQ (32)(REG_P2), R12
MOVQ (40)(REG_P2), R13
MOVQ (48)(REG_P2), R14
MOVQ (56)(REG_P2), R15
ADDQ ( 0)(REG_P2), R12
ADCQ ( 8)(REG_P2), R13
ADCQ (16)(REG_P2), R14
ADCQ (24)(REG_P2), R15
SBBQ $0, DX
// Store carries on stack
MOVQ AX, (64)(SP)
MOVQ DX, (72)(SP)
// (SP[0-3],R8,R9,R10,R11) <- (U0+U1)*(V0+V1).
// MUL using comba; In comments below U=U0+U1 V=V0+V1
// U0*V0
MOVQ (CX), AX
MULQ R12
MOVQ AX, (SP) // C0
MOVQ DX, R8
// U0*V1
XORQ R9, R9
MOVQ (CX), AX
MULQ R13
ADDQ AX, R8
ADCQ DX, R9
// U1*V0
XORQ R10, R10
MOVQ (8)(CX), AX
MULQ R12
ADDQ AX, R8
MOVQ R8, (8)(SP) // C1
ADCQ DX, R9
ADCQ $0, R10
// U0*V2
XORQ R8, R8
MOVQ (CX), AX
MULQ R14
ADDQ AX, R9
ADCQ DX, R10
ADCQ $0, R8
// U2*V0
MOVQ (16)(CX), AX
MULQ R12
ADDQ AX, R9
ADCQ DX, R10
ADCQ $0, R8
// U1*V1
MOVQ (8)(CX), AX
MULQ R13
ADDQ AX, R9
MOVQ R9, (16)(SP) // C2
ADCQ DX, R10
ADCQ $0, R8
// U0*V3
XORQ R9, R9
MOVQ (CX), AX
MULQ R15
ADDQ AX, R10
ADCQ DX, R8
ADCQ $0, R9
// U3*V0
MOVQ (24)(CX), AX
MULQ R12
ADDQ AX, R10
ADCQ DX, R8
ADCQ $0, R9
// U1*V2
MOVQ (8)(CX), AX
MULQ R14
ADDQ AX, R10
ADCQ DX, R8
ADCQ $0, R9
// U2*V1
MOVQ (16)(CX), AX
MULQ R13
ADDQ AX, R10
MOVQ R10, (24)(SP) // C3
ADCQ DX, R8
ADCQ $0, R9
// U1*V3
XORQ R10, R10
MOVQ (8)(CX), AX
MULQ R15
ADDQ AX, R8
ADCQ DX, R9
ADCQ $0, R10
// U3*V1
MOVQ (24)(CX), AX
MULQ R13
ADDQ AX, R8
ADCQ DX, R9
ADCQ $0, R10
// U2*V2
MOVQ (16)(CX), AX
MULQ R14
ADDQ AX, R8
MOVQ R8, (32)(SP) // C4
ADCQ DX, R9
ADCQ $0, R10
// U2*V3
XORQ R11, R11
MOVQ (16)(CX), AX
MULQ R15
ADDQ AX, R9
ADCQ DX, R10
ADCQ $0, R11
// U3*V2
MOVQ (24)(CX), AX
MULQ R14
ADDQ AX, R9 // C5
ADCQ DX, R10
ADCQ $0, R11
// U3*V3
MOVQ (24)(CX), AX
MULQ R15
ADDQ AX, R10 // C6
ADCQ DX, R11 // C7
MOVQ (64)(SP), AX
ANDQ AX, R12
ANDQ AX, R13
ANDQ AX, R14
ANDQ AX, R15
ADDQ R8, R12
ADCQ R9, R13
ADCQ R10, R14
ADCQ R11, R15
MOVQ (72)(SP), AX
MOVQ (CX), R8
MOVQ (8)(CX), R9
MOVQ (16)(CX), R10
MOVQ (24)(CX), R11
ANDQ AX, R8
ANDQ AX, R9
ANDQ AX, R10
ANDQ AX, R11
ADDQ R12, R8
ADCQ R13, R9
ADCQ R14, R10
ADCQ R15, R11
MOVQ R8, (32)(SP)
MOVQ R9, (40)(SP)
MOVQ R10, (48)(SP)
MOVQ R11, (56)(SP)
// CX[0-7] <- AL*BL
// U0*V0
MOVQ (REG_P1), R11
MOVQ (REG_P2), AX
MULQ R11
XORQ R9, R9
MOVQ AX, (CX) // C0
MOVQ DX, R8
// U0*V1
MOVQ (16)(REG_P1), R14
MOVQ (8)(REG_P2), AX
MULQ R11
XORQ R10, R10
ADDQ AX, R8
ADCQ DX, R9
// U1*V0
MOVQ (8)(REG_P1), R12
MOVQ (REG_P2), AX
MULQ R12
ADDQ AX, R8
MOVQ R8, (8)(CX) // C1
ADCQ DX, R9
ADCQ $0, R10
// U0*V2
XORQ R8, R8
MOVQ (16)(REG_P2), AX
MULQ R11
ADDQ AX, R9
ADCQ DX, R10
ADCQ $0, R8
// U2*V0
MOVQ (REG_P2), R13
MOVQ R14, AX
MULQ R13
ADDQ AX, R9
ADCQ DX, R10
ADCQ $0, R8
// U1*V1
MOVQ (8)(REG_P2), AX
MULQ R12
ADDQ AX, R9
MOVQ R9, (16)(CX) // C2
ADCQ DX, R10
ADCQ $0, R8
// U0*V3
XORQ R9, R9
MOVQ (24)(REG_P2), AX
MULQ R11
MOVQ (24)(REG_P1), R15
ADDQ AX, R10
ADCQ DX, R8
ADCQ $0, R9
// U3*V1
MOVQ R15, AX
MULQ R13
ADDQ AX, R10
ADCQ DX, R8
ADCQ $0, R9
// U2*V2
MOVQ (16)(REG_P2), AX
MULQ R12
ADDQ AX, R10
ADCQ DX, R8
ADCQ $0, R9
// U2*V3
MOVQ (8)(REG_P2), AX
MULQ R14
ADDQ AX, R10
MOVQ R10, (24)(CX) // C3
ADCQ DX, R8
ADCQ $0, R9
// U3*V2
XORQ R10, R10
MOVQ (24)(REG_P2), AX
MULQ R12
ADDQ AX, R8
ADCQ DX, R9
ADCQ $0, R10
// U3*V1
MOVQ (8)(REG_P2), AX
MULQ R15
ADDQ AX, R8
ADCQ DX, R9
ADCQ $0, R10
// U2*V2
MOVQ (16)(REG_P2), AX
MULQ R14
ADDQ AX, R8
MOVQ R8, (32)(CX) // C4
ADCQ DX, R9
ADCQ $0, R10
// U2*V3
XORQ R8, R8
MOVQ (24)(REG_P2), AX
MULQ R14
ADDQ AX, R9
ADCQ DX, R10
ADCQ $0, R8
// U3*V2
MOVQ (16)(REG_P2), AX
MULQ R15
ADDQ AX, R9
MOVQ R9, (40)(CX) // C5
ADCQ DX, R10
ADCQ $0, R8
// U3*V3
MOVQ (24)(REG_P2), AX
MULQ R15
ADDQ AX, R10
MOVQ R10, (48)(CX) // C6
ADCQ DX, R8
MOVQ R8, (56)(CX) // C7
// CX[8-15] <- U1*V1
MOVQ (32)(REG_P1), R11
MOVQ (32)(REG_P2), AX
MULQ R11
XORQ R9, R9
MOVQ AX, (64)(CX) // C0
MOVQ DX, R8
MOVQ (48)(REG_P1), R14
MOVQ (40)(REG_P2), AX
MULQ R11
XORQ R10, R10
ADDQ AX, R8
ADCQ DX, R9
MOVQ (40)(REG_P1), R12
MOVQ (32)(REG_P2), AX
MULQ R12
ADDQ AX, R8
MOVQ R8, (72)(CX) // C1
ADCQ DX, R9
ADCQ $0, R10
XORQ R8, R8
MOVQ (48)(REG_P2), AX
MULQ R11
ADDQ AX, R9
ADCQ DX, R10
ADCQ $0, R8
MOVQ (32)(REG_P2), R13
MOVQ R14, AX
MULQ R13
ADDQ AX, R9
ADCQ DX, R10
ADCQ $0, R8
MOVQ (40)(REG_P2), AX
MULQ R12
ADDQ AX, R9
MOVQ R9, (80)(CX) // C2
ADCQ DX, R10
ADCQ $0, R8
XORQ R9, R9
MOVQ (56)(REG_P2), AX
MULQ R11
MOVQ (56)(REG_P1), R15
ADDQ AX, R10
ADCQ DX, R8
ADCQ $0, R9
MOVQ R15, AX
MULQ R13
ADDQ AX, R10
ADCQ DX, R8
ADCQ $0, R9
MOVQ (48)(REG_P2), AX
MULQ R12
ADDQ AX, R10
ADCQ DX, R8
ADCQ $0, R9
MOVQ (40)(REG_P2), AX
MULQ R14
ADDQ AX, R10
MOVQ R10, (88)(CX) // C3
ADCQ DX, R8
ADCQ $0, R9
XORQ R10, R10
MOVQ (56)(REG_P2), AX
MULQ R12
ADDQ AX, R8
ADCQ DX, R9
ADCQ $0, R10
MOVQ (40)(REG_P2), AX
MULQ R15
ADDQ AX, R8
ADCQ DX, R9
ADCQ $0, R10
MOVQ (48)(REG_P2), AX
MULQ R14
ADDQ AX, R8
MOVQ R8, (96)(CX) // C4
ADCQ DX, R9
ADCQ $0, R10
XORQ R8, R8
MOVQ (56)(REG_P2), AX
MULQ R14
ADDQ AX, R9
ADCQ DX, R10
ADCQ $0, R8
MOVQ (48)(REG_P2), AX
MULQ R15
ADDQ AX, R9
MOVQ R9, (104)(CX) // C5
ADCQ DX, R10
ADCQ $0, R8
MOVQ (56)(REG_P2), AX
MULQ R15
ADDQ AX, R10
MOVQ R10, (112)(CX) // C6
ADCQ DX, R8
MOVQ R8, (120)(CX) // C7
// [R8-R15] <- (U0+U1)*(V0+V1) - U1*V1
MOVQ (SP), R8
SUBQ (CX), R8
MOVQ (8)(SP), R9
SBBQ (8)(CX), R9
MOVQ (16)(SP), R10
SBBQ (16)(CX), R10
MOVQ (24)(SP), R11
SBBQ (24)(CX), R11
MOVQ (32)(SP), R12
SBBQ (32)(CX), R12
MOVQ (40)(SP), R13
SBBQ (40)(CX), R13
MOVQ (48)(SP), R14
SBBQ (48)(CX), R14
MOVQ (56)(SP), R15
SBBQ (56)(CX), R15
// [R8-R15] <- (U0+U1)*(V0+V1) - U1*V0 - U0*U1
MOVQ ( 64)(CX), AX; SUBQ AX, R8
MOVQ ( 72)(CX), AX; SBBQ AX, R9
MOVQ ( 80)(CX), AX; SBBQ AX, R10
MOVQ ( 88)(CX), AX; SBBQ AX, R11
MOVQ ( 96)(CX), AX; SBBQ AX, R12
MOVQ (104)(CX), DX; SBBQ DX, R13
MOVQ (112)(CX), DI; SBBQ DI, R14
MOVQ (120)(CX), SI; SBBQ SI, R15
// Final result
ADDQ (32)(CX), R8; MOVQ R8, (32)(CX)
ADCQ (40)(CX), R9; MOVQ R9, (40)(CX)
ADCQ (48)(CX), R10; MOVQ R10, (48)(CX)
ADCQ (56)(CX), R11; MOVQ R11, (56)(CX)
ADCQ (64)(CX), R12; MOVQ R12, (64)(CX)
ADCQ (72)(CX), R13; MOVQ R13, (72)(CX)
ADCQ (80)(CX), R14; MOVQ R14, (80)(CX)
ADCQ (88)(CX), R15; MOVQ R15, (88)(CX)
ADCQ $0, AX; MOVQ AX, (96)(CX)
ADCQ $0, DX; MOVQ DX, (104)(CX)
ADCQ $0, DI; MOVQ DI, (112)(CX)
ADCQ $0, SI; MOVQ SI, (120)(CX)
RET
mul_with_mulx_adcx_adox:
// Mul implementation for CPUs supporting two independent carry chain
// (ADOX/ADCX) instructions and carry-less MULX multiplier
MUL(CX, REG_P1, REG_P2, MULS256_MULX_ADCX_ADOX)
RET
mul_with_mulx:
// Mul implementation for CPUs supporting carry-less MULX multiplier.
MUL(CX, REG_P1, REG_P2, MULS256_MULX)
RET
TEXT ·fp503MontgomeryReduce(SB), $0-16
MOVQ z+0(FP), REG_P2
MOVQ x+8(FP), REG_P1
// Check wether to use optimized implementation
CMPB ·HasADXandBMI2(SB), $1
JE redc_with_mulx_adcx_adox
CMPB ·HasBMI2(SB), $1
JE redc_with_mulx
MOVQ (REG_P1), R11
MOVQ P503P1_3, AX
MULQ R11
XORQ R8, R8
ADDQ (24)(REG_P1), AX
MOVQ AX, (24)(REG_P2)
ADCQ DX, R8
XORQ R9, R9
MOVQ P503P1_4, AX
MULQ R11
XORQ R10, R10
ADDQ AX, R8
ADCQ DX, R9
MOVQ (8)(REG_P1), R12
MOVQ P503P1_3, AX
MULQ R12
ADDQ AX, R8
ADCQ DX, R9
ADCQ $0, R10
ADDQ (32)(REG_P1), R8
MOVQ R8, (32)(REG_P2) // Z4
ADCQ $0, R9
ADCQ $0, R10
XORQ R8, R8
MOVQ P503P1_5, AX
MULQ R11
ADDQ AX, R9
ADCQ DX, R10
ADCQ $0, R8
MOVQ P503P1_4, AX
MULQ R12
ADDQ AX, R9
ADCQ DX, R10
ADCQ $0, R8
MOVQ (16)(REG_P1), R13
MOVQ P503P1_3, AX
MULQ R13
ADDQ AX, R9
ADCQ DX, R10
ADCQ $0, R8
ADDQ (40)(REG_P1), R9
MOVQ R9, (40)(REG_P2) // Z5
ADCQ $0, R10
ADCQ $0, R8
XORQ R9, R9
MOVQ P503P1_6, AX
MULQ R11
ADDQ AX, R10
ADCQ DX, R8
ADCQ $0, R9
MOVQ P503P1_5, AX
MULQ R12
ADDQ AX, R10
ADCQ DX, R8
ADCQ $0, R9
MOVQ P503P1_4, AX
MULQ R13
ADDQ AX, R10
ADCQ DX, R8
ADCQ $0, R9
MOVQ (24)(REG_P2), R14
MOVQ P503P1_3, AX
MULQ R14
ADDQ AX, R10
ADCQ DX, R8
ADCQ $0, R9
ADDQ (48)(REG_P1), R10
MOVQ R10, (48)(REG_P2) // Z6
ADCQ $0, R8
ADCQ $0, R9
XORQ R10, R10
MOVQ P503P1_7, AX
MULQ R11
ADDQ AX, R8
ADCQ DX, R9
ADCQ $0, R10
MOVQ P503P1_6, AX
MULQ R12
ADDQ AX, R8
ADCQ DX, R9
ADCQ $0, R10
MOVQ P503P1_5, AX
MULQ R13
ADDQ AX, R8
ADCQ DX, R9
ADCQ $0, R10
MOVQ P503P1_4, AX
MULQ R14
ADDQ AX, R8
ADCQ DX, R9
ADCQ $0, R10
MOVQ (32)(REG_P2), R15
MOVQ P503P1_3, AX
MULQ R15
ADDQ AX, R8
ADCQ DX, R9
ADCQ $0, R10
ADDQ (56)(REG_P1), R8
MOVQ R8, (56)(REG_P2) // Z7
ADCQ $0, R9
ADCQ $0, R10
XORQ R8, R8
MOVQ P503P1_7, AX
MULQ R12
ADDQ AX, R9
ADCQ DX, R10
ADCQ $0, R8
MOVQ P503P1_6, AX
MULQ R13
ADDQ AX, R9
ADCQ DX, R10
ADCQ $0, R8
MOVQ P503P1_5, AX
MULQ R14
ADDQ AX, R9
ADCQ DX, R10
ADCQ $0, R8
MOVQ P503P1_4, AX
MULQ R15
ADDQ AX, R9
ADCQ DX, R10
ADCQ $0, R8
MOVQ (40)(REG_P2), CX
MOVQ P503P1_3, AX
MULQ CX
ADDQ AX, R9
ADCQ DX, R10
ADCQ $0, R8
ADDQ (64)(REG_P1), R9
MOVQ R9, (REG_P2) // Z0
ADCQ $0, R10
ADCQ $0, R8
XORQ R9, R9
MOVQ P503P1_7, AX
MULQ R13
ADDQ AX, R10
ADCQ DX, R8
ADCQ $0, R9
MOVQ P503P1_6, AX
MULQ R14
ADDQ AX, R10
ADCQ DX, R8
ADCQ $0, R9
MOVQ P503P1_5, AX
MULQ R15
ADDQ AX, R10
ADCQ DX, R8
ADCQ $0, R9
MOVQ P503P1_4, AX
MULQ CX
ADDQ AX, R10
ADCQ DX, R8
ADCQ $0, R9
MOVQ (48)(REG_P2), R13
MOVQ P503P1_3, AX
MULQ R13
ADDQ AX, R10
ADCQ DX, R8
ADCQ $0, R9
ADDQ (72)(REG_P1), R10
MOVQ R10, (8)(REG_P2) // Z1
ADCQ $0, R8
ADCQ $0, R9
XORQ R10, R10
MOVQ P503P1_7, AX
MULQ R14
ADDQ AX, R8
ADCQ DX, R9
ADCQ $0, R10
MOVQ P503P1_6, AX
MULQ R15
ADDQ AX, R8
ADCQ DX, R9
ADCQ $0, R10
MOVQ P503P1_5, AX
MULQ CX
ADDQ AX, R8
ADCQ DX, R9
ADCQ $0, R10
MOVQ P503P1_4, AX
MULQ R13
ADDQ AX, R8
ADCQ DX, R9
ADCQ $0, R10
MOVQ (56)(REG_P2), R14
MOVQ P503P1_3, AX
MULQ R14
ADDQ AX, R8
ADCQ DX, R9
ADCQ $0, R10
ADDQ (80)(REG_P1), R8
MOVQ R8, (16)(REG_P2) // Z2
ADCQ $0, R9
ADCQ $0, R10
XORQ R8, R8
MOVQ P503P1_7, AX
MULQ R15
ADDQ AX, R9
ADCQ DX, R10
ADCQ $0, R8
MOVQ P503P1_6, AX
MULQ CX
ADDQ AX, R9
ADCQ DX, R10
ADCQ $0, R8
MOVQ P503P1_5, AX
MULQ R13
ADDQ AX, R9
ADCQ DX, R10
ADCQ $0, R8
MOVQ P503P1_4, AX
MULQ R14
ADDQ AX, R9
ADCQ DX, R10
ADCQ $0, R8
ADDQ (88)(REG_P1), R9
MOVQ R9, (24)(REG_P2) // Z3
ADCQ $0, R10
ADCQ $0, R8
XORQ R9, R9
MOVQ P503P1_7, AX
MULQ CX
ADDQ AX, R10
ADCQ DX, R8
ADCQ $0, R9
MOVQ P503P1_6, AX
MULQ R13
ADDQ AX, R10
ADCQ DX, R8
ADCQ $0, R9
MOVQ P503P1_5, AX
MULQ R14
ADDQ AX, R10
ADCQ DX, R8
ADCQ $0, R9
ADDQ (96)(REG_P1), R10
MOVQ R10, (32)(REG_P2) // Z4
ADCQ $0, R8
ADCQ $0, R9
XORQ R10, R10
MOVQ P503P1_7, AX
MULQ R13
ADDQ AX, R8
ADCQ DX, R9
ADCQ $0, R10
MOVQ P503P1_6, AX
MULQ R14
ADDQ AX, R8
ADCQ DX, R9
ADCQ $0, R10
ADDQ (104)(REG_P1), R8 // Z5
MOVQ R8, (40)(REG_P2) // Z5
ADCQ $0, R9
ADCQ $0, R10
MOVQ P503P1_7, AX
MULQ R14
ADDQ AX, R9
ADCQ DX, R10
ADDQ (112)(REG_P1), R9 // Z6
MOVQ R9, (48)(REG_P2) // Z6
ADCQ $0, R10
ADDQ (120)(REG_P1), R10 // Z7
MOVQ R10, (56)(REG_P2) // Z7
RET
redc_with_mulx_adcx_adox:
// Implementation of the Montgomery reduction for CPUs
// supporting two independent carry chain (ADOX/ADCX)
// instructions and carry-less MULX multiplier
REDC(REG_P2, REG_P1, MULS_128x320_MULX_ADCX_ADOX)
RET
redc_with_mulx:
// Implementation of the Montgomery reduction for CPUs
// supporting carry-less MULX multiplier.
REDC(REG_P2, REG_P1, MULS_128x320_MULX)
RET
TEXT ·fp503AddLazy(SB), NOSPLIT, $0-24
MOVQ z+0(FP), REG_P3
MOVQ x+8(FP), REG_P1
MOVQ y+16(FP), REG_P2
MOVQ (REG_P1), R8
MOVQ (8)(REG_P1), R9
MOVQ (16)(REG_P1), R10
MOVQ (24)(REG_P1), R11
MOVQ (32)(REG_P1), R12
MOVQ (40)(REG_P1), R13
MOVQ (48)(REG_P1), R14
MOVQ (56)(REG_P1), R15
ADDQ (REG_P2), R8
ADCQ (8)(REG_P2), R9
ADCQ (16)(REG_P2), R10
ADCQ (24)(REG_P2), R11
ADCQ (32)(REG_P2), R12
ADCQ (40)(REG_P2), R13
ADCQ (48)(REG_P2), R14
ADCQ (56)(REG_P2), R15
MOVQ R8, (REG_P3)
MOVQ R9, (8)(REG_P3)
MOVQ R10, (16)(REG_P3)
MOVQ R11, (24)(REG_P3)
MOVQ R12, (32)(REG_P3)
MOVQ R13, (40)(REG_P3)
MOVQ R14, (48)(REG_P3)
MOVQ R15, (56)(REG_P3)
RET
TEXT ·fp503X2AddLazy(SB), NOSPLIT, $0-24
MOVQ z+0(FP), REG_P3
MOVQ x+8(FP), REG_P1
MOVQ y+16(FP), REG_P2
MOVQ (REG_P1), R8
MOVQ (8)(REG_P1), R9
MOVQ (16)(REG_P1), R10
MOVQ (24)(REG_P1), R11
MOVQ (32)(REG_P1), R12
MOVQ (40)(REG_P1), R13
MOVQ (48)(REG_P1), R14
MOVQ (56)(REG_P1), R15
MOVQ (64)(REG_P1), AX
MOVQ (72)(REG_P1), BX
MOVQ (80)(REG_P1), CX
ADDQ (REG_P2), R8
ADCQ (8)(REG_P2), R9
ADCQ (16)(REG_P2), R10
ADCQ (24)(REG_P2), R11
ADCQ (32)(REG_P2), R12
ADCQ (40)(REG_P2), R13
ADCQ (48)(REG_P2), R14
ADCQ (56)(REG_P2), R15
ADCQ (64)(REG_P2), AX
ADCQ (72)(REG_P2), BX
ADCQ (80)(REG_P2), CX
MOVQ R8, (REG_P3)
MOVQ R9, (8)(REG_P3)
MOVQ R10, (16)(REG_P3)
MOVQ R11, (24)(REG_P3)
MOVQ R12, (32)(REG_P3)
MOVQ R13, (40)(REG_P3)
MOVQ R14, (48)(REG_P3)
MOVQ R15, (56)(REG_P3)
MOVQ AX, (64)(REG_P3)
MOVQ BX, (72)(REG_P3)
MOVQ CX, (80)(REG_P3)
MOVQ (88)(REG_P1), R8
MOVQ (96)(REG_P1), R9
MOVQ (104)(REG_P1), R10
MOVQ (112)(REG_P1), R11
MOVQ (120)(REG_P1), R12
ADCQ (88)(REG_P2), R8
ADCQ (96)(REG_P2), R9
ADCQ (104)(REG_P2), R10
ADCQ (112)(REG_P2), R11
ADCQ (120)(REG_P2), R12
MOVQ R8, (88)(REG_P3)
MOVQ R9, (96)(REG_P3)
MOVQ R10, (104)(REG_P3)
MOVQ R11, (112)(REG_P3)
MOVQ R12, (120)(REG_P3)
RET
TEXT ·fp503X2SubLazy(SB), NOSPLIT, $0-24
MOVQ z+0(FP), REG_P3
MOVQ x+8(FP), REG_P1
MOVQ y+16(FP), REG_P2
// Used later to store result of 0-borrow
XORQ CX, CX
// SUBC for first 11 limbs
MOVQ (REG_P1), R8
MOVQ (8)(REG_P1), R9
MOVQ (16)(REG_P1), R10
MOVQ (24)(REG_P1), R11
MOVQ (32)(REG_P1), R12
MOVQ (40)(REG_P1), R13
MOVQ (48)(REG_P1), R14
MOVQ (56)(REG_P1), R15
MOVQ (64)(REG_P1), AX
MOVQ (72)(REG_P1), BX
SUBQ (REG_P2), R8
SBBQ (8)(REG_P2), R9
SBBQ (16)(REG_P2), R10
SBBQ (24)(REG_P2), R11
SBBQ (32)(REG_P2), R12
SBBQ (40)(REG_P2), R13
SBBQ (48)(REG_P2), R14
SBBQ (56)(REG_P2), R15
SBBQ (64)(REG_P2), AX
SBBQ (72)(REG_P2), BX
MOVQ R8, (REG_P3)
MOVQ R9, (8)(REG_P3)
MOVQ R10, (16)(REG_P3)
MOVQ R11, (24)(REG_P3)
MOVQ R12, (32)(REG_P3)
MOVQ R13, (40)(REG_P3)
MOVQ R14, (48)(REG_P3)
MOVQ R15, (56)(REG_P3)
MOVQ AX, (64)(REG_P3)
MOVQ BX, (72)(REG_P3)
// SUBC for last 5 limbs
MOVQ (80)(REG_P1), R8
MOVQ (88)(REG_P1), R9
MOVQ (96)(REG_P1), R10
MOVQ (104)(REG_P1), R11
MOVQ (112)(REG_P1), R12
MOVQ (120)(REG_P1), R13
SBBQ (80)(REG_P2), R8
SBBQ (88)(REG_P2), R9
SBBQ (96)(REG_P2), R10
SBBQ (104)(REG_P2), R11
SBBQ (112)(REG_P2), R12
SBBQ (120)(REG_P2), R13
MOVQ R8, (80)(REG_P3)
MOVQ R9, (88)(REG_P3)
MOVQ R10, (96)(REG_P3)
MOVQ R11, (104)(REG_P3)
MOVQ R12, (112)(REG_P3)
MOVQ R13, (120)(REG_P3)
// Now the carry flag is 1 if x-y < 0. If so, add p*2^512.
SBBQ $0, CX
// Load p into registers:
MOVQ P503_0, R8
// P503_{1,2} = P503_0, so reuse R8
MOVQ P503_3, R9
MOVQ P503_4, R10
MOVQ P503_5, R11
MOVQ P503_6, R12
MOVQ P503_7, R13
ANDQ CX, R8
ANDQ CX, R9
ANDQ CX, R10
ANDQ CX, R11
ANDQ CX, R12
ANDQ CX, R13
MOVQ (64 )(REG_P3), AX; ADDQ R8, AX; MOVQ AX, (64 )(REG_P3)
MOVQ (64+ 8)(REG_P3), AX; ADCQ R8, AX; MOVQ AX, (64+ 8)(REG_P3)
MOVQ (64+16)(REG_P3), AX; ADCQ R8, AX; MOVQ AX, (64+16)(REG_P3)
MOVQ (64+24)(REG_P3), AX; ADCQ R9, AX; MOVQ AX, (64+24)(REG_P3)
MOVQ (64+32)(REG_P3), AX; ADCQ R10, AX; MOVQ AX, (64+32)(REG_P3)
MOVQ (64+40)(REG_P3), AX; ADCQ R11, AX; MOVQ AX, (64+40)(REG_P3)
MOVQ (64+48)(REG_P3), AX; ADCQ R12, AX; MOVQ AX, (64+48)(REG_P3)
MOVQ (64+56)(REG_P3), AX; ADCQ R13, AX; MOVQ AX, (64+56)(REG_P3)
RET