mirror of
https://github.com/henrydcase/nobs.git
synced 2024-11-23 07:38:56 +00:00
1233 lines
31 KiB
ArmAsm
1233 lines
31 KiB
ArmAsm
// +build amd64,!noasm
|
|
|
|
#include "textflag.h"
|
|
|
|
// p434
|
|
#define P434_0 $0xFFFFFFFFFFFFFFFF
|
|
#define P434_3 $0xFDC1767AE2FFFFFF
|
|
#define P434_4 $0x7BC65C783158AEA3
|
|
#define P434_5 $0x6CFC5FD681C52056
|
|
#define P434_6 $0x0002341F27177344
|
|
|
|
// p434 x 2
|
|
#define P434X2_0 $0xFFFFFFFFFFFFFFFE
|
|
#define P434X2_1 $0xFFFFFFFFFFFFFFFF
|
|
#define P434X2_3 $0xFB82ECF5C5FFFFFF
|
|
#define P434X2_4 $0xF78CB8F062B15D47
|
|
#define P434X2_5 $0xD9F8BFAD038A40AC
|
|
#define P434X2_6 $0x0004683E4E2EE688
|
|
|
|
// Redefine P434p1Zeros
|
|
#define P434_P1_ZEROS 3
|
|
|
|
// Performs schoolbook multiplication of 128-bit with 256-bit
|
|
// number. Uses MULX, ADOX, ADCX instruction.
|
|
#define MULX128x256(IDX, M0, M1, T0, T1, T2, T3, T4, T5, T6) \
|
|
XORQ AX, AX \
|
|
MOVQ (IDX)(M0), DX \
|
|
MULXQ M1+ 0(SB), T0, T1 \ // T0 <- C0
|
|
MULXQ M1+ 8(SB), T4, T2 \
|
|
MULXQ M1+16(SB), T5, T3 \
|
|
ADOXQ T4, T1 \ // T1: interm1
|
|
ADOXQ T5, T2 \ // T2: interm2
|
|
\
|
|
MULXQ M1+24(SB), T5, T4 \
|
|
ADOXQ T5, T3 \ // T3: interm3
|
|
ADOXQ AX, T4 \ // T4: interm4
|
|
\
|
|
XORQ AX, AX \
|
|
MOVQ (IDX+8)(M0), DX \
|
|
MULXQ M1+ 0(SB), T5, T6 \
|
|
ADCXQ T5, T1 \ // T1 <- C1
|
|
ADCXQ T6, T2 \
|
|
\
|
|
MULXQ M1+ 8(SB), T6, T5 \
|
|
ADCXQ T5, T3 \
|
|
ADOXQ T6, T2 \ // T2 <- C2
|
|
\
|
|
MULXQ M1+16(SB), T6, T5 \
|
|
ADCXQ T5, T4 \
|
|
ADOXQ T6, T3 \ // T3 <- C3
|
|
\
|
|
MULXQ M1+24(SB), T6, T5 \
|
|
ADCXQ AX, T5 \
|
|
ADOXQ T6, T4 \ // T4 <- C4
|
|
ADOXQ AX, T5 // T5 <- C5
|
|
|
|
// Performs schoolbook multiplication of 64-bit with 256-bit
|
|
// number. Uses MULX and ADOX instructions.
|
|
//
|
|
// Uses registers: DX,AX
|
|
#define MULX64x256(IDX, M0, M1, T0, T1, T2, T3, T4, T5) \
|
|
XORQ AX, AX \
|
|
MOVQ (IDX)(M0), DX \
|
|
MULXQ M1+ 0(SB), T0, T1 \ // T0 <- C0
|
|
MULXQ M1+ 8(SB), T4, T2 \
|
|
MULXQ M1+16(SB), T5, T3 \
|
|
\
|
|
ADOXQ T4, T1 \ // T1 <- C1
|
|
ADOXQ T5, T2 \ // T2 <- C2
|
|
\
|
|
MULXQ M1+24(SB), T5, T4 \
|
|
ADOXQ T5, T3 \ // T3 <- C3
|
|
ADOXQ AX, T4 // T4 <- C4
|
|
|
|
// Performs schoolbook multiplication of two 192-bit numbers
|
|
// number. Uses MULX and ADOX instructions.
|
|
//
|
|
// Uses registers: DX,AX
|
|
#define MULX192(IM0,M0,IM1,M1,ID,MDST,T0,T1,T2,T3,T4,T5,T6) \
|
|
MOVQ (0+IM0)(M0), DX \
|
|
MULXQ (0+IM1)(M1), T1, T0 \ // T0:T1 = A0*B0
|
|
MOVQ T1,(ID+0)(MDST) \ // MDST0
|
|
MULXQ (IM1+ 8)(M1), T2, T1 \ // T1:T2 = A0*B1
|
|
XORQ AX, AX \
|
|
ADOXQ T2, T0 \
|
|
MULXQ (IM1+16)(M1),T3, T2 \ // T2:T3 = A0*B2
|
|
ADOXQ T3, T1 \
|
|
\
|
|
MOVQ (IM0+8)(M0), DX \
|
|
MULXQ (IM1+0)(M1), T4, T3 \ // T3:T4 = A1*B0
|
|
ADOXQ AX, T2 \
|
|
XORQ AX, AX \
|
|
\
|
|
MULXQ (IM1+8)(M1), T6, T5 \ // T6:T7 = A1*B1
|
|
ADOXQ T0, T4 \
|
|
MOVQ T4,(ID+8)(MDST) \ // MDST1
|
|
ADCXQ T6, T3 \
|
|
\
|
|
MULXQ (IM1+16)(M1),T0, T6 \ // T6:T0 = A1*B2
|
|
ADOXQ T1, T3 \
|
|
ADCXQ T0, T5 \
|
|
ADCXQ AX, T6 \
|
|
ADOXQ T2, T5 \
|
|
\
|
|
MOVQ (IM0+16)(M0),DX \
|
|
MULXQ (IM1+ 0)(M1), T0, T1 \ // T1:T0 = A2*B0
|
|
ADOXQ AX, T6 \
|
|
XORQ AX, AX \
|
|
\
|
|
MULXQ (IM1+ 8)(M1), T2, T4 \ // T4:T2 = A2*B1
|
|
ADOXQ T3, T0 \
|
|
MOVQ T0, (ID+16)(MDST) \ // MDST2
|
|
ADCXQ T5, T1 \
|
|
\
|
|
MULXQ (IM1+16)(M1),T3, T0 \ // T0:T3 = A2*B2
|
|
ADCXQ T6, T4 \
|
|
ADCXQ AX, T0 \
|
|
ADOXQ T2, T1 \
|
|
ADOXQ T4, T3 \
|
|
ADOXQ AX, T0 \
|
|
MOVQ T1, (ID+24)(MDST) \ // MDST3
|
|
MOVQ T3, (ID+32)(MDST) \ // MDST4
|
|
MOVQ T0, (ID+40)(MDST) // MDST5
|
|
|
|
// Performs schoolbook multiplication of 2 256-bit numbers. Uses
|
|
// MULX instruction. Result is stored in 256 bits pointed by $DST.
|
|
//
|
|
// Uses registers: DX,AX
|
|
#define MULX256(IM0,M0,IM1,M1,ID,MDST,T0,T1,T2,T3,T4,T5,T6,T7,T8,T9) \
|
|
MOVQ (IM0+0)(M0), DX \
|
|
MULXQ (IM1+0)(M1), T1, T0 \ // A0*B[0-3]
|
|
MOVQ T1, (ID+0)(MDST) \
|
|
MULXQ (IM1+8)(M1), T2, T1 \
|
|
XORQ AX, AX \
|
|
ADOXQ T2, T0 \
|
|
MULXQ (IM1+16)(M1),T3, T2 \
|
|
ADOXQ T3, T1 \
|
|
MULXQ (IM1+24)(M1),T4, T3 \
|
|
ADOXQ T4, T2 \
|
|
\
|
|
MOVQ (IM0+8)(M0), DX \
|
|
MULXQ (IM1+0)(M1), T4, T5 \ // A1*B[0-3]
|
|
ADOXQ AX, T3 \
|
|
XORQ AX, AX \
|
|
MULXQ (IM1+8)(M1), T7, T6 \
|
|
ADOXQ T0, T4 \
|
|
MOVQ T4, (ID+8)(MDST) \
|
|
ADCXQ T7, T5 \
|
|
MULXQ (IM1+16)(M1),T8, T7 \
|
|
ADCXQ T8, T6 \
|
|
ADOXQ T1, T5 \
|
|
MULXQ (IM1+24)(M1),T9, T8 \
|
|
ADCXQ T9, T7 \
|
|
ADCXQ AX, T8 \
|
|
ADOXQ T2, T6 \
|
|
\
|
|
MOVQ (IM0+16)(M0),DX \ // A2*B[0-3]
|
|
MULXQ (IM1+ 0)(M1), T0, T1 \
|
|
ADOXQ T3, T7 \
|
|
ADOXQ AX, T8 \
|
|
XORQ AX, AX \
|
|
MULXQ (IM1+8)(M1), T3, T2 \
|
|
ADOXQ T5, T0 \
|
|
MOVQ T0, (ID+16)(MDST) \
|
|
ADCXQ T3, T1 \
|
|
MULXQ (IM1+16)(M1),T4, T3 \
|
|
ADCXQ T4, T2 \
|
|
ADOXQ T6, T1 \
|
|
MULXQ (IM1+24)(M1),T9, T4 \
|
|
ADCXQ T9, T3 \
|
|
ADCXQ AX, T4 \
|
|
\
|
|
ADOXQ T7, T2 \
|
|
ADOXQ T8, T3 \
|
|
ADOXQ AX, T4 \
|
|
\
|
|
MOVQ (IM0+24)(M0),DX \
|
|
MULXQ (IM1+ 0)(M1), T0, T5\ // A3*B[0-3]
|
|
XORQ AX, AX \
|
|
MULXQ (IM1+ 8)(M1), T7, T6\
|
|
ADCXQ T7, T5 \
|
|
ADOXQ T0, T1 \
|
|
MULXQ (IM1+16)(M1), T8, T7 \
|
|
ADCXQ T8, T6 \
|
|
ADOXQ T5, T2 \
|
|
MULXQ (IM1+24)(M1), T9, T8 \
|
|
ADCXQ T9, T7 \
|
|
ADCXQ AX, T8 \
|
|
ADOXQ T6, T3 \
|
|
ADOXQ T7, T4 \
|
|
ADOXQ AX, T8 \
|
|
MOVQ T1, (ID+24)(MDST) \
|
|
MOVQ T2, (ID+32)(MDST) \
|
|
MOVQ T3, (ID+40)(MDST) \
|
|
MOVQ T4, (ID+48)(MDST) \
|
|
MOVQ T8, (ID+56)(MDST)
|
|
|
|
// Performs schoolbook multiplication of 64-bit with 256-bit
|
|
// number.
|
|
//
|
|
// Uses registers: DX, AX
|
|
#define MUL64x256(IDX,M0,M1,C0,C1,C2,C3,C4,T0) \
|
|
MOVQ (IDX)(M0), T0 \
|
|
\
|
|
XORQ C2, C2 \
|
|
MOVQ M1+0(SB), AX \
|
|
MULQ T0 \
|
|
MOVQ AX, C0 \
|
|
MOVQ DX, C1 \
|
|
\
|
|
XORQ C3, C3 \
|
|
MOVQ M1+8(SB), AX \
|
|
MULQ T0 \
|
|
ADDQ AX, C1 \
|
|
ADCQ DX, C2 \
|
|
\
|
|
XORQ C4, C4 \
|
|
MOVQ M1+16(SB), AX \
|
|
MULQ T0 \
|
|
ADDQ AX, C2 \
|
|
ADCQ DX, C3 \
|
|
\
|
|
MOVQ M1+24(SB), AX \
|
|
MULQ T0 \
|
|
ADDQ AX, C3 \
|
|
ADCQ DX, C4
|
|
|
|
// Performs schoolbook multiplication of 128-bit with 256-bit
|
|
// number. Destroys RAX and RDX
|
|
//
|
|
// Uses registers: DX, AX
|
|
#define MUL128x256(IDX,M0,M1,C0,C1,C2,C3,C4,C5,T0,T1) \
|
|
\ // A0 x B0
|
|
MOVQ (IDX+0)(M0), T0 \
|
|
MOVQ M1+0(SB), AX \
|
|
MULQ T0 \
|
|
XORQ C2, C2 \
|
|
MOVQ AX, C0 \
|
|
MOVQ DX, C1 \
|
|
\ // A0 x B1
|
|
MOVQ M1+8(SB), AX \
|
|
MULQ T0 \
|
|
XORQ C3, C3 \
|
|
ADDQ AX, C1 \
|
|
ADCQ DX, C2 \
|
|
\ // A1 x B0
|
|
MOVQ (IDX+8)(M0), T1 \
|
|
MOVQ M1+0(SB), AX \
|
|
MULQ T1 \
|
|
ADDQ AX, C1 \
|
|
ADCQ DX, C2 \
|
|
ADCQ $0, C3 \
|
|
\ // A0 x B2
|
|
XORQ C4, C4 \
|
|
MOVQ M1+16(SB), AX \
|
|
MULQ T0 \
|
|
ADDQ AX, C2 \
|
|
ADCQ DX, C3 \
|
|
ADCQ $0, C4 \
|
|
\ // A1 x B1
|
|
MOVQ M1+8(SB), AX \
|
|
MULQ T1 \
|
|
ADDQ AX, C2 \
|
|
ADCQ DX, C3 \
|
|
ADCQ $0, C4 \
|
|
\ // A0 x B3
|
|
MOVQ M1+24(SB), AX \
|
|
MULQ T0 \
|
|
XORQ C5, C5 \
|
|
ADDQ AX, C3 \
|
|
ADCQ DX, C4 \
|
|
ADCQ $0, C5 \
|
|
\ // A1 x B2
|
|
MOVQ M1+16(SB), AX \
|
|
MULQ T1 \
|
|
ADDQ AX, C3 \
|
|
ADCQ DX, C4 \
|
|
ADCQ $0, C5 \
|
|
\ // A1 x B3
|
|
MOVQ M1+24(SB), AX \
|
|
MULQ T1 \
|
|
ADDQ AX, C4 \
|
|
ADCQ DX, C5
|
|
|
|
#define REDC_COMMON(MUL01, MUL23, MUL45, MUL67) \
|
|
MUL01 \
|
|
XORQ CX, CX \
|
|
ADDQ 0x18(DI), R8 \
|
|
ADCQ 0x20(DI), R9 \
|
|
ADCQ 0x28(DI), R10 \
|
|
ADCQ 0x30(DI), R11 \
|
|
ADCQ 0x38(DI), R12 \
|
|
ADCQ 0x40(DI), R13 \
|
|
ADCQ 0x48(DI), CX \
|
|
MOVQ R8, 0x18(DI) \
|
|
MOVQ R9, 0x20(DI) \
|
|
MOVQ R10, 0x28(DI) \
|
|
MOVQ R11, 0x30(DI) \
|
|
MOVQ R12, 0x38(DI) \
|
|
MOVQ R13, 0x40(DI) \
|
|
MOVQ CX, 0x48(DI) \
|
|
MOVQ 0x50(DI), R8 \
|
|
MOVQ 0x58(DI), R9 \
|
|
MOVQ 0x60(DI), R10 \
|
|
MOVQ 0x68(DI), R11 \
|
|
ADCQ $0, R8 \
|
|
ADCQ $0, R9 \
|
|
ADCQ $0, R10 \
|
|
ADCQ $0, R11 \
|
|
MOVQ R8, 0x50(DI) \
|
|
MOVQ R9, 0x58(DI) \
|
|
MOVQ R10, 0x60(DI) \
|
|
MOVQ R11, 0x68(DI) \
|
|
\
|
|
MUL23 \
|
|
XORQ CX, CX \
|
|
ADDQ 0x28(DI), R8 \
|
|
ADCQ 0x30(DI), R9 \
|
|
ADCQ 0x38(DI), R10 \
|
|
ADCQ 0x40(DI), R11 \
|
|
ADCQ 0x48(DI), R12 \
|
|
ADCQ 0x50(DI), R13 \
|
|
ADCQ 0x58(DI), CX \
|
|
MOVQ R8, 0x28(DI) \
|
|
MOVQ R9, 0x30(DI) \
|
|
MOVQ R10, 0x38(DI) \
|
|
MOVQ R11, 0x40(DI) \
|
|
MOVQ R12, 0x48(DI) \
|
|
MOVQ R13, 0x50(DI) \
|
|
MOVQ CX, 0x58(DI) \
|
|
MOVQ 0x60(DI), R8 \
|
|
MOVQ 0x68(DI), R9 \
|
|
ADCQ $0, R8 \
|
|
ADCQ $0, R9 \
|
|
MOVQ R8, 0x60(DI) \
|
|
MOVQ R9, 0x68(DI) \
|
|
\
|
|
MUL45 \
|
|
XORQ CX, CX \
|
|
ADDQ 0x38(DI), R8 \
|
|
ADCQ 0x40(DI), R9 \
|
|
ADCQ 0x48(DI), R10 \
|
|
ADCQ 0x50(DI), R11 \
|
|
ADCQ 0x58(DI), R12 \
|
|
ADCQ 0x60(DI), R13 \
|
|
ADCQ 0x68(DI), CX \
|
|
MOVQ R8, 0x0(SI) \ // OUT0
|
|
MOVQ R9, 0x8(SI) \ // OUT1
|
|
MOVQ R10, 0x48(DI) \
|
|
MOVQ R11, 0x50(DI) \
|
|
MOVQ R12, 0x58(DI) \
|
|
MOVQ R13, 0x60(DI) \
|
|
MOVQ CX, 0x68(DI) \
|
|
\
|
|
MUL67 \
|
|
ADDQ 0x48(DI), R8 \
|
|
ADCQ 0x50(DI), R9 \
|
|
ADCQ 0x58(DI), R10 \
|
|
ADCQ 0x60(DI), R11 \
|
|
ADCQ 0x68(DI), R12 \
|
|
MOVQ R8, 0x10(SI) \ // OUT2
|
|
MOVQ R9, 0x18(SI) \ // OUT3
|
|
MOVQ R10, 0x20(SI) \ // OUT4
|
|
MOVQ R11, 0x28(SI) \ // OUT5
|
|
MOVQ R12, 0x30(SI) // OUT6
|
|
|
|
TEXT ·cswapP434(SB),NOSPLIT,$0-17
|
|
|
|
MOVQ x+0(FP), DI
|
|
MOVQ y+8(FP), SI
|
|
MOVB choice+16(FP), AL // AL = 0 or 1
|
|
MOVBLZX AL, AX // AX = 0 or 1
|
|
NEGQ AX // AX = 0x00..00 or 0xff..ff
|
|
#ifndef CSWAP_BLOCK
|
|
#define CSWAP_BLOCK(idx) \
|
|
MOVQ (idx*8)(DI), BX \ // BX = x[idx]
|
|
MOVQ (idx*8)(SI), CX \ // CX = y[idx]
|
|
MOVQ CX, DX \ // DX = y[idx]
|
|
XORQ BX, DX \ // DX = y[idx] ^ x[idx]
|
|
ANDQ AX, DX \ // DX = (y[idx] ^ x[idx]) & mask
|
|
XORQ DX, BX \ // BX = (y[idx] ^ x[idx]) & mask) ^ x[idx] = x[idx] or y[idx]
|
|
XORQ DX, CX \ // CX = (y[idx] ^ x[idx]) & mask) ^ y[idx] = y[idx] or x[idx]
|
|
MOVQ BX, (idx*8)(DI) \
|
|
MOVQ CX, (idx*8)(SI)
|
|
#endif
|
|
CSWAP_BLOCK(0)
|
|
CSWAP_BLOCK(1)
|
|
CSWAP_BLOCK(2)
|
|
CSWAP_BLOCK(3)
|
|
CSWAP_BLOCK(4)
|
|
CSWAP_BLOCK(5)
|
|
CSWAP_BLOCK(6)
|
|
#ifdef CSWAP_BLOCK
|
|
#undef CSWAP_BLOCK
|
|
#endif
|
|
RET
|
|
|
|
TEXT ·addP434(SB),NOSPLIT,$0-24
|
|
MOVQ z+0(FP), DX
|
|
MOVQ x+8(FP), DI
|
|
MOVQ y+16(FP), SI
|
|
|
|
// Used later to calculate a mask
|
|
XORQ CX, CX
|
|
|
|
// [R8-R14]: z = x + y
|
|
MOVQ ( 0)(DI), R8; ADDQ ( 0)(SI), R8
|
|
MOVQ ( 8)(DI), R9; ADCQ ( 8)(SI), R9
|
|
MOVQ (16)(DI), R10; ADCQ (16)(SI), R10
|
|
MOVQ (24)(DI), R11; ADCQ (24)(SI), R11
|
|
MOVQ (32)(DI), R12; ADCQ (32)(SI), R12
|
|
MOVQ (40)(DI), R13; ADCQ (40)(SI), R13
|
|
MOVQ (48)(DI), R14; ADCQ (48)(SI), R14
|
|
|
|
XORQ DI, DI
|
|
|
|
MOVQ P434X2_0, AX; SUBQ AX, R8
|
|
MOVQ P434X2_1, AX; SBBQ AX, R9
|
|
SBBQ AX, R10
|
|
MOVQ P434X2_3, AX; SBBQ AX, R11
|
|
MOVQ P434X2_4, AX; SBBQ AX, R12
|
|
MOVQ P434X2_5, AX; SBBQ AX, R13
|
|
MOVQ P434X2_6, AX; SBBQ AX, R14
|
|
|
|
// mask
|
|
SBBQ $0, CX
|
|
|
|
// if z<0 add P434x2 back
|
|
MOVQ P434X2_0, R15; ANDQ CX, R15;
|
|
MOVQ P434X2_1, AX; ANDQ CX, AX;
|
|
|
|
ADDQ R8, R15; MOVQ R15, ( 0)(DX)
|
|
ADCQ AX, R9; MOVQ R9, ( 8)(DX)
|
|
ADCQ AX, R10; MOVQ R10, (16)(DX)
|
|
|
|
ADCQ $0, DI
|
|
MOVQ P434X2_3, R15; ANDQ CX, R15;
|
|
MOVQ P434X2_4, R8; ANDQ CX, R8;
|
|
MOVQ P434X2_5, R9; ANDQ CX, R9;
|
|
MOVQ P434X2_6, R10; ANDQ CX, R10;
|
|
BTQ $0, DI
|
|
|
|
ADCQ R11, R15; MOVQ R15, (24)(DX)
|
|
ADCQ R12, R8; MOVQ R8, (32)(DX)
|
|
ADCQ R13, R9; MOVQ R9, (40)(DX)
|
|
ADCQ R14, R10; MOVQ R10, (48)(DX)
|
|
|
|
RET
|
|
|
|
TEXT ·adlP434(SB),NOSPLIT,$0-24
|
|
MOVQ z+0(FP), DX
|
|
MOVQ x+8(FP), DI
|
|
MOVQ y+16(FP),SI
|
|
|
|
MOVQ ( 0)(DI), R8
|
|
ADDQ ( 0)(SI), R8
|
|
MOVQ ( 8)(DI), R9
|
|
ADCQ ( 8)(SI), R9
|
|
MOVQ (16)(DI), R10
|
|
ADCQ (16)(SI), R10
|
|
MOVQ (24)(DI), R11
|
|
ADCQ (24)(SI), R11
|
|
MOVQ (32)(DI), R12
|
|
ADCQ (32)(SI), R12
|
|
MOVQ (40)(DI), R13
|
|
ADCQ (40)(SI), R13
|
|
MOVQ (48)(DI), R14
|
|
ADCQ (48)(SI), R14
|
|
MOVQ (56)(DI), R15
|
|
ADCQ (56)(SI), R15
|
|
MOVQ (64)(DI), AX
|
|
ADCQ (64)(SI), AX
|
|
MOVQ (72)(DI), BX
|
|
ADCQ (72)(SI), BX
|
|
MOVQ (80)(DI), CX
|
|
ADCQ (80)(SI), CX
|
|
|
|
MOVQ R8, ( 0)(DX)
|
|
MOVQ R9, ( 8)(DX)
|
|
MOVQ R10,(16)(DX)
|
|
MOVQ R11,(24)(DX)
|
|
MOVQ R12,(32)(DX)
|
|
MOVQ R13,(40)(DX)
|
|
MOVQ R14,(48)(DX)
|
|
MOVQ R15,(56)(DX)
|
|
MOVQ AX, (64)(DX)
|
|
MOVQ BX, (72)(DX)
|
|
MOVQ CX, (80)(DX)
|
|
|
|
MOVQ (88)(DI), R8
|
|
ADCQ (88)(SI), R8
|
|
MOVQ (96)(DI), R9
|
|
ADCQ (96)(SI), R9
|
|
MOVQ (104)(DI), R10
|
|
ADCQ (104)(SI), R10
|
|
|
|
MOVQ R8, (88)(DX)
|
|
MOVQ R9, (96)(DX)
|
|
MOVQ R10,(104)(DX)
|
|
RET
|
|
|
|
TEXT ·subP434(SB),NOSPLIT,$0-24
|
|
MOVQ z+0(FP), DX
|
|
MOVQ x+8(FP), DI
|
|
MOVQ y+16(FP), SI
|
|
|
|
// Used later to calculate a mask
|
|
XORQ CX, CX
|
|
|
|
MOVQ ( 0)(DI), R8; SUBQ ( 0)(SI), R8
|
|
MOVQ ( 8)(DI), R9; SBBQ ( 8)(SI), R9
|
|
MOVQ (16)(DI), R10; SBBQ (16)(SI), R10
|
|
MOVQ (24)(DI), R11; SBBQ (24)(SI), R11
|
|
MOVQ (32)(DI), R12; SBBQ (32)(SI), R12
|
|
MOVQ (40)(DI), R13; SBBQ (40)(SI), R13
|
|
MOVQ (48)(DI), R14; SBBQ (48)(SI), R14
|
|
|
|
// mask
|
|
SBBQ $0, CX
|
|
XORQ R15, R15
|
|
|
|
// if z<0 add p434x2 back
|
|
MOVQ P434X2_0, DI; ANDQ CX, DI
|
|
MOVQ P434X2_1, SI; ANDQ CX, SI
|
|
MOVQ P434X2_3, AX; ANDQ CX, AX
|
|
|
|
ADDQ DI, R8; MOVQ R8, ( 0)(DX)
|
|
ADCQ SI, R9; MOVQ R9, ( 8)(DX)
|
|
ADCQ SI, R10; MOVQ R10, (16)(DX)
|
|
ADCQ AX, R11; MOVQ R11, (24)(DX)
|
|
ADCQ $0, R15
|
|
|
|
MOVQ P434X2_4, R8; ANDQ CX, R8;
|
|
MOVQ P434X2_5, R9; ANDQ CX, R9;
|
|
MOVQ P434X2_6, R10; ANDQ CX, R10
|
|
|
|
BTQ $0, R15
|
|
|
|
ADCQ R8, R12; MOVQ R12, (32)(DX)
|
|
ADCQ R9, R13; MOVQ R13, (40)(DX)
|
|
ADCQ R10, R14; MOVQ R14, (48)(DX)
|
|
RET
|
|
|
|
TEXT ·sulP434(SB),NOSPLIT,$0-24
|
|
MOVQ z+0(FP), DX
|
|
MOVQ x+8(FP), DI
|
|
MOVQ y+16(FP), SI
|
|
|
|
// Used later to store result of 0-borrow
|
|
XORQ CX, CX
|
|
|
|
// SUBC for first 10 limbs
|
|
MOVQ ( 0)(DI), R8; SUBQ ( 0)(SI), R8
|
|
MOVQ ( 8)(DI), R9; SBBQ ( 8)(SI), R9
|
|
MOVQ (16)(DI), R10; SBBQ (16)(SI), R10
|
|
MOVQ (24)(DI), R11; SBBQ (24)(SI), R11
|
|
MOVQ (32)(DI), R12; SBBQ (32)(SI), R12
|
|
MOVQ (40)(DI), R13; SBBQ (40)(SI), R13
|
|
MOVQ (48)(DI), R14; SBBQ (48)(SI), R14
|
|
MOVQ (56)(DI), R15; SBBQ (56)(SI), R15
|
|
MOVQ (64)(DI), AX; SBBQ (64)(SI), AX
|
|
MOVQ (72)(DI), BX; SBBQ (72)(SI), BX
|
|
|
|
MOVQ R8, ( 0)(DX)
|
|
MOVQ R9, ( 8)(DX)
|
|
MOVQ R10, (16)(DX)
|
|
MOVQ R11, (24)(DX)
|
|
MOVQ R12, (32)(DX)
|
|
MOVQ R13, (40)(DX)
|
|
MOVQ R14, (48)(DX)
|
|
MOVQ R15, (56)(DX)
|
|
MOVQ AX, (64)(DX)
|
|
MOVQ BX, (72)(DX)
|
|
|
|
// SUBC for last 4 limbs
|
|
MOVQ ( 80)(DI), R8; SBBQ ( 80)(SI), R8
|
|
MOVQ ( 88)(DI), R9; SBBQ ( 88)(SI), R9
|
|
MOVQ ( 96)(DI), R10; SBBQ ( 96)(SI), R10
|
|
MOVQ (104)(DI), R11; SBBQ (104)(SI), R11
|
|
|
|
// Store carry flag
|
|
SBBQ $0, CX
|
|
|
|
MOVQ R8, ( 80)(DX)
|
|
MOVQ R9, ( 88)(DX)
|
|
MOVQ R10, ( 96)(DX)
|
|
MOVQ R11, (104)(DX)
|
|
|
|
// Load p into registers:
|
|
MOVQ P434_0, R8; ANDQ CX, R8
|
|
// P434_{1,2} = P434_0, so reuse R8
|
|
MOVQ P434_3, R9; ANDQ CX, R9
|
|
MOVQ P434_4, R10; ANDQ CX, R10
|
|
MOVQ P434_5, R11; ANDQ CX, R11
|
|
MOVQ P434_6, R12; ANDQ CX, R12
|
|
|
|
MOVQ (56 )(DX), AX; ADDQ R8, AX; MOVQ AX, (56 )(DX)
|
|
MOVQ (56+ 8)(DX), AX; ADCQ R8, AX; MOVQ AX, (56+ 8)(DX)
|
|
MOVQ (56+16)(DX), AX; ADCQ R8, AX; MOVQ AX, (56+16)(DX)
|
|
MOVQ (56+24)(DX), AX; ADCQ R9, AX; MOVQ AX, (56+24)(DX)
|
|
MOVQ (56+32)(DX), AX; ADCQ R10, AX; MOVQ AX, (56+32)(DX)
|
|
MOVQ (56+40)(DX), AX; ADCQ R11, AX; MOVQ AX, (56+40)(DX)
|
|
MOVQ (56+48)(DX), AX; ADCQ R12, AX; MOVQ AX, (56+48)(DX)
|
|
|
|
RET
|
|
|
|
TEXT ·modP434(SB),NOSPLIT,$0-8
|
|
MOVQ x+0(FP), DI
|
|
|
|
// Zero AX for later use:
|
|
XORQ AX, AX
|
|
|
|
// Set x <- x - p
|
|
MOVQ P434_0, R8
|
|
SUBQ R8, ( 0)(DI)
|
|
// P434_{1,2} = P434_0, so reuse R8
|
|
MOVQ P434_3, R9
|
|
SBBQ R8, ( 8)(DI)
|
|
SBBQ R8, (16)(DI)
|
|
MOVQ P434_4, R10
|
|
SBBQ R9, (24)(DI)
|
|
MOVQ P434_5, R11
|
|
SBBQ R10, (32)(DI)
|
|
MOVQ P434_6, R12
|
|
SBBQ R11, (40)(DI)
|
|
SBBQ R12, (48)(DI)
|
|
|
|
// save carry
|
|
SBBQ $0, AX
|
|
|
|
// Conditionally add p to x if x-p < 0
|
|
ANDQ AX, R8
|
|
ANDQ AX, R9
|
|
ANDQ AX, R10
|
|
ANDQ AX, R11
|
|
ANDQ AX, R12
|
|
|
|
ADDQ R8, ( 0)(DI)
|
|
ADCQ R8, ( 8)(DI)
|
|
ADCQ R8, (16)(DI)
|
|
ADCQ R9, (24)(DI)
|
|
ADCQ R10,(32)(DI)
|
|
ADCQ R11,(40)(DI)
|
|
ADCQ R12,(48)(DI)
|
|
RET
|
|
|
|
// 434-bit multiplication using Karatsuba (one level),
|
|
// schoolbook (one level).
|
|
TEXT ·mulP434(SB),NOSPLIT,$112-24
|
|
MOVQ z+0(FP), CX
|
|
MOVQ x+8(FP), DI
|
|
MOVQ y+16(FP), SI
|
|
|
|
// Check wether to use optimized implementation
|
|
CMPB ·HasADXandBMI2(SB), $1
|
|
JE mul_with_mulx_adcx_adox
|
|
|
|
// rcx[0-3] <- AH+AL
|
|
XORQ AX, AX
|
|
MOVQ 0x20(DI), R8
|
|
MOVQ 0x28(DI), R9
|
|
MOVQ 0x30(DI), R10
|
|
XORQ R11, R11
|
|
ADDQ 0x0(DI), R8
|
|
ADCQ 0x8(DI), R9
|
|
ADCQ 0x10(DI), R10
|
|
ADCQ 0x18(DI), R11
|
|
// store AH+AL mask
|
|
SBBQ $0, AX
|
|
MOVQ AX, 0x40(SP)
|
|
// store AH+AL in 0-0x18(rcx)
|
|
MOVQ R8, 0x0(CX)
|
|
MOVQ R9, 0x8(CX)
|
|
MOVQ R10, 0x10(CX)
|
|
MOVQ R11, 0x18(CX)
|
|
|
|
// r12-r15 <- BH+BL
|
|
XORQ DX, DX
|
|
MOVQ 0x20(SI), R12
|
|
MOVQ 0x28(SI), R13
|
|
MOVQ 0x30(SI), R14
|
|
XORQ R15, R15
|
|
ADDQ 0x0(SI), R12
|
|
ADCQ 0x8(SI), R13
|
|
ADCQ 0x10(SI), R14
|
|
ADCQ 0x18(SI), R15
|
|
SBBQ $0, DX
|
|
|
|
// store BH+BL mask
|
|
MOVQ DX, 0x48(SP)
|
|
|
|
// (rsp[0-0x38]) <- (AH+AL)*(BH+BL)
|
|
MOVQ (CX), AX
|
|
MULQ R12
|
|
MOVQ AX, (SP)
|
|
MOVQ DX, R8
|
|
|
|
XORQ R9, R9
|
|
MOVQ (CX), AX
|
|
MULQ R13
|
|
ADDQ AX, R8
|
|
ADCQ DX, R9
|
|
|
|
XORQ R10, R10
|
|
MOVQ 0x8(CX), AX
|
|
MULQ R12
|
|
ADDQ AX, R8
|
|
MOVQ R8, 0x8(SP)
|
|
ADCQ DX, R9
|
|
ADCQ $0, R10
|
|
|
|
XORQ R8, R8
|
|
MOVQ (CX), AX
|
|
MULQ R14
|
|
ADDQ AX, R9
|
|
ADCQ DX, R10
|
|
ADCQ $0, R8
|
|
|
|
MOVQ 0x10(CX), AX
|
|
MULQ R12
|
|
ADDQ AX, R9
|
|
ADCQ DX, R10
|
|
ADCQ $0, R8
|
|
|
|
MOVQ 0x8(CX), AX
|
|
MULQ R13
|
|
ADDQ AX, R9
|
|
MOVQ R9, 0x10(SP)
|
|
ADCQ DX, R10
|
|
ADCQ $0, R8
|
|
|
|
XORQ R9, R9
|
|
MOVQ (CX),AX
|
|
MULQ R15
|
|
ADDQ AX, R10
|
|
ADCQ DX, R8
|
|
ADCQ $0, R9
|
|
|
|
MOVQ 0x18(CX), AX
|
|
MULQ R12
|
|
ADDQ AX, R10
|
|
ADCQ DX, R8
|
|
ADCQ $0, R9
|
|
|
|
MOVQ 0x8(CX), AX
|
|
MULQ R14
|
|
ADDQ AX, R10
|
|
ADCQ DX, R8
|
|
ADCQ $0, R9
|
|
|
|
MOVQ 0x10(CX), AX
|
|
MULQ R13
|
|
ADDQ AX, R10
|
|
MOVQ R10, 0x18(SP)
|
|
ADCQ DX, R8
|
|
ADCQ $0, R9
|
|
|
|
XORQ R10, R10
|
|
MOVQ 0x8(CX), AX
|
|
MULQ R15
|
|
ADDQ AX, R8
|
|
ADCQ DX, R9
|
|
ADCQ $0, R10
|
|
|
|
MOVQ 0x18(CX), AX
|
|
MULQ R13
|
|
ADDQ AX, R8
|
|
ADCQ DX, R9
|
|
ADCQ $0, R10
|
|
|
|
MOVQ 0x10(CX), AX
|
|
MULQ R14
|
|
ADDQ AX, R8
|
|
MOVQ R8, 0x20(SP)
|
|
ADCQ DX, R9
|
|
ADCQ $0, R10
|
|
|
|
XORQ R11, R11
|
|
MOVQ 0x10(CX), AX
|
|
MULQ R15
|
|
ADDQ AX, R9
|
|
ADCQ DX, R10
|
|
ADCQ $0, R11
|
|
|
|
MOVQ 0x18(CX), AX
|
|
MULQ R14
|
|
ADDQ AX, R9
|
|
MOVQ R9, 0x28(SP)
|
|
ADCQ DX, R10
|
|
ADCQ $0, R11
|
|
|
|
MOVQ 0x18(CX), AX
|
|
MULQ R15
|
|
ADDQ AX, R10
|
|
MOVQ R10, 0x30(SP)
|
|
ADCQ DX, R11
|
|
MOVQ R11,0x38(SP)
|
|
|
|
// r12-r15 <- masked (BH + BL)
|
|
MOVQ 0x40(SP), AX
|
|
ANDQ AX, R12
|
|
ANDQ AX, R13
|
|
ANDQ AX, R14
|
|
ANDQ AX, R15
|
|
|
|
// r8-r11 <- masked (AH + AL)
|
|
MOVQ 0x48(SP), AX
|
|
MOVQ 0x00(CX), R8
|
|
ANDQ AX, R8
|
|
MOVQ 0x08(CX), R9
|
|
ANDQ AX, R9
|
|
MOVQ 0x10(CX), R10
|
|
ANDQ AX, R10
|
|
MOVQ 0x18(CX), R11
|
|
ANDQ AX, R11
|
|
|
|
// r12-r15 <- masked (AH + AL) + masked (BH + BL)
|
|
ADDQ R8, R12
|
|
ADCQ R9, R13
|
|
ADCQ R10, R14
|
|
ADCQ R11, R15
|
|
|
|
// rsp[0x20-0x38] <- (AH+AL) x (BH+BL) high
|
|
MOVQ 0x20(SP), AX
|
|
ADDQ AX, R12
|
|
MOVQ 0x28(SP), AX
|
|
ADCQ AX, R13
|
|
MOVQ 0x30(SP), AX
|
|
ADCQ AX, R14
|
|
MOVQ 0x38(SP), AX
|
|
ADCQ AX, R15
|
|
MOVQ R12, 0x50(SP)
|
|
MOVQ R13, 0x58(SP)
|
|
MOVQ R14, 0x60(SP)
|
|
MOVQ R15, 0x68(SP)
|
|
|
|
// [rcx] <- CL = AL x BL
|
|
MOVQ (DI), R11
|
|
MOVQ (SI), AX
|
|
MULQ R11
|
|
XORQ R9, R9
|
|
MOVQ AX, (CX)
|
|
MOVQ DX, R8
|
|
|
|
MOVQ 0x10(DI), R14
|
|
MOVQ 0x8(SI), AX
|
|
MULQ R11
|
|
XORQ R10, R10
|
|
ADDQ AX, R8
|
|
ADCQ DX, R9
|
|
|
|
MOVQ 0x8(DI), R12
|
|
MOVQ (SI), AX
|
|
MULQ R12
|
|
ADDQ AX, R8
|
|
MOVQ R8, 0x8(CX)
|
|
ADCQ DX, R9
|
|
ADCQ $0, R10
|
|
|
|
XORQ R8, R8
|
|
MOVQ 0x10(SI), AX
|
|
MULQ R11
|
|
ADDQ AX, R9
|
|
ADCQ DX, R10
|
|
ADCQ $0, R8
|
|
|
|
MOVQ (SI), R13
|
|
MOVQ R14, AX
|
|
MULQ R13
|
|
ADDQ AX, R9
|
|
ADCQ DX, R10
|
|
ADCQ $0, R8
|
|
|
|
MOVQ 0x8(SI), AX
|
|
MULQ R12
|
|
ADDQ AX, R9
|
|
MOVQ R9, 0x10(CX)
|
|
ADCQ DX, R10
|
|
ADCQ $0, R8
|
|
|
|
XORQ R9, R9
|
|
MOVQ 0x18(SI), AX
|
|
MULQ R11
|
|
MOVQ 0x18(DI), R15
|
|
ADDQ AX, R10
|
|
ADCQ DX, R8
|
|
ADCQ $0, R9
|
|
|
|
MOVQ R15, AX
|
|
MULQ R13
|
|
ADDQ AX, R10
|
|
ADCQ DX, R8
|
|
ADCQ $0, R9
|
|
|
|
MOVQ 0x10(SI), AX
|
|
MULQ R12
|
|
ADDQ AX, R10
|
|
ADCQ DX, R8
|
|
ADCQ $0, R9
|
|
|
|
MOVQ 0x8(SI), AX
|
|
MULQ R14
|
|
ADDQ AX, R10
|
|
MOVQ R10, 0x18(CX)
|
|
ADCQ DX, R8
|
|
ADCQ $0, R9
|
|
|
|
XORQ R10, R10
|
|
MOVQ 0x18(SI), AX
|
|
MULQ R12
|
|
ADDQ AX, R8
|
|
ADCQ DX, R9
|
|
ADCQ $0, R10
|
|
|
|
MOVQ 0x8(SI), AX
|
|
MULQ R15
|
|
ADDQ AX, R8
|
|
ADCQ DX, R9
|
|
ADCQ $0, R10
|
|
|
|
MOVQ 0x10(SI), AX
|
|
MULQ R14
|
|
ADDQ AX, R8
|
|
MOVQ R8, 0x20(CX)
|
|
ADCQ DX, R9
|
|
ADCQ $0, R10
|
|
|
|
XORQ R8, R8
|
|
MOVQ 0x18(SI), AX
|
|
MULQ R14
|
|
ADDQ AX, R9
|
|
ADCQ DX, R10
|
|
ADCQ $0, R8
|
|
|
|
MOVQ 0x10(SI), AX
|
|
MULQ R15
|
|
ADDQ AX, R9
|
|
MOVQ R9, 0x28(CX)
|
|
ADCQ DX, R10
|
|
ADCQ $0, R8
|
|
|
|
MOVQ 0x18(SI), AX
|
|
MULQ R15
|
|
ADDQ AX, R10
|
|
MOVQ R10, 0x30(CX)
|
|
ADCQ DX, R8
|
|
MOVQ R8, 0x38(CX)
|
|
|
|
// rcx[0x40-0x68] <- AH*BH
|
|
// multiplies 2 192-bit numbers A,B
|
|
MOVQ 0x20(DI), R11
|
|
MOVQ 0x20(SI), AX
|
|
MULQ R11
|
|
XORQ R9, R9
|
|
MOVQ AX, 0x40(CX)
|
|
MOVQ DX, R8
|
|
|
|
MOVQ 0x30(DI), R14
|
|
MOVQ 0x28(SI), AX
|
|
MULQ R11
|
|
XORQ R10, R10
|
|
ADDQ AX, R8
|
|
ADCQ DX, R9
|
|
|
|
MOVQ 0x28(DI), R12
|
|
MOVQ 0x20(SI), AX
|
|
MULQ R12
|
|
ADDQ AX, R8
|
|
MOVQ R8, 0x48(CX)
|
|
ADCQ DX, R9
|
|
ADCQ $0, R10
|
|
|
|
XORQ R8, R8
|
|
MOVQ 0x30(SI), AX
|
|
MULQ R11
|
|
ADDQ AX, R9
|
|
ADCQ DX, R10
|
|
ADCQ $0, R8
|
|
|
|
MOVQ 0x20(SI), R13
|
|
MOVQ R14, AX
|
|
MULQ R13
|
|
ADDQ AX, R9
|
|
ADCQ DX, R10
|
|
ADCQ $0, R8
|
|
|
|
MOVQ 0x28(SI), AX
|
|
MULQ R12
|
|
ADDQ AX, R9
|
|
MOVQ R9, 0x50(CX)
|
|
ADCQ DX, R10
|
|
ADCQ $0, R8
|
|
|
|
MOVQ 0x30(SI), AX
|
|
MULQ R12
|
|
XORQ R12, R12
|
|
ADDQ AX, R10
|
|
ADCQ DX, R8
|
|
ADCQ $0, R12
|
|
|
|
MOVQ 0x28(SI), AX
|
|
MULQ R14
|
|
ADDQ AX, R10
|
|
ADCQ DX, R8
|
|
ADCQ $0, R12
|
|
MOVQ R10, 0x58(CX)
|
|
|
|
MOVQ 0x30(SI), AX
|
|
MULQ R14
|
|
ADDQ AX, R8
|
|
ADCQ $0, R12
|
|
MOVQ R8, 0x60(CX)
|
|
|
|
ADDQ R12, DX
|
|
|
|
// [r8-r15] <- (AH+AL)x(BH+BL) - ALxBL
|
|
MOVQ 0x0(SP), R8
|
|
SUBQ 0x0(CX), R8
|
|
MOVQ 0x8(SP), R9
|
|
SBBQ 0x8(CX), R9
|
|
MOVQ 0x10(SP), R10
|
|
SBBQ 0x10(CX), R10
|
|
MOVQ 0x18(SP), R11
|
|
SBBQ 0x18(CX), R11
|
|
MOVQ 0x50(SP), R12
|
|
SBBQ 0x20(CX), R12
|
|
MOVQ 0x58(SP), R13
|
|
SBBQ 0x28(CX), R13
|
|
MOVQ 0x60(SP), R14
|
|
SBBQ 0x30(CX), R14
|
|
MOVQ 0x68(SP), R15
|
|
SBBQ 0x38(CX), R15
|
|
|
|
// [r8-r15] <- (AH+AL) x (BH+BL) - ALxBL - AHxBH
|
|
MOVQ 0x40(CX), AX
|
|
SUBQ AX, R8
|
|
MOVQ 0x48(CX), AX
|
|
SBBQ AX, R9
|
|
MOVQ 0x50(CX), AX
|
|
SBBQ AX, R10
|
|
MOVQ 0x58(CX), AX
|
|
SBBQ AX, R11
|
|
MOVQ 0x60(CX), AX
|
|
SBBQ AX, R12
|
|
SBBQ DX, R13
|
|
SBBQ $0, R14
|
|
SBBQ $0, R15
|
|
|
|
// Final result
|
|
ADDQ 0x20(CX), R8
|
|
MOVQ R8, 0x20(CX) // OUT4
|
|
ADCQ 0x28(CX), R9
|
|
MOVQ R9, 0x28(CX) // OUT5
|
|
ADCQ 0x30(CX), R10
|
|
MOVQ R10, 0x30(CX) // OUT6
|
|
ADCQ 0x38(CX), R11
|
|
MOVQ R11, 0x38(CX) // OUT7
|
|
ADCQ 0x40(CX), R12
|
|
MOVQ R12, 0x40(CX) // OUT8
|
|
ADCQ 0x48(CX), R13
|
|
MOVQ R13, 0x48(CX) // OUT9
|
|
ADCQ 0x50(CX), R14
|
|
MOVQ R14, 0x50(CX) // OUT10
|
|
ADCQ 0x58(CX), R15
|
|
MOVQ R15, 0x58(CX) // OUT11
|
|
MOVQ 0x60(CX), R12
|
|
ADCQ $0, R12
|
|
MOVQ R12, 0x60(CX) // OUT12
|
|
ADCQ $0, DX
|
|
MOVQ DX, 0x68(CX) // OUT13
|
|
RET
|
|
|
|
mul_with_mulx_adcx_adox:
|
|
// Mul implementation for CPUs supporting two independent carry chain
|
|
// (ADOX/ADCX) instructions and carry-less MULX multiplier
|
|
XORQ AX, AX
|
|
MOVQ 0x0(DI), R8
|
|
MOVQ 0x8(DI), R9
|
|
MOVQ 0x10(DI), R10
|
|
MOVQ 0x18(DI), R11
|
|
|
|
MOVQ BP, 0x70(SP)
|
|
|
|
ADDQ 0x20(DI), R8
|
|
ADCQ 0x28(DI), R9
|
|
ADCQ 0x30(DI), R10
|
|
ADCQ $0, R11
|
|
SBBQ $0, AX
|
|
MOVQ R8, 0x0(SP)
|
|
MOVQ R9, 0x8(SP)
|
|
MOVQ R10, 0x10(SP)
|
|
MOVQ R11, 0x18(SP)
|
|
|
|
// r12-r15 <- BH + BL, rbx <- mask
|
|
XORQ BX, BX
|
|
MOVQ 0x0(SI), R12
|
|
MOVQ 0x8(SI), R13
|
|
MOVQ 0x10(SI), R14
|
|
MOVQ 0x18(SI), R15
|
|
ADDQ 0x20(SI), R12
|
|
ADCQ 0x28(SI), R13
|
|
ADCQ 0x30(SI), R14
|
|
ADCQ $0, R15
|
|
SBBQ $0, BX
|
|
MOVQ R12, 0x20(SP)
|
|
MOVQ R13, 0x28(SP)
|
|
MOVQ R14, 0x30(SP)
|
|
MOVQ R15, 0x38(SP)
|
|
|
|
// r12-r15 <- masked (BH + BL)
|
|
ANDQ AX, R12
|
|
ANDQ AX, R13
|
|
ANDQ AX, R14
|
|
ANDQ AX, R15
|
|
|
|
// r8-r11 <- masked (AH + AL)
|
|
ANDQ BX, R8
|
|
ANDQ BX, R9
|
|
ANDQ BX, R10
|
|
ANDQ BX, R11
|
|
|
|
// r8-r11 <- masked (AH + AL) + masked (BH + BL)
|
|
ADDQ R12, R8
|
|
ADCQ R13, R9
|
|
ADCQ R14, R10
|
|
ADCQ R15, R11
|
|
MOVQ R8, 0x40(SP)
|
|
MOVQ R9, 0x48(SP)
|
|
MOVQ R10, 0x50(SP)
|
|
MOVQ R11, 0x58(SP)
|
|
|
|
// [rsp] <- CM = (AH+AL) x (BH+BL)
|
|
MULX256(0,SP,32,SP,0,SP,R8,R9,R10,R11,R12,R13,R14,R15,BX,BP)
|
|
// [rcx] <- CL = AL x BL (Result c0-c3)
|
|
MULX256(0,DI,0,SI,0,CX,R8,R9,R10,R11,R12,R13,R14,R15,BX,BP)
|
|
// [rcx+64] <- CH = AH x BH
|
|
MULX192(32,DI,32,SI,64,CX,R8,R9,R10,R11,R12,R13,R14)
|
|
|
|
// r8-r11 <- (AH+AL) x (BH+BL), final step
|
|
MOVQ 0x40(SP), R8
|
|
MOVQ 0x48(SP), R9
|
|
MOVQ 0x50(SP), R10
|
|
MOVQ 0x58(SP), R11
|
|
|
|
MOVQ 0x20(SP), AX
|
|
ADDQ AX, R8
|
|
MOVQ 0x28(SP), AX
|
|
ADCQ AX, R9
|
|
MOVQ 0x30(SP), AX
|
|
ADCQ AX, R10
|
|
MOVQ 0x38(SP), AX
|
|
ADCQ AX, R11
|
|
|
|
// [rsp], x3-x5 <- (AH+AL) x (BH+BL) - ALxBL
|
|
MOVQ 0x0(SP), R12
|
|
MOVQ 0x8(SP), R13
|
|
MOVQ 0x10(SP), R14
|
|
MOVQ 0x18(SP), R15
|
|
SUBQ 0x0(CX), R12
|
|
SBBQ 0x8(CX), R13
|
|
SBBQ 0x10(CX), R14
|
|
SBBQ 0x18(CX), R15
|
|
SBBQ 0x20(CX), R8
|
|
SBBQ 0x28(CX), R9
|
|
SBBQ 0x30(CX), R10
|
|
SBBQ 0x38(CX), R11
|
|
|
|
// r8-r15 <- (AH+AL) x (BH+BL) - ALxBL - AHxBH
|
|
SUBQ 0x40(CX), R12
|
|
SBBQ 0x48(CX), R13
|
|
SBBQ 0x50(CX), R14
|
|
SBBQ 0x58(CX), R15
|
|
SBBQ 0x60(CX), R8
|
|
SBBQ 0x68(CX), R9
|
|
SBBQ $0, R10
|
|
SBBQ $0, R11
|
|
|
|
ADDQ 0x20(CX), R12
|
|
MOVQ R12, 0x20(CX) // OUT4
|
|
ADCQ 0x28(CX), R13
|
|
MOVQ R13, 0x28(CX) // OUT5
|
|
ADCQ 0x30(CX), R14
|
|
MOVQ R14, 0x30(CX) // OUT6
|
|
ADCQ 0x38(CX), R15
|
|
MOVQ R15, 0x38(CX) // OUT7
|
|
ADCQ 0x40(CX), R8
|
|
MOVQ R8, 0x40(CX) // OUT8
|
|
ADCQ 0x48(CX), R9
|
|
MOVQ R9, 0x48(CX) // OUT9
|
|
ADCQ 0x50(CX), R10
|
|
MOVQ R10, 0x50(CX) // OUT10
|
|
ADCQ 0x58(CX), R11
|
|
MOVQ R11, 0x58(CX) // OUT11
|
|
MOVQ 0x60(CX), R12
|
|
ADCQ $0, R12
|
|
MOVQ R12, 0x60(CX) // OUT12
|
|
MOVQ 0x68(CX), R13
|
|
ADCQ $0, R13
|
|
MOVQ R13, 0x68(CX) // OUT13
|
|
|
|
MOVQ 0x70(SP), BP
|
|
RET
|
|
|
|
TEXT ·rdcP434(SB),$0-16
|
|
MOVQ z+0(FP), SI
|
|
MOVQ x+8(FP), DI
|
|
CMPB ·HasADXandBMI2(SB), $1
|
|
JE redc_bdw
|
|
#define MUL01 MUL128x256( 0,DI,·P434p1+(8*P434_P1_ZEROS),R8,R9,R10,R11,R12,R13,R14,CX)
|
|
#define MUL23 MUL128x256(16,DI,·P434p1+(8*P434_P1_ZEROS),R8,R9,R10,R11,R12,R13,R14,CX)
|
|
#define MUL45 MUL128x256(32,DI,·P434p1+(8*P434_P1_ZEROS),R8,R9,R10,R11,R12,R13,R14,CX)
|
|
#define MUL67 MUL64x256(48,DI,·P434p1+(8*P434_P1_ZEROS),R8,R9,R10,R11,R12,R13)
|
|
REDC_COMMON(MUL01, MUL23, MUL45, MUL67)
|
|
#undef MUL01
|
|
#undef MUL23
|
|
#undef MUL45
|
|
#undef MUL67
|
|
RET
|
|
|
|
// 434-bit montgomery reduction Uses MULX/ADOX/ADCX instructions
|
|
// available on Broadwell micro-architectures and newer.
|
|
redc_bdw:
|
|
#define MULX01 MULX128x256( 0,DI,·P434p1+(8*P434_P1_ZEROS),R8,R9,R10,R11,R12,R13,CX)
|
|
#define MULX23 MULX128x256(16,DI,·P434p1+(8*P434_P1_ZEROS),R8,R9,R10,R11,R12,R13,CX)
|
|
#define MULX45 MULX128x256(32,DI,·P434p1+(8*P434_P1_ZEROS),R8,R9,R10,R11,R12,R13,CX)
|
|
#define MULX67 MULX64x256(48,DI,·P434p1+(8*P434_P1_ZEROS),R8,R9,R10,R11,R12,R13)
|
|
REDC_COMMON(MULX01, MULX23, MULX45, MULX67)
|
|
#undef MULX01
|
|
#undef MULX23
|
|
#undef MULX45
|
|
#undef MULX67
|
|
RET
|