|
- // +build amd64,!noasm
-
- #include "textflag.h"
-
- // Multipies 512-bit value by 64-bit value. Uses MULQ instruction to
- // multiply 2 64-bit values.
- //
- // Result: x = (y * z) mod 2^512
- //
- // Registers used: AX, CX, DX, SI, DI, R8
- //
- // func mul512(a, b *Fp, c uint64)
- TEXT ·mul512(SB), NOSPLIT, $0-24
- MOVQ a+0(FP), DI // result
- MOVQ b+8(FP), SI // multiplicand
-
- // Check wether to use optimized implementation
- CMPB ·hasBMI2(SB), $1
- JE mul512_mulx
-
- MOVQ c+16(FP), R10 // 64 bit multiplier, used by MULQ
- MOVQ R10, AX; MULQ 0(SI); MOVQ DX, R11; MOVQ AX, 0(DI) //x[0]
- MOVQ R10, AX; MULQ 8(SI); ADDQ R11, AX; ADCQ $0, DX; MOVQ DX, R11; MOVQ AX, 8(DI) //x[1]
- MOVQ R10, AX; MULQ 16(SI); ADDQ R11, AX; ADCQ $0, DX; MOVQ DX, R11; MOVQ AX, 16(DI) //x[2]
- MOVQ R10, AX; MULQ 24(SI); ADDQ R11, AX; ADCQ $0, DX; MOVQ DX, R11; MOVQ AX, 24(DI) //x[3]
- MOVQ R10, AX; MULQ 32(SI); ADDQ R11, AX; ADCQ $0, DX; MOVQ DX, R11; MOVQ AX, 32(DI) //x[4]
- MOVQ R10, AX; MULQ 40(SI); ADDQ R11, AX; ADCQ $0, DX; MOVQ DX, R11; MOVQ AX, 40(DI) //x[5]
- MOVQ R10, AX; MULQ 48(SI); ADDQ R11, AX; ADCQ $0, DX; MOVQ DX, R11; MOVQ AX, 48(DI) //x[6]
- MOVQ R10, AX; MULQ 56(SI); ADDQ R11, AX; MOVQ AX, 56(DI) //x[7]
- RET
-
- // Optimized for CPUs with BMI2
- mul512_mulx:
- MOVQ c+16(FP), DX // 64 bit multiplier, used by MULX
- MULXQ 0(SI), AX, R10; MOVQ AX, 0(DI) // x[0]
- MULXQ 8(SI), AX, R11; ADDQ R10, AX; MOVQ AX, 8(DI) // x[1]
- MULXQ 16(SI), AX, R10; ADCQ R11, AX; MOVQ AX, 16(DI) // x[2]
- MULXQ 24(SI), AX, R11; ADCQ R10, AX; MOVQ AX, 24(DI) // x[3]
- MULXQ 32(SI), AX, R10; ADCQ R11, AX; MOVQ AX, 32(DI) // x[4]
- MULXQ 40(SI), AX, R11; ADCQ R10, AX; MOVQ AX, 40(DI) // x[5]
- MULXQ 48(SI), AX, R10; ADCQ R11, AX; MOVQ AX, 48(DI) // x[6]
- MULXQ 56(SI), AX, R11; ADCQ R10, AX; MOVQ AX, 56(DI) // x[7]
- RET
-
- // Multipies 512-bit value by 64-bit value and returns 576-bit result. Uses MULQ instruction to
- // multiply 2 64-bit values. Returns 576-bit result.
- //
- // Result: x = (y * z)
- //
- // Registers used: AX, CX, DX, SI, DI, R8
- //
- // func mul576(a, b *Fp, c uint64)
- TEXT ·mul576(SB), NOSPLIT, $0-24
- MOVQ a+0(FP), DI // result
- MOVQ b+8(FP), SI // multiplicand
-
- MOVQ c+16(FP), R10 // 64 bit multiplier, used by MULQ
- MOVQ R10, AX; MULQ 0(SI); MOVQ DX, R11; MOVQ AX, 0(DI) //x[0]
- MOVQ R10, AX; MULQ 8(SI); ADDQ R11, AX; ADCQ $0, DX; MOVQ DX, R11; MOVQ AX, 8(DI) //x[1]
- MOVQ R10, AX; MULQ 16(SI); ADDQ R11, AX; ADCQ $0, DX; MOVQ DX, R11; MOVQ AX, 16(DI) //x[2]
- MOVQ R10, AX; MULQ 24(SI); ADDQ R11, AX; ADCQ $0, DX; MOVQ DX, R11; MOVQ AX, 24(DI) //x[3]
- MOVQ R10, AX; MULQ 32(SI); ADDQ R11, AX; ADCQ $0, DX; MOVQ DX, R11; MOVQ AX, 32(DI) //x[4]
- MOVQ R10, AX; MULQ 40(SI); ADDQ R11, AX; ADCQ $0, DX; MOVQ DX, R11; MOVQ AX, 40(DI) //x[5]
- MOVQ R10, AX; MULQ 48(SI); ADDQ R11, AX; ADCQ $0, DX; MOVQ DX, R11; MOVQ AX, 48(DI) //x[6]
- MOVQ R10, AX; MULQ 56(SI); ADDQ R11, AX; ADCQ $0, DX; MOVQ AX, 56(DI) //x[7]
- MOVQ DX, 64(DI) //x[8]
-
- RET
-
-
- TEXT ·cswap512(SB),NOSPLIT,$0-17
- MOVQ x+0(FP), DI
- MOVQ y+8(FP), SI
- MOVBLZX choice+16(FP), AX // AL = 0 or 1
-
- // Make AX, so that either all bits are set or non
- // AX = 0 or 1
- NEGQ AX
-
- // Fill xmm15. After this step first half of XMM15 is
- // just zeros and second half is whatever in AX
- MOVQ AX, X15
-
- // Copy lower double word everywhere else. So that
- // XMM15=AL|AL|AL|AL. As AX has either all bits set
- // or non result will be that XMM15 has also either
- // all bits set or non of them.
- PSHUFD $0, X15, X15
-
- #ifndef CSWAP_BLOCK
- #define CSWAP_BLOCK(idx) \
- MOVOU (idx*16)(DI), X0 \
- MOVOU (idx*16)(SI), X1 \
- \ // X2 = mask & (X0 ^ X1)
- MOVO X1, X2 \
- PXOR X0, X2 \
- PAND X15, X2 \
- \
- PXOR X2, X0 \
- PXOR X2, X1 \
- \
- MOVOU X0, (idx*16)(DI) \
- MOVOU X1, (idx*16)(SI)
- #endif
-
- CSWAP_BLOCK(0)
- CSWAP_BLOCK(1)
- CSWAP_BLOCK(2)
- CSWAP_BLOCK(3)
-
- RET
-
- // mulAsm implements montgomery multiplication interleaved with
- // montgomery reduction. It uses MULX and ADCX/ADOX instructions.
- // Implementation specific to 511-bit prime 'p'
- //
- // func mulBmiAsm(res, x, y *fp)
- TEXT ·mulBmiAsm(SB),NOSPLIT,$8-24
-
- MOVQ x+8(FP), DI // multiplicand
- MOVQ y+16(FP), SI // multiplier
-
- XORQ R8, R8
- XORQ R9, R9
- XORQ R10, R10
- XORQ R11, R11
- XORQ R12, R12
- XORQ R13, R13
- XORQ R14, R14
- XORQ R15, R15
-
- MOVQ BP, 0(SP)
- XORQ BP, BP
-
- // Uses BMI2 (MULX)
- #ifdef MULS_MULX_512
- #undef MULS_MULX_512
- #endif
- #define MULS_MULX_512(idx, r0, r1, r2, r3, r4, r5, r6, r7, r8) \
- \ // Reduction step
- MOVQ ( 0)(SI), DX \
- MULXQ ( 8*idx)(DI), DX, CX \
- ADDQ r0, DX \
- MULXQ ·pNegInv(SB), DX, CX \
- \
- XORQ AX, AX \
- MULXQ ·p+ 0(SB), AX, BX; ; ADOXQ AX, r0 \
- MULXQ ·p+ 8(SB), AX, CX; ADCXQ BX, r1; ADOXQ AX, r1 \
- MULXQ ·p+16(SB), AX, BX; ADCXQ CX, r2; ADOXQ AX, r2 \
- MULXQ ·p+24(SB), AX, CX; ADCXQ BX, r3; ADOXQ AX, r3 \
- MULXQ ·p+32(SB), AX, BX; ADCXQ CX, r4; ADOXQ AX, r4 \
- MULXQ ·p+40(SB), AX, CX; ADCXQ BX, r5; ADOXQ AX, r5 \
- MULXQ ·p+48(SB), AX, BX; ADCXQ CX, r6; ADOXQ AX, r6 \
- MULXQ ·p+56(SB), AX, CX; ADCXQ BX, r7; ADOXQ AX, r7 \
- MOVQ $0, AX ; ADCXQ CX, r8; ADOXQ AX, r8 \
- \ // Multiplication step
- MOVQ (8*idx)(DI), DX \
- \
- XORQ AX, AX \
- MULXQ ( 0)(SI), AX, BX; ADOXQ AX, r0 \
- MULXQ ( 8)(SI), AX, CX; ADCXQ BX, r1; ADOXQ AX, r1 \
- MULXQ (16)(SI), AX, BX; ADCXQ CX, r2; ADOXQ AX, r2 \
- MULXQ (24)(SI), AX, CX; ADCXQ BX, r3; ADOXQ AX, r3 \
- MULXQ (32)(SI), AX, BX; ADCXQ CX, r4; ADOXQ AX, r4 \
- MULXQ (40)(SI), AX, CX; ADCXQ BX, r5; ADOXQ AX, r5 \
- MULXQ (48)(SI), AX, BX; ADCXQ CX, r6; ADOXQ AX, r6 \
- MULXQ (56)(SI), AX, CX; ADCXQ BX, r7; ADOXQ AX, r7 \
- MOVQ $0, AX ; ADCXQ CX, r8; ADOXQ AX, r8
-
- MULS_MULX_512(0, R8, R9, R10, R11, R12, R13, R14, R15, BP)
- MULS_MULX_512(1, R9, R10, R11, R12, R13, R14, R15, BP, R8)
- MULS_MULX_512(2, R10, R11, R12, R13, R14, R15, BP, R8, R9)
- MULS_MULX_512(3, R11, R12, R13, R14, R15, BP, R8, R9, R10)
- MULS_MULX_512(4, R12, R13, R14, R15, BP, R8, R9, R10, R11)
- MULS_MULX_512(5, R13, R14, R15, BP, R8, R9, R10, R11, R12)
- MULS_MULX_512(6, R14, R15, BP, R8, R9, R10, R11, R12, R13)
- MULS_MULX_512(7, R15, BP, R8, R9, R10, R11, R12, R13, R14)
- #undef MULS_MULX_512
-
- MOVQ res+0(FP), DI
- MOVQ BP, ( 0)(DI)
- MOVQ R8, ( 8)(DI)
- MOVQ R9, (16)(DI)
- MOVQ R10, (24)(DI)
- MOVQ R11, (32)(DI)
- MOVQ R12, (40)(DI)
- MOVQ R13, (48)(DI)
- MOVQ R14, (56)(DI)
- MOVQ 0(SP), BP
-
- // NOW DI needs to be reduced if > p
- RET
|