mirror of
https://github.com/henrydcase/nobs.git
synced 2024-11-25 00:21:29 +00:00
193 lines
6.9 KiB
ArmAsm
193 lines
6.9 KiB
ArmAsm
|
// +build amd64,!noasm
|
||
|
|
||
|
#include "textflag.h"
|
||
|
|
||
|
// Multipies 512-bit value by 64-bit value. Uses MULQ instruction to
|
||
|
// multiply 2 64-bit values.
|
||
|
//
|
||
|
// Result: x = (y * z) mod 2^512
|
||
|
//
|
||
|
// Registers used: AX, CX, DX, SI, DI, R8
|
||
|
//
|
||
|
// func mul512(a, b *Fp, c uint64)
|
||
|
TEXT ·mul512(SB), NOSPLIT, $0-24
|
||
|
MOVQ a+0(FP), DI // result
|
||
|
MOVQ b+8(FP), SI // multiplicand
|
||
|
|
||
|
// Check wether to use optimized implementation
|
||
|
CMPB ·hasBMI2(SB), $1
|
||
|
JE mul512_mulx
|
||
|
|
||
|
MOVQ c+16(FP), R10 // 64 bit multiplier, used by MULQ
|
||
|
MOVQ R10, AX; MULQ 0(SI); MOVQ DX, R11; MOVQ AX, 0(DI) //x[0]
|
||
|
MOVQ R10, AX; MULQ 8(SI); ADDQ R11, AX; ADCQ $0, DX; MOVQ DX, R11; MOVQ AX, 8(DI) //x[1]
|
||
|
MOVQ R10, AX; MULQ 16(SI); ADDQ R11, AX; ADCQ $0, DX; MOVQ DX, R11; MOVQ AX, 16(DI) //x[2]
|
||
|
MOVQ R10, AX; MULQ 24(SI); ADDQ R11, AX; ADCQ $0, DX; MOVQ DX, R11; MOVQ AX, 24(DI) //x[3]
|
||
|
MOVQ R10, AX; MULQ 32(SI); ADDQ R11, AX; ADCQ $0, DX; MOVQ DX, R11; MOVQ AX, 32(DI) //x[4]
|
||
|
MOVQ R10, AX; MULQ 40(SI); ADDQ R11, AX; ADCQ $0, DX; MOVQ DX, R11; MOVQ AX, 40(DI) //x[5]
|
||
|
MOVQ R10, AX; MULQ 48(SI); ADDQ R11, AX; ADCQ $0, DX; MOVQ DX, R11; MOVQ AX, 48(DI) //x[6]
|
||
|
MOVQ R10, AX; MULQ 56(SI); ADDQ R11, AX; MOVQ AX, 56(DI) //x[7]
|
||
|
RET
|
||
|
|
||
|
// Optimized for CPUs with BMI2
|
||
|
mul512_mulx:
|
||
|
MOVQ c+16(FP), DX // 64 bit multiplier, used by MULX
|
||
|
MULXQ 0(SI), AX, R10; MOVQ AX, 0(DI) // x[0]
|
||
|
MULXQ 8(SI), AX, R11; ADDQ R10, AX; MOVQ AX, 8(DI) // x[1]
|
||
|
MULXQ 16(SI), AX, R10; ADCQ R11, AX; MOVQ AX, 16(DI) // x[2]
|
||
|
MULXQ 24(SI), AX, R11; ADCQ R10, AX; MOVQ AX, 24(DI) // x[3]
|
||
|
MULXQ 32(SI), AX, R10; ADCQ R11, AX; MOVQ AX, 32(DI) // x[4]
|
||
|
MULXQ 40(SI), AX, R11; ADCQ R10, AX; MOVQ AX, 40(DI) // x[5]
|
||
|
MULXQ 48(SI), AX, R10; ADCQ R11, AX; MOVQ AX, 48(DI) // x[6]
|
||
|
MULXQ 56(SI), AX, R11; ADCQ R10, AX; MOVQ AX, 56(DI) // x[7]
|
||
|
RET
|
||
|
|
||
|
// Multipies 512-bit value by 64-bit value and returns 576-bit result. Uses MULQ instruction to
|
||
|
// multiply 2 64-bit values. Returns 576-bit result.
|
||
|
//
|
||
|
// Result: x = (y * z)
|
||
|
//
|
||
|
// Registers used: AX, CX, DX, SI, DI, R8
|
||
|
//
|
||
|
// func mul576(a, b *Fp, c uint64)
|
||
|
TEXT ·mul576(SB), NOSPLIT, $0-24
|
||
|
MOVQ a+0(FP), DI // result
|
||
|
MOVQ b+8(FP), SI // multiplicand
|
||
|
|
||
|
MOVQ c+16(FP), R10 // 64 bit multiplier, used by MULQ
|
||
|
MOVQ R10, AX; MULQ 0(SI); MOVQ DX, R11; MOVQ AX, 0(DI) //x[0]
|
||
|
MOVQ R10, AX; MULQ 8(SI); ADDQ R11, AX; ADCQ $0, DX; MOVQ DX, R11; MOVQ AX, 8(DI) //x[1]
|
||
|
MOVQ R10, AX; MULQ 16(SI); ADDQ R11, AX; ADCQ $0, DX; MOVQ DX, R11; MOVQ AX, 16(DI) //x[2]
|
||
|
MOVQ R10, AX; MULQ 24(SI); ADDQ R11, AX; ADCQ $0, DX; MOVQ DX, R11; MOVQ AX, 24(DI) //x[3]
|
||
|
MOVQ R10, AX; MULQ 32(SI); ADDQ R11, AX; ADCQ $0, DX; MOVQ DX, R11; MOVQ AX, 32(DI) //x[4]
|
||
|
MOVQ R10, AX; MULQ 40(SI); ADDQ R11, AX; ADCQ $0, DX; MOVQ DX, R11; MOVQ AX, 40(DI) //x[5]
|
||
|
MOVQ R10, AX; MULQ 48(SI); ADDQ R11, AX; ADCQ $0, DX; MOVQ DX, R11; MOVQ AX, 48(DI) //x[6]
|
||
|
MOVQ R10, AX; MULQ 56(SI); ADDQ R11, AX; ADCQ $0, DX; MOVQ AX, 56(DI) //x[7]
|
||
|
MOVQ DX, 64(DI) //x[8]
|
||
|
|
||
|
RET
|
||
|
|
||
|
|
||
|
TEXT ·cswap512(SB),NOSPLIT,$0-17
|
||
|
MOVQ x+0(FP), DI
|
||
|
MOVQ y+8(FP), SI
|
||
|
MOVBLZX choice+16(FP), AX // AL = 0 or 1
|
||
|
|
||
|
// Make AX, so that either all bits are set or non
|
||
|
// AX = 0 or 1
|
||
|
NEGQ AX
|
||
|
|
||
|
// Fill xmm15. After this step first half of XMM15 is
|
||
|
// just zeros and second half is whatever in AX
|
||
|
MOVQ AX, X15
|
||
|
|
||
|
// Copy lower double word everywhere else. So that
|
||
|
// XMM15=AL|AL|AL|AL. As AX has either all bits set
|
||
|
// or non result will be that XMM15 has also either
|
||
|
// all bits set or non of them.
|
||
|
PSHUFD $0, X15, X15
|
||
|
|
||
|
#ifndef CSWAP_BLOCK
|
||
|
#define CSWAP_BLOCK(idx) \
|
||
|
MOVOU (idx*16)(DI), X0 \
|
||
|
MOVOU (idx*16)(SI), X1 \
|
||
|
\ // X2 = mask & (X0 ^ X1)
|
||
|
MOVO X1, X2 \
|
||
|
PXOR X0, X2 \
|
||
|
PAND X15, X2 \
|
||
|
\
|
||
|
PXOR X2, X0 \
|
||
|
PXOR X2, X1 \
|
||
|
\
|
||
|
MOVOU X0, (idx*16)(DI) \
|
||
|
MOVOU X1, (idx*16)(SI)
|
||
|
#endif
|
||
|
|
||
|
CSWAP_BLOCK(0)
|
||
|
CSWAP_BLOCK(1)
|
||
|
CSWAP_BLOCK(2)
|
||
|
CSWAP_BLOCK(3)
|
||
|
|
||
|
RET
|
||
|
|
||
|
// mulAsm implements montgomery multiplication interleaved with
|
||
|
// montgomery reduction. It uses MULX and ADCX/ADOX instructions.
|
||
|
// Implementation specific to 511-bit prime 'p'
|
||
|
//
|
||
|
// func mulBmiAsm(res, x, y *fp)
|
||
|
TEXT ·mulBmiAsm(SB),NOSPLIT,$8-24
|
||
|
|
||
|
MOVQ x+8(FP), DI // multiplicand
|
||
|
MOVQ y+16(FP), SI // multiplier
|
||
|
|
||
|
XORQ R8, R8
|
||
|
XORQ R9, R9
|
||
|
XORQ R10, R10
|
||
|
XORQ R11, R11
|
||
|
XORQ R12, R12
|
||
|
XORQ R13, R13
|
||
|
XORQ R14, R14
|
||
|
XORQ R15, R15
|
||
|
|
||
|
MOVQ BP, 0(SP)
|
||
|
XORQ BP, BP
|
||
|
|
||
|
// Uses BMI2 (MULX)
|
||
|
#ifdef MULS_MULX_512
|
||
|
#undef MULS_MULX_512
|
||
|
#endif
|
||
|
#define MULS_MULX_512(idx, r0, r1, r2, r3, r4, r5, r6, r7, r8) \
|
||
|
\ // Reduction step
|
||
|
MOVQ ( 0)(SI), DX \
|
||
|
MULXQ ( 8*idx)(DI), DX, CX \
|
||
|
ADDQ r0, DX \
|
||
|
MULXQ ·pNegInv(SB), DX, CX \
|
||
|
\
|
||
|
XORQ AX, AX \
|
||
|
MULXQ ·p+ 0(SB), AX, BX; ; ADOXQ AX, r0 \
|
||
|
MULXQ ·p+ 8(SB), AX, CX; ADCXQ BX, r1; ADOXQ AX, r1 \
|
||
|
MULXQ ·p+16(SB), AX, BX; ADCXQ CX, r2; ADOXQ AX, r2 \
|
||
|
MULXQ ·p+24(SB), AX, CX; ADCXQ BX, r3; ADOXQ AX, r3 \
|
||
|
MULXQ ·p+32(SB), AX, BX; ADCXQ CX, r4; ADOXQ AX, r4 \
|
||
|
MULXQ ·p+40(SB), AX, CX; ADCXQ BX, r5; ADOXQ AX, r5 \
|
||
|
MULXQ ·p+48(SB), AX, BX; ADCXQ CX, r6; ADOXQ AX, r6 \
|
||
|
MULXQ ·p+56(SB), AX, CX; ADCXQ BX, r7; ADOXQ AX, r7 \
|
||
|
MOVQ $0, AX ; ADCXQ CX, r8; ADOXQ AX, r8 \
|
||
|
\ // Multiplication step
|
||
|
MOVQ (8*idx)(DI), DX \
|
||
|
\
|
||
|
XORQ AX, AX \
|
||
|
MULXQ ( 0)(SI), AX, BX; ADOXQ AX, r0 \
|
||
|
MULXQ ( 8)(SI), AX, CX; ADCXQ BX, r1; ADOXQ AX, r1 \
|
||
|
MULXQ (16)(SI), AX, BX; ADCXQ CX, r2; ADOXQ AX, r2 \
|
||
|
MULXQ (24)(SI), AX, CX; ADCXQ BX, r3; ADOXQ AX, r3 \
|
||
|
MULXQ (32)(SI), AX, BX; ADCXQ CX, r4; ADOXQ AX, r4 \
|
||
|
MULXQ (40)(SI), AX, CX; ADCXQ BX, r5; ADOXQ AX, r5 \
|
||
|
MULXQ (48)(SI), AX, BX; ADCXQ CX, r6; ADOXQ AX, r6 \
|
||
|
MULXQ (56)(SI), AX, CX; ADCXQ BX, r7; ADOXQ AX, r7 \
|
||
|
MOVQ $0, AX ; ADCXQ CX, r8; ADOXQ AX, r8
|
||
|
|
||
|
MULS_MULX_512(0, R8, R9, R10, R11, R12, R13, R14, R15, BP)
|
||
|
MULS_MULX_512(1, R9, R10, R11, R12, R13, R14, R15, BP, R8)
|
||
|
MULS_MULX_512(2, R10, R11, R12, R13, R14, R15, BP, R8, R9)
|
||
|
MULS_MULX_512(3, R11, R12, R13, R14, R15, BP, R8, R9, R10)
|
||
|
MULS_MULX_512(4, R12, R13, R14, R15, BP, R8, R9, R10, R11)
|
||
|
MULS_MULX_512(5, R13, R14, R15, BP, R8, R9, R10, R11, R12)
|
||
|
MULS_MULX_512(6, R14, R15, BP, R8, R9, R10, R11, R12, R13)
|
||
|
MULS_MULX_512(7, R15, BP, R8, R9, R10, R11, R12, R13, R14)
|
||
|
#undef MULS_MULX_512
|
||
|
|
||
|
MOVQ res+0(FP), DI
|
||
|
MOVQ BP, ( 0)(DI)
|
||
|
MOVQ R8, ( 8)(DI)
|
||
|
MOVQ R9, (16)(DI)
|
||
|
MOVQ R10, (24)(DI)
|
||
|
MOVQ R11, (32)(DI)
|
||
|
MOVQ R12, (40)(DI)
|
||
|
MOVQ R13, (48)(DI)
|
||
|
MOVQ R14, (56)(DI)
|
||
|
MOVQ 0(SP), BP
|
||
|
|
||
|
// NOW DI needs to be reduced if > p
|
||
|
RET
|