1
0
mirror of https://github.com/henrydcase/nobs.git synced 2024-11-22 15:18:57 +00:00

sidh: use SIMD for performing CSWAP

Loads data into 128-bit XMM registers and performs conditional swap.
This is probably less useful for SIDH, but will be useful for cSIDH
This commit is contained in:
Henry Case 2018-10-25 17:09:58 +01:00
parent 9d0050cbee
commit b3470af4d5
2 changed files with 63 additions and 134 deletions

View File

@ -90,31 +90,42 @@ TEXT ·fp503ConditionalSwap(SB),NOSPLIT,$0-17
MOVQ x+0(FP), REG_P1 MOVQ x+0(FP), REG_P1
MOVQ y+8(FP), REG_P2 MOVQ y+8(FP), REG_P2
MOVB choice+16(FP), AL // AL = 0 or 1 MOVBLZX choice+16(FP), AX // AL = 0 or 1
MOVBLZX AL, AX // AX = 0 or 1
NEGQ AX // RAX = 0x00..00 or 0xff..ff // Make AX, so that either all bits are set or non
// AX = 0 or 1
NEGQ AX
// Fill xmm15. After this step first half of XMM15 is
// just zeros and second half is whatever in AX
MOVQ AX, X15
// Copy lower double word everywhere else. So that
// XMM15=AL|AL|AL|AL. As AX has either all bits set
// or non result will be that XMM15 has also either
// all bits set or non of them.
PSHUFD $0, X15, X15
#ifndef CSWAP_BLOCK #ifndef CSWAP_BLOCK
#define CSWAP_BLOCK(idx) \ #define CSWAP_BLOCK(idx) \
MOVQ (idx*8)(REG_P1), BX \ // BX = x[idx] MOVOU (idx*16)(REG_P1), X0 \
MOVQ (idx*8)(REG_P2), CX \ // CX = y[idx] MOVOU (idx*16)(REG_P2), X1 \
MOVQ CX, DX \ // DX = y[idx] \ // X2 = mask & (X0 ^ X1)
XORQ BX, DX \ // DX = y[idx] ^ x[idx] MOVO X1, X2 \
ANDQ AX, DX \ // DX = (y[idx] ^ x[idx]) & mask PXOR X0, X2 \
XORQ DX, BX \ // BX = (y[idx] ^ x[idx]) & mask) ^ x[idx] = x[idx] or y[idx] PAND X15, X2 \
XORQ DX, CX \ // CX = (y[idx] ^ x[idx]) & mask) ^ y[idx] = y[idx] or x[idx] \
MOVQ BX, (idx*8)(REG_P1) \ PXOR X2, X0 \
MOVQ CX, (idx*8)(REG_P2) PXOR X2, X1 \
\
MOVOU X0, (idx*16)(REG_P1) \
MOVOU X1, (idx*16)(REG_P2)
#endif #endif
CSWAP_BLOCK(0) CSWAP_BLOCK(0)
CSWAP_BLOCK(1) CSWAP_BLOCK(1)
CSWAP_BLOCK(2) CSWAP_BLOCK(2)
CSWAP_BLOCK(3) CSWAP_BLOCK(3)
CSWAP_BLOCK(4)
CSWAP_BLOCK(5)
CSWAP_BLOCK(6)
CSWAP_BLOCK(7)
#ifdef CSWAP_BLOCK #ifdef CSWAP_BLOCK
#undef CSWAP_BLOCK #undef CSWAP_BLOCK

View File

@ -126,130 +126,48 @@ TEXT ·fp751ConditionalSwap(SB), NOSPLIT, $0-17
MOVQ x+0(FP), REG_P1 MOVQ x+0(FP), REG_P1
MOVQ y+8(FP), REG_P2 MOVQ y+8(FP), REG_P2
MOVB choice+16(FP), AL // AL = 0 or 1 MOVBLZX choice+16(FP), AX // AL = 0 or 1
MOVBLZX AL, AX // AX = 0 or 1
NEGQ AX // RAX = 0x00..00 or 0xff..ff
MOVQ (0*8)(REG_P1), BX // BX = x[0] // Make AX, so that either all bits are set or non
MOVQ (0*8)(REG_P2), CX // CX = y[0] // AX = 0 or 1
MOVQ CX, DX // DX = y[0] NEGQ AX
XORQ BX, DX // DX = y[0] ^ x[0]
ANDQ AX, DX // DX = (y[0] ^ x[0]) & mask
XORQ DX, BX // BX = (y[0] ^ x[0]) & mask) ^ x[0] = x[0] or y[0]
XORQ DX, CX // CX = (y[0] ^ x[0]) & mask) ^ y[0] = y[0] or x[0]
MOVQ BX, (0*8)(REG_P1)
MOVQ CX, (0*8)(REG_P2)
MOVQ (1*8)(REG_P1), BX // Fill xmm15. After this step first half of XMM15 is
MOVQ (1*8)(REG_P2), CX // just zeros and second half is whatever in AX
MOVQ CX, DX MOVQ AX, X15
XORQ BX, DX
ANDQ AX, DX
XORQ DX, BX
XORQ DX, CX
MOVQ BX, (1*8)(REG_P1)
MOVQ CX, (1*8)(REG_P2)
MOVQ (2*8)(REG_P1), BX // Copy lower double word everywhere else. So that
MOVQ (2*8)(REG_P2), CX // XMM15=AL|AL|AL|AL. As AX has either all bits set
MOVQ CX, DX // or non result will be that XMM15 has also either
XORQ BX, DX // all bits set or non of them.
ANDQ AX, DX PSHUFD $0, X15, X15
XORQ DX, BX
XORQ DX, CX
MOVQ BX, (2*8)(REG_P1)
MOVQ CX, (2*8)(REG_P2)
MOVQ (3*8)(REG_P1), BX #ifndef CSWAP_BLOCK
MOVQ (3*8)(REG_P2), CX #define CSWAP_BLOCK(idx) \
MOVQ CX, DX MOVOU (idx*16)(REG_P1), X0 \
XORQ BX, DX MOVOU (idx*16)(REG_P2), X1 \
ANDQ AX, DX \ // X2 = mask & (X0 ^ X1)
XORQ DX, BX MOVO X1, X2 \
XORQ DX, CX PXOR X0, X2 \
MOVQ BX, (3*8)(REG_P1) PAND X15, X2 \
MOVQ CX, (3*8)(REG_P2) \
PXOR X2, X0 \
PXOR X2, X1 \
\
MOVOU X0, (idx*16)(REG_P1) \
MOVOU X1, (idx*16)(REG_P2)
#endif
MOVQ (4*8)(REG_P1), BX CSWAP_BLOCK(0)
MOVQ (4*8)(REG_P2), CX CSWAP_BLOCK(1)
MOVQ CX, DX CSWAP_BLOCK(2)
XORQ BX, DX CSWAP_BLOCK(3)
ANDQ AX, DX CSWAP_BLOCK(4)
XORQ DX, BX CSWAP_BLOCK(5)
XORQ DX, CX
MOVQ BX, (4*8)(REG_P1)
MOVQ CX, (4*8)(REG_P2)
MOVQ (5*8)(REG_P1), BX
MOVQ (5*8)(REG_P2), CX
MOVQ CX, DX
XORQ BX, DX
ANDQ AX, DX
XORQ DX, BX
XORQ DX, CX
MOVQ BX, (5*8)(REG_P1)
MOVQ CX, (5*8)(REG_P2)
MOVQ (6*8)(REG_P1), BX
MOVQ (6*8)(REG_P2), CX
MOVQ CX, DX
XORQ BX, DX
ANDQ AX, DX
XORQ DX, BX
XORQ DX, CX
MOVQ BX, (6*8)(REG_P1)
MOVQ CX, (6*8)(REG_P2)
MOVQ (7*8)(REG_P1), BX
MOVQ (7*8)(REG_P2), CX
MOVQ CX, DX
XORQ BX, DX
ANDQ AX, DX
XORQ DX, BX
XORQ DX, CX
MOVQ BX, (7*8)(REG_P1)
MOVQ CX, (7*8)(REG_P2)
MOVQ (8*8)(REG_P1), BX
MOVQ (8*8)(REG_P2), CX
MOVQ CX, DX
XORQ BX, DX
ANDQ AX, DX
XORQ DX, BX
XORQ DX, CX
MOVQ BX, (8*8)(REG_P1)
MOVQ CX, (8*8)(REG_P2)
MOVQ (9*8)(REG_P1), BX
MOVQ (9*8)(REG_P2), CX
MOVQ CX, DX
XORQ BX, DX
ANDQ AX, DX
XORQ DX, BX
XORQ DX, CX
MOVQ BX, (9*8)(REG_P1)
MOVQ CX, (9*8)(REG_P2)
MOVQ (10*8)(REG_P1), BX
MOVQ (10*8)(REG_P2), CX
MOVQ CX, DX
XORQ BX, DX
ANDQ AX, DX
XORQ DX, BX
XORQ DX, CX
MOVQ BX, (10*8)(REG_P1)
MOVQ CX, (10*8)(REG_P2)
MOVQ (11*8)(REG_P1), BX
MOVQ (11*8)(REG_P2), CX
MOVQ CX, DX
XORQ BX, DX
ANDQ AX, DX
XORQ DX, BX
XORQ DX, CX
MOVQ BX, (11*8)(REG_P1)
MOVQ CX, (11*8)(REG_P2)
#ifdef CSWAP_BLOCK
#undef CSWAP_BLOCK
#endif
RET RET
TEXT ·fp751AddReduced(SB), NOSPLIT, $0-24 TEXT ·fp751AddReduced(SB), NOSPLIT, $0-24