1
0
mirror of https://github.com/henrydcase/nobs.git synced 2024-11-26 17:11:22 +00:00
nobs/dh/csidh/arith_amd64.s
2019-02-12 21:28:37 +00:00

272 lines
7.7 KiB
ArmAsm

// +build amd64,!noasm
#include "textflag.h"
// Multipies 512-bit value by 64-bit value. Uses MULX instruction
// x = y * z
//
// func mul512(a, b *u512, c uint64)
TEXT ·mul512(SB), NOSPLIT, $0-24
MOVQ x+ 0(FP), DI // result
MOVQ y+ 8(FP), SI // multiplicand
MOVQ z+16(FP), DX // 64 byte multiplier
MULXQ 0(SI), AX, R10; MOVQ AX, 0(DI) // x[0]
MULXQ 8(SI), AX, R11; ADDQ R10, AX; MOVQ AX, 8(DI) // x[1]
MULXQ 16(SI), AX, R10; ADCQ R11, AX; MOVQ AX, 16(DI) // x[2]
MULXQ 24(SI), AX, R11; ADCQ R10, AX; MOVQ AX, 24(DI) // x[3]
MULXQ 32(SI), AX, R10; ADCQ R11, AX; MOVQ AX, 32(DI) // x[4]
MULXQ 40(SI), AX, R11; ADCQ R10, AX; MOVQ AX, 40(DI) // x[5]
MULXQ 48(SI), AX, R10; ADCQ R11, AX; MOVQ AX, 48(DI) // x[6]
MULXQ 56(SI), AX, R11; ADCQ R10, AX; MOVQ AX, 56(DI) // x[7]
RET
// x = y + z
// func add512(x, y, z *u512) uint64
TEXT ·add512(SB), NOSPLIT, $0-32
MOVQ x+ 0(FP), DI // result
MOVQ y+ 8(FP), SI // first summand
MOVQ z+16(FP), DX // second summand
XORQ AX, AX
MOVQ 0(SI), R8; ADDQ 0(DX), R8; MOVQ R8, 0(DI) // x[0]
MOVQ 8(SI), R8; ADCQ 8(DX), R8; MOVQ R8, 8(DI) // x[1]
MOVQ 16(SI), R8; ADCQ 16(DX), R8; MOVQ R8, 16(DI) // x[2]
MOVQ 24(SI), R8; ADCQ 24(DX), R8; MOVQ R8, 24(DI) // x[3]
MOVQ 32(SI), R8; ADCQ 32(DX), R8; MOVQ R8, 32(DI) // x[4]
MOVQ 40(SI), R8; ADCQ 40(DX), R8; MOVQ R8, 40(DI) // x[5]
MOVQ 48(SI), R8; ADCQ 48(DX), R8; MOVQ R8, 48(DI) // x[6]
MOVQ 56(SI), R8; ADCQ 56(DX), R8; MOVQ R8, 56(DI) // x[7]
// return carry
ADCQ AX, AX
MOVQ AX, ret+24(FP)
RET
// x = y - z
// func sub512(x, y, z *u512) uint64
TEXT ·sub512(SB), NOSPLIT, $0-32
MOVQ x+ 0(FP), DI // result
MOVQ y+ 8(FP), SI // minuend
MOVQ z+16(FP), DX // subtrahend
XORQ AX, AX
MOVQ 0(SI), R8; SUBQ 0(DX), R8; MOVQ R8, 0(DI) // x[0]
MOVQ 8(SI), R8; SBBQ 8(DX), R8; MOVQ R8, 8(DI) // x[1]
MOVQ 16(SI), R8; SBBQ 16(DX), R8; MOVQ R8, 16(DI) // x[2]
MOVQ 24(SI), R8; SBBQ 24(DX), R8; MOVQ R8, 24(DI) // x[3]
MOVQ 32(SI), R8; SBBQ 32(DX), R8; MOVQ R8, 32(DI) // x[4]
MOVQ 40(SI), R8; SBBQ 40(DX), R8; MOVQ R8, 40(DI) // x[5]
MOVQ 48(SI), R8; SBBQ 48(DX), R8; MOVQ R8, 48(DI) // x[6]
MOVQ 56(SI), R8; SBBQ 56(DX), R8; MOVQ R8, 56(DI) // x[7]
// return borrow
ADCQ AX, AX
MOVQ AX, ret+24(FP)
RET
TEXT ·cswap512(SB),NOSPLIT,$0-17
MOVQ x+0(FP), DI
MOVQ y+8(FP), SI
MOVBLZX choice+16(FP), AX // AL = 0 or 1
// Make AX, so that either all bits are set or non
// AX = 0 or 1
NEGQ AX
// Fill xmm15. After this step first half of XMM15 is
// just zeros and second half is whatever in AX
MOVQ AX, X15
// Copy lower double word everywhere else. So that
// XMM15=AL|AL|AL|AL. As AX has either all bits set
// or non result will be that XMM15 has also either
// all bits set or non of them.
PSHUFD $0, X15, X15
#ifndef CSWAP_BLOCK
#define CSWAP_BLOCK(idx) \
MOVOU (idx*16)(DI), X0 \
MOVOU (idx*16)(SI), X1 \
\ // X2 = mask & (X0 ^ X1)
MOVO X1, X2 \
PXOR X0, X2 \
PAND X15, X2 \
\
PXOR X2, X0 \
PXOR X2, X1 \
\
MOVOU X0, (idx*16)(DI) \
MOVOU X1, (idx*16)(SI)
#endif
CSWAP_BLOCK(0)
CSWAP_BLOCK(1)
CSWAP_BLOCK(2)
CSWAP_BLOCK(3)
RET
// val = val<p?val:val-p
TEXT ·crdc512(SB),NOSPLIT,$0-8
MOVQ val+0(FP), DI
MOVQ ( 0)(DI), SI; SUBQ ·p+ 0(SB), SI
MOVQ ( 8)(DI), DX; SBBQ ·p+ 8(SB), DX
MOVQ (16)(DI), CX; SBBQ ·p+16(SB), CX
MOVQ (24)(DI), R8; SBBQ ·p+24(SB), R8
MOVQ (32)(DI), R9; SBBQ ·p+32(SB), R9
MOVQ (40)(DI), R10; SBBQ ·p+40(SB), R10
MOVQ (48)(DI), R11; SBBQ ·p+48(SB), R11
MOVQ (56)(DI), R12; SBBQ ·p+56(SB), R12
MOVQ ( 0)(DI), AX; CMOVQCC SI, AX; MOVQ AX, ( 0)(DI)
MOVQ ( 8)(DI), AX; CMOVQCC DX, AX; MOVQ AX, ( 8)(DI)
MOVQ (16)(DI), AX; CMOVQCC CX, AX; MOVQ AX, (16)(DI)
MOVQ (24)(DI), AX; CMOVQCC R8, AX; MOVQ AX, (24)(DI)
MOVQ (32)(DI), AX; CMOVQCC R9, AX; MOVQ AX, (32)(DI)
MOVQ (40)(DI), AX; CMOVQCC R10, AX; MOVQ AX, (40)(DI)
MOVQ (48)(DI), AX; CMOVQCC R11, AX; MOVQ AX, (48)(DI)
MOVQ (56)(DI), AX; CMOVQCC R12, AX; MOVQ AX, (56)(DI)
RET
// val = b?val+p:val
TEXT ·csubrdc512(SB),NOSPLIT,$0-16
MOVQ val+0(FP), DI
MOVQ choice+8(FP), SI
XORQ R8, R8
XORQ R9, R9
XORQ R10, R10
XORQ R11, R11
XORQ R12, R12
XORQ R13, R13
XORQ R14, R14
XORQ R15, R15
TESTQ SI, SI
CMOVQNE ·p+ 0(SB), R8
CMOVQNE ·p+ 8(SB), R9
CMOVQNE ·p+16(SB), R10
CMOVQNE ·p+24(SB), R11
CMOVQNE ·p+32(SB), R12
CMOVQNE ·p+40(SB), R13
CMOVQNE ·p+48(SB), R14
CMOVQNE ·p+56(SB), R15
MOVQ ( 0)(DI), DX; ADDQ R8, DX; MOVQ DX, ( 0)(DI)
MOVQ ( 8)(DI), DX; ADCQ R9, DX; MOVQ DX, ( 8)(DI)
MOVQ (16)(DI), DX; ADCQ R10, DX; MOVQ DX, (16)(DI)
MOVQ (24)(DI), DX; ADCQ R11, DX; MOVQ DX, (24)(DI)
MOVQ (32)(DI), DX; ADCQ R12, DX; MOVQ DX, (32)(DI)
MOVQ (40)(DI), DX; ADCQ R13, DX; MOVQ DX, (40)(DI)
MOVQ (48)(DI), DX; ADCQ R14, DX; MOVQ DX, (48)(DI)
MOVQ (56)(DI), DX; ADCQ R15, DX; MOVQ DX, (56)(DI)
RET
// mul function implements montgomery multiplication interleaved with rdc.
// It takes advantage of the fact that inversion of 'p' has only 64-bits
//
// z = x*y mod p
TEXT ·mul(SB),NOSPLIT,$32-24
MOVQ y+ 8(FP), DI // multiplicand
MOVQ z+16(FP), SI // multiplier
XORQ R8, R8
XORQ R9, R9
XORQ R10, R10
XORQ R11, R11
XORQ R12, R12
XORQ R13, R13
XORQ R14, R14
XORQ R15, R15
MOVQ BP, 24(SP) // OZAPTF: thats maybe wrong
XORQ BP, BP
// Uses BMI2 and MULX
#ifdef MULS_MULX_512
#undef MULS_MULX_512
#endif
#define MULS_MULX_512(idx, r0, r1, r2, r3, r4, r5, r6, r7, r8) \
\ // Reduction step
MOVQ ( 0)(SI), DX \
MULXQ ( 8*idx)(DI), DX, CX \
ADDQ r0, DX \
MULXQ ·pNegInv(SB), DX, CX \
\ // Clean flags
XORQ AX, AX \
MULXQ ·p+ 0(SB), AX, BX; ; ADOXQ AX, r0 \
MULXQ ·p+ 8(SB), AX, CX; ADCXQ BX, r1; ADOXQ AX, r1 \
MULXQ ·p+16(SB), AX, BX; ADCXQ CX, r2; ADOXQ AX, r2 \
MULXQ ·p+24(SB), AX, CX; ADCXQ BX, r3; ADOXQ AX, r3 \
MULXQ ·p+32(SB), AX, BX; ADCXQ CX, r4; ADOXQ AX, r4 \
MULXQ ·p+40(SB), AX, CX; ADCXQ BX, r5; ADOXQ AX, r5 \
MULXQ ·p+48(SB), AX, BX; ADCXQ CX, r6; ADOXQ AX, r6 \
MULXQ ·p+56(SB), AX, CX; ADCXQ BX, r7; ADOXQ AX, r7 \
MOVQ $0, AX ; ADCXQ CX, r8; ADOXQ AX, r8 \
\ // Multiplication step
MOVQ (8*idx)(DI), DX \
\ // Clean flags
XORQ AX, AX \
MULXQ ( 0)(SI), AX, BX; ADOXQ AX, r0 \
MULXQ ( 8)(SI), AX, CX; ADCXQ BX, r1; ADOXQ AX, r1 \
MULXQ (16)(SI), AX, BX; ADCXQ CX, r2; ADOXQ AX, r2 \
MULXQ (24)(SI), AX, CX; ADCXQ BX, r3; ADOXQ AX, r3 \
MULXQ (32)(SI), AX, BX; ADCXQ CX, r4; ADOXQ AX, r4 \
MULXQ (40)(SI), AX, CX; ADCXQ BX, r5; ADOXQ AX, r5 \
MULXQ (48)(SI), AX, BX; ADCXQ CX, r6; ADOXQ AX, r6 \
MULXQ (56)(SI), AX, CX; ADCXQ BX, r7; ADOXQ AX, r7 \
MOVQ $0, AX ; ADCXQ CX, r8; ADOXQ AX, r8
MULS_MULX_512(0, R8, R9, R10, R11, R12, R13, R14, R15, BP)
MULS_MULX_512(1, R9, R10, R11, R12, R13, R14, R15, BP, R8)
MULS_MULX_512(2, R10, R11, R12, R13, R14, R15, BP, R8, R9)
MULS_MULX_512(3, R11, R12, R13, R14, R15, BP, R8, R9, R10)
MULS_MULX_512(4, R12, R13, R14, R15, BP, R8, R9, R10, R11)
MULS_MULX_512(5, R13, R14, R15, BP, R8, R9, R10, R11, R12)
MULS_MULX_512(6, R14, R15, BP, R8, R9, R10, R11, R12, R13)
MULS_MULX_512(7, R15, BP, R8, R9, R10, R11, R12, R13, R14)
MOVQ x+0(FP), DI
MOVQ BP, ( 0)(DI)
MOVQ R8, ( 8)(DI)
MOVQ R9, (16)(DI)
MOVQ R10, (24)(DI)
MOVQ R11, (32)(DI)
MOVQ R12, (40)(DI)
MOVQ R13, (48)(DI)
MOVQ R14, (56)(DI)
MOVQ 24(SP), BP
// NOW DI needs to be reduced if > p
RET
// Checks if x>y. Returns 1 if true otherwise, 0
TEXT ·checkBigger(SB),NOSPLIT,$0-24
MOVQ x+ 0(FP), DI // minuend
MOVQ y+ 8(FP), SI // subtrahend
XORQ AX, AX
MOVQ 0(SI), R8; SUBQ 0(DI), R8
MOVQ 8(SI), R8; SBBQ 8(DI), R8
MOVQ 16(SI), R8; SBBQ 16(DI), R8
MOVQ 24(SI), R8; SBBQ 24(DI), R8
MOVQ 32(SI), R8; SBBQ 32(DI), R8
MOVQ 40(SI), R8; SBBQ 40(DI), R8
MOVQ 48(SI), R8; SBBQ 48(DI), R8
MOVQ 56(SI), R8; SBBQ 56(DI), R8
// return borrow
ADCQ AX, AX
MOVQ AX, 24(SP)
RET