mirror of
https://github.com/henrydcase/nobs.git
synced 2024-11-22 15:18:57 +00:00
Kris Kwiatkowski
7efbbf4745
Implementation of Commutative Supersingular Isogeny Diffie Hellman, based on "A faster way to CSIDH" paper (2018/782). * For fast isogeny calculation, implementation converts a curve from Montgomery to Edwards. All calculations are done on Edwards curve and then converted back to Montgomery. * As multiplication in a field Fp511 is most expensive operation the implementation contains multiple multiplications. It has most performant, assembly implementation which uses BMI2 and ADOX/ADCX instructions for modern CPUs. It also contains slower implementation which will run on older CPUs * Benchmarks (Intel SkyLake): BenchmarkGeneratePrivate 6459 172213 ns/op 0 B/op 0 allocs/op BenchmarkGenerateKeyPair 25 45800356 ns/op 0 B/op 0 allocs/op BenchmarkValidate 297 3915983 ns/op 0 B/op 0 allocs/op BenchmarkValidateRandom 184683 6231 ns/op 0 B/op 0 allocs/op BenchmarkValidateGenerated 25 48481306 ns/op 0 B/op 0 allocs/op BenchmarkDerive 19 60928763 ns/op 0 B/op 0 allocs/op BenchmarkDeriveGenerated 8 137342421 ns/op 0 B/op 0 allocs/op BenchmarkXMul 2311 494267 ns/op 1 B/op 0 allocs/op BenchmarkXAdd 2396754 501 ns/op 0 B/op 0 allocs/op BenchmarkXDbl 2072690 571 ns/op 0 B/op 0 allocs/op BenchmarkIsom 78004 15171 ns/op 0 B/op 0 allocs/op BenchmarkFp512Sub 224635152 5.33 ns/op 0 B/op 0 allocs/op BenchmarkFp512Mul 246633255 4.90 ns/op 0 B/op 0 allocs/op BenchmarkCSwap 233228547 5.10 ns/op 0 B/op 0 allocs/op BenchmarkAddRdc 87348240 12.6 ns/op 0 B/op 0 allocs/op BenchmarkSubRdc 95112787 11.7 ns/op 0 B/op 0 allocs/op BenchmarkModExpRdc 25436 46878 ns/op 0 B/op 0 allocs/op BenchmarkMulBmiAsm 19527573 60.1 ns/op 0 B/op 0 allocs/op BenchmarkMulGeneric 7117650 164 ns/op 0 B/op 0 allocs/op * Go code has very similar performance when compared to C implementation. Results from sidh_torturer (4e2996e12d68364761064341cbe1d1b47efafe23) github.com:henrydcase/sidh-torture/csidh | TestName |Go | C | |------------------|----------|----------| |TestSharedSecret | 57.95774 | 57.91092 | |TestKeyGeneration | 62.23614 | 58.12980 | |TestSharedSecret | 55.28988 | 57.23132 | |TestKeyGeneration | 61.68745 | 58.66396 | |TestSharedSecret | 63.19408 | 58.64774 | |TestKeyGeneration | 62.34022 | 61.62539 | |TestSharedSecret | 62.85453 | 68.74503 | |TestKeyGeneration | 52.58518 | 58.40115 | |TestSharedSecret | 50.77081 | 61.91699 | |TestKeyGeneration | 59.91843 | 61.09266 | |TestSharedSecret | 59.97962 | 62.98151 | |TestKeyGeneration | 64.57525 | 56.22863 | |TestSharedSecret | 56.40521 | 55.77447 | |TestKeyGeneration | 67.85850 | 58.52604 | |TestSharedSecret | 60.54290 | 65.14052 | |TestKeyGeneration | 65.45766 | 58.42823 | On average Go implementation is 2% faster.
193 lines
6.9 KiB
ArmAsm
193 lines
6.9 KiB
ArmAsm
// +build amd64,!noasm
|
|
|
|
#include "textflag.h"
|
|
|
|
// Multipies 512-bit value by 64-bit value. Uses MULQ instruction to
|
|
// multiply 2 64-bit values.
|
|
//
|
|
// Result: x = (y * z) mod 2^512
|
|
//
|
|
// Registers used: AX, CX, DX, SI, DI, R8
|
|
//
|
|
// func mul512(a, b *Fp, c uint64)
|
|
TEXT ·mul512(SB), NOSPLIT, $0-24
|
|
MOVQ a+0(FP), DI // result
|
|
MOVQ b+8(FP), SI // multiplicand
|
|
|
|
// Check wether to use optimized implementation
|
|
CMPB ·hasBMI2(SB), $1
|
|
JE mul512_mulx
|
|
|
|
MOVQ c+16(FP), R10 // 64 bit multiplier, used by MULQ
|
|
MOVQ R10, AX; MULQ 0(SI); MOVQ DX, R11; MOVQ AX, 0(DI) //x[0]
|
|
MOVQ R10, AX; MULQ 8(SI); ADDQ R11, AX; ADCQ $0, DX; MOVQ DX, R11; MOVQ AX, 8(DI) //x[1]
|
|
MOVQ R10, AX; MULQ 16(SI); ADDQ R11, AX; ADCQ $0, DX; MOVQ DX, R11; MOVQ AX, 16(DI) //x[2]
|
|
MOVQ R10, AX; MULQ 24(SI); ADDQ R11, AX; ADCQ $0, DX; MOVQ DX, R11; MOVQ AX, 24(DI) //x[3]
|
|
MOVQ R10, AX; MULQ 32(SI); ADDQ R11, AX; ADCQ $0, DX; MOVQ DX, R11; MOVQ AX, 32(DI) //x[4]
|
|
MOVQ R10, AX; MULQ 40(SI); ADDQ R11, AX; ADCQ $0, DX; MOVQ DX, R11; MOVQ AX, 40(DI) //x[5]
|
|
MOVQ R10, AX; MULQ 48(SI); ADDQ R11, AX; ADCQ $0, DX; MOVQ DX, R11; MOVQ AX, 48(DI) //x[6]
|
|
MOVQ R10, AX; MULQ 56(SI); ADDQ R11, AX; MOVQ AX, 56(DI) //x[7]
|
|
RET
|
|
|
|
// Optimized for CPUs with BMI2
|
|
mul512_mulx:
|
|
MOVQ c+16(FP), DX // 64 bit multiplier, used by MULX
|
|
MULXQ 0(SI), AX, R10; MOVQ AX, 0(DI) // x[0]
|
|
MULXQ 8(SI), AX, R11; ADDQ R10, AX; MOVQ AX, 8(DI) // x[1]
|
|
MULXQ 16(SI), AX, R10; ADCQ R11, AX; MOVQ AX, 16(DI) // x[2]
|
|
MULXQ 24(SI), AX, R11; ADCQ R10, AX; MOVQ AX, 24(DI) // x[3]
|
|
MULXQ 32(SI), AX, R10; ADCQ R11, AX; MOVQ AX, 32(DI) // x[4]
|
|
MULXQ 40(SI), AX, R11; ADCQ R10, AX; MOVQ AX, 40(DI) // x[5]
|
|
MULXQ 48(SI), AX, R10; ADCQ R11, AX; MOVQ AX, 48(DI) // x[6]
|
|
MULXQ 56(SI), AX, R11; ADCQ R10, AX; MOVQ AX, 56(DI) // x[7]
|
|
RET
|
|
|
|
// Multipies 512-bit value by 64-bit value and returns 576-bit result. Uses MULQ instruction to
|
|
// multiply 2 64-bit values. Returns 576-bit result.
|
|
//
|
|
// Result: x = (y * z)
|
|
//
|
|
// Registers used: AX, CX, DX, SI, DI, R8
|
|
//
|
|
// func mul576(a, b *Fp, c uint64)
|
|
TEXT ·mul576(SB), NOSPLIT, $0-24
|
|
MOVQ a+0(FP), DI // result
|
|
MOVQ b+8(FP), SI // multiplicand
|
|
|
|
MOVQ c+16(FP), R10 // 64 bit multiplier, used by MULQ
|
|
MOVQ R10, AX; MULQ 0(SI); MOVQ DX, R11; MOVQ AX, 0(DI) //x[0]
|
|
MOVQ R10, AX; MULQ 8(SI); ADDQ R11, AX; ADCQ $0, DX; MOVQ DX, R11; MOVQ AX, 8(DI) //x[1]
|
|
MOVQ R10, AX; MULQ 16(SI); ADDQ R11, AX; ADCQ $0, DX; MOVQ DX, R11; MOVQ AX, 16(DI) //x[2]
|
|
MOVQ R10, AX; MULQ 24(SI); ADDQ R11, AX; ADCQ $0, DX; MOVQ DX, R11; MOVQ AX, 24(DI) //x[3]
|
|
MOVQ R10, AX; MULQ 32(SI); ADDQ R11, AX; ADCQ $0, DX; MOVQ DX, R11; MOVQ AX, 32(DI) //x[4]
|
|
MOVQ R10, AX; MULQ 40(SI); ADDQ R11, AX; ADCQ $0, DX; MOVQ DX, R11; MOVQ AX, 40(DI) //x[5]
|
|
MOVQ R10, AX; MULQ 48(SI); ADDQ R11, AX; ADCQ $0, DX; MOVQ DX, R11; MOVQ AX, 48(DI) //x[6]
|
|
MOVQ R10, AX; MULQ 56(SI); ADDQ R11, AX; ADCQ $0, DX; MOVQ AX, 56(DI) //x[7]
|
|
MOVQ DX, 64(DI) //x[8]
|
|
|
|
RET
|
|
|
|
|
|
TEXT ·cswap512(SB),NOSPLIT,$0-17
|
|
MOVQ x+0(FP), DI
|
|
MOVQ y+8(FP), SI
|
|
MOVBLZX choice+16(FP), AX // AL = 0 or 1
|
|
|
|
// Make AX, so that either all bits are set or non
|
|
// AX = 0 or 1
|
|
NEGQ AX
|
|
|
|
// Fill xmm15. After this step first half of XMM15 is
|
|
// just zeros and second half is whatever in AX
|
|
MOVQ AX, X15
|
|
|
|
// Copy lower double word everywhere else. So that
|
|
// XMM15=AL|AL|AL|AL. As AX has either all bits set
|
|
// or non result will be that XMM15 has also either
|
|
// all bits set or non of them.
|
|
PSHUFD $0, X15, X15
|
|
|
|
#ifndef CSWAP_BLOCK
|
|
#define CSWAP_BLOCK(idx) \
|
|
MOVOU (idx*16)(DI), X0 \
|
|
MOVOU (idx*16)(SI), X1 \
|
|
\ // X2 = mask & (X0 ^ X1)
|
|
MOVO X1, X2 \
|
|
PXOR X0, X2 \
|
|
PAND X15, X2 \
|
|
\
|
|
PXOR X2, X0 \
|
|
PXOR X2, X1 \
|
|
\
|
|
MOVOU X0, (idx*16)(DI) \
|
|
MOVOU X1, (idx*16)(SI)
|
|
#endif
|
|
|
|
CSWAP_BLOCK(0)
|
|
CSWAP_BLOCK(1)
|
|
CSWAP_BLOCK(2)
|
|
CSWAP_BLOCK(3)
|
|
|
|
RET
|
|
|
|
// mulAsm implements montgomery multiplication interleaved with
|
|
// montgomery reduction. It uses MULX and ADCX/ADOX instructions.
|
|
// Implementation specific to 511-bit prime 'p'
|
|
//
|
|
// func mulBmiAsm(res, x, y *fp)
|
|
TEXT ·mulBmiAsm(SB),NOSPLIT,$8-24
|
|
|
|
MOVQ x+8(FP), DI // multiplicand
|
|
MOVQ y+16(FP), SI // multiplier
|
|
|
|
XORQ R8, R8
|
|
XORQ R9, R9
|
|
XORQ R10, R10
|
|
XORQ R11, R11
|
|
XORQ R12, R12
|
|
XORQ R13, R13
|
|
XORQ R14, R14
|
|
XORQ R15, R15
|
|
|
|
MOVQ BP, 0(SP)
|
|
XORQ BP, BP
|
|
|
|
// Uses BMI2 (MULX)
|
|
#ifdef MULS_MULX_512
|
|
#undef MULS_MULX_512
|
|
#endif
|
|
#define MULS_MULX_512(idx, r0, r1, r2, r3, r4, r5, r6, r7, r8) \
|
|
\ // Reduction step
|
|
MOVQ ( 0)(SI), DX \
|
|
MULXQ ( 8*idx)(DI), DX, CX \
|
|
ADDQ r0, DX \
|
|
MULXQ ·pNegInv(SB), DX, CX \
|
|
\
|
|
XORQ AX, AX \
|
|
MULXQ ·p+ 0(SB), AX, BX; ; ADOXQ AX, r0 \
|
|
MULXQ ·p+ 8(SB), AX, CX; ADCXQ BX, r1; ADOXQ AX, r1 \
|
|
MULXQ ·p+16(SB), AX, BX; ADCXQ CX, r2; ADOXQ AX, r2 \
|
|
MULXQ ·p+24(SB), AX, CX; ADCXQ BX, r3; ADOXQ AX, r3 \
|
|
MULXQ ·p+32(SB), AX, BX; ADCXQ CX, r4; ADOXQ AX, r4 \
|
|
MULXQ ·p+40(SB), AX, CX; ADCXQ BX, r5; ADOXQ AX, r5 \
|
|
MULXQ ·p+48(SB), AX, BX; ADCXQ CX, r6; ADOXQ AX, r6 \
|
|
MULXQ ·p+56(SB), AX, CX; ADCXQ BX, r7; ADOXQ AX, r7 \
|
|
MOVQ $0, AX ; ADCXQ CX, r8; ADOXQ AX, r8 \
|
|
\ // Multiplication step
|
|
MOVQ (8*idx)(DI), DX \
|
|
\
|
|
XORQ AX, AX \
|
|
MULXQ ( 0)(SI), AX, BX; ADOXQ AX, r0 \
|
|
MULXQ ( 8)(SI), AX, CX; ADCXQ BX, r1; ADOXQ AX, r1 \
|
|
MULXQ (16)(SI), AX, BX; ADCXQ CX, r2; ADOXQ AX, r2 \
|
|
MULXQ (24)(SI), AX, CX; ADCXQ BX, r3; ADOXQ AX, r3 \
|
|
MULXQ (32)(SI), AX, BX; ADCXQ CX, r4; ADOXQ AX, r4 \
|
|
MULXQ (40)(SI), AX, CX; ADCXQ BX, r5; ADOXQ AX, r5 \
|
|
MULXQ (48)(SI), AX, BX; ADCXQ CX, r6; ADOXQ AX, r6 \
|
|
MULXQ (56)(SI), AX, CX; ADCXQ BX, r7; ADOXQ AX, r7 \
|
|
MOVQ $0, AX ; ADCXQ CX, r8; ADOXQ AX, r8
|
|
|
|
MULS_MULX_512(0, R8, R9, R10, R11, R12, R13, R14, R15, BP)
|
|
MULS_MULX_512(1, R9, R10, R11, R12, R13, R14, R15, BP, R8)
|
|
MULS_MULX_512(2, R10, R11, R12, R13, R14, R15, BP, R8, R9)
|
|
MULS_MULX_512(3, R11, R12, R13, R14, R15, BP, R8, R9, R10)
|
|
MULS_MULX_512(4, R12, R13, R14, R15, BP, R8, R9, R10, R11)
|
|
MULS_MULX_512(5, R13, R14, R15, BP, R8, R9, R10, R11, R12)
|
|
MULS_MULX_512(6, R14, R15, BP, R8, R9, R10, R11, R12, R13)
|
|
MULS_MULX_512(7, R15, BP, R8, R9, R10, R11, R12, R13, R14)
|
|
#undef MULS_MULX_512
|
|
|
|
MOVQ res+0(FP), DI
|
|
MOVQ BP, ( 0)(DI)
|
|
MOVQ R8, ( 8)(DI)
|
|
MOVQ R9, (16)(DI)
|
|
MOVQ R10, (24)(DI)
|
|
MOVQ R11, (32)(DI)
|
|
MOVQ R12, (40)(DI)
|
|
MOVQ R13, (48)(DI)
|
|
MOVQ R14, (56)(DI)
|
|
MOVQ 0(SP), BP
|
|
|
|
// NOW DI needs to be reduced if > p
|
|
RET
|