diff --git a/p751/arith_amd64.go b/p751/arith_amd64.go new file mode 100644 index 0000000..9bc7a6e --- /dev/null +++ b/p751/arith_amd64.go @@ -0,0 +1,24 @@ +// +build amd64,!noasm + +package p751 + +import cpu "github.com/cloudflare/p751sidh/internal/utils" + +// There couple of reasons for having those variables here: +// * to have an access to them from assembly +// TODO(kk): Is there a way to access variable from different package? +// If it is then probably this file could be moved to internal +// and we don't need to have many copies of that +// * make it easy to vendor the library +// * make it possible to test all functionalities +var useMULX bool +var useADXMULX bool + +func recognizecpu() { + useMULX = cpu.HasBMI2 + useADXMULX = cpu.HasADX && cpu.HasBMI2 +} + +func init() { + recognizecpu() +} diff --git a/p751/arith_amd64.s b/p751/arith_amd64.s index 1fae216..65e7500 100644 --- a/p751/arith_amd64.s +++ b/p751/arith_amd64.s @@ -1607,177 +1607,143 @@ TEXT ·fp751Mul(SB), $96-24 ADCQ DX, T2 \ ADCQ AX, T4 -#define fp751MontgomeryReduceCommonPart1(M0, C, T0, T1, T2, T3, T4, T5, T6, T7, T8, T9)\ - XORQ T7, T7 \ - MOVQ 48+C, AX \ - MOVQ 56+C, DX \ - MOVQ 64+C, T9 \ - ADDQ 40+M0, AX \ - ADCQ 48+M0, DX \ - ADCQ 56+M0, T9 \ - MOVQ AX, 40+M0 \ - MOVQ DX, 48+M0 \ - MOVQ T9, 56+M0 \ - ADCQ 64+M0, T8 \ - ADCQ 72+M0, T0 \ - ADCQ 80+M0, T1 \ - ADCQ 88+M0, T2 \ - ADCQ 96+M0, T3 \ - ADCQ 104+M0, T4 \ - ADCQ 112+M0, T5 \ - ADCQ 120+M0, T6 \ - ADCQ 128+M0, T7 \ - MOVQ T8, 64+M0 \ - MOVQ T0, 72+M0 \ - MOVQ T1, 80+M0 \ - MOVQ T2, 88+M0 \ - MOVQ T3, 96+M0 \ - MOVQ T4, 104+M0 \ - MOVQ T5, 112+M0 \ - MOVQ T6, 120+M0 \ - MOVQ T7, 128+M0 \ - MOVQ 136+M0, T0 \ - MOVQ 144+M0, T1 \ - MOVQ 152+M0, T2 \ - MOVQ 160+M0, T3 \ - MOVQ 168+M0, T4 \ - MOVQ 176+M0, T5 \ - MOVQ 184+M0, T6 \ - ADCQ $0, T0 \ - ADCQ $0, T1 \ - ADCQ $0, T2 \ - ADCQ $0, T3 \ - ADCQ $0, T4 \ - ADCQ $0, T5 \ - ADCQ $0, T6 \ - MOVQ T0, 136+M0 \ - MOVQ T1, 144+M0 \ - MOVQ T2, 152+M0 \ - MOVQ T3, 160+M0 \ - MOVQ T4, 168+M0 \ - MOVQ T5, 176+M0 \ - MOVQ T6, 184+M0 - -#define fp751MontgomeryReduceCommonPart2(M0, C, T0, T1, T2, T3, T4, T5, T6, T7, T8, T9)\ - XORQ T7, T7 \ - MOVQ 48+C, AX \ - MOVQ 56+C, DX \ - MOVQ 64+C, T9 \ - ADDQ 72+M0, AX \ - ADCQ 80+M0, DX \ - ADCQ 88+M0, T9 \ - MOVQ AX, 72+M0 \ - MOVQ DX, 80+M0 \ - MOVQ T9, 88+M0 \ - ADCQ 96+M0, T8 \ - ADCQ 104+M0, T0 \ - ADCQ 112+M0, T1 \ - ADCQ 120+M0, T2 \ - ADCQ 128+M0, T3 \ - ADCQ 136+M0, T4 \ - ADCQ 144+M0, T5 \ - ADCQ 152+M0, T6 \ - ADCQ 160+M0, T7 \ - MOVQ T8, 0+C \ // Final result c0 - MOVQ T0, 104+M0 \ - MOVQ T1, 112+M0 \ - MOVQ T2, 120+M0 \ - MOVQ T3, 128+M0 \ - MOVQ T4, 136+M0 \ - MOVQ T5, 144+M0 \ - MOVQ T6, 152+M0 \ - MOVQ T7, 160+M0 \ - MOVQ 168+M0, T4 \ - MOVQ 176+M0, T5 \ - MOVQ 184+M0, T6 \ - ADCQ $0, T4 \ - ADCQ $0, T5 \ - ADCQ $0, T6 \ - MOVQ T4, 168+M0 \ - MOVQ T5, 176+M0 \ - MOVQ T6, 184+M0 - -#define fp751MontgomeryReduceCommonPart3(M0, C, T0, T1, T2, T3, T4, T5, T6, T7, T8, T9)\ - MOVQ 48+C, AX \ // Final result c1:c11 - MOVQ 56+C, DX \ - MOVQ 64+C, T9 \ - ADDQ 104+M0, AX \ - ADCQ 112+M0, DX \ - ADCQ 120+M0, T9 \ - MOVQ AX, 8+C \ - MOVQ DX, 16+C \ - MOVQ T9, 24+C \ - ADCQ 128+M0, T8 \ - ADCQ 136+M0, T0 \ - ADCQ 144+M0, T1 \ - ADCQ 152+M0, T2 \ - ADCQ 160+M0, T3 \ - ADCQ 168+M0, T4 \ - ADCQ 176+M0, T5 \ - ADCQ 184+M0, T6 \ - MOVQ T8, 32+C \ - MOVQ T0, 40+C \ - MOVQ T1, 48+C \ - MOVQ T2, 56+C \ - MOVQ T3, 64+C \ - MOVQ T4, 72+C \ - MOVQ T5, 80+C \ - MOVQ T6, 88+C - -// This implements the Montgomery reduction algorithm described in -// section 5.2.3 of https://eprint.iacr.org/2017/1015.pdf. -// This assumes that the BMI2 and ADX instruction set extensions are available. -TEXT ·fp751MontgomeryReduceBMI2ADX(SB), $0-16 +// Template for calculating the Montgomery reduction algorithm described in +// section 5.2.3 of https://eprint.iacr.org/2017/1015.pdf. Template must be +// customized with schoolbook multiplicaton for 256 x 448-bit number. +// This macro reuses memory of IN value and *changes* it. Smashes registers +// R[8-15], AX, BX, CX, DX, BP. +// Input: +// * M0: 1536-bit number to be reduced +// * C : either mul256x448bmi2 or mul256x448bmi2adx +// Output: OUT 768-bit +#define REDC(C, M0, MULS) \ + \ // a[0-3] x p751p1_nz --> result: [reg_p2+48], [reg_p2+56], [reg_p2+64], and rbp, r8:r14 + MULS(M0, ·p751p1, 48+C, R8, R9, R13, R10, R14, R12, R11, BP, BX, CX, R15) \ + XORQ R15, R15 \ + MOVQ 48+C, AX \ + MOVQ 56+C, DX \ + MOVQ 64+C, BX \ + ADDQ 40+M0, AX \ + ADCQ 48+M0, DX \ + ADCQ 56+M0, BX \ + MOVQ AX, 40+M0 \ + MOVQ DX, 48+M0 \ + MOVQ BX, 56+M0 \ + ADCQ 64+M0, BP \ + ADCQ 72+M0, R8 \ + ADCQ 80+M0, R9 \ + ADCQ 88+M0, R10 \ + ADCQ 96+M0, R11 \ + ADCQ 104+M0, R12 \ + ADCQ 112+M0, R13 \ + ADCQ 120+M0, R14 \ + ADCQ 128+M0, R15 \ + MOVQ BP, 64+M0 \ + MOVQ R8, 72+M0 \ + MOVQ R9, 80+M0 \ + MOVQ R10, 88+M0 \ + MOVQ R11, 96+M0 \ + MOVQ R12, 104+M0 \ + MOVQ R13, 112+M0 \ + MOVQ R14, 120+M0 \ + MOVQ R15, 128+M0 \ + MOVQ 136+M0, R8 \ + MOVQ 144+M0, R9 \ + MOVQ 152+M0, R10 \ + MOVQ 160+M0, R11 \ + MOVQ 168+M0, R12 \ + MOVQ 176+M0, R13 \ + MOVQ 184+M0, R14 \ + ADCQ $0, R8 \ + ADCQ $0, R9 \ + ADCQ $0, R10 \ + ADCQ $0, R11 \ + ADCQ $0, R12 \ + ADCQ $0, R13 \ + ADCQ $0, R14 \ + MOVQ R8, 136+M0 \ + MOVQ R9, 144+M0 \ + MOVQ R10, 152+M0 \ + MOVQ R11, 160+M0 \ + MOVQ R12, 168+M0 \ + MOVQ R13, 176+M0 \ + MOVQ R14, 184+M0 \ + \ // a[4-7] x p751p1_nz --> result: [reg_p2+48], [reg_p2+56], [reg_p2+64], and rbp, r8:r14 + MULS(32+M0, ·p751p1, 48+C, R8, R9, R13, R10, R14, R12, R11, BP, BX, CX, R15) \ + XORQ R15, R15 \ + MOVQ 48+C, AX \ + MOVQ 56+C, DX \ + MOVQ 64+C, BX \ + ADDQ 72+M0, AX \ + ADCQ 80+M0, DX \ + ADCQ 88+M0, BX \ + MOVQ AX, 72+M0 \ + MOVQ DX, 80+M0 \ + MOVQ BX, 88+M0 \ + ADCQ 96+M0, BP \ + ADCQ 104+M0, R8 \ + ADCQ 112+M0, R9 \ + ADCQ 120+M0, R10 \ + ADCQ 128+M0, R11 \ + ADCQ 136+M0, R12 \ + ADCQ 144+M0, R13 \ + ADCQ 152+M0, R14 \ + ADCQ 160+M0, R15 \ + MOVQ BP, 0+C \ // Final result c0 + MOVQ R8, 104+M0 \ + MOVQ R9, 112+M0 \ + MOVQ R10, 120+M0 \ + MOVQ R11, 128+M0 \ + MOVQ R12, 136+M0 \ + MOVQ R13, 144+M0 \ + MOVQ R14, 152+M0 \ + MOVQ R15, 160+M0 \ + MOVQ 168+M0, R12 \ + MOVQ 176+M0, R13 \ + MOVQ 184+M0, R14 \ + ADCQ $0, R12 \ + ADCQ $0, R13 \ + ADCQ $0, R14 \ + MOVQ R12, 168+M0 \ + MOVQ R13, 176+M0 \ + MOVQ R14, 184+M0 \ + \ // a[8-11] x p751p1_nz --> result: [reg_p2+48], [reg_p2+56], [reg_p2+64], and rbp, r8:r14 + MULS(64+M0, ·p751p1, 48+C, R8, R9, R13, R10, R14, R12, R11, BP, BX, CX, R15) \ + MOVQ 48+C, AX \ // Final result c1:c11 + MOVQ 56+C, DX \ + MOVQ 64+C, BX \ + ADDQ 104+M0, AX \ + ADCQ 112+M0, DX \ + ADCQ 120+M0, BX \ + MOVQ AX, 8+C \ + MOVQ DX, 16+C \ + MOVQ BX, 24+C \ + ADCQ 128+M0, BP \ + ADCQ 136+M0, R8 \ + ADCQ 144+M0, R9 \ + ADCQ 152+M0, R10 \ + ADCQ 160+M0, R11 \ + ADCQ 168+M0, R12 \ + ADCQ 176+M0, R13 \ + ADCQ 184+M0, R14 \ + MOVQ BP, 32+C \ + MOVQ R8, 40+C \ + MOVQ R9, 48+C \ + MOVQ R10, 56+C \ + MOVQ R11, 64+C \ + MOVQ R12, 72+C \ + MOVQ R13, 80+C \ + MOVQ R14, 88+C + +TEXT ·fp751MontgomeryReduce(SB), $0-16 MOVQ z+0(FP), REG_P2 MOVQ x+8(FP), REG_P1 - // a[0-3] x p751p1_nz --> result: [reg_p2+48], [reg_p2+56], [reg_p2+64], and rbp, r8:r14 - mul256x448bmi2adx(0(REG_P1), ·p751p1, 48(REG_P2), R8, R9, R13, R10, R14, R12, R11, BP, BX, CX, R15) - - fp751MontgomeryReduceCommonPart1(0(REG_P1), 0(REG_P2), R8, R9, R10, R11, R12, R13, R14, R15, BP, BX) - - // a[4-7] x p751p1_nz --> result: [reg_p2+48], [reg_p2+56], [reg_p2+64], and rbp, r8:r14 - mul256x448bmi2adx(32(REG_P1), ·p751p1, 48(REG_P2), R8, R9, R13, R10, R14, R12, R11, BP, BX, CX, R15) - - fp751MontgomeryReduceCommonPart2(0(REG_P1), 0(REG_P2), R8, R9, R10, R11, R12, R13, R14, R15, BP, BX) - - // a[8-11] x p751p1_nz --> result: [reg_p2+48], [reg_p2+56], [reg_p2+64], and rbp, r8:r14 - mul256x448bmi2adx(64(REG_P1), ·p751p1, 48(REG_P2), R8, R9, R13, R10, R14, R12, R11, BP, BX, CX, R15) - - fp751MontgomeryReduceCommonPart3(0(REG_P1), 0(REG_P2), R8, R9, R10, R11, R12, R13, R14, R15, BP, BX) - - RET - -// This implements the Montgomery reduction algorithm described in -// section 5.2.3 of https://eprint.iacr.org/2017/1015.pdf. -// This assumes that the BMI2 instruction set extension is available. -TEXT ·fp751MontgomeryReduceBMI2(SB), $0-16 - MOVQ z+0(FP), REG_P2 - MOVQ x+8(FP), REG_P1 - - // a[0-3] x p751p1_nz --> result: [reg_p2+48], [reg_p2+56], [reg_p2+64], and rbp, r8:r14 - mul256x448bmi2(0(REG_P1), ·p751p1, 48(REG_P2), R8, R9, R13, R10, R14, R12, R11, BP, BX, CX, R15) - - fp751MontgomeryReduceCommonPart1(0(REG_P1), 0(REG_P2), R8, R9, R10, R11, R12, R13, R14, R15, BP, BX) - - // a[4-7] x p751p1_nz --> result: [reg_p2+48], [reg_p2+56], [reg_p2+64], and rbp, r8:r14 - mul256x448bmi2(32(REG_P1), ·p751p1, 48(REG_P2), R8, R9, R13, R10, R14, R12, R11, BP, BX, CX, R15) - - fp751MontgomeryReduceCommonPart2(0(REG_P1), 0(REG_P2), R8, R9, R10, R11, R12, R13, R14, R15, BP, BX) - - // a[8-11] x p751p1_nz --> result: [reg_p2+48], [reg_p2+56], [reg_p2+64], and rbp, r8:r14 - mul256x448bmi2(64(REG_P1), ·p751p1, 48(REG_P2), R8, R9, R13, R10, R14, R12, R11, BP, BX, CX, R15) - - fp751MontgomeryReduceCommonPart3(0(REG_P1), 0(REG_P2), R8, R9, R10, R11, R12, R13, R14, R15, BP, BX) - - RET - -// This implements the straightforward Montgomery reduction algorithm without -// using specific instruction set extensions. -TEXT ·fp751MontgomeryReduceFallback(SB), $0-16 - - MOVQ z+0(FP), REG_P2 - MOVQ x+8(FP), REG_P1 + // Check wether to use optimized implementation + CMPB ·useADXMULX(SB), $1 + JE redc_with_mulx_adx + CMPB ·useMULX(SB), $1 + JE redc_with_mulx MOVQ (REG_P1), R11 MOVQ P751P1_5, AX @@ -2379,7 +2345,20 @@ TEXT ·fp751MontgomeryReduceFallback(SB), $0-16 ADCQ $0, R10 ADDQ (184)(REG_P1), R10 // Z11 MOVQ R10, (88)(REG_P2) // Z11 + RET + +redc_with_mulx_adx: + // This implements the Montgomery reduction algorithm described in + // section 5.2.3 of https://eprint.iacr.org/2017/1015.pdf. + // This assumes that the BMI2 and ADX instruction set extensions are available. + REDC(0(REG_P2), 0(REG_P1), mul256x448bmi2adx) + RET +redc_with_mulx: + // This implements the Montgomery reduction algorithm described in + // section 5.2.3 of https://eprint.iacr.org/2017/1015.pdf. + // This assumes that the BMI2 instruction set extension is available. + REDC(0(REG_P2), 0(REG_P1), mul256x448bmi2) RET TEXT ·fp751AddLazy(SB), NOSPLIT, $0-24 diff --git a/p751/arith_amd64_test.go b/p751/arith_amd64_test.go index c7814c9..3c278f5 100644 --- a/p751/arith_amd64_test.go +++ b/p751/arith_amd64_test.go @@ -10,66 +10,66 @@ import ( "testing/quick" ) -func TestFp751MontgomeryReduce(t *testing.T) { - // First make sure that at least one value with a known result reduces - // correctly as defined in TestPrimeFieldElementToBigInt. - fp751MontgomeryReduce = fp751MontgomeryReduceFallback - t.Run("PrimeFieldElementToBigInt", TestPrimeFieldElementToBigInt) +type OptimFlag uint - if !cpu.HasBMI2 { - return - } - - fp751MontgomeryReduce = fp751MontgomeryReduceBMI2 - t.Run("PrimeFieldElementToBigInt", TestPrimeFieldElementToBigInt) - - // Also check that the BMI2 implementation produces the same results - // as the fallback implementation. - compareMontgomeryReduce := func(x, y primeFieldElement) bool { - var z, zbackup FpElementX2 - var zred1, zred2 FpElement - - fp751Mul(&z, &x.A, &y.A) - zbackup = z - - fp751MontgomeryReduceFallback(&zred1, &z) - // z may be destroyed. - z = zbackup - fp751MontgomeryReduceBMI2(&zred2, &z) +const ( + kUse_MUL OptimFlag = 1 << 0 + kUse_MULX = 1 << 1 + kUse_MULXADX = 1 << 2 +) - return zred1 == zred2 +// Utility function used for testing REDC implementations. Tests caller provided +// redcFunc against redc() +func testRedc(t *testing.T, f1, f2 OptimFlag) { + doRedcTest := func(aRR FpElementX2) bool { + defer recognizecpu() + var resRedcF1, resRedcF2 FpElement + var aRRcpy = aRR + + // Compute redc with first implementation + useMULX = (kUse_MULX & f1) == kUse_MULX + useADXMULX = (kUse_MULXADX & f1) == kUse_MULXADX + fp751MontgomeryReduce(&resRedcF1, &aRR) + + // Compute redc with second implementation + useMULX = (kUse_MULX & f2) == kUse_MULX + useADXMULX = (kUse_MULXADX & f2) == kUse_MULXADX + fp751MontgomeryReduce(&resRedcF2, &aRRcpy) + + // Compare results + return reflect.DeepEqual(resRedcF2, resRedcF1) } - if err := quick.Check(compareMontgomeryReduce, quickCheckConfig); err != nil { + if err := quick.Check(doRedcTest, quickCheckConfig); err != nil { t.Error(err) } +} - if !cpu.HasADX { - return +// Ensures corretness of Montgomery reduction implementation which uses MULX +func TestRedcWithMULX(t *testing.T) { + defer recognizecpu() + if !cpu.HasBMI2 { + t.Skip("MULX not supported by the platform") } + testRedc(t, kUse_MULX, kUse_MUL) +} - fp751MontgomeryReduce = fp751MontgomeryReduceBMI2ADX - t.Run("PrimeFieldElementToBigInt", TestPrimeFieldElementToBigInt) - - // Check that the BMI2ADX implementation produces the same results as - // the BMI2 implementation. By transitivity, it should also produce the - // same results as the fallback implementation. - compareMontgomeryReduce = func(x, y primeFieldElement) bool { - var z, zbackup FpElementX2 - var zred1, zred2 FpElement - - fp751Mul(&z, &x.A, &y.A) - zbackup = z - - fp751MontgomeryReduceBMI2(&zred1, &z) - // z may be destroyed. - z = zbackup - fp751MontgomeryReduceBMI2ADX(&zred2, &z) - - return zred1 == zred2 +// Ensures corretness of Montgomery reduction implementation which uses MULX +// and ADX +func TestRedcWithMULXADX(t *testing.T) { + defer recognizecpu() + if !(cpu.HasADX && cpu.HasBMI2) { + t.Skip("MULX, ADCX and ADOX not supported by the platform") } + testRedc(t, kUse_MULXADX, kUse_MUL) +} - if err := quick.Check(compareMontgomeryReduce, quickCheckConfig); err != nil { - t.Error(err) +// Ensures corretness of Montgomery reduction implementation which uses MULX +// and ADX. +func TestRedcWithMULXADXAgainstMULX(t *testing.T) { + defer recognizecpu() + if !(cpu.HasADX && cpu.HasBMI2) { + t.Skip("MULX, ADCX and ADOX not supported by the platform") } + testRedc(t, kUse_MULXADX, kUse_MULX) } diff --git a/p751/arith_decl.go b/p751/arith_decl.go index 008ee41..61a0659 100644 --- a/p751/arith_decl.go +++ b/p751/arith_decl.go @@ -4,7 +4,6 @@ package p751 import ( . "github.com/cloudflare/sidh/internal/isogeny" - cpu "github.com/cloudflare/sidh/internal/utils" ) // If choice = 0, leave x,y unchanged. If choice = 1, set x,y = y,x. @@ -41,31 +40,9 @@ func fp751Mul(z *FpElementX2, x, y *FpElement) // fp751MontgomeryReduce implementations below. // When set, it performs Montgomery reduction: set z = x R^{-1} (mod 2*p). // It may destroy the input value. -var fp751MontgomeryReduce func(z *FpElement, x *FpElementX2) - -//go:noescape -func fp751MontgomeryReduceBMI2ADX(z *FpElement, x *FpElementX2) - //go:noescape -func fp751MontgomeryReduceBMI2(z *FpElement, x *FpElementX2) - -//go:noescape -func fp751MontgomeryReduceFallback(z *FpElement, x *FpElementX2) +func fp751MontgomeryReduce(z *FpElement, x *FpElementX2) // Reduce a field element in [0, 2*p) to one in [0,p). //go:noescape func fp751StrongReduce(x *FpElement) - -// On initialization, set the fp751MontgomeryReduce function pointer to the -// fastest implementation depending on CPU capabilities. -func init() { - if cpu.HasBMI2 { - if cpu.HasADX { - fp751MontgomeryReduce = fp751MontgomeryReduceBMI2ADX - } else { - fp751MontgomeryReduce = fp751MontgomeryReduceBMI2 - } - } else { - fp751MontgomeryReduce = fp751MontgomeryReduceFallback - } -}