diff --git a/p503/arith_amd64.go b/p503/arith_amd64.go new file mode 100644 index 0000000..dd7d3e1 --- /dev/null +++ b/p503/arith_amd64.go @@ -0,0 +1,24 @@ +// +build amd64,!noasm + +package p503 + +import cpu "github.com/cloudflare/p751sidh/internal/utils" + +// There couple of reasons for having those variables here: +// * to have an access to them from assembly +// TODO(kk): Is there a way to access variable from different package? +// If it is then probably this file could be moved to internal +// and we don't need to have many copies of that +// * make it easy to vendor the library +// * make it possible to test all functionalities +var useMULX bool +var useADXMULX bool + +func recognizecpu() { + useMULX = cpu.HasBMI2 + useADXMULX = cpu.HasADX && cpu.HasBMI2 +} + +func init() { + recognizecpu() +} diff --git a/p503/arith_amd64.s b/p503/arith_amd64.s index e52f518..d0b4698 100644 --- a/p503/arith_amd64.s +++ b/p503/arith_amd64.s @@ -354,7 +354,8 @@ // Template for calculating the Montgomery reduction algorithm described in // section 5.2.3 of https://eprint.iacr.org/2017/1015.pdf. Template must be // customized with schoolbook multiplicaton for 128 x 320-bit number. -// This macro reuses memory of IN value and *changes* it. +// This macro reuses memory of IN value and *changes* it. Smashes registers +// R[8-15], BX, CX // Input: // * IN: 1024-bit number to be reduced // * MULS: either MULS_128x320_MULX or MULS_128x320_MULXADX @@ -690,34 +691,23 @@ TEXT ·fp503SubReduced(SB), NOSPLIT, $0-24 RET -TEXT ·mulWithMULXADX(SB), NOSPLIT, $104-24 - // Actual implementation +TEXT ·fp503Mul(SB), NOSPLIT, $104-24 MOVQ z+ 0(FP), CX - MOVQ x+ 8(FP), REG_P2 - MOVQ y+16(FP), REG_P1 - MUL(CX, REG_P2, REG_P1, MULS256_MULXADX) - RET + MOVQ x+ 8(FP), REG_P1 + MOVQ y+16(FP), REG_P2 -TEXT ·mulWithMULX(SB), NOSPLIT, $104-24 - // Actual implementation - MOVQ z+ 0(FP), CX - MOVQ x+ 8(FP), REG_P2 - MOVQ y+16(FP), REG_P1 - MUL(CX, REG_P2, REG_P1, MULS256_MULX) - RET + // Check wether to use optimized implementation + CMPB ·useADXMULX(SB), $1 + JE mul_with_mulx_adx + CMPB ·useMULX(SB), $1 + JE mul_with_mulx -TEXT ·mul(SB), $96-24 - // Uses variant of Karatsuba method. + // Generic x86 implementation (below) uses variant of Karatsuba method. // // Here we store the destination in CX instead of in REG_P3 because the // multiplication instructions use DX as an implicit destination // operand: MULQ $REG sets DX:AX <-- AX * $REG. - // Actual implementation - MOVQ z+0(FP), CX - MOVQ x+8(FP), REG_P1 - MOVQ y+16(FP), REG_P2 - // RAX and RDX will be used for a mask (0-borrow) XORQ AX, AX @@ -1186,12 +1176,28 @@ TEXT ·mul(SB), $96-24 ADCQ $0, DX; MOVQ DX, (104)(CX) ADCQ $0, DI; MOVQ DI, (112)(CX) ADCQ $0, SI; MOVQ SI, (120)(CX) + RET +mul_with_mulx_adx: + // Mul implementation for CPUs supporting two independent carry chain + // (ADOX/ADCX) instructions and carry-less MULX multiplier + MUL(CX, REG_P1, REG_P2, MULS256_MULXADX) RET -TEXT ·redc(SB), $0-16 - MOVQ z+0(FP), REG_P2 - MOVQ x+8(FP), REG_P1 +mul_with_mulx: + // Mul implementation for CPUs supporting carry-less MULX multiplier. + MUL(CX, REG_P1, REG_P2, MULS256_MULX) + RET + +TEXT ·fp503MontgomeryReduce(SB), $0-16 + MOVQ z+0(FP), REG_P2 + MOVQ x+8(FP), REG_P1 + + // Check wether to use optimized implementation + CMPB ·useADXMULX(SB), $1 + JE redc_with_mulx_adx + CMPB ·useMULX(SB), $1 + JE redc_with_mulx MOVQ (REG_P1), R11 MOVQ P503P1_3, AX @@ -1495,19 +1501,19 @@ TEXT ·redc(SB), $0-16 ADCQ $0, R10 ADDQ (120)(REG_P1), R10 // Z7 MOVQ R10, (56)(REG_P2) // Z7 - RET -TEXT ·redcWithMULX(SB), $0-16 - MOVQ z+0(FP), DI - MOVQ x+8(FP), SI - REDC(DI, SI, MULS_128x320_MULX) +redc_with_mulx_adx: + // Implementation of the Montgomery reduction for CPUs + // supporting two independent carry chain (ADOX/ADCX) + // instructions and carry-less MULX multiplier + REDC(REG_P2, REG_P1, MULS_128x320_MULXADX) RET -TEXT ·redcWithMULXADX(SB), $0-16 - MOVQ z+0(FP), DI - MOVQ x+8(FP), SI - REDC(DI, SI, MULS_128x320_MULXADX) +redc_with_mulx: + // Implementation of the Montgomery reduction for CPUs + // supporting carry-less MULX multiplier. + REDC(REG_P2, REG_P1, MULS_128x320_MULX) RET TEXT ·fp503AddLazy(SB), NOSPLIT, $0-24 diff --git a/p503/arith_amd64_test.go b/p503/arith_amd64_test.go index 5b2d9c7..6adb6a9 100644 --- a/p503/arith_amd64_test.go +++ b/p503/arith_amd64_test.go @@ -10,13 +10,32 @@ import ( "testing/quick" ) +type OptimFlag uint + +const ( + kUse_MUL OptimFlag = 1 << 0 + kUse_MULX = 1 << 1 + kUse_MULXADX = 1 << 2 +) + // Utility function used for testing Mul implementations. Tests caller provided // mulFunc against mul() -func testMul(t *testing.T, mulFunc func(z *FpElementX2, x, y *FpElement)) { +func testMul(t *testing.T, f1, f2 OptimFlag) { doMulTest := func(multiplier, multiplicant FpElement) bool { + defer recognizecpu() var resMulRef, resMulOptim FpElementX2 - mul(&resMulRef, &multiplier, &multiplicant) - mulFunc(&resMulOptim, &multiplier, &multiplicant) + + // Compute multiplier*multiplicant with first implementation + useMULX = (kUse_MULX & f1) == kUse_MULX + useADXMULX = (kUse_MULXADX & f1) == kUse_MULXADX + fp503Mul(&resMulOptim, &multiplier, &multiplicant) + + // Compute multiplier*multiplicant with second implementation + useMULX = (kUse_MULX & f2) == kUse_MULX + useADXMULX = (kUse_MULXADX & f2) == kUse_MULXADX + fp503Mul(&resMulRef, &multiplier, &multiplicant) + + // Compare results return reflect.DeepEqual(resMulRef, resMulOptim) } @@ -27,13 +46,24 @@ func testMul(t *testing.T, mulFunc func(z *FpElementX2, x, y *FpElement)) { // Utility function used for testing REDC implementations. Tests caller provided // redcFunc against redc() -func testRedc(t *testing.T, redcFunc func(z *FpElement, x *FpElementX2)) { +func testRedc(t *testing.T, f1, f2 OptimFlag) { doRedcTest := func(aRR FpElementX2) bool { - var resRedcRef, resRedcOptim FpElement + defer recognizecpu() + var resRedcF1, resRedcF2 FpElement var aRRcpy = aRR - redc(&resRedcRef, &aRR) - redcFunc(&resRedcOptim, &aRRcpy) - return reflect.DeepEqual(resRedcRef, resRedcOptim) + + // Compute redc with first implementation + useMULX = (kUse_MULX & f1) == kUse_MULX + useADXMULX = (kUse_MULXADX & f1) == kUse_MULXADX + fp503MontgomeryReduce(&resRedcF1, &aRR) + + // Compute redc with second implementation + useMULX = (kUse_MULX & f2) == kUse_MULX + useADXMULX = (kUse_MULXADX & f2) == kUse_MULXADX + fp503MontgomeryReduce(&resRedcF2, &aRRcpy) + + // Compare results + return reflect.DeepEqual(resRedcF2, resRedcF1) } if err := quick.Check(doRedcTest, quickCheckConfig); err != nil { @@ -43,32 +73,56 @@ func testRedc(t *testing.T, redcFunc func(z *FpElement, x *FpElementX2)) { // Ensures corretness of implementation of mul operation which uses MULX func TestMulWithMULX(t *testing.T) { + defer recognizecpu() if !cpu.HasBMI2 { t.Skip("MULX not supported by the platform") } - testMul(t, mulWithMULX) + testMul(t, kUse_MULX, kUse_MUL) } // Ensures corretness of implementation of mul operation which uses MULX and ADOX/ADCX func TestMulWithMULXADX(t *testing.T) { + defer recognizecpu() if !(cpu.HasADX && cpu.HasBMI2) { t.Skip("MULX, ADCX and ADOX not supported by the platform") } - testMul(t, mulWithMULXADX) + testMul(t, kUse_MULXADX, kUse_MUL) +} + +// Ensures corretness of implementation of mul operation which uses MULX and ADOX/ADCX +func TestMulWithMULXADXAgainstMULX(t *testing.T) { + defer recognizecpu() + if !(cpu.HasADX && cpu.HasBMI2) { + t.Skip("MULX, ADCX and ADOX not supported by the platform") + } + testMul(t, kUse_MULX, kUse_MULXADX) } // Ensures corretness of Montgomery reduction implementation which uses MULX func TestRedcWithMULX(t *testing.T) { + defer recognizecpu() if !cpu.HasBMI2 { t.Skip("MULX not supported by the platform") } - testRedc(t, redcWithMULX) + testRedc(t, kUse_MULX, kUse_MUL) } // Ensures corretness of Montgomery reduction implementation which uses MULX +// and ADX func TestRedcWithMULXADX(t *testing.T) { + defer recognizecpu() + if !(cpu.HasADX && cpu.HasBMI2) { + t.Skip("MULX, ADCX and ADOX not supported by the platform") + } + testRedc(t, kUse_MULXADX, kUse_MUL) +} + +// Ensures corretness of Montgomery reduction implementation which uses MULX +// and ADX. +func TestRedcWithMULXADXAgainstMULX(t *testing.T) { + defer recognizecpu() if !(cpu.HasADX && cpu.HasBMI2) { t.Skip("MULX, ADCX and ADOX not supported by the platform") } - testRedc(t, redcWithMULXADX) + testRedc(t, kUse_MULXADX, kUse_MULX) } diff --git a/p503/arith_decl.go b/p503/arith_decl.go index 6bd15ea..5bc6884 100644 --- a/p503/arith_decl.go +++ b/p503/arith_decl.go @@ -4,7 +4,6 @@ package p503 import ( . "github.com/cloudflare/sidh/internal/isogeny" - cpu "github.com/cloudflare/sidh/internal/utils" ) // If choice = 0, leave x,y unchanged. If choice = 1, set x,y = y,x. @@ -37,53 +36,11 @@ func fp503X2SubLazy(z, x, y *FpElementX2) //go:noescape func fp503StrongReduce(x *FpElement) -// Function pointer to function computing z = x * y. -// Concrete implementation depends on capabilities of the CPU which -// are resolved at runtime. CPUs with ADCX, ADOX and MULX support -// run most optimized implementation -var fp503Mul func(z *FpElementX2, x, y *FpElement) - -// Mul implementattion for legacy CPUs -//go:noescape -func mul(z *FpElementX2, x, y *FpElement) - -// Mul implementation for CPUs supporting carry-less MULX multiplier. -//go:noescape -func mulWithMULX(z *FpElementX2, x, y *FpElement) - -// Mul implementation for CPUs supporting two independent carry chain -// (ADOX/ADCX) instructions and carry-less MULX multiplier +// Computes z = x * y. //go:noescape -func mulWithMULXADX(z *FpElementX2, x, y *FpElement) +func fp503Mul(z *FpElementX2, x, y *FpElement) // Computes the Montgomery reduction z = x R^{-1} (mod 2*p). On return value // of x may be changed. z=x not allowed. -var fp503MontgomeryReduce func(z *FpElement, x *FpElementX2) - -func redc(z *FpElement, x *FpElementX2) - -// Mul implementation for CPUs supporting carry-less MULX multiplier. //go:noescape -func redcWithMULX(z *FpElement, x *FpElementX2) - -// Mul implementation for CPUs supporting two independent carry chain -// (ADOX/ADCX) instructions and carry-less MULX multiplier -//go:noescape -func redcWithMULXADX(z *FpElement, x *FpElementX2) - -// On initialization, set the fp503Mul function pointer to the -// fastest implementation depending on CPU capabilities. -func init() { - if cpu.HasBMI2 { - if cpu.HasADX { - fp503Mul = mulWithMULXADX - fp503MontgomeryReduce = redcWithMULXADX - } else { - fp503Mul = mulWithMULX - fp503MontgomeryReduce = redcWithMULX - } - } else { - fp503Mul = mul - fp503MontgomeryReduce = redc - } -} +func fp503MontgomeryReduce(z *FpElement, x *FpElementX2)