diff --git a/Makefile b/Makefile index 78f8543..2aaa7e8 100644 --- a/Makefile +++ b/Makefile @@ -12,7 +12,7 @@ OPTS ?= OPTS_TAGS ?= -tags=noasm NOASM ?= # -run="NonExistent" is set to make sure tests are not run before benchmarking -BENCH_OPTS ?= -bench=. -run="NonExistent" +BENCH_OPTS ?= -bench=. -run="NonExistent" -benchmem # whether to be verbose V ?= 1 diff --git a/p503/arith_amd64.s b/p503/arith_amd64.s index e52f518..74c04a3 100644 --- a/p503/arith_amd64.s +++ b/p503/arith_amd64.s @@ -698,6 +698,22 @@ TEXT ·mulWithMULXADX(SB), NOSPLIT, $104-24 MUL(CX, REG_P2, REG_P1, MULS256_MULXADX) RET +TEXT ·fp503Mul(SB), NOSPLIT, $104-24 + // Actual implementation + MOVQ z+ 0(FP), CX + MOVQ x+ 8(FP), REG_P2 + MOVQ y+16(FP), REG_P1 + MUL(CX, REG_P2, REG_P1, MULS256_MULXADX) + RET + +TEXT ·fp503MulXXX(SB), NOSPLIT, $104-72 + // Actual implementation + MOVQ z+ 0(FP), CX + MOVQ x+24(FP), REG_P2 + MOVQ y+48(FP), REG_P1 + MUL(CX, REG_P2, REG_P1, MULS256_MULXADX) + RET + TEXT ·mulWithMULX(SB), NOSPLIT, $104-24 // Actual implementation MOVQ z+ 0(FP), CX @@ -1509,6 +1525,11 @@ TEXT ·redcWithMULXADX(SB), $0-16 MOVQ x+8(FP), SI REDC(DI, SI, MULS_128x320_MULXADX) RET +TEXT ·fp503MontgomeryReduce(SB), $0-16 + MOVQ z+0(FP), DI + MOVQ x+8(FP), SI + REDC(DI, SI, MULS_128x320_MULXADX) + RET TEXT ·fp503AddLazy(SB), NOSPLIT, $0-24 diff --git a/p503/arith_decl.go b/p503/arith_decl.go index a65a2dc..284393a 100644 --- a/p503/arith_decl.go +++ b/p503/arith_decl.go @@ -4,7 +4,7 @@ package p503 import ( . "github.com/cloudflare/p751sidh/internal/isogeny" - cpu "github.com/cloudflare/p751sidh/internal/utils" + // cpu "github.com/cloudflare/p751sidh/internal/utils" ) // If choice = 0, leave x,y unchanged. If choice = 1, set x,y = y,x. @@ -41,7 +41,8 @@ func fp503StrongReduce(x *FpElement) // Concrete implementation depends on capabilities of the CPU which // are resolved at runtime. CPUs with ADCX, ADOX and MULX support // run most optimized implementation -var fp503Mul func(z *FpElementX2, x, y *FpElement) +//go:noescape +// var fp503Mul func(z *FpElementX2, x, y *FpElement) // Mul implementattion for legacy CPUs //go:noescape @@ -53,12 +54,21 @@ func mulWithMULX(z *FpElementX2, x, y *FpElement) // Mul implementation for CPUs supporting two independent carry chain // (ADOX/ADCX) instructions and carry-less MULX multiplier +//go:noescape +func fp503Mul(z *FpElementX2, x, y *FpElement) + +//go:noescape +func fp503MulXXX(z, x, y []uint64) + +var fp503Mul1 func(z, x, y []uint64) + //go:noescape func mulWithMULXADX(z *FpElementX2, x, y *FpElement) // Computes the Montgomery reduction z = x R^{-1} (mod 2*p). On return value // of x may be changed. z=x not allowed. -var fp503MontgomeryReduce func(z *FpElement, x *FpElementX2) +//go:noescape +func fp503MontgomeryReduce(z *FpElement, x *FpElementX2) func redc(z *FpElement, x *FpElementX2) @@ -73,17 +83,23 @@ func redcWithMULXADX(z *FpElement, x *FpElementX2) // On initialization, set the fp503Mul function pointer to the // fastest implementation depending on CPU capabilities. +func init() { + fp503Mul1 = fp503MulXXX +} + +/* func init() { if cpu.HasBMI2 { if cpu.HasADX { - fp503Mul = mulWithMULXADX + //fp503Mul = mulWithMULXADX fp503MontgomeryReduce = redcWithMULXADX } else { - fp503Mul = mulWithMULX + //fp503Mul = mulWithMULX fp503MontgomeryReduce = redcWithMULX } } else { - fp503Mul = mul + //fp503Mul = mul fp503MontgomeryReduce = redc } } +*/ diff --git a/p503/field_ops.go b/p503/field_ops.go index ea75e3a..4299f10 100644 --- a/p503/field_ops.go +++ b/p503/field_ops.go @@ -21,13 +21,12 @@ func (fp503Ops) Sub(dest, lhs, rhs *Fp2Element) { } func (fp503Ops) Mul(dest, lhs, rhs *Fp2Element) { - // Let (a,b,c,d) = (lhs.a,lhs.b,rhs.a,rhs.b). - a := &lhs.A - b := &lhs.B - c := &rhs.A - d := &rhs.B + var b_minus_a, c_minus_d FpElement + var ad_plus_bc FpElementX2 + var ac, bd FpElementX2 + var ac_minus_bd FpElementX2 - // We want to compute + // Let (a,b,c,d) = (lhs.a,lhs.b,rhs.a,rhs.b). We want to compute // // (a + bi)*(c + di) = (a*c - b*d) + (a*d + b*c)i // @@ -37,22 +36,18 @@ func (fp503Ops) Mul(dest, lhs, rhs *Fp2Element) { // // so (a*d + b*c) = (b-a)*(c-d) + a*c + b*d. - var ac, bd FpElementX2 - fp503Mul(&ac, a, c) // = a*c*R*R - fp503Mul(&bd, b, d) // = b*d*R*R + fp503Mul1(ac[:], lhs.A[:], rhs.A[:]) // = a*c*R*R + fp503Mul(&bd, &lhs.B, &rhs.B) // = b*d*R*R - var b_minus_a, c_minus_d FpElement - fp503SubReduced(&b_minus_a, b, a) // = (b-a)*R - fp503SubReduced(&c_minus_d, c, d) // = (c-d)*R + fp503SubReduced(&b_minus_a, &lhs.B, &lhs.A) // = (b-a)*R + fp503SubReduced(&c_minus_d, &rhs.A, &rhs.B) // = (c-d)*R - var ad_plus_bc FpElementX2 fp503Mul(&ad_plus_bc, &b_minus_a, &c_minus_d) // = (b-a)*(c-d)*R*R fp503X2AddLazy(&ad_plus_bc, &ad_plus_bc, &ac) // = ((b-a)*(c-d) + a*c)*R*R fp503X2AddLazy(&ad_plus_bc, &ad_plus_bc, &bd) // = ((b-a)*(c-d) + a*c + b*d)*R*R fp503MontgomeryReduce(&dest.B, &ad_plus_bc) // = (a*d + b*c)*R mod p - var ac_minus_bd FpElementX2 fp503X2SubLazy(&ac_minus_bd, &ac, &bd) // = (a*c - b*d)*R*R fp503MontgomeryReduce(&dest.A, &ac_minus_bd) // = (a*c - b*d)*R mod p }