From 9fc872439c811ab0a05f2172d33be6287bcaf7c1 Mon Sep 17 00:00:00 2001 From: Kris Kwiatkowski Date: Wed, 3 Oct 2018 11:54:17 +0100 Subject: [PATCH] PERF --- p503/arith_decl.go | 37 ++++++++++++++++++------------------- p751/arith_decl.go | 24 +++++++++--------------- 2 files changed, 27 insertions(+), 34 deletions(-) diff --git a/p503/arith_decl.go b/p503/arith_decl.go index 0129559..a3e3f59 100644 --- a/p503/arith_decl.go +++ b/p503/arith_decl.go @@ -41,7 +41,15 @@ func fp503StrongReduce(x *FpElement) // Concrete implementation depends on capabilities of the CPU which // are resolved at runtime. CPUs with ADCX, ADOX and MULX support // run most optimized implementation -var fp503Mul func(z *FpElementX2, x, y *FpElement) +func fp503Mul(z *FpElementX2, x, y *FpElement) { + if cpu.X86.HasBMI2 && cpu.X86.HasADX { + mulWithMULXADX(z,x,y) + } else if cpu.X86.HasBMI2 { + mulWithMULX(z,x,y) + } else { + mul(z,x,y) + } +} // Mul implementattion for legacy CPUs //go:noescape @@ -58,7 +66,15 @@ func mulWithMULXADX(z *FpElementX2, x, y *FpElement) // Computes the Montgomery reduction z = x R^{-1} (mod 2*p). On return value // of x may be changed. z=x not allowed. -var fp503MontgomeryReduce func(z *FpElement, x *FpElementX2) +func fp503MontgomeryReduce(z *FpElement, x *FpElementX2) { + if cpu.X86.HasBMI2 && cpu.X86.HasADX { + redcWithMULXADX(z,x) + } else if cpu.X86.HasBMI2 { + redcWithMULX(z,x) + } else { + redc(z,x) + } +} func redc(z *FpElement, x *FpElementX2) @@ -70,20 +86,3 @@ func redcWithMULX(z *FpElement, x *FpElementX2) // (ADOX/ADCX) instructions and carry-less MULX multiplier //go:noescape func redcWithMULXADX(z *FpElement, x *FpElementX2) - -// On initialization, set the fp503Mul function pointer to the -// fastest implementation depending on CPU capabilities. -func init() { - if cpu.X86.HasBMI2 { - if cpu.X86.HasADX { - fp503Mul = mulWithMULXADX - fp503MontgomeryReduce = redcWithMULXADX - } else { - fp503Mul = mulWithMULX - fp503MontgomeryReduce = redcWithMULX - } - } else { - fp503Mul = mul - fp503MontgomeryReduce = redc - } -} diff --git a/p751/arith_decl.go b/p751/arith_decl.go index d56f73b..35e46e2 100644 --- a/p751/arith_decl.go +++ b/p751/arith_decl.go @@ -41,7 +41,15 @@ func fp751Mul(z *FpElementX2, x, y *FpElement) // fp751MontgomeryReduce implementations below. // When set, it performs Montgomery reduction: set z = x R^{-1} (mod 2*p). // It may destroy the input value. -var fp751MontgomeryReduce func(z *FpElement, x *FpElementX2) +func fp751MontgomeryReduce(z *FpElement, x *FpElementX2) { + if cpu.X86.HasBMI2 && cpu.X86.HasADX { + fp751MontgomeryReduceBMI2ADX(z,x) + } else if cpu.X86.HasBMI2 { + fp751MontgomeryReduceBMI2(z,x) + } else { + fp751MontgomeryReduceFallback(z,x) + } +} //go:noescape func fp751MontgomeryReduceBMI2ADX(z *FpElement, x *FpElementX2) @@ -55,17 +63,3 @@ func fp751MontgomeryReduceFallback(z *FpElement, x *FpElementX2) // Reduce a field element in [0, 2*p) to one in [0,p). //go:noescape func fp751StrongReduce(x *FpElement) - -// On initialization, set the fp751MontgomeryReduce function pointer to the -// fastest implementation depending on CPU capabilities. -func init() { - if cpu.X86.HasBMI2 { - if cpu.X86.HasADX { - fp751MontgomeryReduce = fp751MontgomeryReduceBMI2ADX - } else { - fp751MontgomeryReduce = fp751MontgomeryReduceBMI2 - } - } else { - fp751MontgomeryReduce = fp751MontgomeryReduceFallback - } -}