作成者 | SHA1 | メッセージ | 日付 |
---|---|---|---|
Kris Kwiatkowski | 543bbcf84c | WIP | 6年前 |
Kris Kwiatkowski | 5bce13a8cc | WIP3 | 6年前 |
Kris Kwiatkowski | 79fd1f4668 | WIP | 6年前 |
@@ -12,7 +12,7 @@ OPTS ?= | |||
OPTS_TAGS ?= -tags=noasm | |||
NOASM ?= | |||
# -run="NonExistent" is set to make sure tests are not run before benchmarking | |||
BENCH_OPTS ?= -bench=. -run="NonExistent" | |||
BENCH_OPTS ?= -bench=. -run="NonExistent" -benchmem | |||
# whether to be verbose | |||
V ?= 1 | |||
@@ -22,7 +22,7 @@ endif | |||
ifeq ($(V),1) | |||
OPTS += -v # Be verbose | |||
BENCH_OPTS += -gcflags=-m # Show results from inlining | |||
BENCH_OPTS += -gcflags="-m -m" # Show results from inlining | |||
endif | |||
all: test | |||
@@ -698,6 +698,29 @@ TEXT ·mulWithMULXADX(SB), NOSPLIT, $104-24 | |||
MUL(CX, REG_P2, REG_P1, MULS256_MULXADX) | |||
RET | |||
TEXT ·fp503Mul(SB), NOSPLIT, $104-24 | |||
// Actual implementation | |||
MOVQ z+ 0(FP), CX | |||
MOVQ x+ 8(FP), REG_P2 | |||
MOVQ y+16(FP), REG_P1 | |||
CMPB ·HasBMI2(SB), $1 | |||
JE mulWithMULXADX | |||
JMP mulWithMULX | |||
mulWithMULXADX: | |||
MUL(CX, REG_P2, REG_P1, MULS256_MULXADX) | |||
RET | |||
mulWithMULX: | |||
MUL(CX, REG_P2, REG_P1, MULS256_MULX) | |||
RET | |||
TEXT ·fp503MulXXX(SB), NOSPLIT, $104-72 | |||
// Actual implementation | |||
MOVQ z+ 0(FP), CX | |||
MOVQ x+24(FP), REG_P2 | |||
MOVQ y+48(FP), REG_P1 | |||
MUL(CX, REG_P2, REG_P1, MULS256_MULXADX) | |||
RET | |||
TEXT ·mulWithMULX(SB), NOSPLIT, $104-24 | |||
// Actual implementation | |||
MOVQ z+ 0(FP), CX | |||
@@ -1509,6 +1532,11 @@ TEXT ·redcWithMULXADX(SB), $0-16 | |||
MOVQ x+8(FP), SI | |||
REDC(DI, SI, MULS_128x320_MULXADX) | |||
RET | |||
TEXT ·fp503MontgomeryReduce(SB), $0-16 | |||
MOVQ z+0(FP), DI | |||
MOVQ x+8(FP), SI | |||
REDC(DI, SI, MULS_128x320_MULXADX) | |||
RET | |||
TEXT ·fp503AddLazy(SB), NOSPLIT, $0-24 | |||
@@ -41,7 +41,8 @@ func fp503StrongReduce(x *FpElement) | |||
// Concrete implementation depends on capabilities of the CPU which | |||
// are resolved at runtime. CPUs with ADCX, ADOX and MULX support | |||
// run most optimized implementation | |||
var fp503Mul func(z *FpElementX2, x, y *FpElement) | |||
//go:noescape | |||
// var fp503Mul func(z *FpElementX2, x, y *FpElement) | |||
// Mul implementattion for legacy CPUs | |||
//go:noescape | |||
@@ -53,12 +54,21 @@ func mulWithMULX(z *FpElementX2, x, y *FpElement) | |||
// Mul implementation for CPUs supporting two independent carry chain | |||
// (ADOX/ADCX) instructions and carry-less MULX multiplier | |||
//go:noescape | |||
func fp503Mul(z *FpElementX2, x, y *FpElement) | |||
//go:noescape | |||
func fp503MulXXX(z, x, y []uint64) | |||
var fp503Mul1 func(z, x, y []uint64) | |||
//go:noescape | |||
func mulWithMULXADX(z *FpElementX2, x, y *FpElement) | |||
// Computes the Montgomery reduction z = x R^{-1} (mod 2*p). On return value | |||
// of x may be changed. z=x not allowed. | |||
var fp503MontgomeryReduce func(z *FpElement, x *FpElementX2) | |||
//go:noescape | |||
func fp503MontgomeryReduce(z *FpElement, x *FpElementX2) | |||
func redc(z *FpElement, x *FpElementX2) | |||
@@ -73,17 +83,29 @@ func redcWithMULXADX(z *FpElement, x *FpElementX2) | |||
// On initialization, set the fp503Mul function pointer to the | |||
// fastest implementation depending on CPU capabilities. | |||
func init() { | |||
fp503Mul1 = fp503MulXXX | |||
} | |||
var HasBMI2 bool | |||
func init() { | |||
HasBMI2 = cpu.HasBMI2 | |||
} | |||
/* | |||
func init() { | |||
if cpu.HasBMI2 { | |||
if cpu.HasADX { | |||
fp503Mul = mulWithMULXADX | |||
//fp503Mul = mulWithMULXADX | |||
fp503MontgomeryReduce = redcWithMULXADX | |||
} else { | |||
fp503Mul = mulWithMULX | |||
//fp503Mul = mulWithMULX | |||
fp503MontgomeryReduce = redcWithMULX | |||
} | |||
} else { | |||
fp503Mul = mul | |||
//fp503Mul = mul | |||
fp503MontgomeryReduce = redc | |||
} | |||
} | |||
*/ |
@@ -21,13 +21,12 @@ func (fp503Ops) Sub(dest, lhs, rhs *Fp2Element) { | |||
} | |||
func (fp503Ops) Mul(dest, lhs, rhs *Fp2Element) { | |||
// Let (a,b,c,d) = (lhs.a,lhs.b,rhs.a,rhs.b). | |||
a := &lhs.A | |||
b := &lhs.B | |||
c := &rhs.A | |||
d := &rhs.B | |||
var b_minus_a, c_minus_d FpElement | |||
var ad_plus_bc FpElementX2 | |||
var ac, bd FpElementX2 | |||
var ac_minus_bd FpElementX2 | |||
// We want to compute | |||
// Let (a,b,c,d) = (lhs.a,lhs.b,rhs.a,rhs.b). We want to compute | |||
// | |||
// (a + bi)*(c + di) = (a*c - b*d) + (a*d + b*c)i | |||
// | |||
@@ -37,22 +36,18 @@ func (fp503Ops) Mul(dest, lhs, rhs *Fp2Element) { | |||
// | |||
// so (a*d + b*c) = (b-a)*(c-d) + a*c + b*d. | |||
var ac, bd FpElementX2 | |||
fp503Mul(&ac, a, c) // = a*c*R*R | |||
fp503Mul(&bd, b, d) // = b*d*R*R | |||
fp503Mul(&ac, &lhs.A, &rhs.A) // = a*c*R*R | |||
fp503Mul(&bd, &lhs.B, &rhs.B) // = b*d*R*R | |||
var b_minus_a, c_minus_d FpElement | |||
fp503SubReduced(&b_minus_a, b, a) // = (b-a)*R | |||
fp503SubReduced(&c_minus_d, c, d) // = (c-d)*R | |||
fp503SubReduced(&b_minus_a, &lhs.B, &lhs.A) // = (b-a)*R | |||
fp503SubReduced(&c_minus_d, &rhs.A, &rhs.B) // = (c-d)*R | |||
var ad_plus_bc FpElementX2 | |||
fp503Mul(&ad_plus_bc, &b_minus_a, &c_minus_d) // = (b-a)*(c-d)*R*R | |||
fp503X2AddLazy(&ad_plus_bc, &ad_plus_bc, &ac) // = ((b-a)*(c-d) + a*c)*R*R | |||
fp503X2AddLazy(&ad_plus_bc, &ad_plus_bc, &bd) // = ((b-a)*(c-d) + a*c + b*d)*R*R | |||
fp503MontgomeryReduce(&dest.B, &ad_plus_bc) // = (a*d + b*c)*R mod p | |||
var ac_minus_bd FpElementX2 | |||
fp503X2SubLazy(&ac_minus_bd, &ac, &bd) // = (a*c - b*d)*R*R | |||
fp503MontgomeryReduce(&dest.A, &ac_minus_bd) // = (a*c - b*d)*R mod p | |||
} | |||