Author | SHA1 | Message | Date |
---|---|---|---|
Kris Kwiatkowski | 543bbcf84c | WIP | 6 years ago |
Kris Kwiatkowski | 5bce13a8cc | WIP3 | 6 years ago |
Kris Kwiatkowski | 79fd1f4668 | WIP | 6 years ago |
@@ -12,7 +12,7 @@ OPTS ?= | |||||
OPTS_TAGS ?= -tags=noasm | OPTS_TAGS ?= -tags=noasm | ||||
NOASM ?= | NOASM ?= | ||||
# -run="NonExistent" is set to make sure tests are not run before benchmarking | # -run="NonExistent" is set to make sure tests are not run before benchmarking | ||||
BENCH_OPTS ?= -bench=. -run="NonExistent" | |||||
BENCH_OPTS ?= -bench=. -run="NonExistent" -benchmem | |||||
# whether to be verbose | # whether to be verbose | ||||
V ?= 1 | V ?= 1 | ||||
@@ -22,7 +22,7 @@ endif | |||||
ifeq ($(V),1) | ifeq ($(V),1) | ||||
OPTS += -v # Be verbose | OPTS += -v # Be verbose | ||||
BENCH_OPTS += -gcflags=-m # Show results from inlining | |||||
BENCH_OPTS += -gcflags="-m -m" # Show results from inlining | |||||
endif | endif | ||||
all: test | all: test | ||||
@@ -698,6 +698,29 @@ TEXT ·mulWithMULXADX(SB), NOSPLIT, $104-24 | |||||
MUL(CX, REG_P2, REG_P1, MULS256_MULXADX) | MUL(CX, REG_P2, REG_P1, MULS256_MULXADX) | ||||
RET | RET | ||||
TEXT ·fp503Mul(SB), NOSPLIT, $104-24 | |||||
// Actual implementation | |||||
MOVQ z+ 0(FP), CX | |||||
MOVQ x+ 8(FP), REG_P2 | |||||
MOVQ y+16(FP), REG_P1 | |||||
CMPB ·HasBMI2(SB), $1 | |||||
JE mulWithMULXADX | |||||
JMP mulWithMULX | |||||
mulWithMULXADX: | |||||
MUL(CX, REG_P2, REG_P1, MULS256_MULXADX) | |||||
RET | |||||
mulWithMULX: | |||||
MUL(CX, REG_P2, REG_P1, MULS256_MULX) | |||||
RET | |||||
TEXT ·fp503MulXXX(SB), NOSPLIT, $104-72 | |||||
// Actual implementation | |||||
MOVQ z+ 0(FP), CX | |||||
MOVQ x+24(FP), REG_P2 | |||||
MOVQ y+48(FP), REG_P1 | |||||
MUL(CX, REG_P2, REG_P1, MULS256_MULXADX) | |||||
RET | |||||
TEXT ·mulWithMULX(SB), NOSPLIT, $104-24 | TEXT ·mulWithMULX(SB), NOSPLIT, $104-24 | ||||
// Actual implementation | // Actual implementation | ||||
MOVQ z+ 0(FP), CX | MOVQ z+ 0(FP), CX | ||||
@@ -1509,6 +1532,11 @@ TEXT ·redcWithMULXADX(SB), $0-16 | |||||
MOVQ x+8(FP), SI | MOVQ x+8(FP), SI | ||||
REDC(DI, SI, MULS_128x320_MULXADX) | REDC(DI, SI, MULS_128x320_MULXADX) | ||||
RET | RET | ||||
TEXT ·fp503MontgomeryReduce(SB), $0-16 | |||||
MOVQ z+0(FP), DI | |||||
MOVQ x+8(FP), SI | |||||
REDC(DI, SI, MULS_128x320_MULXADX) | |||||
RET | |||||
TEXT ·fp503AddLazy(SB), NOSPLIT, $0-24 | TEXT ·fp503AddLazy(SB), NOSPLIT, $0-24 | ||||
@@ -41,7 +41,8 @@ func fp503StrongReduce(x *FpElement) | |||||
// Concrete implementation depends on capabilities of the CPU which | // Concrete implementation depends on capabilities of the CPU which | ||||
// are resolved at runtime. CPUs with ADCX, ADOX and MULX support | // are resolved at runtime. CPUs with ADCX, ADOX and MULX support | ||||
// run most optimized implementation | // run most optimized implementation | ||||
var fp503Mul func(z *FpElementX2, x, y *FpElement) | |||||
//go:noescape | |||||
// var fp503Mul func(z *FpElementX2, x, y *FpElement) | |||||
// Mul implementattion for legacy CPUs | // Mul implementattion for legacy CPUs | ||||
//go:noescape | //go:noescape | ||||
@@ -53,12 +54,21 @@ func mulWithMULX(z *FpElementX2, x, y *FpElement) | |||||
// Mul implementation for CPUs supporting two independent carry chain | // Mul implementation for CPUs supporting two independent carry chain | ||||
// (ADOX/ADCX) instructions and carry-less MULX multiplier | // (ADOX/ADCX) instructions and carry-less MULX multiplier | ||||
//go:noescape | |||||
func fp503Mul(z *FpElementX2, x, y *FpElement) | |||||
//go:noescape | |||||
func fp503MulXXX(z, x, y []uint64) | |||||
var fp503Mul1 func(z, x, y []uint64) | |||||
//go:noescape | //go:noescape | ||||
func mulWithMULXADX(z *FpElementX2, x, y *FpElement) | func mulWithMULXADX(z *FpElementX2, x, y *FpElement) | ||||
// Computes the Montgomery reduction z = x R^{-1} (mod 2*p). On return value | // Computes the Montgomery reduction z = x R^{-1} (mod 2*p). On return value | ||||
// of x may be changed. z=x not allowed. | // of x may be changed. z=x not allowed. | ||||
var fp503MontgomeryReduce func(z *FpElement, x *FpElementX2) | |||||
//go:noescape | |||||
func fp503MontgomeryReduce(z *FpElement, x *FpElementX2) | |||||
func redc(z *FpElement, x *FpElementX2) | func redc(z *FpElement, x *FpElementX2) | ||||
@@ -73,17 +83,29 @@ func redcWithMULXADX(z *FpElement, x *FpElementX2) | |||||
// On initialization, set the fp503Mul function pointer to the | // On initialization, set the fp503Mul function pointer to the | ||||
// fastest implementation depending on CPU capabilities. | // fastest implementation depending on CPU capabilities. | ||||
func init() { | |||||
fp503Mul1 = fp503MulXXX | |||||
} | |||||
var HasBMI2 bool | |||||
func init() { | |||||
HasBMI2 = cpu.HasBMI2 | |||||
} | |||||
/* | |||||
func init() { | func init() { | ||||
if cpu.HasBMI2 { | if cpu.HasBMI2 { | ||||
if cpu.HasADX { | if cpu.HasADX { | ||||
fp503Mul = mulWithMULXADX | |||||
//fp503Mul = mulWithMULXADX | |||||
fp503MontgomeryReduce = redcWithMULXADX | fp503MontgomeryReduce = redcWithMULXADX | ||||
} else { | } else { | ||||
fp503Mul = mulWithMULX | |||||
//fp503Mul = mulWithMULX | |||||
fp503MontgomeryReduce = redcWithMULX | fp503MontgomeryReduce = redcWithMULX | ||||
} | } | ||||
} else { | } else { | ||||
fp503Mul = mul | |||||
//fp503Mul = mul | |||||
fp503MontgomeryReduce = redc | fp503MontgomeryReduce = redc | ||||
} | } | ||||
} | } | ||||
*/ |
@@ -21,13 +21,12 @@ func (fp503Ops) Sub(dest, lhs, rhs *Fp2Element) { | |||||
} | } | ||||
func (fp503Ops) Mul(dest, lhs, rhs *Fp2Element) { | func (fp503Ops) Mul(dest, lhs, rhs *Fp2Element) { | ||||
// Let (a,b,c,d) = (lhs.a,lhs.b,rhs.a,rhs.b). | |||||
a := &lhs.A | |||||
b := &lhs.B | |||||
c := &rhs.A | |||||
d := &rhs.B | |||||
var b_minus_a, c_minus_d FpElement | |||||
var ad_plus_bc FpElementX2 | |||||
var ac, bd FpElementX2 | |||||
var ac_minus_bd FpElementX2 | |||||
// We want to compute | |||||
// Let (a,b,c,d) = (lhs.a,lhs.b,rhs.a,rhs.b). We want to compute | |||||
// | // | ||||
// (a + bi)*(c + di) = (a*c - b*d) + (a*d + b*c)i | // (a + bi)*(c + di) = (a*c - b*d) + (a*d + b*c)i | ||||
// | // | ||||
@@ -37,22 +36,18 @@ func (fp503Ops) Mul(dest, lhs, rhs *Fp2Element) { | |||||
// | // | ||||
// so (a*d + b*c) = (b-a)*(c-d) + a*c + b*d. | // so (a*d + b*c) = (b-a)*(c-d) + a*c + b*d. | ||||
var ac, bd FpElementX2 | |||||
fp503Mul(&ac, a, c) // = a*c*R*R | |||||
fp503Mul(&bd, b, d) // = b*d*R*R | |||||
fp503Mul(&ac, &lhs.A, &rhs.A) // = a*c*R*R | |||||
fp503Mul(&bd, &lhs.B, &rhs.B) // = b*d*R*R | |||||
var b_minus_a, c_minus_d FpElement | |||||
fp503SubReduced(&b_minus_a, b, a) // = (b-a)*R | |||||
fp503SubReduced(&c_minus_d, c, d) // = (c-d)*R | |||||
fp503SubReduced(&b_minus_a, &lhs.B, &lhs.A) // = (b-a)*R | |||||
fp503SubReduced(&c_minus_d, &rhs.A, &rhs.B) // = (c-d)*R | |||||
var ad_plus_bc FpElementX2 | |||||
fp503Mul(&ad_plus_bc, &b_minus_a, &c_minus_d) // = (b-a)*(c-d)*R*R | fp503Mul(&ad_plus_bc, &b_minus_a, &c_minus_d) // = (b-a)*(c-d)*R*R | ||||
fp503X2AddLazy(&ad_plus_bc, &ad_plus_bc, &ac) // = ((b-a)*(c-d) + a*c)*R*R | fp503X2AddLazy(&ad_plus_bc, &ad_plus_bc, &ac) // = ((b-a)*(c-d) + a*c)*R*R | ||||
fp503X2AddLazy(&ad_plus_bc, &ad_plus_bc, &bd) // = ((b-a)*(c-d) + a*c + b*d)*R*R | fp503X2AddLazy(&ad_plus_bc, &ad_plus_bc, &bd) // = ((b-a)*(c-d) + a*c + b*d)*R*R | ||||
fp503MontgomeryReduce(&dest.B, &ad_plus_bc) // = (a*d + b*c)*R mod p | fp503MontgomeryReduce(&dest.B, &ad_plus_bc) // = (a*d + b*c)*R mod p | ||||
var ac_minus_bd FpElementX2 | |||||
fp503X2SubLazy(&ac_minus_bd, &ac, &bd) // = (a*c - b*d)*R*R | fp503X2SubLazy(&ac_minus_bd, &ac, &bd) // = (a*c - b*d)*R*R | ||||
fp503MontgomeryReduce(&dest.A, &ac_minus_bd) // = (a*c - b*d)*R mod p | fp503MontgomeryReduce(&dest.A, &ac_minus_bd) // = (a*c - b*d)*R mod p | ||||
} | } | ||||