WIP

WIP2 WIP
6 년 전 · a3ccc0e275
--- a/p503/arith_amd64.go
+++ b/p503/arith_amd64.go
@@ -0,0 +1,17 @@
 // +build amd64,!noasm

 package p503

 import cpu "github.com/cloudflare/p751sidh/internal/utils"

 // There couple of reasons for having those variables here:
 // 1) to have an access to them from assembly
 // 2) to make it easy to vendor the library
 // 3) make it possible to test all functionalities
 var useMULX bool
 var useADXMULX bool

 func init() {
 	useMULX = cpu.HasBMI2
 	useADXMULX = cpu.HasADX && cpu.HasBMI2
 }
--- a/p503/arith_amd64.s
+++ b/p503/arith_amd64.s
@@ -690,23 +690,26 @@ TEXT ·fp503SubReduced(SB), NOSPLIT, $0-24

 	RET

 TEXT ·mulWithMULXADX(SB), NOSPLIT, $104-24
 TEXT ·fp503Mul(SB), NOSPLIT, $104-24
 	// Actual implementation
 	MOVQ    z+ 0(FP), CX
 	MOVQ    x+ 8(FP), REG_P2
 	MOVQ    y+16(FP), REG_P1
 	MUL(CX, REG_P2, REG_P1, MULS256_MULXADX)
 	RET
 	MOVQ    x+ 8(FP), REG_P1
 	MOVQ    y+16(FP), REG_P2

 TEXT ·mulWithMULX(SB), NOSPLIT, $104-24
 	// Actual implementation
 	MOVQ    z+ 0(FP), CX
 	MOVQ    x+ 8(FP), REG_P2
 	MOVQ    y+16(FP), REG_P1
 	MUL(CX, REG_P2, REG_P1, MULS256_MULX)
 //	CMPB	·useMULX(SB), $0
 //	JE		mul
 //	CMPB	·useMULX(SB), $1
 //	JE		mulx

 //	MOVB	·useADXMULX(SB), AX
 //	TESTB	AX, AX
 //	JZ		mulx_and_adx

 	MUL(CX, REG_P1, REG_P2, MULS256_MULXADX)
 	RET

 TEXT ·mul(SB), $96-24
 mul:
 	RET
 	// Uses variant of Karatsuba method.
 	//
 	// Here we store the destination in CX instead of in REG_P3 because the
@@ -1188,11 +1191,28 @@ TEXT ·mul(SB), $96-24
 	ADCQ    $0, SI;     	MOVQ    SI, (120)(CX)

 	RET
 // Uses implementation optimized for CPU supporting carry-less
 // multiplayer (MULX) and two independent carry-chains (ADOX/ADCX)
 mulx_and_adx:
 	MUL(CX, REG_P1, REG_P2, MULS256_MULXADX)
 	RET
 // Uses implementation optimized for CPU supporting carry-less
 mulx:
 	MUL(CX, REG_P1, REG_P2, MULS256_MULX)
 	RET

 TEXT ·redc(SB), $0-16
 TEXT ·fp503MontgomeryReduce(SB), $0-16
 	MOVQ	z+0(FP), REG_P2
 	MOVQ	x+8(FP), REG_P1

 	REDC(REG_P2, REG_P1, MULS_128x320_MULX)
 	RET

 	CMPB	·useADXMULX(SB), $1
 	JE		redc_with_mulx_adx
 	CMPB	·useMULX(SB), $1
 	JE		redc_with_mulx

 	MOVQ    (REG_P1), R11
 	MOVQ    P503P1_3, AX
 	MULQ    R11
@@ -1497,17 +1517,12 @@ TEXT ·redc(SB), $0-16
 	MOVQ    R10, (56)(REG_P2)      // Z7

 	RET

 TEXT ·redcWithMULX(SB), $0-16
 	MOVQ    z+0(FP), DI
 	MOVQ    x+8(FP), SI
 	REDC(DI, SI, MULS_128x320_MULX)
 redc_with_mulx:
 	REDC(REG_P2, REG_P1, MULS_128x320_MULX)
 	RET

 TEXT ·redcWithMULXADX(SB), $0-16
 	MOVQ    z+0(FP), DI
 	MOVQ    x+8(FP), SI
 	REDC(DI, SI, MULS_128x320_MULXADX)
 redc_with_mulx_adx:
 	REDC(REG_P2, REG_P1, MULS_128x320_MULXADX)
 	RET

 TEXT ·fp503AddLazy(SB), NOSPLIT, $0-24
--- a/p503/arith_amd64_test.go
+++ b/p503/arith_amd64_test.go
@@ -2,14 +2,16 @@

 package p503

 /*
 import (
 	. "github.com/cloudflare/p751sidh/internal/isogeny"
 	cpu "github.com/cloudflare/p751sidh/internal/utils"
 //	cpu "github.com/cloudflare/p751sidh/internal/utils"
 	"reflect"
 	"testing"
 	"testing/quick"
 )

 */
 /*
 // Utility function used for testing Mul implementations. Tests caller provided
 // mulFunc against mul()
 func testMul(t *testing.T, mulFunc func(z *FpElementX2, x, y *FpElement)) {
@@ -40,7 +42,8 @@ func testRedc(t *testing.T, redcFunc func(z *FpElement, x *FpElementX2)) {
 		t.Error(err)
 	}
 }

 */
 /*
 // Ensures corretness of implementation of mul operation which uses MULX
 func TestMulWithMULX(t *testing.T) {
 	if !cpu.HasBMI2 {
@@ -72,3 +75,4 @@ func TestRedcWithMULXADX(t *testing.T) {
 	}
 	testRedc(t, redcWithMULXADX)
 }
 */
--- a/p503/arith_decl.go
+++ b/p503/arith_decl.go
@@ -4,7 +4,6 @@ package p503

 import (
 	. "github.com/cloudflare/p751sidh/internal/isogeny"
 	cpu "github.com/cloudflare/p751sidh/internal/utils"
 )

 // If choice = 0, leave x,y unchanged. If choice = 1, set x,y = y,x.
@@ -41,24 +40,24 @@ func fp503StrongReduce(x *FpElement)
 // Concrete implementation depends on capabilities of the CPU which
 // are resolved at runtime. CPUs with ADCX, ADOX and MULX support
 // run most optimized implementation
 var fp503Mul func(z *FpElementX2, x, y *FpElement)

 // Mul implementattion for legacy CPUs
 //go:noescape
 func mul(z *FpElementX2, x, y *FpElement)

 // Mul implementation for CPUs supporting carry-less MULX multiplier.
 //go:noescape
 func mulWithMULX(z *FpElementX2, x, y *FpElement)

 // Mul implementation for CPUs supporting two independent carry chain
 // (ADOX/ADCX) instructions and carry-less MULX multiplier
 //go:noescape
 func mulWithMULXADX(z *FpElementX2, x, y *FpElement)
 func fp503Mul(z *FpElementX2, x, y *FpElement)

 // // Mul implementattion for legacy CPUs
 // //go:noescape
 // func mul(z *FpElementX2, x, y *FpElement)
 //
 // // Mul implementation for CPUs supporting carry-less MULX multiplier.
 // //go:noescape
 // func mulWithMULX(z *FpElementX2, x, y *FpElement)
 //
 // // Mul implementation for CPUs supporting two independent carry chain
 // // (ADOX/ADCX) instructions and carry-less MULX multiplier
 // //go:noescape
 // func mulWithMULXADX(z *FpElementX2, x, y *FpElement)

 // Computes the Montgomery reduction z = x R^{-1} (mod 2*p). On return value
 // of x may be changed. z=x not allowed.
 var fp503MontgomeryReduce func(z *FpElement, x *FpElementX2)
 func fp503MontgomeryReduce(z *FpElement, x *FpElementX2)

 func redc(z *FpElement, x *FpElementX2)

@@ -71,19 +70,21 @@ func redcWithMULX(z *FpElement, x *FpElementX2)
 //go:noescape
 func redcWithMULXADX(z *FpElement, x *FpElementX2)

 /*
 // On initialization, set the fp503Mul function pointer to the
 // fastest implementation depending on CPU capabilities.
 func init() {
 	if cpu.HasBMI2 {
 		if cpu.HasADX {
 			fp503Mul = mulWithMULXADX
 //			fp503Mul = mulWithMULXADX
 			fp503MontgomeryReduce = redcWithMULXADX
 		} else {
 			fp503Mul = mulWithMULX
 //			fp503Mul = mulWithMULX
 			fp503MontgomeryReduce = redcWithMULX
 		}
 	} else {
 		fp503Mul = mul
 //		fp503Mul = mul
 		fp503MontgomeryReduce = redc
 	}
 }
 */