diff --git a/Makefile b/Makefile
index 78f8543..2aaa7e8 100644
--- a/Makefile
+++ b/Makefile
@@ -12,7 +12,7 @@ OPTS         ?=
 OPTS_TAGS    ?= -tags=noasm
 NOASM        ?=
 # -run="NonExistent" is set to make sure tests are not run before benchmarking
-BENCH_OPTS   ?= -bench=. -run="NonExistent"
+BENCH_OPTS   ?= -bench=. -run="NonExistent" -benchmem
 # whether to be verbose
 V            ?= 1
 
diff --git a/p503/arith_amd64.s b/p503/arith_amd64.s
index e52f518..74c04a3 100644
--- a/p503/arith_amd64.s
+++ b/p503/arith_amd64.s
@@ -698,6 +698,22 @@ TEXT ·mulWithMULXADX(SB), NOSPLIT, $104-24
 	MUL(CX, REG_P2, REG_P1, MULS256_MULXADX)
 	RET
 
+TEXT ·fp503Mul(SB), NOSPLIT, $104-24
+	// Actual implementation
+	MOVQ    z+ 0(FP), CX
+	MOVQ    x+ 8(FP), REG_P2
+	MOVQ    y+16(FP), REG_P1
+	MUL(CX, REG_P2, REG_P1, MULS256_MULXADX)
+	RET
+
+TEXT ·fp503MulXXX(SB), NOSPLIT, $104-72
+	// Actual implementation
+	MOVQ    z+ 0(FP), CX
+	MOVQ    x+24(FP), REG_P2
+	MOVQ    y+48(FP), REG_P1
+	MUL(CX, REG_P2, REG_P1, MULS256_MULXADX)
+	RET
+
 TEXT ·mulWithMULX(SB), NOSPLIT, $104-24
 	// Actual implementation
 	MOVQ    z+ 0(FP), CX
@@ -1509,6 +1525,11 @@ TEXT ·redcWithMULXADX(SB), $0-16
 	MOVQ    x+8(FP), SI
 	REDC(DI, SI, MULS_128x320_MULXADX)
 	RET
+TEXT ·fp503MontgomeryReduce(SB), $0-16
+	MOVQ    z+0(FP), DI
+	MOVQ    x+8(FP), SI
+	REDC(DI, SI, MULS_128x320_MULXADX)
+	RET
 
 TEXT ·fp503AddLazy(SB), NOSPLIT, $0-24
 
diff --git a/p503/arith_decl.go b/p503/arith_decl.go
index a65a2dc..284393a 100644
--- a/p503/arith_decl.go
+++ b/p503/arith_decl.go
@@ -4,7 +4,7 @@ package p503
 
 import (
 	. "github.com/cloudflare/p751sidh/internal/isogeny"
-	cpu "github.com/cloudflare/p751sidh/internal/utils"
+	//	cpu "github.com/cloudflare/p751sidh/internal/utils"
 )
 
 // If choice = 0, leave x,y unchanged. If choice = 1, set x,y = y,x.
@@ -41,7 +41,8 @@ func fp503StrongReduce(x *FpElement)
 // Concrete implementation depends on capabilities of the CPU which
 // are resolved at runtime. CPUs with ADCX, ADOX and MULX support
 // run most optimized implementation
-var fp503Mul func(z *FpElementX2, x, y *FpElement)
+//go:noescape
+// var fp503Mul func(z *FpElementX2, x, y *FpElement)
 
 // Mul implementattion for legacy CPUs
 //go:noescape
@@ -53,12 +54,21 @@ func mulWithMULX(z *FpElementX2, x, y *FpElement)
 
 // Mul implementation for CPUs supporting two independent carry chain
 // (ADOX/ADCX) instructions and carry-less MULX multiplier
+//go:noescape
+func fp503Mul(z *FpElementX2, x, y *FpElement)
+
+//go:noescape
+func fp503MulXXX(z, x, y []uint64)
+
+var fp503Mul1 func(z, x, y []uint64)
+
 //go:noescape
 func mulWithMULXADX(z *FpElementX2, x, y *FpElement)
 
 // Computes the Montgomery reduction z = x R^{-1} (mod 2*p). On return value
 // of x may be changed. z=x not allowed.
-var fp503MontgomeryReduce func(z *FpElement, x *FpElementX2)
+//go:noescape
+func fp503MontgomeryReduce(z *FpElement, x *FpElementX2)
 
 func redc(z *FpElement, x *FpElementX2)
 
@@ -73,17 +83,23 @@ func redcWithMULXADX(z *FpElement, x *FpElementX2)
 
 // On initialization, set the fp503Mul function pointer to the
 // fastest implementation depending on CPU capabilities.
+func init() {
+	fp503Mul1 = fp503MulXXX
+}
+
+/*
 func init() {
 	if cpu.HasBMI2 {
 		if cpu.HasADX {
-			fp503Mul = mulWithMULXADX
+			//fp503Mul = mulWithMULXADX
 			fp503MontgomeryReduce = redcWithMULXADX
 		} else {
-			fp503Mul = mulWithMULX
+			//fp503Mul = mulWithMULX
 			fp503MontgomeryReduce = redcWithMULX
 		}
 	} else {
-		fp503Mul = mul
+		//fp503Mul = mul
 		fp503MontgomeryReduce = redc
 	}
 }
+*/
diff --git a/p503/field_ops.go b/p503/field_ops.go
index ea75e3a..4299f10 100644
--- a/p503/field_ops.go
+++ b/p503/field_ops.go
@@ -21,13 +21,12 @@ func (fp503Ops) Sub(dest, lhs, rhs *Fp2Element) {
 }
 
 func (fp503Ops) Mul(dest, lhs, rhs *Fp2Element) {
-	// Let (a,b,c,d) = (lhs.a,lhs.b,rhs.a,rhs.b).
-	a := &lhs.A
-	b := &lhs.B
-	c := &rhs.A
-	d := &rhs.B
+	var b_minus_a, c_minus_d FpElement
+	var ad_plus_bc FpElementX2
+	var ac, bd FpElementX2
+	var ac_minus_bd FpElementX2
 
-	// We want to compute
+	// Let (a,b,c,d) = (lhs.a,lhs.b,rhs.a,rhs.b).  We want to compute
 	//
 	// (a + bi)*(c + di) = (a*c - b*d) + (a*d + b*c)i
 	//
@@ -37,22 +36,18 @@ func (fp503Ops) Mul(dest, lhs, rhs *Fp2Element) {
 	//
 	// so (a*d + b*c) = (b-a)*(c-d) + a*c + b*d.
 
-	var ac, bd FpElementX2
-	fp503Mul(&ac, a, c) // = a*c*R*R
-	fp503Mul(&bd, b, d) // = b*d*R*R
+	fp503Mul1(ac[:], lhs.A[:], rhs.A[:]) // = a*c*R*R
+	fp503Mul(&bd, &lhs.B, &rhs.B)        // = b*d*R*R
 
-	var b_minus_a, c_minus_d FpElement
-	fp503SubReduced(&b_minus_a, b, a) // = (b-a)*R
-	fp503SubReduced(&c_minus_d, c, d) // = (c-d)*R
+	fp503SubReduced(&b_minus_a, &lhs.B, &lhs.A) // = (b-a)*R
+	fp503SubReduced(&c_minus_d, &rhs.A, &rhs.B) // = (c-d)*R
 
-	var ad_plus_bc FpElementX2
 	fp503Mul(&ad_plus_bc, &b_minus_a, &c_minus_d) // = (b-a)*(c-d)*R*R
 	fp503X2AddLazy(&ad_plus_bc, &ad_plus_bc, &ac) // = ((b-a)*(c-d) + a*c)*R*R
 	fp503X2AddLazy(&ad_plus_bc, &ad_plus_bc, &bd) // = ((b-a)*(c-d) + a*c + b*d)*R*R
 
 	fp503MontgomeryReduce(&dest.B, &ad_plus_bc) // = (a*d + b*c)*R mod p
 
-	var ac_minus_bd FpElementX2
 	fp503X2SubLazy(&ac_minus_bd, &ac, &bd)       // = (a*c - b*d)*R*R
 	fp503MontgomeryReduce(&dest.A, &ac_minus_bd) // = (a*c - b*d)*R mod p
 }