diff --git a/p503/arith_amd64.go b/p503/arith_amd64.go
new file mode 100644
index 0000000..dd7d3e1
--- /dev/null
+++ b/p503/arith_amd64.go
@@ -0,0 +1,24 @@
+// +build amd64,!noasm
+
+package p503
+
+import cpu "github.com/cloudflare/p751sidh/internal/utils"
+
+// There couple of reasons for having those variables here:
+// * to have an access to them from assembly
+//   TODO(kk): Is there a way to access variable from different package?
+//             If it is then probably this file could be moved to internal
+//             and we don't need to have many copies of that
+// * make it easy to vendor the library
+// * make it possible to test all functionalities
+var useMULX bool
+var useADXMULX bool
+
+func recognizecpu() {
+	useMULX = cpu.HasBMI2
+	useADXMULX = cpu.HasADX && cpu.HasBMI2
+}
+
+func init() {
+	recognizecpu()
+}
diff --git a/p503/arith_amd64.s b/p503/arith_amd64.s
index e52f518..d0b4698 100644
--- a/p503/arith_amd64.s
+++ b/p503/arith_amd64.s
@@ -354,7 +354,8 @@
 // Template for calculating the Montgomery reduction algorithm described in
 // section 5.2.3 of https://eprint.iacr.org/2017/1015.pdf. Template must be
 // customized with schoolbook multiplicaton for 128 x 320-bit number.
-// This macro reuses memory of IN value and *changes* it.
+// This macro reuses memory of IN value and *changes* it. Smashes registers
+// R[8-15], BX, CX
 // Input:
 //    * IN: 1024-bit number to be reduced
 //    * MULS: either MULS_128x320_MULX or MULS_128x320_MULXADX
@@ -690,34 +691,23 @@ TEXT ·fp503SubReduced(SB), NOSPLIT, $0-24
 
 	RET
 
-TEXT ·mulWithMULXADX(SB), NOSPLIT, $104-24
-	// Actual implementation
+TEXT ·fp503Mul(SB), NOSPLIT, $104-24
 	MOVQ    z+ 0(FP), CX
-	MOVQ    x+ 8(FP), REG_P2
-	MOVQ    y+16(FP), REG_P1
-	MUL(CX, REG_P2, REG_P1, MULS256_MULXADX)
-	RET
+	MOVQ    x+ 8(FP), REG_P1
+	MOVQ    y+16(FP), REG_P2
 
-TEXT ·mulWithMULX(SB), NOSPLIT, $104-24
-	// Actual implementation
-	MOVQ    z+ 0(FP), CX
-	MOVQ    x+ 8(FP), REG_P2
-	MOVQ    y+16(FP), REG_P1
-	MUL(CX, REG_P2, REG_P1, MULS256_MULX)
-	RET
+	// Check wether to use optimized implementation
+	CMPB    ·useADXMULX(SB), $1
+	JE      mul_with_mulx_adx
+	CMPB    ·useMULX(SB), $1
+	JE      mul_with_mulx
 
-TEXT ·mul(SB), $96-24
-	// Uses variant of Karatsuba method.
+	// Generic x86 implementation (below) uses variant of Karatsuba method.
 	//
 	// Here we store the destination in CX instead of in REG_P3 because the
 	// multiplication instructions use DX as an implicit destination
 	// operand: MULQ $REG sets DX:AX <-- AX * $REG.
 
-	// Actual implementation
-	MOVQ	z+0(FP), CX
-	MOVQ	x+8(FP), REG_P1
-	MOVQ	y+16(FP), REG_P2
-
 	// RAX and RDX will be used for a mask (0-borrow)
 	XORQ	AX, AX
 
@@ -1186,12 +1176,28 @@ TEXT ·mul(SB), $96-24
 	ADCQ    $0, DX;        	MOVQ    DX, (104)(CX)
 	ADCQ    $0, DI;         MOVQ    DI, (112)(CX)
 	ADCQ    $0, SI;     	MOVQ    SI, (120)(CX)
+	RET
 
+mul_with_mulx_adx:
+	// Mul implementation for CPUs supporting two independent carry chain
+	// (ADOX/ADCX) instructions and carry-less MULX multiplier
+	MUL(CX, REG_P1, REG_P2, MULS256_MULXADX)
 	RET
 
-TEXT ·redc(SB), $0-16
-	MOVQ	z+0(FP), REG_P2
-	MOVQ	x+8(FP), REG_P1
+mul_with_mulx:
+	// Mul implementation for CPUs supporting carry-less MULX multiplier.
+	MUL(CX, REG_P1, REG_P2, MULS256_MULX)
+	RET
+
+TEXT ·fp503MontgomeryReduce(SB), $0-16
+	MOVQ    z+0(FP), REG_P2
+	MOVQ    x+8(FP), REG_P1
+
+	// Check wether to use optimized implementation
+	CMPB    ·useADXMULX(SB), $1
+	JE      redc_with_mulx_adx
+	CMPB    ·useMULX(SB), $1
+	JE      redc_with_mulx
 
 	MOVQ    (REG_P1), R11
 	MOVQ    P503P1_3, AX
@@ -1495,19 +1501,19 @@ TEXT ·redc(SB), $0-16
 	ADCQ    $0, R10
 	ADDQ    (120)(REG_P1), R10     // Z7
 	MOVQ    R10, (56)(REG_P2)      // Z7
-
 	RET
 
-TEXT ·redcWithMULX(SB), $0-16
-	MOVQ    z+0(FP), DI
-	MOVQ    x+8(FP), SI
-	REDC(DI, SI, MULS_128x320_MULX)
+redc_with_mulx_adx:
+	// Implementation of the Montgomery reduction for CPUs
+	// supporting two independent carry chain (ADOX/ADCX)
+	// instructions and carry-less MULX multiplier
+	REDC(REG_P2, REG_P1, MULS_128x320_MULXADX)
 	RET
 
-TEXT ·redcWithMULXADX(SB), $0-16
-	MOVQ    z+0(FP), DI
-	MOVQ    x+8(FP), SI
-	REDC(DI, SI, MULS_128x320_MULXADX)
+redc_with_mulx:
+	// Implementation of the Montgomery reduction for CPUs
+	// supporting carry-less MULX multiplier.
+	REDC(REG_P2, REG_P1, MULS_128x320_MULX)
 	RET
 
 TEXT ·fp503AddLazy(SB), NOSPLIT, $0-24
diff --git a/p503/arith_amd64_test.go b/p503/arith_amd64_test.go
index 5b2d9c7..6adb6a9 100644
--- a/p503/arith_amd64_test.go
+++ b/p503/arith_amd64_test.go
@@ -10,13 +10,32 @@ import (
 	"testing/quick"
 )
 
+type OptimFlag uint
+
+const (
+	kUse_MUL     OptimFlag = 1 << 0
+	kUse_MULX              = 1 << 1
+	kUse_MULXADX           = 1 << 2
+)
+
 // Utility function used for testing Mul implementations. Tests caller provided
 // mulFunc against mul()
-func testMul(t *testing.T, mulFunc func(z *FpElementX2, x, y *FpElement)) {
+func testMul(t *testing.T, f1, f2 OptimFlag) {
 	doMulTest := func(multiplier, multiplicant FpElement) bool {
+		defer recognizecpu()
 		var resMulRef, resMulOptim FpElementX2
-		mul(&resMulRef, &multiplier, &multiplicant)
-		mulFunc(&resMulOptim, &multiplier, &multiplicant)
+
+		// Compute multiplier*multiplicant with first implementation
+		useMULX = (kUse_MULX & f1) == kUse_MULX
+		useADXMULX = (kUse_MULXADX & f1) == kUse_MULXADX
+		fp503Mul(&resMulOptim, &multiplier, &multiplicant)
+
+		// Compute multiplier*multiplicant with second implementation
+		useMULX = (kUse_MULX & f2) == kUse_MULX
+		useADXMULX = (kUse_MULXADX & f2) == kUse_MULXADX
+		fp503Mul(&resMulRef, &multiplier, &multiplicant)
+
+		// Compare results
 		return reflect.DeepEqual(resMulRef, resMulOptim)
 	}
 
@@ -27,13 +46,24 @@ func testMul(t *testing.T, mulFunc func(z *FpElementX2, x, y *FpElement)) {
 
 // Utility function used for testing REDC implementations. Tests caller provided
 // redcFunc against redc()
-func testRedc(t *testing.T, redcFunc func(z *FpElement, x *FpElementX2)) {
+func testRedc(t *testing.T, f1, f2 OptimFlag) {
 	doRedcTest := func(aRR FpElementX2) bool {
-		var resRedcRef, resRedcOptim FpElement
+		defer recognizecpu()
+		var resRedcF1, resRedcF2 FpElement
 		var aRRcpy = aRR
-		redc(&resRedcRef, &aRR)
-		redcFunc(&resRedcOptim, &aRRcpy)
-		return reflect.DeepEqual(resRedcRef, resRedcOptim)
+
+		// Compute redc with first implementation
+		useMULX = (kUse_MULX & f1) == kUse_MULX
+		useADXMULX = (kUse_MULXADX & f1) == kUse_MULXADX
+		fp503MontgomeryReduce(&resRedcF1, &aRR)
+
+		// Compute redc with second implementation
+		useMULX = (kUse_MULX & f2) == kUse_MULX
+		useADXMULX = (kUse_MULXADX & f2) == kUse_MULXADX
+		fp503MontgomeryReduce(&resRedcF2, &aRRcpy)
+
+		// Compare results
+		return reflect.DeepEqual(resRedcF2, resRedcF1)
 	}
 
 	if err := quick.Check(doRedcTest, quickCheckConfig); err != nil {
@@ -43,32 +73,56 @@ func testRedc(t *testing.T, redcFunc func(z *FpElement, x *FpElementX2)) {
 
 // Ensures corretness of implementation of mul operation which uses MULX
 func TestMulWithMULX(t *testing.T) {
+	defer recognizecpu()
 	if !cpu.HasBMI2 {
 		t.Skip("MULX not supported by the platform")
 	}
-	testMul(t, mulWithMULX)
+	testMul(t, kUse_MULX, kUse_MUL)
 }
 
 // Ensures corretness of implementation of mul operation which uses MULX and ADOX/ADCX
 func TestMulWithMULXADX(t *testing.T) {
+	defer recognizecpu()
 	if !(cpu.HasADX && cpu.HasBMI2) {
 		t.Skip("MULX, ADCX and ADOX not supported by the platform")
 	}
-	testMul(t, mulWithMULXADX)
+	testMul(t, kUse_MULXADX, kUse_MUL)
+}
+
+// Ensures corretness of implementation of mul operation which uses MULX and ADOX/ADCX
+func TestMulWithMULXADXAgainstMULX(t *testing.T) {
+	defer recognizecpu()
+	if !(cpu.HasADX && cpu.HasBMI2) {
+		t.Skip("MULX, ADCX and ADOX not supported by the platform")
+	}
+	testMul(t, kUse_MULX, kUse_MULXADX)
 }
 
 // Ensures corretness of Montgomery reduction implementation which uses MULX
 func TestRedcWithMULX(t *testing.T) {
+	defer recognizecpu()
 	if !cpu.HasBMI2 {
 		t.Skip("MULX not supported by the platform")
 	}
-	testRedc(t, redcWithMULX)
+	testRedc(t, kUse_MULX, kUse_MUL)
 }
 
 // Ensures corretness of Montgomery reduction implementation which uses MULX
+// and ADX
 func TestRedcWithMULXADX(t *testing.T) {
+	defer recognizecpu()
+	if !(cpu.HasADX && cpu.HasBMI2) {
+		t.Skip("MULX, ADCX and ADOX not supported by the platform")
+	}
+	testRedc(t, kUse_MULXADX, kUse_MUL)
+}
+
+// Ensures corretness of Montgomery reduction implementation which uses MULX
+// and ADX.
+func TestRedcWithMULXADXAgainstMULX(t *testing.T) {
+	defer recognizecpu()
 	if !(cpu.HasADX && cpu.HasBMI2) {
 		t.Skip("MULX, ADCX and ADOX not supported by the platform")
 	}
-	testRedc(t, redcWithMULXADX)
+	testRedc(t, kUse_MULXADX, kUse_MULX)
 }
diff --git a/p503/arith_decl.go b/p503/arith_decl.go
index 6bd15ea..5bc6884 100644
--- a/p503/arith_decl.go
+++ b/p503/arith_decl.go
@@ -4,7 +4,6 @@ package p503
 
 import (
 	. "github.com/cloudflare/sidh/internal/isogeny"
-	cpu "github.com/cloudflare/sidh/internal/utils"
 )
 
 // If choice = 0, leave x,y unchanged. If choice = 1, set x,y = y,x.
@@ -37,53 +36,11 @@ func fp503X2SubLazy(z, x, y *FpElementX2)
 //go:noescape
 func fp503StrongReduce(x *FpElement)
 
-// Function pointer to function computing z = x * y.
-// Concrete implementation depends on capabilities of the CPU which
-// are resolved at runtime. CPUs with ADCX, ADOX and MULX support
-// run most optimized implementation
-var fp503Mul func(z *FpElementX2, x, y *FpElement)
-
-// Mul implementattion for legacy CPUs
-//go:noescape
-func mul(z *FpElementX2, x, y *FpElement)
-
-// Mul implementation for CPUs supporting carry-less MULX multiplier.
-//go:noescape
-func mulWithMULX(z *FpElementX2, x, y *FpElement)
-
-// Mul implementation for CPUs supporting two independent carry chain
-// (ADOX/ADCX) instructions and carry-less MULX multiplier
+// Computes z = x * y.
 //go:noescape
-func mulWithMULXADX(z *FpElementX2, x, y *FpElement)
+func fp503Mul(z *FpElementX2, x, y *FpElement)
 
 // Computes the Montgomery reduction z = x R^{-1} (mod 2*p). On return value
 // of x may be changed. z=x not allowed.
-var fp503MontgomeryReduce func(z *FpElement, x *FpElementX2)
-
-func redc(z *FpElement, x *FpElementX2)
-
-// Mul implementation for CPUs supporting carry-less MULX multiplier.
 //go:noescape
-func redcWithMULX(z *FpElement, x *FpElementX2)
-
-// Mul implementation for CPUs supporting two independent carry chain
-// (ADOX/ADCX) instructions and carry-less MULX multiplier
-//go:noescape
-func redcWithMULXADX(z *FpElement, x *FpElementX2)
-
-// On initialization, set the fp503Mul function pointer to the
-// fastest implementation depending on CPU capabilities.
-func init() {
-	if cpu.HasBMI2 {
-		if cpu.HasADX {
-			fp503Mul = mulWithMULXADX
-			fp503MontgomeryReduce = redcWithMULXADX
-		} else {
-			fp503Mul = mulWithMULX
-			fp503MontgomeryReduce = redcWithMULX
-		}
-	} else {
-		fp503Mul = mul
-		fp503MontgomeryReduce = redc
-	}
-}
+func fp503MontgomeryReduce(z *FpElement, x *FpElementX2)