diff --git a/p751/arith_amd64.go b/p751/arith_amd64.go
new file mode 100644
index 0000000..9bc7a6e
--- /dev/null
+++ b/p751/arith_amd64.go
@@ -0,0 +1,24 @@
+// +build amd64,!noasm
+
+package p751
+
+import cpu "github.com/cloudflare/p751sidh/internal/utils"
+
+// There couple of reasons for having those variables here:
+// * to have an access to them from assembly
+//   TODO(kk): Is there a way to access variable from different package?
+//             If it is then probably this file could be moved to internal
+//             and we don't need to have many copies of that
+// * make it easy to vendor the library
+// * make it possible to test all functionalities
+var useMULX bool
+var useADXMULX bool
+
+func recognizecpu() {
+	useMULX = cpu.HasBMI2
+	useADXMULX = cpu.HasADX && cpu.HasBMI2
+}
+
+func init() {
+	recognizecpu()
+}
diff --git a/p751/arith_amd64.s b/p751/arith_amd64.s
index 1fae216..65e7500 100644
--- a/p751/arith_amd64.s
+++ b/p751/arith_amd64.s
@@ -1607,177 +1607,143 @@ TEXT ·fp751Mul(SB), $96-24
 	ADCQ	DX, T2			\
 	ADCQ	AX, T4
 
-#define fp751MontgomeryReduceCommonPart1(M0, C, T0, T1, T2, T3, T4, T5, T6, T7, T8, T9)\
-	XORQ	T7, T7			\
-	MOVQ	48+C, AX		\
-	MOVQ	56+C, DX		\
-	MOVQ	64+C, T9		\
-	ADDQ	40+M0, AX		\
-	ADCQ	48+M0, DX		\
-	ADCQ	56+M0, T9		\
-	MOVQ	AX, 40+M0		\
-	MOVQ	DX, 48+M0		\
-	MOVQ	T9, 56+M0		\
-	ADCQ	64+M0, T8		\
-	ADCQ	72+M0, T0		\
-	ADCQ	80+M0, T1		\
-	ADCQ	88+M0, T2		\
-	ADCQ	96+M0, T3		\
-	ADCQ	104+M0, T4		\
-	ADCQ	112+M0, T5		\
-	ADCQ	120+M0, T6		\
-	ADCQ	128+M0, T7		\
-	MOVQ	T8, 64+M0		\
-	MOVQ	T0, 72+M0		\
-	MOVQ	T1, 80+M0		\
-	MOVQ	T2, 88+M0		\
-	MOVQ	T3, 96+M0		\
-	MOVQ	T4, 104+M0		\
-	MOVQ	T5, 112+M0		\
-	MOVQ	T6, 120+M0		\
-	MOVQ	T7, 128+M0		\
-	MOVQ	136+M0, T0		\
-	MOVQ	144+M0, T1		\
-	MOVQ	152+M0, T2		\
-	MOVQ	160+M0, T3		\
-	MOVQ	168+M0, T4		\
-	MOVQ	176+M0, T5		\
-	MOVQ	184+M0, T6		\
-	ADCQ	$0, T0			\
-	ADCQ	$0, T1			\
-	ADCQ	$0, T2			\
-	ADCQ	$0, T3			\
-	ADCQ	$0, T4			\
-	ADCQ	$0, T5			\
-	ADCQ	$0, T6			\
-	MOVQ	T0, 136+M0		\
-	MOVQ	T1, 144+M0		\
-	MOVQ	T2, 152+M0		\
-	MOVQ	T3, 160+M0		\
-	MOVQ	T4, 168+M0		\
-	MOVQ	T5, 176+M0		\
-	MOVQ	T6, 184+M0
-
-#define fp751MontgomeryReduceCommonPart2(M0, C, T0, T1, T2, T3, T4, T5, T6, T7, T8, T9)\
-	XORQ	T7, T7			\
-	MOVQ	48+C, AX		\
-	MOVQ	56+C, DX		\
-	MOVQ	64+C, T9		\
-	ADDQ	72+M0, AX		\
-	ADCQ	80+M0, DX		\
-	ADCQ	88+M0, T9		\
-	MOVQ	AX, 72+M0		\
-	MOVQ	DX, 80+M0		\
-	MOVQ	T9, 88+M0		\
-	ADCQ	96+M0, T8		\
-	ADCQ	104+M0, T0		\
-	ADCQ	112+M0, T1		\
-	ADCQ	120+M0, T2		\
-	ADCQ	128+M0, T3		\
-	ADCQ	136+M0, T4		\
-	ADCQ	144+M0, T5		\
-	ADCQ	152+M0, T6		\
-	ADCQ	160+M0, T7		\
-	MOVQ	T8, 0+C			\	// Final result c0
-	MOVQ	T0, 104+M0		\
-	MOVQ	T1, 112+M0		\
-	MOVQ	T2, 120+M0		\
-	MOVQ	T3, 128+M0		\
-	MOVQ	T4, 136+M0		\
-	MOVQ	T5, 144+M0		\
-	MOVQ	T6, 152+M0		\
-	MOVQ	T7, 160+M0		\
-	MOVQ	168+M0, T4		\
-	MOVQ	176+M0, T5		\
-	MOVQ	184+M0, T6		\
-	ADCQ	$0, T4			\
-	ADCQ	$0, T5			\
-	ADCQ	$0, T6			\
-	MOVQ	T4, 168+M0		\
-	MOVQ	T5, 176+M0		\
-	MOVQ	T6, 184+M0
-
-#define fp751MontgomeryReduceCommonPart3(M0, C, T0, T1, T2, T3, T4, T5, T6, T7, T8, T9)\
-	MOVQ	48+C, AX		\	// Final result c1:c11
-	MOVQ	56+C, DX		\
-	MOVQ	64+C, T9		\
-	ADDQ	104+M0, AX		\
-	ADCQ	112+M0, DX		\
-	ADCQ	120+M0, T9		\
-	MOVQ	AX, 8+C			\
-	MOVQ	DX, 16+C		\
-	MOVQ	T9, 24+C		\
-	ADCQ	128+M0, T8		\
-	ADCQ	136+M0, T0		\
-	ADCQ	144+M0, T1		\
-	ADCQ	152+M0, T2		\
-	ADCQ	160+M0, T3		\
-	ADCQ	168+M0, T4		\
-	ADCQ	176+M0, T5		\
-	ADCQ	184+M0, T6		\
-	MOVQ	T8, 32+C		\
-	MOVQ	T0, 40+C		\
-	MOVQ	T1, 48+C		\
-	MOVQ	T2, 56+C		\
-	MOVQ	T3, 64+C		\
-	MOVQ	T4, 72+C		\
-	MOVQ	T5, 80+C		\
-	MOVQ	T6, 88+C
-
-// This implements the Montgomery reduction algorithm described in
-// section 5.2.3 of https://eprint.iacr.org/2017/1015.pdf.
-// This assumes that the BMI2 and ADX instruction set extensions are available.
-TEXT ·fp751MontgomeryReduceBMI2ADX(SB), $0-16
+// Template for calculating the Montgomery reduction algorithm described in
+// section 5.2.3 of https://eprint.iacr.org/2017/1015.pdf. Template must be
+// customized with schoolbook multiplicaton for 256 x 448-bit number.
+// This macro reuses memory of IN value and *changes* it. Smashes registers
+// R[8-15], AX, BX, CX, DX, BP.
+// Input:
+//    * M0: 1536-bit number to be reduced
+//    * C : either mul256x448bmi2 or mul256x448bmi2adx
+// Output: OUT 768-bit
+#define REDC(C, M0, MULS) 	\
+    \ // a[0-3] x p751p1_nz --> result: [reg_p2+48], [reg_p2+56], [reg_p2+64], and rbp, r8:r14
+    MULS(M0, ·p751p1, 48+C, R8, R9, R13, R10, R14, R12, R11, BP, BX, CX, R15) \
+    XORQ    R15, R15        \
+    MOVQ    48+C, AX        \
+    MOVQ    56+C, DX        \
+    MOVQ    64+C, BX        \
+    ADDQ    40+M0, AX       \
+    ADCQ    48+M0, DX       \
+    ADCQ    56+M0, BX       \
+    MOVQ    AX, 40+M0       \
+    MOVQ    DX, 48+M0       \
+    MOVQ    BX, 56+M0       \
+    ADCQ    64+M0, BP       \
+    ADCQ    72+M0, R8       \
+    ADCQ    80+M0, R9       \
+    ADCQ    88+M0, R10      \
+    ADCQ    96+M0, R11      \
+    ADCQ    104+M0, R12     \
+    ADCQ    112+M0, R13     \
+    ADCQ    120+M0, R14     \
+    ADCQ    128+M0, R15     \
+    MOVQ    BP, 64+M0       \
+    MOVQ    R8, 72+M0       \
+    MOVQ    R9, 80+M0       \
+    MOVQ    R10, 88+M0      \
+    MOVQ    R11, 96+M0      \
+    MOVQ    R12, 104+M0     \
+    MOVQ    R13, 112+M0     \
+    MOVQ    R14, 120+M0     \
+    MOVQ    R15, 128+M0     \
+    MOVQ    136+M0, R8      \
+    MOVQ    144+M0, R9      \
+    MOVQ    152+M0, R10     \
+    MOVQ    160+M0, R11     \
+    MOVQ    168+M0, R12     \
+    MOVQ    176+M0, R13     \
+    MOVQ    184+M0, R14     \
+    ADCQ    $0, R8          \
+    ADCQ    $0, R9          \
+    ADCQ    $0, R10         \
+    ADCQ    $0, R11         \
+    ADCQ    $0, R12         \
+    ADCQ    $0, R13         \
+    ADCQ    $0, R14         \
+    MOVQ    R8, 136+M0      \
+    MOVQ    R9, 144+M0      \
+    MOVQ    R10, 152+M0     \
+    MOVQ    R11, 160+M0     \
+    MOVQ    R12, 168+M0     \
+    MOVQ    R13, 176+M0     \
+    MOVQ    R14, 184+M0     \
+    \ // a[4-7] x p751p1_nz --> result: [reg_p2+48], [reg_p2+56], [reg_p2+64], and rbp, r8:r14
+    MULS(32+M0, ·p751p1, 48+C, R8, R9, R13, R10, R14, R12, R11, BP, BX, CX, R15) \
+    XORQ    R15, R15          \
+    MOVQ    48+C, AX        \
+    MOVQ    56+C, DX        \
+    MOVQ    64+C, BX        \
+    ADDQ    72+M0, AX       \
+    ADCQ    80+M0, DX       \
+    ADCQ    88+M0, BX       \
+    MOVQ    AX, 72+M0       \
+    MOVQ    DX, 80+M0       \
+    MOVQ    BX, 88+M0       \
+    ADCQ    96+M0, BP       \
+    ADCQ    104+M0, R8      \
+    ADCQ    112+M0, R9      \
+    ADCQ    120+M0, R10     \
+    ADCQ    128+M0, R11     \
+    ADCQ    136+M0, R12     \
+    ADCQ    144+M0, R13     \
+    ADCQ    152+M0, R14     \
+    ADCQ    160+M0, R15     \
+    MOVQ    BP, 0+C         \   // Final result c0
+    MOVQ    R8, 104+M0      \
+    MOVQ    R9, 112+M0      \
+    MOVQ    R10, 120+M0     \
+    MOVQ    R11, 128+M0     \
+    MOVQ    R12, 136+M0     \
+    MOVQ    R13, 144+M0     \
+    MOVQ    R14, 152+M0     \
+    MOVQ    R15, 160+M0     \
+    MOVQ    168+M0, R12     \
+    MOVQ    176+M0, R13     \
+    MOVQ    184+M0, R14     \
+    ADCQ    $0, R12         \
+    ADCQ    $0, R13         \
+    ADCQ    $0, R14         \
+    MOVQ    R12, 168+M0     \
+    MOVQ    R13, 176+M0     \
+    MOVQ    R14, 184+M0     \
+    \ // a[8-11] x p751p1_nz --> result: [reg_p2+48], [reg_p2+56], [reg_p2+64], and rbp, r8:r14
+    MULS(64+M0, ·p751p1, 48+C, R8, R9, R13, R10, R14, R12, R11, BP, BX, CX, R15) \
+    MOVQ    48+C, AX        \   // Final result c1:c11
+    MOVQ    56+C, DX        \
+    MOVQ    64+C, BX        \
+    ADDQ    104+M0, AX      \
+    ADCQ    112+M0, DX      \
+    ADCQ    120+M0, BX      \
+    MOVQ    AX, 8+C         \
+    MOVQ    DX, 16+C        \
+    MOVQ    BX, 24+C        \
+    ADCQ    128+M0, BP      \
+    ADCQ    136+M0, R8      \
+    ADCQ    144+M0, R9      \
+    ADCQ    152+M0, R10     \
+    ADCQ    160+M0, R11     \
+    ADCQ    168+M0, R12     \
+    ADCQ    176+M0, R13     \
+    ADCQ    184+M0, R14     \
+    MOVQ    BP, 32+C        \
+    MOVQ    R8, 40+C        \
+    MOVQ    R9, 48+C        \
+    MOVQ    R10, 56+C       \
+    MOVQ    R11, 64+C       \
+    MOVQ    R12, 72+C       \
+    MOVQ    R13, 80+C       \
+    MOVQ    R14, 88+C
+
+TEXT ·fp751MontgomeryReduce(SB), $0-16
 	MOVQ z+0(FP), REG_P2
 	MOVQ x+8(FP), REG_P1
 
-	// a[0-3] x p751p1_nz --> result: [reg_p2+48], [reg_p2+56], [reg_p2+64], and rbp, r8:r14
-	mul256x448bmi2adx(0(REG_P1), ·p751p1, 48(REG_P2), R8, R9, R13, R10, R14, R12, R11, BP, BX, CX, R15)
-
-	fp751MontgomeryReduceCommonPart1(0(REG_P1), 0(REG_P2), R8, R9, R10, R11, R12, R13, R14, R15, BP, BX)
-
-	// a[4-7] x p751p1_nz --> result: [reg_p2+48], [reg_p2+56], [reg_p2+64], and rbp, r8:r14
-	mul256x448bmi2adx(32(REG_P1), ·p751p1, 48(REG_P2), R8, R9, R13, R10, R14, R12, R11, BP, BX, CX, R15)
-
-	fp751MontgomeryReduceCommonPart2(0(REG_P1), 0(REG_P2), R8, R9, R10, R11, R12, R13, R14, R15, BP, BX)
-
-	// a[8-11] x p751p1_nz --> result: [reg_p2+48], [reg_p2+56], [reg_p2+64], and rbp, r8:r14
-	mul256x448bmi2adx(64(REG_P1), ·p751p1, 48(REG_P2), R8, R9, R13, R10, R14, R12, R11, BP, BX, CX, R15)
-
-	fp751MontgomeryReduceCommonPart3(0(REG_P1), 0(REG_P2), R8, R9, R10, R11, R12, R13, R14, R15, BP, BX)
-
-	RET
-
-// This implements the Montgomery reduction algorithm described in
-// section 5.2.3 of https://eprint.iacr.org/2017/1015.pdf.
-// This assumes that the BMI2 instruction set extension is available.
-TEXT ·fp751MontgomeryReduceBMI2(SB), $0-16
-	MOVQ z+0(FP), REG_P2
-	MOVQ x+8(FP), REG_P1
-
-	// a[0-3] x p751p1_nz --> result: [reg_p2+48], [reg_p2+56], [reg_p2+64], and rbp, r8:r14
-	mul256x448bmi2(0(REG_P1), ·p751p1, 48(REG_P2), R8, R9, R13, R10, R14, R12, R11, BP, BX, CX, R15)
-
-	fp751MontgomeryReduceCommonPart1(0(REG_P1), 0(REG_P2), R8, R9, R10, R11, R12, R13, R14, R15, BP, BX)
-
-	// a[4-7] x p751p1_nz --> result: [reg_p2+48], [reg_p2+56], [reg_p2+64], and rbp, r8:r14
-	mul256x448bmi2(32(REG_P1), ·p751p1, 48(REG_P2), R8, R9, R13, R10, R14, R12, R11, BP, BX, CX, R15)
-
-	fp751MontgomeryReduceCommonPart2(0(REG_P1), 0(REG_P2), R8, R9, R10, R11, R12, R13, R14, R15, BP, BX)
-
-	// a[8-11] x p751p1_nz --> result: [reg_p2+48], [reg_p2+56], [reg_p2+64], and rbp, r8:r14
-	mul256x448bmi2(64(REG_P1), ·p751p1, 48(REG_P2), R8, R9, R13, R10, R14, R12, R11, BP, BX, CX, R15)
-
-	fp751MontgomeryReduceCommonPart3(0(REG_P1), 0(REG_P2), R8, R9, R10, R11, R12, R13, R14, R15, BP, BX)
-
-	RET
-
-// This implements the straightforward Montgomery reduction algorithm without
-// using specific instruction set extensions.
-TEXT ·fp751MontgomeryReduceFallback(SB), $0-16
-
-	MOVQ z+0(FP), REG_P2
-	MOVQ x+8(FP), REG_P1
+	// Check wether to use optimized implementation
+	CMPB    ·useADXMULX(SB), $1
+	JE      redc_with_mulx_adx
+	CMPB    ·useMULX(SB), $1
+	JE      redc_with_mulx
 
 	MOVQ	(REG_P1), R11
 	MOVQ	P751P1_5, AX
@@ -2379,7 +2345,20 @@ TEXT ·fp751MontgomeryReduceFallback(SB), $0-16
 	ADCQ	$0, R10
 	ADDQ	(184)(REG_P1), R10		// Z11
 	MOVQ	R10, (88)(REG_P2)		// Z11
+	RET
+
+redc_with_mulx_adx:
+	// This implements the Montgomery reduction algorithm described in
+	// section 5.2.3 of https://eprint.iacr.org/2017/1015.pdf.
+	// This assumes that the BMI2 and ADX instruction set extensions are available.
+	REDC(0(REG_P2), 0(REG_P1), mul256x448bmi2adx)
+	RET
 
+redc_with_mulx:
+	// This implements the Montgomery reduction algorithm described in
+	// section 5.2.3 of https://eprint.iacr.org/2017/1015.pdf.
+	// This assumes that the BMI2 instruction set extension is available.
+	REDC(0(REG_P2), 0(REG_P1), mul256x448bmi2)
 	RET
 
 TEXT ·fp751AddLazy(SB), NOSPLIT, $0-24
diff --git a/p751/arith_amd64_test.go b/p751/arith_amd64_test.go
index c7814c9..3c278f5 100644
--- a/p751/arith_amd64_test.go
+++ b/p751/arith_amd64_test.go
@@ -10,66 +10,66 @@ import (
 	"testing/quick"
 )
 
-func TestFp751MontgomeryReduce(t *testing.T) {
-	// First make sure that at least one value with a known result reduces
-	// correctly as defined in TestPrimeFieldElementToBigInt.
-	fp751MontgomeryReduce = fp751MontgomeryReduceFallback
-	t.Run("PrimeFieldElementToBigInt", TestPrimeFieldElementToBigInt)
+type OptimFlag uint
 
-	if !cpu.HasBMI2 {
-		return
-	}
-
-	fp751MontgomeryReduce = fp751MontgomeryReduceBMI2
-	t.Run("PrimeFieldElementToBigInt", TestPrimeFieldElementToBigInt)
-
-	// Also check that the BMI2 implementation produces the same results
-	// as the fallback implementation.
-	compareMontgomeryReduce := func(x, y primeFieldElement) bool {
-		var z, zbackup FpElementX2
-		var zred1, zred2 FpElement
-
-		fp751Mul(&z, &x.A, &y.A)
-		zbackup = z
-
-		fp751MontgomeryReduceFallback(&zred1, &z)
-		// z may be destroyed.
-		z = zbackup
-		fp751MontgomeryReduceBMI2(&zred2, &z)
+const (
+	kUse_MUL     OptimFlag = 1 << 0
+	kUse_MULX              = 1 << 1
+	kUse_MULXADX           = 1 << 2
+)
 
-		return zred1 == zred2
+// Utility function used for testing REDC implementations. Tests caller provided
+// redcFunc against redc()
+func testRedc(t *testing.T, f1, f2 OptimFlag) {
+	doRedcTest := func(aRR FpElementX2) bool {
+		defer recognizecpu()
+		var resRedcF1, resRedcF2 FpElement
+		var aRRcpy = aRR
+
+		// Compute redc with first implementation
+		useMULX = (kUse_MULX & f1) == kUse_MULX
+		useADXMULX = (kUse_MULXADX & f1) == kUse_MULXADX
+		fp751MontgomeryReduce(&resRedcF1, &aRR)
+
+		// Compute redc with second implementation
+		useMULX = (kUse_MULX & f2) == kUse_MULX
+		useADXMULX = (kUse_MULXADX & f2) == kUse_MULXADX
+		fp751MontgomeryReduce(&resRedcF2, &aRRcpy)
+
+		// Compare results
+		return reflect.DeepEqual(resRedcF2, resRedcF1)
 	}
 
-	if err := quick.Check(compareMontgomeryReduce, quickCheckConfig); err != nil {
+	if err := quick.Check(doRedcTest, quickCheckConfig); err != nil {
 		t.Error(err)
 	}
+}
 
-	if !cpu.HasADX {
-		return
+// Ensures corretness of Montgomery reduction implementation which uses MULX
+func TestRedcWithMULX(t *testing.T) {
+	defer recognizecpu()
+	if !cpu.HasBMI2 {
+		t.Skip("MULX not supported by the platform")
 	}
+	testRedc(t, kUse_MULX, kUse_MUL)
+}
 
-	fp751MontgomeryReduce = fp751MontgomeryReduceBMI2ADX
-	t.Run("PrimeFieldElementToBigInt", TestPrimeFieldElementToBigInt)
-
-	// Check that the BMI2ADX implementation produces the same results as
-	// the BMI2 implementation. By transitivity, it should also produce the
-	// same results as the fallback implementation.
-	compareMontgomeryReduce = func(x, y primeFieldElement) bool {
-		var z, zbackup FpElementX2
-		var zred1, zred2 FpElement
-
-		fp751Mul(&z, &x.A, &y.A)
-		zbackup = z
-
-		fp751MontgomeryReduceBMI2(&zred1, &z)
-		// z may be destroyed.
-		z = zbackup
-		fp751MontgomeryReduceBMI2ADX(&zred2, &z)
-
-		return zred1 == zred2
+// Ensures corretness of Montgomery reduction implementation which uses MULX
+// and ADX
+func TestRedcWithMULXADX(t *testing.T) {
+	defer recognizecpu()
+	if !(cpu.HasADX && cpu.HasBMI2) {
+		t.Skip("MULX, ADCX and ADOX not supported by the platform")
 	}
+	testRedc(t, kUse_MULXADX, kUse_MUL)
+}
 
-	if err := quick.Check(compareMontgomeryReduce, quickCheckConfig); err != nil {
-		t.Error(err)
+// Ensures corretness of Montgomery reduction implementation which uses MULX
+// and ADX.
+func TestRedcWithMULXADXAgainstMULX(t *testing.T) {
+	defer recognizecpu()
+	if !(cpu.HasADX && cpu.HasBMI2) {
+		t.Skip("MULX, ADCX and ADOX not supported by the platform")
 	}
+	testRedc(t, kUse_MULXADX, kUse_MULX)
 }
diff --git a/p751/arith_decl.go b/p751/arith_decl.go
index 008ee41..61a0659 100644
--- a/p751/arith_decl.go
+++ b/p751/arith_decl.go
@@ -4,7 +4,6 @@ package p751
 
 import (
 	. "github.com/cloudflare/sidh/internal/isogeny"
-	cpu "github.com/cloudflare/sidh/internal/utils"
 )
 
 // If choice = 0, leave x,y unchanged. If choice = 1, set x,y = y,x.
@@ -41,31 +40,9 @@ func fp751Mul(z *FpElementX2, x, y *FpElement)
 // fp751MontgomeryReduce implementations below.
 // When set, it performs Montgomery reduction: set z = x R^{-1} (mod 2*p).
 // It may destroy the input value.
-var fp751MontgomeryReduce func(z *FpElement, x *FpElementX2)
-
-//go:noescape
-func fp751MontgomeryReduceBMI2ADX(z *FpElement, x *FpElementX2)
-
 //go:noescape
-func fp751MontgomeryReduceBMI2(z *FpElement, x *FpElementX2)
-
-//go:noescape
-func fp751MontgomeryReduceFallback(z *FpElement, x *FpElementX2)
+func fp751MontgomeryReduce(z *FpElement, x *FpElementX2)
 
 // Reduce a field element in [0, 2*p) to one in [0,p).
 //go:noescape
 func fp751StrongReduce(x *FpElement)
-
-// On initialization, set the fp751MontgomeryReduce function pointer to the
-// fastest implementation depending on CPU capabilities.
-func init() {
-	if cpu.HasBMI2 {
-		if cpu.HasADX {
-			fp751MontgomeryReduce = fp751MontgomeryReduceBMI2ADX
-		} else {
-			fp751MontgomeryReduce = fp751MontgomeryReduceBMI2
-		}
-	} else {
-		fp751MontgomeryReduce = fp751MontgomeryReduceFallback
-	}
-}