@@ -698,7 +698,7 @@ TEXT ·fp503Mul(SB), NOSPLIT, $104-24 | |||||
// Check wether to use optimized implementation | // Check wether to use optimized implementation | ||||
CMPB github·com∕cloudflare∕sidh∕internal∕utils·HasADXandBMI2(SB), $1 | CMPB github·com∕cloudflare∕sidh∕internal∕utils·HasADXandBMI2(SB), $1 | ||||
JE mul_with_mulx_adx | |||||
JE mul_with_mulx_adcx_adox | |||||
CMPB github·com∕cloudflare∕sidh∕internal∕utils·HasBMI2(SB), $1 | CMPB github·com∕cloudflare∕sidh∕internal∕utils·HasBMI2(SB), $1 | ||||
JE mul_with_mulx | JE mul_with_mulx | ||||
@@ -1178,7 +1178,7 @@ TEXT ·fp503Mul(SB), NOSPLIT, $104-24 | |||||
ADCQ $0, SI; MOVQ SI, (120)(CX) | ADCQ $0, SI; MOVQ SI, (120)(CX) | ||||
RET | RET | ||||
mul_with_mulx_adx: | |||||
mul_with_mulx_adcx_adox: | |||||
// Mul implementation for CPUs supporting two independent carry chain | // Mul implementation for CPUs supporting two independent carry chain | ||||
// (ADOX/ADCX) instructions and carry-less MULX multiplier | // (ADOX/ADCX) instructions and carry-less MULX multiplier | ||||
MUL(CX, REG_P1, REG_P2, MULS256_MULXADX) | MUL(CX, REG_P1, REG_P2, MULS256_MULXADX) | ||||
@@ -1195,7 +1195,7 @@ TEXT ·fp503MontgomeryReduce(SB), $0-16 | |||||
// Check wether to use optimized implementation | // Check wether to use optimized implementation | ||||
CMPB github·com∕cloudflare∕sidh∕internal∕utils·HasADXandBMI2(SB), $1 | CMPB github·com∕cloudflare∕sidh∕internal∕utils·HasADXandBMI2(SB), $1 | ||||
JE redc_with_mulx_adx | |||||
JE redc_with_mulx_adcx_adox | |||||
CMPB github·com∕cloudflare∕sidh∕internal∕utils·HasBMI2(SB), $1 | CMPB github·com∕cloudflare∕sidh∕internal∕utils·HasBMI2(SB), $1 | ||||
JE redc_with_mulx | JE redc_with_mulx | ||||
@@ -1503,7 +1503,7 @@ TEXT ·fp503MontgomeryReduce(SB), $0-16 | |||||
MOVQ R10, (56)(REG_P2) // Z7 | MOVQ R10, (56)(REG_P2) // Z7 | ||||
RET | RET | ||||
redc_with_mulx_adx: | |||||
redc_with_mulx_adcx_adox: | |||||
// Implementation of the Montgomery reduction for CPUs | // Implementation of the Montgomery reduction for CPUs | ||||
// supporting two independent carry chain (ADOX/ADCX) | // supporting two independent carry chain (ADOX/ADCX) | ||||
// instructions and carry-less MULX multiplier | // instructions and carry-less MULX multiplier | ||||
@@ -13,9 +13,12 @@ import ( | |||||
type OptimFlag uint | type OptimFlag uint | ||||
const ( | const ( | ||||
kUse_MUL OptimFlag = 1 << 0 | |||||
kUse_MULX = 1 << 1 | |||||
kUse_BMI2andADX = 1 << 2 | |||||
// Indicates that optimisation which uses MUL instruction should be used | |||||
kUse_MUL OptimFlag = 1 << 0 | |||||
// Indicates that optimisation which uses MULX instruction should be used | |||||
kUse_MULX = 1 << 1 | |||||
// Indicates that optimisation which uses MULX, ADOX and ADCX instructions should be used | |||||
kUse_MULXandADxX = 1 << 2 | |||||
) | ) | ||||
// Utility function used for testing Mul implementations. Tests caller provided | // Utility function used for testing Mul implementations. Tests caller provided | ||||
@@ -27,12 +30,12 @@ func testMul(t *testing.T, f1, f2 OptimFlag) { | |||||
// Compute multiplier*multiplicant with first implementation | // Compute multiplier*multiplicant with first implementation | ||||
cpu.HasBMI2 = (kUse_MULX & f1) == kUse_MULX | cpu.HasBMI2 = (kUse_MULX & f1) == kUse_MULX | ||||
cpu.HasADXandBMI2 = (kUse_BMI2andADX & f1) == kUse_BMI2andADX | |||||
cpu.HasADXandBMI2 = (kUse_MULXandADxX & f1) == kUse_MULXandADxX | |||||
fp503Mul(&resMulOptim, &multiplier, &multiplicant) | fp503Mul(&resMulOptim, &multiplier, &multiplicant) | ||||
// Compute multiplier*multiplicant with second implementation | // Compute multiplier*multiplicant with second implementation | ||||
cpu.HasBMI2 = (kUse_MULX & f2) == kUse_MULX | cpu.HasBMI2 = (kUse_MULX & f2) == kUse_MULX | ||||
cpu.HasADXandBMI2 = (kUse_BMI2andADX & f2) == kUse_BMI2andADX | |||||
cpu.HasADXandBMI2 = (kUse_MULXandADxX & f2) == kUse_MULXandADxX | |||||
fp503Mul(&resMulRef, &multiplier, &multiplicant) | fp503Mul(&resMulRef, &multiplier, &multiplicant) | ||||
// Compare results | // Compare results | ||||
@@ -54,12 +57,12 @@ func testRedc(t *testing.T, f1, f2 OptimFlag) { | |||||
// Compute redc with first implementation | // Compute redc with first implementation | ||||
cpu.HasBMI2 = (kUse_MULX & f1) == kUse_MULX | cpu.HasBMI2 = (kUse_MULX & f1) == kUse_MULX | ||||
cpu.HasADXandBMI2 = (kUse_BMI2andADX & f1) == kUse_BMI2andADX | |||||
cpu.HasADXandBMI2 = (kUse_MULXandADxX & f1) == kUse_MULXandADxX | |||||
fp503MontgomeryReduce(&resRedcF1, &aRR) | fp503MontgomeryReduce(&resRedcF1, &aRR) | ||||
// Compute redc with second implementation | // Compute redc with second implementation | ||||
cpu.HasBMI2 = (kUse_MULX & f2) == kUse_MULX | cpu.HasBMI2 = (kUse_MULX & f2) == kUse_MULX | ||||
cpu.HasADXandBMI2 = (kUse_BMI2andADX & f2) == kUse_BMI2andADX | |||||
cpu.HasADXandBMI2 = (kUse_MULXandADxX & f2) == kUse_MULXandADxX | |||||
fp503MontgomeryReduce(&resRedcF2, &aRRcpy) | fp503MontgomeryReduce(&resRedcF2, &aRRcpy) | ||||
// Compare results | // Compare results | ||||
@@ -71,7 +74,7 @@ func testRedc(t *testing.T, f1, f2 OptimFlag) { | |||||
} | } | ||||
} | } | ||||
// Ensures corretness of implementation of mul operation which uses MULX | |||||
// Ensures correctness of implementation of mul operation which uses MULX | |||||
func TestMulWithMULX(t *testing.T) { | func TestMulWithMULX(t *testing.T) { | ||||
defer cpu.RecognizeCpu() | defer cpu.RecognizeCpu() | ||||
if !cpu.HasBMI2 { | if !cpu.HasBMI2 { | ||||
@@ -80,25 +83,25 @@ func TestMulWithMULX(t *testing.T) { | |||||
testMul(t, kUse_MULX, kUse_MUL) | testMul(t, kUse_MULX, kUse_MUL) | ||||
} | } | ||||
// Ensures corretness of implementation of mul operation which uses MULX and ADOX/ADCX | |||||
func TestMulWithMULXADX(t *testing.T) { | |||||
// Ensures correctness of implementation of mul operation which uses MULX and ADOX/ADCX | |||||
func TestMulWithMULXADxX(t *testing.T) { | |||||
defer cpu.RecognizeCpu() | defer cpu.RecognizeCpu() | ||||
if !cpu.HasADXandBMI2 { | if !cpu.HasADXandBMI2 { | ||||
t.Skip("MULX, ADCX and ADOX not supported by the platform") | t.Skip("MULX, ADCX and ADOX not supported by the platform") | ||||
} | } | ||||
testMul(t, kUse_BMI2andADX, kUse_MUL) | |||||
testMul(t, kUse_MULXandADxX, kUse_MUL) | |||||
} | } | ||||
// Ensures corretness of implementation of mul operation which uses MULX and ADOX/ADCX | |||||
func TestMulWithMULXADXAgainstMULX(t *testing.T) { | |||||
// Ensures correctness of implementation of mul operation which uses MULX and ADOX/ADCX | |||||
func TestMulWithMULXADxXAgainstMULX(t *testing.T) { | |||||
defer cpu.RecognizeCpu() | defer cpu.RecognizeCpu() | ||||
if !cpu.HasADXandBMI2 { | if !cpu.HasADXandBMI2 { | ||||
t.Skip("MULX, ADCX and ADOX not supported by the platform") | t.Skip("MULX, ADCX and ADOX not supported by the platform") | ||||
} | } | ||||
testMul(t, kUse_MULX, kUse_BMI2andADX) | |||||
testMul(t, kUse_MULX, kUse_MULXandADxX) | |||||
} | } | ||||
// Ensures corretness of Montgomery reduction implementation which uses MULX | |||||
// Ensures correctness of Montgomery reduction implementation which uses MULX | |||||
func TestRedcWithMULX(t *testing.T) { | func TestRedcWithMULX(t *testing.T) { | ||||
defer cpu.RecognizeCpu() | defer cpu.RecognizeCpu() | ||||
if !cpu.HasBMI2 { | if !cpu.HasBMI2 { | ||||
@@ -107,22 +110,22 @@ func TestRedcWithMULX(t *testing.T) { | |||||
testRedc(t, kUse_MULX, kUse_MUL) | testRedc(t, kUse_MULX, kUse_MUL) | ||||
} | } | ||||
// Ensures corretness of Montgomery reduction implementation which uses MULX | |||||
// and ADX | |||||
func TestRedcWithMULXADX(t *testing.T) { | |||||
// Ensures correctness of Montgomery reduction implementation which uses MULX | |||||
// and ADCX/ADOX. | |||||
func TestRedcWithMULXADxX(t *testing.T) { | |||||
defer cpu.RecognizeCpu() | defer cpu.RecognizeCpu() | ||||
if !cpu.HasADXandBMI2 { | if !cpu.HasADXandBMI2 { | ||||
t.Skip("MULX, ADCX and ADOX not supported by the platform") | t.Skip("MULX, ADCX and ADOX not supported by the platform") | ||||
} | } | ||||
testRedc(t, kUse_BMI2andADX, kUse_MUL) | |||||
testRedc(t, kUse_MULXandADxX, kUse_MUL) | |||||
} | } | ||||
// Ensures corretness of Montgomery reduction implementation which uses MULX | |||||
// and ADX. | |||||
func TestRedcWithMULXADXAgainstMULX(t *testing.T) { | |||||
// Ensures correctness of Montgomery reduction implementation which uses MULX | |||||
// and ADCX/ADOX. | |||||
func TestRedcWithMULXADxXAgainstMULX(t *testing.T) { | |||||
defer cpu.RecognizeCpu() | defer cpu.RecognizeCpu() | ||||
if !cpu.HasADXandBMI2 { | if !cpu.HasADXandBMI2 { | ||||
t.Skip("MULX, ADCX and ADOX not supported by the platform") | t.Skip("MULX, ADCX and ADOX not supported by the platform") | ||||
} | } | ||||
testRedc(t, kUse_BMI2andADX, kUse_MULX) | |||||
testRedc(t, kUse_MULXandADxX, kUse_MULX) | |||||
} | } |
@@ -1741,7 +1741,7 @@ TEXT ·fp751MontgomeryReduce(SB), $0-16 | |||||
// Check wether to use optimized implementation | // Check wether to use optimized implementation | ||||
CMPB github·com∕cloudflare∕sidh∕internal∕utils·HasADXandBMI2(SB), $1 | CMPB github·com∕cloudflare∕sidh∕internal∕utils·HasADXandBMI2(SB), $1 | ||||
JE redc_with_mulx_adx | |||||
JE redc_with_mulx_adcx_adox | |||||
CMPB github·com∕cloudflare∕sidh∕internal∕utils·HasBMI2(SB), $1 | CMPB github·com∕cloudflare∕sidh∕internal∕utils·HasBMI2(SB), $1 | ||||
JE redc_with_mulx | JE redc_with_mulx | ||||
@@ -2347,7 +2347,7 @@ TEXT ·fp751MontgomeryReduce(SB), $0-16 | |||||
MOVQ R10, (88)(REG_P2) // Z11 | MOVQ R10, (88)(REG_P2) // Z11 | ||||
RET | RET | ||||
redc_with_mulx_adx: | |||||
redc_with_mulx_adcx_adox: | |||||
// This implements the Montgomery reduction algorithm described in | // This implements the Montgomery reduction algorithm described in | ||||
// section 5.2.3 of https://eprint.iacr.org/2017/1015.pdf. | // section 5.2.3 of https://eprint.iacr.org/2017/1015.pdf. | ||||
// This assumes that the BMI2 and ADX instruction set extensions are available. | // This assumes that the BMI2 and ADX instruction set extensions are available. | ||||
@@ -14,9 +14,12 @@ import ( | |||||
type OptimFlag uint | type OptimFlag uint | ||||
const ( | const ( | ||||
kUse_MUL OptimFlag = 1 << 0 | |||||
kUse_MULX = 1 << 1 | |||||
kUse_ADXandBMI2 = 1 << 2 | |||||
// Indicates that optimisation which uses MUL instruction should be used | |||||
kUse_MUL OptimFlag = 1 << 0 | |||||
// Indicates that optimisation which uses MULX instruction should be used | |||||
kUse_MULX = 1 << 1 | |||||
// Indicates that optimisation which uses MULX, ADOX and ADCX instructions should be used | |||||
kUse_MULXandADxX = 1 << 2 | |||||
) | ) | ||||
// Utility function used for testing REDC implementations. Tests caller provided | // Utility function used for testing REDC implementations. Tests caller provided | ||||
@@ -29,12 +32,12 @@ func testRedc(t *testing.T, f1, f2 OptimFlag) { | |||||
// Compute redc with first implementation | // Compute redc with first implementation | ||||
cpu.HasBMI2 = (kUse_MULX & f1) == kUse_MULX | cpu.HasBMI2 = (kUse_MULX & f1) == kUse_MULX | ||||
cpu.HasADXandBMI2 = (kUse_ADXandBMI2 & f1) == kUse_ADXandBMI2 | |||||
cpu.HasADXandBMI2 = (kUse_MULXandADxX & f1) == kUse_MULXandADxX | |||||
fp751MontgomeryReduce(&resRedcF1, &aRR) | fp751MontgomeryReduce(&resRedcF1, &aRR) | ||||
// Compute redc with second implementation | // Compute redc with second implementation | ||||
cpu.HasBMI2 = (kUse_MULX & f2) == kUse_MULX | cpu.HasBMI2 = (kUse_MULX & f2) == kUse_MULX | ||||
cpu.HasADXandBMI2 = (kUse_ADXandBMI2 & f2) == kUse_ADXandBMI2 | |||||
cpu.HasADXandBMI2 = (kUse_MULXandADxX & f2) == kUse_MULXandADxX | |||||
fp751MontgomeryReduce(&resRedcF2, &aRRcpy) | fp751MontgomeryReduce(&resRedcF2, &aRRcpy) | ||||
// Compare results | // Compare results | ||||
@@ -46,7 +49,7 @@ func testRedc(t *testing.T, f1, f2 OptimFlag) { | |||||
} | } | ||||
} | } | ||||
// Ensures corretness of Montgomery reduction implementation which uses MULX | |||||
// Ensures correctness of Montgomery reduction implementation which uses MULX | |||||
func TestRedcWithMULX(t *testing.T) { | func TestRedcWithMULX(t *testing.T) { | ||||
defer cpu.RecognizeCpu() | defer cpu.RecognizeCpu() | ||||
if !cpu.HasBMI2 { | if !cpu.HasBMI2 { | ||||
@@ -55,22 +58,22 @@ func TestRedcWithMULX(t *testing.T) { | |||||
testRedc(t, kUse_MULX, kUse_MUL) | testRedc(t, kUse_MULX, kUse_MUL) | ||||
} | } | ||||
// Ensures corretness of Montgomery reduction implementation which uses MULX | |||||
// and ADX | |||||
func TestRedcWithMULXADX(t *testing.T) { | |||||
// Ensures correctness of Montgomery reduction implementation which uses MULX | |||||
// and ADCX/ADOX. | |||||
func TestRedcWithMULXADxX(t *testing.T) { | |||||
defer cpu.RecognizeCpu() | defer cpu.RecognizeCpu() | ||||
if !cpu.HasADXandBMI2 { | if !cpu.HasADXandBMI2 { | ||||
t.Skip("MULX, ADCX and ADOX not supported by the platform") | t.Skip("MULX, ADCX and ADOX not supported by the platform") | ||||
} | } | ||||
testRedc(t, kUse_ADXandBMI2, kUse_MUL) | |||||
testRedc(t, kUse_MULXandADxX, kUse_MUL) | |||||
} | } | ||||
// Ensures corretness of Montgomery reduction implementation which uses MULX | |||||
// and ADX. | |||||
func TestRedcWithMULXADXAgainstMULX(t *testing.T) { | |||||
// Ensures correctness of Montgomery reduction implementation which uses MULX | |||||
// and ADCX/ADOX. | |||||
func TestRedcWithMULXADxXAgainstMULX(t *testing.T) { | |||||
defer cpu.RecognizeCpu() | defer cpu.RecognizeCpu() | ||||
if !cpu.HasADXandBMI2 { | if !cpu.HasADXandBMI2 { | ||||
t.Skip("MULX, ADCX and ADOX not supported by the platform") | t.Skip("MULX, ADCX and ADOX not supported by the platform") | ||||
} | } | ||||
testRedc(t, kUse_ADXandBMI2, kUse_MULX) | |||||
testRedc(t, kUse_MULXandADxX, kUse_MULX) | |||||
} | } |