1
0
mirror of https://github.com/henrydcase/nobs.git synced 2024-11-22 23:28:57 +00:00
nobs/dh/csidh/fp511_test.go

443 lines
8.4 KiB
Go
Raw Normal View History

cSIDH-511: (#26) Implementation of Commutative Supersingular Isogeny Diffie Hellman, based on "A faster way to CSIDH" paper (2018/782). * For fast isogeny calculation, implementation converts a curve from Montgomery to Edwards. All calculations are done on Edwards curve and then converted back to Montgomery. * As multiplication in a field Fp511 is most expensive operation the implementation contains multiple multiplications. It has most performant, assembly implementation which uses BMI2 and ADOX/ADCX instructions for modern CPUs. It also contains slower implementation which will run on older CPUs * Benchmarks (Intel SkyLake): BenchmarkGeneratePrivate 6459 172213 ns/op 0 B/op 0 allocs/op BenchmarkGenerateKeyPair 25 45800356 ns/op 0 B/op 0 allocs/op BenchmarkValidate 297 3915983 ns/op 0 B/op 0 allocs/op BenchmarkValidateRandom 184683 6231 ns/op 0 B/op 0 allocs/op BenchmarkValidateGenerated 25 48481306 ns/op 0 B/op 0 allocs/op BenchmarkDerive 19 60928763 ns/op 0 B/op 0 allocs/op BenchmarkDeriveGenerated 8 137342421 ns/op 0 B/op 0 allocs/op BenchmarkXMul 2311 494267 ns/op 1 B/op 0 allocs/op BenchmarkXAdd 2396754 501 ns/op 0 B/op 0 allocs/op BenchmarkXDbl 2072690 571 ns/op 0 B/op 0 allocs/op BenchmarkIsom 78004 15171 ns/op 0 B/op 0 allocs/op BenchmarkFp512Sub 224635152 5.33 ns/op 0 B/op 0 allocs/op BenchmarkFp512Mul 246633255 4.90 ns/op 0 B/op 0 allocs/op BenchmarkCSwap 233228547 5.10 ns/op 0 B/op 0 allocs/op BenchmarkAddRdc 87348240 12.6 ns/op 0 B/op 0 allocs/op BenchmarkSubRdc 95112787 11.7 ns/op 0 B/op 0 allocs/op BenchmarkModExpRdc 25436 46878 ns/op 0 B/op 0 allocs/op BenchmarkMulBmiAsm 19527573 60.1 ns/op 0 B/op 0 allocs/op BenchmarkMulGeneric 7117650 164 ns/op 0 B/op 0 allocs/op * Go code has very similar performance when compared to C implementation. Results from sidh_torturer (4e2996e12d68364761064341cbe1d1b47efafe23) github.com:henrydcase/sidh-torture/csidh | TestName |Go | C | |------------------|----------|----------| |TestSharedSecret | 57.95774 | 57.91092 | |TestKeyGeneration | 62.23614 | 58.12980 | |TestSharedSecret | 55.28988 | 57.23132 | |TestKeyGeneration | 61.68745 | 58.66396 | |TestSharedSecret | 63.19408 | 58.64774 | |TestKeyGeneration | 62.34022 | 61.62539 | |TestSharedSecret | 62.85453 | 68.74503 | |TestKeyGeneration | 52.58518 | 58.40115 | |TestSharedSecret | 50.77081 | 61.91699 | |TestKeyGeneration | 59.91843 | 61.09266 | |TestSharedSecret | 59.97962 | 62.98151 | |TestKeyGeneration | 64.57525 | 56.22863 | |TestSharedSecret | 56.40521 | 55.77447 | |TestKeyGeneration | 67.85850 | 58.52604 | |TestSharedSecret | 60.54290 | 65.14052 | |TestKeyGeneration | 65.45766 | 58.42823 | On average Go implementation is 2% faster.
2019-11-24 03:39:35 +00:00
package csidh
import (
"math/big"
"math/rand"
"testing"
"golang.org/x/sys/cpu"
)
func resetCPUFeatures() {
hasBMI2 = cpu.X86.HasBMI2
hasADXandBMI2 = cpu.X86.HasBMI2 && cpu.X86.HasADX
}
func testFp512Mul3Nominal(t *testing.T) {
var multiplier64 uint64
var mod big.Int
// modulus: 2^512
mod.SetUint64(1).Lsh(&mod, 512)
for i := 0; i < numIter; i++ {
multiplier64 = rand.Uint64()
fV := randomFp()
exp, _ := new(big.Int).SetString(fp2S(fV), 16)
exp.Mul(exp, new(big.Int).SetUint64(multiplier64))
// Truncate to 512 bits
exp.Mod(exp, &mod)
mul512(&fV, &fV, multiplier64)
res, _ := new(big.Int).SetString(fp2S(fV), 16)
if exp.Cmp(res) != 0 {
t.Errorf("%X != %X", exp, res)
}
}
}
// Check if mul512 produces result
2020-05-14 00:48:43 +01:00
// z = x*y mod 2^512.
cSIDH-511: (#26) Implementation of Commutative Supersingular Isogeny Diffie Hellman, based on "A faster way to CSIDH" paper (2018/782). * For fast isogeny calculation, implementation converts a curve from Montgomery to Edwards. All calculations are done on Edwards curve and then converted back to Montgomery. * As multiplication in a field Fp511 is most expensive operation the implementation contains multiple multiplications. It has most performant, assembly implementation which uses BMI2 and ADOX/ADCX instructions for modern CPUs. It also contains slower implementation which will run on older CPUs * Benchmarks (Intel SkyLake): BenchmarkGeneratePrivate 6459 172213 ns/op 0 B/op 0 allocs/op BenchmarkGenerateKeyPair 25 45800356 ns/op 0 B/op 0 allocs/op BenchmarkValidate 297 3915983 ns/op 0 B/op 0 allocs/op BenchmarkValidateRandom 184683 6231 ns/op 0 B/op 0 allocs/op BenchmarkValidateGenerated 25 48481306 ns/op 0 B/op 0 allocs/op BenchmarkDerive 19 60928763 ns/op 0 B/op 0 allocs/op BenchmarkDeriveGenerated 8 137342421 ns/op 0 B/op 0 allocs/op BenchmarkXMul 2311 494267 ns/op 1 B/op 0 allocs/op BenchmarkXAdd 2396754 501 ns/op 0 B/op 0 allocs/op BenchmarkXDbl 2072690 571 ns/op 0 B/op 0 allocs/op BenchmarkIsom 78004 15171 ns/op 0 B/op 0 allocs/op BenchmarkFp512Sub 224635152 5.33 ns/op 0 B/op 0 allocs/op BenchmarkFp512Mul 246633255 4.90 ns/op 0 B/op 0 allocs/op BenchmarkCSwap 233228547 5.10 ns/op 0 B/op 0 allocs/op BenchmarkAddRdc 87348240 12.6 ns/op 0 B/op 0 allocs/op BenchmarkSubRdc 95112787 11.7 ns/op 0 B/op 0 allocs/op BenchmarkModExpRdc 25436 46878 ns/op 0 B/op 0 allocs/op BenchmarkMulBmiAsm 19527573 60.1 ns/op 0 B/op 0 allocs/op BenchmarkMulGeneric 7117650 164 ns/op 0 B/op 0 allocs/op * Go code has very similar performance when compared to C implementation. Results from sidh_torturer (4e2996e12d68364761064341cbe1d1b47efafe23) github.com:henrydcase/sidh-torture/csidh | TestName |Go | C | |------------------|----------|----------| |TestSharedSecret | 57.95774 | 57.91092 | |TestKeyGeneration | 62.23614 | 58.12980 | |TestSharedSecret | 55.28988 | 57.23132 | |TestKeyGeneration | 61.68745 | 58.66396 | |TestSharedSecret | 63.19408 | 58.64774 | |TestKeyGeneration | 62.34022 | 61.62539 | |TestSharedSecret | 62.85453 | 68.74503 | |TestKeyGeneration | 52.58518 | 58.40115 | |TestSharedSecret | 50.77081 | 61.91699 | |TestKeyGeneration | 59.91843 | 61.09266 | |TestSharedSecret | 59.97962 | 62.98151 | |TestKeyGeneration | 64.57525 | 56.22863 | |TestSharedSecret | 56.40521 | 55.77447 | |TestKeyGeneration | 67.85850 | 58.52604 | |TestSharedSecret | 60.54290 | 65.14052 | |TestKeyGeneration | 65.45766 | 58.42823 | On average Go implementation is 2% faster.
2019-11-24 03:39:35 +00:00
func TestFp512Mul3_Nominal(t *testing.T) {
hasBMI2 = false
testFp512Mul3Nominal(t)
resetCPUFeatures()
testFp512Mul3Nominal(t)
}
func TestAddRdcRandom(t *testing.T) {
for i := 0; i < numIter; i++ {
a := randomFp()
bigA, _ := new(big.Int).SetString(fp2S(a), 16)
bigA.Mod(bigA, modulus)
copy(a[:], intGetU64(bigA))
b := randomFp()
bigB, _ := new(big.Int).SetString(fp2S(b), 16)
bigB.Mod(bigB, modulus)
copy(b[:], intGetU64(bigB))
addRdc(&a, &a, &b)
bigRet, _ := new(big.Int).SetString(fp2S(a), 16)
bigA.Add(bigA, bigB)
bigA.Mod(bigA, modulus)
if bigRet.Cmp(bigA) != 0 {
t.Errorf("%X != %X", bigRet, bigA)
}
}
}
func TestAddRdcNominal(t *testing.T) {
var res fp
tmp := oneFp512
addRdc(&res, &tmp, &p)
2020-05-14 00:48:43 +01:00
if !eqFp(&res, &tmp) {
cSIDH-511: (#26) Implementation of Commutative Supersingular Isogeny Diffie Hellman, based on "A faster way to CSIDH" paper (2018/782). * For fast isogeny calculation, implementation converts a curve from Montgomery to Edwards. All calculations are done on Edwards curve and then converted back to Montgomery. * As multiplication in a field Fp511 is most expensive operation the implementation contains multiple multiplications. It has most performant, assembly implementation which uses BMI2 and ADOX/ADCX instructions for modern CPUs. It also contains slower implementation which will run on older CPUs * Benchmarks (Intel SkyLake): BenchmarkGeneratePrivate 6459 172213 ns/op 0 B/op 0 allocs/op BenchmarkGenerateKeyPair 25 45800356 ns/op 0 B/op 0 allocs/op BenchmarkValidate 297 3915983 ns/op 0 B/op 0 allocs/op BenchmarkValidateRandom 184683 6231 ns/op 0 B/op 0 allocs/op BenchmarkValidateGenerated 25 48481306 ns/op 0 B/op 0 allocs/op BenchmarkDerive 19 60928763 ns/op 0 B/op 0 allocs/op BenchmarkDeriveGenerated 8 137342421 ns/op 0 B/op 0 allocs/op BenchmarkXMul 2311 494267 ns/op 1 B/op 0 allocs/op BenchmarkXAdd 2396754 501 ns/op 0 B/op 0 allocs/op BenchmarkXDbl 2072690 571 ns/op 0 B/op 0 allocs/op BenchmarkIsom 78004 15171 ns/op 0 B/op 0 allocs/op BenchmarkFp512Sub 224635152 5.33 ns/op 0 B/op 0 allocs/op BenchmarkFp512Mul 246633255 4.90 ns/op 0 B/op 0 allocs/op BenchmarkCSwap 233228547 5.10 ns/op 0 B/op 0 allocs/op BenchmarkAddRdc 87348240 12.6 ns/op 0 B/op 0 allocs/op BenchmarkSubRdc 95112787 11.7 ns/op 0 B/op 0 allocs/op BenchmarkModExpRdc 25436 46878 ns/op 0 B/op 0 allocs/op BenchmarkMulBmiAsm 19527573 60.1 ns/op 0 B/op 0 allocs/op BenchmarkMulGeneric 7117650 164 ns/op 0 B/op 0 allocs/op * Go code has very similar performance when compared to C implementation. Results from sidh_torturer (4e2996e12d68364761064341cbe1d1b47efafe23) github.com:henrydcase/sidh-torture/csidh | TestName |Go | C | |------------------|----------|----------| |TestSharedSecret | 57.95774 | 57.91092 | |TestKeyGeneration | 62.23614 | 58.12980 | |TestSharedSecret | 55.28988 | 57.23132 | |TestKeyGeneration | 61.68745 | 58.66396 | |TestSharedSecret | 63.19408 | 58.64774 | |TestKeyGeneration | 62.34022 | 61.62539 | |TestSharedSecret | 62.85453 | 68.74503 | |TestKeyGeneration | 52.58518 | 58.40115 | |TestSharedSecret | 50.77081 | 61.91699 | |TestKeyGeneration | 59.91843 | 61.09266 | |TestSharedSecret | 59.97962 | 62.98151 | |TestKeyGeneration | 64.57525 | 56.22863 | |TestSharedSecret | 56.40521 | 55.77447 | |TestKeyGeneration | 67.85850 | 58.52604 | |TestSharedSecret | 60.54290 | 65.14052 | |TestKeyGeneration | 65.45766 | 58.42823 | On average Go implementation is 2% faster.
2019-11-24 03:39:35 +00:00
t.Errorf("Wrong value\n%X", res)
}
tmp = zeroFp512
addRdc(&res, &p, &p)
2020-05-14 00:48:43 +01:00
if !eqFp(&res, &p) {
cSIDH-511: (#26) Implementation of Commutative Supersingular Isogeny Diffie Hellman, based on "A faster way to CSIDH" paper (2018/782). * For fast isogeny calculation, implementation converts a curve from Montgomery to Edwards. All calculations are done on Edwards curve and then converted back to Montgomery. * As multiplication in a field Fp511 is most expensive operation the implementation contains multiple multiplications. It has most performant, assembly implementation which uses BMI2 and ADOX/ADCX instructions for modern CPUs. It also contains slower implementation which will run on older CPUs * Benchmarks (Intel SkyLake): BenchmarkGeneratePrivate 6459 172213 ns/op 0 B/op 0 allocs/op BenchmarkGenerateKeyPair 25 45800356 ns/op 0 B/op 0 allocs/op BenchmarkValidate 297 3915983 ns/op 0 B/op 0 allocs/op BenchmarkValidateRandom 184683 6231 ns/op 0 B/op 0 allocs/op BenchmarkValidateGenerated 25 48481306 ns/op 0 B/op 0 allocs/op BenchmarkDerive 19 60928763 ns/op 0 B/op 0 allocs/op BenchmarkDeriveGenerated 8 137342421 ns/op 0 B/op 0 allocs/op BenchmarkXMul 2311 494267 ns/op 1 B/op 0 allocs/op BenchmarkXAdd 2396754 501 ns/op 0 B/op 0 allocs/op BenchmarkXDbl 2072690 571 ns/op 0 B/op 0 allocs/op BenchmarkIsom 78004 15171 ns/op 0 B/op 0 allocs/op BenchmarkFp512Sub 224635152 5.33 ns/op 0 B/op 0 allocs/op BenchmarkFp512Mul 246633255 4.90 ns/op 0 B/op 0 allocs/op BenchmarkCSwap 233228547 5.10 ns/op 0 B/op 0 allocs/op BenchmarkAddRdc 87348240 12.6 ns/op 0 B/op 0 allocs/op BenchmarkSubRdc 95112787 11.7 ns/op 0 B/op 0 allocs/op BenchmarkModExpRdc 25436 46878 ns/op 0 B/op 0 allocs/op BenchmarkMulBmiAsm 19527573 60.1 ns/op 0 B/op 0 allocs/op BenchmarkMulGeneric 7117650 164 ns/op 0 B/op 0 allocs/op * Go code has very similar performance when compared to C implementation. Results from sidh_torturer (4e2996e12d68364761064341cbe1d1b47efafe23) github.com:henrydcase/sidh-torture/csidh | TestName |Go | C | |------------------|----------|----------| |TestSharedSecret | 57.95774 | 57.91092 | |TestKeyGeneration | 62.23614 | 58.12980 | |TestSharedSecret | 55.28988 | 57.23132 | |TestKeyGeneration | 61.68745 | 58.66396 | |TestSharedSecret | 63.19408 | 58.64774 | |TestKeyGeneration | 62.34022 | 61.62539 | |TestSharedSecret | 62.85453 | 68.74503 | |TestKeyGeneration | 52.58518 | 58.40115 | |TestSharedSecret | 50.77081 | 61.91699 | |TestKeyGeneration | 59.91843 | 61.09266 | |TestSharedSecret | 59.97962 | 62.98151 | |TestKeyGeneration | 64.57525 | 56.22863 | |TestSharedSecret | 56.40521 | 55.77447 | |TestKeyGeneration | 67.85850 | 58.52604 | |TestSharedSecret | 60.54290 | 65.14052 | |TestKeyGeneration | 65.45766 | 58.42823 | On average Go implementation is 2% faster.
2019-11-24 03:39:35 +00:00
t.Errorf("Wrong value\n%X", res)
}
tmp = fp{1, 1, 1, 1, 1, 1, 1, 1}
addRdc(&res, &p, &tmp)
2020-05-14 00:48:43 +01:00
if !eqFp(&res, &tmp) {
cSIDH-511: (#26) Implementation of Commutative Supersingular Isogeny Diffie Hellman, based on "A faster way to CSIDH" paper (2018/782). * For fast isogeny calculation, implementation converts a curve from Montgomery to Edwards. All calculations are done on Edwards curve and then converted back to Montgomery. * As multiplication in a field Fp511 is most expensive operation the implementation contains multiple multiplications. It has most performant, assembly implementation which uses BMI2 and ADOX/ADCX instructions for modern CPUs. It also contains slower implementation which will run on older CPUs * Benchmarks (Intel SkyLake): BenchmarkGeneratePrivate 6459 172213 ns/op 0 B/op 0 allocs/op BenchmarkGenerateKeyPair 25 45800356 ns/op 0 B/op 0 allocs/op BenchmarkValidate 297 3915983 ns/op 0 B/op 0 allocs/op BenchmarkValidateRandom 184683 6231 ns/op 0 B/op 0 allocs/op BenchmarkValidateGenerated 25 48481306 ns/op 0 B/op 0 allocs/op BenchmarkDerive 19 60928763 ns/op 0 B/op 0 allocs/op BenchmarkDeriveGenerated 8 137342421 ns/op 0 B/op 0 allocs/op BenchmarkXMul 2311 494267 ns/op 1 B/op 0 allocs/op BenchmarkXAdd 2396754 501 ns/op 0 B/op 0 allocs/op BenchmarkXDbl 2072690 571 ns/op 0 B/op 0 allocs/op BenchmarkIsom 78004 15171 ns/op 0 B/op 0 allocs/op BenchmarkFp512Sub 224635152 5.33 ns/op 0 B/op 0 allocs/op BenchmarkFp512Mul 246633255 4.90 ns/op 0 B/op 0 allocs/op BenchmarkCSwap 233228547 5.10 ns/op 0 B/op 0 allocs/op BenchmarkAddRdc 87348240 12.6 ns/op 0 B/op 0 allocs/op BenchmarkSubRdc 95112787 11.7 ns/op 0 B/op 0 allocs/op BenchmarkModExpRdc 25436 46878 ns/op 0 B/op 0 allocs/op BenchmarkMulBmiAsm 19527573 60.1 ns/op 0 B/op 0 allocs/op BenchmarkMulGeneric 7117650 164 ns/op 0 B/op 0 allocs/op * Go code has very similar performance when compared to C implementation. Results from sidh_torturer (4e2996e12d68364761064341cbe1d1b47efafe23) github.com:henrydcase/sidh-torture/csidh | TestName |Go | C | |------------------|----------|----------| |TestSharedSecret | 57.95774 | 57.91092 | |TestKeyGeneration | 62.23614 | 58.12980 | |TestSharedSecret | 55.28988 | 57.23132 | |TestKeyGeneration | 61.68745 | 58.66396 | |TestSharedSecret | 63.19408 | 58.64774 | |TestKeyGeneration | 62.34022 | 61.62539 | |TestSharedSecret | 62.85453 | 68.74503 | |TestKeyGeneration | 52.58518 | 58.40115 | |TestSharedSecret | 50.77081 | 61.91699 | |TestKeyGeneration | 59.91843 | 61.09266 | |TestSharedSecret | 59.97962 | 62.98151 | |TestKeyGeneration | 64.57525 | 56.22863 | |TestSharedSecret | 56.40521 | 55.77447 | |TestKeyGeneration | 67.85850 | 58.52604 | |TestSharedSecret | 60.54290 | 65.14052 | |TestKeyGeneration | 65.45766 | 58.42823 | On average Go implementation is 2% faster.
2019-11-24 03:39:35 +00:00
t.Errorf("Wrong value\n%X", res)
}
tmp = fp{1, 1, 1, 1, 1, 1, 1, 1}
exp := fp{2, 2, 2, 2, 2, 2, 2, 2}
addRdc(&res, &tmp, &tmp)
2020-05-14 00:48:43 +01:00
if !eqFp(&res, &exp) {
cSIDH-511: (#26) Implementation of Commutative Supersingular Isogeny Diffie Hellman, based on "A faster way to CSIDH" paper (2018/782). * For fast isogeny calculation, implementation converts a curve from Montgomery to Edwards. All calculations are done on Edwards curve and then converted back to Montgomery. * As multiplication in a field Fp511 is most expensive operation the implementation contains multiple multiplications. It has most performant, assembly implementation which uses BMI2 and ADOX/ADCX instructions for modern CPUs. It also contains slower implementation which will run on older CPUs * Benchmarks (Intel SkyLake): BenchmarkGeneratePrivate 6459 172213 ns/op 0 B/op 0 allocs/op BenchmarkGenerateKeyPair 25 45800356 ns/op 0 B/op 0 allocs/op BenchmarkValidate 297 3915983 ns/op 0 B/op 0 allocs/op BenchmarkValidateRandom 184683 6231 ns/op 0 B/op 0 allocs/op BenchmarkValidateGenerated 25 48481306 ns/op 0 B/op 0 allocs/op BenchmarkDerive 19 60928763 ns/op 0 B/op 0 allocs/op BenchmarkDeriveGenerated 8 137342421 ns/op 0 B/op 0 allocs/op BenchmarkXMul 2311 494267 ns/op 1 B/op 0 allocs/op BenchmarkXAdd 2396754 501 ns/op 0 B/op 0 allocs/op BenchmarkXDbl 2072690 571 ns/op 0 B/op 0 allocs/op BenchmarkIsom 78004 15171 ns/op 0 B/op 0 allocs/op BenchmarkFp512Sub 224635152 5.33 ns/op 0 B/op 0 allocs/op BenchmarkFp512Mul 246633255 4.90 ns/op 0 B/op 0 allocs/op BenchmarkCSwap 233228547 5.10 ns/op 0 B/op 0 allocs/op BenchmarkAddRdc 87348240 12.6 ns/op 0 B/op 0 allocs/op BenchmarkSubRdc 95112787 11.7 ns/op 0 B/op 0 allocs/op BenchmarkModExpRdc 25436 46878 ns/op 0 B/op 0 allocs/op BenchmarkMulBmiAsm 19527573 60.1 ns/op 0 B/op 0 allocs/op BenchmarkMulGeneric 7117650 164 ns/op 0 B/op 0 allocs/op * Go code has very similar performance when compared to C implementation. Results from sidh_torturer (4e2996e12d68364761064341cbe1d1b47efafe23) github.com:henrydcase/sidh-torture/csidh | TestName |Go | C | |------------------|----------|----------| |TestSharedSecret | 57.95774 | 57.91092 | |TestKeyGeneration | 62.23614 | 58.12980 | |TestSharedSecret | 55.28988 | 57.23132 | |TestKeyGeneration | 61.68745 | 58.66396 | |TestSharedSecret | 63.19408 | 58.64774 | |TestKeyGeneration | 62.34022 | 61.62539 | |TestSharedSecret | 62.85453 | 68.74503 | |TestKeyGeneration | 52.58518 | 58.40115 | |TestSharedSecret | 50.77081 | 61.91699 | |TestKeyGeneration | 59.91843 | 61.09266 | |TestSharedSecret | 59.97962 | 62.98151 | |TestKeyGeneration | 64.57525 | 56.22863 | |TestSharedSecret | 56.40521 | 55.77447 | |TestKeyGeneration | 67.85850 | 58.52604 | |TestSharedSecret | 60.54290 | 65.14052 | |TestKeyGeneration | 65.45766 | 58.42823 | On average Go implementation is 2% faster.
2019-11-24 03:39:35 +00:00
t.Errorf("Wrong value\n%X", res)
}
}
func TestFp512Sub3_Nominal(t *testing.T) {
var ret fp
var mod big.Int
// modulus: 2^512
mod.SetUint64(1).Lsh(&mod, 512)
for i := 0; i < numIter; i++ {
a := randomFp()
bigA, _ := new(big.Int).SetString(fp2S(a), 16)
b := randomFp()
bigB, _ := new(big.Int).SetString(fp2S(b), 16)
sub512(&ret, &a, &b)
bigRet, _ := new(big.Int).SetString(fp2S(ret), 16)
bigA.Sub(bigA, bigB)
// Truncate to 512 bits
bigA.Mod(bigA, &mod)
if bigRet.Cmp(bigA) != 0 {
t.Errorf("%X != %X", bigRet, bigA)
}
}
}
func TestFp512Sub3_DoesntReturnCarry(t *testing.T) {
a := fp{}
b := fp{
0xFFFFFFFFFFFFFFFF, 1,
0, 0,
0, 0,
0, 0}
c := fp{
0xFFFFFFFFFFFFFFFF, 2,
0, 0,
0, 0,
0, 0}
if sub512(&a, &b, &c) != 1 {
t.Error("Carry not returned")
}
}
func TestFp512Sub3_ReturnsCarry(t *testing.T) {
a := fp{}
b := fp{
0xFFFFFFFFFFFFFFFF, 2,
0, 0,
0, 0,
0, 0}
c := fp{
0xFFFFFFFFFFFFFFFF, 1,
0, 0,
0, 0,
0, 0}
if sub512(&a, &b, &c) != 0 {
t.Error("Carry not returned")
}
}
func TestCswap(t *testing.T) {
arg1 := randomFp()
arg2 := randomFp()
arg1cpy := arg1
cswap512(&arg1, &arg2, 0)
2020-05-14 00:48:43 +01:00
if !eqFp(&arg1, &arg1cpy) {
cSIDH-511: (#26) Implementation of Commutative Supersingular Isogeny Diffie Hellman, based on "A faster way to CSIDH" paper (2018/782). * For fast isogeny calculation, implementation converts a curve from Montgomery to Edwards. All calculations are done on Edwards curve and then converted back to Montgomery. * As multiplication in a field Fp511 is most expensive operation the implementation contains multiple multiplications. It has most performant, assembly implementation which uses BMI2 and ADOX/ADCX instructions for modern CPUs. It also contains slower implementation which will run on older CPUs * Benchmarks (Intel SkyLake): BenchmarkGeneratePrivate 6459 172213 ns/op 0 B/op 0 allocs/op BenchmarkGenerateKeyPair 25 45800356 ns/op 0 B/op 0 allocs/op BenchmarkValidate 297 3915983 ns/op 0 B/op 0 allocs/op BenchmarkValidateRandom 184683 6231 ns/op 0 B/op 0 allocs/op BenchmarkValidateGenerated 25 48481306 ns/op 0 B/op 0 allocs/op BenchmarkDerive 19 60928763 ns/op 0 B/op 0 allocs/op BenchmarkDeriveGenerated 8 137342421 ns/op 0 B/op 0 allocs/op BenchmarkXMul 2311 494267 ns/op 1 B/op 0 allocs/op BenchmarkXAdd 2396754 501 ns/op 0 B/op 0 allocs/op BenchmarkXDbl 2072690 571 ns/op 0 B/op 0 allocs/op BenchmarkIsom 78004 15171 ns/op 0 B/op 0 allocs/op BenchmarkFp512Sub 224635152 5.33 ns/op 0 B/op 0 allocs/op BenchmarkFp512Mul 246633255 4.90 ns/op 0 B/op 0 allocs/op BenchmarkCSwap 233228547 5.10 ns/op 0 B/op 0 allocs/op BenchmarkAddRdc 87348240 12.6 ns/op 0 B/op 0 allocs/op BenchmarkSubRdc 95112787 11.7 ns/op 0 B/op 0 allocs/op BenchmarkModExpRdc 25436 46878 ns/op 0 B/op 0 allocs/op BenchmarkMulBmiAsm 19527573 60.1 ns/op 0 B/op 0 allocs/op BenchmarkMulGeneric 7117650 164 ns/op 0 B/op 0 allocs/op * Go code has very similar performance when compared to C implementation. Results from sidh_torturer (4e2996e12d68364761064341cbe1d1b47efafe23) github.com:henrydcase/sidh-torture/csidh | TestName |Go | C | |------------------|----------|----------| |TestSharedSecret | 57.95774 | 57.91092 | |TestKeyGeneration | 62.23614 | 58.12980 | |TestSharedSecret | 55.28988 | 57.23132 | |TestKeyGeneration | 61.68745 | 58.66396 | |TestSharedSecret | 63.19408 | 58.64774 | |TestKeyGeneration | 62.34022 | 61.62539 | |TestSharedSecret | 62.85453 | 68.74503 | |TestKeyGeneration | 52.58518 | 58.40115 | |TestSharedSecret | 50.77081 | 61.91699 | |TestKeyGeneration | 59.91843 | 61.09266 | |TestSharedSecret | 59.97962 | 62.98151 | |TestKeyGeneration | 64.57525 | 56.22863 | |TestSharedSecret | 56.40521 | 55.77447 | |TestKeyGeneration | 67.85850 | 58.52604 | |TestSharedSecret | 60.54290 | 65.14052 | |TestKeyGeneration | 65.45766 | 58.42823 | On average Go implementation is 2% faster.
2019-11-24 03:39:35 +00:00
t.Error("cswap swapped")
}
arg1cpy = arg1
cswap512(&arg1, &arg2, 1)
2020-05-14 00:48:43 +01:00
if eqFp(&arg1, &arg1cpy) {
cSIDH-511: (#26) Implementation of Commutative Supersingular Isogeny Diffie Hellman, based on "A faster way to CSIDH" paper (2018/782). * For fast isogeny calculation, implementation converts a curve from Montgomery to Edwards. All calculations are done on Edwards curve and then converted back to Montgomery. * As multiplication in a field Fp511 is most expensive operation the implementation contains multiple multiplications. It has most performant, assembly implementation which uses BMI2 and ADOX/ADCX instructions for modern CPUs. It also contains slower implementation which will run on older CPUs * Benchmarks (Intel SkyLake): BenchmarkGeneratePrivate 6459 172213 ns/op 0 B/op 0 allocs/op BenchmarkGenerateKeyPair 25 45800356 ns/op 0 B/op 0 allocs/op BenchmarkValidate 297 3915983 ns/op 0 B/op 0 allocs/op BenchmarkValidateRandom 184683 6231 ns/op 0 B/op 0 allocs/op BenchmarkValidateGenerated 25 48481306 ns/op 0 B/op 0 allocs/op BenchmarkDerive 19 60928763 ns/op 0 B/op 0 allocs/op BenchmarkDeriveGenerated 8 137342421 ns/op 0 B/op 0 allocs/op BenchmarkXMul 2311 494267 ns/op 1 B/op 0 allocs/op BenchmarkXAdd 2396754 501 ns/op 0 B/op 0 allocs/op BenchmarkXDbl 2072690 571 ns/op 0 B/op 0 allocs/op BenchmarkIsom 78004 15171 ns/op 0 B/op 0 allocs/op BenchmarkFp512Sub 224635152 5.33 ns/op 0 B/op 0 allocs/op BenchmarkFp512Mul 246633255 4.90 ns/op 0 B/op 0 allocs/op BenchmarkCSwap 233228547 5.10 ns/op 0 B/op 0 allocs/op BenchmarkAddRdc 87348240 12.6 ns/op 0 B/op 0 allocs/op BenchmarkSubRdc 95112787 11.7 ns/op 0 B/op 0 allocs/op BenchmarkModExpRdc 25436 46878 ns/op 0 B/op 0 allocs/op BenchmarkMulBmiAsm 19527573 60.1 ns/op 0 B/op 0 allocs/op BenchmarkMulGeneric 7117650 164 ns/op 0 B/op 0 allocs/op * Go code has very similar performance when compared to C implementation. Results from sidh_torturer (4e2996e12d68364761064341cbe1d1b47efafe23) github.com:henrydcase/sidh-torture/csidh | TestName |Go | C | |------------------|----------|----------| |TestSharedSecret | 57.95774 | 57.91092 | |TestKeyGeneration | 62.23614 | 58.12980 | |TestSharedSecret | 55.28988 | 57.23132 | |TestKeyGeneration | 61.68745 | 58.66396 | |TestSharedSecret | 63.19408 | 58.64774 | |TestKeyGeneration | 62.34022 | 61.62539 | |TestSharedSecret | 62.85453 | 68.74503 | |TestKeyGeneration | 52.58518 | 58.40115 | |TestSharedSecret | 50.77081 | 61.91699 | |TestKeyGeneration | 59.91843 | 61.09266 | |TestSharedSecret | 59.97962 | 62.98151 | |TestKeyGeneration | 64.57525 | 56.22863 | |TestSharedSecret | 56.40521 | 55.77447 | |TestKeyGeneration | 67.85850 | 58.52604 | |TestSharedSecret | 60.54290 | 65.14052 | |TestKeyGeneration | 65.45766 | 58.42823 | On average Go implementation is 2% faster.
2019-11-24 03:39:35 +00:00
t.Error("cswap didn't swapped")
}
arg1cpy = arg1
cswap512(&arg1, &arg2, 0xF2)
2020-05-14 00:48:43 +01:00
if eqFp(&arg1, &arg1cpy) {
cSIDH-511: (#26) Implementation of Commutative Supersingular Isogeny Diffie Hellman, based on "A faster way to CSIDH" paper (2018/782). * For fast isogeny calculation, implementation converts a curve from Montgomery to Edwards. All calculations are done on Edwards curve and then converted back to Montgomery. * As multiplication in a field Fp511 is most expensive operation the implementation contains multiple multiplications. It has most performant, assembly implementation which uses BMI2 and ADOX/ADCX instructions for modern CPUs. It also contains slower implementation which will run on older CPUs * Benchmarks (Intel SkyLake): BenchmarkGeneratePrivate 6459 172213 ns/op 0 B/op 0 allocs/op BenchmarkGenerateKeyPair 25 45800356 ns/op 0 B/op 0 allocs/op BenchmarkValidate 297 3915983 ns/op 0 B/op 0 allocs/op BenchmarkValidateRandom 184683 6231 ns/op 0 B/op 0 allocs/op BenchmarkValidateGenerated 25 48481306 ns/op 0 B/op 0 allocs/op BenchmarkDerive 19 60928763 ns/op 0 B/op 0 allocs/op BenchmarkDeriveGenerated 8 137342421 ns/op 0 B/op 0 allocs/op BenchmarkXMul 2311 494267 ns/op 1 B/op 0 allocs/op BenchmarkXAdd 2396754 501 ns/op 0 B/op 0 allocs/op BenchmarkXDbl 2072690 571 ns/op 0 B/op 0 allocs/op BenchmarkIsom 78004 15171 ns/op 0 B/op 0 allocs/op BenchmarkFp512Sub 224635152 5.33 ns/op 0 B/op 0 allocs/op BenchmarkFp512Mul 246633255 4.90 ns/op 0 B/op 0 allocs/op BenchmarkCSwap 233228547 5.10 ns/op 0 B/op 0 allocs/op BenchmarkAddRdc 87348240 12.6 ns/op 0 B/op 0 allocs/op BenchmarkSubRdc 95112787 11.7 ns/op 0 B/op 0 allocs/op BenchmarkModExpRdc 25436 46878 ns/op 0 B/op 0 allocs/op BenchmarkMulBmiAsm 19527573 60.1 ns/op 0 B/op 0 allocs/op BenchmarkMulGeneric 7117650 164 ns/op 0 B/op 0 allocs/op * Go code has very similar performance when compared to C implementation. Results from sidh_torturer (4e2996e12d68364761064341cbe1d1b47efafe23) github.com:henrydcase/sidh-torture/csidh | TestName |Go | C | |------------------|----------|----------| |TestSharedSecret | 57.95774 | 57.91092 | |TestKeyGeneration | 62.23614 | 58.12980 | |TestSharedSecret | 55.28988 | 57.23132 | |TestKeyGeneration | 61.68745 | 58.66396 | |TestSharedSecret | 63.19408 | 58.64774 | |TestKeyGeneration | 62.34022 | 61.62539 | |TestSharedSecret | 62.85453 | 68.74503 | |TestKeyGeneration | 52.58518 | 58.40115 | |TestSharedSecret | 50.77081 | 61.91699 | |TestKeyGeneration | 59.91843 | 61.09266 | |TestSharedSecret | 59.97962 | 62.98151 | |TestKeyGeneration | 64.57525 | 56.22863 | |TestSharedSecret | 56.40521 | 55.77447 | |TestKeyGeneration | 67.85850 | 58.52604 | |TestSharedSecret | 60.54290 | 65.14052 | |TestKeyGeneration | 65.45766 | 58.42823 | On average Go implementation is 2% faster.
2019-11-24 03:39:35 +00:00
t.Error("cswap didn't swapped")
}
}
func TestSubRdc(t *testing.T) {
var res fp
// 1 - 1 mod P
tmp := oneFp512
subRdc(&res, &tmp, &tmp)
2020-05-14 00:48:43 +01:00
if !eqFp(&res, &zeroFp512) {
cSIDH-511: (#26) Implementation of Commutative Supersingular Isogeny Diffie Hellman, based on "A faster way to CSIDH" paper (2018/782). * For fast isogeny calculation, implementation converts a curve from Montgomery to Edwards. All calculations are done on Edwards curve and then converted back to Montgomery. * As multiplication in a field Fp511 is most expensive operation the implementation contains multiple multiplications. It has most performant, assembly implementation which uses BMI2 and ADOX/ADCX instructions for modern CPUs. It also contains slower implementation which will run on older CPUs * Benchmarks (Intel SkyLake): BenchmarkGeneratePrivate 6459 172213 ns/op 0 B/op 0 allocs/op BenchmarkGenerateKeyPair 25 45800356 ns/op 0 B/op 0 allocs/op BenchmarkValidate 297 3915983 ns/op 0 B/op 0 allocs/op BenchmarkValidateRandom 184683 6231 ns/op 0 B/op 0 allocs/op BenchmarkValidateGenerated 25 48481306 ns/op 0 B/op 0 allocs/op BenchmarkDerive 19 60928763 ns/op 0 B/op 0 allocs/op BenchmarkDeriveGenerated 8 137342421 ns/op 0 B/op 0 allocs/op BenchmarkXMul 2311 494267 ns/op 1 B/op 0 allocs/op BenchmarkXAdd 2396754 501 ns/op 0 B/op 0 allocs/op BenchmarkXDbl 2072690 571 ns/op 0 B/op 0 allocs/op BenchmarkIsom 78004 15171 ns/op 0 B/op 0 allocs/op BenchmarkFp512Sub 224635152 5.33 ns/op 0 B/op 0 allocs/op BenchmarkFp512Mul 246633255 4.90 ns/op 0 B/op 0 allocs/op BenchmarkCSwap 233228547 5.10 ns/op 0 B/op 0 allocs/op BenchmarkAddRdc 87348240 12.6 ns/op 0 B/op 0 allocs/op BenchmarkSubRdc 95112787 11.7 ns/op 0 B/op 0 allocs/op BenchmarkModExpRdc 25436 46878 ns/op 0 B/op 0 allocs/op BenchmarkMulBmiAsm 19527573 60.1 ns/op 0 B/op 0 allocs/op BenchmarkMulGeneric 7117650 164 ns/op 0 B/op 0 allocs/op * Go code has very similar performance when compared to C implementation. Results from sidh_torturer (4e2996e12d68364761064341cbe1d1b47efafe23) github.com:henrydcase/sidh-torture/csidh | TestName |Go | C | |------------------|----------|----------| |TestSharedSecret | 57.95774 | 57.91092 | |TestKeyGeneration | 62.23614 | 58.12980 | |TestSharedSecret | 55.28988 | 57.23132 | |TestKeyGeneration | 61.68745 | 58.66396 | |TestSharedSecret | 63.19408 | 58.64774 | |TestKeyGeneration | 62.34022 | 61.62539 | |TestSharedSecret | 62.85453 | 68.74503 | |TestKeyGeneration | 52.58518 | 58.40115 | |TestSharedSecret | 50.77081 | 61.91699 | |TestKeyGeneration | 59.91843 | 61.09266 | |TestSharedSecret | 59.97962 | 62.98151 | |TestKeyGeneration | 64.57525 | 56.22863 | |TestSharedSecret | 56.40521 | 55.77447 | |TestKeyGeneration | 67.85850 | 58.52604 | |TestSharedSecret | 60.54290 | 65.14052 | |TestKeyGeneration | 65.45766 | 58.42823 | On average Go implementation is 2% faster.
2019-11-24 03:39:35 +00:00
t.Errorf("Wrong value\n%X", res)
}
zero(&res)
// 0 - 1 mod P
exp := p
exp[0]--
subRdc(&res, &zeroFp512, &oneFp512)
2020-05-14 00:48:43 +01:00
if !eqFp(&res, &exp) {
cSIDH-511: (#26) Implementation of Commutative Supersingular Isogeny Diffie Hellman, based on "A faster way to CSIDH" paper (2018/782). * For fast isogeny calculation, implementation converts a curve from Montgomery to Edwards. All calculations are done on Edwards curve and then converted back to Montgomery. * As multiplication in a field Fp511 is most expensive operation the implementation contains multiple multiplications. It has most performant, assembly implementation which uses BMI2 and ADOX/ADCX instructions for modern CPUs. It also contains slower implementation which will run on older CPUs * Benchmarks (Intel SkyLake): BenchmarkGeneratePrivate 6459 172213 ns/op 0 B/op 0 allocs/op BenchmarkGenerateKeyPair 25 45800356 ns/op 0 B/op 0 allocs/op BenchmarkValidate 297 3915983 ns/op 0 B/op 0 allocs/op BenchmarkValidateRandom 184683 6231 ns/op 0 B/op 0 allocs/op BenchmarkValidateGenerated 25 48481306 ns/op 0 B/op 0 allocs/op BenchmarkDerive 19 60928763 ns/op 0 B/op 0 allocs/op BenchmarkDeriveGenerated 8 137342421 ns/op 0 B/op 0 allocs/op BenchmarkXMul 2311 494267 ns/op 1 B/op 0 allocs/op BenchmarkXAdd 2396754 501 ns/op 0 B/op 0 allocs/op BenchmarkXDbl 2072690 571 ns/op 0 B/op 0 allocs/op BenchmarkIsom 78004 15171 ns/op 0 B/op 0 allocs/op BenchmarkFp512Sub 224635152 5.33 ns/op 0 B/op 0 allocs/op BenchmarkFp512Mul 246633255 4.90 ns/op 0 B/op 0 allocs/op BenchmarkCSwap 233228547 5.10 ns/op 0 B/op 0 allocs/op BenchmarkAddRdc 87348240 12.6 ns/op 0 B/op 0 allocs/op BenchmarkSubRdc 95112787 11.7 ns/op 0 B/op 0 allocs/op BenchmarkModExpRdc 25436 46878 ns/op 0 B/op 0 allocs/op BenchmarkMulBmiAsm 19527573 60.1 ns/op 0 B/op 0 allocs/op BenchmarkMulGeneric 7117650 164 ns/op 0 B/op 0 allocs/op * Go code has very similar performance when compared to C implementation. Results from sidh_torturer (4e2996e12d68364761064341cbe1d1b47efafe23) github.com:henrydcase/sidh-torture/csidh | TestName |Go | C | |------------------|----------|----------| |TestSharedSecret | 57.95774 | 57.91092 | |TestKeyGeneration | 62.23614 | 58.12980 | |TestSharedSecret | 55.28988 | 57.23132 | |TestKeyGeneration | 61.68745 | 58.66396 | |TestSharedSecret | 63.19408 | 58.64774 | |TestKeyGeneration | 62.34022 | 61.62539 | |TestSharedSecret | 62.85453 | 68.74503 | |TestKeyGeneration | 52.58518 | 58.40115 | |TestSharedSecret | 50.77081 | 61.91699 | |TestKeyGeneration | 59.91843 | 61.09266 | |TestSharedSecret | 59.97962 | 62.98151 | |TestKeyGeneration | 64.57525 | 56.22863 | |TestSharedSecret | 56.40521 | 55.77447 | |TestKeyGeneration | 67.85850 | 58.52604 | |TestSharedSecret | 60.54290 | 65.14052 | |TestKeyGeneration | 65.45766 | 58.42823 | On average Go implementation is 2% faster.
2019-11-24 03:39:35 +00:00
t.Errorf("Wrong value\n%X\n%X", res, exp)
}
zero(&res)
// P - (P-1)
pMinusOne := p
pMinusOne[0]--
subRdc(&res, &p, &pMinusOne)
2020-05-14 00:48:43 +01:00
if !eqFp(&res, &oneFp512) {
cSIDH-511: (#26) Implementation of Commutative Supersingular Isogeny Diffie Hellman, based on "A faster way to CSIDH" paper (2018/782). * For fast isogeny calculation, implementation converts a curve from Montgomery to Edwards. All calculations are done on Edwards curve and then converted back to Montgomery. * As multiplication in a field Fp511 is most expensive operation the implementation contains multiple multiplications. It has most performant, assembly implementation which uses BMI2 and ADOX/ADCX instructions for modern CPUs. It also contains slower implementation which will run on older CPUs * Benchmarks (Intel SkyLake): BenchmarkGeneratePrivate 6459 172213 ns/op 0 B/op 0 allocs/op BenchmarkGenerateKeyPair 25 45800356 ns/op 0 B/op 0 allocs/op BenchmarkValidate 297 3915983 ns/op 0 B/op 0 allocs/op BenchmarkValidateRandom 184683 6231 ns/op 0 B/op 0 allocs/op BenchmarkValidateGenerated 25 48481306 ns/op 0 B/op 0 allocs/op BenchmarkDerive 19 60928763 ns/op 0 B/op 0 allocs/op BenchmarkDeriveGenerated 8 137342421 ns/op 0 B/op 0 allocs/op BenchmarkXMul 2311 494267 ns/op 1 B/op 0 allocs/op BenchmarkXAdd 2396754 501 ns/op 0 B/op 0 allocs/op BenchmarkXDbl 2072690 571 ns/op 0 B/op 0 allocs/op BenchmarkIsom 78004 15171 ns/op 0 B/op 0 allocs/op BenchmarkFp512Sub 224635152 5.33 ns/op 0 B/op 0 allocs/op BenchmarkFp512Mul 246633255 4.90 ns/op 0 B/op 0 allocs/op BenchmarkCSwap 233228547 5.10 ns/op 0 B/op 0 allocs/op BenchmarkAddRdc 87348240 12.6 ns/op 0 B/op 0 allocs/op BenchmarkSubRdc 95112787 11.7 ns/op 0 B/op 0 allocs/op BenchmarkModExpRdc 25436 46878 ns/op 0 B/op 0 allocs/op BenchmarkMulBmiAsm 19527573 60.1 ns/op 0 B/op 0 allocs/op BenchmarkMulGeneric 7117650 164 ns/op 0 B/op 0 allocs/op * Go code has very similar performance when compared to C implementation. Results from sidh_torturer (4e2996e12d68364761064341cbe1d1b47efafe23) github.com:henrydcase/sidh-torture/csidh | TestName |Go | C | |------------------|----------|----------| |TestSharedSecret | 57.95774 | 57.91092 | |TestKeyGeneration | 62.23614 | 58.12980 | |TestSharedSecret | 55.28988 | 57.23132 | |TestKeyGeneration | 61.68745 | 58.66396 | |TestSharedSecret | 63.19408 | 58.64774 | |TestKeyGeneration | 62.34022 | 61.62539 | |TestSharedSecret | 62.85453 | 68.74503 | |TestKeyGeneration | 52.58518 | 58.40115 | |TestSharedSecret | 50.77081 | 61.91699 | |TestKeyGeneration | 59.91843 | 61.09266 | |TestSharedSecret | 59.97962 | 62.98151 | |TestKeyGeneration | 64.57525 | 56.22863 | |TestSharedSecret | 56.40521 | 55.77447 | |TestKeyGeneration | 67.85850 | 58.52604 | |TestSharedSecret | 60.54290 | 65.14052 | |TestKeyGeneration | 65.45766 | 58.42823 | On average Go implementation is 2% faster.
2019-11-24 03:39:35 +00:00
t.Errorf("Wrong value\n[%X != %X]", res, oneFp512)
}
zero(&res)
subRdc(&res, &p, &oneFp512)
2020-05-14 00:48:43 +01:00
if !eqFp(&res, &pMinusOne) {
cSIDH-511: (#26) Implementation of Commutative Supersingular Isogeny Diffie Hellman, based on "A faster way to CSIDH" paper (2018/782). * For fast isogeny calculation, implementation converts a curve from Montgomery to Edwards. All calculations are done on Edwards curve and then converted back to Montgomery. * As multiplication in a field Fp511 is most expensive operation the implementation contains multiple multiplications. It has most performant, assembly implementation which uses BMI2 and ADOX/ADCX instructions for modern CPUs. It also contains slower implementation which will run on older CPUs * Benchmarks (Intel SkyLake): BenchmarkGeneratePrivate 6459 172213 ns/op 0 B/op 0 allocs/op BenchmarkGenerateKeyPair 25 45800356 ns/op 0 B/op 0 allocs/op BenchmarkValidate 297 3915983 ns/op 0 B/op 0 allocs/op BenchmarkValidateRandom 184683 6231 ns/op 0 B/op 0 allocs/op BenchmarkValidateGenerated 25 48481306 ns/op 0 B/op 0 allocs/op BenchmarkDerive 19 60928763 ns/op 0 B/op 0 allocs/op BenchmarkDeriveGenerated 8 137342421 ns/op 0 B/op 0 allocs/op BenchmarkXMul 2311 494267 ns/op 1 B/op 0 allocs/op BenchmarkXAdd 2396754 501 ns/op 0 B/op 0 allocs/op BenchmarkXDbl 2072690 571 ns/op 0 B/op 0 allocs/op BenchmarkIsom 78004 15171 ns/op 0 B/op 0 allocs/op BenchmarkFp512Sub 224635152 5.33 ns/op 0 B/op 0 allocs/op BenchmarkFp512Mul 246633255 4.90 ns/op 0 B/op 0 allocs/op BenchmarkCSwap 233228547 5.10 ns/op 0 B/op 0 allocs/op BenchmarkAddRdc 87348240 12.6 ns/op 0 B/op 0 allocs/op BenchmarkSubRdc 95112787 11.7 ns/op 0 B/op 0 allocs/op BenchmarkModExpRdc 25436 46878 ns/op 0 B/op 0 allocs/op BenchmarkMulBmiAsm 19527573 60.1 ns/op 0 B/op 0 allocs/op BenchmarkMulGeneric 7117650 164 ns/op 0 B/op 0 allocs/op * Go code has very similar performance when compared to C implementation. Results from sidh_torturer (4e2996e12d68364761064341cbe1d1b47efafe23) github.com:henrydcase/sidh-torture/csidh | TestName |Go | C | |------------------|----------|----------| |TestSharedSecret | 57.95774 | 57.91092 | |TestKeyGeneration | 62.23614 | 58.12980 | |TestSharedSecret | 55.28988 | 57.23132 | |TestKeyGeneration | 61.68745 | 58.66396 | |TestSharedSecret | 63.19408 | 58.64774 | |TestKeyGeneration | 62.34022 | 61.62539 | |TestSharedSecret | 62.85453 | 68.74503 | |TestKeyGeneration | 52.58518 | 58.40115 | |TestSharedSecret | 50.77081 | 61.91699 | |TestKeyGeneration | 59.91843 | 61.09266 | |TestSharedSecret | 59.97962 | 62.98151 | |TestKeyGeneration | 64.57525 | 56.22863 | |TestSharedSecret | 56.40521 | 55.77447 | |TestKeyGeneration | 67.85850 | 58.52604 | |TestSharedSecret | 60.54290 | 65.14052 | |TestKeyGeneration | 65.45766 | 58.42823 | On average Go implementation is 2% faster.
2019-11-24 03:39:35 +00:00
t.Errorf("Wrong value\n[%X != %X]", res, pMinusOne)
}
}
func testMulRdc(t *testing.T) {
var res fp
var m1 = fp{
0x85E2579C786882D0, 0x4E3433657E18DA95,
0x850AE5507965A0B3, 0xA15BC4E676475964}
var m2 = fp{
0x85E2579C786882CF, 0x4E3433657E18DA95,
0x850AE5507965A0B3, 0xA15BC4E676475964}
// Expected
var m1m1 = fp{
0xAEBF46E92C88A4B4, 0xCFE857977B946347,
0xD3B264FF08493901, 0x6EEB3D23746B6C7C,
0xC0CA874A349D64B4, 0x7AD4A38B406F8504,
0x38B6B6CEB82472FB, 0x1587015FD7DDFC7D}
var m1m2 = fp{
0x51534771258C4624, 0x2BFEDE86504E2160,
0xE8127D5E9329670B, 0x0C84DBD584491D75,
0x656C73C68B16E38C, 0x01C0DA470B30B8DE,
0x2532E3903EAA950B, 0x3F2C28EA97FE6FEC}
// 0*0
tmp := zeroFp512
mulRdc(&res, &tmp, &tmp)
2020-05-14 00:48:43 +01:00
if !eqFp(&res, &tmp) {
cSIDH-511: (#26) Implementation of Commutative Supersingular Isogeny Diffie Hellman, based on "A faster way to CSIDH" paper (2018/782). * For fast isogeny calculation, implementation converts a curve from Montgomery to Edwards. All calculations are done on Edwards curve and then converted back to Montgomery. * As multiplication in a field Fp511 is most expensive operation the implementation contains multiple multiplications. It has most performant, assembly implementation which uses BMI2 and ADOX/ADCX instructions for modern CPUs. It also contains slower implementation which will run on older CPUs * Benchmarks (Intel SkyLake): BenchmarkGeneratePrivate 6459 172213 ns/op 0 B/op 0 allocs/op BenchmarkGenerateKeyPair 25 45800356 ns/op 0 B/op 0 allocs/op BenchmarkValidate 297 3915983 ns/op 0 B/op 0 allocs/op BenchmarkValidateRandom 184683 6231 ns/op 0 B/op 0 allocs/op BenchmarkValidateGenerated 25 48481306 ns/op 0 B/op 0 allocs/op BenchmarkDerive 19 60928763 ns/op 0 B/op 0 allocs/op BenchmarkDeriveGenerated 8 137342421 ns/op 0 B/op 0 allocs/op BenchmarkXMul 2311 494267 ns/op 1 B/op 0 allocs/op BenchmarkXAdd 2396754 501 ns/op 0 B/op 0 allocs/op BenchmarkXDbl 2072690 571 ns/op 0 B/op 0 allocs/op BenchmarkIsom 78004 15171 ns/op 0 B/op 0 allocs/op BenchmarkFp512Sub 224635152 5.33 ns/op 0 B/op 0 allocs/op BenchmarkFp512Mul 246633255 4.90 ns/op 0 B/op 0 allocs/op BenchmarkCSwap 233228547 5.10 ns/op 0 B/op 0 allocs/op BenchmarkAddRdc 87348240 12.6 ns/op 0 B/op 0 allocs/op BenchmarkSubRdc 95112787 11.7 ns/op 0 B/op 0 allocs/op BenchmarkModExpRdc 25436 46878 ns/op 0 B/op 0 allocs/op BenchmarkMulBmiAsm 19527573 60.1 ns/op 0 B/op 0 allocs/op BenchmarkMulGeneric 7117650 164 ns/op 0 B/op 0 allocs/op * Go code has very similar performance when compared to C implementation. Results from sidh_torturer (4e2996e12d68364761064341cbe1d1b47efafe23) github.com:henrydcase/sidh-torture/csidh | TestName |Go | C | |------------------|----------|----------| |TestSharedSecret | 57.95774 | 57.91092 | |TestKeyGeneration | 62.23614 | 58.12980 | |TestSharedSecret | 55.28988 | 57.23132 | |TestKeyGeneration | 61.68745 | 58.66396 | |TestSharedSecret | 63.19408 | 58.64774 | |TestKeyGeneration | 62.34022 | 61.62539 | |TestSharedSecret | 62.85453 | 68.74503 | |TestKeyGeneration | 52.58518 | 58.40115 | |TestSharedSecret | 50.77081 | 61.91699 | |TestKeyGeneration | 59.91843 | 61.09266 | |TestSharedSecret | 59.97962 | 62.98151 | |TestKeyGeneration | 64.57525 | 56.22863 | |TestSharedSecret | 56.40521 | 55.77447 | |TestKeyGeneration | 67.85850 | 58.52604 | |TestSharedSecret | 60.54290 | 65.14052 | |TestKeyGeneration | 65.45766 | 58.42823 | On average Go implementation is 2% faster.
2019-11-24 03:39:35 +00:00
t.Errorf("Wrong value\n%X", res)
}
// 1*m1 == m1
zero(&res)
mulRdc(&res, &m1, &one)
2020-05-14 00:48:43 +01:00
if !eqFp(&res, &m1) {
cSIDH-511: (#26) Implementation of Commutative Supersingular Isogeny Diffie Hellman, based on "A faster way to CSIDH" paper (2018/782). * For fast isogeny calculation, implementation converts a curve from Montgomery to Edwards. All calculations are done on Edwards curve and then converted back to Montgomery. * As multiplication in a field Fp511 is most expensive operation the implementation contains multiple multiplications. It has most performant, assembly implementation which uses BMI2 and ADOX/ADCX instructions for modern CPUs. It also contains slower implementation which will run on older CPUs * Benchmarks (Intel SkyLake): BenchmarkGeneratePrivate 6459 172213 ns/op 0 B/op 0 allocs/op BenchmarkGenerateKeyPair 25 45800356 ns/op 0 B/op 0 allocs/op BenchmarkValidate 297 3915983 ns/op 0 B/op 0 allocs/op BenchmarkValidateRandom 184683 6231 ns/op 0 B/op 0 allocs/op BenchmarkValidateGenerated 25 48481306 ns/op 0 B/op 0 allocs/op BenchmarkDerive 19 60928763 ns/op 0 B/op 0 allocs/op BenchmarkDeriveGenerated 8 137342421 ns/op 0 B/op 0 allocs/op BenchmarkXMul 2311 494267 ns/op 1 B/op 0 allocs/op BenchmarkXAdd 2396754 501 ns/op 0 B/op 0 allocs/op BenchmarkXDbl 2072690 571 ns/op 0 B/op 0 allocs/op BenchmarkIsom 78004 15171 ns/op 0 B/op 0 allocs/op BenchmarkFp512Sub 224635152 5.33 ns/op 0 B/op 0 allocs/op BenchmarkFp512Mul 246633255 4.90 ns/op 0 B/op 0 allocs/op BenchmarkCSwap 233228547 5.10 ns/op 0 B/op 0 allocs/op BenchmarkAddRdc 87348240 12.6 ns/op 0 B/op 0 allocs/op BenchmarkSubRdc 95112787 11.7 ns/op 0 B/op 0 allocs/op BenchmarkModExpRdc 25436 46878 ns/op 0 B/op 0 allocs/op BenchmarkMulBmiAsm 19527573 60.1 ns/op 0 B/op 0 allocs/op BenchmarkMulGeneric 7117650 164 ns/op 0 B/op 0 allocs/op * Go code has very similar performance when compared to C implementation. Results from sidh_torturer (4e2996e12d68364761064341cbe1d1b47efafe23) github.com:henrydcase/sidh-torture/csidh | TestName |Go | C | |------------------|----------|----------| |TestSharedSecret | 57.95774 | 57.91092 | |TestKeyGeneration | 62.23614 | 58.12980 | |TestSharedSecret | 55.28988 | 57.23132 | |TestKeyGeneration | 61.68745 | 58.66396 | |TestSharedSecret | 63.19408 | 58.64774 | |TestKeyGeneration | 62.34022 | 61.62539 | |TestSharedSecret | 62.85453 | 68.74503 | |TestKeyGeneration | 52.58518 | 58.40115 | |TestSharedSecret | 50.77081 | 61.91699 | |TestKeyGeneration | 59.91843 | 61.09266 | |TestSharedSecret | 59.97962 | 62.98151 | |TestKeyGeneration | 64.57525 | 56.22863 | |TestSharedSecret | 56.40521 | 55.77447 | |TestKeyGeneration | 67.85850 | 58.52604 | |TestSharedSecret | 60.54290 | 65.14052 | |TestKeyGeneration | 65.45766 | 58.42823 | On average Go implementation is 2% faster.
2019-11-24 03:39:35 +00:00
t.Errorf("Wrong value\n%X", res)
}
// m1*m2 < p
zero(&res)
mulRdc(&res, &m1, &m2)
2020-05-14 00:48:43 +01:00
if !eqFp(&res, &m1m2) {
cSIDH-511: (#26) Implementation of Commutative Supersingular Isogeny Diffie Hellman, based on "A faster way to CSIDH" paper (2018/782). * For fast isogeny calculation, implementation converts a curve from Montgomery to Edwards. All calculations are done on Edwards curve and then converted back to Montgomery. * As multiplication in a field Fp511 is most expensive operation the implementation contains multiple multiplications. It has most performant, assembly implementation which uses BMI2 and ADOX/ADCX instructions for modern CPUs. It also contains slower implementation which will run on older CPUs * Benchmarks (Intel SkyLake): BenchmarkGeneratePrivate 6459 172213 ns/op 0 B/op 0 allocs/op BenchmarkGenerateKeyPair 25 45800356 ns/op 0 B/op 0 allocs/op BenchmarkValidate 297 3915983 ns/op 0 B/op 0 allocs/op BenchmarkValidateRandom 184683 6231 ns/op 0 B/op 0 allocs/op BenchmarkValidateGenerated 25 48481306 ns/op 0 B/op 0 allocs/op BenchmarkDerive 19 60928763 ns/op 0 B/op 0 allocs/op BenchmarkDeriveGenerated 8 137342421 ns/op 0 B/op 0 allocs/op BenchmarkXMul 2311 494267 ns/op 1 B/op 0 allocs/op BenchmarkXAdd 2396754 501 ns/op 0 B/op 0 allocs/op BenchmarkXDbl 2072690 571 ns/op 0 B/op 0 allocs/op BenchmarkIsom 78004 15171 ns/op 0 B/op 0 allocs/op BenchmarkFp512Sub 224635152 5.33 ns/op 0 B/op 0 allocs/op BenchmarkFp512Mul 246633255 4.90 ns/op 0 B/op 0 allocs/op BenchmarkCSwap 233228547 5.10 ns/op 0 B/op 0 allocs/op BenchmarkAddRdc 87348240 12.6 ns/op 0 B/op 0 allocs/op BenchmarkSubRdc 95112787 11.7 ns/op 0 B/op 0 allocs/op BenchmarkModExpRdc 25436 46878 ns/op 0 B/op 0 allocs/op BenchmarkMulBmiAsm 19527573 60.1 ns/op 0 B/op 0 allocs/op BenchmarkMulGeneric 7117650 164 ns/op 0 B/op 0 allocs/op * Go code has very similar performance when compared to C implementation. Results from sidh_torturer (4e2996e12d68364761064341cbe1d1b47efafe23) github.com:henrydcase/sidh-torture/csidh | TestName |Go | C | |------------------|----------|----------| |TestSharedSecret | 57.95774 | 57.91092 | |TestKeyGeneration | 62.23614 | 58.12980 | |TestSharedSecret | 55.28988 | 57.23132 | |TestKeyGeneration | 61.68745 | 58.66396 | |TestSharedSecret | 63.19408 | 58.64774 | |TestKeyGeneration | 62.34022 | 61.62539 | |TestSharedSecret | 62.85453 | 68.74503 | |TestKeyGeneration | 52.58518 | 58.40115 | |TestSharedSecret | 50.77081 | 61.91699 | |TestKeyGeneration | 59.91843 | 61.09266 | |TestSharedSecret | 59.97962 | 62.98151 | |TestKeyGeneration | 64.57525 | 56.22863 | |TestSharedSecret | 56.40521 | 55.77447 | |TestKeyGeneration | 67.85850 | 58.52604 | |TestSharedSecret | 60.54290 | 65.14052 | |TestKeyGeneration | 65.45766 | 58.42823 | On average Go implementation is 2% faster.
2019-11-24 03:39:35 +00:00
t.Errorf("Wrong value\n%X", res)
}
// m1*m1 > p
zero(&res)
mulRdc(&res, &m1, &m1)
2020-05-14 00:48:43 +01:00
if !eqFp(&res, &m1m1) {
cSIDH-511: (#26) Implementation of Commutative Supersingular Isogeny Diffie Hellman, based on "A faster way to CSIDH" paper (2018/782). * For fast isogeny calculation, implementation converts a curve from Montgomery to Edwards. All calculations are done on Edwards curve and then converted back to Montgomery. * As multiplication in a field Fp511 is most expensive operation the implementation contains multiple multiplications. It has most performant, assembly implementation which uses BMI2 and ADOX/ADCX instructions for modern CPUs. It also contains slower implementation which will run on older CPUs * Benchmarks (Intel SkyLake): BenchmarkGeneratePrivate 6459 172213 ns/op 0 B/op 0 allocs/op BenchmarkGenerateKeyPair 25 45800356 ns/op 0 B/op 0 allocs/op BenchmarkValidate 297 3915983 ns/op 0 B/op 0 allocs/op BenchmarkValidateRandom 184683 6231 ns/op 0 B/op 0 allocs/op BenchmarkValidateGenerated 25 48481306 ns/op 0 B/op 0 allocs/op BenchmarkDerive 19 60928763 ns/op 0 B/op 0 allocs/op BenchmarkDeriveGenerated 8 137342421 ns/op 0 B/op 0 allocs/op BenchmarkXMul 2311 494267 ns/op 1 B/op 0 allocs/op BenchmarkXAdd 2396754 501 ns/op 0 B/op 0 allocs/op BenchmarkXDbl 2072690 571 ns/op 0 B/op 0 allocs/op BenchmarkIsom 78004 15171 ns/op 0 B/op 0 allocs/op BenchmarkFp512Sub 224635152 5.33 ns/op 0 B/op 0 allocs/op BenchmarkFp512Mul 246633255 4.90 ns/op 0 B/op 0 allocs/op BenchmarkCSwap 233228547 5.10 ns/op 0 B/op 0 allocs/op BenchmarkAddRdc 87348240 12.6 ns/op 0 B/op 0 allocs/op BenchmarkSubRdc 95112787 11.7 ns/op 0 B/op 0 allocs/op BenchmarkModExpRdc 25436 46878 ns/op 0 B/op 0 allocs/op BenchmarkMulBmiAsm 19527573 60.1 ns/op 0 B/op 0 allocs/op BenchmarkMulGeneric 7117650 164 ns/op 0 B/op 0 allocs/op * Go code has very similar performance when compared to C implementation. Results from sidh_torturer (4e2996e12d68364761064341cbe1d1b47efafe23) github.com:henrydcase/sidh-torture/csidh | TestName |Go | C | |------------------|----------|----------| |TestSharedSecret | 57.95774 | 57.91092 | |TestKeyGeneration | 62.23614 | 58.12980 | |TestSharedSecret | 55.28988 | 57.23132 | |TestKeyGeneration | 61.68745 | 58.66396 | |TestSharedSecret | 63.19408 | 58.64774 | |TestKeyGeneration | 62.34022 | 61.62539 | |TestSharedSecret | 62.85453 | 68.74503 | |TestKeyGeneration | 52.58518 | 58.40115 | |TestSharedSecret | 50.77081 | 61.91699 | |TestKeyGeneration | 59.91843 | 61.09266 | |TestSharedSecret | 59.97962 | 62.98151 | |TestKeyGeneration | 64.57525 | 56.22863 | |TestSharedSecret | 56.40521 | 55.77447 | |TestKeyGeneration | 67.85850 | 58.52604 | |TestSharedSecret | 60.54290 | 65.14052 | |TestKeyGeneration | 65.45766 | 58.42823 | On average Go implementation is 2% faster.
2019-11-24 03:39:35 +00:00
t.Errorf("Wrong value\n%X", res)
}
}
func TestMulRdc(t *testing.T) {
hasADXandBMI2 = false
testMulRdc(t)
resetCPUFeatures()
testMulRdc(t)
}
func TestModExp(t *testing.T) {
var resExp, base, exp big.Int
var baseFp, expFp, resFp, resFpExp fp
for i := 0; i < numIter; i++ {
// Perform modexp with reference implementation
// in Montgomery domain
base.SetString(fp2S(randomFp()), 16)
exp.SetString(fp2S(randomFp()), 16)
resExp.Exp(&base, &exp, modulus)
toMont(&base, true)
toMont(&resExp, true)
// Convert to fp
copy(baseFp[:], intGetU64(&base))
copy(expFp[:], intGetU64(&exp))
copy(resFpExp[:], intGetU64(&resExp))
// Perform modexp with our implementation
modExpRdc512(&resFp, &baseFp, &expFp)
2020-05-14 00:48:43 +01:00
if !eqFp(&resFp, &resFpExp) {
cSIDH-511: (#26) Implementation of Commutative Supersingular Isogeny Diffie Hellman, based on "A faster way to CSIDH" paper (2018/782). * For fast isogeny calculation, implementation converts a curve from Montgomery to Edwards. All calculations are done on Edwards curve and then converted back to Montgomery. * As multiplication in a field Fp511 is most expensive operation the implementation contains multiple multiplications. It has most performant, assembly implementation which uses BMI2 and ADOX/ADCX instructions for modern CPUs. It also contains slower implementation which will run on older CPUs * Benchmarks (Intel SkyLake): BenchmarkGeneratePrivate 6459 172213 ns/op 0 B/op 0 allocs/op BenchmarkGenerateKeyPair 25 45800356 ns/op 0 B/op 0 allocs/op BenchmarkValidate 297 3915983 ns/op 0 B/op 0 allocs/op BenchmarkValidateRandom 184683 6231 ns/op 0 B/op 0 allocs/op BenchmarkValidateGenerated 25 48481306 ns/op 0 B/op 0 allocs/op BenchmarkDerive 19 60928763 ns/op 0 B/op 0 allocs/op BenchmarkDeriveGenerated 8 137342421 ns/op 0 B/op 0 allocs/op BenchmarkXMul 2311 494267 ns/op 1 B/op 0 allocs/op BenchmarkXAdd 2396754 501 ns/op 0 B/op 0 allocs/op BenchmarkXDbl 2072690 571 ns/op 0 B/op 0 allocs/op BenchmarkIsom 78004 15171 ns/op 0 B/op 0 allocs/op BenchmarkFp512Sub 224635152 5.33 ns/op 0 B/op 0 allocs/op BenchmarkFp512Mul 246633255 4.90 ns/op 0 B/op 0 allocs/op BenchmarkCSwap 233228547 5.10 ns/op 0 B/op 0 allocs/op BenchmarkAddRdc 87348240 12.6 ns/op 0 B/op 0 allocs/op BenchmarkSubRdc 95112787 11.7 ns/op 0 B/op 0 allocs/op BenchmarkModExpRdc 25436 46878 ns/op 0 B/op 0 allocs/op BenchmarkMulBmiAsm 19527573 60.1 ns/op 0 B/op 0 allocs/op BenchmarkMulGeneric 7117650 164 ns/op 0 B/op 0 allocs/op * Go code has very similar performance when compared to C implementation. Results from sidh_torturer (4e2996e12d68364761064341cbe1d1b47efafe23) github.com:henrydcase/sidh-torture/csidh | TestName |Go | C | |------------------|----------|----------| |TestSharedSecret | 57.95774 | 57.91092 | |TestKeyGeneration | 62.23614 | 58.12980 | |TestSharedSecret | 55.28988 | 57.23132 | |TestKeyGeneration | 61.68745 | 58.66396 | |TestSharedSecret | 63.19408 | 58.64774 | |TestKeyGeneration | 62.34022 | 61.62539 | |TestSharedSecret | 62.85453 | 68.74503 | |TestKeyGeneration | 52.58518 | 58.40115 | |TestSharedSecret | 50.77081 | 61.91699 | |TestKeyGeneration | 59.91843 | 61.09266 | |TestSharedSecret | 59.97962 | 62.98151 | |TestKeyGeneration | 64.57525 | 56.22863 | |TestSharedSecret | 56.40521 | 55.77447 | |TestKeyGeneration | 67.85850 | 58.52604 | |TestSharedSecret | 60.54290 | 65.14052 | |TestKeyGeneration | 65.45766 | 58.42823 | On average Go implementation is 2% faster.
2019-11-24 03:39:35 +00:00
t.Errorf("Wrong value\n%X!=%X", resFp, intGetU64(&resExp))
}
}
}
2020-05-14 00:48:43 +01:00
// Test uses Euler's Criterion.
cSIDH-511: (#26) Implementation of Commutative Supersingular Isogeny Diffie Hellman, based on "A faster way to CSIDH" paper (2018/782). * For fast isogeny calculation, implementation converts a curve from Montgomery to Edwards. All calculations are done on Edwards curve and then converted back to Montgomery. * As multiplication in a field Fp511 is most expensive operation the implementation contains multiple multiplications. It has most performant, assembly implementation which uses BMI2 and ADOX/ADCX instructions for modern CPUs. It also contains slower implementation which will run on older CPUs * Benchmarks (Intel SkyLake): BenchmarkGeneratePrivate 6459 172213 ns/op 0 B/op 0 allocs/op BenchmarkGenerateKeyPair 25 45800356 ns/op 0 B/op 0 allocs/op BenchmarkValidate 297 3915983 ns/op 0 B/op 0 allocs/op BenchmarkValidateRandom 184683 6231 ns/op 0 B/op 0 allocs/op BenchmarkValidateGenerated 25 48481306 ns/op 0 B/op 0 allocs/op BenchmarkDerive 19 60928763 ns/op 0 B/op 0 allocs/op BenchmarkDeriveGenerated 8 137342421 ns/op 0 B/op 0 allocs/op BenchmarkXMul 2311 494267 ns/op 1 B/op 0 allocs/op BenchmarkXAdd 2396754 501 ns/op 0 B/op 0 allocs/op BenchmarkXDbl 2072690 571 ns/op 0 B/op 0 allocs/op BenchmarkIsom 78004 15171 ns/op 0 B/op 0 allocs/op BenchmarkFp512Sub 224635152 5.33 ns/op 0 B/op 0 allocs/op BenchmarkFp512Mul 246633255 4.90 ns/op 0 B/op 0 allocs/op BenchmarkCSwap 233228547 5.10 ns/op 0 B/op 0 allocs/op BenchmarkAddRdc 87348240 12.6 ns/op 0 B/op 0 allocs/op BenchmarkSubRdc 95112787 11.7 ns/op 0 B/op 0 allocs/op BenchmarkModExpRdc 25436 46878 ns/op 0 B/op 0 allocs/op BenchmarkMulBmiAsm 19527573 60.1 ns/op 0 B/op 0 allocs/op BenchmarkMulGeneric 7117650 164 ns/op 0 B/op 0 allocs/op * Go code has very similar performance when compared to C implementation. Results from sidh_torturer (4e2996e12d68364761064341cbe1d1b47efafe23) github.com:henrydcase/sidh-torture/csidh | TestName |Go | C | |------------------|----------|----------| |TestSharedSecret | 57.95774 | 57.91092 | |TestKeyGeneration | 62.23614 | 58.12980 | |TestSharedSecret | 55.28988 | 57.23132 | |TestKeyGeneration | 61.68745 | 58.66396 | |TestSharedSecret | 63.19408 | 58.64774 | |TestKeyGeneration | 62.34022 | 61.62539 | |TestSharedSecret | 62.85453 | 68.74503 | |TestKeyGeneration | 52.58518 | 58.40115 | |TestSharedSecret | 50.77081 | 61.91699 | |TestKeyGeneration | 59.91843 | 61.09266 | |TestSharedSecret | 59.97962 | 62.98151 | |TestKeyGeneration | 64.57525 | 56.22863 | |TestSharedSecret | 56.40521 | 55.77447 | |TestKeyGeneration | 67.85850 | 58.52604 | |TestSharedSecret | 60.54290 | 65.14052 | |TestKeyGeneration | 65.45766 | 58.42823 | On average Go implementation is 2% faster.
2019-11-24 03:39:35 +00:00
func TestIsNonQuadRes(t *testing.T) {
var n, nMont big.Int
var pm1o2, rawP big.Int
var nMontFp fp
// (p-1)/2
pm1o2.SetString("0x32da4747ba07c4dffe455868af1f26255a16841d76e446212d7dfe63499164e6d3d56362b3f9aa83a8b398660f85a792e1390dfa2bd6541a8dc0dc8299e3643d", 0)
// modulus value (not in montgomery)
rawP.SetString("0x65b48e8f740f89bffc8ab0d15e3e4c4ab42d083aedc88c425afbfcc69322c9cda7aac6c567f35507516730cc1f0b4f25c2721bf457aca8351b81b90533c6c87b", 0)
// There is 641 quadratic residues in this range
2020-05-14 00:48:43 +01:00
for i := uint64(1); i < uint64(numIter); i++ {
cSIDH-511: (#26) Implementation of Commutative Supersingular Isogeny Diffie Hellman, based on "A faster way to CSIDH" paper (2018/782). * For fast isogeny calculation, implementation converts a curve from Montgomery to Edwards. All calculations are done on Edwards curve and then converted back to Montgomery. * As multiplication in a field Fp511 is most expensive operation the implementation contains multiple multiplications. It has most performant, assembly implementation which uses BMI2 and ADOX/ADCX instructions for modern CPUs. It also contains slower implementation which will run on older CPUs * Benchmarks (Intel SkyLake): BenchmarkGeneratePrivate 6459 172213 ns/op 0 B/op 0 allocs/op BenchmarkGenerateKeyPair 25 45800356 ns/op 0 B/op 0 allocs/op BenchmarkValidate 297 3915983 ns/op 0 B/op 0 allocs/op BenchmarkValidateRandom 184683 6231 ns/op 0 B/op 0 allocs/op BenchmarkValidateGenerated 25 48481306 ns/op 0 B/op 0 allocs/op BenchmarkDerive 19 60928763 ns/op 0 B/op 0 allocs/op BenchmarkDeriveGenerated 8 137342421 ns/op 0 B/op 0 allocs/op BenchmarkXMul 2311 494267 ns/op 1 B/op 0 allocs/op BenchmarkXAdd 2396754 501 ns/op 0 B/op 0 allocs/op BenchmarkXDbl 2072690 571 ns/op 0 B/op 0 allocs/op BenchmarkIsom 78004 15171 ns/op 0 B/op 0 allocs/op BenchmarkFp512Sub 224635152 5.33 ns/op 0 B/op 0 allocs/op BenchmarkFp512Mul 246633255 4.90 ns/op 0 B/op 0 allocs/op BenchmarkCSwap 233228547 5.10 ns/op 0 B/op 0 allocs/op BenchmarkAddRdc 87348240 12.6 ns/op 0 B/op 0 allocs/op BenchmarkSubRdc 95112787 11.7 ns/op 0 B/op 0 allocs/op BenchmarkModExpRdc 25436 46878 ns/op 0 B/op 0 allocs/op BenchmarkMulBmiAsm 19527573 60.1 ns/op 0 B/op 0 allocs/op BenchmarkMulGeneric 7117650 164 ns/op 0 B/op 0 allocs/op * Go code has very similar performance when compared to C implementation. Results from sidh_torturer (4e2996e12d68364761064341cbe1d1b47efafe23) github.com:henrydcase/sidh-torture/csidh | TestName |Go | C | |------------------|----------|----------| |TestSharedSecret | 57.95774 | 57.91092 | |TestKeyGeneration | 62.23614 | 58.12980 | |TestSharedSecret | 55.28988 | 57.23132 | |TestKeyGeneration | 61.68745 | 58.66396 | |TestSharedSecret | 63.19408 | 58.64774 | |TestKeyGeneration | 62.34022 | 61.62539 | |TestSharedSecret | 62.85453 | 68.74503 | |TestKeyGeneration | 52.58518 | 58.40115 | |TestSharedSecret | 50.77081 | 61.91699 | |TestKeyGeneration | 59.91843 | 61.09266 | |TestSharedSecret | 59.97962 | 62.98151 | |TestKeyGeneration | 64.57525 | 56.22863 | |TestSharedSecret | 56.40521 | 55.77447 | |TestKeyGeneration | 67.85850 | 58.52604 | |TestSharedSecret | 60.54290 | 65.14052 | |TestKeyGeneration | 65.45766 | 58.42823 | On average Go implementation is 2% faster.
2019-11-24 03:39:35 +00:00
n.SetUint64(i)
n.Exp(&n, &pm1o2, &rawP)
// exp == 1 iff n is quadratic non-residue
exp := n.Cmp(big.NewInt(1))
if exp < 0 {
panic("Should never happen")
}
nMont.SetUint64(i)
toMont(&nMont, true)
copy(nMontFp[:], intGetU64(&nMont))
ret := nMontFp.isNonQuadRes()
if ret != exp {
toMont(&nMont, false)
t.Errorf("Test failed for value %s", nMont.Text(10))
}
}
}
func TestCheckSmaller(t *testing.T) {
// p-1
var pMin1 = p
pMin1[0]--
// p-1 < p => 1
if !isLess(&pMin1, &p) {
t.Error("pMin1>p")
}
// p < p-1 => 0
if isLess(&p, &pMin1) {
t.Error("p>pMin1")
}
// p == p => 0
if isLess(&p, &p) {
t.Error("p==p")
}
}
func BenchmarkFp512Sub(b *testing.B) {
var arg1 fp
arg2, arg3 := randomFp(), randomFp()
for n := 0; n < b.N; n++ {
sub512(&arg1, &arg2, &arg3)
}
}
func BenchmarkFp512Mul(b *testing.B) {
var arg1 = rand.Uint64()
arg2, arg3 := randomFp(), randomFp()
for n := 0; n < b.N; n++ {
mul512(&arg2, &arg3, arg1)
}
}
func BenchmarkCSwap(b *testing.B) {
arg1 := randomFp()
arg2 := randomFp()
for n := 0; n < b.N; n++ {
cswap512(&arg1, &arg2, uint8(n%2))
}
}
func BenchmarkAddRdc(b *testing.B) {
var res fp
arg1 := randomFp()
arg2 := randomFp()
for n := 0; n < b.N; n++ {
addRdc(&res, &arg1, &arg2)
}
}
func BenchmarkSubRdc(b *testing.B) {
arg1 := randomFp()
arg2 := randomFp()
var res fp
for n := 0; n < b.N; n++ {
subRdc(&res, &arg1, &arg2)
}
}
func BenchmarkModExpRdc(b *testing.B) {
arg1 := randomFp()
arg2 := randomFp()
var res fp
for n := 0; n < b.N; n++ {
modExpRdc512(&res, &arg1, &arg2)
}
}
func BenchmarkMulGeneric(b *testing.B) {
arg1 := randomFp()
arg2 := randomFp()
var res fp
for n := 0; n < b.N; n++ {
mulGeneric(&res, &arg1, &arg2)
}
}
func BenchmarkMulBmiAsm(b *testing.B) {
arg1 := randomFp()
arg2 := randomFp()
var res fp
for n := 0; n < b.N; n++ {
mulRdc(&res, &arg1, &arg2)
}
}
func BenchmarkMulGenAsm(b *testing.B) {
arg1 := randomFp()
arg2 := randomFp()
var res fp
hasADXandBMI2 = false
for n := 0; n < b.N; n++ {
mulRdc(&res, &arg1, &arg2)
}
resetCPUFeatures()
}