nobs/dh/csidh/fp511_generic.go

// +build noasm arm64

package csidh

import "math/bits"

func mul512(r, m1 *fp, m2 uint64) {
	var c, h, l uint64

	c, r[0] = bits.Mul64(m2, m1[0])

	h, l = bits.Mul64(m2, m1[1])
	r[1], c = bits.Add64(l, c, 0)
	c = h + c

	h, l = bits.Mul64(m2, m1[2])
	r[2], c = bits.Add64(l, c, 0)
	c = h + c

	h, l = bits.Mul64(m2, m1[3])
	r[3], c = bits.Add64(l, c, 0)
	c = h + c

	h, l = bits.Mul64(m2, m1[4])
	r[4], c = bits.Add64(l, c, 0)
	c = h + c

	h, l = bits.Mul64(m2, m1[5])
	r[5], c = bits.Add64(l, c, 0)
	c = h + c

	h, l = bits.Mul64(m2, m1[6])
	r[6], c = bits.Add64(l, c, 0)
	c = h + c

	h, l = bits.Mul64(m2, m1[7])
	r[7], _ = bits.Add64(l, c, 0)
}

func mul576(r *[9]uint64, m1 *fp, m2 uint64) {
	var c, h, l uint64

	c, r[0] = bits.Mul64(m2, m1[0])

	h, l = bits.Mul64(m2, m1[1])
	r[1], c = bits.Add64(l, c, 0)
	c = h + c

	h, l = bits.Mul64(m2, m1[2])
	r[2], c = bits.Add64(l, c, 0)
	c = h + c

	h, l = bits.Mul64(m2, m1[3])
	r[3], c = bits.Add64(l, c, 0)
	c = h + c

	h, l = bits.Mul64(m2, m1[4])
	r[4], c = bits.Add64(l, c, 0)
	c = h + c

	h, l = bits.Mul64(m2, m1[5])
	r[5], c = bits.Add64(l, c, 0)
	c = h + c

	h, l = bits.Mul64(m2, m1[6])
	r[6], c = bits.Add64(l, c, 0)
	c = h + c

	h, l = bits.Mul64(m2, m1[7])
	r[7], c = bits.Add64(l, c, 0)
	r[8], c = bits.Add64(h, c, 0)
	r[8] += c
}

func cswap512(x, y *fp, choice uint8) {
	var tmp uint64
	mask64 := 0 - uint64(choice)

	for i := 0; i < numWords; i++ {
		tmp = mask64 & (x[i] ^ y[i])
		x[i] = tmp ^ x[i]
		y[i] = tmp ^ y[i]
	}
}

func mul(res, x, y *fp) {
	mulGeneric(res, x, y)
}

// mulRdc performs montgomery multiplication r = x * y mod P.
// Returned result r is already reduced and in Montgomery domain.
func mulRdc(r, x, y *fp) {
	var t fp
	var c uint64

	mulGeneric(r, x, y)

	// if p <= r < 2p then r = r-p
	t[0], c = bits.Sub64(r[0], p[0], 0)
	t[1], c = bits.Sub64(r[1], p[1], c)
	t[2], c = bits.Sub64(r[2], p[2], c)
	t[3], c = bits.Sub64(r[3], p[3], c)
	t[4], c = bits.Sub64(r[4], p[4], c)
	t[5], c = bits.Sub64(r[5], p[5], c)
	t[6], c = bits.Sub64(r[6], p[6], c)
	t[7], c = bits.Sub64(r[7], p[7], c)

	var w = uint64(0 - uint64(c))
	r[0] = ctPick64(w, r[0], t[0])
	r[1] = ctPick64(w, r[1], t[1])
	r[2] = ctPick64(w, r[2], t[2])
	r[3] = ctPick64(w, r[3], t[3])
	r[4] = ctPick64(w, r[4], t[4])
	r[5] = ctPick64(w, r[5], t[5])
	r[6] = ctPick64(w, r[6], t[6])
	r[7] = ctPick64(w, r[7], t[7])
}
cSIDH-511: (#26) Implementation of Commutative Supersingular Isogeny Diffie Hellman, based on "A faster way to CSIDH" paper (2018/782). * For fast isogeny calculation, implementation converts a curve from Montgomery to Edwards. All calculations are done on Edwards curve and then converted back to Montgomery. * As multiplication in a field Fp511 is most expensive operation the implementation contains multiple multiplications. It has most performant, assembly implementation which uses BMI2 and ADOX/ADCX instructions for modern CPUs. It also contains slower implementation which will run on older CPUs * Benchmarks (Intel SkyLake): BenchmarkGeneratePrivate 6459 172213 ns/op 0 B/op 0 allocs/op BenchmarkGenerateKeyPair 25 45800356 ns/op 0 B/op 0 allocs/op BenchmarkValidate 297 3915983 ns/op 0 B/op 0 allocs/op BenchmarkValidateRandom 184683 6231 ns/op 0 B/op 0 allocs/op BenchmarkValidateGenerated 25 48481306 ns/op 0 B/op 0 allocs/op BenchmarkDerive 19 60928763 ns/op 0 B/op 0 allocs/op BenchmarkDeriveGenerated 8 137342421 ns/op 0 B/op 0 allocs/op BenchmarkXMul 2311 494267 ns/op 1 B/op 0 allocs/op BenchmarkXAdd 2396754 501 ns/op 0 B/op 0 allocs/op BenchmarkXDbl 2072690 571 ns/op 0 B/op 0 allocs/op BenchmarkIsom 78004 15171 ns/op 0 B/op 0 allocs/op BenchmarkFp512Sub 224635152 5.33 ns/op 0 B/op 0 allocs/op BenchmarkFp512Mul 246633255 4.90 ns/op 0 B/op 0 allocs/op BenchmarkCSwap 233228547 5.10 ns/op 0 B/op 0 allocs/op BenchmarkAddRdc 87348240 12.6 ns/op 0 B/op 0 allocs/op BenchmarkSubRdc 95112787 11.7 ns/op 0 B/op 0 allocs/op BenchmarkModExpRdc 25436 46878 ns/op 0 B/op 0 allocs/op BenchmarkMulBmiAsm 19527573 60.1 ns/op 0 B/op 0 allocs/op BenchmarkMulGeneric 7117650 164 ns/op 0 B/op 0 allocs/op * Go code has very similar performance when compared to C implementation. Results from sidh_torturer (4e2996e12d68364761064341cbe1d1b47efafe23) github.com:henrydcase/sidh-torture/csidh \| TestName \|Go \| C \| \|------------------\|----------\|----------\| \|TestSharedSecret \| 57.95774 \| 57.91092 \| \|TestKeyGeneration \| 62.23614 \| 58.12980 \| \|TestSharedSecret \| 55.28988 \| 57.23132 \| \|TestKeyGeneration \| 61.68745 \| 58.66396 \| \|TestSharedSecret \| 63.19408 \| 58.64774 \| \|TestKeyGeneration \| 62.34022 \| 61.62539 \| \|TestSharedSecret \| 62.85453 \| 68.74503 \| \|TestKeyGeneration \| 52.58518 \| 58.40115 \| \|TestSharedSecret \| 50.77081 \| 61.91699 \| \|TestKeyGeneration \| 59.91843 \| 61.09266 \| \|TestSharedSecret \| 59.97962 \| 62.98151 \| \|TestKeyGeneration \| 64.57525 \| 56.22863 \| \|TestSharedSecret \| 56.40521 \| 55.77447 \| \|TestKeyGeneration \| 67.85850 \| 58.52604 \| \|TestSharedSecret \| 60.54290 \| 65.14052 \| \|TestKeyGeneration \| 65.45766 \| 58.42823 \| On average Go implementation is 2% faster. 2019-11-24 03:39:35 +00:00			`// +build noasm arm64`

			`package csidh`

			`import "math/bits"`

			`func mul512(r, m1 *fp, m2 uint64) {`
			`var c, h, l uint64`

			`c, r[0] = bits.Mul64(m2, m1[0])`

			`h, l = bits.Mul64(m2, m1[1])`
			`r[1], c = bits.Add64(l, c, 0)`
			`c = h + c`

			`h, l = bits.Mul64(m2, m1[2])`
			`r[2], c = bits.Add64(l, c, 0)`
			`c = h + c`

			`h, l = bits.Mul64(m2, m1[3])`
			`r[3], c = bits.Add64(l, c, 0)`
			`c = h + c`

			`h, l = bits.Mul64(m2, m1[4])`
			`r[4], c = bits.Add64(l, c, 0)`
			`c = h + c`

			`h, l = bits.Mul64(m2, m1[5])`
			`r[5], c = bits.Add64(l, c, 0)`
			`c = h + c`

			`h, l = bits.Mul64(m2, m1[6])`
			`r[6], c = bits.Add64(l, c, 0)`
			`c = h + c`

			`h, l = bits.Mul64(m2, m1[7])`
			`r[7], _ = bits.Add64(l, c, 0)`
			`}`

			`func mul576(r [9]uint64, m1 fp, m2 uint64) {`
			`var c, h, l uint64`

			`c, r[0] = bits.Mul64(m2, m1[0])`

			`h, l = bits.Mul64(m2, m1[1])`
			`r[1], c = bits.Add64(l, c, 0)`
			`c = h + c`

			`h, l = bits.Mul64(m2, m1[2])`
			`r[2], c = bits.Add64(l, c, 0)`
			`c = h + c`

			`h, l = bits.Mul64(m2, m1[3])`
			`r[3], c = bits.Add64(l, c, 0)`
			`c = h + c`

			`h, l = bits.Mul64(m2, m1[4])`
			`r[4], c = bits.Add64(l, c, 0)`
			`c = h + c`

			`h, l = bits.Mul64(m2, m1[5])`
			`r[5], c = bits.Add64(l, c, 0)`
			`c = h + c`

			`h, l = bits.Mul64(m2, m1[6])`
			`r[6], c = bits.Add64(l, c, 0)`
			`c = h + c`

			`h, l = bits.Mul64(m2, m1[7])`
			`r[7], c = bits.Add64(l, c, 0)`
			`r[8], c = bits.Add64(h, c, 0)`
			`r[8] += c`
			`}`

			`func cswap512(x, y *fp, choice uint8) {`
			`var tmp uint64`
			`mask64 := 0 - uint64(choice)`

			`for i := 0; i < numWords; i++ {`
			`tmp = mask64 & (x[i] ^ y[i])`
			`x[i] = tmp ^ x[i]`
			`y[i] = tmp ^ y[i]`
			`}`
			`}`

			`func mul(res, x, y *fp) {`
			`mulGeneric(res, x, y)`
			`}`

			`// mulRdc performs montgomery multiplication r = x * y mod P.`
			`// Returned result r is already reduced and in Montgomery domain.`
			`func mulRdc(r, x, y *fp) {`
			`var t fp`
			`var c uint64`

			`mulGeneric(r, x, y)`

			`// if p <= r < 2p then r = r-p`
			`t[0], c = bits.Sub64(r[0], p[0], 0)`
			`t[1], c = bits.Sub64(r[1], p[1], c)`
			`t[2], c = bits.Sub64(r[2], p[2], c)`
			`t[3], c = bits.Sub64(r[3], p[3], c)`
			`t[4], c = bits.Sub64(r[4], p[4], c)`
			`t[5], c = bits.Sub64(r[5], p[5], c)`
			`t[6], c = bits.Sub64(r[6], p[6], c)`
			`t[7], c = bits.Sub64(r[7], p[7], c)`

			`var w = uint64(0 - uint64(c))`
			`r[0] = ctPick64(w, r[0], t[0])`
			`r[1] = ctPick64(w, r[1], t[1])`
			`r[2] = ctPick64(w, r[2], t[2])`
			`r[3] = ctPick64(w, r[3], t[3])`
			`r[4] = ctPick64(w, r[4], t[4])`
			`r[5] = ctPick64(w, r[5], t[5])`
			`r[6] = ctPick64(w, r[6], t[6])`
			`r[7] = ctPick64(w, r[7], t[7])`
			`}`