1
0
mirror of https://github.com/henrydcase/nobs.git synced 2024-11-26 17:11:22 +00:00
nobs/dh/csidh/curve.go
Kris Kwiatkowski 7efbbf4745 cSIDH-511: (#26)
Implementation of Commutative Supersingular Isogeny Diffie Hellman,
based on "A faster way to CSIDH" paper (2018/782).

* For fast isogeny calculation, implementation converts a curve from
  Montgomery to Edwards. All calculations are done on Edwards curve
  and then converted back to Montgomery.
* As multiplication in a field Fp511 is most expensive operation
  the implementation contains multiple multiplications. It has
  most performant, assembly implementation which uses BMI2 and
  ADOX/ADCX instructions for modern CPUs. It also contains
  slower implementation which will run on older CPUs

* Benchmarks (Intel SkyLake):

  BenchmarkGeneratePrivate   	    6459	    172213 ns/op	       0 B/op	       0 allocs/op
  BenchmarkGenerateKeyPair   	      25	  45800356 ns/op	       0 B/op	       0 allocs/op
  BenchmarkValidate          	     297	   3915983 ns/op	       0 B/op	       0 allocs/op
  BenchmarkValidateRandom    	  184683	      6231 ns/op	       0 B/op	       0 allocs/op
  BenchmarkValidateGenerated 	      25	  48481306 ns/op	       0 B/op	       0 allocs/op
  BenchmarkDerive            	      19	  60928763 ns/op	       0 B/op	       0 allocs/op
  BenchmarkDeriveGenerated   	       8	 137342421 ns/op	       0 B/op	       0 allocs/op
  BenchmarkXMul              	    2311	    494267 ns/op	       1 B/op	       0 allocs/op
  BenchmarkXAdd              	 2396754	       501 ns/op	       0 B/op	       0 allocs/op
  BenchmarkXDbl              	 2072690	       571 ns/op	       0 B/op	       0 allocs/op
  BenchmarkIsom              	   78004	     15171 ns/op	       0 B/op	       0 allocs/op
  BenchmarkFp512Sub          	224635152	         5.33 ns/op	       0 B/op	       0 allocs/op
  BenchmarkFp512Mul          	246633255	         4.90 ns/op	       0 B/op	       0 allocs/op
  BenchmarkCSwap             	233228547	         5.10 ns/op	       0 B/op	       0 allocs/op
  BenchmarkAddRdc            	87348240	        12.6 ns/op	       0 B/op	       0 allocs/op
  BenchmarkSubRdc            	95112787	        11.7 ns/op	       0 B/op	       0 allocs/op
  BenchmarkModExpRdc         	   25436	     46878 ns/op	       0 B/op	       0 allocs/op
  BenchmarkMulBmiAsm         	19527573	        60.1 ns/op	       0 B/op	       0 allocs/op
  BenchmarkMulGeneric        	 7117650	       164 ns/op	       0 B/op	       0 allocs/op

* Go code has very similar performance when compared to C
  implementation.
  Results from sidh_torturer (4e2996e12d68364761064341cbe1d1b47efafe23)
  github.com:henrydcase/sidh-torture/csidh

  | TestName         |Go        | C        |
  |------------------|----------|----------|
  |TestSharedSecret  | 57.95774 | 57.91092 |
  |TestKeyGeneration | 62.23614 | 58.12980 |
  |TestSharedSecret  | 55.28988 | 57.23132 |
  |TestKeyGeneration | 61.68745 | 58.66396 |
  |TestSharedSecret  | 63.19408 | 58.64774 |
  |TestKeyGeneration | 62.34022 | 61.62539 |
  |TestSharedSecret  | 62.85453 | 68.74503 |
  |TestKeyGeneration | 52.58518 | 58.40115 |
  |TestSharedSecret  | 50.77081 | 61.91699 |
  |TestKeyGeneration | 59.91843 | 61.09266 |
  |TestSharedSecret  | 59.97962 | 62.98151 |
  |TestKeyGeneration | 64.57525 | 56.22863 |
  |TestSharedSecret  | 56.40521 | 55.77447 |
  |TestKeyGeneration | 67.85850 | 58.52604 |
  |TestSharedSecret  | 60.54290 | 65.14052 |
  |TestKeyGeneration | 65.45766 | 58.42823 |

  On average Go implementation is 2% faster.
2019-11-25 15:03:29 +00:00

204 lines
4.9 KiB
Go

package csidh
// Implements differential arithmetic in P^1 for montgomery
// curves a mapping: x(P),x(Q),x(P-Q) -> x(P+Q)
// PaQ = P + Q
// This algorithms is correctly defined only for cases when
// P!=inf, Q!=inf, P!=Q and P!=-Q
func xAdd(PaQ, P, Q, PdQ *point) {
var t0, t1, t2, t3 fp
addRdc(&t0, &P.x, &P.z)
subRdc(&t1, &P.x, &P.z)
addRdc(&t2, &Q.x, &Q.z)
subRdc(&t3, &Q.x, &Q.z)
mulRdc(&t0, &t0, &t3)
mulRdc(&t1, &t1, &t2)
addRdc(&t2, &t0, &t1)
subRdc(&t3, &t0, &t1)
mulRdc(&t2, &t2, &t2) // sqr
mulRdc(&t3, &t3, &t3) // sqr
mulRdc(&PaQ.x, &PdQ.z, &t2)
mulRdc(&PaQ.z, &PdQ.x, &t3)
}
// Q = 2*P on a montgomery curve E(x): x^3 + A*x^2 + x
// It is correctly defined for all P != inf
func xDbl(Q, P, A *point) {
var t0, t1, t2 fp
addRdc(&t0, &P.x, &P.z)
mulRdc(&t0, &t0, &t0) // sqr
subRdc(&t1, &P.x, &P.z)
mulRdc(&t1, &t1, &t1) // sqr
subRdc(&t2, &t0, &t1)
mulRdc(&t1, &four, &t1)
mulRdc(&t1, &t1, &A.z)
mulRdc(&Q.x, &t0, &t1)
addRdc(&t0, &A.z, &A.z)
addRdc(&t0, &t0, &A.x)
mulRdc(&t0, &t0, &t2)
addRdc(&t0, &t0, &t1)
mulRdc(&Q.z, &t0, &t2)
}
// PaP = 2*P; PaQ = P+Q
// PaP can override P and PaQ can override Q
func xDblAdd(PaP, PaQ, P, Q, PdQ *point, A24 *coeff) {
var t0, t1, t2 fp
addRdc(&t0, &P.x, &P.z)
subRdc(&t1, &P.x, &P.z)
mulRdc(&PaP.x, &t0, &t0)
subRdc(&t2, &Q.x, &Q.z)
addRdc(&PaQ.x, &Q.x, &Q.z)
mulRdc(&t0, &t0, &t2)
mulRdc(&PaP.z, &t1, &t1)
mulRdc(&t1, &t1, &PaQ.x)
subRdc(&t2, &PaP.x, &PaP.z)
mulRdc(&PaP.z, &PaP.z, &A24.c)
mulRdc(&PaP.x, &PaP.x, &PaP.z)
mulRdc(&PaQ.x, &A24.a, &t2)
subRdc(&PaQ.z, &t0, &t1)
addRdc(&PaP.z, &PaP.z, &PaQ.x)
addRdc(&PaQ.x, &t0, &t1)
mulRdc(&PaP.z, &PaP.z, &t2)
mulRdc(&PaQ.z, &PaQ.z, &PaQ.z)
mulRdc(&PaQ.x, &PaQ.x, &PaQ.x)
mulRdc(&PaQ.z, &PaQ.z, &PdQ.x)
mulRdc(&PaQ.x, &PaQ.x, &PdQ.z)
}
// Swap P1 with P2 in constant time. The 'choice'
// parameter must have a value of either 1 (results
// in swap) or 0 (results in no-swap).
func cswappoint(P1, P2 *point, choice uint8) {
cswap512(&P1.x, &P2.x, choice)
cswap512(&P1.z, &P2.z, choice)
}
// A uniform Montgomery ladder. co is A coefficient of
// x^3 + A*x^2 + x curve. k MUST be > 0
//
// kP = [k]P. xM=x(0 + k*P)
//
// non-constant time.
func xMul512(kP, P *point, co *coeff, k *fp) {
var A24 coeff
var Q point
var j uint
var A = point{x: co.a, z: co.c}
var R = *P
// Precompyte A24 = (A+2C:4C) => (A24.x = A.x+2A.z; A24.z = 4*A.z)
addRdc(&A24.a, &co.c, &co.c)
addRdc(&A24.a, &A24.a, &co.a)
mulRdc(&A24.c, &co.c, &four)
// Skip initial 0 bits.
for j = 511; j > 0; j-- {
// performance hit from making it constant-time is actually
// quite big, so... unsafe branch for now
if uint8(k[j>>6]>>(j&63)&1) != 0 {
break
}
}
xDbl(&Q, P, &A)
prevBit := uint8(1)
for i := j; i > 0; {
i--
bit := uint8(k[i>>6] >> (i & 63) & 1)
swap := prevBit ^ bit
prevBit = bit
cswappoint(&Q, &R, swap)
xDblAdd(&Q, &R, &Q, &R, P, &A24)
}
cswappoint(&Q, &R, uint8(k[0]&1))
*kP = Q
}
func isom(img *point, co *coeff, kern *point, order uint64) {
var t0, t1, t2, S, D fp
var Q, prod point
var coEd coeff
var M = [3]point{*kern}
// Compute twisted Edwards coefficients
// coEd.a = co.a + 2*co.c
// coEd.c = co.a - 2*co.c
// coEd.a*X^2 + Y^2 = 1 + coEd.c*X^2*Y^2
addRdc(&coEd.c, &co.c, &co.c)
addRdc(&coEd.a, &co.a, &coEd.c)
subRdc(&coEd.c, &co.a, &coEd.c)
// Transfer point to twisted Edwards YZ-coordinates
// (X:Z)->(Y:Z) = (X-Z : X+Z)
addRdc(&S, &img.x, &img.z)
subRdc(&D, &img.x, &img.z)
subRdc(&prod.x, &kern.x, &kern.z)
addRdc(&prod.z, &kern.x, &kern.z)
mulRdc(&t1, &prod.x, &S)
mulRdc(&t0, &prod.z, &D)
addRdc(&Q.x, &t0, &t1)
subRdc(&Q.z, &t0, &t1)
xDbl(&M[1], kern, &point{x: co.a, z: co.c})
// TODO: Not constant time.
for i := uint64(1); i < order>>1; i++ {
if i >= 2 {
xAdd(&M[i%3], &M[(i-1)%3], kern, &M[(i-2)%3])
}
subRdc(&t1, &M[i%3].x, &M[i%3].z)
addRdc(&t0, &M[i%3].x, &M[i%3].z)
mulRdc(&prod.x, &prod.x, &t1)
mulRdc(&prod.z, &prod.z, &t0)
mulRdc(&t1, &t1, &S)
mulRdc(&t0, &t0, &D)
addRdc(&t2, &t0, &t1)
mulRdc(&Q.x, &Q.x, &t2)
subRdc(&t2, &t0, &t1)
mulRdc(&Q.z, &Q.z, &t2)
}
mulRdc(&Q.x, &Q.x, &Q.x)
mulRdc(&Q.z, &Q.z, &Q.z)
mulRdc(&img.x, &img.x, &Q.x)
mulRdc(&img.z, &img.z, &Q.z)
// coEd.a^order and coEd.c^order
modExpRdc64(&coEd.a, &coEd.a, order)
modExpRdc64(&coEd.c, &coEd.c, order)
// prod^8
mulRdc(&prod.x, &prod.x, &prod.x)
mulRdc(&prod.x, &prod.x, &prod.x)
mulRdc(&prod.x, &prod.x, &prod.x)
mulRdc(&prod.z, &prod.z, &prod.z)
mulRdc(&prod.z, &prod.z, &prod.z)
mulRdc(&prod.z, &prod.z, &prod.z)
// Compute image curve params
mulRdc(&coEd.c, &coEd.c, &prod.x)
mulRdc(&coEd.a, &coEd.a, &prod.z)
// Convert curve coefficients back to Montgomery
addRdc(&co.a, &coEd.a, &coEd.c)
subRdc(&co.c, &coEd.a, &coEd.c)
addRdc(&co.a, &co.a, &co.a)
}
// evaluates x^3 + Ax^2 + x
func montEval(res, A, x *fp) {
var t fp
*res = *x
mulRdc(res, res, res)
mulRdc(&t, A, x)
addRdc(res, res, &t)
addRdc(res, res, &one)
mulRdc(res, res, x)
}