From c3a637c2ecba720bcad1cc633d79eeeec3ce9500 Mon Sep 17 00:00:00 2001 From: Kris Kwiatkowski Date: Mon, 23 Jul 2018 15:55:16 +0100 Subject: [PATCH] p751: implements platform independent field arithmetic * Code is much slower than x86 specialized implementation * Cross checked on ARMv8 (32-bit) --- p751toolbox/field.go | 64 +++------ p751toolbox/field_amd64.s | 2 + p751toolbox/field_decl.go | 42 ++++++ p751toolbox/field_generic.go | 254 +++++++++++++++++++++++++++++++++++ p751toolbox/field_test.go | 39 ++++++ sidh/sidh.go | 12 +- sidh/sidh_amd64.s | 12 +- sidh/sidh_decl.go | 13 ++ sidh/sidh_generic.go | 52 +++++++ sidh/sidh_test.go | 15 ++- 10 files changed, 442 insertions(+), 63 deletions(-) create mode 100644 p751toolbox/field_decl.go create mode 100644 p751toolbox/field_generic.go create mode 100644 sidh/sidh_decl.go create mode 100644 sidh/sidh_generic.go diff --git a/p751toolbox/field.go b/p751toolbox/field.go index b5dfc7b..cfd6cee 100644 --- a/p751toolbox/field.go +++ b/p751toolbox/field.go @@ -24,6 +24,27 @@ var oneExtensionField = ExtensionFieldElement{ B: Fp751Element{0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0}, } +// 2*p751 +var p751x2 = Fp751Element{ + 0xFFFFFFFFFFFFFFFE, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, + 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0xDD5FFFFFFFFFFFFF, + 0xC7D92D0A93F0F151, 0xB52B363427EF98ED, 0x109D30CFADD7D0ED, + 0x0AC56A08B964AE90, 0x1C25213F2F75B8CD, 0x0000DFCBAA83EE38} + +// p751 +var p751 = Fp751Element{ + 0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, + 0xffffffffffffffff, 0xffffffffffffffff, 0xeeafffffffffffff, + 0xe3ec968549f878a8, 0xda959b1a13f7cc76, 0x084e9867d6ebe876, + 0x8562b5045cb25748, 0x0e12909f97badc66, 0x00006fe5d541f71c} + +// p751 + 1 +var p751p1 = Fp751Element{ + 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, + 0x0000000000000000, 0x0000000000000000, 0xeeb0000000000000, + 0xe3ec968549f878a8, 0xda959b1a13f7cc76, 0x084e9867d6ebe876, + 0x8562b5045cb25748, 0x0e12909f97badc66, 0x00006fe5d541f71c} + // Set dest = 0. // // Returns dest to allow chaining operations. @@ -451,11 +472,6 @@ func (dest *PrimeFieldElement) P34(x *PrimeFieldElement) *PrimeFieldElement { const fp751NumWords = 12 -// (2^768) mod p. -// This can't be a constant because Go doesn't allow array constants, so try -// not to modify it. -var montgomeryR = Fp751Element{149933, 0, 0, 0, 0, 9444048418595930112, 6136068611055053926, 7599709743867700432, 14455912356952952366, 5522737203492907350, 1222606818372667369, 49869481633250} - // (2^768)^2 mod p // This can't be a constant because Go doesn't allow array constants, so try // not to modify it. @@ -472,44 +488,6 @@ type Fp751Element [fp751NumWords]uint64 // Represents an intermediate product of two elements of the base field F_p. type fp751X2 [2 * fp751NumWords]uint64 -// If choice = 0, leave x,y unchanged. If choice = 1, set x,y = y,x. -// This function executes in constant time. -//go:noescape -func fp751ConditionalSwap(x, y *Fp751Element, choice uint8) - -// Compute z = x + y (mod p). -//go:noescape -func fp751AddReduced(z, x, y *Fp751Element) - -// Compute z = x - y (mod p). -//go:noescape -func fp751SubReduced(z, x, y *Fp751Element) - -// Compute z = x + y, without reducing mod p. -//go:noescape -func fp751AddLazy(z, x, y *Fp751Element) - -// Compute z = x + y, without reducing mod p. -//go:noescape -func fp751X2AddLazy(z, x, y *fp751X2) - -// Compute z = x - y, without reducing mod p. -//go:noescape -func fp751X2SubLazy(z, x, y *fp751X2) - -// Compute z = x * y. -//go:noescape -func fp751Mul(z *fp751X2, x, y *Fp751Element) - -// Perform Montgomery reduction: set z = x R^{-1} (mod p). -// Destroys the input value. -//go:noescape -func fp751MontgomeryReduce(z *Fp751Element, x *fp751X2) - -// Reduce a field element in [0, 2*p) to one in [0,p). -//go:noescape -func fp751StrongReduce(x *Fp751Element) - func (x Fp751Element) vartimeEq(y Fp751Element) bool { fp751StrongReduce(&x) fp751StrongReduce(&y) diff --git a/p751toolbox/field_amd64.s b/p751toolbox/field_amd64.s index 6618987..4596d8f 100644 --- a/p751toolbox/field_amd64.s +++ b/p751toolbox/field_amd64.s @@ -1,3 +1,5 @@ +// +build amd64,!noasm + #include "textflag.h" // p751 + 1 diff --git a/p751toolbox/field_decl.go b/p751toolbox/field_decl.go new file mode 100644 index 0000000..37d462a --- /dev/null +++ b/p751toolbox/field_decl.go @@ -0,0 +1,42 @@ +// +build amd64,!noasm + +package p751toolbox + +// If choice = 0, leave x,y unchanged. If choice = 1, set x,y = y,x. +// If choice is neither 0 nor 1 then behaviour is undefined. +// This function executes in constant time. +//go:noescape +func fp751ConditionalSwap(x, y *Fp751Element, choice uint8) + +// Compute z = x + y (mod p). +//go:noescape +func fp751AddReduced(z, x, y *Fp751Element) + +// Compute z = x - y (mod p). +//go:noescape +func fp751SubReduced(z, x, y *Fp751Element) + +// Compute z = x + y, without reducing mod p. +//go:noescape +func fp751AddLazy(z, x, y *Fp751Element) + +// Compute z = x + y, without reducing mod p. +//go:noescape +func fp751X2AddLazy(z, x, y *fp751X2) + +// Compute z = x - y, without reducing mod p. +//go:noescape +func fp751X2SubLazy(z, x, y *fp751X2) + +// Compute z = x * y. +//go:noescape +func fp751Mul(z *fp751X2, x, y *Fp751Element) + +// Perform Montgomery reduction: set z = x R^{-1} (mod 2*p). +// Destroys the input value. +//go:noescape +func fp751MontgomeryReduce(z *Fp751Element, x *fp751X2) + +// Reduce a field element in [0, 2*p) to one in [0,p). +//go:noescape +func fp751StrongReduce(x *Fp751Element) diff --git a/p751toolbox/field_generic.go b/p751toolbox/field_generic.go new file mode 100644 index 0000000..757ed0a --- /dev/null +++ b/p751toolbox/field_generic.go @@ -0,0 +1,254 @@ +// +build noasm arm64 arm + +package p751toolbox + +// helper used for uint128 representation +type uint128 struct { + H, L uint64 +} + +// Adds 2 64bit digits in constant time. +// Returns result and carry (1 or 0) +func addc64(cin, a, b uint64) (ret, cout uint64) { + t := a + cin + ret = b + t + cout = ((a & b) | ((a | b) & (^ret))) >> 63 + return +} + +// Substracts 2 64bit digits in constant time. +// Returns result and borrow (1 or 0) +func subc64(bIn, a, b uint64) (ret, bOut uint64) { + var tmp1 = a - b + // Set bOut if bIn!=0 and tmp1==0 in constant time + bOut = bIn & (1 ^ ((tmp1 | uint64(0-tmp1)) >> 63)) + // Constant time check if x> 63 + ret = tmp1 - bIn + return +} + +// Multiplies 2 64bit digits in constant time +func mul64(a, b uint64) (res uint128) { + var al, bl, ah, bh, albl, albh, ahbl, ahbh uint64 + var res1, res2, res3 uint64 + var carry, maskL, maskH, temp uint64 + + maskL = (^maskL) >> 32 + maskH = ^maskL + + al = a & maskL + ah = a >> 32 + bl = b & maskL + bh = b >> 32 + + albl = al * bl + albh = al * bh + ahbl = ah * bl + ahbh = ah * bh + res.L = albl & maskL + + res1 = albl >> 32 + res2 = ahbl & maskL + res3 = albh & maskL + temp = res1 + res2 + res3 + carry = temp >> 32 + res.L ^= temp << 32 + + res1 = ahbl >> 32 + res2 = albh >> 32 + res3 = ahbh & maskL + temp = res1 + res2 + res3 + carry + res.H = temp & maskL + carry = temp & maskH + res.H ^= (ahbh & maskH) + carry + return +} + +// Compute z = x + y (mod p). +func fp751AddReduced(z, x, y *Fp751Element) { + var carry uint64 + + // z=x+y % p751 + for i := 0; i < fp751NumWords; i++ { + z[i], carry = addc64(carry, x[i], y[i]) + } + + // z = z - p751x2 + carry = 0 + for i := 0; i < fp751NumWords; i++ { + z[i], carry = subc64(carry, z[i], p751x2[i]) + } + + // z = z + p751x2 + mask := uint64(0 - carry) + carry = 0 + for i := 0; i < fp751NumWords; i++ { + z[i], carry = addc64(carry, z[i], p751x2[i]&mask) + } +} + +// Compute z = x - y (mod p). +func fp751SubReduced(z, x, y *Fp751Element) { + var borrow uint64 + + for i := 0; i < fp751NumWords; i++ { + z[i], borrow = subc64(borrow, x[i], y[i]) + } + + mask := uint64(0 - borrow) + borrow = 0 + + for i := 0; i < fp751NumWords; i++ { + z[i], borrow = addc64(borrow, z[i], p751x2[i]&mask) + } +} + +// Conditionally swaps bits in x and y in constant time. +// mask indicates bits to be swaped (set bits are swapped) +// For details see "Hackers Delight, 2.20" +// +// Implementation doesn't actually depend on a prime field. +func fp751ConditionalSwap(x, y *Fp751Element, mask uint8) { + var tmp, mask64 uint64 + + mask64 = 0 - uint64(mask) + for i := 0; i < len(x); i++ { + tmp = mask64 & (x[i] ^ y[i]) + x[i] = tmp ^ x[i] + y[i] = tmp ^ y[i] + } +} + +// Perform Montgomery reduction: set z = x R^{-1} (mod 2*p) +// with R=2^768. Destroys the input value. +func fp751MontgomeryReduce(z *Fp751Element, x *fp751X2) { + var carry, t, u, v uint64 + var uv uint128 + var count int + + count = 5 // number of 0 digits in the least significat part of p751 + 1 + + for i := 0; i < fp751NumWords; i++ { + for j := 0; j < i; j++ { + if j < (i - count + 1) { + uv = mul64(z[j], p751p1[i-j]) + v, carry = addc64(0, uv.L, v) + u, carry = addc64(carry, uv.H, u) + t += carry + } + } + v, carry = addc64(0, v, x[i]) + u, carry = addc64(carry, u, 0) + t += carry + + z[i] = v + v = u + u = t + t = 0 + } + + for i := fp751NumWords; i < 2*fp751NumWords-1; i++ { + if count > 0 { + count-- + } + for j := i - fp751NumWords + 1; j < fp751NumWords; j++ { + if j < (fp751NumWords - count) { + uv = mul64(z[j], p751p1[i-j]) + v, carry = addc64(0, uv.L, v) + u, carry = addc64(carry, uv.H, u) + t += carry + } + } + v, carry = addc64(0, v, x[i]) + u, carry = addc64(carry, u, 0) + + t += carry + z[i-fp751NumWords] = v + v = u + u = t + t = 0 + } + v, carry = addc64(0, v, x[2*fp751NumWords-1]) + z[fp751NumWords-1] = v +} + +// Compute z = x * y. +func fp751Mul(z *fp751X2, x, y *Fp751Element) { + var u, v, t uint64 + var carry uint64 + var uv uint128 + + for i := uint64(0); i < fp751NumWords; i++ { + for j := uint64(0); j <= i; j++ { + uv = mul64(x[j], y[i-j]) + v, carry = addc64(0, uv.L, v) + u, carry = addc64(carry, uv.H, u) + t += carry + } + z[i] = v + v = u + u = t + t = 0 + } + + for i := fp751NumWords; i < (2*fp751NumWords)-1; i++ { + for j := i - fp751NumWords + 1; j < fp751NumWords; j++ { + uv = mul64(x[j], y[i-j]) + v, carry = addc64(0, uv.L, v) + u, carry = addc64(carry, uv.H, u) + t += carry + } + z[i] = v + v = u + u = t + t = 0 + } + z[2*fp751NumWords-1] = v +} + +// Compute z = x + y, without reducing mod p. +func fp751AddLazy(z, x, y *Fp751Element) { + var carry uint64 + for i := 0; i < fp751NumWords; i++ { + z[i], carry = addc64(carry, x[i], y[i]) + } +} + +// Compute z = x + y, without reducing mod p. +func fp751X2AddLazy(z, x, y *fp751X2) { + var carry uint64 + for i := 0; i < 2*fp751NumWords; i++ { + z[i], carry = addc64(carry, x[i], y[i]) + } +} + +// Reduce a field element in [0, 2*p) to one in [0,p). +func fp751StrongReduce(x *Fp751Element) { + var borrow, mask uint64 + for i := 0; i < fp751NumWords; i++ { + x[i], borrow = subc64(borrow, x[i], p751[i]) + } + + // Sets all bits if borrow = 1 + mask = 0 - borrow + borrow = 0 + for i := 0; i < fp751NumWords; i++ { + x[i], borrow = addc64(borrow, x[i], p751[i]&mask) + } +} + +// Compute z = x - y, without reducing mod p. +func fp751X2SubLazy(z, x, y *fp751X2) { + var borrow, mask uint64 + for i := 0; i < len(z); i++ { + z[i], borrow = subc64(borrow, x[i], y[i]) + } + + // Sets all bits if borrow = 1 + mask = 0 - borrow + borrow = 0 + for i := fp751NumWords; i < len(z); i++ { + z[i], borrow = addc64(borrow, z[i], p751[i-fp751NumWords]&mask) + } +} diff --git a/p751toolbox/field_test.go b/p751toolbox/field_test.go index ce3b058..f56300b 100644 --- a/p751toolbox/field_test.go +++ b/p751toolbox/field_test.go @@ -497,6 +497,8 @@ func BenchmarkPrimeFieldElementSub(b *testing.B) { } } +// --- field operation functions + func BenchmarkFp751Multiply(b *testing.B) { for n := 0; n < b.N; n++ { fp751Mul(&benchmarkFp751X2, &bench_x, &bench_y) @@ -525,3 +527,40 @@ func BenchmarkFp751SubReduced(b *testing.B) { fp751SubReduced(&benchmarkFp751Element, &bench_x, &bench_y) } } + +func BenchmarkFp751ConditionalSwap(b *testing.B) { + x, y := bench_x, bench_y + for n := 0; n < b.N; n++ { + fp751ConditionalSwap(&x, &y, 1) + fp751ConditionalSwap(&x, &y, 0) + } +} + +func BenchmarkFp751StrongReduce(b *testing.B) { + x := bench_x + for n := 0; n < b.N; n++ { + fp751StrongReduce(&x) + } +} + +func BenchmarkFp751AddLazy(b *testing.B) { + var z Fp751Element + x, y := bench_x, bench_y + for n := 0; n < b.N; n++ { + fp751AddLazy(&z, &x, &y) + } +} + +func BenchmarkFp751X2AddLazy(b *testing.B) { + x, y, z := bench_z, bench_z, bench_z + for n := 0; n < b.N; n++ { + fp751X2AddLazy(&x, &y, &z) + } +} + +func BenchmarkFp751X2SubLazy(b *testing.B) { + x, y, z := bench_z, bench_z, bench_z + for n := 0; n < b.N; n++ { + fp751X2SubLazy(&x, &y, &z) + } +} diff --git a/sidh/sidh.go b/sidh/sidh.go index 0bb5212..12109dd 100644 --- a/sidh/sidh.go +++ b/sidh/sidh.go @@ -9,16 +9,6 @@ import ( . "github.com/cloudflare/p751sidh/p751toolbox" ) -// Set result to zero if the input scalar is <= 3^238. scalar must be 48-byte array -// of bytes. This function is specific to P751. -//go:noescape -func checkLessThanThree238(scalar []byte) uint64 - -// Multiply 48-byte scalar by 3 to get a scalar in 3*[0,3^238). This -// function is specific to P751. -//go:noescape -func multiplyByThree(scalar []byte) - // ----------------------------------------------------------------------------- // Functions for traversing isogeny trees acoording to strategy. Key type 'A' is // @@ -196,7 +186,7 @@ func (prv *PrivateKey) generatePrivateKeyA(rand io.Reader) error { // shared secret computation. func (prv *PrivateKey) generatePrivateKeyB(rand io.Reader) error { // Perform rejection sampling to obtain a random value in [0,3^238]: - var ok uint64 + var ok uint8 for i := uint(0); i < prv.params.SampleRate; i++ { _, err := io.ReadFull(rand, prv.Scalar) if err != nil { diff --git a/sidh/sidh_amd64.s b/sidh/sidh_amd64.s index 4b1e4da..9d24181 100644 --- a/sidh/sidh_amd64.s +++ b/sidh/sidh_amd64.s @@ -10,9 +10,9 @@ #define THREE238M1_4 $0xb858a87e8f4222c7 #define THREE238M1_5 $0x254c9c6b525eaf5 -// Set result to zero if the input scalar is <= 3^238. scalar must be 48-byte array -// of bytes. -// func checkLessThanThree238(s_base uintptr, s_len uint, s_cap uint) uint64 +// Set result to zero if the input scalar is <= 3^238, otherwise result is 1. +// Scalar must be array of 48 bytes +// func checkLessThanThree238(s_base uintptr, s_len uint, s_cap uint) uint8 TEXT ·checkLessThanThree238(SB), NOSPLIT, $0-16 MOVQ scalar+0(FP), SI @@ -34,9 +34,9 @@ TEXT ·checkLessThanThree238(SB), NOSPLIT, $0-16 SBBQ 32(SI), R14 SBBQ 40(SI), R15 - // Save borrow flag indicating 3^238 - scalar < 0 as a mask in AX (eax) - SBBL $0, AX - MOVL AX, ret+24(FP) + // Save borrow flag indicating 3^238 - scalar < 0 as a mask in AX (rax) + ADCB $0, AX + MOVB AX, ret+24(FP) RET diff --git a/sidh/sidh_decl.go b/sidh/sidh_decl.go new file mode 100644 index 0000000..d3ec68e --- /dev/null +++ b/sidh/sidh_decl.go @@ -0,0 +1,13 @@ +// +build amd64,!noasm + +package sidh + +// Set result to zero if the input scalar is <= 3^238, otherwise result is 1. +// Scalar must be array of 48 bytes. This function is specific to P751. +//go:noescape +func checkLessThanThree238(scalar []byte) uint8 + +// Multiply 48-byte scalar by 3 to get a scalar in 3*[0,3^238). This +// function is specific to P751. +//go:noescape +func multiplyByThree(scalar []byte) diff --git a/sidh/sidh_generic.go b/sidh/sidh_generic.go new file mode 100644 index 0000000..ef45b93 --- /dev/null +++ b/sidh/sidh_generic.go @@ -0,0 +1,52 @@ +// +build !amd64 noasm + +package sidh + +var three238m1 = []uint8{ + 0xf8, 0x84, 0x83, 0x82, 0x8a, 0x71, 0xcd, 0xed, + 0x14, 0x7a, 0x42, 0xd4, 0xbf, 0x35, 0x3b, 0x73, + 0x38, 0xcf, 0xd7, 0x94, 0xcf, 0x29, 0x82, 0xf8, + 0xd6, 0x2a, 0x7c, 0x0c, 0x99, 0x6c, 0xc5, 0x63, + 0xc7, 0x22, 0x42, 0x8f, 0x7e, 0xa8, 0x58, 0xb8, + 0xf5, 0xea, 0x25, 0xb5, 0xc6, 0xc9, 0x54, 0x02} + +func addc8(cin, a, b uint8) (ret, cout uint8) { + t := a + cin + ret = b + t + cout = ((a & b) | ((a | b) & (^ret))) >> 7 + return +} + +func subc8(bIn, a, b uint8) (ret, bOut uint8) { + var tmp1 = a - b + ret = tmp1 - bIn + // Set bOut if bIn!=0 and tmp1==0 in constant time + bOut = bIn & (1 ^ ((tmp1 | uint8(0-tmp1)) >> 7)) + // Constant time check if a> 7 + return +} + +// Set result to zero if the input scalar is <= 3^238, otherwise result is 1. +// Scalar must be array of 48 bytes. This function is specific to P751. +func checkLessThanThree238(scalar []byte) uint8 { + var borrow uint8 + for i := 0; i < len(three238m1); i++ { + _, borrow = subc8(borrow, three238m1[i], scalar[i]) + } + return borrow +} + +// Multiply 48-byte scalar by 3 to get a scalar in 3*[0,3^238). This +// function is specific to P751. +func multiplyByThree(scalar []byte) { + var carry uint8 + var dbl [48]uint8 + + for i := 0; i < len(scalar); i++ { + dbl[i], carry = addc8(carry, scalar[i], scalar[i]) + } + for i := 0; i < len(scalar); i++ { + scalar[i], carry = addc8(carry, dbl[i], scalar[i]) + } +} diff --git a/sidh/sidh_test.go b/sidh/sidh_test.go index e0848df..d6f10d3 100644 --- a/sidh/sidh_test.go +++ b/sidh/sidh_test.go @@ -259,19 +259,28 @@ func TestCheckLessThanThree238(t *testing.T) { 212, 191, 53, 59, 115, 56, 207, 215, 148, 207, 41, 130, 248, 214, 42, 124, 12, 153, 108, 197, 99, 199, 34, 66, 143, 126, 168, 88, 184, 245, 234, 37, 181, 198, 201, 84, 2} + // makes second 64-bit digits bigger than in three238. checks if carries are correctly propagated + var three238plus2power65 = [48]byte{249, 132, 131, 130, 138, 113, 205, 237, 22, 122, + 66, 212, 191, 53, 59, 115, 56, 207, 215, 148, 207, 41, 130, 248, 214, 42, 124, 12, + 153, 108, 197, 99, 199, 34, 66, 143, 126, 168, 88, 184, 245, 234, 37, 181, 198, + 201, 84, 2} - var result uint64 + var result uint8 result = checkLessThanThree238(three238minus1[:]) if result != 0 { t.Error("expected 0, got", result) } result = checkLessThanThree238(three238[:]) - if result == 0 { + if result != 1 { t.Error("expected nonzero, got", result) } result = checkLessThanThree238(three238plus1[:]) - if result == 0 { + if result != 1 { + t.Error("expected nonzero, got", result) + } + result = checkLessThanThree238(three238plus2power65[:]) + if result != 1 { t.Error("expected nonzero, got", result) } }