* Code is much slower than x86 specialized implementation * Cross checked on ARMv8 (32-bit)trials/prep_p503_trial3
@@ -24,6 +24,27 @@ var oneExtensionField = ExtensionFieldElement{ | |||
B: Fp751Element{0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0}, | |||
} | |||
// 2*p751 | |||
var p751x2 = Fp751Element{ | |||
0xFFFFFFFFFFFFFFFE, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, | |||
0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0xDD5FFFFFFFFFFFFF, | |||
0xC7D92D0A93F0F151, 0xB52B363427EF98ED, 0x109D30CFADD7D0ED, | |||
0x0AC56A08B964AE90, 0x1C25213F2F75B8CD, 0x0000DFCBAA83EE38} | |||
// p751 | |||
var p751 = Fp751Element{ | |||
0xffffffffffffffff, 0xffffffffffffffff, 0xffffffffffffffff, | |||
0xffffffffffffffff, 0xffffffffffffffff, 0xeeafffffffffffff, | |||
0xe3ec968549f878a8, 0xda959b1a13f7cc76, 0x084e9867d6ebe876, | |||
0x8562b5045cb25748, 0x0e12909f97badc66, 0x00006fe5d541f71c} | |||
// p751 + 1 | |||
var p751p1 = Fp751Element{ | |||
0x0000000000000000, 0x0000000000000000, 0x0000000000000000, | |||
0x0000000000000000, 0x0000000000000000, 0xeeb0000000000000, | |||
0xe3ec968549f878a8, 0xda959b1a13f7cc76, 0x084e9867d6ebe876, | |||
0x8562b5045cb25748, 0x0e12909f97badc66, 0x00006fe5d541f71c} | |||
// Set dest = 0. | |||
// | |||
// Returns dest to allow chaining operations. | |||
@@ -451,11 +472,6 @@ func (dest *PrimeFieldElement) P34(x *PrimeFieldElement) *PrimeFieldElement { | |||
const fp751NumWords = 12 | |||
// (2^768) mod p. | |||
// This can't be a constant because Go doesn't allow array constants, so try | |||
// not to modify it. | |||
var montgomeryR = Fp751Element{149933, 0, 0, 0, 0, 9444048418595930112, 6136068611055053926, 7599709743867700432, 14455912356952952366, 5522737203492907350, 1222606818372667369, 49869481633250} | |||
// (2^768)^2 mod p | |||
// This can't be a constant because Go doesn't allow array constants, so try | |||
// not to modify it. | |||
@@ -472,44 +488,6 @@ type Fp751Element [fp751NumWords]uint64 | |||
// Represents an intermediate product of two elements of the base field F_p. | |||
type fp751X2 [2 * fp751NumWords]uint64 | |||
// If choice = 0, leave x,y unchanged. If choice = 1, set x,y = y,x. | |||
// This function executes in constant time. | |||
//go:noescape | |||
func fp751ConditionalSwap(x, y *Fp751Element, choice uint8) | |||
// Compute z = x + y (mod p). | |||
//go:noescape | |||
func fp751AddReduced(z, x, y *Fp751Element) | |||
// Compute z = x - y (mod p). | |||
//go:noescape | |||
func fp751SubReduced(z, x, y *Fp751Element) | |||
// Compute z = x + y, without reducing mod p. | |||
//go:noescape | |||
func fp751AddLazy(z, x, y *Fp751Element) | |||
// Compute z = x + y, without reducing mod p. | |||
//go:noescape | |||
func fp751X2AddLazy(z, x, y *fp751X2) | |||
// Compute z = x - y, without reducing mod p. | |||
//go:noescape | |||
func fp751X2SubLazy(z, x, y *fp751X2) | |||
// Compute z = x * y. | |||
//go:noescape | |||
func fp751Mul(z *fp751X2, x, y *Fp751Element) | |||
// Perform Montgomery reduction: set z = x R^{-1} (mod p). | |||
// Destroys the input value. | |||
//go:noescape | |||
func fp751MontgomeryReduce(z *Fp751Element, x *fp751X2) | |||
// Reduce a field element in [0, 2*p) to one in [0,p). | |||
//go:noescape | |||
func fp751StrongReduce(x *Fp751Element) | |||
func (x Fp751Element) vartimeEq(y Fp751Element) bool { | |||
fp751StrongReduce(&x) | |||
fp751StrongReduce(&y) | |||
@@ -1,3 +1,5 @@ | |||
// +build amd64,!noasm | |||
#include "textflag.h" | |||
// p751 + 1 | |||
@@ -0,0 +1,42 @@ | |||
// +build amd64,!noasm | |||
package p751toolbox | |||
// If choice = 0, leave x,y unchanged. If choice = 1, set x,y = y,x. | |||
// If choice is neither 0 nor 1 then behaviour is undefined. | |||
// This function executes in constant time. | |||
//go:noescape | |||
func fp751ConditionalSwap(x, y *Fp751Element, choice uint8) | |||
// Compute z = x + y (mod p). | |||
//go:noescape | |||
func fp751AddReduced(z, x, y *Fp751Element) | |||
// Compute z = x - y (mod p). | |||
//go:noescape | |||
func fp751SubReduced(z, x, y *Fp751Element) | |||
// Compute z = x + y, without reducing mod p. | |||
//go:noescape | |||
func fp751AddLazy(z, x, y *Fp751Element) | |||
// Compute z = x + y, without reducing mod p. | |||
//go:noescape | |||
func fp751X2AddLazy(z, x, y *fp751X2) | |||
// Compute z = x - y, without reducing mod p. | |||
//go:noescape | |||
func fp751X2SubLazy(z, x, y *fp751X2) | |||
// Compute z = x * y. | |||
//go:noescape | |||
func fp751Mul(z *fp751X2, x, y *Fp751Element) | |||
// Perform Montgomery reduction: set z = x R^{-1} (mod 2*p). | |||
// Destroys the input value. | |||
//go:noescape | |||
func fp751MontgomeryReduce(z *Fp751Element, x *fp751X2) | |||
// Reduce a field element in [0, 2*p) to one in [0,p). | |||
//go:noescape | |||
func fp751StrongReduce(x *Fp751Element) |
@@ -0,0 +1,254 @@ | |||
// +build noasm arm64 arm | |||
package p751toolbox | |||
// helper used for uint128 representation | |||
type uint128 struct { | |||
H, L uint64 | |||
} | |||
// Adds 2 64bit digits in constant time. | |||
// Returns result and carry (1 or 0) | |||
func addc64(cin, a, b uint64) (ret, cout uint64) { | |||
t := a + cin | |||
ret = b + t | |||
cout = ((a & b) | ((a | b) & (^ret))) >> 63 | |||
return | |||
} | |||
// Substracts 2 64bit digits in constant time. | |||
// Returns result and borrow (1 or 0) | |||
func subc64(bIn, a, b uint64) (ret, bOut uint64) { | |||
var tmp1 = a - b | |||
// Set bOut if bIn!=0 and tmp1==0 in constant time | |||
bOut = bIn & (1 ^ ((tmp1 | uint64(0-tmp1)) >> 63)) | |||
// Constant time check if x<y | |||
bOut |= (a ^ ((a ^ b) | (uint64(a-b) ^ b))) >> 63 | |||
ret = tmp1 - bIn | |||
return | |||
} | |||
// Multiplies 2 64bit digits in constant time | |||
func mul64(a, b uint64) (res uint128) { | |||
var al, bl, ah, bh, albl, albh, ahbl, ahbh uint64 | |||
var res1, res2, res3 uint64 | |||
var carry, maskL, maskH, temp uint64 | |||
maskL = (^maskL) >> 32 | |||
maskH = ^maskL | |||
al = a & maskL | |||
ah = a >> 32 | |||
bl = b & maskL | |||
bh = b >> 32 | |||
albl = al * bl | |||
albh = al * bh | |||
ahbl = ah * bl | |||
ahbh = ah * bh | |||
res.L = albl & maskL | |||
res1 = albl >> 32 | |||
res2 = ahbl & maskL | |||
res3 = albh & maskL | |||
temp = res1 + res2 + res3 | |||
carry = temp >> 32 | |||
res.L ^= temp << 32 | |||
res1 = ahbl >> 32 | |||
res2 = albh >> 32 | |||
res3 = ahbh & maskL | |||
temp = res1 + res2 + res3 + carry | |||
res.H = temp & maskL | |||
carry = temp & maskH | |||
res.H ^= (ahbh & maskH) + carry | |||
return | |||
} | |||
// Compute z = x + y (mod p). | |||
func fp751AddReduced(z, x, y *Fp751Element) { | |||
var carry uint64 | |||
// z=x+y % p751 | |||
for i := 0; i < fp751NumWords; i++ { | |||
z[i], carry = addc64(carry, x[i], y[i]) | |||
} | |||
// z = z - p751x2 | |||
carry = 0 | |||
for i := 0; i < fp751NumWords; i++ { | |||
z[i], carry = subc64(carry, z[i], p751x2[i]) | |||
} | |||
// z = z + p751x2 | |||
mask := uint64(0 - carry) | |||
carry = 0 | |||
for i := 0; i < fp751NumWords; i++ { | |||
z[i], carry = addc64(carry, z[i], p751x2[i]&mask) | |||
} | |||
} | |||
// Compute z = x - y (mod p). | |||
func fp751SubReduced(z, x, y *Fp751Element) { | |||
var borrow uint64 | |||
for i := 0; i < fp751NumWords; i++ { | |||
z[i], borrow = subc64(borrow, x[i], y[i]) | |||
} | |||
mask := uint64(0 - borrow) | |||
borrow = 0 | |||
for i := 0; i < fp751NumWords; i++ { | |||
z[i], borrow = addc64(borrow, z[i], p751x2[i]&mask) | |||
} | |||
} | |||
// Conditionally swaps bits in x and y in constant time. | |||
// mask indicates bits to be swaped (set bits are swapped) | |||
// For details see "Hackers Delight, 2.20" | |||
// | |||
// Implementation doesn't actually depend on a prime field. | |||
func fp751ConditionalSwap(x, y *Fp751Element, mask uint8) { | |||
var tmp, mask64 uint64 | |||
mask64 = 0 - uint64(mask) | |||
for i := 0; i < len(x); i++ { | |||
tmp = mask64 & (x[i] ^ y[i]) | |||
x[i] = tmp ^ x[i] | |||
y[i] = tmp ^ y[i] | |||
} | |||
} | |||
// Perform Montgomery reduction: set z = x R^{-1} (mod 2*p) | |||
// with R=2^768. Destroys the input value. | |||
func fp751MontgomeryReduce(z *Fp751Element, x *fp751X2) { | |||
var carry, t, u, v uint64 | |||
var uv uint128 | |||
var count int | |||
count = 5 // number of 0 digits in the least significat part of p751 + 1 | |||
for i := 0; i < fp751NumWords; i++ { | |||
for j := 0; j < i; j++ { | |||
if j < (i - count + 1) { | |||
uv = mul64(z[j], p751p1[i-j]) | |||
v, carry = addc64(0, uv.L, v) | |||
u, carry = addc64(carry, uv.H, u) | |||
t += carry | |||
} | |||
} | |||
v, carry = addc64(0, v, x[i]) | |||
u, carry = addc64(carry, u, 0) | |||
t += carry | |||
z[i] = v | |||
v = u | |||
u = t | |||
t = 0 | |||
} | |||
for i := fp751NumWords; i < 2*fp751NumWords-1; i++ { | |||
if count > 0 { | |||
count-- | |||
} | |||
for j := i - fp751NumWords + 1; j < fp751NumWords; j++ { | |||
if j < (fp751NumWords - count) { | |||
uv = mul64(z[j], p751p1[i-j]) | |||
v, carry = addc64(0, uv.L, v) | |||
u, carry = addc64(carry, uv.H, u) | |||
t += carry | |||
} | |||
} | |||
v, carry = addc64(0, v, x[i]) | |||
u, carry = addc64(carry, u, 0) | |||
t += carry | |||
z[i-fp751NumWords] = v | |||
v = u | |||
u = t | |||
t = 0 | |||
} | |||
v, carry = addc64(0, v, x[2*fp751NumWords-1]) | |||
z[fp751NumWords-1] = v | |||
} | |||
// Compute z = x * y. | |||
func fp751Mul(z *fp751X2, x, y *Fp751Element) { | |||
var u, v, t uint64 | |||
var carry uint64 | |||
var uv uint128 | |||
for i := uint64(0); i < fp751NumWords; i++ { | |||
for j := uint64(0); j <= i; j++ { | |||
uv = mul64(x[j], y[i-j]) | |||
v, carry = addc64(0, uv.L, v) | |||
u, carry = addc64(carry, uv.H, u) | |||
t += carry | |||
} | |||
z[i] = v | |||
v = u | |||
u = t | |||
t = 0 | |||
} | |||
for i := fp751NumWords; i < (2*fp751NumWords)-1; i++ { | |||
for j := i - fp751NumWords + 1; j < fp751NumWords; j++ { | |||
uv = mul64(x[j], y[i-j]) | |||
v, carry = addc64(0, uv.L, v) | |||
u, carry = addc64(carry, uv.H, u) | |||
t += carry | |||
} | |||
z[i] = v | |||
v = u | |||
u = t | |||
t = 0 | |||
} | |||
z[2*fp751NumWords-1] = v | |||
} | |||
// Compute z = x + y, without reducing mod p. | |||
func fp751AddLazy(z, x, y *Fp751Element) { | |||
var carry uint64 | |||
for i := 0; i < fp751NumWords; i++ { | |||
z[i], carry = addc64(carry, x[i], y[i]) | |||
} | |||
} | |||
// Compute z = x + y, without reducing mod p. | |||
func fp751X2AddLazy(z, x, y *fp751X2) { | |||
var carry uint64 | |||
for i := 0; i < 2*fp751NumWords; i++ { | |||
z[i], carry = addc64(carry, x[i], y[i]) | |||
} | |||
} | |||
// Reduce a field element in [0, 2*p) to one in [0,p). | |||
func fp751StrongReduce(x *Fp751Element) { | |||
var borrow, mask uint64 | |||
for i := 0; i < fp751NumWords; i++ { | |||
x[i], borrow = subc64(borrow, x[i], p751[i]) | |||
} | |||
// Sets all bits if borrow = 1 | |||
mask = 0 - borrow | |||
borrow = 0 | |||
for i := 0; i < fp751NumWords; i++ { | |||
x[i], borrow = addc64(borrow, x[i], p751[i]&mask) | |||
} | |||
} | |||
// Compute z = x - y, without reducing mod p. | |||
func fp751X2SubLazy(z, x, y *fp751X2) { | |||
var borrow, mask uint64 | |||
for i := 0; i < len(z); i++ { | |||
z[i], borrow = subc64(borrow, x[i], y[i]) | |||
} | |||
// Sets all bits if borrow = 1 | |||
mask = 0 - borrow | |||
borrow = 0 | |||
for i := fp751NumWords; i < len(z); i++ { | |||
z[i], borrow = addc64(borrow, z[i], p751[i-fp751NumWords]&mask) | |||
} | |||
} |
@@ -497,6 +497,8 @@ func BenchmarkPrimeFieldElementSub(b *testing.B) { | |||
} | |||
} | |||
// --- field operation functions | |||
func BenchmarkFp751Multiply(b *testing.B) { | |||
for n := 0; n < b.N; n++ { | |||
fp751Mul(&benchmarkFp751X2, &bench_x, &bench_y) | |||
@@ -525,3 +527,40 @@ func BenchmarkFp751SubReduced(b *testing.B) { | |||
fp751SubReduced(&benchmarkFp751Element, &bench_x, &bench_y) | |||
} | |||
} | |||
func BenchmarkFp751ConditionalSwap(b *testing.B) { | |||
x, y := bench_x, bench_y | |||
for n := 0; n < b.N; n++ { | |||
fp751ConditionalSwap(&x, &y, 1) | |||
fp751ConditionalSwap(&x, &y, 0) | |||
} | |||
} | |||
func BenchmarkFp751StrongReduce(b *testing.B) { | |||
x := bench_x | |||
for n := 0; n < b.N; n++ { | |||
fp751StrongReduce(&x) | |||
} | |||
} | |||
func BenchmarkFp751AddLazy(b *testing.B) { | |||
var z Fp751Element | |||
x, y := bench_x, bench_y | |||
for n := 0; n < b.N; n++ { | |||
fp751AddLazy(&z, &x, &y) | |||
} | |||
} | |||
func BenchmarkFp751X2AddLazy(b *testing.B) { | |||
x, y, z := bench_z, bench_z, bench_z | |||
for n := 0; n < b.N; n++ { | |||
fp751X2AddLazy(&x, &y, &z) | |||
} | |||
} | |||
func BenchmarkFp751X2SubLazy(b *testing.B) { | |||
x, y, z := bench_z, bench_z, bench_z | |||
for n := 0; n < b.N; n++ { | |||
fp751X2SubLazy(&x, &y, &z) | |||
} | |||
} |
@@ -9,16 +9,6 @@ import ( | |||
. "github.com/cloudflare/p751sidh/p751toolbox" | |||
) | |||
// Set result to zero if the input scalar is <= 3^238. scalar must be 48-byte array | |||
// of bytes. This function is specific to P751. | |||
//go:noescape | |||
func checkLessThanThree238(scalar []byte) uint64 | |||
// Multiply 48-byte scalar by 3 to get a scalar in 3*[0,3^238). This | |||
// function is specific to P751. | |||
//go:noescape | |||
func multiplyByThree(scalar []byte) | |||
// ----------------------------------------------------------------------------- | |||
// Functions for traversing isogeny trees acoording to strategy. Key type 'A' is | |||
// | |||
@@ -196,7 +186,7 @@ func (prv *PrivateKey) generatePrivateKeyA(rand io.Reader) error { | |||
// shared secret computation. | |||
func (prv *PrivateKey) generatePrivateKeyB(rand io.Reader) error { | |||
// Perform rejection sampling to obtain a random value in [0,3^238]: | |||
var ok uint64 | |||
var ok uint8 | |||
for i := uint(0); i < prv.params.SampleRate; i++ { | |||
_, err := io.ReadFull(rand, prv.Scalar) | |||
if err != nil { | |||
@@ -10,9 +10,9 @@ | |||
#define THREE238M1_4 $0xb858a87e8f4222c7 | |||
#define THREE238M1_5 $0x254c9c6b525eaf5 | |||
// Set result to zero if the input scalar is <= 3^238. scalar must be 48-byte array | |||
// of bytes. | |||
// func checkLessThanThree238(s_base uintptr, s_len uint, s_cap uint) uint64 | |||
// Set result to zero if the input scalar is <= 3^238, otherwise result is 1. | |||
// Scalar must be array of 48 bytes | |||
// func checkLessThanThree238(s_base uintptr, s_len uint, s_cap uint) uint8 | |||
TEXT ·checkLessThanThree238(SB), NOSPLIT, $0-16 | |||
MOVQ scalar+0(FP), SI | |||
@@ -34,9 +34,9 @@ TEXT ·checkLessThanThree238(SB), NOSPLIT, $0-16 | |||
SBBQ 32(SI), R14 | |||
SBBQ 40(SI), R15 | |||
// Save borrow flag indicating 3^238 - scalar < 0 as a mask in AX (eax) | |||
SBBL $0, AX | |||
MOVL AX, ret+24(FP) | |||
// Save borrow flag indicating 3^238 - scalar < 0 as a mask in AX (rax) | |||
ADCB $0, AX | |||
MOVB AX, ret+24(FP) | |||
RET | |||
@@ -0,0 +1,13 @@ | |||
// +build amd64,!noasm | |||
package sidh | |||
// Set result to zero if the input scalar is <= 3^238, otherwise result is 1. | |||
// Scalar must be array of 48 bytes. This function is specific to P751. | |||
//go:noescape | |||
func checkLessThanThree238(scalar []byte) uint8 | |||
// Multiply 48-byte scalar by 3 to get a scalar in 3*[0,3^238). This | |||
// function is specific to P751. | |||
//go:noescape | |||
func multiplyByThree(scalar []byte) |
@@ -0,0 +1,52 @@ | |||
// +build !amd64 noasm | |||
package sidh | |||
var three238m1 = []uint8{ | |||
0xf8, 0x84, 0x83, 0x82, 0x8a, 0x71, 0xcd, 0xed, | |||
0x14, 0x7a, 0x42, 0xd4, 0xbf, 0x35, 0x3b, 0x73, | |||
0x38, 0xcf, 0xd7, 0x94, 0xcf, 0x29, 0x82, 0xf8, | |||
0xd6, 0x2a, 0x7c, 0x0c, 0x99, 0x6c, 0xc5, 0x63, | |||
0xc7, 0x22, 0x42, 0x8f, 0x7e, 0xa8, 0x58, 0xb8, | |||
0xf5, 0xea, 0x25, 0xb5, 0xc6, 0xc9, 0x54, 0x02} | |||
func addc8(cin, a, b uint8) (ret, cout uint8) { | |||
t := a + cin | |||
ret = b + t | |||
cout = ((a & b) | ((a | b) & (^ret))) >> 7 | |||
return | |||
} | |||
func subc8(bIn, a, b uint8) (ret, bOut uint8) { | |||
var tmp1 = a - b | |||
ret = tmp1 - bIn | |||
// Set bOut if bIn!=0 and tmp1==0 in constant time | |||
bOut = bIn & (1 ^ ((tmp1 | uint8(0-tmp1)) >> 7)) | |||
// Constant time check if a<b | |||
bOut |= (a ^ ((a ^ b) | (uint8(a-b) ^ b))) >> 7 | |||
return | |||
} | |||
// Set result to zero if the input scalar is <= 3^238, otherwise result is 1. | |||
// Scalar must be array of 48 bytes. This function is specific to P751. | |||
func checkLessThanThree238(scalar []byte) uint8 { | |||
var borrow uint8 | |||
for i := 0; i < len(three238m1); i++ { | |||
_, borrow = subc8(borrow, three238m1[i], scalar[i]) | |||
} | |||
return borrow | |||
} | |||
// Multiply 48-byte scalar by 3 to get a scalar in 3*[0,3^238). This | |||
// function is specific to P751. | |||
func multiplyByThree(scalar []byte) { | |||
var carry uint8 | |||
var dbl [48]uint8 | |||
for i := 0; i < len(scalar); i++ { | |||
dbl[i], carry = addc8(carry, scalar[i], scalar[i]) | |||
} | |||
for i := 0; i < len(scalar); i++ { | |||
scalar[i], carry = addc8(carry, dbl[i], scalar[i]) | |||
} | |||
} |
@@ -259,19 +259,28 @@ func TestCheckLessThanThree238(t *testing.T) { | |||
212, 191, 53, 59, 115, 56, 207, 215, 148, 207, 41, 130, 248, 214, 42, 124, 12, | |||
153, 108, 197, 99, 199, 34, 66, 143, 126, 168, 88, 184, 245, 234, 37, 181, 198, | |||
201, 84, 2} | |||
// makes second 64-bit digits bigger than in three238. checks if carries are correctly propagated | |||
var three238plus2power65 = [48]byte{249, 132, 131, 130, 138, 113, 205, 237, 22, 122, | |||
66, 212, 191, 53, 59, 115, 56, 207, 215, 148, 207, 41, 130, 248, 214, 42, 124, 12, | |||
153, 108, 197, 99, 199, 34, 66, 143, 126, 168, 88, 184, 245, 234, 37, 181, 198, | |||
201, 84, 2} | |||
var result uint64 | |||
var result uint8 | |||
result = checkLessThanThree238(three238minus1[:]) | |||
if result != 0 { | |||
t.Error("expected 0, got", result) | |||
} | |||
result = checkLessThanThree238(three238[:]) | |||
if result == 0 { | |||
if result != 1 { | |||
t.Error("expected nonzero, got", result) | |||
} | |||
result = checkLessThanThree238(three238plus1[:]) | |||
if result == 0 { | |||
if result != 1 { | |||
t.Error("expected nonzero, got", result) | |||
} | |||
result = checkLessThanThree238(three238plus2power65[:]) | |||
if result != 1 { | |||
t.Error("expected nonzero, got", result) | |||
} | |||
} | |||