@@ -0,0 +1,27 @@ | |||
all: | |||
@cc \ | |||
-std=c99 -pedantic \ | |||
-Wall -Wextra \ | |||
-O2 -funroll-loops \ | |||
rng.c \ | |||
u512.s fp.s \ | |||
mont.c \ | |||
csidh.c \ | |||
main.c \ | |||
-o main | |||
debug: | |||
cc \ | |||
-std=c99 -pedantic \ | |||
-Wall -Wextra \ | |||
-g \ | |||
rng.c \ | |||
u512.s fp.s \ | |||
mont.c \ | |||
csidh.c \ | |||
main.c \ | |||
-o main | |||
clean: | |||
rm -f main | |||
@@ -0,0 +1,54 @@ | |||
#include <stdlib.h> | |||
#include <stdio.h> | |||
#include <string.h> | |||
#include <time.h> | |||
#include <assert.h> | |||
#include "u512.h" | |||
#include "fp.h" | |||
#include "mont.h" | |||
#include "csidh.h" | |||
#include <inttypes.h> | |||
static __inline__ uint64_t rdtsc(void) | |||
{ | |||
uint32_t hi, lo; | |||
__asm__ __volatile__ ("rdtsc" : "=a"(lo), "=d"(hi)); | |||
return lo | (uint64_t) hi << 32; | |||
} | |||
unsigned long its = 10000; | |||
int main() | |||
{ | |||
clock_t t0, t1, time = 0; | |||
uint64_t c0, c1, cycles = 0; | |||
private_key priv; | |||
public_key pub = base; | |||
for (unsigned long i = 0; i < its; ++i) { | |||
csidh_private(&priv); | |||
t0 = clock(); | |||
c0 = rdtsc(); | |||
/**************************************/ | |||
assert(validate(&pub)); | |||
action(&pub, &pub, &priv); | |||
/**************************************/ | |||
c1 = rdtsc(); | |||
t1 = clock(); | |||
cycles += c1 - c0; | |||
time += t1 - t0; | |||
} | |||
printf("iterations: %lu\n", its); | |||
printf("clock cycles: %" PRIu64 "\n", (uint64_t) cycles / its); | |||
printf("wall-clock time: %.3lf ms\n", 1000. * time / CLOCKS_PER_SEC / its); | |||
} | |||
@@ -0,0 +1,220 @@ | |||
#include <string.h> | |||
#include <assert.h> | |||
#include "csidh.h" | |||
#include "rng.h" | |||
/* specific to p, should perhaps be somewhere else */ | |||
const unsigned primes[num_primes] = { | |||
3, 5, 7, 11, 13, 17, 19, 23, 29, 31, 37, 41, 43, 47, 53, 59, | |||
61, 67, 71, 73, 79, 83, 89, 97, 101, 103, 107, 109, 113, 127, 131, 137, | |||
139, 149, 151, 157, 163, 167, 173, 179, 181, 191, 193, 197, 199, 211, 223, 227, | |||
229, 233, 239, 241, 251, 257, 263, 269, 271, 277, 281, 283, 293, 307, 311, 313, | |||
317, 331, 337, 347, 349, 353, 359, 367, 373, 587, | |||
}; | |||
const u512 four_sqrt_p = {{ | |||
0x85e2579c786882cf, 0x4e3433657e18da95, 0x850ae5507965a0b3, 0xa15bc4e676475964, | |||
}}; | |||
const public_key base = {0}; /* A = 0 */ | |||
void csidh_private(private_key *priv) | |||
{ | |||
memset(&priv->e, 0, sizeof(priv->e)); | |||
for (size_t i = 0; i < num_primes; ) { | |||
int8_t buf[64]; | |||
randombytes(buf, sizeof(buf)); | |||
for (size_t j = 0; j < sizeof(buf); ++j) { | |||
if (buf[j] <= max_exponent && buf[j] >= -max_exponent) { | |||
priv->e[i / 2] |= (buf[j] & 0xf) << i % 2 * 4; | |||
if (++i >= num_primes) | |||
break; | |||
} | |||
} | |||
} | |||
} | |||
/* compute [(p+1)/l] P for all l in our list of primes. */ | |||
/* divide and conquer is much faster than doing it naively, | |||
* but uses more memory. */ | |||
static void cofactor_multiples(proj *P, const proj *A, size_t lower, size_t upper) | |||
{ | |||
assert(lower < upper); | |||
if (upper - lower == 1) | |||
return; | |||
size_t mid = lower + (upper - lower + 1) / 2; | |||
u512 cl = u512_1, cu = u512_1; | |||
for (size_t i = lower; i < mid; ++i) | |||
u512_mul3_64(&cu, &cu, primes[i]); | |||
for (size_t i = mid; i < upper; ++i) | |||
u512_mul3_64(&cl, &cl, primes[i]); | |||
xMUL(&P[mid], A, &P[lower], &cu); | |||
xMUL(&P[lower], A, &P[lower], &cl); | |||
cofactor_multiples(P, A, lower, mid); | |||
cofactor_multiples(P, A, mid, upper); | |||
} | |||
/* never accepts invalid keys. */ | |||
bool validate(public_key const *in) | |||
{ | |||
const proj A = {in->A, fp_1}; | |||
do { | |||
proj P[num_primes]; | |||
fp_random(&P->x); | |||
P->z = fp_1; | |||
/* maximal 2-power in p+1 */ | |||
xDBL(P, &A, P); | |||
xDBL(P, &A, P); | |||
cofactor_multiples(P, &A, 0, num_primes); | |||
u512 order = u512_1; | |||
for (size_t i = num_primes - 1; i < num_primes; --i) { | |||
/* we only gain information if [(p+1)/l] P is non-zero */ | |||
if (memcmp(&P[i].z, &fp_0, sizeof(fp))) { | |||
u512 tmp; | |||
u512_set(&tmp, primes[i]); | |||
xMUL(&P[i], &A, &P[i], &tmp); | |||
if (memcmp(&P[i].z, &fp_0, sizeof(fp))) | |||
/* P does not have order dividing p+1. */ | |||
return false; | |||
u512_mul3_64(&order, &order, primes[i]); | |||
if (u512_sub3(&tmp, &four_sqrt_p, &order)) /* returns borrow */ | |||
/* order > 4 sqrt(p), hence definitely supersingular */ | |||
return true; | |||
} | |||
} | |||
/* P didn't have big enough order to prove supersingularity. */ | |||
} while (1); | |||
} | |||
/* compute x^3 + Ax^2 + x */ | |||
static void montgomery_rhs(fp *rhs, fp const *A, fp const *x) | |||
{ | |||
fp tmp; | |||
*rhs = *x; | |||
fp_sq1(rhs); | |||
fp_mul3(&tmp, A, x); | |||
fp_add2(rhs, &tmp); | |||
fp_add2(rhs, &fp_1); | |||
fp_mul2(rhs, x); | |||
} | |||
/* totally not constant-time. */ | |||
void action(public_key *out, public_key const *in, private_key const *priv) | |||
{ | |||
u512 k[2]; | |||
u512_set(&k[0], 4); /* maximal 2-power in p+1 */ | |||
u512_set(&k[1], 4); /* maximal 2-power in p+1 */ | |||
uint8_t e[2][num_primes]; | |||
for (size_t i = 0; i < num_primes; ++i) { | |||
int8_t t = (int8_t) (priv->e[i / 2] << i % 2 * 4) >> 4; | |||
if (t > 0) { | |||
e[0][i] = t; | |||
e[1][i] = 0; | |||
u512_mul3_64(&k[1], &k[1], primes[i]); | |||
} | |||
else if (t < 0) { | |||
e[1][i] = -t; | |||
e[0][i] = 0; | |||
u512_mul3_64(&k[0], &k[0], primes[i]); | |||
} | |||
else { | |||
e[0][i] = 0; | |||
e[1][i] = 0; | |||
u512_mul3_64(&k[0], &k[0], primes[i]); | |||
u512_mul3_64(&k[1], &k[1], primes[i]); | |||
} | |||
} | |||
proj A = {in->A, fp_1}; | |||
bool done[2] = {false, false}; | |||
do { | |||
assert(!memcmp(&A.z, &fp_1, sizeof(fp))); | |||
proj P; | |||
fp_random(&P.x); | |||
P.z = fp_1; | |||
fp rhs; | |||
montgomery_rhs(&rhs, &A.x, &P.x); | |||
bool sign = !fp_issquare(&rhs); | |||
if (done[sign]) | |||
continue; | |||
xMUL(&P, &A, &P, &k[sign]); | |||
done[sign] = true; | |||
for (size_t i = 0; i < num_primes; ++i) { | |||
if (e[sign][i]) { | |||
u512 cof = u512_1; | |||
for (size_t j = i + 1; j < num_primes; ++j) | |||
if (e[sign][j]) | |||
u512_mul3_64(&cof, &cof, primes[j]); | |||
proj K; | |||
xMUL(&K, &A, &P, &cof); | |||
if (memcmp(&K.z, &fp_0, sizeof(fp))) { | |||
xISOG(&A, &P, &K, primes[i]); | |||
if (!--e[sign][i]) | |||
u512_mul3_64(&k[sign], &k[sign], primes[i]); | |||
} | |||
} | |||
done[sign] &= !e[sign][i]; | |||
} | |||
fp_inv(&A.z); | |||
fp_mul2(&A.x, &A.z); | |||
A.z = fp_1; | |||
} while (!(done[0] && done[1])); | |||
out->A = A.x; | |||
} | |||
/* includes public-key validation. */ | |||
bool csidh(public_key *out, public_key const *in, private_key const *priv) | |||
{ | |||
if (!validate(in)) { | |||
fp_random(&out->A); | |||
return false; | |||
} | |||
action(out, in, priv); | |||
return true; | |||
} | |||
@@ -0,0 +1,26 @@ | |||
#ifndef CSIDH_H | |||
#define CSIDH_H | |||
#include "u512.h" | |||
#include "fp.h" | |||
#include "mont.h" | |||
/* specific to p, should perhaps be somewhere else */ | |||
#define num_primes 74 | |||
#define max_exponent 5 /* (2*5+1)^74 is roughly 2^256 */ | |||
typedef struct private_key { | |||
int8_t e[(num_primes + 1) / 2]; /* packed int4_t */ | |||
} private_key; | |||
typedef struct public_key { | |||
fp A; /* Montgomery coefficient: represents y^2 = x^3 + Ax^2 + x */ | |||
} public_key; | |||
extern const public_key base; | |||
void csidh_private(private_key *priv); | |||
bool csidh(public_key *out, public_key const *in, private_key const *priv); | |||
#endif |
@@ -0,0 +1,37 @@ | |||
#ifndef FP_H | |||
#define FP_H | |||
#include "u512.h" | |||
/* fp is in the Montgomery domain, so interpreting that | |||
as an integer should never make sense. | |||
enable compiler warnings when mixing up u512 and fp. */ | |||
typedef struct fp { | |||
u512 x; | |||
} fp; | |||
extern const fp fp_0; | |||
extern const fp fp_1; | |||
void fp_set(fp *x, uint64_t y); | |||
void fp_cswap(fp *x, fp *y, bool c); | |||
void fp_enc(fp *x, u512 const *y); /* encode to Montgomery representation */ | |||
void fp_dec(u512 *x, fp const *y); /* decode from Montgomery representation */ | |||
void fp_add2(fp *x, fp const *y); | |||
void fp_sub2(fp *x, fp const *y); | |||
void fp_mul2(fp *x, fp const *y); | |||
void fp_add3(fp *x, fp const *y, fp const *z); | |||
void fp_sub3(fp *x, fp const *y, fp const *z); | |||
void fp_mul3(fp *x, fp const *y, fp const *z); | |||
void fp_sq1(fp *x); | |||
void fp_sq2(fp *x, fp const *y); | |||
void fp_inv(fp *x); | |||
bool fp_issquare(fp const *x); | |||
void fp_random(fp *x); | |||
#endif |
@@ -0,0 +1,452 @@ | |||
.intel_syntax noprefix | |||
.section .rodata | |||
.set pbits, 511 | |||
p: | |||
.quad 0x1b81b90533c6c87b, 0xc2721bf457aca835, 0x516730cc1f0b4f25, 0xa7aac6c567f35507 | |||
.quad 0x5afbfcc69322c9cd, 0xb42d083aedc88c42, 0xfc8ab0d15e3e4c4a, 0x65b48e8f740f89bf | |||
.global fp_0 | |||
fp_0: .quad 0, 0, 0, 0, 0, 0, 0, 0 | |||
.global fp_1 | |||
fp_1: /* 2^512 mod p */ | |||
.quad 0xc8fc8df598726f0a, 0x7b1bc81750a6af95, 0x5d319e67c1e961b4, 0xb0aa7275301955f1 | |||
.quad 0x4a080672d9ba6c64, 0x97a5ef8a246ee77b, 0x06ea9e5d4383676a, 0x3496e2e117e0ec80 | |||
/* (2^512)^2 mod p */ | |||
.r_squared_mod_p: | |||
.quad 0x36905b572ffc1724, 0x67086f4525f1f27d, 0x4faf3fbfd22370ca, 0x192ea214bcc584b1 | |||
.quad 0x5dae03ee2f5de3d0, 0x1e9248731776b371, 0xad5f166e20e4f52d, 0x4ed759aea6f3917e | |||
/* -p^-1 mod 2^64 */ | |||
.inv_min_p_mod_r: | |||
.quad 0x66c1301f632e294d | |||
.section .text | |||
.global fp_copy | |||
fp_copy: | |||
cld | |||
mov rcx, 8 | |||
rep movsq | |||
ret | |||
.global fp_set | |||
fp_set: | |||
push rdi | |||
call u512_set | |||
pop rdi | |||
mov rsi, rdi | |||
jmp fp_enc | |||
.global fp_cswap | |||
fp_cswap: | |||
movzx rax, dl | |||
neg rax | |||
.set k, 0 | |||
.rept 8 | |||
mov rcx, [rdi + 8*k] | |||
mov rdx, [rsi + 8*k] | |||
mov r8, rcx | |||
xor r8, rdx | |||
and r8, rax | |||
xor rcx, r8 | |||
xor rdx, r8 | |||
mov [rdi + 8*k], rcx | |||
mov [rsi + 8*k], rdx | |||
.set k, k+1 | |||
.endr | |||
ret | |||
.reduce_once: | |||
push rbp | |||
mov rbp, rdi | |||
mov rdi, [rbp + 0] | |||
sub rdi, [rip + p + 0] | |||
mov rsi, [rbp + 8] | |||
sbb rsi, [rip + p + 8] | |||
mov rdx, [rbp + 16] | |||
sbb rdx, [rip + p + 16] | |||
mov rcx, [rbp + 24] | |||
sbb rcx, [rip + p + 24] | |||
mov r8, [rbp + 32] | |||
sbb r8, [rip + p + 32] | |||
mov r9, [rbp + 40] | |||
sbb r9, [rip + p + 40] | |||
mov r10, [rbp + 48] | |||
sbb r10, [rip + p + 48] | |||
mov r11, [rbp + 56] | |||
sbb r11, [rip + p + 56] | |||
setnc al | |||
movzx rax, al | |||
neg rax | |||
.macro cswap2, r, m | |||
xor \r, \m | |||
and \r, rax | |||
xor \m, \r | |||
.endm | |||
cswap2 rdi, [rbp + 0] | |||
cswap2 rsi, [rbp + 8] | |||
cswap2 rdx, [rbp + 16] | |||
cswap2 rcx, [rbp + 24] | |||
cswap2 r8, [rbp + 32] | |||
cswap2 r9, [rbp + 40] | |||
cswap2 r10, [rbp + 48] | |||
cswap2 r11, [rbp + 56] | |||
pop rbp | |||
ret | |||
.global fp_add3 | |||
fp_add3: | |||
push rdi | |||
call u512_add3 | |||
pop rdi | |||
jmp .reduce_once | |||
.global fp_add2 | |||
fp_add2: | |||
mov rdx, rdi | |||
jmp fp_add3 | |||
.global fp_sub3 | |||
fp_sub3: | |||
push rdi | |||
call u512_sub3 | |||
pop rdi | |||
xor rsi, rsi | |||
xor rdx, rdx | |||
xor rcx, rcx | |||
xor r8, r8 | |||
xor r9, r9 | |||
xor r10, r10 | |||
xor r11, r11 | |||
test rax, rax | |||
cmovnz rax, [rip + p + 0] | |||
cmovnz rsi, [rip + p + 8] | |||
cmovnz rdx, [rip + p + 16] | |||
cmovnz rcx, [rip + p + 24] | |||
cmovnz r8, [rip + p + 32] | |||
cmovnz r9, [rip + p + 40] | |||
cmovnz r10, [rip + p + 48] | |||
cmovnz r11, [rip + p + 56] | |||
add [rdi + 0], rax | |||
adc [rdi + 8], rsi | |||
adc [rdi + 16], rdx | |||
adc [rdi + 24], rcx | |||
adc [rdi + 32], r8 | |||
adc [rdi + 40], r9 | |||
adc [rdi + 48], r10 | |||
adc [rdi + 56], r11 | |||
ret | |||
.global fp_sub2 | |||
fp_sub2: | |||
mov rdx, rdi | |||
xchg rsi, rdx | |||
jmp fp_sub3 | |||
/* Montgomery arithmetic */ | |||
.global fp_enc | |||
fp_enc: | |||
lea rdx, [rip + .r_squared_mod_p] | |||
jmp fp_mul3 | |||
.global fp_dec | |||
fp_dec: | |||
lea rdx, [rip + u512_1] | |||
jmp fp_mul3 | |||
.global fp_mul3 | |||
fp_mul3: | |||
push rbp | |||
push rbx | |||
push r12 | |||
push r13 | |||
push r14 | |||
push r15 | |||
push rdi | |||
mov rdi, rsi | |||
mov rsi, rdx | |||
xor r8, r8 | |||
xor r9, r9 | |||
xor r10, r10 | |||
xor r11, r11 | |||
xor r12, r12 | |||
xor r13, r13 | |||
xor r14, r14 | |||
xor r15, r15 | |||
xor rbp, rbp | |||
/* flags are already cleared */ | |||
.macro MULSTEP, k, r0, r1, r2, r3, r4, r5, r6, r7, r8 | |||
mov rdx, [rsi + 0] | |||
mulx rcx, rdx, [rdi + 8*\k] | |||
add rdx, \r0 | |||
mulx rcx, rdx, [rip + .inv_min_p_mod_r] | |||
xor rax, rax /* clear flags */ | |||
mulx rbx, rax, [rip + p + 0] | |||
adox \r0, rax | |||
mulx rcx, rax, [rip + p + 8] | |||
adcx \r1, rbx | |||
adox \r1, rax | |||
mulx rbx, rax, [rip + p + 16] | |||
adcx \r2, rcx | |||
adox \r2, rax | |||
mulx rcx, rax, [rip + p + 24] | |||
adcx \r3, rbx | |||
adox \r3, rax | |||
mulx rbx, rax, [rip + p + 32] | |||
adcx \r4, rcx | |||
adox \r4, rax | |||
mulx rcx, rax, [rip + p + 40] | |||
adcx \r5, rbx | |||
adox \r5, rax | |||
mulx rbx, rax, [rip + p + 48] | |||
adcx \r6, rcx | |||
adox \r6, rax | |||
mulx rcx, rax, [rip + p + 56] | |||
adcx \r7, rbx | |||
adox \r7, rax | |||
mov rax, 0 | |||
adcx \r8, rcx | |||
adox \r8, rax | |||
mov rdx, [rdi + 8*\k] | |||
xor rax, rax /* clear flags */ | |||
mulx rbx, rax, [rsi + 0] | |||
adox \r0, rax | |||
mulx rcx, rax, [rsi + 8] | |||
adcx \r1, rbx | |||
adox \r1, rax | |||
mulx rbx, rax, [rsi + 16] | |||
adcx \r2, rcx | |||
adox \r2, rax | |||
mulx rcx, rax, [rsi + 24] | |||
adcx \r3, rbx | |||
adox \r3, rax | |||
mulx rbx, rax, [rsi + 32] | |||
adcx \r4, rcx | |||
adox \r4, rax | |||
mulx rcx, rax, [rsi + 40] | |||
adcx \r5, rbx | |||
adox \r5, rax | |||
mulx rbx, rax, [rsi + 48] | |||
adcx \r6, rcx | |||
adox \r6, rax | |||
mulx rcx, rax, [rsi + 56] | |||
adcx \r7, rbx | |||
adox \r7, rax | |||
mov rax, 0 | |||
adcx \r8, rcx | |||
adox \r8, rax | |||
.endm | |||
MULSTEP 0, r8, r9, r10, r11, r12, r13, r14, r15, rbp | |||
MULSTEP 1, r9, r10, r11, r12, r13, r14, r15, rbp, r8 | |||
MULSTEP 2, r10, r11, r12, r13, r14, r15, rbp, r8, r9 | |||
MULSTEP 3, r11, r12, r13, r14, r15, rbp, r8, r9, r10 | |||
MULSTEP 4, r12, r13, r14, r15, rbp, r8, r9, r10, r11 | |||
MULSTEP 5, r13, r14, r15, rbp, r8, r9, r10, r11, r12 | |||
MULSTEP 6, r14, r15, rbp, r8, r9, r10, r11, r12, r13 | |||
MULSTEP 7, r15, rbp, r8, r9, r10, r11, r12, r13, r14 | |||
pop rdi | |||
mov [rdi + 0], rbp | |||
mov [rdi + 8], r8 | |||
mov [rdi + 16], r9 | |||
mov [rdi + 24], r10 | |||
mov [rdi + 32], r11 | |||
mov [rdi + 40], r12 | |||
mov [rdi + 48], r13 | |||
mov [rdi + 56], r14 | |||
pop r15 | |||
pop r14 | |||
pop r13 | |||
pop r12 | |||
pop rbx | |||
pop rbp | |||
jmp .reduce_once | |||
.global fp_mul2 | |||
fp_mul2: | |||
mov rdx, rdi | |||
jmp fp_mul3 | |||
.global fp_sq2 | |||
fp_sq2: | |||
/* TODO implement optimized Montgomery squaring */ | |||
mov rdx, rsi | |||
jmp fp_mul3 | |||
.global fp_sq1 | |||
fp_sq1: | |||
mov rsi, rdi | |||
jmp fp_sq2 | |||
/* (obviously) not constant time in the exponent! */ | |||
.fp_pow: | |||
push rbx | |||
mov rbx, rsi | |||
push r12 | |||
push r13 | |||
push rdi | |||
sub rsp, 64 | |||
mov rsi, rdi | |||
mov rdi, rsp | |||
call fp_copy | |||
mov rdi, [rsp + 64] | |||
lea rsi, [rip + fp_1] | |||
call fp_copy | |||
.macro POWSTEP, k | |||
mov r13, [rbx + 8*\k] | |||
xor r12, r12 | |||
0: | |||
test r13, 1 | |||
jz 1f | |||
mov rdi, [rsp + 64] | |||
mov rsi, rsp | |||
call fp_mul2 | |||
1: | |||
mov rdi, rsp | |||
call fp_sq1 | |||
shr r13 | |||
inc r12 | |||
test r12, 64 | |||
jz 0b | |||
.endm | |||
POWSTEP 0 | |||
POWSTEP 1 | |||
POWSTEP 2 | |||
POWSTEP 3 | |||
POWSTEP 4 | |||
POWSTEP 5 | |||
POWSTEP 6 | |||
POWSTEP 7 | |||
add rsp, 64+8 | |||
pop r13 | |||
pop r12 | |||
pop rbx | |||
ret | |||
.section .rodata | |||
.p_minus_2: | |||
.quad 0x1b81b90533c6c879, 0xc2721bf457aca835, 0x516730cc1f0b4f25, 0xa7aac6c567f35507 | |||
.quad 0x5afbfcc69322c9cd, 0xb42d083aedc88c42, 0xfc8ab0d15e3e4c4a, 0x65b48e8f740f89bf | |||
.section .text | |||
/* TODO use a better addition chain? */ | |||
.global fp_inv | |||
fp_inv: | |||
lea rsi, [rip + .p_minus_2] | |||
jmp .fp_pow | |||
.section .rodata | |||
.p_minus_1_halves: | |||
.quad 0x8dc0dc8299e3643d, 0xe1390dfa2bd6541a, 0xa8b398660f85a792, 0xd3d56362b3f9aa83 | |||
.quad 0x2d7dfe63499164e6, 0x5a16841d76e44621, 0xfe455868af1f2625, 0x32da4747ba07c4df | |||
.section .text | |||
/* TODO use a better addition chain? */ | |||
.global fp_issquare | |||
fp_issquare: | |||
push rdi | |||
lea rsi, [rip + .p_minus_1_halves] | |||
call .fp_pow | |||
pop rdi | |||
xor rax, rax | |||
.set k, 0 | |||
.rept 8 | |||
mov rsi, [rdi + 8*k] | |||
xor rsi, [rip + fp_1 + 8*k] | |||
or rax, rsi | |||
.set k, k+1 | |||
.endr | |||
test rax, rax | |||
setz al | |||
movzx rax, al | |||
ret | |||
/* not constant time (but this shouldn't leak anything of importance) */ | |||
.global fp_random | |||
fp_random: | |||
push rdi | |||
mov rsi, 64 | |||
call randombytes | |||
pop rdi | |||
mov rax, 1 | |||
shl rax, (pbits % 64) | |||
dec rax | |||
and [rdi + 56], rax | |||
.set k, 7 | |||
.rept 8 | |||
mov rax, [rip + p + 8*k] | |||
cmp [rdi + 8*k], rax | |||
jge fp_random | |||
jl 0f | |||
.set k, k-1 | |||
.endr | |||
0: | |||
ret | |||
@@ -0,0 +1,99 @@ | |||
#include <stdlib.h> | |||
#include <stdio.h> | |||
#include <string.h> | |||
#include <unistd.h> | |||
#include <time.h> | |||
#include <assert.h> | |||
#include "u512.h" | |||
#include "fp.h" | |||
#include "mont.h" | |||
#include "csidh.h" | |||
void u512_print(u512 const *x) | |||
{ | |||
for (size_t i = 63; i < 64; --i) | |||
printf("%02hhx", i[(unsigned char *) x->c]); | |||
} | |||
void fp_print(fp const *x) | |||
{ | |||
u512 y; | |||
fp_dec(&y, x); | |||
u512_print(&y); | |||
} | |||
int main() | |||
{ | |||
clock_t t0, t1; | |||
private_key priv_alice, priv_bob; | |||
public_key pub_alice, pub_bob; | |||
public_key shared_alice, shared_bob; | |||
printf("\n"); | |||
t0 = clock(); | |||
csidh_private(&priv_alice); | |||
t1 = clock(); | |||
printf("Alice's private key (%7.3lf ms):\n ", 1000. * (t1 - t0) / CLOCKS_PER_SEC); | |||
for (size_t i = 0; i < sizeof(priv_alice); ++i) | |||
printf("%02hhx", i[(uint8_t *) &priv_alice]); | |||
printf("\n\n"); | |||
t0 = clock(); | |||
csidh_private(&priv_bob); | |||
t1 = clock(); | |||
printf("Bob's private key (%7.3lf ms):\n ", 1000. * (t1 - t0) / CLOCKS_PER_SEC); | |||
for (size_t i = 0; i < sizeof(priv_bob); ++i) | |||
printf("%02hhx", i[(uint8_t *) &priv_bob]); | |||
printf("\n\n"); | |||
t0 = clock(); | |||
assert(csidh(&pub_alice, &base, &priv_alice)); | |||
t1 = clock(); | |||
printf("Alice's public key (%7.3lf ms):\n ", 1000. * (t1 - t0) / CLOCKS_PER_SEC); | |||
fp_print(&pub_alice.A); | |||
printf("\n\n"); | |||
t0 = clock(); | |||
assert(csidh(&pub_bob, &base, &priv_bob)); | |||
t1 = clock(); | |||
printf("Bob's public key (%7.3lf ms):\n ", 1000. * (t1 - t0) / CLOCKS_PER_SEC); | |||
fp_print(&pub_bob.A); | |||
printf("\n\n"); | |||
t0 = clock(); | |||
assert(csidh(&shared_alice, &pub_bob, &priv_alice)); | |||
t1 = clock(); | |||
printf("Alice's shared secret (%7.3lf ms):\n ", 1000. * (t1 - t0) / CLOCKS_PER_SEC); | |||
fp_print(&shared_alice.A); | |||
printf("\n\n"); | |||
t0 = clock(); | |||
assert(csidh(&shared_bob, &pub_alice, &priv_bob)); | |||
t1 = clock(); | |||
printf("Bob's shared secret (%7.3lf ms):\n ", 1000. * (t1 - t0) / CLOCKS_PER_SEC); | |||
fp_print(&shared_bob.A); | |||
printf("\n\n"); | |||
printf(" "); | |||
if (memcmp(&shared_alice, &shared_bob, sizeof(public_key))) | |||
printf("\x1b[31mNOT EQUAL!\x1b[0m\n"); | |||
else | |||
printf("\x1b[32mequal.\x1b[0m\n"); | |||
printf("\n"); | |||
printf("\n"); | |||
} | |||
@@ -0,0 +1,188 @@ | |||
#include <assert.h> | |||
#include "mont.h" | |||
void xDBLADD(proj *R, proj *S, proj const *P, proj const *Q, proj const *PQ, proj const *A24) | |||
{ | |||
fp tmp0, tmp1, tmp2; | |||
fp_add3(&tmp0, &P->x, &P->z); | |||
fp_sub3(&tmp1, &P->x, &P->z); | |||
fp_sq2(&R->x, &tmp0); | |||
fp_sub3(&tmp2, &Q->x, &Q->z); | |||
fp_add3(&S->x, &Q->x, &Q->z); | |||
fp_mul2(&tmp0, &tmp2); | |||
fp_sq2(&R->z, &tmp1); | |||
fp_mul2(&tmp1, &S->x); | |||
fp_sub3(&tmp2, &R->x, &R->z); | |||
fp_mul2(&R->z, &A24->z); | |||
fp_mul2(&R->x, &R->z); | |||
fp_mul3(&S->x, &A24->x, &tmp2); | |||
fp_sub3(&S->z, &tmp0, &tmp1); | |||
fp_add2(&R->z, &S->x); | |||
fp_add3(&S->x, &tmp0, &tmp1); | |||
fp_mul2(&R->z, &tmp2); | |||
fp_sq1(&S->z); | |||
fp_sq1(&S->x); | |||
fp_mul2(&S->z, &PQ->x); | |||
fp_mul2(&S->x, &PQ->z); | |||
} | |||
void xDBL(proj *Q, proj const *A, proj const *P) | |||
{ | |||
fp a, b, c; | |||
fp_add3(&a, &P->x, &P->z); | |||
fp_sq1(&a); | |||
fp_sub3(&b, &P->x, &P->z); | |||
fp_sq1(&b); | |||
fp_sub3(&c, &a, &b); | |||
fp_add2(&b, &b); fp_add2(&b, &b); /* multiplication by 4 */ | |||
fp_mul2(&b, &A->z); | |||
fp_mul3(&Q->x, &a, &b); | |||
fp_add3(&a, &A->z, &A->z); /* multiplication by 2 */ | |||
fp_add2(&a, &A->x); | |||
fp_mul2(&a, &c); | |||
fp_add2(&a, &b); | |||
fp_mul3(&Q->z, &a, &c); | |||
} | |||
void xADD(proj *S, proj const *P, proj const *Q, proj const *PQ) | |||
{ | |||
fp a, b, c, d; | |||
fp_add3(&a, &P->x, &P->z); | |||
fp_sub3(&b, &P->x, &P->z); | |||
fp_add3(&c, &Q->x, &Q->z); | |||
fp_sub3(&d, &Q->x, &Q->z); | |||
fp_mul2(&a, &d); | |||
fp_mul2(&b, &c); | |||
fp_add3(&c, &a, &b); | |||
fp_sub3(&d, &a, &b); | |||
fp_sq1(&c); | |||
fp_sq1(&d); | |||
fp_mul3(&S->x, &PQ->z, &c); | |||
fp_mul3(&S->z, &PQ->x, &d); | |||
} | |||
/* Montgomery ladder. */ | |||
/* P must not be the unique point of order 2. */ | |||
/* not constant-time! */ | |||
void xMUL(proj *Q, proj const *A, proj const *P, u512 const *k) | |||
{ | |||
proj R = *P; | |||
proj A24; | |||
const proj Pcopy = *P; /* in case Q = P */ | |||
Q->x = fp_1; | |||
Q->z = fp_0; | |||
fp_add3(&A24.x, &A->z, &A->z); | |||
fp_add3(&A24.z, &A24.x, &A24.x); | |||
fp_add2(&A24.x, &A->x); | |||
unsigned long i = 512; | |||
while (--i && !u512_bit(k, i)); | |||
do { | |||
bool bit = u512_bit(k, i); | |||
if (bit) { proj T = *Q; *Q = R; R = T; } /* not constant-time */ | |||
//fp_cswap(&Q->x, &R.x, bit); | |||
//fp_cswap(&Q->z, &R.z, bit); | |||
xDBLADD(Q, &R, Q, &R, &Pcopy, &A24); | |||
if (bit) { proj T = *Q; *Q = R; R = T; } /* not constant-time */ | |||
//fp_cswap(&Q->x, &R.x, bit); | |||
//fp_cswap(&Q->z, &R.z, bit); | |||
} while (i--); | |||
} | |||
/* computes the isogeny with kernel point K of order k */ | |||
/* returns the new curve coefficient A and the image of P */ | |||
/* (obviously) not constant time in k */ | |||
void xISOG(proj *A, proj *P, proj const *K, uint64_t k) | |||
{ | |||
assert (k >= 3); | |||
assert (k % 2 == 1); | |||
fp tmp0, tmp1; | |||
fp T[4] = {K->z, K->x, K->x, K->z}; | |||
proj Q; | |||
fp_mul3(&Q.x, &P->x, &K->x); | |||
fp_mul3(&tmp0, &P->z, &K->z); | |||
fp_sub2(&Q.x, &tmp0); | |||
fp_mul3(&Q.z, &P->x, &K->z); | |||
fp_mul3(&tmp0, &P->z, &K->x); | |||
fp_sub2(&Q.z, &tmp0); | |||
proj M[3] = {*K}; | |||
xDBL(&M[1], A, K); | |||
for (uint64_t i = 1; i < k / 2; ++i) { | |||
if (i >= 2) | |||
xADD(&M[i % 3], &M[(i - 1) % 3], K, &M[(i - 2) % 3]); | |||
fp_mul3(&tmp0, &M[i % 3].x, &T[0]); | |||
fp_mul3(&tmp1, &M[i % 3].z, &T[1]); | |||
fp_add3(&T[0], &tmp0, &tmp1); | |||
fp_mul2(&T[1], &M[i % 3].x); | |||
fp_mul3(&tmp0, &M[i % 3].z, &T[2]); | |||
fp_mul3(&tmp1, &M[i % 3].x, &T[3]); | |||
fp_add3(&T[2], &tmp0, &tmp1); | |||
fp_mul2(&T[3], &M[i % 3].z); | |||
fp_mul3(&tmp0, &P->x, &M[i % 3].x); | |||
fp_mul3(&tmp1, &P->z, &M[i % 3].z); | |||
fp_sub2(&tmp0, &tmp1); | |||
fp_mul2(&Q.x, &tmp0); | |||
fp_mul3(&tmp0, &P->x, &M[i % 3].z); | |||
fp_mul3(&tmp1, &P->z, &M[i % 3].x); | |||
fp_sub2(&tmp0, &tmp1); | |||
fp_mul2(&Q.z, &tmp0); | |||
} | |||
fp_mul2(&T[0], &T[1]); | |||
fp_add2(&T[0], &T[0]); /* multiplication by 2 */ | |||
fp_sq1(&T[1]); | |||
fp_mul2(&T[2], &T[3]); | |||
fp_add2(&T[2], &T[2]); /* multiplication by 2 */ | |||
fp_sq1(&T[3]); | |||
/* Ax := T[1] * T[3] * Ax - 3 * Az * (T[1] * T[2] - T[0] * T[3]) */ | |||
fp_mul3(&tmp0, &T[1], &T[2]); | |||
fp_mul3(&tmp1, &T[0], &T[3]); | |||
fp_sub2(&tmp0, &tmp1); | |||
fp_mul2(&tmp0, &A->z); | |||
fp_add3(&tmp1, &tmp0, &tmp0); fp_add2(&tmp0, &tmp1); /* multiplication by 3 */ | |||
fp_mul3(&tmp1, &T[1], &T[3]); | |||
fp_mul2(&tmp1, &A->x); | |||
fp_sub3(&A->x, &tmp1, &tmp0); | |||
/* Az := Az * T[3]^2 */ | |||
fp_sq1(&T[3]); | |||
fp_mul2(&A->z, &T[3]); | |||
/* X := X * Xim^2, Z := Z * Zim^2 */ | |||
fp_sq1(&Q.x); | |||
fp_sq1(&Q.z); | |||
fp_mul2(&P->x, &Q.x); | |||
fp_mul2(&P->z, &Q.z); | |||
} | |||
@@ -0,0 +1,19 @@ | |||
#ifndef MONT_H | |||
#define MONT_H | |||
#include "u512.h" | |||
#include "fp.h" | |||
/* P^1 over fp. */ | |||
typedef struct proj { | |||
fp x; | |||
fp z; | |||
} proj; | |||
void xDBL(proj *Q, proj const *A, proj const *P); | |||
void xADD(proj *S, proj const *P, proj const *Q, proj const *PQ); | |||
void xDBLADD(proj *R, proj *S, proj const *P, proj const *Q, proj const *PQ, proj const *A); | |||
void xMUL(proj *Q, proj const *A, proj const *P, u512 const *k); | |||
void xISOG(proj *A, proj *P, proj const *K, uint64_t k); | |||
#endif |
@@ -0,0 +1,18 @@ | |||
#include "rng.h" | |||
#include <stdlib.h> | |||
#include <unistd.h> | |||
#include <fcntl.h> | |||
void randombytes(void *x, size_t l) | |||
{ | |||
static int fd = -1; | |||
ssize_t n; | |||
if (fd < 0 && 0 > (fd = open("/dev/urandom", O_RDONLY))) | |||
exit(1); | |||
for (size_t i = 0; i < l; i += n) | |||
if (0 >= (n = read(fd, (char *) x + i, l - i))) | |||
exit(2); | |||
} | |||
@@ -0,0 +1,8 @@ | |||
#ifndef RNG_H | |||
#define RNG_H | |||
#include <stdlib.h> | |||
void randombytes(void *x, size_t l); | |||
#endif |
@@ -0,0 +1,128 @@ | |||
#!/usr/bin/env sage | |||
#coding: utf8 | |||
proof.all(False) | |||
# parameters. | |||
ls = list(primes(3, 374)) + [587] # Elkies primes | |||
#ls = list(primes(3, 47)) + [97] # (a smaller example) | |||
p = 4 * prod(ls) - 1 | |||
assert is_prime(p) | |||
print "\nElkies primes:", " ".join(map(str, ls)) | |||
max_exp = ceil((sqrt(p) ** (1/len(ls)) - 1) / 2) | |||
assert (2 * max_exp + 1) ** len(ls) >= sqrt(p) | |||
print "exponents are chosen in the range {}..{}.".format(-max_exp, max_exp) | |||
base = GF(p)(0) # Montgomery coefficient of starting curve | |||
# helper functions. | |||
# NB: all the operations can be computed entirely over the prime field, | |||
# but for simplicity of this implementation we will make use of curves | |||
# defined over GF(p^2). note this slows everything down quite a bit. | |||
Fp2.<i> = GF(p**2, modulus = x**2 + 1) | |||
def montgomery_curve(A): | |||
return EllipticCurve(Fp2, [0, A, 0, 1, 0]) | |||
# sage's isogeny formulas return Weierstraß curves, hence we need this... | |||
def montgomery_coefficient(E): | |||
Ew = E.change_ring(GF(p)).short_weierstrass_model() | |||
_, _, _, a, b = Ew.a_invariants() | |||
R.<z> = GF(p)[] | |||
r = (z**3 + a*z + b).roots(multiplicities=False)[0] | |||
s = sqrt(3 * r**2 + a) | |||
if not is_square(s): s = -s | |||
A = 3 * r / s | |||
assert montgomery_curve(A).change_ring(GF(p)).is_isomorphic(Ew) | |||
return GF(p)(A) | |||
# actual implementation. | |||
def private(): | |||
return [randrange(-max_exp, max_exp + 1) for _ in range(len(ls))] | |||
def validate(A): | |||
while True: | |||
k = 1 | |||
P = montgomery_curve(A).lift_x(GF(p).random_element()) | |||
for l in ls: | |||
Q = (p + 1) // l * P | |||
if not Q: continue | |||
if l * Q: return False | |||
k *= l | |||
if k > 4 * sqrt(p): return True | |||
def action(pub, priv): | |||
E = montgomery_curve(pub) | |||
es = priv[:] | |||
while any(es): | |||
E._order = (p + 1)**2 # else sage computes this | |||
P = E.lift_x(GF(p).random_element()) | |||
s = +1 if P.xy()[1] in GF(p) else -1 | |||
k = prod(l for l, e in zip(ls, es) if sign(e) == s) | |||
P *= (p + 1) // k | |||
for i, (l, e) in enumerate(zip(ls, es)): | |||
if sign(e) != s: continue | |||
Q = k // l * P | |||
if not Q: continue | |||
Q._order = l # else sage computes this | |||
phi = E.isogeny(Q) | |||
E, P = phi.codomain(), phi(P) | |||
es[i] -= s | |||
k //= l | |||
return montgomery_coefficient(E) | |||
# example. | |||
print "testing public-key validation on random ordinary curves (should be all 0s):\n ", | |||
for _ in range(16): | |||
while True: | |||
A = GF(p).random_element() | |||
if montgomery_curve(A).is_ordinary(): break | |||
print int(validate(A)), | |||
privA = private() | |||
print "\nAlice's private key:\n ", " ".join(map('{:2d}'.format, privA)) | |||
pubA = action(base, privA) | |||
print "\nAlice's public key:\n ", pubA, | |||
print " (valid: {})".format(int(validate(pubA))) | |||
privB = private() | |||
print "\nBob's private key:\n ", " ".join(map('{:2d}'.format, privB)) | |||
pubB = action(base, privB) | |||
print "\nBob's public key:\n ", pubB, | |||
print " (valid: {})".format(int(validate(pubB))) | |||
sharedA = action(pubB, privA) | |||
print "\nAlice's shared secret:\n ", sharedA | |||
sharedB = action(pubA, privB) | |||
print "\nBob's shared secret:\n ", sharedB | |||
if sharedA == sharedB: | |||
print "\n--> equal!\n" | |||
else: | |||
print "\n--> NOT EQUAL?!\n" | |||
@@ -0,0 +1,22 @@ | |||
#ifndef UINT_H | |||
#define UINT_H | |||
#include <stdbool.h> | |||
#include <stdint.h> | |||
typedef struct u512 { | |||
uint64_t c[8]; | |||
} u512; | |||
extern const u512 u512_1; | |||
void u512_set(u512 *x, uint64_t y); | |||
bool u512_bit(u512 const *x, uint64_t k); | |||
bool u512_add3(u512 *x, u512 const *y, u512 const *z); /* returns carry */ | |||
bool u512_sub3(u512 *x, u512 const *y, u512 const *z); /* returns borrow */ | |||
void u512_mul3_64(u512 *x, u512 const *y, uint64_t z); | |||
#endif |
@@ -0,0 +1,102 @@ | |||
.intel_syntax noprefix | |||
.section .rodata | |||
.global u512_1 | |||
u512_1: .quad 1, 0, 0, 0, 0, 0, 0, 0 | |||
.section .text | |||
.global u512_set | |||
u512_set: | |||
cld | |||
mov rax, rsi | |||
stosq | |||
xor rax, rax | |||
mov rcx, 7 | |||
rep stosq | |||
ret | |||
.global u512_bit | |||
u512_bit: | |||
mov rcx, rsi | |||
and rcx, 0x3f | |||
shr rsi, 6 | |||
mov rax, [rdi + 8*rsi] | |||
shr rax, cl | |||
and rax, 1 | |||
ret | |||
.global u512_add3 | |||
u512_add3: | |||
mov rax, [rsi + 0] | |||
add rax, [rdx + 0] | |||
mov [rdi + 0], rax | |||
.set k, 1 | |||
.rept 7 | |||
mov rax, [rsi + 8*k] | |||
adc rax, [rdx + 8*k] | |||
mov [rdi + 8*k], rax | |||
.set k, k+1 | |||
.endr | |||
setc al | |||
movzx rax, al | |||
ret | |||
.global u512_sub3 | |||
u512_sub3: | |||
mov rax, [rsi + 0] | |||
sub rax, [rdx + 0] | |||
mov [rdi + 0], rax | |||
.set k, 1 | |||
.rept 7 | |||
mov rax, [rsi + 8*k] | |||
sbb rax, [rdx + 8*k] | |||
mov [rdi + 8*k], rax | |||
.set k, k+1 | |||
.endr | |||
setc al | |||
movzx rax, al | |||
ret | |||
.global u512_mul3_64 | |||
u512_mul3_64: | |||
mulx r10, rax, [rsi + 0] | |||
mov [rdi + 0], rax | |||
mulx r11, rax, [rsi + 8] | |||
add rax, r10 | |||
mov [rdi + 8], rax | |||
mulx r10, rax, [rsi + 16] | |||
adcx rax, r11 | |||
mov [rdi + 16], rax | |||
mulx r11, rax, [rsi + 24] | |||
adcx rax, r10 | |||
mov [rdi + 24], rax | |||
mulx r10, rax, [rsi + 32] | |||
adcx rax, r11 | |||
mov [rdi + 32],rax | |||
mulx r11, rax, [rsi + 40] | |||
adcx rax, r10 | |||
mov [rdi + 40],rax | |||
mulx r10, rax, [rsi + 48] | |||
adcx rax, r11 | |||
mov [rdi + 48],rax | |||
mulx r11, rax, [rsi + 56] | |||
adcx rax, r10 | |||
mov [rdi + 56],rax | |||
ret | |||