diff --git a/reference/csidh-20180427-by-castryck-et-al/Makefile b/reference/csidh-20180427-by-castryck-et-al/Makefile new file mode 100644 index 0000000..926d7df --- /dev/null +++ b/reference/csidh-20180427-by-castryck-et-al/Makefile @@ -0,0 +1,27 @@ +all: + @cc \ + -std=c99 -pedantic \ + -Wall -Wextra \ + -O2 -funroll-loops \ + rng.c \ + u512.s fp.s \ + mont.c \ + csidh.c \ + main.c \ + -o main + +debug: + cc \ + -std=c99 -pedantic \ + -Wall -Wextra \ + -g \ + rng.c \ + u512.s fp.s \ + mont.c \ + csidh.c \ + main.c \ + -o main + +clean: + rm -f main + diff --git a/reference/csidh-20180427-by-castryck-et-al/bench.c b/reference/csidh-20180427-by-castryck-et-al/bench.c new file mode 100644 index 0000000..fd0f64a --- /dev/null +++ b/reference/csidh-20180427-by-castryck-et-al/bench.c @@ -0,0 +1,54 @@ + +#include +#include +#include +#include +#include + +#include "u512.h" +#include "fp.h" +#include "mont.h" +#include "csidh.h" + +#include + +static __inline__ uint64_t rdtsc(void) +{ + uint32_t hi, lo; + __asm__ __volatile__ ("rdtsc" : "=a"(lo), "=d"(hi)); + return lo | (uint64_t) hi << 32; +} + +unsigned long its = 10000; + +int main() +{ + clock_t t0, t1, time = 0; + uint64_t c0, c1, cycles = 0; + + private_key priv; + public_key pub = base; + + for (unsigned long i = 0; i < its; ++i) { + + csidh_private(&priv); + + t0 = clock(); + c0 = rdtsc(); + + /**************************************/ + assert(validate(&pub)); + action(&pub, &pub, &priv); + /**************************************/ + + c1 = rdtsc(); + t1 = clock(); + cycles += c1 - c0; + time += t1 - t0; + } + + printf("iterations: %lu\n", its); + printf("clock cycles: %" PRIu64 "\n", (uint64_t) cycles / its); + printf("wall-clock time: %.3lf ms\n", 1000. * time / CLOCKS_PER_SEC / its); +} + diff --git a/reference/csidh-20180427-by-castryck-et-al/csidh.c b/reference/csidh-20180427-by-castryck-et-al/csidh.c new file mode 100644 index 0000000..6539f8e --- /dev/null +++ b/reference/csidh-20180427-by-castryck-et-al/csidh.c @@ -0,0 +1,220 @@ + +#include +#include + +#include "csidh.h" +#include "rng.h" + +/* specific to p, should perhaps be somewhere else */ +const unsigned primes[num_primes] = { + 3, 5, 7, 11, 13, 17, 19, 23, 29, 31, 37, 41, 43, 47, 53, 59, + 61, 67, 71, 73, 79, 83, 89, 97, 101, 103, 107, 109, 113, 127, 131, 137, + 139, 149, 151, 157, 163, 167, 173, 179, 181, 191, 193, 197, 199, 211, 223, 227, + 229, 233, 239, 241, 251, 257, 263, 269, 271, 277, 281, 283, 293, 307, 311, 313, + 317, 331, 337, 347, 349, 353, 359, 367, 373, 587, +}; + +const u512 four_sqrt_p = {{ + 0x85e2579c786882cf, 0x4e3433657e18da95, 0x850ae5507965a0b3, 0xa15bc4e676475964, +}}; + + +const public_key base = {0}; /* A = 0 */ + +void csidh_private(private_key *priv) +{ + memset(&priv->e, 0, sizeof(priv->e)); + for (size_t i = 0; i < num_primes; ) { + int8_t buf[64]; + randombytes(buf, sizeof(buf)); + for (size_t j = 0; j < sizeof(buf); ++j) { + if (buf[j] <= max_exponent && buf[j] >= -max_exponent) { + priv->e[i / 2] |= (buf[j] & 0xf) << i % 2 * 4; + if (++i >= num_primes) + break; + } + } + } +} + +/* compute [(p+1)/l] P for all l in our list of primes. */ +/* divide and conquer is much faster than doing it naively, + * but uses more memory. */ +static void cofactor_multiples(proj *P, const proj *A, size_t lower, size_t upper) +{ + assert(lower < upper); + + if (upper - lower == 1) + return; + + size_t mid = lower + (upper - lower + 1) / 2; + + u512 cl = u512_1, cu = u512_1; + for (size_t i = lower; i < mid; ++i) + u512_mul3_64(&cu, &cu, primes[i]); + for (size_t i = mid; i < upper; ++i) + u512_mul3_64(&cl, &cl, primes[i]); + + xMUL(&P[mid], A, &P[lower], &cu); + xMUL(&P[lower], A, &P[lower], &cl); + + cofactor_multiples(P, A, lower, mid); + cofactor_multiples(P, A, mid, upper); +} + +/* never accepts invalid keys. */ +bool validate(public_key const *in) +{ + const proj A = {in->A, fp_1}; + + do { + + proj P[num_primes]; + fp_random(&P->x); + P->z = fp_1; + + /* maximal 2-power in p+1 */ + xDBL(P, &A, P); + xDBL(P, &A, P); + + cofactor_multiples(P, &A, 0, num_primes); + + u512 order = u512_1; + + for (size_t i = num_primes - 1; i < num_primes; --i) { + + /* we only gain information if [(p+1)/l] P is non-zero */ + if (memcmp(&P[i].z, &fp_0, sizeof(fp))) { + + u512 tmp; + u512_set(&tmp, primes[i]); + xMUL(&P[i], &A, &P[i], &tmp); + + if (memcmp(&P[i].z, &fp_0, sizeof(fp))) + /* P does not have order dividing p+1. */ + return false; + + u512_mul3_64(&order, &order, primes[i]); + + if (u512_sub3(&tmp, &four_sqrt_p, &order)) /* returns borrow */ + /* order > 4 sqrt(p), hence definitely supersingular */ + return true; + } + } + + /* P didn't have big enough order to prove supersingularity. */ + } while (1); +} + +/* compute x^3 + Ax^2 + x */ +static void montgomery_rhs(fp *rhs, fp const *A, fp const *x) +{ + fp tmp; + *rhs = *x; + fp_sq1(rhs); + fp_mul3(&tmp, A, x); + fp_add2(rhs, &tmp); + fp_add2(rhs, &fp_1); + fp_mul2(rhs, x); +} + +/* totally not constant-time. */ +void action(public_key *out, public_key const *in, private_key const *priv) +{ + u512 k[2]; + u512_set(&k[0], 4); /* maximal 2-power in p+1 */ + u512_set(&k[1], 4); /* maximal 2-power in p+1 */ + + uint8_t e[2][num_primes]; + + for (size_t i = 0; i < num_primes; ++i) { + + int8_t t = (int8_t) (priv->e[i / 2] << i % 2 * 4) >> 4; + + if (t > 0) { + e[0][i] = t; + e[1][i] = 0; + u512_mul3_64(&k[1], &k[1], primes[i]); + } + else if (t < 0) { + e[1][i] = -t; + e[0][i] = 0; + u512_mul3_64(&k[0], &k[0], primes[i]); + } + else { + e[0][i] = 0; + e[1][i] = 0; + u512_mul3_64(&k[0], &k[0], primes[i]); + u512_mul3_64(&k[1], &k[1], primes[i]); + } + } + + proj A = {in->A, fp_1}; + + bool done[2] = {false, false}; + + do { + + assert(!memcmp(&A.z, &fp_1, sizeof(fp))); + + proj P; + fp_random(&P.x); + P.z = fp_1; + + fp rhs; + montgomery_rhs(&rhs, &A.x, &P.x); + bool sign = !fp_issquare(&rhs); + + if (done[sign]) + continue; + + xMUL(&P, &A, &P, &k[sign]); + + done[sign] = true; + + for (size_t i = 0; i < num_primes; ++i) { + + if (e[sign][i]) { + + u512 cof = u512_1; + for (size_t j = i + 1; j < num_primes; ++j) + if (e[sign][j]) + u512_mul3_64(&cof, &cof, primes[j]); + + proj K; + xMUL(&K, &A, &P, &cof); + + if (memcmp(&K.z, &fp_0, sizeof(fp))) { + + xISOG(&A, &P, &K, primes[i]); + + if (!--e[sign][i]) + u512_mul3_64(&k[sign], &k[sign], primes[i]); + + } + + } + + done[sign] &= !e[sign][i]; + } + + fp_inv(&A.z); + fp_mul2(&A.x, &A.z); + A.z = fp_1; + + } while (!(done[0] && done[1])); + + out->A = A.x; +} + +/* includes public-key validation. */ +bool csidh(public_key *out, public_key const *in, private_key const *priv) +{ + if (!validate(in)) { + fp_random(&out->A); + return false; + } + action(out, in, priv); + return true; +} + diff --git a/reference/csidh-20180427-by-castryck-et-al/csidh.h b/reference/csidh-20180427-by-castryck-et-al/csidh.h new file mode 100644 index 0000000..52e9d54 --- /dev/null +++ b/reference/csidh-20180427-by-castryck-et-al/csidh.h @@ -0,0 +1,26 @@ +#ifndef CSIDH_H +#define CSIDH_H + +#include "u512.h" +#include "fp.h" +#include "mont.h" + +/* specific to p, should perhaps be somewhere else */ +#define num_primes 74 +#define max_exponent 5 /* (2*5+1)^74 is roughly 2^256 */ + + +typedef struct private_key { + int8_t e[(num_primes + 1) / 2]; /* packed int4_t */ +} private_key; + +typedef struct public_key { + fp A; /* Montgomery coefficient: represents y^2 = x^3 + Ax^2 + x */ +} public_key; + +extern const public_key base; + +void csidh_private(private_key *priv); +bool csidh(public_key *out, public_key const *in, private_key const *priv); + +#endif diff --git a/reference/csidh-20180427-by-castryck-et-al/fp.h b/reference/csidh-20180427-by-castryck-et-al/fp.h new file mode 100644 index 0000000..6cfa064 --- /dev/null +++ b/reference/csidh-20180427-by-castryck-et-al/fp.h @@ -0,0 +1,37 @@ +#ifndef FP_H +#define FP_H + +#include "u512.h" + +/* fp is in the Montgomery domain, so interpreting that + as an integer should never make sense. + enable compiler warnings when mixing up u512 and fp. */ +typedef struct fp { + u512 x; +} fp; + +extern const fp fp_0; +extern const fp fp_1; + +void fp_set(fp *x, uint64_t y); +void fp_cswap(fp *x, fp *y, bool c); + +void fp_enc(fp *x, u512 const *y); /* encode to Montgomery representation */ +void fp_dec(u512 *x, fp const *y); /* decode from Montgomery representation */ + +void fp_add2(fp *x, fp const *y); +void fp_sub2(fp *x, fp const *y); +void fp_mul2(fp *x, fp const *y); + +void fp_add3(fp *x, fp const *y, fp const *z); +void fp_sub3(fp *x, fp const *y, fp const *z); +void fp_mul3(fp *x, fp const *y, fp const *z); + +void fp_sq1(fp *x); +void fp_sq2(fp *x, fp const *y); +void fp_inv(fp *x); +bool fp_issquare(fp const *x); + +void fp_random(fp *x); + +#endif diff --git a/reference/csidh-20180427-by-castryck-et-al/fp.s b/reference/csidh-20180427-by-castryck-et-al/fp.s new file mode 100644 index 0000000..ac65d94 --- /dev/null +++ b/reference/csidh-20180427-by-castryck-et-al/fp.s @@ -0,0 +1,452 @@ + +.intel_syntax noprefix + +.section .rodata + +.set pbits, 511 +p: + .quad 0x1b81b90533c6c87b, 0xc2721bf457aca835, 0x516730cc1f0b4f25, 0xa7aac6c567f35507 + .quad 0x5afbfcc69322c9cd, 0xb42d083aedc88c42, 0xfc8ab0d15e3e4c4a, 0x65b48e8f740f89bf + + +.global fp_0 +fp_0: .quad 0, 0, 0, 0, 0, 0, 0, 0 + +.global fp_1 +fp_1: /* 2^512 mod p */ + .quad 0xc8fc8df598726f0a, 0x7b1bc81750a6af95, 0x5d319e67c1e961b4, 0xb0aa7275301955f1 + .quad 0x4a080672d9ba6c64, 0x97a5ef8a246ee77b, 0x06ea9e5d4383676a, 0x3496e2e117e0ec80 + + +/* (2^512)^2 mod p */ +.r_squared_mod_p: + .quad 0x36905b572ffc1724, 0x67086f4525f1f27d, 0x4faf3fbfd22370ca, 0x192ea214bcc584b1 + .quad 0x5dae03ee2f5de3d0, 0x1e9248731776b371, 0xad5f166e20e4f52d, 0x4ed759aea6f3917e + +/* -p^-1 mod 2^64 */ +.inv_min_p_mod_r: + .quad 0x66c1301f632e294d + + +.section .text + +.global fp_copy +fp_copy: + cld + mov rcx, 8 + rep movsq + ret + +.global fp_set +fp_set: + push rdi + call u512_set + pop rdi + mov rsi, rdi + jmp fp_enc + +.global fp_cswap +fp_cswap: + movzx rax, dl + neg rax + .set k, 0 + .rept 8 + mov rcx, [rdi + 8*k] + mov rdx, [rsi + 8*k] + + mov r8, rcx + xor r8, rdx + and r8, rax + + xor rcx, r8 + xor rdx, r8 + + mov [rdi + 8*k], rcx + mov [rsi + 8*k], rdx + + .set k, k+1 + .endr + ret + +.reduce_once: + push rbp + mov rbp, rdi + + mov rdi, [rbp + 0] + sub rdi, [rip + p + 0] + mov rsi, [rbp + 8] + sbb rsi, [rip + p + 8] + mov rdx, [rbp + 16] + sbb rdx, [rip + p + 16] + mov rcx, [rbp + 24] + sbb rcx, [rip + p + 24] + mov r8, [rbp + 32] + sbb r8, [rip + p + 32] + mov r9, [rbp + 40] + sbb r9, [rip + p + 40] + mov r10, [rbp + 48] + sbb r10, [rip + p + 48] + mov r11, [rbp + 56] + sbb r11, [rip + p + 56] + + setnc al + movzx rax, al + neg rax + +.macro cswap2, r, m + xor \r, \m + and \r, rax + xor \m, \r +.endm + + cswap2 rdi, [rbp + 0] + cswap2 rsi, [rbp + 8] + cswap2 rdx, [rbp + 16] + cswap2 rcx, [rbp + 24] + cswap2 r8, [rbp + 32] + cswap2 r9, [rbp + 40] + cswap2 r10, [rbp + 48] + cswap2 r11, [rbp + 56] + + pop rbp + ret + +.global fp_add3 +fp_add3: + push rdi + call u512_add3 + pop rdi + jmp .reduce_once + +.global fp_add2 +fp_add2: + mov rdx, rdi + jmp fp_add3 + +.global fp_sub3 +fp_sub3: + push rdi + call u512_sub3 + pop rdi + xor rsi, rsi + xor rdx, rdx + xor rcx, rcx + xor r8, r8 + xor r9, r9 + xor r10, r10 + xor r11, r11 + test rax, rax + cmovnz rax, [rip + p + 0] + cmovnz rsi, [rip + p + 8] + cmovnz rdx, [rip + p + 16] + cmovnz rcx, [rip + p + 24] + cmovnz r8, [rip + p + 32] + cmovnz r9, [rip + p + 40] + cmovnz r10, [rip + p + 48] + cmovnz r11, [rip + p + 56] + add [rdi + 0], rax + adc [rdi + 8], rsi + adc [rdi + 16], rdx + adc [rdi + 24], rcx + adc [rdi + 32], r8 + adc [rdi + 40], r9 + adc [rdi + 48], r10 + adc [rdi + 56], r11 + ret + +.global fp_sub2 +fp_sub2: + mov rdx, rdi + xchg rsi, rdx + jmp fp_sub3 + + +/* Montgomery arithmetic */ + +.global fp_enc +fp_enc: + lea rdx, [rip + .r_squared_mod_p] + jmp fp_mul3 + +.global fp_dec +fp_dec: + lea rdx, [rip + u512_1] + jmp fp_mul3 + +.global fp_mul3 +fp_mul3: + push rbp + push rbx + push r12 + push r13 + push r14 + push r15 + + push rdi + + mov rdi, rsi + mov rsi, rdx + + xor r8, r8 + xor r9, r9 + xor r10, r10 + xor r11, r11 + xor r12, r12 + xor r13, r13 + xor r14, r14 + xor r15, r15 + xor rbp, rbp + + /* flags are already cleared */ + +.macro MULSTEP, k, r0, r1, r2, r3, r4, r5, r6, r7, r8 + + mov rdx, [rsi + 0] + mulx rcx, rdx, [rdi + 8*\k] + add rdx, \r0 + mulx rcx, rdx, [rip + .inv_min_p_mod_r] + + xor rax, rax /* clear flags */ + + mulx rbx, rax, [rip + p + 0] + adox \r0, rax + + mulx rcx, rax, [rip + p + 8] + adcx \r1, rbx + adox \r1, rax + + mulx rbx, rax, [rip + p + 16] + adcx \r2, rcx + adox \r2, rax + + mulx rcx, rax, [rip + p + 24] + adcx \r3, rbx + adox \r3, rax + + mulx rbx, rax, [rip + p + 32] + adcx \r4, rcx + adox \r4, rax + + mulx rcx, rax, [rip + p + 40] + adcx \r5, rbx + adox \r5, rax + + mulx rbx, rax, [rip + p + 48] + adcx \r6, rcx + adox \r6, rax + + mulx rcx, rax, [rip + p + 56] + adcx \r7, rbx + adox \r7, rax + + mov rax, 0 + adcx \r8, rcx + adox \r8, rax + + + mov rdx, [rdi + 8*\k] + + xor rax, rax /* clear flags */ + + mulx rbx, rax, [rsi + 0] + adox \r0, rax + + mulx rcx, rax, [rsi + 8] + adcx \r1, rbx + adox \r1, rax + + mulx rbx, rax, [rsi + 16] + adcx \r2, rcx + adox \r2, rax + + mulx rcx, rax, [rsi + 24] + adcx \r3, rbx + adox \r3, rax + + mulx rbx, rax, [rsi + 32] + adcx \r4, rcx + adox \r4, rax + + mulx rcx, rax, [rsi + 40] + adcx \r5, rbx + adox \r5, rax + + mulx rbx, rax, [rsi + 48] + adcx \r6, rcx + adox \r6, rax + + mulx rcx, rax, [rsi + 56] + adcx \r7, rbx + adox \r7, rax + + mov rax, 0 + adcx \r8, rcx + adox \r8, rax + +.endm + + MULSTEP 0, r8, r9, r10, r11, r12, r13, r14, r15, rbp + MULSTEP 1, r9, r10, r11, r12, r13, r14, r15, rbp, r8 + MULSTEP 2, r10, r11, r12, r13, r14, r15, rbp, r8, r9 + MULSTEP 3, r11, r12, r13, r14, r15, rbp, r8, r9, r10 + MULSTEP 4, r12, r13, r14, r15, rbp, r8, r9, r10, r11 + MULSTEP 5, r13, r14, r15, rbp, r8, r9, r10, r11, r12 + MULSTEP 6, r14, r15, rbp, r8, r9, r10, r11, r12, r13 + MULSTEP 7, r15, rbp, r8, r9, r10, r11, r12, r13, r14 + + pop rdi + + mov [rdi + 0], rbp + mov [rdi + 8], r8 + mov [rdi + 16], r9 + mov [rdi + 24], r10 + mov [rdi + 32], r11 + mov [rdi + 40], r12 + mov [rdi + 48], r13 + mov [rdi + 56], r14 + + pop r15 + pop r14 + pop r13 + pop r12 + pop rbx + pop rbp + jmp .reduce_once + +.global fp_mul2 +fp_mul2: + mov rdx, rdi + jmp fp_mul3 + +.global fp_sq2 +fp_sq2: + /* TODO implement optimized Montgomery squaring */ + mov rdx, rsi + jmp fp_mul3 + +.global fp_sq1 +fp_sq1: + mov rsi, rdi + jmp fp_sq2 + +/* (obviously) not constant time in the exponent! */ +.fp_pow: + push rbx + mov rbx, rsi + push r12 + push r13 + push rdi + sub rsp, 64 + + mov rsi, rdi + mov rdi, rsp + call fp_copy + + mov rdi, [rsp + 64] + lea rsi, [rip + fp_1] + call fp_copy + +.macro POWSTEP, k + mov r13, [rbx + 8*\k] + xor r12, r12 + + 0: + test r13, 1 + jz 1f + + mov rdi, [rsp + 64] + mov rsi, rsp + call fp_mul2 + + 1: + mov rdi, rsp + call fp_sq1 + + shr r13 + + inc r12 + test r12, 64 + jz 0b +.endm + + POWSTEP 0 + POWSTEP 1 + POWSTEP 2 + POWSTEP 3 + POWSTEP 4 + POWSTEP 5 + POWSTEP 6 + POWSTEP 7 + + add rsp, 64+8 + pop r13 + pop r12 + pop rbx + ret + +.section .rodata +.p_minus_2: + .quad 0x1b81b90533c6c879, 0xc2721bf457aca835, 0x516730cc1f0b4f25, 0xa7aac6c567f35507 + .quad 0x5afbfcc69322c9cd, 0xb42d083aedc88c42, 0xfc8ab0d15e3e4c4a, 0x65b48e8f740f89bf + +.section .text + +/* TODO use a better addition chain? */ +.global fp_inv +fp_inv: + lea rsi, [rip + .p_minus_2] + jmp .fp_pow + +.section .rodata +.p_minus_1_halves: + .quad 0x8dc0dc8299e3643d, 0xe1390dfa2bd6541a, 0xa8b398660f85a792, 0xd3d56362b3f9aa83 + .quad 0x2d7dfe63499164e6, 0x5a16841d76e44621, 0xfe455868af1f2625, 0x32da4747ba07c4df + +.section .text + +/* TODO use a better addition chain? */ +.global fp_issquare +fp_issquare: + push rdi + lea rsi, [rip + .p_minus_1_halves] + call .fp_pow + pop rdi + + xor rax, rax + .set k, 0 + .rept 8 + mov rsi, [rdi + 8*k] + xor rsi, [rip + fp_1 + 8*k] + or rax, rsi + .set k, k+1 + .endr + test rax, rax + setz al + movzx rax, al + ret + + +/* not constant time (but this shouldn't leak anything of importance) */ +.global fp_random +fp_random: + + push rdi + mov rsi, 64 + call randombytes + pop rdi + mov rax, 1 + shl rax, (pbits % 64) + dec rax + and [rdi + 56], rax + + .set k, 7 + .rept 8 + mov rax, [rip + p + 8*k] + cmp [rdi + 8*k], rax + jge fp_random + jl 0f + .set k, k-1 + .endr + 0: + ret + diff --git a/reference/csidh-20180427-by-castryck-et-al/main b/reference/csidh-20180427-by-castryck-et-al/main new file mode 100755 index 0000000..d8e66de Binary files /dev/null and b/reference/csidh-20180427-by-castryck-et-al/main differ diff --git a/reference/csidh-20180427-by-castryck-et-al/main.c b/reference/csidh-20180427-by-castryck-et-al/main.c new file mode 100644 index 0000000..6649bfc --- /dev/null +++ b/reference/csidh-20180427-by-castryck-et-al/main.c @@ -0,0 +1,99 @@ + +#include +#include +#include +#include +#include +#include + +#include "u512.h" +#include "fp.h" +#include "mont.h" +#include "csidh.h" + +void u512_print(u512 const *x) +{ + for (size_t i = 63; i < 64; --i) + printf("%02hhx", i[(unsigned char *) x->c]); +} + +void fp_print(fp const *x) +{ + u512 y; + fp_dec(&y, x); + u512_print(&y); +} + +int main() +{ + clock_t t0, t1; + + private_key priv_alice, priv_bob; + public_key pub_alice, pub_bob; + public_key shared_alice, shared_bob; + + printf("\n"); + + + t0 = clock(); + csidh_private(&priv_alice); + t1 = clock(); + + printf("Alice's private key (%7.3lf ms):\n ", 1000. * (t1 - t0) / CLOCKS_PER_SEC); + for (size_t i = 0; i < sizeof(priv_alice); ++i) + printf("%02hhx", i[(uint8_t *) &priv_alice]); + printf("\n\n"); + + t0 = clock(); + csidh_private(&priv_bob); + t1 = clock(); + + printf("Bob's private key (%7.3lf ms):\n ", 1000. * (t1 - t0) / CLOCKS_PER_SEC); + for (size_t i = 0; i < sizeof(priv_bob); ++i) + printf("%02hhx", i[(uint8_t *) &priv_bob]); + printf("\n\n"); + + + t0 = clock(); + assert(csidh(&pub_alice, &base, &priv_alice)); + t1 = clock(); + + printf("Alice's public key (%7.3lf ms):\n ", 1000. * (t1 - t0) / CLOCKS_PER_SEC); + fp_print(&pub_alice.A); + printf("\n\n"); + + t0 = clock(); + assert(csidh(&pub_bob, &base, &priv_bob)); + t1 = clock(); + + printf("Bob's public key (%7.3lf ms):\n ", 1000. * (t1 - t0) / CLOCKS_PER_SEC); + fp_print(&pub_bob.A); + printf("\n\n"); + + + t0 = clock(); + assert(csidh(&shared_alice, &pub_bob, &priv_alice)); + t1 = clock(); + + printf("Alice's shared secret (%7.3lf ms):\n ", 1000. * (t1 - t0) / CLOCKS_PER_SEC); + fp_print(&shared_alice.A); + printf("\n\n"); + + t0 = clock(); + assert(csidh(&shared_bob, &pub_alice, &priv_bob)); + t1 = clock(); + + printf("Bob's shared secret (%7.3lf ms):\n ", 1000. * (t1 - t0) / CLOCKS_PER_SEC); + fp_print(&shared_bob.A); + printf("\n\n"); + + printf(" "); + if (memcmp(&shared_alice, &shared_bob, sizeof(public_key))) + printf("\x1b[31mNOT EQUAL!\x1b[0m\n"); + else + printf("\x1b[32mequal.\x1b[0m\n"); + printf("\n"); + + printf("\n"); +} + diff --git a/reference/csidh-20180427-by-castryck-et-al/mont.c b/reference/csidh-20180427-by-castryck-et-al/mont.c new file mode 100644 index 0000000..d2a6533 --- /dev/null +++ b/reference/csidh-20180427-by-castryck-et-al/mont.c @@ -0,0 +1,188 @@ + +#include + +#include "mont.h" + +void xDBLADD(proj *R, proj *S, proj const *P, proj const *Q, proj const *PQ, proj const *A24) +{ + fp tmp0, tmp1, tmp2; + + fp_add3(&tmp0, &P->x, &P->z); + fp_sub3(&tmp1, &P->x, &P->z); + fp_sq2(&R->x, &tmp0); + fp_sub3(&tmp2, &Q->x, &Q->z); + fp_add3(&S->x, &Q->x, &Q->z); + fp_mul2(&tmp0, &tmp2); + fp_sq2(&R->z, &tmp1); + fp_mul2(&tmp1, &S->x); + fp_sub3(&tmp2, &R->x, &R->z); + fp_mul2(&R->z, &A24->z); + fp_mul2(&R->x, &R->z); + fp_mul3(&S->x, &A24->x, &tmp2); + fp_sub3(&S->z, &tmp0, &tmp1); + fp_add2(&R->z, &S->x); + fp_add3(&S->x, &tmp0, &tmp1); + fp_mul2(&R->z, &tmp2); + fp_sq1(&S->z); + fp_sq1(&S->x); + fp_mul2(&S->z, &PQ->x); + fp_mul2(&S->x, &PQ->z); +} + +void xDBL(proj *Q, proj const *A, proj const *P) +{ + fp a, b, c; + fp_add3(&a, &P->x, &P->z); + fp_sq1(&a); + fp_sub3(&b, &P->x, &P->z); + fp_sq1(&b); + fp_sub3(&c, &a, &b); + fp_add2(&b, &b); fp_add2(&b, &b); /* multiplication by 4 */ + fp_mul2(&b, &A->z); + fp_mul3(&Q->x, &a, &b); + fp_add3(&a, &A->z, &A->z); /* multiplication by 2 */ + fp_add2(&a, &A->x); + fp_mul2(&a, &c); + fp_add2(&a, &b); + fp_mul3(&Q->z, &a, &c); +} + +void xADD(proj *S, proj const *P, proj const *Q, proj const *PQ) +{ + fp a, b, c, d; + fp_add3(&a, &P->x, &P->z); + fp_sub3(&b, &P->x, &P->z); + fp_add3(&c, &Q->x, &Q->z); + fp_sub3(&d, &Q->x, &Q->z); + fp_mul2(&a, &d); + fp_mul2(&b, &c); + fp_add3(&c, &a, &b); + fp_sub3(&d, &a, &b); + fp_sq1(&c); + fp_sq1(&d); + fp_mul3(&S->x, &PQ->z, &c); + fp_mul3(&S->z, &PQ->x, &d); +} + +/* Montgomery ladder. */ +/* P must not be the unique point of order 2. */ +/* not constant-time! */ +void xMUL(proj *Q, proj const *A, proj const *P, u512 const *k) +{ + proj R = *P; + proj A24; + const proj Pcopy = *P; /* in case Q = P */ + + Q->x = fp_1; + Q->z = fp_0; + + fp_add3(&A24.x, &A->z, &A->z); + fp_add3(&A24.z, &A24.x, &A24.x); + fp_add2(&A24.x, &A->x); + + unsigned long i = 512; + while (--i && !u512_bit(k, i)); + + do { + + bool bit = u512_bit(k, i); + + if (bit) { proj T = *Q; *Q = R; R = T; } /* not constant-time */ + //fp_cswap(&Q->x, &R.x, bit); + //fp_cswap(&Q->z, &R.z, bit); + + xDBLADD(Q, &R, Q, &R, &Pcopy, &A24); + + if (bit) { proj T = *Q; *Q = R; R = T; } /* not constant-time */ + //fp_cswap(&Q->x, &R.x, bit); + //fp_cswap(&Q->z, &R.z, bit); + + } while (i--); +} + + +/* computes the isogeny with kernel point K of order k */ +/* returns the new curve coefficient A and the image of P */ +/* (obviously) not constant time in k */ +void xISOG(proj *A, proj *P, proj const *K, uint64_t k) +{ + assert (k >= 3); + assert (k % 2 == 1); + + fp tmp0, tmp1; + fp T[4] = {K->z, K->x, K->x, K->z}; + proj Q; + + fp_mul3(&Q.x, &P->x, &K->x); + fp_mul3(&tmp0, &P->z, &K->z); + fp_sub2(&Q.x, &tmp0); + + fp_mul3(&Q.z, &P->x, &K->z); + fp_mul3(&tmp0, &P->z, &K->x); + fp_sub2(&Q.z, &tmp0); + + proj M[3] = {*K}; + xDBL(&M[1], A, K); + + for (uint64_t i = 1; i < k / 2; ++i) { + + if (i >= 2) + xADD(&M[i % 3], &M[(i - 1) % 3], K, &M[(i - 2) % 3]); + + fp_mul3(&tmp0, &M[i % 3].x, &T[0]); + fp_mul3(&tmp1, &M[i % 3].z, &T[1]); + fp_add3(&T[0], &tmp0, &tmp1); + + fp_mul2(&T[1], &M[i % 3].x); + + fp_mul3(&tmp0, &M[i % 3].z, &T[2]); + fp_mul3(&tmp1, &M[i % 3].x, &T[3]); + fp_add3(&T[2], &tmp0, &tmp1); + + fp_mul2(&T[3], &M[i % 3].z); + + + fp_mul3(&tmp0, &P->x, &M[i % 3].x); + fp_mul3(&tmp1, &P->z, &M[i % 3].z); + fp_sub2(&tmp0, &tmp1); + fp_mul2(&Q.x, &tmp0); + + fp_mul3(&tmp0, &P->x, &M[i % 3].z); + fp_mul3(&tmp1, &P->z, &M[i % 3].x); + fp_sub2(&tmp0, &tmp1); + fp_mul2(&Q.z, &tmp0); + } + + fp_mul2(&T[0], &T[1]); + fp_add2(&T[0], &T[0]); /* multiplication by 2 */ + + fp_sq1(&T[1]); + + fp_mul2(&T[2], &T[3]); + fp_add2(&T[2], &T[2]); /* multiplication by 2 */ + + fp_sq1(&T[3]); + + /* Ax := T[1] * T[3] * Ax - 3 * Az * (T[1] * T[2] - T[0] * T[3]) */ + fp_mul3(&tmp0, &T[1], &T[2]); + fp_mul3(&tmp1, &T[0], &T[3]); + fp_sub2(&tmp0, &tmp1); + fp_mul2(&tmp0, &A->z); + fp_add3(&tmp1, &tmp0, &tmp0); fp_add2(&tmp0, &tmp1); /* multiplication by 3 */ + + fp_mul3(&tmp1, &T[1], &T[3]); + fp_mul2(&tmp1, &A->x); + + fp_sub3(&A->x, &tmp1, &tmp0); + + /* Az := Az * T[3]^2 */ + fp_sq1(&T[3]); + fp_mul2(&A->z, &T[3]); + + /* X := X * Xim^2, Z := Z * Zim^2 */ + fp_sq1(&Q.x); + fp_sq1(&Q.z); + fp_mul2(&P->x, &Q.x); + fp_mul2(&P->z, &Q.z); +} + diff --git a/reference/csidh-20180427-by-castryck-et-al/mont.h b/reference/csidh-20180427-by-castryck-et-al/mont.h new file mode 100644 index 0000000..36b3ad7 --- /dev/null +++ b/reference/csidh-20180427-by-castryck-et-al/mont.h @@ -0,0 +1,19 @@ +#ifndef MONT_H +#define MONT_H + +#include "u512.h" +#include "fp.h" + +/* P^1 over fp. */ +typedef struct proj { + fp x; + fp z; +} proj; + +void xDBL(proj *Q, proj const *A, proj const *P); +void xADD(proj *S, proj const *P, proj const *Q, proj const *PQ); +void xDBLADD(proj *R, proj *S, proj const *P, proj const *Q, proj const *PQ, proj const *A); +void xMUL(proj *Q, proj const *A, proj const *P, u512 const *k); +void xISOG(proj *A, proj *P, proj const *K, uint64_t k); + +#endif diff --git a/reference/csidh-20180427-by-castryck-et-al/rng.c b/reference/csidh-20180427-by-castryck-et-al/rng.c new file mode 100644 index 0000000..fc28c87 --- /dev/null +++ b/reference/csidh-20180427-by-castryck-et-al/rng.c @@ -0,0 +1,18 @@ + +#include "rng.h" + +#include +#include +#include + +void randombytes(void *x, size_t l) +{ + static int fd = -1; + ssize_t n; + if (fd < 0 && 0 > (fd = open("/dev/urandom", O_RDONLY))) + exit(1); + for (size_t i = 0; i < l; i += n) + if (0 >= (n = read(fd, (char *) x + i, l - i))) + exit(2); +} + diff --git a/reference/csidh-20180427-by-castryck-et-al/rng.h b/reference/csidh-20180427-by-castryck-et-al/rng.h new file mode 100644 index 0000000..30e01d0 --- /dev/null +++ b/reference/csidh-20180427-by-castryck-et-al/rng.h @@ -0,0 +1,8 @@ +#ifndef RNG_H +#define RNG_H + +#include + +void randombytes(void *x, size_t l); + +#endif diff --git a/reference/csidh-20180427-by-castryck-et-al/supersingular.sage b/reference/csidh-20180427-by-castryck-et-al/supersingular.sage new file mode 100755 index 0000000..928f695 --- /dev/null +++ b/reference/csidh-20180427-by-castryck-et-al/supersingular.sage @@ -0,0 +1,128 @@ +#!/usr/bin/env sage +#coding: utf8 + +proof.all(False) + + +# parameters. + +ls = list(primes(3, 374)) + [587] # Elkies primes +#ls = list(primes(3, 47)) + [97] # (a smaller example) +p = 4 * prod(ls) - 1 +assert is_prime(p) +print "\nElkies primes:", " ".join(map(str, ls)) + +max_exp = ceil((sqrt(p) ** (1/len(ls)) - 1) / 2) +assert (2 * max_exp + 1) ** len(ls) >= sqrt(p) +print "exponents are chosen in the range {}..{}.".format(-max_exp, max_exp) + +base = GF(p)(0) # Montgomery coefficient of starting curve + + +# helper functions. + +# NB: all the operations can be computed entirely over the prime field, +# but for simplicity of this implementation we will make use of curves +# defined over GF(p^2). note this slows everything down quite a bit. + +Fp2. = GF(p**2, modulus = x**2 + 1) + +def montgomery_curve(A): + return EllipticCurve(Fp2, [0, A, 0, 1, 0]) + +# sage's isogeny formulas return Weierstraß curves, hence we need this... +def montgomery_coefficient(E): + Ew = E.change_ring(GF(p)).short_weierstrass_model() + _, _, _, a, b = Ew.a_invariants() + R. = GF(p)[] + r = (z**3 + a*z + b).roots(multiplicities=False)[0] + s = sqrt(3 * r**2 + a) + if not is_square(s): s = -s + A = 3 * r / s + assert montgomery_curve(A).change_ring(GF(p)).is_isomorphic(Ew) + return GF(p)(A) + + +# actual implementation. + +def private(): + return [randrange(-max_exp, max_exp + 1) for _ in range(len(ls))] + +def validate(A): + while True: + k = 1 + P = montgomery_curve(A).lift_x(GF(p).random_element()) + for l in ls: + Q = (p + 1) // l * P + if not Q: continue + if l * Q: return False + k *= l + if k > 4 * sqrt(p): return True + +def action(pub, priv): + + E = montgomery_curve(pub) + es = priv[:] + + while any(es): + + E._order = (p + 1)**2 # else sage computes this + + P = E.lift_x(GF(p).random_element()) + s = +1 if P.xy()[1] in GF(p) else -1 + k = prod(l for l, e in zip(ls, es) if sign(e) == s) + P *= (p + 1) // k + + for i, (l, e) in enumerate(zip(ls, es)): + + if sign(e) != s: continue + + Q = k // l * P + if not Q: continue + Q._order = l # else sage computes this + phi = E.isogeny(Q) + + E, P = phi.codomain(), phi(P) + es[i] -= s + k //= l + + return montgomery_coefficient(E) + + +# example. + +print + +print "testing public-key validation on random ordinary curves (should be all 0s):\n ", +for _ in range(16): + while True: + A = GF(p).random_element() + if montgomery_curve(A).is_ordinary(): break + print int(validate(A)), +print + +privA = private() +print "\nAlice's private key:\n ", " ".join(map('{:2d}'.format, privA)) + +pubA = action(base, privA) +print "\nAlice's public key:\n ", pubA, +print " (valid: {})".format(int(validate(pubA))) + +privB = private() +print "\nBob's private key:\n ", " ".join(map('{:2d}'.format, privB)) + +pubB = action(base, privB) +print "\nBob's public key:\n ", pubB, +print " (valid: {})".format(int(validate(pubB))) + +sharedA = action(pubB, privA) +print "\nAlice's shared secret:\n ", sharedA + +sharedB = action(pubA, privB) +print "\nBob's shared secret:\n ", sharedB + +if sharedA == sharedB: + print "\n--> equal!\n" +else: + print "\n--> NOT EQUAL?!\n" + diff --git a/reference/csidh-20180427-by-castryck-et-al/u512.h b/reference/csidh-20180427-by-castryck-et-al/u512.h new file mode 100644 index 0000000..6eaae93 --- /dev/null +++ b/reference/csidh-20180427-by-castryck-et-al/u512.h @@ -0,0 +1,22 @@ +#ifndef UINT_H +#define UINT_H + +#include +#include + +typedef struct u512 { + uint64_t c[8]; +} u512; + +extern const u512 u512_1; + +void u512_set(u512 *x, uint64_t y); + +bool u512_bit(u512 const *x, uint64_t k); + +bool u512_add3(u512 *x, u512 const *y, u512 const *z); /* returns carry */ +bool u512_sub3(u512 *x, u512 const *y, u512 const *z); /* returns borrow */ + +void u512_mul3_64(u512 *x, u512 const *y, uint64_t z); + +#endif diff --git a/reference/csidh-20180427-by-castryck-et-al/u512.s b/reference/csidh-20180427-by-castryck-et-al/u512.s new file mode 100644 index 0000000..779be3e --- /dev/null +++ b/reference/csidh-20180427-by-castryck-et-al/u512.s @@ -0,0 +1,102 @@ + +.intel_syntax noprefix + +.section .rodata + +.global u512_1 +u512_1: .quad 1, 0, 0, 0, 0, 0, 0, 0 + + +.section .text + +.global u512_set +u512_set: + cld + mov rax, rsi + stosq + xor rax, rax + mov rcx, 7 + rep stosq + ret + + +.global u512_bit +u512_bit: + mov rcx, rsi + and rcx, 0x3f + shr rsi, 6 + mov rax, [rdi + 8*rsi] + shr rax, cl + and rax, 1 + ret + + +.global u512_add3 +u512_add3: + mov rax, [rsi + 0] + add rax, [rdx + 0] + mov [rdi + 0], rax + .set k, 1 + .rept 7 + mov rax, [rsi + 8*k] + adc rax, [rdx + 8*k] + mov [rdi + 8*k], rax + .set k, k+1 + .endr + setc al + movzx rax, al + ret + +.global u512_sub3 +u512_sub3: + mov rax, [rsi + 0] + sub rax, [rdx + 0] + mov [rdi + 0], rax + .set k, 1 + .rept 7 + mov rax, [rsi + 8*k] + sbb rax, [rdx + 8*k] + mov [rdi + 8*k], rax + .set k, k+1 + .endr + setc al + movzx rax, al + ret + + +.global u512_mul3_64 +u512_mul3_64: + + mulx r10, rax, [rsi + 0] + mov [rdi + 0], rax + + mulx r11, rax, [rsi + 8] + add rax, r10 + mov [rdi + 8], rax + + mulx r10, rax, [rsi + 16] + adcx rax, r11 + mov [rdi + 16], rax + + mulx r11, rax, [rsi + 24] + adcx rax, r10 + mov [rdi + 24], rax + + mulx r10, rax, [rsi + 32] + adcx rax, r11 + mov [rdi + 32],rax + + mulx r11, rax, [rsi + 40] + adcx rax, r10 + mov [rdi + 40],rax + + mulx r10, rax, [rsi + 48] + adcx rax, r11 + mov [rdi + 48],rax + + mulx r11, rax, [rsi + 56] + adcx rax, r10 + mov [rdi + 56],rax + + ret +