reference implementation by Castryck et al

2018-09-18 17:43:39 +02:00 · 2018-09-18 17:43:39 +02:00 · 08d5c77d3a
--- a/reference/csidh-20180427-by-castryck-et-al/Makefile
+++ b/reference/csidh-20180427-by-castryck-et-al/Makefile
@ -0,0 +1,27 @@
+all:
+	@cc \
+		-std=c99 -pedantic \
+		-Wall -Wextra \
+		-O2 -funroll-loops \
+		rng.c \
+		u512.s fp.s \
+		mont.c \
+		csidh.c \
+		main.c \
+		-o main
+
+debug:
+	cc \
+		-std=c99 -pedantic \
+		-Wall -Wextra \
+		-g \
+		rng.c \
+		u512.s fp.s \
+		mont.c \
+		csidh.c \
+		main.c \
+		-o main
+
+clean:
+	rm -f main
+
--- a/reference/csidh-20180427-by-castryck-et-al/bench.c
+++ b/reference/csidh-20180427-by-castryck-et-al/bench.c
@ -0,0 +1,54 @@
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <time.h>
+#include <assert.h>
+
+#include "u512.h"
+#include "fp.h"
+#include "mont.h"
+#include "csidh.h"
+
+#include <inttypes.h>
+
+static __inline__ uint64_t rdtsc(void)
+{
+    uint32_t hi, lo;
+    __asm__ __volatile__ ("rdtsc" : "=a"(lo), "=d"(hi));
+    return lo | (uint64_t) hi << 32;
+}
+
+unsigned long its = 10000;
+
+int main()
+{
+    clock_t t0, t1, time = 0;
+    uint64_t c0, c1, cycles = 0;
+
+    private_key priv;
+    public_key pub = base;
+
+    for (unsigned long i = 0; i < its; ++i) {
+
+        csidh_private(&priv);
+
+        t0 = clock();
+        c0 = rdtsc();
+
+        /**************************************/
+        assert(validate(&pub));
+        action(&pub, &pub, &priv);
+        /**************************************/
+
+        c1 = rdtsc();
+        t1 = clock();
+        cycles += c1 - c0;
+        time += t1 - t0;
+    }
+
+    printf("iterations: %lu\n", its);
+    printf("clock cycles: %" PRIu64 "\n", (uint64_t) cycles / its);
+    printf("wall-clock time: %.3lf ms\n", 1000. * time / CLOCKS_PER_SEC / its);
+}
+
--- a/reference/csidh-20180427-by-castryck-et-al/csidh.c
+++ b/reference/csidh-20180427-by-castryck-et-al/csidh.c
@ -0,0 +1,220 @@
+
+#include <string.h>
+#include <assert.h>
+
+#include "csidh.h"
+#include "rng.h"
+
+/* specific to p, should perhaps be somewhere else */
+const unsigned primes[num_primes] = {
+      3,   5,   7,  11,  13,  17,  19,  23,  29,  31,  37,  41,  43,  47,  53,  59,
+     61,  67,  71,  73,  79,  83,  89,  97, 101, 103, 107, 109, 113, 127, 131, 137,
+    139, 149, 151, 157, 163, 167, 173, 179, 181, 191, 193, 197, 199, 211, 223, 227,
+    229, 233, 239, 241, 251, 257, 263, 269, 271, 277, 281, 283, 293, 307, 311, 313,
+    317, 331, 337, 347, 349, 353, 359, 367, 373, 587,
+};
+
+const u512 four_sqrt_p = {{
+    0x85e2579c786882cf, 0x4e3433657e18da95, 0x850ae5507965a0b3, 0xa15bc4e676475964,
+}};
+
+
+const public_key base = {0}; /* A = 0 */
+
+void csidh_private(private_key *priv)
+{
+    memset(&priv->e, 0, sizeof(priv->e));
+    for (size_t i = 0; i < num_primes; ) {
+        int8_t buf[64];
+        randombytes(buf, sizeof(buf));
+        for (size_t j = 0; j < sizeof(buf); ++j) {
+            if (buf[j] <= max_exponent && buf[j] >= -max_exponent) {
+                priv->e[i / 2] |= (buf[j] & 0xf) << i % 2 * 4;
+                if (++i >= num_primes)
+                    break;
+            }
+        }
+    }
+}
+
+/* compute [(p+1)/l] P for all l in our list of primes. */
+/* divide and conquer is much faster than doing it naively,
+ * but uses more memory. */
+static void cofactor_multiples(proj *P, const proj *A, size_t lower, size_t upper)
+{
+    assert(lower < upper);
+
+    if (upper - lower == 1)
+        return;
+
+    size_t mid = lower + (upper - lower + 1) / 2;
+
+    u512 cl = u512_1, cu = u512_1;
+    for (size_t i = lower; i < mid; ++i)
+        u512_mul3_64(&cu, &cu, primes[i]);
+    for (size_t i = mid; i < upper; ++i)
+        u512_mul3_64(&cl, &cl, primes[i]);
+
+    xMUL(&P[mid], A, &P[lower], &cu);
+    xMUL(&P[lower], A, &P[lower], &cl);
+
+    cofactor_multiples(P, A, lower, mid);
+    cofactor_multiples(P, A, mid, upper);
+}
+
+/* never accepts invalid keys. */
+bool validate(public_key const *in)
+{
+    const proj A = {in->A, fp_1};
+
+    do {
+
+        proj P[num_primes];
+        fp_random(&P->x);
+        P->z = fp_1;
+
+        /* maximal 2-power in p+1 */
+        xDBL(P, &A, P);
+        xDBL(P, &A, P);
+
+        cofactor_multiples(P, &A, 0, num_primes);
+
+        u512 order = u512_1;
+
+        for (size_t i = num_primes - 1; i < num_primes; --i) {
+
+            /* we only gain information if [(p+1)/l] P is non-zero */
+            if (memcmp(&P[i].z, &fp_0, sizeof(fp))) {
+
+                u512 tmp;
+                u512_set(&tmp, primes[i]);
+                xMUL(&P[i], &A, &P[i], &tmp);
+
+                if (memcmp(&P[i].z, &fp_0, sizeof(fp)))
+                    /* P does not have order dividing p+1. */
+                    return false;
+
+                u512_mul3_64(&order, &order, primes[i]);
+
+                if (u512_sub3(&tmp, &four_sqrt_p, &order)) /* returns borrow */
+                    /* order > 4 sqrt(p), hence definitely supersingular */
+                    return true;
+            }
+        }
+
+    /* P didn't have big enough order to prove supersingularity. */
+    } while (1);
+}
+
+/* compute x^3 + Ax^2 + x */
+static void montgomery_rhs(fp *rhs, fp const *A, fp const *x)
+{
+    fp tmp;
+    *rhs = *x;
+    fp_sq1(rhs);
+    fp_mul3(&tmp, A, x);
+    fp_add2(rhs, &tmp);
+    fp_add2(rhs, &fp_1);
+    fp_mul2(rhs, x);
+}
+
+/* totally not constant-time. */
+void action(public_key *out, public_key const *in, private_key const *priv)
+{
+    u512 k[2];
+    u512_set(&k[0], 4); /* maximal 2-power in p+1 */
+    u512_set(&k[1], 4); /* maximal 2-power in p+1 */
+
+    uint8_t e[2][num_primes];
+
+    for (size_t i = 0; i < num_primes; ++i) {
+
+        int8_t t = (int8_t) (priv->e[i / 2] << i % 2 * 4) >> 4;
+
+        if (t > 0) {
+            e[0][i] = t;
+            e[1][i] = 0;
+            u512_mul3_64(&k[1], &k[1], primes[i]);
+        }
+        else if (t < 0) {
+            e[1][i] = -t;
+            e[0][i] = 0;
+            u512_mul3_64(&k[0], &k[0], primes[i]);
+        }
+        else {
+            e[0][i] = 0;
+            e[1][i] = 0;
+            u512_mul3_64(&k[0], &k[0], primes[i]);
+            u512_mul3_64(&k[1], &k[1], primes[i]);
+        }
+    }
+
+    proj A = {in->A, fp_1};
+
+    bool done[2] = {false, false};
+
+    do {
+
+        assert(!memcmp(&A.z, &fp_1, sizeof(fp)));
+
+        proj P;
+        fp_random(&P.x);
+        P.z = fp_1;
+
+        fp rhs;
+        montgomery_rhs(&rhs, &A.x, &P.x);
+        bool sign = !fp_issquare(&rhs);
+
+        if (done[sign])
+            continue;
+
+        xMUL(&P, &A, &P, &k[sign]);
+
+        done[sign] = true;
+
+        for (size_t i = 0; i < num_primes; ++i) {
+
+            if (e[sign][i]) {
+
+                u512 cof = u512_1;
+                for (size_t j = i + 1; j < num_primes; ++j)
+                    if (e[sign][j])
+                        u512_mul3_64(&cof, &cof, primes[j]);
+
+                proj K;
+                xMUL(&K, &A, &P, &cof);
+
+                if (memcmp(&K.z, &fp_0, sizeof(fp))) {
+
+                    xISOG(&A, &P, &K, primes[i]);
+
+                    if (!--e[sign][i])
+                        u512_mul3_64(&k[sign], &k[sign], primes[i]);
+
+                }
+
+            }
+
+            done[sign] &= !e[sign][i];
+        }
+
+        fp_inv(&A.z);
+        fp_mul2(&A.x, &A.z);
+        A.z = fp_1;
+
+    } while (!(done[0] && done[1]));
+
+    out->A = A.x;
+}
+
+/* includes public-key validation. */
+bool csidh(public_key *out, public_key const *in, private_key const *priv)
+{
+    if (!validate(in)) {
+        fp_random(&out->A);
+        return false;
+    }
+    action(out, in, priv);
+    return true;
+}
+
--- a/reference/csidh-20180427-by-castryck-et-al/csidh.h
+++ b/reference/csidh-20180427-by-castryck-et-al/csidh.h
@ -0,0 +1,26 @@
+#ifndef CSIDH_H
+#define CSIDH_H
+
+#include "u512.h"
+#include "fp.h"
+#include "mont.h"
+
+/* specific to p, should perhaps be somewhere else */
+#define num_primes 74
+#define max_exponent 5 /* (2*5+1)^74 is roughly 2^256 */
+
+
+typedef struct private_key {
+    int8_t e[(num_primes + 1) / 2]; /* packed int4_t */
+} private_key;
+
+typedef struct public_key {
+    fp A; /* Montgomery coefficient: represents y^2 = x^3 + Ax^2 + x */
+} public_key;
+
+extern const public_key base;
+
+void csidh_private(private_key *priv);
+bool csidh(public_key *out, public_key const *in, private_key const *priv);
+
+#endif
--- a/reference/csidh-20180427-by-castryck-et-al/fp.h
+++ b/reference/csidh-20180427-by-castryck-et-al/fp.h
@ -0,0 +1,37 @@
+#ifndef FP_H
+#define FP_H
+
+#include "u512.h"
+
+/* fp is in the Montgomery domain, so interpreting that
+   as an integer should never make sense.
+   enable compiler warnings when mixing up u512 and fp. */
+typedef struct fp {
+    u512 x;
+} fp;
+
+extern const fp fp_0;
+extern const fp fp_1;
+
+void fp_set(fp *x, uint64_t y);
+void fp_cswap(fp *x, fp *y, bool c);
+
+void fp_enc(fp *x, u512 const *y); /* encode to Montgomery representation */
+void fp_dec(u512 *x, fp const *y); /* decode from Montgomery representation */
+
+void fp_add2(fp *x, fp const *y);
+void fp_sub2(fp *x, fp const *y);
+void fp_mul2(fp *x, fp const *y);
+
+void fp_add3(fp *x, fp const *y, fp const *z);
+void fp_sub3(fp *x, fp const *y, fp const *z);
+void fp_mul3(fp *x, fp const *y, fp const *z);
+
+void fp_sq1(fp *x);
+void fp_sq2(fp *x, fp const *y);
+void fp_inv(fp *x);
+bool fp_issquare(fp const *x);
+
+void fp_random(fp *x);
+
+#endif
--- a/reference/csidh-20180427-by-castryck-et-al/fp.s
+++ b/reference/csidh-20180427-by-castryck-et-al/fp.s
@ -0,0 +1,452 @@
+
+.intel_syntax noprefix
+
+.section .rodata
+
+.set pbits, 511
+p:
+    .quad 0x1b81b90533c6c87b, 0xc2721bf457aca835, 0x516730cc1f0b4f25, 0xa7aac6c567f35507
+    .quad 0x5afbfcc69322c9cd, 0xb42d083aedc88c42, 0xfc8ab0d15e3e4c4a, 0x65b48e8f740f89bf
+
+
+.global fp_0
+fp_0: .quad 0, 0, 0, 0, 0, 0, 0, 0
+
+.global fp_1
+fp_1: /* 2^512 mod p */
+    .quad 0xc8fc8df598726f0a, 0x7b1bc81750a6af95, 0x5d319e67c1e961b4, 0xb0aa7275301955f1
+    .quad 0x4a080672d9ba6c64, 0x97a5ef8a246ee77b, 0x06ea9e5d4383676a, 0x3496e2e117e0ec80
+
+
+/* (2^512)^2 mod p */
+.r_squared_mod_p:
+    .quad 0x36905b572ffc1724, 0x67086f4525f1f27d, 0x4faf3fbfd22370ca, 0x192ea214bcc584b1
+    .quad 0x5dae03ee2f5de3d0, 0x1e9248731776b371, 0xad5f166e20e4f52d, 0x4ed759aea6f3917e
+
+/* -p^-1 mod 2^64 */
+.inv_min_p_mod_r:
+    .quad 0x66c1301f632e294d
+
+
+.section .text
+
+.global fp_copy
+fp_copy:
+    cld
+    mov rcx, 8
+    rep movsq
+    ret
+
+.global fp_set
+fp_set:
+    push rdi
+    call u512_set
+    pop rdi
+    mov rsi, rdi
+    jmp fp_enc
+
+.global fp_cswap
+fp_cswap:
+    movzx rax, dl
+    neg rax
+    .set k, 0
+    .rept 8
+        mov rcx, [rdi + 8*k]
+        mov rdx, [rsi + 8*k]
+
+        mov r8, rcx
+        xor r8, rdx
+        and r8, rax
+
+        xor rcx, r8
+        xor rdx, r8
+
+        mov [rdi + 8*k], rcx
+        mov [rsi + 8*k], rdx
+
+        .set k, k+1
+    .endr
+    ret
+
+.reduce_once:
+    push rbp
+    mov rbp, rdi
+
+    mov rdi, [rbp +  0]
+    sub rdi, [rip + p +  0]
+    mov rsi, [rbp +  8]
+    sbb rsi, [rip + p +  8]
+    mov rdx, [rbp + 16]
+    sbb rdx, [rip + p + 16]
+    mov rcx, [rbp + 24]
+    sbb rcx, [rip + p + 24]
+    mov r8,  [rbp + 32]
+    sbb r8,  [rip + p + 32]
+    mov r9,  [rbp + 40]
+    sbb r9,  [rip + p + 40]
+    mov r10, [rbp + 48]
+    sbb r10, [rip + p + 48]
+    mov r11, [rbp + 56]
+    sbb r11, [rip + p + 56]
+
+    setnc al
+    movzx rax, al
+    neg rax
+
+.macro cswap2, r, m
+    xor \r, \m
+    and \r, rax
+    xor \m, \r
+.endm
+
+    cswap2 rdi, [rbp +  0]
+    cswap2 rsi, [rbp +  8]
+    cswap2 rdx, [rbp + 16]
+    cswap2 rcx, [rbp + 24]
+    cswap2 r8,  [rbp + 32]
+    cswap2 r9,  [rbp + 40]
+    cswap2 r10, [rbp + 48]
+    cswap2 r11, [rbp + 56]
+
+    pop rbp
+    ret
+
+.global fp_add3
+fp_add3:
+    push rdi
+    call u512_add3
+    pop rdi
+    jmp .reduce_once
+
+.global fp_add2
+fp_add2:
+    mov rdx, rdi
+    jmp fp_add3
+
+.global fp_sub3
+fp_sub3:
+    push rdi
+    call u512_sub3
+    pop rdi
+    xor rsi, rsi
+    xor rdx, rdx
+    xor rcx, rcx
+    xor r8, r8
+    xor r9, r9
+    xor r10, r10
+    xor r11, r11
+    test rax, rax
+    cmovnz rax, [rip + p +  0]
+    cmovnz rsi, [rip + p +  8]
+    cmovnz rdx, [rip + p + 16]
+    cmovnz rcx, [rip + p + 24]
+    cmovnz r8,  [rip + p + 32]
+    cmovnz r9,  [rip + p + 40]
+    cmovnz r10, [rip + p + 48]
+    cmovnz r11, [rip + p + 56]
+    add [rdi +  0], rax
+    adc [rdi +  8], rsi
+    adc [rdi + 16], rdx
+    adc [rdi + 24], rcx
+    adc [rdi + 32],  r8
+    adc [rdi + 40],  r9
+    adc [rdi + 48], r10
+    adc [rdi + 56], r11
+    ret
+
+.global fp_sub2
+fp_sub2:
+    mov rdx, rdi
+    xchg rsi, rdx
+    jmp fp_sub3
+
+
+/* Montgomery arithmetic */
+
+.global fp_enc
+fp_enc:
+    lea rdx, [rip + .r_squared_mod_p]
+    jmp fp_mul3
+
+.global fp_dec
+fp_dec:
+    lea rdx, [rip + u512_1]
+    jmp fp_mul3
+
+.global fp_mul3
+fp_mul3:
+    push rbp
+    push rbx
+    push r12
+    push r13
+    push r14
+    push r15
+
+    push rdi
+
+    mov rdi, rsi
+    mov rsi, rdx
+
+    xor r8,  r8
+    xor r9,  r9
+    xor r10, r10
+    xor r11, r11
+    xor r12, r12
+    xor r13, r13
+    xor r14, r14
+    xor r15, r15
+    xor rbp, rbp
+
+    /* flags are already cleared */
+
+.macro MULSTEP, k, r0, r1, r2, r3, r4, r5, r6, r7, r8
+
+    mov rdx, [rsi +  0]
+    mulx rcx, rdx, [rdi + 8*\k]
+    add rdx, \r0
+    mulx rcx, rdx, [rip + .inv_min_p_mod_r]
+
+    xor rax, rax /* clear flags */
+
+    mulx rbx, rax, [rip + p +  0]
+    adox \r0, rax
+
+    mulx rcx, rax, [rip + p +  8]
+    adcx \r1, rbx
+    adox \r1, rax
+
+    mulx rbx, rax, [rip + p + 16]
+    adcx \r2, rcx
+    adox \r2, rax
+
+    mulx rcx, rax, [rip + p + 24]
+    adcx \r3, rbx
+    adox \r3, rax
+
+    mulx rbx, rax, [rip + p + 32]
+    adcx \r4, rcx
+    adox \r4, rax
+
+    mulx rcx, rax, [rip + p + 40]
+    adcx \r5, rbx
+    adox \r5, rax
+
+    mulx rbx, rax, [rip + p + 48]
+    adcx \r6, rcx
+    adox \r6, rax
+
+    mulx rcx, rax, [rip + p + 56]
+    adcx \r7, rbx
+    adox \r7, rax
+
+    mov rax, 0
+    adcx \r8, rcx
+    adox \r8, rax
+
+
+    mov rdx, [rdi + 8*\k]
+
+    xor rax, rax /* clear flags */
+
+    mulx rbx, rax, [rsi +  0]
+    adox \r0, rax
+
+    mulx rcx, rax, [rsi +  8]
+    adcx \r1, rbx
+    adox \r1, rax
+
+    mulx rbx, rax, [rsi + 16]
+    adcx \r2, rcx
+    adox \r2, rax
+
+    mulx rcx, rax, [rsi + 24]
+    adcx \r3, rbx
+    adox \r3, rax
+
+    mulx rbx, rax, [rsi + 32]
+    adcx \r4, rcx
+    adox \r4, rax
+
+    mulx rcx, rax, [rsi + 40]
+    adcx \r5, rbx
+    adox \r5, rax
+
+    mulx rbx, rax, [rsi + 48]
+    adcx \r6, rcx
+    adox \r6, rax
+
+    mulx rcx, rax, [rsi + 56]
+    adcx \r7, rbx
+    adox \r7, rax
+
+    mov rax, 0
+    adcx \r8, rcx
+    adox \r8, rax
+
+.endm
+
+    MULSTEP 0, r8,  r9,  r10, r11, r12, r13, r14, r15, rbp
+    MULSTEP 1, r9,  r10, r11, r12, r13, r14, r15, rbp, r8
+    MULSTEP 2, r10, r11, r12, r13, r14, r15, rbp, r8,  r9
+    MULSTEP 3, r11, r12, r13, r14, r15, rbp, r8,  r9,  r10
+    MULSTEP 4, r12, r13, r14, r15, rbp, r8,  r9,  r10, r11
+    MULSTEP 5, r13, r14, r15, rbp, r8,  r9,  r10, r11, r12
+    MULSTEP 6, r14, r15, rbp, r8,  r9,  r10, r11, r12, r13
+    MULSTEP 7, r15, rbp, r8,  r9,  r10, r11, r12, r13, r14
+
+    pop rdi
+
+    mov [rdi +  0], rbp
+    mov [rdi +  8], r8
+    mov [rdi + 16], r9
+    mov [rdi + 24], r10
+    mov [rdi + 32], r11
+    mov [rdi + 40], r12
+    mov [rdi + 48], r13
+    mov [rdi + 56], r14
+
+    pop r15
+    pop r14
+    pop r13
+    pop r12
+    pop rbx
+    pop rbp
+    jmp .reduce_once
+
+.global fp_mul2
+fp_mul2:
+    mov rdx, rdi
+    jmp fp_mul3
+
+.global fp_sq2
+fp_sq2:
+    /* TODO implement optimized Montgomery squaring */
+    mov rdx, rsi
+    jmp fp_mul3
+
+.global fp_sq1
+fp_sq1:
+    mov rsi, rdi
+    jmp fp_sq2
+
+/* (obviously) not constant time in the exponent! */
+.fp_pow:
+    push rbx
+    mov rbx, rsi
+    push r12
+    push r13
+    push rdi
+    sub rsp, 64
+
+    mov rsi, rdi
+    mov rdi, rsp
+    call fp_copy
+
+    mov rdi, [rsp + 64]
+    lea rsi, [rip + fp_1]
+    call fp_copy
+
+.macro POWSTEP, k
+        mov r13, [rbx + 8*\k]
+        xor r12, r12
+
+        0:
+        test r13, 1
+        jz 1f
+
+        mov rdi, [rsp + 64]
+        mov rsi, rsp
+        call fp_mul2
+
+        1:
+        mov rdi, rsp
+        call fp_sq1
+
+        shr r13
+
+        inc r12
+        test r12, 64
+        jz 0b
+.endm
+
+    POWSTEP 0
+    POWSTEP 1
+    POWSTEP 2
+    POWSTEP 3
+    POWSTEP 4
+    POWSTEP 5
+    POWSTEP 6
+    POWSTEP 7
+
+    add rsp, 64+8
+    pop r13
+    pop r12
+    pop rbx
+    ret
+
+.section .rodata
+.p_minus_2:
+    .quad 0x1b81b90533c6c879, 0xc2721bf457aca835, 0x516730cc1f0b4f25, 0xa7aac6c567f35507
+    .quad 0x5afbfcc69322c9cd, 0xb42d083aedc88c42, 0xfc8ab0d15e3e4c4a, 0x65b48e8f740f89bf
+
+.section .text
+
+/* TODO use a better addition chain? */
+.global fp_inv
+fp_inv:
+    lea rsi, [rip + .p_minus_2]
+    jmp .fp_pow
+
+.section .rodata
+.p_minus_1_halves:
+    .quad 0x8dc0dc8299e3643d, 0xe1390dfa2bd6541a, 0xa8b398660f85a792, 0xd3d56362b3f9aa83
+    .quad 0x2d7dfe63499164e6, 0x5a16841d76e44621, 0xfe455868af1f2625, 0x32da4747ba07c4df
+
+.section .text
+
+/* TODO use a better addition chain? */
+.global fp_issquare
+fp_issquare:
+    push rdi
+    lea rsi, [rip + .p_minus_1_halves]
+    call .fp_pow
+    pop rdi
+
+    xor rax, rax
+    .set k, 0
+    .rept 8
+        mov rsi, [rdi + 8*k]
+        xor rsi, [rip + fp_1 + 8*k]
+        or rax, rsi
+        .set k, k+1
+    .endr
+    test rax, rax
+    setz al
+    movzx rax, al
+    ret
+
+
+/* not constant time (but this shouldn't leak anything of importance) */
+.global fp_random
+fp_random:
+
+    push rdi
+    mov rsi, 64
+    call randombytes
+    pop rdi
+    mov rax, 1
+    shl rax, (pbits % 64)
+    dec rax
+    and [rdi + 56], rax
+
+    .set k, 7
+    .rept 8
+        mov rax, [rip + p + 8*k]
+        cmp [rdi + 8*k], rax
+        jge fp_random
+        jl 0f
+        .set k, k-1
+    .endr
+    0:
+    ret
+
--- a/reference/csidh-20180427-by-castryck-et-al/main
+++ b/reference/csidh-20180427-by-castryck-et-al/main
--- a/reference/csidh-20180427-by-castryck-et-al/main.c
+++ b/reference/csidh-20180427-by-castryck-et-al/main.c
@ -0,0 +1,99 @@
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <unistd.h>
+#include <time.h>
+#include <assert.h>
+
+#include "u512.h"
+#include "fp.h"
+#include "mont.h"
+#include "csidh.h"
+
+void u512_print(u512 const *x)
+{
+    for (size_t i = 63; i < 64; --i)
+        printf("%02hhx", i[(unsigned char *) x->c]);
+}
+
+void fp_print(fp const *x)
+{
+    u512 y;
+    fp_dec(&y, x);
+    u512_print(&y);
+}
+
+int main()
+{
+    clock_t t0, t1;
+
+    private_key priv_alice, priv_bob;
+    public_key pub_alice, pub_bob;
+    public_key shared_alice, shared_bob;
+
+    printf("\n");
+
+
+    t0 = clock();
+    csidh_private(&priv_alice);
+    t1 = clock();
+
+    printf("Alice's private key   (%7.3lf ms):\n  ", 1000. * (t1 - t0) / CLOCKS_PER_SEC);
+    for (size_t i = 0; i < sizeof(priv_alice); ++i)
+        printf("%02hhx", i[(uint8_t *) &priv_alice]);
+    printf("\n\n");
+
+    t0 = clock();
+    csidh_private(&priv_bob);
+    t1 = clock();
+
+    printf("Bob's private key     (%7.3lf ms):\n  ", 1000. * (t1 - t0) / CLOCKS_PER_SEC);
+    for (size_t i = 0; i < sizeof(priv_bob); ++i)
+        printf("%02hhx", i[(uint8_t *) &priv_bob]);
+    printf("\n\n");
+
+
+    t0 = clock();
+    assert(csidh(&pub_alice, &base, &priv_alice));
+    t1 = clock();
+
+    printf("Alice's public key    (%7.3lf ms):\n  ", 1000. * (t1 - t0) / CLOCKS_PER_SEC);
+    fp_print(&pub_alice.A);
+    printf("\n\n");
+
+    t0 = clock();
+    assert(csidh(&pub_bob, &base, &priv_bob));
+    t1 = clock();
+
+    printf("Bob's public key      (%7.3lf ms):\n  ", 1000. * (t1 - t0) / CLOCKS_PER_SEC);
+    fp_print(&pub_bob.A);
+    printf("\n\n");
+
+
+    t0 = clock();
+    assert(csidh(&shared_alice, &pub_bob, &priv_alice));
+    t1 = clock();
+
+    printf("Alice's shared secret (%7.3lf ms):\n  ", 1000. * (t1 - t0) / CLOCKS_PER_SEC);
+    fp_print(&shared_alice.A);
+    printf("\n\n");
+
+    t0 = clock();
+    assert(csidh(&shared_bob, &pub_alice, &priv_bob));
+    t1 = clock();
+
+    printf("Bob's shared secret   (%7.3lf ms):\n  ", 1000. * (t1 - t0) / CLOCKS_PER_SEC);
+    fp_print(&shared_bob.A);
+    printf("\n\n");
+
+    printf("    ");
+    if (memcmp(&shared_alice, &shared_bob, sizeof(public_key)))
+        printf("\x1b[31mNOT EQUAL!\x1b[0m\n");
+    else
+        printf("\x1b[32mequal.\x1b[0m\n");
+    printf("\n");
+
+    printf("\n");
+}
+
--- a/reference/csidh-20180427-by-castryck-et-al/mont.c
+++ b/reference/csidh-20180427-by-castryck-et-al/mont.c
@ -0,0 +1,188 @@
+
+#include <assert.h>
+
+#include "mont.h"
+
+void xDBLADD(proj *R, proj *S, proj const *P, proj const *Q, proj const *PQ, proj const *A24)
+{
+    fp tmp0, tmp1, tmp2;
+
+    fp_add3(&tmp0, &P->x, &P->z);
+    fp_sub3(&tmp1, &P->x, &P->z);
+    fp_sq2(&R->x, &tmp0);
+    fp_sub3(&tmp2, &Q->x, &Q->z);
+    fp_add3(&S->x, &Q->x, &Q->z);
+    fp_mul2(&tmp0, &tmp2);
+    fp_sq2(&R->z, &tmp1);
+    fp_mul2(&tmp1, &S->x);
+    fp_sub3(&tmp2, &R->x, &R->z);
+    fp_mul2(&R->z, &A24->z);
+    fp_mul2(&R->x, &R->z);
+    fp_mul3(&S->x, &A24->x, &tmp2);
+    fp_sub3(&S->z, &tmp0, &tmp1);
+    fp_add2(&R->z, &S->x);
+    fp_add3(&S->x, &tmp0, &tmp1);
+    fp_mul2(&R->z, &tmp2);
+    fp_sq1(&S->z);
+    fp_sq1(&S->x);
+    fp_mul2(&S->z, &PQ->x);
+    fp_mul2(&S->x, &PQ->z);
+}
+
+void xDBL(proj *Q, proj const *A, proj const *P)
+{
+    fp a, b, c;
+    fp_add3(&a, &P->x, &P->z);
+    fp_sq1(&a);
+    fp_sub3(&b, &P->x, &P->z);
+    fp_sq1(&b);
+    fp_sub3(&c, &a, &b);
+    fp_add2(&b, &b); fp_add2(&b, &b); /* multiplication by 4 */
+    fp_mul2(&b, &A->z);
+    fp_mul3(&Q->x, &a, &b);
+    fp_add3(&a, &A->z, &A->z); /* multiplication by 2 */
+    fp_add2(&a, &A->x);
+    fp_mul2(&a, &c);
+    fp_add2(&a, &b);
+    fp_mul3(&Q->z, &a, &c);
+}
+
+void xADD(proj *S, proj const *P, proj const *Q, proj const *PQ)
+{
+    fp a, b, c, d;
+    fp_add3(&a, &P->x, &P->z);
+    fp_sub3(&b, &P->x, &P->z);
+    fp_add3(&c, &Q->x, &Q->z);
+    fp_sub3(&d, &Q->x, &Q->z);
+    fp_mul2(&a, &d);
+    fp_mul2(&b, &c);
+    fp_add3(&c, &a, &b);
+    fp_sub3(&d, &a, &b);
+    fp_sq1(&c);
+    fp_sq1(&d);
+    fp_mul3(&S->x, &PQ->z, &c);
+    fp_mul3(&S->z, &PQ->x, &d);
+}
+
+/* Montgomery ladder. */
+/* P must not be the unique point of order 2. */
+/* not constant-time! */
+void xMUL(proj *Q, proj const *A, proj const *P, u512 const *k)
+{
+    proj R = *P;
+    proj A24;
+    const proj Pcopy = *P; /* in case Q = P */
+
+    Q->x = fp_1;
+    Q->z = fp_0;
+
+    fp_add3(&A24.x, &A->z, &A->z);
+    fp_add3(&A24.z, &A24.x, &A24.x);
+    fp_add2(&A24.x, &A->x);
+
+    unsigned long i = 512;
+    while (--i && !u512_bit(k, i));
+
+    do {
+
+        bool bit = u512_bit(k, i);
+
+        if (bit) { proj T = *Q; *Q = R; R = T; } /* not constant-time */
+        //fp_cswap(&Q->x, &R.x, bit);
+        //fp_cswap(&Q->z, &R.z, bit);
+
+        xDBLADD(Q, &R, Q, &R, &Pcopy, &A24);
+
+        if (bit) { proj T = *Q; *Q = R; R = T; } /* not constant-time */
+        //fp_cswap(&Q->x, &R.x, bit);
+        //fp_cswap(&Q->z, &R.z, bit);
+
+    } while (i--);
+}
+
+
+/* computes the isogeny with kernel point K of order k */
+/* returns the new curve coefficient A and the image of P */
+/* (obviously) not constant time in k */
+void xISOG(proj *A, proj *P, proj const *K, uint64_t k)
+{
+    assert (k >= 3);
+    assert (k % 2 == 1);
+
+    fp tmp0, tmp1;
+    fp T[4] = {K->z, K->x, K->x, K->z};
+    proj Q;
+
+    fp_mul3(&Q.x,  &P->x, &K->x);
+    fp_mul3(&tmp0, &P->z, &K->z);
+    fp_sub2(&Q.x,  &tmp0);
+
+    fp_mul3(&Q.z,  &P->x, &K->z);
+    fp_mul3(&tmp0, &P->z, &K->x);
+    fp_sub2(&Q.z,  &tmp0);
+
+    proj M[3] = {*K};
+    xDBL(&M[1], A, K);
+
+    for (uint64_t i = 1; i < k / 2; ++i) {
+
+        if (i >= 2)
+            xADD(&M[i % 3], &M[(i - 1) % 3], K, &M[(i - 2) % 3]);
+
+        fp_mul3(&tmp0, &M[i % 3].x, &T[0]);
+        fp_mul3(&tmp1, &M[i % 3].z, &T[1]);
+        fp_add3(&T[0], &tmp0, &tmp1);
+
+        fp_mul2(&T[1], &M[i % 3].x);
+
+        fp_mul3(&tmp0, &M[i % 3].z, &T[2]);
+        fp_mul3(&tmp1, &M[i % 3].x, &T[3]);
+        fp_add3(&T[2], &tmp0, &tmp1);
+
+        fp_mul2(&T[3], &M[i % 3].z);
+
+
+        fp_mul3(&tmp0, &P->x, &M[i % 3].x);
+        fp_mul3(&tmp1, &P->z, &M[i % 3].z);
+        fp_sub2(&tmp0, &tmp1);
+        fp_mul2(&Q.x,  &tmp0);
+
+        fp_mul3(&tmp0, &P->x, &M[i % 3].z);
+        fp_mul3(&tmp1, &P->z, &M[i % 3].x);
+        fp_sub2(&tmp0, &tmp1);
+        fp_mul2(&Q.z,  &tmp0);
+    }
+
+    fp_mul2(&T[0], &T[1]);
+    fp_add2(&T[0], &T[0]); /* multiplication by 2 */
+
+    fp_sq1(&T[1]);
+
+    fp_mul2(&T[2], &T[3]);
+    fp_add2(&T[2], &T[2]); /* multiplication by 2 */
+
+    fp_sq1(&T[3]);
+
+    /* Ax := T[1] * T[3] * Ax - 3 * Az * (T[1] * T[2] - T[0] * T[3]) */
+    fp_mul3(&tmp0, &T[1], &T[2]);
+    fp_mul3(&tmp1, &T[0], &T[3]);
+    fp_sub2(&tmp0, &tmp1);
+    fp_mul2(&tmp0, &A->z);
+    fp_add3(&tmp1, &tmp0, &tmp0); fp_add2(&tmp0, &tmp1); /* multiplication by 3 */
+
+    fp_mul3(&tmp1, &T[1], &T[3]);
+    fp_mul2(&tmp1, &A->x);
+
+    fp_sub3(&A->x, &tmp1, &tmp0);
+
+    /* Az := Az * T[3]^2 */
+    fp_sq1(&T[3]);
+    fp_mul2(&A->z, &T[3]);
+
+    /* X := X * Xim^2, Z := Z * Zim^2 */
+    fp_sq1(&Q.x);
+    fp_sq1(&Q.z);
+    fp_mul2(&P->x, &Q.x);
+    fp_mul2(&P->z, &Q.z);
+}
+
--- a/reference/csidh-20180427-by-castryck-et-al/mont.h
+++ b/reference/csidh-20180427-by-castryck-et-al/mont.h
@ -0,0 +1,19 @@
+#ifndef MONT_H
+#define MONT_H
+
+#include "u512.h"
+#include "fp.h"
+
+/* P^1 over fp. */
+typedef struct proj {
+    fp x;
+    fp z;
+} proj;
+
+void xDBL(proj *Q, proj const *A, proj const *P);
+void xADD(proj *S, proj const *P, proj const *Q, proj const *PQ);
+void xDBLADD(proj *R, proj *S, proj const *P, proj const *Q, proj const *PQ, proj const *A);
+void xMUL(proj *Q, proj const *A, proj const *P, u512 const *k);
+void xISOG(proj *A, proj *P, proj const *K, uint64_t k);
+
+#endif
--- a/reference/csidh-20180427-by-castryck-et-al/rng.c
+++ b/reference/csidh-20180427-by-castryck-et-al/rng.c
@ -0,0 +1,18 @@
+
+#include "rng.h"
+
+#include <stdlib.h>
+#include <unistd.h>
+#include <fcntl.h>
+
+void randombytes(void *x, size_t l)
+{
+    static int fd = -1;
+    ssize_t n;
+    if (fd < 0 && 0 > (fd = open("/dev/urandom", O_RDONLY)))
+        exit(1);
+    for (size_t i = 0; i < l; i += n)
+        if (0 >= (n = read(fd, (char *) x + i, l - i)))
+            exit(2);
+}
+
--- a/reference/csidh-20180427-by-castryck-et-al/rng.h
+++ b/reference/csidh-20180427-by-castryck-et-al/rng.h
@ -0,0 +1,8 @@
+#ifndef RNG_H
+#define RNG_H
+
+#include <stdlib.h>
+
+void randombytes(void *x, size_t l);
+
+#endif
--- a/reference/csidh-20180427-by-castryck-et-al/supersingular.sage
+++ b/reference/csidh-20180427-by-castryck-et-al/supersingular.sage
@ -0,0 +1,128 @@
+#!/usr/bin/env sage
+#coding: utf8
+
+proof.all(False)
+
+
+# parameters.
+
+ls = list(primes(3, 374)) + [587] # Elkies primes
+#ls = list(primes(3, 47)) + [97] # (a smaller example)
+p = 4 * prod(ls) - 1
+assert is_prime(p)
+print "\nElkies primes:", " ".join(map(str, ls))
+
+max_exp = ceil((sqrt(p) ** (1/len(ls)) - 1) / 2)
+assert (2 * max_exp + 1) ** len(ls) >= sqrt(p)
+print "exponents are chosen in the range {}..{}.".format(-max_exp, max_exp)
+
+base = GF(p)(0) # Montgomery coefficient of starting curve
+
+
+# helper functions.
+
+# NB: all the operations can be computed entirely over the prime field,
+# but for simplicity of this implementation we will make use of curves
+# defined over GF(p^2).  note this slows everything down quite a bit.
+
+Fp2.<i> = GF(p**2, modulus = x**2 + 1)
+
+def montgomery_curve(A):
+    return EllipticCurve(Fp2, [0, A, 0, 1, 0])
+
+# sage's isogeny formulas return Weierstraß curves, hence we need this...
+def montgomery_coefficient(E):
+    Ew = E.change_ring(GF(p)).short_weierstrass_model()
+    _, _, _, a, b = Ew.a_invariants()
+    R.<z> = GF(p)[]
+    r = (z**3 + a*z + b).roots(multiplicities=False)[0]
+    s = sqrt(3 * r**2 + a)
+    if not is_square(s): s = -s
+    A = 3 * r / s
+    assert montgomery_curve(A).change_ring(GF(p)).is_isomorphic(Ew)
+    return GF(p)(A)
+
+
+# actual implementation.
+
+def private():
+    return [randrange(-max_exp, max_exp + 1) for _ in range(len(ls))]
+
+def validate(A):
+    while True:
+        k = 1
+        P = montgomery_curve(A).lift_x(GF(p).random_element())
+        for l in ls:
+            Q = (p + 1) // l * P
+            if not Q: continue
+            if l * Q: return False
+            k *= l
+            if k > 4 * sqrt(p): return True
+
+def action(pub, priv):
+
+    E = montgomery_curve(pub)
+    es = priv[:]
+
+    while any(es):
+
+        E._order = (p + 1)**2 # else sage computes this
+
+        P = E.lift_x(GF(p).random_element())
+        s = +1 if P.xy()[1] in GF(p) else -1
+        k = prod(l for l, e in zip(ls, es) if sign(e) == s)
+        P *= (p + 1) // k
+
+        for i, (l, e) in enumerate(zip(ls, es)):
+
+            if sign(e) != s: continue
+
+            Q = k // l * P
+            if not Q: continue
+            Q._order = l # else sage computes this
+            phi = E.isogeny(Q)
+
+            E, P = phi.codomain(), phi(P)
+            es[i] -= s
+            k //= l
+
+    return montgomery_coefficient(E)
+
+
+# example.
+
+print
+
+print "testing public-key validation on random ordinary curves (should be all 0s):\n   ",
+for _ in range(16):
+    while True:
+        A = GF(p).random_element()
+        if montgomery_curve(A).is_ordinary(): break
+    print int(validate(A)),
+print
+
+privA = private()
+print "\nAlice's private key:\n   ", " ".join(map('{:2d}'.format, privA))
+
+pubA = action(base, privA)
+print "\nAlice's public key:\n   ", pubA,
+print " (valid: {})".format(int(validate(pubA)))
+
+privB = private()
+print "\nBob's private key:\n   ", " ".join(map('{:2d}'.format, privB))
+
+pubB = action(base, privB)
+print "\nBob's public key:\n   ", pubB,
+print " (valid: {})".format(int(validate(pubB)))
+
+sharedA = action(pubB, privA)
+print "\nAlice's shared secret:\n   ", sharedA
+
+sharedB = action(pubA, privB)
+print "\nBob's shared secret:\n   ", sharedB
+
+if sharedA == sharedB:
+    print "\n--> equal!\n"
+else:
+    print "\n--> NOT EQUAL?!\n"
+
--- a/reference/csidh-20180427-by-castryck-et-al/u512.h
+++ b/reference/csidh-20180427-by-castryck-et-al/u512.h
@ -0,0 +1,22 @@
+#ifndef UINT_H
+#define UINT_H
+
+#include <stdbool.h>
+#include <stdint.h>
+
+typedef struct u512 {
+    uint64_t c[8];
+} u512;
+
+extern const u512 u512_1;
+
+void u512_set(u512 *x, uint64_t y);
+
+bool u512_bit(u512 const *x, uint64_t k);
+
+bool u512_add3(u512 *x, u512 const *y, u512 const *z); /* returns carry */
+bool u512_sub3(u512 *x, u512 const *y, u512 const *z); /* returns borrow */
+
+void u512_mul3_64(u512 *x, u512 const *y, uint64_t z);
+
+#endif
--- a/reference/csidh-20180427-by-castryck-et-al/u512.s
+++ b/reference/csidh-20180427-by-castryck-et-al/u512.s
@ -0,0 +1,102 @@
+
+.intel_syntax noprefix
+
+.section .rodata
+
+.global u512_1
+u512_1: .quad 1, 0, 0, 0, 0, 0, 0, 0
+
+
+.section .text
+
+.global u512_set
+u512_set:
+    cld
+    mov rax, rsi
+    stosq
+    xor rax, rax
+    mov rcx, 7
+    rep stosq
+    ret
+
+
+.global u512_bit
+u512_bit:
+    mov rcx, rsi
+    and rcx, 0x3f
+    shr rsi, 6
+    mov rax, [rdi + 8*rsi]
+    shr rax, cl
+    and rax, 1
+    ret
+
+
+.global u512_add3
+u512_add3:
+    mov rax, [rsi +  0]
+    add rax, [rdx +  0]
+    mov [rdi +  0], rax
+    .set k, 1
+    .rept 7
+        mov rax, [rsi + 8*k]
+        adc rax, [rdx + 8*k]
+        mov [rdi + 8*k], rax
+        .set k, k+1
+    .endr
+    setc al
+    movzx rax, al
+    ret
+
+.global u512_sub3
+u512_sub3:
+    mov rax, [rsi +  0]
+    sub rax, [rdx +  0]
+    mov [rdi +  0], rax
+    .set k, 1
+    .rept 7
+        mov rax, [rsi + 8*k]
+        sbb rax, [rdx + 8*k]
+        mov [rdi + 8*k], rax
+        .set k, k+1
+    .endr
+    setc al
+    movzx rax, al
+    ret
+
+
+.global u512_mul3_64
+u512_mul3_64:
+
+    mulx r10, rax, [rsi +  0]
+    mov [rdi +  0], rax
+
+    mulx r11, rax, [rsi +  8]
+    add  rax, r10
+    mov [rdi +  8], rax
+
+    mulx r10, rax, [rsi + 16]
+    adcx rax, r11
+    mov [rdi + 16], rax
+
+    mulx r11, rax, [rsi + 24]
+    adcx rax, r10
+    mov [rdi + 24], rax
+
+    mulx r10, rax, [rsi + 32]
+    adcx rax, r11
+    mov [rdi + 32],rax
+
+    mulx r11, rax, [rsi + 40]
+    adcx rax, r10
+    mov [rdi + 40],rax
+
+    mulx r10, rax, [rsi + 48]
+    adcx rax, r11
+    mov [rdi + 48],rax
+
+    mulx r11, rax, [rsi + 56]
+    adcx rax, r10
+    mov [rdi + 56],rax
+
+    ret
+