remove reference impl

2018-10-24 11:24:32 +01:00 · 2018-10-24 11:24:32 +01:00 · f2e85e88a3
commit f2e85e88a3
--- a/reference/csidh-20180427-by-castryck-et-al/Makefile
+++ b/reference/csidh-20180427-by-castryck-et-al/Makefile
@ -1,27 +0,0 @@
-all:
-	@cc \
-		-std=c99 -pedantic \
-		-Wall -Wextra \
-		-O2 -funroll-loops \
-		rng.c \
-		u512.s fp.s \
-		mont.c \
-		csidh.c \
-		main.c \
-		-o main
-
-debug:
-	cc \
-		-std=c99 -pedantic \
-		-Wall -Wextra \
-		-g \
-		rng.c \
-		u512.s fp.s \
-		mont.c \
-		csidh.c \
-		main.c \
-		-o main
-
-clean:
-	rm -f main
-
--- a/reference/csidh-20180427-by-castryck-et-al/bench.c
+++ b/reference/csidh-20180427-by-castryck-et-al/bench.c
@ -1,54 +0,0 @@
-
-#include <stdlib.h>
-#include <stdio.h>
-#include <string.h>
-#include <time.h>
-#include <assert.h>
-
-#include "u512.h"
-#include "fp.h"
-#include "mont.h"
-#include "csidh.h"
-
-#include <inttypes.h>
-
-static __inline__ uint64_t rdtsc(void)
-{
-    uint32_t hi, lo;
-    __asm__ __volatile__ ("rdtsc" : "=a"(lo), "=d"(hi));
-    return lo | (uint64_t) hi << 32;
-}
-
-unsigned long its = 10000;
-
-int main()
-{
-    clock_t t0, t1, time = 0;
-    uint64_t c0, c1, cycles = 0;
-
-    private_key priv;
-    public_key pub = base;
-
-    for (unsigned long i = 0; i < its; ++i) {
-
-        csidh_private(&priv);
-
-        t0 = clock();
-        c0 = rdtsc();
-
-        /**************************************/
-        assert(validate(&pub));
-        action(&pub, &pub, &priv);
-        /**************************************/
-
-        c1 = rdtsc();
-        t1 = clock();
-        cycles += c1 - c0;
-        time += t1 - t0;
-    }
-
-    printf("iterations: %lu\n", its);
-    printf("clock cycles: %" PRIu64 "\n", (uint64_t) cycles / its);
-    printf("wall-clock time: %.3lf ms\n", 1000. * time / CLOCKS_PER_SEC / its);
-}
-
--- a/reference/csidh-20180427-by-castryck-et-al/csidh.c
+++ b/reference/csidh-20180427-by-castryck-et-al/csidh.c
@ -1,220 +0,0 @@
-
-#include <string.h>
-#include <assert.h>
-
-#include "csidh.h"
-#include "rng.h"
-
-/* specific to p, should perhaps be somewhere else */
-const unsigned primes[num_primes] = {
-      3,   5,   7,  11,  13,  17,  19,  23,  29,  31,  37,  41,  43,  47,  53,  59,
-     61,  67,  71,  73,  79,  83,  89,  97, 101, 103, 107, 109, 113, 127, 131, 137,
-    139, 149, 151, 157, 163, 167, 173, 179, 181, 191, 193, 197, 199, 211, 223, 227,
-    229, 233, 239, 241, 251, 257, 263, 269, 271, 277, 281, 283, 293, 307, 311, 313,
-    317, 331, 337, 347, 349, 353, 359, 367, 373, 587,
-};
-
-const u512 four_sqrt_p = {{
-    0x85e2579c786882cf, 0x4e3433657e18da95, 0x850ae5507965a0b3, 0xa15bc4e676475964,
-}};
-
-
-const public_key base = {0}; /* A = 0 */
-
-void csidh_private(private_key *priv)
-{
-    memset(&priv->e, 0, sizeof(priv->e));
-    for (size_t i = 0; i < num_primes; ) {
-        int8_t buf[64];
-        randombytes(buf, sizeof(buf));
-        for (size_t j = 0; j < sizeof(buf); ++j) {
-            if (buf[j] <= max_exponent && buf[j] >= -max_exponent) {
-                priv->e[i / 2] |= (buf[j] & 0xf) << i % 2 * 4;
-                if (++i >= num_primes)
-                    break;
-            }
-        }
-    }
-}
-
-/* compute [(p+1)/l] P for all l in our list of primes. */
-/* divide and conquer is much faster than doing it naively,
- * but uses more memory. */
-static void cofactor_multiples(proj *P, const proj *A, size_t lower, size_t upper)
-{
-    assert(lower < upper);
-
-    if (upper - lower == 1)
-        return;
-
-    size_t mid = lower + (upper - lower + 1) / 2;
-
-    u512 cl = u512_1, cu = u512_1;
-    for (size_t i = lower; i < mid; ++i)
-        u512_mul3_64(&cu, &cu, primes[i]);
-    for (size_t i = mid; i < upper; ++i)
-        u512_mul3_64(&cl, &cl, primes[i]);
-
-    xMUL(&P[mid], A, &P[lower], &cu);
-    xMUL(&P[lower], A, &P[lower], &cl);
-
-    cofactor_multiples(P, A, lower, mid);
-    cofactor_multiples(P, A, mid, upper);
-}
-
-/* never accepts invalid keys. */
-bool validate(public_key const *in)
-{
-    const proj A = {in->A, fp_1};
-
-    do {
-
-        proj P[num_primes];
-        fp_random(&P->x);
-        P->z = fp_1;
-
-        /* maximal 2-power in p+1 */
-        xDBL(P, &A, P);
-        xDBL(P, &A, P);
-
-        cofactor_multiples(P, &A, 0, num_primes);
-
-        u512 order = u512_1;
-
-        for (size_t i = num_primes - 1; i < num_primes; --i) {
-
-            /* we only gain information if [(p+1)/l] P is non-zero */
-            if (memcmp(&P[i].z, &fp_0, sizeof(fp))) {
-
-                u512 tmp;
-                u512_set(&tmp, primes[i]);
-                xMUL(&P[i], &A, &P[i], &tmp);
-
-                if (memcmp(&P[i].z, &fp_0, sizeof(fp)))
-                    /* P does not have order dividing p+1. */
-                    return false;
-
-                u512_mul3_64(&order, &order, primes[i]);
-
-                if (u512_sub3(&tmp, &four_sqrt_p, &order)) /* returns borrow */
-                    /* order > 4 sqrt(p), hence definitely supersingular */
-                    return true;
-            }
-        }
-
-    /* P didn't have big enough order to prove supersingularity. */
-    } while (1);
-}
-
-/* compute x^3 + Ax^2 + x */
-static void montgomery_rhs(fp *rhs, fp const *A, fp const *x)
-{
-    fp tmp;
-    *rhs = *x;
-    fp_sq1(rhs);
-    fp_mul3(&tmp, A, x);
-    fp_add2(rhs, &tmp);
-    fp_add2(rhs, &fp_1);
-    fp_mul2(rhs, x);
-}
-
-/* totally not constant-time. */
-void action(public_key *out, public_key const *in, private_key const *priv)
-{
-    u512 k[2];
-    u512_set(&k[0], 4); /* maximal 2-power in p+1 */
-    u512_set(&k[1], 4); /* maximal 2-power in p+1 */
-
-    uint8_t e[2][num_primes];
-
-    for (size_t i = 0; i < num_primes; ++i) {
-
-        int8_t t = (int8_t) (priv->e[i / 2] << i % 2 * 4) >> 4;
-
-        if (t > 0) {
-            e[0][i] = t;
-            e[1][i] = 0;
-            u512_mul3_64(&k[1], &k[1], primes[i]);
-        }
-        else if (t < 0) {
-            e[1][i] = -t;
-            e[0][i] = 0;
-            u512_mul3_64(&k[0], &k[0], primes[i]);
-        }
-        else {
-            e[0][i] = 0;
-            e[1][i] = 0;
-            u512_mul3_64(&k[0], &k[0], primes[i]);
-            u512_mul3_64(&k[1], &k[1], primes[i]);
-        }
-    }
-
-    proj A = {in->A, fp_1};
-
-    bool done[2] = {false, false};
-
-    do {
-
-        assert(!memcmp(&A.z, &fp_1, sizeof(fp)));
-
-        proj P;
-        fp_random(&P.x);
-        P.z = fp_1;
-
-        fp rhs;
-        montgomery_rhs(&rhs, &A.x, &P.x);
-        bool sign = !fp_issquare(&rhs);
-
-        if (done[sign])
-            continue;
-
-        xMUL(&P, &A, &P, &k[sign]);
-
-        done[sign] = true;
-
-        for (size_t i = 0; i < num_primes; ++i) {
-
-            if (e[sign][i]) {
-
-                u512 cof = u512_1;
-                for (size_t j = i + 1; j < num_primes; ++j)
-                    if (e[sign][j])
-                        u512_mul3_64(&cof, &cof, primes[j]);
-
-                proj K;
-                xMUL(&K, &A, &P, &cof);
-
-                if (memcmp(&K.z, &fp_0, sizeof(fp))) {
-
-                    xISOG(&A, &P, &K, primes[i]);
-
-                    if (!--e[sign][i])
-                        u512_mul3_64(&k[sign], &k[sign], primes[i]);
-
-                }
-
-            }
-
-            done[sign] &= !e[sign][i];
-        }
-
-        fp_inv(&A.z);
-        fp_mul2(&A.x, &A.z);
-        A.z = fp_1;
-
-    } while (!(done[0] && done[1]));
-
-    out->A = A.x;
-}
-
-/* includes public-key validation. */
-bool csidh(public_key *out, public_key const *in, private_key const *priv)
-{
-    if (!validate(in)) {
-        fp_random(&out->A);
-        return false;
-    }
-    action(out, in, priv);
-    return true;
-}
-
--- a/reference/csidh-20180427-by-castryck-et-al/csidh.h
+++ b/reference/csidh-20180427-by-castryck-et-al/csidh.h
@ -1,26 +0,0 @@
-#ifndef CSIDH_H
-#define CSIDH_H
-
-#include "u512.h"
-#include "fp.h"
-#include "mont.h"
-
-/* specific to p, should perhaps be somewhere else */
-#define num_primes 74
-#define max_exponent 5 /* (2*5+1)^74 is roughly 2^256 */
-
-
-typedef struct private_key {
-    int8_t e[(num_primes + 1) / 2]; /* packed int4_t */
-} private_key;
-
-typedef struct public_key {
-    fp A; /* Montgomery coefficient: represents y^2 = x^3 + Ax^2 + x */
-} public_key;
-
-extern const public_key base;
-
-void csidh_private(private_key *priv);
-bool csidh(public_key *out, public_key const *in, private_key const *priv);
-
-#endif
--- a/reference/csidh-20180427-by-castryck-et-al/fp.h
+++ b/reference/csidh-20180427-by-castryck-et-al/fp.h
@ -1,37 +0,0 @@
-#ifndef FP_H
-#define FP_H
-
-#include "u512.h"
-
-/* fp is in the Montgomery domain, so interpreting that
-   as an integer should never make sense.
-   enable compiler warnings when mixing up u512 and fp. */
-typedef struct fp {
-    u512 x;
-} fp;
-
-extern const fp fp_0;
-extern const fp fp_1;
-
-void fp_set(fp *x, uint64_t y);
-void fp_cswap(fp *x, fp *y, bool c);
-
-void fp_enc(fp *x, u512 const *y); /* encode to Montgomery representation */
-void fp_dec(u512 *x, fp const *y); /* decode from Montgomery representation */
-
-void fp_add2(fp *x, fp const *y);
-void fp_sub2(fp *x, fp const *y);
-void fp_mul2(fp *x, fp const *y);
-
-void fp_add3(fp *x, fp const *y, fp const *z);
-void fp_sub3(fp *x, fp const *y, fp const *z);
-void fp_mul3(fp *x, fp const *y, fp const *z);
-
-void fp_sq1(fp *x);
-void fp_sq2(fp *x, fp const *y);
-void fp_inv(fp *x);
-bool fp_issquare(fp const *x);
-
-void fp_random(fp *x);
-
-#endif
--- a/reference/csidh-20180427-by-castryck-et-al/fp.s
+++ b/reference/csidh-20180427-by-castryck-et-al/fp.s
@ -1,452 +0,0 @@
-
-.intel_syntax noprefix
-
-.section .rodata
-
-.set pbits, 511
-p:
-    .quad 0x1b81b90533c6c87b, 0xc2721bf457aca835, 0x516730cc1f0b4f25, 0xa7aac6c567f35507
-    .quad 0x5afbfcc69322c9cd, 0xb42d083aedc88c42, 0xfc8ab0d15e3e4c4a, 0x65b48e8f740f89bf
-
-
-.global fp_0
-fp_0: .quad 0, 0, 0, 0, 0, 0, 0, 0
-
-.global fp_1
-fp_1: /* 2^512 mod p */
-    .quad 0xc8fc8df598726f0a, 0x7b1bc81750a6af95, 0x5d319e67c1e961b4, 0xb0aa7275301955f1
-    .quad 0x4a080672d9ba6c64, 0x97a5ef8a246ee77b, 0x06ea9e5d4383676a, 0x3496e2e117e0ec80
-
-
-/* (2^512)^2 mod p */
-.r_squared_mod_p:
-    .quad 0x36905b572ffc1724, 0x67086f4525f1f27d, 0x4faf3fbfd22370ca, 0x192ea214bcc584b1
-    .quad 0x5dae03ee2f5de3d0, 0x1e9248731776b371, 0xad5f166e20e4f52d, 0x4ed759aea6f3917e
-
-/* -p^-1 mod 2^64 */
-.inv_min_p_mod_r:
-    .quad 0x66c1301f632e294d
-
-
-.section .text
-
-.global fp_copy
-fp_copy:
-    cld
-    mov rcx, 8
-    rep movsq
-    ret
-
-.global fp_set
-fp_set:
-    push rdi
-    call u512_set
-    pop rdi
-    mov rsi, rdi
-    jmp fp_enc
-
-.global fp_cswap
-fp_cswap:
-    movzx rax, dl
-    neg rax
-    .set k, 0
-    .rept 8
-        mov rcx, [rdi + 8*k]
-        mov rdx, [rsi + 8*k]
-
-        mov r8, rcx
-        xor r8, rdx
-        and r8, rax
-
-        xor rcx, r8
-        xor rdx, r8
-
-        mov [rdi + 8*k], rcx
-        mov [rsi + 8*k], rdx
-
-        .set k, k+1
-    .endr
-    ret
-
-.reduce_once:
-    push rbp
-    mov rbp, rdi
-
-    mov rdi, [rbp +  0]
-    sub rdi, [rip + p +  0]
-    mov rsi, [rbp +  8]
-    sbb rsi, [rip + p +  8]
-    mov rdx, [rbp + 16]
-    sbb rdx, [rip + p + 16]
-    mov rcx, [rbp + 24]
-    sbb rcx, [rip + p + 24]
-    mov r8,  [rbp + 32]
-    sbb r8,  [rip + p + 32]
-    mov r9,  [rbp + 40]
-    sbb r9,  [rip + p + 40]
-    mov r10, [rbp + 48]
-    sbb r10, [rip + p + 48]
-    mov r11, [rbp + 56]
-    sbb r11, [rip + p + 56]
-
-    setnc al
-    movzx rax, al
-    neg rax
-
-.macro cswap2, r, m
-    xor \r, \m
-    and \r, rax
-    xor \m, \r
-.endm
-
-    cswap2 rdi, [rbp +  0]
-    cswap2 rsi, [rbp +  8]
-    cswap2 rdx, [rbp + 16]
-    cswap2 rcx, [rbp + 24]
-    cswap2 r8,  [rbp + 32]
-    cswap2 r9,  [rbp + 40]
-    cswap2 r10, [rbp + 48]
-    cswap2 r11, [rbp + 56]
-
-    pop rbp
-    ret
-
-.global fp_add3
-fp_add3:
-    push rdi
-    call u512_add3
-    pop rdi
-    jmp .reduce_once
-
-.global fp_add2
-fp_add2:
-    mov rdx, rdi
-    jmp fp_add3
-
-.global fp_sub3
-fp_sub3:
-    push rdi
-    call u512_sub3
-    pop rdi
-    xor rsi, rsi
-    xor rdx, rdx
-    xor rcx, rcx
-    xor r8, r8
-    xor r9, r9
-    xor r10, r10
-    xor r11, r11
-    test rax, rax
-    cmovnz rax, [rip + p +  0]
-    cmovnz rsi, [rip + p +  8]
-    cmovnz rdx, [rip + p + 16]
-    cmovnz rcx, [rip + p + 24]
-    cmovnz r8,  [rip + p + 32]
-    cmovnz r9,  [rip + p + 40]
-    cmovnz r10, [rip + p + 48]
-    cmovnz r11, [rip + p + 56]
-    add [rdi +  0], rax
-    adc [rdi +  8], rsi
-    adc [rdi + 16], rdx
-    adc [rdi + 24], rcx
-    adc [rdi + 32],  r8
-    adc [rdi + 40],  r9
-    adc [rdi + 48], r10
-    adc [rdi + 56], r11
-    ret
-
-.global fp_sub2
-fp_sub2:
-    mov rdx, rdi
-    xchg rsi, rdx
-    jmp fp_sub3
-
-
-/* Montgomery arithmetic */
-
-.global fp_enc
-fp_enc:
-    lea rdx, [rip + .r_squared_mod_p]
-    jmp fp_mul3
-
-.global fp_dec
-fp_dec:
-    lea rdx, [rip + u512_1]
-    jmp fp_mul3
-
-.global fp_mul3
-fp_mul3:
-    push rbp
-    push rbx
-    push r12
-    push r13
-    push r14
-    push r15
-
-    push rdi
-
-    mov rdi, rsi
-    mov rsi, rdx
-
-    xor r8,  r8
-    xor r9,  r9
-    xor r10, r10
-    xor r11, r11
-    xor r12, r12
-    xor r13, r13
-    xor r14, r14
-    xor r15, r15
-    xor rbp, rbp
-
-    /* flags are already cleared */
-
-.macro MULSTEP, k, r0, r1, r2, r3, r4, r5, r6, r7, r8
-
-    mov rdx, [rsi +  0]
-    mulx rcx, rdx, [rdi + 8*\k]
-    add rdx, \r0
-    mulx rcx, rdx, [rip + .inv_min_p_mod_r]
-
-    xor rax, rax /* clear flags */
-
-    mulx rbx, rax, [rip + p +  0]
-    adox \r0, rax
-
-    mulx rcx, rax, [rip + p +  8]
-    adcx \r1, rbx
-    adox \r1, rax
-
-    mulx rbx, rax, [rip + p + 16]
-    adcx \r2, rcx
-    adox \r2, rax
-
-    mulx rcx, rax, [rip + p + 24]
-    adcx \r3, rbx
-    adox \r3, rax
-
-    mulx rbx, rax, [rip + p + 32]
-    adcx \r4, rcx
-    adox \r4, rax
-
-    mulx rcx, rax, [rip + p + 40]
-    adcx \r5, rbx
-    adox \r5, rax
-
-    mulx rbx, rax, [rip + p + 48]
-    adcx \r6, rcx
-    adox \r6, rax
-
-    mulx rcx, rax, [rip + p + 56]
-    adcx \r7, rbx
-    adox \r7, rax
-
-    mov rax, 0
-    adcx \r8, rcx
-    adox \r8, rax
-
-
-    mov rdx, [rdi + 8*\k]
-
-    xor rax, rax /* clear flags */
-
-    mulx rbx, rax, [rsi +  0]
-    adox \r0, rax
-
-    mulx rcx, rax, [rsi +  8]
-    adcx \r1, rbx
-    adox \r1, rax
-
-    mulx rbx, rax, [rsi + 16]
-    adcx \r2, rcx
-    adox \r2, rax
-
-    mulx rcx, rax, [rsi + 24]
-    adcx \r3, rbx
-    adox \r3, rax
-
-    mulx rbx, rax, [rsi + 32]
-    adcx \r4, rcx
-    adox \r4, rax
-
-    mulx rcx, rax, [rsi + 40]
-    adcx \r5, rbx
-    adox \r5, rax
-
-    mulx rbx, rax, [rsi + 48]
-    adcx \r6, rcx
-    adox \r6, rax
-
-    mulx rcx, rax, [rsi + 56]
-    adcx \r7, rbx
-    adox \r7, rax
-
-    mov rax, 0
-    adcx \r8, rcx
-    adox \r8, rax
-
-.endm
-
-    MULSTEP 0, r8,  r9,  r10, r11, r12, r13, r14, r15, rbp
-    MULSTEP 1, r9,  r10, r11, r12, r13, r14, r15, rbp, r8
-    MULSTEP 2, r10, r11, r12, r13, r14, r15, rbp, r8,  r9
-    MULSTEP 3, r11, r12, r13, r14, r15, rbp, r8,  r9,  r10
-    MULSTEP 4, r12, r13, r14, r15, rbp, r8,  r9,  r10, r11
-    MULSTEP 5, r13, r14, r15, rbp, r8,  r9,  r10, r11, r12
-    MULSTEP 6, r14, r15, rbp, r8,  r9,  r10, r11, r12, r13
-    MULSTEP 7, r15, rbp, r8,  r9,  r10, r11, r12, r13, r14
-
-    pop rdi
-
-    mov [rdi +  0], rbp
-    mov [rdi +  8], r8
-    mov [rdi + 16], r9
-    mov [rdi + 24], r10
-    mov [rdi + 32], r11
-    mov [rdi + 40], r12
-    mov [rdi + 48], r13
-    mov [rdi + 56], r14
-
-    pop r15
-    pop r14
-    pop r13
-    pop r12
-    pop rbx
-    pop rbp
-    jmp .reduce_once
-
-.global fp_mul2
-fp_mul2:
-    mov rdx, rdi
-    jmp fp_mul3
-
-.global fp_sq2
-fp_sq2:
-    /* TODO implement optimized Montgomery squaring */
-    mov rdx, rsi
-    jmp fp_mul3
-
-.global fp_sq1
-fp_sq1:
-    mov rsi, rdi
-    jmp fp_sq2
-
-/* (obviously) not constant time in the exponent! */
-.fp_pow:
-    push rbx
-    mov rbx, rsi
-    push r12
-    push r13
-    push rdi
-    sub rsp, 64
-
-    mov rsi, rdi
-    mov rdi, rsp
-    call fp_copy
-
-    mov rdi, [rsp + 64]
-    lea rsi, [rip + fp_1]
-    call fp_copy
-
-.macro POWSTEP, k
-        mov r13, [rbx + 8*\k]
-        xor r12, r12
-
-        0:
-        test r13, 1
-        jz 1f
-
-        mov rdi, [rsp + 64]
-        mov rsi, rsp
-        call fp_mul2
-
-        1:
-        mov rdi, rsp
-        call fp_sq1
-
-        shr r13
-
-        inc r12
-        test r12, 64
-        jz 0b
-.endm
-
-    POWSTEP 0
-    POWSTEP 1
-    POWSTEP 2
-    POWSTEP 3
-    POWSTEP 4
-    POWSTEP 5
-    POWSTEP 6
-    POWSTEP 7
-
-    add rsp, 64+8
-    pop r13
-    pop r12
-    pop rbx
-    ret
-
-.section .rodata
-.p_minus_2:
-    .quad 0x1b81b90533c6c879, 0xc2721bf457aca835, 0x516730cc1f0b4f25, 0xa7aac6c567f35507
-    .quad 0x5afbfcc69322c9cd, 0xb42d083aedc88c42, 0xfc8ab0d15e3e4c4a, 0x65b48e8f740f89bf
-
-.section .text
-
-/* TODO use a better addition chain? */
-.global fp_inv
-fp_inv:
-    lea rsi, [rip + .p_minus_2]
-    jmp .fp_pow
-
-.section .rodata
-.p_minus_1_halves:
-    .quad 0x8dc0dc8299e3643d, 0xe1390dfa2bd6541a, 0xa8b398660f85a792, 0xd3d56362b3f9aa83
-    .quad 0x2d7dfe63499164e6, 0x5a16841d76e44621, 0xfe455868af1f2625, 0x32da4747ba07c4df
-
-.section .text
-
-/* TODO use a better addition chain? */
-.global fp_issquare
-fp_issquare:
-    push rdi
-    lea rsi, [rip + .p_minus_1_halves]
-    call .fp_pow
-    pop rdi
-
-    xor rax, rax
-    .set k, 0
-    .rept 8
-        mov rsi, [rdi + 8*k]
-        xor rsi, [rip + fp_1 + 8*k]
-        or rax, rsi
-        .set k, k+1
-    .endr
-    test rax, rax
-    setz al
-    movzx rax, al
-    ret
-
-
-/* not constant time (but this shouldn't leak anything of importance) */
-.global fp_random
-fp_random:
-
-    push rdi
-    mov rsi, 64
-    call randombytes
-    pop rdi
-    mov rax, 1
-    shl rax, (pbits % 64)
-    dec rax
-    and [rdi + 56], rax
-
-    .set k, 7
-    .rept 8
-        mov rax, [rip + p + 8*k]
-        cmp [rdi + 8*k], rax
-        jge fp_random
-        jl 0f
-        .set k, k-1
-    .endr
-    0:
-    ret
-
--- a/reference/csidh-20180427-by-castryck-et-al/main
+++ b/reference/csidh-20180427-by-castryck-et-al/main
--- a/reference/csidh-20180427-by-castryck-et-al/main.c
+++ b/reference/csidh-20180427-by-castryck-et-al/main.c
@ -1,99 +0,0 @@
-
-#include <stdlib.h>
-#include <stdio.h>
-#include <string.h>
-#include <unistd.h>
-#include <time.h>
-#include <assert.h>
-
-#include "u512.h"
-#include "fp.h"
-#include "mont.h"
-#include "csidh.h"
-
-void u512_print(u512 const *x)
-{
-    for (size_t i = 63; i < 64; --i)
-        printf("%02hhx", i[(unsigned char *) x->c]);
-}
-
-void fp_print(fp const *x)
-{
-    u512 y;
-    fp_dec(&y, x);
-    u512_print(&y);
-}
-
-int main()
-{
-    clock_t t0, t1;
-
-    private_key priv_alice, priv_bob;
-    public_key pub_alice, pub_bob;
-    public_key shared_alice, shared_bob;
-
-    printf("\n");
-
-
-    t0 = clock();
-    csidh_private(&priv_alice);
-    t1 = clock();
-
-    printf("Alice's private key   (%7.3lf ms):\n  ", 1000. * (t1 - t0) / CLOCKS_PER_SEC);
-    for (size_t i = 0; i < sizeof(priv_alice); ++i)
-        printf("%02hhx", i[(uint8_t *) &priv_alice]);
-    printf("\n\n");
-
-    t0 = clock();
-    csidh_private(&priv_bob);
-    t1 = clock();
-
-    printf("Bob's private key     (%7.3lf ms):\n  ", 1000. * (t1 - t0) / CLOCKS_PER_SEC);
-    for (size_t i = 0; i < sizeof(priv_bob); ++i)
-        printf("%02hhx", i[(uint8_t *) &priv_bob]);
-    printf("\n\n");
-
-
-    t0 = clock();
-    assert(csidh(&pub_alice, &base, &priv_alice));
-    t1 = clock();
-
-    printf("Alice's public key    (%7.3lf ms):\n  ", 1000. * (t1 - t0) / CLOCKS_PER_SEC);
-    fp_print(&pub_alice.A);
-    printf("\n\n");
-
-    t0 = clock();
-    assert(csidh(&pub_bob, &base, &priv_bob));
-    t1 = clock();
-
-    printf("Bob's public key      (%7.3lf ms):\n  ", 1000. * (t1 - t0) / CLOCKS_PER_SEC);
-    fp_print(&pub_bob.A);
-    printf("\n\n");
-
-
-    t0 = clock();
-    assert(csidh(&shared_alice, &pub_bob, &priv_alice));
-    t1 = clock();
-
-    printf("Alice's shared secret (%7.3lf ms):\n  ", 1000. * (t1 - t0) / CLOCKS_PER_SEC);
-    fp_print(&shared_alice.A);
-    printf("\n\n");
-
-    t0 = clock();
-    assert(csidh(&shared_bob, &pub_alice, &priv_bob));
-    t1 = clock();
-
-    printf("Bob's shared secret   (%7.3lf ms):\n  ", 1000. * (t1 - t0) / CLOCKS_PER_SEC);
-    fp_print(&shared_bob.A);
-    printf("\n\n");
-
-    printf("    ");
-    if (memcmp(&shared_alice, &shared_bob, sizeof(public_key)))
-        printf("\x1b[31mNOT EQUAL!\x1b[0m\n");
-    else
-        printf("\x1b[32mequal.\x1b[0m\n");
-    printf("\n");
-
-    printf("\n");
-}
-
--- a/reference/csidh-20180427-by-castryck-et-al/mont.c
+++ b/reference/csidh-20180427-by-castryck-et-al/mont.c
@ -1,188 +0,0 @@
-
-#include <assert.h>
-
-#include "mont.h"
-
-void xDBLADD(proj *R, proj *S, proj const *P, proj const *Q, proj const *PQ, proj const *A24)
-{
-    fp tmp0, tmp1, tmp2;
-
-    fp_add3(&tmp0, &P->x, &P->z);
-    fp_sub3(&tmp1, &P->x, &P->z);
-    fp_sq2(&R->x, &tmp0);
-    fp_sub3(&tmp2, &Q->x, &Q->z);
-    fp_add3(&S->x, &Q->x, &Q->z);
-    fp_mul2(&tmp0, &tmp2);
-    fp_sq2(&R->z, &tmp1);
-    fp_mul2(&tmp1, &S->x);
-    fp_sub3(&tmp2, &R->x, &R->z);
-    fp_mul2(&R->z, &A24->z);
-    fp_mul2(&R->x, &R->z);
-    fp_mul3(&S->x, &A24->x, &tmp2);
-    fp_sub3(&S->z, &tmp0, &tmp1);
-    fp_add2(&R->z, &S->x);
-    fp_add3(&S->x, &tmp0, &tmp1);
-    fp_mul2(&R->z, &tmp2);
-    fp_sq1(&S->z);
-    fp_sq1(&S->x);
-    fp_mul2(&S->z, &PQ->x);
-    fp_mul2(&S->x, &PQ->z);
-}
-
-void xDBL(proj *Q, proj const *A, proj const *P)
-{
-    fp a, b, c;
-    fp_add3(&a, &P->x, &P->z);
-    fp_sq1(&a);
-    fp_sub3(&b, &P->x, &P->z);
-    fp_sq1(&b);
-    fp_sub3(&c, &a, &b);
-    fp_add2(&b, &b); fp_add2(&b, &b); /* multiplication by 4 */
-    fp_mul2(&b, &A->z);
-    fp_mul3(&Q->x, &a, &b);
-    fp_add3(&a, &A->z, &A->z); /* multiplication by 2 */
-    fp_add2(&a, &A->x);
-    fp_mul2(&a, &c);
-    fp_add2(&a, &b);
-    fp_mul3(&Q->z, &a, &c);
-}
-
-void xADD(proj *S, proj const *P, proj const *Q, proj const *PQ)
-{
-    fp a, b, c, d;
-    fp_add3(&a, &P->x, &P->z);
-    fp_sub3(&b, &P->x, &P->z);
-    fp_add3(&c, &Q->x, &Q->z);
-    fp_sub3(&d, &Q->x, &Q->z);
-    fp_mul2(&a, &d);
-    fp_mul2(&b, &c);
-    fp_add3(&c, &a, &b);
-    fp_sub3(&d, &a, &b);
-    fp_sq1(&c);
-    fp_sq1(&d);
-    fp_mul3(&S->x, &PQ->z, &c);
-    fp_mul3(&S->z, &PQ->x, &d);
-}
-
-/* Montgomery ladder. */
-/* P must not be the unique point of order 2. */
-/* not constant-time! */
-void xMUL(proj *Q, proj const *A, proj const *P, u512 const *k)
-{
-    proj R = *P;
-    proj A24;
-    const proj Pcopy = *P; /* in case Q = P */
-
-    Q->x = fp_1;
-    Q->z = fp_0;
-
-    fp_add3(&A24.x, &A->z, &A->z);
-    fp_add3(&A24.z, &A24.x, &A24.x);
-    fp_add2(&A24.x, &A->x);
-
-    unsigned long i = 512;
-    while (--i && !u512_bit(k, i));
-
-    do {
-
-        bool bit = u512_bit(k, i);
-
-        if (bit) { proj T = *Q; *Q = R; R = T; } /* not constant-time */
-        //fp_cswap(&Q->x, &R.x, bit);
-        //fp_cswap(&Q->z, &R.z, bit);
-
-        xDBLADD(Q, &R, Q, &R, &Pcopy, &A24);
-
-        if (bit) { proj T = *Q; *Q = R; R = T; } /* not constant-time */
-        //fp_cswap(&Q->x, &R.x, bit);
-        //fp_cswap(&Q->z, &R.z, bit);
-
-    } while (i--);
-}
-
-
-/* computes the isogeny with kernel point K of order k */
-/* returns the new curve coefficient A and the image of P */
-/* (obviously) not constant time in k */
-void xISOG(proj *A, proj *P, proj const *K, uint64_t k)
-{
-    assert (k >= 3);
-    assert (k % 2 == 1);
-
-    fp tmp0, tmp1;
-    fp T[4] = {K->z, K->x, K->x, K->z};
-    proj Q;
-
-    fp_mul3(&Q.x,  &P->x, &K->x);
-    fp_mul3(&tmp0, &P->z, &K->z);
-    fp_sub2(&Q.x,  &tmp0);
-
-    fp_mul3(&Q.z,  &P->x, &K->z);
-    fp_mul3(&tmp0, &P->z, &K->x);
-    fp_sub2(&Q.z,  &tmp0);
-
-    proj M[3] = {*K};
-    xDBL(&M[1], A, K);
-
-    for (uint64_t i = 1; i < k / 2; ++i) {
-
-        if (i >= 2)
-            xADD(&M[i % 3], &M[(i - 1) % 3], K, &M[(i - 2) % 3]);
-
-        fp_mul3(&tmp0, &M[i % 3].x, &T[0]);
-        fp_mul3(&tmp1, &M[i % 3].z, &T[1]);
-        fp_add3(&T[0], &tmp0, &tmp1);
-
-        fp_mul2(&T[1], &M[i % 3].x);
-
-        fp_mul3(&tmp0, &M[i % 3].z, &T[2]);
-        fp_mul3(&tmp1, &M[i % 3].x, &T[3]);
-        fp_add3(&T[2], &tmp0, &tmp1);
-
-        fp_mul2(&T[3], &M[i % 3].z);
-
-
-        fp_mul3(&tmp0, &P->x, &M[i % 3].x);
-        fp_mul3(&tmp1, &P->z, &M[i % 3].z);
-        fp_sub2(&tmp0, &tmp1);
-        fp_mul2(&Q.x,  &tmp0);
-
-        fp_mul3(&tmp0, &P->x, &M[i % 3].z);
-        fp_mul3(&tmp1, &P->z, &M[i % 3].x);
-        fp_sub2(&tmp0, &tmp1);
-        fp_mul2(&Q.z,  &tmp0);
-    }
-
-    fp_mul2(&T[0], &T[1]);
-    fp_add2(&T[0], &T[0]); /* multiplication by 2 */
-
-    fp_sq1(&T[1]);
-
-    fp_mul2(&T[2], &T[3]);
-    fp_add2(&T[2], &T[2]); /* multiplication by 2 */
-
-    fp_sq1(&T[3]);
-
-    /* Ax := T[1] * T[3] * Ax - 3 * Az * (T[1] * T[2] - T[0] * T[3]) */
-    fp_mul3(&tmp0, &T[1], &T[2]);
-    fp_mul3(&tmp1, &T[0], &T[3]);
-    fp_sub2(&tmp0, &tmp1);
-    fp_mul2(&tmp0, &A->z);
-    fp_add3(&tmp1, &tmp0, &tmp0); fp_add2(&tmp0, &tmp1); /* multiplication by 3 */
-
-    fp_mul3(&tmp1, &T[1], &T[3]);
-    fp_mul2(&tmp1, &A->x);
-
-    fp_sub3(&A->x, &tmp1, &tmp0);
-
-    /* Az := Az * T[3]^2 */
-    fp_sq1(&T[3]);
-    fp_mul2(&A->z, &T[3]);
-
-    /* X := X * Xim^2, Z := Z * Zim^2 */
-    fp_sq1(&Q.x);
-    fp_sq1(&Q.z);
-    fp_mul2(&P->x, &Q.x);
-    fp_mul2(&P->z, &Q.z);
-}
-
--- a/reference/csidh-20180427-by-castryck-et-al/mont.h
+++ b/reference/csidh-20180427-by-castryck-et-al/mont.h
@ -1,19 +0,0 @@
-#ifndef MONT_H
-#define MONT_H
-
-#include "u512.h"
-#include "fp.h"
-
-/* P^1 over fp. */
-typedef struct proj {
-    fp x;
-    fp z;
-} proj;
-
-void xDBL(proj *Q, proj const *A, proj const *P);
-void xADD(proj *S, proj const *P, proj const *Q, proj const *PQ);
-void xDBLADD(proj *R, proj *S, proj const *P, proj const *Q, proj const *PQ, proj const *A);
-void xMUL(proj *Q, proj const *A, proj const *P, u512 const *k);
-void xISOG(proj *A, proj *P, proj const *K, uint64_t k);
-
-#endif
--- a/reference/csidh-20180427-by-castryck-et-al/rng.c
+++ b/reference/csidh-20180427-by-castryck-et-al/rng.c
@ -1,18 +0,0 @@
-
-#include "rng.h"
-
-#include <stdlib.h>
-#include <unistd.h>
-#include <fcntl.h>
-
-void randombytes(void *x, size_t l)
-{
-    static int fd = -1;
-    ssize_t n;
-    if (fd < 0 && 0 > (fd = open("/dev/urandom", O_RDONLY)))
-        exit(1);
-    for (size_t i = 0; i < l; i += n)
-        if (0 >= (n = read(fd, (char *) x + i, l - i)))
-            exit(2);
-}
-
--- a/reference/csidh-20180427-by-castryck-et-al/rng.h
+++ b/reference/csidh-20180427-by-castryck-et-al/rng.h
@ -1,8 +0,0 @@
-#ifndef RNG_H
-#define RNG_H
-
-#include <stdlib.h>
-
-void randombytes(void *x, size_t l);
-
-#endif
--- a/reference/csidh-20180427-by-castryck-et-al/supersingular.sage
+++ b/reference/csidh-20180427-by-castryck-et-al/supersingular.sage
@ -1,128 +0,0 @@
-#!/usr/bin/env sage
-#coding: utf8
-
-proof.all(False)
-
-
-# parameters.
-
-ls = list(primes(3, 374)) + [587] # Elkies primes
-#ls = list(primes(3, 47)) + [97] # (a smaller example)
-p = 4 * prod(ls) - 1
-assert is_prime(p)
-print "\nElkies primes:", " ".join(map(str, ls))
-
-max_exp = ceil((sqrt(p) ** (1/len(ls)) - 1) / 2)
-assert (2 * max_exp + 1) ** len(ls) >= sqrt(p)
-print "exponents are chosen in the range {}..{}.".format(-max_exp, max_exp)
-
-base = GF(p)(0) # Montgomery coefficient of starting curve
-
-
-# helper functions.
-
-# NB: all the operations can be computed entirely over the prime field,
-# but for simplicity of this implementation we will make use of curves
-# defined over GF(p^2).  note this slows everything down quite a bit.
-
-Fp2.<i> = GF(p**2, modulus = x**2 + 1)
-
-def montgomery_curve(A):
-    return EllipticCurve(Fp2, [0, A, 0, 1, 0])
-
-# sage's isogeny formulas return Weierstraß curves, hence we need this...
-def montgomery_coefficient(E):
-    Ew = E.change_ring(GF(p)).short_weierstrass_model()
-    _, _, _, a, b = Ew.a_invariants()
-    R.<z> = GF(p)[]
-    r = (z**3 + a*z + b).roots(multiplicities=False)[0]
-    s = sqrt(3 * r**2 + a)
-    if not is_square(s): s = -s
-    A = 3 * r / s
-    assert montgomery_curve(A).change_ring(GF(p)).is_isomorphic(Ew)
-    return GF(p)(A)
-
-
-# actual implementation.
-
-def private():
-    return [randrange(-max_exp, max_exp + 1) for _ in range(len(ls))]
-
-def validate(A):
-    while True:
-        k = 1
-        P = montgomery_curve(A).lift_x(GF(p).random_element())
-        for l in ls:
-            Q = (p + 1) // l * P
-            if not Q: continue
-            if l * Q: return False
-            k *= l
-            if k > 4 * sqrt(p): return True
-
-def action(pub, priv):
-
-    E = montgomery_curve(pub)
-    es = priv[:]
-
-    while any(es):
-
-        E._order = (p + 1)**2 # else sage computes this
-
-        P = E.lift_x(GF(p).random_element())
-        s = +1 if P.xy()[1] in GF(p) else -1
-        k = prod(l for l, e in zip(ls, es) if sign(e) == s)
-        P *= (p + 1) // k
-
-        for i, (l, e) in enumerate(zip(ls, es)):
-
-            if sign(e) != s: continue
-
-            Q = k // l * P
-            if not Q: continue
-            Q._order = l # else sage computes this
-            phi = E.isogeny(Q)
-
-            E, P = phi.codomain(), phi(P)
-            es[i] -= s
-            k //= l
-
-    return montgomery_coefficient(E)
-
-
-# example.
-
-print
-
-print "testing public-key validation on random ordinary curves (should be all 0s):\n   ",
-for _ in range(16):
-    while True:
-        A = GF(p).random_element()
-        if montgomery_curve(A).is_ordinary(): break
-    print int(validate(A)),
-print
-
-privA = private()
-print "\nAlice's private key:\n   ", " ".join(map('{:2d}'.format, privA))
-
-pubA = action(base, privA)
-print "\nAlice's public key:\n   ", pubA,
-print " (valid: {})".format(int(validate(pubA)))
-
-privB = private()
-print "\nBob's private key:\n   ", " ".join(map('{:2d}'.format, privB))
-
-pubB = action(base, privB)
-print "\nBob's public key:\n   ", pubB,
-print " (valid: {})".format(int(validate(pubB)))
-
-sharedA = action(pubB, privA)
-print "\nAlice's shared secret:\n   ", sharedA
-
-sharedB = action(pubA, privB)
-print "\nBob's shared secret:\n   ", sharedB
-
-if sharedA == sharedB:
-    print "\n--> equal!\n"
-else:
-    print "\n--> NOT EQUAL?!\n"
-
--- a/reference/csidh-20180427-by-castryck-et-al/u512.h
+++ b/reference/csidh-20180427-by-castryck-et-al/u512.h
@ -1,22 +0,0 @@
-#ifndef UINT_H
-#define UINT_H
-
-#include <stdbool.h>
-#include <stdint.h>
-
-typedef struct u512 {
-    uint64_t c[8];
-} u512;
-
-extern const u512 u512_1;
-
-void u512_set(u512 *x, uint64_t y);
-
-bool u512_bit(u512 const *x, uint64_t k);
-
-bool u512_add3(u512 *x, u512 const *y, u512 const *z); /* returns carry */
-bool u512_sub3(u512 *x, u512 const *y, u512 const *z); /* returns borrow */
-
-void u512_mul3_64(u512 *x, u512 const *y, uint64_t z);
-
-#endif
--- a/reference/csidh-20180427-by-castryck-et-al/u512.s
+++ b/reference/csidh-20180427-by-castryck-et-al/u512.s
@ -1,102 +0,0 @@
-
-.intel_syntax noprefix
-
-.section .rodata
-
-.global u512_1
-u512_1: .quad 1, 0, 0, 0, 0, 0, 0, 0
-
-
-.section .text
-
-.global u512_set
-u512_set:
-    cld
-    mov rax, rsi
-    stosq
-    xor rax, rax
-    mov rcx, 7
-    rep stosq
-    ret
-
-
-.global u512_bit
-u512_bit:
-    mov rcx, rsi
-    and rcx, 0x3f
-    shr rsi, 6
-    mov rax, [rdi + 8*rsi]
-    shr rax, cl
-    and rax, 1
-    ret
-
-
-.global u512_add3
-u512_add3:
-    mov rax, [rsi +  0]
-    add rax, [rdx +  0]
-    mov [rdi +  0], rax
-    .set k, 1
-    .rept 7
-        mov rax, [rsi + 8*k]
-        adc rax, [rdx + 8*k]
-        mov [rdi + 8*k], rax
-        .set k, k+1
-    .endr
-    setc al
-    movzx rax, al
-    ret
-
-.global u512_sub3
-u512_sub3:
-    mov rax, [rsi +  0]
-    sub rax, [rdx +  0]
-    mov [rdi +  0], rax
-    .set k, 1
-    .rept 7
-        mov rax, [rsi + 8*k]
-        sbb rax, [rdx + 8*k]
-        mov [rdi + 8*k], rax
-        .set k, k+1
-    .endr
-    setc al
-    movzx rax, al
-    ret
-
-
-.global u512_mul3_64
-u512_mul3_64:
-
-    mulx r10, rax, [rsi +  0]
-    mov [rdi +  0], rax
-
-    mulx r11, rax, [rsi +  8]
-    add  rax, r10
-    mov [rdi +  8], rax
-
-    mulx r10, rax, [rsi + 16]
-    adcx rax, r11
-    mov [rdi + 16], rax
-
-    mulx r11, rax, [rsi + 24]
-    adcx rax, r10
-    mov [rdi + 24], rax
-
-    mulx r10, rax, [rsi + 32]
-    adcx rax, r11
-    mov [rdi + 32],rax
-
-    mulx r11, rax, [rsi + 40]
-    adcx rax, r10
-    mov [rdi + 40],rax
-
-    mulx r10, rax, [rsi + 48]
-    adcx rax, r11
-    mov [rdi + 48],rax
-
-    mulx r11, rax, [rsi + 56]
-    adcx rax, r10
-    mov [rdi + 56],rax
-
-    ret
-