commit c00c37e0118993aa4e1dcf544d15c7ebb055b0f4
Author: Michael Meyer <michael.meyer@hs-rm.de>
Date:   Thu Aug 23 13:49:45 2018 +0200

    c-implementation

diff --git a/Makefile b/Makefile
new file mode 100644
index 0000000..1077519
--- /dev/null
+++ b/Makefile
@@ -0,0 +1,25 @@
+all:
+	@gcc \
+		-Wall -Wextra \
+		-O0 -funroll-loops \
+		rng.c \
+		u512.s fp.s \
+		mont.c \
+		csidh.c \
+		main.c \
+		-o main
+
+debug:
+	gcc \
+		-Wall -Wextra \
+		-g \
+		rng.c \
+		u512.s fp.s \
+		mont.c \
+		csidh.c \
+		main.c \
+		-o main
+
+clean:
+	rm -f main
+
diff --git a/bench.c b/bench.c
new file mode 100644
index 0000000..fd0f64a
--- /dev/null
+++ b/bench.c
@@ -0,0 +1,54 @@
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <time.h>
+#include <assert.h>
+
+#include "u512.h"
+#include "fp.h"
+#include "mont.h"
+#include "csidh.h"
+
+#include <inttypes.h>
+
+static __inline__ uint64_t rdtsc(void)
+{
+    uint32_t hi, lo;
+    __asm__ __volatile__ ("rdtsc" : "=a"(lo), "=d"(hi));
+    return lo | (uint64_t) hi << 32;
+}
+
+unsigned long its = 10000;
+
+int main()
+{
+    clock_t t0, t1, time = 0;
+    uint64_t c0, c1, cycles = 0;
+
+    private_key priv;
+    public_key pub = base;
+
+    for (unsigned long i = 0; i < its; ++i) {
+
+        csidh_private(&priv);
+
+        t0 = clock();
+        c0 = rdtsc();
+
+        /**************************************/
+        assert(validate(&pub));
+        action(&pub, &pub, &priv);
+        /**************************************/
+
+        c1 = rdtsc();
+        t1 = clock();
+        cycles += c1 - c0;
+        time += t1 - t0;
+    }
+
+    printf("iterations: %lu\n", its);
+    printf("clock cycles: %" PRIu64 "\n", (uint64_t) cycles / its);
+    printf("wall-clock time: %.3lf ms\n", 1000. * time / CLOCKS_PER_SEC / its);
+}
+
diff --git a/csidh.c b/csidh.c
new file mode 100644
index 0000000..25d897f
--- /dev/null
+++ b/csidh.c
@@ -0,0 +1,220 @@
+
+#include <string.h>
+#include <assert.h>
+
+#include "csidh.h"
+#include "rng.h"
+
+/* specific to p, should perhaps be somewhere else */
+const unsigned primes[num_primes] = {
+      3,   5,   7,  11,  13,  17,  19,  23,  29,  31,  37,  41,  43,  47,  53,  59,
+     61,  67,  71,  73,  79,  83,  89,  97, 101, 103, 107, 109, 113, 127, 131, 137,
+    139, 149, 151, 157, 163, 167, 173, 179, 181, 191, 193, 197, 199, 211, 223, 227,
+    229, 233, 239, 241, 251, 257, 263, 269, 271, 277, 281, 283, 293, 307, 311, 313,
+    317, 331, 337, 347, 349, 353, 359, 367, 373, 587,
+};
+
+const u512 four_sqrt_p = {{
+    0x85e2579c786882cf, 0x4e3433657e18da95, 0x850ae5507965a0b3, 0xa15bc4e676475964,
+}};
+
+
+const public_key base = {0}; /* A = 0 */
+
+void csidh_private(private_key *priv)
+{
+    memset(&priv->e, 0, sizeof(priv->e));
+    for (size_t i = 0; i < num_primes; ) {
+        int8_t buf[64];
+        randombytes(buf, sizeof(buf));
+        for (size_t j = 0; j < sizeof(buf); ++j) {
+            if (buf[j] <= max_exponent && buf[j] >= -max_exponent) {
+                priv->e[i / 2] |= (buf[j] & 0xf) << i % 2 * 4;
+                if (++i >= num_primes)
+                    break;
+            }
+        }
+    }
+}
+
+/* compute [(p+1)/l] P for all l in our list of primes. */
+/* divide and conquer is much faster than doing it naively,
+ * but uses more memory. */
+static void cofactor_multiples(proj *P, const proj *A, size_t lower, size_t upper)
+{
+    assert(lower < upper);
+
+    if (upper - lower == 1)
+        return;
+
+    size_t mid = lower + (upper - lower + 1) / 2;
+
+    u512 cl = u512_1, cu = u512_1;
+    for (size_t i = lower; i < mid; ++i)
+        u512_mul3_64(&cu, &cu, primes[i]);
+    for (size_t i = mid; i < upper; ++i)
+        u512_mul3_64(&cl, &cl, primes[i]);
+
+    xMUL(&P[mid], A, &P[lower], &cu);
+    xMUL(&P[lower], A, &P[lower], &cl);
+
+    cofactor_multiples(P, A, lower, mid);
+    cofactor_multiples(P, A, mid, upper);
+}
+
+/* never accepts invalid keys. */
+bool validate(public_key const *in)
+{
+    const proj A = {in->A, fp_1};
+
+    do {
+
+        proj P[num_primes];
+        fp_random(&P->x);
+        P->z = fp_1;
+
+        /* maximal 2-power in p+1 */
+        xDBL(P, &A, P);
+        xDBL(P, &A, P);
+
+        cofactor_multiples(P, &A, 0, num_primes);
+
+        u512 order = u512_1;
+
+        for (size_t i = num_primes - 1; i < num_primes; --i) {
+
+            /* we only gain information if [(p+1)/l] P is non-zero */
+            if (memcmp(&P[i].z, &fp_0, sizeof(fp))) {
+
+                u512 tmp;
+                u512_set(&tmp, primes[i]);
+                xMUL(&P[i], &A, &P[i], &tmp);
+
+                if (memcmp(&P[i].z, &fp_0, sizeof(fp)))
+                    /* P does not have order dividing p+1. */
+                    return false;
+
+                u512_mul3_64(&order, &order, primes[i]);
+
+                if (u512_sub3(&tmp, &four_sqrt_p, &order)) /* returns borrow */
+                    /* order > 4 sqrt(p), hence definitely supersingular */
+                    return true;
+            }
+        }
+
+    /* P didn't have big enough order to prove supersingularity. */
+    } while (1);
+}
+
+/* compute x^3 + Ax^2 + x */
+static void montgomery_rhs(fp *rhs, fp const *A, fp const *x)
+{
+    fp tmp;
+    *rhs = *x;
+    fp_sq1(rhs);
+    fp_mul3(&tmp, A, x);
+    fp_add2(rhs, &tmp);
+    fp_add2(rhs, &fp_1);
+    fp_mul2(rhs, x);
+}
+
+/* totally not constant-time. */
+void action(public_key *out, public_key const *in, private_key const *priv)
+{
+    u512 k[2];
+    u512_set(&k[0], 4); /* maximal 2-power in p+1 */
+    u512_set(&k[1], 4); /* maximal 2-power in p+1 */
+
+    uint8_t e[2][num_primes];
+
+    for (size_t i = 0; i < num_primes; ++i) {
+
+        int8_t t = (int8_t) (priv->e[i / 2] << i % 2 * 4) >> 4;
+
+        if (t > 0) {
+            e[0][i] = t;
+            e[1][i] = 0;
+            u512_mul3_64(&k[1], &k[1], primes[i]);
+        }
+        else if (t < 0) {
+            e[1][i] = -t;
+            e[0][i] = 0;
+            u512_mul3_64(&k[0], &k[0], primes[i]);
+        }
+        else {
+            e[0][i] = 0;
+            e[1][i] = 0;
+            u512_mul3_64(&k[0], &k[0], primes[i]);
+            u512_mul3_64(&k[1], &k[1], primes[i]);
+        }
+    }
+
+    proj A = {in->A, fp_1};
+
+    bool done[2] = {false, false};
+
+    do {
+
+        assert(!memcmp(&A.z, &fp_1, sizeof(fp)));
+
+        proj P;
+        fp_random(&P.x);
+        P.z = fp_1;
+
+        fp rhs;
+        montgomery_rhs(&rhs, &A.x, &P.x);
+        bool sign = !fp_issquare(&rhs);
+
+        if (done[sign])
+            continue;
+
+        xMUL(&P, &A, &P, &k[sign]);
+
+        done[sign] = true;
+
+        for (size_t i = num_primes-1; i < num_primes; --i) {  //changed loop direction
+
+            if (e[sign][i]) {
+
+                u512 cof = u512_1;
+                for (size_t j = i - 1; j < num_primes; --j)   //changed loop direction
+                    if (e[sign][j])
+                        u512_mul3_64(&cof, &cof, primes[j]);
+
+                proj K;
+                xMUL(&K, &A, &P, &cof);
+
+                if (memcmp(&K.z, &fp_0, sizeof(fp))) {
+
+                    xISOG(&A, &P, &K, primes[i]);
+
+                    if (!--e[sign][i])
+                        u512_mul3_64(&k[sign], &k[sign], primes[i]);
+
+                }
+
+            }
+
+            done[sign] &= !e[sign][i];
+        }
+
+        fp_inv(&A.z);
+        fp_mul2(&A.x, &A.z);
+        A.z = fp_1;
+
+    } while (!(done[0] && done[1]));
+
+    out->A = A.x;
+}
+
+/* includes public-key validation. */
+bool csidh(public_key *out, public_key const *in, private_key const *priv)
+{
+    if (!validate(in)) {
+        fp_random(&out->A);
+        return false;
+    }
+    action(out, in, priv);
+    return true;
+}
+
diff --git a/csidh.h b/csidh.h
new file mode 100644
index 0000000..52e9d54
--- /dev/null
+++ b/csidh.h
@@ -0,0 +1,26 @@
+#ifndef CSIDH_H
+#define CSIDH_H
+
+#include "u512.h"
+#include "fp.h"
+#include "mont.h"
+
+/* specific to p, should perhaps be somewhere else */
+#define num_primes 74
+#define max_exponent 5 /* (2*5+1)^74 is roughly 2^256 */
+
+
+typedef struct private_key {
+    int8_t e[(num_primes + 1) / 2]; /* packed int4_t */
+} private_key;
+
+typedef struct public_key {
+    fp A; /* Montgomery coefficient: represents y^2 = x^3 + Ax^2 + x */
+} public_key;
+
+extern const public_key base;
+
+void csidh_private(private_key *priv);
+bool csidh(public_key *out, public_key const *in, private_key const *priv);
+
+#endif
diff --git a/cycle.h b/cycle.h
new file mode 100644
index 0000000..2652a04
--- /dev/null
+++ b/cycle.h
@@ -0,0 +1,514 @@
+/*
+ * Copyright (c) 2003, 2007-8 Matteo Frigo
+ * Copyright (c) 2003, 2007-8 Massachusetts Institute of Technology
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining
+ * a copy of this software and associated documentation files (the
+ * "Software"), to deal in the Software without restriction, including
+ * without limitation the rights to use, copy, modify, merge, publish,
+ * distribute, sublicense, and/or sell copies of the Software, and to
+ * permit persons to whom the Software is furnished to do so, subject to
+ * the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be
+ * included in all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+ * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+ * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+ * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ *
+ */
+
+
+/* machine-dependent cycle counters code. Needs to be inlined. */
+
+/***************************************************************************/
+/* To use the cycle counters in your code, simply #include "cycle.h" (this
+   file), and then use the functions/macros:
+
+                 ticks getticks(void);
+
+   ticks is an opaque typedef defined below, representing the current time.
+   You extract the elapsed time between two calls to gettick() via:
+
+                 double elapsed(ticks t1, ticks t0);
+
+   which returns a double-precision variable in arbitrary units.  You
+   are not expected to convert this into human units like seconds; it
+   is intended only for *comparisons* of time intervals.
+
+   (In order to use some of the OS-dependent timer routines like
+   Solaris' gethrtime, you need to paste the autoconf snippet below
+   into your configure.ac file and #include "config.h" before cycle.h,
+   or define the relevant macros manually if you are not using autoconf.)
+*/
+
+/***************************************************************************/
+/* This file uses macros like HAVE_GETHRTIME that are assumed to be
+   defined according to whether the corresponding function/type/header
+   is available on your system.  The necessary macros are most
+   conveniently defined if you are using GNU autoconf, via the tests:
+   
+   dnl ---------------------------------------------------------------------
+
+   AC_C_INLINE
+   AC_HEADER_TIME
+   AC_CHECK_HEADERS([sys/time.h c_asm.h intrinsics.h mach/mach_time.h])
+
+   AC_CHECK_TYPE([hrtime_t],[AC_DEFINE(HAVE_HRTIME_T, 1, [Define to 1 if hrtime_t is defined in <sys/time.h>])],,[#if HAVE_SYS_TIME_H
+#include <sys/time.h>
+#endif])
+
+   AC_CHECK_FUNCS([gethrtime read_real_time time_base_to_time clock_gettime mach_absolute_time])
+
+   dnl Cray UNICOS _rtc() (real-time clock) intrinsic
+   AC_MSG_CHECKING([for _rtc intrinsic])
+   rtc_ok=yes
+   AC_TRY_LINK([#ifdef HAVE_INTRINSICS_H
+#include <intrinsics.h>
+#endif], [_rtc()], [AC_DEFINE(HAVE__RTC,1,[Define if you have the UNICOS _rtc() intrinsic.])], [rtc_ok=no])
+   AC_MSG_RESULT($rtc_ok)
+
+   dnl ---------------------------------------------------------------------
+*/
+
+/***************************************************************************/
+
+#if TIME_WITH_SYS_TIME
+# include <sys/time.h>
+# include <time.h>
+#else
+# if HAVE_SYS_TIME_H
+#  include <sys/time.h>
+# else
+#  include <time.h>
+# endif
+#endif
+
+#define INLINE_ELAPSED(INL) static INL double elapsed(ticks t1, ticks t0) \
+{									  \
+     return (double)t1 - (double)t0;					  \
+}
+
+/*----------------------------------------------------------------*/
+/* Solaris */
+#if defined(HAVE_GETHRTIME) && defined(HAVE_HRTIME_T) && !defined(HAVE_TICK_COUNTER)
+typedef hrtime_t ticks;
+
+#define getticks gethrtime
+
+INLINE_ELAPSED(inline)
+
+#define HAVE_TICK_COUNTER
+#endif
+
+/*----------------------------------------------------------------*/
+/* AIX v. 4+ routines to read the real-time clock or time-base register */
+#if defined(HAVE_READ_REAL_TIME) && defined(HAVE_TIME_BASE_TO_TIME) && !defined(HAVE_TICK_COUNTER)
+typedef timebasestruct_t ticks;
+
+static __inline ticks getticks(void)
+{
+     ticks t;
+     read_real_time(&t, TIMEBASE_SZ);
+     return t;
+}
+
+static __inline double elapsed(ticks t1, ticks t0) /* time in nanoseconds */
+{
+     time_base_to_time(&t1, TIMEBASE_SZ);
+     time_base_to_time(&t0, TIMEBASE_SZ);
+     return (((double)t1.tb_high - (double)t0.tb_high) * 1.0e9 + 
+	     ((double)t1.tb_low - (double)t0.tb_low));
+}
+
+#define HAVE_TICK_COUNTER
+#endif
+
+/*----------------------------------------------------------------*/
+/*
+ * PowerPC ``cycle'' counter using the time base register.
+ */
+#if ((((defined(__GNUC__) && (defined(__powerpc__) || defined(__ppc__))) || (defined(__MWERKS__) && defined(macintosh)))) || (defined(__IBM_GCC_ASM) && (defined(__powerpc__) || defined(__ppc__))))  && !defined(HAVE_TICK_COUNTER)
+typedef unsigned long long ticks;
+
+static __inline__ ticks getticks(void)
+{
+     unsigned int tbl, tbu0, tbu1;
+
+     do {
+	  __asm__ __volatile__ ("mftbu %0" : "=r"(tbu0));
+	  __asm__ __volatile__ ("mftb %0" : "=r"(tbl));
+	  __asm__ __volatile__ ("mftbu %0" : "=r"(tbu1));
+     } while (tbu0 != tbu1);
+
+     return (((unsigned long long)tbu0) << 32) | tbl;
+}
+
+INLINE_ELAPSED(__inline__)
+
+#define HAVE_TICK_COUNTER
+#endif
+
+/* MacOS/Mach (Darwin) time-base register interface (unlike UpTime,
+   from Carbon, requires no additional libraries to be linked). */
+#if defined(HAVE_MACH_ABSOLUTE_TIME) && defined(HAVE_MACH_MACH_TIME_H) && !defined(HAVE_TICK_COUNTER)
+#include <mach/mach_time.h>
+typedef uint64_t ticks;
+#define getticks mach_absolute_time
+INLINE_ELAPSED(__inline__)
+#define HAVE_TICK_COUNTER
+#endif
+
+/*----------------------------------------------------------------*/
+/*
+ * Pentium cycle counter 
+ */
+#if (defined(__GNUC__) || defined(__ICC)) && defined(__i386__)  && !defined(HAVE_TICK_COUNTER)
+typedef unsigned long long ticks;
+
+static __inline__ ticks getticks(void)
+{
+     ticks ret;
+
+     __asm__ __volatile__("rdtsc": "=A" (ret));
+     /* no input, nothing else clobbered */
+     return ret;
+}
+
+INLINE_ELAPSED(__inline__)
+
+#define HAVE_TICK_COUNTER
+#define TIME_MIN 5000.0   /* unreliable pentium IV cycle counter */
+#endif
+
+/* Visual C++ -- thanks to Morten Nissov for his help with this */
+#if _MSC_VER >= 1200 && _M_IX86 >= 500 && !defined(HAVE_TICK_COUNTER)
+#include <windows.h>
+typedef LARGE_INTEGER ticks;
+#define RDTSC __asm __emit 0fh __asm __emit 031h /* hack for VC++ 5.0 */
+
+static __inline ticks getticks(void)
+{
+     ticks retval;
+
+     __asm {
+	  RDTSC
+	  mov retval.HighPart, edx
+	  mov retval.LowPart, eax
+     }
+     return retval;
+}
+
+static __inline double elapsed(ticks t1, ticks t0)
+{  
+     return (double)t1.QuadPart - (double)t0.QuadPart;
+}  
+
+#define HAVE_TICK_COUNTER
+#define TIME_MIN 5000.0   /* unreliable pentium IV cycle counter */
+#endif
+
+/*----------------------------------------------------------------*/
+/*
+ * X86-64 cycle counter
+ */
+#if (defined(__GNUC__) || defined(__ICC) || defined(__SUNPRO_C)) && defined(__x86_64__)  && !defined(HAVE_TICK_COUNTER)
+typedef unsigned long long ticks;
+
+static __inline__ ticks getticks(void)
+{
+     unsigned a, d; 
+     asm volatile("rdtsc" : "=a" (a), "=d" (d)); 
+     return ((ticks)a) | (((ticks)d) << 32); 
+}
+
+INLINE_ELAPSED(__inline__)
+
+#define HAVE_TICK_COUNTER
+#endif
+
+/* PGI compiler, courtesy Cristiano Calonaci, Andrea Tarsi, & Roberto Gori.
+   NOTE: this code will fail to link unless you use the -Masmkeyword compiler
+   option (grrr). */
+#if defined(__PGI) && defined(__x86_64__) && !defined(HAVE_TICK_COUNTER) 
+typedef unsigned long long ticks;
+static ticks getticks(void)
+{
+    asm(" rdtsc; shl    $0x20,%rdx; mov    %eax,%eax; or     %rdx,%rax;    ");
+}
+INLINE_ELAPSED(__inline__)
+#define HAVE_TICK_COUNTER
+#endif
+
+/* Visual C++, courtesy of Dirk Michaelis */
+#if _MSC_VER >= 1400 && (defined(_M_AMD64) || defined(_M_X64)) && !defined(HAVE_TICK_COUNTER)
+
+#include <intrin.h>
+#pragma intrinsic(__rdtsc)
+typedef unsigned __int64 ticks;
+#define getticks __rdtsc
+INLINE_ELAPSED(__inline)
+
+#define HAVE_TICK_COUNTER
+#endif
+
+/*----------------------------------------------------------------*/
+/*
+ * IA64 cycle counter
+ */
+
+/* intel's icc/ecc compiler */
+#if (defined(__EDG_VERSION) || defined(__ECC)) && defined(__ia64__) && !defined(HAVE_TICK_COUNTER)
+typedef unsigned long ticks;
+#include <ia64intrin.h>
+
+static __inline__ ticks getticks(void)
+{
+     return __getReg(_IA64_REG_AR_ITC);
+}
+ 
+INLINE_ELAPSED(__inline__)
+ 
+#define HAVE_TICK_COUNTER
+#endif
+
+/* gcc */
+#if defined(__GNUC__) && defined(__ia64__) && !defined(HAVE_TICK_COUNTER)
+typedef unsigned long ticks;
+
+static __inline__ ticks getticks(void)
+{
+     ticks ret;
+
+     __asm__ __volatile__ ("mov %0=ar.itc" : "=r"(ret));
+     return ret;
+}
+
+INLINE_ELAPSED(__inline__)
+
+#define HAVE_TICK_COUNTER
+#endif
+
+/* HP/UX IA64 compiler, courtesy Teresa L. Johnson: */
+#if defined(__hpux) && defined(__ia64) && !defined(HAVE_TICK_COUNTER)
+#include <machine/sys/inline.h>
+typedef unsigned long ticks;
+
+static inline ticks getticks(void)
+{
+     ticks ret;
+
+     ret = _Asm_mov_from_ar (_AREG_ITC);
+     return ret;
+}
+
+INLINE_ELAPSED(inline)
+
+#define HAVE_TICK_COUNTER
+#endif
+
+/* Microsoft Visual C++ */
+#if defined(_MSC_VER) && defined(_M_IA64) && !defined(HAVE_TICK_COUNTER)
+typedef unsigned __int64 ticks;
+
+#  ifdef __cplusplus
+extern "C"
+#  endif
+ticks __getReg(int whichReg);
+#pragma intrinsic(__getReg)
+
+static __inline ticks getticks(void)
+{
+     volatile ticks temp;
+     temp = __getReg(3116);
+     return temp;
+}
+
+INLINE_ELAPSED(inline)
+
+#define HAVE_TICK_COUNTER
+#endif
+
+/*----------------------------------------------------------------*/
+/*
+ * PA-RISC cycle counter 
+ */
+#if defined(__hppa__) || defined(__hppa) && !defined(HAVE_TICK_COUNTER)
+typedef unsigned long ticks;
+
+#  ifdef __GNUC__
+static __inline__ ticks getticks(void)
+{
+     ticks ret;
+
+     __asm__ __volatile__("mfctl 16, %0": "=r" (ret));
+     /* no input, nothing else clobbered */
+     return ret;
+}
+#  else
+#  include <machine/inline.h>
+static inline unsigned long getticks(void)
+{
+     register ticks ret;
+     _MFCTL(16, ret);
+     return ret;
+}
+#  endif
+
+INLINE_ELAPSED(inline)
+
+#define HAVE_TICK_COUNTER
+#endif
+
+/*----------------------------------------------------------------*/
+/* S390, courtesy of James Treacy */
+#if defined(__GNUC__) && defined(__s390__) && !defined(HAVE_TICK_COUNTER)
+typedef unsigned long long ticks;
+
+static __inline__ ticks getticks(void)
+{
+     ticks cycles;
+     __asm__("stck 0(%0)" : : "a" (&(cycles)) : "memory", "cc");
+     return cycles;
+}
+
+INLINE_ELAPSED(__inline__)
+
+#define HAVE_TICK_COUNTER
+#endif
+/*----------------------------------------------------------------*/
+#if defined(__GNUC__) && defined(__alpha__) && !defined(HAVE_TICK_COUNTER)
+/*
+ * The 32-bit cycle counter on alpha overflows pretty quickly, 
+ * unfortunately.  A 1GHz machine overflows in 4 seconds.
+ */
+typedef unsigned int ticks;
+
+static __inline__ ticks getticks(void)
+{
+     unsigned long cc;
+     __asm__ __volatile__ ("rpcc %0" : "=r"(cc));
+     return (cc & 0xFFFFFFFF);
+}
+
+INLINE_ELAPSED(__inline__)
+
+#define HAVE_TICK_COUNTER
+#endif
+
+/*----------------------------------------------------------------*/
+#if defined(__GNUC__) && defined(__sparc_v9__) && !defined(HAVE_TICK_COUNTER)
+typedef unsigned long ticks;
+
+static __inline__ ticks getticks(void)
+{
+     ticks ret;
+     __asm__ __volatile__("rd %%tick, %0" : "=r" (ret));
+     return ret;
+}
+
+INLINE_ELAPSED(__inline__)
+
+#define HAVE_TICK_COUNTER
+#endif
+
+/*----------------------------------------------------------------*/
+#if (defined(__DECC) || defined(__DECCXX)) && defined(__alpha) && defined(HAVE_C_ASM_H) && !defined(HAVE_TICK_COUNTER)
+#  include <c_asm.h>
+typedef unsigned int ticks;
+
+static __inline ticks getticks(void)
+{
+     unsigned long cc;
+     cc = asm("rpcc %v0");
+     return (cc & 0xFFFFFFFF);
+}
+
+INLINE_ELAPSED(__inline)
+
+#define HAVE_TICK_COUNTER
+#endif
+/*----------------------------------------------------------------*/
+/* SGI/Irix */
+#if defined(HAVE_CLOCK_GETTIME) && defined(CLOCK_SGI_CYCLE) && !defined(HAVE_TICK_COUNTER)
+typedef struct timespec ticks;
+
+static inline ticks getticks(void)
+{
+     struct timespec t;
+     clock_gettime(CLOCK_SGI_CYCLE, &t);
+     return t;
+}
+
+static inline double elapsed(ticks t1, ticks t0)
+{
+     return ((double)t1.tv_sec - (double)t0.tv_sec) * 1.0E9 +
+	  ((double)t1.tv_nsec - (double)t0.tv_nsec);
+}
+#define HAVE_TICK_COUNTER
+#endif
+
+/*----------------------------------------------------------------*/
+/* Cray UNICOS _rtc() intrinsic function */
+#if defined(HAVE__RTC) && !defined(HAVE_TICK_COUNTER)
+#ifdef HAVE_INTRINSICS_H
+#  include <intrinsics.h>
+#endif
+
+typedef long long ticks;
+
+#define getticks _rtc
+
+INLINE_ELAPSED(inline)
+
+#define HAVE_TICK_COUNTER
+#endif
+
+/*----------------------------------------------------------------*/
+/* MIPS ZBus */
+#if HAVE_MIPS_ZBUS_TIMER
+#if defined(__mips__) && !defined(HAVE_TICK_COUNTER)
+#include <sys/mman.h>
+#include <unistd.h>
+#include <fcntl.h>
+
+typedef uint64_t ticks;
+
+static inline ticks getticks(void)
+{
+  static uint64_t* addr = 0;
+
+  if (addr == 0)
+  {
+    uint32_t rq_addr = 0x10030000;
+    int fd;
+    int pgsize;
+
+    pgsize = getpagesize();
+    fd = open ("/dev/mem", O_RDONLY | O_SYNC, 0);
+    if (fd < 0) {
+      perror("open");
+      return NULL;
+    }
+    addr = mmap(0, pgsize, PROT_READ, MAP_SHARED, fd, rq_addr);
+    close(fd);
+    if (addr == (uint64_t *)-1) {
+      perror("mmap");
+      return NULL;
+    }
+  }
+
+  return *addr;
+}
+
+INLINE_ELAPSED(inline)
+
+#define HAVE_TICK_COUNTER
+#endif
+#endif /* HAVE_MIPS_ZBUS_TIMER */
+
diff --git a/fp.h b/fp.h
new file mode 100644
index 0000000..6cfa064
--- /dev/null
+++ b/fp.h
@@ -0,0 +1,37 @@
+#ifndef FP_H
+#define FP_H
+
+#include "u512.h"
+
+/* fp is in the Montgomery domain, so interpreting that
+   as an integer should never make sense.
+   enable compiler warnings when mixing up u512 and fp. */
+typedef struct fp {
+    u512 x;
+} fp;
+
+extern const fp fp_0;
+extern const fp fp_1;
+
+void fp_set(fp *x, uint64_t y);
+void fp_cswap(fp *x, fp *y, bool c);
+
+void fp_enc(fp *x, u512 const *y); /* encode to Montgomery representation */
+void fp_dec(u512 *x, fp const *y); /* decode from Montgomery representation */
+
+void fp_add2(fp *x, fp const *y);
+void fp_sub2(fp *x, fp const *y);
+void fp_mul2(fp *x, fp const *y);
+
+void fp_add3(fp *x, fp const *y, fp const *z);
+void fp_sub3(fp *x, fp const *y, fp const *z);
+void fp_mul3(fp *x, fp const *y, fp const *z);
+
+void fp_sq1(fp *x);
+void fp_sq2(fp *x, fp const *y);
+void fp_inv(fp *x);
+bool fp_issquare(fp const *x);
+
+void fp_random(fp *x);
+
+#endif
diff --git a/fp.s b/fp.s
new file mode 100644
index 0000000..ac65d94
--- /dev/null
+++ b/fp.s
@@ -0,0 +1,452 @@
+
+.intel_syntax noprefix
+
+.section .rodata
+
+.set pbits, 511
+p:
+    .quad 0x1b81b90533c6c87b, 0xc2721bf457aca835, 0x516730cc1f0b4f25, 0xa7aac6c567f35507
+    .quad 0x5afbfcc69322c9cd, 0xb42d083aedc88c42, 0xfc8ab0d15e3e4c4a, 0x65b48e8f740f89bf
+
+
+.global fp_0
+fp_0: .quad 0, 0, 0, 0, 0, 0, 0, 0
+
+.global fp_1
+fp_1: /* 2^512 mod p */
+    .quad 0xc8fc8df598726f0a, 0x7b1bc81750a6af95, 0x5d319e67c1e961b4, 0xb0aa7275301955f1
+    .quad 0x4a080672d9ba6c64, 0x97a5ef8a246ee77b, 0x06ea9e5d4383676a, 0x3496e2e117e0ec80
+
+
+/* (2^512)^2 mod p */
+.r_squared_mod_p:
+    .quad 0x36905b572ffc1724, 0x67086f4525f1f27d, 0x4faf3fbfd22370ca, 0x192ea214bcc584b1
+    .quad 0x5dae03ee2f5de3d0, 0x1e9248731776b371, 0xad5f166e20e4f52d, 0x4ed759aea6f3917e
+
+/* -p^-1 mod 2^64 */
+.inv_min_p_mod_r:
+    .quad 0x66c1301f632e294d
+
+
+.section .text
+
+.global fp_copy
+fp_copy:
+    cld
+    mov rcx, 8
+    rep movsq
+    ret
+
+.global fp_set
+fp_set:
+    push rdi
+    call u512_set
+    pop rdi
+    mov rsi, rdi
+    jmp fp_enc
+
+.global fp_cswap
+fp_cswap:
+    movzx rax, dl
+    neg rax
+    .set k, 0
+    .rept 8
+        mov rcx, [rdi + 8*k]
+        mov rdx, [rsi + 8*k]
+
+        mov r8, rcx
+        xor r8, rdx
+        and r8, rax
+
+        xor rcx, r8
+        xor rdx, r8
+
+        mov [rdi + 8*k], rcx
+        mov [rsi + 8*k], rdx
+
+        .set k, k+1
+    .endr
+    ret
+
+.reduce_once:
+    push rbp
+    mov rbp, rdi
+
+    mov rdi, [rbp +  0]
+    sub rdi, [rip + p +  0]
+    mov rsi, [rbp +  8]
+    sbb rsi, [rip + p +  8]
+    mov rdx, [rbp + 16]
+    sbb rdx, [rip + p + 16]
+    mov rcx, [rbp + 24]
+    sbb rcx, [rip + p + 24]
+    mov r8,  [rbp + 32]
+    sbb r8,  [rip + p + 32]
+    mov r9,  [rbp + 40]
+    sbb r9,  [rip + p + 40]
+    mov r10, [rbp + 48]
+    sbb r10, [rip + p + 48]
+    mov r11, [rbp + 56]
+    sbb r11, [rip + p + 56]
+
+    setnc al
+    movzx rax, al
+    neg rax
+
+.macro cswap2, r, m
+    xor \r, \m
+    and \r, rax
+    xor \m, \r
+.endm
+
+    cswap2 rdi, [rbp +  0]
+    cswap2 rsi, [rbp +  8]
+    cswap2 rdx, [rbp + 16]
+    cswap2 rcx, [rbp + 24]
+    cswap2 r8,  [rbp + 32]
+    cswap2 r9,  [rbp + 40]
+    cswap2 r10, [rbp + 48]
+    cswap2 r11, [rbp + 56]
+
+    pop rbp
+    ret
+
+.global fp_add3
+fp_add3:
+    push rdi
+    call u512_add3
+    pop rdi
+    jmp .reduce_once
+
+.global fp_add2
+fp_add2:
+    mov rdx, rdi
+    jmp fp_add3
+
+.global fp_sub3
+fp_sub3:
+    push rdi
+    call u512_sub3
+    pop rdi
+    xor rsi, rsi
+    xor rdx, rdx
+    xor rcx, rcx
+    xor r8, r8
+    xor r9, r9
+    xor r10, r10
+    xor r11, r11
+    test rax, rax
+    cmovnz rax, [rip + p +  0]
+    cmovnz rsi, [rip + p +  8]
+    cmovnz rdx, [rip + p + 16]
+    cmovnz rcx, [rip + p + 24]
+    cmovnz r8,  [rip + p + 32]
+    cmovnz r9,  [rip + p + 40]
+    cmovnz r10, [rip + p + 48]
+    cmovnz r11, [rip + p + 56]
+    add [rdi +  0], rax
+    adc [rdi +  8], rsi
+    adc [rdi + 16], rdx
+    adc [rdi + 24], rcx
+    adc [rdi + 32],  r8
+    adc [rdi + 40],  r9
+    adc [rdi + 48], r10
+    adc [rdi + 56], r11
+    ret
+
+.global fp_sub2
+fp_sub2:
+    mov rdx, rdi
+    xchg rsi, rdx
+    jmp fp_sub3
+
+
+/* Montgomery arithmetic */
+
+.global fp_enc
+fp_enc:
+    lea rdx, [rip + .r_squared_mod_p]
+    jmp fp_mul3
+
+.global fp_dec
+fp_dec:
+    lea rdx, [rip + u512_1]
+    jmp fp_mul3
+
+.global fp_mul3
+fp_mul3:
+    push rbp
+    push rbx
+    push r12
+    push r13
+    push r14
+    push r15
+
+    push rdi
+
+    mov rdi, rsi
+    mov rsi, rdx
+
+    xor r8,  r8
+    xor r9,  r9
+    xor r10, r10
+    xor r11, r11
+    xor r12, r12
+    xor r13, r13
+    xor r14, r14
+    xor r15, r15
+    xor rbp, rbp
+
+    /* flags are already cleared */
+
+.macro MULSTEP, k, r0, r1, r2, r3, r4, r5, r6, r7, r8
+
+    mov rdx, [rsi +  0]
+    mulx rcx, rdx, [rdi + 8*\k]
+    add rdx, \r0
+    mulx rcx, rdx, [rip + .inv_min_p_mod_r]
+
+    xor rax, rax /* clear flags */
+
+    mulx rbx, rax, [rip + p +  0]
+    adox \r0, rax
+
+    mulx rcx, rax, [rip + p +  8]
+    adcx \r1, rbx
+    adox \r1, rax
+
+    mulx rbx, rax, [rip + p + 16]
+    adcx \r2, rcx
+    adox \r2, rax
+
+    mulx rcx, rax, [rip + p + 24]
+    adcx \r3, rbx
+    adox \r3, rax
+
+    mulx rbx, rax, [rip + p + 32]
+    adcx \r4, rcx
+    adox \r4, rax
+
+    mulx rcx, rax, [rip + p + 40]
+    adcx \r5, rbx
+    adox \r5, rax
+
+    mulx rbx, rax, [rip + p + 48]
+    adcx \r6, rcx
+    adox \r6, rax
+
+    mulx rcx, rax, [rip + p + 56]
+    adcx \r7, rbx
+    adox \r7, rax
+
+    mov rax, 0
+    adcx \r8, rcx
+    adox \r8, rax
+
+
+    mov rdx, [rdi + 8*\k]
+
+    xor rax, rax /* clear flags */
+
+    mulx rbx, rax, [rsi +  0]
+    adox \r0, rax
+
+    mulx rcx, rax, [rsi +  8]
+    adcx \r1, rbx
+    adox \r1, rax
+
+    mulx rbx, rax, [rsi + 16]
+    adcx \r2, rcx
+    adox \r2, rax
+
+    mulx rcx, rax, [rsi + 24]
+    adcx \r3, rbx
+    adox \r3, rax
+
+    mulx rbx, rax, [rsi + 32]
+    adcx \r4, rcx
+    adox \r4, rax
+
+    mulx rcx, rax, [rsi + 40]
+    adcx \r5, rbx
+    adox \r5, rax
+
+    mulx rbx, rax, [rsi + 48]
+    adcx \r6, rcx
+    adox \r6, rax
+
+    mulx rcx, rax, [rsi + 56]
+    adcx \r7, rbx
+    adox \r7, rax
+
+    mov rax, 0
+    adcx \r8, rcx
+    adox \r8, rax
+
+.endm
+
+    MULSTEP 0, r8,  r9,  r10, r11, r12, r13, r14, r15, rbp
+    MULSTEP 1, r9,  r10, r11, r12, r13, r14, r15, rbp, r8
+    MULSTEP 2, r10, r11, r12, r13, r14, r15, rbp, r8,  r9
+    MULSTEP 3, r11, r12, r13, r14, r15, rbp, r8,  r9,  r10
+    MULSTEP 4, r12, r13, r14, r15, rbp, r8,  r9,  r10, r11
+    MULSTEP 5, r13, r14, r15, rbp, r8,  r9,  r10, r11, r12
+    MULSTEP 6, r14, r15, rbp, r8,  r9,  r10, r11, r12, r13
+    MULSTEP 7, r15, rbp, r8,  r9,  r10, r11, r12, r13, r14
+
+    pop rdi
+
+    mov [rdi +  0], rbp
+    mov [rdi +  8], r8
+    mov [rdi + 16], r9
+    mov [rdi + 24], r10
+    mov [rdi + 32], r11
+    mov [rdi + 40], r12
+    mov [rdi + 48], r13
+    mov [rdi + 56], r14
+
+    pop r15
+    pop r14
+    pop r13
+    pop r12
+    pop rbx
+    pop rbp
+    jmp .reduce_once
+
+.global fp_mul2
+fp_mul2:
+    mov rdx, rdi
+    jmp fp_mul3
+
+.global fp_sq2
+fp_sq2:
+    /* TODO implement optimized Montgomery squaring */
+    mov rdx, rsi
+    jmp fp_mul3
+
+.global fp_sq1
+fp_sq1:
+    mov rsi, rdi
+    jmp fp_sq2
+
+/* (obviously) not constant time in the exponent! */
+.fp_pow:
+    push rbx
+    mov rbx, rsi
+    push r12
+    push r13
+    push rdi
+    sub rsp, 64
+
+    mov rsi, rdi
+    mov rdi, rsp
+    call fp_copy
+
+    mov rdi, [rsp + 64]
+    lea rsi, [rip + fp_1]
+    call fp_copy
+
+.macro POWSTEP, k
+        mov r13, [rbx + 8*\k]
+        xor r12, r12
+
+        0:
+        test r13, 1
+        jz 1f
+
+        mov rdi, [rsp + 64]
+        mov rsi, rsp
+        call fp_mul2
+
+        1:
+        mov rdi, rsp
+        call fp_sq1
+
+        shr r13
+
+        inc r12
+        test r12, 64
+        jz 0b
+.endm
+
+    POWSTEP 0
+    POWSTEP 1
+    POWSTEP 2
+    POWSTEP 3
+    POWSTEP 4
+    POWSTEP 5
+    POWSTEP 6
+    POWSTEP 7
+
+    add rsp, 64+8
+    pop r13
+    pop r12
+    pop rbx
+    ret
+
+.section .rodata
+.p_minus_2:
+    .quad 0x1b81b90533c6c879, 0xc2721bf457aca835, 0x516730cc1f0b4f25, 0xa7aac6c567f35507
+    .quad 0x5afbfcc69322c9cd, 0xb42d083aedc88c42, 0xfc8ab0d15e3e4c4a, 0x65b48e8f740f89bf
+
+.section .text
+
+/* TODO use a better addition chain? */
+.global fp_inv
+fp_inv:
+    lea rsi, [rip + .p_minus_2]
+    jmp .fp_pow
+
+.section .rodata
+.p_minus_1_halves:
+    .quad 0x8dc0dc8299e3643d, 0xe1390dfa2bd6541a, 0xa8b398660f85a792, 0xd3d56362b3f9aa83
+    .quad 0x2d7dfe63499164e6, 0x5a16841d76e44621, 0xfe455868af1f2625, 0x32da4747ba07c4df
+
+.section .text
+
+/* TODO use a better addition chain? */
+.global fp_issquare
+fp_issquare:
+    push rdi
+    lea rsi, [rip + .p_minus_1_halves]
+    call .fp_pow
+    pop rdi
+
+    xor rax, rax
+    .set k, 0
+    .rept 8
+        mov rsi, [rdi + 8*k]
+        xor rsi, [rip + fp_1 + 8*k]
+        or rax, rsi
+        .set k, k+1
+    .endr
+    test rax, rax
+    setz al
+    movzx rax, al
+    ret
+
+
+/* not constant time (but this shouldn't leak anything of importance) */
+.global fp_random
+fp_random:
+
+    push rdi
+    mov rsi, 64
+    call randombytes
+    pop rdi
+    mov rax, 1
+    shl rax, (pbits % 64)
+    dec rax
+    and [rdi + 56], rax
+
+    .set k, 7
+    .rept 8
+        mov rax, [rip + p + 8*k]
+        cmp [rdi + 8*k], rax
+        jge fp_random
+        jl 0f
+        .set k, k-1
+    .endr
+    0:
+    ret
+
diff --git a/main.c b/main.c
new file mode 100644
index 0000000..6649bfc
--- /dev/null
+++ b/main.c
@@ -0,0 +1,99 @@
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <unistd.h>
+#include <time.h>
+#include <assert.h>
+
+#include "u512.h"
+#include "fp.h"
+#include "mont.h"
+#include "csidh.h"
+
+void u512_print(u512 const *x)
+{
+    for (size_t i = 63; i < 64; --i)
+        printf("%02hhx", i[(unsigned char *) x->c]);
+}
+
+void fp_print(fp const *x)
+{
+    u512 y;
+    fp_dec(&y, x);
+    u512_print(&y);
+}
+
+int main()
+{
+    clock_t t0, t1;
+
+    private_key priv_alice, priv_bob;
+    public_key pub_alice, pub_bob;
+    public_key shared_alice, shared_bob;
+
+    printf("\n");
+
+
+    t0 = clock();
+    csidh_private(&priv_alice);
+    t1 = clock();
+
+    printf("Alice's private key   (%7.3lf ms):\n  ", 1000. * (t1 - t0) / CLOCKS_PER_SEC);
+    for (size_t i = 0; i < sizeof(priv_alice); ++i)
+        printf("%02hhx", i[(uint8_t *) &priv_alice]);
+    printf("\n\n");
+
+    t0 = clock();
+    csidh_private(&priv_bob);
+    t1 = clock();
+
+    printf("Bob's private key     (%7.3lf ms):\n  ", 1000. * (t1 - t0) / CLOCKS_PER_SEC);
+    for (size_t i = 0; i < sizeof(priv_bob); ++i)
+        printf("%02hhx", i[(uint8_t *) &priv_bob]);
+    printf("\n\n");
+
+
+    t0 = clock();
+    assert(csidh(&pub_alice, &base, &priv_alice));
+    t1 = clock();
+
+    printf("Alice's public key    (%7.3lf ms):\n  ", 1000. * (t1 - t0) / CLOCKS_PER_SEC);
+    fp_print(&pub_alice.A);
+    printf("\n\n");
+
+    t0 = clock();
+    assert(csidh(&pub_bob, &base, &priv_bob));
+    t1 = clock();
+
+    printf("Bob's public key      (%7.3lf ms):\n  ", 1000. * (t1 - t0) / CLOCKS_PER_SEC);
+    fp_print(&pub_bob.A);
+    printf("\n\n");
+
+
+    t0 = clock();
+    assert(csidh(&shared_alice, &pub_bob, &priv_alice));
+    t1 = clock();
+
+    printf("Alice's shared secret (%7.3lf ms):\n  ", 1000. * (t1 - t0) / CLOCKS_PER_SEC);
+    fp_print(&shared_alice.A);
+    printf("\n\n");
+
+    t0 = clock();
+    assert(csidh(&shared_bob, &pub_alice, &priv_bob));
+    t1 = clock();
+
+    printf("Bob's shared secret   (%7.3lf ms):\n  ", 1000. * (t1 - t0) / CLOCKS_PER_SEC);
+    fp_print(&shared_bob.A);
+    printf("\n\n");
+
+    printf("    ");
+    if (memcmp(&shared_alice, &shared_bob, sizeof(public_key)))
+        printf("\x1b[31mNOT EQUAL!\x1b[0m\n");
+    else
+        printf("\x1b[32mequal.\x1b[0m\n");
+    printf("\n");
+
+    printf("\n");
+}
+
diff --git a/mont.c b/mont.c
new file mode 100644
index 0000000..6950969
--- /dev/null
+++ b/mont.c
@@ -0,0 +1,203 @@
+
+#include <assert.h>
+
+#include "mont.h"
+#include "u512.h"
+
+void xDBLADD(proj *R, proj *S, proj const *P, proj const *Q, proj const *PQ, proj const *A24)
+{
+    fp tmp0, tmp1, tmp2;        //requires precomputation of A24=(A+2C:4C)
+
+    fp_add3(&tmp0, &P->x, &P->z);
+    fp_sub3(&tmp1, &P->x, &P->z);
+    fp_sq2(&R->x, &tmp0);
+    fp_sub3(&tmp2, &Q->x, &Q->z);
+    fp_add3(&S->x, &Q->x, &Q->z);
+    fp_mul2(&tmp0, &tmp2);
+    fp_sq2(&R->z, &tmp1);
+    fp_mul2(&tmp1, &S->x);
+    fp_sub3(&tmp2, &R->x, &R->z);
+    fp_mul2(&R->z, &A24->z);
+    fp_mul2(&R->x, &R->z);
+    fp_mul3(&S->x, &A24->x, &tmp2);
+    fp_sub3(&S->z, &tmp0, &tmp1);
+    fp_add2(&R->z, &S->x);
+    fp_add3(&S->x, &tmp0, &tmp1);
+    fp_mul2(&R->z, &tmp2);
+    fp_sq1(&S->z);
+    fp_sq1(&S->x);
+    fp_mul2(&S->z, &PQ->x);
+    fp_mul2(&S->x, &PQ->z);
+}
+
+void xDBL(proj *Q, proj const *A, proj const *P)
+{
+    fp a, b, c;
+    fp_add3(&a, &P->x, &P->z);
+    fp_sq1(&a);
+    fp_sub3(&b, &P->x, &P->z);
+    fp_sq1(&b);
+    fp_sub3(&c, &a, &b);
+    fp_add2(&b, &b); fp_add2(&b, &b); /* multiplication by 4 */
+    fp_mul2(&b, &A->z);
+    fp_mul3(&Q->x, &a, &b);
+    fp_add3(&a, &A->z, &A->z); /* multiplication by 2 */
+    fp_add2(&a, &A->x);
+    fp_mul2(&a, &c);
+    fp_add2(&a, &b);
+    fp_mul3(&Q->z, &a, &c);
+}
+
+void xADD(proj *S, proj const *P, proj const *Q, proj const *PQ)
+{
+    fp a, b, c, d;
+    fp_add3(&a, &P->x, &P->z);
+    fp_sub3(&b, &P->x, &P->z);
+    fp_add3(&c, &Q->x, &Q->z);
+    fp_sub3(&d, &Q->x, &Q->z);
+    fp_mul2(&a, &d);
+    fp_mul2(&b, &c);
+    fp_add3(&c, &a, &b);
+    fp_sub3(&d, &a, &b);
+    fp_sq1(&c);
+    fp_sq1(&d);
+    fp_mul3(&S->x, &PQ->z, &c);
+    fp_mul3(&S->z, &PQ->x, &d);
+}
+
+/* Montgomery ladder. */
+/* P must not be the unique point of order 2. */
+/* not constant-time! */
+void xMUL(proj *Q, proj const *A, proj const *P, u512 const *k)
+{
+    proj R = *P;
+    proj A24;
+    const proj Pcopy = *P; /* in case Q = P */
+
+    Q->x = fp_1;
+    Q->z = fp_0;
+
+    fp_add3(&A24.x, &A->z, &A->z);    //precomputation of A24=(A+2C:4C)
+    fp_add3(&A24.z, &A24.x, &A24.x);
+    fp_add2(&A24.x, &A->x);
+
+    unsigned long i = 512;
+    while (--i && !u512_bit(k, i));
+
+    do {
+
+        bool bit = u512_bit(k, i);
+
+        if (bit) { proj T = *Q; *Q = R; R = T; } /* not constant-time */
+        //fp_cswap(&Q->x, &R.x, bit);
+        //fp_cswap(&Q->z, &R.z, bit);
+
+        xDBLADD(Q, &R, Q, &R, &Pcopy, &A24);
+
+        if (bit) { proj T = *Q; *Q = R; R = T; } /* not constant-time */
+        //fp_cswap(&Q->x, &R.x, bit);
+        //fp_cswap(&Q->z, &R.z, bit);
+
+    } while (i--);
+}
+
+//simultaneous square-and-multiply, computes x^exp and y^exp 
+void exp_by_squaring_(fp* x, fp* y, uint64_t exp)
+{
+	fp result1, result2;
+	fp_set(&result1, 1);
+	fp_set(&result2, 1);
+
+    while (exp)
+    {
+        if (exp & 1){
+          fp_mul2(&result1, x);
+          fp_mul2(&result2, y);
+	}
+	
+        fp_sq1(x);
+	fp_sq1(y);
+        exp >>= 1;
+    }
+
+    fp_cswap(&result1, x, 1);
+    fp_cswap(&result2, y, 1);
+
+}
+
+
+/* computes the isogeny with kernel point K of order k */
+/* returns the new curve coefficient A and the image of P */
+/* (obviously) not constant time in k */
+void xISOG(proj *A, proj *P, proj const *K, uint64_t k)
+{
+    assert (k >= 3);
+    assert (k % 2 == 1);
+
+    fp tmp0, tmp1, tmp2, Psum, Pdif;
+    proj Q, Aed, prod;
+
+    fp_add3(&Aed.z, &A->z, &A->z);  //compute twisted Edwards curve coefficients
+    fp_add3(&Aed.x, &A->x, &Aed.z);
+    fp_sub3(&Aed.z, &A->x, &Aed.z);
+   
+    fp_add3(&Psum, &P->x, &P->z);   //precomputations
+    fp_sub3(&Pdif, &P->x, &P->z);
+
+    fp_sub3(&prod.x, &K->x, &K->z);
+    fp_add3(&prod.z, &K->x, &K->z);
+    
+    fp_mul3(&tmp1, &prod.x, &Psum);
+    fp_mul3(&tmp0, &prod.z, &Pdif);
+    fp_add3(&Q.x, &tmp0, &tmp1);
+    fp_sub3(&Q.z, &tmp0, &tmp1);
+
+    proj M[3] = {*K};
+    xDBL(&M[1], A, K);
+
+    for (uint64_t i = 1; i < k / 2; ++i) {
+
+        if (i >= 2)
+            xADD(&M[i % 3], &M[(i - 1) % 3], K, &M[(i - 2) % 3]);
+
+	fp_sub3(&tmp1, &M[i % 3].x, &M[i % 3].z);
+    	fp_add3(&tmp0, &M[i % 3].x, &M[i % 3].z);
+	fp_mul2(&prod.x, &tmp1);
+        fp_mul2(&prod.z, &tmp0);
+    	fp_mul2(&tmp1, &Psum);
+    	fp_mul2(&tmp0, &Pdif);
+    	fp_add3(&tmp2, &tmp0, &tmp1);
+	fp_mul2(&Q.x, &tmp2);
+    	fp_sub3(&tmp2, &tmp0, &tmp1);
+	fp_mul2(&Q.z, &tmp2);
+
+    }
+
+
+    // point evaluation
+    fp_sq1(&Q.x);
+    fp_sq1(&Q.z);
+    fp_mul2(&P->x, &Q.x);
+    fp_mul2(&P->z, &Q.z);
+
+    //compute Aed.x^k, Aed.z^k
+    exp_by_squaring_(&Aed.x, &Aed.z, k);
+
+    //compute prod.x^8, prod.z^8
+    fp_sq1(&prod.x);
+    fp_sq1(&prod.x);
+    fp_sq1(&prod.x);
+    fp_sq1(&prod.z);
+    fp_sq1(&prod.z);
+    fp_sq1(&prod.z);
+
+    //compute image curve parameters
+    fp_mul2(&Aed.z, &prod.x);
+    fp_mul2(&Aed.x, &prod.z);
+
+    //compute Montgomery params
+    fp_add3(&A->x, &Aed.x, &Aed.z);
+    fp_sub3(&A->z, &Aed.x, &Aed.z);
+    fp_add2(&A->x, &A->x);
+}
+
diff --git a/mont.h b/mont.h
new file mode 100644
index 0000000..36b3ad7
--- /dev/null
+++ b/mont.h
@@ -0,0 +1,19 @@
+#ifndef MONT_H
+#define MONT_H
+
+#include "u512.h"
+#include "fp.h"
+
+/* P^1 over fp. */
+typedef struct proj {
+    fp x;
+    fp z;
+} proj;
+
+void xDBL(proj *Q, proj const *A, proj const *P);
+void xADD(proj *S, proj const *P, proj const *Q, proj const *PQ);
+void xDBLADD(proj *R, proj *S, proj const *P, proj const *Q, proj const *PQ, proj const *A);
+void xMUL(proj *Q, proj const *A, proj const *P, u512 const *k);
+void xISOG(proj *A, proj *P, proj const *K, uint64_t k);
+
+#endif
diff --git a/rng.c b/rng.c
new file mode 100644
index 0000000..fc28c87
--- /dev/null
+++ b/rng.c
@@ -0,0 +1,18 @@
+
+#include "rng.h"
+
+#include <stdlib.h>
+#include <unistd.h>
+#include <fcntl.h>
+
+void randombytes(void *x, size_t l)
+{
+    static int fd = -1;
+    ssize_t n;
+    if (fd < 0 && 0 > (fd = open("/dev/urandom", O_RDONLY)))
+        exit(1);
+    for (size_t i = 0; i < l; i += n)
+        if (0 >= (n = read(fd, (char *) x + i, l - i)))
+            exit(2);
+}
+
diff --git a/rng.h b/rng.h
new file mode 100644
index 0000000..30e01d0
--- /dev/null
+++ b/rng.h
@@ -0,0 +1,8 @@
+#ifndef RNG_H
+#define RNG_H
+
+#include <stdlib.h>
+
+void randombytes(void *x, size_t l);
+
+#endif
diff --git a/u512.h b/u512.h
new file mode 100644
index 0000000..6eaae93
--- /dev/null
+++ b/u512.h
@@ -0,0 +1,22 @@
+#ifndef UINT_H
+#define UINT_H
+
+#include <stdbool.h>
+#include <stdint.h>
+
+typedef struct u512 {
+    uint64_t c[8];
+} u512;
+
+extern const u512 u512_1;
+
+void u512_set(u512 *x, uint64_t y);
+
+bool u512_bit(u512 const *x, uint64_t k);
+
+bool u512_add3(u512 *x, u512 const *y, u512 const *z); /* returns carry */
+bool u512_sub3(u512 *x, u512 const *y, u512 const *z); /* returns borrow */
+
+void u512_mul3_64(u512 *x, u512 const *y, uint64_t z);
+
+#endif
diff --git a/u512.s b/u512.s
new file mode 100644
index 0000000..779be3e
--- /dev/null
+++ b/u512.s
@@ -0,0 +1,102 @@
+
+.intel_syntax noprefix
+
+.section .rodata
+
+.global u512_1
+u512_1: .quad 1, 0, 0, 0, 0, 0, 0, 0
+
+
+.section .text
+
+.global u512_set
+u512_set:
+    cld
+    mov rax, rsi
+    stosq
+    xor rax, rax
+    mov rcx, 7
+    rep stosq
+    ret
+
+
+.global u512_bit
+u512_bit:
+    mov rcx, rsi
+    and rcx, 0x3f
+    shr rsi, 6
+    mov rax, [rdi + 8*rsi]
+    shr rax, cl
+    and rax, 1
+    ret
+
+
+.global u512_add3
+u512_add3:
+    mov rax, [rsi +  0]
+    add rax, [rdx +  0]
+    mov [rdi +  0], rax
+    .set k, 1
+    .rept 7
+        mov rax, [rsi + 8*k]
+        adc rax, [rdx + 8*k]
+        mov [rdi + 8*k], rax
+        .set k, k+1
+    .endr
+    setc al
+    movzx rax, al
+    ret
+
+.global u512_sub3
+u512_sub3:
+    mov rax, [rsi +  0]
+    sub rax, [rdx +  0]
+    mov [rdi +  0], rax
+    .set k, 1
+    .rept 7
+        mov rax, [rsi + 8*k]
+        sbb rax, [rdx + 8*k]
+        mov [rdi + 8*k], rax
+        .set k, k+1
+    .endr
+    setc al
+    movzx rax, al
+    ret
+
+
+.global u512_mul3_64
+u512_mul3_64:
+
+    mulx r10, rax, [rsi +  0]
+    mov [rdi +  0], rax
+
+    mulx r11, rax, [rsi +  8]
+    add  rax, r10
+    mov [rdi +  8], rax
+
+    mulx r10, rax, [rsi + 16]
+    adcx rax, r11
+    mov [rdi + 16], rax
+
+    mulx r11, rax, [rsi + 24]
+    adcx rax, r10
+    mov [rdi + 24], rax
+
+    mulx r10, rax, [rsi + 32]
+    adcx rax, r11
+    mov [rdi + 32],rax
+
+    mulx r11, rax, [rsi + 40]
+    adcx rax, r10
+    mov [rdi + 40],rax
+
+    mulx r10, rax, [rsi + 48]
+    adcx rax, r11
+    mov [rdi + 48],rax
+
+    mulx r11, rax, [rsi + 56]
+    adcx rax, r10
+    mov [rdi + 56],rax
+
+    ret
+