From c00c37e0118993aa4e1dcf544d15c7ebb055b0f4 Mon Sep 17 00:00:00 2001 From: Michael Meyer Date: Thu, 23 Aug 2018 13:49:45 +0200 Subject: [PATCH] c-implementation --- Makefile | 25 +++ bench.c | 54 ++++++ csidh.c | 220 ++++++++++++++++++++++++ csidh.h | 26 +++ cycle.h | 514 +++++++++++++++++++++++++++++++++++++++++++++++++++++++ fp.h | 37 ++++ fp.s | 452 ++++++++++++++++++++++++++++++++++++++++++++++++ main.c | 99 +++++++++++ mont.c | 203 ++++++++++++++++++++++ mont.h | 19 ++ rng.c | 18 ++ rng.h | 8 + u512.h | 22 +++ u512.s | 102 +++++++++++ 14 files changed, 1799 insertions(+) create mode 100644 Makefile create mode 100644 bench.c create mode 100644 csidh.c create mode 100644 csidh.h create mode 100644 cycle.h create mode 100644 fp.h create mode 100644 fp.s create mode 100644 main.c create mode 100644 mont.c create mode 100644 mont.h create mode 100644 rng.c create mode 100644 rng.h create mode 100644 u512.h create mode 100644 u512.s diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..1077519 --- /dev/null +++ b/Makefile @@ -0,0 +1,25 @@ +all: + @gcc \ + -Wall -Wextra \ + -O0 -funroll-loops \ + rng.c \ + u512.s fp.s \ + mont.c \ + csidh.c \ + main.c \ + -o main + +debug: + gcc \ + -Wall -Wextra \ + -g \ + rng.c \ + u512.s fp.s \ + mont.c \ + csidh.c \ + main.c \ + -o main + +clean: + rm -f main + diff --git a/bench.c b/bench.c new file mode 100644 index 0000000..fd0f64a --- /dev/null +++ b/bench.c @@ -0,0 +1,54 @@ + +#include +#include +#include +#include +#include + +#include "u512.h" +#include "fp.h" +#include "mont.h" +#include "csidh.h" + +#include + +static __inline__ uint64_t rdtsc(void) +{ + uint32_t hi, lo; + __asm__ __volatile__ ("rdtsc" : "=a"(lo), "=d"(hi)); + return lo | (uint64_t) hi << 32; +} + +unsigned long its = 10000; + +int main() +{ + clock_t t0, t1, time = 0; + uint64_t c0, c1, cycles = 0; + + private_key priv; + public_key pub = base; + + for (unsigned long i = 0; i < its; ++i) { + + csidh_private(&priv); + + t0 = clock(); + c0 = rdtsc(); + + /**************************************/ + assert(validate(&pub)); + action(&pub, &pub, &priv); + /**************************************/ + + c1 = rdtsc(); + t1 = clock(); + cycles += c1 - c0; + time += t1 - t0; + } + + printf("iterations: %lu\n", its); + printf("clock cycles: %" PRIu64 "\n", (uint64_t) cycles / its); + printf("wall-clock time: %.3lf ms\n", 1000. * time / CLOCKS_PER_SEC / its); +} + diff --git a/csidh.c b/csidh.c new file mode 100644 index 0000000..25d897f --- /dev/null +++ b/csidh.c @@ -0,0 +1,220 @@ + +#include +#include + +#include "csidh.h" +#include "rng.h" + +/* specific to p, should perhaps be somewhere else */ +const unsigned primes[num_primes] = { + 3, 5, 7, 11, 13, 17, 19, 23, 29, 31, 37, 41, 43, 47, 53, 59, + 61, 67, 71, 73, 79, 83, 89, 97, 101, 103, 107, 109, 113, 127, 131, 137, + 139, 149, 151, 157, 163, 167, 173, 179, 181, 191, 193, 197, 199, 211, 223, 227, + 229, 233, 239, 241, 251, 257, 263, 269, 271, 277, 281, 283, 293, 307, 311, 313, + 317, 331, 337, 347, 349, 353, 359, 367, 373, 587, +}; + +const u512 four_sqrt_p = {{ + 0x85e2579c786882cf, 0x4e3433657e18da95, 0x850ae5507965a0b3, 0xa15bc4e676475964, +}}; + + +const public_key base = {0}; /* A = 0 */ + +void csidh_private(private_key *priv) +{ + memset(&priv->e, 0, sizeof(priv->e)); + for (size_t i = 0; i < num_primes; ) { + int8_t buf[64]; + randombytes(buf, sizeof(buf)); + for (size_t j = 0; j < sizeof(buf); ++j) { + if (buf[j] <= max_exponent && buf[j] >= -max_exponent) { + priv->e[i / 2] |= (buf[j] & 0xf) << i % 2 * 4; + if (++i >= num_primes) + break; + } + } + } +} + +/* compute [(p+1)/l] P for all l in our list of primes. */ +/* divide and conquer is much faster than doing it naively, + * but uses more memory. */ +static void cofactor_multiples(proj *P, const proj *A, size_t lower, size_t upper) +{ + assert(lower < upper); + + if (upper - lower == 1) + return; + + size_t mid = lower + (upper - lower + 1) / 2; + + u512 cl = u512_1, cu = u512_1; + for (size_t i = lower; i < mid; ++i) + u512_mul3_64(&cu, &cu, primes[i]); + for (size_t i = mid; i < upper; ++i) + u512_mul3_64(&cl, &cl, primes[i]); + + xMUL(&P[mid], A, &P[lower], &cu); + xMUL(&P[lower], A, &P[lower], &cl); + + cofactor_multiples(P, A, lower, mid); + cofactor_multiples(P, A, mid, upper); +} + +/* never accepts invalid keys. */ +bool validate(public_key const *in) +{ + const proj A = {in->A, fp_1}; + + do { + + proj P[num_primes]; + fp_random(&P->x); + P->z = fp_1; + + /* maximal 2-power in p+1 */ + xDBL(P, &A, P); + xDBL(P, &A, P); + + cofactor_multiples(P, &A, 0, num_primes); + + u512 order = u512_1; + + for (size_t i = num_primes - 1; i < num_primes; --i) { + + /* we only gain information if [(p+1)/l] P is non-zero */ + if (memcmp(&P[i].z, &fp_0, sizeof(fp))) { + + u512 tmp; + u512_set(&tmp, primes[i]); + xMUL(&P[i], &A, &P[i], &tmp); + + if (memcmp(&P[i].z, &fp_0, sizeof(fp))) + /* P does not have order dividing p+1. */ + return false; + + u512_mul3_64(&order, &order, primes[i]); + + if (u512_sub3(&tmp, &four_sqrt_p, &order)) /* returns borrow */ + /* order > 4 sqrt(p), hence definitely supersingular */ + return true; + } + } + + /* P didn't have big enough order to prove supersingularity. */ + } while (1); +} + +/* compute x^3 + Ax^2 + x */ +static void montgomery_rhs(fp *rhs, fp const *A, fp const *x) +{ + fp tmp; + *rhs = *x; + fp_sq1(rhs); + fp_mul3(&tmp, A, x); + fp_add2(rhs, &tmp); + fp_add2(rhs, &fp_1); + fp_mul2(rhs, x); +} + +/* totally not constant-time. */ +void action(public_key *out, public_key const *in, private_key const *priv) +{ + u512 k[2]; + u512_set(&k[0], 4); /* maximal 2-power in p+1 */ + u512_set(&k[1], 4); /* maximal 2-power in p+1 */ + + uint8_t e[2][num_primes]; + + for (size_t i = 0; i < num_primes; ++i) { + + int8_t t = (int8_t) (priv->e[i / 2] << i % 2 * 4) >> 4; + + if (t > 0) { + e[0][i] = t; + e[1][i] = 0; + u512_mul3_64(&k[1], &k[1], primes[i]); + } + else if (t < 0) { + e[1][i] = -t; + e[0][i] = 0; + u512_mul3_64(&k[0], &k[0], primes[i]); + } + else { + e[0][i] = 0; + e[1][i] = 0; + u512_mul3_64(&k[0], &k[0], primes[i]); + u512_mul3_64(&k[1], &k[1], primes[i]); + } + } + + proj A = {in->A, fp_1}; + + bool done[2] = {false, false}; + + do { + + assert(!memcmp(&A.z, &fp_1, sizeof(fp))); + + proj P; + fp_random(&P.x); + P.z = fp_1; + + fp rhs; + montgomery_rhs(&rhs, &A.x, &P.x); + bool sign = !fp_issquare(&rhs); + + if (done[sign]) + continue; + + xMUL(&P, &A, &P, &k[sign]); + + done[sign] = true; + + for (size_t i = num_primes-1; i < num_primes; --i) { //changed loop direction + + if (e[sign][i]) { + + u512 cof = u512_1; + for (size_t j = i - 1; j < num_primes; --j) //changed loop direction + if (e[sign][j]) + u512_mul3_64(&cof, &cof, primes[j]); + + proj K; + xMUL(&K, &A, &P, &cof); + + if (memcmp(&K.z, &fp_0, sizeof(fp))) { + + xISOG(&A, &P, &K, primes[i]); + + if (!--e[sign][i]) + u512_mul3_64(&k[sign], &k[sign], primes[i]); + + } + + } + + done[sign] &= !e[sign][i]; + } + + fp_inv(&A.z); + fp_mul2(&A.x, &A.z); + A.z = fp_1; + + } while (!(done[0] && done[1])); + + out->A = A.x; +} + +/* includes public-key validation. */ +bool csidh(public_key *out, public_key const *in, private_key const *priv) +{ + if (!validate(in)) { + fp_random(&out->A); + return false; + } + action(out, in, priv); + return true; +} + diff --git a/csidh.h b/csidh.h new file mode 100644 index 0000000..52e9d54 --- /dev/null +++ b/csidh.h @@ -0,0 +1,26 @@ +#ifndef CSIDH_H +#define CSIDH_H + +#include "u512.h" +#include "fp.h" +#include "mont.h" + +/* specific to p, should perhaps be somewhere else */ +#define num_primes 74 +#define max_exponent 5 /* (2*5+1)^74 is roughly 2^256 */ + + +typedef struct private_key { + int8_t e[(num_primes + 1) / 2]; /* packed int4_t */ +} private_key; + +typedef struct public_key { + fp A; /* Montgomery coefficient: represents y^2 = x^3 + Ax^2 + x */ +} public_key; + +extern const public_key base; + +void csidh_private(private_key *priv); +bool csidh(public_key *out, public_key const *in, private_key const *priv); + +#endif diff --git a/cycle.h b/cycle.h new file mode 100644 index 0000000..2652a04 --- /dev/null +++ b/cycle.h @@ -0,0 +1,514 @@ +/* + * Copyright (c) 2003, 2007-8 Matteo Frigo + * Copyright (c) 2003, 2007-8 Massachusetts Institute of Technology + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE + * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION + * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION + * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + * + */ + + +/* machine-dependent cycle counters code. Needs to be inlined. */ + +/***************************************************************************/ +/* To use the cycle counters in your code, simply #include "cycle.h" (this + file), and then use the functions/macros: + + ticks getticks(void); + + ticks is an opaque typedef defined below, representing the current time. + You extract the elapsed time between two calls to gettick() via: + + double elapsed(ticks t1, ticks t0); + + which returns a double-precision variable in arbitrary units. You + are not expected to convert this into human units like seconds; it + is intended only for *comparisons* of time intervals. + + (In order to use some of the OS-dependent timer routines like + Solaris' gethrtime, you need to paste the autoconf snippet below + into your configure.ac file and #include "config.h" before cycle.h, + or define the relevant macros manually if you are not using autoconf.) +*/ + +/***************************************************************************/ +/* This file uses macros like HAVE_GETHRTIME that are assumed to be + defined according to whether the corresponding function/type/header + is available on your system. The necessary macros are most + conveniently defined if you are using GNU autoconf, via the tests: + + dnl --------------------------------------------------------------------- + + AC_C_INLINE + AC_HEADER_TIME + AC_CHECK_HEADERS([sys/time.h c_asm.h intrinsics.h mach/mach_time.h]) + + AC_CHECK_TYPE([hrtime_t],[AC_DEFINE(HAVE_HRTIME_T, 1, [Define to 1 if hrtime_t is defined in ])],,[#if HAVE_SYS_TIME_H +#include +#endif]) + + AC_CHECK_FUNCS([gethrtime read_real_time time_base_to_time clock_gettime mach_absolute_time]) + + dnl Cray UNICOS _rtc() (real-time clock) intrinsic + AC_MSG_CHECKING([for _rtc intrinsic]) + rtc_ok=yes + AC_TRY_LINK([#ifdef HAVE_INTRINSICS_H +#include +#endif], [_rtc()], [AC_DEFINE(HAVE__RTC,1,[Define if you have the UNICOS _rtc() intrinsic.])], [rtc_ok=no]) + AC_MSG_RESULT($rtc_ok) + + dnl --------------------------------------------------------------------- +*/ + +/***************************************************************************/ + +#if TIME_WITH_SYS_TIME +# include +# include +#else +# if HAVE_SYS_TIME_H +# include +# else +# include +# endif +#endif + +#define INLINE_ELAPSED(INL) static INL double elapsed(ticks t1, ticks t0) \ +{ \ + return (double)t1 - (double)t0; \ +} + +/*----------------------------------------------------------------*/ +/* Solaris */ +#if defined(HAVE_GETHRTIME) && defined(HAVE_HRTIME_T) && !defined(HAVE_TICK_COUNTER) +typedef hrtime_t ticks; + +#define getticks gethrtime + +INLINE_ELAPSED(inline) + +#define HAVE_TICK_COUNTER +#endif + +/*----------------------------------------------------------------*/ +/* AIX v. 4+ routines to read the real-time clock or time-base register */ +#if defined(HAVE_READ_REAL_TIME) && defined(HAVE_TIME_BASE_TO_TIME) && !defined(HAVE_TICK_COUNTER) +typedef timebasestruct_t ticks; + +static __inline ticks getticks(void) +{ + ticks t; + read_real_time(&t, TIMEBASE_SZ); + return t; +} + +static __inline double elapsed(ticks t1, ticks t0) /* time in nanoseconds */ +{ + time_base_to_time(&t1, TIMEBASE_SZ); + time_base_to_time(&t0, TIMEBASE_SZ); + return (((double)t1.tb_high - (double)t0.tb_high) * 1.0e9 + + ((double)t1.tb_low - (double)t0.tb_low)); +} + +#define HAVE_TICK_COUNTER +#endif + +/*----------------------------------------------------------------*/ +/* + * PowerPC ``cycle'' counter using the time base register. + */ +#if ((((defined(__GNUC__) && (defined(__powerpc__) || defined(__ppc__))) || (defined(__MWERKS__) && defined(macintosh)))) || (defined(__IBM_GCC_ASM) && (defined(__powerpc__) || defined(__ppc__)))) && !defined(HAVE_TICK_COUNTER) +typedef unsigned long long ticks; + +static __inline__ ticks getticks(void) +{ + unsigned int tbl, tbu0, tbu1; + + do { + __asm__ __volatile__ ("mftbu %0" : "=r"(tbu0)); + __asm__ __volatile__ ("mftb %0" : "=r"(tbl)); + __asm__ __volatile__ ("mftbu %0" : "=r"(tbu1)); + } while (tbu0 != tbu1); + + return (((unsigned long long)tbu0) << 32) | tbl; +} + +INLINE_ELAPSED(__inline__) + +#define HAVE_TICK_COUNTER +#endif + +/* MacOS/Mach (Darwin) time-base register interface (unlike UpTime, + from Carbon, requires no additional libraries to be linked). */ +#if defined(HAVE_MACH_ABSOLUTE_TIME) && defined(HAVE_MACH_MACH_TIME_H) && !defined(HAVE_TICK_COUNTER) +#include +typedef uint64_t ticks; +#define getticks mach_absolute_time +INLINE_ELAPSED(__inline__) +#define HAVE_TICK_COUNTER +#endif + +/*----------------------------------------------------------------*/ +/* + * Pentium cycle counter + */ +#if (defined(__GNUC__) || defined(__ICC)) && defined(__i386__) && !defined(HAVE_TICK_COUNTER) +typedef unsigned long long ticks; + +static __inline__ ticks getticks(void) +{ + ticks ret; + + __asm__ __volatile__("rdtsc": "=A" (ret)); + /* no input, nothing else clobbered */ + return ret; +} + +INLINE_ELAPSED(__inline__) + +#define HAVE_TICK_COUNTER +#define TIME_MIN 5000.0 /* unreliable pentium IV cycle counter */ +#endif + +/* Visual C++ -- thanks to Morten Nissov for his help with this */ +#if _MSC_VER >= 1200 && _M_IX86 >= 500 && !defined(HAVE_TICK_COUNTER) +#include +typedef LARGE_INTEGER ticks; +#define RDTSC __asm __emit 0fh __asm __emit 031h /* hack for VC++ 5.0 */ + +static __inline ticks getticks(void) +{ + ticks retval; + + __asm { + RDTSC + mov retval.HighPart, edx + mov retval.LowPart, eax + } + return retval; +} + +static __inline double elapsed(ticks t1, ticks t0) +{ + return (double)t1.QuadPart - (double)t0.QuadPart; +} + +#define HAVE_TICK_COUNTER +#define TIME_MIN 5000.0 /* unreliable pentium IV cycle counter */ +#endif + +/*----------------------------------------------------------------*/ +/* + * X86-64 cycle counter + */ +#if (defined(__GNUC__) || defined(__ICC) || defined(__SUNPRO_C)) && defined(__x86_64__) && !defined(HAVE_TICK_COUNTER) +typedef unsigned long long ticks; + +static __inline__ ticks getticks(void) +{ + unsigned a, d; + asm volatile("rdtsc" : "=a" (a), "=d" (d)); + return ((ticks)a) | (((ticks)d) << 32); +} + +INLINE_ELAPSED(__inline__) + +#define HAVE_TICK_COUNTER +#endif + +/* PGI compiler, courtesy Cristiano Calonaci, Andrea Tarsi, & Roberto Gori. + NOTE: this code will fail to link unless you use the -Masmkeyword compiler + option (grrr). */ +#if defined(__PGI) && defined(__x86_64__) && !defined(HAVE_TICK_COUNTER) +typedef unsigned long long ticks; +static ticks getticks(void) +{ + asm(" rdtsc; shl $0x20,%rdx; mov %eax,%eax; or %rdx,%rax; "); +} +INLINE_ELAPSED(__inline__) +#define HAVE_TICK_COUNTER +#endif + +/* Visual C++, courtesy of Dirk Michaelis */ +#if _MSC_VER >= 1400 && (defined(_M_AMD64) || defined(_M_X64)) && !defined(HAVE_TICK_COUNTER) + +#include +#pragma intrinsic(__rdtsc) +typedef unsigned __int64 ticks; +#define getticks __rdtsc +INLINE_ELAPSED(__inline) + +#define HAVE_TICK_COUNTER +#endif + +/*----------------------------------------------------------------*/ +/* + * IA64 cycle counter + */ + +/* intel's icc/ecc compiler */ +#if (defined(__EDG_VERSION) || defined(__ECC)) && defined(__ia64__) && !defined(HAVE_TICK_COUNTER) +typedef unsigned long ticks; +#include + +static __inline__ ticks getticks(void) +{ + return __getReg(_IA64_REG_AR_ITC); +} + +INLINE_ELAPSED(__inline__) + +#define HAVE_TICK_COUNTER +#endif + +/* gcc */ +#if defined(__GNUC__) && defined(__ia64__) && !defined(HAVE_TICK_COUNTER) +typedef unsigned long ticks; + +static __inline__ ticks getticks(void) +{ + ticks ret; + + __asm__ __volatile__ ("mov %0=ar.itc" : "=r"(ret)); + return ret; +} + +INLINE_ELAPSED(__inline__) + +#define HAVE_TICK_COUNTER +#endif + +/* HP/UX IA64 compiler, courtesy Teresa L. Johnson: */ +#if defined(__hpux) && defined(__ia64) && !defined(HAVE_TICK_COUNTER) +#include +typedef unsigned long ticks; + +static inline ticks getticks(void) +{ + ticks ret; + + ret = _Asm_mov_from_ar (_AREG_ITC); + return ret; +} + +INLINE_ELAPSED(inline) + +#define HAVE_TICK_COUNTER +#endif + +/* Microsoft Visual C++ */ +#if defined(_MSC_VER) && defined(_M_IA64) && !defined(HAVE_TICK_COUNTER) +typedef unsigned __int64 ticks; + +# ifdef __cplusplus +extern "C" +# endif +ticks __getReg(int whichReg); +#pragma intrinsic(__getReg) + +static __inline ticks getticks(void) +{ + volatile ticks temp; + temp = __getReg(3116); + return temp; +} + +INLINE_ELAPSED(inline) + +#define HAVE_TICK_COUNTER +#endif + +/*----------------------------------------------------------------*/ +/* + * PA-RISC cycle counter + */ +#if defined(__hppa__) || defined(__hppa) && !defined(HAVE_TICK_COUNTER) +typedef unsigned long ticks; + +# ifdef __GNUC__ +static __inline__ ticks getticks(void) +{ + ticks ret; + + __asm__ __volatile__("mfctl 16, %0": "=r" (ret)); + /* no input, nothing else clobbered */ + return ret; +} +# else +# include +static inline unsigned long getticks(void) +{ + register ticks ret; + _MFCTL(16, ret); + return ret; +} +# endif + +INLINE_ELAPSED(inline) + +#define HAVE_TICK_COUNTER +#endif + +/*----------------------------------------------------------------*/ +/* S390, courtesy of James Treacy */ +#if defined(__GNUC__) && defined(__s390__) && !defined(HAVE_TICK_COUNTER) +typedef unsigned long long ticks; + +static __inline__ ticks getticks(void) +{ + ticks cycles; + __asm__("stck 0(%0)" : : "a" (&(cycles)) : "memory", "cc"); + return cycles; +} + +INLINE_ELAPSED(__inline__) + +#define HAVE_TICK_COUNTER +#endif +/*----------------------------------------------------------------*/ +#if defined(__GNUC__) && defined(__alpha__) && !defined(HAVE_TICK_COUNTER) +/* + * The 32-bit cycle counter on alpha overflows pretty quickly, + * unfortunately. A 1GHz machine overflows in 4 seconds. + */ +typedef unsigned int ticks; + +static __inline__ ticks getticks(void) +{ + unsigned long cc; + __asm__ __volatile__ ("rpcc %0" : "=r"(cc)); + return (cc & 0xFFFFFFFF); +} + +INLINE_ELAPSED(__inline__) + +#define HAVE_TICK_COUNTER +#endif + +/*----------------------------------------------------------------*/ +#if defined(__GNUC__) && defined(__sparc_v9__) && !defined(HAVE_TICK_COUNTER) +typedef unsigned long ticks; + +static __inline__ ticks getticks(void) +{ + ticks ret; + __asm__ __volatile__("rd %%tick, %0" : "=r" (ret)); + return ret; +} + +INLINE_ELAPSED(__inline__) + +#define HAVE_TICK_COUNTER +#endif + +/*----------------------------------------------------------------*/ +#if (defined(__DECC) || defined(__DECCXX)) && defined(__alpha) && defined(HAVE_C_ASM_H) && !defined(HAVE_TICK_COUNTER) +# include +typedef unsigned int ticks; + +static __inline ticks getticks(void) +{ + unsigned long cc; + cc = asm("rpcc %v0"); + return (cc & 0xFFFFFFFF); +} + +INLINE_ELAPSED(__inline) + +#define HAVE_TICK_COUNTER +#endif +/*----------------------------------------------------------------*/ +/* SGI/Irix */ +#if defined(HAVE_CLOCK_GETTIME) && defined(CLOCK_SGI_CYCLE) && !defined(HAVE_TICK_COUNTER) +typedef struct timespec ticks; + +static inline ticks getticks(void) +{ + struct timespec t; + clock_gettime(CLOCK_SGI_CYCLE, &t); + return t; +} + +static inline double elapsed(ticks t1, ticks t0) +{ + return ((double)t1.tv_sec - (double)t0.tv_sec) * 1.0E9 + + ((double)t1.tv_nsec - (double)t0.tv_nsec); +} +#define HAVE_TICK_COUNTER +#endif + +/*----------------------------------------------------------------*/ +/* Cray UNICOS _rtc() intrinsic function */ +#if defined(HAVE__RTC) && !defined(HAVE_TICK_COUNTER) +#ifdef HAVE_INTRINSICS_H +# include +#endif + +typedef long long ticks; + +#define getticks _rtc + +INLINE_ELAPSED(inline) + +#define HAVE_TICK_COUNTER +#endif + +/*----------------------------------------------------------------*/ +/* MIPS ZBus */ +#if HAVE_MIPS_ZBUS_TIMER +#if defined(__mips__) && !defined(HAVE_TICK_COUNTER) +#include +#include +#include + +typedef uint64_t ticks; + +static inline ticks getticks(void) +{ + static uint64_t* addr = 0; + + if (addr == 0) + { + uint32_t rq_addr = 0x10030000; + int fd; + int pgsize; + + pgsize = getpagesize(); + fd = open ("/dev/mem", O_RDONLY | O_SYNC, 0); + if (fd < 0) { + perror("open"); + return NULL; + } + addr = mmap(0, pgsize, PROT_READ, MAP_SHARED, fd, rq_addr); + close(fd); + if (addr == (uint64_t *)-1) { + perror("mmap"); + return NULL; + } + } + + return *addr; +} + +INLINE_ELAPSED(inline) + +#define HAVE_TICK_COUNTER +#endif +#endif /* HAVE_MIPS_ZBUS_TIMER */ + diff --git a/fp.h b/fp.h new file mode 100644 index 0000000..6cfa064 --- /dev/null +++ b/fp.h @@ -0,0 +1,37 @@ +#ifndef FP_H +#define FP_H + +#include "u512.h" + +/* fp is in the Montgomery domain, so interpreting that + as an integer should never make sense. + enable compiler warnings when mixing up u512 and fp. */ +typedef struct fp { + u512 x; +} fp; + +extern const fp fp_0; +extern const fp fp_1; + +void fp_set(fp *x, uint64_t y); +void fp_cswap(fp *x, fp *y, bool c); + +void fp_enc(fp *x, u512 const *y); /* encode to Montgomery representation */ +void fp_dec(u512 *x, fp const *y); /* decode from Montgomery representation */ + +void fp_add2(fp *x, fp const *y); +void fp_sub2(fp *x, fp const *y); +void fp_mul2(fp *x, fp const *y); + +void fp_add3(fp *x, fp const *y, fp const *z); +void fp_sub3(fp *x, fp const *y, fp const *z); +void fp_mul3(fp *x, fp const *y, fp const *z); + +void fp_sq1(fp *x); +void fp_sq2(fp *x, fp const *y); +void fp_inv(fp *x); +bool fp_issquare(fp const *x); + +void fp_random(fp *x); + +#endif diff --git a/fp.s b/fp.s new file mode 100644 index 0000000..ac65d94 --- /dev/null +++ b/fp.s @@ -0,0 +1,452 @@ + +.intel_syntax noprefix + +.section .rodata + +.set pbits, 511 +p: + .quad 0x1b81b90533c6c87b, 0xc2721bf457aca835, 0x516730cc1f0b4f25, 0xa7aac6c567f35507 + .quad 0x5afbfcc69322c9cd, 0xb42d083aedc88c42, 0xfc8ab0d15e3e4c4a, 0x65b48e8f740f89bf + + +.global fp_0 +fp_0: .quad 0, 0, 0, 0, 0, 0, 0, 0 + +.global fp_1 +fp_1: /* 2^512 mod p */ + .quad 0xc8fc8df598726f0a, 0x7b1bc81750a6af95, 0x5d319e67c1e961b4, 0xb0aa7275301955f1 + .quad 0x4a080672d9ba6c64, 0x97a5ef8a246ee77b, 0x06ea9e5d4383676a, 0x3496e2e117e0ec80 + + +/* (2^512)^2 mod p */ +.r_squared_mod_p: + .quad 0x36905b572ffc1724, 0x67086f4525f1f27d, 0x4faf3fbfd22370ca, 0x192ea214bcc584b1 + .quad 0x5dae03ee2f5de3d0, 0x1e9248731776b371, 0xad5f166e20e4f52d, 0x4ed759aea6f3917e + +/* -p^-1 mod 2^64 */ +.inv_min_p_mod_r: + .quad 0x66c1301f632e294d + + +.section .text + +.global fp_copy +fp_copy: + cld + mov rcx, 8 + rep movsq + ret + +.global fp_set +fp_set: + push rdi + call u512_set + pop rdi + mov rsi, rdi + jmp fp_enc + +.global fp_cswap +fp_cswap: + movzx rax, dl + neg rax + .set k, 0 + .rept 8 + mov rcx, [rdi + 8*k] + mov rdx, [rsi + 8*k] + + mov r8, rcx + xor r8, rdx + and r8, rax + + xor rcx, r8 + xor rdx, r8 + + mov [rdi + 8*k], rcx + mov [rsi + 8*k], rdx + + .set k, k+1 + .endr + ret + +.reduce_once: + push rbp + mov rbp, rdi + + mov rdi, [rbp + 0] + sub rdi, [rip + p + 0] + mov rsi, [rbp + 8] + sbb rsi, [rip + p + 8] + mov rdx, [rbp + 16] + sbb rdx, [rip + p + 16] + mov rcx, [rbp + 24] + sbb rcx, [rip + p + 24] + mov r8, [rbp + 32] + sbb r8, [rip + p + 32] + mov r9, [rbp + 40] + sbb r9, [rip + p + 40] + mov r10, [rbp + 48] + sbb r10, [rip + p + 48] + mov r11, [rbp + 56] + sbb r11, [rip + p + 56] + + setnc al + movzx rax, al + neg rax + +.macro cswap2, r, m + xor \r, \m + and \r, rax + xor \m, \r +.endm + + cswap2 rdi, [rbp + 0] + cswap2 rsi, [rbp + 8] + cswap2 rdx, [rbp + 16] + cswap2 rcx, [rbp + 24] + cswap2 r8, [rbp + 32] + cswap2 r9, [rbp + 40] + cswap2 r10, [rbp + 48] + cswap2 r11, [rbp + 56] + + pop rbp + ret + +.global fp_add3 +fp_add3: + push rdi + call u512_add3 + pop rdi + jmp .reduce_once + +.global fp_add2 +fp_add2: + mov rdx, rdi + jmp fp_add3 + +.global fp_sub3 +fp_sub3: + push rdi + call u512_sub3 + pop rdi + xor rsi, rsi + xor rdx, rdx + xor rcx, rcx + xor r8, r8 + xor r9, r9 + xor r10, r10 + xor r11, r11 + test rax, rax + cmovnz rax, [rip + p + 0] + cmovnz rsi, [rip + p + 8] + cmovnz rdx, [rip + p + 16] + cmovnz rcx, [rip + p + 24] + cmovnz r8, [rip + p + 32] + cmovnz r9, [rip + p + 40] + cmovnz r10, [rip + p + 48] + cmovnz r11, [rip + p + 56] + add [rdi + 0], rax + adc [rdi + 8], rsi + adc [rdi + 16], rdx + adc [rdi + 24], rcx + adc [rdi + 32], r8 + adc [rdi + 40], r9 + adc [rdi + 48], r10 + adc [rdi + 56], r11 + ret + +.global fp_sub2 +fp_sub2: + mov rdx, rdi + xchg rsi, rdx + jmp fp_sub3 + + +/* Montgomery arithmetic */ + +.global fp_enc +fp_enc: + lea rdx, [rip + .r_squared_mod_p] + jmp fp_mul3 + +.global fp_dec +fp_dec: + lea rdx, [rip + u512_1] + jmp fp_mul3 + +.global fp_mul3 +fp_mul3: + push rbp + push rbx + push r12 + push r13 + push r14 + push r15 + + push rdi + + mov rdi, rsi + mov rsi, rdx + + xor r8, r8 + xor r9, r9 + xor r10, r10 + xor r11, r11 + xor r12, r12 + xor r13, r13 + xor r14, r14 + xor r15, r15 + xor rbp, rbp + + /* flags are already cleared */ + +.macro MULSTEP, k, r0, r1, r2, r3, r4, r5, r6, r7, r8 + + mov rdx, [rsi + 0] + mulx rcx, rdx, [rdi + 8*\k] + add rdx, \r0 + mulx rcx, rdx, [rip + .inv_min_p_mod_r] + + xor rax, rax /* clear flags */ + + mulx rbx, rax, [rip + p + 0] + adox \r0, rax + + mulx rcx, rax, [rip + p + 8] + adcx \r1, rbx + adox \r1, rax + + mulx rbx, rax, [rip + p + 16] + adcx \r2, rcx + adox \r2, rax + + mulx rcx, rax, [rip + p + 24] + adcx \r3, rbx + adox \r3, rax + + mulx rbx, rax, [rip + p + 32] + adcx \r4, rcx + adox \r4, rax + + mulx rcx, rax, [rip + p + 40] + adcx \r5, rbx + adox \r5, rax + + mulx rbx, rax, [rip + p + 48] + adcx \r6, rcx + adox \r6, rax + + mulx rcx, rax, [rip + p + 56] + adcx \r7, rbx + adox \r7, rax + + mov rax, 0 + adcx \r8, rcx + adox \r8, rax + + + mov rdx, [rdi + 8*\k] + + xor rax, rax /* clear flags */ + + mulx rbx, rax, [rsi + 0] + adox \r0, rax + + mulx rcx, rax, [rsi + 8] + adcx \r1, rbx + adox \r1, rax + + mulx rbx, rax, [rsi + 16] + adcx \r2, rcx + adox \r2, rax + + mulx rcx, rax, [rsi + 24] + adcx \r3, rbx + adox \r3, rax + + mulx rbx, rax, [rsi + 32] + adcx \r4, rcx + adox \r4, rax + + mulx rcx, rax, [rsi + 40] + adcx \r5, rbx + adox \r5, rax + + mulx rbx, rax, [rsi + 48] + adcx \r6, rcx + adox \r6, rax + + mulx rcx, rax, [rsi + 56] + adcx \r7, rbx + adox \r7, rax + + mov rax, 0 + adcx \r8, rcx + adox \r8, rax + +.endm + + MULSTEP 0, r8, r9, r10, r11, r12, r13, r14, r15, rbp + MULSTEP 1, r9, r10, r11, r12, r13, r14, r15, rbp, r8 + MULSTEP 2, r10, r11, r12, r13, r14, r15, rbp, r8, r9 + MULSTEP 3, r11, r12, r13, r14, r15, rbp, r8, r9, r10 + MULSTEP 4, r12, r13, r14, r15, rbp, r8, r9, r10, r11 + MULSTEP 5, r13, r14, r15, rbp, r8, r9, r10, r11, r12 + MULSTEP 6, r14, r15, rbp, r8, r9, r10, r11, r12, r13 + MULSTEP 7, r15, rbp, r8, r9, r10, r11, r12, r13, r14 + + pop rdi + + mov [rdi + 0], rbp + mov [rdi + 8], r8 + mov [rdi + 16], r9 + mov [rdi + 24], r10 + mov [rdi + 32], r11 + mov [rdi + 40], r12 + mov [rdi + 48], r13 + mov [rdi + 56], r14 + + pop r15 + pop r14 + pop r13 + pop r12 + pop rbx + pop rbp + jmp .reduce_once + +.global fp_mul2 +fp_mul2: + mov rdx, rdi + jmp fp_mul3 + +.global fp_sq2 +fp_sq2: + /* TODO implement optimized Montgomery squaring */ + mov rdx, rsi + jmp fp_mul3 + +.global fp_sq1 +fp_sq1: + mov rsi, rdi + jmp fp_sq2 + +/* (obviously) not constant time in the exponent! */ +.fp_pow: + push rbx + mov rbx, rsi + push r12 + push r13 + push rdi + sub rsp, 64 + + mov rsi, rdi + mov rdi, rsp + call fp_copy + + mov rdi, [rsp + 64] + lea rsi, [rip + fp_1] + call fp_copy + +.macro POWSTEP, k + mov r13, [rbx + 8*\k] + xor r12, r12 + + 0: + test r13, 1 + jz 1f + + mov rdi, [rsp + 64] + mov rsi, rsp + call fp_mul2 + + 1: + mov rdi, rsp + call fp_sq1 + + shr r13 + + inc r12 + test r12, 64 + jz 0b +.endm + + POWSTEP 0 + POWSTEP 1 + POWSTEP 2 + POWSTEP 3 + POWSTEP 4 + POWSTEP 5 + POWSTEP 6 + POWSTEP 7 + + add rsp, 64+8 + pop r13 + pop r12 + pop rbx + ret + +.section .rodata +.p_minus_2: + .quad 0x1b81b90533c6c879, 0xc2721bf457aca835, 0x516730cc1f0b4f25, 0xa7aac6c567f35507 + .quad 0x5afbfcc69322c9cd, 0xb42d083aedc88c42, 0xfc8ab0d15e3e4c4a, 0x65b48e8f740f89bf + +.section .text + +/* TODO use a better addition chain? */ +.global fp_inv +fp_inv: + lea rsi, [rip + .p_minus_2] + jmp .fp_pow + +.section .rodata +.p_minus_1_halves: + .quad 0x8dc0dc8299e3643d, 0xe1390dfa2bd6541a, 0xa8b398660f85a792, 0xd3d56362b3f9aa83 + .quad 0x2d7dfe63499164e6, 0x5a16841d76e44621, 0xfe455868af1f2625, 0x32da4747ba07c4df + +.section .text + +/* TODO use a better addition chain? */ +.global fp_issquare +fp_issquare: + push rdi + lea rsi, [rip + .p_minus_1_halves] + call .fp_pow + pop rdi + + xor rax, rax + .set k, 0 + .rept 8 + mov rsi, [rdi + 8*k] + xor rsi, [rip + fp_1 + 8*k] + or rax, rsi + .set k, k+1 + .endr + test rax, rax + setz al + movzx rax, al + ret + + +/* not constant time (but this shouldn't leak anything of importance) */ +.global fp_random +fp_random: + + push rdi + mov rsi, 64 + call randombytes + pop rdi + mov rax, 1 + shl rax, (pbits % 64) + dec rax + and [rdi + 56], rax + + .set k, 7 + .rept 8 + mov rax, [rip + p + 8*k] + cmp [rdi + 8*k], rax + jge fp_random + jl 0f + .set k, k-1 + .endr + 0: + ret + diff --git a/main.c b/main.c new file mode 100644 index 0000000..6649bfc --- /dev/null +++ b/main.c @@ -0,0 +1,99 @@ + +#include +#include +#include +#include +#include +#include + +#include "u512.h" +#include "fp.h" +#include "mont.h" +#include "csidh.h" + +void u512_print(u512 const *x) +{ + for (size_t i = 63; i < 64; --i) + printf("%02hhx", i[(unsigned char *) x->c]); +} + +void fp_print(fp const *x) +{ + u512 y; + fp_dec(&y, x); + u512_print(&y); +} + +int main() +{ + clock_t t0, t1; + + private_key priv_alice, priv_bob; + public_key pub_alice, pub_bob; + public_key shared_alice, shared_bob; + + printf("\n"); + + + t0 = clock(); + csidh_private(&priv_alice); + t1 = clock(); + + printf("Alice's private key (%7.3lf ms):\n ", 1000. * (t1 - t0) / CLOCKS_PER_SEC); + for (size_t i = 0; i < sizeof(priv_alice); ++i) + printf("%02hhx", i[(uint8_t *) &priv_alice]); + printf("\n\n"); + + t0 = clock(); + csidh_private(&priv_bob); + t1 = clock(); + + printf("Bob's private key (%7.3lf ms):\n ", 1000. * (t1 - t0) / CLOCKS_PER_SEC); + for (size_t i = 0; i < sizeof(priv_bob); ++i) + printf("%02hhx", i[(uint8_t *) &priv_bob]); + printf("\n\n"); + + + t0 = clock(); + assert(csidh(&pub_alice, &base, &priv_alice)); + t1 = clock(); + + printf("Alice's public key (%7.3lf ms):\n ", 1000. * (t1 - t0) / CLOCKS_PER_SEC); + fp_print(&pub_alice.A); + printf("\n\n"); + + t0 = clock(); + assert(csidh(&pub_bob, &base, &priv_bob)); + t1 = clock(); + + printf("Bob's public key (%7.3lf ms):\n ", 1000. * (t1 - t0) / CLOCKS_PER_SEC); + fp_print(&pub_bob.A); + printf("\n\n"); + + + t0 = clock(); + assert(csidh(&shared_alice, &pub_bob, &priv_alice)); + t1 = clock(); + + printf("Alice's shared secret (%7.3lf ms):\n ", 1000. * (t1 - t0) / CLOCKS_PER_SEC); + fp_print(&shared_alice.A); + printf("\n\n"); + + t0 = clock(); + assert(csidh(&shared_bob, &pub_alice, &priv_bob)); + t1 = clock(); + + printf("Bob's shared secret (%7.3lf ms):\n ", 1000. * (t1 - t0) / CLOCKS_PER_SEC); + fp_print(&shared_bob.A); + printf("\n\n"); + + printf(" "); + if (memcmp(&shared_alice, &shared_bob, sizeof(public_key))) + printf("\x1b[31mNOT EQUAL!\x1b[0m\n"); + else + printf("\x1b[32mequal.\x1b[0m\n"); + printf("\n"); + + printf("\n"); +} + diff --git a/mont.c b/mont.c new file mode 100644 index 0000000..6950969 --- /dev/null +++ b/mont.c @@ -0,0 +1,203 @@ + +#include + +#include "mont.h" +#include "u512.h" + +void xDBLADD(proj *R, proj *S, proj const *P, proj const *Q, proj const *PQ, proj const *A24) +{ + fp tmp0, tmp1, tmp2; //requires precomputation of A24=(A+2C:4C) + + fp_add3(&tmp0, &P->x, &P->z); + fp_sub3(&tmp1, &P->x, &P->z); + fp_sq2(&R->x, &tmp0); + fp_sub3(&tmp2, &Q->x, &Q->z); + fp_add3(&S->x, &Q->x, &Q->z); + fp_mul2(&tmp0, &tmp2); + fp_sq2(&R->z, &tmp1); + fp_mul2(&tmp1, &S->x); + fp_sub3(&tmp2, &R->x, &R->z); + fp_mul2(&R->z, &A24->z); + fp_mul2(&R->x, &R->z); + fp_mul3(&S->x, &A24->x, &tmp2); + fp_sub3(&S->z, &tmp0, &tmp1); + fp_add2(&R->z, &S->x); + fp_add3(&S->x, &tmp0, &tmp1); + fp_mul2(&R->z, &tmp2); + fp_sq1(&S->z); + fp_sq1(&S->x); + fp_mul2(&S->z, &PQ->x); + fp_mul2(&S->x, &PQ->z); +} + +void xDBL(proj *Q, proj const *A, proj const *P) +{ + fp a, b, c; + fp_add3(&a, &P->x, &P->z); + fp_sq1(&a); + fp_sub3(&b, &P->x, &P->z); + fp_sq1(&b); + fp_sub3(&c, &a, &b); + fp_add2(&b, &b); fp_add2(&b, &b); /* multiplication by 4 */ + fp_mul2(&b, &A->z); + fp_mul3(&Q->x, &a, &b); + fp_add3(&a, &A->z, &A->z); /* multiplication by 2 */ + fp_add2(&a, &A->x); + fp_mul2(&a, &c); + fp_add2(&a, &b); + fp_mul3(&Q->z, &a, &c); +} + +void xADD(proj *S, proj const *P, proj const *Q, proj const *PQ) +{ + fp a, b, c, d; + fp_add3(&a, &P->x, &P->z); + fp_sub3(&b, &P->x, &P->z); + fp_add3(&c, &Q->x, &Q->z); + fp_sub3(&d, &Q->x, &Q->z); + fp_mul2(&a, &d); + fp_mul2(&b, &c); + fp_add3(&c, &a, &b); + fp_sub3(&d, &a, &b); + fp_sq1(&c); + fp_sq1(&d); + fp_mul3(&S->x, &PQ->z, &c); + fp_mul3(&S->z, &PQ->x, &d); +} + +/* Montgomery ladder. */ +/* P must not be the unique point of order 2. */ +/* not constant-time! */ +void xMUL(proj *Q, proj const *A, proj const *P, u512 const *k) +{ + proj R = *P; + proj A24; + const proj Pcopy = *P; /* in case Q = P */ + + Q->x = fp_1; + Q->z = fp_0; + + fp_add3(&A24.x, &A->z, &A->z); //precomputation of A24=(A+2C:4C) + fp_add3(&A24.z, &A24.x, &A24.x); + fp_add2(&A24.x, &A->x); + + unsigned long i = 512; + while (--i && !u512_bit(k, i)); + + do { + + bool bit = u512_bit(k, i); + + if (bit) { proj T = *Q; *Q = R; R = T; } /* not constant-time */ + //fp_cswap(&Q->x, &R.x, bit); + //fp_cswap(&Q->z, &R.z, bit); + + xDBLADD(Q, &R, Q, &R, &Pcopy, &A24); + + if (bit) { proj T = *Q; *Q = R; R = T; } /* not constant-time */ + //fp_cswap(&Q->x, &R.x, bit); + //fp_cswap(&Q->z, &R.z, bit); + + } while (i--); +} + +//simultaneous square-and-multiply, computes x^exp and y^exp +void exp_by_squaring_(fp* x, fp* y, uint64_t exp) +{ + fp result1, result2; + fp_set(&result1, 1); + fp_set(&result2, 1); + + while (exp) + { + if (exp & 1){ + fp_mul2(&result1, x); + fp_mul2(&result2, y); + } + + fp_sq1(x); + fp_sq1(y); + exp >>= 1; + } + + fp_cswap(&result1, x, 1); + fp_cswap(&result2, y, 1); + +} + + +/* computes the isogeny with kernel point K of order k */ +/* returns the new curve coefficient A and the image of P */ +/* (obviously) not constant time in k */ +void xISOG(proj *A, proj *P, proj const *K, uint64_t k) +{ + assert (k >= 3); + assert (k % 2 == 1); + + fp tmp0, tmp1, tmp2, Psum, Pdif; + proj Q, Aed, prod; + + fp_add3(&Aed.z, &A->z, &A->z); //compute twisted Edwards curve coefficients + fp_add3(&Aed.x, &A->x, &Aed.z); + fp_sub3(&Aed.z, &A->x, &Aed.z); + + fp_add3(&Psum, &P->x, &P->z); //precomputations + fp_sub3(&Pdif, &P->x, &P->z); + + fp_sub3(&prod.x, &K->x, &K->z); + fp_add3(&prod.z, &K->x, &K->z); + + fp_mul3(&tmp1, &prod.x, &Psum); + fp_mul3(&tmp0, &prod.z, &Pdif); + fp_add3(&Q.x, &tmp0, &tmp1); + fp_sub3(&Q.z, &tmp0, &tmp1); + + proj M[3] = {*K}; + xDBL(&M[1], A, K); + + for (uint64_t i = 1; i < k / 2; ++i) { + + if (i >= 2) + xADD(&M[i % 3], &M[(i - 1) % 3], K, &M[(i - 2) % 3]); + + fp_sub3(&tmp1, &M[i % 3].x, &M[i % 3].z); + fp_add3(&tmp0, &M[i % 3].x, &M[i % 3].z); + fp_mul2(&prod.x, &tmp1); + fp_mul2(&prod.z, &tmp0); + fp_mul2(&tmp1, &Psum); + fp_mul2(&tmp0, &Pdif); + fp_add3(&tmp2, &tmp0, &tmp1); + fp_mul2(&Q.x, &tmp2); + fp_sub3(&tmp2, &tmp0, &tmp1); + fp_mul2(&Q.z, &tmp2); + + } + + + // point evaluation + fp_sq1(&Q.x); + fp_sq1(&Q.z); + fp_mul2(&P->x, &Q.x); + fp_mul2(&P->z, &Q.z); + + //compute Aed.x^k, Aed.z^k + exp_by_squaring_(&Aed.x, &Aed.z, k); + + //compute prod.x^8, prod.z^8 + fp_sq1(&prod.x); + fp_sq1(&prod.x); + fp_sq1(&prod.x); + fp_sq1(&prod.z); + fp_sq1(&prod.z); + fp_sq1(&prod.z); + + //compute image curve parameters + fp_mul2(&Aed.z, &prod.x); + fp_mul2(&Aed.x, &prod.z); + + //compute Montgomery params + fp_add3(&A->x, &Aed.x, &Aed.z); + fp_sub3(&A->z, &Aed.x, &Aed.z); + fp_add2(&A->x, &A->x); +} + diff --git a/mont.h b/mont.h new file mode 100644 index 0000000..36b3ad7 --- /dev/null +++ b/mont.h @@ -0,0 +1,19 @@ +#ifndef MONT_H +#define MONT_H + +#include "u512.h" +#include "fp.h" + +/* P^1 over fp. */ +typedef struct proj { + fp x; + fp z; +} proj; + +void xDBL(proj *Q, proj const *A, proj const *P); +void xADD(proj *S, proj const *P, proj const *Q, proj const *PQ); +void xDBLADD(proj *R, proj *S, proj const *P, proj const *Q, proj const *PQ, proj const *A); +void xMUL(proj *Q, proj const *A, proj const *P, u512 const *k); +void xISOG(proj *A, proj *P, proj const *K, uint64_t k); + +#endif diff --git a/rng.c b/rng.c new file mode 100644 index 0000000..fc28c87 --- /dev/null +++ b/rng.c @@ -0,0 +1,18 @@ + +#include "rng.h" + +#include +#include +#include + +void randombytes(void *x, size_t l) +{ + static int fd = -1; + ssize_t n; + if (fd < 0 && 0 > (fd = open("/dev/urandom", O_RDONLY))) + exit(1); + for (size_t i = 0; i < l; i += n) + if (0 >= (n = read(fd, (char *) x + i, l - i))) + exit(2); +} + diff --git a/rng.h b/rng.h new file mode 100644 index 0000000..30e01d0 --- /dev/null +++ b/rng.h @@ -0,0 +1,8 @@ +#ifndef RNG_H +#define RNG_H + +#include + +void randombytes(void *x, size_t l); + +#endif diff --git a/u512.h b/u512.h new file mode 100644 index 0000000..6eaae93 --- /dev/null +++ b/u512.h @@ -0,0 +1,22 @@ +#ifndef UINT_H +#define UINT_H + +#include +#include + +typedef struct u512 { + uint64_t c[8]; +} u512; + +extern const u512 u512_1; + +void u512_set(u512 *x, uint64_t y); + +bool u512_bit(u512 const *x, uint64_t k); + +bool u512_add3(u512 *x, u512 const *y, u512 const *z); /* returns carry */ +bool u512_sub3(u512 *x, u512 const *y, u512 const *z); /* returns borrow */ + +void u512_mul3_64(u512 *x, u512 const *y, uint64_t z); + +#endif diff --git a/u512.s b/u512.s new file mode 100644 index 0000000..779be3e --- /dev/null +++ b/u512.s @@ -0,0 +1,102 @@ + +.intel_syntax noprefix + +.section .rodata + +.global u512_1 +u512_1: .quad 1, 0, 0, 0, 0, 0, 0, 0 + + +.section .text + +.global u512_set +u512_set: + cld + mov rax, rsi + stosq + xor rax, rax + mov rcx, 7 + rep stosq + ret + + +.global u512_bit +u512_bit: + mov rcx, rsi + and rcx, 0x3f + shr rsi, 6 + mov rax, [rdi + 8*rsi] + shr rax, cl + and rax, 1 + ret + + +.global u512_add3 +u512_add3: + mov rax, [rsi + 0] + add rax, [rdx + 0] + mov [rdi + 0], rax + .set k, 1 + .rept 7 + mov rax, [rsi + 8*k] + adc rax, [rdx + 8*k] + mov [rdi + 8*k], rax + .set k, k+1 + .endr + setc al + movzx rax, al + ret + +.global u512_sub3 +u512_sub3: + mov rax, [rsi + 0] + sub rax, [rdx + 0] + mov [rdi + 0], rax + .set k, 1 + .rept 7 + mov rax, [rsi + 8*k] + sbb rax, [rdx + 8*k] + mov [rdi + 8*k], rax + .set k, k+1 + .endr + setc al + movzx rax, al + ret + + +.global u512_mul3_64 +u512_mul3_64: + + mulx r10, rax, [rsi + 0] + mov [rdi + 0], rax + + mulx r11, rax, [rsi + 8] + add rax, r10 + mov [rdi + 8], rax + + mulx r10, rax, [rsi + 16] + adcx rax, r11 + mov [rdi + 16], rax + + mulx r11, rax, [rsi + 24] + adcx rax, r10 + mov [rdi + 24], rax + + mulx r10, rax, [rsi + 32] + adcx rax, r11 + mov [rdi + 32],rax + + mulx r11, rax, [rsi + 40] + adcx rax, r10 + mov [rdi + 40],rax + + mulx r10, rax, [rsi + 48] + adcx rax, r11 + mov [rdi + 48],rax + + mulx r11, rax, [rsi + 56] + adcx rax, r10 + mov [rdi + 56],rax + + ret +