commit c00c37e0118993aa4e1dcf544d15c7ebb055b0f4 Author: Michael Meyer Date: Thu Aug 23 13:49:45 2018 +0200 c-implementation diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..1077519 --- /dev/null +++ b/Makefile @@ -0,0 +1,25 @@ +all: + @gcc \ + -Wall -Wextra \ + -O0 -funroll-loops \ + rng.c \ + u512.s fp.s \ + mont.c \ + csidh.c \ + main.c \ + -o main + +debug: + gcc \ + -Wall -Wextra \ + -g \ + rng.c \ + u512.s fp.s \ + mont.c \ + csidh.c \ + main.c \ + -o main + +clean: + rm -f main + diff --git a/bench.c b/bench.c new file mode 100644 index 0000000..fd0f64a --- /dev/null +++ b/bench.c @@ -0,0 +1,54 @@ + +#include +#include +#include +#include +#include + +#include "u512.h" +#include "fp.h" +#include "mont.h" +#include "csidh.h" + +#include + +static __inline__ uint64_t rdtsc(void) +{ + uint32_t hi, lo; + __asm__ __volatile__ ("rdtsc" : "=a"(lo), "=d"(hi)); + return lo | (uint64_t) hi << 32; +} + +unsigned long its = 10000; + +int main() +{ + clock_t t0, t1, time = 0; + uint64_t c0, c1, cycles = 0; + + private_key priv; + public_key pub = base; + + for (unsigned long i = 0; i < its; ++i) { + + csidh_private(&priv); + + t0 = clock(); + c0 = rdtsc(); + + /**************************************/ + assert(validate(&pub)); + action(&pub, &pub, &priv); + /**************************************/ + + c1 = rdtsc(); + t1 = clock(); + cycles += c1 - c0; + time += t1 - t0; + } + + printf("iterations: %lu\n", its); + printf("clock cycles: %" PRIu64 "\n", (uint64_t) cycles / its); + printf("wall-clock time: %.3lf ms\n", 1000. * time / CLOCKS_PER_SEC / its); +} + diff --git a/csidh.c b/csidh.c new file mode 100644 index 0000000..25d897f --- /dev/null +++ b/csidh.c @@ -0,0 +1,220 @@ + +#include +#include + +#include "csidh.h" +#include "rng.h" + +/* specific to p, should perhaps be somewhere else */ +const unsigned primes[num_primes] = { + 3, 5, 7, 11, 13, 17, 19, 23, 29, 31, 37, 41, 43, 47, 53, 59, + 61, 67, 71, 73, 79, 83, 89, 97, 101, 103, 107, 109, 113, 127, 131, 137, + 139, 149, 151, 157, 163, 167, 173, 179, 181, 191, 193, 197, 199, 211, 223, 227, + 229, 233, 239, 241, 251, 257, 263, 269, 271, 277, 281, 283, 293, 307, 311, 313, + 317, 331, 337, 347, 349, 353, 359, 367, 373, 587, +}; + +const u512 four_sqrt_p = {{ + 0x85e2579c786882cf, 0x4e3433657e18da95, 0x850ae5507965a0b3, 0xa15bc4e676475964, +}}; + + +const public_key base = {0}; /* A = 0 */ + +void csidh_private(private_key *priv) +{ + memset(&priv->e, 0, sizeof(priv->e)); + for (size_t i = 0; i < num_primes; ) { + int8_t buf[64]; + randombytes(buf, sizeof(buf)); + for (size_t j = 0; j < sizeof(buf); ++j) { + if (buf[j] <= max_exponent && buf[j] >= -max_exponent) { + priv->e[i / 2] |= (buf[j] & 0xf) << i % 2 * 4; + if (++i >= num_primes) + break; + } + } + } +} + +/* compute [(p+1)/l] P for all l in our list of primes. */ +/* divide and conquer is much faster than doing it naively, + * but uses more memory. */ +static void cofactor_multiples(proj *P, const proj *A, size_t lower, size_t upper) +{ + assert(lower < upper); + + if (upper - lower == 1) + return; + + size_t mid = lower + (upper - lower + 1) / 2; + + u512 cl = u512_1, cu = u512_1; + for (size_t i = lower; i < mid; ++i) + u512_mul3_64(&cu, &cu, primes[i]); + for (size_t i = mid; i < upper; ++i) + u512_mul3_64(&cl, &cl, primes[i]); + + xMUL(&P[mid], A, &P[lower], &cu); + xMUL(&P[lower], A, &P[lower], &cl); + + cofactor_multiples(P, A, lower, mid); + cofactor_multiples(P, A, mid, upper); +} + +/* never accepts invalid keys. */ +bool validate(public_key const *in) +{ + const proj A = {in->A, fp_1}; + + do { + + proj P[num_primes]; + fp_random(&P->x); + P->z = fp_1; + + /* maximal 2-power in p+1 */ + xDBL(P, &A, P); + xDBL(P, &A, P); + + cofactor_multiples(P, &A, 0, num_primes); + + u512 order = u512_1; + + for (size_t i = num_primes - 1; i < num_primes; --i) { + + /* we only gain information if [(p+1)/l] P is non-zero */ + if (memcmp(&P[i].z, &fp_0, sizeof(fp))) { + + u512 tmp; + u512_set(&tmp, primes[i]); + xMUL(&P[i], &A, &P[i], &tmp); + + if (memcmp(&P[i].z, &fp_0, sizeof(fp))) + /* P does not have order dividing p+1. */ + return false; + + u512_mul3_64(&order, &order, primes[i]); + + if (u512_sub3(&tmp, &four_sqrt_p, &order)) /* returns borrow */ + /* order > 4 sqrt(p), hence definitely supersingular */ + return true; + } + } + + /* P didn't have big enough order to prove supersingularity. */ + } while (1); +} + +/* compute x^3 + Ax^2 + x */ +static void montgomery_rhs(fp *rhs, fp const *A, fp const *x) +{ + fp tmp; + *rhs = *x; + fp_sq1(rhs); + fp_mul3(&tmp, A, x); + fp_add2(rhs, &tmp); + fp_add2(rhs, &fp_1); + fp_mul2(rhs, x); +} + +/* totally not constant-time. */ +void action(public_key *out, public_key const *in, private_key const *priv) +{ + u512 k[2]; + u512_set(&k[0], 4); /* maximal 2-power in p+1 */ + u512_set(&k[1], 4); /* maximal 2-power in p+1 */ + + uint8_t e[2][num_primes]; + + for (size_t i = 0; i < num_primes; ++i) { + + int8_t t = (int8_t) (priv->e[i / 2] << i % 2 * 4) >> 4; + + if (t > 0) { + e[0][i] = t; + e[1][i] = 0; + u512_mul3_64(&k[1], &k[1], primes[i]); + } + else if (t < 0) { + e[1][i] = -t; + e[0][i] = 0; + u512_mul3_64(&k[0], &k[0], primes[i]); + } + else { + e[0][i] = 0; + e[1][i] = 0; + u512_mul3_64(&k[0], &k[0], primes[i]); + u512_mul3_64(&k[1], &k[1], primes[i]); + } + } + + proj A = {in->A, fp_1}; + + bool done[2] = {false, false}; + + do { + + assert(!memcmp(&A.z, &fp_1, sizeof(fp))); + + proj P; + fp_random(&P.x); + P.z = fp_1; + + fp rhs; + montgomery_rhs(&rhs, &A.x, &P.x); + bool sign = !fp_issquare(&rhs); + + if (done[sign]) + continue; + + xMUL(&P, &A, &P, &k[sign]); + + done[sign] = true; + + for (size_t i = num_primes-1; i < num_primes; --i) { //changed loop direction + + if (e[sign][i]) { + + u512 cof = u512_1; + for (size_t j = i - 1; j < num_primes; --j) //changed loop direction + if (e[sign][j]) + u512_mul3_64(&cof, &cof, primes[j]); + + proj K; + xMUL(&K, &A, &P, &cof); + + if (memcmp(&K.z, &fp_0, sizeof(fp))) { + + xISOG(&A, &P, &K, primes[i]); + + if (!--e[sign][i]) + u512_mul3_64(&k[sign], &k[sign], primes[i]); + + } + + } + + done[sign] &= !e[sign][i]; + } + + fp_inv(&A.z); + fp_mul2(&A.x, &A.z); + A.z = fp_1; + + } while (!(done[0] && done[1])); + + out->A = A.x; +} + +/* includes public-key validation. */ +bool csidh(public_key *out, public_key const *in, private_key const *priv) +{ + if (!validate(in)) { + fp_random(&out->A); + return false; + } + action(out, in, priv); + return true; +} + diff --git a/csidh.h b/csidh.h new file mode 100644 index 0000000..52e9d54 --- /dev/null +++ b/csidh.h @@ -0,0 +1,26 @@ +#ifndef CSIDH_H +#define CSIDH_H + +#include "u512.h" +#include "fp.h" +#include "mont.h" + +/* specific to p, should perhaps be somewhere else */ +#define num_primes 74 +#define max_exponent 5 /* (2*5+1)^74 is roughly 2^256 */ + + +typedef struct private_key { + int8_t e[(num_primes + 1) / 2]; /* packed int4_t */ +} private_key; + +typedef struct public_key { + fp A; /* Montgomery coefficient: represents y^2 = x^3 + Ax^2 + x */ +} public_key; + +extern const public_key base; + +void csidh_private(private_key *priv); +bool csidh(public_key *out, public_key const *in, private_key const *priv); + +#endif diff --git a/cycle.h b/cycle.h new file mode 100644 index 0000000..2652a04 --- /dev/null +++ b/cycle.h @@ -0,0 +1,514 @@ +/* + * Copyright (c) 2003, 2007-8 Matteo Frigo + * Copyright (c) 2003, 2007-8 Massachusetts Institute of Technology + * + * Permission is hereby granted, free of charge, to any person obtaining + * a copy of this software and associated documentation files (the + * "Software"), to deal in the Software without restriction, including + * without limitation the rights to use, copy, modify, merge, publish, + * distribute, sublicense, and/or sell copies of the Software, and to + * permit persons to whom the Software is furnished to do so, subject to + * the following conditions: + * + * The above copyright notice and this permission notice shall be + * included in all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE + * LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION + * OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION + * WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. + * + */ + + +/* machine-dependent cycle counters code. Needs to be inlined. */ + +/***************************************************************************/ +/* To use the cycle counters in your code, simply #include "cycle.h" (this + file), and then use the functions/macros: + + ticks getticks(void); + + ticks is an opaque typedef defined below, representing the current time. + You extract the elapsed time between two calls to gettick() via: + + double elapsed(ticks t1, ticks t0); + + which returns a double-precision variable in arbitrary units. You + are not expected to convert this into human units like seconds; it + is intended only for *comparisons* of time intervals. + + (In order to use some of the OS-dependent timer routines like + Solaris' gethrtime, you need to paste the autoconf snippet below + into your configure.ac file and #include "config.h" before cycle.h, + or define the relevant macros manually if you are not using autoconf.) +*/ + +/***************************************************************************/ +/* This file uses macros like HAVE_GETHRTIME that are assumed to be + defined according to whether the corresponding function/type/header + is available on your system. The necessary macros are most + conveniently defined if you are using GNU autoconf, via the tests: + + dnl --------------------------------------------------------------------- + + AC_C_INLINE + AC_HEADER_TIME + AC_CHECK_HEADERS([sys/time.h c_asm.h intrinsics.h mach/mach_time.h]) + + AC_CHECK_TYPE([hrtime_t],[AC_DEFINE(HAVE_HRTIME_T, 1, [Define to 1 if hrtime_t is defined in ])],,[#if HAVE_SYS_TIME_H +#include +#endif]) + + AC_CHECK_FUNCS([gethrtime read_real_time time_base_to_time clock_gettime mach_absolute_time]) + + dnl Cray UNICOS _rtc() (real-time clock) intrinsic + AC_MSG_CHECKING([for _rtc intrinsic]) + rtc_ok=yes + AC_TRY_LINK([#ifdef HAVE_INTRINSICS_H +#include +#endif], [_rtc()], [AC_DEFINE(HAVE__RTC,1,[Define if you have the UNICOS _rtc() intrinsic.])], [rtc_ok=no]) + AC_MSG_RESULT($rtc_ok) + + dnl --------------------------------------------------------------------- +*/ + +/***************************************************************************/ + +#if TIME_WITH_SYS_TIME +# include +# include +#else +# if HAVE_SYS_TIME_H +# include +# else +# include +# endif +#endif + +#define INLINE_ELAPSED(INL) static INL double elapsed(ticks t1, ticks t0) \ +{ \ + return (double)t1 - (double)t0; \ +} + +/*----------------------------------------------------------------*/ +/* Solaris */ +#if defined(HAVE_GETHRTIME) && defined(HAVE_HRTIME_T) && !defined(HAVE_TICK_COUNTER) +typedef hrtime_t ticks; + +#define getticks gethrtime + +INLINE_ELAPSED(inline) + +#define HAVE_TICK_COUNTER +#endif + +/*----------------------------------------------------------------*/ +/* AIX v. 4+ routines to read the real-time clock or time-base register */ +#if defined(HAVE_READ_REAL_TIME) && defined(HAVE_TIME_BASE_TO_TIME) && !defined(HAVE_TICK_COUNTER) +typedef timebasestruct_t ticks; + +static __inline ticks getticks(void) +{ + ticks t; + read_real_time(&t, TIMEBASE_SZ); + return t; +} + +static __inline double elapsed(ticks t1, ticks t0) /* time in nanoseconds */ +{ + time_base_to_time(&t1, TIMEBASE_SZ); + time_base_to_time(&t0, TIMEBASE_SZ); + return (((double)t1.tb_high - (double)t0.tb_high) * 1.0e9 + + ((double)t1.tb_low - (double)t0.tb_low)); +} + +#define HAVE_TICK_COUNTER +#endif + +/*----------------------------------------------------------------*/ +/* + * PowerPC ``cycle'' counter using the time base register. + */ +#if ((((defined(__GNUC__) && (defined(__powerpc__) || defined(__ppc__))) || (defined(__MWERKS__) && defined(macintosh)))) || (defined(__IBM_GCC_ASM) && (defined(__powerpc__) || defined(__ppc__)))) && !defined(HAVE_TICK_COUNTER) +typedef unsigned long long ticks; + +static __inline__ ticks getticks(void) +{ + unsigned int tbl, tbu0, tbu1; + + do { + __asm__ __volatile__ ("mftbu %0" : "=r"(tbu0)); + __asm__ __volatile__ ("mftb %0" : "=r"(tbl)); + __asm__ __volatile__ ("mftbu %0" : "=r"(tbu1)); + } while (tbu0 != tbu1); + + return (((unsigned long long)tbu0) << 32) | tbl; +} + +INLINE_ELAPSED(__inline__) + +#define HAVE_TICK_COUNTER +#endif + +/* MacOS/Mach (Darwin) time-base register interface (unlike UpTime, + from Carbon, requires no additional libraries to be linked). */ +#if defined(HAVE_MACH_ABSOLUTE_TIME) && defined(HAVE_MACH_MACH_TIME_H) && !defined(HAVE_TICK_COUNTER) +#include +typedef uint64_t ticks; +#define getticks mach_absolute_time +INLINE_ELAPSED(__inline__) +#define HAVE_TICK_COUNTER +#endif + +/*----------------------------------------------------------------*/ +/* + * Pentium cycle counter + */ +#if (defined(__GNUC__) || defined(__ICC)) && defined(__i386__) && !defined(HAVE_TICK_COUNTER) +typedef unsigned long long ticks; + +static __inline__ ticks getticks(void) +{ + ticks ret; + + __asm__ __volatile__("rdtsc": "=A" (ret)); + /* no input, nothing else clobbered */ + return ret; +} + +INLINE_ELAPSED(__inline__) + +#define HAVE_TICK_COUNTER +#define TIME_MIN 5000.0 /* unreliable pentium IV cycle counter */ +#endif + +/* Visual C++ -- thanks to Morten Nissov for his help with this */ +#if _MSC_VER >= 1200 && _M_IX86 >= 500 && !defined(HAVE_TICK_COUNTER) +#include +typedef LARGE_INTEGER ticks; +#define RDTSC __asm __emit 0fh __asm __emit 031h /* hack for VC++ 5.0 */ + +static __inline ticks getticks(void) +{ + ticks retval; + + __asm { + RDTSC + mov retval.HighPart, edx + mov retval.LowPart, eax + } + return retval; +} + +static __inline double elapsed(ticks t1, ticks t0) +{ + return (double)t1.QuadPart - (double)t0.QuadPart; +} + +#define HAVE_TICK_COUNTER +#define TIME_MIN 5000.0 /* unreliable pentium IV cycle counter */ +#endif + +/*----------------------------------------------------------------*/ +/* + * X86-64 cycle counter + */ +#if (defined(__GNUC__) || defined(__ICC) || defined(__SUNPRO_C)) && defined(__x86_64__) && !defined(HAVE_TICK_COUNTER) +typedef unsigned long long ticks; + +static __inline__ ticks getticks(void) +{ + unsigned a, d; + asm volatile("rdtsc" : "=a" (a), "=d" (d)); + return ((ticks)a) | (((ticks)d) << 32); +} + +INLINE_ELAPSED(__inline__) + +#define HAVE_TICK_COUNTER +#endif + +/* PGI compiler, courtesy Cristiano Calonaci, Andrea Tarsi, & Roberto Gori. + NOTE: this code will fail to link unless you use the -Masmkeyword compiler + option (grrr). */ +#if defined(__PGI) && defined(__x86_64__) && !defined(HAVE_TICK_COUNTER) +typedef unsigned long long ticks; +static ticks getticks(void) +{ + asm(" rdtsc; shl $0x20,%rdx; mov %eax,%eax; or %rdx,%rax; "); +} +INLINE_ELAPSED(__inline__) +#define HAVE_TICK_COUNTER +#endif + +/* Visual C++, courtesy of Dirk Michaelis */ +#if _MSC_VER >= 1400 && (defined(_M_AMD64) || defined(_M_X64)) && !defined(HAVE_TICK_COUNTER) + +#include +#pragma intrinsic(__rdtsc) +typedef unsigned __int64 ticks; +#define getticks __rdtsc +INLINE_ELAPSED(__inline) + +#define HAVE_TICK_COUNTER +#endif + +/*----------------------------------------------------------------*/ +/* + * IA64 cycle counter + */ + +/* intel's icc/ecc compiler */ +#if (defined(__EDG_VERSION) || defined(__ECC)) && defined(__ia64__) && !defined(HAVE_TICK_COUNTER) +typedef unsigned long ticks; +#include + +static __inline__ ticks getticks(void) +{ + return __getReg(_IA64_REG_AR_ITC); +} + +INLINE_ELAPSED(__inline__) + +#define HAVE_TICK_COUNTER +#endif + +/* gcc */ +#if defined(__GNUC__) && defined(__ia64__) && !defined(HAVE_TICK_COUNTER) +typedef unsigned long ticks; + +static __inline__ ticks getticks(void) +{ + ticks ret; + + __asm__ __volatile__ ("mov %0=ar.itc" : "=r"(ret)); + return ret; +} + +INLINE_ELAPSED(__inline__) + +#define HAVE_TICK_COUNTER +#endif + +/* HP/UX IA64 compiler, courtesy Teresa L. Johnson: */ +#if defined(__hpux) && defined(__ia64) && !defined(HAVE_TICK_COUNTER) +#include +typedef unsigned long ticks; + +static inline ticks getticks(void) +{ + ticks ret; + + ret = _Asm_mov_from_ar (_AREG_ITC); + return ret; +} + +INLINE_ELAPSED(inline) + +#define HAVE_TICK_COUNTER +#endif + +/* Microsoft Visual C++ */ +#if defined(_MSC_VER) && defined(_M_IA64) && !defined(HAVE_TICK_COUNTER) +typedef unsigned __int64 ticks; + +# ifdef __cplusplus +extern "C" +# endif +ticks __getReg(int whichReg); +#pragma intrinsic(__getReg) + +static __inline ticks getticks(void) +{ + volatile ticks temp; + temp = __getReg(3116); + return temp; +} + +INLINE_ELAPSED(inline) + +#define HAVE_TICK_COUNTER +#endif + +/*----------------------------------------------------------------*/ +/* + * PA-RISC cycle counter + */ +#if defined(__hppa__) || defined(__hppa) && !defined(HAVE_TICK_COUNTER) +typedef unsigned long ticks; + +# ifdef __GNUC__ +static __inline__ ticks getticks(void) +{ + ticks ret; + + __asm__ __volatile__("mfctl 16, %0": "=r" (ret)); + /* no input, nothing else clobbered */ + return ret; +} +# else +# include +static inline unsigned long getticks(void) +{ + register ticks ret; + _MFCTL(16, ret); + return ret; +} +# endif + +INLINE_ELAPSED(inline) + +#define HAVE_TICK_COUNTER +#endif + +/*----------------------------------------------------------------*/ +/* S390, courtesy of James Treacy */ +#if defined(__GNUC__) && defined(__s390__) && !defined(HAVE_TICK_COUNTER) +typedef unsigned long long ticks; + +static __inline__ ticks getticks(void) +{ + ticks cycles; + __asm__("stck 0(%0)" : : "a" (&(cycles)) : "memory", "cc"); + return cycles; +} + +INLINE_ELAPSED(__inline__) + +#define HAVE_TICK_COUNTER +#endif +/*----------------------------------------------------------------*/ +#if defined(__GNUC__) && defined(__alpha__) && !defined(HAVE_TICK_COUNTER) +/* + * The 32-bit cycle counter on alpha overflows pretty quickly, + * unfortunately. A 1GHz machine overflows in 4 seconds. + */ +typedef unsigned int ticks; + +static __inline__ ticks getticks(void) +{ + unsigned long cc; + __asm__ __volatile__ ("rpcc %0" : "=r"(cc)); + return (cc & 0xFFFFFFFF); +} + +INLINE_ELAPSED(__inline__) + +#define HAVE_TICK_COUNTER +#endif + +/*----------------------------------------------------------------*/ +#if defined(__GNUC__) && defined(__sparc_v9__) && !defined(HAVE_TICK_COUNTER) +typedef unsigned long ticks; + +static __inline__ ticks getticks(void) +{ + ticks ret; + __asm__ __volatile__("rd %%tick, %0" : "=r" (ret)); + return ret; +} + +INLINE_ELAPSED(__inline__) + +#define HAVE_TICK_COUNTER +#endif + +/*----------------------------------------------------------------*/ +#if (defined(__DECC) || defined(__DECCXX)) && defined(__alpha) && defined(HAVE_C_ASM_H) && !defined(HAVE_TICK_COUNTER) +# include +typedef unsigned int ticks; + +static __inline ticks getticks(void) +{ + unsigned long cc; + cc = asm("rpcc %v0"); + return (cc & 0xFFFFFFFF); +} + +INLINE_ELAPSED(__inline) + +#define HAVE_TICK_COUNTER +#endif +/*----------------------------------------------------------------*/ +/* SGI/Irix */ +#if defined(HAVE_CLOCK_GETTIME) && defined(CLOCK_SGI_CYCLE) && !defined(HAVE_TICK_COUNTER) +typedef struct timespec ticks; + +static inline ticks getticks(void) +{ + struct timespec t; + clock_gettime(CLOCK_SGI_CYCLE, &t); + return t; +} + +static inline double elapsed(ticks t1, ticks t0) +{ + return ((double)t1.tv_sec - (double)t0.tv_sec) * 1.0E9 + + ((double)t1.tv_nsec - (double)t0.tv_nsec); +} +#define HAVE_TICK_COUNTER +#endif + +/*----------------------------------------------------------------*/ +/* Cray UNICOS _rtc() intrinsic function */ +#if defined(HAVE__RTC) && !defined(HAVE_TICK_COUNTER) +#ifdef HAVE_INTRINSICS_H +# include +#endif + +typedef long long ticks; + +#define getticks _rtc + +INLINE_ELAPSED(inline) + +#define HAVE_TICK_COUNTER +#endif + +/*----------------------------------------------------------------*/ +/* MIPS ZBus */ +#if HAVE_MIPS_ZBUS_TIMER +#if defined(__mips__) && !defined(HAVE_TICK_COUNTER) +#include +#include +#include + +typedef uint64_t ticks; + +static inline ticks getticks(void) +{ + static uint64_t* addr = 0; + + if (addr == 0) + { + uint32_t rq_addr = 0x10030000; + int fd; + int pgsize; + + pgsize = getpagesize(); + fd = open ("/dev/mem", O_RDONLY | O_SYNC, 0); + if (fd < 0) { + perror("open"); + return NULL; + } + addr = mmap(0, pgsize, PROT_READ, MAP_SHARED, fd, rq_addr); + close(fd); + if (addr == (uint64_t *)-1) { + perror("mmap"); + return NULL; + } + } + + return *addr; +} + +INLINE_ELAPSED(inline) + +#define HAVE_TICK_COUNTER +#endif +#endif /* HAVE_MIPS_ZBUS_TIMER */ + diff --git a/fp.h b/fp.h new file mode 100644 index 0000000..6cfa064 --- /dev/null +++ b/fp.h @@ -0,0 +1,37 @@ +#ifndef FP_H +#define FP_H + +#include "u512.h" + +/* fp is in the Montgomery domain, so interpreting that + as an integer should never make sense. + enable compiler warnings when mixing up u512 and fp. */ +typedef struct fp { + u512 x; +} fp; + +extern const fp fp_0; +extern const fp fp_1; + +void fp_set(fp *x, uint64_t y); +void fp_cswap(fp *x, fp *y, bool c); + +void fp_enc(fp *x, u512 const *y); /* encode to Montgomery representation */ +void fp_dec(u512 *x, fp const *y); /* decode from Montgomery representation */ + +void fp_add2(fp *x, fp const *y); +void fp_sub2(fp *x, fp const *y); +void fp_mul2(fp *x, fp const *y); + +void fp_add3(fp *x, fp const *y, fp const *z); +void fp_sub3(fp *x, fp const *y, fp const *z); +void fp_mul3(fp *x, fp const *y, fp const *z); + +void fp_sq1(fp *x); +void fp_sq2(fp *x, fp const *y); +void fp_inv(fp *x); +bool fp_issquare(fp const *x); + +void fp_random(fp *x); + +#endif diff --git a/fp.s b/fp.s new file mode 100644 index 0000000..ac65d94 --- /dev/null +++ b/fp.s @@ -0,0 +1,452 @@ + +.intel_syntax noprefix + +.section .rodata + +.set pbits, 511 +p: + .quad 0x1b81b90533c6c87b, 0xc2721bf457aca835, 0x516730cc1f0b4f25, 0xa7aac6c567f35507 + .quad 0x5afbfcc69322c9cd, 0xb42d083aedc88c42, 0xfc8ab0d15e3e4c4a, 0x65b48e8f740f89bf + + +.global fp_0 +fp_0: .quad 0, 0, 0, 0, 0, 0, 0, 0 + +.global fp_1 +fp_1: /* 2^512 mod p */ + .quad 0xc8fc8df598726f0a, 0x7b1bc81750a6af95, 0x5d319e67c1e961b4, 0xb0aa7275301955f1 + .quad 0x4a080672d9ba6c64, 0x97a5ef8a246ee77b, 0x06ea9e5d4383676a, 0x3496e2e117e0ec80 + + +/* (2^512)^2 mod p */ +.r_squared_mod_p: + .quad 0x36905b572ffc1724, 0x67086f4525f1f27d, 0x4faf3fbfd22370ca, 0x192ea214bcc584b1 + .quad 0x5dae03ee2f5de3d0, 0x1e9248731776b371, 0xad5f166e20e4f52d, 0x4ed759aea6f3917e + +/* -p^-1 mod 2^64 */ +.inv_min_p_mod_r: + .quad 0x66c1301f632e294d + + +.section .text + +.global fp_copy +fp_copy: + cld + mov rcx, 8 + rep movsq + ret + +.global fp_set +fp_set: + push rdi + call u512_set + pop rdi + mov rsi, rdi + jmp fp_enc + +.global fp_cswap +fp_cswap: + movzx rax, dl + neg rax + .set k, 0 + .rept 8 + mov rcx, [rdi + 8*k] + mov rdx, [rsi + 8*k] + + mov r8, rcx + xor r8, rdx + and r8, rax + + xor rcx, r8 + xor rdx, r8 + + mov [rdi + 8*k], rcx + mov [rsi + 8*k], rdx + + .set k, k+1 + .endr + ret + +.reduce_once: + push rbp + mov rbp, rdi + + mov rdi, [rbp + 0] + sub rdi, [rip + p + 0] + mov rsi, [rbp + 8] + sbb rsi, [rip + p + 8] + mov rdx, [rbp + 16] + sbb rdx, [rip + p + 16] + mov rcx, [rbp + 24] + sbb rcx, [rip + p + 24] + mov r8, [rbp + 32] + sbb r8, [rip + p + 32] + mov r9, [rbp + 40] + sbb r9, [rip + p + 40] + mov r10, [rbp + 48] + sbb r10, [rip + p + 48] + mov r11, [rbp + 56] + sbb r11, [rip + p + 56] + + setnc al + movzx rax, al + neg rax + +.macro cswap2, r, m + xor \r, \m + and \r, rax + xor \m, \r +.endm + + cswap2 rdi, [rbp + 0] + cswap2 rsi, [rbp + 8] + cswap2 rdx, [rbp + 16] + cswap2 rcx, [rbp + 24] + cswap2 r8, [rbp + 32] + cswap2 r9, [rbp + 40] + cswap2 r10, [rbp + 48] + cswap2 r11, [rbp + 56] + + pop rbp + ret + +.global fp_add3 +fp_add3: + push rdi + call u512_add3 + pop rdi + jmp .reduce_once + +.global fp_add2 +fp_add2: + mov rdx, rdi + jmp fp_add3 + +.global fp_sub3 +fp_sub3: + push rdi + call u512_sub3 + pop rdi + xor rsi, rsi + xor rdx, rdx + xor rcx, rcx + xor r8, r8 + xor r9, r9 + xor r10, r10 + xor r11, r11 + test rax, rax + cmovnz rax, [rip + p + 0] + cmovnz rsi, [rip + p + 8] + cmovnz rdx, [rip + p + 16] + cmovnz rcx, [rip + p + 24] + cmovnz r8, [rip + p + 32] + cmovnz r9, [rip + p + 40] + cmovnz r10, [rip + p + 48] + cmovnz r11, [rip + p + 56] + add [rdi + 0], rax + adc [rdi + 8], rsi + adc [rdi + 16], rdx + adc [rdi + 24], rcx + adc [rdi + 32], r8 + adc [rdi + 40], r9 + adc [rdi + 48], r10 + adc [rdi + 56], r11 + ret + +.global fp_sub2 +fp_sub2: + mov rdx, rdi + xchg rsi, rdx + jmp fp_sub3 + + +/* Montgomery arithmetic */ + +.global fp_enc +fp_enc: + lea rdx, [rip + .r_squared_mod_p] + jmp fp_mul3 + +.global fp_dec +fp_dec: + lea rdx, [rip + u512_1] + jmp fp_mul3 + +.global fp_mul3 +fp_mul3: + push rbp + push rbx + push r12 + push r13 + push r14 + push r15 + + push rdi + + mov rdi, rsi + mov rsi, rdx + + xor r8, r8 + xor r9, r9 + xor r10, r10 + xor r11, r11 + xor r12, r12 + xor r13, r13 + xor r14, r14 + xor r15, r15 + xor rbp, rbp + + /* flags are already cleared */ + +.macro MULSTEP, k, r0, r1, r2, r3, r4, r5, r6, r7, r8 + + mov rdx, [rsi + 0] + mulx rcx, rdx, [rdi + 8*\k] + add rdx, \r0 + mulx rcx, rdx, [rip + .inv_min_p_mod_r] + + xor rax, rax /* clear flags */ + + mulx rbx, rax, [rip + p + 0] + adox \r0, rax + + mulx rcx, rax, [rip + p + 8] + adcx \r1, rbx + adox \r1, rax + + mulx rbx, rax, [rip + p + 16] + adcx \r2, rcx + adox \r2, rax + + mulx rcx, rax, [rip + p + 24] + adcx \r3, rbx + adox \r3, rax + + mulx rbx, rax, [rip + p + 32] + adcx \r4, rcx + adox \r4, rax + + mulx rcx, rax, [rip + p + 40] + adcx \r5, rbx + adox \r5, rax + + mulx rbx, rax, [rip + p + 48] + adcx \r6, rcx + adox \r6, rax + + mulx rcx, rax, [rip + p + 56] + adcx \r7, rbx + adox \r7, rax + + mov rax, 0 + adcx \r8, rcx + adox \r8, rax + + + mov rdx, [rdi + 8*\k] + + xor rax, rax /* clear flags */ + + mulx rbx, rax, [rsi + 0] + adox \r0, rax + + mulx rcx, rax, [rsi + 8] + adcx \r1, rbx + adox \r1, rax + + mulx rbx, rax, [rsi + 16] + adcx \r2, rcx + adox \r2, rax + + mulx rcx, rax, [rsi + 24] + adcx \r3, rbx + adox \r3, rax + + mulx rbx, rax, [rsi + 32] + adcx \r4, rcx + adox \r4, rax + + mulx rcx, rax, [rsi + 40] + adcx \r5, rbx + adox \r5, rax + + mulx rbx, rax, [rsi + 48] + adcx \r6, rcx + adox \r6, rax + + mulx rcx, rax, [rsi + 56] + adcx \r7, rbx + adox \r7, rax + + mov rax, 0 + adcx \r8, rcx + adox \r8, rax + +.endm + + MULSTEP 0, r8, r9, r10, r11, r12, r13, r14, r15, rbp + MULSTEP 1, r9, r10, r11, r12, r13, r14, r15, rbp, r8 + MULSTEP 2, r10, r11, r12, r13, r14, r15, rbp, r8, r9 + MULSTEP 3, r11, r12, r13, r14, r15, rbp, r8, r9, r10 + MULSTEP 4, r12, r13, r14, r15, rbp, r8, r9, r10, r11 + MULSTEP 5, r13, r14, r15, rbp, r8, r9, r10, r11, r12 + MULSTEP 6, r14, r15, rbp, r8, r9, r10, r11, r12, r13 + MULSTEP 7, r15, rbp, r8, r9, r10, r11, r12, r13, r14 + + pop rdi + + mov [rdi + 0], rbp + mov [rdi + 8], r8 + mov [rdi + 16], r9 + mov [rdi + 24], r10 + mov [rdi + 32], r11 + mov [rdi + 40], r12 + mov [rdi + 48], r13 + mov [rdi + 56], r14 + + pop r15 + pop r14 + pop r13 + pop r12 + pop rbx + pop rbp + jmp .reduce_once + +.global fp_mul2 +fp_mul2: + mov rdx, rdi + jmp fp_mul3 + +.global fp_sq2 +fp_sq2: + /* TODO implement optimized Montgomery squaring */ + mov rdx, rsi + jmp fp_mul3 + +.global fp_sq1 +fp_sq1: + mov rsi, rdi + jmp fp_sq2 + +/* (obviously) not constant time in the exponent! */ +.fp_pow: + push rbx + mov rbx, rsi + push r12 + push r13 + push rdi + sub rsp, 64 + + mov rsi, rdi + mov rdi, rsp + call fp_copy + + mov rdi, [rsp + 64] + lea rsi, [rip + fp_1] + call fp_copy + +.macro POWSTEP, k + mov r13, [rbx + 8*\k] + xor r12, r12 + + 0: + test r13, 1 + jz 1f + + mov rdi, [rsp + 64] + mov rsi, rsp + call fp_mul2 + + 1: + mov rdi, rsp + call fp_sq1 + + shr r13 + + inc r12 + test r12, 64 + jz 0b +.endm + + POWSTEP 0 + POWSTEP 1 + POWSTEP 2 + POWSTEP 3 + POWSTEP 4 + POWSTEP 5 + POWSTEP 6 + POWSTEP 7 + + add rsp, 64+8 + pop r13 + pop r12 + pop rbx + ret + +.section .rodata +.p_minus_2: + .quad 0x1b81b90533c6c879, 0xc2721bf457aca835, 0x516730cc1f0b4f25, 0xa7aac6c567f35507 + .quad 0x5afbfcc69322c9cd, 0xb42d083aedc88c42, 0xfc8ab0d15e3e4c4a, 0x65b48e8f740f89bf + +.section .text + +/* TODO use a better addition chain? */ +.global fp_inv +fp_inv: + lea rsi, [rip + .p_minus_2] + jmp .fp_pow + +.section .rodata +.p_minus_1_halves: + .quad 0x8dc0dc8299e3643d, 0xe1390dfa2bd6541a, 0xa8b398660f85a792, 0xd3d56362b3f9aa83 + .quad 0x2d7dfe63499164e6, 0x5a16841d76e44621, 0xfe455868af1f2625, 0x32da4747ba07c4df + +.section .text + +/* TODO use a better addition chain? */ +.global fp_issquare +fp_issquare: + push rdi + lea rsi, [rip + .p_minus_1_halves] + call .fp_pow + pop rdi + + xor rax, rax + .set k, 0 + .rept 8 + mov rsi, [rdi + 8*k] + xor rsi, [rip + fp_1 + 8*k] + or rax, rsi + .set k, k+1 + .endr + test rax, rax + setz al + movzx rax, al + ret + + +/* not constant time (but this shouldn't leak anything of importance) */ +.global fp_random +fp_random: + + push rdi + mov rsi, 64 + call randombytes + pop rdi + mov rax, 1 + shl rax, (pbits % 64) + dec rax + and [rdi + 56], rax + + .set k, 7 + .rept 8 + mov rax, [rip + p + 8*k] + cmp [rdi + 8*k], rax + jge fp_random + jl 0f + .set k, k-1 + .endr + 0: + ret + diff --git a/main.c b/main.c new file mode 100644 index 0000000..6649bfc --- /dev/null +++ b/main.c @@ -0,0 +1,99 @@ + +#include +#include +#include +#include +#include +#include + +#include "u512.h" +#include "fp.h" +#include "mont.h" +#include "csidh.h" + +void u512_print(u512 const *x) +{ + for (size_t i = 63; i < 64; --i) + printf("%02hhx", i[(unsigned char *) x->c]); +} + +void fp_print(fp const *x) +{ + u512 y; + fp_dec(&y, x); + u512_print(&y); +} + +int main() +{ + clock_t t0, t1; + + private_key priv_alice, priv_bob; + public_key pub_alice, pub_bob; + public_key shared_alice, shared_bob; + + printf("\n"); + + + t0 = clock(); + csidh_private(&priv_alice); + t1 = clock(); + + printf("Alice's private key (%7.3lf ms):\n ", 1000. * (t1 - t0) / CLOCKS_PER_SEC); + for (size_t i = 0; i < sizeof(priv_alice); ++i) + printf("%02hhx", i[(uint8_t *) &priv_alice]); + printf("\n\n"); + + t0 = clock(); + csidh_private(&priv_bob); + t1 = clock(); + + printf("Bob's private key (%7.3lf ms):\n ", 1000. * (t1 - t0) / CLOCKS_PER_SEC); + for (size_t i = 0; i < sizeof(priv_bob); ++i) + printf("%02hhx", i[(uint8_t *) &priv_bob]); + printf("\n\n"); + + + t0 = clock(); + assert(csidh(&pub_alice, &base, &priv_alice)); + t1 = clock(); + + printf("Alice's public key (%7.3lf ms):\n ", 1000. * (t1 - t0) / CLOCKS_PER_SEC); + fp_print(&pub_alice.A); + printf("\n\n"); + + t0 = clock(); + assert(csidh(&pub_bob, &base, &priv_bob)); + t1 = clock(); + + printf("Bob's public key (%7.3lf ms):\n ", 1000. * (t1 - t0) / CLOCKS_PER_SEC); + fp_print(&pub_bob.A); + printf("\n\n"); + + + t0 = clock(); + assert(csidh(&shared_alice, &pub_bob, &priv_alice)); + t1 = clock(); + + printf("Alice's shared secret (%7.3lf ms):\n ", 1000. * (t1 - t0) / CLOCKS_PER_SEC); + fp_print(&shared_alice.A); + printf("\n\n"); + + t0 = clock(); + assert(csidh(&shared_bob, &pub_alice, &priv_bob)); + t1 = clock(); + + printf("Bob's shared secret (%7.3lf ms):\n ", 1000. * (t1 - t0) / CLOCKS_PER_SEC); + fp_print(&shared_bob.A); + printf("\n\n"); + + printf(" "); + if (memcmp(&shared_alice, &shared_bob, sizeof(public_key))) + printf("\x1b[31mNOT EQUAL!\x1b[0m\n"); + else + printf("\x1b[32mequal.\x1b[0m\n"); + printf("\n"); + + printf("\n"); +} + diff --git a/mont.c b/mont.c new file mode 100644 index 0000000..6950969 --- /dev/null +++ b/mont.c @@ -0,0 +1,203 @@ + +#include + +#include "mont.h" +#include "u512.h" + +void xDBLADD(proj *R, proj *S, proj const *P, proj const *Q, proj const *PQ, proj const *A24) +{ + fp tmp0, tmp1, tmp2; //requires precomputation of A24=(A+2C:4C) + + fp_add3(&tmp0, &P->x, &P->z); + fp_sub3(&tmp1, &P->x, &P->z); + fp_sq2(&R->x, &tmp0); + fp_sub3(&tmp2, &Q->x, &Q->z); + fp_add3(&S->x, &Q->x, &Q->z); + fp_mul2(&tmp0, &tmp2); + fp_sq2(&R->z, &tmp1); + fp_mul2(&tmp1, &S->x); + fp_sub3(&tmp2, &R->x, &R->z); + fp_mul2(&R->z, &A24->z); + fp_mul2(&R->x, &R->z); + fp_mul3(&S->x, &A24->x, &tmp2); + fp_sub3(&S->z, &tmp0, &tmp1); + fp_add2(&R->z, &S->x); + fp_add3(&S->x, &tmp0, &tmp1); + fp_mul2(&R->z, &tmp2); + fp_sq1(&S->z); + fp_sq1(&S->x); + fp_mul2(&S->z, &PQ->x); + fp_mul2(&S->x, &PQ->z); +} + +void xDBL(proj *Q, proj const *A, proj const *P) +{ + fp a, b, c; + fp_add3(&a, &P->x, &P->z); + fp_sq1(&a); + fp_sub3(&b, &P->x, &P->z); + fp_sq1(&b); + fp_sub3(&c, &a, &b); + fp_add2(&b, &b); fp_add2(&b, &b); /* multiplication by 4 */ + fp_mul2(&b, &A->z); + fp_mul3(&Q->x, &a, &b); + fp_add3(&a, &A->z, &A->z); /* multiplication by 2 */ + fp_add2(&a, &A->x); + fp_mul2(&a, &c); + fp_add2(&a, &b); + fp_mul3(&Q->z, &a, &c); +} + +void xADD(proj *S, proj const *P, proj const *Q, proj const *PQ) +{ + fp a, b, c, d; + fp_add3(&a, &P->x, &P->z); + fp_sub3(&b, &P->x, &P->z); + fp_add3(&c, &Q->x, &Q->z); + fp_sub3(&d, &Q->x, &Q->z); + fp_mul2(&a, &d); + fp_mul2(&b, &c); + fp_add3(&c, &a, &b); + fp_sub3(&d, &a, &b); + fp_sq1(&c); + fp_sq1(&d); + fp_mul3(&S->x, &PQ->z, &c); + fp_mul3(&S->z, &PQ->x, &d); +} + +/* Montgomery ladder. */ +/* P must not be the unique point of order 2. */ +/* not constant-time! */ +void xMUL(proj *Q, proj const *A, proj const *P, u512 const *k) +{ + proj R = *P; + proj A24; + const proj Pcopy = *P; /* in case Q = P */ + + Q->x = fp_1; + Q->z = fp_0; + + fp_add3(&A24.x, &A->z, &A->z); //precomputation of A24=(A+2C:4C) + fp_add3(&A24.z, &A24.x, &A24.x); + fp_add2(&A24.x, &A->x); + + unsigned long i = 512; + while (--i && !u512_bit(k, i)); + + do { + + bool bit = u512_bit(k, i); + + if (bit) { proj T = *Q; *Q = R; R = T; } /* not constant-time */ + //fp_cswap(&Q->x, &R.x, bit); + //fp_cswap(&Q->z, &R.z, bit); + + xDBLADD(Q, &R, Q, &R, &Pcopy, &A24); + + if (bit) { proj T = *Q; *Q = R; R = T; } /* not constant-time */ + //fp_cswap(&Q->x, &R.x, bit); + //fp_cswap(&Q->z, &R.z, bit); + + } while (i--); +} + +//simultaneous square-and-multiply, computes x^exp and y^exp +void exp_by_squaring_(fp* x, fp* y, uint64_t exp) +{ + fp result1, result2; + fp_set(&result1, 1); + fp_set(&result2, 1); + + while (exp) + { + if (exp & 1){ + fp_mul2(&result1, x); + fp_mul2(&result2, y); + } + + fp_sq1(x); + fp_sq1(y); + exp >>= 1; + } + + fp_cswap(&result1, x, 1); + fp_cswap(&result2, y, 1); + +} + + +/* computes the isogeny with kernel point K of order k */ +/* returns the new curve coefficient A and the image of P */ +/* (obviously) not constant time in k */ +void xISOG(proj *A, proj *P, proj const *K, uint64_t k) +{ + assert (k >= 3); + assert (k % 2 == 1); + + fp tmp0, tmp1, tmp2, Psum, Pdif; + proj Q, Aed, prod; + + fp_add3(&Aed.z, &A->z, &A->z); //compute twisted Edwards curve coefficients + fp_add3(&Aed.x, &A->x, &Aed.z); + fp_sub3(&Aed.z, &A->x, &Aed.z); + + fp_add3(&Psum, &P->x, &P->z); //precomputations + fp_sub3(&Pdif, &P->x, &P->z); + + fp_sub3(&prod.x, &K->x, &K->z); + fp_add3(&prod.z, &K->x, &K->z); + + fp_mul3(&tmp1, &prod.x, &Psum); + fp_mul3(&tmp0, &prod.z, &Pdif); + fp_add3(&Q.x, &tmp0, &tmp1); + fp_sub3(&Q.z, &tmp0, &tmp1); + + proj M[3] = {*K}; + xDBL(&M[1], A, K); + + for (uint64_t i = 1; i < k / 2; ++i) { + + if (i >= 2) + xADD(&M[i % 3], &M[(i - 1) % 3], K, &M[(i - 2) % 3]); + + fp_sub3(&tmp1, &M[i % 3].x, &M[i % 3].z); + fp_add3(&tmp0, &M[i % 3].x, &M[i % 3].z); + fp_mul2(&prod.x, &tmp1); + fp_mul2(&prod.z, &tmp0); + fp_mul2(&tmp1, &Psum); + fp_mul2(&tmp0, &Pdif); + fp_add3(&tmp2, &tmp0, &tmp1); + fp_mul2(&Q.x, &tmp2); + fp_sub3(&tmp2, &tmp0, &tmp1); + fp_mul2(&Q.z, &tmp2); + + } + + + // point evaluation + fp_sq1(&Q.x); + fp_sq1(&Q.z); + fp_mul2(&P->x, &Q.x); + fp_mul2(&P->z, &Q.z); + + //compute Aed.x^k, Aed.z^k + exp_by_squaring_(&Aed.x, &Aed.z, k); + + //compute prod.x^8, prod.z^8 + fp_sq1(&prod.x); + fp_sq1(&prod.x); + fp_sq1(&prod.x); + fp_sq1(&prod.z); + fp_sq1(&prod.z); + fp_sq1(&prod.z); + + //compute image curve parameters + fp_mul2(&Aed.z, &prod.x); + fp_mul2(&Aed.x, &prod.z); + + //compute Montgomery params + fp_add3(&A->x, &Aed.x, &Aed.z); + fp_sub3(&A->z, &Aed.x, &Aed.z); + fp_add2(&A->x, &A->x); +} + diff --git a/mont.h b/mont.h new file mode 100644 index 0000000..36b3ad7 --- /dev/null +++ b/mont.h @@ -0,0 +1,19 @@ +#ifndef MONT_H +#define MONT_H + +#include "u512.h" +#include "fp.h" + +/* P^1 over fp. */ +typedef struct proj { + fp x; + fp z; +} proj; + +void xDBL(proj *Q, proj const *A, proj const *P); +void xADD(proj *S, proj const *P, proj const *Q, proj const *PQ); +void xDBLADD(proj *R, proj *S, proj const *P, proj const *Q, proj const *PQ, proj const *A); +void xMUL(proj *Q, proj const *A, proj const *P, u512 const *k); +void xISOG(proj *A, proj *P, proj const *K, uint64_t k); + +#endif diff --git a/rng.c b/rng.c new file mode 100644 index 0000000..fc28c87 --- /dev/null +++ b/rng.c @@ -0,0 +1,18 @@ + +#include "rng.h" + +#include +#include +#include + +void randombytes(void *x, size_t l) +{ + static int fd = -1; + ssize_t n; + if (fd < 0 && 0 > (fd = open("/dev/urandom", O_RDONLY))) + exit(1); + for (size_t i = 0; i < l; i += n) + if (0 >= (n = read(fd, (char *) x + i, l - i))) + exit(2); +} + diff --git a/rng.h b/rng.h new file mode 100644 index 0000000..30e01d0 --- /dev/null +++ b/rng.h @@ -0,0 +1,8 @@ +#ifndef RNG_H +#define RNG_H + +#include + +void randombytes(void *x, size_t l); + +#endif diff --git a/u512.h b/u512.h new file mode 100644 index 0000000..6eaae93 --- /dev/null +++ b/u512.h @@ -0,0 +1,22 @@ +#ifndef UINT_H +#define UINT_H + +#include +#include + +typedef struct u512 { + uint64_t c[8]; +} u512; + +extern const u512 u512_1; + +void u512_set(u512 *x, uint64_t y); + +bool u512_bit(u512 const *x, uint64_t k); + +bool u512_add3(u512 *x, u512 const *y, u512 const *z); /* returns carry */ +bool u512_sub3(u512 *x, u512 const *y, u512 const *z); /* returns borrow */ + +void u512_mul3_64(u512 *x, u512 const *y, uint64_t z); + +#endif diff --git a/u512.s b/u512.s new file mode 100644 index 0000000..779be3e --- /dev/null +++ b/u512.s @@ -0,0 +1,102 @@ + +.intel_syntax noprefix + +.section .rodata + +.global u512_1 +u512_1: .quad 1, 0, 0, 0, 0, 0, 0, 0 + + +.section .text + +.global u512_set +u512_set: + cld + mov rax, rsi + stosq + xor rax, rax + mov rcx, 7 + rep stosq + ret + + +.global u512_bit +u512_bit: + mov rcx, rsi + and rcx, 0x3f + shr rsi, 6 + mov rax, [rdi + 8*rsi] + shr rax, cl + and rax, 1 + ret + + +.global u512_add3 +u512_add3: + mov rax, [rsi + 0] + add rax, [rdx + 0] + mov [rdi + 0], rax + .set k, 1 + .rept 7 + mov rax, [rsi + 8*k] + adc rax, [rdx + 8*k] + mov [rdi + 8*k], rax + .set k, k+1 + .endr + setc al + movzx rax, al + ret + +.global u512_sub3 +u512_sub3: + mov rax, [rsi + 0] + sub rax, [rdx + 0] + mov [rdi + 0], rax + .set k, 1 + .rept 7 + mov rax, [rsi + 8*k] + sbb rax, [rdx + 8*k] + mov [rdi + 8*k], rax + .set k, k+1 + .endr + setc al + movzx rax, al + ret + + +.global u512_mul3_64 +u512_mul3_64: + + mulx r10, rax, [rsi + 0] + mov [rdi + 0], rax + + mulx r11, rax, [rsi + 8] + add rax, r10 + mov [rdi + 8], rax + + mulx r10, rax, [rsi + 16] + adcx rax, r11 + mov [rdi + 16], rax + + mulx r11, rax, [rsi + 24] + adcx rax, r10 + mov [rdi + 24], rax + + mulx r10, rax, [rsi + 32] + adcx rax, r11 + mov [rdi + 32],rax + + mulx r11, rax, [rsi + 40] + adcx rax, r10 + mov [rdi + 40],rax + + mulx r10, rax, [rsi + 48] + adcx rax, r11 + mov [rdi + 48],rax + + mulx r11, rax, [rsi + 56] + adcx rax, r10 + mov [rdi + 56],rax + + ret +