@@ -0,0 +1,25 @@ | |||
all: | |||
@gcc \ | |||
-Wall -Wextra \ | |||
-O0 -funroll-loops \ | |||
rng.c \ | |||
u512.s fp.s \ | |||
mont.c \ | |||
csidh.c \ | |||
main.c \ | |||
-o main | |||
debug: | |||
gcc \ | |||
-Wall -Wextra \ | |||
-g \ | |||
rng.c \ | |||
u512.s fp.s \ | |||
mont.c \ | |||
csidh.c \ | |||
main.c \ | |||
-o main | |||
clean: | |||
rm -f main | |||
@@ -0,0 +1,54 @@ | |||
#include <stdlib.h> | |||
#include <stdio.h> | |||
#include <string.h> | |||
#include <time.h> | |||
#include <assert.h> | |||
#include "u512.h" | |||
#include "fp.h" | |||
#include "mont.h" | |||
#include "csidh.h" | |||
#include <inttypes.h> | |||
static __inline__ uint64_t rdtsc(void) | |||
{ | |||
uint32_t hi, lo; | |||
__asm__ __volatile__ ("rdtsc" : "=a"(lo), "=d"(hi)); | |||
return lo | (uint64_t) hi << 32; | |||
} | |||
unsigned long its = 10000; | |||
int main() | |||
{ | |||
clock_t t0, t1, time = 0; | |||
uint64_t c0, c1, cycles = 0; | |||
private_key priv; | |||
public_key pub = base; | |||
for (unsigned long i = 0; i < its; ++i) { | |||
csidh_private(&priv); | |||
t0 = clock(); | |||
c0 = rdtsc(); | |||
/**************************************/ | |||
assert(validate(&pub)); | |||
action(&pub, &pub, &priv); | |||
/**************************************/ | |||
c1 = rdtsc(); | |||
t1 = clock(); | |||
cycles += c1 - c0; | |||
time += t1 - t0; | |||
} | |||
printf("iterations: %lu\n", its); | |||
printf("clock cycles: %" PRIu64 "\n", (uint64_t) cycles / its); | |||
printf("wall-clock time: %.3lf ms\n", 1000. * time / CLOCKS_PER_SEC / its); | |||
} | |||
@@ -0,0 +1,220 @@ | |||
#include <string.h> | |||
#include <assert.h> | |||
#include "csidh.h" | |||
#include "rng.h" | |||
/* specific to p, should perhaps be somewhere else */ | |||
const unsigned primes[num_primes] = { | |||
3, 5, 7, 11, 13, 17, 19, 23, 29, 31, 37, 41, 43, 47, 53, 59, | |||
61, 67, 71, 73, 79, 83, 89, 97, 101, 103, 107, 109, 113, 127, 131, 137, | |||
139, 149, 151, 157, 163, 167, 173, 179, 181, 191, 193, 197, 199, 211, 223, 227, | |||
229, 233, 239, 241, 251, 257, 263, 269, 271, 277, 281, 283, 293, 307, 311, 313, | |||
317, 331, 337, 347, 349, 353, 359, 367, 373, 587, | |||
}; | |||
const u512 four_sqrt_p = {{ | |||
0x85e2579c786882cf, 0x4e3433657e18da95, 0x850ae5507965a0b3, 0xa15bc4e676475964, | |||
}}; | |||
const public_key base = {0}; /* A = 0 */ | |||
void csidh_private(private_key *priv) | |||
{ | |||
memset(&priv->e, 0, sizeof(priv->e)); | |||
for (size_t i = 0; i < num_primes; ) { | |||
int8_t buf[64]; | |||
randombytes(buf, sizeof(buf)); | |||
for (size_t j = 0; j < sizeof(buf); ++j) { | |||
if (buf[j] <= max_exponent && buf[j] >= -max_exponent) { | |||
priv->e[i / 2] |= (buf[j] & 0xf) << i % 2 * 4; | |||
if (++i >= num_primes) | |||
break; | |||
} | |||
} | |||
} | |||
} | |||
/* compute [(p+1)/l] P for all l in our list of primes. */ | |||
/* divide and conquer is much faster than doing it naively, | |||
* but uses more memory. */ | |||
static void cofactor_multiples(proj *P, const proj *A, size_t lower, size_t upper) | |||
{ | |||
assert(lower < upper); | |||
if (upper - lower == 1) | |||
return; | |||
size_t mid = lower + (upper - lower + 1) / 2; | |||
u512 cl = u512_1, cu = u512_1; | |||
for (size_t i = lower; i < mid; ++i) | |||
u512_mul3_64(&cu, &cu, primes[i]); | |||
for (size_t i = mid; i < upper; ++i) | |||
u512_mul3_64(&cl, &cl, primes[i]); | |||
xMUL(&P[mid], A, &P[lower], &cu); | |||
xMUL(&P[lower], A, &P[lower], &cl); | |||
cofactor_multiples(P, A, lower, mid); | |||
cofactor_multiples(P, A, mid, upper); | |||
} | |||
/* never accepts invalid keys. */ | |||
bool validate(public_key const *in) | |||
{ | |||
const proj A = {in->A, fp_1}; | |||
do { | |||
proj P[num_primes]; | |||
fp_random(&P->x); | |||
P->z = fp_1; | |||
/* maximal 2-power in p+1 */ | |||
xDBL(P, &A, P); | |||
xDBL(P, &A, P); | |||
cofactor_multiples(P, &A, 0, num_primes); | |||
u512 order = u512_1; | |||
for (size_t i = num_primes - 1; i < num_primes; --i) { | |||
/* we only gain information if [(p+1)/l] P is non-zero */ | |||
if (memcmp(&P[i].z, &fp_0, sizeof(fp))) { | |||
u512 tmp; | |||
u512_set(&tmp, primes[i]); | |||
xMUL(&P[i], &A, &P[i], &tmp); | |||
if (memcmp(&P[i].z, &fp_0, sizeof(fp))) | |||
/* P does not have order dividing p+1. */ | |||
return false; | |||
u512_mul3_64(&order, &order, primes[i]); | |||
if (u512_sub3(&tmp, &four_sqrt_p, &order)) /* returns borrow */ | |||
/* order > 4 sqrt(p), hence definitely supersingular */ | |||
return true; | |||
} | |||
} | |||
/* P didn't have big enough order to prove supersingularity. */ | |||
} while (1); | |||
} | |||
/* compute x^3 + Ax^2 + x */ | |||
static void montgomery_rhs(fp *rhs, fp const *A, fp const *x) | |||
{ | |||
fp tmp; | |||
*rhs = *x; | |||
fp_sq1(rhs); | |||
fp_mul3(&tmp, A, x); | |||
fp_add2(rhs, &tmp); | |||
fp_add2(rhs, &fp_1); | |||
fp_mul2(rhs, x); | |||
} | |||
/* totally not constant-time. */ | |||
void action(public_key *out, public_key const *in, private_key const *priv) | |||
{ | |||
u512 k[2]; | |||
u512_set(&k[0], 4); /* maximal 2-power in p+1 */ | |||
u512_set(&k[1], 4); /* maximal 2-power in p+1 */ | |||
uint8_t e[2][num_primes]; | |||
for (size_t i = 0; i < num_primes; ++i) { | |||
int8_t t = (int8_t) (priv->e[i / 2] << i % 2 * 4) >> 4; | |||
if (t > 0) { | |||
e[0][i] = t; | |||
e[1][i] = 0; | |||
u512_mul3_64(&k[1], &k[1], primes[i]); | |||
} | |||
else if (t < 0) { | |||
e[1][i] = -t; | |||
e[0][i] = 0; | |||
u512_mul3_64(&k[0], &k[0], primes[i]); | |||
} | |||
else { | |||
e[0][i] = 0; | |||
e[1][i] = 0; | |||
u512_mul3_64(&k[0], &k[0], primes[i]); | |||
u512_mul3_64(&k[1], &k[1], primes[i]); | |||
} | |||
} | |||
proj A = {in->A, fp_1}; | |||
bool done[2] = {false, false}; | |||
do { | |||
assert(!memcmp(&A.z, &fp_1, sizeof(fp))); | |||
proj P; | |||
fp_random(&P.x); | |||
P.z = fp_1; | |||
fp rhs; | |||
montgomery_rhs(&rhs, &A.x, &P.x); | |||
bool sign = !fp_issquare(&rhs); | |||
if (done[sign]) | |||
continue; | |||
xMUL(&P, &A, &P, &k[sign]); | |||
done[sign] = true; | |||
for (size_t i = num_primes-1; i < num_primes; --i) { //changed loop direction | |||
if (e[sign][i]) { | |||
u512 cof = u512_1; | |||
for (size_t j = i - 1; j < num_primes; --j) //changed loop direction | |||
if (e[sign][j]) | |||
u512_mul3_64(&cof, &cof, primes[j]); | |||
proj K; | |||
xMUL(&K, &A, &P, &cof); | |||
if (memcmp(&K.z, &fp_0, sizeof(fp))) { | |||
xISOG(&A, &P, &K, primes[i]); | |||
if (!--e[sign][i]) | |||
u512_mul3_64(&k[sign], &k[sign], primes[i]); | |||
} | |||
} | |||
done[sign] &= !e[sign][i]; | |||
} | |||
fp_inv(&A.z); | |||
fp_mul2(&A.x, &A.z); | |||
A.z = fp_1; | |||
} while (!(done[0] && done[1])); | |||
out->A = A.x; | |||
} | |||
/* includes public-key validation. */ | |||
bool csidh(public_key *out, public_key const *in, private_key const *priv) | |||
{ | |||
if (!validate(in)) { | |||
fp_random(&out->A); | |||
return false; | |||
} | |||
action(out, in, priv); | |||
return true; | |||
} | |||
@@ -0,0 +1,26 @@ | |||
#ifndef CSIDH_H | |||
#define CSIDH_H | |||
#include "u512.h" | |||
#include "fp.h" | |||
#include "mont.h" | |||
/* specific to p, should perhaps be somewhere else */ | |||
#define num_primes 74 | |||
#define max_exponent 5 /* (2*5+1)^74 is roughly 2^256 */ | |||
typedef struct private_key { | |||
int8_t e[(num_primes + 1) / 2]; /* packed int4_t */ | |||
} private_key; | |||
typedef struct public_key { | |||
fp A; /* Montgomery coefficient: represents y^2 = x^3 + Ax^2 + x */ | |||
} public_key; | |||
extern const public_key base; | |||
void csidh_private(private_key *priv); | |||
bool csidh(public_key *out, public_key const *in, private_key const *priv); | |||
#endif |
@@ -0,0 +1,514 @@ | |||
/* | |||
* Copyright (c) 2003, 2007-8 Matteo Frigo | |||
* Copyright (c) 2003, 2007-8 Massachusetts Institute of Technology | |||
* | |||
* Permission is hereby granted, free of charge, to any person obtaining | |||
* a copy of this software and associated documentation files (the | |||
* "Software"), to deal in the Software without restriction, including | |||
* without limitation the rights to use, copy, modify, merge, publish, | |||
* distribute, sublicense, and/or sell copies of the Software, and to | |||
* permit persons to whom the Software is furnished to do so, subject to | |||
* the following conditions: | |||
* | |||
* The above copyright notice and this permission notice shall be | |||
* included in all copies or substantial portions of the Software. | |||
* | |||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, | |||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF | |||
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND | |||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE | |||
* LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION | |||
* OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION | |||
* WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. | |||
* | |||
*/ | |||
/* machine-dependent cycle counters code. Needs to be inlined. */ | |||
/***************************************************************************/ | |||
/* To use the cycle counters in your code, simply #include "cycle.h" (this | |||
file), and then use the functions/macros: | |||
ticks getticks(void); | |||
ticks is an opaque typedef defined below, representing the current time. | |||
You extract the elapsed time between two calls to gettick() via: | |||
double elapsed(ticks t1, ticks t0); | |||
which returns a double-precision variable in arbitrary units. You | |||
are not expected to convert this into human units like seconds; it | |||
is intended only for *comparisons* of time intervals. | |||
(In order to use some of the OS-dependent timer routines like | |||
Solaris' gethrtime, you need to paste the autoconf snippet below | |||
into your configure.ac file and #include "config.h" before cycle.h, | |||
or define the relevant macros manually if you are not using autoconf.) | |||
*/ | |||
/***************************************************************************/ | |||
/* This file uses macros like HAVE_GETHRTIME that are assumed to be | |||
defined according to whether the corresponding function/type/header | |||
is available on your system. The necessary macros are most | |||
conveniently defined if you are using GNU autoconf, via the tests: | |||
dnl --------------------------------------------------------------------- | |||
AC_C_INLINE | |||
AC_HEADER_TIME | |||
AC_CHECK_HEADERS([sys/time.h c_asm.h intrinsics.h mach/mach_time.h]) | |||
AC_CHECK_TYPE([hrtime_t],[AC_DEFINE(HAVE_HRTIME_T, 1, [Define to 1 if hrtime_t is defined in <sys/time.h>])],,[#if HAVE_SYS_TIME_H | |||
#include <sys/time.h> | |||
#endif]) | |||
AC_CHECK_FUNCS([gethrtime read_real_time time_base_to_time clock_gettime mach_absolute_time]) | |||
dnl Cray UNICOS _rtc() (real-time clock) intrinsic | |||
AC_MSG_CHECKING([for _rtc intrinsic]) | |||
rtc_ok=yes | |||
AC_TRY_LINK([#ifdef HAVE_INTRINSICS_H | |||
#include <intrinsics.h> | |||
#endif], [_rtc()], [AC_DEFINE(HAVE__RTC,1,[Define if you have the UNICOS _rtc() intrinsic.])], [rtc_ok=no]) | |||
AC_MSG_RESULT($rtc_ok) | |||
dnl --------------------------------------------------------------------- | |||
*/ | |||
/***************************************************************************/ | |||
#if TIME_WITH_SYS_TIME | |||
# include <sys/time.h> | |||
# include <time.h> | |||
#else | |||
# if HAVE_SYS_TIME_H | |||
# include <sys/time.h> | |||
# else | |||
# include <time.h> | |||
# endif | |||
#endif | |||
#define INLINE_ELAPSED(INL) static INL double elapsed(ticks t1, ticks t0) \ | |||
{ \ | |||
return (double)t1 - (double)t0; \ | |||
} | |||
/*----------------------------------------------------------------*/ | |||
/* Solaris */ | |||
#if defined(HAVE_GETHRTIME) && defined(HAVE_HRTIME_T) && !defined(HAVE_TICK_COUNTER) | |||
typedef hrtime_t ticks; | |||
#define getticks gethrtime | |||
INLINE_ELAPSED(inline) | |||
#define HAVE_TICK_COUNTER | |||
#endif | |||
/*----------------------------------------------------------------*/ | |||
/* AIX v. 4+ routines to read the real-time clock or time-base register */ | |||
#if defined(HAVE_READ_REAL_TIME) && defined(HAVE_TIME_BASE_TO_TIME) && !defined(HAVE_TICK_COUNTER) | |||
typedef timebasestruct_t ticks; | |||
static __inline ticks getticks(void) | |||
{ | |||
ticks t; | |||
read_real_time(&t, TIMEBASE_SZ); | |||
return t; | |||
} | |||
static __inline double elapsed(ticks t1, ticks t0) /* time in nanoseconds */ | |||
{ | |||
time_base_to_time(&t1, TIMEBASE_SZ); | |||
time_base_to_time(&t0, TIMEBASE_SZ); | |||
return (((double)t1.tb_high - (double)t0.tb_high) * 1.0e9 + | |||
((double)t1.tb_low - (double)t0.tb_low)); | |||
} | |||
#define HAVE_TICK_COUNTER | |||
#endif | |||
/*----------------------------------------------------------------*/ | |||
/* | |||
* PowerPC ``cycle'' counter using the time base register. | |||
*/ | |||
#if ((((defined(__GNUC__) && (defined(__powerpc__) || defined(__ppc__))) || (defined(__MWERKS__) && defined(macintosh)))) || (defined(__IBM_GCC_ASM) && (defined(__powerpc__) || defined(__ppc__)))) && !defined(HAVE_TICK_COUNTER) | |||
typedef unsigned long long ticks; | |||
static __inline__ ticks getticks(void) | |||
{ | |||
unsigned int tbl, tbu0, tbu1; | |||
do { | |||
__asm__ __volatile__ ("mftbu %0" : "=r"(tbu0)); | |||
__asm__ __volatile__ ("mftb %0" : "=r"(tbl)); | |||
__asm__ __volatile__ ("mftbu %0" : "=r"(tbu1)); | |||
} while (tbu0 != tbu1); | |||
return (((unsigned long long)tbu0) << 32) | tbl; | |||
} | |||
INLINE_ELAPSED(__inline__) | |||
#define HAVE_TICK_COUNTER | |||
#endif | |||
/* MacOS/Mach (Darwin) time-base register interface (unlike UpTime, | |||
from Carbon, requires no additional libraries to be linked). */ | |||
#if defined(HAVE_MACH_ABSOLUTE_TIME) && defined(HAVE_MACH_MACH_TIME_H) && !defined(HAVE_TICK_COUNTER) | |||
#include <mach/mach_time.h> | |||
typedef uint64_t ticks; | |||
#define getticks mach_absolute_time | |||
INLINE_ELAPSED(__inline__) | |||
#define HAVE_TICK_COUNTER | |||
#endif | |||
/*----------------------------------------------------------------*/ | |||
/* | |||
* Pentium cycle counter | |||
*/ | |||
#if (defined(__GNUC__) || defined(__ICC)) && defined(__i386__) && !defined(HAVE_TICK_COUNTER) | |||
typedef unsigned long long ticks; | |||
static __inline__ ticks getticks(void) | |||
{ | |||
ticks ret; | |||
__asm__ __volatile__("rdtsc": "=A" (ret)); | |||
/* no input, nothing else clobbered */ | |||
return ret; | |||
} | |||
INLINE_ELAPSED(__inline__) | |||
#define HAVE_TICK_COUNTER | |||
#define TIME_MIN 5000.0 /* unreliable pentium IV cycle counter */ | |||
#endif | |||
/* Visual C++ -- thanks to Morten Nissov for his help with this */ | |||
#if _MSC_VER >= 1200 && _M_IX86 >= 500 && !defined(HAVE_TICK_COUNTER) | |||
#include <windows.h> | |||
typedef LARGE_INTEGER ticks; | |||
#define RDTSC __asm __emit 0fh __asm __emit 031h /* hack for VC++ 5.0 */ | |||
static __inline ticks getticks(void) | |||
{ | |||
ticks retval; | |||
__asm { | |||
RDTSC | |||
mov retval.HighPart, edx | |||
mov retval.LowPart, eax | |||
} | |||
return retval; | |||
} | |||
static __inline double elapsed(ticks t1, ticks t0) | |||
{ | |||
return (double)t1.QuadPart - (double)t0.QuadPart; | |||
} | |||
#define HAVE_TICK_COUNTER | |||
#define TIME_MIN 5000.0 /* unreliable pentium IV cycle counter */ | |||
#endif | |||
/*----------------------------------------------------------------*/ | |||
/* | |||
* X86-64 cycle counter | |||
*/ | |||
#if (defined(__GNUC__) || defined(__ICC) || defined(__SUNPRO_C)) && defined(__x86_64__) && !defined(HAVE_TICK_COUNTER) | |||
typedef unsigned long long ticks; | |||
static __inline__ ticks getticks(void) | |||
{ | |||
unsigned a, d; | |||
asm volatile("rdtsc" : "=a" (a), "=d" (d)); | |||
return ((ticks)a) | (((ticks)d) << 32); | |||
} | |||
INLINE_ELAPSED(__inline__) | |||
#define HAVE_TICK_COUNTER | |||
#endif | |||
/* PGI compiler, courtesy Cristiano Calonaci, Andrea Tarsi, & Roberto Gori. | |||
NOTE: this code will fail to link unless you use the -Masmkeyword compiler | |||
option (grrr). */ | |||
#if defined(__PGI) && defined(__x86_64__) && !defined(HAVE_TICK_COUNTER) | |||
typedef unsigned long long ticks; | |||
static ticks getticks(void) | |||
{ | |||
asm(" rdtsc; shl $0x20,%rdx; mov %eax,%eax; or %rdx,%rax; "); | |||
} | |||
INLINE_ELAPSED(__inline__) | |||
#define HAVE_TICK_COUNTER | |||
#endif | |||
/* Visual C++, courtesy of Dirk Michaelis */ | |||
#if _MSC_VER >= 1400 && (defined(_M_AMD64) || defined(_M_X64)) && !defined(HAVE_TICK_COUNTER) | |||
#include <intrin.h> | |||
#pragma intrinsic(__rdtsc) | |||
typedef unsigned __int64 ticks; | |||
#define getticks __rdtsc | |||
INLINE_ELAPSED(__inline) | |||
#define HAVE_TICK_COUNTER | |||
#endif | |||
/*----------------------------------------------------------------*/ | |||
/* | |||
* IA64 cycle counter | |||
*/ | |||
/* intel's icc/ecc compiler */ | |||
#if (defined(__EDG_VERSION) || defined(__ECC)) && defined(__ia64__) && !defined(HAVE_TICK_COUNTER) | |||
typedef unsigned long ticks; | |||
#include <ia64intrin.h> | |||
static __inline__ ticks getticks(void) | |||
{ | |||
return __getReg(_IA64_REG_AR_ITC); | |||
} | |||
INLINE_ELAPSED(__inline__) | |||
#define HAVE_TICK_COUNTER | |||
#endif | |||
/* gcc */ | |||
#if defined(__GNUC__) && defined(__ia64__) && !defined(HAVE_TICK_COUNTER) | |||
typedef unsigned long ticks; | |||
static __inline__ ticks getticks(void) | |||
{ | |||
ticks ret; | |||
__asm__ __volatile__ ("mov %0=ar.itc" : "=r"(ret)); | |||
return ret; | |||
} | |||
INLINE_ELAPSED(__inline__) | |||
#define HAVE_TICK_COUNTER | |||
#endif | |||
/* HP/UX IA64 compiler, courtesy Teresa L. Johnson: */ | |||
#if defined(__hpux) && defined(__ia64) && !defined(HAVE_TICK_COUNTER) | |||
#include <machine/sys/inline.h> | |||
typedef unsigned long ticks; | |||
static inline ticks getticks(void) | |||
{ | |||
ticks ret; | |||
ret = _Asm_mov_from_ar (_AREG_ITC); | |||
return ret; | |||
} | |||
INLINE_ELAPSED(inline) | |||
#define HAVE_TICK_COUNTER | |||
#endif | |||
/* Microsoft Visual C++ */ | |||
#if defined(_MSC_VER) && defined(_M_IA64) && !defined(HAVE_TICK_COUNTER) | |||
typedef unsigned __int64 ticks; | |||
# ifdef __cplusplus | |||
extern "C" | |||
# endif | |||
ticks __getReg(int whichReg); | |||
#pragma intrinsic(__getReg) | |||
static __inline ticks getticks(void) | |||
{ | |||
volatile ticks temp; | |||
temp = __getReg(3116); | |||
return temp; | |||
} | |||
INLINE_ELAPSED(inline) | |||
#define HAVE_TICK_COUNTER | |||
#endif | |||
/*----------------------------------------------------------------*/ | |||
/* | |||
* PA-RISC cycle counter | |||
*/ | |||
#if defined(__hppa__) || defined(__hppa) && !defined(HAVE_TICK_COUNTER) | |||
typedef unsigned long ticks; | |||
# ifdef __GNUC__ | |||
static __inline__ ticks getticks(void) | |||
{ | |||
ticks ret; | |||
__asm__ __volatile__("mfctl 16, %0": "=r" (ret)); | |||
/* no input, nothing else clobbered */ | |||
return ret; | |||
} | |||
# else | |||
# include <machine/inline.h> | |||
static inline unsigned long getticks(void) | |||
{ | |||
register ticks ret; | |||
_MFCTL(16, ret); | |||
return ret; | |||
} | |||
# endif | |||
INLINE_ELAPSED(inline) | |||
#define HAVE_TICK_COUNTER | |||
#endif | |||
/*----------------------------------------------------------------*/ | |||
/* S390, courtesy of James Treacy */ | |||
#if defined(__GNUC__) && defined(__s390__) && !defined(HAVE_TICK_COUNTER) | |||
typedef unsigned long long ticks; | |||
static __inline__ ticks getticks(void) | |||
{ | |||
ticks cycles; | |||
__asm__("stck 0(%0)" : : "a" (&(cycles)) : "memory", "cc"); | |||
return cycles; | |||
} | |||
INLINE_ELAPSED(__inline__) | |||
#define HAVE_TICK_COUNTER | |||
#endif | |||
/*----------------------------------------------------------------*/ | |||
#if defined(__GNUC__) && defined(__alpha__) && !defined(HAVE_TICK_COUNTER) | |||
/* | |||
* The 32-bit cycle counter on alpha overflows pretty quickly, | |||
* unfortunately. A 1GHz machine overflows in 4 seconds. | |||
*/ | |||
typedef unsigned int ticks; | |||
static __inline__ ticks getticks(void) | |||
{ | |||
unsigned long cc; | |||
__asm__ __volatile__ ("rpcc %0" : "=r"(cc)); | |||
return (cc & 0xFFFFFFFF); | |||
} | |||
INLINE_ELAPSED(__inline__) | |||
#define HAVE_TICK_COUNTER | |||
#endif | |||
/*----------------------------------------------------------------*/ | |||
#if defined(__GNUC__) && defined(__sparc_v9__) && !defined(HAVE_TICK_COUNTER) | |||
typedef unsigned long ticks; | |||
static __inline__ ticks getticks(void) | |||
{ | |||
ticks ret; | |||
__asm__ __volatile__("rd %%tick, %0" : "=r" (ret)); | |||
return ret; | |||
} | |||
INLINE_ELAPSED(__inline__) | |||
#define HAVE_TICK_COUNTER | |||
#endif | |||
/*----------------------------------------------------------------*/ | |||
#if (defined(__DECC) || defined(__DECCXX)) && defined(__alpha) && defined(HAVE_C_ASM_H) && !defined(HAVE_TICK_COUNTER) | |||
# include <c_asm.h> | |||
typedef unsigned int ticks; | |||
static __inline ticks getticks(void) | |||
{ | |||
unsigned long cc; | |||
cc = asm("rpcc %v0"); | |||
return (cc & 0xFFFFFFFF); | |||
} | |||
INLINE_ELAPSED(__inline) | |||
#define HAVE_TICK_COUNTER | |||
#endif | |||
/*----------------------------------------------------------------*/ | |||
/* SGI/Irix */ | |||
#if defined(HAVE_CLOCK_GETTIME) && defined(CLOCK_SGI_CYCLE) && !defined(HAVE_TICK_COUNTER) | |||
typedef struct timespec ticks; | |||
static inline ticks getticks(void) | |||
{ | |||
struct timespec t; | |||
clock_gettime(CLOCK_SGI_CYCLE, &t); | |||
return t; | |||
} | |||
static inline double elapsed(ticks t1, ticks t0) | |||
{ | |||
return ((double)t1.tv_sec - (double)t0.tv_sec) * 1.0E9 + | |||
((double)t1.tv_nsec - (double)t0.tv_nsec); | |||
} | |||
#define HAVE_TICK_COUNTER | |||
#endif | |||
/*----------------------------------------------------------------*/ | |||
/* Cray UNICOS _rtc() intrinsic function */ | |||
#if defined(HAVE__RTC) && !defined(HAVE_TICK_COUNTER) | |||
#ifdef HAVE_INTRINSICS_H | |||
# include <intrinsics.h> | |||
#endif | |||
typedef long long ticks; | |||
#define getticks _rtc | |||
INLINE_ELAPSED(inline) | |||
#define HAVE_TICK_COUNTER | |||
#endif | |||
/*----------------------------------------------------------------*/ | |||
/* MIPS ZBus */ | |||
#if HAVE_MIPS_ZBUS_TIMER | |||
#if defined(__mips__) && !defined(HAVE_TICK_COUNTER) | |||
#include <sys/mman.h> | |||
#include <unistd.h> | |||
#include <fcntl.h> | |||
typedef uint64_t ticks; | |||
static inline ticks getticks(void) | |||
{ | |||
static uint64_t* addr = 0; | |||
if (addr == 0) | |||
{ | |||
uint32_t rq_addr = 0x10030000; | |||
int fd; | |||
int pgsize; | |||
pgsize = getpagesize(); | |||
fd = open ("/dev/mem", O_RDONLY | O_SYNC, 0); | |||
if (fd < 0) { | |||
perror("open"); | |||
return NULL; | |||
} | |||
addr = mmap(0, pgsize, PROT_READ, MAP_SHARED, fd, rq_addr); | |||
close(fd); | |||
if (addr == (uint64_t *)-1) { | |||
perror("mmap"); | |||
return NULL; | |||
} | |||
} | |||
return *addr; | |||
} | |||
INLINE_ELAPSED(inline) | |||
#define HAVE_TICK_COUNTER | |||
#endif | |||
#endif /* HAVE_MIPS_ZBUS_TIMER */ | |||
@@ -0,0 +1,37 @@ | |||
#ifndef FP_H | |||
#define FP_H | |||
#include "u512.h" | |||
/* fp is in the Montgomery domain, so interpreting that | |||
as an integer should never make sense. | |||
enable compiler warnings when mixing up u512 and fp. */ | |||
typedef struct fp { | |||
u512 x; | |||
} fp; | |||
extern const fp fp_0; | |||
extern const fp fp_1; | |||
void fp_set(fp *x, uint64_t y); | |||
void fp_cswap(fp *x, fp *y, bool c); | |||
void fp_enc(fp *x, u512 const *y); /* encode to Montgomery representation */ | |||
void fp_dec(u512 *x, fp const *y); /* decode from Montgomery representation */ | |||
void fp_add2(fp *x, fp const *y); | |||
void fp_sub2(fp *x, fp const *y); | |||
void fp_mul2(fp *x, fp const *y); | |||
void fp_add3(fp *x, fp const *y, fp const *z); | |||
void fp_sub3(fp *x, fp const *y, fp const *z); | |||
void fp_mul3(fp *x, fp const *y, fp const *z); | |||
void fp_sq1(fp *x); | |||
void fp_sq2(fp *x, fp const *y); | |||
void fp_inv(fp *x); | |||
bool fp_issquare(fp const *x); | |||
void fp_random(fp *x); | |||
#endif |
@@ -0,0 +1,452 @@ | |||
.intel_syntax noprefix | |||
.section .rodata | |||
.set pbits, 511 | |||
p: | |||
.quad 0x1b81b90533c6c87b, 0xc2721bf457aca835, 0x516730cc1f0b4f25, 0xa7aac6c567f35507 | |||
.quad 0x5afbfcc69322c9cd, 0xb42d083aedc88c42, 0xfc8ab0d15e3e4c4a, 0x65b48e8f740f89bf | |||
.global fp_0 | |||
fp_0: .quad 0, 0, 0, 0, 0, 0, 0, 0 | |||
.global fp_1 | |||
fp_1: /* 2^512 mod p */ | |||
.quad 0xc8fc8df598726f0a, 0x7b1bc81750a6af95, 0x5d319e67c1e961b4, 0xb0aa7275301955f1 | |||
.quad 0x4a080672d9ba6c64, 0x97a5ef8a246ee77b, 0x06ea9e5d4383676a, 0x3496e2e117e0ec80 | |||
/* (2^512)^2 mod p */ | |||
.r_squared_mod_p: | |||
.quad 0x36905b572ffc1724, 0x67086f4525f1f27d, 0x4faf3fbfd22370ca, 0x192ea214bcc584b1 | |||
.quad 0x5dae03ee2f5de3d0, 0x1e9248731776b371, 0xad5f166e20e4f52d, 0x4ed759aea6f3917e | |||
/* -p^-1 mod 2^64 */ | |||
.inv_min_p_mod_r: | |||
.quad 0x66c1301f632e294d | |||
.section .text | |||
.global fp_copy | |||
fp_copy: | |||
cld | |||
mov rcx, 8 | |||
rep movsq | |||
ret | |||
.global fp_set | |||
fp_set: | |||
push rdi | |||
call u512_set | |||
pop rdi | |||
mov rsi, rdi | |||
jmp fp_enc | |||
.global fp_cswap | |||
fp_cswap: | |||
movzx rax, dl | |||
neg rax | |||
.set k, 0 | |||
.rept 8 | |||
mov rcx, [rdi + 8*k] | |||
mov rdx, [rsi + 8*k] | |||
mov r8, rcx | |||
xor r8, rdx | |||
and r8, rax | |||
xor rcx, r8 | |||
xor rdx, r8 | |||
mov [rdi + 8*k], rcx | |||
mov [rsi + 8*k], rdx | |||
.set k, k+1 | |||
.endr | |||
ret | |||
.reduce_once: | |||
push rbp | |||
mov rbp, rdi | |||
mov rdi, [rbp + 0] | |||
sub rdi, [rip + p + 0] | |||
mov rsi, [rbp + 8] | |||
sbb rsi, [rip + p + 8] | |||
mov rdx, [rbp + 16] | |||
sbb rdx, [rip + p + 16] | |||
mov rcx, [rbp + 24] | |||
sbb rcx, [rip + p + 24] | |||
mov r8, [rbp + 32] | |||
sbb r8, [rip + p + 32] | |||
mov r9, [rbp + 40] | |||
sbb r9, [rip + p + 40] | |||
mov r10, [rbp + 48] | |||
sbb r10, [rip + p + 48] | |||
mov r11, [rbp + 56] | |||
sbb r11, [rip + p + 56] | |||
setnc al | |||
movzx rax, al | |||
neg rax | |||
.macro cswap2, r, m | |||
xor \r, \m | |||
and \r, rax | |||
xor \m, \r | |||
.endm | |||
cswap2 rdi, [rbp + 0] | |||
cswap2 rsi, [rbp + 8] | |||
cswap2 rdx, [rbp + 16] | |||
cswap2 rcx, [rbp + 24] | |||
cswap2 r8, [rbp + 32] | |||
cswap2 r9, [rbp + 40] | |||
cswap2 r10, [rbp + 48] | |||
cswap2 r11, [rbp + 56] | |||
pop rbp | |||
ret | |||
.global fp_add3 | |||
fp_add3: | |||
push rdi | |||
call u512_add3 | |||
pop rdi | |||
jmp .reduce_once | |||
.global fp_add2 | |||
fp_add2: | |||
mov rdx, rdi | |||
jmp fp_add3 | |||
.global fp_sub3 | |||
fp_sub3: | |||
push rdi | |||
call u512_sub3 | |||
pop rdi | |||
xor rsi, rsi | |||
xor rdx, rdx | |||
xor rcx, rcx | |||
xor r8, r8 | |||
xor r9, r9 | |||
xor r10, r10 | |||
xor r11, r11 | |||
test rax, rax | |||
cmovnz rax, [rip + p + 0] | |||
cmovnz rsi, [rip + p + 8] | |||
cmovnz rdx, [rip + p + 16] | |||
cmovnz rcx, [rip + p + 24] | |||
cmovnz r8, [rip + p + 32] | |||
cmovnz r9, [rip + p + 40] | |||
cmovnz r10, [rip + p + 48] | |||
cmovnz r11, [rip + p + 56] | |||
add [rdi + 0], rax | |||
adc [rdi + 8], rsi | |||
adc [rdi + 16], rdx | |||
adc [rdi + 24], rcx | |||
adc [rdi + 32], r8 | |||
adc [rdi + 40], r9 | |||
adc [rdi + 48], r10 | |||
adc [rdi + 56], r11 | |||
ret | |||
.global fp_sub2 | |||
fp_sub2: | |||
mov rdx, rdi | |||
xchg rsi, rdx | |||
jmp fp_sub3 | |||
/* Montgomery arithmetic */ | |||
.global fp_enc | |||
fp_enc: | |||
lea rdx, [rip + .r_squared_mod_p] | |||
jmp fp_mul3 | |||
.global fp_dec | |||
fp_dec: | |||
lea rdx, [rip + u512_1] | |||
jmp fp_mul3 | |||
.global fp_mul3 | |||
fp_mul3: | |||
push rbp | |||
push rbx | |||
push r12 | |||
push r13 | |||
push r14 | |||
push r15 | |||
push rdi | |||
mov rdi, rsi | |||
mov rsi, rdx | |||
xor r8, r8 | |||
xor r9, r9 | |||
xor r10, r10 | |||
xor r11, r11 | |||
xor r12, r12 | |||
xor r13, r13 | |||
xor r14, r14 | |||
xor r15, r15 | |||
xor rbp, rbp | |||
/* flags are already cleared */ | |||
.macro MULSTEP, k, r0, r1, r2, r3, r4, r5, r6, r7, r8 | |||
mov rdx, [rsi + 0] | |||
mulx rcx, rdx, [rdi + 8*\k] | |||
add rdx, \r0 | |||
mulx rcx, rdx, [rip + .inv_min_p_mod_r] | |||
xor rax, rax /* clear flags */ | |||
mulx rbx, rax, [rip + p + 0] | |||
adox \r0, rax | |||
mulx rcx, rax, [rip + p + 8] | |||
adcx \r1, rbx | |||
adox \r1, rax | |||
mulx rbx, rax, [rip + p + 16] | |||
adcx \r2, rcx | |||
adox \r2, rax | |||
mulx rcx, rax, [rip + p + 24] | |||
adcx \r3, rbx | |||
adox \r3, rax | |||
mulx rbx, rax, [rip + p + 32] | |||
adcx \r4, rcx | |||
adox \r4, rax | |||
mulx rcx, rax, [rip + p + 40] | |||
adcx \r5, rbx | |||
adox \r5, rax | |||
mulx rbx, rax, [rip + p + 48] | |||
adcx \r6, rcx | |||
adox \r6, rax | |||
mulx rcx, rax, [rip + p + 56] | |||
adcx \r7, rbx | |||
adox \r7, rax | |||
mov rax, 0 | |||
adcx \r8, rcx | |||
adox \r8, rax | |||
mov rdx, [rdi + 8*\k] | |||
xor rax, rax /* clear flags */ | |||
mulx rbx, rax, [rsi + 0] | |||
adox \r0, rax | |||
mulx rcx, rax, [rsi + 8] | |||
adcx \r1, rbx | |||
adox \r1, rax | |||
mulx rbx, rax, [rsi + 16] | |||
adcx \r2, rcx | |||
adox \r2, rax | |||
mulx rcx, rax, [rsi + 24] | |||
adcx \r3, rbx | |||
adox \r3, rax | |||
mulx rbx, rax, [rsi + 32] | |||
adcx \r4, rcx | |||
adox \r4, rax | |||
mulx rcx, rax, [rsi + 40] | |||
adcx \r5, rbx | |||
adox \r5, rax | |||
mulx rbx, rax, [rsi + 48] | |||
adcx \r6, rcx | |||
adox \r6, rax | |||
mulx rcx, rax, [rsi + 56] | |||
adcx \r7, rbx | |||
adox \r7, rax | |||
mov rax, 0 | |||
adcx \r8, rcx | |||
adox \r8, rax | |||
.endm | |||
MULSTEP 0, r8, r9, r10, r11, r12, r13, r14, r15, rbp | |||
MULSTEP 1, r9, r10, r11, r12, r13, r14, r15, rbp, r8 | |||
MULSTEP 2, r10, r11, r12, r13, r14, r15, rbp, r8, r9 | |||
MULSTEP 3, r11, r12, r13, r14, r15, rbp, r8, r9, r10 | |||
MULSTEP 4, r12, r13, r14, r15, rbp, r8, r9, r10, r11 | |||
MULSTEP 5, r13, r14, r15, rbp, r8, r9, r10, r11, r12 | |||
MULSTEP 6, r14, r15, rbp, r8, r9, r10, r11, r12, r13 | |||
MULSTEP 7, r15, rbp, r8, r9, r10, r11, r12, r13, r14 | |||
pop rdi | |||
mov [rdi + 0], rbp | |||
mov [rdi + 8], r8 | |||
mov [rdi + 16], r9 | |||
mov [rdi + 24], r10 | |||
mov [rdi + 32], r11 | |||
mov [rdi + 40], r12 | |||
mov [rdi + 48], r13 | |||
mov [rdi + 56], r14 | |||
pop r15 | |||
pop r14 | |||
pop r13 | |||
pop r12 | |||
pop rbx | |||
pop rbp | |||
jmp .reduce_once | |||
.global fp_mul2 | |||
fp_mul2: | |||
mov rdx, rdi | |||
jmp fp_mul3 | |||
.global fp_sq2 | |||
fp_sq2: | |||
/* TODO implement optimized Montgomery squaring */ | |||
mov rdx, rsi | |||
jmp fp_mul3 | |||
.global fp_sq1 | |||
fp_sq1: | |||
mov rsi, rdi | |||
jmp fp_sq2 | |||
/* (obviously) not constant time in the exponent! */ | |||
.fp_pow: | |||
push rbx | |||
mov rbx, rsi | |||
push r12 | |||
push r13 | |||
push rdi | |||
sub rsp, 64 | |||
mov rsi, rdi | |||
mov rdi, rsp | |||
call fp_copy | |||
mov rdi, [rsp + 64] | |||
lea rsi, [rip + fp_1] | |||
call fp_copy | |||
.macro POWSTEP, k | |||
mov r13, [rbx + 8*\k] | |||
xor r12, r12 | |||
0: | |||
test r13, 1 | |||
jz 1f | |||
mov rdi, [rsp + 64] | |||
mov rsi, rsp | |||
call fp_mul2 | |||
1: | |||
mov rdi, rsp | |||
call fp_sq1 | |||
shr r13 | |||
inc r12 | |||
test r12, 64 | |||
jz 0b | |||
.endm | |||
POWSTEP 0 | |||
POWSTEP 1 | |||
POWSTEP 2 | |||
POWSTEP 3 | |||
POWSTEP 4 | |||
POWSTEP 5 | |||
POWSTEP 6 | |||
POWSTEP 7 | |||
add rsp, 64+8 | |||
pop r13 | |||
pop r12 | |||
pop rbx | |||
ret | |||
.section .rodata | |||
.p_minus_2: | |||
.quad 0x1b81b90533c6c879, 0xc2721bf457aca835, 0x516730cc1f0b4f25, 0xa7aac6c567f35507 | |||
.quad 0x5afbfcc69322c9cd, 0xb42d083aedc88c42, 0xfc8ab0d15e3e4c4a, 0x65b48e8f740f89bf | |||
.section .text | |||
/* TODO use a better addition chain? */ | |||
.global fp_inv | |||
fp_inv: | |||
lea rsi, [rip + .p_minus_2] | |||
jmp .fp_pow | |||
.section .rodata | |||
.p_minus_1_halves: | |||
.quad 0x8dc0dc8299e3643d, 0xe1390dfa2bd6541a, 0xa8b398660f85a792, 0xd3d56362b3f9aa83 | |||
.quad 0x2d7dfe63499164e6, 0x5a16841d76e44621, 0xfe455868af1f2625, 0x32da4747ba07c4df | |||
.section .text | |||
/* TODO use a better addition chain? */ | |||
.global fp_issquare | |||
fp_issquare: | |||
push rdi | |||
lea rsi, [rip + .p_minus_1_halves] | |||
call .fp_pow | |||
pop rdi | |||
xor rax, rax | |||
.set k, 0 | |||
.rept 8 | |||
mov rsi, [rdi + 8*k] | |||
xor rsi, [rip + fp_1 + 8*k] | |||
or rax, rsi | |||
.set k, k+1 | |||
.endr | |||
test rax, rax | |||
setz al | |||
movzx rax, al | |||
ret | |||
/* not constant time (but this shouldn't leak anything of importance) */ | |||
.global fp_random | |||
fp_random: | |||
push rdi | |||
mov rsi, 64 | |||
call randombytes | |||
pop rdi | |||
mov rax, 1 | |||
shl rax, (pbits % 64) | |||
dec rax | |||
and [rdi + 56], rax | |||
.set k, 7 | |||
.rept 8 | |||
mov rax, [rip + p + 8*k] | |||
cmp [rdi + 8*k], rax | |||
jge fp_random | |||
jl 0f | |||
.set k, k-1 | |||
.endr | |||
0: | |||
ret | |||
@@ -0,0 +1,99 @@ | |||
#include <stdlib.h> | |||
#include <stdio.h> | |||
#include <string.h> | |||
#include <unistd.h> | |||
#include <time.h> | |||
#include <assert.h> | |||
#include "u512.h" | |||
#include "fp.h" | |||
#include "mont.h" | |||
#include "csidh.h" | |||
void u512_print(u512 const *x) | |||
{ | |||
for (size_t i = 63; i < 64; --i) | |||
printf("%02hhx", i[(unsigned char *) x->c]); | |||
} | |||
void fp_print(fp const *x) | |||
{ | |||
u512 y; | |||
fp_dec(&y, x); | |||
u512_print(&y); | |||
} | |||
int main() | |||
{ | |||
clock_t t0, t1; | |||
private_key priv_alice, priv_bob; | |||
public_key pub_alice, pub_bob; | |||
public_key shared_alice, shared_bob; | |||
printf("\n"); | |||
t0 = clock(); | |||
csidh_private(&priv_alice); | |||
t1 = clock(); | |||
printf("Alice's private key (%7.3lf ms):\n ", 1000. * (t1 - t0) / CLOCKS_PER_SEC); | |||
for (size_t i = 0; i < sizeof(priv_alice); ++i) | |||
printf("%02hhx", i[(uint8_t *) &priv_alice]); | |||
printf("\n\n"); | |||
t0 = clock(); | |||
csidh_private(&priv_bob); | |||
t1 = clock(); | |||
printf("Bob's private key (%7.3lf ms):\n ", 1000. * (t1 - t0) / CLOCKS_PER_SEC); | |||
for (size_t i = 0; i < sizeof(priv_bob); ++i) | |||
printf("%02hhx", i[(uint8_t *) &priv_bob]); | |||
printf("\n\n"); | |||
t0 = clock(); | |||
assert(csidh(&pub_alice, &base, &priv_alice)); | |||
t1 = clock(); | |||
printf("Alice's public key (%7.3lf ms):\n ", 1000. * (t1 - t0) / CLOCKS_PER_SEC); | |||
fp_print(&pub_alice.A); | |||
printf("\n\n"); | |||
t0 = clock(); | |||
assert(csidh(&pub_bob, &base, &priv_bob)); | |||
t1 = clock(); | |||
printf("Bob's public key (%7.3lf ms):\n ", 1000. * (t1 - t0) / CLOCKS_PER_SEC); | |||
fp_print(&pub_bob.A); | |||
printf("\n\n"); | |||
t0 = clock(); | |||
assert(csidh(&shared_alice, &pub_bob, &priv_alice)); | |||
t1 = clock(); | |||
printf("Alice's shared secret (%7.3lf ms):\n ", 1000. * (t1 - t0) / CLOCKS_PER_SEC); | |||
fp_print(&shared_alice.A); | |||
printf("\n\n"); | |||
t0 = clock(); | |||
assert(csidh(&shared_bob, &pub_alice, &priv_bob)); | |||
t1 = clock(); | |||
printf("Bob's shared secret (%7.3lf ms):\n ", 1000. * (t1 - t0) / CLOCKS_PER_SEC); | |||
fp_print(&shared_bob.A); | |||
printf("\n\n"); | |||
printf(" "); | |||
if (memcmp(&shared_alice, &shared_bob, sizeof(public_key))) | |||
printf("\x1b[31mNOT EQUAL!\x1b[0m\n"); | |||
else | |||
printf("\x1b[32mequal.\x1b[0m\n"); | |||
printf("\n"); | |||
printf("\n"); | |||
} | |||
@@ -0,0 +1,203 @@ | |||
#include <assert.h> | |||
#include "mont.h" | |||
#include "u512.h" | |||
void xDBLADD(proj *R, proj *S, proj const *P, proj const *Q, proj const *PQ, proj const *A24) | |||
{ | |||
fp tmp0, tmp1, tmp2; //requires precomputation of A24=(A+2C:4C) | |||
fp_add3(&tmp0, &P->x, &P->z); | |||
fp_sub3(&tmp1, &P->x, &P->z); | |||
fp_sq2(&R->x, &tmp0); | |||
fp_sub3(&tmp2, &Q->x, &Q->z); | |||
fp_add3(&S->x, &Q->x, &Q->z); | |||
fp_mul2(&tmp0, &tmp2); | |||
fp_sq2(&R->z, &tmp1); | |||
fp_mul2(&tmp1, &S->x); | |||
fp_sub3(&tmp2, &R->x, &R->z); | |||
fp_mul2(&R->z, &A24->z); | |||
fp_mul2(&R->x, &R->z); | |||
fp_mul3(&S->x, &A24->x, &tmp2); | |||
fp_sub3(&S->z, &tmp0, &tmp1); | |||
fp_add2(&R->z, &S->x); | |||
fp_add3(&S->x, &tmp0, &tmp1); | |||
fp_mul2(&R->z, &tmp2); | |||
fp_sq1(&S->z); | |||
fp_sq1(&S->x); | |||
fp_mul2(&S->z, &PQ->x); | |||
fp_mul2(&S->x, &PQ->z); | |||
} | |||
void xDBL(proj *Q, proj const *A, proj const *P) | |||
{ | |||
fp a, b, c; | |||
fp_add3(&a, &P->x, &P->z); | |||
fp_sq1(&a); | |||
fp_sub3(&b, &P->x, &P->z); | |||
fp_sq1(&b); | |||
fp_sub3(&c, &a, &b); | |||
fp_add2(&b, &b); fp_add2(&b, &b); /* multiplication by 4 */ | |||
fp_mul2(&b, &A->z); | |||
fp_mul3(&Q->x, &a, &b); | |||
fp_add3(&a, &A->z, &A->z); /* multiplication by 2 */ | |||
fp_add2(&a, &A->x); | |||
fp_mul2(&a, &c); | |||
fp_add2(&a, &b); | |||
fp_mul3(&Q->z, &a, &c); | |||
} | |||
void xADD(proj *S, proj const *P, proj const *Q, proj const *PQ) | |||
{ | |||
fp a, b, c, d; | |||
fp_add3(&a, &P->x, &P->z); | |||
fp_sub3(&b, &P->x, &P->z); | |||
fp_add3(&c, &Q->x, &Q->z); | |||
fp_sub3(&d, &Q->x, &Q->z); | |||
fp_mul2(&a, &d); | |||
fp_mul2(&b, &c); | |||
fp_add3(&c, &a, &b); | |||
fp_sub3(&d, &a, &b); | |||
fp_sq1(&c); | |||
fp_sq1(&d); | |||
fp_mul3(&S->x, &PQ->z, &c); | |||
fp_mul3(&S->z, &PQ->x, &d); | |||
} | |||
/* Montgomery ladder. */ | |||
/* P must not be the unique point of order 2. */ | |||
/* not constant-time! */ | |||
void xMUL(proj *Q, proj const *A, proj const *P, u512 const *k) | |||
{ | |||
proj R = *P; | |||
proj A24; | |||
const proj Pcopy = *P; /* in case Q = P */ | |||
Q->x = fp_1; | |||
Q->z = fp_0; | |||
fp_add3(&A24.x, &A->z, &A->z); //precomputation of A24=(A+2C:4C) | |||
fp_add3(&A24.z, &A24.x, &A24.x); | |||
fp_add2(&A24.x, &A->x); | |||
unsigned long i = 512; | |||
while (--i && !u512_bit(k, i)); | |||
do { | |||
bool bit = u512_bit(k, i); | |||
if (bit) { proj T = *Q; *Q = R; R = T; } /* not constant-time */ | |||
//fp_cswap(&Q->x, &R.x, bit); | |||
//fp_cswap(&Q->z, &R.z, bit); | |||
xDBLADD(Q, &R, Q, &R, &Pcopy, &A24); | |||
if (bit) { proj T = *Q; *Q = R; R = T; } /* not constant-time */ | |||
//fp_cswap(&Q->x, &R.x, bit); | |||
//fp_cswap(&Q->z, &R.z, bit); | |||
} while (i--); | |||
} | |||
//simultaneous square-and-multiply, computes x^exp and y^exp | |||
void exp_by_squaring_(fp* x, fp* y, uint64_t exp) | |||
{ | |||
fp result1, result2; | |||
fp_set(&result1, 1); | |||
fp_set(&result2, 1); | |||
while (exp) | |||
{ | |||
if (exp & 1){ | |||
fp_mul2(&result1, x); | |||
fp_mul2(&result2, y); | |||
} | |||
fp_sq1(x); | |||
fp_sq1(y); | |||
exp >>= 1; | |||
} | |||
fp_cswap(&result1, x, 1); | |||
fp_cswap(&result2, y, 1); | |||
} | |||
/* computes the isogeny with kernel point K of order k */ | |||
/* returns the new curve coefficient A and the image of P */ | |||
/* (obviously) not constant time in k */ | |||
void xISOG(proj *A, proj *P, proj const *K, uint64_t k) | |||
{ | |||
assert (k >= 3); | |||
assert (k % 2 == 1); | |||
fp tmp0, tmp1, tmp2, Psum, Pdif; | |||
proj Q, Aed, prod; | |||
fp_add3(&Aed.z, &A->z, &A->z); //compute twisted Edwards curve coefficients | |||
fp_add3(&Aed.x, &A->x, &Aed.z); | |||
fp_sub3(&Aed.z, &A->x, &Aed.z); | |||
fp_add3(&Psum, &P->x, &P->z); //precomputations | |||
fp_sub3(&Pdif, &P->x, &P->z); | |||
fp_sub3(&prod.x, &K->x, &K->z); | |||
fp_add3(&prod.z, &K->x, &K->z); | |||
fp_mul3(&tmp1, &prod.x, &Psum); | |||
fp_mul3(&tmp0, &prod.z, &Pdif); | |||
fp_add3(&Q.x, &tmp0, &tmp1); | |||
fp_sub3(&Q.z, &tmp0, &tmp1); | |||
proj M[3] = {*K}; | |||
xDBL(&M[1], A, K); | |||
for (uint64_t i = 1; i < k / 2; ++i) { | |||
if (i >= 2) | |||
xADD(&M[i % 3], &M[(i - 1) % 3], K, &M[(i - 2) % 3]); | |||
fp_sub3(&tmp1, &M[i % 3].x, &M[i % 3].z); | |||
fp_add3(&tmp0, &M[i % 3].x, &M[i % 3].z); | |||
fp_mul2(&prod.x, &tmp1); | |||
fp_mul2(&prod.z, &tmp0); | |||
fp_mul2(&tmp1, &Psum); | |||
fp_mul2(&tmp0, &Pdif); | |||
fp_add3(&tmp2, &tmp0, &tmp1); | |||
fp_mul2(&Q.x, &tmp2); | |||
fp_sub3(&tmp2, &tmp0, &tmp1); | |||
fp_mul2(&Q.z, &tmp2); | |||
} | |||
// point evaluation | |||
fp_sq1(&Q.x); | |||
fp_sq1(&Q.z); | |||
fp_mul2(&P->x, &Q.x); | |||
fp_mul2(&P->z, &Q.z); | |||
//compute Aed.x^k, Aed.z^k | |||
exp_by_squaring_(&Aed.x, &Aed.z, k); | |||
//compute prod.x^8, prod.z^8 | |||
fp_sq1(&prod.x); | |||
fp_sq1(&prod.x); | |||
fp_sq1(&prod.x); | |||
fp_sq1(&prod.z); | |||
fp_sq1(&prod.z); | |||
fp_sq1(&prod.z); | |||
//compute image curve parameters | |||
fp_mul2(&Aed.z, &prod.x); | |||
fp_mul2(&Aed.x, &prod.z); | |||
//compute Montgomery params | |||
fp_add3(&A->x, &Aed.x, &Aed.z); | |||
fp_sub3(&A->z, &Aed.x, &Aed.z); | |||
fp_add2(&A->x, &A->x); | |||
} | |||
@@ -0,0 +1,19 @@ | |||
#ifndef MONT_H | |||
#define MONT_H | |||
#include "u512.h" | |||
#include "fp.h" | |||
/* P^1 over fp. */ | |||
typedef struct proj { | |||
fp x; | |||
fp z; | |||
} proj; | |||
void xDBL(proj *Q, proj const *A, proj const *P); | |||
void xADD(proj *S, proj const *P, proj const *Q, proj const *PQ); | |||
void xDBLADD(proj *R, proj *S, proj const *P, proj const *Q, proj const *PQ, proj const *A); | |||
void xMUL(proj *Q, proj const *A, proj const *P, u512 const *k); | |||
void xISOG(proj *A, proj *P, proj const *K, uint64_t k); | |||
#endif |
@@ -0,0 +1,18 @@ | |||
#include "rng.h" | |||
#include <stdlib.h> | |||
#include <unistd.h> | |||
#include <fcntl.h> | |||
void randombytes(void *x, size_t l) | |||
{ | |||
static int fd = -1; | |||
ssize_t n; | |||
if (fd < 0 && 0 > (fd = open("/dev/urandom", O_RDONLY))) | |||
exit(1); | |||
for (size_t i = 0; i < l; i += n) | |||
if (0 >= (n = read(fd, (char *) x + i, l - i))) | |||
exit(2); | |||
} | |||
@@ -0,0 +1,8 @@ | |||
#ifndef RNG_H | |||
#define RNG_H | |||
#include <stdlib.h> | |||
void randombytes(void *x, size_t l); | |||
#endif |
@@ -0,0 +1,22 @@ | |||
#ifndef UINT_H | |||
#define UINT_H | |||
#include <stdbool.h> | |||
#include <stdint.h> | |||
typedef struct u512 { | |||
uint64_t c[8]; | |||
} u512; | |||
extern const u512 u512_1; | |||
void u512_set(u512 *x, uint64_t y); | |||
bool u512_bit(u512 const *x, uint64_t k); | |||
bool u512_add3(u512 *x, u512 const *y, u512 const *z); /* returns carry */ | |||
bool u512_sub3(u512 *x, u512 const *y, u512 const *z); /* returns borrow */ | |||
void u512_mul3_64(u512 *x, u512 const *y, uint64_t z); | |||
#endif |
@@ -0,0 +1,102 @@ | |||
.intel_syntax noprefix | |||
.section .rodata | |||
.global u512_1 | |||
u512_1: .quad 1, 0, 0, 0, 0, 0, 0, 0 | |||
.section .text | |||
.global u512_set | |||
u512_set: | |||
cld | |||
mov rax, rsi | |||
stosq | |||
xor rax, rax | |||
mov rcx, 7 | |||
rep stosq | |||
ret | |||
.global u512_bit | |||
u512_bit: | |||
mov rcx, rsi | |||
and rcx, 0x3f | |||
shr rsi, 6 | |||
mov rax, [rdi + 8*rsi] | |||
shr rax, cl | |||
and rax, 1 | |||
ret | |||
.global u512_add3 | |||
u512_add3: | |||
mov rax, [rsi + 0] | |||
add rax, [rdx + 0] | |||
mov [rdi + 0], rax | |||
.set k, 1 | |||
.rept 7 | |||
mov rax, [rsi + 8*k] | |||
adc rax, [rdx + 8*k] | |||
mov [rdi + 8*k], rax | |||
.set k, k+1 | |||
.endr | |||
setc al | |||
movzx rax, al | |||
ret | |||
.global u512_sub3 | |||
u512_sub3: | |||
mov rax, [rsi + 0] | |||
sub rax, [rdx + 0] | |||
mov [rdi + 0], rax | |||
.set k, 1 | |||
.rept 7 | |||
mov rax, [rsi + 8*k] | |||
sbb rax, [rdx + 8*k] | |||
mov [rdi + 8*k], rax | |||
.set k, k+1 | |||
.endr | |||
setc al | |||
movzx rax, al | |||
ret | |||
.global u512_mul3_64 | |||
u512_mul3_64: | |||
mulx r10, rax, [rsi + 0] | |||
mov [rdi + 0], rax | |||
mulx r11, rax, [rsi + 8] | |||
add rax, r10 | |||
mov [rdi + 8], rax | |||
mulx r10, rax, [rsi + 16] | |||
adcx rax, r11 | |||
mov [rdi + 16], rax | |||
mulx r11, rax, [rsi + 24] | |||
adcx rax, r10 | |||
mov [rdi + 24], rax | |||
mulx r10, rax, [rsi + 32] | |||
adcx rax, r11 | |||
mov [rdi + 32],rax | |||
mulx r11, rax, [rsi + 40] | |||
adcx rax, r10 | |||
mov [rdi + 40],rax | |||
mulx r10, rax, [rsi + 48] | |||
adcx rax, r11 | |||
mov [rdi + 48],rax | |||
mulx r11, rax, [rsi + 56] | |||
adcx rax, r10 | |||
mov [rdi + 56],rax | |||
ret | |||