c-implementation
这个提交包含在:
当前提交
c00c37e011
25
Makefile
普通文件
25
Makefile
普通文件
@ -0,0 +1,25 @@
|
||||
all:
|
||||
@gcc \
|
||||
-Wall -Wextra \
|
||||
-O0 -funroll-loops \
|
||||
rng.c \
|
||||
u512.s fp.s \
|
||||
mont.c \
|
||||
csidh.c \
|
||||
main.c \
|
||||
-o main
|
||||
|
||||
debug:
|
||||
gcc \
|
||||
-Wall -Wextra \
|
||||
-g \
|
||||
rng.c \
|
||||
u512.s fp.s \
|
||||
mont.c \
|
||||
csidh.c \
|
||||
main.c \
|
||||
-o main
|
||||
|
||||
clean:
|
||||
rm -f main
|
||||
|
54
bench.c
普通文件
54
bench.c
普通文件
@ -0,0 +1,54 @@
|
||||
|
||||
#include <stdlib.h>
|
||||
#include <stdio.h>
|
||||
#include <string.h>
|
||||
#include <time.h>
|
||||
#include <assert.h>
|
||||
|
||||
#include "u512.h"
|
||||
#include "fp.h"
|
||||
#include "mont.h"
|
||||
#include "csidh.h"
|
||||
|
||||
#include <inttypes.h>
|
||||
|
||||
static __inline__ uint64_t rdtsc(void)
|
||||
{
|
||||
uint32_t hi, lo;
|
||||
__asm__ __volatile__ ("rdtsc" : "=a"(lo), "=d"(hi));
|
||||
return lo | (uint64_t) hi << 32;
|
||||
}
|
||||
|
||||
unsigned long its = 10000;
|
||||
|
||||
int main()
|
||||
{
|
||||
clock_t t0, t1, time = 0;
|
||||
uint64_t c0, c1, cycles = 0;
|
||||
|
||||
private_key priv;
|
||||
public_key pub = base;
|
||||
|
||||
for (unsigned long i = 0; i < its; ++i) {
|
||||
|
||||
csidh_private(&priv);
|
||||
|
||||
t0 = clock();
|
||||
c0 = rdtsc();
|
||||
|
||||
/**************************************/
|
||||
assert(validate(&pub));
|
||||
action(&pub, &pub, &priv);
|
||||
/**************************************/
|
||||
|
||||
c1 = rdtsc();
|
||||
t1 = clock();
|
||||
cycles += c1 - c0;
|
||||
time += t1 - t0;
|
||||
}
|
||||
|
||||
printf("iterations: %lu\n", its);
|
||||
printf("clock cycles: %" PRIu64 "\n", (uint64_t) cycles / its);
|
||||
printf("wall-clock time: %.3lf ms\n", 1000. * time / CLOCKS_PER_SEC / its);
|
||||
}
|
||||
|
220
csidh.c
普通文件
220
csidh.c
普通文件
@ -0,0 +1,220 @@
|
||||
|
||||
#include <string.h>
|
||||
#include <assert.h>
|
||||
|
||||
#include "csidh.h"
|
||||
#include "rng.h"
|
||||
|
||||
/* specific to p, should perhaps be somewhere else */
|
||||
const unsigned primes[num_primes] = {
|
||||
3, 5, 7, 11, 13, 17, 19, 23, 29, 31, 37, 41, 43, 47, 53, 59,
|
||||
61, 67, 71, 73, 79, 83, 89, 97, 101, 103, 107, 109, 113, 127, 131, 137,
|
||||
139, 149, 151, 157, 163, 167, 173, 179, 181, 191, 193, 197, 199, 211, 223, 227,
|
||||
229, 233, 239, 241, 251, 257, 263, 269, 271, 277, 281, 283, 293, 307, 311, 313,
|
||||
317, 331, 337, 347, 349, 353, 359, 367, 373, 587,
|
||||
};
|
||||
|
||||
const u512 four_sqrt_p = {{
|
||||
0x85e2579c786882cf, 0x4e3433657e18da95, 0x850ae5507965a0b3, 0xa15bc4e676475964,
|
||||
}};
|
||||
|
||||
|
||||
const public_key base = {0}; /* A = 0 */
|
||||
|
||||
void csidh_private(private_key *priv)
|
||||
{
|
||||
memset(&priv->e, 0, sizeof(priv->e));
|
||||
for (size_t i = 0; i < num_primes; ) {
|
||||
int8_t buf[64];
|
||||
randombytes(buf, sizeof(buf));
|
||||
for (size_t j = 0; j < sizeof(buf); ++j) {
|
||||
if (buf[j] <= max_exponent && buf[j] >= -max_exponent) {
|
||||
priv->e[i / 2] |= (buf[j] & 0xf) << i % 2 * 4;
|
||||
if (++i >= num_primes)
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/* compute [(p+1)/l] P for all l in our list of primes. */
|
||||
/* divide and conquer is much faster than doing it naively,
|
||||
* but uses more memory. */
|
||||
static void cofactor_multiples(proj *P, const proj *A, size_t lower, size_t upper)
|
||||
{
|
||||
assert(lower < upper);
|
||||
|
||||
if (upper - lower == 1)
|
||||
return;
|
||||
|
||||
size_t mid = lower + (upper - lower + 1) / 2;
|
||||
|
||||
u512 cl = u512_1, cu = u512_1;
|
||||
for (size_t i = lower; i < mid; ++i)
|
||||
u512_mul3_64(&cu, &cu, primes[i]);
|
||||
for (size_t i = mid; i < upper; ++i)
|
||||
u512_mul3_64(&cl, &cl, primes[i]);
|
||||
|
||||
xMUL(&P[mid], A, &P[lower], &cu);
|
||||
xMUL(&P[lower], A, &P[lower], &cl);
|
||||
|
||||
cofactor_multiples(P, A, lower, mid);
|
||||
cofactor_multiples(P, A, mid, upper);
|
||||
}
|
||||
|
||||
/* never accepts invalid keys. */
|
||||
bool validate(public_key const *in)
|
||||
{
|
||||
const proj A = {in->A, fp_1};
|
||||
|
||||
do {
|
||||
|
||||
proj P[num_primes];
|
||||
fp_random(&P->x);
|
||||
P->z = fp_1;
|
||||
|
||||
/* maximal 2-power in p+1 */
|
||||
xDBL(P, &A, P);
|
||||
xDBL(P, &A, P);
|
||||
|
||||
cofactor_multiples(P, &A, 0, num_primes);
|
||||
|
||||
u512 order = u512_1;
|
||||
|
||||
for (size_t i = num_primes - 1; i < num_primes; --i) {
|
||||
|
||||
/* we only gain information if [(p+1)/l] P is non-zero */
|
||||
if (memcmp(&P[i].z, &fp_0, sizeof(fp))) {
|
||||
|
||||
u512 tmp;
|
||||
u512_set(&tmp, primes[i]);
|
||||
xMUL(&P[i], &A, &P[i], &tmp);
|
||||
|
||||
if (memcmp(&P[i].z, &fp_0, sizeof(fp)))
|
||||
/* P does not have order dividing p+1. */
|
||||
return false;
|
||||
|
||||
u512_mul3_64(&order, &order, primes[i]);
|
||||
|
||||
if (u512_sub3(&tmp, &four_sqrt_p, &order)) /* returns borrow */
|
||||
/* order > 4 sqrt(p), hence definitely supersingular */
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
/* P didn't have big enough order to prove supersingularity. */
|
||||
} while (1);
|
||||
}
|
||||
|
||||
/* compute x^3 + Ax^2 + x */
|
||||
static void montgomery_rhs(fp *rhs, fp const *A, fp const *x)
|
||||
{
|
||||
fp tmp;
|
||||
*rhs = *x;
|
||||
fp_sq1(rhs);
|
||||
fp_mul3(&tmp, A, x);
|
||||
fp_add2(rhs, &tmp);
|
||||
fp_add2(rhs, &fp_1);
|
||||
fp_mul2(rhs, x);
|
||||
}
|
||||
|
||||
/* totally not constant-time. */
|
||||
void action(public_key *out, public_key const *in, private_key const *priv)
|
||||
{
|
||||
u512 k[2];
|
||||
u512_set(&k[0], 4); /* maximal 2-power in p+1 */
|
||||
u512_set(&k[1], 4); /* maximal 2-power in p+1 */
|
||||
|
||||
uint8_t e[2][num_primes];
|
||||
|
||||
for (size_t i = 0; i < num_primes; ++i) {
|
||||
|
||||
int8_t t = (int8_t) (priv->e[i / 2] << i % 2 * 4) >> 4;
|
||||
|
||||
if (t > 0) {
|
||||
e[0][i] = t;
|
||||
e[1][i] = 0;
|
||||
u512_mul3_64(&k[1], &k[1], primes[i]);
|
||||
}
|
||||
else if (t < 0) {
|
||||
e[1][i] = -t;
|
||||
e[0][i] = 0;
|
||||
u512_mul3_64(&k[0], &k[0], primes[i]);
|
||||
}
|
||||
else {
|
||||
e[0][i] = 0;
|
||||
e[1][i] = 0;
|
||||
u512_mul3_64(&k[0], &k[0], primes[i]);
|
||||
u512_mul3_64(&k[1], &k[1], primes[i]);
|
||||
}
|
||||
}
|
||||
|
||||
proj A = {in->A, fp_1};
|
||||
|
||||
bool done[2] = {false, false};
|
||||
|
||||
do {
|
||||
|
||||
assert(!memcmp(&A.z, &fp_1, sizeof(fp)));
|
||||
|
||||
proj P;
|
||||
fp_random(&P.x);
|
||||
P.z = fp_1;
|
||||
|
||||
fp rhs;
|
||||
montgomery_rhs(&rhs, &A.x, &P.x);
|
||||
bool sign = !fp_issquare(&rhs);
|
||||
|
||||
if (done[sign])
|
||||
continue;
|
||||
|
||||
xMUL(&P, &A, &P, &k[sign]);
|
||||
|
||||
done[sign] = true;
|
||||
|
||||
for (size_t i = num_primes-1; i < num_primes; --i) { //changed loop direction
|
||||
|
||||
if (e[sign][i]) {
|
||||
|
||||
u512 cof = u512_1;
|
||||
for (size_t j = i - 1; j < num_primes; --j) //changed loop direction
|
||||
if (e[sign][j])
|
||||
u512_mul3_64(&cof, &cof, primes[j]);
|
||||
|
||||
proj K;
|
||||
xMUL(&K, &A, &P, &cof);
|
||||
|
||||
if (memcmp(&K.z, &fp_0, sizeof(fp))) {
|
||||
|
||||
xISOG(&A, &P, &K, primes[i]);
|
||||
|
||||
if (!--e[sign][i])
|
||||
u512_mul3_64(&k[sign], &k[sign], primes[i]);
|
||||
|
||||
}
|
||||
|
||||
}
|
||||
|
||||
done[sign] &= !e[sign][i];
|
||||
}
|
||||
|
||||
fp_inv(&A.z);
|
||||
fp_mul2(&A.x, &A.z);
|
||||
A.z = fp_1;
|
||||
|
||||
} while (!(done[0] && done[1]));
|
||||
|
||||
out->A = A.x;
|
||||
}
|
||||
|
||||
/* includes public-key validation. */
|
||||
bool csidh(public_key *out, public_key const *in, private_key const *priv)
|
||||
{
|
||||
if (!validate(in)) {
|
||||
fp_random(&out->A);
|
||||
return false;
|
||||
}
|
||||
action(out, in, priv);
|
||||
return true;
|
||||
}
|
||||
|
26
csidh.h
普通文件
26
csidh.h
普通文件
@ -0,0 +1,26 @@
|
||||
#ifndef CSIDH_H
|
||||
#define CSIDH_H
|
||||
|
||||
#include "u512.h"
|
||||
#include "fp.h"
|
||||
#include "mont.h"
|
||||
|
||||
/* specific to p, should perhaps be somewhere else */
|
||||
#define num_primes 74
|
||||
#define max_exponent 5 /* (2*5+1)^74 is roughly 2^256 */
|
||||
|
||||
|
||||
typedef struct private_key {
|
||||
int8_t e[(num_primes + 1) / 2]; /* packed int4_t */
|
||||
} private_key;
|
||||
|
||||
typedef struct public_key {
|
||||
fp A; /* Montgomery coefficient: represents y^2 = x^3 + Ax^2 + x */
|
||||
} public_key;
|
||||
|
||||
extern const public_key base;
|
||||
|
||||
void csidh_private(private_key *priv);
|
||||
bool csidh(public_key *out, public_key const *in, private_key const *priv);
|
||||
|
||||
#endif
|
514
cycle.h
普通文件
514
cycle.h
普通文件
@ -0,0 +1,514 @@
|
||||
/*
|
||||
* Copyright (c) 2003, 2007-8 Matteo Frigo
|
||||
* Copyright (c) 2003, 2007-8 Massachusetts Institute of Technology
|
||||
*
|
||||
* Permission is hereby granted, free of charge, to any person obtaining
|
||||
* a copy of this software and associated documentation files (the
|
||||
* "Software"), to deal in the Software without restriction, including
|
||||
* without limitation the rights to use, copy, modify, merge, publish,
|
||||
* distribute, sublicense, and/or sell copies of the Software, and to
|
||||
* permit persons to whom the Software is furnished to do so, subject to
|
||||
* the following conditions:
|
||||
*
|
||||
* The above copyright notice and this permission notice shall be
|
||||
* included in all copies or substantial portions of the Software.
|
||||
*
|
||||
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
|
||||
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
|
||||
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
|
||||
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
|
||||
* LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
|
||||
* OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
|
||||
* WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
|
||||
*
|
||||
*/
|
||||
|
||||
|
||||
/* machine-dependent cycle counters code. Needs to be inlined. */
|
||||
|
||||
/***************************************************************************/
|
||||
/* To use the cycle counters in your code, simply #include "cycle.h" (this
|
||||
file), and then use the functions/macros:
|
||||
|
||||
ticks getticks(void);
|
||||
|
||||
ticks is an opaque typedef defined below, representing the current time.
|
||||
You extract the elapsed time between two calls to gettick() via:
|
||||
|
||||
double elapsed(ticks t1, ticks t0);
|
||||
|
||||
which returns a double-precision variable in arbitrary units. You
|
||||
are not expected to convert this into human units like seconds; it
|
||||
is intended only for *comparisons* of time intervals.
|
||||
|
||||
(In order to use some of the OS-dependent timer routines like
|
||||
Solaris' gethrtime, you need to paste the autoconf snippet below
|
||||
into your configure.ac file and #include "config.h" before cycle.h,
|
||||
or define the relevant macros manually if you are not using autoconf.)
|
||||
*/
|
||||
|
||||
/***************************************************************************/
|
||||
/* This file uses macros like HAVE_GETHRTIME that are assumed to be
|
||||
defined according to whether the corresponding function/type/header
|
||||
is available on your system. The necessary macros are most
|
||||
conveniently defined if you are using GNU autoconf, via the tests:
|
||||
|
||||
dnl ---------------------------------------------------------------------
|
||||
|
||||
AC_C_INLINE
|
||||
AC_HEADER_TIME
|
||||
AC_CHECK_HEADERS([sys/time.h c_asm.h intrinsics.h mach/mach_time.h])
|
||||
|
||||
AC_CHECK_TYPE([hrtime_t],[AC_DEFINE(HAVE_HRTIME_T, 1, [Define to 1 if hrtime_t is defined in <sys/time.h>])],,[#if HAVE_SYS_TIME_H
|
||||
#include <sys/time.h>
|
||||
#endif])
|
||||
|
||||
AC_CHECK_FUNCS([gethrtime read_real_time time_base_to_time clock_gettime mach_absolute_time])
|
||||
|
||||
dnl Cray UNICOS _rtc() (real-time clock) intrinsic
|
||||
AC_MSG_CHECKING([for _rtc intrinsic])
|
||||
rtc_ok=yes
|
||||
AC_TRY_LINK([#ifdef HAVE_INTRINSICS_H
|
||||
#include <intrinsics.h>
|
||||
#endif], [_rtc()], [AC_DEFINE(HAVE__RTC,1,[Define if you have the UNICOS _rtc() intrinsic.])], [rtc_ok=no])
|
||||
AC_MSG_RESULT($rtc_ok)
|
||||
|
||||
dnl ---------------------------------------------------------------------
|
||||
*/
|
||||
|
||||
/***************************************************************************/
|
||||
|
||||
#if TIME_WITH_SYS_TIME
|
||||
# include <sys/time.h>
|
||||
# include <time.h>
|
||||
#else
|
||||
# if HAVE_SYS_TIME_H
|
||||
# include <sys/time.h>
|
||||
# else
|
||||
# include <time.h>
|
||||
# endif
|
||||
#endif
|
||||
|
||||
#define INLINE_ELAPSED(INL) static INL double elapsed(ticks t1, ticks t0) \
|
||||
{ \
|
||||
return (double)t1 - (double)t0; \
|
||||
}
|
||||
|
||||
/*----------------------------------------------------------------*/
|
||||
/* Solaris */
|
||||
#if defined(HAVE_GETHRTIME) && defined(HAVE_HRTIME_T) && !defined(HAVE_TICK_COUNTER)
|
||||
typedef hrtime_t ticks;
|
||||
|
||||
#define getticks gethrtime
|
||||
|
||||
INLINE_ELAPSED(inline)
|
||||
|
||||
#define HAVE_TICK_COUNTER
|
||||
#endif
|
||||
|
||||
/*----------------------------------------------------------------*/
|
||||
/* AIX v. 4+ routines to read the real-time clock or time-base register */
|
||||
#if defined(HAVE_READ_REAL_TIME) && defined(HAVE_TIME_BASE_TO_TIME) && !defined(HAVE_TICK_COUNTER)
|
||||
typedef timebasestruct_t ticks;
|
||||
|
||||
static __inline ticks getticks(void)
|
||||
{
|
||||
ticks t;
|
||||
read_real_time(&t, TIMEBASE_SZ);
|
||||
return t;
|
||||
}
|
||||
|
||||
static __inline double elapsed(ticks t1, ticks t0) /* time in nanoseconds */
|
||||
{
|
||||
time_base_to_time(&t1, TIMEBASE_SZ);
|
||||
time_base_to_time(&t0, TIMEBASE_SZ);
|
||||
return (((double)t1.tb_high - (double)t0.tb_high) * 1.0e9 +
|
||||
((double)t1.tb_low - (double)t0.tb_low));
|
||||
}
|
||||
|
||||
#define HAVE_TICK_COUNTER
|
||||
#endif
|
||||
|
||||
/*----------------------------------------------------------------*/
|
||||
/*
|
||||
* PowerPC ``cycle'' counter using the time base register.
|
||||
*/
|
||||
#if ((((defined(__GNUC__) && (defined(__powerpc__) || defined(__ppc__))) || (defined(__MWERKS__) && defined(macintosh)))) || (defined(__IBM_GCC_ASM) && (defined(__powerpc__) || defined(__ppc__)))) && !defined(HAVE_TICK_COUNTER)
|
||||
typedef unsigned long long ticks;
|
||||
|
||||
static __inline__ ticks getticks(void)
|
||||
{
|
||||
unsigned int tbl, tbu0, tbu1;
|
||||
|
||||
do {
|
||||
__asm__ __volatile__ ("mftbu %0" : "=r"(tbu0));
|
||||
__asm__ __volatile__ ("mftb %0" : "=r"(tbl));
|
||||
__asm__ __volatile__ ("mftbu %0" : "=r"(tbu1));
|
||||
} while (tbu0 != tbu1);
|
||||
|
||||
return (((unsigned long long)tbu0) << 32) | tbl;
|
||||
}
|
||||
|
||||
INLINE_ELAPSED(__inline__)
|
||||
|
||||
#define HAVE_TICK_COUNTER
|
||||
#endif
|
||||
|
||||
/* MacOS/Mach (Darwin) time-base register interface (unlike UpTime,
|
||||
from Carbon, requires no additional libraries to be linked). */
|
||||
#if defined(HAVE_MACH_ABSOLUTE_TIME) && defined(HAVE_MACH_MACH_TIME_H) && !defined(HAVE_TICK_COUNTER)
|
||||
#include <mach/mach_time.h>
|
||||
typedef uint64_t ticks;
|
||||
#define getticks mach_absolute_time
|
||||
INLINE_ELAPSED(__inline__)
|
||||
#define HAVE_TICK_COUNTER
|
||||
#endif
|
||||
|
||||
/*----------------------------------------------------------------*/
|
||||
/*
|
||||
* Pentium cycle counter
|
||||
*/
|
||||
#if (defined(__GNUC__) || defined(__ICC)) && defined(__i386__) && !defined(HAVE_TICK_COUNTER)
|
||||
typedef unsigned long long ticks;
|
||||
|
||||
static __inline__ ticks getticks(void)
|
||||
{
|
||||
ticks ret;
|
||||
|
||||
__asm__ __volatile__("rdtsc": "=A" (ret));
|
||||
/* no input, nothing else clobbered */
|
||||
return ret;
|
||||
}
|
||||
|
||||
INLINE_ELAPSED(__inline__)
|
||||
|
||||
#define HAVE_TICK_COUNTER
|
||||
#define TIME_MIN 5000.0 /* unreliable pentium IV cycle counter */
|
||||
#endif
|
||||
|
||||
/* Visual C++ -- thanks to Morten Nissov for his help with this */
|
||||
#if _MSC_VER >= 1200 && _M_IX86 >= 500 && !defined(HAVE_TICK_COUNTER)
|
||||
#include <windows.h>
|
||||
typedef LARGE_INTEGER ticks;
|
||||
#define RDTSC __asm __emit 0fh __asm __emit 031h /* hack for VC++ 5.0 */
|
||||
|
||||
static __inline ticks getticks(void)
|
||||
{
|
||||
ticks retval;
|
||||
|
||||
__asm {
|
||||
RDTSC
|
||||
mov retval.HighPart, edx
|
||||
mov retval.LowPart, eax
|
||||
}
|
||||
return retval;
|
||||
}
|
||||
|
||||
static __inline double elapsed(ticks t1, ticks t0)
|
||||
{
|
||||
return (double)t1.QuadPart - (double)t0.QuadPart;
|
||||
}
|
||||
|
||||
#define HAVE_TICK_COUNTER
|
||||
#define TIME_MIN 5000.0 /* unreliable pentium IV cycle counter */
|
||||
#endif
|
||||
|
||||
/*----------------------------------------------------------------*/
|
||||
/*
|
||||
* X86-64 cycle counter
|
||||
*/
|
||||
#if (defined(__GNUC__) || defined(__ICC) || defined(__SUNPRO_C)) && defined(__x86_64__) && !defined(HAVE_TICK_COUNTER)
|
||||
typedef unsigned long long ticks;
|
||||
|
||||
static __inline__ ticks getticks(void)
|
||||
{
|
||||
unsigned a, d;
|
||||
asm volatile("rdtsc" : "=a" (a), "=d" (d));
|
||||
return ((ticks)a) | (((ticks)d) << 32);
|
||||
}
|
||||
|
||||
INLINE_ELAPSED(__inline__)
|
||||
|
||||
#define HAVE_TICK_COUNTER
|
||||
#endif
|
||||
|
||||
/* PGI compiler, courtesy Cristiano Calonaci, Andrea Tarsi, & Roberto Gori.
|
||||
NOTE: this code will fail to link unless you use the -Masmkeyword compiler
|
||||
option (grrr). */
|
||||
#if defined(__PGI) && defined(__x86_64__) && !defined(HAVE_TICK_COUNTER)
|
||||
typedef unsigned long long ticks;
|
||||
static ticks getticks(void)
|
||||
{
|
||||
asm(" rdtsc; shl $0x20,%rdx; mov %eax,%eax; or %rdx,%rax; ");
|
||||
}
|
||||
INLINE_ELAPSED(__inline__)
|
||||
#define HAVE_TICK_COUNTER
|
||||
#endif
|
||||
|
||||
/* Visual C++, courtesy of Dirk Michaelis */
|
||||
#if _MSC_VER >= 1400 && (defined(_M_AMD64) || defined(_M_X64)) && !defined(HAVE_TICK_COUNTER)
|
||||
|
||||
#include <intrin.h>
|
||||
#pragma intrinsic(__rdtsc)
|
||||
typedef unsigned __int64 ticks;
|
||||
#define getticks __rdtsc
|
||||
INLINE_ELAPSED(__inline)
|
||||
|
||||
#define HAVE_TICK_COUNTER
|
||||
#endif
|
||||
|
||||
/*----------------------------------------------------------------*/
|
||||
/*
|
||||
* IA64 cycle counter
|
||||
*/
|
||||
|
||||
/* intel's icc/ecc compiler */
|
||||
#if (defined(__EDG_VERSION) || defined(__ECC)) && defined(__ia64__) && !defined(HAVE_TICK_COUNTER)
|
||||
typedef unsigned long ticks;
|
||||
#include <ia64intrin.h>
|
||||
|
||||
static __inline__ ticks getticks(void)
|
||||
{
|
||||
return __getReg(_IA64_REG_AR_ITC);
|
||||
}
|
||||
|
||||
INLINE_ELAPSED(__inline__)
|
||||
|
||||
#define HAVE_TICK_COUNTER
|
||||
#endif
|
||||
|
||||
/* gcc */
|
||||
#if defined(__GNUC__) && defined(__ia64__) && !defined(HAVE_TICK_COUNTER)
|
||||
typedef unsigned long ticks;
|
||||
|
||||
static __inline__ ticks getticks(void)
|
||||
{
|
||||
ticks ret;
|
||||
|
||||
__asm__ __volatile__ ("mov %0=ar.itc" : "=r"(ret));
|
||||
return ret;
|
||||
}
|
||||
|
||||
INLINE_ELAPSED(__inline__)
|
||||
|
||||
#define HAVE_TICK_COUNTER
|
||||
#endif
|
||||
|
||||
/* HP/UX IA64 compiler, courtesy Teresa L. Johnson: */
|
||||
#if defined(__hpux) && defined(__ia64) && !defined(HAVE_TICK_COUNTER)
|
||||
#include <machine/sys/inline.h>
|
||||
typedef unsigned long ticks;
|
||||
|
||||
static inline ticks getticks(void)
|
||||
{
|
||||
ticks ret;
|
||||
|
||||
ret = _Asm_mov_from_ar (_AREG_ITC);
|
||||
return ret;
|
||||
}
|
||||
|
||||
INLINE_ELAPSED(inline)
|
||||
|
||||
#define HAVE_TICK_COUNTER
|
||||
#endif
|
||||
|
||||
/* Microsoft Visual C++ */
|
||||
#if defined(_MSC_VER) && defined(_M_IA64) && !defined(HAVE_TICK_COUNTER)
|
||||
typedef unsigned __int64 ticks;
|
||||
|
||||
# ifdef __cplusplus
|
||||
extern "C"
|
||||
# endif
|
||||
ticks __getReg(int whichReg);
|
||||
#pragma intrinsic(__getReg)
|
||||
|
||||
static __inline ticks getticks(void)
|
||||
{
|
||||
volatile ticks temp;
|
||||
temp = __getReg(3116);
|
||||
return temp;
|
||||
}
|
||||
|
||||
INLINE_ELAPSED(inline)
|
||||
|
||||
#define HAVE_TICK_COUNTER
|
||||
#endif
|
||||
|
||||
/*----------------------------------------------------------------*/
|
||||
/*
|
||||
* PA-RISC cycle counter
|
||||
*/
|
||||
#if defined(__hppa__) || defined(__hppa) && !defined(HAVE_TICK_COUNTER)
|
||||
typedef unsigned long ticks;
|
||||
|
||||
# ifdef __GNUC__
|
||||
static __inline__ ticks getticks(void)
|
||||
{
|
||||
ticks ret;
|
||||
|
||||
__asm__ __volatile__("mfctl 16, %0": "=r" (ret));
|
||||
/* no input, nothing else clobbered */
|
||||
return ret;
|
||||
}
|
||||
# else
|
||||
# include <machine/inline.h>
|
||||
static inline unsigned long getticks(void)
|
||||
{
|
||||
register ticks ret;
|
||||
_MFCTL(16, ret);
|
||||
return ret;
|
||||
}
|
||||
# endif
|
||||
|
||||
INLINE_ELAPSED(inline)
|
||||
|
||||
#define HAVE_TICK_COUNTER
|
||||
#endif
|
||||
|
||||
/*----------------------------------------------------------------*/
|
||||
/* S390, courtesy of James Treacy */
|
||||
#if defined(__GNUC__) && defined(__s390__) && !defined(HAVE_TICK_COUNTER)
|
||||
typedef unsigned long long ticks;
|
||||
|
||||
static __inline__ ticks getticks(void)
|
||||
{
|
||||
ticks cycles;
|
||||
__asm__("stck 0(%0)" : : "a" (&(cycles)) : "memory", "cc");
|
||||
return cycles;
|
||||
}
|
||||
|
||||
INLINE_ELAPSED(__inline__)
|
||||
|
||||
#define HAVE_TICK_COUNTER
|
||||
#endif
|
||||
/*----------------------------------------------------------------*/
|
||||
#if defined(__GNUC__) && defined(__alpha__) && !defined(HAVE_TICK_COUNTER)
|
||||
/*
|
||||
* The 32-bit cycle counter on alpha overflows pretty quickly,
|
||||
* unfortunately. A 1GHz machine overflows in 4 seconds.
|
||||
*/
|
||||
typedef unsigned int ticks;
|
||||
|
||||
static __inline__ ticks getticks(void)
|
||||
{
|
||||
unsigned long cc;
|
||||
__asm__ __volatile__ ("rpcc %0" : "=r"(cc));
|
||||
return (cc & 0xFFFFFFFF);
|
||||
}
|
||||
|
||||
INLINE_ELAPSED(__inline__)
|
||||
|
||||
#define HAVE_TICK_COUNTER
|
||||
#endif
|
||||
|
||||
/*----------------------------------------------------------------*/
|
||||
#if defined(__GNUC__) && defined(__sparc_v9__) && !defined(HAVE_TICK_COUNTER)
|
||||
typedef unsigned long ticks;
|
||||
|
||||
static __inline__ ticks getticks(void)
|
||||
{
|
||||
ticks ret;
|
||||
__asm__ __volatile__("rd %%tick, %0" : "=r" (ret));
|
||||
return ret;
|
||||
}
|
||||
|
||||
INLINE_ELAPSED(__inline__)
|
||||
|
||||
#define HAVE_TICK_COUNTER
|
||||
#endif
|
||||
|
||||
/*----------------------------------------------------------------*/
|
||||
#if (defined(__DECC) || defined(__DECCXX)) && defined(__alpha) && defined(HAVE_C_ASM_H) && !defined(HAVE_TICK_COUNTER)
|
||||
# include <c_asm.h>
|
||||
typedef unsigned int ticks;
|
||||
|
||||
static __inline ticks getticks(void)
|
||||
{
|
||||
unsigned long cc;
|
||||
cc = asm("rpcc %v0");
|
||||
return (cc & 0xFFFFFFFF);
|
||||
}
|
||||
|
||||
INLINE_ELAPSED(__inline)
|
||||
|
||||
#define HAVE_TICK_COUNTER
|
||||
#endif
|
||||
/*----------------------------------------------------------------*/
|
||||
/* SGI/Irix */
|
||||
#if defined(HAVE_CLOCK_GETTIME) && defined(CLOCK_SGI_CYCLE) && !defined(HAVE_TICK_COUNTER)
|
||||
typedef struct timespec ticks;
|
||||
|
||||
static inline ticks getticks(void)
|
||||
{
|
||||
struct timespec t;
|
||||
clock_gettime(CLOCK_SGI_CYCLE, &t);
|
||||
return t;
|
||||
}
|
||||
|
||||
static inline double elapsed(ticks t1, ticks t0)
|
||||
{
|
||||
return ((double)t1.tv_sec - (double)t0.tv_sec) * 1.0E9 +
|
||||
((double)t1.tv_nsec - (double)t0.tv_nsec);
|
||||
}
|
||||
#define HAVE_TICK_COUNTER
|
||||
#endif
|
||||
|
||||
/*----------------------------------------------------------------*/
|
||||
/* Cray UNICOS _rtc() intrinsic function */
|
||||
#if defined(HAVE__RTC) && !defined(HAVE_TICK_COUNTER)
|
||||
#ifdef HAVE_INTRINSICS_H
|
||||
# include <intrinsics.h>
|
||||
#endif
|
||||
|
||||
typedef long long ticks;
|
||||
|
||||
#define getticks _rtc
|
||||
|
||||
INLINE_ELAPSED(inline)
|
||||
|
||||
#define HAVE_TICK_COUNTER
|
||||
#endif
|
||||
|
||||
/*----------------------------------------------------------------*/
|
||||
/* MIPS ZBus */
|
||||
#if HAVE_MIPS_ZBUS_TIMER
|
||||
#if defined(__mips__) && !defined(HAVE_TICK_COUNTER)
|
||||
#include <sys/mman.h>
|
||||
#include <unistd.h>
|
||||
#include <fcntl.h>
|
||||
|
||||
typedef uint64_t ticks;
|
||||
|
||||
static inline ticks getticks(void)
|
||||
{
|
||||
static uint64_t* addr = 0;
|
||||
|
||||
if (addr == 0)
|
||||
{
|
||||
uint32_t rq_addr = 0x10030000;
|
||||
int fd;
|
||||
int pgsize;
|
||||
|
||||
pgsize = getpagesize();
|
||||
fd = open ("/dev/mem", O_RDONLY | O_SYNC, 0);
|
||||
if (fd < 0) {
|
||||
perror("open");
|
||||
return NULL;
|
||||
}
|
||||
addr = mmap(0, pgsize, PROT_READ, MAP_SHARED, fd, rq_addr);
|
||||
close(fd);
|
||||
if (addr == (uint64_t *)-1) {
|
||||
perror("mmap");
|
||||
return NULL;
|
||||
}
|
||||
}
|
||||
|
||||
return *addr;
|
||||
}
|
||||
|
||||
INLINE_ELAPSED(inline)
|
||||
|
||||
#define HAVE_TICK_COUNTER
|
||||
#endif
|
||||
#endif /* HAVE_MIPS_ZBUS_TIMER */
|
||||
|
37
fp.h
普通文件
37
fp.h
普通文件
@ -0,0 +1,37 @@
|
||||
#ifndef FP_H
|
||||
#define FP_H
|
||||
|
||||
#include "u512.h"
|
||||
|
||||
/* fp is in the Montgomery domain, so interpreting that
|
||||
as an integer should never make sense.
|
||||
enable compiler warnings when mixing up u512 and fp. */
|
||||
typedef struct fp {
|
||||
u512 x;
|
||||
} fp;
|
||||
|
||||
extern const fp fp_0;
|
||||
extern const fp fp_1;
|
||||
|
||||
void fp_set(fp *x, uint64_t y);
|
||||
void fp_cswap(fp *x, fp *y, bool c);
|
||||
|
||||
void fp_enc(fp *x, u512 const *y); /* encode to Montgomery representation */
|
||||
void fp_dec(u512 *x, fp const *y); /* decode from Montgomery representation */
|
||||
|
||||
void fp_add2(fp *x, fp const *y);
|
||||
void fp_sub2(fp *x, fp const *y);
|
||||
void fp_mul2(fp *x, fp const *y);
|
||||
|
||||
void fp_add3(fp *x, fp const *y, fp const *z);
|
||||
void fp_sub3(fp *x, fp const *y, fp const *z);
|
||||
void fp_mul3(fp *x, fp const *y, fp const *z);
|
||||
|
||||
void fp_sq1(fp *x);
|
||||
void fp_sq2(fp *x, fp const *y);
|
||||
void fp_inv(fp *x);
|
||||
bool fp_issquare(fp const *x);
|
||||
|
||||
void fp_random(fp *x);
|
||||
|
||||
#endif
|
452
fp.s
普通文件
452
fp.s
普通文件
@ -0,0 +1,452 @@
|
||||
|
||||
.intel_syntax noprefix
|
||||
|
||||
.section .rodata
|
||||
|
||||
.set pbits, 511
|
||||
p:
|
||||
.quad 0x1b81b90533c6c87b, 0xc2721bf457aca835, 0x516730cc1f0b4f25, 0xa7aac6c567f35507
|
||||
.quad 0x5afbfcc69322c9cd, 0xb42d083aedc88c42, 0xfc8ab0d15e3e4c4a, 0x65b48e8f740f89bf
|
||||
|
||||
|
||||
.global fp_0
|
||||
fp_0: .quad 0, 0, 0, 0, 0, 0, 0, 0
|
||||
|
||||
.global fp_1
|
||||
fp_1: /* 2^512 mod p */
|
||||
.quad 0xc8fc8df598726f0a, 0x7b1bc81750a6af95, 0x5d319e67c1e961b4, 0xb0aa7275301955f1
|
||||
.quad 0x4a080672d9ba6c64, 0x97a5ef8a246ee77b, 0x06ea9e5d4383676a, 0x3496e2e117e0ec80
|
||||
|
||||
|
||||
/* (2^512)^2 mod p */
|
||||
.r_squared_mod_p:
|
||||
.quad 0x36905b572ffc1724, 0x67086f4525f1f27d, 0x4faf3fbfd22370ca, 0x192ea214bcc584b1
|
||||
.quad 0x5dae03ee2f5de3d0, 0x1e9248731776b371, 0xad5f166e20e4f52d, 0x4ed759aea6f3917e
|
||||
|
||||
/* -p^-1 mod 2^64 */
|
||||
.inv_min_p_mod_r:
|
||||
.quad 0x66c1301f632e294d
|
||||
|
||||
|
||||
.section .text
|
||||
|
||||
.global fp_copy
|
||||
fp_copy:
|
||||
cld
|
||||
mov rcx, 8
|
||||
rep movsq
|
||||
ret
|
||||
|
||||
.global fp_set
|
||||
fp_set:
|
||||
push rdi
|
||||
call u512_set
|
||||
pop rdi
|
||||
mov rsi, rdi
|
||||
jmp fp_enc
|
||||
|
||||
.global fp_cswap
|
||||
fp_cswap:
|
||||
movzx rax, dl
|
||||
neg rax
|
||||
.set k, 0
|
||||
.rept 8
|
||||
mov rcx, [rdi + 8*k]
|
||||
mov rdx, [rsi + 8*k]
|
||||
|
||||
mov r8, rcx
|
||||
xor r8, rdx
|
||||
and r8, rax
|
||||
|
||||
xor rcx, r8
|
||||
xor rdx, r8
|
||||
|
||||
mov [rdi + 8*k], rcx
|
||||
mov [rsi + 8*k], rdx
|
||||
|
||||
.set k, k+1
|
||||
.endr
|
||||
ret
|
||||
|
||||
.reduce_once:
|
||||
push rbp
|
||||
mov rbp, rdi
|
||||
|
||||
mov rdi, [rbp + 0]
|
||||
sub rdi, [rip + p + 0]
|
||||
mov rsi, [rbp + 8]
|
||||
sbb rsi, [rip + p + 8]
|
||||
mov rdx, [rbp + 16]
|
||||
sbb rdx, [rip + p + 16]
|
||||
mov rcx, [rbp + 24]
|
||||
sbb rcx, [rip + p + 24]
|
||||
mov r8, [rbp + 32]
|
||||
sbb r8, [rip + p + 32]
|
||||
mov r9, [rbp + 40]
|
||||
sbb r9, [rip + p + 40]
|
||||
mov r10, [rbp + 48]
|
||||
sbb r10, [rip + p + 48]
|
||||
mov r11, [rbp + 56]
|
||||
sbb r11, [rip + p + 56]
|
||||
|
||||
setnc al
|
||||
movzx rax, al
|
||||
neg rax
|
||||
|
||||
.macro cswap2, r, m
|
||||
xor \r, \m
|
||||
and \r, rax
|
||||
xor \m, \r
|
||||
.endm
|
||||
|
||||
cswap2 rdi, [rbp + 0]
|
||||
cswap2 rsi, [rbp + 8]
|
||||
cswap2 rdx, [rbp + 16]
|
||||
cswap2 rcx, [rbp + 24]
|
||||
cswap2 r8, [rbp + 32]
|
||||
cswap2 r9, [rbp + 40]
|
||||
cswap2 r10, [rbp + 48]
|
||||
cswap2 r11, [rbp + 56]
|
||||
|
||||
pop rbp
|
||||
ret
|
||||
|
||||
.global fp_add3
|
||||
fp_add3:
|
||||
push rdi
|
||||
call u512_add3
|
||||
pop rdi
|
||||
jmp .reduce_once
|
||||
|
||||
.global fp_add2
|
||||
fp_add2:
|
||||
mov rdx, rdi
|
||||
jmp fp_add3
|
||||
|
||||
.global fp_sub3
|
||||
fp_sub3:
|
||||
push rdi
|
||||
call u512_sub3
|
||||
pop rdi
|
||||
xor rsi, rsi
|
||||
xor rdx, rdx
|
||||
xor rcx, rcx
|
||||
xor r8, r8
|
||||
xor r9, r9
|
||||
xor r10, r10
|
||||
xor r11, r11
|
||||
test rax, rax
|
||||
cmovnz rax, [rip + p + 0]
|
||||
cmovnz rsi, [rip + p + 8]
|
||||
cmovnz rdx, [rip + p + 16]
|
||||
cmovnz rcx, [rip + p + 24]
|
||||
cmovnz r8, [rip + p + 32]
|
||||
cmovnz r9, [rip + p + 40]
|
||||
cmovnz r10, [rip + p + 48]
|
||||
cmovnz r11, [rip + p + 56]
|
||||
add [rdi + 0], rax
|
||||
adc [rdi + 8], rsi
|
||||
adc [rdi + 16], rdx
|
||||
adc [rdi + 24], rcx
|
||||
adc [rdi + 32], r8
|
||||
adc [rdi + 40], r9
|
||||
adc [rdi + 48], r10
|
||||
adc [rdi + 56], r11
|
||||
ret
|
||||
|
||||
.global fp_sub2
|
||||
fp_sub2:
|
||||
mov rdx, rdi
|
||||
xchg rsi, rdx
|
||||
jmp fp_sub3
|
||||
|
||||
|
||||
/* Montgomery arithmetic */
|
||||
|
||||
.global fp_enc
|
||||
fp_enc:
|
||||
lea rdx, [rip + .r_squared_mod_p]
|
||||
jmp fp_mul3
|
||||
|
||||
.global fp_dec
|
||||
fp_dec:
|
||||
lea rdx, [rip + u512_1]
|
||||
jmp fp_mul3
|
||||
|
||||
.global fp_mul3
|
||||
fp_mul3:
|
||||
push rbp
|
||||
push rbx
|
||||
push r12
|
||||
push r13
|
||||
push r14
|
||||
push r15
|
||||
|
||||
push rdi
|
||||
|
||||
mov rdi, rsi
|
||||
mov rsi, rdx
|
||||
|
||||
xor r8, r8
|
||||
xor r9, r9
|
||||
xor r10, r10
|
||||
xor r11, r11
|
||||
xor r12, r12
|
||||
xor r13, r13
|
||||
xor r14, r14
|
||||
xor r15, r15
|
||||
xor rbp, rbp
|
||||
|
||||
/* flags are already cleared */
|
||||
|
||||
.macro MULSTEP, k, r0, r1, r2, r3, r4, r5, r6, r7, r8
|
||||
|
||||
mov rdx, [rsi + 0]
|
||||
mulx rcx, rdx, [rdi + 8*\k]
|
||||
add rdx, \r0
|
||||
mulx rcx, rdx, [rip + .inv_min_p_mod_r]
|
||||
|
||||
xor rax, rax /* clear flags */
|
||||
|
||||
mulx rbx, rax, [rip + p + 0]
|
||||
adox \r0, rax
|
||||
|
||||
mulx rcx, rax, [rip + p + 8]
|
||||
adcx \r1, rbx
|
||||
adox \r1, rax
|
||||
|
||||
mulx rbx, rax, [rip + p + 16]
|
||||
adcx \r2, rcx
|
||||
adox \r2, rax
|
||||
|
||||
mulx rcx, rax, [rip + p + 24]
|
||||
adcx \r3, rbx
|
||||
adox \r3, rax
|
||||
|
||||
mulx rbx, rax, [rip + p + 32]
|
||||
adcx \r4, rcx
|
||||
adox \r4, rax
|
||||
|
||||
mulx rcx, rax, [rip + p + 40]
|
||||
adcx \r5, rbx
|
||||
adox \r5, rax
|
||||
|
||||
mulx rbx, rax, [rip + p + 48]
|
||||
adcx \r6, rcx
|
||||
adox \r6, rax
|
||||
|
||||
mulx rcx, rax, [rip + p + 56]
|
||||
adcx \r7, rbx
|
||||
adox \r7, rax
|
||||
|
||||
mov rax, 0
|
||||
adcx \r8, rcx
|
||||
adox \r8, rax
|
||||
|
||||
|
||||
mov rdx, [rdi + 8*\k]
|
||||
|
||||
xor rax, rax /* clear flags */
|
||||
|
||||
mulx rbx, rax, [rsi + 0]
|
||||
adox \r0, rax
|
||||
|
||||
mulx rcx, rax, [rsi + 8]
|
||||
adcx \r1, rbx
|
||||
adox \r1, rax
|
||||
|
||||
mulx rbx, rax, [rsi + 16]
|
||||
adcx \r2, rcx
|
||||
adox \r2, rax
|
||||
|
||||
mulx rcx, rax, [rsi + 24]
|
||||
adcx \r3, rbx
|
||||
adox \r3, rax
|
||||
|
||||
mulx rbx, rax, [rsi + 32]
|
||||
adcx \r4, rcx
|
||||
adox \r4, rax
|
||||
|
||||
mulx rcx, rax, [rsi + 40]
|
||||
adcx \r5, rbx
|
||||
adox \r5, rax
|
||||
|
||||
mulx rbx, rax, [rsi + 48]
|
||||
adcx \r6, rcx
|
||||
adox \r6, rax
|
||||
|
||||
mulx rcx, rax, [rsi + 56]
|
||||
adcx \r7, rbx
|
||||
adox \r7, rax
|
||||
|
||||
mov rax, 0
|
||||
adcx \r8, rcx
|
||||
adox \r8, rax
|
||||
|
||||
.endm
|
||||
|
||||
MULSTEP 0, r8, r9, r10, r11, r12, r13, r14, r15, rbp
|
||||
MULSTEP 1, r9, r10, r11, r12, r13, r14, r15, rbp, r8
|
||||
MULSTEP 2, r10, r11, r12, r13, r14, r15, rbp, r8, r9
|
||||
MULSTEP 3, r11, r12, r13, r14, r15, rbp, r8, r9, r10
|
||||
MULSTEP 4, r12, r13, r14, r15, rbp, r8, r9, r10, r11
|
||||
MULSTEP 5, r13, r14, r15, rbp, r8, r9, r10, r11, r12
|
||||
MULSTEP 6, r14, r15, rbp, r8, r9, r10, r11, r12, r13
|
||||
MULSTEP 7, r15, rbp, r8, r9, r10, r11, r12, r13, r14
|
||||
|
||||
pop rdi
|
||||
|
||||
mov [rdi + 0], rbp
|
||||
mov [rdi + 8], r8
|
||||
mov [rdi + 16], r9
|
||||
mov [rdi + 24], r10
|
||||
mov [rdi + 32], r11
|
||||
mov [rdi + 40], r12
|
||||
mov [rdi + 48], r13
|
||||
mov [rdi + 56], r14
|
||||
|
||||
pop r15
|
||||
pop r14
|
||||
pop r13
|
||||
pop r12
|
||||
pop rbx
|
||||
pop rbp
|
||||
jmp .reduce_once
|
||||
|
||||
.global fp_mul2
|
||||
fp_mul2:
|
||||
mov rdx, rdi
|
||||
jmp fp_mul3
|
||||
|
||||
.global fp_sq2
|
||||
fp_sq2:
|
||||
/* TODO implement optimized Montgomery squaring */
|
||||
mov rdx, rsi
|
||||
jmp fp_mul3
|
||||
|
||||
.global fp_sq1
|
||||
fp_sq1:
|
||||
mov rsi, rdi
|
||||
jmp fp_sq2
|
||||
|
||||
/* (obviously) not constant time in the exponent! */
|
||||
.fp_pow:
|
||||
push rbx
|
||||
mov rbx, rsi
|
||||
push r12
|
||||
push r13
|
||||
push rdi
|
||||
sub rsp, 64
|
||||
|
||||
mov rsi, rdi
|
||||
mov rdi, rsp
|
||||
call fp_copy
|
||||
|
||||
mov rdi, [rsp + 64]
|
||||
lea rsi, [rip + fp_1]
|
||||
call fp_copy
|
||||
|
||||
.macro POWSTEP, k
|
||||
mov r13, [rbx + 8*\k]
|
||||
xor r12, r12
|
||||
|
||||
0:
|
||||
test r13, 1
|
||||
jz 1f
|
||||
|
||||
mov rdi, [rsp + 64]
|
||||
mov rsi, rsp
|
||||
call fp_mul2
|
||||
|
||||
1:
|
||||
mov rdi, rsp
|
||||
call fp_sq1
|
||||
|
||||
shr r13
|
||||
|
||||
inc r12
|
||||
test r12, 64
|
||||
jz 0b
|
||||
.endm
|
||||
|
||||
POWSTEP 0
|
||||
POWSTEP 1
|
||||
POWSTEP 2
|
||||
POWSTEP 3
|
||||
POWSTEP 4
|
||||
POWSTEP 5
|
||||
POWSTEP 6
|
||||
POWSTEP 7
|
||||
|
||||
add rsp, 64+8
|
||||
pop r13
|
||||
pop r12
|
||||
pop rbx
|
||||
ret
|
||||
|
||||
.section .rodata
|
||||
.p_minus_2:
|
||||
.quad 0x1b81b90533c6c879, 0xc2721bf457aca835, 0x516730cc1f0b4f25, 0xa7aac6c567f35507
|
||||
.quad 0x5afbfcc69322c9cd, 0xb42d083aedc88c42, 0xfc8ab0d15e3e4c4a, 0x65b48e8f740f89bf
|
||||
|
||||
.section .text
|
||||
|
||||
/* TODO use a better addition chain? */
|
||||
.global fp_inv
|
||||
fp_inv:
|
||||
lea rsi, [rip + .p_minus_2]
|
||||
jmp .fp_pow
|
||||
|
||||
.section .rodata
|
||||
.p_minus_1_halves:
|
||||
.quad 0x8dc0dc8299e3643d, 0xe1390dfa2bd6541a, 0xa8b398660f85a792, 0xd3d56362b3f9aa83
|
||||
.quad 0x2d7dfe63499164e6, 0x5a16841d76e44621, 0xfe455868af1f2625, 0x32da4747ba07c4df
|
||||
|
||||
.section .text
|
||||
|
||||
/* TODO use a better addition chain? */
|
||||
.global fp_issquare
|
||||
fp_issquare:
|
||||
push rdi
|
||||
lea rsi, [rip + .p_minus_1_halves]
|
||||
call .fp_pow
|
||||
pop rdi
|
||||
|
||||
xor rax, rax
|
||||
.set k, 0
|
||||
.rept 8
|
||||
mov rsi, [rdi + 8*k]
|
||||
xor rsi, [rip + fp_1 + 8*k]
|
||||
or rax, rsi
|
||||
.set k, k+1
|
||||
.endr
|
||||
test rax, rax
|
||||
setz al
|
||||
movzx rax, al
|
||||
ret
|
||||
|
||||
|
||||
/* not constant time (but this shouldn't leak anything of importance) */
|
||||
.global fp_random
|
||||
fp_random:
|
||||
|
||||
push rdi
|
||||
mov rsi, 64
|
||||
call randombytes
|
||||
pop rdi
|
||||
mov rax, 1
|
||||
shl rax, (pbits % 64)
|
||||
dec rax
|
||||
and [rdi + 56], rax
|
||||
|
||||
.set k, 7
|
||||
.rept 8
|
||||
mov rax, [rip + p + 8*k]
|
||||
cmp [rdi + 8*k], rax
|
||||
jge fp_random
|
||||
jl 0f
|
||||
.set k, k-1
|
||||
.endr
|
||||
0:
|
||||
ret
|
||||
|
99
main.c
普通文件
99
main.c
普通文件
@ -0,0 +1,99 @@
|
||||
|
||||
#include <stdlib.h>
|
||||
#include <stdio.h>
|
||||
#include <string.h>
|
||||
#include <unistd.h>
|
||||
#include <time.h>
|
||||
#include <assert.h>
|
||||
|
||||
#include "u512.h"
|
||||
#include "fp.h"
|
||||
#include "mont.h"
|
||||
#include "csidh.h"
|
||||
|
||||
void u512_print(u512 const *x)
|
||||
{
|
||||
for (size_t i = 63; i < 64; --i)
|
||||
printf("%02hhx", i[(unsigned char *) x->c]);
|
||||
}
|
||||
|
||||
void fp_print(fp const *x)
|
||||
{
|
||||
u512 y;
|
||||
fp_dec(&y, x);
|
||||
u512_print(&y);
|
||||
}
|
||||
|
||||
int main()
|
||||
{
|
||||
clock_t t0, t1;
|
||||
|
||||
private_key priv_alice, priv_bob;
|
||||
public_key pub_alice, pub_bob;
|
||||
public_key shared_alice, shared_bob;
|
||||
|
||||
printf("\n");
|
||||
|
||||
|
||||
t0 = clock();
|
||||
csidh_private(&priv_alice);
|
||||
t1 = clock();
|
||||
|
||||
printf("Alice's private key (%7.3lf ms):\n ", 1000. * (t1 - t0) / CLOCKS_PER_SEC);
|
||||
for (size_t i = 0; i < sizeof(priv_alice); ++i)
|
||||
printf("%02hhx", i[(uint8_t *) &priv_alice]);
|
||||
printf("\n\n");
|
||||
|
||||
t0 = clock();
|
||||
csidh_private(&priv_bob);
|
||||
t1 = clock();
|
||||
|
||||
printf("Bob's private key (%7.3lf ms):\n ", 1000. * (t1 - t0) / CLOCKS_PER_SEC);
|
||||
for (size_t i = 0; i < sizeof(priv_bob); ++i)
|
||||
printf("%02hhx", i[(uint8_t *) &priv_bob]);
|
||||
printf("\n\n");
|
||||
|
||||
|
||||
t0 = clock();
|
||||
assert(csidh(&pub_alice, &base, &priv_alice));
|
||||
t1 = clock();
|
||||
|
||||
printf("Alice's public key (%7.3lf ms):\n ", 1000. * (t1 - t0) / CLOCKS_PER_SEC);
|
||||
fp_print(&pub_alice.A);
|
||||
printf("\n\n");
|
||||
|
||||
t0 = clock();
|
||||
assert(csidh(&pub_bob, &base, &priv_bob));
|
||||
t1 = clock();
|
||||
|
||||
printf("Bob's public key (%7.3lf ms):\n ", 1000. * (t1 - t0) / CLOCKS_PER_SEC);
|
||||
fp_print(&pub_bob.A);
|
||||
printf("\n\n");
|
||||
|
||||
|
||||
t0 = clock();
|
||||
assert(csidh(&shared_alice, &pub_bob, &priv_alice));
|
||||
t1 = clock();
|
||||
|
||||
printf("Alice's shared secret (%7.3lf ms):\n ", 1000. * (t1 - t0) / CLOCKS_PER_SEC);
|
||||
fp_print(&shared_alice.A);
|
||||
printf("\n\n");
|
||||
|
||||
t0 = clock();
|
||||
assert(csidh(&shared_bob, &pub_alice, &priv_bob));
|
||||
t1 = clock();
|
||||
|
||||
printf("Bob's shared secret (%7.3lf ms):\n ", 1000. * (t1 - t0) / CLOCKS_PER_SEC);
|
||||
fp_print(&shared_bob.A);
|
||||
printf("\n\n");
|
||||
|
||||
printf(" ");
|
||||
if (memcmp(&shared_alice, &shared_bob, sizeof(public_key)))
|
||||
printf("\x1b[31mNOT EQUAL!\x1b[0m\n");
|
||||
else
|
||||
printf("\x1b[32mequal.\x1b[0m\n");
|
||||
printf("\n");
|
||||
|
||||
printf("\n");
|
||||
}
|
||||
|
203
mont.c
普通文件
203
mont.c
普通文件
@ -0,0 +1,203 @@
|
||||
|
||||
#include <assert.h>
|
||||
|
||||
#include "mont.h"
|
||||
#include "u512.h"
|
||||
|
||||
void xDBLADD(proj *R, proj *S, proj const *P, proj const *Q, proj const *PQ, proj const *A24)
|
||||
{
|
||||
fp tmp0, tmp1, tmp2; //requires precomputation of A24=(A+2C:4C)
|
||||
|
||||
fp_add3(&tmp0, &P->x, &P->z);
|
||||
fp_sub3(&tmp1, &P->x, &P->z);
|
||||
fp_sq2(&R->x, &tmp0);
|
||||
fp_sub3(&tmp2, &Q->x, &Q->z);
|
||||
fp_add3(&S->x, &Q->x, &Q->z);
|
||||
fp_mul2(&tmp0, &tmp2);
|
||||
fp_sq2(&R->z, &tmp1);
|
||||
fp_mul2(&tmp1, &S->x);
|
||||
fp_sub3(&tmp2, &R->x, &R->z);
|
||||
fp_mul2(&R->z, &A24->z);
|
||||
fp_mul2(&R->x, &R->z);
|
||||
fp_mul3(&S->x, &A24->x, &tmp2);
|
||||
fp_sub3(&S->z, &tmp0, &tmp1);
|
||||
fp_add2(&R->z, &S->x);
|
||||
fp_add3(&S->x, &tmp0, &tmp1);
|
||||
fp_mul2(&R->z, &tmp2);
|
||||
fp_sq1(&S->z);
|
||||
fp_sq1(&S->x);
|
||||
fp_mul2(&S->z, &PQ->x);
|
||||
fp_mul2(&S->x, &PQ->z);
|
||||
}
|
||||
|
||||
void xDBL(proj *Q, proj const *A, proj const *P)
|
||||
{
|
||||
fp a, b, c;
|
||||
fp_add3(&a, &P->x, &P->z);
|
||||
fp_sq1(&a);
|
||||
fp_sub3(&b, &P->x, &P->z);
|
||||
fp_sq1(&b);
|
||||
fp_sub3(&c, &a, &b);
|
||||
fp_add2(&b, &b); fp_add2(&b, &b); /* multiplication by 4 */
|
||||
fp_mul2(&b, &A->z);
|
||||
fp_mul3(&Q->x, &a, &b);
|
||||
fp_add3(&a, &A->z, &A->z); /* multiplication by 2 */
|
||||
fp_add2(&a, &A->x);
|
||||
fp_mul2(&a, &c);
|
||||
fp_add2(&a, &b);
|
||||
fp_mul3(&Q->z, &a, &c);
|
||||
}
|
||||
|
||||
void xADD(proj *S, proj const *P, proj const *Q, proj const *PQ)
|
||||
{
|
||||
fp a, b, c, d;
|
||||
fp_add3(&a, &P->x, &P->z);
|
||||
fp_sub3(&b, &P->x, &P->z);
|
||||
fp_add3(&c, &Q->x, &Q->z);
|
||||
fp_sub3(&d, &Q->x, &Q->z);
|
||||
fp_mul2(&a, &d);
|
||||
fp_mul2(&b, &c);
|
||||
fp_add3(&c, &a, &b);
|
||||
fp_sub3(&d, &a, &b);
|
||||
fp_sq1(&c);
|
||||
fp_sq1(&d);
|
||||
fp_mul3(&S->x, &PQ->z, &c);
|
||||
fp_mul3(&S->z, &PQ->x, &d);
|
||||
}
|
||||
|
||||
/* Montgomery ladder. */
|
||||
/* P must not be the unique point of order 2. */
|
||||
/* not constant-time! */
|
||||
void xMUL(proj *Q, proj const *A, proj const *P, u512 const *k)
|
||||
{
|
||||
proj R = *P;
|
||||
proj A24;
|
||||
const proj Pcopy = *P; /* in case Q = P */
|
||||
|
||||
Q->x = fp_1;
|
||||
Q->z = fp_0;
|
||||
|
||||
fp_add3(&A24.x, &A->z, &A->z); //precomputation of A24=(A+2C:4C)
|
||||
fp_add3(&A24.z, &A24.x, &A24.x);
|
||||
fp_add2(&A24.x, &A->x);
|
||||
|
||||
unsigned long i = 512;
|
||||
while (--i && !u512_bit(k, i));
|
||||
|
||||
do {
|
||||
|
||||
bool bit = u512_bit(k, i);
|
||||
|
||||
if (bit) { proj T = *Q; *Q = R; R = T; } /* not constant-time */
|
||||
//fp_cswap(&Q->x, &R.x, bit);
|
||||
//fp_cswap(&Q->z, &R.z, bit);
|
||||
|
||||
xDBLADD(Q, &R, Q, &R, &Pcopy, &A24);
|
||||
|
||||
if (bit) { proj T = *Q; *Q = R; R = T; } /* not constant-time */
|
||||
//fp_cswap(&Q->x, &R.x, bit);
|
||||
//fp_cswap(&Q->z, &R.z, bit);
|
||||
|
||||
} while (i--);
|
||||
}
|
||||
|
||||
//simultaneous square-and-multiply, computes x^exp and y^exp
|
||||
void exp_by_squaring_(fp* x, fp* y, uint64_t exp)
|
||||
{
|
||||
fp result1, result2;
|
||||
fp_set(&result1, 1);
|
||||
fp_set(&result2, 1);
|
||||
|
||||
while (exp)
|
||||
{
|
||||
if (exp & 1){
|
||||
fp_mul2(&result1, x);
|
||||
fp_mul2(&result2, y);
|
||||
}
|
||||
|
||||
fp_sq1(x);
|
||||
fp_sq1(y);
|
||||
exp >>= 1;
|
||||
}
|
||||
|
||||
fp_cswap(&result1, x, 1);
|
||||
fp_cswap(&result2, y, 1);
|
||||
|
||||
}
|
||||
|
||||
|
||||
/* computes the isogeny with kernel point K of order k */
|
||||
/* returns the new curve coefficient A and the image of P */
|
||||
/* (obviously) not constant time in k */
|
||||
void xISOG(proj *A, proj *P, proj const *K, uint64_t k)
|
||||
{
|
||||
assert (k >= 3);
|
||||
assert (k % 2 == 1);
|
||||
|
||||
fp tmp0, tmp1, tmp2, Psum, Pdif;
|
||||
proj Q, Aed, prod;
|
||||
|
||||
fp_add3(&Aed.z, &A->z, &A->z); //compute twisted Edwards curve coefficients
|
||||
fp_add3(&Aed.x, &A->x, &Aed.z);
|
||||
fp_sub3(&Aed.z, &A->x, &Aed.z);
|
||||
|
||||
fp_add3(&Psum, &P->x, &P->z); //precomputations
|
||||
fp_sub3(&Pdif, &P->x, &P->z);
|
||||
|
||||
fp_sub3(&prod.x, &K->x, &K->z);
|
||||
fp_add3(&prod.z, &K->x, &K->z);
|
||||
|
||||
fp_mul3(&tmp1, &prod.x, &Psum);
|
||||
fp_mul3(&tmp0, &prod.z, &Pdif);
|
||||
fp_add3(&Q.x, &tmp0, &tmp1);
|
||||
fp_sub3(&Q.z, &tmp0, &tmp1);
|
||||
|
||||
proj M[3] = {*K};
|
||||
xDBL(&M[1], A, K);
|
||||
|
||||
for (uint64_t i = 1; i < k / 2; ++i) {
|
||||
|
||||
if (i >= 2)
|
||||
xADD(&M[i % 3], &M[(i - 1) % 3], K, &M[(i - 2) % 3]);
|
||||
|
||||
fp_sub3(&tmp1, &M[i % 3].x, &M[i % 3].z);
|
||||
fp_add3(&tmp0, &M[i % 3].x, &M[i % 3].z);
|
||||
fp_mul2(&prod.x, &tmp1);
|
||||
fp_mul2(&prod.z, &tmp0);
|
||||
fp_mul2(&tmp1, &Psum);
|
||||
fp_mul2(&tmp0, &Pdif);
|
||||
fp_add3(&tmp2, &tmp0, &tmp1);
|
||||
fp_mul2(&Q.x, &tmp2);
|
||||
fp_sub3(&tmp2, &tmp0, &tmp1);
|
||||
fp_mul2(&Q.z, &tmp2);
|
||||
|
||||
}
|
||||
|
||||
|
||||
// point evaluation
|
||||
fp_sq1(&Q.x);
|
||||
fp_sq1(&Q.z);
|
||||
fp_mul2(&P->x, &Q.x);
|
||||
fp_mul2(&P->z, &Q.z);
|
||||
|
||||
//compute Aed.x^k, Aed.z^k
|
||||
exp_by_squaring_(&Aed.x, &Aed.z, k);
|
||||
|
||||
//compute prod.x^8, prod.z^8
|
||||
fp_sq1(&prod.x);
|
||||
fp_sq1(&prod.x);
|
||||
fp_sq1(&prod.x);
|
||||
fp_sq1(&prod.z);
|
||||
fp_sq1(&prod.z);
|
||||
fp_sq1(&prod.z);
|
||||
|
||||
//compute image curve parameters
|
||||
fp_mul2(&Aed.z, &prod.x);
|
||||
fp_mul2(&Aed.x, &prod.z);
|
||||
|
||||
//compute Montgomery params
|
||||
fp_add3(&A->x, &Aed.x, &Aed.z);
|
||||
fp_sub3(&A->z, &Aed.x, &Aed.z);
|
||||
fp_add2(&A->x, &A->x);
|
||||
}
|
||||
|
19
mont.h
普通文件
19
mont.h
普通文件
@ -0,0 +1,19 @@
|
||||
#ifndef MONT_H
|
||||
#define MONT_H
|
||||
|
||||
#include "u512.h"
|
||||
#include "fp.h"
|
||||
|
||||
/* P^1 over fp. */
|
||||
typedef struct proj {
|
||||
fp x;
|
||||
fp z;
|
||||
} proj;
|
||||
|
||||
void xDBL(proj *Q, proj const *A, proj const *P);
|
||||
void xADD(proj *S, proj const *P, proj const *Q, proj const *PQ);
|
||||
void xDBLADD(proj *R, proj *S, proj const *P, proj const *Q, proj const *PQ, proj const *A);
|
||||
void xMUL(proj *Q, proj const *A, proj const *P, u512 const *k);
|
||||
void xISOG(proj *A, proj *P, proj const *K, uint64_t k);
|
||||
|
||||
#endif
|
18
rng.c
普通文件
18
rng.c
普通文件
@ -0,0 +1,18 @@
|
||||
|
||||
#include "rng.h"
|
||||
|
||||
#include <stdlib.h>
|
||||
#include <unistd.h>
|
||||
#include <fcntl.h>
|
||||
|
||||
void randombytes(void *x, size_t l)
|
||||
{
|
||||
static int fd = -1;
|
||||
ssize_t n;
|
||||
if (fd < 0 && 0 > (fd = open("/dev/urandom", O_RDONLY)))
|
||||
exit(1);
|
||||
for (size_t i = 0; i < l; i += n)
|
||||
if (0 >= (n = read(fd, (char *) x + i, l - i)))
|
||||
exit(2);
|
||||
}
|
||||
|
8
rng.h
普通文件
8
rng.h
普通文件
@ -0,0 +1,8 @@
|
||||
#ifndef RNG_H
|
||||
#define RNG_H
|
||||
|
||||
#include <stdlib.h>
|
||||
|
||||
void randombytes(void *x, size_t l);
|
||||
|
||||
#endif
|
22
u512.h
普通文件
22
u512.h
普通文件
@ -0,0 +1,22 @@
|
||||
#ifndef UINT_H
|
||||
#define UINT_H
|
||||
|
||||
#include <stdbool.h>
|
||||
#include <stdint.h>
|
||||
|
||||
typedef struct u512 {
|
||||
uint64_t c[8];
|
||||
} u512;
|
||||
|
||||
extern const u512 u512_1;
|
||||
|
||||
void u512_set(u512 *x, uint64_t y);
|
||||
|
||||
bool u512_bit(u512 const *x, uint64_t k);
|
||||
|
||||
bool u512_add3(u512 *x, u512 const *y, u512 const *z); /* returns carry */
|
||||
bool u512_sub3(u512 *x, u512 const *y, u512 const *z); /* returns borrow */
|
||||
|
||||
void u512_mul3_64(u512 *x, u512 const *y, uint64_t z);
|
||||
|
||||
#endif
|
102
u512.s
普通文件
102
u512.s
普通文件
@ -0,0 +1,102 @@
|
||||
|
||||
.intel_syntax noprefix
|
||||
|
||||
.section .rodata
|
||||
|
||||
.global u512_1
|
||||
u512_1: .quad 1, 0, 0, 0, 0, 0, 0, 0
|
||||
|
||||
|
||||
.section .text
|
||||
|
||||
.global u512_set
|
||||
u512_set:
|
||||
cld
|
||||
mov rax, rsi
|
||||
stosq
|
||||
xor rax, rax
|
||||
mov rcx, 7
|
||||
rep stosq
|
||||
ret
|
||||
|
||||
|
||||
.global u512_bit
|
||||
u512_bit:
|
||||
mov rcx, rsi
|
||||
and rcx, 0x3f
|
||||
shr rsi, 6
|
||||
mov rax, [rdi + 8*rsi]
|
||||
shr rax, cl
|
||||
and rax, 1
|
||||
ret
|
||||
|
||||
|
||||
.global u512_add3
|
||||
u512_add3:
|
||||
mov rax, [rsi + 0]
|
||||
add rax, [rdx + 0]
|
||||
mov [rdi + 0], rax
|
||||
.set k, 1
|
||||
.rept 7
|
||||
mov rax, [rsi + 8*k]
|
||||
adc rax, [rdx + 8*k]
|
||||
mov [rdi + 8*k], rax
|
||||
.set k, k+1
|
||||
.endr
|
||||
setc al
|
||||
movzx rax, al
|
||||
ret
|
||||
|
||||
.global u512_sub3
|
||||
u512_sub3:
|
||||
mov rax, [rsi + 0]
|
||||
sub rax, [rdx + 0]
|
||||
mov [rdi + 0], rax
|
||||
.set k, 1
|
||||
.rept 7
|
||||
mov rax, [rsi + 8*k]
|
||||
sbb rax, [rdx + 8*k]
|
||||
mov [rdi + 8*k], rax
|
||||
.set k, k+1
|
||||
.endr
|
||||
setc al
|
||||
movzx rax, al
|
||||
ret
|
||||
|
||||
|
||||
.global u512_mul3_64
|
||||
u512_mul3_64:
|
||||
|
||||
mulx r10, rax, [rsi + 0]
|
||||
mov [rdi + 0], rax
|
||||
|
||||
mulx r11, rax, [rsi + 8]
|
||||
add rax, r10
|
||||
mov [rdi + 8], rax
|
||||
|
||||
mulx r10, rax, [rsi + 16]
|
||||
adcx rax, r11
|
||||
mov [rdi + 16], rax
|
||||
|
||||
mulx r11, rax, [rsi + 24]
|
||||
adcx rax, r10
|
||||
mov [rdi + 24], rax
|
||||
|
||||
mulx r10, rax, [rsi + 32]
|
||||
adcx rax, r11
|
||||
mov [rdi + 32],rax
|
||||
|
||||
mulx r11, rax, [rsi + 40]
|
||||
adcx rax, r10
|
||||
mov [rdi + 40],rax
|
||||
|
||||
mulx r10, rax, [rsi + 48]
|
||||
adcx rax, r11
|
||||
mov [rdi + 48],rax
|
||||
|
||||
mulx r11, rax, [rsi + 56]
|
||||
adcx rax, r10
|
||||
mov [rdi + 56],rax
|
||||
|
||||
ret
|
||||
|
正在加载...
在新工单中引用
屏蔽一个用户