c-implementation

这个提交包含在:
Michael Meyer 2018-08-23 13:49:45 +02:00
当前提交 c00c37e011
共有 14 个文件被更改,包括 1799 次插入0 次删除

25
Makefile 普通文件
查看文件

@ -0,0 +1,25 @@
all:
@gcc \
-Wall -Wextra \
-O0 -funroll-loops \
rng.c \
u512.s fp.s \
mont.c \
csidh.c \
main.c \
-o main
debug:
gcc \
-Wall -Wextra \
-g \
rng.c \
u512.s fp.s \
mont.c \
csidh.c \
main.c \
-o main
clean:
rm -f main

54
bench.c 普通文件
查看文件

@ -0,0 +1,54 @@
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <time.h>
#include <assert.h>
#include "u512.h"
#include "fp.h"
#include "mont.h"
#include "csidh.h"
#include <inttypes.h>
static __inline__ uint64_t rdtsc(void)
{
uint32_t hi, lo;
__asm__ __volatile__ ("rdtsc" : "=a"(lo), "=d"(hi));
return lo | (uint64_t) hi << 32;
}
unsigned long its = 10000;
int main()
{
clock_t t0, t1, time = 0;
uint64_t c0, c1, cycles = 0;
private_key priv;
public_key pub = base;
for (unsigned long i = 0; i < its; ++i) {
csidh_private(&priv);
t0 = clock();
c0 = rdtsc();
/**************************************/
assert(validate(&pub));
action(&pub, &pub, &priv);
/**************************************/
c1 = rdtsc();
t1 = clock();
cycles += c1 - c0;
time += t1 - t0;
}
printf("iterations: %lu\n", its);
printf("clock cycles: %" PRIu64 "\n", (uint64_t) cycles / its);
printf("wall-clock time: %.3lf ms\n", 1000. * time / CLOCKS_PER_SEC / its);
}

220
csidh.c 普通文件
查看文件

@ -0,0 +1,220 @@
#include <string.h>
#include <assert.h>
#include "csidh.h"
#include "rng.h"
/* specific to p, should perhaps be somewhere else */
const unsigned primes[num_primes] = {
3, 5, 7, 11, 13, 17, 19, 23, 29, 31, 37, 41, 43, 47, 53, 59,
61, 67, 71, 73, 79, 83, 89, 97, 101, 103, 107, 109, 113, 127, 131, 137,
139, 149, 151, 157, 163, 167, 173, 179, 181, 191, 193, 197, 199, 211, 223, 227,
229, 233, 239, 241, 251, 257, 263, 269, 271, 277, 281, 283, 293, 307, 311, 313,
317, 331, 337, 347, 349, 353, 359, 367, 373, 587,
};
const u512 four_sqrt_p = {{
0x85e2579c786882cf, 0x4e3433657e18da95, 0x850ae5507965a0b3, 0xa15bc4e676475964,
}};
const public_key base = {0}; /* A = 0 */
void csidh_private(private_key *priv)
{
memset(&priv->e, 0, sizeof(priv->e));
for (size_t i = 0; i < num_primes; ) {
int8_t buf[64];
randombytes(buf, sizeof(buf));
for (size_t j = 0; j < sizeof(buf); ++j) {
if (buf[j] <= max_exponent && buf[j] >= -max_exponent) {
priv->e[i / 2] |= (buf[j] & 0xf) << i % 2 * 4;
if (++i >= num_primes)
break;
}
}
}
}
/* compute [(p+1)/l] P for all l in our list of primes. */
/* divide and conquer is much faster than doing it naively,
* but uses more memory. */
static void cofactor_multiples(proj *P, const proj *A, size_t lower, size_t upper)
{
assert(lower < upper);
if (upper - lower == 1)
return;
size_t mid = lower + (upper - lower + 1) / 2;
u512 cl = u512_1, cu = u512_1;
for (size_t i = lower; i < mid; ++i)
u512_mul3_64(&cu, &cu, primes[i]);
for (size_t i = mid; i < upper; ++i)
u512_mul3_64(&cl, &cl, primes[i]);
xMUL(&P[mid], A, &P[lower], &cu);
xMUL(&P[lower], A, &P[lower], &cl);
cofactor_multiples(P, A, lower, mid);
cofactor_multiples(P, A, mid, upper);
}
/* never accepts invalid keys. */
bool validate(public_key const *in)
{
const proj A = {in->A, fp_1};
do {
proj P[num_primes];
fp_random(&P->x);
P->z = fp_1;
/* maximal 2-power in p+1 */
xDBL(P, &A, P);
xDBL(P, &A, P);
cofactor_multiples(P, &A, 0, num_primes);
u512 order = u512_1;
for (size_t i = num_primes - 1; i < num_primes; --i) {
/* we only gain information if [(p+1)/l] P is non-zero */
if (memcmp(&P[i].z, &fp_0, sizeof(fp))) {
u512 tmp;
u512_set(&tmp, primes[i]);
xMUL(&P[i], &A, &P[i], &tmp);
if (memcmp(&P[i].z, &fp_0, sizeof(fp)))
/* P does not have order dividing p+1. */
return false;
u512_mul3_64(&order, &order, primes[i]);
if (u512_sub3(&tmp, &four_sqrt_p, &order)) /* returns borrow */
/* order > 4 sqrt(p), hence definitely supersingular */
return true;
}
}
/* P didn't have big enough order to prove supersingularity. */
} while (1);
}
/* compute x^3 + Ax^2 + x */
static void montgomery_rhs(fp *rhs, fp const *A, fp const *x)
{
fp tmp;
*rhs = *x;
fp_sq1(rhs);
fp_mul3(&tmp, A, x);
fp_add2(rhs, &tmp);
fp_add2(rhs, &fp_1);
fp_mul2(rhs, x);
}
/* totally not constant-time. */
void action(public_key *out, public_key const *in, private_key const *priv)
{
u512 k[2];
u512_set(&k[0], 4); /* maximal 2-power in p+1 */
u512_set(&k[1], 4); /* maximal 2-power in p+1 */
uint8_t e[2][num_primes];
for (size_t i = 0; i < num_primes; ++i) {
int8_t t = (int8_t) (priv->e[i / 2] << i % 2 * 4) >> 4;
if (t > 0) {
e[0][i] = t;
e[1][i] = 0;
u512_mul3_64(&k[1], &k[1], primes[i]);
}
else if (t < 0) {
e[1][i] = -t;
e[0][i] = 0;
u512_mul3_64(&k[0], &k[0], primes[i]);
}
else {
e[0][i] = 0;
e[1][i] = 0;
u512_mul3_64(&k[0], &k[0], primes[i]);
u512_mul3_64(&k[1], &k[1], primes[i]);
}
}
proj A = {in->A, fp_1};
bool done[2] = {false, false};
do {
assert(!memcmp(&A.z, &fp_1, sizeof(fp)));
proj P;
fp_random(&P.x);
P.z = fp_1;
fp rhs;
montgomery_rhs(&rhs, &A.x, &P.x);
bool sign = !fp_issquare(&rhs);
if (done[sign])
continue;
xMUL(&P, &A, &P, &k[sign]);
done[sign] = true;
for (size_t i = num_primes-1; i < num_primes; --i) { //changed loop direction
if (e[sign][i]) {
u512 cof = u512_1;
for (size_t j = i - 1; j < num_primes; --j) //changed loop direction
if (e[sign][j])
u512_mul3_64(&cof, &cof, primes[j]);
proj K;
xMUL(&K, &A, &P, &cof);
if (memcmp(&K.z, &fp_0, sizeof(fp))) {
xISOG(&A, &P, &K, primes[i]);
if (!--e[sign][i])
u512_mul3_64(&k[sign], &k[sign], primes[i]);
}
}
done[sign] &= !e[sign][i];
}
fp_inv(&A.z);
fp_mul2(&A.x, &A.z);
A.z = fp_1;
} while (!(done[0] && done[1]));
out->A = A.x;
}
/* includes public-key validation. */
bool csidh(public_key *out, public_key const *in, private_key const *priv)
{
if (!validate(in)) {
fp_random(&out->A);
return false;
}
action(out, in, priv);
return true;
}

26
csidh.h 普通文件
查看文件

@ -0,0 +1,26 @@
#ifndef CSIDH_H
#define CSIDH_H
#include "u512.h"
#include "fp.h"
#include "mont.h"
/* specific to p, should perhaps be somewhere else */
#define num_primes 74
#define max_exponent 5 /* (2*5+1)^74 is roughly 2^256 */
typedef struct private_key {
int8_t e[(num_primes + 1) / 2]; /* packed int4_t */
} private_key;
typedef struct public_key {
fp A; /* Montgomery coefficient: represents y^2 = x^3 + Ax^2 + x */
} public_key;
extern const public_key base;
void csidh_private(private_key *priv);
bool csidh(public_key *out, public_key const *in, private_key const *priv);
#endif

514
cycle.h 普通文件
查看文件

@ -0,0 +1,514 @@
/*
* Copyright (c) 2003, 2007-8 Matteo Frigo
* Copyright (c) 2003, 2007-8 Massachusetts Institute of Technology
*
* Permission is hereby granted, free of charge, to any person obtaining
* a copy of this software and associated documentation files (the
* "Software"), to deal in the Software without restriction, including
* without limitation the rights to use, copy, modify, merge, publish,
* distribute, sublicense, and/or sell copies of the Software, and to
* permit persons to whom the Software is furnished to do so, subject to
* the following conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
* LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
* OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
* WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*
*/
/* machine-dependent cycle counters code. Needs to be inlined. */
/***************************************************************************/
/* To use the cycle counters in your code, simply #include "cycle.h" (this
file), and then use the functions/macros:
ticks getticks(void);
ticks is an opaque typedef defined below, representing the current time.
You extract the elapsed time between two calls to gettick() via:
double elapsed(ticks t1, ticks t0);
which returns a double-precision variable in arbitrary units. You
are not expected to convert this into human units like seconds; it
is intended only for *comparisons* of time intervals.
(In order to use some of the OS-dependent timer routines like
Solaris' gethrtime, you need to paste the autoconf snippet below
into your configure.ac file and #include "config.h" before cycle.h,
or define the relevant macros manually if you are not using autoconf.)
*/
/***************************************************************************/
/* This file uses macros like HAVE_GETHRTIME that are assumed to be
defined according to whether the corresponding function/type/header
is available on your system. The necessary macros are most
conveniently defined if you are using GNU autoconf, via the tests:
dnl ---------------------------------------------------------------------
AC_C_INLINE
AC_HEADER_TIME
AC_CHECK_HEADERS([sys/time.h c_asm.h intrinsics.h mach/mach_time.h])
AC_CHECK_TYPE([hrtime_t],[AC_DEFINE(HAVE_HRTIME_T, 1, [Define to 1 if hrtime_t is defined in <sys/time.h>])],,[#if HAVE_SYS_TIME_H
#include <sys/time.h>
#endif])
AC_CHECK_FUNCS([gethrtime read_real_time time_base_to_time clock_gettime mach_absolute_time])
dnl Cray UNICOS _rtc() (real-time clock) intrinsic
AC_MSG_CHECKING([for _rtc intrinsic])
rtc_ok=yes
AC_TRY_LINK([#ifdef HAVE_INTRINSICS_H
#include <intrinsics.h>
#endif], [_rtc()], [AC_DEFINE(HAVE__RTC,1,[Define if you have the UNICOS _rtc() intrinsic.])], [rtc_ok=no])
AC_MSG_RESULT($rtc_ok)
dnl ---------------------------------------------------------------------
*/
/***************************************************************************/
#if TIME_WITH_SYS_TIME
# include <sys/time.h>
# include <time.h>
#else
# if HAVE_SYS_TIME_H
# include <sys/time.h>
# else
# include <time.h>
# endif
#endif
#define INLINE_ELAPSED(INL) static INL double elapsed(ticks t1, ticks t0) \
{ \
return (double)t1 - (double)t0; \
}
/*----------------------------------------------------------------*/
/* Solaris */
#if defined(HAVE_GETHRTIME) && defined(HAVE_HRTIME_T) && !defined(HAVE_TICK_COUNTER)
typedef hrtime_t ticks;
#define getticks gethrtime
INLINE_ELAPSED(inline)
#define HAVE_TICK_COUNTER
#endif
/*----------------------------------------------------------------*/
/* AIX v. 4+ routines to read the real-time clock or time-base register */
#if defined(HAVE_READ_REAL_TIME) && defined(HAVE_TIME_BASE_TO_TIME) && !defined(HAVE_TICK_COUNTER)
typedef timebasestruct_t ticks;
static __inline ticks getticks(void)
{
ticks t;
read_real_time(&t, TIMEBASE_SZ);
return t;
}
static __inline double elapsed(ticks t1, ticks t0) /* time in nanoseconds */
{
time_base_to_time(&t1, TIMEBASE_SZ);
time_base_to_time(&t0, TIMEBASE_SZ);
return (((double)t1.tb_high - (double)t0.tb_high) * 1.0e9 +
((double)t1.tb_low - (double)t0.tb_low));
}
#define HAVE_TICK_COUNTER
#endif
/*----------------------------------------------------------------*/
/*
* PowerPC ``cycle'' counter using the time base register.
*/
#if ((((defined(__GNUC__) && (defined(__powerpc__) || defined(__ppc__))) || (defined(__MWERKS__) && defined(macintosh)))) || (defined(__IBM_GCC_ASM) && (defined(__powerpc__) || defined(__ppc__)))) && !defined(HAVE_TICK_COUNTER)
typedef unsigned long long ticks;
static __inline__ ticks getticks(void)
{
unsigned int tbl, tbu0, tbu1;
do {
__asm__ __volatile__ ("mftbu %0" : "=r"(tbu0));
__asm__ __volatile__ ("mftb %0" : "=r"(tbl));
__asm__ __volatile__ ("mftbu %0" : "=r"(tbu1));
} while (tbu0 != tbu1);
return (((unsigned long long)tbu0) << 32) | tbl;
}
INLINE_ELAPSED(__inline__)
#define HAVE_TICK_COUNTER
#endif
/* MacOS/Mach (Darwin) time-base register interface (unlike UpTime,
from Carbon, requires no additional libraries to be linked). */
#if defined(HAVE_MACH_ABSOLUTE_TIME) && defined(HAVE_MACH_MACH_TIME_H) && !defined(HAVE_TICK_COUNTER)
#include <mach/mach_time.h>
typedef uint64_t ticks;
#define getticks mach_absolute_time
INLINE_ELAPSED(__inline__)
#define HAVE_TICK_COUNTER
#endif
/*----------------------------------------------------------------*/
/*
* Pentium cycle counter
*/
#if (defined(__GNUC__) || defined(__ICC)) && defined(__i386__) && !defined(HAVE_TICK_COUNTER)
typedef unsigned long long ticks;
static __inline__ ticks getticks(void)
{
ticks ret;
__asm__ __volatile__("rdtsc": "=A" (ret));
/* no input, nothing else clobbered */
return ret;
}
INLINE_ELAPSED(__inline__)
#define HAVE_TICK_COUNTER
#define TIME_MIN 5000.0 /* unreliable pentium IV cycle counter */
#endif
/* Visual C++ -- thanks to Morten Nissov for his help with this */
#if _MSC_VER >= 1200 && _M_IX86 >= 500 && !defined(HAVE_TICK_COUNTER)
#include <windows.h>
typedef LARGE_INTEGER ticks;
#define RDTSC __asm __emit 0fh __asm __emit 031h /* hack for VC++ 5.0 */
static __inline ticks getticks(void)
{
ticks retval;
__asm {
RDTSC
mov retval.HighPart, edx
mov retval.LowPart, eax
}
return retval;
}
static __inline double elapsed(ticks t1, ticks t0)
{
return (double)t1.QuadPart - (double)t0.QuadPart;
}
#define HAVE_TICK_COUNTER
#define TIME_MIN 5000.0 /* unreliable pentium IV cycle counter */
#endif
/*----------------------------------------------------------------*/
/*
* X86-64 cycle counter
*/
#if (defined(__GNUC__) || defined(__ICC) || defined(__SUNPRO_C)) && defined(__x86_64__) && !defined(HAVE_TICK_COUNTER)
typedef unsigned long long ticks;
static __inline__ ticks getticks(void)
{
unsigned a, d;
asm volatile("rdtsc" : "=a" (a), "=d" (d));
return ((ticks)a) | (((ticks)d) << 32);
}
INLINE_ELAPSED(__inline__)
#define HAVE_TICK_COUNTER
#endif
/* PGI compiler, courtesy Cristiano Calonaci, Andrea Tarsi, & Roberto Gori.
NOTE: this code will fail to link unless you use the -Masmkeyword compiler
option (grrr). */
#if defined(__PGI) && defined(__x86_64__) && !defined(HAVE_TICK_COUNTER)
typedef unsigned long long ticks;
static ticks getticks(void)
{
asm(" rdtsc; shl $0x20,%rdx; mov %eax,%eax; or %rdx,%rax; ");
}
INLINE_ELAPSED(__inline__)
#define HAVE_TICK_COUNTER
#endif
/* Visual C++, courtesy of Dirk Michaelis */
#if _MSC_VER >= 1400 && (defined(_M_AMD64) || defined(_M_X64)) && !defined(HAVE_TICK_COUNTER)
#include <intrin.h>
#pragma intrinsic(__rdtsc)
typedef unsigned __int64 ticks;
#define getticks __rdtsc
INLINE_ELAPSED(__inline)
#define HAVE_TICK_COUNTER
#endif
/*----------------------------------------------------------------*/
/*
* IA64 cycle counter
*/
/* intel's icc/ecc compiler */
#if (defined(__EDG_VERSION) || defined(__ECC)) && defined(__ia64__) && !defined(HAVE_TICK_COUNTER)
typedef unsigned long ticks;
#include <ia64intrin.h>
static __inline__ ticks getticks(void)
{
return __getReg(_IA64_REG_AR_ITC);
}
INLINE_ELAPSED(__inline__)
#define HAVE_TICK_COUNTER
#endif
/* gcc */
#if defined(__GNUC__) && defined(__ia64__) && !defined(HAVE_TICK_COUNTER)
typedef unsigned long ticks;
static __inline__ ticks getticks(void)
{
ticks ret;
__asm__ __volatile__ ("mov %0=ar.itc" : "=r"(ret));
return ret;
}
INLINE_ELAPSED(__inline__)
#define HAVE_TICK_COUNTER
#endif
/* HP/UX IA64 compiler, courtesy Teresa L. Johnson: */
#if defined(__hpux) && defined(__ia64) && !defined(HAVE_TICK_COUNTER)
#include <machine/sys/inline.h>
typedef unsigned long ticks;
static inline ticks getticks(void)
{
ticks ret;
ret = _Asm_mov_from_ar (_AREG_ITC);
return ret;
}
INLINE_ELAPSED(inline)
#define HAVE_TICK_COUNTER
#endif
/* Microsoft Visual C++ */
#if defined(_MSC_VER) && defined(_M_IA64) && !defined(HAVE_TICK_COUNTER)
typedef unsigned __int64 ticks;
# ifdef __cplusplus
extern "C"
# endif
ticks __getReg(int whichReg);
#pragma intrinsic(__getReg)
static __inline ticks getticks(void)
{
volatile ticks temp;
temp = __getReg(3116);
return temp;
}
INLINE_ELAPSED(inline)
#define HAVE_TICK_COUNTER
#endif
/*----------------------------------------------------------------*/
/*
* PA-RISC cycle counter
*/
#if defined(__hppa__) || defined(__hppa) && !defined(HAVE_TICK_COUNTER)
typedef unsigned long ticks;
# ifdef __GNUC__
static __inline__ ticks getticks(void)
{
ticks ret;
__asm__ __volatile__("mfctl 16, %0": "=r" (ret));
/* no input, nothing else clobbered */
return ret;
}
# else
# include <machine/inline.h>
static inline unsigned long getticks(void)
{
register ticks ret;
_MFCTL(16, ret);
return ret;
}
# endif
INLINE_ELAPSED(inline)
#define HAVE_TICK_COUNTER
#endif
/*----------------------------------------------------------------*/
/* S390, courtesy of James Treacy */
#if defined(__GNUC__) && defined(__s390__) && !defined(HAVE_TICK_COUNTER)
typedef unsigned long long ticks;
static __inline__ ticks getticks(void)
{
ticks cycles;
__asm__("stck 0(%0)" : : "a" (&(cycles)) : "memory", "cc");
return cycles;
}
INLINE_ELAPSED(__inline__)
#define HAVE_TICK_COUNTER
#endif
/*----------------------------------------------------------------*/
#if defined(__GNUC__) && defined(__alpha__) && !defined(HAVE_TICK_COUNTER)
/*
* The 32-bit cycle counter on alpha overflows pretty quickly,
* unfortunately. A 1GHz machine overflows in 4 seconds.
*/
typedef unsigned int ticks;
static __inline__ ticks getticks(void)
{
unsigned long cc;
__asm__ __volatile__ ("rpcc %0" : "=r"(cc));
return (cc & 0xFFFFFFFF);
}
INLINE_ELAPSED(__inline__)
#define HAVE_TICK_COUNTER
#endif
/*----------------------------------------------------------------*/
#if defined(__GNUC__) && defined(__sparc_v9__) && !defined(HAVE_TICK_COUNTER)
typedef unsigned long ticks;
static __inline__ ticks getticks(void)
{
ticks ret;
__asm__ __volatile__("rd %%tick, %0" : "=r" (ret));
return ret;
}
INLINE_ELAPSED(__inline__)
#define HAVE_TICK_COUNTER
#endif
/*----------------------------------------------------------------*/
#if (defined(__DECC) || defined(__DECCXX)) && defined(__alpha) && defined(HAVE_C_ASM_H) && !defined(HAVE_TICK_COUNTER)
# include <c_asm.h>
typedef unsigned int ticks;
static __inline ticks getticks(void)
{
unsigned long cc;
cc = asm("rpcc %v0");
return (cc & 0xFFFFFFFF);
}
INLINE_ELAPSED(__inline)
#define HAVE_TICK_COUNTER
#endif
/*----------------------------------------------------------------*/
/* SGI/Irix */
#if defined(HAVE_CLOCK_GETTIME) && defined(CLOCK_SGI_CYCLE) && !defined(HAVE_TICK_COUNTER)
typedef struct timespec ticks;
static inline ticks getticks(void)
{
struct timespec t;
clock_gettime(CLOCK_SGI_CYCLE, &t);
return t;
}
static inline double elapsed(ticks t1, ticks t0)
{
return ((double)t1.tv_sec - (double)t0.tv_sec) * 1.0E9 +
((double)t1.tv_nsec - (double)t0.tv_nsec);
}
#define HAVE_TICK_COUNTER
#endif
/*----------------------------------------------------------------*/
/* Cray UNICOS _rtc() intrinsic function */
#if defined(HAVE__RTC) && !defined(HAVE_TICK_COUNTER)
#ifdef HAVE_INTRINSICS_H
# include <intrinsics.h>
#endif
typedef long long ticks;
#define getticks _rtc
INLINE_ELAPSED(inline)
#define HAVE_TICK_COUNTER
#endif
/*----------------------------------------------------------------*/
/* MIPS ZBus */
#if HAVE_MIPS_ZBUS_TIMER
#if defined(__mips__) && !defined(HAVE_TICK_COUNTER)
#include <sys/mman.h>
#include <unistd.h>
#include <fcntl.h>
typedef uint64_t ticks;
static inline ticks getticks(void)
{
static uint64_t* addr = 0;
if (addr == 0)
{
uint32_t rq_addr = 0x10030000;
int fd;
int pgsize;
pgsize = getpagesize();
fd = open ("/dev/mem", O_RDONLY | O_SYNC, 0);
if (fd < 0) {
perror("open");
return NULL;
}
addr = mmap(0, pgsize, PROT_READ, MAP_SHARED, fd, rq_addr);
close(fd);
if (addr == (uint64_t *)-1) {
perror("mmap");
return NULL;
}
}
return *addr;
}
INLINE_ELAPSED(inline)
#define HAVE_TICK_COUNTER
#endif
#endif /* HAVE_MIPS_ZBUS_TIMER */

37
fp.h 普通文件
查看文件

@ -0,0 +1,37 @@
#ifndef FP_H
#define FP_H
#include "u512.h"
/* fp is in the Montgomery domain, so interpreting that
as an integer should never make sense.
enable compiler warnings when mixing up u512 and fp. */
typedef struct fp {
u512 x;
} fp;
extern const fp fp_0;
extern const fp fp_1;
void fp_set(fp *x, uint64_t y);
void fp_cswap(fp *x, fp *y, bool c);
void fp_enc(fp *x, u512 const *y); /* encode to Montgomery representation */
void fp_dec(u512 *x, fp const *y); /* decode from Montgomery representation */
void fp_add2(fp *x, fp const *y);
void fp_sub2(fp *x, fp const *y);
void fp_mul2(fp *x, fp const *y);
void fp_add3(fp *x, fp const *y, fp const *z);
void fp_sub3(fp *x, fp const *y, fp const *z);
void fp_mul3(fp *x, fp const *y, fp const *z);
void fp_sq1(fp *x);
void fp_sq2(fp *x, fp const *y);
void fp_inv(fp *x);
bool fp_issquare(fp const *x);
void fp_random(fp *x);
#endif

452
fp.s 普通文件
查看文件

@ -0,0 +1,452 @@
.intel_syntax noprefix
.section .rodata
.set pbits, 511
p:
.quad 0x1b81b90533c6c87b, 0xc2721bf457aca835, 0x516730cc1f0b4f25, 0xa7aac6c567f35507
.quad 0x5afbfcc69322c9cd, 0xb42d083aedc88c42, 0xfc8ab0d15e3e4c4a, 0x65b48e8f740f89bf
.global fp_0
fp_0: .quad 0, 0, 0, 0, 0, 0, 0, 0
.global fp_1
fp_1: /* 2^512 mod p */
.quad 0xc8fc8df598726f0a, 0x7b1bc81750a6af95, 0x5d319e67c1e961b4, 0xb0aa7275301955f1
.quad 0x4a080672d9ba6c64, 0x97a5ef8a246ee77b, 0x06ea9e5d4383676a, 0x3496e2e117e0ec80
/* (2^512)^2 mod p */
.r_squared_mod_p:
.quad 0x36905b572ffc1724, 0x67086f4525f1f27d, 0x4faf3fbfd22370ca, 0x192ea214bcc584b1
.quad 0x5dae03ee2f5de3d0, 0x1e9248731776b371, 0xad5f166e20e4f52d, 0x4ed759aea6f3917e
/* -p^-1 mod 2^64 */
.inv_min_p_mod_r:
.quad 0x66c1301f632e294d
.section .text
.global fp_copy
fp_copy:
cld
mov rcx, 8
rep movsq
ret
.global fp_set
fp_set:
push rdi
call u512_set
pop rdi
mov rsi, rdi
jmp fp_enc
.global fp_cswap
fp_cswap:
movzx rax, dl
neg rax
.set k, 0
.rept 8
mov rcx, [rdi + 8*k]
mov rdx, [rsi + 8*k]
mov r8, rcx
xor r8, rdx
and r8, rax
xor rcx, r8
xor rdx, r8
mov [rdi + 8*k], rcx
mov [rsi + 8*k], rdx
.set k, k+1
.endr
ret
.reduce_once:
push rbp
mov rbp, rdi
mov rdi, [rbp + 0]
sub rdi, [rip + p + 0]
mov rsi, [rbp + 8]
sbb rsi, [rip + p + 8]
mov rdx, [rbp + 16]
sbb rdx, [rip + p + 16]
mov rcx, [rbp + 24]
sbb rcx, [rip + p + 24]
mov r8, [rbp + 32]
sbb r8, [rip + p + 32]
mov r9, [rbp + 40]
sbb r9, [rip + p + 40]
mov r10, [rbp + 48]
sbb r10, [rip + p + 48]
mov r11, [rbp + 56]
sbb r11, [rip + p + 56]
setnc al
movzx rax, al
neg rax
.macro cswap2, r, m
xor \r, \m
and \r, rax
xor \m, \r
.endm
cswap2 rdi, [rbp + 0]
cswap2 rsi, [rbp + 8]
cswap2 rdx, [rbp + 16]
cswap2 rcx, [rbp + 24]
cswap2 r8, [rbp + 32]
cswap2 r9, [rbp + 40]
cswap2 r10, [rbp + 48]
cswap2 r11, [rbp + 56]
pop rbp
ret
.global fp_add3
fp_add3:
push rdi
call u512_add3
pop rdi
jmp .reduce_once
.global fp_add2
fp_add2:
mov rdx, rdi
jmp fp_add3
.global fp_sub3
fp_sub3:
push rdi
call u512_sub3
pop rdi
xor rsi, rsi
xor rdx, rdx
xor rcx, rcx
xor r8, r8
xor r9, r9
xor r10, r10
xor r11, r11
test rax, rax
cmovnz rax, [rip + p + 0]
cmovnz rsi, [rip + p + 8]
cmovnz rdx, [rip + p + 16]
cmovnz rcx, [rip + p + 24]
cmovnz r8, [rip + p + 32]
cmovnz r9, [rip + p + 40]
cmovnz r10, [rip + p + 48]
cmovnz r11, [rip + p + 56]
add [rdi + 0], rax
adc [rdi + 8], rsi
adc [rdi + 16], rdx
adc [rdi + 24], rcx
adc [rdi + 32], r8
adc [rdi + 40], r9
adc [rdi + 48], r10
adc [rdi + 56], r11
ret
.global fp_sub2
fp_sub2:
mov rdx, rdi
xchg rsi, rdx
jmp fp_sub3
/* Montgomery arithmetic */
.global fp_enc
fp_enc:
lea rdx, [rip + .r_squared_mod_p]
jmp fp_mul3
.global fp_dec
fp_dec:
lea rdx, [rip + u512_1]
jmp fp_mul3
.global fp_mul3
fp_mul3:
push rbp
push rbx
push r12
push r13
push r14
push r15
push rdi
mov rdi, rsi
mov rsi, rdx
xor r8, r8
xor r9, r9
xor r10, r10
xor r11, r11
xor r12, r12
xor r13, r13
xor r14, r14
xor r15, r15
xor rbp, rbp
/* flags are already cleared */
.macro MULSTEP, k, r0, r1, r2, r3, r4, r5, r6, r7, r8
mov rdx, [rsi + 0]
mulx rcx, rdx, [rdi + 8*\k]
add rdx, \r0
mulx rcx, rdx, [rip + .inv_min_p_mod_r]
xor rax, rax /* clear flags */
mulx rbx, rax, [rip + p + 0]
adox \r0, rax
mulx rcx, rax, [rip + p + 8]
adcx \r1, rbx
adox \r1, rax
mulx rbx, rax, [rip + p + 16]
adcx \r2, rcx
adox \r2, rax
mulx rcx, rax, [rip + p + 24]
adcx \r3, rbx
adox \r3, rax
mulx rbx, rax, [rip + p + 32]
adcx \r4, rcx
adox \r4, rax
mulx rcx, rax, [rip + p + 40]
adcx \r5, rbx
adox \r5, rax
mulx rbx, rax, [rip + p + 48]
adcx \r6, rcx
adox \r6, rax
mulx rcx, rax, [rip + p + 56]
adcx \r7, rbx
adox \r7, rax
mov rax, 0
adcx \r8, rcx
adox \r8, rax
mov rdx, [rdi + 8*\k]
xor rax, rax /* clear flags */
mulx rbx, rax, [rsi + 0]
adox \r0, rax
mulx rcx, rax, [rsi + 8]
adcx \r1, rbx
adox \r1, rax
mulx rbx, rax, [rsi + 16]
adcx \r2, rcx
adox \r2, rax
mulx rcx, rax, [rsi + 24]
adcx \r3, rbx
adox \r3, rax
mulx rbx, rax, [rsi + 32]
adcx \r4, rcx
adox \r4, rax
mulx rcx, rax, [rsi + 40]
adcx \r5, rbx
adox \r5, rax
mulx rbx, rax, [rsi + 48]
adcx \r6, rcx
adox \r6, rax
mulx rcx, rax, [rsi + 56]
adcx \r7, rbx
adox \r7, rax
mov rax, 0
adcx \r8, rcx
adox \r8, rax
.endm
MULSTEP 0, r8, r9, r10, r11, r12, r13, r14, r15, rbp
MULSTEP 1, r9, r10, r11, r12, r13, r14, r15, rbp, r8
MULSTEP 2, r10, r11, r12, r13, r14, r15, rbp, r8, r9
MULSTEP 3, r11, r12, r13, r14, r15, rbp, r8, r9, r10
MULSTEP 4, r12, r13, r14, r15, rbp, r8, r9, r10, r11
MULSTEP 5, r13, r14, r15, rbp, r8, r9, r10, r11, r12
MULSTEP 6, r14, r15, rbp, r8, r9, r10, r11, r12, r13
MULSTEP 7, r15, rbp, r8, r9, r10, r11, r12, r13, r14
pop rdi
mov [rdi + 0], rbp
mov [rdi + 8], r8
mov [rdi + 16], r9
mov [rdi + 24], r10
mov [rdi + 32], r11
mov [rdi + 40], r12
mov [rdi + 48], r13
mov [rdi + 56], r14
pop r15
pop r14
pop r13
pop r12
pop rbx
pop rbp
jmp .reduce_once
.global fp_mul2
fp_mul2:
mov rdx, rdi
jmp fp_mul3
.global fp_sq2
fp_sq2:
/* TODO implement optimized Montgomery squaring */
mov rdx, rsi
jmp fp_mul3
.global fp_sq1
fp_sq1:
mov rsi, rdi
jmp fp_sq2
/* (obviously) not constant time in the exponent! */
.fp_pow:
push rbx
mov rbx, rsi
push r12
push r13
push rdi
sub rsp, 64
mov rsi, rdi
mov rdi, rsp
call fp_copy
mov rdi, [rsp + 64]
lea rsi, [rip + fp_1]
call fp_copy
.macro POWSTEP, k
mov r13, [rbx + 8*\k]
xor r12, r12
0:
test r13, 1
jz 1f
mov rdi, [rsp + 64]
mov rsi, rsp
call fp_mul2
1:
mov rdi, rsp
call fp_sq1
shr r13
inc r12
test r12, 64
jz 0b
.endm
POWSTEP 0
POWSTEP 1
POWSTEP 2
POWSTEP 3
POWSTEP 4
POWSTEP 5
POWSTEP 6
POWSTEP 7
add rsp, 64+8
pop r13
pop r12
pop rbx
ret
.section .rodata
.p_minus_2:
.quad 0x1b81b90533c6c879, 0xc2721bf457aca835, 0x516730cc1f0b4f25, 0xa7aac6c567f35507
.quad 0x5afbfcc69322c9cd, 0xb42d083aedc88c42, 0xfc8ab0d15e3e4c4a, 0x65b48e8f740f89bf
.section .text
/* TODO use a better addition chain? */
.global fp_inv
fp_inv:
lea rsi, [rip + .p_minus_2]
jmp .fp_pow
.section .rodata
.p_minus_1_halves:
.quad 0x8dc0dc8299e3643d, 0xe1390dfa2bd6541a, 0xa8b398660f85a792, 0xd3d56362b3f9aa83
.quad 0x2d7dfe63499164e6, 0x5a16841d76e44621, 0xfe455868af1f2625, 0x32da4747ba07c4df
.section .text
/* TODO use a better addition chain? */
.global fp_issquare
fp_issquare:
push rdi
lea rsi, [rip + .p_minus_1_halves]
call .fp_pow
pop rdi
xor rax, rax
.set k, 0
.rept 8
mov rsi, [rdi + 8*k]
xor rsi, [rip + fp_1 + 8*k]
or rax, rsi
.set k, k+1
.endr
test rax, rax
setz al
movzx rax, al
ret
/* not constant time (but this shouldn't leak anything of importance) */
.global fp_random
fp_random:
push rdi
mov rsi, 64
call randombytes
pop rdi
mov rax, 1
shl rax, (pbits % 64)
dec rax
and [rdi + 56], rax
.set k, 7
.rept 8
mov rax, [rip + p + 8*k]
cmp [rdi + 8*k], rax
jge fp_random
jl 0f
.set k, k-1
.endr
0:
ret

99
main.c 普通文件
查看文件

@ -0,0 +1,99 @@
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <unistd.h>
#include <time.h>
#include <assert.h>
#include "u512.h"
#include "fp.h"
#include "mont.h"
#include "csidh.h"
void u512_print(u512 const *x)
{
for (size_t i = 63; i < 64; --i)
printf("%02hhx", i[(unsigned char *) x->c]);
}
void fp_print(fp const *x)
{
u512 y;
fp_dec(&y, x);
u512_print(&y);
}
int main()
{
clock_t t0, t1;
private_key priv_alice, priv_bob;
public_key pub_alice, pub_bob;
public_key shared_alice, shared_bob;
printf("\n");
t0 = clock();
csidh_private(&priv_alice);
t1 = clock();
printf("Alice's private key (%7.3lf ms):\n ", 1000. * (t1 - t0) / CLOCKS_PER_SEC);
for (size_t i = 0; i < sizeof(priv_alice); ++i)
printf("%02hhx", i[(uint8_t *) &priv_alice]);
printf("\n\n");
t0 = clock();
csidh_private(&priv_bob);
t1 = clock();
printf("Bob's private key (%7.3lf ms):\n ", 1000. * (t1 - t0) / CLOCKS_PER_SEC);
for (size_t i = 0; i < sizeof(priv_bob); ++i)
printf("%02hhx", i[(uint8_t *) &priv_bob]);
printf("\n\n");
t0 = clock();
assert(csidh(&pub_alice, &base, &priv_alice));
t1 = clock();
printf("Alice's public key (%7.3lf ms):\n ", 1000. * (t1 - t0) / CLOCKS_PER_SEC);
fp_print(&pub_alice.A);
printf("\n\n");
t0 = clock();
assert(csidh(&pub_bob, &base, &priv_bob));
t1 = clock();
printf("Bob's public key (%7.3lf ms):\n ", 1000. * (t1 - t0) / CLOCKS_PER_SEC);
fp_print(&pub_bob.A);
printf("\n\n");
t0 = clock();
assert(csidh(&shared_alice, &pub_bob, &priv_alice));
t1 = clock();
printf("Alice's shared secret (%7.3lf ms):\n ", 1000. * (t1 - t0) / CLOCKS_PER_SEC);
fp_print(&shared_alice.A);
printf("\n\n");
t0 = clock();
assert(csidh(&shared_bob, &pub_alice, &priv_bob));
t1 = clock();
printf("Bob's shared secret (%7.3lf ms):\n ", 1000. * (t1 - t0) / CLOCKS_PER_SEC);
fp_print(&shared_bob.A);
printf("\n\n");
printf(" ");
if (memcmp(&shared_alice, &shared_bob, sizeof(public_key)))
printf("\x1b[31mNOT EQUAL!\x1b[0m\n");
else
printf("\x1b[32mequal.\x1b[0m\n");
printf("\n");
printf("\n");
}

203
mont.c 普通文件
查看文件

@ -0,0 +1,203 @@
#include <assert.h>
#include "mont.h"
#include "u512.h"
void xDBLADD(proj *R, proj *S, proj const *P, proj const *Q, proj const *PQ, proj const *A24)
{
fp tmp0, tmp1, tmp2; //requires precomputation of A24=(A+2C:4C)
fp_add3(&tmp0, &P->x, &P->z);
fp_sub3(&tmp1, &P->x, &P->z);
fp_sq2(&R->x, &tmp0);
fp_sub3(&tmp2, &Q->x, &Q->z);
fp_add3(&S->x, &Q->x, &Q->z);
fp_mul2(&tmp0, &tmp2);
fp_sq2(&R->z, &tmp1);
fp_mul2(&tmp1, &S->x);
fp_sub3(&tmp2, &R->x, &R->z);
fp_mul2(&R->z, &A24->z);
fp_mul2(&R->x, &R->z);
fp_mul3(&S->x, &A24->x, &tmp2);
fp_sub3(&S->z, &tmp0, &tmp1);
fp_add2(&R->z, &S->x);
fp_add3(&S->x, &tmp0, &tmp1);
fp_mul2(&R->z, &tmp2);
fp_sq1(&S->z);
fp_sq1(&S->x);
fp_mul2(&S->z, &PQ->x);
fp_mul2(&S->x, &PQ->z);
}
void xDBL(proj *Q, proj const *A, proj const *P)
{
fp a, b, c;
fp_add3(&a, &P->x, &P->z);
fp_sq1(&a);
fp_sub3(&b, &P->x, &P->z);
fp_sq1(&b);
fp_sub3(&c, &a, &b);
fp_add2(&b, &b); fp_add2(&b, &b); /* multiplication by 4 */
fp_mul2(&b, &A->z);
fp_mul3(&Q->x, &a, &b);
fp_add3(&a, &A->z, &A->z); /* multiplication by 2 */
fp_add2(&a, &A->x);
fp_mul2(&a, &c);
fp_add2(&a, &b);
fp_mul3(&Q->z, &a, &c);
}
void xADD(proj *S, proj const *P, proj const *Q, proj const *PQ)
{
fp a, b, c, d;
fp_add3(&a, &P->x, &P->z);
fp_sub3(&b, &P->x, &P->z);
fp_add3(&c, &Q->x, &Q->z);
fp_sub3(&d, &Q->x, &Q->z);
fp_mul2(&a, &d);
fp_mul2(&b, &c);
fp_add3(&c, &a, &b);
fp_sub3(&d, &a, &b);
fp_sq1(&c);
fp_sq1(&d);
fp_mul3(&S->x, &PQ->z, &c);
fp_mul3(&S->z, &PQ->x, &d);
}
/* Montgomery ladder. */
/* P must not be the unique point of order 2. */
/* not constant-time! */
void xMUL(proj *Q, proj const *A, proj const *P, u512 const *k)
{
proj R = *P;
proj A24;
const proj Pcopy = *P; /* in case Q = P */
Q->x = fp_1;
Q->z = fp_0;
fp_add3(&A24.x, &A->z, &A->z); //precomputation of A24=(A+2C:4C)
fp_add3(&A24.z, &A24.x, &A24.x);
fp_add2(&A24.x, &A->x);
unsigned long i = 512;
while (--i && !u512_bit(k, i));
do {
bool bit = u512_bit(k, i);
if (bit) { proj T = *Q; *Q = R; R = T; } /* not constant-time */
//fp_cswap(&Q->x, &R.x, bit);
//fp_cswap(&Q->z, &R.z, bit);
xDBLADD(Q, &R, Q, &R, &Pcopy, &A24);
if (bit) { proj T = *Q; *Q = R; R = T; } /* not constant-time */
//fp_cswap(&Q->x, &R.x, bit);
//fp_cswap(&Q->z, &R.z, bit);
} while (i--);
}
//simultaneous square-and-multiply, computes x^exp and y^exp
void exp_by_squaring_(fp* x, fp* y, uint64_t exp)
{
fp result1, result2;
fp_set(&result1, 1);
fp_set(&result2, 1);
while (exp)
{
if (exp & 1){
fp_mul2(&result1, x);
fp_mul2(&result2, y);
}
fp_sq1(x);
fp_sq1(y);
exp >>= 1;
}
fp_cswap(&result1, x, 1);
fp_cswap(&result2, y, 1);
}
/* computes the isogeny with kernel point K of order k */
/* returns the new curve coefficient A and the image of P */
/* (obviously) not constant time in k */
void xISOG(proj *A, proj *P, proj const *K, uint64_t k)
{
assert (k >= 3);
assert (k % 2 == 1);
fp tmp0, tmp1, tmp2, Psum, Pdif;
proj Q, Aed, prod;
fp_add3(&Aed.z, &A->z, &A->z); //compute twisted Edwards curve coefficients
fp_add3(&Aed.x, &A->x, &Aed.z);
fp_sub3(&Aed.z, &A->x, &Aed.z);
fp_add3(&Psum, &P->x, &P->z); //precomputations
fp_sub3(&Pdif, &P->x, &P->z);
fp_sub3(&prod.x, &K->x, &K->z);
fp_add3(&prod.z, &K->x, &K->z);
fp_mul3(&tmp1, &prod.x, &Psum);
fp_mul3(&tmp0, &prod.z, &Pdif);
fp_add3(&Q.x, &tmp0, &tmp1);
fp_sub3(&Q.z, &tmp0, &tmp1);
proj M[3] = {*K};
xDBL(&M[1], A, K);
for (uint64_t i = 1; i < k / 2; ++i) {
if (i >= 2)
xADD(&M[i % 3], &M[(i - 1) % 3], K, &M[(i - 2) % 3]);
fp_sub3(&tmp1, &M[i % 3].x, &M[i % 3].z);
fp_add3(&tmp0, &M[i % 3].x, &M[i % 3].z);
fp_mul2(&prod.x, &tmp1);
fp_mul2(&prod.z, &tmp0);
fp_mul2(&tmp1, &Psum);
fp_mul2(&tmp0, &Pdif);
fp_add3(&tmp2, &tmp0, &tmp1);
fp_mul2(&Q.x, &tmp2);
fp_sub3(&tmp2, &tmp0, &tmp1);
fp_mul2(&Q.z, &tmp2);
}
// point evaluation
fp_sq1(&Q.x);
fp_sq1(&Q.z);
fp_mul2(&P->x, &Q.x);
fp_mul2(&P->z, &Q.z);
//compute Aed.x^k, Aed.z^k
exp_by_squaring_(&Aed.x, &Aed.z, k);
//compute prod.x^8, prod.z^8
fp_sq1(&prod.x);
fp_sq1(&prod.x);
fp_sq1(&prod.x);
fp_sq1(&prod.z);
fp_sq1(&prod.z);
fp_sq1(&prod.z);
//compute image curve parameters
fp_mul2(&Aed.z, &prod.x);
fp_mul2(&Aed.x, &prod.z);
//compute Montgomery params
fp_add3(&A->x, &Aed.x, &Aed.z);
fp_sub3(&A->z, &Aed.x, &Aed.z);
fp_add2(&A->x, &A->x);
}

19
mont.h 普通文件
查看文件

@ -0,0 +1,19 @@
#ifndef MONT_H
#define MONT_H
#include "u512.h"
#include "fp.h"
/* P^1 over fp. */
typedef struct proj {
fp x;
fp z;
} proj;
void xDBL(proj *Q, proj const *A, proj const *P);
void xADD(proj *S, proj const *P, proj const *Q, proj const *PQ);
void xDBLADD(proj *R, proj *S, proj const *P, proj const *Q, proj const *PQ, proj const *A);
void xMUL(proj *Q, proj const *A, proj const *P, u512 const *k);
void xISOG(proj *A, proj *P, proj const *K, uint64_t k);
#endif

18
rng.c 普通文件
查看文件

@ -0,0 +1,18 @@
#include "rng.h"
#include <stdlib.h>
#include <unistd.h>
#include <fcntl.h>
void randombytes(void *x, size_t l)
{
static int fd = -1;
ssize_t n;
if (fd < 0 && 0 > (fd = open("/dev/urandom", O_RDONLY)))
exit(1);
for (size_t i = 0; i < l; i += n)
if (0 >= (n = read(fd, (char *) x + i, l - i)))
exit(2);
}

8
rng.h 普通文件
查看文件

@ -0,0 +1,8 @@
#ifndef RNG_H
#define RNG_H
#include <stdlib.h>
void randombytes(void *x, size_t l);
#endif

22
u512.h 普通文件
查看文件

@ -0,0 +1,22 @@
#ifndef UINT_H
#define UINT_H
#include <stdbool.h>
#include <stdint.h>
typedef struct u512 {
uint64_t c[8];
} u512;
extern const u512 u512_1;
void u512_set(u512 *x, uint64_t y);
bool u512_bit(u512 const *x, uint64_t k);
bool u512_add3(u512 *x, u512 const *y, u512 const *z); /* returns carry */
bool u512_sub3(u512 *x, u512 const *y, u512 const *z); /* returns borrow */
void u512_mul3_64(u512 *x, u512 const *y, uint64_t z);
#endif

102
u512.s 普通文件
查看文件

@ -0,0 +1,102 @@
.intel_syntax noprefix
.section .rodata
.global u512_1
u512_1: .quad 1, 0, 0, 0, 0, 0, 0, 0
.section .text
.global u512_set
u512_set:
cld
mov rax, rsi
stosq
xor rax, rax
mov rcx, 7
rep stosq
ret
.global u512_bit
u512_bit:
mov rcx, rsi
and rcx, 0x3f
shr rsi, 6
mov rax, [rdi + 8*rsi]
shr rax, cl
and rax, 1
ret
.global u512_add3
u512_add3:
mov rax, [rsi + 0]
add rax, [rdx + 0]
mov [rdi + 0], rax
.set k, 1
.rept 7
mov rax, [rsi + 8*k]
adc rax, [rdx + 8*k]
mov [rdi + 8*k], rax
.set k, k+1
.endr
setc al
movzx rax, al
ret
.global u512_sub3
u512_sub3:
mov rax, [rsi + 0]
sub rax, [rdx + 0]
mov [rdi + 0], rax
.set k, 1
.rept 7
mov rax, [rsi + 8*k]
sbb rax, [rdx + 8*k]
mov [rdi + 8*k], rax
.set k, k+1
.endr
setc al
movzx rax, al
ret
.global u512_mul3_64
u512_mul3_64:
mulx r10, rax, [rsi + 0]
mov [rdi + 0], rax
mulx r11, rax, [rsi + 8]
add rax, r10
mov [rdi + 8], rax
mulx r10, rax, [rsi + 16]
adcx rax, r11
mov [rdi + 16], rax
mulx r11, rax, [rsi + 24]
adcx rax, r10
mov [rdi + 24], rax
mulx r10, rax, [rsi + 32]
adcx rax, r11
mov [rdi + 32],rax
mulx r11, rax, [rsi + 40]
adcx rax, r10
mov [rdi + 40],rax
mulx r10, rax, [rsi + 48]
adcx rax, r11
mov [rdi + 48],rax
mulx r11, rax, [rsi + 56]
adcx rax, r10
mov [rdi + 56],rax
ret