@@ -0,0 +1,39 @@ | |||||
CC?=clang | |||||
TARGET_OS=$(shell uname -s) | |||||
ifeq ($(TARGET_OS),Darwin) | |||||
CFLAGS= -std=c99 -O3 -g -D_AMD64_ -D__LINUX__ -mavx512f -mavx512bw -mavx512dq -mavx512ifma | |||||
else | |||||
ifeq ($(CC),clang) | |||||
CFLAGS= -std=c99 -O3 -g -D_AMD64_ -D__LINUX__ -mavx512f -mavx512bw -mavx512dq -mavx512ifma | |||||
else | |||||
CFLAGS= -std=c99 -O3 -g -D_AMD64_ -D__LINUX__ -mavx512f | |||||
endif | |||||
endif | |||||
CFLAGS+=-D_MULX_ -D_ADX_ | |||||
SRC_REAL=fp2_751_ifma.S fp_751_ifma.S fp2_packed_751_ifma.S | |||||
SRC_STANDIN=$(SRC_REAL:.S=_standin.S) | |||||
SOURCES=./sidh_ref/fp_x64_asm.S ./sidh_ref/fp_x64.c ./sidh_ref/P751.c ./sidh_ref/random/random.c ./sidh_ref/sha3/fips202.c P751_ifma.c | |||||
OBJECTS=$(SOURCES:.c=.o) | |||||
EXE_REAL=sidh_ifma | |||||
EXE_STANDIN=sidh_standin | |||||
all: $(SOURCES) $(SRC_STANDIN) $(SRC_REAL) $(EXE_REAL) $(EXE_STANDIN) | |||||
$(SRC_STANDIN): %_standin.S: %.S | |||||
cat $< | sed 's/vpmadd52luq/VFMADD231PD/; s/vpmadd52huq/VFMADD231PD/;' > $@ | |||||
$(EXE_REAL): main.c ./sidh_ref/sidh.c $(OBJECTS) $(SRC_REAL) | |||||
$(CC) main.c $(OBJECTS) $(SRC_REAL) $(CFLAGS) -o $@ -DREPEAT=1 -DOUTER_REPEAT=1 | |||||
$(EXE_STANDIN): main.c ./sidh_ref/sidh.c $(OBJECTS) $(SRC_STANDIN) | |||||
$(CC) main.c $(OBJECTS) $(SRC_STANDIN) $(CFLAGS) -o $@ -DREPEAT=20 -DOUTER_REPEAT=20 | |||||
.o: ./sidh_ref/sidh.c | |||||
$(CC) $(CFLAGS) $< -o $@ | |||||
clean: | |||||
rm -f *.o ./sidh_ref/*.o $(EXE_REAL) $(EXE_STANDIN) $(SRC_STANDIN) |
@@ -0,0 +1,817 @@ | |||||
#include <stdint.h> | |||||
#include <string.h> | |||||
#define NWORDS_FIELD 15 | |||||
#define MAX_INT_POINTS_ALICE 8 | |||||
#define MAX_INT_POINTS_BOB 10 | |||||
#define ALICE 0 | |||||
#define BOB 1 | |||||
#define OALICE_BITS 372 | |||||
#define OBOB_BITS 379 | |||||
#define MAX_Alice 186 | |||||
#define MAX_Bob 239 | |||||
#define NBITS_FIELD 751 | |||||
#define MAXBITS_FIELD 768 | |||||
#define FP2_ENCODED_BYTES 2 * ((NBITS_FIELD + 7) / 8) | |||||
typedef uint64_t felm_t[NWORDS_FIELD]; | |||||
typedef felm_t f2elm_t[2]; | |||||
typedef struct | |||||
{ | |||||
f2elm_t X; | |||||
f2elm_t Z; | |||||
} point_proj; // Point representation in projective XZ Montgomery coordinates. | |||||
typedef point_proj point_proj_t[1]; | |||||
const uint64_t A_gen_ifma[5 * NWORDS_FIELD] = { | |||||
0x000ceab50ad8bc0d, 0x0005e457b1c2fc08, 0x000cd6e1d7d710f5, 0x000ae8738d92953d, 0x000a7ebee8a3418a, 0x0008345f03f46fba, 0x0007cfe2616c9a28, 0x000b4be50c8b9e16, 0x00039b6799643b2e, 0x000597a7ff9d56d5, 0x00021d410d97fe0a, 0x000a4a92a8f2ad52, 0x00054508e42abde4, 0x000ebf7d0178c137, 0x00000000004a0a75, | |||||
0x000d21582e4118ad, 0x0005df400ae6cc41, 0x000aec407c2ecb7c, 0x000de8e34b521432, 0x000761e2ab085167, 0x000bcaa6094b3c50, 0x000df9ddd71032cf, 0x00057d905265605f, 0x000f7dba2681f9d7, 0x0009e9732def416c, 0x0006f77956ce00ce, 0x000576fb3094772b, 0x000b2d166e2a949f, 0x0002f665c6588ea2, 0x0000000000337a25, | |||||
0x00026279148626cd, 0x0006b5baead56fe5, 0x000ab911fad60dc9, 0x000401e137d0bf07, 0x0004d3e925216196, 0x0005e4cd09a33740, 0x00069e4af733c538, 0x000d1169f6821367, 0x000c64ecfc721111, 0x000ba56507cd0dc7, 0x000995e4ae04dfad, 0x0007b992deeceab8, 0x0007bccd256aff1e, 0x000207f5fde1824c, 0x0000000000345cc7, | |||||
0x00041dffd19b3e7f, 0x000b48c18e0bb844, 0x000380584b4dea99, 0x0000692de648ad31, 0x000d72761b6dfaee, 0x0005c672c3058de6, 0x000cba26fdc22397, 0x000e15f9133d4bc3, 0x000d5ae123793466, 0x000bb494276e321d, 0x000c9c99fb74cd99, 0x0005da6e4fd03f75, 0x000b95feb24d0937, 0x000e6a307e03cd17, 0x000000000044ad2e, | |||||
0x0007f1ec71be8c36, 0x00053859b1ed78c1, 0x000529ff824d6df7, 0x000633a10839b2a8, 0x00003e9e25fdea79, 0x000a8054df1762fc, 0x000034c6467c4708, 0x000acb63530b60ec, 0x0000c6fc8c19bf71, 0x0005aca92467c3cb, 0x000d42050ba154a2, 0x000b4d5baa4ab074, 0x00044ba4962ac622, 0x0002bbf250aa70e6, 0x0000000000457f51}; | |||||
const uint64_t B_gen_ifma[5 * NWORDS_FIELD] = { | |||||
0x0001ef867ab0bcb9, 0x0009a45c76cfb6d7, 0x0001f034a5fdd76e, 0x000038b1ee69194b, 0x000e7b18a7761f3f, 0x000a486a52c84cf6, 0x0005aa75466fcf01, 0x00044164f797233f, 0x000331aeaec77db1, 0x0005185f83d9a22f, 0x000e2d4dc94f5b17, 0x0000f7b3858b15a4, 0x000635ac44515c99, 0x000a5b14eaf4ee2e, 0x000000000048e907, | |||||
0x0004e7c075cc3a24, 0x00004aa430a49203, 0x00094c8677baf00b, 0x000b3aae0c9a755c, 0x000c4b064e9ebb08, 0x000dd04e826c661d, 0x00061f01b223684e, 0x000d43bc8a6360b6, 0x00008c633a79ab30, 0x0008e0092fbd6f39, 0x0002b9ba797337f8, 0x000fcb3252ddaf84, 0x000467ded2ca9dce, 0x0006117350e479f4, 0x00000000001ae9d1, | |||||
0x000ed7b96c4ab279, 0x000178486ef1a8c9, 0x000c2f4299429da5, 0x000aef4926f20cd5, 0x0003b2e2858b4716, 0x000bcc3cac3eeb68, 0x0003a600460dda2f, 0x00050e6650a24c9f, 0x0004cb60c61775f8, 0x00082b196ebc78b3, 0x000cc7fec8cce966, 0x000d9b778d801d65, 0x0005324630f74af3, 0x0009018193e7592e, 0x00000000003aef05, | |||||
0x00033769d0f314ef, 0x000e2659d11c0d67, 0x000d133f084c3086, 0x0005e23d5da27bcb, 0x0008ec9a8d586402, 0x000c781b3b645bf3, 0x000c9fb03ee6426d, 0x000ddc7bb40b83e3, 0x000bb7b4ab585e3a, 0x0006c2672e53eeaf, 0x0000397a1e62b655, 0x0004ac383daab923, 0x0008eb1ecdd2f39e, 0x000f1516da469247, 0x00000000003693cf, | |||||
0x0007d8f72bd956dc, 0x000e9934884ae37e, 0x0003c3edd2d504b3, 0x00005d14e7fa1ecb, 0x0007610ceb75d635, 0x000b4cac446b1112, 0x000c1f70caf255b4, 0x00057d3e324d2f36, 0x0006181c3bb1a700, 0x000db2f2916ccc40, 0x00021ee51d1c92f1, 0x000c07c22031c32a, 0x000e4310e5103473, 0x00069c1148de9ef5, 0x00000000004d1227}; | |||||
const uint64_t One[NWORDS_FIELD] = { | |||||
0x00000000249ad67c, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0001f9800c542c00, 0x000b326488fe3b2a, 0x000e6176236db777, 0x000dd6e970232b83, 0x000d4d762277573f, 0x00054cd16c015f35, 0x0009fc72438c4fc7, 0x00000000001bf8f6}; | |||||
const uint64_t Two[NWORDS_FIELD] = { | |||||
0x000000004935acf8, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0003f30018a85800, 0x000664c911fc7654, 0x000cc2ec46db6eef, 0x000badd2e0465707, 0x000a9aec44eeae7f, 0x000a99a2d802be6b, 0x0003f8e487189f8e, 0x000000000037f1ed}; | |||||
// Fixed parameters for isogeny tree computation | |||||
extern const unsigned int strat_Alice[MAX_Alice - 1]; | |||||
extern const unsigned int strat_Bob[MAX_Bob - 1]; | |||||
void norm2red(uint64_t *res, const uint64_t *a); | |||||
void red2norm(uint64_t out[12], const uint64_t in[15]) | |||||
{ | |||||
out[0] = in[0] ^ in[1] << 52; | |||||
out[1] = in[1] >> 12 ^ in[2] << 40; | |||||
out[2] = in[2] >> 24 ^ in[3] << 28; | |||||
out[3] = in[3] >> 36 ^ in[4] << 16; | |||||
out[4] = in[4] >> 48 ^ in[5] << 4 ^ in[6] << 56; | |||||
out[5] = in[6] >> 8 ^ in[7] << 44; | |||||
out[6] = in[7] >> 20 ^ in[8] << 32; | |||||
out[7] = in[8] >> 32 ^ in[9] << 20; | |||||
out[8] = in[9] >> 44 ^ in[10] << 8 ^ in[11] << 60; | |||||
out[9] = in[11] >> 4 ^ in[12] << 48; | |||||
out[10] = in[12] >> 16 ^ in[13] << 36; | |||||
out[11] = in[13] >> 28 ^ in[14] << 24; | |||||
} | |||||
static void init_basis(const uint64_t *gen, f2elm_t XP, f2elm_t XQ, f2elm_t XR) | |||||
{ // Initialization of basis points | |||||
memcpy(XP[0], &gen[0 * NWORDS_FIELD], sizeof(felm_t)); | |||||
memcpy(XP[1], &gen[1 * NWORDS_FIELD], sizeof(felm_t)); | |||||
memcpy(XQ[0], &gen[2 * NWORDS_FIELD], sizeof(felm_t)); | |||||
memset(XQ[1], 0, sizeof(felm_t)); | |||||
memcpy(XR[0], &gen[3 * NWORDS_FIELD], sizeof(felm_t)); | |||||
memcpy(XR[1], &gen[4 * NWORDS_FIELD], sizeof(felm_t)); | |||||
} | |||||
void fp2_mul_ifma(f2elm_t res, const f2elm_t a, const f2elm_t b); | |||||
void fp2_mul_ifma_x2(f2elm_t res1, const f2elm_t a1, const f2elm_t b1, f2elm_t res2, const f2elm_t a2, const f2elm_t b2); | |||||
void fp2_sqr_ifma(f2elm_t res, const f2elm_t a); | |||||
void fp2_add(f2elm_t res, const f2elm_t a, const f2elm_t b); | |||||
void fp2_sub(f2elm_t res, const f2elm_t a, const f2elm_t b); | |||||
void fp2_swap(point_proj_t a, point_proj_t b, int swap); | |||||
void fp_mul_ifma(felm_t res, felm_t a, felm_t b); | |||||
void fp_add(felm_t res, const felm_t a, const felm_t b); | |||||
void fp_sub(felm_t res, const felm_t a, const felm_t b); | |||||
void to_mont_ifma(felm_t rp, const felm_t ap); | |||||
void from_mont_ifma(felm_t rp, const felm_t ap); | |||||
void red2norm(uint64_t out[12], const felm_t in); | |||||
#define fp2mul_mont(a, b, r) fp2_mul_ifma(r, a, b) | |||||
#define fp2sqr_mont(a, r) fp2_sqr_ifma(r, a) | |||||
#define fp2add(a, b, r) fp2_add(r, a, b) | |||||
#define fp2sub(a, b, r) fp2_sub(r, a, b) | |||||
#define fp2correction | |||||
#define fpsqr_mont(a, r) fp_mul_ifma(r, a, a) | |||||
#define fpmul_mont(a, b, r) fp_mul_ifma(r, a, b) | |||||
#define fpadd(a, b, r) fp_add(r, a, b) | |||||
#define fpsub(a, b, r) fp_sub(r, a, b) | |||||
void fpinv_chain_mont(felm_t a) | |||||
{ // Chain to compute a^(p-3)/4 using Montgomery arithmetic. | |||||
unsigned int i, j; | |||||
felm_t t[27], tt; | |||||
// Precomputed table | |||||
fpsqr_mont(a, tt); | |||||
fpmul_mont(a, tt, t[0]); | |||||
fpmul_mont(t[0], tt, t[1]); | |||||
fpmul_mont(t[1], tt, t[2]); | |||||
fpmul_mont(t[2], tt, t[3]); | |||||
fpmul_mont(t[3], tt, t[3]); | |||||
for (i = 3; i <= 8; i++) | |||||
fpmul_mont(t[i], tt, t[i + 1]); | |||||
fpmul_mont(t[9], tt, t[9]); | |||||
for (i = 9; i <= 20; i++) | |||||
fpmul_mont(t[i], tt, t[i + 1]); | |||||
fpmul_mont(t[21], tt, t[21]); | |||||
for (i = 21; i <= 24; i++) | |||||
fpmul_mont(t[i], tt, t[i + 1]); | |||||
fpmul_mont(t[25], tt, t[25]); | |||||
fpmul_mont(t[25], tt, t[26]); | |||||
memcpy(tt, a, sizeof(felm_t)); | |||||
for (i = 0; i < 6; i++) | |||||
fpsqr_mont(tt, tt); | |||||
fpmul_mont(t[20], tt, tt); | |||||
for (i = 0; i < 6; i++) | |||||
fpsqr_mont(tt, tt); | |||||
fpmul_mont(t[24], tt, tt); | |||||
for (i = 0; i < 6; i++) | |||||
fpsqr_mont(tt, tt); | |||||
fpmul_mont(t[11], tt, tt); | |||||
for (i = 0; i < 6; i++) | |||||
fpsqr_mont(tt, tt); | |||||
fpmul_mont(t[8], tt, tt); | |||||
for (i = 0; i < 8; i++) | |||||
fpsqr_mont(tt, tt); | |||||
fpmul_mont(t[2], tt, tt); | |||||
for (i = 0; i < 6; i++) | |||||
fpsqr_mont(tt, tt); | |||||
fpmul_mont(t[23], tt, tt); | |||||
for (i = 0; i < 6; i++) | |||||
fpsqr_mont(tt, tt); | |||||
fpmul_mont(t[2], tt, tt); | |||||
for (i = 0; i < 9; i++) | |||||
fpsqr_mont(tt, tt); | |||||
fpmul_mont(t[2], tt, tt); | |||||
for (i = 0; i < 10; i++) | |||||
fpsqr_mont(tt, tt); | |||||
fpmul_mont(t[15], tt, tt); | |||||
for (i = 0; i < 8; i++) | |||||
fpsqr_mont(tt, tt); | |||||
fpmul_mont(t[13], tt, tt); | |||||
for (i = 0; i < 8; i++) | |||||
fpsqr_mont(tt, tt); | |||||
fpmul_mont(t[26], tt, tt); | |||||
for (i = 0; i < 8; i++) | |||||
fpsqr_mont(tt, tt); | |||||
fpmul_mont(t[20], tt, tt); | |||||
for (i = 0; i < 6; i++) | |||||
fpsqr_mont(tt, tt); | |||||
fpmul_mont(t[11], tt, tt); | |||||
for (i = 0; i < 6; i++) | |||||
fpsqr_mont(tt, tt); | |||||
fpmul_mont(t[10], tt, tt); | |||||
for (i = 0; i < 6; i++) | |||||
fpsqr_mont(tt, tt); | |||||
fpmul_mont(t[14], tt, tt); | |||||
for (i = 0; i < 6; i++) | |||||
fpsqr_mont(tt, tt); | |||||
fpmul_mont(t[4], tt, tt); | |||||
for (i = 0; i < 10; i++) | |||||
fpsqr_mont(tt, tt); | |||||
fpmul_mont(t[18], tt, tt); | |||||
for (i = 0; i < 6; i++) | |||||
fpsqr_mont(tt, tt); | |||||
fpmul_mont(t[1], tt, tt); | |||||
for (i = 0; i < 7; i++) | |||||
fpsqr_mont(tt, tt); | |||||
fpmul_mont(t[22], tt, tt); | |||||
for (i = 0; i < 10; i++) | |||||
fpsqr_mont(tt, tt); | |||||
fpmul_mont(t[6], tt, tt); | |||||
for (i = 0; i < 7; i++) | |||||
fpsqr_mont(tt, tt); | |||||
fpmul_mont(t[24], tt, tt); | |||||
for (i = 0; i < 6; i++) | |||||
fpsqr_mont(tt, tt); | |||||
fpmul_mont(t[9], tt, tt); | |||||
for (i = 0; i < 8; i++) | |||||
fpsqr_mont(tt, tt); | |||||
fpmul_mont(t[18], tt, tt); | |||||
for (i = 0; i < 6; i++) | |||||
fpsqr_mont(tt, tt); | |||||
fpmul_mont(t[17], tt, tt); | |||||
for (i = 0; i < 8; i++) | |||||
fpsqr_mont(tt, tt); | |||||
fpmul_mont(a, tt, tt); | |||||
for (i = 0; i < 10; i++) | |||||
fpsqr_mont(tt, tt); | |||||
fpmul_mont(t[16], tt, tt); | |||||
for (i = 0; i < 6; i++) | |||||
fpsqr_mont(tt, tt); | |||||
fpmul_mont(t[7], tt, tt); | |||||
for (i = 0; i < 6; i++) | |||||
fpsqr_mont(tt, tt); | |||||
fpmul_mont(t[0], tt, tt); | |||||
for (i = 0; i < 7; i++) | |||||
fpsqr_mont(tt, tt); | |||||
fpmul_mont(t[12], tt, tt); | |||||
for (i = 0; i < 7; i++) | |||||
fpsqr_mont(tt, tt); | |||||
fpmul_mont(t[19], tt, tt); | |||||
for (i = 0; i < 6; i++) | |||||
fpsqr_mont(tt, tt); | |||||
fpmul_mont(t[22], tt, tt); | |||||
for (i = 0; i < 6; i++) | |||||
fpsqr_mont(tt, tt); | |||||
fpmul_mont(t[25], tt, tt); | |||||
for (i = 0; i < 7; i++) | |||||
fpsqr_mont(tt, tt); | |||||
fpmul_mont(t[2], tt, tt); | |||||
for (i = 0; i < 6; i++) | |||||
fpsqr_mont(tt, tt); | |||||
fpmul_mont(t[10], tt, tt); | |||||
for (i = 0; i < 7; i++) | |||||
fpsqr_mont(tt, tt); | |||||
fpmul_mont(t[22], tt, tt); | |||||
for (i = 0; i < 8; i++) | |||||
fpsqr_mont(tt, tt); | |||||
fpmul_mont(t[18], tt, tt); | |||||
for (i = 0; i < 6; i++) | |||||
fpsqr_mont(tt, tt); | |||||
fpmul_mont(t[4], tt, tt); | |||||
for (i = 0; i < 6; i++) | |||||
fpsqr_mont(tt, tt); | |||||
fpmul_mont(t[14], tt, tt); | |||||
for (i = 0; i < 7; i++) | |||||
fpsqr_mont(tt, tt); | |||||
fpmul_mont(t[13], tt, tt); | |||||
for (i = 0; i < 6; i++) | |||||
fpsqr_mont(tt, tt); | |||||
fpmul_mont(t[5], tt, tt); | |||||
for (i = 0; i < 6; i++) | |||||
fpsqr_mont(tt, tt); | |||||
fpmul_mont(t[23], tt, tt); | |||||
for (i = 0; i < 6; i++) | |||||
fpsqr_mont(tt, tt); | |||||
fpmul_mont(t[21], tt, tt); | |||||
for (i = 0; i < 6; i++) | |||||
fpsqr_mont(tt, tt); | |||||
fpmul_mont(t[2], tt, tt); | |||||
for (i = 0; i < 7; i++) | |||||
fpsqr_mont(tt, tt); | |||||
fpmul_mont(t[23], tt, tt); | |||||
for (i = 0; i < 8; i++) | |||||
fpsqr_mont(tt, tt); | |||||
fpmul_mont(t[12], tt, tt); | |||||
for (i = 0; i < 6; i++) | |||||
fpsqr_mont(tt, tt); | |||||
fpmul_mont(t[9], tt, tt); | |||||
for (i = 0; i < 6; i++) | |||||
fpsqr_mont(tt, tt); | |||||
fpmul_mont(t[3], tt, tt); | |||||
for (i = 0; i < 7; i++) | |||||
fpsqr_mont(tt, tt); | |||||
fpmul_mont(t[13], tt, tt); | |||||
for (i = 0; i < 7; i++) | |||||
fpsqr_mont(tt, tt); | |||||
fpmul_mont(t[17], tt, tt); | |||||
for (i = 0; i < 8; i++) | |||||
fpsqr_mont(tt, tt); | |||||
fpmul_mont(t[26], tt, tt); | |||||
for (i = 0; i < 8; i++) | |||||
fpsqr_mont(tt, tt); | |||||
fpmul_mont(t[5], tt, tt); | |||||
for (i = 0; i < 8; i++) | |||||
fpsqr_mont(tt, tt); | |||||
fpmul_mont(t[8], tt, tt); | |||||
for (i = 0; i < 6; i++) | |||||
fpsqr_mont(tt, tt); | |||||
fpmul_mont(t[2], tt, tt); | |||||
for (i = 0; i < 6; i++) | |||||
fpsqr_mont(tt, tt); | |||||
fpmul_mont(t[11], tt, tt); | |||||
for (i = 0; i < 7; i++) | |||||
fpsqr_mont(tt, tt); | |||||
fpmul_mont(t[20], tt, tt); | |||||
for (j = 0; j < 61; j++) | |||||
{ | |||||
for (i = 0; i < 6; i++) | |||||
fpsqr_mont(tt, tt); | |||||
fpmul_mont(t[26], tt, tt); | |||||
} | |||||
memcpy(a, tt, sizeof(felm_t)); | |||||
} | |||||
void fpinv_mont(felm_t a) | |||||
{ // Field inversion using Montgomery arithmetic, a = a^(-1)*R mod p. | |||||
felm_t tt; | |||||
memcpy(tt, a, sizeof(felm_t)); | |||||
fpinv_chain_mont(tt); | |||||
fpsqr_mont(tt, tt); | |||||
fpsqr_mont(tt, tt); | |||||
fpmul_mont(a, tt, a); | |||||
} | |||||
void fp2inv_mont(f2elm_t a) | |||||
{ // GF(p^2) inversion using Montgomery arithmetic, a = (a0-i*a1)/(a0^2+a1^2). | |||||
f2elm_t t1; | |||||
felm_t zero = {0}; | |||||
fpsqr_mont(a[0], t1[0]); // t10 = a0^2 | |||||
fpsqr_mont(a[1], t1[1]); // t11 = a1^2 | |||||
fpadd(t1[0], t1[1], t1[0]); // t10 = a0^2+a1^2 | |||||
fpinv_mont(t1[0]); // t10 = (a0^2+a1^2)^-1 | |||||
fp_sub(a[1], zero, a[1]); // a = a0-i*a1 | |||||
fpmul_mont(a[0], t1[0], a[0]); | |||||
fpmul_mont(a[1], t1[0], a[1]); // a = (a0-i*a1)*(a0^2+a1^2)^-1 | |||||
} | |||||
void inv_3_way_ifma(f2elm_t z1, f2elm_t z2, f2elm_t z3) | |||||
{ // 3-way simultaneous inversion | |||||
// Input: z1,z2,z3 | |||||
// Output: 1/z1,1/z2,1/z3 (override inputs). | |||||
f2elm_t t0, t1, t2, t3; | |||||
fp2mul_mont(z1, z2, t0); // t0 = z1*z2 | |||||
fp2mul_mont(z3, t0, t1); // t1 = z1*z2*z3 | |||||
fp2inv_mont(t1); // t1 = 1/(z1*z2*z3) | |||||
fp2mul_mont(z3, t1, t2); // t2 = 1/(z1*z2) | |||||
fp2_mul_ifma_x2(t3, t2, z2, z2, t2, z1); | |||||
//fp2mul_mont(t2, z2, t3); // t3 = 1/z1 | |||||
//fp2mul_mont(t2, z1, z2); // z2 = 1/z2 | |||||
fp2mul_mont(t0, t1, z3); // z3 = 1/z3 | |||||
memcpy(z1, t3, sizeof(f2elm_t)); | |||||
} | |||||
void xDBLADD_ifma(point_proj_t P, point_proj_t Q, const f2elm_t xPQ, const f2elm_t A24) | |||||
{ // Simultaneous doubling and differential addition. | |||||
// Input: projective Montgomery points P=(XP:ZP) and Q=(XQ:ZQ) such that xP=XP/ZP and xQ=XQ/ZQ, affine difference xPQ=x(P-Q) and Montgomery curve constant A24=(A+2)/4. | |||||
// Output: projective Montgomery points P <- 2*P = (X2P:Z2P) such that x(2P)=X2P/Z2P, and Q <- P+Q = (XQP:ZQP) such that = x(Q+P)=XQP/ZQP. | |||||
f2elm_t t0, t1, t2, t3; | |||||
fp2add(P->X, P->Z, t0); // t0 = XP+ZP | |||||
fp2sub(P->X, P->Z, t1); // t1 = XP-ZP | |||||
fp2_mul_ifma_x2(P->X, t0, t0, P->Z, t1, t1); | |||||
//fp2sqr_mont(t0, P->X); // XP = (XP+ZP)^2 | |||||
//fp2sqr_mont(t1, P->Z); // ZP = (XP-ZP)^2 | |||||
fp2add(Q->X, Q->Z, t2); // XQ = XQ+ZQ | |||||
fp2sub(Q->X, Q->Z, t3); // t2 = XQ-ZQ | |||||
fp2_mul_ifma_x2(t1, t1, t2, t0, t0, t3); | |||||
//fp2mul_mont(t2, t1, t1); // t1 = (XP-ZP)*(XQ+ZQ) | |||||
//fp2mul_mont(t3, t0, t0); // t0 = (XP+ZP)*(XQ-ZQ) | |||||
fp2sub(P->X, P->Z, t2); // t2 = (XP+ZP)^2-(XP-ZP)^2 | |||||
fp2sub(t0, t1, Q->Z); // ZQ = (XP+ZP)*(XQ-ZQ)-(XP-ZP)*(XQ+ZQ) | |||||
fp2_mul_ifma_x2(P->X, P->X, P->Z, Q->X, A24, t2); | |||||
//fp2mul_mont(P->X, P->Z, P->X); // XP = (XP+ZP)^2*(XP-ZP)^2 | |||||
//fp2mul_mont(t2, A24, Q->X); // XQ = A24*[(XP+ZP)^2-(XP-ZP)^2] | |||||
fp2add(Q->X, P->Z, P->Z); // ZP = A24*[(XP+ZP)^2-(XP-ZP)^2]+(XP-ZP)^2 | |||||
fp2add(t0, t1, Q->X); // XQ = (XP+ZP)*(XQ-ZQ)+(XP-ZP)*(XQ+ZQ) | |||||
fp2_mul_ifma_x2(Q->Z, Q->Z, Q->Z, Q->X, Q->X, Q->X); | |||||
//fp2sqr_mont(Q->Z, Q->Z); // ZQ = [(XP+ZP)*(XQ-ZQ)-(XP-ZP)*(XQ+ZQ)]^2 | |||||
//fp2sqr_mont(Q->X, Q->X); // XQ = [(XP+ZP)*(XQ-ZQ)+(XP-ZP)*(XQ+ZQ)]^2 | |||||
fp2_mul_ifma_x2(P->Z, P->Z, t2, Q->Z, Q->Z, xPQ); | |||||
//fp2mul_mont(P->Z, t2, P->Z); // ZP = [A24*[(XP+ZP)^2-(XP-ZP)^2]+(XP-ZP)^2]*[(XP+ZP)^2-(XP-ZP)^2] | |||||
//fp2mul_mont(Q->Z, xPQ, Q->Z); // ZQ = xPQ*[(XP+ZP)*(XQ-ZQ)-(XP-ZP)*(XQ+ZQ)]^2 | |||||
} | |||||
static void LADDER3PT_ifma(const f2elm_t xP, const f2elm_t xQ, const f2elm_t xPQ, const uint64_t *m, const unsigned int AliceOrBob, point_proj_t R) | |||||
{ | |||||
point_proj_t R0 = {0}, R2 = {0}; | |||||
const f2elm_t A24 = { | |||||
{0x00000000124d6b3e, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000fcc0062a1600, 0x000d9932447f1d95, 0x000f30bb11b6dbbb, 0x000eeb74b81195c1, 0x000ea6bb113bab9f, 0x000aa668b600af9a, 0x0004fe3921c627e3, 0x00000000000dfc7b}, | |||||
{0}}; | |||||
uint64_t mask; | |||||
int i, nbits, bit, swap, prevbit = 0; | |||||
if (AliceOrBob == ALICE) | |||||
{ | |||||
nbits = OALICE_BITS; | |||||
} | |||||
else | |||||
{ | |||||
nbits = OBOB_BITS; | |||||
} | |||||
// Initializing points | |||||
memcpy(R0->X, xQ, sizeof(f2elm_t)); | |||||
memcpy(R0->Z[0], One, sizeof(felm_t)); | |||||
memcpy(R2->X, xPQ, sizeof(f2elm_t)); | |||||
memcpy(R2->Z[0], One, sizeof(felm_t)); | |||||
memcpy(R->X, xP, sizeof(f2elm_t)); | |||||
memcpy(R->Z[0], One, sizeof(felm_t)); | |||||
memset(R->Z[1], 0, sizeof(felm_t)); | |||||
// Main loop | |||||
for (i = 0; i < nbits; i++) | |||||
{ | |||||
bit = (m[i >> 6] >> (i & (64 - 1))) & 1; | |||||
swap = bit ^ prevbit; | |||||
prevbit = bit; | |||||
fp2_swap(R, R2, swap); | |||||
xDBLADD_ifma(R0, R2, R->X, A24); | |||||
fp2_mul_ifma(R2->X, R->Z, R2->X); | |||||
} | |||||
} | |||||
static void xDBL_ifma(const point_proj_t P, point_proj_t Q, const f2elm_t A24plus, const f2elm_t C24) | |||||
{ // Doubling of a Montgomery point in projective coordinates (X:Z). | |||||
// Input: projective Montgomery x-coordinates P = (X1:Z1), where x1=X1/Z1 and Montgomery curve constants A+2C and 4C. | |||||
// Output: projective Montgomery x-coordinates Q = 2*P = (X2:Z2). | |||||
f2elm_t t0, t1, t2; | |||||
fp2sub(P->X, P->Z, t0); // t0 = X1-Z1 | |||||
fp2add(P->X, P->Z, t1); // t1 = X1+Z1 | |||||
fp2_mul_ifma_x2(t0, t0, t0, t1, t1, t1); | |||||
//fp2sqr_mont(t0, t0); // t0 = (X1-Z1)^2 | |||||
//fp2sqr_mont(t1, t1); // t1 = (X1+Z1)^2 | |||||
fp2sub(t1, t0, t2); // t1 = (X1+Z1)^2-(X1-Z1)^2 | |||||
fp2_mul_ifma_x2(Q->Z, t0, C24, t0, t2, A24plus); | |||||
//fp2mul_mont(C24, t0, Q->Z); // Z2 = C24*(X1-Z1)^2 | |||||
//fp2mul_mont(A24plus, t2, t0); // t0 = A24plus*[(X1+Z1)^2-(X1-Z1)^2] | |||||
fp2add(Q->Z, t0, t0); // Z2 = A24plus*[(X1+Z1)^2-(X1-Z1)^2] + C24*(X1-Z1)^2 | |||||
fp2_mul_ifma_x2(Q->X, Q->Z, t1, Q->Z, t2, t0); | |||||
//fp2mul_mont(t1, Q->Z, Q->X); // X2 = C24*(X1-Z1)^2*(X1+Z1)^2 | |||||
//fp2mul_mont(t0, t2, Q->Z); // Z2 = [A24plus*[(X1+Z1)^2-(X1-Z1)^2] + C24*(X1-Z1)^2]*[(X1+Z1)^2-(X1-Z1)^2] | |||||
} | |||||
static void xDBLe_ifma(const point_proj_t P, point_proj_t Q, const f2elm_t A24plus, const f2elm_t C24, const int e) | |||||
{ // Computes [2^e](X:Z) on Montgomery curve with projective constant via e repeated doublings. | |||||
// Input: projective Montgomery x-coordinates P = (XP:ZP), such that xP=XP/ZP and Montgomery curve constants A+2C and 4C. | |||||
// Output: projective Montgomery x-coordinates Q <- (2^e)*P. | |||||
int i; | |||||
memcpy(Q, P, sizeof(point_proj)); | |||||
for (i = 0; i < e; i++) | |||||
{ | |||||
xDBL_ifma(Q, Q, A24plus, C24); | |||||
} | |||||
} | |||||
static void xTPL_ifma(const point_proj_t P, point_proj_t Q, const f2elm_t A24minus, const f2elm_t A24plus) | |||||
{ // Tripling of a Montgomery point in projective coordinates (X:Z). | |||||
// Input: projective Montgomery x-coordinates P = (X:Z), where x=X/Z and Montgomery curve constants A24plus = A+2C and A24minus = A-2C. | |||||
// Output: projective Montgomery x-coordinates Q = 3*P = (X3:Z3). | |||||
f2elm_t t0, t1, t2, t3, t4, t5, t6, t7, t8; | |||||
fp2sub(P->X, P->Z, t0); // t0 = X-Z | |||||
fp2add(P->X, P->Z, t1); // t1 = X+Z | |||||
fp2_mul_ifma_x2(t2, t0, t0, t3, t1, t1); | |||||
//fp2sqr_mont(t0, t2); // t2 = (X-Z)^2 | |||||
//fp2sqr_mont(t1, t3); // t3 = (X+Z)^2 | |||||
fp2_mul_ifma_x2(t5, A24plus, t3, t6, A24minus, t2); | |||||
//fp2mul_mont(t3, A24plus, t5); // t5 = A24plus*(X+Z)^2 | |||||
//fp2mul_mont(A24minus, t2, t6); // t6 = A24minus*(X-Z)^2 | |||||
fp2_mul_ifma_x2(t7, t3, t5, t8, t2, t6); | |||||
//fp2mul_mont(t3, t5, t7); // t3 = A24plus*(X+Z)^3 | |||||
//fp2mul_mont(t2, t6, t8); // t2 = A24minus*(X-Z)^3 | |||||
fp2add(t0, t1, t4); // t4 = 2*X | |||||
fp2sub(t1, t0, t0); // t0 = 2*Z | |||||
fp2sqr_mont(t4, t1); // t1 = 4*X^2 | |||||
fp2sub(t1, t3, t1); // t1 = 4*X^2 - (X+Z)^2 | |||||
fp2sub(t1, t2, t1); // t1 = 4*X^2 - (X+Z)^2 - (X-Z)^2 | |||||
fp2sub(t8, t7, t7); // t3 = A24minus*(X-Z)^3 - coeff*(X+Z)^3 | |||||
fp2sub(t5, t6, t8); // t2 = A24plus*(X+Z)^2 - A24minus*(X-Z)^2 | |||||
fp2mul_mont(t1, t8, t1); // t1 = [4*X^2 - (X+Z)^2 - (X-Z)^2]*[A24plus*(X+Z)^2 - A24minus*(X-Z)^2] | |||||
fp2add(t7, t1, t8); // t2 = [4*X^2 - (X+Z)^2 - (X-Z)^2]*[A24plus*(X+Z)^2 - A24minus*(X-Z)^2] + A24minus*(X-Z)^3 - coeff*(X+Z)^3 | |||||
fp2sub(t7, t1, t1); // t1 = A24minus*(X-Z)^3 - A24plus*(X+Z)^3 - [4*X^2 - (X+Z)^2 - (X-Z)^2]*[A24plus*(X+Z)^2 - A24minus*(X-Z)^2] | |||||
fp2_mul_ifma_x2(t8, t8, t8, t1, t1, t1); | |||||
//fp2sqr_mont(t8, t8); // t2 = t2^2 | |||||
//fp2sqr_mont(t1, t1); // t1 = t1^2 | |||||
fp2_mul_ifma_x2(Q->X, t4, t8, Q->Z, t1, t0); | |||||
//fp2mul_mont(t4, t8, Q->X); // X3 = 2*X*t2 | |||||
//fp2mul_mont(t0, t1, Q->Z); // Z3 = 2*Z*t1 | |||||
} | |||||
void xTPLe_ifma(const point_proj_t P, point_proj_t Q, const f2elm_t A24minus, const f2elm_t A24plus, const int e) | |||||
{ // Computes [3^e](X:Z) on Montgomery curve with projective constant via e repeated triplings. | |||||
// Input: projective Montgomery x-coordinates P = (XP:ZP), such that xP=XP/ZP and Montgomery curve constants A24plus = A+2C and A24minus = A-2C. | |||||
// Output: projective Montgomery x-coordinates Q <- (3^e)*P. | |||||
int i; | |||||
memcpy(Q, P, sizeof(point_proj)); | |||||
for (i = 0; i < e; i++) | |||||
{ | |||||
xTPL_ifma(Q, Q, A24minus, A24plus); | |||||
} | |||||
} | |||||
static void get_4_isog_ifma(const point_proj_t P, f2elm_t A24plus, f2elm_t C24, f2elm_t *coeff) | |||||
{ // Computes the corresponding 4-isogeny of a projective Montgomery point (X4:Z4) of order 4. | |||||
// Input: projective point of order four P = (X4:Z4). | |||||
// Output: the 4-isogenous Montgomery curve with projective coefficients A+2C/4C and the 3 coefficients | |||||
// that are used to evaluate the isogeny at a point in eval_4_isog(). | |||||
fp2sub(P->X, P->Z, coeff[1]); // coeff[1] = X4-Z4 | |||||
fp2add(P->X, P->Z, coeff[2]); // coeff[2] = X4+Z4 | |||||
fp2_mul_ifma_x2(coeff[0], P->Z, P->Z, A24plus, P->X, P->X); | |||||
//fp2sqr_mont(P->Z, coeff[0]); // coeff[0] = Z4^2 | |||||
//fp2sqr_mont(P->X, A24plus); // A24plus = X4^2 | |||||
fp2add(coeff[0], coeff[0], coeff[0]); // coeff[0] = 2*Z4^2 | |||||
fp2add(A24plus, A24plus, A24plus); // A24plus = 2*X4^2 | |||||
fp2_mul_ifma_x2(C24, coeff[0], coeff[0], A24plus, A24plus, A24plus); | |||||
//fp2sqr_mont(coeff[0], C24); // C24 = 4*Z4^4 | |||||
//fp2sqr_mont(A24plus, A24plus); // A24plus = 4*X4^4 | |||||
fp2add(coeff[0], coeff[0], coeff[0]); // coeff[0] = 4*Z4^2 | |||||
} | |||||
static void eval_4_isog_ifma(point_proj_t P, f2elm_t *coeff) | |||||
{ // Evaluates the isogeny at the point (X:Z) in the domain of the isogeny, given a 4-isogeny phi defined | |||||
// by the 3 coefficients in coeff (computed in the function get_4_isog()). | |||||
// Inputs: the coefficients defining the isogeny, and the projective point P = (X:Z). | |||||
// Output: the projective point P = phi(P) = (X:Z) in the codomain. | |||||
f2elm_t t0, t1, t2; | |||||
fp2add(P->X, P->Z, t0); // t0 = X+Z | |||||
fp2sub(P->X, P->Z, t1); // t1 = X-Z | |||||
fp2_mul_ifma_x2(P->X, t0, coeff[1], t0, t0, t1); | |||||
//fp2mul_mont(t0, coeff[1], P->X); // X = (X+Z)*coeff[1] | |||||
//fp2mul_mont(t0, t1, t0); // t0 = (X+Z)*(X-Z) | |||||
fp2_mul_ifma_x2(P->Z, coeff[2], t1, t0, coeff[0], t0); | |||||
//fp2mul_mont(t1, coeff[2], P->Z); // Z = (X-Z)*coeff[2] | |||||
//fp2mul_mont(t0, coeff[0], t0); // t0 = coeff[0]*(X+Z)*(X-Z) | |||||
fp2add(P->X, P->Z, t1); // t1 = (X-Z)*coeff[2] + (X+Z)*coeff[1] | |||||
fp2sub(P->X, P->Z, P->Z); // Z = (X-Z)*coeff[2] - (X+Z)*coeff[1] | |||||
fp2_mul_ifma_x2(t1, t1, t1, P->Z, P->Z, P->Z); | |||||
//fp2sqr_mont(t1, t1); // t1 = [(X-Z)*coeff[2] + (X+Z)*coeff[1]]^2 | |||||
//fp2sqr_mont(P->Z, P->Z); // Z = [(X-Z)*coeff[2] - (X+Z)*coeff[1]]^2 | |||||
fp2add(t1, t0, P->X); // X = coeff[0]*(X+Z)*(X-Z) + [(X-Z)*coeff[2] + (X+Z)*coeff[1]]^2 | |||||
fp2sub(P->Z, t0, t0); // t0 = [(X-Z)*coeff[2] - (X+Z)*coeff[1]]^2 - coeff[0]*(X+Z)*(X-Z) | |||||
fp2_mul_ifma_x2(P->X, P->X, t1, P->Z, P->Z, t0); | |||||
//fp2mul_mont(P->X, t1, P->X); // Xfinal | |||||
//fp2mul_mont(P->Z, t0, P->Z); // Zfinal | |||||
} | |||||
static void get_3_isog_ifma(const point_proj_t P, f2elm_t A24minus, f2elm_t A24plus, f2elm_t *coeff) | |||||
{ // Computes the corresponding 3-isogeny of a projective Montgomery point (X3:Z3) of order 3. | |||||
// Input: projective point of order three P = (X3:Z3). | |||||
// Output: the 3-isogenous Montgomery curve with projective coefficient A/C. | |||||
f2elm_t t0, t1, t2, t3, t4, t5; | |||||
fp2sub(P->X, P->Z, coeff[0]); // coeff0 = X-Z | |||||
fp2add(P->X, P->Z, coeff[1]); // coeff1 = X+Z | |||||
fp2_mul_ifma_x2(t0, coeff[0], coeff[0], t1, coeff[1], coeff[1]); | |||||
//fp2sqr_mont(coeff[0], t0); // t0 = (X-Z)^2 | |||||
//fp2sqr_mont(coeff[1], t1); // t1 = (X+Z)^2 | |||||
fp2add(t0, t1, t2); // t2 = (X+Z)^2 + (X-Z)^2 | |||||
fp2add(coeff[0], coeff[1], t3); // t3 = 2*X | |||||
fp2sqr_mont(t3, t3); // t3 = 4*X^2 | |||||
fp2sub(t3, t2, t3); // t3 = 4*X^2 - (X+Z)^2 - (X-Z)^2 | |||||
fp2add(t1, t3, t2); // t2 = 4*X^2 - (X-Z)^2 | |||||
fp2add(t3, t0, t3); // t3 = 4*X^2 - (X+Z)^2 | |||||
fp2add(t0, t3, t4); // t4 = 4*X^2 - (X+Z)^2 + (X-Z)^2 | |||||
fp2add(t4, t4, t4); // t4 = 2(4*X^2 - (X+Z)^2 + (X-Z)^2) | |||||
fp2add(t1, t4, t4); // t4 = 8*X^2 - (X+Z)^2 + 2*(X-Z)^2 | |||||
fp2add(t1, t2, t5); // t4 = 4*X^2 + (X+Z)^2 - (X-Z)^2 | |||||
fp2add(t5, t5, t5); // t4 = 2(4*X^2 + (X+Z)^2 - (X-Z)^2) | |||||
fp2add(t0, t5, t5); // t4 = 8*X^2 + 2*(X+Z)^2 - (X-Z)^2 | |||||
fp2_mul_ifma_x2(A24minus, t2, t4, t5, t5, t3); | |||||
// fp2mul_mont(t2, t4, A24minus); // A24minus = [4*X^2 - (X-Z)^2]*[8*X^2 - (X+Z)^2 + 2*(X-Z)^2] | |||||
// fp2mul_mont(t3, t5, t5); // t4 = [4*X^2 - (X+Z)^2]*[8*X^2 + 2*(X+Z)^2 - (X-Z)^2] | |||||
fp2sub(t5, A24minus, t0); // t0 = [4*X^2 - (X+Z)^2]*[8*X^2 + 2*(X+Z)^2 - (X-Z)^2] - [4*X^2 - (X-Z)^2]*[8*X^2 - (X+Z)^2 + 2*(X-Z)^2] | |||||
fp2add(A24minus, t0, A24plus); // A24plus = 8*X^2 - (X+Z)^2 + 2*(X-Z)^2 | |||||
} | |||||
static void eval_3_isog_ifma(point_proj_t Q, const f2elm_t *coeff) | |||||
{ // Computes the 3-isogeny R=phi(X:Z), given projective point (X3:Z3) of order 3 on a Montgomery curve and | |||||
// a point P with 2 coefficients in coeff (computed in the function get_3_isog()). | |||||
// Inputs: projective points P = (X3:Z3) and Q = (X:Z). | |||||
// Output: the projective point Q <- phi(Q) = (X3:Z3). | |||||
f2elm_t t0, t1, t2; | |||||
fp2add(Q->X, Q->Z, t0); // t0 = X+Z | |||||
fp2sub(Q->X, Q->Z, t1); // t1 = X-Z | |||||
fp2_mul_ifma_x2(t0, t0, coeff[0], t1, t1, coeff[1]); | |||||
//fp2mul_mont(t0, coeff[0], t0); // t0 = coeff0*(X+Z) | |||||
//fp2mul_mont(t1, coeff[1], t1); // t1 = coeff1*(X-Z) | |||||
fp2add(t0, t1, t2); // t2 = coeff0*(X-Z) + coeff1*(X+Z) | |||||
fp2sub(t1, t0, t0); // t0 = coeff0*(X-Z) - coeff1*(X+Z) | |||||
fp2_mul_ifma_x2(t2, t2, t2, t0, t0, t0); | |||||
//fp2sqr_mont(t2, t2); // t2 = [coeff0*(X-Z) + coeff1*(X+Z)]^2 | |||||
//fp2sqr_mont(t0, t0); // t1 = [coeff0*(X-Z) - coeff1*(X+Z)]^2 | |||||
fp2_mul_ifma_x2(Q->X, Q->X, t2, Q->Z, Q->Z, t0); | |||||
//fp2mul_mont(Q->X, t2, Q->X); // X3final = X*[coeff0*(X-Z) + coeff1*(X+Z)]^2 | |||||
//fp2mul_mont(Q->Z, t0, Q->Z); // Z3final = Z*[coeff0*(X-Z) - coeff1*(X+Z)]^2 | |||||
} | |||||
static void fp2_encode(const f2elm_t x, unsigned char *enc) | |||||
{ // Conversion of GF(p^2) element from Montgomery to standard representation, and encoding by removing leading 0 bytes | |||||
unsigned int i; | |||||
f2elm_t tt; | |||||
uint64_t t[12 * 2]; | |||||
from_mont_ifma(tt[0], x[0]); | |||||
from_mont_ifma(tt[1], x[1]); | |||||
red2norm(t, tt[0]); | |||||
red2norm(&t[12], tt[1]); | |||||
for (i = 0; i < FP2_ENCODED_BYTES / 2; i++) | |||||
{ | |||||
enc[i] = ((unsigned char *)t)[i]; | |||||
enc[i + FP2_ENCODED_BYTES / 2] = ((unsigned char *)t)[i + MAXBITS_FIELD / 8]; | |||||
} | |||||
} | |||||
static void fp2_decode(const unsigned char *enc, f2elm_t x) | |||||
{ | |||||
unsigned int i; | |||||
uint64_t t[12 * 2]; | |||||
memset(x, 0, sizeof(f2elm_t)); | |||||
for (i = 0; i < FP2_ENCODED_BYTES / 2; i++) | |||||
{ | |||||
((unsigned char *)t)[i] = enc[i]; | |||||
((unsigned char *)t)[i + MAXBITS_FIELD / 8] = enc[i + FP2_ENCODED_BYTES / 2]; | |||||
} | |||||
norm2red(x[0], t); | |||||
norm2red(x[1], &t[12]); | |||||
to_mont_ifma(x[0], x[0]); | |||||
to_mont_ifma(x[1], x[1]); | |||||
} | |||||
int EphemeralKeyGeneration_A_ifma(const unsigned char *PrivateKeyA, unsigned char *PublicKeyA) | |||||
{ // Alice's ephemeral public key generation | |||||
// Input: a private key PrivateKeyA in the range [0, 2^eA - 1]. | |||||
// Output: the public key PublicKeyA consisting of 3 elements in GF(p^2) which are encoded by removing leading 0 bytes. | |||||
point_proj_t R, phiP = {0}, phiQ = {0}, phiR = {0}, pts[MAX_INT_POINTS_ALICE]; | |||||
f2elm_t XPA, XQA, XRA, coeff[3]; | |||||
unsigned int i, row, m, index = 0, pts_index[MAX_INT_POINTS_ALICE], npts = 0, ii = 0; | |||||
f2elm_t C24 = { | |||||
{0x000000004935acf8, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0003f30018a85800, 0x000664c911fc7654, 0x000cc2ec46db6eef, 0x000badd2e0465707, 0x000a9aec44eeae7f, 0x000a99a2d802be6b, 0x0003f8e487189f8e, 0x000000000037f1ed}, | |||||
{0}}; | |||||
f2elm_t A24plus = { | |||||
{0x00000000249ad67c, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0001f9800c542c00, 0x000b326488fe3b2a, 0x000e6176236db777, 0x000dd6e970232b83, 0x000d4d762277573f, 0x00054cd16c015f35, 0x0009fc72438c4fc7, 0x00000000001bf8f6}, | |||||
{0}}; | |||||
// Initialize basis points | |||||
init_basis(A_gen_ifma, XPA, XQA, XRA); | |||||
init_basis(B_gen_ifma, phiP->X, phiQ->X, phiR->X); | |||||
memcpy(phiP->Z, One, sizeof(felm_t)); | |||||
memcpy(phiQ->Z, One, sizeof(felm_t)); | |||||
memcpy(phiR->Z, One, sizeof(felm_t)); | |||||
// Retrieve kernel point | |||||
LADDER3PT_ifma(XPA, XQA, XRA, (uint64_t *)PrivateKeyA, ALICE, R); | |||||
// Traverse tree | |||||
index = 0; | |||||
for (row = 1; row < MAX_Alice; row++) | |||||
{ | |||||
while (index < MAX_Alice - row) | |||||
{ | |||||
memcpy(pts[npts]->X, R->X, sizeof(f2elm_t)); | |||||
memcpy(pts[npts]->Z, R->Z, sizeof(f2elm_t)); | |||||
pts_index[npts++] = index; | |||||
m = strat_Alice[ii++]; | |||||
xDBLe_ifma(R, R, A24plus, C24, (int)(2 * m)); | |||||
index += m; | |||||
} | |||||
get_4_isog_ifma(R, A24plus, C24, coeff); | |||||
for (i = 0; i < npts; i++) | |||||
{ | |||||
eval_4_isog_ifma(pts[i], coeff); | |||||
} | |||||
eval_4_isog_ifma(phiP, coeff); | |||||
eval_4_isog_ifma(phiQ, coeff); | |||||
eval_4_isog_ifma(phiR, coeff); | |||||
memcpy(R->X, pts[npts - 1]->X, sizeof(f2elm_t)); | |||||
memcpy(R->Z, pts[npts - 1]->Z, sizeof(f2elm_t)); | |||||
index = pts_index[npts - 1]; | |||||
npts -= 1; | |||||
} | |||||
get_4_isog_ifma(R, A24plus, C24, coeff); | |||||
eval_4_isog_ifma(phiP, coeff); | |||||
eval_4_isog_ifma(phiQ, coeff); | |||||
eval_4_isog_ifma(phiR, coeff); | |||||
inv_3_way_ifma(phiP->Z, phiQ->Z, phiR->Z); | |||||
fp2_mul_ifma_x2(phiP->X, phiP->X, phiP->Z, phiQ->X, phiQ->X, phiQ->Z); | |||||
//fp2mul_mont(phiP->X, phiP->Z, phiP->X); | |||||
//fp2mul_mont(phiQ->X, phiQ->Z, phiQ->X); | |||||
fp2mul_mont(phiR->X, phiR->Z, phiR->X); | |||||
// Format public key | |||||
fp2_encode(phiP->X, PublicKeyA); | |||||
fp2_encode(phiQ->X, PublicKeyA + FP2_ENCODED_BYTES); | |||||
fp2_encode(phiR->X, PublicKeyA + 2 * FP2_ENCODED_BYTES); | |||||
return 0; | |||||
} | |||||
int EphemeralKeyGeneration_B_ifma(const unsigned char *PrivateKeyB, unsigned char *PublicKeyB) | |||||
{ // Bob's ephemeral public key generation | |||||
// Input: a private key PrivateKeyB in the range [0, 2^Floor(Log(2,oB)) - 1]. | |||||
// Output: the public key PublicKeyB consisting of 3 elements in GF(p^2) which are encoded by removing leading 0 bytes. | |||||
point_proj_t R, phiP = {0}, phiQ = {0}, phiR = {0}, pts[MAX_INT_POINTS_BOB]; | |||||
f2elm_t XPB, XQB, XRB, coeff[3], A = {0}; | |||||
unsigned int i, row, m, index = 0, pts_index[MAX_INT_POINTS_BOB], npts = 0, ii = 0; | |||||
f2elm_t A24plus = {{0x000000004935acf8, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0003f30018a85800, 0x000664c911fc7654, 0x000cc2ec46db6eef, 0x000badd2e0465707, 0x000a9aec44eeae7f, 0x000a99a2d802be6b, 0x0003f8e487189f8e, 0x000000000037f1ed}, | |||||
{0}}; | |||||
f2elm_t A24minus = {{0x000fffffb6ca5307, 0x000fffffffffffff, 0x000fffffffffffff, 0x000fffffffffffff, 0x000fffffffffffff, 0x000fffffffffffff, 0x000fffffffffffff, 0x0000ac8771e692ff, 0x000167add1f02031, 0x000aaabd12d63250, 0x000ca0c5879094e0, 0x0000b5598636c600, 0x0004fe180463c6f7, 0x0000268d39c8897b, 0x000000000037f3e8}, | |||||
{0}}; | |||||
uint64_t temp[12]; | |||||
uint64_t ifma_temp[15]; | |||||
// Initialize basis points | |||||
init_basis(B_gen_ifma, XPB, XQB, XRB); | |||||
init_basis(A_gen_ifma, phiP->X, phiQ->X, phiR->X); | |||||
memcpy(phiP->Z, One, sizeof(felm_t)); | |||||
memcpy(phiQ->Z, One, sizeof(felm_t)); | |||||
memcpy(phiR->Z, One, sizeof(felm_t)); | |||||
// Retrieve kernel point | |||||
LADDER3PT_ifma(XPB, XQB, XRB, (uint64_t *)PrivateKeyB, BOB, R); | |||||
// Traverse tree | |||||
index = 0; | |||||
for (row = 1; row < MAX_Bob; row++) | |||||
{ | |||||
while (index < MAX_Bob - row) | |||||
{ | |||||
memcpy(pts[npts]->X, R->X, sizeof(f2elm_t)); | |||||
memcpy(pts[npts]->Z, R->Z, sizeof(f2elm_t)); | |||||
pts_index[npts++] = index; | |||||
m = strat_Bob[ii++]; | |||||
xTPLe_ifma(R, R, A24minus, A24plus, (int)m); | |||||
index += m; | |||||
} | |||||
get_3_isog_ifma(R, A24minus, A24plus, coeff); | |||||
for (i = 0; i < npts; i++) | |||||
{ | |||||
eval_3_isog_ifma(pts[i], coeff); | |||||
} | |||||
eval_3_isog_ifma(phiP, coeff); | |||||
eval_3_isog_ifma(phiQ, coeff); | |||||
eval_3_isog_ifma(phiR, coeff); | |||||
memcpy(R->X, pts[npts - 1]->X, sizeof(f2elm_t)); | |||||
memcpy(R->Z, pts[npts - 1]->Z, sizeof(f2elm_t)); | |||||
index = pts_index[npts - 1]; | |||||
npts -= 1; | |||||
} | |||||
get_3_isog_ifma(R, A24minus, A24plus, coeff); | |||||
eval_3_isog_ifma(phiP, coeff); | |||||
eval_3_isog_ifma(phiQ, coeff); | |||||
eval_3_isog_ifma(phiR, coeff); | |||||
inv_3_way_ifma(phiP->Z, phiQ->Z, phiR->Z); | |||||
fp2mul_mont(phiP->X, phiP->Z, phiP->X); | |||||
fp2mul_mont(phiQ->X, phiQ->Z, phiQ->X); | |||||
fp2mul_mont(phiR->X, phiR->Z, phiR->X); | |||||
// Format public key | |||||
fp2_encode(phiP->X, PublicKeyB); | |||||
fp2_encode(phiQ->X, PublicKeyB + FP2_ENCODED_BYTES); | |||||
fp2_encode(phiR->X, PublicKeyB + 2 * FP2_ENCODED_BYTES); | |||||
return 0; | |||||
} |
@@ -0,0 +1,34 @@ | |||||
## PQ SIDH/SIKE implementation using AVX512IFMA instructions | |||||
Using the AVX512IFMA (vpmadd52luq and vpmadd52huq) specifically designed for | |||||
prime field arithmetic allows a projected speedup of up to 4X on supporting | |||||
processors, when those become available. | |||||
### Current status | |||||
* Tested for correctness with Intel SDE | |||||
* EphemeralKeyGeneration_A and EphemeralKeyGeneration_B with P751 are implemented | |||||
* Using "standins": 3X performance gain on Xeon Gold (with two FMA units) | |||||
* Optimizations are 3-fold | |||||
* Finite field *𝔽~p~* multiplication by performing a single horizontal Montgomery multiplication | |||||
* Quadratic finite field *𝔽~p²~* multiplication and square by performing 3/4 horizontal Montgomery multiplications in parallel | |||||
* A pair of quadratic finite field *𝔽~p²~* multiplications (where applicable) by performing 8 vertical Montgomery multiplications in parallel | |||||
* AVX512 add/sub are also implemented | |||||
### How to test? | |||||
The Makefile generates to executables: sidh_ifma can be run with Intel SDE to | |||||
check for correctness. sidh_standin produces incorrect results, because it | |||||
replaces the IFMA instrutions with FMA instructions and can be executed on a | |||||
machine with AVX512 support to estimate performance. | |||||
### TODO | |||||
* EphemeralSecretAgreement_A and EphemeralSecretAgreement_B | |||||
* SIKE | |||||
* P503 | |||||
* Using vertical representation throughout for greater speedups | |||||
### License | |||||
Available under the original [SIKE](https://github.com/Microsoft/PQCrypto-SIKE) license |
@@ -0,0 +1,916 @@ | |||||
#if defined(__APPLE__) | |||||
/* OS X's C ABI prefixes functions with underscore. */ | |||||
#define C_ABI(x) _ ## x | |||||
#define HIDDEN .private_extern | |||||
#else | |||||
#define C_ABI(x) x | |||||
#define HIDDEN .hidden | |||||
#endif | |||||
.p2align 6 | |||||
.LpermMask0: | |||||
.word 0,1,2,3, 3,4,5,6, 6,7,8,9, 9,10,11,12, 13,14,15,16, 16,17,18,19, 19,20,21,22, 22,23,24,25 | |||||
.LshiftMask0: | |||||
.quad 0,4,8,12,0,4,8,12 | |||||
.LandMask: | |||||
.quad 0xfffffffffffff | |||||
.p2align 6 | |||||
.Lpoly: | |||||
.quad 0x000fffffffffffff, 0x000fffffffffffff, 0x000fffffffffffff, 0x000fffffffffffff | |||||
.quad 0x000fffffffffffff, 0x000fffffffffffff, 0x000fffffffffffff, 0x00049f878a8eeaff | |||||
.quad 0x0007cc76e3ec9685, 0x00076da959b1a13f, 0x00084e9867d6ebe8, 0x000b5045cb257480 | |||||
.quad 0x000f97badc668562, 0x00041f71c0e12909, 0x00000000006fe5d5, 0 | |||||
.LpolyX: | |||||
.quad 0x0fffffffffffff00, 0x0fffffffffffff00, 0x0fffffffffffff00, 0x0fffffffffffff00 | |||||
.quad 0x0fffffffffffff00, 0x0fffffffffffff00, 0x0fffffffffffff00, 0x049f878a8eeaff00 | |||||
.quad 0x07cc76e3ec968500, 0x076da959b1a13f00, 0x084e9867d6ebe800, 0x0b5045cb25748000 | |||||
.quad 0x0f97badc66856200, 0x041f71c0e1290900, 0x000000006fe5d500, 0 | |||||
#define felemR %rdi | |||||
#define felemA %rsi | |||||
#define felemB %rdx | |||||
#define itr %r10 | |||||
#define M0 %zmm0 | |||||
#define M1 %zmm1 | |||||
#define ZERO %zmm2 | |||||
#define AND_MASK %zmm3 | |||||
#define A0a %zmm4 | |||||
#define A0b %zmm5 | |||||
#define A1a %zmm6 | |||||
#define A1b %zmm7 | |||||
#define ACC0a %zmm8 | |||||
#define ACC0b %zmm9 | |||||
#define ACC1a %zmm10 | |||||
#define ACC1b %zmm11 | |||||
#define ACC2a %zmm12 | |||||
#define ACC2b %zmm13 | |||||
#define ACC3a %zmm14 | |||||
#define ACC3b %zmm15 | |||||
#define B0curr %zmm16 | |||||
#define B0prev %zmm17 | |||||
#define B1curr %zmm18 | |||||
#define B1prev %zmm19 | |||||
#define Y0curr %zmm20 | |||||
#define Y0prev %zmm21 | |||||
#define Y1curr %zmm22 | |||||
#define Y1prev %zmm23 | |||||
#define Y2curr %zmm24 | |||||
#define Y2prev %zmm25 | |||||
#define Y3curr %zmm26 | |||||
#define Y3prev %zmm27 | |||||
#define T0 %zmm28 | |||||
#define T1 %zmm29 | |||||
#define T2 %zmm30 | |||||
#define T3 %zmm31 | |||||
############################################################################### | |||||
.globl C_ABI(fp2_mul_ifma) | |||||
.p2align 6 | |||||
C_ABI(fp2_mul_ifma): | |||||
mov $1, %eax | |||||
kmovw %eax, %k1 | |||||
mov $0x7f, %eax | |||||
kmovw %eax, %k5 | |||||
vpbroadcastq .LandMask(%rip), AND_MASK | |||||
vpxorq ZERO, ZERO, ZERO | |||||
vmovdqu64 64*0(felemA), A0a | |||||
vmovdqu64 64*1(felemA), A0b{%k5}{z} | |||||
vmovdqu64 15*8 + 64*0(felemA), A1a | |||||
vmovdqu64 15*8 + 64*1(felemA), A1b{%k5}{z} | |||||
# Load the modulus | |||||
vmovdqa64 64*0 + .Lpoly(%rip), M0 | |||||
vmovdqa64 64*1 + .Lpoly(%rip), M1 | |||||
# Prepare the accumulators | |||||
vpxorq ACC0a, ACC0a, ACC0a | |||||
vpxorq ACC0b, ACC0b, ACC0b | |||||
vpxorq ACC1a, ACC1a, ACC1a | |||||
vpxorq ACC1b, ACC1b, ACC1b | |||||
vpxorq ACC2a, ACC2a, ACC2a | |||||
vpxorq ACC2b, ACC2b, ACC2b | |||||
vpxorq ACC3a, ACC3a, ACC3a | |||||
vpxorq ACC3b, ACC3b, ACC3b | |||||
vpxorq T0, T0, T0 | |||||
vpxorq T1, T1, T1 | |||||
vpxorq T2, T2, T2 | |||||
vpxorq T3, T3, T3 | |||||
# First iteration | |||||
vpbroadcastq (felemB), B0curr | |||||
vpbroadcastq 15*8(felemB), B1curr | |||||
lea 8(felemB), felemB | |||||
vpmadd52luq B0curr, A0a, ACC0a | |||||
vpmadd52luq B0curr, A0b, ACC0b | |||||
vpmadd52luq B1curr, A1a, ACC1a | |||||
vpmadd52luq B1curr, A1b, ACC1b | |||||
vpmadd52luq B0curr, A1a, ACC2a | |||||
vpmadd52luq B0curr, A1b, ACC2b | |||||
vpmadd52luq B1curr, A0a, ACC3a | |||||
vpmadd52luq B1curr, A0b, ACC3b | |||||
vpermq ACC0a, ZERO, Y0curr | |||||
vpermq ACC1a, ZERO, Y1curr | |||||
vpermq ACC2a, ZERO, Y2curr | |||||
vpermq ACC3a, ZERO, Y3curr | |||||
vpmadd52luq Y0curr, M0, ACC0a | |||||
vpmadd52luq Y0curr, M1, ACC0b | |||||
vpmadd52luq Y1curr, M0, ACC1a | |||||
vpmadd52luq Y1curr, M1, ACC1b | |||||
vpmadd52luq Y2curr, M0, ACC2a | |||||
vpmadd52luq Y2curr, M1, ACC2b | |||||
vpmadd52luq Y3curr, M0, ACC3a | |||||
vpmadd52luq Y3curr, M1, ACC3b | |||||
vpsrlq $52, ACC0a, T0{%k1}{z} | |||||
vpsrlq $52, ACC1a, T1{%k1}{z} | |||||
vpsrlq $52, ACC2a, T2{%k1}{z} | |||||
vpsrlq $52, ACC3a, T3{%k1}{z} | |||||
mov $14, itr | |||||
1: | |||||
# Shift the ACC in zmms right by a word | |||||
valignq $1, ACC0a, ACC0b, ACC0a | |||||
valignq $1, ACC0b, ZERO, ACC0b | |||||
valignq $1, ACC1a, ACC1b, ACC1a | |||||
valignq $1, ACC1b, ZERO, ACC1b | |||||
valignq $1, ACC2a, ACC2b, ACC2a | |||||
valignq $1, ACC2b, ZERO, ACC2b | |||||
valignq $1, ACC3a, ACC3b, ACC3a | |||||
valignq $1, ACC3b, ZERO, ACC3b | |||||
vmovdqa64 B0curr, B0prev | |||||
vmovdqa64 B1curr, B1prev | |||||
vmovdqa64 Y0curr, Y0prev | |||||
vmovdqa64 Y1curr, Y1prev | |||||
vmovdqa64 Y2curr, Y2prev | |||||
vmovdqa64 Y3curr, Y3prev | |||||
vpbroadcastq (felemB), B0curr | |||||
vpbroadcastq 15*8(felemB), B1curr | |||||
lea 8(felemB), felemB | |||||
# High multiplications | |||||
vpmadd52huq B0prev, A0a, ACC0a # ACC0 = A0 * B0 | |||||
vpmadd52huq B0prev, A0b, ACC0b | |||||
vpmadd52huq B1prev, A1a, ACC1a # ACC1 = A1 * B1 | |||||
vpmadd52huq B1prev, A1b, ACC1b | |||||
vpmadd52huq B0prev, A1a, ACC2a # ACC2 = A1 * B0 | |||||
vpmadd52huq B0prev, A1b, ACC2b | |||||
vpmadd52huq B1prev, A0a, ACC3a # ACC3 = A0 * B1 | |||||
vpmadd52huq B1prev, A0b, ACC3b | |||||
vpmadd52huq Y0prev, M0, ACC0a | |||||
vpmadd52huq Y0prev, M1, ACC0b | |||||
vpmadd52huq Y1prev, M0, ACC1a | |||||
vpmadd52huq Y1prev, M1, ACC1b | |||||
vpmadd52huq Y2prev, M0, ACC2a | |||||
vpmadd52huq Y2prev, M1, ACC2b | |||||
vpmadd52huq Y3prev, M0, ACC3a | |||||
vpmadd52huq Y3prev, M1, ACC3b | |||||
# Low multiplications | |||||
vpmadd52luq B0curr, A0a, ACC0a | |||||
vpmadd52luq B0curr, A0b, ACC0b | |||||
vpmadd52luq B1curr, A1a, ACC1a | |||||
vpmadd52luq B1curr, A1b, ACC1b | |||||
vpmadd52luq B0curr, A1a, ACC2a | |||||
vpmadd52luq B0curr, A1b, ACC2b | |||||
vpmadd52luq B1curr, A0a, ACC3a | |||||
vpmadd52luq B1curr, A0b, ACC3b | |||||
vpaddq T0, ACC0a, ACC0a | |||||
vpaddq T1, ACC1a, ACC1a | |||||
vpaddq T2, ACC2a, ACC2a | |||||
vpaddq T3, ACC3a, ACC3a | |||||
vpermq ACC0a, ZERO, Y0curr | |||||
vpermq ACC1a, ZERO, Y1curr | |||||
vpermq ACC2a, ZERO, Y2curr | |||||
vpermq ACC3a, ZERO, Y3curr | |||||
vpmadd52luq Y0curr, M0, ACC0a | |||||
vpmadd52luq Y0curr, M1, ACC0b | |||||
vpmadd52luq Y1curr, M0, ACC1a | |||||
vpmadd52luq Y1curr, M1, ACC1b | |||||
vpmadd52luq Y2curr, M0, ACC2a | |||||
vpmadd52luq Y2curr, M1, ACC2b | |||||
vpmadd52luq Y3curr, M0, ACC3a | |||||
vpmadd52luq Y3curr, M1, ACC3b | |||||
vpsrlq $52, ACC0a, T0{%k1}{z} | |||||
vpsrlq $52, ACC1a, T1{%k1}{z} | |||||
vpsrlq $52, ACC2a, T2{%k1}{z} | |||||
vpsrlq $52, ACC3a, T3{%k1}{z} | |||||
dec itr | |||||
jne 1b | |||||
valignq $1, ACC0a, ACC0b, ACC0a | |||||
valignq $1, ACC0b, ZERO, ACC0b | |||||
valignq $1, ACC1a, ACC1b, ACC1a | |||||
valignq $1, ACC1b, ZERO, ACC1b | |||||
valignq $1, ACC2a, ACC2b, ACC2a | |||||
valignq $1, ACC2b, ZERO, ACC2b | |||||
valignq $1, ACC3a, ACC3b, ACC3a | |||||
valignq $1, ACC3b, ZERO, ACC3b | |||||
vpaddq T0, ACC0a, ACC0a | |||||
vpaddq T1, ACC1a, ACC1a | |||||
vpaddq T2, ACC2a, ACC2a | |||||
vpaddq T3, ACC3a, ACC3a | |||||
# The last high multiplications | |||||
vpmadd52huq B0curr, A0a, ACC0a | |||||
vpmadd52huq B0curr, A0b, ACC0b | |||||
vpmadd52huq B1curr, A1a, ACC1a | |||||
vpmadd52huq B1curr, A1b, ACC1b | |||||
vpmadd52huq B0curr, A1a, ACC2a | |||||
vpmadd52huq B0curr, A1b, ACC2b | |||||
vpmadd52huq B1curr, A0a, ACC3a | |||||
vpmadd52huq B1curr, A0b, ACC3b | |||||
vpmadd52huq Y0curr, M0, ACC0a | |||||
vpmadd52huq Y0curr, M1, ACC0b | |||||
vpmadd52huq Y1curr, M0, ACC1a | |||||
vpmadd52huq Y1curr, M1, ACC1b | |||||
vpmadd52huq Y2curr, M0, ACC2a | |||||
vpmadd52huq Y2curr, M1, ACC2b | |||||
vpmadd52huq Y3curr, M0, ACC3a | |||||
vpmadd52huq Y3curr, M1, ACC3b | |||||
# C0 = A0*B0 - A1*B1 | |||||
# C1 = A0*B1 + A1*B0 | |||||
vpaddq 64*0 + .LpolyX(%rip), ACC0a, ACC0a | |||||
vpaddq 64*1 + .LpolyX(%rip), ACC0b, ACC0b | |||||
vpaddq ACC3a, ACC2a, ACC2a | |||||
vpaddq ACC3b, ACC2b, ACC2b | |||||
vpsubq ACC1a, ACC0a, ACC0a | |||||
vpsubq ACC1b, ACC0b, ACC0b | |||||
# Now 'normalize' the acc to 52 bit words | |||||
vpsrlq $52, ACC0a, A0a | |||||
vpsrlq $52, ACC0b, A0b | |||||
vpsrlq $52, ACC2a, A1a | |||||
vpsrlq $52, ACC2b, A1b | |||||
vpandq AND_MASK, ACC0a, ACC0a | |||||
vpandq AND_MASK, ACC0b, ACC0b | |||||
vpandq AND_MASK, ACC2a, ACC2a | |||||
vpandq AND_MASK, ACC2b, ACC2b | |||||
valignq $7, A0a, A0b, A0b | |||||
valignq $7, ZERO, A0a, A0a | |||||
valignq $7, A1a, A1b, A1b | |||||
valignq $7, ZERO, A1a, A1a | |||||
vpaddq A0a, ACC0a, ACC0a | |||||
vpaddq A0b, ACC0b, ACC0b | |||||
vpaddq A1a, ACC2a, ACC2a | |||||
vpaddq A1b, ACC2b, ACC2b | |||||
vpcmpuq $1, A0a, ACC0a, %k1 | |||||
vpcmpuq $1, A0b, ACC0b, %k2 | |||||
vpcmpuq $0, AND_MASK, ACC0a, %k3 | |||||
vpcmpuq $0, AND_MASK, ACC0b, %k4 | |||||
kmovb %k1, %eax | |||||
kmovb %k2, %ecx | |||||
kmovb %k3, %r8d | |||||
kmovb %k4, %r9d | |||||
add %al, %al | |||||
adc %cl, %cl | |||||
add %r8b, %al | |||||
adc %r9b, %cl | |||||
xor %r8b, %al | |||||
xor %r9b, %cl | |||||
kmovb %eax, %k1 | |||||
kmovb %ecx, %k2 | |||||
vpsubq AND_MASK, ACC0a, ACC0a{%k1} | |||||
vpsubq AND_MASK, ACC0b, ACC0b{%k2} | |||||
vpandq AND_MASK, ACC0a, ACC0a | |||||
vpandq AND_MASK, ACC0b, ACC0b | |||||
vpcmpuq $1, A1a, ACC2a, %k1 | |||||
vpcmpuq $1, A1b, ACC2b, %k2 | |||||
vpcmpuq $0, AND_MASK, ACC2a, %k3 | |||||
vpcmpuq $0, AND_MASK, ACC2b, %k4 | |||||
kmovb %k1, %eax | |||||
kmovb %k2, %ecx | |||||
kmovb %k3, %r8d | |||||
kmovb %k4, %r9d | |||||
add %al, %al | |||||
adc %cl, %cl | |||||
add %r8b, %al | |||||
adc %r9b, %cl | |||||
xor %r8b, %al | |||||
xor %r9b, %cl | |||||
kmovb %eax, %k1 | |||||
kmovb %ecx, %k2 | |||||
vpsubq AND_MASK, ACC2a, ACC2a{%k1} | |||||
vpsubq AND_MASK, ACC2b, ACC2b{%k2} | |||||
vpandq AND_MASK, ACC2a, ACC2a | |||||
vpandq AND_MASK, ACC2b, ACC2b | |||||
mov $0x7f, %eax | |||||
kmovw %eax, %k1 | |||||
vmovdqu64 ACC0a, 64*0(felemR) | |||||
vmovdqu64 ACC0b, 64*1(felemR){%k5} | |||||
vmovdqu64 ACC2a, 15*8 + 64*0(felemR) | |||||
vmovdqu64 ACC2b, 15*8 + 64*1(felemR){%k5} | |||||
ret | |||||
############################################################################### | |||||
#define ST0 ACC3a | |||||
#define ST1 ACC3b | |||||
#define ST2 Y3curr | |||||
.globl C_ABI(fp2_sqr_ifma) | |||||
.p2align 6 | |||||
C_ABI(fp2_sqr_ifma): | |||||
mov $1, %eax | |||||
kmovw %eax, %k1 | |||||
mov $0x7f, %eax | |||||
kmovw %eax, %k2 | |||||
vpbroadcastq .LandMask(%rip), AND_MASK | |||||
vpxorq ZERO, ZERO, ZERO | |||||
vmovdqu64 64*0(felemA), A0a | |||||
vmovdqu64 64*1(felemA), A0b{%k2}{z} | |||||
vmovdqu64 15*8 + 64*0(felemA), A1a | |||||
vmovdqu64 15*8 + 64*1(felemA), A1b{%k2}{z} | |||||
# Load the modulus | |||||
vmovdqa64 64*0 + .Lpoly(%rip), M0 | |||||
vmovdqa64 64*1 + .Lpoly(%rip), M1 | |||||
# Prepare the accumulators | |||||
vpxorq ACC0a, ACC0a, ACC0a | |||||
vpxorq ACC0b, ACC0b, ACC0b | |||||
vpxorq ACC1a, ACC1a, ACC1a | |||||
vpxorq ACC1b, ACC1b, ACC1b | |||||
vpxorq ACC2a, ACC2a, ACC2a | |||||
vpxorq ACC2b, ACC2b, ACC2b | |||||
vpxorq T0, T0, T0 | |||||
vpxorq T1, T1, T1 | |||||
vpxorq T2, T2, T2 | |||||
# First iteration | |||||
vpbroadcastq (felemA), B0curr | |||||
vpbroadcastq 15*8(felemA), B1curr | |||||
lea 8(felemA), felemA | |||||
vpmadd52luq B0curr, A0a, ACC0a | |||||
vpmadd52luq B0curr, A0b, ACC0b | |||||
vpmadd52luq B1curr, A1a, ACC1a | |||||
vpmadd52luq B1curr, A1b, ACC1b | |||||
vpmadd52luq B0curr, A1a, ACC2a | |||||
vpmadd52luq B0curr, A1b, ACC2b | |||||
vpermq ACC0a, ZERO, Y0curr | |||||
vpermq ACC1a, ZERO, Y1curr | |||||
vpermq ACC2a, ZERO, Y2curr | |||||
vpmadd52luq Y0curr, M0, ACC0a | |||||
vpmadd52luq Y0curr, M1, ACC0b | |||||
vpmadd52luq Y1curr, M0, ACC1a | |||||
vpmadd52luq Y1curr, M1, ACC1b | |||||
vpmadd52luq Y2curr, M0, ACC2a | |||||
vpmadd52luq Y2curr, M1, ACC2b | |||||
vpsrlq $52, ACC0a, T0{%k1}{z} | |||||
vpsrlq $52, ACC1a, T1{%k1}{z} | |||||
vpsrlq $52, ACC2a, T2{%k1}{z} | |||||
mov $14, itr | |||||
1: | |||||
# Shift the ACC in zmms right by a word | |||||
valignq $1, ACC0a, ACC0b, ACC0a | |||||
valignq $1, ACC0b, ZERO, ACC0b | |||||
valignq $1, ACC1a, ACC1b, ACC1a | |||||
valignq $1, ACC1b, ZERO, ACC1b | |||||
valignq $1, ACC2a, ACC2b, ACC2a | |||||
valignq $1, ACC2b, ZERO, ACC2b | |||||
vpxorq ST0, ST0, ST0 | |||||
vpxorq ST1, ST1, ST1 | |||||
vpxorq ST2, ST2, ST2 | |||||
vmovdqa64 B0curr, B0prev | |||||
vmovdqa64 B1curr, B1prev | |||||
vmovdqa64 Y0curr, Y0prev | |||||
vmovdqa64 Y1curr, Y1prev | |||||
vmovdqa64 Y2curr, Y2prev | |||||
vpbroadcastq (felemA), B0curr | |||||
vpbroadcastq 15*8(felemA), B1curr | |||||
lea 8(felemA), felemA | |||||
# High multiplications | |||||
vpmadd52huq B0prev, A0a, ACC0a # ACC0 = A0 * B0 | |||||
vpmadd52huq B1prev, A1a, ACC1a # ACC1 = A1 * B1 | |||||
vpmadd52huq B0prev, A1a, ACC2a # ACC2 = A1 * B0 | |||||
vpmadd52huq B0prev, A0b, ACC0b | |||||
vpmadd52huq B1prev, A1b, ACC1b | |||||
vpmadd52huq B0prev, A1b, ACC2b | |||||
# We really want to have 8 independent vpmadd instructions in the pipe | |||||
vpmadd52huq Y0prev, M0, T0 | |||||
vpmadd52huq Y1prev, M0, T1 | |||||
vpmadd52huq Y2prev, M0, T2 | |||||
vpmadd52huq Y0prev, M1, ACC0b | |||||
vpmadd52huq Y1prev, M1, ACC1b | |||||
vpmadd52huq Y2prev, M1, ACC2b | |||||
# Low multiplications | |||||
vpmadd52luq B0curr, A0a, ACC0a | |||||
vpmadd52luq B1curr, A1a, ACC1a | |||||
vpmadd52luq B0curr, A1a, ACC2a | |||||
vpmadd52luq B0curr, A0b, ST0 | |||||
vpmadd52luq B1curr, A1b, ST1 | |||||
vpmadd52luq B0curr, A1b, ST2 | |||||
vpaddq T0, ACC0a, ACC0a | |||||
vpaddq T1, ACC1a, ACC1a | |||||
vpaddq T2, ACC2a, ACC2a | |||||
vpermq ACC0a, ZERO, Y0curr | |||||
vpermq ACC1a, ZERO, Y1curr | |||||
vpermq ACC2a, ZERO, Y2curr | |||||
vpaddq ST0, ACC0b, ACC0b | |||||
vpaddq ST1, ACC1b, ACC1b | |||||
vpaddq ST2, ACC2b, ACC2b | |||||
vpmadd52luq Y0curr, M0, ACC0a | |||||
vpmadd52luq Y0curr, M1, ACC0b | |||||
vpmadd52luq Y1curr, M0, ACC1a | |||||
vpmadd52luq Y1curr, M1, ACC1b | |||||
vpmadd52luq Y2curr, M0, ACC2a | |||||
vpmadd52luq Y2curr, M1, ACC2b | |||||
vpsrlq $52, ACC0a, T0{%k1}{z} | |||||
vpsrlq $52, ACC1a, T1{%k1}{z} | |||||
vpsrlq $52, ACC2a, T2{%k1}{z} | |||||
dec itr | |||||
jne 1b | |||||
valignq $1, ACC0a, ACC0b, ACC0a | |||||
valignq $1, ACC0b, ZERO, ACC0b | |||||
valignq $1, ACC1a, ACC1b, ACC1a | |||||
valignq $1, ACC1b, ZERO, ACC1b | |||||
valignq $1, ACC2a, ACC2b, ACC2a | |||||
valignq $1, ACC2b, ZERO, ACC2b | |||||
vpaddq T0, ACC0a, ACC0a | |||||
vpaddq T1, ACC1a, ACC1a | |||||
vpaddq T2, ACC2a, ACC2a | |||||
# The last high multiplications | |||||
vpmadd52huq B0curr, A0a, ACC0a | |||||
vpmadd52huq B0curr, A0b, ACC0b | |||||
vpmadd52huq B1curr, A1a, ACC1a | |||||
vpmadd52huq B1curr, A1b, ACC1b | |||||
vpmadd52huq B0curr, A1a, ACC2a | |||||
vpmadd52huq B0curr, A1b, ACC2b | |||||
vpmadd52huq Y0curr, M0, ACC0a | |||||
vpmadd52huq Y0curr, M1, ACC0b | |||||
vpmadd52huq Y1curr, M0, ACC1a | |||||
vpmadd52huq Y1curr, M1, ACC1b | |||||
vpmadd52huq Y2curr, M0, ACC2a | |||||
vpmadd52huq Y2curr, M1, ACC2b | |||||
# C0 = A0*B0 - A1*B1 | |||||
# C1 = A0*B1 + A1*B0 | |||||
vpaddq 64*0 + .LpolyX(%rip), ACC0a, ACC0a | |||||
vpaddq 64*1 + .LpolyX(%rip), ACC0b, ACC0b | |||||
vpaddq ACC2a, ACC2a, ACC2a | |||||
vpaddq ACC2b, ACC2b, ACC2b | |||||
vpsubq ACC1a, ACC0a, ACC0a | |||||
vpsubq ACC1b, ACC0b, ACC0b | |||||
# Now 'normalize' the acc to 52 bit words | |||||
vpsrlq $52, ACC0a, A0a | |||||
vpsrlq $52, ACC0b, A0b | |||||
vpsrlq $52, ACC2a, A1a | |||||
vpsrlq $52, ACC2b, A1b | |||||
vpandq AND_MASK, ACC0a, ACC0a | |||||
vpandq AND_MASK, ACC0b, ACC0b | |||||
vpandq AND_MASK, ACC2a, ACC2a | |||||
vpandq AND_MASK, ACC2b, ACC2b | |||||
valignq $7, A0a, A0b, A0b | |||||
valignq $7, ZERO, A0a, A0a | |||||
valignq $7, A1a, A1b, A1b | |||||
valignq $7, ZERO, A1a, A1a | |||||
vpaddq A0a, ACC0a, ACC0a | |||||
vpaddq A0b, ACC0b, ACC0b | |||||
vpaddq A1a, ACC2a, ACC2a | |||||
vpaddq A1b, ACC2b, ACC2b | |||||
vpcmpuq $1, A0a, ACC0a, %k1 | |||||
vpcmpuq $1, A0b, ACC0b, %k2 | |||||
vpcmpuq $0, AND_MASK, ACC0a, %k3 | |||||
vpcmpuq $0, AND_MASK, ACC0b, %k4 | |||||
kmovb %k1, %eax | |||||
kmovb %k2, %ecx | |||||
kmovb %k3, %r8d | |||||
kmovb %k4, %r9d | |||||
add %al, %al | |||||
adc %cl, %cl | |||||
add %r8b, %al | |||||
adc %r9b, %cl | |||||
xor %r8b, %al | |||||
xor %r9b, %cl | |||||
kmovb %eax, %k1 | |||||
kmovb %ecx, %k2 | |||||
vpsubq AND_MASK, ACC0a, ACC0a{%k1} | |||||
vpsubq AND_MASK, ACC0b, ACC0b{%k2} | |||||
vpandq AND_MASK, ACC0a, ACC0a | |||||
vpandq AND_MASK, ACC0b, ACC0b | |||||
vpcmpuq $1, A1a, ACC2a, %k1 | |||||
vpcmpuq $1, A1b, ACC2b, %k2 | |||||
vpcmpuq $0, AND_MASK, ACC2a, %k3 | |||||
vpcmpuq $0, AND_MASK, ACC2b, %k4 | |||||
kmovb %k1, %eax | |||||
kmovb %k2, %ecx | |||||
kmovb %k3, %r8d | |||||
kmovb %k4, %r9d | |||||
add %al, %al | |||||
adc %cl, %cl | |||||
add %r8b, %al | |||||
adc %r9b, %cl | |||||
xor %r8b, %al | |||||
xor %r9b, %cl | |||||
kmovb %eax, %k1 | |||||
kmovb %ecx, %k2 | |||||
vpsubq AND_MASK, ACC2a, ACC2a{%k1} | |||||
vpsubq AND_MASK, ACC2b, ACC2b{%k2} | |||||
vpandq AND_MASK, ACC2a, ACC2a | |||||
vpandq AND_MASK, ACC2b, ACC2b | |||||
mov $0x7f, %eax | |||||
kmovw %eax, %k1 | |||||
vmovdqu64 ACC0a, 64*0(felemR) | |||||
vmovdqu64 ACC0b, 64*1(felemR){%k1} | |||||
vmovdqu64 ACC2a, 15*8 + 64*0(felemR) | |||||
vmovdqu64 ACC2b, 15*8 + 64*1(felemR){%k1} | |||||
ret | |||||
############################################################################### | |||||
.globl C_ABI(fp2_sub) | |||||
.p2align 6 | |||||
C_ABI(fp2_sub): | |||||
mov $1, %eax | |||||
kmovw %eax, %k1 | |||||
mov $0x7f, %eax | |||||
kmovw %eax, %k2 | |||||
vmovdqu64 64*0(felemA), ACC0a | |||||
vmovdqu64 64*1(felemA), ACC0b{%k2}{z} | |||||
vmovdqu64 15*8 + 64*0(felemA), ACC1a | |||||
vmovdqu64 15*8 + 64*1(felemA), ACC1b{%k2}{z} | |||||
vmovdqu64 64*0(felemB), ACC2a | |||||
vmovdqu64 64*1(felemB), ACC2b{%k2}{z} | |||||
vmovdqu64 15*8 + 64*0(felemB), ACC3a | |||||
vmovdqu64 15*8 + 64*1(felemB), ACC3b{%k2}{z} | |||||
vpaddq 64*0 + .LpolyX(%rip), ACC0a, ACC0a | |||||
vpaddq 64*1 + .LpolyX(%rip), ACC0b, ACC0b | |||||
vpaddq 64*0 + .LpolyX(%rip), ACC1a, ACC1a | |||||
vpaddq 64*1 + .LpolyX(%rip), ACC1b, ACC1b | |||||
vpsubq ACC2a, ACC0a, ACC0a | |||||
vpsubq ACC2b, ACC0b, ACC0b | |||||
vpsubq ACC3a, ACC1a, ACC2a | |||||
vpsubq ACC3b, ACC1b, ACC2b | |||||
jmp fp2_normalize | |||||
############################################################################### | |||||
.globl C_ABI(fp2_add) | |||||
.p2align 6 | |||||
C_ABI(fp2_add): | |||||
mov $1, %eax | |||||
kmovw %eax, %k1 | |||||
mov $0x7f, %eax | |||||
kmovw %eax, %k2 | |||||
vmovdqu64 64*0(felemA), ACC0a | |||||
vmovdqu64 64*1(felemA), ACC0b{%k2}{z} | |||||
vmovdqu64 15*8 + 64*0(felemA), ACC1a | |||||
vmovdqu64 15*8 + 64*1(felemA), ACC1b{%k2}{z} | |||||
vmovdqu64 64*0(felemB), ACC2a | |||||
vmovdqu64 64*1(felemB), ACC2b{%k2}{z} | |||||
vmovdqu64 15*8 + 64*0(felemB), ACC3a | |||||
vmovdqu64 15*8 + 64*1(felemB), ACC3b{%k2}{z} | |||||
vpaddq ACC2a, ACC0a, ACC0a | |||||
vpaddq ACC2b, ACC0b, ACC0b | |||||
vpaddq ACC3a, ACC1a, ACC2a | |||||
vpaddq ACC3b, ACC1b, ACC2b | |||||
// Fallthrough | |||||
############################################################################### | |||||
.p2align 6 | |||||
C_ABI(fp2_normalize): | |||||
vpbroadcastq .LandMask(%rip), AND_MASK | |||||
vpxorq ZERO, ZERO, ZERO | |||||
# Now 'normalize' the acc to 52 bit words | |||||
vpsrlq $52, ACC0a, A0a | |||||
vpsrlq $52, ACC0b, A0b | |||||
vpsrlq $52, ACC2a, A1a | |||||
vpsrlq $52, ACC2b, A1b | |||||
vpandq AND_MASK, ACC0a, ACC0a | |||||
vpandq AND_MASK, ACC0b, ACC0b | |||||
vpandq AND_MASK, ACC2a, ACC2a | |||||
vpandq AND_MASK, ACC2b, ACC2b | |||||
valignq $7, A0a, A0b, A0b | |||||
valignq $7, ZERO, A0a, A0a | |||||
valignq $7, A1a, A1b, A1b | |||||
valignq $7, ZERO, A1a, A1a | |||||
vpaddq A0a, ACC0a, ACC0a | |||||
vpaddq A0b, ACC0b, ACC0b | |||||
vpaddq A1a, ACC2a, ACC2a | |||||
vpaddq A1b, ACC2b, ACC2b | |||||
vpcmpuq $1, A0a, ACC0a, %k1 | |||||
vpcmpuq $1, A0b, ACC0b, %k2 | |||||
vpcmpuq $0, AND_MASK, ACC0a, %k3 | |||||
vpcmpuq $0, AND_MASK, ACC0b, %k4 | |||||
kmovb %k1, %eax | |||||
kmovb %k2, %ecx | |||||
kmovb %k3, %r8d | |||||
kmovb %k4, %r9d | |||||
add %al, %al | |||||
adc %cl, %cl | |||||
add %r8b, %al | |||||
adc %r9b, %cl | |||||
xor %r8b, %al | |||||
xor %r9b, %cl | |||||
kmovb %eax, %k1 | |||||
kmovb %ecx, %k2 | |||||
vpsubq AND_MASK, ACC0a, ACC0a{%k1} | |||||
vpsubq AND_MASK, ACC0b, ACC0b{%k2} | |||||
vpandq AND_MASK, ACC0a, ACC0a | |||||
vpandq AND_MASK, ACC0b, ACC0b | |||||
vpcmpuq $1, A1a, ACC2a, %k1 | |||||
vpcmpuq $1, A1b, ACC2b, %k2 | |||||
vpcmpuq $0, AND_MASK, ACC2a, %k3 | |||||
vpcmpuq $0, AND_MASK, ACC2b, %k4 | |||||
kmovb %k1, %eax | |||||
kmovb %k2, %ecx | |||||
kmovb %k3, %r8d | |||||
kmovb %k4, %r9d | |||||
add %al, %al | |||||
adc %cl, %cl | |||||
add %r8b, %al | |||||
adc %r9b, %cl | |||||
xor %r8b, %al | |||||
xor %r9b, %cl | |||||
kmovb %eax, %k1 | |||||
kmovb %ecx, %k2 | |||||
vpsubq AND_MASK, ACC2a, ACC2a{%k1} | |||||
vpsubq AND_MASK, ACC2b, ACC2b{%k2} | |||||
vpandq AND_MASK, ACC2a, ACC2a | |||||
vpandq AND_MASK, ACC2b, ACC2b | |||||
mov $0x7f, %eax | |||||
kmovw %eax, %k1 | |||||
vmovdqu64 ACC0a, 64*0(felemR) | |||||
vmovdqu64 ACC0b, 64*1(felemR){%k1} | |||||
vmovdqu64 ACC2a, 15*8 + 64*0(felemR) | |||||
vmovdqu64 ACC2b, 15*8 + 64*1(felemR){%k1} | |||||
ret | |||||
############################################################################### | |||||
#define p1ptr %rdi | |||||
#define p2ptr %rsi | |||||
#define swap %rdx | |||||
.globl C_ABI(fp2_swap) | |||||
.p2align 6 | |||||
C_ABI(fp2_swap): | |||||
mov $0x7f, %eax | |||||
kmovw %eax, %k2 | |||||
// TODO: get rid of the masks, not needed | |||||
vmovdqu64 64*0(p1ptr), %zmm0 | |||||
vmovdqu64 64*1(p1ptr), %zmm1{%k2}{z} | |||||
vmovdqu64 15*8 + 64*0(p1ptr), %zmm2 | |||||
vmovdqu64 15*8 + 64*1(p1ptr), %zmm3{%k2}{z} | |||||
vmovdqu64 2*15*8 + 64*0(p1ptr), %zmm4 | |||||
vmovdqu64 2*15*8 + 64*1(p1ptr), %zmm5{%k2}{z} | |||||
vmovdqu64 3*15*8 + 64*0(p1ptr), %zmm6 | |||||
vmovdqu64 3*15*8 + 64*1(p1ptr), %zmm7{%k2}{z} | |||||
vmovdqu64 64*0(p2ptr), %zmm8 | |||||
vmovdqu64 64*1(p2ptr), %zmm9{%k2}{z} | |||||
vmovdqu64 15*8 + 64*0(p2ptr), %zmm10 | |||||
vmovdqu64 15*8 + 64*1(p2ptr), %zmm11{%k2}{z} | |||||
vmovdqu64 2*15*8 + 64*0(p2ptr), %zmm12 | |||||
vmovdqu64 2*15*8 + 64*1(p2ptr), %zmm13{%k2}{z} | |||||
vmovdqu64 3*15*8 + 64*0(p2ptr), %zmm14 | |||||
vmovdqu64 3*15*8 + 64*1(p2ptr), %zmm15{%k2}{z} | |||||
vpxorq %zmm16, %zmm16, %zmm16 | |||||
vpbroadcastq swap, %zmm17 | |||||
vpsubq %zmm17, %zmm16, %zmm16 | |||||
vmovdqa64 %zmm8, %zmm17 | |||||
vmovdqa64 %zmm9, %zmm18 | |||||
vmovdqa64 %zmm10, %zmm19 | |||||
vmovdqa64 %zmm11, %zmm20 | |||||
vmovdqa64 %zmm12, %zmm21 | |||||
vmovdqa64 %zmm13, %zmm22 | |||||
vmovdqa64 %zmm14, %zmm23 | |||||
vmovdqa64 %zmm15, %zmm24 | |||||
vpternlogq $0xd8, %zmm16, %zmm0, %zmm17 | |||||
vpternlogq $0xd8, %zmm16, %zmm1, %zmm18 | |||||
vpternlogq $0xd8, %zmm16, %zmm2, %zmm19 | |||||
vpternlogq $0xd8, %zmm16, %zmm3, %zmm20 | |||||
vpternlogq $0xd8, %zmm16, %zmm4, %zmm21 | |||||
vpternlogq $0xd8, %zmm16, %zmm5, %zmm22 | |||||
vpternlogq $0xd8, %zmm16, %zmm6, %zmm23 | |||||
vpternlogq $0xd8, %zmm16, %zmm7, %zmm24 | |||||
vpternlogq $0xe4, %zmm16, %zmm0, %zmm8 | |||||
vpternlogq $0xe4, %zmm16, %zmm1, %zmm9 | |||||
vpternlogq $0xe4, %zmm16, %zmm2, %zmm10 | |||||
vpternlogq $0xe4, %zmm16, %zmm3, %zmm11 | |||||
vpternlogq $0xe4, %zmm16, %zmm4, %zmm12 | |||||
vpternlogq $0xe4, %zmm16, %zmm5, %zmm13 | |||||
vpternlogq $0xe4, %zmm16, %zmm6, %zmm14 | |||||
vpternlogq $0xe4, %zmm16, %zmm7, %zmm15 | |||||
vmovdqu64 %zmm8, 64*0(p1ptr) | |||||
vmovdqu64 %zmm9, 64*1(p1ptr){%k2} | |||||
vmovdqu64 %zmm10, 15*8 + 64*0(p1ptr) | |||||
vmovdqu64 %zmm11, 15*8 + 64*1(p1ptr){%k2} | |||||
vmovdqu64 %zmm12, 2*15*8 + 64*0(p1ptr) | |||||
vmovdqu64 %zmm13, 2*15*8 + 64*1(p1ptr){%k2} | |||||
vmovdqu64 %zmm14, 3*15*8 + 64*0(p1ptr) | |||||
vmovdqu64 %zmm15, 3*15*8 + 64*1(p1ptr){%k2} | |||||
vmovdqu64 %zmm17, 64*0(p2ptr) | |||||
vmovdqu64 %zmm18, 64*1(p2ptr){%k2} | |||||
vmovdqu64 %zmm19, 15*8 + 64*0(p2ptr) | |||||
vmovdqu64 %zmm20, 15*8 + 64*1(p2ptr){%k2} | |||||
vmovdqu64 %zmm21, 2*15*8 + 64*0(p2ptr) | |||||
vmovdqu64 %zmm22, 2*15*8 + 64*1(p2ptr){%k2} | |||||
vmovdqu64 %zmm23, 3*15*8 + 64*0(p2ptr) | |||||
vmovdqu64 %zmm24, 3*15*8 + 64*1(p2ptr){%k2} | |||||
ret | |||||
############################################################################### | |||||
.globl C_ABI(fp_add) | |||||
.p2align 6 | |||||
C_ABI(fp_add): | |||||
mov $0x7f, %eax | |||||
kmovw %eax, %k2 | |||||
vmovdqu64 64*0(felemA), ACC0a | |||||
vmovdqu64 64*1(felemA), ACC0b{%k2}{z} | |||||
vmovdqu64 64*0(felemB), ACC2a | |||||
vmovdqu64 64*1(felemB), ACC2b{%k2}{z} | |||||
vpaddq ACC2a, ACC0a, ACC0a | |||||
vpaddq ACC2b, ACC0b, ACC0b | |||||
// Fallthrough | |||||
############################################################################### | |||||
.p2align 6 | |||||
C_ABI(fp_normalize): | |||||
vpbroadcastq .LandMask(%rip), AND_MASK | |||||
vpxorq ZERO, ZERO, ZERO | |||||
# Now 'normalize' the acc to 52 bit words | |||||
vpsrlq $52, ACC0a, A0a | |||||
vpsrlq $52, ACC0b, A0b | |||||
vpandq AND_MASK, ACC0a, ACC0a | |||||
vpandq AND_MASK, ACC0b, ACC0b | |||||
valignq $7, A0a, A0b, A0b | |||||
valignq $7, ZERO, A0a, A0a | |||||
vpaddq A0a, ACC0a, ACC0a | |||||
vpaddq A0b, ACC0b, ACC0b | |||||
vpcmpuq $1, A0a, ACC0a, %k1 | |||||
vpcmpuq $1, A0b, ACC0b, %k2 | |||||
vpcmpuq $0, AND_MASK, ACC0a, %k3 | |||||
vpcmpuq $0, AND_MASK, ACC0b, %k4 | |||||
kmovb %k1, %eax | |||||
kmovb %k2, %ecx | |||||
kmovb %k3, %r8d | |||||
kmovb %k4, %r9d | |||||
add %al, %al | |||||
adc %cl, %cl | |||||
add %r8b, %al | |||||
adc %r9b, %cl | |||||
xor %r8b, %al | |||||
xor %r9b, %cl | |||||
kmovb %eax, %k1 | |||||
kmovb %ecx, %k2 | |||||
vpsubq AND_MASK, ACC0a, ACC0a{%k1} | |||||
vpsubq AND_MASK, ACC0b, ACC0b{%k2} | |||||
vpandq AND_MASK, ACC0a, ACC0a | |||||
vpandq AND_MASK, ACC0b, ACC0b | |||||
mov $0x7f, %eax | |||||
kmovw %eax, %k1 | |||||
vmovdqu64 ACC0a, 64*0(%rdi) | |||||
vmovdqu64 ACC0b, 64*1(%rdi){%k1} | |||||
ret | |||||
############################################################################### | |||||
.globl C_ABI(fp_sub) | |||||
.p2align 6 | |||||
C_ABI(fp_sub): | |||||
mov $0x7f, %eax | |||||
kmovw %eax, %k2 | |||||
vmovdqu64 64*0(felemA), ACC0a | |||||
vmovdqu64 64*1(felemA), ACC0b{%k2}{z} | |||||
vmovdqu64 64*0(felemB), ACC2a | |||||
vmovdqu64 64*1(felemB), ACC2b{%k2}{z} | |||||
vpaddq 64*0 + .LpolyX(%rip), ACC0a, ACC0a | |||||
vpaddq 64*1 + .LpolyX(%rip), ACC0b, ACC0b | |||||
vpsubq ACC2a, ACC0a, ACC0a | |||||
vpsubq ACC2b, ACC0b, ACC0b | |||||
jmp fp_normalize | |||||
@@ -0,0 +1,523 @@ | |||||
#if defined(__APPLE__) | |||||
/* OS X's C ABI prefixes functions with underscore. */ | |||||
#define C_ABI(x) _ ## x | |||||
#define HIDDEN .private_extern | |||||
#else | |||||
#define C_ABI(x) x | |||||
#define HIDDEN .hidden | |||||
#endif | |||||
#define ACC0 %zmm0 | |||||
#define ACC1 %zmm1 | |||||
#define ACC2 %zmm2 | |||||
#define ACC3 %zmm3 | |||||
#define ACC4 %zmm4 | |||||
#define ACC5 %zmm5 | |||||
#define ACC6 %zmm6 | |||||
#define ACC7 %zmm7 | |||||
#define ACC8 %zmm8 | |||||
#define ACC9 %zmm9 | |||||
#define ACC10 %zmm10 | |||||
#define ACC11 %zmm11 | |||||
#define ACC12 %zmm12 | |||||
#define ACC13 %zmm13 | |||||
#define ACC14 %zmm14 | |||||
#define ACC15 %zmm15 | |||||
#define A0 %zmm16 | |||||
#define A1 %zmm17 | |||||
#define A2 %zmm18 | |||||
#define A3 %zmm19 | |||||
#define A4 %zmm20 | |||||
#define A5 %zmm21 | |||||
#define A6 %zmm22 | |||||
#define A7 %zmm23 | |||||
#define A8 %zmm24 | |||||
#define A9 %zmm25 | |||||
#define A10 %zmm26 | |||||
#define A11 %zmm27 | |||||
#define A12 %zmm28 | |||||
#define A13 %zmm29 | |||||
#define A14 %zmm30 | |||||
#define B %zmm31 | |||||
#define rptr %rdi | |||||
#define aptr %rsi | |||||
#define bptr %rdx | |||||
#define r0ptr %rdi | |||||
#define a0ptr %rsi | |||||
#define b0ptr %rdx | |||||
#define r1ptr %rcx | |||||
#define a1ptr %r8 | |||||
#define b1ptr %r9 | |||||
#define hlp %rax | |||||
.p2align 6 | |||||
.Lmask: | |||||
.Lpoly: | |||||
.quad 0x000fffffffffffff, 0x000fffffffffffff, 0x000fffffffffffff, 0x000fffffffffffff | |||||
.quad 0x000fffffffffffff, 0x000fffffffffffff, 0x000fffffffffffff, 0x00049f878a8eeaff | |||||
.quad 0x0007cc76e3ec9685, 0x00076da959b1a13f, 0x00084e9867d6ebe8, 0x000b5045cb257480 | |||||
.quad 0x000f97badc668562, 0x00041f71c0e12909, 0x00000000006fe5d5, 0 | |||||
.LpolyX: | |||||
.quad 0x0fffffffffffff00, 0x0fffffffffffff00, 0x0fffffffffffff00, 0x0fffffffffffff00 | |||||
.quad 0x0fffffffffffff00, 0x0fffffffffffff00, 0x0fffffffffffff00, 0x049f878a8eeaff00 | |||||
.quad 0x07cc76e3ec968500, 0x076da959b1a13f00, 0x084e9867d6ebe800, 0x0b5045cb25748000 | |||||
.quad 0x0f97badc66856200, 0x041f71c0e1290900, 0x000000006fe5d500, 0 | |||||
.Lperm0: | |||||
.quad 0,1,0,1,2,3,2,3 | |||||
.Lperm1: | |||||
.quad 4,5,5,4,6,7,7,6 | |||||
// TODO: avoid transposing every call by keeping data vertical throughout | |||||
// Performs 8 field multiplications in parallel | |||||
.globl C_ABI(fp2_mul_ifma_x2) | |||||
C_ABI(fp2_mul_ifma_x2): | |||||
push %rbp | |||||
mov %rsp, %rbp | |||||
sub $960, %rsp | |||||
and $-64, %rsp | |||||
mov $0x7f, %rax | |||||
kmovq %rax, %k5 | |||||
// Load a0[0] | |||||
vmovdqu64 0*64(a0ptr), %zmm0 | |||||
vmovdqu64 1*64(a0ptr), %zmm1{%k5}{z} | |||||
lea 15*8(a0ptr), a0ptr | |||||
// Load a0[1] | |||||
vmovdqu64 0*64(a0ptr), %zmm2 | |||||
vmovdqu64 1*64(a0ptr), %zmm3{%k5}{z} | |||||
// Load b0[0] | |||||
vmovdqu64 0*64(b0ptr), %zmm4 | |||||
vmovdqu64 1*64(b0ptr), %zmm5{%k5}{z} | |||||
lea 15*8(b0ptr), b0ptr | |||||
// Load b0[1] | |||||
vmovdqu64 0*64(b0ptr), %zmm6 | |||||
vmovdqu64 1*64(b0ptr), %zmm7{%k5}{z} | |||||
// Load a1[0] | |||||
vmovdqu64 0*64(a1ptr), %zmm8 | |||||
vmovdqu64 1*64(a1ptr), %zmm9{%k5}{z} | |||||
lea 15*8(a1ptr), a1ptr | |||||
// Load a1[1] | |||||
vmovdqu64 0*64(a1ptr), %zmm10 | |||||
vmovdqu64 1*64(a1ptr), %zmm11{%k5}{z} | |||||
// Load b1[0] | |||||
vmovdqu64 0*64(b1ptr), %zmm12 | |||||
vmovdqu64 1*64(b1ptr), %zmm13{%k5}{z} | |||||
lea 15*8(b1ptr), b1ptr | |||||
// Load b1[1] | |||||
vmovdqu64 0*64(b1ptr), %zmm14 | |||||
vmovdqu64 1*64(b1ptr), %zmm15{%k5}{z} | |||||
// Transpose | |||||
vpunpcklqdq %zmm2, %zmm0, %zmm16 // 0 0 2 2 4 4 6 6 | |||||
vpunpckhqdq %zmm2, %zmm0, %zmm17 // 1 1 3 3 5 5 7 7 | |||||
vpunpcklqdq %zmm6, %zmm4, %zmm18 // 0 0 2 2 4 4 6 6 | |||||
vpunpckhqdq %zmm6, %zmm4, %zmm19 // 1 1 3 3 5 5 7 7 | |||||
vpunpcklqdq %zmm10, %zmm8, %zmm20 // 0 0 2 2 4 4 6 6 | |||||
vpunpckhqdq %zmm10, %zmm8, %zmm21 // 1 1 3 3 5 5 7 7 | |||||
vpunpcklqdq %zmm14, %zmm12, %zmm22 // 0 0 2 2 4 4 6 6 | |||||
vpunpckhqdq %zmm14, %zmm12, %zmm23 // 1 1 3 3 5 5 7 7 | |||||
vpunpcklqdq %zmm3, %zmm1, %zmm24 // 8 8 10 10 12 12 14 14 | |||||
vpunpckhqdq %zmm3, %zmm1, %zmm25 // 9 9 11 11 13 13 15 15 | |||||
vpunpcklqdq %zmm7, %zmm5, %zmm26 // 8 8 10 10 12 12 14 14 | |||||
vpunpckhqdq %zmm7, %zmm5, %zmm27 // 9 9 11 11 13 13 15 15 | |||||
vpunpcklqdq %zmm11, %zmm9, %zmm28 // 8 8 10 10 12 12 14 14 | |||||
vpunpckhqdq %zmm11, %zmm9, %zmm29 // 9 9 11 11 13 13 15 15 | |||||
vpunpcklqdq %zmm15, %zmm13, %zmm30 // 8 8 10 10 12 12 14 14 | |||||
vpunpckhqdq %zmm15, %zmm13, %zmm31 // 9 9 11 11 13 13 15 15 | |||||
vshufi64x2 $0x44, %zmm20, %zmm16, %zmm0 // 0 0 2 2 0 0 2 2 | |||||
vshufi64x2 $0x44, %zmm22, %zmm18, %zmm1 // 0 0 2 2 0 0 2 2 | |||||
vshufi64x2 $0xee, %zmm20, %zmm16, %zmm2 // 4 4 6 6 4 4 6 6 | |||||
vshufi64x2 $0xee, %zmm22, %zmm18, %zmm3 // 4 4 6 6 4 4 6 6 | |||||
vshufi64x2 $0x44, %zmm21, %zmm17, %zmm4 // 1 1 3 3 1 1 3 3 | |||||
vshufi64x2 $0x44, %zmm23, %zmm19, %zmm5 // 1 1 3 3 1 1 3 3 | |||||
vshufi64x2 $0xee, %zmm21, %zmm17, %zmm6 // 5 5 7 7 5 5 7 7 | |||||
vshufi64x2 $0xee, %zmm23, %zmm19, %zmm7 // 5 5 7 7 5 5 7 7 | |||||
vshufi64x2 $0x44, %zmm28, %zmm24, %zmm8 // 8 8 10 10 8 8 10 10 | |||||
vshufi64x2 $0x44, %zmm30, %zmm26, %zmm9 // 8 8 10 10 8 8 10 10 | |||||
vshufi64x2 $0xee, %zmm28, %zmm24, %zmm10 // 12 12 14 14 12 12 14 14 | |||||
vshufi64x2 $0xee, %zmm30, %zmm26, %zmm11 // 12 12 14 14 12 12 14 14 | |||||
vshufi64x2 $0x44, %zmm29, %zmm25, %zmm12 // 9 9 11 11 9 9 11 11 | |||||
vshufi64x2 $0x44, %zmm31, %zmm27, %zmm13 // 9 9 11 11 9 9 11 11 | |||||
vshufi64x2 $0xee, %zmm29, %zmm25, %zmm14 // 13 13 15 15 13 13 15 15 | |||||
vshufi64x2 $0xee, %zmm31, %zmm27, %zmm15 // 13 13 15 15 13 13 15 15 | |||||
vshufi64x2 $0x88, %zmm1, %zmm0, %zmm16 //0 | |||||
vshufi64x2 $0x88, %zmm5, %zmm4, %zmm17 //1 | |||||
vshufi64x2 $0xdd, %zmm1, %zmm0, %zmm18 // | |||||
vshufi64x2 $0xdd, %zmm5, %zmm4, %zmm19 | |||||
vshufi64x2 $0x88, %zmm3, %zmm2, %zmm20 | |||||
vshufi64x2 $0x88, %zmm7, %zmm6, %zmm21 | |||||
vshufi64x2 $0xdd, %zmm3, %zmm2, %zmm22 | |||||
vshufi64x2 $0xdd, %zmm7, %zmm6, %zmm23 | |||||
vshufi64x2 $0x88, %zmm9, %zmm8, %zmm24 | |||||
vshufi64x2 $0x88, %zmm13, %zmm12, %zmm25 | |||||
vshufi64x2 $0xdd, %zmm9, %zmm8, %zmm26 | |||||
vshufi64x2 $0xdd, %zmm13, %zmm12, %zmm27 | |||||
vshufi64x2 $0x88, %zmm11, %zmm10, %zmm28 | |||||
vshufi64x2 $0x88, %zmm15, %zmm14, %zmm29 | |||||
vshufi64x2 $0xdd, %zmm11, %zmm10, %zmm30 | |||||
vmovdqa64 .Lperm0(%rip), %zmm31 | |||||
vpermq %zmm16, %zmm31, %zmm0 | |||||
vpermq %zmm17, %zmm31, %zmm1 | |||||
vpermq %zmm18, %zmm31, %zmm2 | |||||
vpermq %zmm19, %zmm31, %zmm3 | |||||
vpermq %zmm20, %zmm31, %zmm4 | |||||
vpermq %zmm21, %zmm31, %zmm5 | |||||
vpermq %zmm22, %zmm31, %zmm6 | |||||
vpermq %zmm23, %zmm31, %zmm7 | |||||
vpermq %zmm24, %zmm31, %zmm8 | |||||
vpermq %zmm25, %zmm31, %zmm9 | |||||
vpermq %zmm26, %zmm31, %zmm10 | |||||
vpermq %zmm27, %zmm31, %zmm11 | |||||
vpermq %zmm28, %zmm31, %zmm12 | |||||
vpermq %zmm29, %zmm31, %zmm13 | |||||
vpermq %zmm30, %zmm31, %zmm14 | |||||
.irp r,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14 | |||||
vmovdqu64 %zmm\r, \r*64(%rsp) | |||||
.endr | |||||
vmovdqa64 .Lperm1(%rip), %zmm31 | |||||
vpermq %zmm16, %zmm31, A0 | |||||
vpermq %zmm17, %zmm31, A1 | |||||
vpermq %zmm18, %zmm31, A2 | |||||
vpermq %zmm19, %zmm31, A3 | |||||
vpermq %zmm20, %zmm31, A4 | |||||
vpermq %zmm21, %zmm31, A5 | |||||
vpermq %zmm22, %zmm31, A6 | |||||
vpermq %zmm23, %zmm31, A7 | |||||
vpermq %zmm24, %zmm31, A8 | |||||
vpermq %zmm25, %zmm31, A9 | |||||
vpermq %zmm26, %zmm31, A10 | |||||
vpermq %zmm27, %zmm31, A11 | |||||
vpermq %zmm28, %zmm31, A12 | |||||
vpermq %zmm29, %zmm31, A13 | |||||
vpermq %zmm30, %zmm31, A14 | |||||
lea (%rsp), bptr | |||||
call do_mul_x2 | |||||
// After parallel multiplication the layout is: | |||||
// A0[0] * B0[0], A0[1] * B0[1], A0[0] * B0[1], A0[1] * B0[0], A1[0] * B1[0], A1[1] * B1[1], A1[0] * B1[1], A1[1] * B1[0] | |||||
// We need to compute: | |||||
// A0[0] * B0[0] - A0[1] * B0[1], A0[0] * B0[1] + A0[1] * B0[0], A1[0] * B1[0] - A0[1] * B1[1], A1[0] * B1[1] + A1[1] * B1[0] | |||||
vpsrldq $8, ACC0, A0 | |||||
vpsrldq $8, ACC1, A1 | |||||
vpsrldq $8, ACC2, A2 | |||||
vpsrldq $8, ACC3, A3 | |||||
vpsrldq $8, ACC4, A4 | |||||
vpsrldq $8, ACC5, A5 | |||||
vpsrldq $8, ACC6, A6 | |||||
vpsrldq $8, ACC7, A7 | |||||
vpsrldq $8, ACC8, A8 | |||||
vpsrldq $8, ACC9, A9 | |||||
vpsrldq $8, ACC10, A10 | |||||
vpsrldq $8, ACC11, A11 | |||||
vpsrldq $8, ACC12, A12 | |||||
vpsrldq $8, ACC13, A13 | |||||
vpsrldq $8, ACC14, A14 | |||||
mov $0x44, hlp | |||||
kmovq hlp, %k7 | |||||
vpaddq A0, ACC0, ACC0{%k7} | |||||
vpaddq A1, ACC1, ACC1{%k7} | |||||
vpaddq A2, ACC2, ACC2{%k7} | |||||
vpaddq A3, ACC3, ACC3{%k7} | |||||
vpaddq A4, ACC4, ACC4{%k7} | |||||
vpaddq A5, ACC5, ACC5{%k7} | |||||
vpaddq A6, ACC6, ACC6{%k7} | |||||
vpaddq A7, ACC7, ACC7{%k7} | |||||
vpaddq A8, ACC8, ACC8{%k7} | |||||
vpaddq A9, ACC9, ACC9{%k7} | |||||
vpaddq A10, ACC10, ACC10{%k7} | |||||
vpaddq A11, ACC11, ACC11{%k7} | |||||
vpaddq A12, ACC12, ACC12{%k7} | |||||
vpaddq A13, ACC13, ACC13{%k7} | |||||
vpaddq A14, ACC14, ACC14{%k7} | |||||
mov $0x11, hlp | |||||
kmovq hlp, %k7 | |||||
vpaddq 0*8+.LpolyX(%rip){1to8}, ACC0, ACC0{%k7} | |||||
vpaddq 1*8+.LpolyX(%rip){1to8}, ACC1, ACC1{%k7} | |||||
vpaddq 2*8+.LpolyX(%rip){1to8}, ACC2, ACC2{%k7} | |||||
vpaddq 3*8+.LpolyX(%rip){1to8}, ACC3, ACC3{%k7} | |||||
vpaddq 4*8+.LpolyX(%rip){1to8}, ACC4, ACC4{%k7} | |||||
vpaddq 5*8+.LpolyX(%rip){1to8}, ACC5, ACC5{%k7} | |||||
vpaddq 6*8+.LpolyX(%rip){1to8}, ACC6, ACC6{%k7} | |||||
vpaddq 7*8+.LpolyX(%rip){1to8}, ACC7, ACC7{%k7} | |||||
vpaddq 8*8+.LpolyX(%rip){1to8}, ACC8, ACC8{%k7} | |||||
vpaddq 9*8+.LpolyX(%rip){1to8}, ACC9, ACC9{%k7} | |||||
vpaddq 10*8+.LpolyX(%rip){1to8}, ACC10, ACC10{%k7} | |||||
vpaddq 11*8+.LpolyX(%rip){1to8}, ACC11, ACC11{%k7} | |||||
vpaddq 12*8+.LpolyX(%rip){1to8}, ACC12, ACC12{%k7} | |||||
vpaddq 13*8+.LpolyX(%rip){1to8}, ACC13, ACC13{%k7} | |||||
vpaddq 14*8+.LpolyX(%rip){1to8}, ACC14, ACC14{%k7} | |||||
vpsubq A0, ACC0, ACC0{%k7} | |||||
vpsubq A1, ACC1, ACC1{%k7} | |||||
vpsubq A2, ACC2, ACC2{%k7} | |||||
vpsubq A3, ACC3, ACC3{%k7} | |||||
vpsubq A4, ACC4, ACC4{%k7} | |||||
vpsubq A5, ACC5, ACC5{%k7} | |||||
vpsubq A6, ACC6, ACC6{%k7} | |||||
vpsubq A7, ACC7, ACC7{%k7} | |||||
vpsubq A8, ACC8, ACC8{%k7} | |||||
vpsubq A9, ACC9, ACC9{%k7} | |||||
vpsubq A10, ACC10, ACC10{%k7} | |||||
vpsubq A11, ACC11, ACC11{%k7} | |||||
vpsubq A12, ACC12, ACC12{%k7} | |||||
vpsubq A13, ACC13, ACC13{%k7} | |||||
vpsubq A14, ACC14, ACC14{%k7} | |||||
vpsrlq $52, ACC0, B | |||||
vpaddq B, ACC1, ACC1 | |||||
vpandq .Lpoly(%rip){1to8}, ACC0, ACC0 | |||||
vpsrlq $52, ACC1, B | |||||
vpaddq B, ACC2, ACC2 | |||||
vpandq .Lpoly(%rip){1to8}, ACC1, ACC1 | |||||
vpsrlq $52, ACC2, B | |||||
vpaddq B, ACC3, ACC3 | |||||
vpandq .Lpoly(%rip){1to8}, ACC2, ACC2 | |||||
vpsrlq $52, ACC3, B | |||||
vpaddq B, ACC4, ACC4 | |||||
vpandq .Lpoly(%rip){1to8}, ACC3, ACC3 | |||||
vpsrlq $52, ACC4, B | |||||
vpaddq B, ACC5, ACC5 | |||||
vpandq .Lpoly(%rip){1to8}, ACC4, ACC4 | |||||
vpsrlq $52, ACC5, B | |||||
vpaddq B, ACC6, ACC6 | |||||
vpandq .Lpoly(%rip){1to8}, ACC5, ACC5 | |||||
vpsrlq $52, ACC6, B | |||||
vpaddq B, ACC7, ACC7 | |||||
vpandq .Lpoly(%rip){1to8}, ACC6, ACC6 | |||||
vpsrlq $52, ACC7, B | |||||
vpaddq B, ACC8, ACC8 | |||||
vpandq .Lpoly(%rip){1to8}, ACC7, ACC7 | |||||
vpsrlq $52, ACC8, B | |||||
vpaddq B, ACC9, ACC9 | |||||
vpandq .Lpoly(%rip){1to8}, ACC8, ACC8 | |||||
vpsrlq $52, ACC9, B | |||||
vpaddq B, ACC10, ACC10 | |||||
vpandq .Lpoly(%rip){1to8}, ACC9, ACC9 | |||||
vpsrlq $52, ACC10, B | |||||
vpaddq B, ACC11, ACC11 | |||||
vpandq .Lpoly(%rip){1to8}, ACC10, ACC10 | |||||
vpsrlq $52, ACC11, B | |||||
vpaddq B, ACC12, ACC12 | |||||
vpandq .Lpoly(%rip){1to8}, ACC11, ACC11 | |||||
vpsrlq $52, ACC12, B | |||||
vpaddq B, ACC13, ACC13 | |||||
vpandq .Lpoly(%rip){1to8}, ACC12, ACC12 | |||||
vpsrlq $52, ACC13, B | |||||
vpaddq B, ACC14, ACC14 | |||||
vpandq .Lpoly(%rip){1to8}, ACC13, ACC13 | |||||
vpandq .Lpoly(%rip){1to8}, ACC14, ACC14 | |||||
// Transpose to horizontal | |||||
vpunpcklqdq ACC1, ACC0, ACC0 | |||||
vpunpcklqdq ACC3, ACC2, ACC1 | |||||
vpunpcklqdq ACC5, ACC4, ACC2 | |||||
vpunpcklqdq ACC7, ACC6, ACC3 | |||||
vpunpcklqdq ACC9, ACC8, ACC4 | |||||
vpunpcklqdq ACC11, ACC10, ACC5 | |||||
vpunpcklqdq ACC13, ACC12, ACC6 | |||||
vmovdqa64 ACC14, ACC7 | |||||
vshufi64x2 $0x44, ACC1, ACC0, A0 | |||||
vshufi64x2 $0x44, ACC3, ACC2, A1 | |||||
vshufi64x2 $0x44, ACC5, ACC4, A2 | |||||
vshufi64x2 $0x44, ACC7, ACC6, A3 | |||||
vshufi64x2 $0xee, ACC1, ACC0, A4 | |||||
vshufi64x2 $0xee, ACC3, ACC2, A5 | |||||
vshufi64x2 $0xee, ACC5, ACC4, A6 | |||||
vshufi64x2 $0xee, ACC7, ACC6, A7 | |||||
vshufi64x2 $0x88, A1, A0, ACC0 | |||||
vshufi64x2 $0x88, A3, A2, ACC1 | |||||
vshufi64x2 $0xdd, A1, A0, ACC2 | |||||
vshufi64x2 $0xdd, A3, A2, ACC3 | |||||
vshufi64x2 $0x88, A5, A4, ACC4 | |||||
vshufi64x2 $0x88, A7, A6, ACC5 | |||||
vshufi64x2 $0xdd, A5, A4, ACC6 | |||||
vshufi64x2 $0xdd, A7, A6, ACC7 | |||||
vmovdqu64 ACC0, 0*64(r0ptr) | |||||
vmovdqu64 ACC1, 1*64(r0ptr){%k5} | |||||
lea 15*8(r0ptr), r0ptr | |||||
vmovdqu64 ACC2, 0*64(r0ptr) | |||||
vmovdqu64 ACC3, 1*64(r0ptr){%k5} | |||||
vmovdqu64 ACC4, 0*64(r1ptr) | |||||
vmovdqu64 ACC5, 1*64(r1ptr){%k5} | |||||
lea 15*8(r1ptr), r1ptr | |||||
vmovdqu64 ACC6, 0*64(r1ptr) | |||||
vmovdqu64 ACC7, 1*64(r1ptr){%k5} | |||||
mov %rbp, %rsp | |||||
pop %rbp | |||||
ret | |||||
// Performs 8 field multiplications in parallel | |||||
.globl C_ABI(amm_751_ifma_x2) | |||||
C_ABI(amm_751_ifma_x2): | |||||
vmovdqu64 0*64(aptr), A0 | |||||
vmovdqu64 1*64(aptr), A1 | |||||
vmovdqu64 2*64(aptr), A2 | |||||
vmovdqu64 3*64(aptr), A3 | |||||
vmovdqu64 4*64(aptr), A4 | |||||
vmovdqu64 5*64(aptr), A5 | |||||
vmovdqu64 6*64(aptr), A6 | |||||
vmovdqu64 7*64(aptr), A7 | |||||
vmovdqu64 8*64(aptr), A8 | |||||
vmovdqu64 9*64(aptr), A9 | |||||
vmovdqu64 10*64(aptr), A10 | |||||
vmovdqu64 11*64(aptr), A11 | |||||
vmovdqu64 12*64(aptr), A12 | |||||
vmovdqu64 13*64(aptr), A13 | |||||
vmovdqu64 14*64(aptr), A14 | |||||
do_mul_x2: | |||||
vpxorq ACC0, ACC0, ACC0 | |||||
vpxorq ACC1, ACC1, ACC1 | |||||
vpxorq ACC2, ACC2, ACC2 | |||||
vpxorq ACC3, ACC3, ACC3 | |||||
vpxorq ACC4, ACC4, ACC4 | |||||
vpxorq ACC5, ACC5, ACC5 | |||||
vpxorq ACC6, ACC6, ACC6 | |||||
vpxorq ACC7, ACC7, ACC7 | |||||
vpxorq ACC8, ACC8, ACC8 | |||||
vpxorq ACC9, ACC9, ACC9 | |||||
vpxorq ACC10, ACC10, ACC10 | |||||
vpxorq ACC11, ACC11, ACC11 | |||||
vpxorq ACC12, ACC12, ACC12 | |||||
vpxorq ACC13, ACC13, ACC13 | |||||
vpxorq ACC14, ACC14, ACC14 | |||||
vpxorq ACC15, ACC15, ACC15 | |||||
mov $15, hlp | |||||
1: | |||||
vmovdqu64 (bptr), B | |||||
lea 1*64(bptr), bptr | |||||
vpmadd52luq A0, B, ACC0 | |||||
vpmadd52luq A1, B, ACC1 | |||||
vpmadd52luq A2, B, ACC2 | |||||
vpmadd52luq A3, B, ACC3 | |||||
vpmadd52luq A4, B, ACC4 | |||||
vpmadd52luq A5, B, ACC5 | |||||
vpmadd52luq A6, B, ACC6 | |||||
vpmadd52luq A7, B, ACC7 | |||||
vpmadd52luq A8, B, ACC8 | |||||
vpmadd52luq A9, B, ACC9 | |||||
vpmadd52luq A10, B, ACC10 | |||||
vpmadd52luq A11, B, ACC11 | |||||
vpmadd52luq A12, B, ACC12 | |||||
vpmadd52luq A13, B, ACC13 | |||||
vpmadd52luq A14, B, ACC14 | |||||
vpmadd52huq A0, B, ACC1 | |||||
vpmadd52huq A1, B, ACC2 | |||||
vpmadd52huq A2, B, ACC3 | |||||
vpmadd52huq A3, B, ACC4 | |||||
vpmadd52huq A4, B, ACC5 | |||||
vpmadd52huq A5, B, ACC6 | |||||
vpmadd52huq A6, B, ACC7 | |||||
vpmadd52huq A7, B, ACC8 | |||||
vpmadd52huq A8, B, ACC9 | |||||
vpmadd52huq A9, B, ACC10 | |||||
vpmadd52huq A10, B, ACC11 | |||||
vpmadd52huq A11, B, ACC12 | |||||
vpmadd52huq A12, B, ACC13 | |||||
vpmadd52huq A13, B, ACC14 | |||||
vpmadd52huq A14, B, ACC15 | |||||
vmovdqa64 ACC0, B | |||||
vpmadd52luq 0*8 + .Lpoly(%rip){1to8}, B, ACC0 | |||||
vpsrlq $52, ACC0, ACC0 | |||||
vpmadd52luq 1*8 + .Lpoly(%rip){1to8}, B, ACC1 | |||||
vpaddq ACC1, ACC0, ACC0 | |||||
vpmadd52luq 2*8 + .Lpoly(%rip){1to8}, B, ACC2 | |||||
vmovdqa64 ACC2, ACC1 | |||||
vpmadd52luq 3*8 + .Lpoly(%rip){1to8}, B, ACC3 | |||||
vmovdqa64 ACC3, ACC2 | |||||
vpmadd52luq 4*8 + .Lpoly(%rip){1to8}, B, ACC4 | |||||
vmovdqa64 ACC4, ACC3 | |||||
vpmadd52luq 5*8 + .Lpoly(%rip){1to8}, B, ACC5 | |||||
vmovdqa64 ACC5, ACC4 | |||||
vpmadd52luq 6*8 + .Lpoly(%rip){1to8}, B, ACC6 | |||||
vmovdqa64 ACC6, ACC5 | |||||
vpmadd52luq 7*8 + .Lpoly(%rip){1to8}, B, ACC7 | |||||
vmovdqa64 ACC7, ACC6 | |||||
vpmadd52luq 8*8 + .Lpoly(%rip){1to8}, B, ACC8 | |||||
vmovdqa64 ACC8, ACC7 | |||||
vpmadd52luq 9*8 + .Lpoly(%rip){1to8}, B, ACC9 | |||||
vmovdqa64 ACC9, ACC8 | |||||
vpmadd52luq 10*8 + .Lpoly(%rip){1to8}, B, ACC10 | |||||
vmovdqa64 ACC10, ACC9 | |||||
vpmadd52luq 11*8 + .Lpoly(%rip){1to8}, B, ACC11 | |||||
vmovdqa64 ACC11, ACC10 | |||||
vpmadd52luq 12*8 + .Lpoly(%rip){1to8}, B, ACC12 | |||||
vmovdqa64 ACC12, ACC11 | |||||
vpmadd52luq 13*8 + .Lpoly(%rip){1to8}, B, ACC13 | |||||
vmovdqa64 ACC13, ACC12 | |||||
vpmadd52luq 14*8 + .Lpoly(%rip){1to8}, B, ACC14 | |||||
vmovdqa64 ACC14, ACC13 | |||||
vmovdqa64 ACC15, ACC14 | |||||
vpxorq ACC15, ACC15, ACC15 | |||||
vpmadd52huq 0*8 + .Lpoly(%rip){1to8}, B, ACC0 | |||||
vpmadd52huq 1*8 + .Lpoly(%rip){1to8}, B, ACC1 | |||||
vpmadd52huq 2*8 + .Lpoly(%rip){1to8}, B, ACC2 | |||||
vpmadd52huq 3*8 + .Lpoly(%rip){1to8}, B, ACC3 | |||||
vpmadd52huq 4*8 + .Lpoly(%rip){1to8}, B, ACC4 | |||||
vpmadd52huq 5*8 + .Lpoly(%rip){1to8}, B, ACC5 | |||||
vpmadd52huq 6*8 + .Lpoly(%rip){1to8}, B, ACC6 | |||||
vpmadd52huq 7*8 + .Lpoly(%rip){1to8}, B, ACC7 | |||||
vpmadd52huq 8*8 + .Lpoly(%rip){1to8}, B, ACC8 | |||||
vpmadd52huq 9*8 + .Lpoly(%rip){1to8}, B, ACC9 | |||||
vpmadd52huq 10*8 + .Lpoly(%rip){1to8}, B, ACC10 | |||||
vpmadd52huq 11*8 + .Lpoly(%rip){1to8}, B, ACC11 | |||||
vpmadd52huq 12*8 + .Lpoly(%rip){1to8}, B, ACC12 | |||||
vpmadd52huq 13*8 + .Lpoly(%rip){1to8}, B, ACC13 | |||||
vpmadd52huq 14*8 + .Lpoly(%rip){1to8}, B, ACC14 | |||||
dec hlp | |||||
jnz 1b | |||||
ret |
@@ -0,0 +1,268 @@ | |||||
#if defined(__APPLE__) | |||||
/* OS X's C ABI prefixes functions with underscore. */ | |||||
#define C_ABI(x) _ ## x | |||||
#define HIDDEN .private_extern | |||||
#else | |||||
#define C_ABI(x) x | |||||
#define HIDDEN .hidden | |||||
#endif | |||||
.p2align 6 | |||||
.LpermMask0: | |||||
.word 0,1,2,3, 3,4,5,6, 6,7,8,9, 9,10,11,12, 13,14,15,16, 16,17,18,19, 19,20,21,22, 22,23,24,25 | |||||
.LshiftMask0: | |||||
.quad 0,4,8,12,0,4,8,12 | |||||
.LandMask: | |||||
.quad 0xfffffffffffff | |||||
.p2align 6 | |||||
.Lpoly: | |||||
.quad 0x000fffffffffffff, 0x000fffffffffffff, 0x000fffffffffffff, 0x000fffffffffffff | |||||
.quad 0x000fffffffffffff, 0x000fffffffffffff, 0x000fffffffffffff, 0x00049f878a8eeaff | |||||
.quad 0x0007cc76e3ec9685, 0x00076da959b1a13f, 0x00084e9867d6ebe8, 0x000b5045cb257480 | |||||
.quad 0x000f97badc668562, 0x00041f71c0e12909, 0x00000000006fe5d5, 0 | |||||
.LR2: | |||||
.quad 0x000dad40589641fd, 0x000452a233046449, 0x000edb010161a696, 0x00036941472e3fd8 | |||||
.quad 0x000e2082a2e7065e, 0x000904f8751f40bf, 0x0007fc814932cca8, 0x00033f174b08b2ee | |||||
.quad 0x0009814efb9f1375, 0x00099594a1afe512, 0x00043c75310de66d, 0x000197021a5b37b0 | |||||
.quad 0x000cc1a272e73959, 0x000a733d7c97cd76, 0x0000000000292ee8, 0 | |||||
.Lone: | |||||
.quad 1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 | |||||
.globl C_ABI(norm2red) | |||||
.p2align 6 | |||||
C_ABI(norm2red): | |||||
mov $0x3FFFFF, %eax | |||||
kmovd %eax, %k1 | |||||
mov $0x7F, %eax | |||||
kmovd %eax, %k2 | |||||
vmovdqa64 .LpermMask0(%rip), %zmm0 | |||||
vmovdqa64 .LshiftMask0(%rip), %zmm1 | |||||
vpbroadcastq .LandMask(%rip), %zmm10 | |||||
vpermw 52*0(%rsi), %zmm0, %zmm2 | |||||
vmovdqu16 52*1(%rsi), %zmm3{%k1}{z} | |||||
vpermw %zmm3, %zmm0, %zmm3 | |||||
vpsrlvq %zmm1, %zmm2, %zmm2 | |||||
vpsrlvq %zmm1, %zmm3, %zmm3 | |||||
vpsrlvq %zmm1, %zmm4, %zmm4 | |||||
vpandq %zmm10, %zmm2, %zmm2 | |||||
vpandq %zmm10, %zmm3, %zmm3 | |||||
vpandq %zmm10, %zmm4, %zmm4 | |||||
vmovdqu64 %zmm2, 64*0(%rdi) | |||||
vmovdqu64 %zmm3, 64*1(%rdi){%k2} | |||||
ret | |||||
#define res %rdi // uint64_t *rp, | |||||
#define a0 %rsi // const uint64_t *ap, | |||||
#define bpi %rdx // const uint64_t *bptr, | |||||
#define m0 %rcx | |||||
#define b_ptr %rax | |||||
#define acc0 %r9 | |||||
#define itr %r10 | |||||
#define t0 %r11 | |||||
#define t1 %r12 | |||||
#define t2 %r13 | |||||
#define A0 %zmm0 | |||||
#define A1 %zmm1 | |||||
#define M0 %zmm2 | |||||
#define M1 %zmm3 | |||||
#define ACC0 %zmm4 | |||||
#define ACC0_xmm %xmm4 | |||||
#define ACC1 %zmm5 | |||||
#define Y_curr %zmm6 | |||||
#define Y_prev %zmm7 | |||||
#define B_curr %zmm8 | |||||
#define B_prev %zmm9 | |||||
#define TMP %zmm10 | |||||
#define TMP_xmm %xmm10 | |||||
#define ZERO %zmm11 | |||||
#define AND_MASK %zmm12 | |||||
#define ACC0b %zmm13 | |||||
#define ACC1b %zmm14 | |||||
############################################################################### | |||||
.globl C_ABI(to_mont_ifma) | |||||
.p2align 6 | |||||
C_ABI(to_mont_ifma): | |||||
leaq .LR2(%rip), bpi | |||||
jmp C_ABI(fp_mul_ifma) | |||||
############################################################################### | |||||
.globl C_ABI(from_mont_ifma) | |||||
.p2align 6 | |||||
C_ABI(from_mont_ifma): | |||||
leaq .Lone(%rip), bpi | |||||
jmp C_ABI(fp_mul_ifma) | |||||
############################################################################### | |||||
.globl C_ABI(fp_mul_ifma) | |||||
.p2align 6 | |||||
C_ABI(fp_mul_ifma): | |||||
push %rbx | |||||
push %r12 | |||||
push %r13 | |||||
mov bpi, b_ptr | |||||
mov $1, t0 | |||||
mov $0x3f, t1 | |||||
kmovq t0, %k1 | |||||
kmovq t1, %k2 | |||||
vpbroadcastq .LandMask(%rip), AND_MASK | |||||
vpxorq ZERO, ZERO, ZERO | |||||
# Load operands A into registers. A[0] is stored in ALU register, in order to compensate for the latency of IFMA when computing (A*B)[0] * K0 | |||||
vmovdqu64 8*1+64*0(a0), A0 | |||||
vmovdqu64 8*1+64*1(a0), A1{%k2}{z} | |||||
mov 8*0(a0), a0 | |||||
# Load the modulii | |||||
mov .Lpoly(%rip), m0 | |||||
vmovdqu64 8*1+64*0+.Lpoly(%rip), M0 | |||||
vmovdqu64 8*1+64*1+.Lpoly(%rip), M1{%k2}{z} | |||||
# Prepare the accumulators | |||||
vpxorq ACC0, ACC0, ACC0 | |||||
vpxorq ACC1, ACC1, ACC1 | |||||
vpxorq B_curr, B_curr, B_curr | |||||
vpxorq Y_curr, Y_curr, Y_curr | |||||
xor acc0, acc0 | |||||
mov $15, itr | |||||
1: | |||||
vpxorq ACC0b, ACC0b, ACC0b | |||||
vpxorq ACC1b, ACC1b, ACC1b | |||||
# High multiplications | |||||
vpmadd52huq B_curr, A0, ACC0b | |||||
vpmadd52huq B_curr, A1, ACC1b | |||||
vpmadd52huq Y_curr, M0, ACC0b | |||||
vpmadd52huq Y_curr, M1, ACC1b | |||||
# Shift the ACC in zmms right by a word | |||||
valignq $1, ACC0, ACC1, ACC0 | |||||
valignq $1, ACC1, ZERO, ACC1 | |||||
mov a0, %rdx | |||||
mulx (b_ptr), t0, t2 | |||||
add t0, acc0 | |||||
adc $0, t2 | |||||
mov acc0, %rdx | |||||
and .LandMask(%rip), %rdx | |||||
vpbroadcastq %rdx, Y_curr | |||||
vpbroadcastq (b_ptr), B_curr | |||||
mulx m0, t0, t1 | |||||
add t0, acc0 | |||||
adc t1, t2 | |||||
shrd $52, t2, acc0 | |||||
# Low multiplications | |||||
vpmadd52luq B_curr, A0, ACC0b | |||||
vpmadd52luq B_curr, A1, ACC1b | |||||
vpmadd52luq Y_curr, M0, ACC0 | |||||
vpmadd52luq Y_curr, M1, ACC1 | |||||
vpaddq ACC0b, ACC0, ACC0 | |||||
vpaddq ACC1b, ACC1, ACC1 | |||||
vmovq ACC0_xmm, t0 | |||||
add t0, acc0 | |||||
lea 8(b_ptr), b_ptr | |||||
dec itr | |||||
jne 1b | |||||
vmovq acc0, TMP_xmm | |||||
vmovdqa64 TMP, ACC0{%k1} | |||||
valignq $7, A0, A1, A1 | |||||
valignq $7, ZERO, A0, A0 | |||||
valignq $7, M0, M1, M1 | |||||
valignq $7, ZERO, M0, M0 | |||||
# The last high multiplications | |||||
vpmadd52huq B_curr, A0, ACC0 | |||||
vpmadd52huq B_curr, A1, ACC1 | |||||
vpmadd52huq Y_curr, M0, ACC0 | |||||
vpmadd52huq Y_curr, M1, ACC1 | |||||
# Now 'normalize' the result to 52 bit words | |||||
vpsrlq $52, ACC0, A0 | |||||
vpsrlq $52, ACC1, A1 | |||||
vpandq AND_MASK, ACC0, ACC0 | |||||
vpandq AND_MASK, ACC1, ACC1 | |||||
valignq $7, A0, A1, A1 | |||||
valignq $7, ZERO, A0, A0 | |||||
vpaddq A0, ACC0, ACC0 | |||||
vpaddq A1, ACC1, ACC1 | |||||
vpcmpuq $1, A0, ACC0, %k1 | |||||
vpcmpuq $1, A1, ACC1, %k2 | |||||
kmovb %k1, %eax | |||||
kmovb %k2, %ebx | |||||
add %al, %al | |||||
adc %cl, %cl | |||||
vpcmpuq $0, AND_MASK, ACC0, %k1 | |||||
vpcmpuq $0, AND_MASK, ACC1, %k2 | |||||
kmovb %k1, %r8d | |||||
kmovb %k2, %r9d | |||||
add %r8b, %al | |||||
adc %r9b, %bl | |||||
xor %r8b, %al | |||||
xor %r9b, %bl | |||||
kmovb %eax, %k1 | |||||
kmovb %ebx, %k2 | |||||
vpsubq AND_MASK, ACC0, ACC0{%k1} | |||||
vpsubq AND_MASK, ACC1, ACC1{%k2} | |||||
vpandq AND_MASK, ACC0, ACC0 | |||||
vpandq AND_MASK, ACC1, ACC1 | |||||
mov $0x7f, t0 | |||||
kmovq t0, %k1 | |||||
vmovdqu64 ACC0, 64*0(res) | |||||
vmovdqu64 ACC1, 64*1(res){%k1} | |||||
bail: | |||||
pop %r13 | |||||
pop %r12 | |||||
pop %rbx | |||||
ret |
@@ -0,0 +1,218 @@ | |||||
#include <stdint.h> | |||||
#include <stdio.h> | |||||
#include <string.h> | |||||
#include "./sidh_ref/P751_internal.h" | |||||
#include "measurements.h" | |||||
#ifndef PRIME_BITS | |||||
#define PRIME_BITS 751 | |||||
#endif | |||||
#define DIGITS_64 ((PRIME_BITS + 63) / 64) | |||||
#define DIGITS_52 ((PRIME_BITS + 51) / 52) | |||||
#define OALICE_BITS 372 | |||||
#define OBOB_BITS 379 | |||||
#define SECRETKEY_A_BYTES (OALICE_BITS + 7) / 8 | |||||
#define SECRETKEY_B_BYTES (OBOB_BITS + 7) / 8 | |||||
#define MASK_ALICE 0x0F | |||||
#define MASK_BOB 0x03 | |||||
typedef uint64_t num52[DIGITS_52]; | |||||
typedef num52 felem[2]; | |||||
void fp2_mul_ifma(felem res, felem a, felem b); | |||||
void fp2_sqr_ifma(felem res, felem a); | |||||
void fp2_mul_ifma_x2(felem res1, const felem a1, const felem b1, felem res2, const felem a2, const felem b2); | |||||
void fp_mul_ifma(uint64_t *rp, const uint64_t *ap, const uint64_t *bp); | |||||
void to_mont_ifma(uint64_t *rp, const uint64_t *ap); | |||||
void from_mont_ifma(uint64_t *rp, const uint64_t *ap); | |||||
void red2norm(uint64_t out[12], const uint64_t in[15]); | |||||
void norm2red(uint64_t *res, const uint64_t *a); | |||||
int EphemeralKeyGeneration_A_ifma(const unsigned char *PrivateKeyA, unsigned char *PublicKeyA); | |||||
int EphemeralKeyGeneration_B_ifma(const unsigned char *PrivateKeyB, unsigned char *PublicKeyB); | |||||
int rdrand64_step(uint64_t *rand) | |||||
{ | |||||
unsigned char ok; | |||||
__asm__ volatile("rdrand %0; setc %1" | |||||
: "=r"(*rand), "=qm"(ok)); | |||||
return (int)ok; | |||||
} | |||||
static void rand_750(uint64_t out[DIGITS_64]) | |||||
{ | |||||
for (int i = 0; i < DIGITS_64; i++) | |||||
{ | |||||
while (!rdrand64_step((uint64_t *)&out[i])) | |||||
; | |||||
} | |||||
out[DIGITS_64 - 1] &= ((1ULL << (PRIME_BITS - 64 * (DIGITS_64 - 1))) - 1); | |||||
} | |||||
static void rand_bytes(uint8_t *out, size_t out_len) | |||||
{ | |||||
uint64_t temp; | |||||
for (int i = 0; i < out_len; i++) | |||||
{ | |||||
while (!rdrand64_step((uint64_t *)&temp)) | |||||
; | |||||
out[i] = temp; | |||||
} | |||||
} | |||||
int main() | |||||
{ | |||||
int i; | |||||
do | |||||
{ | |||||
felm_t fa, fb, fr; | |||||
num52 r, a, b; | |||||
uint64_t res_ifma[DIGITS_64]; | |||||
rand_750(fa); | |||||
rand_750(fb); | |||||
norm2red(a, (uint64_t *)fa); | |||||
norm2red(b, (uint64_t *)fb); | |||||
to_mont_ifma(a, a); | |||||
to_mont_ifma(b, b); | |||||
MEASURE({ fp_mul_ifma(r, a, b); }); | |||||
from_mont_ifma(r, r); | |||||
red2norm(res_ifma, r); | |||||
printf("Mont mul IFMA Cycles/op: %.0f\n", RDTSC_total_clk); | |||||
to_mont(fa, fa); | |||||
to_mont(fb, fb); | |||||
MEASURE({ fpmul751_mont(fa, fb, fr); }); | |||||
from_mont(fr, fr); | |||||
printf("Mont mul ref Cycles/op: %.0f\n", RDTSC_total_clk); | |||||
printf("%s\n", memcmp(fr, res_ifma, sizeof(res_ifma)) ? "FP MUL Fail" | |||||
: "FP MUL Success"); | |||||
} while (0); | |||||
do | |||||
{ | |||||
felem a, b, r, r2; | |||||
f2elm_t fa, fb, fr; | |||||
uint64_t res_ifma[2][DIGITS_64]; | |||||
rand_750(fa[0]); | |||||
rand_750(fa[1]); | |||||
rand_750(fb[0]); | |||||
rand_750(fb[1]); | |||||
norm2red(a[0], (uint64_t *)fa[0]); | |||||
norm2red(a[1], (uint64_t *)fa[1]); | |||||
norm2red(b[0], (uint64_t *)fb[0]); | |||||
norm2red(b[1], (uint64_t *)fb[1]); | |||||
to_mont_ifma(a[0], a[0]); | |||||
to_mont_ifma(a[1], a[1]); | |||||
to_mont_ifma(b[0], b[0]); | |||||
to_mont_ifma(b[1], b[1]); | |||||
MEASURE({ fp2_mul_ifma(r, a, b); }); | |||||
from_mont_ifma(r[0], r[0]); | |||||
from_mont_ifma(r[1], r[1]); | |||||
red2norm(res_ifma[0], r[0]); | |||||
red2norm(res_ifma[1], r[1]); | |||||
printf("Mont FP2 mul IFMA Cycles/op: %.0f\n", RDTSC_total_clk); | |||||
to_mont(fa[0], fa[0]); | |||||
to_mont(fa[1], fa[1]); | |||||
to_mont(fb[0], fb[0]); | |||||
to_mont(fb[1], fb[1]); | |||||
MEASURE({ fp2mul751_mont(fa, fb, fr); }); | |||||
from_mont(fr[0], fr[0]); | |||||
from_mont(fr[1], fr[1]); | |||||
printf("Mont FP2 mul ref Cycles/op: %.0f\n", RDTSC_total_clk); | |||||
printf("%s\n", memcmp(fr, res_ifma, sizeof(res_ifma)) ? "FP2 MUL Fail" | |||||
: "FP2 MUL Success"); | |||||
MEASURE({ fp2_mul_ifma_x2(r, a, b, r2, a, b); }); | |||||
from_mont_ifma(r[0], r[0]); | |||||
from_mont_ifma(r[1], r[1]); | |||||
red2norm(res_ifma[0], r[0]); | |||||
red2norm(res_ifma[1], r[1]); | |||||
printf("Dual Mont FP2 mul IFMA Cycles/op: %.0f\n", RDTSC_total_clk); | |||||
printf("%s\n", memcmp(fr, res_ifma, sizeof(res_ifma)) ? "Dual FP2 MUL 1/2 Fail" | |||||
: "Dual FP2 MUL 1/2 Success"); | |||||
from_mont_ifma(r2[0], r2[0]); | |||||
from_mont_ifma(r2[1], r2[1]); | |||||
red2norm(res_ifma[0], r2[0]); | |||||
red2norm(res_ifma[1], r2[1]); | |||||
printf("%s\n", memcmp(fr, res_ifma, sizeof(res_ifma)) ? "Dual FP2 MUL 2/2 Fail" | |||||
: "Dual FP2 MUL 2/2 Success"); | |||||
MEASURE({ fp2_sqr_ifma(r, a); }); | |||||
from_mont_ifma(r[0], r[0]); | |||||
from_mont_ifma(r[1], r[1]); | |||||
red2norm(res_ifma[0], r[0]); | |||||
red2norm(res_ifma[1], r[1]); | |||||
printf("Mont FP2 sqr IFMA Cycles/op: %.0f\n", RDTSC_total_clk); | |||||
MEASURE({ fp2sqr751_mont(fa, fr); }); | |||||
from_mont(fr[0], fr[0]); | |||||
from_mont(fr[1], fr[1]); | |||||
printf("Mont FP2 sqr ref Cycles/op: %.0f\n", RDTSC_total_clk); | |||||
printf("%s\n", memcmp(fr, res_ifma, sizeof(res_ifma)) ? "FP2 SQR Fail" | |||||
: "FP2 SQR Success"); | |||||
} while (0); | |||||
do | |||||
{ | |||||
unsigned char ephemeralsk_alice[SECRETKEY_A_BYTES]; | |||||
unsigned char ephemeralsk_bob[SECRETKEY_B_BYTES]; | |||||
unsigned char ct1[564] = {0}; | |||||
unsigned char ct2[564] = {0}; | |||||
rand_bytes(ephemeralsk_alice, sizeof(ephemeralsk_alice)); | |||||
rand_bytes(ephemeralsk_bob, sizeof(ephemeralsk_bob)); | |||||
ephemeralsk_alice[SECRETKEY_A_BYTES - 1] &= MASK_ALICE; | |||||
ephemeralsk_bob[SECRETKEY_B_BYTES - 1] &= MASK_BOB; | |||||
MEASURE({ EphemeralKeyGeneration_A(ephemeralsk_alice, ct1); }); | |||||
printf("Ref EphemeralKeyGeneration_A Cycles/op: %.0f\n", RDTSC_total_clk); | |||||
MEASURE({ EphemeralKeyGeneration_A_ifma(ephemeralsk_alice, ct2); }); | |||||
printf("IFMA EphemeralKeyGeneration_A Cycles/op: %.0f\n", RDTSC_total_clk); | |||||
printf("%s\n", memcmp(ct1, ct2, sizeof(ct1)) ? "EphemeralKeyGeneration_A Fail" | |||||
: "EphemeralKeyGeneration_A Success"); | |||||
MEASURE({ EphemeralKeyGeneration_B(ephemeralsk_bob, ct1); }); | |||||
printf("Ref EphemeralKeyGeneration_B Cycles/op: %.0f\n", RDTSC_total_clk); | |||||
MEASURE({ EphemeralKeyGeneration_B_ifma(ephemeralsk_bob, ct2); }); | |||||
printf("IFMA EphemeralKeyGeneration_B Cycles/op: %.0f\n", RDTSC_total_clk); | |||||
printf("%s\n", memcmp(ct1, ct2, sizeof(ct1)) ? "EphemeralKeyGeneration_B Fail" | |||||
: "EphemeralKeyGeneration_B Success"); | |||||
} while (0); | |||||
} |
@@ -0,0 +1,52 @@ | |||||
#ifndef MEASURE_H | |||||
#define MEASURE_H | |||||
#ifndef REPEAT | |||||
#define REPEAT 100 | |||||
#endif | |||||
#ifndef OUTER_REPEAT | |||||
#define OUTER_REPEAT 10 | |||||
#endif | |||||
#ifndef WARMUP | |||||
#define WARMUP REPEAT / 4 | |||||
#endif | |||||
unsigned long long RDTSC_start_clk, RDTSC_end_clk; | |||||
double RDTSC_total_clk; | |||||
double RDTSC_TEMP_CLK; | |||||
int RDTSC_MEASURE_ITERATOR; | |||||
int RDTSC_OUTER_ITERATOR; | |||||
inline static unsigned long get_Clks(void) | |||||
{ | |||||
unsigned hi, lo; | |||||
__asm__ __volatile__("rdtscp\n\t" | |||||
: "=a"(lo), "=d"(hi)::"rcx"); | |||||
return ((unsigned long)lo) ^ (((unsigned long)hi) << 32); | |||||
} | |||||
#define RDTSC_MEASURE(x) \ | |||||
for (RDTSC_MEASURE_ITERATOR = 0; RDTSC_MEASURE_ITERATOR < WARMUP; RDTSC_MEASURE_ITERATOR++) \ | |||||
{ \ | |||||
{x}; \ | |||||
} \ | |||||
RDTSC_total_clk = 1.7976931348623157e+308; \ | |||||
for (RDTSC_OUTER_ITERATOR = 0; RDTSC_OUTER_ITERATOR < OUTER_REPEAT; RDTSC_OUTER_ITERATOR++) \ | |||||
{ \ | |||||
RDTSC_start_clk = get_Clks(); \ | |||||
for (RDTSC_MEASURE_ITERATOR = 0; RDTSC_MEASURE_ITERATOR < REPEAT; RDTSC_MEASURE_ITERATOR++) \ | |||||
{ \ | |||||
{x}; \ | |||||
} \ | |||||
RDTSC_end_clk = get_Clks(); \ | |||||
RDTSC_TEMP_CLK = (double)(RDTSC_end_clk - RDTSC_start_clk) / REPEAT; \ | |||||
if (RDTSC_total_clk > RDTSC_TEMP_CLK) \ | |||||
RDTSC_total_clk = RDTSC_TEMP_CLK; \ | |||||
} | |||||
#define MEASURE(x) RDTSC_MEASURE(x) | |||||
#endif |
@@ -0,0 +1,122 @@ | |||||
/******************************************************************************************** | |||||
* Supersingular Isogeny Key Encapsulation Library | |||||
* | |||||
* Abstract: supersingular isogeny parameters and generation of functions for P751 | |||||
*********************************************************************************************/ | |||||
#include "P751_internal.h" | |||||
// Encoding of field elements, elements over Z_order, elements over GF(p^2) and elliptic curve points: | |||||
// -------------------------------------------------------------------------------------------------- | |||||
// Elements over GF(p) and Z_order are encoded with the least significant octet (and digit) located at the leftmost position (i.e., little endian format). | |||||
// Elements (a+b*i) over GF(p^2), where a and b are defined over GF(p), are encoded as {a, b}, with a in the least significant position. | |||||
// Elliptic curve points P = (x,y) are encoded as {x, y}, with x in the least significant position. | |||||
// Internally, the number of digits used to represent all these elements is obtained by approximating the number of bits to the immediately greater multiple of 32. | |||||
// For example, a 751-bit field element is represented with Ceil(751 / 64) = 12 64-bit digits or Ceil(751 / 32) = 24 32-bit digits. | |||||
// | |||||
// Curve isogeny system "SIDHp751". Base curve: Montgomery curve By^2 = Cx^3 + Ax^2 + Cx defined over GF(p751^2), where A=0, B=1, C=1 and p751 = 2^372*3^239-1 | |||||
// | |||||
const uint64_t p751[NWORDS64_FIELD] = { 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0xEEAFFFFFFFFFFFFF, | |||||
0xE3EC968549F878A8, 0xDA959B1A13F7CC76, 0x084E9867D6EBE876, 0x8562B5045CB25748, 0x0E12909F97BADC66, 0x00006FE5D541F71C }; | |||||
const uint64_t p751p1[NWORDS64_FIELD] = { 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0xEEB0000000000000, | |||||
0xE3EC968549F878A8, 0xDA959B1A13F7CC76, 0x084E9867D6EBE876, 0x8562B5045CB25748, 0x0E12909F97BADC66, 0x00006FE5D541F71C }; | |||||
const uint64_t p751x2[NWORDS64_FIELD] = { 0xFFFFFFFFFFFFFFFE, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0xDD5FFFFFFFFFFFFF, | |||||
0xC7D92D0A93F0F151, 0xB52B363427EF98ED, 0x109D30CFADD7D0ED, 0x0AC56A08B964AE90, 0x1C25213F2F75B8CD, 0x0000DFCBAA83EE38 }; | |||||
// Order of Alice's subgroup | |||||
const uint64_t Alice_order[NWORDS64_ORDER] = { 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0010000000000000 }; | |||||
// Order of Bob's subgroup | |||||
const uint64_t Bob_order[NWORDS64_ORDER] = { 0xC968549F878A8EEB, 0x59B1A13F7CC76E3E, 0xE9867D6EBE876DA9, 0x2B5045CB25748084, 0x2909F97BADC66856, 0x06FE5D541F71C0E1 }; | |||||
// Alice's generator values {XPA0 + XPA1*i, XQA0, XRA0 + XRA1*i} in GF(p751^2), expressed in Montgomery representation | |||||
const uint64_t A_gen[5 * NWORDS64_FIELD] = { 0xC2FC08CEAB50AD8B, 0x1D7D710F55E457B1, 0xE8738D92953DCD6E, 0xBAA7EBEE8A3418AA, 0xC9A288345F03F46F, 0xC8D18D167CFE2616, | |||||
0x02043761F6B1C045, 0xAA1975E13180E7E9, 0x9E13D3FDC6690DE6, 0x3A024640A3A3BB4F, 0x4E5AD44E6ACBBDAE, 0x0000544BEB561DAD, // XPA0 | |||||
0xE6CC41D21582E411, 0x07C2ECB7C5DF400A, 0xE8E34B521432AEC4, 0x50761E2AB085167D, 0x032CFBCAA6094B3C, 0x6C522F5FDF9DDD71, | |||||
0x1319217DC3A1887D, 0xDC4FB25803353A86, 0x362C8D7B63A6AB09, 0x39DCDFBCE47EA488, 0x4C27C99A2C28D409, 0x00003CB0075527C4, // XPA1 | |||||
0xD56FE52627914862, 0x1FAD60DC96B5BAEA, 0x01E137D0BF07AB91, 0x404D3E9252161964, 0x3C5385E4CD09A337, 0x4476426769E4AF73, | |||||
0x9790C6DB989DFE33, 0xE06E1C04D2AA8B5E, 0x38C08185EDEA73B9, 0xAA41F678A4396CA6, 0x92B9259B2229E9A0, 0x00002F9326818BE0, // XQA0 | |||||
0x0BB84441DFFD19B3, 0x84B4DEA99B48C18E, 0x692DE648AD313805, 0xE6D72761B6DFAEE0, 0x223975C672C3058D, 0xA0FDE0C3CBA26FDC, | |||||
0xA5326132A922A3CA, 0xCA5E7F5D5EA96FA4, 0x127C7EFE33FFA8C6, 0x4749B1567E2A23C4, 0x2B7DF5B4AF413BFA, 0x0000656595B9623C, // XRA0 | |||||
0xED78C17F1EC71BE8, 0xF824D6DF753859B1, 0x33A10839B2A8529F, 0xFC03E9E25FDEA796, 0xC4708A8054DF1762, 0x4034F2EC034C6467, | |||||
0xABFB70FBF06ECC79, 0xDABE96636EC108B7, 0x49CBCFB090605FD3, 0x20B89711819A45A7, 0xFB8E1590B2B0F63E, 0x0000556A5F964AB2 }; // XRA1 | |||||
// Bob's generator values {XPB0 + XPB1*i, XQB0, XRB0 + XRB1*i} in GF(p751^2), expressed in Montgomery representation | |||||
const uint64_t B_gen[5 * NWORDS64_FIELD] = { 0xCFB6D71EF867AB0B, 0x4A5FDD76E9A45C76, 0x38B1EE69194B1F03, 0xF6E7B18A7761F3F0, 0xFCF01A486A52C84C, 0xCBE2F63F5AA75466, | |||||
0x6487BCE837B5E4D6, 0x7747F5A8C622E9B8, 0x4CBFE1E4EE6AEBBA, 0x8A8616A13FA91512, 0x53DB980E1579E0A5, 0x000058FEBFF3BE69, // XPB0 | |||||
0xA492034E7C075CC3, 0x677BAF00B04AA430, 0x3AAE0C9A755C94C8, 0x1DC4B064E9EBB08B, 0x3684EDD04E826C66, 0x9BAA6CB661F01B22, | |||||
0x20285A00AD2EFE35, 0xDCE95ABD0497065F, 0x16C7FBB3778E3794, 0x26B3AC29CEF25AAF, 0xFB3C28A31A30AC1D, 0x000046ED190624EE, // XPB1 | |||||
0xF1A8C9ED7B96C4AB, 0x299429DA5178486E, 0xEF4926F20CD5C2F4, 0x683B2E2858B4716A, 0xDDA2FBCC3CAC3EEB, 0xEC055F9F3A600460, | |||||
0xD5A5A17A58C3848B, 0x4652D836F42EAED5, 0x2F2E71ED78B3A3B3, 0xA771C057180ADD1D, 0xC780A5D2D835F512, 0x0000114EA3B55AC1, // XQB0 | |||||
0x1C0D6733769D0F31, 0xF084C3086E2659D1, 0xE23D5DA27BCBD133, 0xF38EC9A8D5864025, 0x6426DC781B3B645B, 0x4B24E8E3C9FB03EE, | |||||
0x6432792F9D2CEA30, 0x7CC8E8B1AE76E857, 0x7F32BFB626BB8963, 0xB9F05995B48D7B74, 0x4D71200A7D67E042, 0x0000228457AF0637, // XRB0 | |||||
0x4AE37E7D8F72BD95, 0xDD2D504B3E993488, 0x5D14E7FA1ECB3C3E, 0x127610CEB75D6350, 0x255B4B4CAC446B11, 0x9EA12336C1F70CAF, | |||||
0x79FA68A2147BC2F8, 0x11E895CFDADBBC49, 0xE4B9D3C4D6356C18, 0x44B25856A67F951C, 0x5851541F61308D0B, 0x00002FFD994F7E4C }; // XRB1 | |||||
// Montgomery constant Montgomery_R2 = (2^768)^2 mod p751 | |||||
const uint64_t Montgomery_R2[NWORDS64_FIELD] = { 0x233046449DAD4058, 0xDB010161A696452A, 0x5E36941472E3FD8E, 0xF40BFE2082A2E706, 0x4932CCA8904F8751 ,0x1F735F1F1EE7FC81, | |||||
0xA24F4D80C1048E18, 0xB56C383CCDB607C5, 0x441DD47B735F9C90, 0x5673ED2C6A6AC82A, 0x06C905261132294B, 0x000041AD830F1F35 }; | |||||
// Value one in Montgomery representation | |||||
const uint64_t Montgomery_one[NWORDS64_FIELD] = { 0x00000000000249ad, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x8310000000000000, | |||||
0x5527b1e4375c6c66, 0x697797bf3f4f24d0, 0xc89db7b2ac5c4e2e, 0x4ca4b439d2076956, 0x10f7926c7512c7e9, 0x00002d5b24bce5e2 }; | |||||
// Value (2^384)^2 mod 3^239 | |||||
const uint64_t Montgomery_Rprime[NWORDS64_ORDER] = { 0x1A55482318541298, 0x070A6370DFA12A03, 0xCB1658E0E3823A40, 0xB3B7384EB5DEF3F9, 0xCBCA952F7006EA33, 0x00569EF8EC94864C }; | |||||
// Value -(3^239)^-1 mod 2^384 | |||||
const uint64_t Montgomery_rprime[NWORDS64_ORDER] = { 0x48062A91D3AB563D, 0x6CE572751303C2F5, 0x5D1319F3F160EC9D, 0xE35554E8C2D5623A, 0xCA29300232BC79A5, 0x8AAD843D646D78C5 }; | |||||
// Value order_Bob/3 mod p751 | |||||
const uint64_t Border_div3[NWORDS_ORDER] = { 0xEDCD718A828384F9, 0x733B35BFD4427A14, 0xF88229CF94D7CF38, 0x63C56C990C7C2AD6, 0xB858A87E8F4222C7, 0x0254C9C6B525EAF5 }; | |||||
// Fixed parameters for isogeny tree computation | |||||
const unsigned int strat_Alice[MAX_Alice-1] = { | |||||
80, 48, 27, 15, 8, 4, 2, 1, 1, 2, 1, 1, 4, 2, 1, 1, 2, 1, 1, 7, 4, 2, 1, 1, 2, 1, | |||||
1, 3, 2, 1, 1, 1, 1, 12, 7, 4, 2, 1, 1, 2, 1, 1, 3, 2, 1, 1, 1, 1, 5, 3, 2, 1, 1, | |||||
1, 1, 2, 1, 1, 1, 21, 12, 7, 4, 2, 1, 1, 2, 1, 1, 3, 2, 1, 1, 1, 1, 5, 3, 2, 1, | |||||
1, 1, 1, 2, 1, 1, 1, 9, 5, 3, 2, 1, 1, 1, 1, 2, 1, 1, 1, 4, 2, 1, 1, 1, 2, 1, 1, | |||||
33, 20, 12, 7, 4, 2, 1, 1, 2, 1, 1, 3, 2, 1, 1, 1, 1, 5, 3, 2, 1, 1, 1, 1, 2, 1, | |||||
1, 1, 8, 5, 3, 2, 1, 1, 1, 1, 2, 1, 1, 1, 4, 2, 1, 1, 2, 1, 1, 16, 8, 4, 2, 1, 1, | |||||
1, 2, 1, 1, 4, 2, 1, 1, 2, 1, 1, 8, 4, 2, 1, 1, 2, 1, 1, 4, 2, 1, 1, 2, 1, 1 }; | |||||
const unsigned int strat_Bob[MAX_Bob-1] = { | |||||
112, 63, 32, 16, 8, 4, 2, 1, 1, 2, 1, 1, 4, 2, 1, 1, 2, 1, 1, 8, 4, 2, 1, 1, 2, 1, | |||||
1, 4, 2, 1, 1, 2, 1, 1, 16, 8, 4, 2, 1, 1, 2, 1, 1, 4, 2, 1, 1, 2, 1, 1, 8, 4, 2, | |||||
1, 1, 2, 1, 1, 4, 2, 1, 1, 2, 1, 1, 31, 16, 8, 4, 2, 1, 1, 2, 1, 1, 4, 2, 1, 1, 2, | |||||
1, 1, 8, 4, 2, 1, 1, 2, 1, 1, 4, 2, 1, 1, 2, 1, 1, 15, 8, 4, 2, 1, 1, 2, 1, 1, 4, | |||||
2, 1, 1, 2, 1, 1, 7, 4, 2, 1, 1, 2, 1, 1, 3, 2, 1, 1, 1, 1, 49, 31, 16, 8, 4, 2, | |||||
1, 1, 2, 1, 1, 4, 2, 1, 1, 2, 1, 1, 8, 4, 2, 1, 1, 2, 1, 1, 4, 2, 1, 1, 2, 1, 1, | |||||
15, 8, 4, 2, 1, 1, 2, 1, 1, 4, 2, 1, 1, 2, 1, 1, 7, 4, 2, 1, 1, 2, 1, 1, 3, 2, 1, | |||||
1, 1, 1, 21, 12, 8, 4, 2, 1, 1, 2, 1, 1, 4, 2, 1, 1, 2, 1, 1, 5, 3, 2, 1, 1, 1, 1, | |||||
2, 1, 1, 1, 9, 5, 3, 2, 1, 1, 1, 1, 2, 1, 1, 1, 4, 2, 1, 1, 1, 2, 1, 1 }; | |||||
// Setting up macro defines and including GF(p), GF(p^2), curve, isogeny and kex functions | |||||
#define fpcopy fpcopy751 | |||||
#define fpzero fpzero751 | |||||
#define fpadd fpadd751 | |||||
#define fpsub fpsub751 | |||||
#define fpneg fpneg751 | |||||
#define fpdiv2 fpdiv2_751 | |||||
#define fpcorrection fpcorrection751 | |||||
#define fpmul_mont fpmul751_mont | |||||
#define fpsqr_mont fpsqr751_mont | |||||
#define fpinv_mont fpinv751_mont | |||||
#define fpinv_chain_mont fpinv751_chain_mont | |||||
#define fpinv_mont_bingcd fpinv751_mont_bingcd | |||||
#define fp2copy fp2copy751 | |||||
#define fp2zero fp2zero751 | |||||
#define fp2add fp2add751 | |||||
#define fp2sub fp2sub751 | |||||
#define fp2neg fp2neg751 | |||||
#define fp2div2 fp2div2_751 | |||||
#define fp2correction fp2correction751 | |||||
#define fp2mul_mont fp2mul751_mont | |||||
#define fp2sqr_mont fp2sqr751_mont | |||||
#define fp2inv_mont fp2inv751_mont | |||||
#define fp2inv_mont_bingcd fp2inv751_mont_bingcd | |||||
#define fpequal_non_constant_time fpequal751_non_constant_time | |||||
#define mp_add_asm mp_add751_asm | |||||
#define mp_addx2_asm mp_add751x2_asm | |||||
#define mp_subx2_asm mp_sub751x2_asm | |||||
#include "fpx.c" | |||||
#include "ec_isogeny.c" | |||||
#include "sidh.c" | |||||
#include "sike.c" |
@@ -0,0 +1,255 @@ | |||||
/******************************************************************************************** | |||||
* Supersingular Isogeny Key Encapsulation Library | |||||
* | |||||
* Abstract: internal header file for P751 | |||||
*********************************************************************************************/ | |||||
#ifndef __P751_INTERNAL_H__ | |||||
#define __P751_INTERNAL_H__ | |||||
#include "api.h" | |||||
#define NWORDS_FIELD 12 // Number of words of a 751-bit field element | |||||
#define p751_ZERO_WORDS 5 // Number of "0" digits in the least significant part of p751 + 1 | |||||
// Basic constants | |||||
#define NBITS_FIELD 751 | |||||
#define MAXBITS_FIELD 768 | |||||
#define MAXWORDS_FIELD ((MAXBITS_FIELD + RADIX - 1) / RADIX) // Max. number of words to represent field elements | |||||
#define NWORDS64_FIELD ((NBITS_FIELD + 63) / 64) // Number of 64-bit words of a 751-bit field element | |||||
#define NBITS_ORDER 384 | |||||
#define NWORDS_ORDER ((NBITS_ORDER + RADIX - 1) / RADIX) // Number of words of oA and oB, where oA and oB are the subgroup orders of Alice and Bob, resp. | |||||
#define NWORDS64_ORDER ((NBITS_ORDER + 63) / 64) // Number of 64-bit words of a 384-bit element | |||||
#define MAXBITS_ORDER NBITS_ORDER | |||||
#define MAXWORDS_ORDER ((MAXBITS_ORDER + RADIX - 1) / RADIX) // Max. number of words to represent elements in [1, oA-1] or [1, oB]. | |||||
#define ALICE 0 | |||||
#define BOB 1 | |||||
#define OALICE_BITS 372 | |||||
#define OBOB_BITS 379 | |||||
#define OBOB_EXPON 239 | |||||
#define MASK_ALICE 0x0F | |||||
#define MASK_BOB 0x03 | |||||
#define PRIME p751 | |||||
#define PARAM_A 0 | |||||
#define PARAM_C 1 | |||||
// Fixed parameters for isogeny tree computation | |||||
#define MAX_INT_POINTS_ALICE 8 | |||||
#define MAX_INT_POINTS_BOB 10 | |||||
#define MAX_Alice 186 | |||||
#define MAX_Bob 239 | |||||
#define MSG_BYTES 32 | |||||
#define SECRETKEY_A_BYTES (OALICE_BITS + 7) / 8 | |||||
#define SECRETKEY_B_BYTES (OBOB_BITS + 7) / 8 | |||||
#define FP2_ENCODED_BYTES 2 * ((NBITS_FIELD + 7) / 8) | |||||
// SIDH's basic element definitions and point representations | |||||
typedef digit_t felm_t[NWORDS_FIELD]; // Datatype for representing 751-bit field elements (768-bit max.) | |||||
typedef digit_t dfelm_t[2 * NWORDS_FIELD]; // Datatype for representing double-precision 2x751-bit field elements (2x768-bit max.) | |||||
typedef felm_t f2elm_t[2]; // Datatype for representing quadratic extension field elements GF(p751^2) | |||||
typedef f2elm_t publickey_t[3]; // Datatype for representing public keys equivalent to three GF(p751^2) elements | |||||
typedef struct | |||||
{ | |||||
f2elm_t X; | |||||
f2elm_t Z; | |||||
} point_proj; // Point representation in projective XZ Montgomery coordinates. | |||||
typedef point_proj point_proj_t[1]; | |||||
/**************** Function prototypes ****************/ | |||||
/************* Multiprecision functions **************/ | |||||
// Copy wordsize digits, c = a, where lng(a) = nwords | |||||
void copy_words(const digit_t *a, digit_t *c, const unsigned int nwords); | |||||
// Multiprecision addition, c = a+b, where lng(a) = lng(b) = nwords. Returns the carry bit | |||||
unsigned int mp_add(const digit_t *a, const digit_t *b, digit_t *c, const unsigned int nwords); | |||||
// 751-bit multiprecision addition, c = a+b | |||||
void mp_add751(const digit_t *a, const digit_t *b, digit_t *c); | |||||
void mp_add751_asm(const digit_t *a, const digit_t *b, digit_t *c); | |||||
//void mp_addmask751_asm(const digit_t* a, const digit_t mask, digit_t* c); | |||||
// 2x751-bit multiprecision addition, c = a+b | |||||
void mp_add751x2(const digit_t *a, const digit_t *b, digit_t *c); | |||||
void mp_add751x2_asm(const digit_t *a, const digit_t *b, digit_t *c); | |||||
// Multiprecision subtraction, c = a-b, where lng(a) = lng(b) = nwords. Returns the borrow bit | |||||
unsigned int mp_sub(const digit_t *a, const digit_t *b, digit_t *c, const unsigned int nwords); | |||||
digit_t mp_sub751x2_asm(const digit_t *a, const digit_t *b, digit_t *c); | |||||
// Multiprecision left shift | |||||
void mp_shiftleft(digit_t *x, unsigned int shift, const unsigned int nwords); | |||||
// Multiprecision right shift by one | |||||
void mp_shiftr1(digit_t *x, const unsigned int nwords); | |||||
// Multiprecision left right shift by one | |||||
void mp_shiftl1(digit_t *x, const unsigned int nwords); | |||||
// Digit multiplication, digit * digit -> 2-digit result | |||||
void digit_x_digit(const digit_t a, const digit_t b, digit_t *c); | |||||
// Multiprecision comba multiply, c = a*b, where lng(a) = lng(b) = nwords. | |||||
void mp_mul(const digit_t *a, const digit_t *b, digit_t *c, const unsigned int nwords); | |||||
void multiply(const digit_t *a, const digit_t *b, digit_t *c, const unsigned int nwords); | |||||
// Montgomery multiplication modulo the group order, mc = ma*mb*r' mod order, where ma,mb,mc in [0, order-1] | |||||
void Montgomery_multiply_mod_order(const digit_t *ma, const digit_t *mb, digit_t *mc, const digit_t *order, const digit_t *Montgomery_rprime); | |||||
// (Non-constant time) Montgomery inversion modulo the curve order using a^(-1) = a^(order-2) mod order | |||||
//void Montgomery_inversion_mod_order(const digit_t* ma, digit_t* mc, const digit_t* order, const digit_t* Montgomery_rprime); | |||||
void Montgomery_inversion_mod_order_bingcd(const digit_t *a, digit_t *c, const digit_t *order, const digit_t *Montgomery_rprime, const digit_t *Montgomery_R2); | |||||
// Conversion of elements in Z_r to Montgomery representation, where the order r is up to 384 bits. | |||||
void to_Montgomery_mod_order(const digit_t *a, digit_t *mc, const digit_t *order, const digit_t *Montgomery_rprime, const digit_t *Montgomery_Rprime); | |||||
// Conversion of elements in Z_r from Montgomery to standard representation, where the order is up to 384 bits. | |||||
void from_Montgomery_mod_order(const digit_t *ma, digit_t *c, const digit_t *order, const digit_t *Montgomery_rprime); | |||||
// Inversion modulo Alice's order 2^372. | |||||
void inv_mod_orderA(const digit_t *a, digit_t *c); | |||||
/************ Field arithmetic functions *************/ | |||||
// Copy of a field element, c = a | |||||
void fpcopy751(const felm_t a, felm_t c); | |||||
// Zeroing a field element, a = 0 | |||||
void fpzero751(felm_t a); | |||||
// Non constant-time comparison of two field elements. If a = b return TRUE, otherwise, return FALSE | |||||
bool fpequal751_non_constant_time(const felm_t a, const felm_t b); | |||||
// Modular addition, c = a+b mod p751 | |||||
extern void fpadd751(const digit_t *a, const digit_t *b, digit_t *c); | |||||
extern void fpadd751_asm(const digit_t *a, const digit_t *b, digit_t *c); | |||||
// Modular subtraction, c = a-b mod p751 | |||||
extern void fpsub751(const digit_t *a, const digit_t *b, digit_t *c); | |||||
extern void fpsub751_asm(const digit_t *a, const digit_t *b, digit_t *c); | |||||
// Modular negation, a = -a mod p751 | |||||
extern void fpneg751(digit_t *a); | |||||
// Modular division by two, c = a/2 mod p751. | |||||
void fpdiv2_751(const digit_t *a, digit_t *c); | |||||
// Modular correction to reduce field element a in [0, 2*p751-1] to [0, p751-1]. | |||||
void fpcorrection751(digit_t *a); | |||||
// 751-bit Montgomery reduction, c = a mod p | |||||
void rdc_mont(const digit_t *a, digit_t *c); | |||||
// Field multiplication using Montgomery arithmetic, c = a*b*R^-1 mod p751, where R=2^768 | |||||
void fpmul751_mont(const felm_t a, const felm_t b, felm_t c); | |||||
void mul751_asm(const felm_t a, const felm_t b, dfelm_t c); | |||||
void rdc751_asm(const dfelm_t ma, dfelm_t mc); | |||||
// Field squaring using Montgomery arithmetic, c = a*b*R^-1 mod p751, where R=2^768 | |||||
void fpsqr751_mont(const felm_t ma, felm_t mc); | |||||
// Conversion to Montgomery representation | |||||
void to_mont(const felm_t a, felm_t mc); | |||||
// Conversion from Montgomery representation to standard representation | |||||
void from_mont(const felm_t ma, felm_t c); | |||||
// Field inversion, a = a^-1 in GF(p751) | |||||
void fpinv751_mont(felm_t a); | |||||
// Field inversion, a = a^-1 in GF(p751) using the binary GCD | |||||
void fpinv751_mont_bingcd(felm_t a); | |||||
// Chain to compute (p751-3)/4 using Montgomery arithmetic | |||||
void fpinv751_chain_mont(felm_t a); | |||||
/************ GF(p^2) arithmetic functions *************/ | |||||
// Copy of a GF(p751^2) element, c = a | |||||
void fp2copy751(const f2elm_t a, f2elm_t c); | |||||
// Zeroing a GF(p751^2) element, a = 0 | |||||
void fp2zero751(f2elm_t a); | |||||
// GF(p751^2) negation, a = -a in GF(p751^2) | |||||
void fp2neg751(f2elm_t a); | |||||
// GF(p751^2) addition, c = a+b in GF(p751^2) | |||||
extern void fp2add751(const f2elm_t a, const f2elm_t b, f2elm_t c); | |||||
// GF(p751^2) subtraction, c = a-b in GF(p751^2) | |||||
extern void fp2sub751(const f2elm_t a, const f2elm_t b, f2elm_t c); | |||||
// GF(p751^2) division by two, c = a/2 in GF(p751^2) | |||||
void fp2div2_751(const f2elm_t a, f2elm_t c); | |||||
// Modular correction, a = a in GF(p751^2) | |||||
void fp2correction751(f2elm_t a); | |||||
// GF(p751^2) squaring using Montgomery arithmetic, c = a^2 in GF(p751^2) | |||||
void fp2sqr751_mont(const f2elm_t a, f2elm_t c); | |||||
// GF(p751^2) multiplication using Montgomery arithmetic, c = a*b in GF(p751^2) | |||||
void fp2mul751_mont(const f2elm_t a, const f2elm_t b, f2elm_t c); | |||||
// Conversion of a GF(p751^2) element to Montgomery representation | |||||
void to_fp2mont(const f2elm_t a, f2elm_t mc); | |||||
// Conversion of a GF(p751^2) element from Montgomery representation to standard representation | |||||
void from_fp2mont(const f2elm_t ma, f2elm_t c); | |||||
// GF(p751^2) inversion using Montgomery arithmetic, a = (a0-i*a1)/(a0^2+a1^2) | |||||
void fp2inv751_mont(f2elm_t a); | |||||
// GF(p751^2) inversion, a = (a0-i*a1)/(a0^2+a1^2), GF(p751) inversion done using the binary GCD | |||||
void fp2inv751_mont_bingcd(f2elm_t a); | |||||
// n-way Montgomery inversion | |||||
void mont_n_way_inv(const f2elm_t *vec, const int n, f2elm_t *out); | |||||
/************ Elliptic curve and isogeny functions *************/ | |||||
// Computes the j-invariant of a Montgomery curve with projective constant. | |||||
void j_inv(const f2elm_t A, const f2elm_t C, f2elm_t jinv); | |||||
// Simultaneous doubling and differential addition. | |||||
void xDBLADD(point_proj_t P, point_proj_t Q, const f2elm_t xPQ, const f2elm_t A24); | |||||
// Doubling of a Montgomery point in projective coordinates (X:Z). | |||||
void xDBL(const point_proj_t P, point_proj_t Q, const f2elm_t A24plus, const f2elm_t C24); | |||||
// Computes [2^e](X:Z) on Montgomery curve with projective constant via e repeated doublings. | |||||
void xDBLe(const point_proj_t P, point_proj_t Q, const f2elm_t A24plus, const f2elm_t C24, const int e); | |||||
// Differential addition. | |||||
void xADD(point_proj_t P, const point_proj_t Q, const f2elm_t xPQ); | |||||
// Computes the corresponding 4-isogeny of a projective Montgomery point (X4:Z4) of order 4. | |||||
void get_4_isog(const point_proj_t P, f2elm_t A24plus, f2elm_t C24, f2elm_t *coeff); | |||||
// Evaluates the isogeny at the point (X:Z) in the domain of the isogeny. | |||||
void eval_4_isog(point_proj_t P, f2elm_t *coeff); | |||||
// Tripling of a Montgomery point in projective coordinates (X:Z). | |||||
void xTPL(const point_proj_t P, point_proj_t Q, const f2elm_t A24minus, const f2elm_t A24plus); | |||||
// Computes [3^e](X:Z) on Montgomery curve with projective constant via e repeated triplings. | |||||
void xTPLe(const point_proj_t P, point_proj_t Q, const f2elm_t A24minus, const f2elm_t A24plus, const int e); | |||||
// Computes the corresponding 3-isogeny of a projective Montgomery point (X3:Z3) of order 3. | |||||
void get_3_isog(const point_proj_t P, f2elm_t A24minus, f2elm_t A24plus, f2elm_t *coeff); | |||||
// Computes the 3-isogeny R=phi(X:Z), given projective point (X3:Z3) of order 3 on a Montgomery curve and a point P with coefficients given in coeff. | |||||
void eval_3_isog(point_proj_t Q, const f2elm_t *coeff); | |||||
// 3-way simultaneous inversion | |||||
void inv_3_way(f2elm_t z1, f2elm_t z2, f2elm_t z3); | |||||
// Given the x-coordinates of P, Q, and R, returns the value A corresponding to the Montgomery curve E_A: y^2=x^3+A*x^2+x such that R=Q-P on E_A. | |||||
void get_A(const f2elm_t xP, const f2elm_t xQ, const f2elm_t xR, f2elm_t A); | |||||
#endif |
@@ -0,0 +1,214 @@ | |||||
/******************************************************************************************** | |||||
* SIDH: an efficient supersingular isogeny-based cryptography library for ephemeral | |||||
* Diffie-Hellman key exchange. | |||||
* | |||||
* Copyright (c) Microsoft Corporation. All rights reserved. | |||||
* | |||||
* | |||||
* Abstract: main header file | |||||
* | |||||
*********************************************************************************************/ | |||||
#ifndef __SIDH_H__ | |||||
#define __SIDH_H__ | |||||
#include <stdint.h> | |||||
#include <stdbool.h> | |||||
#include <stddef.h> | |||||
// Definition of operating system | |||||
#define OS_WIN 1 | |||||
#define OS_LINUX 2 | |||||
#define OS_TARGET OS_LINUX | |||||
#define COMPILER_VC 1 | |||||
#define COMPILER_GCC 2 | |||||
#define COMPILER_CLANG 3 | |||||
#define COMPILER COMPILER_GCC | |||||
// Definition of the targeted architecture and basic data types | |||||
#define TARGET_AMD64 1 | |||||
#define TARGET_x86 2 | |||||
#define TARGET_ARM 3 | |||||
#define TARGET_ARM64 4 | |||||
#define TARGET TARGET_AMD64 | |||||
#define RADIX 64 | |||||
typedef uint64_t digit_t; // Unsigned 64-bit digit | |||||
typedef int64_t sdigit_t; // Signed 64-bit digit | |||||
typedef uint32_t hdigit_t; // Unsigned 32-bit digit | |||||
#define NWORDS_FIELD 12 // Number of words of a 751-bit field element | |||||
#define p751_ZERO_WORDS 5 // Number of "0" digits in the least significant part of p751 + 1 | |||||
#define RADIX64 64 | |||||
// Selection of generic, portable implementation | |||||
// Unsupported configurations | |||||
#if (TARGET != TARGET_AMD64) && (TARGET != TARGET_ARM64) && !defined(GENERIC_IMPLEMENTATION) | |||||
#error-- "Unsupported configuration" | |||||
#endif | |||||
// Extended datatype support | |||||
#if defined(GENERIC_IMPLEMENTATION) | |||||
typedef uint64_t uint128_t[2]; | |||||
#elif (TARGET == TARGET_AMD64 && OS_TARGET == OS_LINUX) && (COMPILER == COMPILER_GCC || COMPILER == COMPILER_CLANG) | |||||
#define UINT128_SUPPORT | |||||
typedef unsigned uint128_t __attribute__((mode(TI))); | |||||
#elif (TARGET == TARGET_ARM64 && OS_TARGET == OS_LINUX) && (COMPILER == COMPILER_GCC || COMPILER == COMPILER_CLANG) | |||||
#define UINT128_SUPPORT | |||||
typedef unsigned uint128_t __attribute__((mode(TI))); | |||||
#elif (TARGET == TARGET_AMD64) && (OS_TARGET == OS_WIN && COMPILER == COMPILER_VC) | |||||
#define SCALAR_INTRIN_SUPPORT | |||||
typedef uint64_t uint128_t[2]; | |||||
#else | |||||
#error-- "Unsupported configuration" | |||||
#endif | |||||
// Basic constants | |||||
#define NBITS_FIELD 751 | |||||
#define MAXBITS_FIELD 768 | |||||
#define MAXWORDS_FIELD ((MAXBITS_FIELD + RADIX - 1) / RADIX) // Max. number of words to represent field elements | |||||
#define NWORDS64_FIELD ((NBITS_FIELD + 63) / 64) // Number of 64-bit words of a 751-bit field element | |||||
#define NBITS_ORDER 384 | |||||
#define NWORDS_ORDER ((NBITS_ORDER + RADIX - 1) / RADIX) // Number of words of oA and oB, where oA and oB are the subgroup orders of Alice and Bob, resp. | |||||
#define NWORDS64_ORDER ((NBITS_ORDER + 63) / 64) // Number of 64-bit words of a 384-bit element | |||||
#define MAXBITS_ORDER NBITS_ORDER | |||||
#define MAXWORDS_ORDER ((MAXBITS_ORDER + RADIX - 1) / RADIX) // Max. number of words to represent elements in [1, oA-1] or [1, oB]. | |||||
// Basic constants for elliptic curve BigMont | |||||
#define BIGMONT_NBITS_ORDER 749 | |||||
#define BIGMONT_MAXBITS_ORDER 768 | |||||
#define BIGMONT_NWORDS_ORDER ((BIGMONT_NBITS_ORDER + RADIX - 1) / RADIX) // Number of words of BigMont's subgroup order. | |||||
#define BIGMONT_MAXWORDS_ORDER ((BIGMONT_MAXBITS_ORDER + RADIX - 1) / RADIX) // Max. number of words to represent elements in [1, BigMont_order]. | |||||
// Definitions of the error-handling type and error codes | |||||
typedef enum { | |||||
CRYPTO_SUCCESS, // 0x00 | |||||
CRYPTO_ERROR, // 0x01 | |||||
CRYPTO_ERROR_DURING_TEST, // 0x02 | |||||
CRYPTO_ERROR_UNKNOWN, // 0x03 | |||||
CRYPTO_ERROR_NOT_IMPLEMENTED, // 0x04 | |||||
CRYPTO_ERROR_NO_MEMORY, // 0x05 | |||||
CRYPTO_ERROR_INVALID_PARAMETER, // 0x06 | |||||
CRYPTO_ERROR_SHARED_KEY, // 0x07 | |||||
CRYPTO_ERROR_PUBLIC_KEY_VALIDATION, // 0x08 | |||||
CRYPTO_ERROR_TOO_MANY_ITERATIONS, // 0x09 | |||||
CRYPTO_ERROR_END_OF_LIST | |||||
} CRYPTO_STATUS; | |||||
#define CRYPTO_STATUS_TYPE_SIZE (CRYPTO_ERROR_END_OF_LIST) | |||||
// Definitions of the error messages | |||||
// NOTE: they must match the error codes above | |||||
#define CRYPTO_MSG_SUCCESS "CRYPTO_SUCCESS" | |||||
#define CRYPTO_MSG_ERROR "CRYPTO_ERROR" | |||||
#define CRYPTO_MSG_ERROR_DURING_TEST "CRYPTO_ERROR_DURING_TEST" | |||||
#define CRYPTO_MSG_ERROR_UNKNOWN "CRYPTO_ERROR_UNKNOWN" | |||||
#define CRYPTO_MSG_ERROR_NOT_IMPLEMENTED "CRYPTO_ERROR_NOT_IMPLEMENTED" | |||||
#define CRYPTO_MSG_ERROR_NO_MEMORY "CRYPTO_ERROR_NO_MEMORY" | |||||
#define CRYPTO_MSG_ERROR_INVALID_PARAMETER "CRYPTO_ERROR_INVALID_PARAMETER" | |||||
#define CRYPTO_MSG_ERROR_SHARED_KEY "CRYPTO_ERROR_SHARED_KEY" | |||||
#define CRYPTO_MSG_ERROR_PUBLIC_KEY_VALIDATION "CRYPTO_ERROR_PUBLIC_KEY_VALIDATION" | |||||
#define CRYPTO_MSG_ERROR_TOO_MANY_ITERATIONS "CRYPTO_ERROR_TOO_MANY_ITERATIONS" | |||||
// Definition of type random_bytes to implement callback functions outputting "nbytes" random values to "random_array" | |||||
typedef CRYPTO_STATUS (*RandomBytes)(unsigned int nbytes, unsigned char *random_array); | |||||
// Definition of type for curve isogeny system identifiers. Currently valid value is "SIDHp751" (see SIDH.h) | |||||
typedef char CurveIsogeny_ID[10]; | |||||
// Supersingular elliptic curve isogeny structures: | |||||
// This data struct contains the static curve isogeny data | |||||
typedef struct | |||||
{ | |||||
CurveIsogeny_ID CurveIsogeny; // Curve isogeny system identifier, base curve defined over GF(p^2) | |||||
unsigned int pwordbits; // Smallest multiple of 32 larger than the prime bitlength | |||||
unsigned int owordbits; // Smallest multiple of 32 larger than the order bitlength | |||||
unsigned int pbits; // Bitlength of the prime p | |||||
uint64_t prime[MAXWORDS_FIELD]; // Prime p | |||||
uint64_t A[MAXWORDS_FIELD]; // Base curve parameter "A" | |||||
uint64_t C[MAXWORDS_FIELD]; // Base curve parameter "C" | |||||
unsigned int oAbits; // Order bitlength for Alice | |||||
uint64_t Aorder[MAXWORDS_ORDER]; // Order of Alice's (sub)group | |||||
unsigned int oBbits; // Order bitlength for Bob | |||||
unsigned int eB; // Power of Bob's subgroup order (i.e., oB = 3^eB) | |||||
uint64_t Border[MAXWORDS_ORDER]; // Order of Bob's (sub)group | |||||
uint64_t PA[2 * MAXWORDS_FIELD]; // Alice's generator PA = (XPA,YPA), where XPA and YPA are defined over GF(p) | |||||
uint64_t PB[2 * MAXWORDS_FIELD]; // Bob's generator PB = (XPB,YPB), where XPB and YPB are defined over GF(p) | |||||
unsigned int BigMont_A24; // BigMont's curve parameter A24 = (A+2)/4 | |||||
uint64_t BigMont_order[BIGMONT_MAXWORDS_ORDER]; // BigMont's subgroup order | |||||
uint64_t Montgomery_R2[MAXWORDS_FIELD]; // Montgomery constant (2^W)^2 mod p, using a suitable value W | |||||
uint64_t Montgomery_pp[MAXWORDS_FIELD]; // Montgomery constant -p^-1 mod 2^W, using a suitable value W | |||||
uint64_t Montgomery_one[MAXWORDS_FIELD]; // Value one in Montgomery representation | |||||
} CurveIsogenyStaticData, *PCurveIsogenyStaticData; | |||||
// This data struct is initialized with the targeted curve isogeny system during setup | |||||
typedef struct | |||||
{ | |||||
CurveIsogeny_ID CurveIsogeny; // Curve isogeny system identifier, base curve defined over GF(p^2) | |||||
unsigned int pwordbits; // Closest multiple of 32 to prime bitlength | |||||
unsigned int owordbits; // Closest multiple of 32 to order bitlength | |||||
unsigned int pbits; // Bitlength of the prime p | |||||
digit_t *prime; // Prime p | |||||
digit_t *A; // Base curve parameter "A" | |||||
digit_t *C; // Base curve parameter "C" | |||||
unsigned int oAbits; // Order bitlength for Alice | |||||
digit_t *Aorder; // Order of Alice's (sub)group | |||||
unsigned int oBbits; // Order bitlength for Bob | |||||
unsigned int eB; // Power of Bob's subgroup order (i.e., oB = 3^eB) | |||||
digit_t *Border; // Order of Bob's (sub)group | |||||
digit_t *PA; // Alice's generator PA = (XPA,YPA), where XPA and YPA are defined over GF(p) | |||||
digit_t *PB; // Bob's generator PB = (XPB,YPB), where XPB and YPB are defined over GF(p) | |||||
unsigned int BigMont_A24; // BigMont's curve parameter A24 = (A+2)/4 | |||||
digit_t *BigMont_order; // BigMont's subgroup order | |||||
digit_t *Montgomery_R2; // Montgomery constant (2^W)^2 mod p, using a suitable value W | |||||
digit_t *Montgomery_pp; // Montgomery constant -p^-1 mod 2^W, using a suitable value W | |||||
digit_t *Montgomery_one; // Value one in Montgomery representation | |||||
RandomBytes RandomBytesFunction; // Function providing random bytes to generate nonces or secret keys | |||||
} CurveIsogenyStruct, *PCurveIsogenyStruct; | |||||
// Supported curve isogeny systems: | |||||
// "SIDHp751", base curve: supersingular elliptic curve E: y^2 = x^3 + x | |||||
extern CurveIsogenyStaticData CurveIsogeny_SIDHp751; | |||||
/******************** Function prototypes ***********************/ | |||||
/*************** Setup/initialization functions *****************/ | |||||
// Dynamic allocation of memory for curve isogeny structure. | |||||
// Returns NULL on error. | |||||
PCurveIsogenyStruct SIDH_curve_allocate(PCurveIsogenyStaticData CurveData); | |||||
// Initialize curve isogeny structure pCurveIsogeny with static data extracted from pCurveIsogenyData. | |||||
// This needs to be called after allocating memory for "pCurveIsogeny" using SIDH_curve_allocate(). | |||||
CRYPTO_STATUS SIDH_curve_initialize(PCurveIsogenyStruct pCurveIsogeny, RandomBytes RandomBytesFunction, PCurveIsogenyStaticData pCurveIsogenyData); | |||||
// Free memory for curve isogeny structure | |||||
void SIDH_curve_free(PCurveIsogenyStruct pCurveIsogeny); | |||||
// Output error/success message for a given CRYPTO_STATUS | |||||
const char *SIDH_get_error_message(CRYPTO_STATUS Status); | |||||
// Output random values in the range [1, order-1] in little endian format that can be used as private keys. | |||||
CRYPTO_STATUS random_mod_order(digit_t *random_digits, unsigned int AliceOrBob, PCurveIsogenyStruct pCurveIsogeny); | |||||
// Output random values in the range [1, BigMont_order-1] in little endian format that can be used as private keys | |||||
// to compute scalar multiplications using the elliptic curve BigMont. | |||||
CRYPTO_STATUS random_BigMont_mod_order(digit_t *random_digits, PCurveIsogenyStruct pCurveIsogeny); | |||||
// Clear "nwords" digits from memory | |||||
void clear_words(void *mem, digit_t nwords); | |||||
#endif |
@@ -0,0 +1,109 @@ | |||||
/******************************************************************************************** | |||||
* Supersingular Isogeny Key Encapsulation Library | |||||
* | |||||
* Abstract: API header file for P751 | |||||
*********************************************************************************************/ | |||||
#ifndef __P751_API_H__ | |||||
#define __P751_API_H__ | |||||
#include "config.h" | |||||
/*********************** Key encapsulation mechanism API ***********************/ | |||||
#define CRYPTO_SECRETKEYBYTES 644 // MSG_BYTES + SECRETKEY_B_BYTES + CRYPTO_PUBLICKEYBYTES bytes | |||||
#define CRYPTO_PUBLICKEYBYTES 564 | |||||
#define CRYPTO_BYTES 24 | |||||
#define CRYPTO_CIPHERTEXTBYTES 596 // CRYPTO_PUBLICKEYBYTES + MSG_BYTES bytes | |||||
// Algorithm name | |||||
#define CRYPTO_ALGNAME "SIKEp751" | |||||
// SIKE's key generation | |||||
// It produces a private key sk and computes the public key pk. | |||||
// Outputs: secret key sk (CRYPTO_SECRETKEYBYTES = 644 bytes) | |||||
// public key pk (CRYPTO_PUBLICKEYBYTES = 564 bytes) | |||||
int crypto_kem_keypair(unsigned char *pk, unsigned char *sk); | |||||
// SIKE's encapsulation | |||||
// Input: public key pk (CRYPTO_PUBLICKEYBYTES = 564 bytes) | |||||
// Outputs: shared secret ss (CRYPTO_BYTES = 24 bytes) | |||||
// ciphertext message ct (CRYPTO_CIPHERTEXTBYTES = 596 bytes) | |||||
int crypto_kem_enc(unsigned char *ct, unsigned char *ss, const unsigned char *pk); | |||||
// SIKE's decapsulation | |||||
// Input: secret key sk (CRYPTO_SECRETKEYBYTES = 644 bytes) | |||||
// ciphertext message ct (CRYPTO_CIPHERTEXTBYTES = 596 bytes) | |||||
// Outputs: shared secret ss (CRYPTO_BYTES = 24 bytes) | |||||
int crypto_kem_dec(unsigned char *ss, const unsigned char *ct, const unsigned char *sk); | |||||
// Encoding of keys for KEM-based isogeny system "SIKEp751" (wire format): | |||||
// ---------------------------------------------------------------------- | |||||
// Elements over GF(p751) are encoded in 94 octets in little endian format (i.e., the least significant octet is located in the lowest memory address). | |||||
// Elements (a+b*i) over GF(p751^2), where a and b are defined over GF(p751), are encoded as {a, b}, with a in the lowest memory portion. | |||||
// | |||||
// Private keys sk consist of the concatenation of a 32-byte random value, a value in the range [0, 2^378-1] and the public key pk. In the SIKE API, | |||||
// private keys are encoded in 644 octets in little endian format. | |||||
// Public keys pk consist of 3 elements in GF(p751^2). In the SIKE API, pk is encoded in 564 octets. | |||||
// Ciphertexts ct consist of the concatenation of a public key value and a 32-byte value. In the SIKE API, ct is encoded in 564 + 32 = 596 octets. | |||||
// Shared keys ss consist of a value of 24 octets. | |||||
/*********************** Ephemeral key exchange API ***********************/ | |||||
#define SIDH_SECRETKEYBYTES 48 | |||||
#define SIDH_PUBLICKEYBYTES 564 | |||||
#define SIDH_BYTES 188 | |||||
// SECURITY NOTE: SIDH supports ephemeral Diffie-Hellman key exchange. It is NOT secure to use it with static keys. | |||||
// See "On the Security of Supersingular Isogeny Cryptosystems", S.D. Galbraith, C. Petit, B. Shani and Y.B. Ti, in ASIACRYPT 2016, 2016. | |||||
// Extended version available at: http://eprint.iacr.org/2016/859 | |||||
// Generation of Alice's secret key | |||||
// Outputs random value in [0, 2^372 - 1] to be used as Alice's private key | |||||
void random_mod_order_A(unsigned char* random_digits); | |||||
// Generation of Bob's secret key | |||||
// Outputs random value in [0, 2^Floor(Log(2,3^239)) - 1] to be used as Bob's private key | |||||
void random_mod_order_B(unsigned char* random_digits); | |||||
// Alice's ephemeral public key generation | |||||
// Input: a private key PrivateKeyA in the range [0, 2^372 - 1], stored in 47 bytes. | |||||
// Output: the public key PublicKeyA consisting of 3 GF(p751^2) elements encoded in 564 bytes. | |||||
int EphemeralKeyGeneration_A(const unsigned char* PrivateKeyA, unsigned char* PublicKeyA); | |||||
// Bob's ephemeral key-pair generation | |||||
// It produces a private key PrivateKeyB and computes the public key PublicKeyB. | |||||
// The private key is an integer in the range [0, 2^Floor(Log(2,3^239)) - 1], stored in 48 bytes. | |||||
// The public key consists of 3 GF(p751^2) elements encoded in 564 bytes. | |||||
int EphemeralKeyGeneration_B(const unsigned char* PrivateKeyB, unsigned char* PublicKeyB); | |||||
// Alice's ephemeral shared secret computation | |||||
// It produces a shared secret key SharedSecretA using her secret key PrivateKeyA and Bob's public key PublicKeyB | |||||
// Inputs: Alice's PrivateKeyA is an integer in the range [0, 2^372 - 1], stored in 47 bytes. | |||||
// Bob's PublicKeyB consists of 3 GF(p751^2) elements encoded in 564 bytes. | |||||
// Output: a shared secret SharedSecretA that consists of one element in GF(p751^2) encoded in 188 bytes. | |||||
int EphemeralSecretAgreement_A(const unsigned char* PrivateKeyA, const unsigned char* PublicKeyB, unsigned char* SharedSecretA); | |||||
// Bob's ephemeral shared secret computation | |||||
// It produces a shared secret key SharedSecretB using his secret key PrivateKeyB and Alice's public key PublicKeyA | |||||
// Inputs: Bob's PrivateKeyB is an integer in the range [0, 2^Floor(Log(2,3^239)) - 1], stored in 48 bytes. | |||||
// Alice's PublicKeyA consists of 3 GF(p751^2) elements encoded in 564 bytes. | |||||
// Output: a shared secret SharedSecretB that consists of one element in GF(p751^2) encoded in 188 bytes. | |||||
int EphemeralSecretAgreement_B(const unsigned char* PrivateKeyB, const unsigned char* PublicKeyA, unsigned char* SharedSecretB); | |||||
// Encoding of keys for KEX-based isogeny system "SIDHp751" (wire format): | |||||
// ---------------------------------------------------------------------- | |||||
// Elements over GF(p751) are encoded in 94 octets in little endian format (i.e., the least significant octet is located in the lowest memory address). | |||||
// Elements (a+b*i) over GF(p751^2), where a and b are defined over GF(p751), are encoded as {a, b}, with a in the lowest memory portion. | |||||
// | |||||
// Private keys PrivateKeyA and PrivateKeyB can have values in the range [0, 2^372-1] and [0, 2^378-1], resp. In the SIDH API, private keys are encoded | |||||
// in 48 octets in little endian format. | |||||
// Public keys PublicKeyA and PublicKeyB consist of 3 elements in GF(p751^2). In the SIDH API, they are encoded in 564 octets. | |||||
// Shared keys SharedSecretA and SharedSecretB consist of one element in GF(p751^2). In the SIDH API, they are encoded in 188 octets. | |||||
#endif |
@@ -0,0 +1,128 @@ | |||||
/******************************************************************************************** | |||||
* Supersingular Isogeny Key Encapsulation Library | |||||
* | |||||
* Abstract: configuration file and platform-dependent macros | |||||
*********************************************************************************************/ | |||||
#ifndef __CONFIG_H__ | |||||
#define __CONFIG_H__ | |||||
#include <stdint.h> | |||||
#include <stdbool.h> | |||||
#include <stddef.h> | |||||
// Definition of operating system | |||||
#define OS_LINUX 1 | |||||
#if defined(__LINUX__) // Linux OS | |||||
#define OS_TARGET OS_LINUX | |||||
#else | |||||
#error -- "Unsupported OS" | |||||
#endif | |||||
// Definition of compiler | |||||
#define COMPILER_GCC 1 | |||||
#define COMPILER_CLANG 2 | |||||
#if defined(__GNUC__) // GNU GCC compiler | |||||
#define COMPILER COMPILER_GCC | |||||
#elif defined(__clang__) // Clang compiler | |||||
#define COMPILER COMPILER_CLANG | |||||
#else | |||||
#error -- "Unsupported COMPILER" | |||||
#endif | |||||
// Definition of the targeted architecture and basic data types | |||||
#define TARGET_AMD64 1 | |||||
#if defined(_AMD64_) | |||||
#define TARGET TARGET_AMD64 | |||||
#define RADIX 64 | |||||
#define LOG2RADIX 6 | |||||
typedef uint64_t digit_t; // Unsigned 64-bit digit | |||||
#else | |||||
#error -- "Unsupported ARCHITECTURE" | |||||
#endif | |||||
#define RADIX64 64 | |||||
// Selection of implementation: optimized_fast with x64 assembly | |||||
#if defined(_OPTIMIZED_FAST_) | |||||
#define OPTIMIZED_FAST_IMPLEMENTATION | |||||
#endif | |||||
// Extended datatype support | |||||
#define UINT128_SUPPORT | |||||
typedef unsigned uint128_t __attribute__((mode(TI))); | |||||
// Macro definitions | |||||
#define NBITS_TO_NBYTES(nbits) (((nbits)+7)/8) // Conversion macro from number of bits to number of bytes | |||||
#define NBITS_TO_NWORDS(nbits) (((nbits)+(sizeof(digit_t)*8)-1)/(sizeof(digit_t)*8)) // Conversion macro from number of bits to number of computer words | |||||
#define NBYTES_TO_NWORDS(nbytes) (((nbytes)+sizeof(digit_t)-1)/sizeof(digit_t)) // Conversion macro from number of bytes to number of computer words | |||||
// Macro to avoid compiler warnings when detecting unreferenced parameters | |||||
#define UNREFERENCED_PARAMETER(PAR) ((void)(PAR)) | |||||
/********************** Constant-time unsigned comparisons ***********************/ | |||||
// The following functions return 1 (TRUE) if condition is true, 0 (FALSE) otherwise | |||||
static __inline unsigned int is_digit_nonzero_ct(digit_t x) | |||||
{ // Is x != 0? | |||||
return (unsigned int)((x | (0-x)) >> (RADIX-1)); | |||||
} | |||||
static __inline unsigned int is_digit_zero_ct(digit_t x) | |||||
{ // Is x = 0? | |||||
return (unsigned int)(1 ^ is_digit_nonzero_ct(x)); | |||||
} | |||||
static __inline unsigned int is_digit_lessthan_ct(digit_t x, digit_t y) | |||||
{ // Is x < y? | |||||
return (unsigned int)((x ^ ((x ^ y) | ((x - y) ^ y))) >> (RADIX-1)); | |||||
} | |||||
/********************** Macros for platform-dependent operations **********************/ | |||||
// Digit multiplication | |||||
#define MUL(multiplier, multiplicand, hi, lo) \ | |||||
{ uint128_t tempReg = (uint128_t)(multiplier) * (uint128_t)(multiplicand); \ | |||||
*(hi) = (digit_t)(tempReg >> RADIX); \ | |||||
(lo) = (digit_t)tempReg; } | |||||
// Digit addition with carry | |||||
#define ADDC(carryIn, addend1, addend2, carryOut, sumOut) \ | |||||
{ uint128_t tempReg = (uint128_t)(addend1) + (uint128_t)(addend2) + (uint128_t)(carryIn); \ | |||||
(carryOut) = (digit_t)(tempReg >> RADIX); \ | |||||
(sumOut) = (digit_t)tempReg; } | |||||
// Digit subtraction with borrow | |||||
#define SUBC(borrowIn, minuend, subtrahend, borrowOut, differenceOut) \ | |||||
{ uint128_t tempReg = (uint128_t)(minuend) - (uint128_t)(subtrahend) - (uint128_t)(borrowIn); \ | |||||
(borrowOut) = (digit_t)(tempReg >> (sizeof(uint128_t)*8 - 1)); \ | |||||
(differenceOut) = (digit_t)tempReg; } | |||||
// Digit shift right | |||||
#define SHIFTR(highIn, lowIn, shift, shiftOut, DigitSize) \ | |||||
(shiftOut) = ((lowIn) >> (shift)) ^ ((highIn) << (RADIX - (shift))); | |||||
// Digit shift left | |||||
#define SHIFTL(highIn, lowIn, shift, shiftOut, DigitSize) \ | |||||
(shiftOut) = ((highIn) << (shift)) ^ ((lowIn) >> (RADIX - (shift))); | |||||
#endif |
@@ -0,0 +1,330 @@ | |||||
/******************************************************************************************** | |||||
* Supersingular Isogeny Key Encapsulation Library | |||||
* | |||||
* Abstract: elliptic curve and isogeny functions | |||||
*********************************************************************************************/ | |||||
#include "P751_internal.h" | |||||
#include <stdio.h> | |||||
void xDBL(const point_proj_t P, point_proj_t Q, const f2elm_t A24plus, const f2elm_t C24) | |||||
{ // Doubling of a Montgomery point in projective coordinates (X:Z). | |||||
// Input: projective Montgomery x-coordinates P = (X1:Z1), where x1=X1/Z1 and Montgomery curve constants A+2C and 4C. | |||||
// Output: projective Montgomery x-coordinates Q = 2*P = (X2:Z2). | |||||
f2elm_t t0, t1; | |||||
fp2sub(P->X, P->Z, t0); // t0 = X1-Z1 | |||||
fp2add(P->X, P->Z, t1); // t1 = X1+Z1 | |||||
fp2sqr_mont(t0, t0); // t0 = (X1-Z1)^2 | |||||
fp2sqr_mont(t1, t1); // t1 = (X1+Z1)^2 | |||||
fp2mul_mont(C24, t0, Q->Z); // Z2 = C24*(X1-Z1)^2 | |||||
fp2mul_mont(t1, Q->Z, Q->X); // X2 = C24*(X1-Z1)^2*(X1+Z1)^2 | |||||
fp2sub(t1, t0, t1); // t1 = (X1+Z1)^2-(X1-Z1)^2 | |||||
fp2mul_mont(A24plus, t1, t0); // t0 = A24plus*[(X1+Z1)^2-(X1-Z1)^2] | |||||
fp2add(Q->Z, t0, Q->Z); // Z2 = A24plus*[(X1+Z1)^2-(X1-Z1)^2] + C24*(X1-Z1)^2 | |||||
fp2mul_mont(Q->Z, t1, Q->Z); // Z2 = [A24plus*[(X1+Z1)^2-(X1-Z1)^2] + C24*(X1-Z1)^2]*[(X1+Z1)^2-(X1-Z1)^2] | |||||
} | |||||
void xDBLe(const point_proj_t P, point_proj_t Q, const f2elm_t A24plus, const f2elm_t C24, const int e) | |||||
{ // Computes [2^e](X:Z) on Montgomery curve with projective constant via e repeated doublings. | |||||
// Input: projective Montgomery x-coordinates P = (XP:ZP), such that xP=XP/ZP and Montgomery curve constants A+2C and 4C. | |||||
// Output: projective Montgomery x-coordinates Q <- (2^e)*P. | |||||
int i; | |||||
copy_words((digit_t *)P, (digit_t *)Q, 2 * 2 * NWORDS_FIELD); | |||||
for (i = 0; i < e; i++) | |||||
{ | |||||
xDBL(Q, Q, A24plus, C24); | |||||
} | |||||
} | |||||
void get_4_isog(const point_proj_t P, f2elm_t A24plus, f2elm_t C24, f2elm_t *coeff) | |||||
{ // Computes the corresponding 4-isogeny of a projective Montgomery point (X4:Z4) of order 4. | |||||
// Input: projective point of order four P = (X4:Z4). | |||||
// Output: the 4-isogenous Montgomery curve with projective coefficients A+2C/4C and the 3 coefficients | |||||
// that are used to evaluate the isogeny at a point in eval_4_isog(). | |||||
fp2sub(P->X, P->Z, coeff[1]); // coeff[1] = X4-Z4 | |||||
fp2add(P->X, P->Z, coeff[2]); // coeff[2] = X4+Z4 | |||||
fp2sqr_mont(P->Z, coeff[0]); // coeff[0] = Z4^2 | |||||
fp2add(coeff[0], coeff[0], coeff[0]); // coeff[0] = 2*Z4^2 | |||||
fp2sqr_mont(coeff[0], C24); // C24 = 4*Z4^4 | |||||
fp2add(coeff[0], coeff[0], coeff[0]); // coeff[0] = 4*Z4^2 | |||||
fp2sqr_mont(P->X, A24plus); // A24plus = X4^2 | |||||
fp2add(A24plus, A24plus, A24plus); // A24plus = 2*X4^2 | |||||
fp2sqr_mont(A24plus, A24plus); // A24plus = 4*X4^4 | |||||
} | |||||
void eval_4_isog(point_proj_t P, f2elm_t *coeff) | |||||
{ // Evaluates the isogeny at the point (X:Z) in the domain of the isogeny, given a 4-isogeny phi defined | |||||
// by the 3 coefficients in coeff (computed in the function get_4_isog()). | |||||
// Inputs: the coefficients defining the isogeny, and the projective point P = (X:Z). | |||||
// Output: the projective point P = phi(P) = (X:Z) in the codomain. | |||||
f2elm_t t0, t1; | |||||
fp2add(P->X, P->Z, t0); // t0 = X+Z | |||||
fp2sub(P->X, P->Z, t1); // t1 = X-Z | |||||
fp2mul_mont(t0, coeff[1], P->X); // X = (X+Z)*coeff[1] | |||||
fp2mul_mont(t1, coeff[2], P->Z); // Z = (X-Z)*coeff[2] | |||||
fp2mul_mont(t0, t1, t0); // t0 = (X+Z)*(X-Z) | |||||
fp2mul_mont(t0, coeff[0], t0); // t0 = coeff[0]*(X+Z)*(X-Z) | |||||
fp2add(P->X, P->Z, t1); // t1 = (X-Z)*coeff[2] + (X+Z)*coeff[1] | |||||
fp2sub(P->X, P->Z, P->Z); // Z = (X-Z)*coeff[2] - (X+Z)*coeff[1] | |||||
fp2sqr_mont(t1, t1); // t1 = [(X-Z)*coeff[2] + (X+Z)*coeff[1]]^2 | |||||
fp2sqr_mont(P->Z, P->Z); // Z = [(X-Z)*coeff[2] - (X+Z)*coeff[1]]^2 | |||||
fp2add(t1, t0, P->X); // X = coeff[0]*(X+Z)*(X-Z) + [(X-Z)*coeff[2] + (X+Z)*coeff[1]]^2 | |||||
fp2sub(P->Z, t0, t0); // t0 = [(X-Z)*coeff[2] - (X+Z)*coeff[1]]^2 - coeff[0]*(X+Z)*(X-Z) | |||||
fp2mul_mont(P->X, t1, P->X); // Xfinal | |||||
fp2mul_mont(P->Z, t0, P->Z); // Zfinal | |||||
} | |||||
void xTPL(const point_proj_t P, point_proj_t Q, const f2elm_t A24minus, const f2elm_t A24plus) | |||||
{ // Tripling of a Montgomery point in projective coordinates (X:Z). | |||||
// Input: projective Montgomery x-coordinates P = (X:Z), where x=X/Z and Montgomery curve constants A24plus = A+2C and A24minus = A-2C. | |||||
// Output: projective Montgomery x-coordinates Q = 3*P = (X3:Z3). | |||||
f2elm_t t0, t1, t2, t3, t4, t5, t6; | |||||
fp2sub(P->X, P->Z, t0); // t0 = X-Z | |||||
fp2sqr_mont(t0, t2); // t2 = (X-Z)^2 | |||||
fp2add(P->X, P->Z, t1); // t1 = X+Z | |||||
fp2sqr_mont(t1, t3); // t3 = (X+Z)^2 | |||||
fp2add(t0, t1, t4); // t4 = 2*X | |||||
fp2sub(t1, t0, t0); // t0 = 2*Z | |||||
fp2sqr_mont(t4, t1); // t1 = 4*X^2 | |||||
fp2sub(t1, t3, t1); // t1 = 4*X^2 - (X+Z)^2 | |||||
fp2sub(t1, t2, t1); // t1 = 4*X^2 - (X+Z)^2 - (X-Z)^2 | |||||
fp2mul_mont(t3, A24plus, t5); // t5 = A24plus*(X+Z)^2 | |||||
fp2mul_mont(t3, t5, t3); // t3 = A24plus*(X+Z)^3 | |||||
fp2mul_mont(A24minus, t2, t6); // t6 = A24minus*(X-Z)^2 | |||||
fp2mul_mont(t2, t6, t2); // t2 = A24minus*(X-Z)^3 | |||||
fp2sub(t2, t3, t3); // t3 = A24minus*(X-Z)^3 - coeff*(X+Z)^3 | |||||
fp2sub(t5, t6, t2); // t2 = A24plus*(X+Z)^2 - A24minus*(X-Z)^2 | |||||
fp2mul_mont(t1, t2, t1); // t1 = [4*X^2 - (X+Z)^2 - (X-Z)^2]*[A24plus*(X+Z)^2 - A24minus*(X-Z)^2] | |||||
fp2add(t3, t1, t2); // t2 = [4*X^2 - (X+Z)^2 - (X-Z)^2]*[A24plus*(X+Z)^2 - A24minus*(X-Z)^2] + A24minus*(X-Z)^3 - coeff*(X+Z)^3 | |||||
fp2sqr_mont(t2, t2); // t2 = t2^2 | |||||
fp2mul_mont(t4, t2, Q->X); // X3 = 2*X*t2 | |||||
fp2sub(t3, t1, t1); // t1 = A24minus*(X-Z)^3 - A24plus*(X+Z)^3 - [4*X^2 - (X+Z)^2 - (X-Z)^2]*[A24plus*(X+Z)^2 - A24minus*(X-Z)^2] | |||||
fp2sqr_mont(t1, t1); // t1 = t1^2 | |||||
fp2mul_mont(t0, t1, Q->Z); // Z3 = 2*Z*t1 | |||||
} | |||||
void xTPLe(const point_proj_t P, point_proj_t Q, const f2elm_t A24minus, const f2elm_t A24plus, const int e) | |||||
{ // Computes [3^e](X:Z) on Montgomery curve with projective constant via e repeated triplings. | |||||
// Input: projective Montgomery x-coordinates P = (XP:ZP), such that xP=XP/ZP and Montgomery curve constants A24plus = A+2C and A24minus = A-2C. | |||||
// Output: projective Montgomery x-coordinates Q <- (3^e)*P. | |||||
int i; | |||||
copy_words((digit_t *)P, (digit_t *)Q, 2 * 2 * NWORDS_FIELD); | |||||
for (i = 0; i < e; i++) | |||||
{ | |||||
xTPL(Q, Q, A24minus, A24plus); | |||||
} | |||||
} | |||||
void get_3_isog(const point_proj_t P, f2elm_t A24minus, f2elm_t A24plus, f2elm_t *coeff) | |||||
{ // Computes the corresponding 3-isogeny of a projective Montgomery point (X3:Z3) of order 3. | |||||
// Input: projective point of order three P = (X3:Z3). | |||||
// Output: the 3-isogenous Montgomery curve with projective coefficient A/C. | |||||
f2elm_t t0, t1, t2, t3, t4; | |||||
fp2sub(P->X, P->Z, coeff[0]); // coeff0 = X-Z | |||||
fp2sqr_mont(coeff[0], t0); // t0 = (X-Z)^2 | |||||
fp2add(P->X, P->Z, coeff[1]); // coeff1 = X+Z | |||||
fp2sqr_mont(coeff[1], t1); // t1 = (X+Z)^2 | |||||
fp2add(t0, t1, t2); // t2 = (X+Z)^2 + (X-Z)^2 | |||||
fp2add(coeff[0], coeff[1], t3); // t3 = 2*X | |||||
fp2sqr_mont(t3, t3); // t3 = 4*X^2 | |||||
fp2sub(t3, t2, t3); // t3 = 4*X^2 - (X+Z)^2 - (X-Z)^2 | |||||
fp2add(t1, t3, t2); // t2 = 4*X^2 - (X-Z)^2 | |||||
fp2add(t3, t0, t3); // t3 = 4*X^2 - (X+Z)^2 | |||||
fp2add(t0, t3, t4); // t4 = 4*X^2 - (X+Z)^2 + (X-Z)^2 | |||||
fp2add(t4, t4, t4); // t4 = 2(4*X^2 - (X+Z)^2 + (X-Z)^2) | |||||
fp2add(t1, t4, t4); // t4 = 8*X^2 - (X+Z)^2 + 2*(X-Z)^2 | |||||
fp2mul_mont(t2, t4, A24minus); // A24minus = [4*X^2 - (X-Z)^2]*[8*X^2 - (X+Z)^2 + 2*(X-Z)^2] | |||||
fp2add(t1, t2, t4); // t4 = 4*X^2 + (X+Z)^2 - (X-Z)^2 | |||||
fp2add(t4, t4, t4); // t4 = 2(4*X^2 + (X+Z)^2 - (X-Z)^2) | |||||
fp2add(t0, t4, t4); // t4 = 8*X^2 + 2*(X+Z)^2 - (X-Z)^2 | |||||
fp2mul_mont(t3, t4, t4); // t4 = [4*X^2 - (X+Z)^2]*[8*X^2 + 2*(X+Z)^2 - (X-Z)^2] | |||||
fp2sub(t4, A24minus, t0); // t0 = [4*X^2 - (X+Z)^2]*[8*X^2 + 2*(X+Z)^2 - (X-Z)^2] - [4*X^2 - (X-Z)^2]*[8*X^2 - (X+Z)^2 + 2*(X-Z)^2] | |||||
fp2add(A24minus, t0, A24plus); // A24plus = 8*X^2 - (X+Z)^2 + 2*(X-Z)^2 | |||||
} | |||||
void eval_3_isog(point_proj_t Q, const f2elm_t *coeff) | |||||
{ // Computes the 3-isogeny R=phi(X:Z), given projective point (X3:Z3) of order 3 on a Montgomery curve and | |||||
// a point P with 2 coefficients in coeff (computed in the function get_3_isog()). | |||||
// Inputs: projective points P = (X3:Z3) and Q = (X:Z). | |||||
// Output: the projective point Q <- phi(Q) = (X3:Z3). | |||||
f2elm_t t0, t1, t2; | |||||
fp2add(Q->X, Q->Z, t0); // t0 = X+Z | |||||
fp2sub(Q->X, Q->Z, t1); // t1 = X-Z | |||||
fp2mul_mont(t0, coeff[0], t0); // t0 = coeff0*(X+Z) | |||||
fp2mul_mont(t1, coeff[1], t1); // t1 = coeff1*(X-Z) | |||||
fp2add(t0, t1, t2); // t2 = coeff0*(X-Z) + coeff1*(X+Z) | |||||
fp2sub(t1, t0, t0); // t0 = coeff0*(X-Z) - coeff1*(X+Z) | |||||
fp2sqr_mont(t2, t2); // t2 = [coeff0*(X-Z) + coeff1*(X+Z)]^2 | |||||
fp2sqr_mont(t0, t0); // t1 = [coeff0*(X-Z) - coeff1*(X+Z)]^2 | |||||
fp2mul_mont(Q->X, t2, Q->X); // X3final = X*[coeff0*(X-Z) + coeff1*(X+Z)]^2 | |||||
fp2mul_mont(Q->Z, t0, Q->Z); // Z3final = Z*[coeff0*(X-Z) - coeff1*(X+Z)]^2 | |||||
} | |||||
void inv_3_way(f2elm_t z1, f2elm_t z2, f2elm_t z3) | |||||
{ // 3-way simultaneous inversion | |||||
// Input: z1,z2,z3 | |||||
// Output: 1/z1,1/z2,1/z3 (override inputs). | |||||
f2elm_t t0, t1, t2, t3; | |||||
fp2mul_mont(z1, z2, t0); // t0 = z1*z2 | |||||
fp2mul_mont(z3, t0, t1); // t1 = z1*z2*z3 | |||||
fp2inv_mont(t1); // t1 = 1/(z1*z2*z3) | |||||
fp2mul_mont(z3, t1, t2); // t2 = 1/(z1*z2) | |||||
fp2mul_mont(t2, z2, t3); // t3 = 1/z1 | |||||
fp2mul_mont(t2, z1, z2); // z2 = 1/z2 | |||||
fp2mul_mont(t0, t1, z3); // z3 = 1/z3 | |||||
fp2copy(t3, z1); // z1 = 1/z1 | |||||
} | |||||
void get_A(const f2elm_t xP, const f2elm_t xQ, const f2elm_t xR, f2elm_t A) | |||||
{ // Given the x-coordinates of P, Q, and R, returns the value A corresponding to the Montgomery curve E_A: y^2=x^3+A*x^2+x such that R=Q-P on E_A. | |||||
// Input: the x-coordinates xP, xQ, and xR of the points P, Q and R. | |||||
// Output: the coefficient A corresponding to the curve E_A: y^2=x^3+A*x^2+x. | |||||
f2elm_t t0, t1, one = {0}; | |||||
fpcopy((digit_t *)&Montgomery_one, one[0]); | |||||
fp2add(xP, xQ, t1); // t1 = xP+xQ | |||||
fp2mul_mont(xP, xQ, t0); // t0 = xP*xQ | |||||
fp2mul_mont(xR, t1, A); // A = xR*t1 | |||||
fp2add(t0, A, A); // A = A+t0 | |||||
fp2mul_mont(t0, xR, t0); // t0 = t0*xR | |||||
fp2sub(A, one, A); // A = A-1 | |||||
fp2add(t0, t0, t0); // t0 = t0+t0 | |||||
fp2add(t1, xR, t1); // t1 = t1+xR | |||||
fp2add(t0, t0, t0); // t0 = t0+t0 | |||||
fp2sqr_mont(A, A); // A = A^2 | |||||
fp2inv_mont(t0); // t0 = 1/t0 | |||||
fp2mul_mont(A, t0, A); // A = A*t0 | |||||
fp2sub(A, t1, A); // Afinal = A-t1 | |||||
} | |||||
void j_inv(const f2elm_t A, const f2elm_t C, f2elm_t jinv) | |||||
{ // Computes the j-invariant of a Montgomery curve with projective constant. | |||||
// Input: A,C in GF(p^2). | |||||
// Output: j=256*(A^2-3*C^2)^3/(C^4*(A^2-4*C^2)), which is the j-invariant of the Montgomery curve B*y^2=x^3+(A/C)*x^2+x or (equivalently) j-invariant of B'*y^2=C*x^3+A*x^2+C*x. | |||||
f2elm_t t0, t1; | |||||
fp2sqr_mont(A, jinv); // jinv = A^2 | |||||
fp2sqr_mont(C, t1); // t1 = C^2 | |||||
fp2add(t1, t1, t0); // t0 = t1+t1 | |||||
fp2sub(jinv, t0, t0); // t0 = jinv-t0 | |||||
fp2sub(t0, t1, t0); // t0 = t0-t1 | |||||
fp2sub(t0, t1, jinv); // jinv = t0-t1 | |||||
fp2sqr_mont(t1, t1); // t1 = t1^2 | |||||
fp2mul_mont(jinv, t1, jinv); // jinv = jinv*t1 | |||||
fp2add(t0, t0, t0); // t0 = t0+t0 | |||||
fp2add(t0, t0, t0); // t0 = t0+t0 | |||||
fp2sqr_mont(t0, t1); // t1 = t0^2 | |||||
fp2mul_mont(t0, t1, t0); // t0 = t0*t1 | |||||
fp2add(t0, t0, t0); // t0 = t0+t0 | |||||
fp2add(t0, t0, t0); // t0 = t0+t0 | |||||
fp2inv_mont(jinv); // jinv = 1/jinv | |||||
fp2mul_mont(jinv, t0, jinv); // jinv = t0*jinv | |||||
} | |||||
void xDBLADD(point_proj_t P, point_proj_t Q, const f2elm_t xPQ, const f2elm_t A24) | |||||
{ // Simultaneous doubling and differential addition. | |||||
// Input: projective Montgomery points P=(XP:ZP) and Q=(XQ:ZQ) such that xP=XP/ZP and xQ=XQ/ZQ, affine difference xPQ=x(P-Q) and Montgomery curve constant A24=(A+2)/4. | |||||
// Output: projective Montgomery points P <- 2*P = (X2P:Z2P) such that x(2P)=X2P/Z2P, and Q <- P+Q = (XQP:ZQP) such that = x(Q+P)=XQP/ZQP. | |||||
f2elm_t t0, t1, t2; | |||||
fp2add(P->X, P->Z, t0); // t0 = XP+ZP | |||||
fp2sub(P->X, P->Z, t1); // t1 = XP-ZP | |||||
fp2sqr_mont(t0, P->X); // XP = (XP+ZP)^2 | |||||
fp2sub(Q->X, Q->Z, t2); // t2 = XQ-ZQ | |||||
fp2correction(t2); | |||||
fp2add(Q->X, Q->Z, Q->X); // XQ = XQ+ZQ | |||||
fp2mul_mont(t2, t0, t0); // t0 = (XP+ZP)*(XQ-ZQ) | |||||
fp2sqr_mont(t1, P->Z); // ZP = (XP-ZP)^2 | |||||
fp2mul_mont(Q->X, t1, t1); // t1 = (XP-ZP)*(XQ+ZQ) | |||||
fp2sub(P->X, P->Z, t2); // t2 = (XP+ZP)^2-(XP-ZP)^2 | |||||
fp2mul_mont(P->X, P->Z, P->X); // XP = (XP+ZP)^2*(XP-ZP)^2 | |||||
fp2mul_mont(t2, A24, Q->X); // XQ = A24*[(XP+ZP)^2-(XP-ZP)^2] | |||||
fp2sub(t0, t1, Q->Z); // ZQ = (XP+ZP)*(XQ-ZQ)-(XP-ZP)*(XQ+ZQ) | |||||
fp2add(Q->X, P->Z, P->Z); // ZP = A24*[(XP+ZP)^2-(XP-ZP)^2]+(XP-ZP)^2 | |||||
fp2add(t0, t1, Q->X); // XQ = (XP+ZP)*(XQ-ZQ)+(XP-ZP)*(XQ+ZQ) | |||||
fp2mul_mont(P->Z, t2, P->Z); // ZP = [A24*[(XP+ZP)^2-(XP-ZP)^2]+(XP-ZP)^2]*[(XP+ZP)^2-(XP-ZP)^2] | |||||
fp2sqr_mont(Q->Z, Q->Z); // ZQ = [(XP+ZP)*(XQ-ZQ)-(XP-ZP)*(XQ+ZQ)]^2 | |||||
fp2sqr_mont(Q->X, Q->X); // XQ = [(XP+ZP)*(XQ-ZQ)+(XP-ZP)*(XQ+ZQ)]^2 | |||||
fp2mul_mont(Q->Z, xPQ, Q->Z); // ZQ = xPQ*[(XP+ZP)*(XQ-ZQ)-(XP-ZP)*(XQ+ZQ)]^2 | |||||
} | |||||
static void swap_points(point_proj_t P, point_proj_t Q, const digit_t option) | |||||
{ // Swap points. | |||||
// If option = 0 then P <- P and Q <- Q, else if option = 0xFF...FF then P <- Q and Q <- P | |||||
digit_t temp; | |||||
unsigned int i; | |||||
for (i = 0; i < NWORDS_FIELD; i++) | |||||
{ | |||||
temp = option & (P->X[0][i] ^ Q->X[0][i]); | |||||
P->X[0][i] = temp ^ P->X[0][i]; | |||||
Q->X[0][i] = temp ^ Q->X[0][i]; | |||||
temp = option & (P->Z[0][i] ^ Q->Z[0][i]); | |||||
P->Z[0][i] = temp ^ P->Z[0][i]; | |||||
Q->Z[0][i] = temp ^ Q->Z[0][i]; | |||||
temp = option & (P->X[1][i] ^ Q->X[1][i]); | |||||
P->X[1][i] = temp ^ P->X[1][i]; | |||||
Q->X[1][i] = temp ^ Q->X[1][i]; | |||||
temp = option & (P->Z[1][i] ^ Q->Z[1][i]); | |||||
P->Z[1][i] = temp ^ P->Z[1][i]; | |||||
Q->Z[1][i] = temp ^ Q->Z[1][i]; | |||||
} | |||||
} | |||||
static void LADDER3PT(const f2elm_t xP, const f2elm_t xQ, const f2elm_t xPQ, const digit_t *m, const unsigned int AliceOrBob, point_proj_t R, const f2elm_t A) | |||||
{ | |||||
point_proj_t R0 = {0}, R2 = {0}; | |||||
f2elm_t A24 = {0}; | |||||
digit_t mask; | |||||
int i, nbits, bit, swap, prevbit = 0; | |||||
if (AliceOrBob == ALICE) | |||||
{ | |||||
nbits = OALICE_BITS; | |||||
} | |||||
else | |||||
{ | |||||
nbits = OBOB_BITS; | |||||
} | |||||
// Initializing constant | |||||
fpcopy((digit_t *)&Montgomery_one, A24[0]); | |||||
fp2add(A24, A24, A24); | |||||
fp2add(A, A24, A24); | |||||
fp2div2(A24, A24); | |||||
fp2div2(A24, A24); // A24 = (A+2)/4 | |||||
// Initializing points | |||||
fp2copy(xQ, R0->X); | |||||
fpcopy((digit_t *)&Montgomery_one, (digit_t *)R0->Z); | |||||
fp2copy(xPQ, R2->X); | |||||
fpcopy((digit_t *)&Montgomery_one, (digit_t *)R2->Z); | |||||
fp2copy(xP, R->X); | |||||
fpcopy((digit_t *)&Montgomery_one, (digit_t *)R->Z); | |||||
fpzero((digit_t *)(R->Z)[1]); | |||||
// Main loop | |||||
for (i = 0; i < nbits; i++) | |||||
{ | |||||
bit = (m[i >> LOG2RADIX] >> (i & (RADIX - 1))) & 1; | |||||
swap = bit ^ prevbit; | |||||
prevbit = bit; | |||||
mask = 0 - (digit_t)swap; | |||||
swap_points(R, R2, mask); | |||||
xDBLADD(R0, R2, R->X, A24); | |||||
fp2mul_mont(R2->X, R->Z, R2->X); | |||||
} | |||||
} |
@@ -0,0 +1,867 @@ | |||||
/******************************************************************************************** | |||||
* SIDH: an efficient supersingular isogeny-based cryptography library for ephemeral | |||||
* Diffie-Hellman key exchange. | |||||
* | |||||
* Copyright (c) Microsoft Corporation. All rights reserved. | |||||
* | |||||
* | |||||
* Abstract: modular arithmetic optimized for x64 platforms | |||||
* | |||||
*********************************************************************************************/ | |||||
#include "P751_internal.h" | |||||
// Global constants | |||||
extern const uint64_t p751[NWORDS_FIELD]; | |||||
extern const uint64_t p751p1[NWORDS_FIELD]; | |||||
extern const uint64_t p751x2[NWORDS_FIELD]; | |||||
__inline void fpadd751(const digit_t *a, const digit_t *b, digit_t *c) | |||||
{ // Modular addition, c = a+b mod p751. | |||||
// Inputs: a, b in [0, 2*p751-1] | |||||
// Output: c in [0, 2*p751-1] | |||||
#if (OS_TARGET == OS_WIN) | |||||
unsigned int i, carry = 0; | |||||
digit_t mask; | |||||
for (i = 0; i < NWORDS_FIELD; i++) | |||||
{ | |||||
ADDC(carry, a[i], b[i], carry, c[i]); | |||||
} | |||||
carry = 0; | |||||
for (i = 0; i < NWORDS_FIELD; i++) | |||||
{ | |||||
SUBC(carry, c[i], ((digit_t *)p751x2)[i], carry, c[i]); | |||||
} | |||||
mask = 0 - (digit_t)carry; | |||||
carry = 0; | |||||
for (i = 0; i < NWORDS_FIELD; i++) | |||||
{ | |||||
ADDC(carry, c[i], ((digit_t *)p751x2)[i] & mask, carry, c[i]); | |||||
} | |||||
#elif (OS_TARGET == OS_LINUX) | |||||
fpadd751_asm(a, b, c); | |||||
#endif | |||||
} | |||||
__inline void fpsub751(const digit_t *a, const digit_t *b, digit_t *c) | |||||
{ // Modular subtraction, c = a-b mod p751. | |||||
// Inputs: a, b in [0, 2*p751-1] | |||||
// Output: c in [0, 2*p751-1] | |||||
#if (OS_TARGET == OS_WIN) | |||||
unsigned int i, borrow = 0; | |||||
digit_t mask; | |||||
for (i = 0; i < NWORDS_FIELD; i++) | |||||
{ | |||||
SUBC(borrow, a[i], b[i], borrow, c[i]); | |||||
} | |||||
mask = 0 - (digit_t)borrow; | |||||
borrow = 0; | |||||
for (i = 0; i < NWORDS_FIELD; i++) | |||||
{ | |||||
ADDC(borrow, c[i], ((digit_t *)p751x2)[i] & mask, borrow, c[i]); | |||||
} | |||||
#elif (OS_TARGET == OS_LINUX) | |||||
fpsub751_asm(a, b, c); | |||||
#endif | |||||
} | |||||
__inline void fpneg751(digit_t *a) | |||||
{ // Modular negation, a = -a mod p751. | |||||
// Input/output: a in [0, 2*p751-1] | |||||
unsigned int i, borrow = 0; | |||||
for (i = 0; i < NWORDS_FIELD; i++) | |||||
{ | |||||
SUBC(borrow, ((digit_t *)p751x2)[i], a[i], borrow, a[i]); | |||||
} | |||||
} | |||||
void fpdiv2_751(const digit_t *a, digit_t *c) | |||||
{ // Modular division by two, c = a/2 mod p751. | |||||
// Input : a in [0, 2*p751-1] | |||||
// Output: c in [0, 2*p751-1] | |||||
unsigned int i, carry = 0; | |||||
digit_t mask; | |||||
mask = 0 - (digit_t)(a[0] & 1); // If a is odd compute a+p751 | |||||
for (i = 0; i < NWORDS_FIELD; i++) | |||||
{ | |||||
ADDC(carry, a[i], ((digit_t *)p751)[i] & mask, carry, c[i]); | |||||
} | |||||
mp_shiftr1(c, NWORDS_FIELD); | |||||
} | |||||
void fpcorrection751(digit_t *a) | |||||
{ // Modular correction to reduce field element a in [0, 2*p751-1] to [0, p751-1]. | |||||
unsigned int i, borrow = 0; | |||||
digit_t mask; | |||||
for (i = 0; i < NWORDS_FIELD; i++) | |||||
{ | |||||
SUBC(borrow, a[i], ((digit_t *)p751)[i], borrow, a[i]); | |||||
} | |||||
mask = 0 - (digit_t)borrow; | |||||
borrow = 0; | |||||
for (i = 0; i < NWORDS_FIELD; i++) | |||||
{ | |||||
ADDC(borrow, a[i], ((digit_t *)p751)[i] & mask, borrow, a[i]); | |||||
} | |||||
} | |||||
void mp_mul(const digit_t *a, const digit_t *b, digit_t *c, const unsigned int nwords) | |||||
{ // Multiprecision multiply, c = a*b, where lng(a) = lng(b) = nwords. | |||||
UNREFERENCED_PARAMETER(nwords); | |||||
#if (OS_TARGET == OS_WIN) | |||||
digit_t t = 0; | |||||
uint128_t uv = {0}; | |||||
unsigned int carry = 0; | |||||
MULADD128(a[0], b[0], uv, carry, uv); | |||||
t += carry; | |||||
c[0] = uv[0]; | |||||
uv[0] = uv[1]; | |||||
uv[1] = t; | |||||
t = 0; | |||||
MULADD128(a[0], b[1], uv, carry, uv); | |||||
t += carry; | |||||
MULADD128(a[1], b[0], uv, carry, uv); | |||||
t += carry; | |||||
c[1] = uv[0]; | |||||
uv[0] = uv[1]; | |||||
uv[1] = t; | |||||
t = 0; | |||||
MULADD128(a[0], b[2], uv, carry, uv); | |||||
t += carry; | |||||
MULADD128(a[1], b[1], uv, carry, uv); | |||||
t += carry; | |||||
MULADD128(a[2], b[0], uv, carry, uv); | |||||
t += carry; | |||||
c[2] = uv[0]; | |||||
uv[0] = uv[1]; | |||||
uv[1] = t; | |||||
t = 0; | |||||
MULADD128(a[0], b[3], uv, carry, uv); | |||||
t += carry; | |||||
MULADD128(a[2], b[1], uv, carry, uv); | |||||
t += carry; | |||||
MULADD128(a[1], b[2], uv, carry, uv); | |||||
t += carry; | |||||
MULADD128(a[3], b[0], uv, carry, uv); | |||||
t += carry; | |||||
c[3] = uv[0]; | |||||
uv[0] = uv[1]; | |||||
uv[1] = t; | |||||
t = 0; | |||||
MULADD128(a[0], b[4], uv, carry, uv); | |||||
t += carry; | |||||
MULADD128(a[3], b[1], uv, carry, uv); | |||||
t += carry; | |||||
MULADD128(a[2], b[2], uv, carry, uv); | |||||
t += carry; | |||||
MULADD128(a[1], b[3], uv, carry, uv); | |||||
t += carry; | |||||
MULADD128(a[4], b[0], uv, carry, uv); | |||||
t += carry; | |||||
c[4] = uv[0]; | |||||
uv[0] = uv[1]; | |||||
uv[1] = t; | |||||
t = 0; | |||||
MULADD128(a[0], b[5], uv, carry, uv); | |||||
t += carry; | |||||
MULADD128(a[4], b[1], uv, carry, uv); | |||||
t += carry; | |||||
MULADD128(a[3], b[2], uv, carry, uv); | |||||
t += carry; | |||||
MULADD128(a[2], b[3], uv, carry, uv); | |||||
t += carry; | |||||
MULADD128(a[1], b[4], uv, carry, uv); | |||||
t += carry; | |||||
MULADD128(a[5], b[0], uv, carry, uv); | |||||
t += carry; | |||||
c[5] = uv[0]; | |||||
uv[0] = uv[1]; | |||||
uv[1] = t; | |||||
t = 0; | |||||
MULADD128(a[0], b[6], uv, carry, uv); | |||||
t += carry; | |||||
MULADD128(a[5], b[1], uv, carry, uv); | |||||
t += carry; | |||||
MULADD128(a[4], b[2], uv, carry, uv); | |||||
t += carry; | |||||
MULADD128(a[3], b[3], uv, carry, uv); | |||||
t += carry; | |||||
MULADD128(a[2], b[4], uv, carry, uv); | |||||
t += carry; | |||||
MULADD128(a[1], b[5], uv, carry, uv); | |||||
t += carry; | |||||
MULADD128(a[6], b[0], uv, carry, uv); | |||||
t += carry; | |||||
c[6] = uv[0]; | |||||
uv[0] = uv[1]; | |||||
uv[1] = t; | |||||
t = 0; | |||||
MULADD128(a[0], b[7], uv, carry, uv); | |||||
t += carry; | |||||
MULADD128(a[6], b[1], uv, carry, uv); | |||||
t += carry; | |||||
MULADD128(a[5], b[2], uv, carry, uv); | |||||
t += carry; | |||||
MULADD128(a[4], b[3], uv, carry, uv); | |||||
t += carry; | |||||
MULADD128(a[3], b[4], uv, carry, uv); | |||||
t += carry; | |||||
MULADD128(a[2], b[5], uv, carry, uv); | |||||
t += carry; | |||||
MULADD128(a[1], b[6], uv, carry, uv); | |||||
t += carry; | |||||
MULADD128(a[7], b[0], uv, carry, uv); | |||||
t += carry; | |||||
c[7] = uv[0]; | |||||
uv[0] = uv[1]; | |||||
uv[1] = t; | |||||
t = 0; | |||||
MULADD128(a[0], b[8], uv, carry, uv); | |||||
t += carry; | |||||
MULADD128(a[7], b[1], uv, carry, uv); | |||||
t += carry; | |||||
MULADD128(a[6], b[2], uv, carry, uv); | |||||
t += carry; | |||||
MULADD128(a[5], b[3], uv, carry, uv); | |||||
t += carry; | |||||
MULADD128(a[4], b[4], uv, carry, uv); | |||||
t += carry; | |||||
MULADD128(a[3], b[5], uv, carry, uv); | |||||
t += carry; | |||||
MULADD128(a[2], b[6], uv, carry, uv); | |||||
t += carry; | |||||
MULADD128(a[1], b[7], uv, carry, uv); | |||||
t += carry; | |||||
MULADD128(a[8], b[0], uv, carry, uv); | |||||
t += carry; | |||||
c[8] = uv[0]; | |||||
uv[0] = uv[1]; | |||||
uv[1] = t; | |||||
t = 0; | |||||
MULADD128(a[0], b[9], uv, carry, uv); | |||||
t += carry; | |||||
MULADD128(a[8], b[1], uv, carry, uv); | |||||
t += carry; | |||||
MULADD128(a[7], b[2], uv, carry, uv); | |||||
t += carry; | |||||
MULADD128(a[6], b[3], uv, carry, uv); | |||||
t += carry; | |||||
MULADD128(a[5], b[4], uv, carry, uv); | |||||
t += carry; | |||||
MULADD128(a[4], b[5], uv, carry, uv); | |||||
t += carry; | |||||
MULADD128(a[3], b[6], uv, carry, uv); | |||||
t += carry; | |||||
MULADD128(a[2], b[7], uv, carry, uv); | |||||
t += carry; | |||||
MULADD128(a[1], b[8], uv, carry, uv); | |||||
t += carry; | |||||
MULADD128(a[9], b[0], uv, carry, uv); | |||||
t += carry; | |||||
c[9] = uv[0]; | |||||
uv[0] = uv[1]; | |||||
uv[1] = t; | |||||
t = 0; | |||||
MULADD128(a[0], b[10], uv, carry, uv); | |||||
t += carry; | |||||
MULADD128(a[9], b[1], uv, carry, uv); | |||||
t += carry; | |||||
MULADD128(a[8], b[2], uv, carry, uv); | |||||
t += carry; | |||||
MULADD128(a[7], b[3], uv, carry, uv); | |||||
t += carry; | |||||
MULADD128(a[6], b[4], uv, carry, uv); | |||||
t += carry; | |||||
MULADD128(a[5], b[5], uv, carry, uv); | |||||
t += carry; | |||||
MULADD128(a[4], b[6], uv, carry, uv); | |||||
t += carry; | |||||
MULADD128(a[3], b[7], uv, carry, uv); | |||||
t += carry; | |||||
MULADD128(a[2], b[8], uv, carry, uv); | |||||
t += carry; | |||||
MULADD128(a[1], b[9], uv, carry, uv); | |||||
t += carry; | |||||
MULADD128(a[10], b[0], uv, carry, uv); | |||||
t += carry; | |||||
c[10] = uv[0]; | |||||
uv[0] = uv[1]; | |||||
uv[1] = t; | |||||
t = 0; | |||||
MULADD128(a[0], b[11], uv, carry, uv); | |||||
t += carry; | |||||
MULADD128(a[10], b[1], uv, carry, uv); | |||||
t += carry; | |||||
MULADD128(a[9], b[2], uv, carry, uv); | |||||
t += carry; | |||||
MULADD128(a[8], b[3], uv, carry, uv); | |||||
t += carry; | |||||
MULADD128(a[7], b[4], uv, carry, uv); | |||||
t += carry; | |||||
MULADD128(a[6], b[5], uv, carry, uv); | |||||
t += carry; | |||||
MULADD128(a[5], b[6], uv, carry, uv); | |||||
t += carry; | |||||
MULADD128(a[4], b[7], uv, carry, uv); | |||||
t += carry; | |||||
MULADD128(a[3], b[8], uv, carry, uv); | |||||
t += carry; | |||||
MULADD128(a[2], b[9], uv, carry, uv); | |||||
t += carry; | |||||
MULADD128(a[1], b[10], uv, carry, uv); | |||||
t += carry; | |||||
MULADD128(a[11], b[0], uv, carry, uv); | |||||
t += carry; | |||||
c[11] = uv[0]; | |||||
uv[0] = uv[1]; | |||||
uv[1] = t; | |||||
t = 0; | |||||
MULADD128(a[1], b[11], uv, carry, uv); | |||||
t += carry; | |||||
MULADD128(a[10], b[2], uv, carry, uv); | |||||
t += carry; | |||||
MULADD128(a[9], b[3], uv, carry, uv); | |||||
t += carry; | |||||
MULADD128(a[8], b[4], uv, carry, uv); | |||||
t += carry; | |||||
MULADD128(a[7], b[5], uv, carry, uv); | |||||
t += carry; | |||||
MULADD128(a[6], b[6], uv, carry, uv); | |||||
t += carry; | |||||
MULADD128(a[5], b[7], uv, carry, uv); | |||||
t += carry; | |||||
MULADD128(a[4], b[8], uv, carry, uv); | |||||
t += carry; | |||||
MULADD128(a[3], b[9], uv, carry, uv); | |||||
t += carry; | |||||
MULADD128(a[2], b[10], uv, carry, uv); | |||||
t += carry; | |||||
MULADD128(a[11], b[1], uv, carry, uv); | |||||
t += carry; | |||||
c[12] = uv[0]; | |||||
uv[0] = uv[1]; | |||||
uv[1] = t; | |||||
t = 0; | |||||
MULADD128(a[11], b[2], uv, carry, uv); | |||||
t += carry; | |||||
MULADD128(a[10], b[3], uv, carry, uv); | |||||
t += carry; | |||||
MULADD128(a[9], b[4], uv, carry, uv); | |||||
t += carry; | |||||
MULADD128(a[8], b[5], uv, carry, uv); | |||||
t += carry; | |||||
MULADD128(a[7], b[6], uv, carry, uv); | |||||
t += carry; | |||||
MULADD128(a[6], b[7], uv, carry, uv); | |||||
t += carry; | |||||
MULADD128(a[5], b[8], uv, carry, uv); | |||||
t += carry; | |||||
MULADD128(a[4], b[9], uv, carry, uv); | |||||
t += carry; | |||||
MULADD128(a[3], b[10], uv, carry, uv); | |||||
t += carry; | |||||
MULADD128(a[2], b[11], uv, carry, uv); | |||||
t += carry; | |||||
c[13] = uv[0]; | |||||
uv[0] = uv[1]; | |||||
uv[1] = t; | |||||
t = 0; | |||||
MULADD128(a[11], b[3], uv, carry, uv); | |||||
t += carry; | |||||
MULADD128(a[10], b[4], uv, carry, uv); | |||||
t += carry; | |||||
MULADD128(a[9], b[5], uv, carry, uv); | |||||
t += carry; | |||||
MULADD128(a[8], b[6], uv, carry, uv); | |||||
t += carry; | |||||
MULADD128(a[7], b[7], uv, carry, uv); | |||||
t += carry; | |||||
MULADD128(a[6], b[8], uv, carry, uv); | |||||
t += carry; | |||||
MULADD128(a[5], b[9], uv, carry, uv); | |||||
t += carry; | |||||
MULADD128(a[4], b[10], uv, carry, uv); | |||||
t += carry; | |||||
MULADD128(a[3], b[11], uv, carry, uv); | |||||
t += carry; | |||||
c[14] = uv[0]; | |||||
uv[0] = uv[1]; | |||||
uv[1] = t; | |||||
t = 0; | |||||
MULADD128(a[11], b[4], uv, carry, uv); | |||||
t += carry; | |||||
MULADD128(a[10], b[5], uv, carry, uv); | |||||
t += carry; | |||||
MULADD128(a[9], b[6], uv, carry, uv); | |||||
t += carry; | |||||
MULADD128(a[8], b[7], uv, carry, uv); | |||||
t += carry; | |||||
MULADD128(a[7], b[8], uv, carry, uv); | |||||
t += carry; | |||||
MULADD128(a[6], b[9], uv, carry, uv); | |||||
t += carry; | |||||
MULADD128(a[5], b[10], uv, carry, uv); | |||||
t += carry; | |||||
MULADD128(a[4], b[11], uv, carry, uv); | |||||
t += carry; | |||||
c[15] = uv[0]; | |||||
uv[0] = uv[1]; | |||||
uv[1] = t; | |||||
t = 0; | |||||
MULADD128(a[11], b[5], uv, carry, uv); | |||||
t += carry; | |||||
MULADD128(a[10], b[6], uv, carry, uv); | |||||
t += carry; | |||||
MULADD128(a[9], b[7], uv, carry, uv); | |||||
t += carry; | |||||
MULADD128(a[8], b[8], uv, carry, uv); | |||||
t += carry; | |||||
MULADD128(a[7], b[9], uv, carry, uv); | |||||
t += carry; | |||||
MULADD128(a[6], b[10], uv, carry, uv); | |||||
t += carry; | |||||
MULADD128(a[5], b[11], uv, carry, uv); | |||||
t += carry; | |||||
c[16] = uv[0]; | |||||
uv[0] = uv[1]; | |||||
uv[1] = t; | |||||
t = 0; | |||||
MULADD128(a[11], b[6], uv, carry, uv); | |||||
t += carry; | |||||
MULADD128(a[10], b[7], uv, carry, uv); | |||||
t += carry; | |||||
MULADD128(a[9], b[8], uv, carry, uv); | |||||
t += carry; | |||||
MULADD128(a[8], b[9], uv, carry, uv); | |||||
t += carry; | |||||
MULADD128(a[7], b[10], uv, carry, uv); | |||||
t += carry; | |||||
MULADD128(a[6], b[11], uv, carry, uv); | |||||
t += carry; | |||||
c[17] = uv[0]; | |||||
uv[0] = uv[1]; | |||||
uv[1] = t; | |||||
t = 0; | |||||
MULADD128(a[11], b[7], uv, carry, uv); | |||||
t += carry; | |||||
MULADD128(a[10], b[8], uv, carry, uv); | |||||
t += carry; | |||||
MULADD128(a[9], b[9], uv, carry, uv); | |||||
t += carry; | |||||
MULADD128(a[8], b[10], uv, carry, uv); | |||||
t += carry; | |||||
MULADD128(a[7], b[11], uv, carry, uv); | |||||
t += carry; | |||||
c[18] = uv[0]; | |||||
uv[0] = uv[1]; | |||||
uv[1] = t; | |||||
t = 0; | |||||
MULADD128(a[11], b[8], uv, carry, uv); | |||||
t += carry; | |||||
MULADD128(a[10], b[9], uv, carry, uv); | |||||
t += carry; | |||||
MULADD128(a[9], b[10], uv, carry, uv); | |||||
t += carry; | |||||
MULADD128(a[8], b[11], uv, carry, uv); | |||||
t += carry; | |||||
c[19] = uv[0]; | |||||
uv[0] = uv[1]; | |||||
uv[1] = t; | |||||
t = 0; | |||||
MULADD128(a[11], b[9], uv, carry, uv); | |||||
t += carry; | |||||
MULADD128(a[10], b[10], uv, carry, uv); | |||||
t += carry; | |||||
MULADD128(a[9], b[11], uv, carry, uv); | |||||
t += carry; | |||||
c[20] = uv[0]; | |||||
uv[0] = uv[1]; | |||||
uv[1] = t; | |||||
t = 0; | |||||
MULADD128(a[11], b[10], uv, carry, uv); | |||||
t += carry; | |||||
MULADD128(a[10], b[11], uv, carry, uv); | |||||
t += carry; | |||||
c[21] = uv[0]; | |||||
uv[0] = uv[1]; | |||||
uv[1] = t; | |||||
MULADD128(a[11], b[11], uv, carry, uv); | |||||
c[22] = uv[0]; | |||||
c[23] = uv[1]; | |||||
#elif (OS_TARGET == OS_LINUX) | |||||
mul751_asm(a, b, c); | |||||
#endif | |||||
} | |||||
void rdc_mont(const dfelm_t ma, felm_t mc) | |||||
{ // Efficient Montgomery reduction using comba and exploiting the special form of the prime p751. | |||||
// mc = ma*R^-1 mod p751x2, where R = 2^768. | |||||
// If ma < 2^768*p751, the output mc is in the range [0, 2*p751-1]. | |||||
// ma is assumed to be in Montgomery representation. | |||||
#if (OS_TARGET == OS_WIN) | |||||
unsigned int carry; | |||||
digit_t t = 0; | |||||
uint128_t uv = {0}; | |||||
mc[0] = ma[0]; | |||||
mc[1] = ma[1]; | |||||
mc[2] = ma[2]; | |||||
mc[3] = ma[3]; | |||||
mc[4] = ma[4]; | |||||
MUL128(mc[0], ((digit_t *)p751p1)[5], uv); | |||||
ADDC(0, uv[0], ma[5], carry, uv[0]); | |||||
ADDC(carry, uv[1], 0, carry, uv[1]); | |||||
mc[5] = uv[0]; | |||||
uv[0] = uv[1]; | |||||
uv[1] = 0; | |||||
MULADD128(mc[0], ((digit_t *)p751p1)[6], uv, carry, uv); | |||||
MULADD128(mc[1], ((digit_t *)p751p1)[5], uv, carry, uv); | |||||
t += carry; | |||||
ADDC(0, uv[0], ma[6], carry, uv[0]); | |||||
ADDC(carry, uv[1], 0, carry, uv[1]); | |||||
t += carry; | |||||
mc[6] = uv[0]; | |||||
uv[0] = uv[1]; | |||||
uv[1] = t; | |||||
t = 0; | |||||
MULADD128(mc[0], ((digit_t *)p751p1)[7], uv, carry, uv); | |||||
t += carry; | |||||
MULADD128(mc[1], ((digit_t *)p751p1)[6], uv, carry, uv); | |||||
t += carry; | |||||
MULADD128(mc[2], ((digit_t *)p751p1)[5], uv, carry, uv); | |||||
t += carry; | |||||
ADDC(0, uv[0], ma[7], carry, uv[0]); | |||||
ADDC(carry, uv[1], 0, carry, uv[1]); | |||||
t += carry; | |||||
mc[7] = uv[0]; | |||||
uv[0] = uv[1]; | |||||
uv[1] = t; | |||||
t = 0; | |||||
MULADD128(mc[0], ((digit_t *)p751p1)[8], uv, carry, uv); | |||||
t += carry; | |||||
MULADD128(mc[1], ((digit_t *)p751p1)[7], uv, carry, uv); | |||||
t += carry; | |||||
MULADD128(mc[2], ((digit_t *)p751p1)[6], uv, carry, uv); | |||||
t += carry; | |||||
MULADD128(mc[3], ((digit_t *)p751p1)[5], uv, carry, uv); | |||||
t += carry; | |||||
ADDC(0, uv[0], ma[8], carry, uv[0]); | |||||
ADDC(carry, uv[1], 0, carry, uv[1]); | |||||
t += carry; | |||||
mc[8] = uv[0]; | |||||
uv[0] = uv[1]; | |||||
uv[1] = t; | |||||
t = 0; | |||||
MULADD128(mc[0], ((digit_t *)p751p1)[9], uv, carry, uv); | |||||
t += carry; | |||||
MULADD128(mc[1], ((digit_t *)p751p1)[8], uv, carry, uv); | |||||
t += carry; | |||||
MULADD128(mc[2], ((digit_t *)p751p1)[7], uv, carry, uv); | |||||
t += carry; | |||||
MULADD128(mc[3], ((digit_t *)p751p1)[6], uv, carry, uv); | |||||
t += carry; | |||||
MULADD128(mc[4], ((digit_t *)p751p1)[5], uv, carry, uv); | |||||
t += carry; | |||||
ADDC(0, uv[0], ma[9], carry, uv[0]); | |||||
ADDC(carry, uv[1], 0, carry, uv[1]); | |||||
t += carry; | |||||
mc[9] = uv[0]; | |||||
uv[0] = uv[1]; | |||||
uv[1] = t; | |||||
t = 0; | |||||
MULADD128(mc[0], ((digit_t *)p751p1)[10], uv, carry, uv); | |||||
t += carry; | |||||
MULADD128(mc[1], ((digit_t *)p751p1)[9], uv, carry, uv); | |||||
t += carry; | |||||
MULADD128(mc[2], ((digit_t *)p751p1)[8], uv, carry, uv); | |||||
t += carry; | |||||
MULADD128(mc[3], ((digit_t *)p751p1)[7], uv, carry, uv); | |||||
t += carry; | |||||
MULADD128(mc[4], ((digit_t *)p751p1)[6], uv, carry, uv); | |||||
t += carry; | |||||
MULADD128(mc[5], ((digit_t *)p751p1)[5], uv, carry, uv); | |||||
t += carry; | |||||
ADDC(0, uv[0], ma[10], carry, uv[0]); | |||||
ADDC(carry, uv[1], 0, carry, uv[1]); | |||||
t += carry; | |||||
mc[10] = uv[0]; | |||||
uv[0] = uv[1]; | |||||
uv[1] = t; | |||||
t = 0; | |||||
MULADD128(mc[0], ((digit_t *)p751p1)[11], uv, carry, uv); | |||||
t += carry; | |||||
MULADD128(mc[1], ((digit_t *)p751p1)[10], uv, carry, uv); | |||||
t += carry; | |||||
MULADD128(mc[2], ((digit_t *)p751p1)[9], uv, carry, uv); | |||||
t += carry; | |||||
MULADD128(mc[3], ((digit_t *)p751p1)[8], uv, carry, uv); | |||||
t += carry; | |||||
MULADD128(mc[4], ((digit_t *)p751p1)[7], uv, carry, uv); | |||||
t += carry; | |||||
MULADD128(mc[5], ((digit_t *)p751p1)[6], uv, carry, uv); | |||||
t += carry; | |||||
MULADD128(mc[6], ((digit_t *)p751p1)[5], uv, carry, uv); | |||||
t += carry; | |||||
ADDC(0, uv[0], ma[11], carry, uv[0]); | |||||
ADDC(carry, uv[1], 0, carry, uv[1]); | |||||
t += carry; | |||||
mc[11] = uv[0]; | |||||
uv[0] = uv[1]; | |||||
uv[1] = t; | |||||
t = 0; | |||||
MULADD128(mc[1], ((digit_t *)p751p1)[11], uv, carry, uv); | |||||
t += carry; | |||||
MULADD128(mc[2], ((digit_t *)p751p1)[10], uv, carry, uv); | |||||
t += carry; | |||||
MULADD128(mc[3], ((digit_t *)p751p1)[9], uv, carry, uv); | |||||
t += carry; | |||||
MULADD128(mc[4], ((digit_t *)p751p1)[8], uv, carry, uv); | |||||
t += carry; | |||||
MULADD128(mc[5], ((digit_t *)p751p1)[7], uv, carry, uv); | |||||
t += carry; | |||||
MULADD128(mc[6], ((digit_t *)p751p1)[6], uv, carry, uv); | |||||
t += carry; | |||||
MULADD128(mc[7], ((digit_t *)p751p1)[5], uv, carry, uv); | |||||
t += carry; | |||||
ADDC(0, uv[0], ma[12], carry, uv[0]); | |||||
ADDC(carry, uv[1], 0, carry, uv[1]); | |||||
t += carry; | |||||
mc[0] = uv[0]; | |||||
uv[0] = uv[1]; | |||||
uv[1] = t; | |||||
t = 0; | |||||
MULADD128(mc[2], ((digit_t *)p751p1)[11], uv, carry, uv); | |||||
t += carry; | |||||
MULADD128(mc[3], ((digit_t *)p751p1)[10], uv, carry, uv); | |||||
t += carry; | |||||
MULADD128(mc[4], ((digit_t *)p751p1)[9], uv, carry, uv); | |||||
t += carry; | |||||
MULADD128(mc[5], ((digit_t *)p751p1)[8], uv, carry, uv); | |||||
t += carry; | |||||
MULADD128(mc[6], ((digit_t *)p751p1)[7], uv, carry, uv); | |||||
t += carry; | |||||
MULADD128(mc[7], ((digit_t *)p751p1)[6], uv, carry, uv); | |||||
t += carry; | |||||
MULADD128(mc[8], ((digit_t *)p751p1)[5], uv, carry, uv); | |||||
t += carry; | |||||
ADDC(0, uv[0], ma[13], carry, uv[0]); | |||||
ADDC(carry, uv[1], 0, carry, uv[1]); | |||||
t += carry; | |||||
mc[1] = uv[0]; | |||||
uv[0] = uv[1]; | |||||
uv[1] = t; | |||||
t = 0; | |||||
MULADD128(mc[3], ((digit_t *)p751p1)[11], uv, carry, uv); | |||||
t += carry; | |||||
MULADD128(mc[4], ((digit_t *)p751p1)[10], uv, carry, uv); | |||||
t += carry; | |||||
MULADD128(mc[5], ((digit_t *)p751p1)[9], uv, carry, uv); | |||||
t += carry; | |||||
MULADD128(mc[6], ((digit_t *)p751p1)[8], uv, carry, uv); | |||||
t += carry; | |||||
MULADD128(mc[7], ((digit_t *)p751p1)[7], uv, carry, uv); | |||||
t += carry; | |||||
MULADD128(mc[8], ((digit_t *)p751p1)[6], uv, carry, uv); | |||||
t += carry; | |||||
MULADD128(mc[9], ((digit_t *)p751p1)[5], uv, carry, uv); | |||||
t += carry; | |||||
ADDC(0, uv[0], ma[14], carry, uv[0]); | |||||
ADDC(carry, uv[1], 0, carry, uv[1]); | |||||
t += carry; | |||||
mc[2] = uv[0]; | |||||
uv[0] = uv[1]; | |||||
uv[1] = t; | |||||
t = 0; | |||||
MULADD128(mc[4], ((digit_t *)p751p1)[11], uv, carry, uv); | |||||
t += carry; | |||||
MULADD128(mc[5], ((digit_t *)p751p1)[10], uv, carry, uv); | |||||
t += carry; | |||||
MULADD128(mc[6], ((digit_t *)p751p1)[9], uv, carry, uv); | |||||
t += carry; | |||||
MULADD128(mc[7], ((digit_t *)p751p1)[8], uv, carry, uv); | |||||
t += carry; | |||||
MULADD128(mc[8], ((digit_t *)p751p1)[7], uv, carry, uv); | |||||
t += carry; | |||||
MULADD128(mc[9], ((digit_t *)p751p1)[6], uv, carry, uv); | |||||
t += carry; | |||||
MULADD128(mc[10], ((digit_t *)p751p1)[5], uv, carry, uv); | |||||
t += carry; | |||||
ADDC(0, uv[0], ma[15], carry, uv[0]); | |||||
ADDC(carry, uv[1], 0, carry, uv[1]); | |||||
t += carry; | |||||
mc[3] = uv[0]; | |||||
uv[0] = uv[1]; | |||||
uv[1] = t; | |||||
t = 0; | |||||
MULADD128(mc[5], ((digit_t *)p751p1)[11], uv, carry, uv); | |||||
t += carry; | |||||
MULADD128(mc[6], ((digit_t *)p751p1)[10], uv, carry, uv); | |||||
t += carry; | |||||
MULADD128(mc[7], ((digit_t *)p751p1)[9], uv, carry, uv); | |||||
t += carry; | |||||
MULADD128(mc[8], ((digit_t *)p751p1)[8], uv, carry, uv); | |||||
t += carry; | |||||
MULADD128(mc[9], ((digit_t *)p751p1)[7], uv, carry, uv); | |||||
t += carry; | |||||
MULADD128(mc[10], ((digit_t *)p751p1)[6], uv, carry, uv); | |||||
t += carry; | |||||
MULADD128(mc[11], ((digit_t *)p751p1)[5], uv, carry, uv); | |||||
t += carry; | |||||
ADDC(0, uv[0], ma[16], carry, uv[0]); | |||||
ADDC(carry, uv[1], 0, carry, uv[1]); | |||||
t += carry; | |||||
mc[4] = uv[0]; | |||||
uv[0] = uv[1]; | |||||
uv[1] = t; | |||||
t = 0; | |||||
MULADD128(mc[6], ((digit_t *)p751p1)[11], uv, carry, uv); | |||||
t += carry; | |||||
MULADD128(mc[7], ((digit_t *)p751p1)[10], uv, carry, uv); | |||||
t += carry; | |||||
MULADD128(mc[8], ((digit_t *)p751p1)[9], uv, carry, uv); | |||||
t += carry; | |||||
MULADD128(mc[9], ((digit_t *)p751p1)[8], uv, carry, uv); | |||||
t += carry; | |||||
MULADD128(mc[10], ((digit_t *)p751p1)[7], uv, carry, uv); | |||||
t += carry; | |||||
MULADD128(mc[11], ((digit_t *)p751p1)[6], uv, carry, uv); | |||||
t += carry; | |||||
ADDC(0, uv[0], ma[17], carry, uv[0]); | |||||
ADDC(carry, uv[1], 0, carry, uv[1]); | |||||
t += carry; | |||||
mc[5] = uv[0]; | |||||
uv[0] = uv[1]; | |||||
uv[1] = t; | |||||
t = 0; | |||||
MULADD128(mc[7], ((digit_t *)p751p1)[11], uv, carry, uv); | |||||
t += carry; | |||||
MULADD128(mc[8], ((digit_t *)p751p1)[10], uv, carry, uv); | |||||
t += carry; | |||||
MULADD128(mc[9], ((digit_t *)p751p1)[9], uv, carry, uv); | |||||
t += carry; | |||||
MULADD128(mc[10], ((digit_t *)p751p1)[8], uv, carry, uv); | |||||
t += carry; | |||||
MULADD128(mc[11], ((digit_t *)p751p1)[7], uv, carry, uv); | |||||
t += carry; | |||||
ADDC(0, uv[0], ma[18], carry, uv[0]); | |||||
ADDC(carry, uv[1], 0, carry, uv[1]); | |||||
t += carry; | |||||
mc[6] = uv[0]; | |||||
uv[0] = uv[1]; | |||||
uv[1] = t; | |||||
t = 0; | |||||
MULADD128(mc[8], ((digit_t *)p751p1)[11], uv, carry, uv); | |||||
t += carry; | |||||
MULADD128(mc[9], ((digit_t *)p751p1)[10], uv, carry, uv); | |||||
t += carry; | |||||
MULADD128(mc[10], ((digit_t *)p751p1)[9], uv, carry, uv); | |||||
t += carry; | |||||
MULADD128(mc[11], ((digit_t *)p751p1)[8], uv, carry, uv); | |||||
t += carry; | |||||
ADDC(0, uv[0], ma[19], carry, uv[0]); | |||||
ADDC(carry, uv[1], 0, carry, uv[1]); | |||||
t += carry; | |||||
mc[7] = uv[0]; | |||||
uv[0] = uv[1]; | |||||
uv[1] = t; | |||||
t = 0; | |||||
MULADD128(mc[9], ((digit_t *)p751p1)[11], uv, carry, uv); | |||||
t += carry; | |||||
MULADD128(mc[10], ((digit_t *)p751p1)[10], uv, carry, uv); | |||||
t += carry; | |||||
MULADD128(mc[11], ((digit_t *)p751p1)[9], uv, carry, uv); | |||||
t += carry; | |||||
ADDC(0, uv[0], ma[20], carry, uv[0]); | |||||
ADDC(carry, uv[1], 0, carry, uv[1]); | |||||
t += carry; | |||||
mc[8] = uv[0]; | |||||
uv[0] = uv[1]; | |||||
uv[1] = t; | |||||
t = 0; | |||||
MULADD128(mc[10], ((digit_t *)p751p1)[11], uv, carry, uv); | |||||
t += carry; | |||||
MULADD128(mc[11], ((digit_t *)p751p1)[10], uv, carry, uv); | |||||
t += carry; | |||||
ADDC(0, uv[0], ma[21], carry, uv[0]); | |||||
ADDC(carry, uv[1], 0, carry, uv[1]); | |||||
t += carry; | |||||
mc[9] = uv[0]; | |||||
uv[0] = uv[1]; | |||||
uv[1] = t; | |||||
t = 0; | |||||
MULADD128(mc[11], ((digit_t *)p751p1)[11], uv, carry, uv); | |||||
t += carry; | |||||
ADDC(0, uv[0], ma[22], carry, mc[10]); | |||||
ADDC(carry, uv[1], 0, carry, uv[1]); | |||||
ADDC(0, uv[1], ma[23], carry, mc[11]); | |||||
#elif (OS_TARGET == OS_LINUX) | |||||
rdc751_asm(ma, mc); | |||||
#endif | |||||
} |
@@ -0,0 +1,474 @@ | |||||
/******************************************************************************************** | |||||
* Supersingular Isogeny Key Encapsulation Library | |||||
* | |||||
* Abstract: core functions over GF(p) and GF(p^2) | |||||
*********************************************************************************************/ | |||||
#include "P751_internal.h" | |||||
__inline void fpcopy(const felm_t a, felm_t c) | |||||
{ // Copy a field element, c = a. | |||||
unsigned int i; | |||||
for (i = 0; i < NWORDS_FIELD; i++) | |||||
c[i] = a[i]; | |||||
} | |||||
__inline void fpzero(felm_t a) | |||||
{ // Zero a field element, a = 0. | |||||
unsigned int i; | |||||
for (i = 0; i < NWORDS_FIELD; i++) | |||||
a[i] = 0; | |||||
} | |||||
void to_mont(const felm_t a, felm_t mc) | |||||
{ // Conversion to Montgomery representation, | |||||
// mc = a*R^2*R^(-1) mod p = a*R mod p, where a in [0, p-1]. | |||||
// The Montgomery constant R^2 mod p is the global value "Montgomery_R2". | |||||
fpmul_mont(a, (digit_t *)&Montgomery_R2, mc); | |||||
} | |||||
void from_mont(const felm_t ma, felm_t c) | |||||
{ // Conversion from Montgomery representation to standard representation, | |||||
// c = ma*R^(-1) mod p = a mod p, where ma in [0, p-1]. | |||||
digit_t one[NWORDS_FIELD] = {0}; | |||||
one[0] = 1; | |||||
fpmul_mont(ma, one, c); | |||||
fpcorrection(c); | |||||
} | |||||
void copy_words(const digit_t *a, digit_t *c, const unsigned int nwords) | |||||
{ // Copy wordsize digits, c = a, where lng(a) = nwords. | |||||
unsigned int i; | |||||
for (i = 0; i < nwords; i++) | |||||
{ | |||||
c[i] = a[i]; | |||||
} | |||||
} | |||||
void fpmul_mont(const felm_t ma, const felm_t mb, felm_t mc) | |||||
{ // Multiprecision multiplication, c = a*b mod p. | |||||
dfelm_t temp = {0}; | |||||
mp_mul(ma, mb, temp, NWORDS_FIELD); | |||||
rdc_mont(temp, mc); | |||||
} | |||||
void fpsqr_mont(const felm_t ma, felm_t mc) | |||||
{ // Multiprecision squaring, c = a^2 mod p. | |||||
dfelm_t temp = {0}; | |||||
mp_mul(ma, ma, temp, NWORDS_FIELD); | |||||
rdc_mont(temp, mc); | |||||
} | |||||
void fpinv_mont(felm_t a) | |||||
{ // Field inversion using Montgomery arithmetic, a = a^(-1)*R mod p. | |||||
felm_t tt; | |||||
fpcopy(a, tt); | |||||
fpinv_chain_mont(tt); | |||||
fpsqr_mont(tt, tt); | |||||
fpsqr_mont(tt, tt); | |||||
fpmul_mont(a, tt, a); | |||||
} | |||||
void fp2copy(const f2elm_t a, f2elm_t c) | |||||
{ // Copy a GF(p^2) element, c = a. | |||||
fpcopy(a[0], c[0]); | |||||
fpcopy(a[1], c[1]); | |||||
} | |||||
void fp2zero(f2elm_t a) | |||||
{ // Zero a GF(p^2) element, a = 0. | |||||
fpzero(a[0]); | |||||
fpzero(a[1]); | |||||
} | |||||
void fp2neg(f2elm_t a) | |||||
{ // GF(p^2) negation, a = -a in GF(p^2). | |||||
fpneg(a[0]); | |||||
fpneg(a[1]); | |||||
} | |||||
__inline void fp2add(const f2elm_t a, const f2elm_t b, f2elm_t c) | |||||
{ // GF(p^2) addition, c = a+b in GF(p^2). | |||||
fpadd(a[0], b[0], c[0]); | |||||
fpadd(a[1], b[1], c[1]); | |||||
} | |||||
__inline void fp2sub(const f2elm_t a, const f2elm_t b, f2elm_t c) | |||||
{ // GF(p^2) subtraction, c = a-b in GF(p^2). | |||||
fpsub(a[0], b[0], c[0]); | |||||
fpsub(a[1], b[1], c[1]); | |||||
} | |||||
void fp2div2(const f2elm_t a, f2elm_t c) | |||||
{ // GF(p^2) division by two, c = a/2 in GF(p^2). | |||||
fpdiv2(a[0], c[0]); | |||||
fpdiv2(a[1], c[1]); | |||||
} | |||||
void fp2correction(f2elm_t a) | |||||
{ // Modular correction, a = a in GF(p^2). | |||||
fpcorrection(a[0]); | |||||
fpcorrection(a[1]); | |||||
} | |||||
__inline static void mp_addfast(const digit_t *a, const digit_t *b, digit_t *c) | |||||
{ // Multiprecision addition, c = a+b. | |||||
mp_add_asm(a, b, c); | |||||
} | |||||
__inline static void mp_addfastx2(const digit_t *a, const digit_t *b, digit_t *c) | |||||
{ // Double-length multiprecision addition, c = a+b. | |||||
mp_addx2_asm(a, b, c); | |||||
} | |||||
void fp2sqr_mont(const f2elm_t a, f2elm_t c) | |||||
{ // GF(p^2) squaring using Montgomery arithmetic, c = a^2 in GF(p^2). | |||||
// Inputs: a = a0+a1*i, where a0, a1 are in [0, 2*p-1] | |||||
// Output: c = c0+c1*i, where c0, c1 are in [0, 2*p-1] | |||||
felm_t t1, t2, t3; | |||||
mp_addfast(a[0], a[1], t1); // t1 = a0+a1 | |||||
fpsub(a[0], a[1], t2); // t2 = a0-a1 | |||||
mp_addfast(a[0], a[0], t3); // t3 = 2a0 | |||||
fpmul_mont(t1, t2, c[0]); // c0 = (a0+a1)(a0-a1) | |||||
fpmul_mont(t3, a[1], c[1]); // c1 = 2a0*a1 | |||||
} | |||||
__inline unsigned int mp_sub(const digit_t *a, const digit_t *b, digit_t *c, const unsigned int nwords) | |||||
{ // Multiprecision subtraction, c = a-b, where lng(a) = lng(b) = nwords. Returns the borrow bit. | |||||
unsigned int i, borrow = 0; | |||||
for (i = 0; i < nwords; i++) | |||||
{ | |||||
SUBC(borrow, a[i], b[i], borrow, c[i]); | |||||
} | |||||
return borrow; | |||||
} | |||||
__inline static digit_t mp_subfast(const digit_t *a, const digit_t *b, digit_t *c) | |||||
{ // Multiprecision subtraction, c = a-b, where lng(a) = lng(b) = 2*NWORDS_FIELD. | |||||
// If c < 0 then returns mask = 0xFF..F, else mask = 0x00..0 | |||||
return mp_subx2_asm(a, b, c); | |||||
} | |||||
void fp2mul_mont(const f2elm_t a, const f2elm_t b, f2elm_t c) | |||||
{ // GF(p^2) multiplication using Montgomery arithmetic, c = a*b in GF(p^2). | |||||
// Inputs: a = a0+a1*i and b = b0+b1*i, where a0, a1, b0, b1 are in [0, 2*p-1] | |||||
// Output: c = c0+c1*i, where c0, c1 are in [0, 2*p-1] | |||||
felm_t t1, t2; | |||||
dfelm_t tt1, tt2, tt3; | |||||
digit_t mask; | |||||
unsigned int i, borrow = 0; | |||||
mp_mul(a[0], b[0], tt1, NWORDS_FIELD); // tt1 = a0*b0 | |||||
mp_mul(a[1], b[1], tt2, NWORDS_FIELD); // tt2 = a1*b1 | |||||
mp_addfast(a[0], a[1], t1); // t1 = a0+a1 | |||||
mp_addfast(b[0], b[1], t2); // t2 = b0+b1 | |||||
mask = mp_subfast(tt1, tt2, tt3); // tt3 = a0*b0 - a1*b1. If tt3 < 0 then mask = 0xFF..F, else if tt3 >= 0 then mask = 0x00..0 | |||||
for (i = 0; i < NWORDS_FIELD; i++) | |||||
{ | |||||
ADDC(borrow, tt3[NWORDS_FIELD + i], ((digit_t *)PRIME)[i] & mask, borrow, tt3[NWORDS_FIELD + i]); | |||||
} | |||||
rdc_mont(tt3, c[0]); // c[0] = a0*b0 - a1*b1 | |||||
mp_addfastx2(tt1, tt2, tt1); // tt1 = a0*b0 + a1*b1 | |||||
mp_mul(t1, t2, tt2, NWORDS_FIELD); // tt2 = (a0+a1)*(b0+b1) | |||||
mp_subfast(tt2, tt1, tt2); // tt2 = (a0+a1)*(b0+b1) - a0*b0 - a1*b1 | |||||
rdc_mont(tt2, c[1]); // c[1] = (a0+a1)*(b0+b1) - a0*b0 - a1*b1 | |||||
//a1*b0+a0*b1 | |||||
} | |||||
void fpinv_chain_mont(felm_t a) | |||||
{ // Chain to compute a^(p-3)/4 using Montgomery arithmetic. | |||||
unsigned int i, j; | |||||
felm_t t[27], tt; | |||||
// Precomputed table | |||||
fpsqr_mont(a, tt); | |||||
fpmul_mont(a, tt, t[0]); | |||||
fpmul_mont(t[0], tt, t[1]); | |||||
fpmul_mont(t[1], tt, t[2]); | |||||
fpmul_mont(t[2], tt, t[3]); | |||||
fpmul_mont(t[3], tt, t[3]); | |||||
for (i = 3; i <= 8; i++) | |||||
fpmul_mont(t[i], tt, t[i + 1]); | |||||
fpmul_mont(t[9], tt, t[9]); | |||||
for (i = 9; i <= 20; i++) | |||||
fpmul_mont(t[i], tt, t[i + 1]); | |||||
fpmul_mont(t[21], tt, t[21]); | |||||
for (i = 21; i <= 24; i++) | |||||
fpmul_mont(t[i], tt, t[i + 1]); | |||||
fpmul_mont(t[25], tt, t[25]); | |||||
fpmul_mont(t[25], tt, t[26]); | |||||
fpcopy(a, tt); | |||||
for (i = 0; i < 6; i++) | |||||
fpsqr_mont(tt, tt); | |||||
fpmul_mont(t[20], tt, tt); | |||||
for (i = 0; i < 6; i++) | |||||
fpsqr_mont(tt, tt); | |||||
fpmul_mont(t[24], tt, tt); | |||||
for (i = 0; i < 6; i++) | |||||
fpsqr_mont(tt, tt); | |||||
fpmul_mont(t[11], tt, tt); | |||||
for (i = 0; i < 6; i++) | |||||
fpsqr_mont(tt, tt); | |||||
fpmul_mont(t[8], tt, tt); | |||||
for (i = 0; i < 8; i++) | |||||
fpsqr_mont(tt, tt); | |||||
fpmul_mont(t[2], tt, tt); | |||||
for (i = 0; i < 6; i++) | |||||
fpsqr_mont(tt, tt); | |||||
fpmul_mont(t[23], tt, tt); | |||||
for (i = 0; i < 6; i++) | |||||
fpsqr_mont(tt, tt); | |||||
fpmul_mont(t[2], tt, tt); | |||||
for (i = 0; i < 9; i++) | |||||
fpsqr_mont(tt, tt); | |||||
fpmul_mont(t[2], tt, tt); | |||||
for (i = 0; i < 10; i++) | |||||
fpsqr_mont(tt, tt); | |||||
fpmul_mont(t[15], tt, tt); | |||||
for (i = 0; i < 8; i++) | |||||
fpsqr_mont(tt, tt); | |||||
fpmul_mont(t[13], tt, tt); | |||||
for (i = 0; i < 8; i++) | |||||
fpsqr_mont(tt, tt); | |||||
fpmul_mont(t[26], tt, tt); | |||||
for (i = 0; i < 8; i++) | |||||
fpsqr_mont(tt, tt); | |||||
fpmul_mont(t[20], tt, tt); | |||||
for (i = 0; i < 6; i++) | |||||
fpsqr_mont(tt, tt); | |||||
fpmul_mont(t[11], tt, tt); | |||||
for (i = 0; i < 6; i++) | |||||
fpsqr_mont(tt, tt); | |||||
fpmul_mont(t[10], tt, tt); | |||||
for (i = 0; i < 6; i++) | |||||
fpsqr_mont(tt, tt); | |||||
fpmul_mont(t[14], tt, tt); | |||||
for (i = 0; i < 6; i++) | |||||
fpsqr_mont(tt, tt); | |||||
fpmul_mont(t[4], tt, tt); | |||||
for (i = 0; i < 10; i++) | |||||
fpsqr_mont(tt, tt); | |||||
fpmul_mont(t[18], tt, tt); | |||||
for (i = 0; i < 6; i++) | |||||
fpsqr_mont(tt, tt); | |||||
fpmul_mont(t[1], tt, tt); | |||||
for (i = 0; i < 7; i++) | |||||
fpsqr_mont(tt, tt); | |||||
fpmul_mont(t[22], tt, tt); | |||||
for (i = 0; i < 10; i++) | |||||
fpsqr_mont(tt, tt); | |||||
fpmul_mont(t[6], tt, tt); | |||||
for (i = 0; i < 7; i++) | |||||
fpsqr_mont(tt, tt); | |||||
fpmul_mont(t[24], tt, tt); | |||||
for (i = 0; i < 6; i++) | |||||
fpsqr_mont(tt, tt); | |||||
fpmul_mont(t[9], tt, tt); | |||||
for (i = 0; i < 8; i++) | |||||
fpsqr_mont(tt, tt); | |||||
fpmul_mont(t[18], tt, tt); | |||||
for (i = 0; i < 6; i++) | |||||
fpsqr_mont(tt, tt); | |||||
fpmul_mont(t[17], tt, tt); | |||||
for (i = 0; i < 8; i++) | |||||
fpsqr_mont(tt, tt); | |||||
fpmul_mont(a, tt, tt); | |||||
for (i = 0; i < 10; i++) | |||||
fpsqr_mont(tt, tt); | |||||
fpmul_mont(t[16], tt, tt); | |||||
for (i = 0; i < 6; i++) | |||||
fpsqr_mont(tt, tt); | |||||
fpmul_mont(t[7], tt, tt); | |||||
for (i = 0; i < 6; i++) | |||||
fpsqr_mont(tt, tt); | |||||
fpmul_mont(t[0], tt, tt); | |||||
for (i = 0; i < 7; i++) | |||||
fpsqr_mont(tt, tt); | |||||
fpmul_mont(t[12], tt, tt); | |||||
for (i = 0; i < 7; i++) | |||||
fpsqr_mont(tt, tt); | |||||
fpmul_mont(t[19], tt, tt); | |||||
for (i = 0; i < 6; i++) | |||||
fpsqr_mont(tt, tt); | |||||
fpmul_mont(t[22], tt, tt); | |||||
for (i = 0; i < 6; i++) | |||||
fpsqr_mont(tt, tt); | |||||
fpmul_mont(t[25], tt, tt); | |||||
for (i = 0; i < 7; i++) | |||||
fpsqr_mont(tt, tt); | |||||
fpmul_mont(t[2], tt, tt); | |||||
for (i = 0; i < 6; i++) | |||||
fpsqr_mont(tt, tt); | |||||
fpmul_mont(t[10], tt, tt); | |||||
for (i = 0; i < 7; i++) | |||||
fpsqr_mont(tt, tt); | |||||
fpmul_mont(t[22], tt, tt); | |||||
for (i = 0; i < 8; i++) | |||||
fpsqr_mont(tt, tt); | |||||
fpmul_mont(t[18], tt, tt); | |||||
for (i = 0; i < 6; i++) | |||||
fpsqr_mont(tt, tt); | |||||
fpmul_mont(t[4], tt, tt); | |||||
for (i = 0; i < 6; i++) | |||||
fpsqr_mont(tt, tt); | |||||
fpmul_mont(t[14], tt, tt); | |||||
for (i = 0; i < 7; i++) | |||||
fpsqr_mont(tt, tt); | |||||
fpmul_mont(t[13], tt, tt); | |||||
for (i = 0; i < 6; i++) | |||||
fpsqr_mont(tt, tt); | |||||
fpmul_mont(t[5], tt, tt); | |||||
for (i = 0; i < 6; i++) | |||||
fpsqr_mont(tt, tt); | |||||
fpmul_mont(t[23], tt, tt); | |||||
for (i = 0; i < 6; i++) | |||||
fpsqr_mont(tt, tt); | |||||
fpmul_mont(t[21], tt, tt); | |||||
for (i = 0; i < 6; i++) | |||||
fpsqr_mont(tt, tt); | |||||
fpmul_mont(t[2], tt, tt); | |||||
for (i = 0; i < 7; i++) | |||||
fpsqr_mont(tt, tt); | |||||
fpmul_mont(t[23], tt, tt); | |||||
for (i = 0; i < 8; i++) | |||||
fpsqr_mont(tt, tt); | |||||
fpmul_mont(t[12], tt, tt); | |||||
for (i = 0; i < 6; i++) | |||||
fpsqr_mont(tt, tt); | |||||
fpmul_mont(t[9], tt, tt); | |||||
for (i = 0; i < 6; i++) | |||||
fpsqr_mont(tt, tt); | |||||
fpmul_mont(t[3], tt, tt); | |||||
for (i = 0; i < 7; i++) | |||||
fpsqr_mont(tt, tt); | |||||
fpmul_mont(t[13], tt, tt); | |||||
for (i = 0; i < 7; i++) | |||||
fpsqr_mont(tt, tt); | |||||
fpmul_mont(t[17], tt, tt); | |||||
for (i = 0; i < 8; i++) | |||||
fpsqr_mont(tt, tt); | |||||
fpmul_mont(t[26], tt, tt); | |||||
for (i = 0; i < 8; i++) | |||||
fpsqr_mont(tt, tt); | |||||
fpmul_mont(t[5], tt, tt); | |||||
for (i = 0; i < 8; i++) | |||||
fpsqr_mont(tt, tt); | |||||
fpmul_mont(t[8], tt, tt); | |||||
for (i = 0; i < 6; i++) | |||||
fpsqr_mont(tt, tt); | |||||
fpmul_mont(t[2], tt, tt); | |||||
for (i = 0; i < 6; i++) | |||||
fpsqr_mont(tt, tt); | |||||
fpmul_mont(t[11], tt, tt); | |||||
for (i = 0; i < 7; i++) | |||||
fpsqr_mont(tt, tt); | |||||
fpmul_mont(t[20], tt, tt); | |||||
for (j = 0; j < 61; j++) | |||||
{ | |||||
for (i = 0; i < 6; i++) | |||||
fpsqr_mont(tt, tt); | |||||
fpmul_mont(t[26], tt, tt); | |||||
} | |||||
fpcopy(tt, a); | |||||
} | |||||
void fp2inv_mont(f2elm_t a) | |||||
{ // GF(p^2) inversion using Montgomery arithmetic, a = (a0-i*a1)/(a0^2+a1^2). | |||||
f2elm_t t1; | |||||
fpsqr_mont(a[0], t1[0]); // t10 = a0^2 | |||||
fpsqr_mont(a[1], t1[1]); // t11 = a1^2 | |||||
fpadd(t1[0], t1[1], t1[0]); // t10 = a0^2+a1^2 | |||||
fpinv_mont(t1[0]); // t10 = (a0^2+a1^2)^-1 | |||||
fpneg(a[1]); // a = a0-i*a1 | |||||
fpmul_mont(a[0], t1[0], a[0]); | |||||
fpmul_mont(a[1], t1[0], a[1]); // a = (a0-i*a1)*(a0^2+a1^2)^-1 | |||||
} | |||||
void to_fp2mont(const f2elm_t a, f2elm_t mc) | |||||
{ // Conversion of a GF(p^2) element to Montgomery representation, | |||||
// mc_i = a_i*R^2*R^(-1) = a_i*R in GF(p^2). | |||||
to_mont(a[0], mc[0]); | |||||
to_mont(a[1], mc[1]); | |||||
} | |||||
void from_fp2mont(const f2elm_t ma, f2elm_t c) | |||||
{ // Conversion of a GF(p^2) element from Montgomery representation to standard representation, | |||||
// c_i = ma_i*R^(-1) = a_i in GF(p^2). | |||||
from_mont(ma[0], c[0]); | |||||
from_mont(ma[1], c[1]); | |||||
} | |||||
__inline unsigned int mp_add(const digit_t *a, const digit_t *b, digit_t *c, const unsigned int nwords) | |||||
{ // Multiprecision addition, c = a+b, where lng(a) = lng(b) = nwords. Returns the carry bit. | |||||
unsigned int i, carry = 0; | |||||
for (i = 0; i < nwords; i++) | |||||
{ | |||||
ADDC(carry, a[i], b[i], carry, c[i]); | |||||
} | |||||
return carry; | |||||
} | |||||
void mp_shiftleft(digit_t *x, unsigned int shift, const unsigned int nwords) | |||||
{ | |||||
unsigned int i, j = 0; | |||||
while (shift > RADIX) | |||||
{ | |||||
j += 1; | |||||
shift -= RADIX; | |||||
} | |||||
for (i = 0; i < nwords - j; i++) | |||||
x[nwords - 1 - i] = x[nwords - 1 - i - j]; | |||||
for (i = nwords - j; i < nwords; i++) | |||||
x[nwords - 1 - i] = 0; | |||||
if (shift != 0) | |||||
{ | |||||
for (j = nwords - 1; j > 0; j--) | |||||
SHIFTL(x[j], x[j - 1], shift, x[j], RADIX); | |||||
x[0] <<= shift; | |||||
} | |||||
} | |||||
void mp_shiftr1(digit_t *x, const unsigned int nwords) | |||||
{ // Multiprecision right shift by one. | |||||
unsigned int i; | |||||
for (i = 0; i < nwords - 1; i++) | |||||
{ | |||||
SHIFTR(x[i + 1], x[i], 1, x[i], RADIX); | |||||
} | |||||
x[nwords - 1] >>= 1; | |||||
} | |||||
void mp_shiftl1(digit_t *x, const unsigned int nwords) | |||||
{ // Multiprecision left shift by one. | |||||
int i; | |||||
for (i = nwords - 1; i > 0; i--) | |||||
{ | |||||
SHIFTL(x[i], x[i - 1], 1, x[i], RADIX); | |||||
} | |||||
x[0] <<= 1; | |||||
} |
@@ -0,0 +1,43 @@ | |||||
/******************************************************************************************** | |||||
* Hardware-based random number generation function using /dev/urandom | |||||
*********************************************************************************************/ | |||||
#include "random.h" | |||||
#include <stdlib.h> | |||||
#include <unistd.h> | |||||
#include <fcntl.h> | |||||
static int lock = -1; | |||||
static __inline void delay(unsigned int count) | |||||
{ | |||||
while (count--) {} | |||||
} | |||||
int randombytes(unsigned char* random_array, unsigned long long nbytes) | |||||
{ // Generation of "nbytes" of random values | |||||
int r, n = (int)nbytes, count = 0; | |||||
if (lock == -1) { | |||||
do { | |||||
lock = open("/dev/urandom", O_RDONLY); | |||||
if (lock == -1) { | |||||
delay(0xFFFFF); | |||||
} | |||||
} while (lock == -1); | |||||
} | |||||
while (n > 0) { | |||||
do { | |||||
r = read(lock, random_array+count, n); | |||||
if (r == -1) { | |||||
delay(0xFFFF); | |||||
} | |||||
} while (r == -1); | |||||
count += r; | |||||
n -= r; | |||||
} | |||||
return 0; | |||||
} |
@@ -0,0 +1,9 @@ | |||||
#ifndef __RANDOM_H__ | |||||
#define __RANDOM_H__ | |||||
// Generate random bytes and output the result to random_array | |||||
int randombytes(unsigned char* random_array, unsigned long long nbytes); | |||||
#endif |
@@ -0,0 +1,573 @@ | |||||
/******************************************************************************************** | |||||
* SHA3-derived functions: SHAKE and cSHAKE | |||||
* | |||||
* Based on the public domain implementation in crypto_hash/keccakc512/simple/ | |||||
* from http://bench.cr.yp.to/supercop.html by Ronny Van Keer | |||||
* and the public domain "TweetFips202" implementation from https://twitter.com/tweetfips202 | |||||
* by Gilles Van Assche, Daniel J. Bernstein, and Peter Schwabe | |||||
* | |||||
* See NIST Special Publication 800-185 for more information: | |||||
* http://nvlpubs.nist.gov/nistpubs/SpecialPublications/NIST.SP.800-185.pdf | |||||
* | |||||
*********************************************************************************************/ | |||||
#include <stdint.h> | |||||
#include <assert.h> | |||||
#include "fips202.h" | |||||
#define NROUNDS 24 | |||||
#define ROL(a, offset) ((a << offset) ^ (a >> (64-offset))) | |||||
static uint64_t load64(const unsigned char *x) | |||||
{ | |||||
unsigned long long r = 0, i; | |||||
for (i = 0; i < 8; ++i) { | |||||
r |= (unsigned long long)x[i] << 8 * i; | |||||
} | |||||
return r; | |||||
} | |||||
static void store64(uint8_t *x, uint64_t u) | |||||
{ | |||||
unsigned int i; | |||||
for (i = 0; i < 8; ++i) { | |||||
x[i] = u; | |||||
u >>= 8; | |||||
} | |||||
} | |||||
static const uint64_t KeccakF_RoundConstants[NROUNDS] = | |||||
{ | |||||
(uint64_t)0x0000000000000001ULL, | |||||
(uint64_t)0x0000000000008082ULL, | |||||
(uint64_t)0x800000000000808aULL, | |||||
(uint64_t)0x8000000080008000ULL, | |||||
(uint64_t)0x000000000000808bULL, | |||||
(uint64_t)0x0000000080000001ULL, | |||||
(uint64_t)0x8000000080008081ULL, | |||||
(uint64_t)0x8000000000008009ULL, | |||||
(uint64_t)0x000000000000008aULL, | |||||
(uint64_t)0x0000000000000088ULL, | |||||
(uint64_t)0x0000000080008009ULL, | |||||
(uint64_t)0x000000008000000aULL, | |||||
(uint64_t)0x000000008000808bULL, | |||||
(uint64_t)0x800000000000008bULL, | |||||
(uint64_t)0x8000000000008089ULL, | |||||
(uint64_t)0x8000000000008003ULL, | |||||
(uint64_t)0x8000000000008002ULL, | |||||
(uint64_t)0x8000000000000080ULL, | |||||
(uint64_t)0x000000000000800aULL, | |||||
(uint64_t)0x800000008000000aULL, | |||||
(uint64_t)0x8000000080008081ULL, | |||||
(uint64_t)0x8000000000008080ULL, | |||||
(uint64_t)0x0000000080000001ULL, | |||||
(uint64_t)0x8000000080008008ULL | |||||
}; | |||||
void KeccakF1600_StatePermute(uint64_t * state) | |||||
{ | |||||
int round; | |||||
uint64_t Aba, Abe, Abi, Abo, Abu; | |||||
uint64_t Aga, Age, Agi, Ago, Agu; | |||||
uint64_t Aka, Ake, Aki, Ako, Aku; | |||||
uint64_t Ama, Ame, Ami, Amo, Amu; | |||||
uint64_t Asa, Ase, Asi, Aso, Asu; | |||||
uint64_t BCa, BCe, BCi, BCo, BCu; | |||||
uint64_t Da, De, Di, Do, Du; | |||||
uint64_t Eba, Ebe, Ebi, Ebo, Ebu; | |||||
uint64_t Ega, Ege, Egi, Ego, Egu; | |||||
uint64_t Eka, Eke, Eki, Eko, Eku; | |||||
uint64_t Ema, Eme, Emi, Emo, Emu; | |||||
uint64_t Esa, Ese, Esi, Eso, Esu; | |||||
//copyFromState(A, state) | |||||
Aba = state[ 0]; | |||||
Abe = state[ 1]; | |||||
Abi = state[ 2]; | |||||
Abo = state[ 3]; | |||||
Abu = state[ 4]; | |||||
Aga = state[ 5]; | |||||
Age = state[ 6]; | |||||
Agi = state[ 7]; | |||||
Ago = state[ 8]; | |||||
Agu = state[ 9]; | |||||
Aka = state[10]; | |||||
Ake = state[11]; | |||||
Aki = state[12]; | |||||
Ako = state[13]; | |||||
Aku = state[14]; | |||||
Ama = state[15]; | |||||
Ame = state[16]; | |||||
Ami = state[17]; | |||||
Amo = state[18]; | |||||
Amu = state[19]; | |||||
Asa = state[20]; | |||||
Ase = state[21]; | |||||
Asi = state[22]; | |||||
Aso = state[23]; | |||||
Asu = state[24]; | |||||
for( round = 0; round < NROUNDS; round += 2 ) | |||||
{ | |||||
// prepareTheta | |||||
BCa = Aba^Aga^Aka^Ama^Asa; | |||||
BCe = Abe^Age^Ake^Ame^Ase; | |||||
BCi = Abi^Agi^Aki^Ami^Asi; | |||||
BCo = Abo^Ago^Ako^Amo^Aso; | |||||
BCu = Abu^Agu^Aku^Amu^Asu; | |||||
//thetaRhoPiChiIotaPrepareTheta(round , A, E) | |||||
Da = BCu^ROL(BCe, 1); | |||||
De = BCa^ROL(BCi, 1); | |||||
Di = BCe^ROL(BCo, 1); | |||||
Do = BCi^ROL(BCu, 1); | |||||
Du = BCo^ROL(BCa, 1); | |||||
Aba ^= Da; | |||||
BCa = Aba; | |||||
Age ^= De; | |||||
BCe = ROL(Age, 44); | |||||
Aki ^= Di; | |||||
BCi = ROL(Aki, 43); | |||||
Amo ^= Do; | |||||
BCo = ROL(Amo, 21); | |||||
Asu ^= Du; | |||||
BCu = ROL(Asu, 14); | |||||
Eba = BCa ^((~BCe)& BCi ); | |||||
Eba ^= (uint64_t)KeccakF_RoundConstants[round]; | |||||
Ebe = BCe ^((~BCi)& BCo ); | |||||
Ebi = BCi ^((~BCo)& BCu ); | |||||
Ebo = BCo ^((~BCu)& BCa ); | |||||
Ebu = BCu ^((~BCa)& BCe ); | |||||
Abo ^= Do; | |||||
BCa = ROL(Abo, 28); | |||||
Agu ^= Du; | |||||
BCe = ROL(Agu, 20); | |||||
Aka ^= Da; | |||||
BCi = ROL(Aka, 3); | |||||
Ame ^= De; | |||||
BCo = ROL(Ame, 45); | |||||
Asi ^= Di; | |||||
BCu = ROL(Asi, 61); | |||||
Ega = BCa ^((~BCe)& BCi ); | |||||
Ege = BCe ^((~BCi)& BCo ); | |||||
Egi = BCi ^((~BCo)& BCu ); | |||||
Ego = BCo ^((~BCu)& BCa ); | |||||
Egu = BCu ^((~BCa)& BCe ); | |||||
Abe ^= De; | |||||
BCa = ROL(Abe, 1); | |||||
Agi ^= Di; | |||||
BCe = ROL(Agi, 6); | |||||
Ako ^= Do; | |||||
BCi = ROL(Ako, 25); | |||||
Amu ^= Du; | |||||
BCo = ROL(Amu, 8); | |||||
Asa ^= Da; | |||||
BCu = ROL(Asa, 18); | |||||
Eka = BCa ^((~BCe)& BCi ); | |||||
Eke = BCe ^((~BCi)& BCo ); | |||||
Eki = BCi ^((~BCo)& BCu ); | |||||
Eko = BCo ^((~BCu)& BCa ); | |||||
Eku = BCu ^((~BCa)& BCe ); | |||||
Abu ^= Du; | |||||
BCa = ROL(Abu, 27); | |||||
Aga ^= Da; | |||||
BCe = ROL(Aga, 36); | |||||
Ake ^= De; | |||||
BCi = ROL(Ake, 10); | |||||
Ami ^= Di; | |||||
BCo = ROL(Ami, 15); | |||||
Aso ^= Do; | |||||
BCu = ROL(Aso, 56); | |||||
Ema = BCa ^((~BCe)& BCi ); | |||||
Eme = BCe ^((~BCi)& BCo ); | |||||
Emi = BCi ^((~BCo)& BCu ); | |||||
Emo = BCo ^((~BCu)& BCa ); | |||||
Emu = BCu ^((~BCa)& BCe ); | |||||
Abi ^= Di; | |||||
BCa = ROL(Abi, 62); | |||||
Ago ^= Do; | |||||
BCe = ROL(Ago, 55); | |||||
Aku ^= Du; | |||||
BCi = ROL(Aku, 39); | |||||
Ama ^= Da; | |||||
BCo = ROL(Ama, 41); | |||||
Ase ^= De; | |||||
BCu = ROL(Ase, 2); | |||||
Esa = BCa ^((~BCe)& BCi ); | |||||
Ese = BCe ^((~BCi)& BCo ); | |||||
Esi = BCi ^((~BCo)& BCu ); | |||||
Eso = BCo ^((~BCu)& BCa ); | |||||
Esu = BCu ^((~BCa)& BCe ); | |||||
// prepareTheta | |||||
BCa = Eba^Ega^Eka^Ema^Esa; | |||||
BCe = Ebe^Ege^Eke^Eme^Ese; | |||||
BCi = Ebi^Egi^Eki^Emi^Esi; | |||||
BCo = Ebo^Ego^Eko^Emo^Eso; | |||||
BCu = Ebu^Egu^Eku^Emu^Esu; | |||||
//thetaRhoPiChiIotaPrepareTheta(round+1, E, A) | |||||
Da = BCu^ROL(BCe, 1); | |||||
De = BCa^ROL(BCi, 1); | |||||
Di = BCe^ROL(BCo, 1); | |||||
Do = BCi^ROL(BCu, 1); | |||||
Du = BCo^ROL(BCa, 1); | |||||
Eba ^= Da; | |||||
BCa = Eba; | |||||
Ege ^= De; | |||||
BCe = ROL(Ege, 44); | |||||
Eki ^= Di; | |||||
BCi = ROL(Eki, 43); | |||||
Emo ^= Do; | |||||
BCo = ROL(Emo, 21); | |||||
Esu ^= Du; | |||||
BCu = ROL(Esu, 14); | |||||
Aba = BCa ^((~BCe)& BCi ); | |||||
Aba ^= (uint64_t)KeccakF_RoundConstants[round+1]; | |||||
Abe = BCe ^((~BCi)& BCo ); | |||||
Abi = BCi ^((~BCo)& BCu ); | |||||
Abo = BCo ^((~BCu)& BCa ); | |||||
Abu = BCu ^((~BCa)& BCe ); | |||||
Ebo ^= Do; | |||||
BCa = ROL(Ebo, 28); | |||||
Egu ^= Du; | |||||
BCe = ROL(Egu, 20); | |||||
Eka ^= Da; | |||||
BCi = ROL(Eka, 3); | |||||
Eme ^= De; | |||||
BCo = ROL(Eme, 45); | |||||
Esi ^= Di; | |||||
BCu = ROL(Esi, 61); | |||||
Aga = BCa ^((~BCe)& BCi ); | |||||
Age = BCe ^((~BCi)& BCo ); | |||||
Agi = BCi ^((~BCo)& BCu ); | |||||
Ago = BCo ^((~BCu)& BCa ); | |||||
Agu = BCu ^((~BCa)& BCe ); | |||||
Ebe ^= De; | |||||
BCa = ROL(Ebe, 1); | |||||
Egi ^= Di; | |||||
BCe = ROL(Egi, 6); | |||||
Eko ^= Do; | |||||
BCi = ROL(Eko, 25); | |||||
Emu ^= Du; | |||||
BCo = ROL(Emu, 8); | |||||
Esa ^= Da; | |||||
BCu = ROL(Esa, 18); | |||||
Aka = BCa ^((~BCe)& BCi ); | |||||
Ake = BCe ^((~BCi)& BCo ); | |||||
Aki = BCi ^((~BCo)& BCu ); | |||||
Ako = BCo ^((~BCu)& BCa ); | |||||
Aku = BCu ^((~BCa)& BCe ); | |||||
Ebu ^= Du; | |||||
BCa = ROL(Ebu, 27); | |||||
Ega ^= Da; | |||||
BCe = ROL(Ega, 36); | |||||
Eke ^= De; | |||||
BCi = ROL(Eke, 10); | |||||
Emi ^= Di; | |||||
BCo = ROL(Emi, 15); | |||||
Eso ^= Do; | |||||
BCu = ROL(Eso, 56); | |||||
Ama = BCa ^((~BCe)& BCi ); | |||||
Ame = BCe ^((~BCi)& BCo ); | |||||
Ami = BCi ^((~BCo)& BCu ); | |||||
Amo = BCo ^((~BCu)& BCa ); | |||||
Amu = BCu ^((~BCa)& BCe ); | |||||
Ebi ^= Di; | |||||
BCa = ROL(Ebi, 62); | |||||
Ego ^= Do; | |||||
BCe = ROL(Ego, 55); | |||||
Eku ^= Du; | |||||
BCi = ROL(Eku, 39); | |||||
Ema ^= Da; | |||||
BCo = ROL(Ema, 41); | |||||
Ese ^= De; | |||||
BCu = ROL(Ese, 2); | |||||
Asa = BCa ^((~BCe)& BCi ); | |||||
Ase = BCe ^((~BCi)& BCo ); | |||||
Asi = BCi ^((~BCo)& BCu ); | |||||
Aso = BCo ^((~BCu)& BCa ); | |||||
Asu = BCu ^((~BCa)& BCe ); | |||||
} | |||||
//copyToState(state, A) | |||||
state[ 0] = Aba; | |||||
state[ 1] = Abe; | |||||
state[ 2] = Abi; | |||||
state[ 3] = Abo; | |||||
state[ 4] = Abu; | |||||
state[ 5] = Aga; | |||||
state[ 6] = Age; | |||||
state[ 7] = Agi; | |||||
state[ 8] = Ago; | |||||
state[ 9] = Agu; | |||||
state[10] = Aka; | |||||
state[11] = Ake; | |||||
state[12] = Aki; | |||||
state[13] = Ako; | |||||
state[14] = Aku; | |||||
state[15] = Ama; | |||||
state[16] = Ame; | |||||
state[17] = Ami; | |||||
state[18] = Amo; | |||||
state[19] = Amu; | |||||
state[20] = Asa; | |||||
state[21] = Ase; | |||||
state[22] = Asi; | |||||
state[23] = Aso; | |||||
state[24] = Asu; | |||||
#undef round | |||||
} | |||||
#include <string.h> | |||||
#define MIN(a, b) ((a) < (b) ? (a) : (b)) | |||||
static void keccak_absorb(uint64_t *s, unsigned int r, const unsigned char *m, unsigned long long int mlen, unsigned char p) | |||||
{ | |||||
unsigned long long i; | |||||
unsigned char t[200]; | |||||
while (mlen >= r) | |||||
{ | |||||
for (i = 0; i < r / 8; ++i) | |||||
s[i] ^= load64(m + 8 * i); | |||||
KeccakF1600_StatePermute(s); | |||||
mlen -= r; | |||||
m += r; | |||||
} | |||||
for (i = 0; i < r; ++i) | |||||
t[i] = 0; | |||||
for (i = 0; i < mlen; ++i) | |||||
t[i] = m[i]; | |||||
t[i] = p; | |||||
t[r - 1] |= 128; | |||||
for (i = 0; i < r / 8; ++i) | |||||
s[i] ^= load64(t + 8 * i); | |||||
} | |||||
static void keccak_squeezeblocks(unsigned char *h, unsigned long long int nblocks, uint64_t *s, unsigned int r) | |||||
{ | |||||
unsigned int i; | |||||
while(nblocks > 0) | |||||
{ | |||||
KeccakF1600_StatePermute(s); | |||||
for (i = 0; i < (r>>3); i++) | |||||
{ | |||||
store64(h+8*i, s[i]); | |||||
} | |||||
h += r; | |||||
nblocks--; | |||||
} | |||||
} | |||||
/********** SHAKE128 ***********/ | |||||
void shake128_absorb(uint64_t *s, const unsigned char *input, unsigned int inputByteLen) | |||||
{ | |||||
keccak_absorb(s, SHAKE128_RATE, input, inputByteLen, 0x1F); | |||||
} | |||||
void shake128_squeezeblocks(unsigned char *output, unsigned long long nblocks, uint64_t *s) | |||||
{ | |||||
keccak_squeezeblocks(output, nblocks, s, SHAKE128_RATE); | |||||
} | |||||
void shake128(unsigned char *output, unsigned long long outlen, const unsigned char *input, unsigned long long inlen) | |||||
{ | |||||
uint64_t s[25] = {0}; | |||||
unsigned char t[SHAKE128_RATE]; | |||||
unsigned long long nblocks = outlen/SHAKE128_RATE; | |||||
size_t i; | |||||
/* Absorb input */ | |||||
keccak_absorb(s, SHAKE128_RATE, input, inlen, 0x1F); | |||||
/* Squeeze output */ | |||||
keccak_squeezeblocks(output, nblocks, s, SHAKE128_RATE); | |||||
output += nblocks*SHAKE128_RATE; | |||||
outlen -= nblocks*SHAKE128_RATE; | |||||
if (outlen) | |||||
{ | |||||
keccak_squeezeblocks(t, 1, s, SHAKE128_RATE); | |||||
for (i = 0; i < outlen; i++) | |||||
output[i] = t[i]; | |||||
} | |||||
} | |||||
/********** cSHAKE128 ***********/ | |||||
void cshake128_simple_absorb(uint64_t s[25], uint16_t cstm, const unsigned char *in, unsigned long long inlen) | |||||
{ | |||||
unsigned char *sep = (unsigned char*)s; | |||||
unsigned int i; | |||||
for (i = 0; i < 25; i++) | |||||
s[i] = 0; | |||||
/* Absorb customization (domain-separation) string */ | |||||
sep[0] = 0x01; | |||||
sep[1] = 0xa8; | |||||
sep[2] = 0x01; | |||||
sep[3] = 0x00; | |||||
sep[4] = 0x01; | |||||
sep[5] = 16; // fixed bitlen of cstm | |||||
sep[6] = cstm & 0xff; | |||||
sep[7] = cstm >> 8; | |||||
KeccakF1600_StatePermute(s); | |||||
/* Absorb input */ | |||||
keccak_absorb(s, SHAKE128_RATE, in, inlen, 0x04); | |||||
} | |||||
void cshake128_simple_squeezeblocks(unsigned char *output, unsigned long long nblocks, uint64_t *s) | |||||
{ | |||||
keccak_squeezeblocks(output, nblocks, s, SHAKE128_RATE); | |||||
} | |||||
void cshake128_simple(unsigned char *output, unsigned long long outlen, uint16_t cstm, const unsigned char *in, unsigned long long inlen) | |||||
{ | |||||
uint64_t s[25]; | |||||
unsigned char t[SHAKE128_RATE]; | |||||
unsigned int i; | |||||
cshake128_simple_absorb(s, cstm, in, inlen); | |||||
/* Squeeze output */ | |||||
keccak_squeezeblocks(output, outlen/SHAKE128_RATE, s, SHAKE128_RATE); | |||||
output += (outlen/SHAKE128_RATE)*SHAKE128_RATE; | |||||
if (outlen%SHAKE128_RATE) | |||||
{ | |||||
keccak_squeezeblocks(t, 1, s, SHAKE128_RATE); | |||||
for (i = 0; i < outlen%SHAKE128_RATE; i++) | |||||
output[i] = t[i]; | |||||
} | |||||
} | |||||
/********** SHAKE256 ***********/ | |||||
void shake256_absorb(uint64_t *s, const unsigned char *input, unsigned int inputByteLen) | |||||
{ | |||||
keccak_absorb(s, SHAKE256_RATE, input, inputByteLen, 0x1F); | |||||
} | |||||
void shake256_squeezeblocks(unsigned char *output, unsigned long long nblocks, uint64_t *s) | |||||
{ | |||||
keccak_squeezeblocks(output, nblocks, s, SHAKE256_RATE); | |||||
} | |||||
void shake256(unsigned char *output, unsigned long long outlen, const unsigned char *input, unsigned long long inlen) | |||||
{ | |||||
uint64_t s[25]; | |||||
unsigned char t[SHAKE256_RATE]; | |||||
unsigned long long nblocks = outlen/SHAKE256_RATE; | |||||
size_t i; | |||||
for (i = 0; i < 25; ++i) | |||||
s[i] = 0; | |||||
/* Absorb input */ | |||||
keccak_absorb(s, SHAKE256_RATE, input, inlen, 0x1F); | |||||
/* Squeeze output */ | |||||
keccak_squeezeblocks(output, nblocks, s, SHAKE256_RATE); | |||||
output += nblocks*SHAKE256_RATE; | |||||
outlen -= nblocks*SHAKE256_RATE; | |||||
if (outlen) | |||||
{ | |||||
keccak_squeezeblocks(t, 1, s, SHAKE256_RATE); | |||||
for (i = 0; i < outlen; i++) | |||||
output[i] = t[i]; | |||||
} | |||||
} | |||||
/********** cSHAKE256 ***********/ | |||||
void cshake256_simple_absorb(uint64_t s[25], uint16_t cstm, const unsigned char *in, unsigned long long inlen) | |||||
{ | |||||
unsigned char *sep = (unsigned char*)s; | |||||
unsigned int i; | |||||
for (i = 0; i < 25; i++) | |||||
s[i] = 0; | |||||
/* Absorb customization (domain-separation) string */ | |||||
sep[0] = 0x01; | |||||
sep[1] = 0x88; | |||||
sep[2] = 0x01; | |||||
sep[3] = 0x00; | |||||
sep[4] = 0x01; | |||||
sep[5] = 16; // fixed bitlen of cstm | |||||
sep[6] = cstm & 0xff; | |||||
sep[7] = cstm >> 8; | |||||
KeccakF1600_StatePermute(s); | |||||
/* Absorb input */ | |||||
keccak_absorb(s, SHAKE256_RATE, in, inlen, 0x04); | |||||
} | |||||
void cshake256_simple_squeezeblocks(unsigned char *output, unsigned long long nblocks, uint64_t *s) | |||||
{ | |||||
keccak_squeezeblocks(output, nblocks, s, SHAKE256_RATE); | |||||
} | |||||
void cshake256_simple(unsigned char *output, unsigned long long outlen, uint16_t cstm, const unsigned char *in, unsigned long long inlen) | |||||
{ | |||||
uint64_t s[25]; | |||||
unsigned char t[SHAKE256_RATE]; | |||||
unsigned int i; | |||||
cshake256_simple_absorb(s, cstm, in, inlen); | |||||
/* Squeeze output */ | |||||
keccak_squeezeblocks(output, outlen/SHAKE256_RATE, s, SHAKE256_RATE); | |||||
output += (outlen/SHAKE256_RATE)*SHAKE256_RATE; | |||||
if(outlen%SHAKE256_RATE) | |||||
{ | |||||
keccak_squeezeblocks(t, 1, s, SHAKE256_RATE); | |||||
for (i = 0; i < outlen%SHAKE256_RATE; i++) | |||||
output[i] = t[i]; | |||||
} | |||||
} |
@@ -0,0 +1,27 @@ | |||||
#ifndef FIPS202_H | |||||
#define FIPS202_H | |||||
#include <stdint.h> | |||||
#define SHAKE128_RATE 168 | |||||
#define SHAKE256_RATE 136 | |||||
void shake128_absorb(uint64_t *s, const unsigned char *input, unsigned int inputByteLen); | |||||
void shake128_squeezeblocks(unsigned char *output, unsigned long long nblocks, uint64_t *s); | |||||
void shake128(unsigned char *output, unsigned long long outlen, const unsigned char *input, unsigned long long inlen); | |||||
void cshake128_simple_absorb(uint64_t *s, uint16_t cstm, const unsigned char *in, unsigned long long inlen); | |||||
void cshake128_simple_squeezeblocks(unsigned char *output, unsigned long long nblocks, uint64_t *s); | |||||
void cshake128_simple(unsigned char *output, unsigned long long outlen, uint16_t cstm, const unsigned char *in, unsigned long long inlen); | |||||
void shake256_absorb(uint64_t *s, const unsigned char *input, unsigned int inputByteLen); | |||||
void shake256_squeezeblocks(unsigned char *output, unsigned long long nblocks, uint64_t *s); | |||||
void shake256(unsigned char *output, unsigned long long outlen, const unsigned char *input, unsigned long long inlen); | |||||
void cshake256_simple_absorb(uint64_t *s, uint16_t cstm, const unsigned char *in, unsigned long long inlen); | |||||
void cshake256_simple_squeezeblocks(unsigned char *output, unsigned long long nblocks, uint64_t *s); | |||||
void cshake256_simple(unsigned char *output, unsigned long long outlen, uint16_t cstm, const unsigned char *in, unsigned long long inlen); | |||||
#endif |
@@ -0,0 +1,345 @@ | |||||
/******************************************************************************************** | |||||
* Supersingular Isogeny Key Encapsulation Library | |||||
* | |||||
* Abstract: ephemeral supersingular isogeny Diffie-Hellman key exchange (SIDH) | |||||
*********************************************************************************************/ | |||||
#include "P751_internal.h" | |||||
#include "random/random.h" | |||||
#include <stdio.h> | |||||
static void clear_words(void *mem, digit_t nwords) | |||||
{ // Clear digits from memory. "nwords" indicates the number of digits to be zeroed. | |||||
// This function uses the volatile type qualifier to inform the compiler not to optimize out the memory clearing. | |||||
unsigned int i; | |||||
volatile digit_t *v = mem; | |||||
for (i = 0; i < nwords; i++) | |||||
{ | |||||
v[i] = 0; | |||||
} | |||||
} | |||||
static void init_basis(digit_t *gen, f2elm_t XP, f2elm_t XQ, f2elm_t XR) | |||||
{ // Initialization of basis points | |||||
fpcopy(gen, XP[0]); | |||||
fpcopy(gen + NWORDS_FIELD, XP[1]); | |||||
fpcopy(gen + 2 * NWORDS_FIELD, XQ[0]); | |||||
fpzero(XQ[1]); | |||||
fpcopy(gen + 3 * NWORDS_FIELD, XR[0]); | |||||
fpcopy(gen + 4 * NWORDS_FIELD, XR[1]); | |||||
} | |||||
static void fp2_encode(const f2elm_t x, unsigned char *enc) | |||||
{ // Conversion of GF(p^2) element from Montgomery to standard representation, and encoding by removing leading 0 bytes | |||||
unsigned int i; | |||||
f2elm_t t; | |||||
from_fp2mont(x, t); | |||||
for (i = 0; i < FP2_ENCODED_BYTES / 2; i++) | |||||
{ | |||||
enc[i] = ((unsigned char *)t)[i]; | |||||
enc[i + FP2_ENCODED_BYTES / 2] = ((unsigned char *)t)[i + MAXBITS_FIELD / 8]; | |||||
} | |||||
} | |||||
static void fp2_decode(const unsigned char *enc, f2elm_t x) | |||||
{ // Parse byte sequence back into GF(p^2) element, and conversion to Montgomery representation | |||||
unsigned int i; | |||||
for (i = 0; i < 2 * (MAXBITS_FIELD / 8); i++) | |||||
((unsigned char *)x)[i] = 0; | |||||
for (i = 0; i < FP2_ENCODED_BYTES / 2; i++) | |||||
{ | |||||
((unsigned char *)x)[i] = enc[i]; | |||||
((unsigned char *)x)[i + MAXBITS_FIELD / 8] = enc[i + FP2_ENCODED_BYTES / 2]; | |||||
} | |||||
to_fp2mont(x, x); | |||||
} | |||||
void random_mod_order_A(unsigned char *random_digits) | |||||
{ // Generation of Alice's secret key | |||||
// Outputs random value in [0, 2^eA - 1] | |||||
unsigned long long nbytes = NBITS_TO_NBYTES(OALICE_BITS); | |||||
clear_words((void *)random_digits, MAXWORDS_ORDER); | |||||
randombytes(random_digits, nbytes); | |||||
random_digits[nbytes - 1] &= MASK_ALICE; // Masking last byte | |||||
} | |||||
void random_mod_order_B(unsigned char *random_digits) | |||||
{ // Generation of Bob's secret key | |||||
// Outputs random value in [0, 2^Floor(Log(2, oB)) - 1] | |||||
unsigned long long nbytes = NBITS_TO_NBYTES(OBOB_BITS - 1); | |||||
clear_words((void *)random_digits, MAXWORDS_ORDER); | |||||
randombytes(random_digits, nbytes); | |||||
random_digits[nbytes - 1] &= MASK_BOB; // Masking last byte | |||||
} | |||||
int EphemeralKeyGeneration_A(const unsigned char *PrivateKeyA, unsigned char *PublicKeyA) | |||||
{ // Alice's ephemeral public key generation | |||||
// Input: a private key PrivateKeyA in the range [0, 2^eA - 1]. | |||||
// Output: the public key PublicKeyA consisting of 3 elements in GF(p^2) which are encoded by removing leading 0 bytes. | |||||
point_proj_t R, phiP = {0}, phiQ = {0}, phiR = {0}, pts[MAX_INT_POINTS_ALICE]; | |||||
f2elm_t XPA, XQA, XRA, coeff[3], A24plus = {0}, C24 = {0}, A = {0}; | |||||
unsigned int i, row, m, index = 0, pts_index[MAX_INT_POINTS_ALICE], npts = 0, ii = 0; | |||||
// Initialize basis points | |||||
init_basis((digit_t *)A_gen, XPA, XQA, XRA); | |||||
init_basis((digit_t *)B_gen, phiP->X, phiQ->X, phiR->X); | |||||
fpcopy((digit_t *)&Montgomery_one, (phiP->Z)[0]); | |||||
fpcopy((digit_t *)&Montgomery_one, (phiQ->Z)[0]); | |||||
fpcopy((digit_t *)&Montgomery_one, (phiR->Z)[0]); | |||||
// Initialize constants | |||||
fpcopy((digit_t *)&Montgomery_one, A24plus[0]); | |||||
fp2add(A24plus, A24plus, C24); | |||||
uint64_t temp[12]; | |||||
uint64_t ifma_temp[15]; | |||||
// Retrieve kernel point | |||||
LADDER3PT(XPA, XQA, XRA, (digit_t *)PrivateKeyA, ALICE, R, A); | |||||
// Traverse tree | |||||
index = 0; | |||||
for (row = 1; row < MAX_Alice; row++) | |||||
{ | |||||
while (index < MAX_Alice - row) | |||||
{ | |||||
fp2copy(R->X, pts[npts]->X); | |||||
fp2copy(R->Z, pts[npts]->Z); | |||||
pts_index[npts++] = index; | |||||
m = strat_Alice[ii++]; | |||||
xDBLe(R, R, A24plus, C24, (int)(2 * m)); | |||||
index += m; | |||||
} | |||||
get_4_isog(R, A24plus, C24, coeff); | |||||
for (i = 0; i < npts; i++) | |||||
{ | |||||
eval_4_isog(pts[i], coeff); | |||||
} | |||||
eval_4_isog(phiP, coeff); | |||||
eval_4_isog(phiQ, coeff); | |||||
eval_4_isog(phiR, coeff); | |||||
fp2copy(pts[npts - 1]->X, R->X); | |||||
fp2copy(pts[npts - 1]->Z, R->Z); | |||||
index = pts_index[npts - 1]; | |||||
npts -= 1; | |||||
} | |||||
get_4_isog(R, A24plus, C24, coeff); | |||||
eval_4_isog(phiP, coeff); | |||||
eval_4_isog(phiQ, coeff); | |||||
eval_4_isog(phiR, coeff); | |||||
inv_3_way(phiP->Z, phiQ->Z, phiR->Z); | |||||
fp2mul_mont(phiP->X, phiP->Z, phiP->X); | |||||
fp2mul_mont(phiQ->X, phiQ->Z, phiQ->X); | |||||
fp2mul_mont(phiR->X, phiR->Z, phiR->X); | |||||
// Format public key | |||||
fp2_encode(phiP->X, PublicKeyA); | |||||
fp2_encode(phiQ->X, PublicKeyA + FP2_ENCODED_BYTES); | |||||
fp2_encode(phiR->X, PublicKeyA + 2 * FP2_ENCODED_BYTES); | |||||
return 0; | |||||
} | |||||
int EphemeralKeyGeneration_B(const unsigned char *PrivateKeyB, unsigned char *PublicKeyB) | |||||
{ // Bob's ephemeral public key generation | |||||
// Input: a private key PrivateKeyB in the range [0, 2^Floor(Log(2,oB)) - 1]. | |||||
// Output: the public key PublicKeyB consisting of 3 elements in GF(p^2) which are encoded by removing leading 0 bytes. | |||||
point_proj_t R, phiP = {0}, phiQ = {0}, phiR = {0}, pts[MAX_INT_POINTS_BOB]; | |||||
f2elm_t XPB, XQB, XRB, coeff[3], A24plus = {0}, A24minus = {0}, A = {0}; | |||||
unsigned int i, row, m, index = 0, pts_index[MAX_INT_POINTS_BOB], npts = 0, ii = 0; | |||||
// Initialize basis points | |||||
init_basis((digit_t *)B_gen, XPB, XQB, XRB); | |||||
init_basis((digit_t *)A_gen, phiP->X, phiQ->X, phiR->X); | |||||
fpcopy((digit_t *)&Montgomery_one, (phiP->Z)[0]); | |||||
fpcopy((digit_t *)&Montgomery_one, (phiQ->Z)[0]); | |||||
fpcopy((digit_t *)&Montgomery_one, (phiR->Z)[0]); | |||||
// Initialize constants | |||||
fpcopy((digit_t *)&Montgomery_one, A24plus[0]); | |||||
fp2add(A24plus, A24plus, A24plus); | |||||
fp2copy(A24plus, A24minus); | |||||
fp2neg(A24minus); | |||||
// Retrieve kernel point | |||||
LADDER3PT(XPB, XQB, XRB, (digit_t *)PrivateKeyB, BOB, R, A); | |||||
// Traverse tree | |||||
index = 0; | |||||
for (row = 1; row < MAX_Bob; row++) | |||||
{ | |||||
while (index < MAX_Bob - row) | |||||
{ | |||||
fp2copy(R->X, pts[npts]->X); | |||||
fp2copy(R->Z, pts[npts]->Z); | |||||
pts_index[npts++] = index; | |||||
m = strat_Bob[ii++]; | |||||
xTPLe(R, R, A24minus, A24plus, (int)m); | |||||
index += m; | |||||
} | |||||
get_3_isog(R, A24minus, A24plus, coeff); | |||||
for (i = 0; i < npts; i++) | |||||
{ | |||||
eval_3_isog(pts[i], coeff); | |||||
} | |||||
eval_3_isog(phiP, coeff); | |||||
eval_3_isog(phiQ, coeff); | |||||
eval_3_isog(phiR, coeff); | |||||
fp2copy(pts[npts - 1]->X, R->X); | |||||
fp2copy(pts[npts - 1]->Z, R->Z); | |||||
index = pts_index[npts - 1]; | |||||
npts -= 1; | |||||
} | |||||
get_3_isog(R, A24minus, A24plus, coeff); | |||||
eval_3_isog(phiP, coeff); | |||||
eval_3_isog(phiQ, coeff); | |||||
eval_3_isog(phiR, coeff); | |||||
inv_3_way(phiP->Z, phiQ->Z, phiR->Z); | |||||
fp2mul_mont(phiP->X, phiP->Z, phiP->X); | |||||
fp2mul_mont(phiQ->X, phiQ->Z, phiQ->X); | |||||
fp2mul_mont(phiR->X, phiR->Z, phiR->X); | |||||
// Format public key | |||||
fp2_encode(phiP->X, PublicKeyB); | |||||
fp2_encode(phiQ->X, PublicKeyB + FP2_ENCODED_BYTES); | |||||
fp2_encode(phiR->X, PublicKeyB + 2 * FP2_ENCODED_BYTES); | |||||
return 0; | |||||
} | |||||
int EphemeralSecretAgreement_A(const unsigned char *PrivateKeyA, const unsigned char *PublicKeyB, unsigned char *SharedSecretA) | |||||
{ // Alice's ephemeral shared secret computation | |||||
// It produces a shared secret key SharedSecretA using her secret key PrivateKeyA and Bob's public key PublicKeyB | |||||
// Inputs: Alice's PrivateKeyA is an integer in the range [0, oA-1]. | |||||
// Bob's PublicKeyB consists of 3 elements in GF(p^2) encoded by removing leading 0 bytes. | |||||
// Output: a shared secret SharedSecretA that consists of one element in GF(p^2) encoded by removing leading 0 bytes. | |||||
point_proj_t R, pts[MAX_INT_POINTS_ALICE]; | |||||
f2elm_t coeff[3], PKB[3], jinv; | |||||
f2elm_t A24plus = {0}, C24 = {0}, A = {0}; | |||||
unsigned int i, row, m, index = 0, pts_index[MAX_INT_POINTS_ALICE], npts = 0, ii = 0; | |||||
// Initialize images of Bob's basis | |||||
fp2_decode(PublicKeyB, PKB[0]); | |||||
fp2_decode(PublicKeyB + FP2_ENCODED_BYTES, PKB[1]); | |||||
fp2_decode(PublicKeyB + 2 * FP2_ENCODED_BYTES, PKB[2]); | |||||
// Initialize constants | |||||
get_A(PKB[0], PKB[1], PKB[2], A); // TODO: Can return projective A? | |||||
fpadd((digit_t *)&Montgomery_one, (digit_t *)&Montgomery_one, C24[0]); | |||||
fp2add(A, C24, A24plus); | |||||
fpadd(C24[0], C24[0], C24[0]); | |||||
// Retrieve kernel point | |||||
LADDER3PT(PKB[0], PKB[1], PKB[2], (digit_t *)PrivateKeyA, ALICE, R, A); | |||||
// Traverse tree | |||||
index = 0; | |||||
for (row = 1; row < MAX_Alice; row++) | |||||
{ | |||||
while (index < MAX_Alice - row) | |||||
{ | |||||
fp2copy(R->X, pts[npts]->X); | |||||
fp2copy(R->Z, pts[npts]->Z); | |||||
pts_index[npts++] = index; | |||||
m = strat_Alice[ii++]; | |||||
xDBLe(R, R, A24plus, C24, (int)(2 * m)); | |||||
index += m; | |||||
} | |||||
get_4_isog(R, A24plus, C24, coeff); | |||||
for (i = 0; i < npts; i++) | |||||
{ | |||||
eval_4_isog(pts[i], coeff); | |||||
} | |||||
fp2copy(pts[npts - 1]->X, R->X); | |||||
fp2copy(pts[npts - 1]->Z, R->Z); | |||||
index = pts_index[npts - 1]; | |||||
npts -= 1; | |||||
} | |||||
get_4_isog(R, A24plus, C24, coeff); | |||||
fp2div2(C24, C24); | |||||
fp2sub(A24plus, C24, A24plus); | |||||
fp2div2(C24, C24); | |||||
j_inv(A24plus, C24, jinv); | |||||
fp2_encode(jinv, SharedSecretA); // Format shared secret | |||||
return 0; | |||||
} | |||||
int EphemeralSecretAgreement_B(const unsigned char *PrivateKeyB, const unsigned char *PublicKeyA, unsigned char *SharedSecretB) | |||||
{ // Bob's ephemeral shared secret computation | |||||
// It produces a shared secret key SharedSecretB using his secret key PrivateKeyB and Alice's public key PublicKeyA | |||||
// Inputs: Bob's PrivateKeyB is an integer in the range [0, 2^Floor(Log(2,oB)) - 1]. | |||||
// Alice's PublicKeyA consists of 3 elements in GF(p^2) encoded by removing leading 0 bytes. | |||||
// Output: a shared secret SharedSecretB that consists of one element in GF(p^2) encoded by removing leading 0 bytes. | |||||
point_proj_t R, pts[MAX_INT_POINTS_BOB]; | |||||
f2elm_t coeff[3], PKB[3], jinv; | |||||
f2elm_t A24plus = {0}, A24minus = {0}, A = {0}; | |||||
unsigned int i, row, m, index = 0, pts_index[MAX_INT_POINTS_BOB], npts = 0, ii = 0; | |||||
// Initialize images of Alice's basis | |||||
fp2_decode(PublicKeyA, PKB[0]); | |||||
fp2_decode(PublicKeyA + FP2_ENCODED_BYTES, PKB[1]); | |||||
fp2_decode(PublicKeyA + 2 * FP2_ENCODED_BYTES, PKB[2]); | |||||
// Initialize constants | |||||
get_A(PKB[0], PKB[1], PKB[2], A); // TODO: Can return projective A? | |||||
fpadd((digit_t *)&Montgomery_one, (digit_t *)&Montgomery_one, A24minus[0]); | |||||
fp2add(A, A24minus, A24plus); | |||||
fp2sub(A, A24minus, A24minus); | |||||
// Retrieve kernel point | |||||
LADDER3PT(PKB[0], PKB[1], PKB[2], (digit_t *)PrivateKeyB, BOB, R, A); | |||||
// Traverse tree | |||||
index = 0; | |||||
for (row = 1; row < MAX_Bob; row++) | |||||
{ | |||||
while (index < MAX_Bob - row) | |||||
{ | |||||
fp2copy(R->X, pts[npts]->X); | |||||
fp2copy(R->Z, pts[npts]->Z); | |||||
pts_index[npts++] = index; | |||||
m = strat_Bob[ii++]; | |||||
xTPLe(R, R, A24minus, A24plus, (int)m); | |||||
index += m; | |||||
} | |||||
get_3_isog(R, A24minus, A24plus, coeff); | |||||
for (i = 0; i < npts; i++) | |||||
{ | |||||
eval_3_isog(pts[i], coeff); | |||||
} | |||||
fp2copy(pts[npts - 1]->X, R->X); | |||||
fp2copy(pts[npts - 1]->Z, R->Z); | |||||
index = pts_index[npts - 1]; | |||||
npts -= 1; | |||||
} | |||||
get_3_isog(R, A24minus, A24plus, coeff); | |||||
fp2add(A24plus, A24minus, A); | |||||
fp2add(A, A, A); | |||||
fp2sub(A24plus, A24minus, A24plus); | |||||
j_inv(A, A24plus, jinv); | |||||
fp2_encode(jinv, SharedSecretB); // Format shared secret | |||||
return 0; | |||||
} |
@@ -0,0 +1,99 @@ | |||||
/******************************************************************************************** | |||||
* Supersingular Isogeny Key Encapsulation Library | |||||
* | |||||
* Abstract: supersingular isogeny key encapsulation (SIKE) protocol | |||||
*********************************************************************************************/ | |||||
#include <string.h> | |||||
#include "P751_internal.h" | |||||
#include "sha3/fips202.h" | |||||
int crypto_kem_keypair(unsigned char *pk, unsigned char *sk) | |||||
{ // SIKE's key generation | |||||
// Outputs: secret key sk (CRYPTO_SECRETKEYBYTES = MSG_BYTES + SECRETKEY_B_BYTES + CRYPTO_PUBLICKEYBYTES bytes) | |||||
// public key pk (CRYPTO_PUBLICKEYBYTES bytes) | |||||
// Generate lower portion of secret key sk <- s||SK | |||||
randombytes(sk, MSG_BYTES); | |||||
random_mod_order_B(sk + MSG_BYTES); | |||||
// Generate public key pk | |||||
EphemeralKeyGeneration_B(sk + MSG_BYTES, pk); | |||||
// Append public key pk to secret key sk | |||||
memcpy(&sk[MSG_BYTES + SECRETKEY_B_BYTES], pk, CRYPTO_PUBLICKEYBYTES); | |||||
return 0; | |||||
} | |||||
int crypto_kem_enc(unsigned char *ct, unsigned char *ss, const unsigned char *pk) | |||||
{ // SIKE's encapsulation | |||||
// Input: public key pk (CRYPTO_PUBLICKEYBYTES bytes) | |||||
// Outputs: shared secret ss (CRYPTO_BYTES bytes) | |||||
// ciphertext message ct (CRYPTO_CIPHERTEXTBYTES = CRYPTO_PUBLICKEYBYTES + MSG_BYTES bytes) | |||||
const uint16_t G = 0; | |||||
const uint16_t H = 1; | |||||
const uint16_t P = 2; | |||||
unsigned char ephemeralsk[SECRETKEY_A_BYTES]; | |||||
unsigned char jinvariant[FP2_ENCODED_BYTES]; | |||||
unsigned char h[MSG_BYTES]; | |||||
unsigned char temp[CRYPTO_CIPHERTEXTBYTES+MSG_BYTES]; | |||||
unsigned int i; | |||||
// Generate ephemeralsk <- G(m||pk) mod oA | |||||
randombytes(temp, MSG_BYTES); | |||||
memcpy(&temp[MSG_BYTES], pk, CRYPTO_PUBLICKEYBYTES); | |||||
cshake256_simple(ephemeralsk, SECRETKEY_A_BYTES, G, temp, CRYPTO_PUBLICKEYBYTES+MSG_BYTES); | |||||
ephemeralsk[SECRETKEY_A_BYTES - 1] &= MASK_ALICE; | |||||
// Encrypt | |||||
EphemeralKeyGeneration_A(ephemeralsk, ct); | |||||
EphemeralSecretAgreement_A(ephemeralsk, pk, jinvariant); | |||||
cshake256_simple(h, MSG_BYTES, P, jinvariant, FP2_ENCODED_BYTES); | |||||
for (i = 0; i < MSG_BYTES; i++) ct[i + CRYPTO_PUBLICKEYBYTES] = temp[i] ^ h[i]; | |||||
// Generate shared secret ss <- H(m||ct) | |||||
memcpy(&temp[MSG_BYTES], ct, CRYPTO_CIPHERTEXTBYTES); | |||||
cshake256_simple(ss, CRYPTO_BYTES, H, temp, CRYPTO_CIPHERTEXTBYTES+MSG_BYTES); | |||||
return 0; | |||||
} | |||||
int crypto_kem_dec(unsigned char *ss, const unsigned char *ct, const unsigned char *sk) | |||||
{ // SIKE's decapsulation | |||||
// Input: secret key sk (CRYPTO_SECRETKEYBYTES = MSG_BYTES + SECRETKEY_B_BYTES + CRYPTO_PUBLICKEYBYTES bytes) | |||||
// ciphertext message ct (CRYPTO_CIPHERTEXTBYTES = CRYPTO_PUBLICKEYBYTES + MSG_BYTES bytes) | |||||
// Outputs: shared secret ss (CRYPTO_BYTES bytes) | |||||
const uint16_t G = 0; | |||||
const uint16_t H = 1; | |||||
const uint16_t P = 2; | |||||
unsigned char ephemeralsk_[SECRETKEY_A_BYTES]; | |||||
unsigned char jinvariant_[FP2_ENCODED_BYTES]; | |||||
unsigned char h_[MSG_BYTES]; | |||||
unsigned char c0_[CRYPTO_PUBLICKEYBYTES]; | |||||
unsigned char temp[CRYPTO_CIPHERTEXTBYTES+MSG_BYTES]; | |||||
unsigned int i; | |||||
// Decrypt | |||||
EphemeralSecretAgreement_B(sk + MSG_BYTES, ct, jinvariant_); | |||||
cshake256_simple(h_, MSG_BYTES, P, jinvariant_, FP2_ENCODED_BYTES); | |||||
for (i = 0; i < MSG_BYTES; i++) temp[i] = ct[i + CRYPTO_PUBLICKEYBYTES] ^ h_[i]; | |||||
// Generate ephemeralsk_ <- G(m||pk) mod oA | |||||
memcpy(&temp[MSG_BYTES], &sk[MSG_BYTES + SECRETKEY_B_BYTES], CRYPTO_PUBLICKEYBYTES); | |||||
cshake256_simple(ephemeralsk_, SECRETKEY_A_BYTES, G, temp, CRYPTO_PUBLICKEYBYTES+MSG_BYTES); | |||||
ephemeralsk_[SECRETKEY_A_BYTES - 1] &= MASK_ALICE; | |||||
// Generate shared secret ss <- H(m||ct) or output ss <- H(s||ct) | |||||
EphemeralKeyGeneration_A(ephemeralsk_, c0_); | |||||
if (memcmp(c0_, ct, CRYPTO_PUBLICKEYBYTES) != 0) { | |||||
memcpy(temp, sk, MSG_BYTES); | |||||
} | |||||
memcpy(&temp[MSG_BYTES], ct, CRYPTO_CIPHERTEXTBYTES); | |||||
cshake256_simple(ss, CRYPTO_BYTES, H, temp, CRYPTO_CIPHERTEXTBYTES+MSG_BYTES); | |||||
return 0; | |||||
} |