From 8335e684f6838dc90587a350e1e779b22813cbb0 Mon Sep 17 00:00:00 2001 From: Kris Kwiatkowski Date: Mon, 8 Jun 2020 09:08:41 +0100 Subject: [PATCH] works on amazon --- Makefile | 39 + P751_ifma.c | 817 ++++++++++++ README.md | 34 + fp2_751_ifma.S | 916 +++++++++++++ fp2_packed_751_ifma.S | 523 ++++++++ fp_751_ifma.S | 268 ++++ main.c | 218 ++++ measurements.h | 52 + sidh_ref/P751.c | 122 ++ sidh_ref/P751_internal.h | 255 ++++ sidh_ref/SIDH.h | 214 +++ sidh_ref/api.h | 109 ++ sidh_ref/config.h | 128 ++ sidh_ref/ec_isogeny.c | 330 +++++ sidh_ref/fp_x64.c | 867 +++++++++++++ sidh_ref/fp_x64_asm.S | 2644 ++++++++++++++++++++++++++++++++++++++ sidh_ref/fpx.c | 474 +++++++ sidh_ref/random/random.c | 43 + sidh_ref/random/random.h | 9 + sidh_ref/random/random.o | Bin 0 -> 5344 bytes sidh_ref/sha3/fips202.c | 573 +++++++++ sidh_ref/sha3/fips202.h | 27 + sidh_ref/sha3/fips202.o | Bin 0 -> 98152 bytes sidh_ref/sidh.c | 345 +++++ sidh_ref/sike.c | 99 ++ 25 files changed, 9106 insertions(+) create mode 100644 Makefile create mode 100644 P751_ifma.c create mode 100644 README.md create mode 100644 fp2_751_ifma.S create mode 100644 fp2_packed_751_ifma.S create mode 100644 fp_751_ifma.S create mode 100644 main.c create mode 100644 measurements.h create mode 100644 sidh_ref/P751.c create mode 100644 sidh_ref/P751_internal.h create mode 100644 sidh_ref/SIDH.h create mode 100644 sidh_ref/api.h create mode 100644 sidh_ref/config.h create mode 100644 sidh_ref/ec_isogeny.c create mode 100644 sidh_ref/fp_x64.c create mode 100644 sidh_ref/fp_x64_asm.S create mode 100644 sidh_ref/fpx.c create mode 100644 sidh_ref/random/random.c create mode 100644 sidh_ref/random/random.h create mode 100644 sidh_ref/random/random.o create mode 100644 sidh_ref/sha3/fips202.c create mode 100644 sidh_ref/sha3/fips202.h create mode 100644 sidh_ref/sha3/fips202.o create mode 100644 sidh_ref/sidh.c create mode 100644 sidh_ref/sike.c diff --git a/Makefile b/Makefile new file mode 100644 index 0000000..8a4e6ce --- /dev/null +++ b/Makefile @@ -0,0 +1,39 @@ +CC?=clang + +TARGET_OS=$(shell uname -s) + +ifeq ($(TARGET_OS),Darwin) + CFLAGS= -std=c99 -O3 -g -D_AMD64_ -D__LINUX__ -mavx512f -mavx512bw -mavx512dq -mavx512ifma +else + ifeq ($(CC),clang) + CFLAGS= -std=c99 -O3 -g -D_AMD64_ -D__LINUX__ -mavx512f -mavx512bw -mavx512dq -mavx512ifma + else + CFLAGS= -std=c99 -O3 -g -D_AMD64_ -D__LINUX__ -mavx512f + endif +endif + +CFLAGS+=-D_MULX_ -D_ADX_ + +SRC_REAL=fp2_751_ifma.S fp_751_ifma.S fp2_packed_751_ifma.S +SRC_STANDIN=$(SRC_REAL:.S=_standin.S) +SOURCES=./sidh_ref/fp_x64_asm.S ./sidh_ref/fp_x64.c ./sidh_ref/P751.c ./sidh_ref/random/random.c ./sidh_ref/sha3/fips202.c P751_ifma.c +OBJECTS=$(SOURCES:.c=.o) +EXE_REAL=sidh_ifma +EXE_STANDIN=sidh_standin + +all: $(SOURCES) $(SRC_STANDIN) $(SRC_REAL) $(EXE_REAL) $(EXE_STANDIN) + +$(SRC_STANDIN): %_standin.S: %.S + cat $< | sed 's/vpmadd52luq/VFMADD231PD/; s/vpmadd52huq/VFMADD231PD/;' > $@ + +$(EXE_REAL): main.c ./sidh_ref/sidh.c $(OBJECTS) $(SRC_REAL) + $(CC) main.c $(OBJECTS) $(SRC_REAL) $(CFLAGS) -o $@ -DREPEAT=1 -DOUTER_REPEAT=1 + +$(EXE_STANDIN): main.c ./sidh_ref/sidh.c $(OBJECTS) $(SRC_STANDIN) + $(CC) main.c $(OBJECTS) $(SRC_STANDIN) $(CFLAGS) -o $@ -DREPEAT=20 -DOUTER_REPEAT=20 + +.o: ./sidh_ref/sidh.c + $(CC) $(CFLAGS) $< -o $@ + +clean: + rm -f *.o ./sidh_ref/*.o $(EXE_REAL) $(EXE_STANDIN) $(SRC_STANDIN) diff --git a/P751_ifma.c b/P751_ifma.c new file mode 100644 index 0000000..02c618b --- /dev/null +++ b/P751_ifma.c @@ -0,0 +1,817 @@ +#include +#include + +#define NWORDS_FIELD 15 +#define MAX_INT_POINTS_ALICE 8 +#define MAX_INT_POINTS_BOB 10 + +#define ALICE 0 +#define BOB 1 +#define OALICE_BITS 372 +#define OBOB_BITS 379 + +#define MAX_Alice 186 +#define MAX_Bob 239 + +#define NBITS_FIELD 751 +#define MAXBITS_FIELD 768 +#define FP2_ENCODED_BYTES 2 * ((NBITS_FIELD + 7) / 8) + +typedef uint64_t felm_t[NWORDS_FIELD]; +typedef felm_t f2elm_t[2]; + +typedef struct +{ + f2elm_t X; + f2elm_t Z; +} point_proj; // Point representation in projective XZ Montgomery coordinates. + +typedef point_proj point_proj_t[1]; + +const uint64_t A_gen_ifma[5 * NWORDS_FIELD] = { + 0x000ceab50ad8bc0d, 0x0005e457b1c2fc08, 0x000cd6e1d7d710f5, 0x000ae8738d92953d, 0x000a7ebee8a3418a, 0x0008345f03f46fba, 0x0007cfe2616c9a28, 0x000b4be50c8b9e16, 0x00039b6799643b2e, 0x000597a7ff9d56d5, 0x00021d410d97fe0a, 0x000a4a92a8f2ad52, 0x00054508e42abde4, 0x000ebf7d0178c137, 0x00000000004a0a75, + 0x000d21582e4118ad, 0x0005df400ae6cc41, 0x000aec407c2ecb7c, 0x000de8e34b521432, 0x000761e2ab085167, 0x000bcaa6094b3c50, 0x000df9ddd71032cf, 0x00057d905265605f, 0x000f7dba2681f9d7, 0x0009e9732def416c, 0x0006f77956ce00ce, 0x000576fb3094772b, 0x000b2d166e2a949f, 0x0002f665c6588ea2, 0x0000000000337a25, + 0x00026279148626cd, 0x0006b5baead56fe5, 0x000ab911fad60dc9, 0x000401e137d0bf07, 0x0004d3e925216196, 0x0005e4cd09a33740, 0x00069e4af733c538, 0x000d1169f6821367, 0x000c64ecfc721111, 0x000ba56507cd0dc7, 0x000995e4ae04dfad, 0x0007b992deeceab8, 0x0007bccd256aff1e, 0x000207f5fde1824c, 0x0000000000345cc7, + 0x00041dffd19b3e7f, 0x000b48c18e0bb844, 0x000380584b4dea99, 0x0000692de648ad31, 0x000d72761b6dfaee, 0x0005c672c3058de6, 0x000cba26fdc22397, 0x000e15f9133d4bc3, 0x000d5ae123793466, 0x000bb494276e321d, 0x000c9c99fb74cd99, 0x0005da6e4fd03f75, 0x000b95feb24d0937, 0x000e6a307e03cd17, 0x000000000044ad2e, + 0x0007f1ec71be8c36, 0x00053859b1ed78c1, 0x000529ff824d6df7, 0x000633a10839b2a8, 0x00003e9e25fdea79, 0x000a8054df1762fc, 0x000034c6467c4708, 0x000acb63530b60ec, 0x0000c6fc8c19bf71, 0x0005aca92467c3cb, 0x000d42050ba154a2, 0x000b4d5baa4ab074, 0x00044ba4962ac622, 0x0002bbf250aa70e6, 0x0000000000457f51}; + +const uint64_t B_gen_ifma[5 * NWORDS_FIELD] = { + 0x0001ef867ab0bcb9, 0x0009a45c76cfb6d7, 0x0001f034a5fdd76e, 0x000038b1ee69194b, 0x000e7b18a7761f3f, 0x000a486a52c84cf6, 0x0005aa75466fcf01, 0x00044164f797233f, 0x000331aeaec77db1, 0x0005185f83d9a22f, 0x000e2d4dc94f5b17, 0x0000f7b3858b15a4, 0x000635ac44515c99, 0x000a5b14eaf4ee2e, 0x000000000048e907, + 0x0004e7c075cc3a24, 0x00004aa430a49203, 0x00094c8677baf00b, 0x000b3aae0c9a755c, 0x000c4b064e9ebb08, 0x000dd04e826c661d, 0x00061f01b223684e, 0x000d43bc8a6360b6, 0x00008c633a79ab30, 0x0008e0092fbd6f39, 0x0002b9ba797337f8, 0x000fcb3252ddaf84, 0x000467ded2ca9dce, 0x0006117350e479f4, 0x00000000001ae9d1, + 0x000ed7b96c4ab279, 0x000178486ef1a8c9, 0x000c2f4299429da5, 0x000aef4926f20cd5, 0x0003b2e2858b4716, 0x000bcc3cac3eeb68, 0x0003a600460dda2f, 0x00050e6650a24c9f, 0x0004cb60c61775f8, 0x00082b196ebc78b3, 0x000cc7fec8cce966, 0x000d9b778d801d65, 0x0005324630f74af3, 0x0009018193e7592e, 0x00000000003aef05, + 0x00033769d0f314ef, 0x000e2659d11c0d67, 0x000d133f084c3086, 0x0005e23d5da27bcb, 0x0008ec9a8d586402, 0x000c781b3b645bf3, 0x000c9fb03ee6426d, 0x000ddc7bb40b83e3, 0x000bb7b4ab585e3a, 0x0006c2672e53eeaf, 0x0000397a1e62b655, 0x0004ac383daab923, 0x0008eb1ecdd2f39e, 0x000f1516da469247, 0x00000000003693cf, + 0x0007d8f72bd956dc, 0x000e9934884ae37e, 0x0003c3edd2d504b3, 0x00005d14e7fa1ecb, 0x0007610ceb75d635, 0x000b4cac446b1112, 0x000c1f70caf255b4, 0x00057d3e324d2f36, 0x0006181c3bb1a700, 0x000db2f2916ccc40, 0x00021ee51d1c92f1, 0x000c07c22031c32a, 0x000e4310e5103473, 0x00069c1148de9ef5, 0x00000000004d1227}; + +const uint64_t One[NWORDS_FIELD] = { + 0x00000000249ad67c, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0001f9800c542c00, 0x000b326488fe3b2a, 0x000e6176236db777, 0x000dd6e970232b83, 0x000d4d762277573f, 0x00054cd16c015f35, 0x0009fc72438c4fc7, 0x00000000001bf8f6}; + +const uint64_t Two[NWORDS_FIELD] = { + 0x000000004935acf8, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0003f30018a85800, 0x000664c911fc7654, 0x000cc2ec46db6eef, 0x000badd2e0465707, 0x000a9aec44eeae7f, 0x000a99a2d802be6b, 0x0003f8e487189f8e, 0x000000000037f1ed}; + +// Fixed parameters for isogeny tree computation +extern const unsigned int strat_Alice[MAX_Alice - 1]; +extern const unsigned int strat_Bob[MAX_Bob - 1]; + +void norm2red(uint64_t *res, const uint64_t *a); +void red2norm(uint64_t out[12], const uint64_t in[15]) +{ + out[0] = in[0] ^ in[1] << 52; + + out[1] = in[1] >> 12 ^ in[2] << 40; + out[2] = in[2] >> 24 ^ in[3] << 28; + out[3] = in[3] >> 36 ^ in[4] << 16; + out[4] = in[4] >> 48 ^ in[5] << 4 ^ in[6] << 56; + + out[5] = in[6] >> 8 ^ in[7] << 44; + out[6] = in[7] >> 20 ^ in[8] << 32; + out[7] = in[8] >> 32 ^ in[9] << 20; + out[8] = in[9] >> 44 ^ in[10] << 8 ^ in[11] << 60; + + out[9] = in[11] >> 4 ^ in[12] << 48; + out[10] = in[12] >> 16 ^ in[13] << 36; + out[11] = in[13] >> 28 ^ in[14] << 24; +} + +static void init_basis(const uint64_t *gen, f2elm_t XP, f2elm_t XQ, f2elm_t XR) +{ // Initialization of basis points + + memcpy(XP[0], &gen[0 * NWORDS_FIELD], sizeof(felm_t)); + memcpy(XP[1], &gen[1 * NWORDS_FIELD], sizeof(felm_t)); + + memcpy(XQ[0], &gen[2 * NWORDS_FIELD], sizeof(felm_t)); + memset(XQ[1], 0, sizeof(felm_t)); + + memcpy(XR[0], &gen[3 * NWORDS_FIELD], sizeof(felm_t)); + memcpy(XR[1], &gen[4 * NWORDS_FIELD], sizeof(felm_t)); +} + +void fp2_mul_ifma(f2elm_t res, const f2elm_t a, const f2elm_t b); +void fp2_mul_ifma_x2(f2elm_t res1, const f2elm_t a1, const f2elm_t b1, f2elm_t res2, const f2elm_t a2, const f2elm_t b2); +void fp2_sqr_ifma(f2elm_t res, const f2elm_t a); +void fp2_add(f2elm_t res, const f2elm_t a, const f2elm_t b); +void fp2_sub(f2elm_t res, const f2elm_t a, const f2elm_t b); + +void fp2_swap(point_proj_t a, point_proj_t b, int swap); + +void fp_mul_ifma(felm_t res, felm_t a, felm_t b); +void fp_add(felm_t res, const felm_t a, const felm_t b); +void fp_sub(felm_t res, const felm_t a, const felm_t b); + +void to_mont_ifma(felm_t rp, const felm_t ap); +void from_mont_ifma(felm_t rp, const felm_t ap); + +void red2norm(uint64_t out[12], const felm_t in); + +#define fp2mul_mont(a, b, r) fp2_mul_ifma(r, a, b) +#define fp2sqr_mont(a, r) fp2_sqr_ifma(r, a) +#define fp2add(a, b, r) fp2_add(r, a, b) +#define fp2sub(a, b, r) fp2_sub(r, a, b) +#define fp2correction + +#define fpsqr_mont(a, r) fp_mul_ifma(r, a, a) +#define fpmul_mont(a, b, r) fp_mul_ifma(r, a, b) + +#define fpadd(a, b, r) fp_add(r, a, b) +#define fpsub(a, b, r) fp_sub(r, a, b) + +void fpinv_chain_mont(felm_t a) +{ // Chain to compute a^(p-3)/4 using Montgomery arithmetic. + unsigned int i, j; + felm_t t[27], tt; + + // Precomputed table + fpsqr_mont(a, tt); + fpmul_mont(a, tt, t[0]); + fpmul_mont(t[0], tt, t[1]); + fpmul_mont(t[1], tt, t[2]); + fpmul_mont(t[2], tt, t[3]); + fpmul_mont(t[3], tt, t[3]); + for (i = 3; i <= 8; i++) + fpmul_mont(t[i], tt, t[i + 1]); + fpmul_mont(t[9], tt, t[9]); + for (i = 9; i <= 20; i++) + fpmul_mont(t[i], tt, t[i + 1]); + fpmul_mont(t[21], tt, t[21]); + for (i = 21; i <= 24; i++) + fpmul_mont(t[i], tt, t[i + 1]); + fpmul_mont(t[25], tt, t[25]); + fpmul_mont(t[25], tt, t[26]); + + memcpy(tt, a, sizeof(felm_t)); + for (i = 0; i < 6; i++) + fpsqr_mont(tt, tt); + fpmul_mont(t[20], tt, tt); + for (i = 0; i < 6; i++) + fpsqr_mont(tt, tt); + fpmul_mont(t[24], tt, tt); + for (i = 0; i < 6; i++) + fpsqr_mont(tt, tt); + fpmul_mont(t[11], tt, tt); + for (i = 0; i < 6; i++) + fpsqr_mont(tt, tt); + fpmul_mont(t[8], tt, tt); + for (i = 0; i < 8; i++) + fpsqr_mont(tt, tt); + fpmul_mont(t[2], tt, tt); + for (i = 0; i < 6; i++) + fpsqr_mont(tt, tt); + fpmul_mont(t[23], tt, tt); + for (i = 0; i < 6; i++) + fpsqr_mont(tt, tt); + fpmul_mont(t[2], tt, tt); + for (i = 0; i < 9; i++) + fpsqr_mont(tt, tt); + fpmul_mont(t[2], tt, tt); + for (i = 0; i < 10; i++) + fpsqr_mont(tt, tt); + fpmul_mont(t[15], tt, tt); + for (i = 0; i < 8; i++) + fpsqr_mont(tt, tt); + fpmul_mont(t[13], tt, tt); + for (i = 0; i < 8; i++) + fpsqr_mont(tt, tt); + fpmul_mont(t[26], tt, tt); + for (i = 0; i < 8; i++) + fpsqr_mont(tt, tt); + fpmul_mont(t[20], tt, tt); + for (i = 0; i < 6; i++) + fpsqr_mont(tt, tt); + fpmul_mont(t[11], tt, tt); + for (i = 0; i < 6; i++) + fpsqr_mont(tt, tt); + fpmul_mont(t[10], tt, tt); + for (i = 0; i < 6; i++) + fpsqr_mont(tt, tt); + fpmul_mont(t[14], tt, tt); + for (i = 0; i < 6; i++) + fpsqr_mont(tt, tt); + fpmul_mont(t[4], tt, tt); + for (i = 0; i < 10; i++) + fpsqr_mont(tt, tt); + fpmul_mont(t[18], tt, tt); + for (i = 0; i < 6; i++) + fpsqr_mont(tt, tt); + fpmul_mont(t[1], tt, tt); + for (i = 0; i < 7; i++) + fpsqr_mont(tt, tt); + fpmul_mont(t[22], tt, tt); + for (i = 0; i < 10; i++) + fpsqr_mont(tt, tt); + fpmul_mont(t[6], tt, tt); + for (i = 0; i < 7; i++) + fpsqr_mont(tt, tt); + fpmul_mont(t[24], tt, tt); + for (i = 0; i < 6; i++) + fpsqr_mont(tt, tt); + fpmul_mont(t[9], tt, tt); + for (i = 0; i < 8; i++) + fpsqr_mont(tt, tt); + fpmul_mont(t[18], tt, tt); + for (i = 0; i < 6; i++) + fpsqr_mont(tt, tt); + fpmul_mont(t[17], tt, tt); + for (i = 0; i < 8; i++) + fpsqr_mont(tt, tt); + fpmul_mont(a, tt, tt); + for (i = 0; i < 10; i++) + fpsqr_mont(tt, tt); + fpmul_mont(t[16], tt, tt); + for (i = 0; i < 6; i++) + fpsqr_mont(tt, tt); + fpmul_mont(t[7], tt, tt); + for (i = 0; i < 6; i++) + fpsqr_mont(tt, tt); + fpmul_mont(t[0], tt, tt); + for (i = 0; i < 7; i++) + fpsqr_mont(tt, tt); + fpmul_mont(t[12], tt, tt); + for (i = 0; i < 7; i++) + fpsqr_mont(tt, tt); + fpmul_mont(t[19], tt, tt); + for (i = 0; i < 6; i++) + fpsqr_mont(tt, tt); + fpmul_mont(t[22], tt, tt); + for (i = 0; i < 6; i++) + fpsqr_mont(tt, tt); + fpmul_mont(t[25], tt, tt); + for (i = 0; i < 7; i++) + fpsqr_mont(tt, tt); + fpmul_mont(t[2], tt, tt); + for (i = 0; i < 6; i++) + fpsqr_mont(tt, tt); + fpmul_mont(t[10], tt, tt); + for (i = 0; i < 7; i++) + fpsqr_mont(tt, tt); + fpmul_mont(t[22], tt, tt); + for (i = 0; i < 8; i++) + fpsqr_mont(tt, tt); + fpmul_mont(t[18], tt, tt); + for (i = 0; i < 6; i++) + fpsqr_mont(tt, tt); + fpmul_mont(t[4], tt, tt); + for (i = 0; i < 6; i++) + fpsqr_mont(tt, tt); + fpmul_mont(t[14], tt, tt); + for (i = 0; i < 7; i++) + fpsqr_mont(tt, tt); + fpmul_mont(t[13], tt, tt); + for (i = 0; i < 6; i++) + fpsqr_mont(tt, tt); + fpmul_mont(t[5], tt, tt); + for (i = 0; i < 6; i++) + fpsqr_mont(tt, tt); + fpmul_mont(t[23], tt, tt); + for (i = 0; i < 6; i++) + fpsqr_mont(tt, tt); + fpmul_mont(t[21], tt, tt); + for (i = 0; i < 6; i++) + fpsqr_mont(tt, tt); + fpmul_mont(t[2], tt, tt); + for (i = 0; i < 7; i++) + fpsqr_mont(tt, tt); + fpmul_mont(t[23], tt, tt); + for (i = 0; i < 8; i++) + fpsqr_mont(tt, tt); + fpmul_mont(t[12], tt, tt); + for (i = 0; i < 6; i++) + fpsqr_mont(tt, tt); + fpmul_mont(t[9], tt, tt); + for (i = 0; i < 6; i++) + fpsqr_mont(tt, tt); + fpmul_mont(t[3], tt, tt); + for (i = 0; i < 7; i++) + fpsqr_mont(tt, tt); + fpmul_mont(t[13], tt, tt); + for (i = 0; i < 7; i++) + fpsqr_mont(tt, tt); + fpmul_mont(t[17], tt, tt); + for (i = 0; i < 8; i++) + fpsqr_mont(tt, tt); + fpmul_mont(t[26], tt, tt); + for (i = 0; i < 8; i++) + fpsqr_mont(tt, tt); + fpmul_mont(t[5], tt, tt); + for (i = 0; i < 8; i++) + fpsqr_mont(tt, tt); + fpmul_mont(t[8], tt, tt); + for (i = 0; i < 6; i++) + fpsqr_mont(tt, tt); + fpmul_mont(t[2], tt, tt); + for (i = 0; i < 6; i++) + fpsqr_mont(tt, tt); + fpmul_mont(t[11], tt, tt); + for (i = 0; i < 7; i++) + fpsqr_mont(tt, tt); + fpmul_mont(t[20], tt, tt); + for (j = 0; j < 61; j++) + { + for (i = 0; i < 6; i++) + fpsqr_mont(tt, tt); + fpmul_mont(t[26], tt, tt); + } + memcpy(a, tt, sizeof(felm_t)); +} + +void fpinv_mont(felm_t a) +{ // Field inversion using Montgomery arithmetic, a = a^(-1)*R mod p. + felm_t tt; + memcpy(tt, a, sizeof(felm_t)); + fpinv_chain_mont(tt); + fpsqr_mont(tt, tt); + fpsqr_mont(tt, tt); + fpmul_mont(a, tt, a); +} + +void fp2inv_mont(f2elm_t a) +{ // GF(p^2) inversion using Montgomery arithmetic, a = (a0-i*a1)/(a0^2+a1^2). + f2elm_t t1; + felm_t zero = {0}; + fpsqr_mont(a[0], t1[0]); // t10 = a0^2 + fpsqr_mont(a[1], t1[1]); // t11 = a1^2 + fpadd(t1[0], t1[1], t1[0]); // t10 = a0^2+a1^2 + fpinv_mont(t1[0]); // t10 = (a0^2+a1^2)^-1 + fp_sub(a[1], zero, a[1]); // a = a0-i*a1 + fpmul_mont(a[0], t1[0], a[0]); + fpmul_mont(a[1], t1[0], a[1]); // a = (a0-i*a1)*(a0^2+a1^2)^-1 +} + +void inv_3_way_ifma(f2elm_t z1, f2elm_t z2, f2elm_t z3) +{ // 3-way simultaneous inversion + // Input: z1,z2,z3 + // Output: 1/z1,1/z2,1/z3 (override inputs). + f2elm_t t0, t1, t2, t3; + + fp2mul_mont(z1, z2, t0); // t0 = z1*z2 + fp2mul_mont(z3, t0, t1); // t1 = z1*z2*z3 + fp2inv_mont(t1); // t1 = 1/(z1*z2*z3) + fp2mul_mont(z3, t1, t2); // t2 = 1/(z1*z2) + fp2_mul_ifma_x2(t3, t2, z2, z2, t2, z1); + //fp2mul_mont(t2, z2, t3); // t3 = 1/z1 + //fp2mul_mont(t2, z1, z2); // z2 = 1/z2 + fp2mul_mont(t0, t1, z3); // z3 = 1/z3 + memcpy(z1, t3, sizeof(f2elm_t)); +} + +void xDBLADD_ifma(point_proj_t P, point_proj_t Q, const f2elm_t xPQ, const f2elm_t A24) +{ // Simultaneous doubling and differential addition. + // Input: projective Montgomery points P=(XP:ZP) and Q=(XQ:ZQ) such that xP=XP/ZP and xQ=XQ/ZQ, affine difference xPQ=x(P-Q) and Montgomery curve constant A24=(A+2)/4. + // Output: projective Montgomery points P <- 2*P = (X2P:Z2P) such that x(2P)=X2P/Z2P, and Q <- P+Q = (XQP:ZQP) such that = x(Q+P)=XQP/ZQP. + f2elm_t t0, t1, t2, t3; + + fp2add(P->X, P->Z, t0); // t0 = XP+ZP + fp2sub(P->X, P->Z, t1); // t1 = XP-ZP + + fp2_mul_ifma_x2(P->X, t0, t0, P->Z, t1, t1); + //fp2sqr_mont(t0, P->X); // XP = (XP+ZP)^2 + //fp2sqr_mont(t1, P->Z); // ZP = (XP-ZP)^2 + + fp2add(Q->X, Q->Z, t2); // XQ = XQ+ZQ + fp2sub(Q->X, Q->Z, t3); // t2 = XQ-ZQ + + fp2_mul_ifma_x2(t1, t1, t2, t0, t0, t3); + //fp2mul_mont(t2, t1, t1); // t1 = (XP-ZP)*(XQ+ZQ) + //fp2mul_mont(t3, t0, t0); // t0 = (XP+ZP)*(XQ-ZQ) + + fp2sub(P->X, P->Z, t2); // t2 = (XP+ZP)^2-(XP-ZP)^2 + fp2sub(t0, t1, Q->Z); // ZQ = (XP+ZP)*(XQ-ZQ)-(XP-ZP)*(XQ+ZQ) + + fp2_mul_ifma_x2(P->X, P->X, P->Z, Q->X, A24, t2); + //fp2mul_mont(P->X, P->Z, P->X); // XP = (XP+ZP)^2*(XP-ZP)^2 + //fp2mul_mont(t2, A24, Q->X); // XQ = A24*[(XP+ZP)^2-(XP-ZP)^2] + + fp2add(Q->X, P->Z, P->Z); // ZP = A24*[(XP+ZP)^2-(XP-ZP)^2]+(XP-ZP)^2 + fp2add(t0, t1, Q->X); // XQ = (XP+ZP)*(XQ-ZQ)+(XP-ZP)*(XQ+ZQ) + + fp2_mul_ifma_x2(Q->Z, Q->Z, Q->Z, Q->X, Q->X, Q->X); + //fp2sqr_mont(Q->Z, Q->Z); // ZQ = [(XP+ZP)*(XQ-ZQ)-(XP-ZP)*(XQ+ZQ)]^2 + //fp2sqr_mont(Q->X, Q->X); // XQ = [(XP+ZP)*(XQ-ZQ)+(XP-ZP)*(XQ+ZQ)]^2 + + fp2_mul_ifma_x2(P->Z, P->Z, t2, Q->Z, Q->Z, xPQ); + //fp2mul_mont(P->Z, t2, P->Z); // ZP = [A24*[(XP+ZP)^2-(XP-ZP)^2]+(XP-ZP)^2]*[(XP+ZP)^2-(XP-ZP)^2] + //fp2mul_mont(Q->Z, xPQ, Q->Z); // ZQ = xPQ*[(XP+ZP)*(XQ-ZQ)-(XP-ZP)*(XQ+ZQ)]^2 +} + +static void LADDER3PT_ifma(const f2elm_t xP, const f2elm_t xQ, const f2elm_t xPQ, const uint64_t *m, const unsigned int AliceOrBob, point_proj_t R) +{ + point_proj_t R0 = {0}, R2 = {0}; + const f2elm_t A24 = { + {0x00000000124d6b3e, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000fcc0062a1600, 0x000d9932447f1d95, 0x000f30bb11b6dbbb, 0x000eeb74b81195c1, 0x000ea6bb113bab9f, 0x000aa668b600af9a, 0x0004fe3921c627e3, 0x00000000000dfc7b}, + {0}}; + + uint64_t mask; + int i, nbits, bit, swap, prevbit = 0; + + if (AliceOrBob == ALICE) + { + nbits = OALICE_BITS; + } + else + { + nbits = OBOB_BITS; + } + + // Initializing points + memcpy(R0->X, xQ, sizeof(f2elm_t)); + memcpy(R0->Z[0], One, sizeof(felm_t)); + + memcpy(R2->X, xPQ, sizeof(f2elm_t)); + memcpy(R2->Z[0], One, sizeof(felm_t)); + + memcpy(R->X, xP, sizeof(f2elm_t)); + memcpy(R->Z[0], One, sizeof(felm_t)); + memset(R->Z[1], 0, sizeof(felm_t)); + + // Main loop + for (i = 0; i < nbits; i++) + { + bit = (m[i >> 6] >> (i & (64 - 1))) & 1; + swap = bit ^ prevbit; + prevbit = bit; + fp2_swap(R, R2, swap); + + xDBLADD_ifma(R0, R2, R->X, A24); + fp2_mul_ifma(R2->X, R->Z, R2->X); + } +} + +static void xDBL_ifma(const point_proj_t P, point_proj_t Q, const f2elm_t A24plus, const f2elm_t C24) +{ // Doubling of a Montgomery point in projective coordinates (X:Z). + // Input: projective Montgomery x-coordinates P = (X1:Z1), where x1=X1/Z1 and Montgomery curve constants A+2C and 4C. + // Output: projective Montgomery x-coordinates Q = 2*P = (X2:Z2). + f2elm_t t0, t1, t2; + + fp2sub(P->X, P->Z, t0); // t0 = X1-Z1 + fp2add(P->X, P->Z, t1); // t1 = X1+Z1 + + fp2_mul_ifma_x2(t0, t0, t0, t1, t1, t1); + //fp2sqr_mont(t0, t0); // t0 = (X1-Z1)^2 + //fp2sqr_mont(t1, t1); // t1 = (X1+Z1)^2 + + fp2sub(t1, t0, t2); // t1 = (X1+Z1)^2-(X1-Z1)^2 + + fp2_mul_ifma_x2(Q->Z, t0, C24, t0, t2, A24plus); + //fp2mul_mont(C24, t0, Q->Z); // Z2 = C24*(X1-Z1)^2 + //fp2mul_mont(A24plus, t2, t0); // t0 = A24plus*[(X1+Z1)^2-(X1-Z1)^2] + + fp2add(Q->Z, t0, t0); // Z2 = A24plus*[(X1+Z1)^2-(X1-Z1)^2] + C24*(X1-Z1)^2 + + fp2_mul_ifma_x2(Q->X, Q->Z, t1, Q->Z, t2, t0); + //fp2mul_mont(t1, Q->Z, Q->X); // X2 = C24*(X1-Z1)^2*(X1+Z1)^2 + //fp2mul_mont(t0, t2, Q->Z); // Z2 = [A24plus*[(X1+Z1)^2-(X1-Z1)^2] + C24*(X1-Z1)^2]*[(X1+Z1)^2-(X1-Z1)^2] +} + +static void xDBLe_ifma(const point_proj_t P, point_proj_t Q, const f2elm_t A24plus, const f2elm_t C24, const int e) +{ // Computes [2^e](X:Z) on Montgomery curve with projective constant via e repeated doublings. + // Input: projective Montgomery x-coordinates P = (XP:ZP), such that xP=XP/ZP and Montgomery curve constants A+2C and 4C. + // Output: projective Montgomery x-coordinates Q <- (2^e)*P. + int i; + + memcpy(Q, P, sizeof(point_proj)); + + for (i = 0; i < e; i++) + { + xDBL_ifma(Q, Q, A24plus, C24); + } +} + +static void xTPL_ifma(const point_proj_t P, point_proj_t Q, const f2elm_t A24minus, const f2elm_t A24plus) +{ // Tripling of a Montgomery point in projective coordinates (X:Z). + // Input: projective Montgomery x-coordinates P = (X:Z), where x=X/Z and Montgomery curve constants A24plus = A+2C and A24minus = A-2C. + // Output: projective Montgomery x-coordinates Q = 3*P = (X3:Z3). + f2elm_t t0, t1, t2, t3, t4, t5, t6, t7, t8; + + fp2sub(P->X, P->Z, t0); // t0 = X-Z + fp2add(P->X, P->Z, t1); // t1 = X+Z + fp2_mul_ifma_x2(t2, t0, t0, t3, t1, t1); + //fp2sqr_mont(t0, t2); // t2 = (X-Z)^2 + //fp2sqr_mont(t1, t3); // t3 = (X+Z)^2 + fp2_mul_ifma_x2(t5, A24plus, t3, t6, A24minus, t2); + //fp2mul_mont(t3, A24plus, t5); // t5 = A24plus*(X+Z)^2 + //fp2mul_mont(A24minus, t2, t6); // t6 = A24minus*(X-Z)^2 + fp2_mul_ifma_x2(t7, t3, t5, t8, t2, t6); + //fp2mul_mont(t3, t5, t7); // t3 = A24plus*(X+Z)^3 + //fp2mul_mont(t2, t6, t8); // t2 = A24minus*(X-Z)^3 + fp2add(t0, t1, t4); // t4 = 2*X + fp2sub(t1, t0, t0); // t0 = 2*Z + fp2sqr_mont(t4, t1); // t1 = 4*X^2 + fp2sub(t1, t3, t1); // t1 = 4*X^2 - (X+Z)^2 + fp2sub(t1, t2, t1); // t1 = 4*X^2 - (X+Z)^2 - (X-Z)^2 + fp2sub(t8, t7, t7); // t3 = A24minus*(X-Z)^3 - coeff*(X+Z)^3 + fp2sub(t5, t6, t8); // t2 = A24plus*(X+Z)^2 - A24minus*(X-Z)^2 + fp2mul_mont(t1, t8, t1); // t1 = [4*X^2 - (X+Z)^2 - (X-Z)^2]*[A24plus*(X+Z)^2 - A24minus*(X-Z)^2] + fp2add(t7, t1, t8); // t2 = [4*X^2 - (X+Z)^2 - (X-Z)^2]*[A24plus*(X+Z)^2 - A24minus*(X-Z)^2] + A24minus*(X-Z)^3 - coeff*(X+Z)^3 + fp2sub(t7, t1, t1); // t1 = A24minus*(X-Z)^3 - A24plus*(X+Z)^3 - [4*X^2 - (X+Z)^2 - (X-Z)^2]*[A24plus*(X+Z)^2 - A24minus*(X-Z)^2] + fp2_mul_ifma_x2(t8, t8, t8, t1, t1, t1); + //fp2sqr_mont(t8, t8); // t2 = t2^2 + //fp2sqr_mont(t1, t1); // t1 = t1^2 + fp2_mul_ifma_x2(Q->X, t4, t8, Q->Z, t1, t0); + //fp2mul_mont(t4, t8, Q->X); // X3 = 2*X*t2 + //fp2mul_mont(t0, t1, Q->Z); // Z3 = 2*Z*t1 +} + +void xTPLe_ifma(const point_proj_t P, point_proj_t Q, const f2elm_t A24minus, const f2elm_t A24plus, const int e) +{ // Computes [3^e](X:Z) on Montgomery curve with projective constant via e repeated triplings. + // Input: projective Montgomery x-coordinates P = (XP:ZP), such that xP=XP/ZP and Montgomery curve constants A24plus = A+2C and A24minus = A-2C. + // Output: projective Montgomery x-coordinates Q <- (3^e)*P. + int i; + + memcpy(Q, P, sizeof(point_proj)); + + for (i = 0; i < e; i++) + { + xTPL_ifma(Q, Q, A24minus, A24plus); + } +} + +static void get_4_isog_ifma(const point_proj_t P, f2elm_t A24plus, f2elm_t C24, f2elm_t *coeff) +{ // Computes the corresponding 4-isogeny of a projective Montgomery point (X4:Z4) of order 4. + // Input: projective point of order four P = (X4:Z4). + // Output: the 4-isogenous Montgomery curve with projective coefficients A+2C/4C and the 3 coefficients + // that are used to evaluate the isogeny at a point in eval_4_isog(). + + fp2sub(P->X, P->Z, coeff[1]); // coeff[1] = X4-Z4 + fp2add(P->X, P->Z, coeff[2]); // coeff[2] = X4+Z4 + + fp2_mul_ifma_x2(coeff[0], P->Z, P->Z, A24plus, P->X, P->X); + //fp2sqr_mont(P->Z, coeff[0]); // coeff[0] = Z4^2 + //fp2sqr_mont(P->X, A24plus); // A24plus = X4^2 + + fp2add(coeff[0], coeff[0], coeff[0]); // coeff[0] = 2*Z4^2 + fp2add(A24plus, A24plus, A24plus); // A24plus = 2*X4^2 + + fp2_mul_ifma_x2(C24, coeff[0], coeff[0], A24plus, A24plus, A24plus); + //fp2sqr_mont(coeff[0], C24); // C24 = 4*Z4^4 + //fp2sqr_mont(A24plus, A24plus); // A24plus = 4*X4^4 + + fp2add(coeff[0], coeff[0], coeff[0]); // coeff[0] = 4*Z4^2 +} + +static void eval_4_isog_ifma(point_proj_t P, f2elm_t *coeff) +{ // Evaluates the isogeny at the point (X:Z) in the domain of the isogeny, given a 4-isogeny phi defined + // by the 3 coefficients in coeff (computed in the function get_4_isog()). + // Inputs: the coefficients defining the isogeny, and the projective point P = (X:Z). + // Output: the projective point P = phi(P) = (X:Z) in the codomain. + f2elm_t t0, t1, t2; + + fp2add(P->X, P->Z, t0); // t0 = X+Z + fp2sub(P->X, P->Z, t1); // t1 = X-Z + + fp2_mul_ifma_x2(P->X, t0, coeff[1], t0, t0, t1); + //fp2mul_mont(t0, coeff[1], P->X); // X = (X+Z)*coeff[1] + //fp2mul_mont(t0, t1, t0); // t0 = (X+Z)*(X-Z) + + fp2_mul_ifma_x2(P->Z, coeff[2], t1, t0, coeff[0], t0); + //fp2mul_mont(t1, coeff[2], P->Z); // Z = (X-Z)*coeff[2] + //fp2mul_mont(t0, coeff[0], t0); // t0 = coeff[0]*(X+Z)*(X-Z) + + fp2add(P->X, P->Z, t1); // t1 = (X-Z)*coeff[2] + (X+Z)*coeff[1] + fp2sub(P->X, P->Z, P->Z); // Z = (X-Z)*coeff[2] - (X+Z)*coeff[1] + + fp2_mul_ifma_x2(t1, t1, t1, P->Z, P->Z, P->Z); + //fp2sqr_mont(t1, t1); // t1 = [(X-Z)*coeff[2] + (X+Z)*coeff[1]]^2 + //fp2sqr_mont(P->Z, P->Z); // Z = [(X-Z)*coeff[2] - (X+Z)*coeff[1]]^2 + + fp2add(t1, t0, P->X); // X = coeff[0]*(X+Z)*(X-Z) + [(X-Z)*coeff[2] + (X+Z)*coeff[1]]^2 + fp2sub(P->Z, t0, t0); // t0 = [(X-Z)*coeff[2] - (X+Z)*coeff[1]]^2 - coeff[0]*(X+Z)*(X-Z) + + fp2_mul_ifma_x2(P->X, P->X, t1, P->Z, P->Z, t0); + //fp2mul_mont(P->X, t1, P->X); // Xfinal + //fp2mul_mont(P->Z, t0, P->Z); // Zfinal +} + +static void get_3_isog_ifma(const point_proj_t P, f2elm_t A24minus, f2elm_t A24plus, f2elm_t *coeff) +{ // Computes the corresponding 3-isogeny of a projective Montgomery point (X3:Z3) of order 3. + // Input: projective point of order three P = (X3:Z3). + // Output: the 3-isogenous Montgomery curve with projective coefficient A/C. + f2elm_t t0, t1, t2, t3, t4, t5; + + fp2sub(P->X, P->Z, coeff[0]); // coeff0 = X-Z + fp2add(P->X, P->Z, coeff[1]); // coeff1 = X+Z + fp2_mul_ifma_x2(t0, coeff[0], coeff[0], t1, coeff[1], coeff[1]); + //fp2sqr_mont(coeff[0], t0); // t0 = (X-Z)^2 + //fp2sqr_mont(coeff[1], t1); // t1 = (X+Z)^2 + fp2add(t0, t1, t2); // t2 = (X+Z)^2 + (X-Z)^2 + fp2add(coeff[0], coeff[1], t3); // t3 = 2*X + fp2sqr_mont(t3, t3); // t3 = 4*X^2 + fp2sub(t3, t2, t3); // t3 = 4*X^2 - (X+Z)^2 - (X-Z)^2 + fp2add(t1, t3, t2); // t2 = 4*X^2 - (X-Z)^2 + fp2add(t3, t0, t3); // t3 = 4*X^2 - (X+Z)^2 + fp2add(t0, t3, t4); // t4 = 4*X^2 - (X+Z)^2 + (X-Z)^2 + fp2add(t4, t4, t4); // t4 = 2(4*X^2 - (X+Z)^2 + (X-Z)^2) + fp2add(t1, t4, t4); // t4 = 8*X^2 - (X+Z)^2 + 2*(X-Z)^2 + fp2add(t1, t2, t5); // t4 = 4*X^2 + (X+Z)^2 - (X-Z)^2 + fp2add(t5, t5, t5); // t4 = 2(4*X^2 + (X+Z)^2 - (X-Z)^2) + fp2add(t0, t5, t5); // t4 = 8*X^2 + 2*(X+Z)^2 - (X-Z)^2 + fp2_mul_ifma_x2(A24minus, t2, t4, t5, t5, t3); + // fp2mul_mont(t2, t4, A24minus); // A24minus = [4*X^2 - (X-Z)^2]*[8*X^2 - (X+Z)^2 + 2*(X-Z)^2] + // fp2mul_mont(t3, t5, t5); // t4 = [4*X^2 - (X+Z)^2]*[8*X^2 + 2*(X+Z)^2 - (X-Z)^2] + fp2sub(t5, A24minus, t0); // t0 = [4*X^2 - (X+Z)^2]*[8*X^2 + 2*(X+Z)^2 - (X-Z)^2] - [4*X^2 - (X-Z)^2]*[8*X^2 - (X+Z)^2 + 2*(X-Z)^2] + fp2add(A24minus, t0, A24plus); // A24plus = 8*X^2 - (X+Z)^2 + 2*(X-Z)^2 +} + +static void eval_3_isog_ifma(point_proj_t Q, const f2elm_t *coeff) +{ // Computes the 3-isogeny R=phi(X:Z), given projective point (X3:Z3) of order 3 on a Montgomery curve and + // a point P with 2 coefficients in coeff (computed in the function get_3_isog()). + // Inputs: projective points P = (X3:Z3) and Q = (X:Z). + // Output: the projective point Q <- phi(Q) = (X3:Z3). + f2elm_t t0, t1, t2; + + fp2add(Q->X, Q->Z, t0); // t0 = X+Z + fp2sub(Q->X, Q->Z, t1); // t1 = X-Z + fp2_mul_ifma_x2(t0, t0, coeff[0], t1, t1, coeff[1]); + //fp2mul_mont(t0, coeff[0], t0); // t0 = coeff0*(X+Z) + //fp2mul_mont(t1, coeff[1], t1); // t1 = coeff1*(X-Z) + fp2add(t0, t1, t2); // t2 = coeff0*(X-Z) + coeff1*(X+Z) + fp2sub(t1, t0, t0); // t0 = coeff0*(X-Z) - coeff1*(X+Z) + fp2_mul_ifma_x2(t2, t2, t2, t0, t0, t0); + //fp2sqr_mont(t2, t2); // t2 = [coeff0*(X-Z) + coeff1*(X+Z)]^2 + //fp2sqr_mont(t0, t0); // t1 = [coeff0*(X-Z) - coeff1*(X+Z)]^2 + fp2_mul_ifma_x2(Q->X, Q->X, t2, Q->Z, Q->Z, t0); + //fp2mul_mont(Q->X, t2, Q->X); // X3final = X*[coeff0*(X-Z) + coeff1*(X+Z)]^2 + //fp2mul_mont(Q->Z, t0, Q->Z); // Z3final = Z*[coeff0*(X-Z) - coeff1*(X+Z)]^2 +} + +static void fp2_encode(const f2elm_t x, unsigned char *enc) +{ // Conversion of GF(p^2) element from Montgomery to standard representation, and encoding by removing leading 0 bytes + unsigned int i; + f2elm_t tt; + uint64_t t[12 * 2]; + + from_mont_ifma(tt[0], x[0]); + from_mont_ifma(tt[1], x[1]); + + red2norm(t, tt[0]); + red2norm(&t[12], tt[1]); + + for (i = 0; i < FP2_ENCODED_BYTES / 2; i++) + { + enc[i] = ((unsigned char *)t)[i]; + enc[i + FP2_ENCODED_BYTES / 2] = ((unsigned char *)t)[i + MAXBITS_FIELD / 8]; + } +} + +static void fp2_decode(const unsigned char *enc, f2elm_t x) +{ + unsigned int i; + uint64_t t[12 * 2]; + + memset(x, 0, sizeof(f2elm_t)); + for (i = 0; i < FP2_ENCODED_BYTES / 2; i++) + { + ((unsigned char *)t)[i] = enc[i]; + ((unsigned char *)t)[i + MAXBITS_FIELD / 8] = enc[i + FP2_ENCODED_BYTES / 2]; + } + + norm2red(x[0], t); + norm2red(x[1], &t[12]); + to_mont_ifma(x[0], x[0]); + to_mont_ifma(x[1], x[1]); +} + +int EphemeralKeyGeneration_A_ifma(const unsigned char *PrivateKeyA, unsigned char *PublicKeyA) +{ // Alice's ephemeral public key generation + // Input: a private key PrivateKeyA in the range [0, 2^eA - 1]. + // Output: the public key PublicKeyA consisting of 3 elements in GF(p^2) which are encoded by removing leading 0 bytes. + point_proj_t R, phiP = {0}, phiQ = {0}, phiR = {0}, pts[MAX_INT_POINTS_ALICE]; + f2elm_t XPA, XQA, XRA, coeff[3]; + unsigned int i, row, m, index = 0, pts_index[MAX_INT_POINTS_ALICE], npts = 0, ii = 0; + + f2elm_t C24 = { + {0x000000004935acf8, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0003f30018a85800, 0x000664c911fc7654, 0x000cc2ec46db6eef, 0x000badd2e0465707, 0x000a9aec44eeae7f, 0x000a99a2d802be6b, 0x0003f8e487189f8e, 0x000000000037f1ed}, + {0}}; + + f2elm_t A24plus = { + {0x00000000249ad67c, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0001f9800c542c00, 0x000b326488fe3b2a, 0x000e6176236db777, 0x000dd6e970232b83, 0x000d4d762277573f, 0x00054cd16c015f35, 0x0009fc72438c4fc7, 0x00000000001bf8f6}, + {0}}; + + // Initialize basis points + init_basis(A_gen_ifma, XPA, XQA, XRA); + init_basis(B_gen_ifma, phiP->X, phiQ->X, phiR->X); + memcpy(phiP->Z, One, sizeof(felm_t)); + memcpy(phiQ->Z, One, sizeof(felm_t)); + memcpy(phiR->Z, One, sizeof(felm_t)); + + // Retrieve kernel point + LADDER3PT_ifma(XPA, XQA, XRA, (uint64_t *)PrivateKeyA, ALICE, R); + + // Traverse tree + index = 0; + for (row = 1; row < MAX_Alice; row++) + { + while (index < MAX_Alice - row) + { + memcpy(pts[npts]->X, R->X, sizeof(f2elm_t)); + memcpy(pts[npts]->Z, R->Z, sizeof(f2elm_t)); + pts_index[npts++] = index; + m = strat_Alice[ii++]; + xDBLe_ifma(R, R, A24plus, C24, (int)(2 * m)); + index += m; + } + get_4_isog_ifma(R, A24plus, C24, coeff); + + for (i = 0; i < npts; i++) + { + eval_4_isog_ifma(pts[i], coeff); + } + eval_4_isog_ifma(phiP, coeff); + eval_4_isog_ifma(phiQ, coeff); + eval_4_isog_ifma(phiR, coeff); + + memcpy(R->X, pts[npts - 1]->X, sizeof(f2elm_t)); + memcpy(R->Z, pts[npts - 1]->Z, sizeof(f2elm_t)); + index = pts_index[npts - 1]; + npts -= 1; + } + + get_4_isog_ifma(R, A24plus, C24, coeff); + eval_4_isog_ifma(phiP, coeff); + eval_4_isog_ifma(phiQ, coeff); + eval_4_isog_ifma(phiR, coeff); + + inv_3_way_ifma(phiP->Z, phiQ->Z, phiR->Z); + fp2_mul_ifma_x2(phiP->X, phiP->X, phiP->Z, phiQ->X, phiQ->X, phiQ->Z); + //fp2mul_mont(phiP->X, phiP->Z, phiP->X); + //fp2mul_mont(phiQ->X, phiQ->Z, phiQ->X); + fp2mul_mont(phiR->X, phiR->Z, phiR->X); + + // Format public key + fp2_encode(phiP->X, PublicKeyA); + fp2_encode(phiQ->X, PublicKeyA + FP2_ENCODED_BYTES); + fp2_encode(phiR->X, PublicKeyA + 2 * FP2_ENCODED_BYTES); + + return 0; +} + +int EphemeralKeyGeneration_B_ifma(const unsigned char *PrivateKeyB, unsigned char *PublicKeyB) +{ // Bob's ephemeral public key generation + // Input: a private key PrivateKeyB in the range [0, 2^Floor(Log(2,oB)) - 1]. + // Output: the public key PublicKeyB consisting of 3 elements in GF(p^2) which are encoded by removing leading 0 bytes. + point_proj_t R, phiP = {0}, phiQ = {0}, phiR = {0}, pts[MAX_INT_POINTS_BOB]; + f2elm_t XPB, XQB, XRB, coeff[3], A = {0}; + unsigned int i, row, m, index = 0, pts_index[MAX_INT_POINTS_BOB], npts = 0, ii = 0; + + f2elm_t A24plus = {{0x000000004935acf8, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0003f30018a85800, 0x000664c911fc7654, 0x000cc2ec46db6eef, 0x000badd2e0465707, 0x000a9aec44eeae7f, 0x000a99a2d802be6b, 0x0003f8e487189f8e, 0x000000000037f1ed}, + {0}}; + + f2elm_t A24minus = {{0x000fffffb6ca5307, 0x000fffffffffffff, 0x000fffffffffffff, 0x000fffffffffffff, 0x000fffffffffffff, 0x000fffffffffffff, 0x000fffffffffffff, 0x0000ac8771e692ff, 0x000167add1f02031, 0x000aaabd12d63250, 0x000ca0c5879094e0, 0x0000b5598636c600, 0x0004fe180463c6f7, 0x0000268d39c8897b, 0x000000000037f3e8}, + {0}}; + + uint64_t temp[12]; + uint64_t ifma_temp[15]; + // Initialize basis points + init_basis(B_gen_ifma, XPB, XQB, XRB); + init_basis(A_gen_ifma, phiP->X, phiQ->X, phiR->X); + memcpy(phiP->Z, One, sizeof(felm_t)); + memcpy(phiQ->Z, One, sizeof(felm_t)); + memcpy(phiR->Z, One, sizeof(felm_t)); + + // Retrieve kernel point + LADDER3PT_ifma(XPB, XQB, XRB, (uint64_t *)PrivateKeyB, BOB, R); + + // Traverse tree + index = 0; + for (row = 1; row < MAX_Bob; row++) + { + while (index < MAX_Bob - row) + { + memcpy(pts[npts]->X, R->X, sizeof(f2elm_t)); + memcpy(pts[npts]->Z, R->Z, sizeof(f2elm_t)); + pts_index[npts++] = index; + m = strat_Bob[ii++]; + xTPLe_ifma(R, R, A24minus, A24plus, (int)m); + index += m; + } + get_3_isog_ifma(R, A24minus, A24plus, coeff); + + for (i = 0; i < npts; i++) + { + eval_3_isog_ifma(pts[i], coeff); + } + eval_3_isog_ifma(phiP, coeff); + eval_3_isog_ifma(phiQ, coeff); + eval_3_isog_ifma(phiR, coeff); + + memcpy(R->X, pts[npts - 1]->X, sizeof(f2elm_t)); + memcpy(R->Z, pts[npts - 1]->Z, sizeof(f2elm_t)); + + index = pts_index[npts - 1]; + npts -= 1; + } + + get_3_isog_ifma(R, A24minus, A24plus, coeff); + eval_3_isog_ifma(phiP, coeff); + eval_3_isog_ifma(phiQ, coeff); + eval_3_isog_ifma(phiR, coeff); + + inv_3_way_ifma(phiP->Z, phiQ->Z, phiR->Z); + fp2mul_mont(phiP->X, phiP->Z, phiP->X); + fp2mul_mont(phiQ->X, phiQ->Z, phiQ->X); + fp2mul_mont(phiR->X, phiR->Z, phiR->X); + + // Format public key + fp2_encode(phiP->X, PublicKeyB); + fp2_encode(phiQ->X, PublicKeyB + FP2_ENCODED_BYTES); + fp2_encode(phiR->X, PublicKeyB + 2 * FP2_ENCODED_BYTES); + + return 0; +} diff --git a/README.md b/README.md new file mode 100644 index 0000000..c080adf --- /dev/null +++ b/README.md @@ -0,0 +1,34 @@ +## PQ SIDH/SIKE implementation using AVX512IFMA instructions + +Using the AVX512IFMA (vpmadd52luq and vpmadd52huq) specifically designed for +prime field arithmetic allows a projected speedup of up to 4X on supporting +processors, when those become available. + +### Current status + +* Tested for correctness with Intel SDE +* EphemeralKeyGeneration_A and EphemeralKeyGeneration_B with P751 are implemented +* Using "standins": 3X performance gain on Xeon Gold (with two FMA units) +* Optimizations are 3-fold + * Finite field *𝔽~p~* multiplication by performing a single horizontal Montgomery multiplication + * Quadratic finite field *𝔽~p²~* multiplication and square by performing 3/4 horizontal Montgomery multiplications in parallel + * A pair of quadratic finite field *𝔽~p²~* multiplications (where applicable) by performing 8 vertical Montgomery multiplications in parallel + * AVX512 add/sub are also implemented + +### How to test? + +The Makefile generates to executables: sidh_ifma can be run with Intel SDE to +check for correctness. sidh_standin produces incorrect results, because it +replaces the IFMA instrutions with FMA instructions and can be executed on a +machine with AVX512 support to estimate performance. + +### TODO + +* EphemeralSecretAgreement_A and EphemeralSecretAgreement_B +* SIKE +* P503 +* Using vertical representation throughout for greater speedups + +### License + +Available under the original [SIKE](https://github.com/Microsoft/PQCrypto-SIKE) license diff --git a/fp2_751_ifma.S b/fp2_751_ifma.S new file mode 100644 index 0000000..f6e7ea4 --- /dev/null +++ b/fp2_751_ifma.S @@ -0,0 +1,916 @@ + +#if defined(__APPLE__) +/* OS X's C ABI prefixes functions with underscore. */ +#define C_ABI(x) _ ## x +#define HIDDEN .private_extern +#else +#define C_ABI(x) x +#define HIDDEN .hidden +#endif + +.p2align 6 +.LpermMask0: +.word 0,1,2,3, 3,4,5,6, 6,7,8,9, 9,10,11,12, 13,14,15,16, 16,17,18,19, 19,20,21,22, 22,23,24,25 +.LshiftMask0: +.quad 0,4,8,12,0,4,8,12 +.LandMask: +.quad 0xfffffffffffff + +.p2align 6 +.Lpoly: +.quad 0x000fffffffffffff, 0x000fffffffffffff, 0x000fffffffffffff, 0x000fffffffffffff +.quad 0x000fffffffffffff, 0x000fffffffffffff, 0x000fffffffffffff, 0x00049f878a8eeaff +.quad 0x0007cc76e3ec9685, 0x00076da959b1a13f, 0x00084e9867d6ebe8, 0x000b5045cb257480 +.quad 0x000f97badc668562, 0x00041f71c0e12909, 0x00000000006fe5d5, 0 + +.LpolyX: +.quad 0x0fffffffffffff00, 0x0fffffffffffff00, 0x0fffffffffffff00, 0x0fffffffffffff00 +.quad 0x0fffffffffffff00, 0x0fffffffffffff00, 0x0fffffffffffff00, 0x049f878a8eeaff00 +.quad 0x07cc76e3ec968500, 0x076da959b1a13f00, 0x084e9867d6ebe800, 0x0b5045cb25748000 +.quad 0x0f97badc66856200, 0x041f71c0e1290900, 0x000000006fe5d500, 0 + +#define felemR %rdi +#define felemA %rsi +#define felemB %rdx + +#define itr %r10 + +#define M0 %zmm0 +#define M1 %zmm1 +#define ZERO %zmm2 +#define AND_MASK %zmm3 + +#define A0a %zmm4 +#define A0b %zmm5 + +#define A1a %zmm6 +#define A1b %zmm7 + +#define ACC0a %zmm8 +#define ACC0b %zmm9 +#define ACC1a %zmm10 +#define ACC1b %zmm11 +#define ACC2a %zmm12 +#define ACC2b %zmm13 +#define ACC3a %zmm14 +#define ACC3b %zmm15 + +#define B0curr %zmm16 +#define B0prev %zmm17 +#define B1curr %zmm18 +#define B1prev %zmm19 + +#define Y0curr %zmm20 +#define Y0prev %zmm21 +#define Y1curr %zmm22 +#define Y1prev %zmm23 +#define Y2curr %zmm24 +#define Y2prev %zmm25 +#define Y3curr %zmm26 +#define Y3prev %zmm27 + +#define T0 %zmm28 +#define T1 %zmm29 +#define T2 %zmm30 +#define T3 %zmm31 + +############################################################################### +.globl C_ABI(fp2_mul_ifma) +.p2align 6 +C_ABI(fp2_mul_ifma): + + mov $1, %eax + kmovw %eax, %k1 + mov $0x7f, %eax + kmovw %eax, %k5 + + vpbroadcastq .LandMask(%rip), AND_MASK + vpxorq ZERO, ZERO, ZERO + + vmovdqu64 64*0(felemA), A0a + vmovdqu64 64*1(felemA), A0b{%k5}{z} + vmovdqu64 15*8 + 64*0(felemA), A1a + vmovdqu64 15*8 + 64*1(felemA), A1b{%k5}{z} + + # Load the modulus + vmovdqa64 64*0 + .Lpoly(%rip), M0 + vmovdqa64 64*1 + .Lpoly(%rip), M1 + + # Prepare the accumulators + vpxorq ACC0a, ACC0a, ACC0a + vpxorq ACC0b, ACC0b, ACC0b + vpxorq ACC1a, ACC1a, ACC1a + vpxorq ACC1b, ACC1b, ACC1b + vpxorq ACC2a, ACC2a, ACC2a + vpxorq ACC2b, ACC2b, ACC2b + vpxorq ACC3a, ACC3a, ACC3a + vpxorq ACC3b, ACC3b, ACC3b + vpxorq T0, T0, T0 + vpxorq T1, T1, T1 + vpxorq T2, T2, T2 + vpxorq T3, T3, T3 + + # First iteration + vpbroadcastq (felemB), B0curr + vpbroadcastq 15*8(felemB), B1curr + lea 8(felemB), felemB + + vpmadd52luq B0curr, A0a, ACC0a + vpmadd52luq B0curr, A0b, ACC0b + vpmadd52luq B1curr, A1a, ACC1a + vpmadd52luq B1curr, A1b, ACC1b + vpmadd52luq B0curr, A1a, ACC2a + vpmadd52luq B0curr, A1b, ACC2b + vpmadd52luq B1curr, A0a, ACC3a + vpmadd52luq B1curr, A0b, ACC3b + + vpermq ACC0a, ZERO, Y0curr + vpermq ACC1a, ZERO, Y1curr + vpermq ACC2a, ZERO, Y2curr + vpermq ACC3a, ZERO, Y3curr + + vpmadd52luq Y0curr, M0, ACC0a + vpmadd52luq Y0curr, M1, ACC0b + vpmadd52luq Y1curr, M0, ACC1a + vpmadd52luq Y1curr, M1, ACC1b + vpmadd52luq Y2curr, M0, ACC2a + vpmadd52luq Y2curr, M1, ACC2b + vpmadd52luq Y3curr, M0, ACC3a + vpmadd52luq Y3curr, M1, ACC3b + + vpsrlq $52, ACC0a, T0{%k1}{z} + vpsrlq $52, ACC1a, T1{%k1}{z} + vpsrlq $52, ACC2a, T2{%k1}{z} + vpsrlq $52, ACC3a, T3{%k1}{z} + + mov $14, itr + +1: + # Shift the ACC in zmms right by a word + valignq $1, ACC0a, ACC0b, ACC0a + valignq $1, ACC0b, ZERO, ACC0b + valignq $1, ACC1a, ACC1b, ACC1a + valignq $1, ACC1b, ZERO, ACC1b + valignq $1, ACC2a, ACC2b, ACC2a + valignq $1, ACC2b, ZERO, ACC2b + valignq $1, ACC3a, ACC3b, ACC3a + valignq $1, ACC3b, ZERO, ACC3b + + vmovdqa64 B0curr, B0prev + vmovdqa64 B1curr, B1prev + vmovdqa64 Y0curr, Y0prev + vmovdqa64 Y1curr, Y1prev + vmovdqa64 Y2curr, Y2prev + vmovdqa64 Y3curr, Y3prev + + vpbroadcastq (felemB), B0curr + vpbroadcastq 15*8(felemB), B1curr + lea 8(felemB), felemB + + # High multiplications + vpmadd52huq B0prev, A0a, ACC0a # ACC0 = A0 * B0 + vpmadd52huq B0prev, A0b, ACC0b + vpmadd52huq B1prev, A1a, ACC1a # ACC1 = A1 * B1 + vpmadd52huq B1prev, A1b, ACC1b + vpmadd52huq B0prev, A1a, ACC2a # ACC2 = A1 * B0 + vpmadd52huq B0prev, A1b, ACC2b + vpmadd52huq B1prev, A0a, ACC3a # ACC3 = A0 * B1 + vpmadd52huq B1prev, A0b, ACC3b + + vpmadd52huq Y0prev, M0, ACC0a + vpmadd52huq Y0prev, M1, ACC0b + vpmadd52huq Y1prev, M0, ACC1a + vpmadd52huq Y1prev, M1, ACC1b + vpmadd52huq Y2prev, M0, ACC2a + vpmadd52huq Y2prev, M1, ACC2b + vpmadd52huq Y3prev, M0, ACC3a + vpmadd52huq Y3prev, M1, ACC3b + # Low multiplications + vpmadd52luq B0curr, A0a, ACC0a + vpmadd52luq B0curr, A0b, ACC0b + vpmadd52luq B1curr, A1a, ACC1a + vpmadd52luq B1curr, A1b, ACC1b + vpmadd52luq B0curr, A1a, ACC2a + vpmadd52luq B0curr, A1b, ACC2b + vpmadd52luq B1curr, A0a, ACC3a + vpmadd52luq B1curr, A0b, ACC3b + + vpaddq T0, ACC0a, ACC0a + vpaddq T1, ACC1a, ACC1a + vpaddq T2, ACC2a, ACC2a + vpaddq T3, ACC3a, ACC3a + vpermq ACC0a, ZERO, Y0curr + vpermq ACC1a, ZERO, Y1curr + vpermq ACC2a, ZERO, Y2curr + vpermq ACC3a, ZERO, Y3curr + + vpmadd52luq Y0curr, M0, ACC0a + vpmadd52luq Y0curr, M1, ACC0b + vpmadd52luq Y1curr, M0, ACC1a + vpmadd52luq Y1curr, M1, ACC1b + vpmadd52luq Y2curr, M0, ACC2a + vpmadd52luq Y2curr, M1, ACC2b + vpmadd52luq Y3curr, M0, ACC3a + vpmadd52luq Y3curr, M1, ACC3b + + vpsrlq $52, ACC0a, T0{%k1}{z} + vpsrlq $52, ACC1a, T1{%k1}{z} + vpsrlq $52, ACC2a, T2{%k1}{z} + vpsrlq $52, ACC3a, T3{%k1}{z} + + dec itr + jne 1b + valignq $1, ACC0a, ACC0b, ACC0a + valignq $1, ACC0b, ZERO, ACC0b + valignq $1, ACC1a, ACC1b, ACC1a + valignq $1, ACC1b, ZERO, ACC1b + valignq $1, ACC2a, ACC2b, ACC2a + valignq $1, ACC2b, ZERO, ACC2b + valignq $1, ACC3a, ACC3b, ACC3a + valignq $1, ACC3b, ZERO, ACC3b + vpaddq T0, ACC0a, ACC0a + vpaddq T1, ACC1a, ACC1a + vpaddq T2, ACC2a, ACC2a + vpaddq T3, ACC3a, ACC3a + + # The last high multiplications + vpmadd52huq B0curr, A0a, ACC0a + vpmadd52huq B0curr, A0b, ACC0b + vpmadd52huq B1curr, A1a, ACC1a + vpmadd52huq B1curr, A1b, ACC1b + vpmadd52huq B0curr, A1a, ACC2a + vpmadd52huq B0curr, A1b, ACC2b + vpmadd52huq B1curr, A0a, ACC3a + vpmadd52huq B1curr, A0b, ACC3b + + vpmadd52huq Y0curr, M0, ACC0a + vpmadd52huq Y0curr, M1, ACC0b + vpmadd52huq Y1curr, M0, ACC1a + vpmadd52huq Y1curr, M1, ACC1b + vpmadd52huq Y2curr, M0, ACC2a + vpmadd52huq Y2curr, M1, ACC2b + vpmadd52huq Y3curr, M0, ACC3a + vpmadd52huq Y3curr, M1, ACC3b + + # C0 = A0*B0 - A1*B1 + # C1 = A0*B1 + A1*B0 + vpaddq 64*0 + .LpolyX(%rip), ACC0a, ACC0a + vpaddq 64*1 + .LpolyX(%rip), ACC0b, ACC0b + + vpaddq ACC3a, ACC2a, ACC2a + vpaddq ACC3b, ACC2b, ACC2b + + vpsubq ACC1a, ACC0a, ACC0a + vpsubq ACC1b, ACC0b, ACC0b + # Now 'normalize' the acc to 52 bit words + vpsrlq $52, ACC0a, A0a + vpsrlq $52, ACC0b, A0b + + vpsrlq $52, ACC2a, A1a + vpsrlq $52, ACC2b, A1b + + vpandq AND_MASK, ACC0a, ACC0a + vpandq AND_MASK, ACC0b, ACC0b + vpandq AND_MASK, ACC2a, ACC2a + vpandq AND_MASK, ACC2b, ACC2b + + valignq $7, A0a, A0b, A0b + valignq $7, ZERO, A0a, A0a + valignq $7, A1a, A1b, A1b + valignq $7, ZERO, A1a, A1a + + vpaddq A0a, ACC0a, ACC0a + vpaddq A0b, ACC0b, ACC0b + vpaddq A1a, ACC2a, ACC2a + vpaddq A1b, ACC2b, ACC2b + + vpcmpuq $1, A0a, ACC0a, %k1 + vpcmpuq $1, A0b, ACC0b, %k2 + vpcmpuq $0, AND_MASK, ACC0a, %k3 + vpcmpuq $0, AND_MASK, ACC0b, %k4 + + kmovb %k1, %eax + kmovb %k2, %ecx + kmovb %k3, %r8d + kmovb %k4, %r9d + + add %al, %al + adc %cl, %cl + + add %r8b, %al + adc %r9b, %cl + + xor %r8b, %al + xor %r9b, %cl + + kmovb %eax, %k1 + kmovb %ecx, %k2 + + vpsubq AND_MASK, ACC0a, ACC0a{%k1} + vpsubq AND_MASK, ACC0b, ACC0b{%k2} + vpandq AND_MASK, ACC0a, ACC0a + vpandq AND_MASK, ACC0b, ACC0b + + vpcmpuq $1, A1a, ACC2a, %k1 + vpcmpuq $1, A1b, ACC2b, %k2 + vpcmpuq $0, AND_MASK, ACC2a, %k3 + vpcmpuq $0, AND_MASK, ACC2b, %k4 + + kmovb %k1, %eax + kmovb %k2, %ecx + kmovb %k3, %r8d + kmovb %k4, %r9d + + add %al, %al + adc %cl, %cl + add %r8b, %al + adc %r9b, %cl + xor %r8b, %al + xor %r9b, %cl + kmovb %eax, %k1 + kmovb %ecx, %k2 + + vpsubq AND_MASK, ACC2a, ACC2a{%k1} + vpsubq AND_MASK, ACC2b, ACC2b{%k2} + vpandq AND_MASK, ACC2a, ACC2a + vpandq AND_MASK, ACC2b, ACC2b + + mov $0x7f, %eax + kmovw %eax, %k1 + + vmovdqu64 ACC0a, 64*0(felemR) + vmovdqu64 ACC0b, 64*1(felemR){%k5} + vmovdqu64 ACC2a, 15*8 + 64*0(felemR) + vmovdqu64 ACC2b, 15*8 + 64*1(felemR){%k5} + ret + +############################################################################### + +#define ST0 ACC3a +#define ST1 ACC3b +#define ST2 Y3curr + +.globl C_ABI(fp2_sqr_ifma) +.p2align 6 +C_ABI(fp2_sqr_ifma): + + mov $1, %eax + kmovw %eax, %k1 + mov $0x7f, %eax + kmovw %eax, %k2 + + vpbroadcastq .LandMask(%rip), AND_MASK + vpxorq ZERO, ZERO, ZERO + + vmovdqu64 64*0(felemA), A0a + vmovdqu64 64*1(felemA), A0b{%k2}{z} + vmovdqu64 15*8 + 64*0(felemA), A1a + vmovdqu64 15*8 + 64*1(felemA), A1b{%k2}{z} + + # Load the modulus + vmovdqa64 64*0 + .Lpoly(%rip), M0 + vmovdqa64 64*1 + .Lpoly(%rip), M1 + + # Prepare the accumulators + vpxorq ACC0a, ACC0a, ACC0a + vpxorq ACC0b, ACC0b, ACC0b + vpxorq ACC1a, ACC1a, ACC1a + vpxorq ACC1b, ACC1b, ACC1b + vpxorq ACC2a, ACC2a, ACC2a + vpxorq ACC2b, ACC2b, ACC2b + vpxorq T0, T0, T0 + vpxorq T1, T1, T1 + vpxorq T2, T2, T2 + + # First iteration + vpbroadcastq (felemA), B0curr + vpbroadcastq 15*8(felemA), B1curr + lea 8(felemA), felemA + + vpmadd52luq B0curr, A0a, ACC0a + vpmadd52luq B0curr, A0b, ACC0b + vpmadd52luq B1curr, A1a, ACC1a + vpmadd52luq B1curr, A1b, ACC1b + vpmadd52luq B0curr, A1a, ACC2a + vpmadd52luq B0curr, A1b, ACC2b + + vpermq ACC0a, ZERO, Y0curr + vpermq ACC1a, ZERO, Y1curr + vpermq ACC2a, ZERO, Y2curr + + vpmadd52luq Y0curr, M0, ACC0a + vpmadd52luq Y0curr, M1, ACC0b + vpmadd52luq Y1curr, M0, ACC1a + vpmadd52luq Y1curr, M1, ACC1b + vpmadd52luq Y2curr, M0, ACC2a + vpmadd52luq Y2curr, M1, ACC2b + + vpsrlq $52, ACC0a, T0{%k1}{z} + vpsrlq $52, ACC1a, T1{%k1}{z} + vpsrlq $52, ACC2a, T2{%k1}{z} + + mov $14, itr + +1: + # Shift the ACC in zmms right by a word + valignq $1, ACC0a, ACC0b, ACC0a + valignq $1, ACC0b, ZERO, ACC0b + valignq $1, ACC1a, ACC1b, ACC1a + valignq $1, ACC1b, ZERO, ACC1b + valignq $1, ACC2a, ACC2b, ACC2a + valignq $1, ACC2b, ZERO, ACC2b + + vpxorq ST0, ST0, ST0 + vpxorq ST1, ST1, ST1 + vpxorq ST2, ST2, ST2 + + vmovdqa64 B0curr, B0prev + vmovdqa64 B1curr, B1prev + vmovdqa64 Y0curr, Y0prev + vmovdqa64 Y1curr, Y1prev + vmovdqa64 Y2curr, Y2prev + + vpbroadcastq (felemA), B0curr + vpbroadcastq 15*8(felemA), B1curr + lea 8(felemA), felemA + + # High multiplications + vpmadd52huq B0prev, A0a, ACC0a # ACC0 = A0 * B0 + vpmadd52huq B1prev, A1a, ACC1a # ACC1 = A1 * B1 + vpmadd52huq B0prev, A1a, ACC2a # ACC2 = A1 * B0 + vpmadd52huq B0prev, A0b, ACC0b + vpmadd52huq B1prev, A1b, ACC1b + vpmadd52huq B0prev, A1b, ACC2b + # We really want to have 8 independent vpmadd instructions in the pipe + vpmadd52huq Y0prev, M0, T0 + vpmadd52huq Y1prev, M0, T1 + vpmadd52huq Y2prev, M0, T2 + + vpmadd52huq Y0prev, M1, ACC0b + vpmadd52huq Y1prev, M1, ACC1b + vpmadd52huq Y2prev, M1, ACC2b + # Low multiplications + vpmadd52luq B0curr, A0a, ACC0a + vpmadd52luq B1curr, A1a, ACC1a + vpmadd52luq B0curr, A1a, ACC2a + + vpmadd52luq B0curr, A0b, ST0 + vpmadd52luq B1curr, A1b, ST1 + vpmadd52luq B0curr, A1b, ST2 + + vpaddq T0, ACC0a, ACC0a + vpaddq T1, ACC1a, ACC1a + vpaddq T2, ACC2a, ACC2a + vpermq ACC0a, ZERO, Y0curr + vpermq ACC1a, ZERO, Y1curr + vpermq ACC2a, ZERO, Y2curr + vpaddq ST0, ACC0b, ACC0b + vpaddq ST1, ACC1b, ACC1b + vpaddq ST2, ACC2b, ACC2b + + vpmadd52luq Y0curr, M0, ACC0a + vpmadd52luq Y0curr, M1, ACC0b + vpmadd52luq Y1curr, M0, ACC1a + vpmadd52luq Y1curr, M1, ACC1b + vpmadd52luq Y2curr, M0, ACC2a + vpmadd52luq Y2curr, M1, ACC2b + + vpsrlq $52, ACC0a, T0{%k1}{z} + vpsrlq $52, ACC1a, T1{%k1}{z} + vpsrlq $52, ACC2a, T2{%k1}{z} + + dec itr + jne 1b + valignq $1, ACC0a, ACC0b, ACC0a + valignq $1, ACC0b, ZERO, ACC0b + valignq $1, ACC1a, ACC1b, ACC1a + valignq $1, ACC1b, ZERO, ACC1b + valignq $1, ACC2a, ACC2b, ACC2a + valignq $1, ACC2b, ZERO, ACC2b + vpaddq T0, ACC0a, ACC0a + vpaddq T1, ACC1a, ACC1a + vpaddq T2, ACC2a, ACC2a + + # The last high multiplications + vpmadd52huq B0curr, A0a, ACC0a + vpmadd52huq B0curr, A0b, ACC0b + vpmadd52huq B1curr, A1a, ACC1a + vpmadd52huq B1curr, A1b, ACC1b + vpmadd52huq B0curr, A1a, ACC2a + vpmadd52huq B0curr, A1b, ACC2b + + vpmadd52huq Y0curr, M0, ACC0a + vpmadd52huq Y0curr, M1, ACC0b + vpmadd52huq Y1curr, M0, ACC1a + vpmadd52huq Y1curr, M1, ACC1b + vpmadd52huq Y2curr, M0, ACC2a + vpmadd52huq Y2curr, M1, ACC2b + + # C0 = A0*B0 - A1*B1 + # C1 = A0*B1 + A1*B0 + vpaddq 64*0 + .LpolyX(%rip), ACC0a, ACC0a + vpaddq 64*1 + .LpolyX(%rip), ACC0b, ACC0b + + vpaddq ACC2a, ACC2a, ACC2a + vpaddq ACC2b, ACC2b, ACC2b + + vpsubq ACC1a, ACC0a, ACC0a + vpsubq ACC1b, ACC0b, ACC0b + + # Now 'normalize' the acc to 52 bit words + vpsrlq $52, ACC0a, A0a + vpsrlq $52, ACC0b, A0b + vpsrlq $52, ACC2a, A1a + vpsrlq $52, ACC2b, A1b + + vpandq AND_MASK, ACC0a, ACC0a + vpandq AND_MASK, ACC0b, ACC0b + vpandq AND_MASK, ACC2a, ACC2a + vpandq AND_MASK, ACC2b, ACC2b + + valignq $7, A0a, A0b, A0b + valignq $7, ZERO, A0a, A0a + valignq $7, A1a, A1b, A1b + valignq $7, ZERO, A1a, A1a + + vpaddq A0a, ACC0a, ACC0a + vpaddq A0b, ACC0b, ACC0b + vpaddq A1a, ACC2a, ACC2a + vpaddq A1b, ACC2b, ACC2b + + vpcmpuq $1, A0a, ACC0a, %k1 + vpcmpuq $1, A0b, ACC0b, %k2 + vpcmpuq $0, AND_MASK, ACC0a, %k3 + vpcmpuq $0, AND_MASK, ACC0b, %k4 + + kmovb %k1, %eax + kmovb %k2, %ecx + kmovb %k3, %r8d + kmovb %k4, %r9d + + add %al, %al + adc %cl, %cl + add %r8b, %al + adc %r9b, %cl + xor %r8b, %al + xor %r9b, %cl + kmovb %eax, %k1 + kmovb %ecx, %k2 + + vpsubq AND_MASK, ACC0a, ACC0a{%k1} + vpsubq AND_MASK, ACC0b, ACC0b{%k2} + vpandq AND_MASK, ACC0a, ACC0a + vpandq AND_MASK, ACC0b, ACC0b + + vpcmpuq $1, A1a, ACC2a, %k1 + vpcmpuq $1, A1b, ACC2b, %k2 + vpcmpuq $0, AND_MASK, ACC2a, %k3 + vpcmpuq $0, AND_MASK, ACC2b, %k4 + + kmovb %k1, %eax + kmovb %k2, %ecx + kmovb %k3, %r8d + kmovb %k4, %r9d + + add %al, %al + adc %cl, %cl + add %r8b, %al + adc %r9b, %cl + xor %r8b, %al + xor %r9b, %cl + kmovb %eax, %k1 + kmovb %ecx, %k2 + + vpsubq AND_MASK, ACC2a, ACC2a{%k1} + vpsubq AND_MASK, ACC2b, ACC2b{%k2} + vpandq AND_MASK, ACC2a, ACC2a + vpandq AND_MASK, ACC2b, ACC2b + + mov $0x7f, %eax + kmovw %eax, %k1 + + vmovdqu64 ACC0a, 64*0(felemR) + vmovdqu64 ACC0b, 64*1(felemR){%k1} + vmovdqu64 ACC2a, 15*8 + 64*0(felemR) + vmovdqu64 ACC2b, 15*8 + 64*1(felemR){%k1} + ret + +############################################################################### +.globl C_ABI(fp2_sub) +.p2align 6 +C_ABI(fp2_sub): + + mov $1, %eax + kmovw %eax, %k1 + mov $0x7f, %eax + kmovw %eax, %k2 + + vmovdqu64 64*0(felemA), ACC0a + vmovdqu64 64*1(felemA), ACC0b{%k2}{z} + vmovdqu64 15*8 + 64*0(felemA), ACC1a + vmovdqu64 15*8 + 64*1(felemA), ACC1b{%k2}{z} + + vmovdqu64 64*0(felemB), ACC2a + vmovdqu64 64*1(felemB), ACC2b{%k2}{z} + vmovdqu64 15*8 + 64*0(felemB), ACC3a + vmovdqu64 15*8 + 64*1(felemB), ACC3b{%k2}{z} + + vpaddq 64*0 + .LpolyX(%rip), ACC0a, ACC0a + vpaddq 64*1 + .LpolyX(%rip), ACC0b, ACC0b + vpaddq 64*0 + .LpolyX(%rip), ACC1a, ACC1a + vpaddq 64*1 + .LpolyX(%rip), ACC1b, ACC1b + + vpsubq ACC2a, ACC0a, ACC0a + vpsubq ACC2b, ACC0b, ACC0b + vpsubq ACC3a, ACC1a, ACC2a + vpsubq ACC3b, ACC1b, ACC2b + + jmp fp2_normalize +############################################################################### +.globl C_ABI(fp2_add) +.p2align 6 +C_ABI(fp2_add): + + mov $1, %eax + kmovw %eax, %k1 + mov $0x7f, %eax + kmovw %eax, %k2 + + vmovdqu64 64*0(felemA), ACC0a + vmovdqu64 64*1(felemA), ACC0b{%k2}{z} + vmovdqu64 15*8 + 64*0(felemA), ACC1a + vmovdqu64 15*8 + 64*1(felemA), ACC1b{%k2}{z} + + vmovdqu64 64*0(felemB), ACC2a + vmovdqu64 64*1(felemB), ACC2b{%k2}{z} + vmovdqu64 15*8 + 64*0(felemB), ACC3a + vmovdqu64 15*8 + 64*1(felemB), ACC3b{%k2}{z} + + vpaddq ACC2a, ACC0a, ACC0a + vpaddq ACC2b, ACC0b, ACC0b + vpaddq ACC3a, ACC1a, ACC2a + vpaddq ACC3b, ACC1b, ACC2b + + // Fallthrough +############################################################################### +.p2align 6 +C_ABI(fp2_normalize): + + vpbroadcastq .LandMask(%rip), AND_MASK + vpxorq ZERO, ZERO, ZERO + + # Now 'normalize' the acc to 52 bit words + vpsrlq $52, ACC0a, A0a + vpsrlq $52, ACC0b, A0b + vpsrlq $52, ACC2a, A1a + vpsrlq $52, ACC2b, A1b + + vpandq AND_MASK, ACC0a, ACC0a + vpandq AND_MASK, ACC0b, ACC0b + vpandq AND_MASK, ACC2a, ACC2a + vpandq AND_MASK, ACC2b, ACC2b + + valignq $7, A0a, A0b, A0b + valignq $7, ZERO, A0a, A0a + valignq $7, A1a, A1b, A1b + valignq $7, ZERO, A1a, A1a + + vpaddq A0a, ACC0a, ACC0a + vpaddq A0b, ACC0b, ACC0b + vpaddq A1a, ACC2a, ACC2a + vpaddq A1b, ACC2b, ACC2b + + vpcmpuq $1, A0a, ACC0a, %k1 + vpcmpuq $1, A0b, ACC0b, %k2 + vpcmpuq $0, AND_MASK, ACC0a, %k3 + vpcmpuq $0, AND_MASK, ACC0b, %k4 + + kmovb %k1, %eax + kmovb %k2, %ecx + kmovb %k3, %r8d + kmovb %k4, %r9d + + add %al, %al + adc %cl, %cl + add %r8b, %al + adc %r9b, %cl + xor %r8b, %al + xor %r9b, %cl + kmovb %eax, %k1 + kmovb %ecx, %k2 + + vpsubq AND_MASK, ACC0a, ACC0a{%k1} + vpsubq AND_MASK, ACC0b, ACC0b{%k2} + vpandq AND_MASK, ACC0a, ACC0a + vpandq AND_MASK, ACC0b, ACC0b + + vpcmpuq $1, A1a, ACC2a, %k1 + vpcmpuq $1, A1b, ACC2b, %k2 + vpcmpuq $0, AND_MASK, ACC2a, %k3 + vpcmpuq $0, AND_MASK, ACC2b, %k4 + + kmovb %k1, %eax + kmovb %k2, %ecx + kmovb %k3, %r8d + kmovb %k4, %r9d + + add %al, %al + adc %cl, %cl + add %r8b, %al + adc %r9b, %cl + xor %r8b, %al + xor %r9b, %cl + kmovb %eax, %k1 + kmovb %ecx, %k2 + + vpsubq AND_MASK, ACC2a, ACC2a{%k1} + vpsubq AND_MASK, ACC2b, ACC2b{%k2} + vpandq AND_MASK, ACC2a, ACC2a + vpandq AND_MASK, ACC2b, ACC2b + + mov $0x7f, %eax + kmovw %eax, %k1 + + vmovdqu64 ACC0a, 64*0(felemR) + vmovdqu64 ACC0b, 64*1(felemR){%k1} + vmovdqu64 ACC2a, 15*8 + 64*0(felemR) + vmovdqu64 ACC2b, 15*8 + 64*1(felemR){%k1} + + ret + + +############################################################################### +#define p1ptr %rdi +#define p2ptr %rsi +#define swap %rdx +.globl C_ABI(fp2_swap) +.p2align 6 +C_ABI(fp2_swap): + + mov $0x7f, %eax + kmovw %eax, %k2 + // TODO: get rid of the masks, not needed + vmovdqu64 64*0(p1ptr), %zmm0 + vmovdqu64 64*1(p1ptr), %zmm1{%k2}{z} + vmovdqu64 15*8 + 64*0(p1ptr), %zmm2 + vmovdqu64 15*8 + 64*1(p1ptr), %zmm3{%k2}{z} + vmovdqu64 2*15*8 + 64*0(p1ptr), %zmm4 + vmovdqu64 2*15*8 + 64*1(p1ptr), %zmm5{%k2}{z} + vmovdqu64 3*15*8 + 64*0(p1ptr), %zmm6 + vmovdqu64 3*15*8 + 64*1(p1ptr), %zmm7{%k2}{z} + + vmovdqu64 64*0(p2ptr), %zmm8 + vmovdqu64 64*1(p2ptr), %zmm9{%k2}{z} + vmovdqu64 15*8 + 64*0(p2ptr), %zmm10 + vmovdqu64 15*8 + 64*1(p2ptr), %zmm11{%k2}{z} + vmovdqu64 2*15*8 + 64*0(p2ptr), %zmm12 + vmovdqu64 2*15*8 + 64*1(p2ptr), %zmm13{%k2}{z} + vmovdqu64 3*15*8 + 64*0(p2ptr), %zmm14 + vmovdqu64 3*15*8 + 64*1(p2ptr), %zmm15{%k2}{z} + + vpxorq %zmm16, %zmm16, %zmm16 + vpbroadcastq swap, %zmm17 + vpsubq %zmm17, %zmm16, %zmm16 + + vmovdqa64 %zmm8, %zmm17 + vmovdqa64 %zmm9, %zmm18 + vmovdqa64 %zmm10, %zmm19 + vmovdqa64 %zmm11, %zmm20 + vmovdqa64 %zmm12, %zmm21 + vmovdqa64 %zmm13, %zmm22 + vmovdqa64 %zmm14, %zmm23 + vmovdqa64 %zmm15, %zmm24 + + vpternlogq $0xd8, %zmm16, %zmm0, %zmm17 + vpternlogq $0xd8, %zmm16, %zmm1, %zmm18 + vpternlogq $0xd8, %zmm16, %zmm2, %zmm19 + vpternlogq $0xd8, %zmm16, %zmm3, %zmm20 + vpternlogq $0xd8, %zmm16, %zmm4, %zmm21 + vpternlogq $0xd8, %zmm16, %zmm5, %zmm22 + vpternlogq $0xd8, %zmm16, %zmm6, %zmm23 + vpternlogq $0xd8, %zmm16, %zmm7, %zmm24 + + vpternlogq $0xe4, %zmm16, %zmm0, %zmm8 + vpternlogq $0xe4, %zmm16, %zmm1, %zmm9 + vpternlogq $0xe4, %zmm16, %zmm2, %zmm10 + vpternlogq $0xe4, %zmm16, %zmm3, %zmm11 + vpternlogq $0xe4, %zmm16, %zmm4, %zmm12 + vpternlogq $0xe4, %zmm16, %zmm5, %zmm13 + vpternlogq $0xe4, %zmm16, %zmm6, %zmm14 + vpternlogq $0xe4, %zmm16, %zmm7, %zmm15 + + + vmovdqu64 %zmm8, 64*0(p1ptr) + vmovdqu64 %zmm9, 64*1(p1ptr){%k2} + vmovdqu64 %zmm10, 15*8 + 64*0(p1ptr) + vmovdqu64 %zmm11, 15*8 + 64*1(p1ptr){%k2} + vmovdqu64 %zmm12, 2*15*8 + 64*0(p1ptr) + vmovdqu64 %zmm13, 2*15*8 + 64*1(p1ptr){%k2} + vmovdqu64 %zmm14, 3*15*8 + 64*0(p1ptr) + vmovdqu64 %zmm15, 3*15*8 + 64*1(p1ptr){%k2} + + vmovdqu64 %zmm17, 64*0(p2ptr) + vmovdqu64 %zmm18, 64*1(p2ptr){%k2} + vmovdqu64 %zmm19, 15*8 + 64*0(p2ptr) + vmovdqu64 %zmm20, 15*8 + 64*1(p2ptr){%k2} + vmovdqu64 %zmm21, 2*15*8 + 64*0(p2ptr) + vmovdqu64 %zmm22, 2*15*8 + 64*1(p2ptr){%k2} + vmovdqu64 %zmm23, 3*15*8 + 64*0(p2ptr) + vmovdqu64 %zmm24, 3*15*8 + 64*1(p2ptr){%k2} + + ret +############################################################################### +.globl C_ABI(fp_add) +.p2align 6 +C_ABI(fp_add): + + mov $0x7f, %eax + kmovw %eax, %k2 + + vmovdqu64 64*0(felemA), ACC0a + vmovdqu64 64*1(felemA), ACC0b{%k2}{z} + + vmovdqu64 64*0(felemB), ACC2a + vmovdqu64 64*1(felemB), ACC2b{%k2}{z} + + vpaddq ACC2a, ACC0a, ACC0a + vpaddq ACC2b, ACC0b, ACC0b + + // Fallthrough +############################################################################### +.p2align 6 +C_ABI(fp_normalize): + + vpbroadcastq .LandMask(%rip), AND_MASK + vpxorq ZERO, ZERO, ZERO + + # Now 'normalize' the acc to 52 bit words + vpsrlq $52, ACC0a, A0a + vpsrlq $52, ACC0b, A0b + + vpandq AND_MASK, ACC0a, ACC0a + vpandq AND_MASK, ACC0b, ACC0b + + valignq $7, A0a, A0b, A0b + valignq $7, ZERO, A0a, A0a + + vpaddq A0a, ACC0a, ACC0a + vpaddq A0b, ACC0b, ACC0b + + vpcmpuq $1, A0a, ACC0a, %k1 + vpcmpuq $1, A0b, ACC0b, %k2 + vpcmpuq $0, AND_MASK, ACC0a, %k3 + vpcmpuq $0, AND_MASK, ACC0b, %k4 + + kmovb %k1, %eax + kmovb %k2, %ecx + kmovb %k3, %r8d + kmovb %k4, %r9d + + add %al, %al + adc %cl, %cl + + add %r8b, %al + adc %r9b, %cl + + xor %r8b, %al + xor %r9b, %cl + + kmovb %eax, %k1 + kmovb %ecx, %k2 + + vpsubq AND_MASK, ACC0a, ACC0a{%k1} + vpsubq AND_MASK, ACC0b, ACC0b{%k2} + vpandq AND_MASK, ACC0a, ACC0a + vpandq AND_MASK, ACC0b, ACC0b + + mov $0x7f, %eax + kmovw %eax, %k1 + + vmovdqu64 ACC0a, 64*0(%rdi) + vmovdqu64 ACC0b, 64*1(%rdi){%k1} + + ret + +############################################################################### +.globl C_ABI(fp_sub) +.p2align 6 +C_ABI(fp_sub): + + mov $0x7f, %eax + kmovw %eax, %k2 + + vmovdqu64 64*0(felemA), ACC0a + vmovdqu64 64*1(felemA), ACC0b{%k2}{z} + + vmovdqu64 64*0(felemB), ACC2a + vmovdqu64 64*1(felemB), ACC2b{%k2}{z} + + vpaddq 64*0 + .LpolyX(%rip), ACC0a, ACC0a + vpaddq 64*1 + .LpolyX(%rip), ACC0b, ACC0b + + vpsubq ACC2a, ACC0a, ACC0a + vpsubq ACC2b, ACC0b, ACC0b + + jmp fp_normalize + diff --git a/fp2_packed_751_ifma.S b/fp2_packed_751_ifma.S new file mode 100644 index 0000000..80ae629 --- /dev/null +++ b/fp2_packed_751_ifma.S @@ -0,0 +1,523 @@ +#if defined(__APPLE__) +/* OS X's C ABI prefixes functions with underscore. */ +#define C_ABI(x) _ ## x +#define HIDDEN .private_extern +#else +#define C_ABI(x) x +#define HIDDEN .hidden +#endif + +#define ACC0 %zmm0 +#define ACC1 %zmm1 +#define ACC2 %zmm2 +#define ACC3 %zmm3 +#define ACC4 %zmm4 +#define ACC5 %zmm5 +#define ACC6 %zmm6 +#define ACC7 %zmm7 +#define ACC8 %zmm8 +#define ACC9 %zmm9 +#define ACC10 %zmm10 +#define ACC11 %zmm11 +#define ACC12 %zmm12 +#define ACC13 %zmm13 +#define ACC14 %zmm14 +#define ACC15 %zmm15 + +#define A0 %zmm16 +#define A1 %zmm17 +#define A2 %zmm18 +#define A3 %zmm19 +#define A4 %zmm20 +#define A5 %zmm21 +#define A6 %zmm22 +#define A7 %zmm23 +#define A8 %zmm24 +#define A9 %zmm25 +#define A10 %zmm26 +#define A11 %zmm27 +#define A12 %zmm28 +#define A13 %zmm29 +#define A14 %zmm30 + +#define B %zmm31 + +#define rptr %rdi +#define aptr %rsi +#define bptr %rdx + +#define r0ptr %rdi +#define a0ptr %rsi +#define b0ptr %rdx + +#define r1ptr %rcx +#define a1ptr %r8 +#define b1ptr %r9 + +#define hlp %rax + +.p2align 6 +.Lmask: +.Lpoly: +.quad 0x000fffffffffffff, 0x000fffffffffffff, 0x000fffffffffffff, 0x000fffffffffffff +.quad 0x000fffffffffffff, 0x000fffffffffffff, 0x000fffffffffffff, 0x00049f878a8eeaff +.quad 0x0007cc76e3ec9685, 0x00076da959b1a13f, 0x00084e9867d6ebe8, 0x000b5045cb257480 +.quad 0x000f97badc668562, 0x00041f71c0e12909, 0x00000000006fe5d5, 0 + +.LpolyX: +.quad 0x0fffffffffffff00, 0x0fffffffffffff00, 0x0fffffffffffff00, 0x0fffffffffffff00 +.quad 0x0fffffffffffff00, 0x0fffffffffffff00, 0x0fffffffffffff00, 0x049f878a8eeaff00 +.quad 0x07cc76e3ec968500, 0x076da959b1a13f00, 0x084e9867d6ebe800, 0x0b5045cb25748000 +.quad 0x0f97badc66856200, 0x041f71c0e1290900, 0x000000006fe5d500, 0 + +.Lperm0: +.quad 0,1,0,1,2,3,2,3 + +.Lperm1: +.quad 4,5,5,4,6,7,7,6 + +// TODO: avoid transposing every call by keeping data vertical throughout + +// Performs 8 field multiplications in parallel +.globl C_ABI(fp2_mul_ifma_x2) +C_ABI(fp2_mul_ifma_x2): + + push %rbp + mov %rsp, %rbp + sub $960, %rsp + and $-64, %rsp + + mov $0x7f, %rax + kmovq %rax, %k5 + + // Load a0[0] + vmovdqu64 0*64(a0ptr), %zmm0 + vmovdqu64 1*64(a0ptr), %zmm1{%k5}{z} + lea 15*8(a0ptr), a0ptr + // Load a0[1] + vmovdqu64 0*64(a0ptr), %zmm2 + vmovdqu64 1*64(a0ptr), %zmm3{%k5}{z} + // Load b0[0] + vmovdqu64 0*64(b0ptr), %zmm4 + vmovdqu64 1*64(b0ptr), %zmm5{%k5}{z} + lea 15*8(b0ptr), b0ptr + // Load b0[1] + vmovdqu64 0*64(b0ptr), %zmm6 + vmovdqu64 1*64(b0ptr), %zmm7{%k5}{z} + // Load a1[0] + vmovdqu64 0*64(a1ptr), %zmm8 + vmovdqu64 1*64(a1ptr), %zmm9{%k5}{z} + lea 15*8(a1ptr), a1ptr + // Load a1[1] + vmovdqu64 0*64(a1ptr), %zmm10 + vmovdqu64 1*64(a1ptr), %zmm11{%k5}{z} + // Load b1[0] + vmovdqu64 0*64(b1ptr), %zmm12 + vmovdqu64 1*64(b1ptr), %zmm13{%k5}{z} + lea 15*8(b1ptr), b1ptr + // Load b1[1] + vmovdqu64 0*64(b1ptr), %zmm14 + vmovdqu64 1*64(b1ptr), %zmm15{%k5}{z} + // Transpose + vpunpcklqdq %zmm2, %zmm0, %zmm16 // 0 0 2 2 4 4 6 6 + vpunpckhqdq %zmm2, %zmm0, %zmm17 // 1 1 3 3 5 5 7 7 + vpunpcklqdq %zmm6, %zmm4, %zmm18 // 0 0 2 2 4 4 6 6 + vpunpckhqdq %zmm6, %zmm4, %zmm19 // 1 1 3 3 5 5 7 7 + vpunpcklqdq %zmm10, %zmm8, %zmm20 // 0 0 2 2 4 4 6 6 + vpunpckhqdq %zmm10, %zmm8, %zmm21 // 1 1 3 3 5 5 7 7 + vpunpcklqdq %zmm14, %zmm12, %zmm22 // 0 0 2 2 4 4 6 6 + vpunpckhqdq %zmm14, %zmm12, %zmm23 // 1 1 3 3 5 5 7 7 + + vpunpcklqdq %zmm3, %zmm1, %zmm24 // 8 8 10 10 12 12 14 14 + vpunpckhqdq %zmm3, %zmm1, %zmm25 // 9 9 11 11 13 13 15 15 + vpunpcklqdq %zmm7, %zmm5, %zmm26 // 8 8 10 10 12 12 14 14 + vpunpckhqdq %zmm7, %zmm5, %zmm27 // 9 9 11 11 13 13 15 15 + vpunpcklqdq %zmm11, %zmm9, %zmm28 // 8 8 10 10 12 12 14 14 + vpunpckhqdq %zmm11, %zmm9, %zmm29 // 9 9 11 11 13 13 15 15 + vpunpcklqdq %zmm15, %zmm13, %zmm30 // 8 8 10 10 12 12 14 14 + vpunpckhqdq %zmm15, %zmm13, %zmm31 // 9 9 11 11 13 13 15 15 + + vshufi64x2 $0x44, %zmm20, %zmm16, %zmm0 // 0 0 2 2 0 0 2 2 + vshufi64x2 $0x44, %zmm22, %zmm18, %zmm1 // 0 0 2 2 0 0 2 2 + vshufi64x2 $0xee, %zmm20, %zmm16, %zmm2 // 4 4 6 6 4 4 6 6 + vshufi64x2 $0xee, %zmm22, %zmm18, %zmm3 // 4 4 6 6 4 4 6 6 + + vshufi64x2 $0x44, %zmm21, %zmm17, %zmm4 // 1 1 3 3 1 1 3 3 + vshufi64x2 $0x44, %zmm23, %zmm19, %zmm5 // 1 1 3 3 1 1 3 3 + vshufi64x2 $0xee, %zmm21, %zmm17, %zmm6 // 5 5 7 7 5 5 7 7 + vshufi64x2 $0xee, %zmm23, %zmm19, %zmm7 // 5 5 7 7 5 5 7 7 + + vshufi64x2 $0x44, %zmm28, %zmm24, %zmm8 // 8 8 10 10 8 8 10 10 + vshufi64x2 $0x44, %zmm30, %zmm26, %zmm9 // 8 8 10 10 8 8 10 10 + vshufi64x2 $0xee, %zmm28, %zmm24, %zmm10 // 12 12 14 14 12 12 14 14 + vshufi64x2 $0xee, %zmm30, %zmm26, %zmm11 // 12 12 14 14 12 12 14 14 + + vshufi64x2 $0x44, %zmm29, %zmm25, %zmm12 // 9 9 11 11 9 9 11 11 + vshufi64x2 $0x44, %zmm31, %zmm27, %zmm13 // 9 9 11 11 9 9 11 11 + vshufi64x2 $0xee, %zmm29, %zmm25, %zmm14 // 13 13 15 15 13 13 15 15 + vshufi64x2 $0xee, %zmm31, %zmm27, %zmm15 // 13 13 15 15 13 13 15 15 + + vshufi64x2 $0x88, %zmm1, %zmm0, %zmm16 //0 + vshufi64x2 $0x88, %zmm5, %zmm4, %zmm17 //1 + vshufi64x2 $0xdd, %zmm1, %zmm0, %zmm18 // + vshufi64x2 $0xdd, %zmm5, %zmm4, %zmm19 + vshufi64x2 $0x88, %zmm3, %zmm2, %zmm20 + vshufi64x2 $0x88, %zmm7, %zmm6, %zmm21 + vshufi64x2 $0xdd, %zmm3, %zmm2, %zmm22 + vshufi64x2 $0xdd, %zmm7, %zmm6, %zmm23 + vshufi64x2 $0x88, %zmm9, %zmm8, %zmm24 + vshufi64x2 $0x88, %zmm13, %zmm12, %zmm25 + vshufi64x2 $0xdd, %zmm9, %zmm8, %zmm26 + vshufi64x2 $0xdd, %zmm13, %zmm12, %zmm27 + vshufi64x2 $0x88, %zmm11, %zmm10, %zmm28 + vshufi64x2 $0x88, %zmm15, %zmm14, %zmm29 + vshufi64x2 $0xdd, %zmm11, %zmm10, %zmm30 + + vmovdqa64 .Lperm0(%rip), %zmm31 + vpermq %zmm16, %zmm31, %zmm0 + vpermq %zmm17, %zmm31, %zmm1 + vpermq %zmm18, %zmm31, %zmm2 + vpermq %zmm19, %zmm31, %zmm3 + vpermq %zmm20, %zmm31, %zmm4 + vpermq %zmm21, %zmm31, %zmm5 + vpermq %zmm22, %zmm31, %zmm6 + vpermq %zmm23, %zmm31, %zmm7 + vpermq %zmm24, %zmm31, %zmm8 + vpermq %zmm25, %zmm31, %zmm9 + vpermq %zmm26, %zmm31, %zmm10 + vpermq %zmm27, %zmm31, %zmm11 + vpermq %zmm28, %zmm31, %zmm12 + vpermq %zmm29, %zmm31, %zmm13 + vpermq %zmm30, %zmm31, %zmm14 + + .irp r,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14 + vmovdqu64 %zmm\r, \r*64(%rsp) + .endr + + vmovdqa64 .Lperm1(%rip), %zmm31 + vpermq %zmm16, %zmm31, A0 + vpermq %zmm17, %zmm31, A1 + vpermq %zmm18, %zmm31, A2 + vpermq %zmm19, %zmm31, A3 + vpermq %zmm20, %zmm31, A4 + vpermq %zmm21, %zmm31, A5 + vpermq %zmm22, %zmm31, A6 + vpermq %zmm23, %zmm31, A7 + vpermq %zmm24, %zmm31, A8 + vpermq %zmm25, %zmm31, A9 + vpermq %zmm26, %zmm31, A10 + vpermq %zmm27, %zmm31, A11 + vpermq %zmm28, %zmm31, A12 + vpermq %zmm29, %zmm31, A13 + vpermq %zmm30, %zmm31, A14 + + lea (%rsp), bptr + call do_mul_x2 + + // After parallel multiplication the layout is: + // A0[0] * B0[0], A0[1] * B0[1], A0[0] * B0[1], A0[1] * B0[0], A1[0] * B1[0], A1[1] * B1[1], A1[0] * B1[1], A1[1] * B1[0] + // We need to compute: + // A0[0] * B0[0] - A0[1] * B0[1], A0[0] * B0[1] + A0[1] * B0[0], A1[0] * B1[0] - A0[1] * B1[1], A1[0] * B1[1] + A1[1] * B1[0] + vpsrldq $8, ACC0, A0 + vpsrldq $8, ACC1, A1 + vpsrldq $8, ACC2, A2 + vpsrldq $8, ACC3, A3 + vpsrldq $8, ACC4, A4 + vpsrldq $8, ACC5, A5 + vpsrldq $8, ACC6, A6 + vpsrldq $8, ACC7, A7 + vpsrldq $8, ACC8, A8 + vpsrldq $8, ACC9, A9 + vpsrldq $8, ACC10, A10 + vpsrldq $8, ACC11, A11 + vpsrldq $8, ACC12, A12 + vpsrldq $8, ACC13, A13 + vpsrldq $8, ACC14, A14 + + mov $0x44, hlp + kmovq hlp, %k7 + + vpaddq A0, ACC0, ACC0{%k7} + vpaddq A1, ACC1, ACC1{%k7} + vpaddq A2, ACC2, ACC2{%k7} + vpaddq A3, ACC3, ACC3{%k7} + vpaddq A4, ACC4, ACC4{%k7} + vpaddq A5, ACC5, ACC5{%k7} + vpaddq A6, ACC6, ACC6{%k7} + vpaddq A7, ACC7, ACC7{%k7} + vpaddq A8, ACC8, ACC8{%k7} + vpaddq A9, ACC9, ACC9{%k7} + vpaddq A10, ACC10, ACC10{%k7} + vpaddq A11, ACC11, ACC11{%k7} + vpaddq A12, ACC12, ACC12{%k7} + vpaddq A13, ACC13, ACC13{%k7} + vpaddq A14, ACC14, ACC14{%k7} + + mov $0x11, hlp + kmovq hlp, %k7 + + vpaddq 0*8+.LpolyX(%rip){1to8}, ACC0, ACC0{%k7} + vpaddq 1*8+.LpolyX(%rip){1to8}, ACC1, ACC1{%k7} + vpaddq 2*8+.LpolyX(%rip){1to8}, ACC2, ACC2{%k7} + vpaddq 3*8+.LpolyX(%rip){1to8}, ACC3, ACC3{%k7} + vpaddq 4*8+.LpolyX(%rip){1to8}, ACC4, ACC4{%k7} + vpaddq 5*8+.LpolyX(%rip){1to8}, ACC5, ACC5{%k7} + vpaddq 6*8+.LpolyX(%rip){1to8}, ACC6, ACC6{%k7} + vpaddq 7*8+.LpolyX(%rip){1to8}, ACC7, ACC7{%k7} + vpaddq 8*8+.LpolyX(%rip){1to8}, ACC8, ACC8{%k7} + vpaddq 9*8+.LpolyX(%rip){1to8}, ACC9, ACC9{%k7} + vpaddq 10*8+.LpolyX(%rip){1to8}, ACC10, ACC10{%k7} + vpaddq 11*8+.LpolyX(%rip){1to8}, ACC11, ACC11{%k7} + vpaddq 12*8+.LpolyX(%rip){1to8}, ACC12, ACC12{%k7} + vpaddq 13*8+.LpolyX(%rip){1to8}, ACC13, ACC13{%k7} + vpaddq 14*8+.LpolyX(%rip){1to8}, ACC14, ACC14{%k7} + + vpsubq A0, ACC0, ACC0{%k7} + vpsubq A1, ACC1, ACC1{%k7} + vpsubq A2, ACC2, ACC2{%k7} + vpsubq A3, ACC3, ACC3{%k7} + vpsubq A4, ACC4, ACC4{%k7} + vpsubq A5, ACC5, ACC5{%k7} + vpsubq A6, ACC6, ACC6{%k7} + vpsubq A7, ACC7, ACC7{%k7} + vpsubq A8, ACC8, ACC8{%k7} + vpsubq A9, ACC9, ACC9{%k7} + vpsubq A10, ACC10, ACC10{%k7} + vpsubq A11, ACC11, ACC11{%k7} + vpsubq A12, ACC12, ACC12{%k7} + vpsubq A13, ACC13, ACC13{%k7} + vpsubq A14, ACC14, ACC14{%k7} + vpsrlq $52, ACC0, B + vpaddq B, ACC1, ACC1 + vpandq .Lpoly(%rip){1to8}, ACC0, ACC0 + + vpsrlq $52, ACC1, B + vpaddq B, ACC2, ACC2 + vpandq .Lpoly(%rip){1to8}, ACC1, ACC1 + + vpsrlq $52, ACC2, B + vpaddq B, ACC3, ACC3 + vpandq .Lpoly(%rip){1to8}, ACC2, ACC2 + + vpsrlq $52, ACC3, B + vpaddq B, ACC4, ACC4 + vpandq .Lpoly(%rip){1to8}, ACC3, ACC3 + + vpsrlq $52, ACC4, B + vpaddq B, ACC5, ACC5 + vpandq .Lpoly(%rip){1to8}, ACC4, ACC4 + + vpsrlq $52, ACC5, B + vpaddq B, ACC6, ACC6 + vpandq .Lpoly(%rip){1to8}, ACC5, ACC5 + + vpsrlq $52, ACC6, B + vpaddq B, ACC7, ACC7 + vpandq .Lpoly(%rip){1to8}, ACC6, ACC6 + + vpsrlq $52, ACC7, B + vpaddq B, ACC8, ACC8 + vpandq .Lpoly(%rip){1to8}, ACC7, ACC7 + + vpsrlq $52, ACC8, B + vpaddq B, ACC9, ACC9 + vpandq .Lpoly(%rip){1to8}, ACC8, ACC8 + + vpsrlq $52, ACC9, B + vpaddq B, ACC10, ACC10 + vpandq .Lpoly(%rip){1to8}, ACC9, ACC9 + + vpsrlq $52, ACC10, B + vpaddq B, ACC11, ACC11 + vpandq .Lpoly(%rip){1to8}, ACC10, ACC10 + + vpsrlq $52, ACC11, B + vpaddq B, ACC12, ACC12 + vpandq .Lpoly(%rip){1to8}, ACC11, ACC11 + + vpsrlq $52, ACC12, B + vpaddq B, ACC13, ACC13 + vpandq .Lpoly(%rip){1to8}, ACC12, ACC12 + + vpsrlq $52, ACC13, B + vpaddq B, ACC14, ACC14 + vpandq .Lpoly(%rip){1to8}, ACC13, ACC13 + + vpandq .Lpoly(%rip){1to8}, ACC14, ACC14 + + // Transpose to horizontal + vpunpcklqdq ACC1, ACC0, ACC0 + vpunpcklqdq ACC3, ACC2, ACC1 + vpunpcklqdq ACC5, ACC4, ACC2 + vpunpcklqdq ACC7, ACC6, ACC3 + vpunpcklqdq ACC9, ACC8, ACC4 + vpunpcklqdq ACC11, ACC10, ACC5 + vpunpcklqdq ACC13, ACC12, ACC6 + vmovdqa64 ACC14, ACC7 + + vshufi64x2 $0x44, ACC1, ACC0, A0 + vshufi64x2 $0x44, ACC3, ACC2, A1 + vshufi64x2 $0x44, ACC5, ACC4, A2 + vshufi64x2 $0x44, ACC7, ACC6, A3 + + vshufi64x2 $0xee, ACC1, ACC0, A4 + vshufi64x2 $0xee, ACC3, ACC2, A5 + vshufi64x2 $0xee, ACC5, ACC4, A6 + vshufi64x2 $0xee, ACC7, ACC6, A7 + + vshufi64x2 $0x88, A1, A0, ACC0 + vshufi64x2 $0x88, A3, A2, ACC1 + vshufi64x2 $0xdd, A1, A0, ACC2 + vshufi64x2 $0xdd, A3, A2, ACC3 + + vshufi64x2 $0x88, A5, A4, ACC4 + vshufi64x2 $0x88, A7, A6, ACC5 + vshufi64x2 $0xdd, A5, A4, ACC6 + vshufi64x2 $0xdd, A7, A6, ACC7 + + vmovdqu64 ACC0, 0*64(r0ptr) + vmovdqu64 ACC1, 1*64(r0ptr){%k5} + lea 15*8(r0ptr), r0ptr + vmovdqu64 ACC2, 0*64(r0ptr) + vmovdqu64 ACC3, 1*64(r0ptr){%k5} + + vmovdqu64 ACC4, 0*64(r1ptr) + vmovdqu64 ACC5, 1*64(r1ptr){%k5} + lea 15*8(r1ptr), r1ptr + vmovdqu64 ACC6, 0*64(r1ptr) + vmovdqu64 ACC7, 1*64(r1ptr){%k5} + + mov %rbp, %rsp + pop %rbp +ret + +// Performs 8 field multiplications in parallel +.globl C_ABI(amm_751_ifma_x2) +C_ABI(amm_751_ifma_x2): + + vmovdqu64 0*64(aptr), A0 + vmovdqu64 1*64(aptr), A1 + vmovdqu64 2*64(aptr), A2 + vmovdqu64 3*64(aptr), A3 + vmovdqu64 4*64(aptr), A4 + vmovdqu64 5*64(aptr), A5 + vmovdqu64 6*64(aptr), A6 + vmovdqu64 7*64(aptr), A7 + vmovdqu64 8*64(aptr), A8 + vmovdqu64 9*64(aptr), A9 + vmovdqu64 10*64(aptr), A10 + vmovdqu64 11*64(aptr), A11 + vmovdqu64 12*64(aptr), A12 + vmovdqu64 13*64(aptr), A13 + vmovdqu64 14*64(aptr), A14 +do_mul_x2: + vpxorq ACC0, ACC0, ACC0 + vpxorq ACC1, ACC1, ACC1 + vpxorq ACC2, ACC2, ACC2 + vpxorq ACC3, ACC3, ACC3 + vpxorq ACC4, ACC4, ACC4 + vpxorq ACC5, ACC5, ACC5 + vpxorq ACC6, ACC6, ACC6 + vpxorq ACC7, ACC7, ACC7 + vpxorq ACC8, ACC8, ACC8 + vpxorq ACC9, ACC9, ACC9 + vpxorq ACC10, ACC10, ACC10 + vpxorq ACC11, ACC11, ACC11 + vpxorq ACC12, ACC12, ACC12 + vpxorq ACC13, ACC13, ACC13 + vpxorq ACC14, ACC14, ACC14 + vpxorq ACC15, ACC15, ACC15 + + mov $15, hlp + +1: + vmovdqu64 (bptr), B + lea 1*64(bptr), bptr + + vpmadd52luq A0, B, ACC0 + vpmadd52luq A1, B, ACC1 + vpmadd52luq A2, B, ACC2 + vpmadd52luq A3, B, ACC3 + vpmadd52luq A4, B, ACC4 + vpmadd52luq A5, B, ACC5 + vpmadd52luq A6, B, ACC6 + vpmadd52luq A7, B, ACC7 + vpmadd52luq A8, B, ACC8 + vpmadd52luq A9, B, ACC9 + vpmadd52luq A10, B, ACC10 + vpmadd52luq A11, B, ACC11 + vpmadd52luq A12, B, ACC12 + vpmadd52luq A13, B, ACC13 + vpmadd52luq A14, B, ACC14 + + vpmadd52huq A0, B, ACC1 + vpmadd52huq A1, B, ACC2 + vpmadd52huq A2, B, ACC3 + vpmadd52huq A3, B, ACC4 + vpmadd52huq A4, B, ACC5 + vpmadd52huq A5, B, ACC6 + vpmadd52huq A6, B, ACC7 + vpmadd52huq A7, B, ACC8 + vpmadd52huq A8, B, ACC9 + vpmadd52huq A9, B, ACC10 + vpmadd52huq A10, B, ACC11 + vpmadd52huq A11, B, ACC12 + vpmadd52huq A12, B, ACC13 + vpmadd52huq A13, B, ACC14 + vpmadd52huq A14, B, ACC15 + + vmovdqa64 ACC0, B + + vpmadd52luq 0*8 + .Lpoly(%rip){1to8}, B, ACC0 + vpsrlq $52, ACC0, ACC0 + vpmadd52luq 1*8 + .Lpoly(%rip){1to8}, B, ACC1 + vpaddq ACC1, ACC0, ACC0 + vpmadd52luq 2*8 + .Lpoly(%rip){1to8}, B, ACC2 + vmovdqa64 ACC2, ACC1 + vpmadd52luq 3*8 + .Lpoly(%rip){1to8}, B, ACC3 + vmovdqa64 ACC3, ACC2 + vpmadd52luq 4*8 + .Lpoly(%rip){1to8}, B, ACC4 + vmovdqa64 ACC4, ACC3 + vpmadd52luq 5*8 + .Lpoly(%rip){1to8}, B, ACC5 + vmovdqa64 ACC5, ACC4 + vpmadd52luq 6*8 + .Lpoly(%rip){1to8}, B, ACC6 + vmovdqa64 ACC6, ACC5 + vpmadd52luq 7*8 + .Lpoly(%rip){1to8}, B, ACC7 + vmovdqa64 ACC7, ACC6 + vpmadd52luq 8*8 + .Lpoly(%rip){1to8}, B, ACC8 + vmovdqa64 ACC8, ACC7 + vpmadd52luq 9*8 + .Lpoly(%rip){1to8}, B, ACC9 + vmovdqa64 ACC9, ACC8 + vpmadd52luq 10*8 + .Lpoly(%rip){1to8}, B, ACC10 + vmovdqa64 ACC10, ACC9 + vpmadd52luq 11*8 + .Lpoly(%rip){1to8}, B, ACC11 + vmovdqa64 ACC11, ACC10 + vpmadd52luq 12*8 + .Lpoly(%rip){1to8}, B, ACC12 + vmovdqa64 ACC12, ACC11 + vpmadd52luq 13*8 + .Lpoly(%rip){1to8}, B, ACC13 + vmovdqa64 ACC13, ACC12 + vpmadd52luq 14*8 + .Lpoly(%rip){1to8}, B, ACC14 + vmovdqa64 ACC14, ACC13 + vmovdqa64 ACC15, ACC14 + vpxorq ACC15, ACC15, ACC15 + + vpmadd52huq 0*8 + .Lpoly(%rip){1to8}, B, ACC0 + vpmadd52huq 1*8 + .Lpoly(%rip){1to8}, B, ACC1 + vpmadd52huq 2*8 + .Lpoly(%rip){1to8}, B, ACC2 + vpmadd52huq 3*8 + .Lpoly(%rip){1to8}, B, ACC3 + vpmadd52huq 4*8 + .Lpoly(%rip){1to8}, B, ACC4 + vpmadd52huq 5*8 + .Lpoly(%rip){1to8}, B, ACC5 + vpmadd52huq 6*8 + .Lpoly(%rip){1to8}, B, ACC6 + vpmadd52huq 7*8 + .Lpoly(%rip){1to8}, B, ACC7 + vpmadd52huq 8*8 + .Lpoly(%rip){1to8}, B, ACC8 + vpmadd52huq 9*8 + .Lpoly(%rip){1to8}, B, ACC9 + vpmadd52huq 10*8 + .Lpoly(%rip){1to8}, B, ACC10 + vpmadd52huq 11*8 + .Lpoly(%rip){1to8}, B, ACC11 + vpmadd52huq 12*8 + .Lpoly(%rip){1to8}, B, ACC12 + vpmadd52huq 13*8 + .Lpoly(%rip){1to8}, B, ACC13 + vpmadd52huq 14*8 + .Lpoly(%rip){1to8}, B, ACC14 + + dec hlp + jnz 1b + + ret \ No newline at end of file diff --git a/fp_751_ifma.S b/fp_751_ifma.S new file mode 100644 index 0000000..21d7ccb --- /dev/null +++ b/fp_751_ifma.S @@ -0,0 +1,268 @@ +#if defined(__APPLE__) +/* OS X's C ABI prefixes functions with underscore. */ +#define C_ABI(x) _ ## x +#define HIDDEN .private_extern +#else +#define C_ABI(x) x +#define HIDDEN .hidden +#endif + +.p2align 6 +.LpermMask0: +.word 0,1,2,3, 3,4,5,6, 6,7,8,9, 9,10,11,12, 13,14,15,16, 16,17,18,19, 19,20,21,22, 22,23,24,25 +.LshiftMask0: +.quad 0,4,8,12,0,4,8,12 +.LandMask: +.quad 0xfffffffffffff + +.p2align 6 +.Lpoly: +.quad 0x000fffffffffffff, 0x000fffffffffffff, 0x000fffffffffffff, 0x000fffffffffffff +.quad 0x000fffffffffffff, 0x000fffffffffffff, 0x000fffffffffffff, 0x00049f878a8eeaff +.quad 0x0007cc76e3ec9685, 0x00076da959b1a13f, 0x00084e9867d6ebe8, 0x000b5045cb257480 +.quad 0x000f97badc668562, 0x00041f71c0e12909, 0x00000000006fe5d5, 0 + +.LR2: +.quad 0x000dad40589641fd, 0x000452a233046449, 0x000edb010161a696, 0x00036941472e3fd8 +.quad 0x000e2082a2e7065e, 0x000904f8751f40bf, 0x0007fc814932cca8, 0x00033f174b08b2ee +.quad 0x0009814efb9f1375, 0x00099594a1afe512, 0x00043c75310de66d, 0x000197021a5b37b0 +.quad 0x000cc1a272e73959, 0x000a733d7c97cd76, 0x0000000000292ee8, 0 + +.Lone: +.quad 1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0 + +.globl C_ABI(norm2red) +.p2align 6 +C_ABI(norm2red): + mov $0x3FFFFF, %eax + kmovd %eax, %k1 + mov $0x7F, %eax + kmovd %eax, %k2 + + vmovdqa64 .LpermMask0(%rip), %zmm0 + vmovdqa64 .LshiftMask0(%rip), %zmm1 + vpbroadcastq .LandMask(%rip), %zmm10 + + vpermw 52*0(%rsi), %zmm0, %zmm2 + vmovdqu16 52*1(%rsi), %zmm3{%k1}{z} + vpermw %zmm3, %zmm0, %zmm3 + + vpsrlvq %zmm1, %zmm2, %zmm2 + vpsrlvq %zmm1, %zmm3, %zmm3 + vpsrlvq %zmm1, %zmm4, %zmm4 + + vpandq %zmm10, %zmm2, %zmm2 + vpandq %zmm10, %zmm3, %zmm3 + vpandq %zmm10, %zmm4, %zmm4 + + vmovdqu64 %zmm2, 64*0(%rdi) + vmovdqu64 %zmm3, 64*1(%rdi){%k2} + ret + + +#define res %rdi // uint64_t *rp, +#define a0 %rsi // const uint64_t *ap, +#define bpi %rdx // const uint64_t *bptr, +#define m0 %rcx + +#define b_ptr %rax + +#define acc0 %r9 + +#define itr %r10 +#define t0 %r11 +#define t1 %r12 +#define t2 %r13 + +#define A0 %zmm0 +#define A1 %zmm1 + +#define M0 %zmm2 +#define M1 %zmm3 + +#define ACC0 %zmm4 +#define ACC0_xmm %xmm4 +#define ACC1 %zmm5 + +#define Y_curr %zmm6 +#define Y_prev %zmm7 +#define B_curr %zmm8 +#define B_prev %zmm9 + +#define TMP %zmm10 +#define TMP_xmm %xmm10 + +#define ZERO %zmm11 +#define AND_MASK %zmm12 + +#define ACC0b %zmm13 +#define ACC1b %zmm14 + +############################################################################### +.globl C_ABI(to_mont_ifma) +.p2align 6 +C_ABI(to_mont_ifma): + leaq .LR2(%rip), bpi + jmp C_ABI(fp_mul_ifma) +############################################################################### +.globl C_ABI(from_mont_ifma) +.p2align 6 +C_ABI(from_mont_ifma): + leaq .Lone(%rip), bpi + jmp C_ABI(fp_mul_ifma) +############################################################################### +.globl C_ABI(fp_mul_ifma) +.p2align 6 +C_ABI(fp_mul_ifma): + + push %rbx + push %r12 + push %r13 + + mov bpi, b_ptr + + mov $1, t0 + mov $0x3f, t1 + kmovq t0, %k1 + kmovq t1, %k2 + + vpbroadcastq .LandMask(%rip), AND_MASK + vpxorq ZERO, ZERO, ZERO + + # Load operands A into registers. A[0] is stored in ALU register, in order to compensate for the latency of IFMA when computing (A*B)[0] * K0 + vmovdqu64 8*1+64*0(a0), A0 + vmovdqu64 8*1+64*1(a0), A1{%k2}{z} + mov 8*0(a0), a0 + + # Load the modulii + mov .Lpoly(%rip), m0 + vmovdqu64 8*1+64*0+.Lpoly(%rip), M0 + vmovdqu64 8*1+64*1+.Lpoly(%rip), M1{%k2}{z} + + # Prepare the accumulators + vpxorq ACC0, ACC0, ACC0 + vpxorq ACC1, ACC1, ACC1 + vpxorq B_curr, B_curr, B_curr + vpxorq Y_curr, Y_curr, Y_curr + xor acc0, acc0 + + mov $15, itr +1: + vpxorq ACC0b, ACC0b, ACC0b + vpxorq ACC1b, ACC1b, ACC1b + + # High multiplications + vpmadd52huq B_curr, A0, ACC0b + vpmadd52huq B_curr, A1, ACC1b + + vpmadd52huq Y_curr, M0, ACC0b + vpmadd52huq Y_curr, M1, ACC1b + + # Shift the ACC in zmms right by a word + valignq $1, ACC0, ACC1, ACC0 + valignq $1, ACC1, ZERO, ACC1 + mov a0, %rdx + + mulx (b_ptr), t0, t2 + add t0, acc0 + adc $0, t2 + + mov acc0, %rdx + and .LandMask(%rip), %rdx + + vpbroadcastq %rdx, Y_curr + vpbroadcastq (b_ptr), B_curr + + mulx m0, t0, t1 + add t0, acc0 + adc t1, t2 + + shrd $52, t2, acc0 + + # Low multiplications + vpmadd52luq B_curr, A0, ACC0b + vpmadd52luq B_curr, A1, ACC1b + + vpmadd52luq Y_curr, M0, ACC0 + vpmadd52luq Y_curr, M1, ACC1 + + vpaddq ACC0b, ACC0, ACC0 + vpaddq ACC1b, ACC1, ACC1 + + vmovq ACC0_xmm, t0 + add t0, acc0 + + lea 8(b_ptr), b_ptr + dec itr + jne 1b + + vmovq acc0, TMP_xmm + + vmovdqa64 TMP, ACC0{%k1} + + valignq $7, A0, A1, A1 + valignq $7, ZERO, A0, A0 + + valignq $7, M0, M1, M1 + valignq $7, ZERO, M0, M0 + + # The last high multiplications + vpmadd52huq B_curr, A0, ACC0 + vpmadd52huq B_curr, A1, ACC1 + + vpmadd52huq Y_curr, M0, ACC0 + vpmadd52huq Y_curr, M1, ACC1 + + # Now 'normalize' the result to 52 bit words + vpsrlq $52, ACC0, A0 + vpsrlq $52, ACC1, A1 + + vpandq AND_MASK, ACC0, ACC0 + vpandq AND_MASK, ACC1, ACC1 + + valignq $7, A0, A1, A1 + valignq $7, ZERO, A0, A0 + + vpaddq A0, ACC0, ACC0 + vpaddq A1, ACC1, ACC1 + + vpcmpuq $1, A0, ACC0, %k1 + vpcmpuq $1, A1, ACC1, %k2 + + kmovb %k1, %eax + kmovb %k2, %ebx + + add %al, %al + adc %cl, %cl + + vpcmpuq $0, AND_MASK, ACC0, %k1 + vpcmpuq $0, AND_MASK, ACC1, %k2 + + kmovb %k1, %r8d + kmovb %k2, %r9d + + add %r8b, %al + adc %r9b, %bl + + xor %r8b, %al + xor %r9b, %bl + + kmovb %eax, %k1 + kmovb %ebx, %k2 + + vpsubq AND_MASK, ACC0, ACC0{%k1} + vpsubq AND_MASK, ACC1, ACC1{%k2} + + vpandq AND_MASK, ACC0, ACC0 + vpandq AND_MASK, ACC1, ACC1 + + mov $0x7f, t0 + kmovq t0, %k1 + + vmovdqu64 ACC0, 64*0(res) + vmovdqu64 ACC1, 64*1(res){%k1} + bail: + pop %r13 + pop %r12 + pop %rbx + ret diff --git a/main.c b/main.c new file mode 100644 index 0000000..3e85e83 --- /dev/null +++ b/main.c @@ -0,0 +1,218 @@ +#include +#include +#include + +#include "./sidh_ref/P751_internal.h" + +#include "measurements.h" + +#ifndef PRIME_BITS +#define PRIME_BITS 751 +#endif + +#define DIGITS_64 ((PRIME_BITS + 63) / 64) +#define DIGITS_52 ((PRIME_BITS + 51) / 52) + +#define OALICE_BITS 372 +#define OBOB_BITS 379 +#define SECRETKEY_A_BYTES (OALICE_BITS + 7) / 8 +#define SECRETKEY_B_BYTES (OBOB_BITS + 7) / 8 + +#define MASK_ALICE 0x0F +#define MASK_BOB 0x03 + +typedef uint64_t num52[DIGITS_52]; +typedef num52 felem[2]; + +void fp2_mul_ifma(felem res, felem a, felem b); +void fp2_sqr_ifma(felem res, felem a); +void fp2_mul_ifma_x2(felem res1, const felem a1, const felem b1, felem res2, const felem a2, const felem b2); +void fp_mul_ifma(uint64_t *rp, const uint64_t *ap, const uint64_t *bp); +void to_mont_ifma(uint64_t *rp, const uint64_t *ap); +void from_mont_ifma(uint64_t *rp, const uint64_t *ap); + +void red2norm(uint64_t out[12], const uint64_t in[15]); +void norm2red(uint64_t *res, const uint64_t *a); + +int EphemeralKeyGeneration_A_ifma(const unsigned char *PrivateKeyA, unsigned char *PublicKeyA); +int EphemeralKeyGeneration_B_ifma(const unsigned char *PrivateKeyB, unsigned char *PublicKeyB); + +int rdrand64_step(uint64_t *rand) +{ + unsigned char ok; + __asm__ volatile("rdrand %0; setc %1" + : "=r"(*rand), "=qm"(ok)); + return (int)ok; +} + +static void rand_750(uint64_t out[DIGITS_64]) +{ + for (int i = 0; i < DIGITS_64; i++) + { + while (!rdrand64_step((uint64_t *)&out[i])) + ; + } + + out[DIGITS_64 - 1] &= ((1ULL << (PRIME_BITS - 64 * (DIGITS_64 - 1))) - 1); +} + +static void rand_bytes(uint8_t *out, size_t out_len) +{ + uint64_t temp; + for (int i = 0; i < out_len; i++) + { + while (!rdrand64_step((uint64_t *)&temp)) + ; + out[i] = temp; + } +} + +int main() +{ + + int i; + + do + { + felm_t fa, fb, fr; + num52 r, a, b; + uint64_t res_ifma[DIGITS_64]; + + rand_750(fa); + rand_750(fb); + + norm2red(a, (uint64_t *)fa); + norm2red(b, (uint64_t *)fb); + to_mont_ifma(a, a); + to_mont_ifma(b, b); + + MEASURE({ fp_mul_ifma(r, a, b); }); + + from_mont_ifma(r, r); + red2norm(res_ifma, r); + printf("Mont mul IFMA Cycles/op: %.0f\n", RDTSC_total_clk); + + to_mont(fa, fa); + to_mont(fb, fb); + MEASURE({ fpmul751_mont(fa, fb, fr); }); + from_mont(fr, fr); + printf("Mont mul ref Cycles/op: %.0f\n", RDTSC_total_clk); + + printf("%s\n", memcmp(fr, res_ifma, sizeof(res_ifma)) ? "FP MUL Fail" + : "FP MUL Success"); + } while (0); + + do + { + felem a, b, r, r2; + f2elm_t fa, fb, fr; + uint64_t res_ifma[2][DIGITS_64]; + + rand_750(fa[0]); + rand_750(fa[1]); + rand_750(fb[0]); + rand_750(fb[1]); + norm2red(a[0], (uint64_t *)fa[0]); + norm2red(a[1], (uint64_t *)fa[1]); + norm2red(b[0], (uint64_t *)fb[0]); + norm2red(b[1], (uint64_t *)fb[1]); + to_mont_ifma(a[0], a[0]); + to_mont_ifma(a[1], a[1]); + to_mont_ifma(b[0], b[0]); + to_mont_ifma(b[1], b[1]); + + MEASURE({ fp2_mul_ifma(r, a, b); }); + + from_mont_ifma(r[0], r[0]); + from_mont_ifma(r[1], r[1]); + red2norm(res_ifma[0], r[0]); + red2norm(res_ifma[1], r[1]); + printf("Mont FP2 mul IFMA Cycles/op: %.0f\n", RDTSC_total_clk); + + to_mont(fa[0], fa[0]); + to_mont(fa[1], fa[1]); + to_mont(fb[0], fb[0]); + to_mont(fb[1], fb[1]); + + MEASURE({ fp2mul751_mont(fa, fb, fr); }); + + from_mont(fr[0], fr[0]); + from_mont(fr[1], fr[1]); + printf("Mont FP2 mul ref Cycles/op: %.0f\n", RDTSC_total_clk); + + printf("%s\n", memcmp(fr, res_ifma, sizeof(res_ifma)) ? "FP2 MUL Fail" + : "FP2 MUL Success"); + + MEASURE({ fp2_mul_ifma_x2(r, a, b, r2, a, b); }); + + from_mont_ifma(r[0], r[0]); + from_mont_ifma(r[1], r[1]); + red2norm(res_ifma[0], r[0]); + red2norm(res_ifma[1], r[1]); + + printf("Dual Mont FP2 mul IFMA Cycles/op: %.0f\n", RDTSC_total_clk); + + printf("%s\n", memcmp(fr, res_ifma, sizeof(res_ifma)) ? "Dual FP2 MUL 1/2 Fail" + : "Dual FP2 MUL 1/2 Success"); + + from_mont_ifma(r2[0], r2[0]); + from_mont_ifma(r2[1], r2[1]); + red2norm(res_ifma[0], r2[0]); + red2norm(res_ifma[1], r2[1]); + + printf("%s\n", memcmp(fr, res_ifma, sizeof(res_ifma)) ? "Dual FP2 MUL 2/2 Fail" + : "Dual FP2 MUL 2/2 Success"); + + MEASURE({ fp2_sqr_ifma(r, a); }); + + from_mont_ifma(r[0], r[0]); + from_mont_ifma(r[1], r[1]); + red2norm(res_ifma[0], r[0]); + red2norm(res_ifma[1], r[1]); + printf("Mont FP2 sqr IFMA Cycles/op: %.0f\n", RDTSC_total_clk); + + MEASURE({ fp2sqr751_mont(fa, fr); }); + + from_mont(fr[0], fr[0]); + from_mont(fr[1], fr[1]); + printf("Mont FP2 sqr ref Cycles/op: %.0f\n", RDTSC_total_clk); + + printf("%s\n", memcmp(fr, res_ifma, sizeof(res_ifma)) ? "FP2 SQR Fail" + : "FP2 SQR Success"); + + } while (0); + + do + { + unsigned char ephemeralsk_alice[SECRETKEY_A_BYTES]; + unsigned char ephemeralsk_bob[SECRETKEY_B_BYTES]; + unsigned char ct1[564] = {0}; + unsigned char ct2[564] = {0}; + rand_bytes(ephemeralsk_alice, sizeof(ephemeralsk_alice)); + rand_bytes(ephemeralsk_bob, sizeof(ephemeralsk_bob)); + ephemeralsk_alice[SECRETKEY_A_BYTES - 1] &= MASK_ALICE; + ephemeralsk_bob[SECRETKEY_B_BYTES - 1] &= MASK_BOB; + + MEASURE({ EphemeralKeyGeneration_A(ephemeralsk_alice, ct1); }); + + printf("Ref EphemeralKeyGeneration_A Cycles/op: %.0f\n", RDTSC_total_clk); + + MEASURE({ EphemeralKeyGeneration_A_ifma(ephemeralsk_alice, ct2); }); + + printf("IFMA EphemeralKeyGeneration_A Cycles/op: %.0f\n", RDTSC_total_clk); + + printf("%s\n", memcmp(ct1, ct2, sizeof(ct1)) ? "EphemeralKeyGeneration_A Fail" + : "EphemeralKeyGeneration_A Success"); + + MEASURE({ EphemeralKeyGeneration_B(ephemeralsk_bob, ct1); }); + + printf("Ref EphemeralKeyGeneration_B Cycles/op: %.0f\n", RDTSC_total_clk); + + MEASURE({ EphemeralKeyGeneration_B_ifma(ephemeralsk_bob, ct2); }); + + printf("IFMA EphemeralKeyGeneration_B Cycles/op: %.0f\n", RDTSC_total_clk); + + printf("%s\n", memcmp(ct1, ct2, sizeof(ct1)) ? "EphemeralKeyGeneration_B Fail" + : "EphemeralKeyGeneration_B Success"); + } while (0); +} diff --git a/measurements.h b/measurements.h new file mode 100644 index 0000000..184a2f3 --- /dev/null +++ b/measurements.h @@ -0,0 +1,52 @@ + +#ifndef MEASURE_H +#define MEASURE_H + +#ifndef REPEAT +#define REPEAT 100 +#endif + +#ifndef OUTER_REPEAT +#define OUTER_REPEAT 10 +#endif + +#ifndef WARMUP +#define WARMUP REPEAT / 4 +#endif + +unsigned long long RDTSC_start_clk, RDTSC_end_clk; +double RDTSC_total_clk; +double RDTSC_TEMP_CLK; +int RDTSC_MEASURE_ITERATOR; +int RDTSC_OUTER_ITERATOR; + +inline static unsigned long get_Clks(void) +{ + unsigned hi, lo; + __asm__ __volatile__("rdtscp\n\t" + : "=a"(lo), "=d"(hi)::"rcx"); + return ((unsigned long)lo) ^ (((unsigned long)hi) << 32); +} + +#define RDTSC_MEASURE(x) \ + for (RDTSC_MEASURE_ITERATOR = 0; RDTSC_MEASURE_ITERATOR < WARMUP; RDTSC_MEASURE_ITERATOR++) \ + { \ + {x}; \ + } \ + RDTSC_total_clk = 1.7976931348623157e+308; \ + for (RDTSC_OUTER_ITERATOR = 0; RDTSC_OUTER_ITERATOR < OUTER_REPEAT; RDTSC_OUTER_ITERATOR++) \ + { \ + RDTSC_start_clk = get_Clks(); \ + for (RDTSC_MEASURE_ITERATOR = 0; RDTSC_MEASURE_ITERATOR < REPEAT; RDTSC_MEASURE_ITERATOR++) \ + { \ + {x}; \ + } \ + RDTSC_end_clk = get_Clks(); \ + RDTSC_TEMP_CLK = (double)(RDTSC_end_clk - RDTSC_start_clk) / REPEAT; \ + if (RDTSC_total_clk > RDTSC_TEMP_CLK) \ + RDTSC_total_clk = RDTSC_TEMP_CLK; \ + } + +#define MEASURE(x) RDTSC_MEASURE(x) + +#endif diff --git a/sidh_ref/P751.c b/sidh_ref/P751.c new file mode 100644 index 0000000..e1924ba --- /dev/null +++ b/sidh_ref/P751.c @@ -0,0 +1,122 @@ +/******************************************************************************************** +* Supersingular Isogeny Key Encapsulation Library +* +* Abstract: supersingular isogeny parameters and generation of functions for P751 +*********************************************************************************************/ + +#include "P751_internal.h" + + +// Encoding of field elements, elements over Z_order, elements over GF(p^2) and elliptic curve points: +// -------------------------------------------------------------------------------------------------- +// Elements over GF(p) and Z_order are encoded with the least significant octet (and digit) located at the leftmost position (i.e., little endian format). +// Elements (a+b*i) over GF(p^2), where a and b are defined over GF(p), are encoded as {a, b}, with a in the least significant position. +// Elliptic curve points P = (x,y) are encoded as {x, y}, with x in the least significant position. +// Internally, the number of digits used to represent all these elements is obtained by approximating the number of bits to the immediately greater multiple of 32. +// For example, a 751-bit field element is represented with Ceil(751 / 64) = 12 64-bit digits or Ceil(751 / 32) = 24 32-bit digits. + +// +// Curve isogeny system "SIDHp751". Base curve: Montgomery curve By^2 = Cx^3 + Ax^2 + Cx defined over GF(p751^2), where A=0, B=1, C=1 and p751 = 2^372*3^239-1 +// + +const uint64_t p751[NWORDS64_FIELD] = { 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0xEEAFFFFFFFFFFFFF, + 0xE3EC968549F878A8, 0xDA959B1A13F7CC76, 0x084E9867D6EBE876, 0x8562B5045CB25748, 0x0E12909F97BADC66, 0x00006FE5D541F71C }; +const uint64_t p751p1[NWORDS64_FIELD] = { 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0xEEB0000000000000, + 0xE3EC968549F878A8, 0xDA959B1A13F7CC76, 0x084E9867D6EBE876, 0x8562B5045CB25748, 0x0E12909F97BADC66, 0x00006FE5D541F71C }; +const uint64_t p751x2[NWORDS64_FIELD] = { 0xFFFFFFFFFFFFFFFE, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0xDD5FFFFFFFFFFFFF, + 0xC7D92D0A93F0F151, 0xB52B363427EF98ED, 0x109D30CFADD7D0ED, 0x0AC56A08B964AE90, 0x1C25213F2F75B8CD, 0x0000DFCBAA83EE38 }; +// Order of Alice's subgroup +const uint64_t Alice_order[NWORDS64_ORDER] = { 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0010000000000000 }; +// Order of Bob's subgroup +const uint64_t Bob_order[NWORDS64_ORDER] = { 0xC968549F878A8EEB, 0x59B1A13F7CC76E3E, 0xE9867D6EBE876DA9, 0x2B5045CB25748084, 0x2909F97BADC66856, 0x06FE5D541F71C0E1 }; +// Alice's generator values {XPA0 + XPA1*i, XQA0, XRA0 + XRA1*i} in GF(p751^2), expressed in Montgomery representation +const uint64_t A_gen[5 * NWORDS64_FIELD] = { 0xC2FC08CEAB50AD8B, 0x1D7D710F55E457B1, 0xE8738D92953DCD6E, 0xBAA7EBEE8A3418AA, 0xC9A288345F03F46F, 0xC8D18D167CFE2616, + 0x02043761F6B1C045, 0xAA1975E13180E7E9, 0x9E13D3FDC6690DE6, 0x3A024640A3A3BB4F, 0x4E5AD44E6ACBBDAE, 0x0000544BEB561DAD, // XPA0 + 0xE6CC41D21582E411, 0x07C2ECB7C5DF400A, 0xE8E34B521432AEC4, 0x50761E2AB085167D, 0x032CFBCAA6094B3C, 0x6C522F5FDF9DDD71, + 0x1319217DC3A1887D, 0xDC4FB25803353A86, 0x362C8D7B63A6AB09, 0x39DCDFBCE47EA488, 0x4C27C99A2C28D409, 0x00003CB0075527C4, // XPA1 + 0xD56FE52627914862, 0x1FAD60DC96B5BAEA, 0x01E137D0BF07AB91, 0x404D3E9252161964, 0x3C5385E4CD09A337, 0x4476426769E4AF73, + 0x9790C6DB989DFE33, 0xE06E1C04D2AA8B5E, 0x38C08185EDEA73B9, 0xAA41F678A4396CA6, 0x92B9259B2229E9A0, 0x00002F9326818BE0, // XQA0 + 0x0BB84441DFFD19B3, 0x84B4DEA99B48C18E, 0x692DE648AD313805, 0xE6D72761B6DFAEE0, 0x223975C672C3058D, 0xA0FDE0C3CBA26FDC, + 0xA5326132A922A3CA, 0xCA5E7F5D5EA96FA4, 0x127C7EFE33FFA8C6, 0x4749B1567E2A23C4, 0x2B7DF5B4AF413BFA, 0x0000656595B9623C, // XRA0 + 0xED78C17F1EC71BE8, 0xF824D6DF753859B1, 0x33A10839B2A8529F, 0xFC03E9E25FDEA796, 0xC4708A8054DF1762, 0x4034F2EC034C6467, + 0xABFB70FBF06ECC79, 0xDABE96636EC108B7, 0x49CBCFB090605FD3, 0x20B89711819A45A7, 0xFB8E1590B2B0F63E, 0x0000556A5F964AB2 }; // XRA1 +// Bob's generator values {XPB0 + XPB1*i, XQB0, XRB0 + XRB1*i} in GF(p751^2), expressed in Montgomery representation +const uint64_t B_gen[5 * NWORDS64_FIELD] = { 0xCFB6D71EF867AB0B, 0x4A5FDD76E9A45C76, 0x38B1EE69194B1F03, 0xF6E7B18A7761F3F0, 0xFCF01A486A52C84C, 0xCBE2F63F5AA75466, + 0x6487BCE837B5E4D6, 0x7747F5A8C622E9B8, 0x4CBFE1E4EE6AEBBA, 0x8A8616A13FA91512, 0x53DB980E1579E0A5, 0x000058FEBFF3BE69, // XPB0 + 0xA492034E7C075CC3, 0x677BAF00B04AA430, 0x3AAE0C9A755C94C8, 0x1DC4B064E9EBB08B, 0x3684EDD04E826C66, 0x9BAA6CB661F01B22, + 0x20285A00AD2EFE35, 0xDCE95ABD0497065F, 0x16C7FBB3778E3794, 0x26B3AC29CEF25AAF, 0xFB3C28A31A30AC1D, 0x000046ED190624EE, // XPB1 + 0xF1A8C9ED7B96C4AB, 0x299429DA5178486E, 0xEF4926F20CD5C2F4, 0x683B2E2858B4716A, 0xDDA2FBCC3CAC3EEB, 0xEC055F9F3A600460, + 0xD5A5A17A58C3848B, 0x4652D836F42EAED5, 0x2F2E71ED78B3A3B3, 0xA771C057180ADD1D, 0xC780A5D2D835F512, 0x0000114EA3B55AC1, // XQB0 + 0x1C0D6733769D0F31, 0xF084C3086E2659D1, 0xE23D5DA27BCBD133, 0xF38EC9A8D5864025, 0x6426DC781B3B645B, 0x4B24E8E3C9FB03EE, + 0x6432792F9D2CEA30, 0x7CC8E8B1AE76E857, 0x7F32BFB626BB8963, 0xB9F05995B48D7B74, 0x4D71200A7D67E042, 0x0000228457AF0637, // XRB0 + 0x4AE37E7D8F72BD95, 0xDD2D504B3E993488, 0x5D14E7FA1ECB3C3E, 0x127610CEB75D6350, 0x255B4B4CAC446B11, 0x9EA12336C1F70CAF, + 0x79FA68A2147BC2F8, 0x11E895CFDADBBC49, 0xE4B9D3C4D6356C18, 0x44B25856A67F951C, 0x5851541F61308D0B, 0x00002FFD994F7E4C }; // XRB1 +// Montgomery constant Montgomery_R2 = (2^768)^2 mod p751 +const uint64_t Montgomery_R2[NWORDS64_FIELD] = { 0x233046449DAD4058, 0xDB010161A696452A, 0x5E36941472E3FD8E, 0xF40BFE2082A2E706, 0x4932CCA8904F8751 ,0x1F735F1F1EE7FC81, + 0xA24F4D80C1048E18, 0xB56C383CCDB607C5, 0x441DD47B735F9C90, 0x5673ED2C6A6AC82A, 0x06C905261132294B, 0x000041AD830F1F35 }; +// Value one in Montgomery representation +const uint64_t Montgomery_one[NWORDS64_FIELD] = { 0x00000000000249ad, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x8310000000000000, + 0x5527b1e4375c6c66, 0x697797bf3f4f24d0, 0xc89db7b2ac5c4e2e, 0x4ca4b439d2076956, 0x10f7926c7512c7e9, 0x00002d5b24bce5e2 }; +// Value (2^384)^2 mod 3^239 +const uint64_t Montgomery_Rprime[NWORDS64_ORDER] = { 0x1A55482318541298, 0x070A6370DFA12A03, 0xCB1658E0E3823A40, 0xB3B7384EB5DEF3F9, 0xCBCA952F7006EA33, 0x00569EF8EC94864C }; +// Value -(3^239)^-1 mod 2^384 +const uint64_t Montgomery_rprime[NWORDS64_ORDER] = { 0x48062A91D3AB563D, 0x6CE572751303C2F5, 0x5D1319F3F160EC9D, 0xE35554E8C2D5623A, 0xCA29300232BC79A5, 0x8AAD843D646D78C5 }; +// Value order_Bob/3 mod p751 +const uint64_t Border_div3[NWORDS_ORDER] = { 0xEDCD718A828384F9, 0x733B35BFD4427A14, 0xF88229CF94D7CF38, 0x63C56C990C7C2AD6, 0xB858A87E8F4222C7, 0x0254C9C6B525EAF5 }; + + +// Fixed parameters for isogeny tree computation +const unsigned int strat_Alice[MAX_Alice-1] = { +80, 48, 27, 15, 8, 4, 2, 1, 1, 2, 1, 1, 4, 2, 1, 1, 2, 1, 1, 7, 4, 2, 1, 1, 2, 1, +1, 3, 2, 1, 1, 1, 1, 12, 7, 4, 2, 1, 1, 2, 1, 1, 3, 2, 1, 1, 1, 1, 5, 3, 2, 1, 1, +1, 1, 2, 1, 1, 1, 21, 12, 7, 4, 2, 1, 1, 2, 1, 1, 3, 2, 1, 1, 1, 1, 5, 3, 2, 1, +1, 1, 1, 2, 1, 1, 1, 9, 5, 3, 2, 1, 1, 1, 1, 2, 1, 1, 1, 4, 2, 1, 1, 1, 2, 1, 1, +33, 20, 12, 7, 4, 2, 1, 1, 2, 1, 1, 3, 2, 1, 1, 1, 1, 5, 3, 2, 1, 1, 1, 1, 2, 1, +1, 1, 8, 5, 3, 2, 1, 1, 1, 1, 2, 1, 1, 1, 4, 2, 1, 1, 2, 1, 1, 16, 8, 4, 2, 1, 1, +1, 2, 1, 1, 4, 2, 1, 1, 2, 1, 1, 8, 4, 2, 1, 1, 2, 1, 1, 4, 2, 1, 1, 2, 1, 1 }; + +const unsigned int strat_Bob[MAX_Bob-1] = { +112, 63, 32, 16, 8, 4, 2, 1, 1, 2, 1, 1, 4, 2, 1, 1, 2, 1, 1, 8, 4, 2, 1, 1, 2, 1, +1, 4, 2, 1, 1, 2, 1, 1, 16, 8, 4, 2, 1, 1, 2, 1, 1, 4, 2, 1, 1, 2, 1, 1, 8, 4, 2, +1, 1, 2, 1, 1, 4, 2, 1, 1, 2, 1, 1, 31, 16, 8, 4, 2, 1, 1, 2, 1, 1, 4, 2, 1, 1, 2, +1, 1, 8, 4, 2, 1, 1, 2, 1, 1, 4, 2, 1, 1, 2, 1, 1, 15, 8, 4, 2, 1, 1, 2, 1, 1, 4, +2, 1, 1, 2, 1, 1, 7, 4, 2, 1, 1, 2, 1, 1, 3, 2, 1, 1, 1, 1, 49, 31, 16, 8, 4, 2, +1, 1, 2, 1, 1, 4, 2, 1, 1, 2, 1, 1, 8, 4, 2, 1, 1, 2, 1, 1, 4, 2, 1, 1, 2, 1, 1, +15, 8, 4, 2, 1, 1, 2, 1, 1, 4, 2, 1, 1, 2, 1, 1, 7, 4, 2, 1, 1, 2, 1, 1, 3, 2, 1, +1, 1, 1, 21, 12, 8, 4, 2, 1, 1, 2, 1, 1, 4, 2, 1, 1, 2, 1, 1, 5, 3, 2, 1, 1, 1, 1, +2, 1, 1, 1, 9, 5, 3, 2, 1, 1, 1, 1, 2, 1, 1, 1, 4, 2, 1, 1, 1, 2, 1, 1 }; + +// Setting up macro defines and including GF(p), GF(p^2), curve, isogeny and kex functions + +#define fpcopy fpcopy751 +#define fpzero fpzero751 +#define fpadd fpadd751 +#define fpsub fpsub751 +#define fpneg fpneg751 +#define fpdiv2 fpdiv2_751 +#define fpcorrection fpcorrection751 +#define fpmul_mont fpmul751_mont +#define fpsqr_mont fpsqr751_mont +#define fpinv_mont fpinv751_mont +#define fpinv_chain_mont fpinv751_chain_mont +#define fpinv_mont_bingcd fpinv751_mont_bingcd +#define fp2copy fp2copy751 +#define fp2zero fp2zero751 +#define fp2add fp2add751 +#define fp2sub fp2sub751 +#define fp2neg fp2neg751 +#define fp2div2 fp2div2_751 +#define fp2correction fp2correction751 +#define fp2mul_mont fp2mul751_mont +#define fp2sqr_mont fp2sqr751_mont +#define fp2inv_mont fp2inv751_mont +#define fp2inv_mont_bingcd fp2inv751_mont_bingcd +#define fpequal_non_constant_time fpequal751_non_constant_time +#define mp_add_asm mp_add751_asm +#define mp_addx2_asm mp_add751x2_asm +#define mp_subx2_asm mp_sub751x2_asm + +#include "fpx.c" +#include "ec_isogeny.c" +#include "sidh.c" +#include "sike.c" \ No newline at end of file diff --git a/sidh_ref/P751_internal.h b/sidh_ref/P751_internal.h new file mode 100644 index 0000000..6517151 --- /dev/null +++ b/sidh_ref/P751_internal.h @@ -0,0 +1,255 @@ +/******************************************************************************************** +* Supersingular Isogeny Key Encapsulation Library +* +* Abstract: internal header file for P751 +*********************************************************************************************/ + +#ifndef __P751_INTERNAL_H__ +#define __P751_INTERNAL_H__ + +#include "api.h" + +#define NWORDS_FIELD 12 // Number of words of a 751-bit field element +#define p751_ZERO_WORDS 5 // Number of "0" digits in the least significant part of p751 + 1 + +// Basic constants + +#define NBITS_FIELD 751 +#define MAXBITS_FIELD 768 +#define MAXWORDS_FIELD ((MAXBITS_FIELD + RADIX - 1) / RADIX) // Max. number of words to represent field elements +#define NWORDS64_FIELD ((NBITS_FIELD + 63) / 64) // Number of 64-bit words of a 751-bit field element +#define NBITS_ORDER 384 +#define NWORDS_ORDER ((NBITS_ORDER + RADIX - 1) / RADIX) // Number of words of oA and oB, where oA and oB are the subgroup orders of Alice and Bob, resp. +#define NWORDS64_ORDER ((NBITS_ORDER + 63) / 64) // Number of 64-bit words of a 384-bit element +#define MAXBITS_ORDER NBITS_ORDER +#define MAXWORDS_ORDER ((MAXBITS_ORDER + RADIX - 1) / RADIX) // Max. number of words to represent elements in [1, oA-1] or [1, oB]. +#define ALICE 0 +#define BOB 1 +#define OALICE_BITS 372 +#define OBOB_BITS 379 +#define OBOB_EXPON 239 +#define MASK_ALICE 0x0F +#define MASK_BOB 0x03 +#define PRIME p751 +#define PARAM_A 0 +#define PARAM_C 1 +// Fixed parameters for isogeny tree computation +#define MAX_INT_POINTS_ALICE 8 +#define MAX_INT_POINTS_BOB 10 +#define MAX_Alice 186 +#define MAX_Bob 239 +#define MSG_BYTES 32 +#define SECRETKEY_A_BYTES (OALICE_BITS + 7) / 8 +#define SECRETKEY_B_BYTES (OBOB_BITS + 7) / 8 +#define FP2_ENCODED_BYTES 2 * ((NBITS_FIELD + 7) / 8) + +// SIDH's basic element definitions and point representations + +typedef digit_t felm_t[NWORDS_FIELD]; // Datatype for representing 751-bit field elements (768-bit max.) +typedef digit_t dfelm_t[2 * NWORDS_FIELD]; // Datatype for representing double-precision 2x751-bit field elements (2x768-bit max.) +typedef felm_t f2elm_t[2]; // Datatype for representing quadratic extension field elements GF(p751^2) +typedef f2elm_t publickey_t[3]; // Datatype for representing public keys equivalent to three GF(p751^2) elements + +typedef struct +{ + f2elm_t X; + f2elm_t Z; +} point_proj; // Point representation in projective XZ Montgomery coordinates. +typedef point_proj point_proj_t[1]; + +/**************** Function prototypes ****************/ +/************* Multiprecision functions **************/ + +// Copy wordsize digits, c = a, where lng(a) = nwords +void copy_words(const digit_t *a, digit_t *c, const unsigned int nwords); + +// Multiprecision addition, c = a+b, where lng(a) = lng(b) = nwords. Returns the carry bit +unsigned int mp_add(const digit_t *a, const digit_t *b, digit_t *c, const unsigned int nwords); + +// 751-bit multiprecision addition, c = a+b +void mp_add751(const digit_t *a, const digit_t *b, digit_t *c); +void mp_add751_asm(const digit_t *a, const digit_t *b, digit_t *c); +//void mp_addmask751_asm(const digit_t* a, const digit_t mask, digit_t* c); + +// 2x751-bit multiprecision addition, c = a+b +void mp_add751x2(const digit_t *a, const digit_t *b, digit_t *c); +void mp_add751x2_asm(const digit_t *a, const digit_t *b, digit_t *c); + +// Multiprecision subtraction, c = a-b, where lng(a) = lng(b) = nwords. Returns the borrow bit +unsigned int mp_sub(const digit_t *a, const digit_t *b, digit_t *c, const unsigned int nwords); +digit_t mp_sub751x2_asm(const digit_t *a, const digit_t *b, digit_t *c); + +// Multiprecision left shift +void mp_shiftleft(digit_t *x, unsigned int shift, const unsigned int nwords); + +// Multiprecision right shift by one +void mp_shiftr1(digit_t *x, const unsigned int nwords); + +// Multiprecision left right shift by one +void mp_shiftl1(digit_t *x, const unsigned int nwords); + +// Digit multiplication, digit * digit -> 2-digit result +void digit_x_digit(const digit_t a, const digit_t b, digit_t *c); + +// Multiprecision comba multiply, c = a*b, where lng(a) = lng(b) = nwords. +void mp_mul(const digit_t *a, const digit_t *b, digit_t *c, const unsigned int nwords); + +void multiply(const digit_t *a, const digit_t *b, digit_t *c, const unsigned int nwords); + +// Montgomery multiplication modulo the group order, mc = ma*mb*r' mod order, where ma,mb,mc in [0, order-1] +void Montgomery_multiply_mod_order(const digit_t *ma, const digit_t *mb, digit_t *mc, const digit_t *order, const digit_t *Montgomery_rprime); + +// (Non-constant time) Montgomery inversion modulo the curve order using a^(-1) = a^(order-2) mod order +//void Montgomery_inversion_mod_order(const digit_t* ma, digit_t* mc, const digit_t* order, const digit_t* Montgomery_rprime); + +void Montgomery_inversion_mod_order_bingcd(const digit_t *a, digit_t *c, const digit_t *order, const digit_t *Montgomery_rprime, const digit_t *Montgomery_R2); + +// Conversion of elements in Z_r to Montgomery representation, where the order r is up to 384 bits. +void to_Montgomery_mod_order(const digit_t *a, digit_t *mc, const digit_t *order, const digit_t *Montgomery_rprime, const digit_t *Montgomery_Rprime); + +// Conversion of elements in Z_r from Montgomery to standard representation, where the order is up to 384 bits. +void from_Montgomery_mod_order(const digit_t *ma, digit_t *c, const digit_t *order, const digit_t *Montgomery_rprime); + +// Inversion modulo Alice's order 2^372. +void inv_mod_orderA(const digit_t *a, digit_t *c); + +/************ Field arithmetic functions *************/ + +// Copy of a field element, c = a +void fpcopy751(const felm_t a, felm_t c); + +// Zeroing a field element, a = 0 +void fpzero751(felm_t a); + +// Non constant-time comparison of two field elements. If a = b return TRUE, otherwise, return FALSE +bool fpequal751_non_constant_time(const felm_t a, const felm_t b); + +// Modular addition, c = a+b mod p751 +extern void fpadd751(const digit_t *a, const digit_t *b, digit_t *c); +extern void fpadd751_asm(const digit_t *a, const digit_t *b, digit_t *c); + +// Modular subtraction, c = a-b mod p751 +extern void fpsub751(const digit_t *a, const digit_t *b, digit_t *c); +extern void fpsub751_asm(const digit_t *a, const digit_t *b, digit_t *c); + +// Modular negation, a = -a mod p751 +extern void fpneg751(digit_t *a); + +// Modular division by two, c = a/2 mod p751. +void fpdiv2_751(const digit_t *a, digit_t *c); + +// Modular correction to reduce field element a in [0, 2*p751-1] to [0, p751-1]. +void fpcorrection751(digit_t *a); + +// 751-bit Montgomery reduction, c = a mod p +void rdc_mont(const digit_t *a, digit_t *c); + +// Field multiplication using Montgomery arithmetic, c = a*b*R^-1 mod p751, where R=2^768 +void fpmul751_mont(const felm_t a, const felm_t b, felm_t c); +void mul751_asm(const felm_t a, const felm_t b, dfelm_t c); +void rdc751_asm(const dfelm_t ma, dfelm_t mc); + +// Field squaring using Montgomery arithmetic, c = a*b*R^-1 mod p751, where R=2^768 +void fpsqr751_mont(const felm_t ma, felm_t mc); + +// Conversion to Montgomery representation +void to_mont(const felm_t a, felm_t mc); + +// Conversion from Montgomery representation to standard representation +void from_mont(const felm_t ma, felm_t c); + +// Field inversion, a = a^-1 in GF(p751) +void fpinv751_mont(felm_t a); + +// Field inversion, a = a^-1 in GF(p751) using the binary GCD +void fpinv751_mont_bingcd(felm_t a); + +// Chain to compute (p751-3)/4 using Montgomery arithmetic +void fpinv751_chain_mont(felm_t a); + +/************ GF(p^2) arithmetic functions *************/ + +// Copy of a GF(p751^2) element, c = a +void fp2copy751(const f2elm_t a, f2elm_t c); + +// Zeroing a GF(p751^2) element, a = 0 +void fp2zero751(f2elm_t a); + +// GF(p751^2) negation, a = -a in GF(p751^2) +void fp2neg751(f2elm_t a); + +// GF(p751^2) addition, c = a+b in GF(p751^2) +extern void fp2add751(const f2elm_t a, const f2elm_t b, f2elm_t c); + +// GF(p751^2) subtraction, c = a-b in GF(p751^2) +extern void fp2sub751(const f2elm_t a, const f2elm_t b, f2elm_t c); + +// GF(p751^2) division by two, c = a/2 in GF(p751^2) +void fp2div2_751(const f2elm_t a, f2elm_t c); + +// Modular correction, a = a in GF(p751^2) +void fp2correction751(f2elm_t a); + +// GF(p751^2) squaring using Montgomery arithmetic, c = a^2 in GF(p751^2) +void fp2sqr751_mont(const f2elm_t a, f2elm_t c); + +// GF(p751^2) multiplication using Montgomery arithmetic, c = a*b in GF(p751^2) +void fp2mul751_mont(const f2elm_t a, const f2elm_t b, f2elm_t c); + +// Conversion of a GF(p751^2) element to Montgomery representation +void to_fp2mont(const f2elm_t a, f2elm_t mc); + +// Conversion of a GF(p751^2) element from Montgomery representation to standard representation +void from_fp2mont(const f2elm_t ma, f2elm_t c); + +// GF(p751^2) inversion using Montgomery arithmetic, a = (a0-i*a1)/(a0^2+a1^2) +void fp2inv751_mont(f2elm_t a); + +// GF(p751^2) inversion, a = (a0-i*a1)/(a0^2+a1^2), GF(p751) inversion done using the binary GCD +void fp2inv751_mont_bingcd(f2elm_t a); + +// n-way Montgomery inversion +void mont_n_way_inv(const f2elm_t *vec, const int n, f2elm_t *out); + +/************ Elliptic curve and isogeny functions *************/ + +// Computes the j-invariant of a Montgomery curve with projective constant. +void j_inv(const f2elm_t A, const f2elm_t C, f2elm_t jinv); + +// Simultaneous doubling and differential addition. +void xDBLADD(point_proj_t P, point_proj_t Q, const f2elm_t xPQ, const f2elm_t A24); + +// Doubling of a Montgomery point in projective coordinates (X:Z). +void xDBL(const point_proj_t P, point_proj_t Q, const f2elm_t A24plus, const f2elm_t C24); + +// Computes [2^e](X:Z) on Montgomery curve with projective constant via e repeated doublings. +void xDBLe(const point_proj_t P, point_proj_t Q, const f2elm_t A24plus, const f2elm_t C24, const int e); + +// Differential addition. +void xADD(point_proj_t P, const point_proj_t Q, const f2elm_t xPQ); + +// Computes the corresponding 4-isogeny of a projective Montgomery point (X4:Z4) of order 4. +void get_4_isog(const point_proj_t P, f2elm_t A24plus, f2elm_t C24, f2elm_t *coeff); + +// Evaluates the isogeny at the point (X:Z) in the domain of the isogeny. +void eval_4_isog(point_proj_t P, f2elm_t *coeff); + +// Tripling of a Montgomery point in projective coordinates (X:Z). +void xTPL(const point_proj_t P, point_proj_t Q, const f2elm_t A24minus, const f2elm_t A24plus); + +// Computes [3^e](X:Z) on Montgomery curve with projective constant via e repeated triplings. +void xTPLe(const point_proj_t P, point_proj_t Q, const f2elm_t A24minus, const f2elm_t A24plus, const int e); + +// Computes the corresponding 3-isogeny of a projective Montgomery point (X3:Z3) of order 3. +void get_3_isog(const point_proj_t P, f2elm_t A24minus, f2elm_t A24plus, f2elm_t *coeff); + +// Computes the 3-isogeny R=phi(X:Z), given projective point (X3:Z3) of order 3 on a Montgomery curve and a point P with coefficients given in coeff. +void eval_3_isog(point_proj_t Q, const f2elm_t *coeff); + +// 3-way simultaneous inversion +void inv_3_way(f2elm_t z1, f2elm_t z2, f2elm_t z3); + +// Given the x-coordinates of P, Q, and R, returns the value A corresponding to the Montgomery curve E_A: y^2=x^3+A*x^2+x such that R=Q-P on E_A. +void get_A(const f2elm_t xP, const f2elm_t xQ, const f2elm_t xR, f2elm_t A); + +#endif diff --git a/sidh_ref/SIDH.h b/sidh_ref/SIDH.h new file mode 100644 index 0000000..6877e8a --- /dev/null +++ b/sidh_ref/SIDH.h @@ -0,0 +1,214 @@ +/******************************************************************************************** +* SIDH: an efficient supersingular isogeny-based cryptography library for ephemeral +* Diffie-Hellman key exchange. +* +* Copyright (c) Microsoft Corporation. All rights reserved. +* +* +* Abstract: main header file +* +*********************************************************************************************/ + +#ifndef __SIDH_H__ +#define __SIDH_H__ + +#include +#include +#include + +// Definition of operating system + +#define OS_WIN 1 +#define OS_LINUX 2 + +#define OS_TARGET OS_LINUX + +#define COMPILER_VC 1 +#define COMPILER_GCC 2 +#define COMPILER_CLANG 3 + +#define COMPILER COMPILER_GCC + +// Definition of the targeted architecture and basic data types + +#define TARGET_AMD64 1 +#define TARGET_x86 2 +#define TARGET_ARM 3 +#define TARGET_ARM64 4 + +#define TARGET TARGET_AMD64 +#define RADIX 64 +typedef uint64_t digit_t; // Unsigned 64-bit digit +typedef int64_t sdigit_t; // Signed 64-bit digit +typedef uint32_t hdigit_t; // Unsigned 32-bit digit +#define NWORDS_FIELD 12 // Number of words of a 751-bit field element +#define p751_ZERO_WORDS 5 // Number of "0" digits in the least significant part of p751 + 1 + +#define RADIX64 64 + +// Selection of generic, portable implementation + +// Unsupported configurations + +#if (TARGET != TARGET_AMD64) && (TARGET != TARGET_ARM64) && !defined(GENERIC_IMPLEMENTATION) +#error-- "Unsupported configuration" +#endif + +// Extended datatype support + +#if defined(GENERIC_IMPLEMENTATION) +typedef uint64_t uint128_t[2]; +#elif (TARGET == TARGET_AMD64 && OS_TARGET == OS_LINUX) && (COMPILER == COMPILER_GCC || COMPILER == COMPILER_CLANG) +#define UINT128_SUPPORT +typedef unsigned uint128_t __attribute__((mode(TI))); +#elif (TARGET == TARGET_ARM64 && OS_TARGET == OS_LINUX) && (COMPILER == COMPILER_GCC || COMPILER == COMPILER_CLANG) +#define UINT128_SUPPORT +typedef unsigned uint128_t __attribute__((mode(TI))); +#elif (TARGET == TARGET_AMD64) && (OS_TARGET == OS_WIN && COMPILER == COMPILER_VC) +#define SCALAR_INTRIN_SUPPORT +typedef uint64_t uint128_t[2]; +#else +#error-- "Unsupported configuration" +#endif + +// Basic constants + +#define NBITS_FIELD 751 +#define MAXBITS_FIELD 768 +#define MAXWORDS_FIELD ((MAXBITS_FIELD + RADIX - 1) / RADIX) // Max. number of words to represent field elements +#define NWORDS64_FIELD ((NBITS_FIELD + 63) / 64) // Number of 64-bit words of a 751-bit field element +#define NBITS_ORDER 384 +#define NWORDS_ORDER ((NBITS_ORDER + RADIX - 1) / RADIX) // Number of words of oA and oB, where oA and oB are the subgroup orders of Alice and Bob, resp. +#define NWORDS64_ORDER ((NBITS_ORDER + 63) / 64) // Number of 64-bit words of a 384-bit element +#define MAXBITS_ORDER NBITS_ORDER +#define MAXWORDS_ORDER ((MAXBITS_ORDER + RADIX - 1) / RADIX) // Max. number of words to represent elements in [1, oA-1] or [1, oB]. + +// Basic constants for elliptic curve BigMont + +#define BIGMONT_NBITS_ORDER 749 +#define BIGMONT_MAXBITS_ORDER 768 +#define BIGMONT_NWORDS_ORDER ((BIGMONT_NBITS_ORDER + RADIX - 1) / RADIX) // Number of words of BigMont's subgroup order. +#define BIGMONT_MAXWORDS_ORDER ((BIGMONT_MAXBITS_ORDER + RADIX - 1) / RADIX) // Max. number of words to represent elements in [1, BigMont_order]. + +// Definitions of the error-handling type and error codes + +typedef enum { + CRYPTO_SUCCESS, // 0x00 + CRYPTO_ERROR, // 0x01 + CRYPTO_ERROR_DURING_TEST, // 0x02 + CRYPTO_ERROR_UNKNOWN, // 0x03 + CRYPTO_ERROR_NOT_IMPLEMENTED, // 0x04 + CRYPTO_ERROR_NO_MEMORY, // 0x05 + CRYPTO_ERROR_INVALID_PARAMETER, // 0x06 + CRYPTO_ERROR_SHARED_KEY, // 0x07 + CRYPTO_ERROR_PUBLIC_KEY_VALIDATION, // 0x08 + CRYPTO_ERROR_TOO_MANY_ITERATIONS, // 0x09 + CRYPTO_ERROR_END_OF_LIST +} CRYPTO_STATUS; + +#define CRYPTO_STATUS_TYPE_SIZE (CRYPTO_ERROR_END_OF_LIST) + +// Definitions of the error messages +// NOTE: they must match the error codes above + +#define CRYPTO_MSG_SUCCESS "CRYPTO_SUCCESS" +#define CRYPTO_MSG_ERROR "CRYPTO_ERROR" +#define CRYPTO_MSG_ERROR_DURING_TEST "CRYPTO_ERROR_DURING_TEST" +#define CRYPTO_MSG_ERROR_UNKNOWN "CRYPTO_ERROR_UNKNOWN" +#define CRYPTO_MSG_ERROR_NOT_IMPLEMENTED "CRYPTO_ERROR_NOT_IMPLEMENTED" +#define CRYPTO_MSG_ERROR_NO_MEMORY "CRYPTO_ERROR_NO_MEMORY" +#define CRYPTO_MSG_ERROR_INVALID_PARAMETER "CRYPTO_ERROR_INVALID_PARAMETER" +#define CRYPTO_MSG_ERROR_SHARED_KEY "CRYPTO_ERROR_SHARED_KEY" +#define CRYPTO_MSG_ERROR_PUBLIC_KEY_VALIDATION "CRYPTO_ERROR_PUBLIC_KEY_VALIDATION" +#define CRYPTO_MSG_ERROR_TOO_MANY_ITERATIONS "CRYPTO_ERROR_TOO_MANY_ITERATIONS" + +// Definition of type random_bytes to implement callback functions outputting "nbytes" random values to "random_array" +typedef CRYPTO_STATUS (*RandomBytes)(unsigned int nbytes, unsigned char *random_array); + +// Definition of type for curve isogeny system identifiers. Currently valid value is "SIDHp751" (see SIDH.h) +typedef char CurveIsogeny_ID[10]; + +// Supersingular elliptic curve isogeny structures: + +// This data struct contains the static curve isogeny data +typedef struct +{ + CurveIsogeny_ID CurveIsogeny; // Curve isogeny system identifier, base curve defined over GF(p^2) + unsigned int pwordbits; // Smallest multiple of 32 larger than the prime bitlength + unsigned int owordbits; // Smallest multiple of 32 larger than the order bitlength + unsigned int pbits; // Bitlength of the prime p + uint64_t prime[MAXWORDS_FIELD]; // Prime p + uint64_t A[MAXWORDS_FIELD]; // Base curve parameter "A" + uint64_t C[MAXWORDS_FIELD]; // Base curve parameter "C" + unsigned int oAbits; // Order bitlength for Alice + uint64_t Aorder[MAXWORDS_ORDER]; // Order of Alice's (sub)group + unsigned int oBbits; // Order bitlength for Bob + unsigned int eB; // Power of Bob's subgroup order (i.e., oB = 3^eB) + uint64_t Border[MAXWORDS_ORDER]; // Order of Bob's (sub)group + uint64_t PA[2 * MAXWORDS_FIELD]; // Alice's generator PA = (XPA,YPA), where XPA and YPA are defined over GF(p) + uint64_t PB[2 * MAXWORDS_FIELD]; // Bob's generator PB = (XPB,YPB), where XPB and YPB are defined over GF(p) + unsigned int BigMont_A24; // BigMont's curve parameter A24 = (A+2)/4 + uint64_t BigMont_order[BIGMONT_MAXWORDS_ORDER]; // BigMont's subgroup order + uint64_t Montgomery_R2[MAXWORDS_FIELD]; // Montgomery constant (2^W)^2 mod p, using a suitable value W + uint64_t Montgomery_pp[MAXWORDS_FIELD]; // Montgomery constant -p^-1 mod 2^W, using a suitable value W + uint64_t Montgomery_one[MAXWORDS_FIELD]; // Value one in Montgomery representation +} CurveIsogenyStaticData, *PCurveIsogenyStaticData; + +// This data struct is initialized with the targeted curve isogeny system during setup +typedef struct +{ + CurveIsogeny_ID CurveIsogeny; // Curve isogeny system identifier, base curve defined over GF(p^2) + unsigned int pwordbits; // Closest multiple of 32 to prime bitlength + unsigned int owordbits; // Closest multiple of 32 to order bitlength + unsigned int pbits; // Bitlength of the prime p + digit_t *prime; // Prime p + digit_t *A; // Base curve parameter "A" + digit_t *C; // Base curve parameter "C" + unsigned int oAbits; // Order bitlength for Alice + digit_t *Aorder; // Order of Alice's (sub)group + unsigned int oBbits; // Order bitlength for Bob + unsigned int eB; // Power of Bob's subgroup order (i.e., oB = 3^eB) + digit_t *Border; // Order of Bob's (sub)group + digit_t *PA; // Alice's generator PA = (XPA,YPA), where XPA and YPA are defined over GF(p) + digit_t *PB; // Bob's generator PB = (XPB,YPB), where XPB and YPB are defined over GF(p) + unsigned int BigMont_A24; // BigMont's curve parameter A24 = (A+2)/4 + digit_t *BigMont_order; // BigMont's subgroup order + digit_t *Montgomery_R2; // Montgomery constant (2^W)^2 mod p, using a suitable value W + digit_t *Montgomery_pp; // Montgomery constant -p^-1 mod 2^W, using a suitable value W + digit_t *Montgomery_one; // Value one in Montgomery representation + RandomBytes RandomBytesFunction; // Function providing random bytes to generate nonces or secret keys +} CurveIsogenyStruct, *PCurveIsogenyStruct; + +// Supported curve isogeny systems: + +// "SIDHp751", base curve: supersingular elliptic curve E: y^2 = x^3 + x +extern CurveIsogenyStaticData CurveIsogeny_SIDHp751; + +/******************** Function prototypes ***********************/ +/*************** Setup/initialization functions *****************/ + +// Dynamic allocation of memory for curve isogeny structure. +// Returns NULL on error. +PCurveIsogenyStruct SIDH_curve_allocate(PCurveIsogenyStaticData CurveData); + +// Initialize curve isogeny structure pCurveIsogeny with static data extracted from pCurveIsogenyData. +// This needs to be called after allocating memory for "pCurveIsogeny" using SIDH_curve_allocate(). +CRYPTO_STATUS SIDH_curve_initialize(PCurveIsogenyStruct pCurveIsogeny, RandomBytes RandomBytesFunction, PCurveIsogenyStaticData pCurveIsogenyData); + +// Free memory for curve isogeny structure +void SIDH_curve_free(PCurveIsogenyStruct pCurveIsogeny); + +// Output error/success message for a given CRYPTO_STATUS +const char *SIDH_get_error_message(CRYPTO_STATUS Status); + +// Output random values in the range [1, order-1] in little endian format that can be used as private keys. +CRYPTO_STATUS random_mod_order(digit_t *random_digits, unsigned int AliceOrBob, PCurveIsogenyStruct pCurveIsogeny); + +// Output random values in the range [1, BigMont_order-1] in little endian format that can be used as private keys +// to compute scalar multiplications using the elliptic curve BigMont. +CRYPTO_STATUS random_BigMont_mod_order(digit_t *random_digits, PCurveIsogenyStruct pCurveIsogeny); + +// Clear "nwords" digits from memory +void clear_words(void *mem, digit_t nwords); + +#endif diff --git a/sidh_ref/api.h b/sidh_ref/api.h new file mode 100644 index 0000000..5de9daf --- /dev/null +++ b/sidh_ref/api.h @@ -0,0 +1,109 @@ +/******************************************************************************************** +* Supersingular Isogeny Key Encapsulation Library +* +* Abstract: API header file for P751 +*********************************************************************************************/ + +#ifndef __P751_API_H__ +#define __P751_API_H__ + +#include "config.h" + + +/*********************** Key encapsulation mechanism API ***********************/ + +#define CRYPTO_SECRETKEYBYTES 644 // MSG_BYTES + SECRETKEY_B_BYTES + CRYPTO_PUBLICKEYBYTES bytes +#define CRYPTO_PUBLICKEYBYTES 564 +#define CRYPTO_BYTES 24 +#define CRYPTO_CIPHERTEXTBYTES 596 // CRYPTO_PUBLICKEYBYTES + MSG_BYTES bytes + +// Algorithm name +#define CRYPTO_ALGNAME "SIKEp751" + +// SIKE's key generation +// It produces a private key sk and computes the public key pk. +// Outputs: secret key sk (CRYPTO_SECRETKEYBYTES = 644 bytes) +// public key pk (CRYPTO_PUBLICKEYBYTES = 564 bytes) +int crypto_kem_keypair(unsigned char *pk, unsigned char *sk); + +// SIKE's encapsulation +// Input: public key pk (CRYPTO_PUBLICKEYBYTES = 564 bytes) +// Outputs: shared secret ss (CRYPTO_BYTES = 24 bytes) +// ciphertext message ct (CRYPTO_CIPHERTEXTBYTES = 596 bytes) +int crypto_kem_enc(unsigned char *ct, unsigned char *ss, const unsigned char *pk); + +// SIKE's decapsulation +// Input: secret key sk (CRYPTO_SECRETKEYBYTES = 644 bytes) +// ciphertext message ct (CRYPTO_CIPHERTEXTBYTES = 596 bytes) +// Outputs: shared secret ss (CRYPTO_BYTES = 24 bytes) +int crypto_kem_dec(unsigned char *ss, const unsigned char *ct, const unsigned char *sk); + + +// Encoding of keys for KEM-based isogeny system "SIKEp751" (wire format): +// ---------------------------------------------------------------------- +// Elements over GF(p751) are encoded in 94 octets in little endian format (i.e., the least significant octet is located in the lowest memory address). +// Elements (a+b*i) over GF(p751^2), where a and b are defined over GF(p751), are encoded as {a, b}, with a in the lowest memory portion. +// +// Private keys sk consist of the concatenation of a 32-byte random value, a value in the range [0, 2^378-1] and the public key pk. In the SIKE API, +// private keys are encoded in 644 octets in little endian format. +// Public keys pk consist of 3 elements in GF(p751^2). In the SIKE API, pk is encoded in 564 octets. +// Ciphertexts ct consist of the concatenation of a public key value and a 32-byte value. In the SIKE API, ct is encoded in 564 + 32 = 596 octets. +// Shared keys ss consist of a value of 24 octets. + + +/*********************** Ephemeral key exchange API ***********************/ + +#define SIDH_SECRETKEYBYTES 48 +#define SIDH_PUBLICKEYBYTES 564 +#define SIDH_BYTES 188 + +// SECURITY NOTE: SIDH supports ephemeral Diffie-Hellman key exchange. It is NOT secure to use it with static keys. +// See "On the Security of Supersingular Isogeny Cryptosystems", S.D. Galbraith, C. Petit, B. Shani and Y.B. Ti, in ASIACRYPT 2016, 2016. +// Extended version available at: http://eprint.iacr.org/2016/859 + +// Generation of Alice's secret key +// Outputs random value in [0, 2^372 - 1] to be used as Alice's private key +void random_mod_order_A(unsigned char* random_digits); + +// Generation of Bob's secret key +// Outputs random value in [0, 2^Floor(Log(2,3^239)) - 1] to be used as Bob's private key +void random_mod_order_B(unsigned char* random_digits); + +// Alice's ephemeral public key generation +// Input: a private key PrivateKeyA in the range [0, 2^372 - 1], stored in 47 bytes. +// Output: the public key PublicKeyA consisting of 3 GF(p751^2) elements encoded in 564 bytes. +int EphemeralKeyGeneration_A(const unsigned char* PrivateKeyA, unsigned char* PublicKeyA); + +// Bob's ephemeral key-pair generation +// It produces a private key PrivateKeyB and computes the public key PublicKeyB. +// The private key is an integer in the range [0, 2^Floor(Log(2,3^239)) - 1], stored in 48 bytes. +// The public key consists of 3 GF(p751^2) elements encoded in 564 bytes. +int EphemeralKeyGeneration_B(const unsigned char* PrivateKeyB, unsigned char* PublicKeyB); + +// Alice's ephemeral shared secret computation +// It produces a shared secret key SharedSecretA using her secret key PrivateKeyA and Bob's public key PublicKeyB +// Inputs: Alice's PrivateKeyA is an integer in the range [0, 2^372 - 1], stored in 47 bytes. +// Bob's PublicKeyB consists of 3 GF(p751^2) elements encoded in 564 bytes. +// Output: a shared secret SharedSecretA that consists of one element in GF(p751^2) encoded in 188 bytes. +int EphemeralSecretAgreement_A(const unsigned char* PrivateKeyA, const unsigned char* PublicKeyB, unsigned char* SharedSecretA); + +// Bob's ephemeral shared secret computation +// It produces a shared secret key SharedSecretB using his secret key PrivateKeyB and Alice's public key PublicKeyA +// Inputs: Bob's PrivateKeyB is an integer in the range [0, 2^Floor(Log(2,3^239)) - 1], stored in 48 bytes. +// Alice's PublicKeyA consists of 3 GF(p751^2) elements encoded in 564 bytes. +// Output: a shared secret SharedSecretB that consists of one element in GF(p751^2) encoded in 188 bytes. +int EphemeralSecretAgreement_B(const unsigned char* PrivateKeyB, const unsigned char* PublicKeyA, unsigned char* SharedSecretB); + + +// Encoding of keys for KEX-based isogeny system "SIDHp751" (wire format): +// ---------------------------------------------------------------------- +// Elements over GF(p751) are encoded in 94 octets in little endian format (i.e., the least significant octet is located in the lowest memory address). +// Elements (a+b*i) over GF(p751^2), where a and b are defined over GF(p751), are encoded as {a, b}, with a in the lowest memory portion. +// +// Private keys PrivateKeyA and PrivateKeyB can have values in the range [0, 2^372-1] and [0, 2^378-1], resp. In the SIDH API, private keys are encoded +// in 48 octets in little endian format. +// Public keys PublicKeyA and PublicKeyB consist of 3 elements in GF(p751^2). In the SIDH API, they are encoded in 564 octets. +// Shared keys SharedSecretA and SharedSecretB consist of one element in GF(p751^2). In the SIDH API, they are encoded in 188 octets. + + +#endif diff --git a/sidh_ref/config.h b/sidh_ref/config.h new file mode 100644 index 0000000..b64962d --- /dev/null +++ b/sidh_ref/config.h @@ -0,0 +1,128 @@ +/******************************************************************************************** +* Supersingular Isogeny Key Encapsulation Library +* +* Abstract: configuration file and platform-dependent macros +*********************************************************************************************/ + +#ifndef __CONFIG_H__ +#define __CONFIG_H__ + +#include +#include +#include + + +// Definition of operating system + +#define OS_LINUX 1 + +#if defined(__LINUX__) // Linux OS + #define OS_TARGET OS_LINUX +#else + #error -- "Unsupported OS" +#endif + + +// Definition of compiler + +#define COMPILER_GCC 1 +#define COMPILER_CLANG 2 + +#if defined(__GNUC__) // GNU GCC compiler + #define COMPILER COMPILER_GCC +#elif defined(__clang__) // Clang compiler + #define COMPILER COMPILER_CLANG +#else + #error -- "Unsupported COMPILER" +#endif + + +// Definition of the targeted architecture and basic data types + +#define TARGET_AMD64 1 + +#if defined(_AMD64_) + #define TARGET TARGET_AMD64 + #define RADIX 64 + #define LOG2RADIX 6 + typedef uint64_t digit_t; // Unsigned 64-bit digit +#else + #error -- "Unsupported ARCHITECTURE" +#endif + +#define RADIX64 64 + + +// Selection of implementation: optimized_fast with x64 assembly + +#if defined(_OPTIMIZED_FAST_) + #define OPTIMIZED_FAST_IMPLEMENTATION +#endif + + +// Extended datatype support + +#define UINT128_SUPPORT +typedef unsigned uint128_t __attribute__((mode(TI))); + + +// Macro definitions + +#define NBITS_TO_NBYTES(nbits) (((nbits)+7)/8) // Conversion macro from number of bits to number of bytes +#define NBITS_TO_NWORDS(nbits) (((nbits)+(sizeof(digit_t)*8)-1)/(sizeof(digit_t)*8)) // Conversion macro from number of bits to number of computer words +#define NBYTES_TO_NWORDS(nbytes) (((nbytes)+sizeof(digit_t)-1)/sizeof(digit_t)) // Conversion macro from number of bytes to number of computer words + +// Macro to avoid compiler warnings when detecting unreferenced parameters +#define UNREFERENCED_PARAMETER(PAR) ((void)(PAR)) + + +/********************** Constant-time unsigned comparisons ***********************/ + +// The following functions return 1 (TRUE) if condition is true, 0 (FALSE) otherwise + +static __inline unsigned int is_digit_nonzero_ct(digit_t x) +{ // Is x != 0? + return (unsigned int)((x | (0-x)) >> (RADIX-1)); +} + +static __inline unsigned int is_digit_zero_ct(digit_t x) +{ // Is x = 0? + return (unsigned int)(1 ^ is_digit_nonzero_ct(x)); +} + +static __inline unsigned int is_digit_lessthan_ct(digit_t x, digit_t y) +{ // Is x < y? + return (unsigned int)((x ^ ((x ^ y) | ((x - y) ^ y))) >> (RADIX-1)); +} + + +/********************** Macros for platform-dependent operations **********************/ + +// Digit multiplication +#define MUL(multiplier, multiplicand, hi, lo) \ + { uint128_t tempReg = (uint128_t)(multiplier) * (uint128_t)(multiplicand); \ + *(hi) = (digit_t)(tempReg >> RADIX); \ + (lo) = (digit_t)tempReg; } + +// Digit addition with carry +#define ADDC(carryIn, addend1, addend2, carryOut, sumOut) \ + { uint128_t tempReg = (uint128_t)(addend1) + (uint128_t)(addend2) + (uint128_t)(carryIn); \ + (carryOut) = (digit_t)(tempReg >> RADIX); \ + (sumOut) = (digit_t)tempReg; } + +// Digit subtraction with borrow +#define SUBC(borrowIn, minuend, subtrahend, borrowOut, differenceOut) \ + { uint128_t tempReg = (uint128_t)(minuend) - (uint128_t)(subtrahend) - (uint128_t)(borrowIn); \ + (borrowOut) = (digit_t)(tempReg >> (sizeof(uint128_t)*8 - 1)); \ + (differenceOut) = (digit_t)tempReg; } + +// Digit shift right +#define SHIFTR(highIn, lowIn, shift, shiftOut, DigitSize) \ + (shiftOut) = ((lowIn) >> (shift)) ^ ((highIn) << (RADIX - (shift))); + +// Digit shift left +#define SHIFTL(highIn, lowIn, shift, shiftOut, DigitSize) \ + (shiftOut) = ((highIn) << (shift)) ^ ((lowIn) >> (RADIX - (shift))); + + +#endif diff --git a/sidh_ref/ec_isogeny.c b/sidh_ref/ec_isogeny.c new file mode 100644 index 0000000..585c7f0 --- /dev/null +++ b/sidh_ref/ec_isogeny.c @@ -0,0 +1,330 @@ +/******************************************************************************************** +* Supersingular Isogeny Key Encapsulation Library +* +* Abstract: elliptic curve and isogeny functions +*********************************************************************************************/ + +#include "P751_internal.h" + +#include + +void xDBL(const point_proj_t P, point_proj_t Q, const f2elm_t A24plus, const f2elm_t C24) +{ // Doubling of a Montgomery point in projective coordinates (X:Z). + // Input: projective Montgomery x-coordinates P = (X1:Z1), where x1=X1/Z1 and Montgomery curve constants A+2C and 4C. + // Output: projective Montgomery x-coordinates Q = 2*P = (X2:Z2). + f2elm_t t0, t1; + + fp2sub(P->X, P->Z, t0); // t0 = X1-Z1 + fp2add(P->X, P->Z, t1); // t1 = X1+Z1 + fp2sqr_mont(t0, t0); // t0 = (X1-Z1)^2 + fp2sqr_mont(t1, t1); // t1 = (X1+Z1)^2 + fp2mul_mont(C24, t0, Q->Z); // Z2 = C24*(X1-Z1)^2 + fp2mul_mont(t1, Q->Z, Q->X); // X2 = C24*(X1-Z1)^2*(X1+Z1)^2 + fp2sub(t1, t0, t1); // t1 = (X1+Z1)^2-(X1-Z1)^2 + fp2mul_mont(A24plus, t1, t0); // t0 = A24plus*[(X1+Z1)^2-(X1-Z1)^2] + fp2add(Q->Z, t0, Q->Z); // Z2 = A24plus*[(X1+Z1)^2-(X1-Z1)^2] + C24*(X1-Z1)^2 + fp2mul_mont(Q->Z, t1, Q->Z); // Z2 = [A24plus*[(X1+Z1)^2-(X1-Z1)^2] + C24*(X1-Z1)^2]*[(X1+Z1)^2-(X1-Z1)^2] +} + +void xDBLe(const point_proj_t P, point_proj_t Q, const f2elm_t A24plus, const f2elm_t C24, const int e) +{ // Computes [2^e](X:Z) on Montgomery curve with projective constant via e repeated doublings. + // Input: projective Montgomery x-coordinates P = (XP:ZP), such that xP=XP/ZP and Montgomery curve constants A+2C and 4C. + // Output: projective Montgomery x-coordinates Q <- (2^e)*P. + int i; + + copy_words((digit_t *)P, (digit_t *)Q, 2 * 2 * NWORDS_FIELD); + + for (i = 0; i < e; i++) + { + xDBL(Q, Q, A24plus, C24); + } +} + +void get_4_isog(const point_proj_t P, f2elm_t A24plus, f2elm_t C24, f2elm_t *coeff) +{ // Computes the corresponding 4-isogeny of a projective Montgomery point (X4:Z4) of order 4. + // Input: projective point of order four P = (X4:Z4). + // Output: the 4-isogenous Montgomery curve with projective coefficients A+2C/4C and the 3 coefficients + // that are used to evaluate the isogeny at a point in eval_4_isog(). + + fp2sub(P->X, P->Z, coeff[1]); // coeff[1] = X4-Z4 + fp2add(P->X, P->Z, coeff[2]); // coeff[2] = X4+Z4 + fp2sqr_mont(P->Z, coeff[0]); // coeff[0] = Z4^2 + fp2add(coeff[0], coeff[0], coeff[0]); // coeff[0] = 2*Z4^2 + fp2sqr_mont(coeff[0], C24); // C24 = 4*Z4^4 + fp2add(coeff[0], coeff[0], coeff[0]); // coeff[0] = 4*Z4^2 + fp2sqr_mont(P->X, A24plus); // A24plus = X4^2 + fp2add(A24plus, A24plus, A24plus); // A24plus = 2*X4^2 + fp2sqr_mont(A24plus, A24plus); // A24plus = 4*X4^4 +} + +void eval_4_isog(point_proj_t P, f2elm_t *coeff) +{ // Evaluates the isogeny at the point (X:Z) in the domain of the isogeny, given a 4-isogeny phi defined + // by the 3 coefficients in coeff (computed in the function get_4_isog()). + // Inputs: the coefficients defining the isogeny, and the projective point P = (X:Z). + // Output: the projective point P = phi(P) = (X:Z) in the codomain. + f2elm_t t0, t1; + + fp2add(P->X, P->Z, t0); // t0 = X+Z + fp2sub(P->X, P->Z, t1); // t1 = X-Z + fp2mul_mont(t0, coeff[1], P->X); // X = (X+Z)*coeff[1] + fp2mul_mont(t1, coeff[2], P->Z); // Z = (X-Z)*coeff[2] + fp2mul_mont(t0, t1, t0); // t0 = (X+Z)*(X-Z) + fp2mul_mont(t0, coeff[0], t0); // t0 = coeff[0]*(X+Z)*(X-Z) + fp2add(P->X, P->Z, t1); // t1 = (X-Z)*coeff[2] + (X+Z)*coeff[1] + fp2sub(P->X, P->Z, P->Z); // Z = (X-Z)*coeff[2] - (X+Z)*coeff[1] + fp2sqr_mont(t1, t1); // t1 = [(X-Z)*coeff[2] + (X+Z)*coeff[1]]^2 + fp2sqr_mont(P->Z, P->Z); // Z = [(X-Z)*coeff[2] - (X+Z)*coeff[1]]^2 + fp2add(t1, t0, P->X); // X = coeff[0]*(X+Z)*(X-Z) + [(X-Z)*coeff[2] + (X+Z)*coeff[1]]^2 + fp2sub(P->Z, t0, t0); // t0 = [(X-Z)*coeff[2] - (X+Z)*coeff[1]]^2 - coeff[0]*(X+Z)*(X-Z) + fp2mul_mont(P->X, t1, P->X); // Xfinal + fp2mul_mont(P->Z, t0, P->Z); // Zfinal +} + +void xTPL(const point_proj_t P, point_proj_t Q, const f2elm_t A24minus, const f2elm_t A24plus) +{ // Tripling of a Montgomery point in projective coordinates (X:Z). + // Input: projective Montgomery x-coordinates P = (X:Z), where x=X/Z and Montgomery curve constants A24plus = A+2C and A24minus = A-2C. + // Output: projective Montgomery x-coordinates Q = 3*P = (X3:Z3). + f2elm_t t0, t1, t2, t3, t4, t5, t6; + + fp2sub(P->X, P->Z, t0); // t0 = X-Z + fp2sqr_mont(t0, t2); // t2 = (X-Z)^2 + fp2add(P->X, P->Z, t1); // t1 = X+Z + fp2sqr_mont(t1, t3); // t3 = (X+Z)^2 + fp2add(t0, t1, t4); // t4 = 2*X + fp2sub(t1, t0, t0); // t0 = 2*Z + fp2sqr_mont(t4, t1); // t1 = 4*X^2 + fp2sub(t1, t3, t1); // t1 = 4*X^2 - (X+Z)^2 + fp2sub(t1, t2, t1); // t1 = 4*X^2 - (X+Z)^2 - (X-Z)^2 + fp2mul_mont(t3, A24plus, t5); // t5 = A24plus*(X+Z)^2 + fp2mul_mont(t3, t5, t3); // t3 = A24plus*(X+Z)^3 + fp2mul_mont(A24minus, t2, t6); // t6 = A24minus*(X-Z)^2 + fp2mul_mont(t2, t6, t2); // t2 = A24minus*(X-Z)^3 + fp2sub(t2, t3, t3); // t3 = A24minus*(X-Z)^3 - coeff*(X+Z)^3 + fp2sub(t5, t6, t2); // t2 = A24plus*(X+Z)^2 - A24minus*(X-Z)^2 + fp2mul_mont(t1, t2, t1); // t1 = [4*X^2 - (X+Z)^2 - (X-Z)^2]*[A24plus*(X+Z)^2 - A24minus*(X-Z)^2] + fp2add(t3, t1, t2); // t2 = [4*X^2 - (X+Z)^2 - (X-Z)^2]*[A24plus*(X+Z)^2 - A24minus*(X-Z)^2] + A24minus*(X-Z)^3 - coeff*(X+Z)^3 + fp2sqr_mont(t2, t2); // t2 = t2^2 + fp2mul_mont(t4, t2, Q->X); // X3 = 2*X*t2 + fp2sub(t3, t1, t1); // t1 = A24minus*(X-Z)^3 - A24plus*(X+Z)^3 - [4*X^2 - (X+Z)^2 - (X-Z)^2]*[A24plus*(X+Z)^2 - A24minus*(X-Z)^2] + fp2sqr_mont(t1, t1); // t1 = t1^2 + fp2mul_mont(t0, t1, Q->Z); // Z3 = 2*Z*t1 +} + +void xTPLe(const point_proj_t P, point_proj_t Q, const f2elm_t A24minus, const f2elm_t A24plus, const int e) +{ // Computes [3^e](X:Z) on Montgomery curve with projective constant via e repeated triplings. + // Input: projective Montgomery x-coordinates P = (XP:ZP), such that xP=XP/ZP and Montgomery curve constants A24plus = A+2C and A24minus = A-2C. + // Output: projective Montgomery x-coordinates Q <- (3^e)*P. + int i; + + copy_words((digit_t *)P, (digit_t *)Q, 2 * 2 * NWORDS_FIELD); + + for (i = 0; i < e; i++) + { + xTPL(Q, Q, A24minus, A24plus); + } +} + +void get_3_isog(const point_proj_t P, f2elm_t A24minus, f2elm_t A24plus, f2elm_t *coeff) +{ // Computes the corresponding 3-isogeny of a projective Montgomery point (X3:Z3) of order 3. + // Input: projective point of order three P = (X3:Z3). + // Output: the 3-isogenous Montgomery curve with projective coefficient A/C. + f2elm_t t0, t1, t2, t3, t4; + + fp2sub(P->X, P->Z, coeff[0]); // coeff0 = X-Z + fp2sqr_mont(coeff[0], t0); // t0 = (X-Z)^2 + fp2add(P->X, P->Z, coeff[1]); // coeff1 = X+Z + fp2sqr_mont(coeff[1], t1); // t1 = (X+Z)^2 + fp2add(t0, t1, t2); // t2 = (X+Z)^2 + (X-Z)^2 + fp2add(coeff[0], coeff[1], t3); // t3 = 2*X + fp2sqr_mont(t3, t3); // t3 = 4*X^2 + fp2sub(t3, t2, t3); // t3 = 4*X^2 - (X+Z)^2 - (X-Z)^2 + fp2add(t1, t3, t2); // t2 = 4*X^2 - (X-Z)^2 + fp2add(t3, t0, t3); // t3 = 4*X^2 - (X+Z)^2 + fp2add(t0, t3, t4); // t4 = 4*X^2 - (X+Z)^2 + (X-Z)^2 + fp2add(t4, t4, t4); // t4 = 2(4*X^2 - (X+Z)^2 + (X-Z)^2) + fp2add(t1, t4, t4); // t4 = 8*X^2 - (X+Z)^2 + 2*(X-Z)^2 + fp2mul_mont(t2, t4, A24minus); // A24minus = [4*X^2 - (X-Z)^2]*[8*X^2 - (X+Z)^2 + 2*(X-Z)^2] + fp2add(t1, t2, t4); // t4 = 4*X^2 + (X+Z)^2 - (X-Z)^2 + fp2add(t4, t4, t4); // t4 = 2(4*X^2 + (X+Z)^2 - (X-Z)^2) + fp2add(t0, t4, t4); // t4 = 8*X^2 + 2*(X+Z)^2 - (X-Z)^2 + fp2mul_mont(t3, t4, t4); // t4 = [4*X^2 - (X+Z)^2]*[8*X^2 + 2*(X+Z)^2 - (X-Z)^2] + fp2sub(t4, A24minus, t0); // t0 = [4*X^2 - (X+Z)^2]*[8*X^2 + 2*(X+Z)^2 - (X-Z)^2] - [4*X^2 - (X-Z)^2]*[8*X^2 - (X+Z)^2 + 2*(X-Z)^2] + fp2add(A24minus, t0, A24plus); // A24plus = 8*X^2 - (X+Z)^2 + 2*(X-Z)^2 +} + +void eval_3_isog(point_proj_t Q, const f2elm_t *coeff) +{ // Computes the 3-isogeny R=phi(X:Z), given projective point (X3:Z3) of order 3 on a Montgomery curve and + // a point P with 2 coefficients in coeff (computed in the function get_3_isog()). + // Inputs: projective points P = (X3:Z3) and Q = (X:Z). + // Output: the projective point Q <- phi(Q) = (X3:Z3). + f2elm_t t0, t1, t2; + + fp2add(Q->X, Q->Z, t0); // t0 = X+Z + fp2sub(Q->X, Q->Z, t1); // t1 = X-Z + fp2mul_mont(t0, coeff[0], t0); // t0 = coeff0*(X+Z) + fp2mul_mont(t1, coeff[1], t1); // t1 = coeff1*(X-Z) + fp2add(t0, t1, t2); // t2 = coeff0*(X-Z) + coeff1*(X+Z) + fp2sub(t1, t0, t0); // t0 = coeff0*(X-Z) - coeff1*(X+Z) + fp2sqr_mont(t2, t2); // t2 = [coeff0*(X-Z) + coeff1*(X+Z)]^2 + fp2sqr_mont(t0, t0); // t1 = [coeff0*(X-Z) - coeff1*(X+Z)]^2 + fp2mul_mont(Q->X, t2, Q->X); // X3final = X*[coeff0*(X-Z) + coeff1*(X+Z)]^2 + fp2mul_mont(Q->Z, t0, Q->Z); // Z3final = Z*[coeff0*(X-Z) - coeff1*(X+Z)]^2 +} + +void inv_3_way(f2elm_t z1, f2elm_t z2, f2elm_t z3) +{ // 3-way simultaneous inversion + // Input: z1,z2,z3 + // Output: 1/z1,1/z2,1/z3 (override inputs). + f2elm_t t0, t1, t2, t3; + + fp2mul_mont(z1, z2, t0); // t0 = z1*z2 + fp2mul_mont(z3, t0, t1); // t1 = z1*z2*z3 + fp2inv_mont(t1); // t1 = 1/(z1*z2*z3) + fp2mul_mont(z3, t1, t2); // t2 = 1/(z1*z2) + fp2mul_mont(t2, z2, t3); // t3 = 1/z1 + fp2mul_mont(t2, z1, z2); // z2 = 1/z2 + fp2mul_mont(t0, t1, z3); // z3 = 1/z3 + fp2copy(t3, z1); // z1 = 1/z1 +} + +void get_A(const f2elm_t xP, const f2elm_t xQ, const f2elm_t xR, f2elm_t A) +{ // Given the x-coordinates of P, Q, and R, returns the value A corresponding to the Montgomery curve E_A: y^2=x^3+A*x^2+x such that R=Q-P on E_A. + // Input: the x-coordinates xP, xQ, and xR of the points P, Q and R. + // Output: the coefficient A corresponding to the curve E_A: y^2=x^3+A*x^2+x. + f2elm_t t0, t1, one = {0}; + + fpcopy((digit_t *)&Montgomery_one, one[0]); + fp2add(xP, xQ, t1); // t1 = xP+xQ + fp2mul_mont(xP, xQ, t0); // t0 = xP*xQ + fp2mul_mont(xR, t1, A); // A = xR*t1 + fp2add(t0, A, A); // A = A+t0 + fp2mul_mont(t0, xR, t0); // t0 = t0*xR + fp2sub(A, one, A); // A = A-1 + fp2add(t0, t0, t0); // t0 = t0+t0 + fp2add(t1, xR, t1); // t1 = t1+xR + fp2add(t0, t0, t0); // t0 = t0+t0 + fp2sqr_mont(A, A); // A = A^2 + fp2inv_mont(t0); // t0 = 1/t0 + fp2mul_mont(A, t0, A); // A = A*t0 + fp2sub(A, t1, A); // Afinal = A-t1 +} + +void j_inv(const f2elm_t A, const f2elm_t C, f2elm_t jinv) +{ // Computes the j-invariant of a Montgomery curve with projective constant. + // Input: A,C in GF(p^2). + // Output: j=256*(A^2-3*C^2)^3/(C^4*(A^2-4*C^2)), which is the j-invariant of the Montgomery curve B*y^2=x^3+(A/C)*x^2+x or (equivalently) j-invariant of B'*y^2=C*x^3+A*x^2+C*x. + f2elm_t t0, t1; + + fp2sqr_mont(A, jinv); // jinv = A^2 + fp2sqr_mont(C, t1); // t1 = C^2 + fp2add(t1, t1, t0); // t0 = t1+t1 + fp2sub(jinv, t0, t0); // t0 = jinv-t0 + fp2sub(t0, t1, t0); // t0 = t0-t1 + fp2sub(t0, t1, jinv); // jinv = t0-t1 + fp2sqr_mont(t1, t1); // t1 = t1^2 + fp2mul_mont(jinv, t1, jinv); // jinv = jinv*t1 + fp2add(t0, t0, t0); // t0 = t0+t0 + fp2add(t0, t0, t0); // t0 = t0+t0 + fp2sqr_mont(t0, t1); // t1 = t0^2 + fp2mul_mont(t0, t1, t0); // t0 = t0*t1 + fp2add(t0, t0, t0); // t0 = t0+t0 + fp2add(t0, t0, t0); // t0 = t0+t0 + fp2inv_mont(jinv); // jinv = 1/jinv + fp2mul_mont(jinv, t0, jinv); // jinv = t0*jinv +} + +void xDBLADD(point_proj_t P, point_proj_t Q, const f2elm_t xPQ, const f2elm_t A24) +{ // Simultaneous doubling and differential addition. + // Input: projective Montgomery points P=(XP:ZP) and Q=(XQ:ZQ) such that xP=XP/ZP and xQ=XQ/ZQ, affine difference xPQ=x(P-Q) and Montgomery curve constant A24=(A+2)/4. + // Output: projective Montgomery points P <- 2*P = (X2P:Z2P) such that x(2P)=X2P/Z2P, and Q <- P+Q = (XQP:ZQP) such that = x(Q+P)=XQP/ZQP. + f2elm_t t0, t1, t2; + + fp2add(P->X, P->Z, t0); // t0 = XP+ZP + fp2sub(P->X, P->Z, t1); // t1 = XP-ZP + fp2sqr_mont(t0, P->X); // XP = (XP+ZP)^2 + fp2sub(Q->X, Q->Z, t2); // t2 = XQ-ZQ + fp2correction(t2); + fp2add(Q->X, Q->Z, Q->X); // XQ = XQ+ZQ + fp2mul_mont(t2, t0, t0); // t0 = (XP+ZP)*(XQ-ZQ) + fp2sqr_mont(t1, P->Z); // ZP = (XP-ZP)^2 + fp2mul_mont(Q->X, t1, t1); // t1 = (XP-ZP)*(XQ+ZQ) + fp2sub(P->X, P->Z, t2); // t2 = (XP+ZP)^2-(XP-ZP)^2 + fp2mul_mont(P->X, P->Z, P->X); // XP = (XP+ZP)^2*(XP-ZP)^2 + fp2mul_mont(t2, A24, Q->X); // XQ = A24*[(XP+ZP)^2-(XP-ZP)^2] + fp2sub(t0, t1, Q->Z); // ZQ = (XP+ZP)*(XQ-ZQ)-(XP-ZP)*(XQ+ZQ) + fp2add(Q->X, P->Z, P->Z); // ZP = A24*[(XP+ZP)^2-(XP-ZP)^2]+(XP-ZP)^2 + fp2add(t0, t1, Q->X); // XQ = (XP+ZP)*(XQ-ZQ)+(XP-ZP)*(XQ+ZQ) + fp2mul_mont(P->Z, t2, P->Z); // ZP = [A24*[(XP+ZP)^2-(XP-ZP)^2]+(XP-ZP)^2]*[(XP+ZP)^2-(XP-ZP)^2] + fp2sqr_mont(Q->Z, Q->Z); // ZQ = [(XP+ZP)*(XQ-ZQ)-(XP-ZP)*(XQ+ZQ)]^2 + fp2sqr_mont(Q->X, Q->X); // XQ = [(XP+ZP)*(XQ-ZQ)+(XP-ZP)*(XQ+ZQ)]^2 + fp2mul_mont(Q->Z, xPQ, Q->Z); // ZQ = xPQ*[(XP+ZP)*(XQ-ZQ)-(XP-ZP)*(XQ+ZQ)]^2 +} + +static void swap_points(point_proj_t P, point_proj_t Q, const digit_t option) +{ // Swap points. + // If option = 0 then P <- P and Q <- Q, else if option = 0xFF...FF then P <- Q and Q <- P + digit_t temp; + unsigned int i; + + for (i = 0; i < NWORDS_FIELD; i++) + { + temp = option & (P->X[0][i] ^ Q->X[0][i]); + P->X[0][i] = temp ^ P->X[0][i]; + Q->X[0][i] = temp ^ Q->X[0][i]; + temp = option & (P->Z[0][i] ^ Q->Z[0][i]); + P->Z[0][i] = temp ^ P->Z[0][i]; + Q->Z[0][i] = temp ^ Q->Z[0][i]; + temp = option & (P->X[1][i] ^ Q->X[1][i]); + P->X[1][i] = temp ^ P->X[1][i]; + Q->X[1][i] = temp ^ Q->X[1][i]; + temp = option & (P->Z[1][i] ^ Q->Z[1][i]); + P->Z[1][i] = temp ^ P->Z[1][i]; + Q->Z[1][i] = temp ^ Q->Z[1][i]; + } +} + +static void LADDER3PT(const f2elm_t xP, const f2elm_t xQ, const f2elm_t xPQ, const digit_t *m, const unsigned int AliceOrBob, point_proj_t R, const f2elm_t A) +{ + point_proj_t R0 = {0}, R2 = {0}; + f2elm_t A24 = {0}; + digit_t mask; + int i, nbits, bit, swap, prevbit = 0; + + if (AliceOrBob == ALICE) + { + nbits = OALICE_BITS; + } + else + { + nbits = OBOB_BITS; + } + + // Initializing constant + fpcopy((digit_t *)&Montgomery_one, A24[0]); + fp2add(A24, A24, A24); + fp2add(A, A24, A24); + fp2div2(A24, A24); + fp2div2(A24, A24); // A24 = (A+2)/4 + + // Initializing points + fp2copy(xQ, R0->X); + fpcopy((digit_t *)&Montgomery_one, (digit_t *)R0->Z); + fp2copy(xPQ, R2->X); + fpcopy((digit_t *)&Montgomery_one, (digit_t *)R2->Z); + fp2copy(xP, R->X); + fpcopy((digit_t *)&Montgomery_one, (digit_t *)R->Z); + fpzero((digit_t *)(R->Z)[1]); + + // Main loop + for (i = 0; i < nbits; i++) + { + bit = (m[i >> LOG2RADIX] >> (i & (RADIX - 1))) & 1; + swap = bit ^ prevbit; + prevbit = bit; + mask = 0 - (digit_t)swap; + + swap_points(R, R2, mask); + xDBLADD(R0, R2, R->X, A24); + fp2mul_mont(R2->X, R->Z, R2->X); + } +} \ No newline at end of file diff --git a/sidh_ref/fp_x64.c b/sidh_ref/fp_x64.c new file mode 100644 index 0000000..a71d0d5 --- /dev/null +++ b/sidh_ref/fp_x64.c @@ -0,0 +1,867 @@ +/******************************************************************************************** +* SIDH: an efficient supersingular isogeny-based cryptography library for ephemeral +* Diffie-Hellman key exchange. +* +* Copyright (c) Microsoft Corporation. All rights reserved. +* +* +* Abstract: modular arithmetic optimized for x64 platforms +* +*********************************************************************************************/ + +#include "P751_internal.h" + +// Global constants +extern const uint64_t p751[NWORDS_FIELD]; +extern const uint64_t p751p1[NWORDS_FIELD]; +extern const uint64_t p751x2[NWORDS_FIELD]; + +__inline void fpadd751(const digit_t *a, const digit_t *b, digit_t *c) +{ // Modular addition, c = a+b mod p751. + // Inputs: a, b in [0, 2*p751-1] + // Output: c in [0, 2*p751-1] + +#if (OS_TARGET == OS_WIN) + unsigned int i, carry = 0; + digit_t mask; + + for (i = 0; i < NWORDS_FIELD; i++) + { + ADDC(carry, a[i], b[i], carry, c[i]); + } + + carry = 0; + for (i = 0; i < NWORDS_FIELD; i++) + { + SUBC(carry, c[i], ((digit_t *)p751x2)[i], carry, c[i]); + } + mask = 0 - (digit_t)carry; + + carry = 0; + for (i = 0; i < NWORDS_FIELD; i++) + { + ADDC(carry, c[i], ((digit_t *)p751x2)[i] & mask, carry, c[i]); + } + +#elif (OS_TARGET == OS_LINUX) + + fpadd751_asm(a, b, c); + +#endif +} + +__inline void fpsub751(const digit_t *a, const digit_t *b, digit_t *c) +{ // Modular subtraction, c = a-b mod p751. + // Inputs: a, b in [0, 2*p751-1] + // Output: c in [0, 2*p751-1] + +#if (OS_TARGET == OS_WIN) + unsigned int i, borrow = 0; + digit_t mask; + + for (i = 0; i < NWORDS_FIELD; i++) + { + SUBC(borrow, a[i], b[i], borrow, c[i]); + } + mask = 0 - (digit_t)borrow; + + borrow = 0; + for (i = 0; i < NWORDS_FIELD; i++) + { + ADDC(borrow, c[i], ((digit_t *)p751x2)[i] & mask, borrow, c[i]); + } + +#elif (OS_TARGET == OS_LINUX) + + fpsub751_asm(a, b, c); + +#endif +} + +__inline void fpneg751(digit_t *a) +{ // Modular negation, a = -a mod p751. + // Input/output: a in [0, 2*p751-1] + unsigned int i, borrow = 0; + + for (i = 0; i < NWORDS_FIELD; i++) + { + SUBC(borrow, ((digit_t *)p751x2)[i], a[i], borrow, a[i]); + } +} + +void fpdiv2_751(const digit_t *a, digit_t *c) +{ // Modular division by two, c = a/2 mod p751. + // Input : a in [0, 2*p751-1] + // Output: c in [0, 2*p751-1] + unsigned int i, carry = 0; + digit_t mask; + + mask = 0 - (digit_t)(a[0] & 1); // If a is odd compute a+p751 + for (i = 0; i < NWORDS_FIELD; i++) + { + ADDC(carry, a[i], ((digit_t *)p751)[i] & mask, carry, c[i]); + } + + mp_shiftr1(c, NWORDS_FIELD); +} + +void fpcorrection751(digit_t *a) +{ // Modular correction to reduce field element a in [0, 2*p751-1] to [0, p751-1]. + unsigned int i, borrow = 0; + digit_t mask; + + for (i = 0; i < NWORDS_FIELD; i++) + { + SUBC(borrow, a[i], ((digit_t *)p751)[i], borrow, a[i]); + } + mask = 0 - (digit_t)borrow; + + borrow = 0; + for (i = 0; i < NWORDS_FIELD; i++) + { + ADDC(borrow, a[i], ((digit_t *)p751)[i] & mask, borrow, a[i]); + } +} + +void mp_mul(const digit_t *a, const digit_t *b, digit_t *c, const unsigned int nwords) +{ // Multiprecision multiply, c = a*b, where lng(a) = lng(b) = nwords. + + UNREFERENCED_PARAMETER(nwords); + +#if (OS_TARGET == OS_WIN) + digit_t t = 0; + uint128_t uv = {0}; + unsigned int carry = 0; + + MULADD128(a[0], b[0], uv, carry, uv); + t += carry; + c[0] = uv[0]; + uv[0] = uv[1]; + uv[1] = t; + t = 0; + + MULADD128(a[0], b[1], uv, carry, uv); + t += carry; + MULADD128(a[1], b[0], uv, carry, uv); + t += carry; + c[1] = uv[0]; + uv[0] = uv[1]; + uv[1] = t; + t = 0; + + MULADD128(a[0], b[2], uv, carry, uv); + t += carry; + MULADD128(a[1], b[1], uv, carry, uv); + t += carry; + MULADD128(a[2], b[0], uv, carry, uv); + t += carry; + c[2] = uv[0]; + uv[0] = uv[1]; + uv[1] = t; + t = 0; + + MULADD128(a[0], b[3], uv, carry, uv); + t += carry; + MULADD128(a[2], b[1], uv, carry, uv); + t += carry; + MULADD128(a[1], b[2], uv, carry, uv); + t += carry; + MULADD128(a[3], b[0], uv, carry, uv); + t += carry; + c[3] = uv[0]; + uv[0] = uv[1]; + uv[1] = t; + t = 0; + + MULADD128(a[0], b[4], uv, carry, uv); + t += carry; + MULADD128(a[3], b[1], uv, carry, uv); + t += carry; + MULADD128(a[2], b[2], uv, carry, uv); + t += carry; + MULADD128(a[1], b[3], uv, carry, uv); + t += carry; + MULADD128(a[4], b[0], uv, carry, uv); + t += carry; + c[4] = uv[0]; + uv[0] = uv[1]; + uv[1] = t; + t = 0; + + MULADD128(a[0], b[5], uv, carry, uv); + t += carry; + MULADD128(a[4], b[1], uv, carry, uv); + t += carry; + MULADD128(a[3], b[2], uv, carry, uv); + t += carry; + MULADD128(a[2], b[3], uv, carry, uv); + t += carry; + MULADD128(a[1], b[4], uv, carry, uv); + t += carry; + MULADD128(a[5], b[0], uv, carry, uv); + t += carry; + c[5] = uv[0]; + uv[0] = uv[1]; + uv[1] = t; + t = 0; + + MULADD128(a[0], b[6], uv, carry, uv); + t += carry; + MULADD128(a[5], b[1], uv, carry, uv); + t += carry; + MULADD128(a[4], b[2], uv, carry, uv); + t += carry; + MULADD128(a[3], b[3], uv, carry, uv); + t += carry; + MULADD128(a[2], b[4], uv, carry, uv); + t += carry; + MULADD128(a[1], b[5], uv, carry, uv); + t += carry; + MULADD128(a[6], b[0], uv, carry, uv); + t += carry; + c[6] = uv[0]; + uv[0] = uv[1]; + uv[1] = t; + t = 0; + + MULADD128(a[0], b[7], uv, carry, uv); + t += carry; + MULADD128(a[6], b[1], uv, carry, uv); + t += carry; + MULADD128(a[5], b[2], uv, carry, uv); + t += carry; + MULADD128(a[4], b[3], uv, carry, uv); + t += carry; + MULADD128(a[3], b[4], uv, carry, uv); + t += carry; + MULADD128(a[2], b[5], uv, carry, uv); + t += carry; + MULADD128(a[1], b[6], uv, carry, uv); + t += carry; + MULADD128(a[7], b[0], uv, carry, uv); + t += carry; + c[7] = uv[0]; + uv[0] = uv[1]; + uv[1] = t; + t = 0; + + MULADD128(a[0], b[8], uv, carry, uv); + t += carry; + MULADD128(a[7], b[1], uv, carry, uv); + t += carry; + MULADD128(a[6], b[2], uv, carry, uv); + t += carry; + MULADD128(a[5], b[3], uv, carry, uv); + t += carry; + MULADD128(a[4], b[4], uv, carry, uv); + t += carry; + MULADD128(a[3], b[5], uv, carry, uv); + t += carry; + MULADD128(a[2], b[6], uv, carry, uv); + t += carry; + MULADD128(a[1], b[7], uv, carry, uv); + t += carry; + MULADD128(a[8], b[0], uv, carry, uv); + t += carry; + c[8] = uv[0]; + uv[0] = uv[1]; + uv[1] = t; + t = 0; + + MULADD128(a[0], b[9], uv, carry, uv); + t += carry; + MULADD128(a[8], b[1], uv, carry, uv); + t += carry; + MULADD128(a[7], b[2], uv, carry, uv); + t += carry; + MULADD128(a[6], b[3], uv, carry, uv); + t += carry; + MULADD128(a[5], b[4], uv, carry, uv); + t += carry; + MULADD128(a[4], b[5], uv, carry, uv); + t += carry; + MULADD128(a[3], b[6], uv, carry, uv); + t += carry; + MULADD128(a[2], b[7], uv, carry, uv); + t += carry; + MULADD128(a[1], b[8], uv, carry, uv); + t += carry; + MULADD128(a[9], b[0], uv, carry, uv); + t += carry; + c[9] = uv[0]; + uv[0] = uv[1]; + uv[1] = t; + t = 0; + + MULADD128(a[0], b[10], uv, carry, uv); + t += carry; + MULADD128(a[9], b[1], uv, carry, uv); + t += carry; + MULADD128(a[8], b[2], uv, carry, uv); + t += carry; + MULADD128(a[7], b[3], uv, carry, uv); + t += carry; + MULADD128(a[6], b[4], uv, carry, uv); + t += carry; + MULADD128(a[5], b[5], uv, carry, uv); + t += carry; + MULADD128(a[4], b[6], uv, carry, uv); + t += carry; + MULADD128(a[3], b[7], uv, carry, uv); + t += carry; + MULADD128(a[2], b[8], uv, carry, uv); + t += carry; + MULADD128(a[1], b[9], uv, carry, uv); + t += carry; + MULADD128(a[10], b[0], uv, carry, uv); + t += carry; + c[10] = uv[0]; + uv[0] = uv[1]; + uv[1] = t; + t = 0; + + MULADD128(a[0], b[11], uv, carry, uv); + t += carry; + MULADD128(a[10], b[1], uv, carry, uv); + t += carry; + MULADD128(a[9], b[2], uv, carry, uv); + t += carry; + MULADD128(a[8], b[3], uv, carry, uv); + t += carry; + MULADD128(a[7], b[4], uv, carry, uv); + t += carry; + MULADD128(a[6], b[5], uv, carry, uv); + t += carry; + MULADD128(a[5], b[6], uv, carry, uv); + t += carry; + MULADD128(a[4], b[7], uv, carry, uv); + t += carry; + MULADD128(a[3], b[8], uv, carry, uv); + t += carry; + MULADD128(a[2], b[9], uv, carry, uv); + t += carry; + MULADD128(a[1], b[10], uv, carry, uv); + t += carry; + MULADD128(a[11], b[0], uv, carry, uv); + t += carry; + c[11] = uv[0]; + uv[0] = uv[1]; + uv[1] = t; + t = 0; + + MULADD128(a[1], b[11], uv, carry, uv); + t += carry; + MULADD128(a[10], b[2], uv, carry, uv); + t += carry; + MULADD128(a[9], b[3], uv, carry, uv); + t += carry; + MULADD128(a[8], b[4], uv, carry, uv); + t += carry; + MULADD128(a[7], b[5], uv, carry, uv); + t += carry; + MULADD128(a[6], b[6], uv, carry, uv); + t += carry; + MULADD128(a[5], b[7], uv, carry, uv); + t += carry; + MULADD128(a[4], b[8], uv, carry, uv); + t += carry; + MULADD128(a[3], b[9], uv, carry, uv); + t += carry; + MULADD128(a[2], b[10], uv, carry, uv); + t += carry; + MULADD128(a[11], b[1], uv, carry, uv); + t += carry; + c[12] = uv[0]; + uv[0] = uv[1]; + uv[1] = t; + t = 0; + + MULADD128(a[11], b[2], uv, carry, uv); + t += carry; + MULADD128(a[10], b[3], uv, carry, uv); + t += carry; + MULADD128(a[9], b[4], uv, carry, uv); + t += carry; + MULADD128(a[8], b[5], uv, carry, uv); + t += carry; + MULADD128(a[7], b[6], uv, carry, uv); + t += carry; + MULADD128(a[6], b[7], uv, carry, uv); + t += carry; + MULADD128(a[5], b[8], uv, carry, uv); + t += carry; + MULADD128(a[4], b[9], uv, carry, uv); + t += carry; + MULADD128(a[3], b[10], uv, carry, uv); + t += carry; + MULADD128(a[2], b[11], uv, carry, uv); + t += carry; + c[13] = uv[0]; + uv[0] = uv[1]; + uv[1] = t; + t = 0; + + MULADD128(a[11], b[3], uv, carry, uv); + t += carry; + MULADD128(a[10], b[4], uv, carry, uv); + t += carry; + MULADD128(a[9], b[5], uv, carry, uv); + t += carry; + MULADD128(a[8], b[6], uv, carry, uv); + t += carry; + MULADD128(a[7], b[7], uv, carry, uv); + t += carry; + MULADD128(a[6], b[8], uv, carry, uv); + t += carry; + MULADD128(a[5], b[9], uv, carry, uv); + t += carry; + MULADD128(a[4], b[10], uv, carry, uv); + t += carry; + MULADD128(a[3], b[11], uv, carry, uv); + t += carry; + c[14] = uv[0]; + uv[0] = uv[1]; + uv[1] = t; + t = 0; + + MULADD128(a[11], b[4], uv, carry, uv); + t += carry; + MULADD128(a[10], b[5], uv, carry, uv); + t += carry; + MULADD128(a[9], b[6], uv, carry, uv); + t += carry; + MULADD128(a[8], b[7], uv, carry, uv); + t += carry; + MULADD128(a[7], b[8], uv, carry, uv); + t += carry; + MULADD128(a[6], b[9], uv, carry, uv); + t += carry; + MULADD128(a[5], b[10], uv, carry, uv); + t += carry; + MULADD128(a[4], b[11], uv, carry, uv); + t += carry; + c[15] = uv[0]; + uv[0] = uv[1]; + uv[1] = t; + t = 0; + + MULADD128(a[11], b[5], uv, carry, uv); + t += carry; + MULADD128(a[10], b[6], uv, carry, uv); + t += carry; + MULADD128(a[9], b[7], uv, carry, uv); + t += carry; + MULADD128(a[8], b[8], uv, carry, uv); + t += carry; + MULADD128(a[7], b[9], uv, carry, uv); + t += carry; + MULADD128(a[6], b[10], uv, carry, uv); + t += carry; + MULADD128(a[5], b[11], uv, carry, uv); + t += carry; + c[16] = uv[0]; + uv[0] = uv[1]; + uv[1] = t; + t = 0; + + MULADD128(a[11], b[6], uv, carry, uv); + t += carry; + MULADD128(a[10], b[7], uv, carry, uv); + t += carry; + MULADD128(a[9], b[8], uv, carry, uv); + t += carry; + MULADD128(a[8], b[9], uv, carry, uv); + t += carry; + MULADD128(a[7], b[10], uv, carry, uv); + t += carry; + MULADD128(a[6], b[11], uv, carry, uv); + t += carry; + c[17] = uv[0]; + uv[0] = uv[1]; + uv[1] = t; + t = 0; + + MULADD128(a[11], b[7], uv, carry, uv); + t += carry; + MULADD128(a[10], b[8], uv, carry, uv); + t += carry; + MULADD128(a[9], b[9], uv, carry, uv); + t += carry; + MULADD128(a[8], b[10], uv, carry, uv); + t += carry; + MULADD128(a[7], b[11], uv, carry, uv); + t += carry; + c[18] = uv[0]; + uv[0] = uv[1]; + uv[1] = t; + t = 0; + + MULADD128(a[11], b[8], uv, carry, uv); + t += carry; + MULADD128(a[10], b[9], uv, carry, uv); + t += carry; + MULADD128(a[9], b[10], uv, carry, uv); + t += carry; + MULADD128(a[8], b[11], uv, carry, uv); + t += carry; + c[19] = uv[0]; + uv[0] = uv[1]; + uv[1] = t; + t = 0; + + MULADD128(a[11], b[9], uv, carry, uv); + t += carry; + MULADD128(a[10], b[10], uv, carry, uv); + t += carry; + MULADD128(a[9], b[11], uv, carry, uv); + t += carry; + c[20] = uv[0]; + uv[0] = uv[1]; + uv[1] = t; + t = 0; + + MULADD128(a[11], b[10], uv, carry, uv); + t += carry; + MULADD128(a[10], b[11], uv, carry, uv); + t += carry; + c[21] = uv[0]; + uv[0] = uv[1]; + uv[1] = t; + + MULADD128(a[11], b[11], uv, carry, uv); + c[22] = uv[0]; + c[23] = uv[1]; + +#elif (OS_TARGET == OS_LINUX) + + mul751_asm(a, b, c); + +#endif +} + +void rdc_mont(const dfelm_t ma, felm_t mc) +{ // Efficient Montgomery reduction using comba and exploiting the special form of the prime p751. + // mc = ma*R^-1 mod p751x2, where R = 2^768. + // If ma < 2^768*p751, the output mc is in the range [0, 2*p751-1]. + // ma is assumed to be in Montgomery representation. + +#if (OS_TARGET == OS_WIN) + unsigned int carry; + digit_t t = 0; + uint128_t uv = {0}; + + mc[0] = ma[0]; + mc[1] = ma[1]; + mc[2] = ma[2]; + mc[3] = ma[3]; + mc[4] = ma[4]; + MUL128(mc[0], ((digit_t *)p751p1)[5], uv); + ADDC(0, uv[0], ma[5], carry, uv[0]); + ADDC(carry, uv[1], 0, carry, uv[1]); + mc[5] = uv[0]; + uv[0] = uv[1]; + uv[1] = 0; + + MULADD128(mc[0], ((digit_t *)p751p1)[6], uv, carry, uv); + MULADD128(mc[1], ((digit_t *)p751p1)[5], uv, carry, uv); + t += carry; + ADDC(0, uv[0], ma[6], carry, uv[0]); + ADDC(carry, uv[1], 0, carry, uv[1]); + t += carry; + mc[6] = uv[0]; + uv[0] = uv[1]; + uv[1] = t; + t = 0; + + MULADD128(mc[0], ((digit_t *)p751p1)[7], uv, carry, uv); + t += carry; + MULADD128(mc[1], ((digit_t *)p751p1)[6], uv, carry, uv); + t += carry; + MULADD128(mc[2], ((digit_t *)p751p1)[5], uv, carry, uv); + t += carry; + ADDC(0, uv[0], ma[7], carry, uv[0]); + ADDC(carry, uv[1], 0, carry, uv[1]); + t += carry; + mc[7] = uv[0]; + uv[0] = uv[1]; + uv[1] = t; + t = 0; + + MULADD128(mc[0], ((digit_t *)p751p1)[8], uv, carry, uv); + t += carry; + MULADD128(mc[1], ((digit_t *)p751p1)[7], uv, carry, uv); + t += carry; + MULADD128(mc[2], ((digit_t *)p751p1)[6], uv, carry, uv); + t += carry; + MULADD128(mc[3], ((digit_t *)p751p1)[5], uv, carry, uv); + t += carry; + ADDC(0, uv[0], ma[8], carry, uv[0]); + ADDC(carry, uv[1], 0, carry, uv[1]); + t += carry; + mc[8] = uv[0]; + uv[0] = uv[1]; + uv[1] = t; + t = 0; + + MULADD128(mc[0], ((digit_t *)p751p1)[9], uv, carry, uv); + t += carry; + MULADD128(mc[1], ((digit_t *)p751p1)[8], uv, carry, uv); + t += carry; + MULADD128(mc[2], ((digit_t *)p751p1)[7], uv, carry, uv); + t += carry; + MULADD128(mc[3], ((digit_t *)p751p1)[6], uv, carry, uv); + t += carry; + MULADD128(mc[4], ((digit_t *)p751p1)[5], uv, carry, uv); + t += carry; + ADDC(0, uv[0], ma[9], carry, uv[0]); + ADDC(carry, uv[1], 0, carry, uv[1]); + t += carry; + mc[9] = uv[0]; + uv[0] = uv[1]; + uv[1] = t; + t = 0; + + MULADD128(mc[0], ((digit_t *)p751p1)[10], uv, carry, uv); + t += carry; + MULADD128(mc[1], ((digit_t *)p751p1)[9], uv, carry, uv); + t += carry; + MULADD128(mc[2], ((digit_t *)p751p1)[8], uv, carry, uv); + t += carry; + MULADD128(mc[3], ((digit_t *)p751p1)[7], uv, carry, uv); + t += carry; + MULADD128(mc[4], ((digit_t *)p751p1)[6], uv, carry, uv); + t += carry; + MULADD128(mc[5], ((digit_t *)p751p1)[5], uv, carry, uv); + t += carry; + ADDC(0, uv[0], ma[10], carry, uv[0]); + ADDC(carry, uv[1], 0, carry, uv[1]); + t += carry; + mc[10] = uv[0]; + uv[0] = uv[1]; + uv[1] = t; + t = 0; + + MULADD128(mc[0], ((digit_t *)p751p1)[11], uv, carry, uv); + t += carry; + MULADD128(mc[1], ((digit_t *)p751p1)[10], uv, carry, uv); + t += carry; + MULADD128(mc[2], ((digit_t *)p751p1)[9], uv, carry, uv); + t += carry; + MULADD128(mc[3], ((digit_t *)p751p1)[8], uv, carry, uv); + t += carry; + MULADD128(mc[4], ((digit_t *)p751p1)[7], uv, carry, uv); + t += carry; + MULADD128(mc[5], ((digit_t *)p751p1)[6], uv, carry, uv); + t += carry; + MULADD128(mc[6], ((digit_t *)p751p1)[5], uv, carry, uv); + t += carry; + ADDC(0, uv[0], ma[11], carry, uv[0]); + ADDC(carry, uv[1], 0, carry, uv[1]); + t += carry; + mc[11] = uv[0]; + uv[0] = uv[1]; + uv[1] = t; + t = 0; + + MULADD128(mc[1], ((digit_t *)p751p1)[11], uv, carry, uv); + t += carry; + MULADD128(mc[2], ((digit_t *)p751p1)[10], uv, carry, uv); + t += carry; + MULADD128(mc[3], ((digit_t *)p751p1)[9], uv, carry, uv); + t += carry; + MULADD128(mc[4], ((digit_t *)p751p1)[8], uv, carry, uv); + t += carry; + MULADD128(mc[5], ((digit_t *)p751p1)[7], uv, carry, uv); + t += carry; + MULADD128(mc[6], ((digit_t *)p751p1)[6], uv, carry, uv); + t += carry; + MULADD128(mc[7], ((digit_t *)p751p1)[5], uv, carry, uv); + t += carry; + ADDC(0, uv[0], ma[12], carry, uv[0]); + ADDC(carry, uv[1], 0, carry, uv[1]); + t += carry; + mc[0] = uv[0]; + uv[0] = uv[1]; + uv[1] = t; + t = 0; + + MULADD128(mc[2], ((digit_t *)p751p1)[11], uv, carry, uv); + t += carry; + MULADD128(mc[3], ((digit_t *)p751p1)[10], uv, carry, uv); + t += carry; + MULADD128(mc[4], ((digit_t *)p751p1)[9], uv, carry, uv); + t += carry; + MULADD128(mc[5], ((digit_t *)p751p1)[8], uv, carry, uv); + t += carry; + MULADD128(mc[6], ((digit_t *)p751p1)[7], uv, carry, uv); + t += carry; + MULADD128(mc[7], ((digit_t *)p751p1)[6], uv, carry, uv); + t += carry; + MULADD128(mc[8], ((digit_t *)p751p1)[5], uv, carry, uv); + t += carry; + ADDC(0, uv[0], ma[13], carry, uv[0]); + ADDC(carry, uv[1], 0, carry, uv[1]); + t += carry; + mc[1] = uv[0]; + uv[0] = uv[1]; + uv[1] = t; + t = 0; + + MULADD128(mc[3], ((digit_t *)p751p1)[11], uv, carry, uv); + t += carry; + MULADD128(mc[4], ((digit_t *)p751p1)[10], uv, carry, uv); + t += carry; + MULADD128(mc[5], ((digit_t *)p751p1)[9], uv, carry, uv); + t += carry; + MULADD128(mc[6], ((digit_t *)p751p1)[8], uv, carry, uv); + t += carry; + MULADD128(mc[7], ((digit_t *)p751p1)[7], uv, carry, uv); + t += carry; + MULADD128(mc[8], ((digit_t *)p751p1)[6], uv, carry, uv); + t += carry; + MULADD128(mc[9], ((digit_t *)p751p1)[5], uv, carry, uv); + t += carry; + ADDC(0, uv[0], ma[14], carry, uv[0]); + ADDC(carry, uv[1], 0, carry, uv[1]); + t += carry; + mc[2] = uv[0]; + uv[0] = uv[1]; + uv[1] = t; + t = 0; + + MULADD128(mc[4], ((digit_t *)p751p1)[11], uv, carry, uv); + t += carry; + MULADD128(mc[5], ((digit_t *)p751p1)[10], uv, carry, uv); + t += carry; + MULADD128(mc[6], ((digit_t *)p751p1)[9], uv, carry, uv); + t += carry; + MULADD128(mc[7], ((digit_t *)p751p1)[8], uv, carry, uv); + t += carry; + MULADD128(mc[8], ((digit_t *)p751p1)[7], uv, carry, uv); + t += carry; + MULADD128(mc[9], ((digit_t *)p751p1)[6], uv, carry, uv); + t += carry; + MULADD128(mc[10], ((digit_t *)p751p1)[5], uv, carry, uv); + t += carry; + ADDC(0, uv[0], ma[15], carry, uv[0]); + ADDC(carry, uv[1], 0, carry, uv[1]); + t += carry; + mc[3] = uv[0]; + uv[0] = uv[1]; + uv[1] = t; + t = 0; + + MULADD128(mc[5], ((digit_t *)p751p1)[11], uv, carry, uv); + t += carry; + MULADD128(mc[6], ((digit_t *)p751p1)[10], uv, carry, uv); + t += carry; + MULADD128(mc[7], ((digit_t *)p751p1)[9], uv, carry, uv); + t += carry; + MULADD128(mc[8], ((digit_t *)p751p1)[8], uv, carry, uv); + t += carry; + MULADD128(mc[9], ((digit_t *)p751p1)[7], uv, carry, uv); + t += carry; + MULADD128(mc[10], ((digit_t *)p751p1)[6], uv, carry, uv); + t += carry; + MULADD128(mc[11], ((digit_t *)p751p1)[5], uv, carry, uv); + t += carry; + ADDC(0, uv[0], ma[16], carry, uv[0]); + ADDC(carry, uv[1], 0, carry, uv[1]); + t += carry; + mc[4] = uv[0]; + uv[0] = uv[1]; + uv[1] = t; + t = 0; + + MULADD128(mc[6], ((digit_t *)p751p1)[11], uv, carry, uv); + t += carry; + MULADD128(mc[7], ((digit_t *)p751p1)[10], uv, carry, uv); + t += carry; + MULADD128(mc[8], ((digit_t *)p751p1)[9], uv, carry, uv); + t += carry; + MULADD128(mc[9], ((digit_t *)p751p1)[8], uv, carry, uv); + t += carry; + MULADD128(mc[10], ((digit_t *)p751p1)[7], uv, carry, uv); + t += carry; + MULADD128(mc[11], ((digit_t *)p751p1)[6], uv, carry, uv); + t += carry; + ADDC(0, uv[0], ma[17], carry, uv[0]); + ADDC(carry, uv[1], 0, carry, uv[1]); + t += carry; + mc[5] = uv[0]; + uv[0] = uv[1]; + uv[1] = t; + t = 0; + + MULADD128(mc[7], ((digit_t *)p751p1)[11], uv, carry, uv); + t += carry; + MULADD128(mc[8], ((digit_t *)p751p1)[10], uv, carry, uv); + t += carry; + MULADD128(mc[9], ((digit_t *)p751p1)[9], uv, carry, uv); + t += carry; + MULADD128(mc[10], ((digit_t *)p751p1)[8], uv, carry, uv); + t += carry; + MULADD128(mc[11], ((digit_t *)p751p1)[7], uv, carry, uv); + t += carry; + ADDC(0, uv[0], ma[18], carry, uv[0]); + ADDC(carry, uv[1], 0, carry, uv[1]); + t += carry; + mc[6] = uv[0]; + uv[0] = uv[1]; + uv[1] = t; + t = 0; + + MULADD128(mc[8], ((digit_t *)p751p1)[11], uv, carry, uv); + t += carry; + MULADD128(mc[9], ((digit_t *)p751p1)[10], uv, carry, uv); + t += carry; + MULADD128(mc[10], ((digit_t *)p751p1)[9], uv, carry, uv); + t += carry; + MULADD128(mc[11], ((digit_t *)p751p1)[8], uv, carry, uv); + t += carry; + ADDC(0, uv[0], ma[19], carry, uv[0]); + ADDC(carry, uv[1], 0, carry, uv[1]); + t += carry; + mc[7] = uv[0]; + uv[0] = uv[1]; + uv[1] = t; + t = 0; + + MULADD128(mc[9], ((digit_t *)p751p1)[11], uv, carry, uv); + t += carry; + MULADD128(mc[10], ((digit_t *)p751p1)[10], uv, carry, uv); + t += carry; + MULADD128(mc[11], ((digit_t *)p751p1)[9], uv, carry, uv); + t += carry; + ADDC(0, uv[0], ma[20], carry, uv[0]); + ADDC(carry, uv[1], 0, carry, uv[1]); + t += carry; + mc[8] = uv[0]; + uv[0] = uv[1]; + uv[1] = t; + t = 0; + + MULADD128(mc[10], ((digit_t *)p751p1)[11], uv, carry, uv); + t += carry; + MULADD128(mc[11], ((digit_t *)p751p1)[10], uv, carry, uv); + t += carry; + ADDC(0, uv[0], ma[21], carry, uv[0]); + ADDC(carry, uv[1], 0, carry, uv[1]); + t += carry; + mc[9] = uv[0]; + uv[0] = uv[1]; + uv[1] = t; + t = 0; + + MULADD128(mc[11], ((digit_t *)p751p1)[11], uv, carry, uv); + t += carry; + ADDC(0, uv[0], ma[22], carry, mc[10]); + ADDC(carry, uv[1], 0, carry, uv[1]); + ADDC(0, uv[1], ma[23], carry, mc[11]); + +#elif (OS_TARGET == OS_LINUX) + + rdc751_asm(ma, mc); + +#endif +} diff --git a/sidh_ref/fp_x64_asm.S b/sidh_ref/fp_x64_asm.S new file mode 100644 index 0000000..8c96bd5 --- /dev/null +++ b/sidh_ref/fp_x64_asm.S @@ -0,0 +1,2644 @@ +//******************************************************************************************* +// Supersingular Isogeny Key Encapsulation Library +// +// Abstract: field arithmetic in x64 assembly for P751 on Linux +//******************************************************************************************* + +.intel_syntax noprefix + +// Registers that are used for parameter passing: +#define reg_p1 rdi +#define reg_p2 rsi +#define reg_p3 rdx + +// p751 + 1 +#define p751p1_5 0xEEB0000000000000 +#define p751p1_6 0xE3EC968549F878A8 +#define p751p1_7 0xDA959B1A13F7CC76 +#define p751p1_8 0x084E9867D6EBE876 +#define p751p1_9 0x8562B5045CB25748 +#define p751p1_10 0x0E12909F97BADC66 +#define p751p1_11 0x00006FE5D541F71C +// p751 x 2 +#define p751x2_0 0xFFFFFFFFFFFFFFFE +#define p751x2_1 0xFFFFFFFFFFFFFFFF +#define p751x2_5 0xDD5FFFFFFFFFFFFF +#define p751x2_6 0xC7D92D0A93F0F151 +#define p751x2_7 0xB52B363427EF98ED +#define p751x2_8 0x109D30CFADD7D0ED +#define p751x2_9 0x0AC56A08B964AE90 +#define p751x2_10 0x1C25213F2F75B8CD +#define p751x2_11 0x0000DFCBAA83EE38 + + +.text +//*********************************************************************** +// Field addition +// Operation: c [reg_p3] = a [reg_p1] + b [reg_p2] +//*********************************************************************** +.global fpadd751_asm +fpadd751_asm: + push r12 + push r13 + push r14 + push r15 + + mov r8, [reg_p1] + mov r9, [reg_p1+8] + mov r10, [reg_p1+16] + mov r11, [reg_p1+24] + mov r12, [reg_p1+32] + mov r13, [reg_p1+40] + mov r14, [reg_p1+48] + mov r15, [reg_p1+56] + mov rcx, [reg_p1+64] + add r8, [reg_p2] + adc r9, [reg_p2+8] + adc r10, [reg_p2+16] + adc r11, [reg_p2+24] + adc r12, [reg_p2+32] + adc r13, [reg_p2+40] + adc r14, [reg_p2+48] + adc r15, [reg_p2+56] + adc rcx, [reg_p2+64] + mov rax, [reg_p1+72] + adc rax, [reg_p2+72] + mov [reg_p3+72], rax + mov rax, [reg_p1+80] + adc rax, [reg_p2+80] + mov [reg_p3+80], rax + mov rax, [reg_p1+88] + adc rax, [reg_p2+88] + mov [reg_p3+88], rax + + movq rax, p751x2_0 + sub r8, rax + movq rax, p751x2_1 + sbb r9, rax + sbb r10, rax + sbb r11, rax + sbb r12, rax + movq rax, p751x2_5 + sbb r13, rax + movq rax, p751x2_6 + sbb r14, rax + movq rax, p751x2_7 + sbb r15, rax + movq rax, p751x2_8 + sbb rcx, rax + mov [reg_p3], r8 + mov [reg_p3+8], r9 + mov [reg_p3+16], r10 + mov [reg_p3+24], r11 + mov [reg_p3+32], r12 + mov [reg_p3+40], r13 + mov [reg_p3+48], r14 + mov [reg_p3+56], r15 + mov [reg_p3+64], rcx + mov r8, [reg_p3+72] + mov r9, [reg_p3+80] + mov r10, [reg_p3+88] + movq rax, p751x2_9 + sbb r8, rax + movq rax, p751x2_10 + sbb r9, rax + movq rax, p751x2_11 + sbb r10, rax + mov [reg_p3+72], r8 + mov [reg_p3+80], r9 + mov [reg_p3+88], r10 + movq rax, 0 + sbb rax, 0 + + mov rsi, p751x2_0 + and rsi, rax + mov r8, p751x2_1 + and r8, rax + movq r9, p751x2_5 + and r9, rax + movq r10, p751x2_6 + and r10, rax + movq r11, p751x2_7 + and r11, rax + movq r12, p751x2_8 + and r12, rax + movq r13, p751x2_9 + and r13, rax + movq r14, p751x2_10 + and r14, rax + movq r15, p751x2_11 + and r15, rax + + mov rax, [reg_p3] + add rax, rsi + mov [reg_p3], rax + mov rax, [reg_p3+8] + adc rax, r8 + mov [reg_p3+8], rax + mov rax, [reg_p3+16] + adc rax, r8 + mov [reg_p3+16], rax + mov rax, [reg_p3+24] + adc rax, r8 + mov [reg_p3+24], rax + mov rax, [reg_p3+32] + adc rax, r8 + mov [reg_p3+32], rax + mov rax, [reg_p3+40] + adc rax, r9 + mov [reg_p3+40], rax + mov rax, [reg_p3+48] + adc rax, r10 + mov [reg_p3+48], rax + mov rax, [reg_p3+56] + adc rax, r11 + mov [reg_p3+56], rax + mov rax, [reg_p3+64] + adc rax, r12 + mov [reg_p3+64], rax + mov rax, [reg_p3+72] + adc rax, r13 + mov [reg_p3+72], rax + mov rax, [reg_p3+80] + adc rax, r14 + mov [reg_p3+80], rax + mov rax, [reg_p3+88] + adc rax, r15 + mov [reg_p3+88], rax + + pop r15 + pop r14 + pop r13 + pop r12 + ret + + +//*********************************************************************** +// Field subtraction +// Operation: c [reg_p3] = a [reg_p1] - b [reg_p2] +//*********************************************************************** +.global fpsub751_asm +fpsub751_asm: + push r12 + push r13 + push r14 + push r15 + + mov r8, [reg_p1] + mov r9, [reg_p1+8] + mov r10, [reg_p1+16] + mov r11, [reg_p1+24] + mov r12, [reg_p1+32] + mov r13, [reg_p1+40] + mov r14, [reg_p1+48] + mov r15, [reg_p1+56] + mov rcx, [reg_p1+64] + sub r8, [reg_p2] + sbb r9, [reg_p2+8] + sbb r10, [reg_p2+16] + sbb r11, [reg_p2+24] + sbb r12, [reg_p2+32] + sbb r13, [reg_p2+40] + sbb r14, [reg_p2+48] + sbb r15, [reg_p2+56] + sbb rcx, [reg_p2+64] + mov [reg_p3], r8 + mov [reg_p3+8], r9 + mov [reg_p3+16], r10 + mov [reg_p3+24], r11 + mov [reg_p3+32], r12 + mov [reg_p3+40], r13 + mov [reg_p3+48], r14 + mov [reg_p3+56], r15 + mov [reg_p3+64], rcx + mov rax, [reg_p1+72] + sbb rax, [reg_p2+72] + mov [reg_p3+72], rax + mov rax, [reg_p1+80] + sbb rax, [reg_p2+80] + mov [reg_p3+80], rax + mov rax, [reg_p1+88] + sbb rax, [reg_p2+88] + mov [reg_p3+88], rax + movq rax, 0 + sbb rax, 0 + + mov rsi, p751x2_0 + and rsi, rax + mov r8, p751x2_1 + and r8, rax + movq r9, p751x2_5 + and r9, rax + movq r10, p751x2_6 + and r10, rax + movq r11, p751x2_7 + and r11, rax + movq r12, p751x2_8 + and r12, rax + movq r13, p751x2_9 + and r13, rax + movq r14, p751x2_10 + and r14, rax + movq r15, p751x2_11 + and r15, rax + + mov rax, [reg_p3] + add rax, rsi + mov [reg_p3], rax + mov rax, [reg_p3+8] + adc rax, r8 + mov [reg_p3+8], rax + mov rax, [reg_p3+16] + adc rax, r8 + mov [reg_p3+16], rax + mov rax, [reg_p3+24] + adc rax, r8 + mov [reg_p3+24], rax + mov rax, [reg_p3+32] + adc rax, r8 + mov [reg_p3+32], rax + mov rax, [reg_p3+40] + adc rax, r9 + mov [reg_p3+40], rax + mov rax, [reg_p3+48] + adc rax, r10 + mov [reg_p3+48], rax + mov rax, [reg_p3+56] + adc rax, r11 + mov [reg_p3+56], rax + mov rax, [reg_p3+64] + adc rax, r12 + mov [reg_p3+64], rax + mov rax, [reg_p3+72] + adc rax, r13 + mov [reg_p3+72], rax + mov rax, [reg_p3+80] + adc rax, r14 + mov [reg_p3+80], rax + mov rax, [reg_p3+88] + adc rax, r15 + mov [reg_p3+88], rax + + pop r15 + pop r14 + pop r13 + pop r12 + ret + + + #ifdef _MULX_ + +///////////////////////////////////////////////////////////////// MACRO +// Schoolbook integer multiplication +// Inputs: memory pointers M0 and M1 +// Outputs: memory pointer C +// Temps: stack pointer for 15 64-bit values, regs T0:T7 +///////////////////////////////////////////////////////////////// +#if _ADX_ +.macro MUL384_SCHOOL M0, M1, C, S, T0, T1, T2, T3, T4, T5, T6, T7 + mov rdx, \M0 + mulx \T0, \T1, \M1 + mulx \T2, \T3, 8\M1 + mov \C, \T1 // C0_final + xor rax, rax + mulx \T4, \T5, 16\M1 + adox \T0, \T3 + adox \T2, \T5 + mulx \T1, \T3, 24\M1 + adox \T4, \T3 + mulx \T5, \T6, 32\M1 + adox \T1, \T6 + mulx \T3, \T7, 40\M1 + adox \T5, \T7 + adox \T3, rax + + mov rdx, 8\M0 + mulx \T6, \T7, \M1 + adcx \T0, \T7 + mov 8\C, \T0 // C1_final + adcx \T2, \T6 + mulx \T6, \T7, 8\M1 + mov \S, \T7 // store T7 + adcx \T4, \T6 + mulx \T0, \T6, 16\M1 + mov 8\S, \T6 // store T6 + adcx \T0, \T1 + mulx \T1, \T7, 24\M1 + adcx \T1, \T5 + mulx \T5, \T6, 32\M1 + adcx \T3, \T5 + mulx \T5, rdx, 40\M1 + adcx \T5, rax + + xor rax, rax + adox \T2, \S + adox \T4, 8\S + adox \T0, \T7 + adox \T1, \T6 + adox \T3, rdx + adox \T5, rax + + mov rdx, 16\M0 + mulx \T6, \T7, \M1 + adcx \T2, \T7 + mov 16\C, \T2 // C2_final + adcx \T4, \T6 + mulx \T6, \T7, 8\M1 + mov \S, \T7 // store T7 + adcx \T0, \T6 + mulx \T2, \T6, 16\M1 + mov 8\S, \T6 // store T6 + adcx \T1, \T2 + mulx \T2, \T7, 24\M1 + adcx \T3, \T2 + mulx \T2, \T6, 32\M1 + adcx \T5, \T2 + mulx \T2, rdx, 40\M1 + adcx \T2, rax + + xor rax, rax + adox \T4, \S + adox \T0, 8\S + adox \T1, \T7 + adox \T3, \T6 + adox \T5, rdx + adox \T2, rax + + mov rdx, 24\M0 + mulx \T6, \T7, \M1 + adcx \T4, \T7 + mov 24\C, \T4 // C3_final + adcx \T0, \T6 + mulx \T6, \T7, 8\M1 + mov \S, \T7 // store T7 + adcx \T1, \T6 + mulx \T4, \T6, 16\M1 + mov 8\S, \T6 // store T6 + adcx \T3, \T4 + mulx \T4, \T7, 24\M1 + adcx \T5, \T4 + mulx \T4, \T6, 32\M1 + adcx \T2, \T4 + mulx \T4, rdx, 40\M1 + adcx \T4, rax + + xor rax, rax + adox \T0, \S + adox \T1, 8\S + adox \T3, \T7 + adox \T5, \T6 + adox \T2, rdx + adox \T4, rax + + mov rdx, 32\M0 + mulx \T6, \T7, \M1 + adcx \T0, \T7 + mov 32\C, \T0 // C4_final + adcx \T1, \T6 + mulx \T6, \T7, 8\M1 + mov \S, \T7 // store T7 + adcx \T3, \T6 + mulx \T0, \T6, 16\M1 + mov 8\S, \T6 // store T6 + adcx \T5, \T0 + mulx \T0, \T7, 24\M1 + adcx \T2, \T0 + mulx \T0, \T6, 32\M1 + adcx \T4, \T0 + mulx \T0, rdx, 40\M1 + adcx \T0, rax + + xor rax, rax + adox \T1, \S + adox \T3, 8\S + adox \T5, \T7 + adox \T2, \T6 + adox \T4, rdx + adox \T0, rax + + mov rdx, 40\M0 + mulx \T6, \T7, \M1 + adcx \T1, \T7 + mov 40\C, \T1 // C5_final + adcx \T3, \T6 + mulx \T6, \T7, 8\M1 + mov \S, \T7 // store T7 + adcx \T5, \T6 + mulx \T1, \T6, 16\M1 + mov 8\S, \T6 // store T6 + adcx \T2, \T1 + mulx \T1, \T7, 24\M1 + adcx \T4, \T1 + mulx \T1, \T6, 32\M1 + adcx \T0, \T1 + mulx \T1, rdx, 40\M1 + adcx \T1, rax + + add \T3, \S + adc \T5, 8\S + adc \T2, \T7 + adc \T4, \T6 + adc \T0, rdx + adc \T1, 0 + mov 48\C, \T3 + mov 56\C, \T5 + mov 64\C, \T2 + mov 72\C, \T4 + mov 80\C, \T0 + mov 88\C, \T1 +.endm + +#else + +.macro MUL384_SCHOOL M0, M1, C, S, T0, T1, T2, T3, T4, T5, T6, T7 + mov rdx, \M0 + mulx \T0, \T1, \M1 + mulx \T2, \T3, 8\M1 + mov \C, \T1 // C0_final + mulx \T4, \T5, 16\M1 + add \T0, \T3 + adc \T2, \T5 + mulx \T1, \T3, 24\M1 + adc \T4, \T3 + mulx \T5, \T6, 32\M1 + adc \T1, \T6 + mulx \T3, \T7, 40\M1 + adc \T5, \T7 + adc \T3, 0 + + mov rdx, 8\M0 + mulx \T6, \T7, \M1 + add \T0, \T7 + mov 8\C, \T0 // C1_final + adc \T2, \T6 + mulx \T6, \T7, 8\M1 + mov \S, \T7 // store T7 + adc \T4, \T6 + mulx \T0, \T6, 16\M1 + mov 8\S, \T6 // store T6 + adc \T0, \T1 + mulx \T1, rax, 24\M1 + adc \T1, \T5 + mulx \T5, \T7, 32\M1 + adc \T3, \T5 + mulx \T5, \T6, 40\M1 + adc \T5, 0 + + add \T2, \S + adc \T4, 8\S + adc \T0, rax + adc \T1, \T7 + adc \T3, \T6 + adc \T5, 0 + + mov rdx, 16\M0 + mulx \T6, \T7, \M1 + add \T2, \T7 + mov 16\C, \T2 // C2_final + adc \T4, \T6 + mulx \T6, \T7, 8\M1 + mov \S, \T7 // store T7 + adc \T0, \T6 + mulx \T2, \T6, 16\M1 + mov 8\S, \T6 // store T6 + adc \T1, \T2 + mulx \T2, rax, 24\M1 + adc \T3, \T2 + mulx \T2, \T7, 32\M1 + adc \T5, \T2 + mulx \T2, \T6, 40\M1 + adc \T2, 0 + + add \T4, \S + adc \T0, 8\S + adc \T1, rax + adc \T3, \T7 + adc \T5, \T6 + adc \T2, 0 + + mov rdx, 24\M0 + mulx \T6, \T7, \M1 + add \T4, \T7 + mov 24\C, \T4 // C3_final + adc \T0, \T6 + mulx \T6, \T7, 8\M1 + mov \S, \T7 // store T7 + adc \T1, \T6 + mulx \T4, \T6, 16\M1 + mov 8\S, \T6 // store T6 + adc \T3, \T4 + mulx \T4, rax, 24\M1 + adc \T5, \T4 + mulx \T4, \T7, 32\M1 + adc \T2, \T4 + mulx \T4, \T6, 40\M1 + adc \T4, 0 + + add \T0, \S + adc \T1, 8\S + adc \T3, rax + adc \T5, \T7 + adc \T2, \T6 + adc \T4, 0 + + mov rdx, 32\M0 + mulx \T6, \T7, \M1 + add \T0, \T7 + mov 32\C, \T0 // C4_final + adc \T1, \T6 + mulx \T6, \T7, 8\M1 + mov \S, \T7 // store T7 + adc \T3, \T6 + mulx \T0, \T6, 16\M1 + mov 8\S, \T6 // store T6 + adc \T5, \T0 + mulx \T0, rax, 24\M1 + adc \T2, \T0 + mulx \T0, \T7, 32\M1 + adc \T4, \T0 + mulx \T0, \T6, 40\M1 + adc \T0, 0 + + add \T1, \S + adc \T3, 8\S + adc \T5, rax + adc \T2, \T7 + adc \T4, \T6 + adc \T0, 0 + + mov rdx, 40\M0 + mulx \T6, \T7, \M1 + add \T1, \T7 + mov 40\C, \T1 // C5_final + adc \T3, \T6 + mulx \T6, \T7, 8\M1 + mov \S, \T7 // store T7 + adc \T5, \T6 + mulx \T1, \T6, 16\M1 + mov 8\S, \T6 // store T6 + adc \T2, \T1 + mulx \T1, rax, 24\M1 + adc \T4, \T1 + mulx \T1, \T7, 32\M1 + adc \T0, \T1 + mulx \T1, \T6, 40\M1 + adc \T1, 0 + + add \T3, \S + mov 48\C, \T3 + adc \T5, 8\S + mov 56\C, \T5 + adc \T2, rax + mov 64\C, \T2 + adc \T4, \T7 + mov 72\C, \T4 + adc \T0, \T6 + mov 80\C, \T0 + adc \T1, 0 + mov 88\C, \T1 +.endm +#endif + + +//***************************************************************************** +// 751-bit multiplication using Karatsuba (one level), schoolbook (two levels) +//***************************************************************************** +.global mul751_asm +mul751_asm: + push r12 + push r13 + push r14 + push r15 + mov rcx, reg_p3 + + // [rsp] <- AH + AL, rax <- mask + xor rax, rax + mov r8, [reg_p1] + mov r9, [reg_p1+8] + mov r10, [reg_p1+16] + mov r11, [reg_p1+24] + mov r12, [reg_p1+32] + mov r13, [reg_p1+40] + push rbx + push rbp + sub rsp, 152 + add r8, [reg_p1+48] + adc r9, [reg_p1+56] + adc r10, [reg_p1+64] + adc r11, [reg_p1+72] + adc r12, [reg_p1+80] + adc r13, [reg_p1+88] + sbb rax, 0 + mov [rsp], r8 + mov [rsp+8], r9 + mov [rsp+16], r10 + mov [rsp+24], r11 + mov [rsp+32], r12 + mov [rsp+40], r13 + + // [rsp+48] <- BH + BL, rdx <- mask + xor rdx, rdx + mov r8, [reg_p2] + mov r9, [reg_p2+8] + mov rbx, [reg_p2+16] + mov rbp, [reg_p2+24] + mov r14, [reg_p2+32] + mov r15, [reg_p2+40] + add r8, [reg_p2+48] + adc r9, [reg_p2+56] + adc rbx, [reg_p2+64] + adc rbp, [reg_p2+72] + adc r14, [reg_p2+80] + adc r15, [reg_p2+88] + sbb rdx, 0 + mov [rsp+48], r8 + mov [rsp+56], r9 + mov [rsp+64], rbx + mov [rsp+72], rbp + mov [rsp+80], r14 + mov [rsp+88], r15 + + // [rcx] <- masked (BH + BL) + and r8, rax + and r9, rax + and rbx, rax + and rbp, rax + and r14, rax + and r15, rax + mov [rcx], r8 + mov [rcx+8], r9 + mov [rcx+16], rbx ///// + mov [rcx+24], rbp ///// + + // r8-r13 <- masked (AH + AL) + mov r8, [rsp] + mov r9, [rsp+8] + and r8, rdx + and r9, rdx + and r10, rdx + and r11, rdx + and r12, rdx + and r13, rdx + + // [rsp+96] <- masked (AH + AL) + masked (AH + AL) + mov rax, [rcx] + mov rdx, [rcx+8] + add r8, rax + adc r9, rdx + adc r10, rbx + adc r11, rbp + adc r12, r14 + adc r13, r15 + mov [rsp+96], r8 + mov [rsp+104], r9 + mov [rsp+112], r10 + mov [rsp+120], r11 + + // [rcx] <- AL x BL + MUL384_SCHOOL [reg_p1], [reg_p2], [rcx], [rsp+128], r8, r9, r10, r11, rbx, rbp, r14, r15 // Result C0-C5 + + // [rcx+96] <- (AH+AL) x (BH+BL), low part + MUL384_SCHOOL [rsp], [rsp+48], [rcx+96], [rsp+128], r8, r9, r10, r11, rbx, rbp, r14, r15 + + // [rsp] <- AH x BH + MUL384_SCHOOL [reg_p1+48], [reg_p2+48], [rsp], [rsp+128], r8, r9, r10, r11, rbx, rbp, r14, r15 + + // r8-r13 <- (AH+AL) x (BH+BL), final step + mov r8, [rsp+96] + mov r9, [rsp+104] + mov r10, [rsp+112] + mov r11, [rsp+120] + mov rax, [rcx+144] + add r8, rax + mov rax, [rcx+152] + adc r9, rax + mov rax, [rcx+160] + adc r10, rax + mov rax, [rcx+168] + adc r11, rax + mov rax, [rcx+176] + adc r12, rax + mov rax, [rcx+184] + adc r13, rax + + // rdi,rdx,rbx,rbp,r14,r15,r8-r13 <- (AH+AL) x (BH+BL) - ALxBL + mov rdi, [rcx+96] + sub rdi, [rcx] + mov rdx, [rcx+104] + sbb rdx, [rcx+8] + mov rbx, [rcx+112] + sbb rbx, [rcx+16] + mov rbp, [rcx+120] + sbb rbp, [rcx+24] + mov r14, [rcx+128] + sbb r14, [rcx+32] + mov r15, [rcx+136] + sbb r15, [rcx+40] + sbb r8, [rcx+48] + sbb r9, [rcx+56] + sbb r10, [rcx+64] + sbb r11, [rcx+72] + sbb r12, [rcx+80] + sbb r13, [rcx+88] + + // rdi,rdx,rbx,rbp,r14,r15,r8-r13 <- (AH+AL) x (BH+BL) - ALxBL - AHxBH + sub rdi, [rsp] + sbb rdx, [rsp+8] + sbb rbx, [rsp+16] + sbb rbp, [rsp+24] + sbb r14, [rsp+32] + sbb r15, [rsp+40] + sbb r8, [rsp+48] + sbb r9, [rsp+56] + sbb r10, [rsp+64] + sbb r11, [rsp+72] + sbb r12, [rsp+80] + sbb r13, [rsp+88] + + mov rax, [rcx+48] + add rax, rdi + mov [rcx+48], rax // Result C6-C11 + mov rax, [rcx+56] + adc rax, rdx + mov [rcx+56], rax + mov rax, [rcx+64] + adc rax, rbx + mov [rcx+64], rax + mov rax, [rcx+72] + adc rax, rbp + mov [rcx+72], rax + mov rax, [rcx+80] + adc rax, r14 + mov [rcx+80], rax + mov rax, [rcx+88] + adc rax, r15 + mov [rcx+88], rax + mov rax, [rsp] + adc r8, rax + mov [rcx+96], r8 // Result C8-C15 + mov rax, [rsp+8] + adc r9, rax + mov [rcx+104], r9 + mov rax, [rsp+16] + adc r10, rax + mov [rcx+112], r10 + mov rax, [rsp+24] + adc r11, rax + mov [rcx+120], r11 + mov rax, [rsp+32] + adc r12, rax + mov [rcx+128], r12 + mov rax, [rsp+40] + adc r13, rax + mov [rcx+136], r13 + mov r8, [rsp+48] + mov r9, [rsp+56] + mov r10, [rsp+64] + mov r11, [rsp+72] + mov r12, [rsp+80] + mov r13, [rsp+88] + adc r8, 0 + adc r9, 0 + adc r10, 0 + adc r11, 0 + adc r12, 0 + adc r13, 0 + add rsp, 152 + mov [rcx+144], r8 + mov [rcx+152], r9 + mov [rcx+160], r10 + mov [rcx+168], r11 + mov [rcx+176], r12 + mov [rcx+184], r13 + + pop rbp + pop rbx + pop r15 + pop r14 + pop r13 + pop r12 + ret + +#else + +//*********************************************************************** +// Integer multiplication +// Based on Karatsuba method +// Operation: c [reg_p3] = a [reg_p1] * b [reg_p2] +// NOTE: a=c or b=c are not allowed +//*********************************************************************** +.global mul751_asm +mul751_asm: + push r12 + push r13 + push r14 + mov rcx, reg_p3 + + // rcx[0-5] <- AH+AL + xor rax, rax + mov r8, [reg_p1+48] + mov r9, [reg_p1+56] + mov r10, [reg_p1+64] + mov r11, [reg_p1+72] + mov r12, [reg_p1+80] + mov r13, [reg_p1+88] + add r8, [reg_p1] + adc r9, [reg_p1+8] + adc r10, [reg_p1+16] + adc r11, [reg_p1+24] + adc r12, [reg_p1+32] + adc r13, [reg_p1+40] + push r15 + mov [rcx], r8 + mov [rcx+8], r9 + mov [rcx+16], r10 + mov [rcx+24], r11 + mov [rcx+32], r12 + mov [rcx+40], r13 + sbb rax, 0 + sub rsp, 96 // Allocating space in stack + + // rcx[6-11] <- BH+BL + xor rdx, rdx + mov r8, [reg_p2+48] + mov r9, [reg_p2+56] + mov r10, [reg_p2+64] + mov r11, [reg_p2+72] + mov r12, [reg_p2+80] + mov r13, [reg_p2+88] + add r8, [reg_p2] + adc r9, [reg_p2+8] + adc r10, [reg_p2+16] + adc r11, [reg_p2+24] + adc r12, [reg_p2+32] + adc r13, [reg_p2+40] + mov [rcx+48], r8 + mov [rcx+56], r9 + mov [rcx+64], r10 + mov [rcx+72], r11 + mov [rcx+80], r12 + mov [rcx+88], r13 + sbb rdx, 0 + mov [rsp+80], rax + mov [rsp+88], rdx + + // (rsp[0-8],r10,r8,r9) <- (AH+AL)*(BH+BL) + mov r11, [rcx] + mov rax, r8 + mul r11 + mov [rsp], rax // c0 + mov r14, rdx + + xor r15, r15 + mov rax, r9 + mul r11 + xor r9, r9 + add r14, rax + adc r9, rdx + + mov r12, [rcx+8] + mov rax, r8 + mul r12 + add r14, rax + mov [rsp+8], r14 // c1 + adc r9, rdx + adc r15, 0 + + xor r8, r8 + mov rax, r10 + mul r11 + add r9, rax + mov r13, [rcx+48] + adc r15, rdx + adc r8, 0 + + mov rax, [rcx+16] + mul r13 + add r9, rax + adc r15, rdx + mov rax, [rcx+56] + adc r8, 0 + + mul r12 + add r9, rax + mov [rsp+16], r9 // c2 + adc r15, rdx + adc r8, 0 + + xor r9, r9 + mov rax, [rcx+72] + mul r11 + add r15, rax + adc r8, rdx + adc r9, 0 + + mov rax, [rcx+24] + mul r13 + add r15, rax + adc r8, rdx + adc r9, 0 + + mov rax, r10 + mul r12 + add r15, rax + adc r8, rdx + adc r9, 0 + + mov r14, [rcx+16] + mov rax, [rcx+56] + mul r14 + add r15, rax + mov [rsp+24], r15 // c3 + adc r8, rdx + adc r9, 0 + + xor r10, r10 + mov rax, [rcx+80] + mul r11 + add r8, rax + adc r9, rdx + adc r10, 0 + + mov rax, [rcx+64] + mul r14 + add r8, rax + adc r9, rdx + adc r10, 0 + + mov r15, [rcx+48] + mov rax, [rcx+32] + mul r15 + add r8, rax + adc r9, rdx + adc r10, 0 + + mov rax, [rcx+72] + mul r12 + add r8, rax + adc r9, rdx + adc r10, 0 + + mov r13, [rcx+24] + mov rax, [rcx+56] + mul r13 + add r8, rax + mov [rsp+32], r8 // c4 + adc r9, rdx + adc r10, 0 + + xor r8, r8 + mov rax, [rcx+88] + mul r11 + add r9, rax + adc r10, rdx + adc r8, 0 + + mov rax, [rcx+64] + mul r13 + add r9, rax + adc r10, rdx + adc r8, 0 + + mov rax, [rcx+72] + mul r14 + add r9, rax + adc r10, rdx + adc r8, 0 + + mov rax, [rcx+40] + mul r15 + add r9, rax + adc r10, rdx + adc r8, 0 + + mov rax, [rcx+80] + mul r12 + add r9, rax + adc r10, rdx + adc r8, 0 + + mov r15, [rcx+32] + mov rax, [rcx+56] + mul r15 + add r9, rax + mov [rsp+40], r9 // c5 + adc r10, rdx + adc r8, 0 + + xor r9, r9 + mov rax, [rcx+64] + mul r15 + add r10, rax + adc r8, rdx + adc r9, 0 + + mov rax, [rcx+88] + mul r12 + add r10, rax + adc r8, rdx + adc r9, 0 + + mov rax, [rcx+80] + mul r14 + add r10, rax + adc r8, rdx + adc r9, 0 + + mov r11, [rcx+40] + mov rax, [rcx+56] + mul r11 + add r10, rax + adc r8, rdx + adc r9, 0 + + mov rax, [rcx+72] + mul r13 + add r10, rax + mov [rsp+48], r10 // c6 + adc r8, rdx + adc r9, 0 + + xor r10, r10 + mov rax, [rcx+88] + mul r14 + add r8, rax + adc r9, rdx + adc r10, 0 + + mov rax, [rcx+64] + mul r11 + add r8, rax + adc r9, rdx + adc r10, 0 + + mov rax, [rcx+80] + mul r13 + add r8, rax + adc r9, rdx + adc r10, 0 + + mov rax, [rcx+72] + mul r15 + add r8, rax + mov [rsp+56], r8 // c7 + adc r9, rdx + adc r10, 0 + + xor r8, r8 + mov rax, [rcx+72] + mul r11 + add r9, rax + adc r10, rdx + adc r8, 0 + + mov rax, [rcx+80] + mul r15 + add r9, rax + adc r10, rdx + adc r8, 0 + + mov rax, [rcx+88] + mul r13 + add r9, rax + mov [rsp+64], r9 // c8 + adc r10, rdx + adc r8, 0 + + xor r9, r9 + mov rax, [rcx+88] + mul r15 + add r10, rax + adc r8, rdx + adc r9, 0 + + mov rax, [rcx+80] + mul r11 + add r10, rax // c9 + adc r8, rdx + adc r9, 0 + + mov rax, [rcx+88] + mul r11 + add r8, rax // c10 + adc r9, rdx // c11 + + mov rax, [rsp+88] + mov rdx, [rcx] + and r12, rax + and r14, rax + and rdx, rax + and r13, rax + and r15, rax + and r11, rax + mov rax, [rsp+48] + add rdx, rax + mov rax, [rsp+56] + adc r12, rax + mov rax, [rsp+64] + adc r14, rax + adc r13, r10 + adc r15, r8 + adc r11, r9 + mov rax, [rsp+80] + mov [rsp+48], rdx + mov [rsp+56], r12 + mov [rsp+64], r14 + mov [rsp+72], r13 + mov [rsp+80], r15 + mov [rsp+88], r11 + + mov r8, [rcx+48] + mov r9, [rcx+56] + mov r10, [rcx+64] + mov r11, [rcx+72] + mov r12, [rcx+80] + mov r13, [rcx+88] + and r8, rax + and r9, rax + and r10, rax + and r11, rax + and r12, rax + and r13, rax + mov rax, [rsp+48] + add r8, rax + mov rax, [rsp+56] + adc r9, rax + mov rax, [rsp+64] + adc r10, rax + mov rax, [rsp+72] + adc r11, rax + mov rax, [rsp+80] + adc r12, rax + mov rax, [rsp+88] + adc r13, rax + mov [rsp+48], r8 + mov [rsp+56], r9 + mov [rsp+72], r11 + + // rcx[0-11] <- AL*BL + mov r11, [reg_p1] + mov rax, [reg_p2] + mul r11 + xor r9, r9 + mov [rcx], rax // c0 + mov [rsp+64], r10 + mov r8, rdx + + mov rax, [reg_p2+8] + mul r11 + xor r10, r10 + add r8, rax + mov [rsp+80], r12 + adc r9, rdx + + mov r12, [reg_p1+8] + mov rax, [reg_p2] + mul r12 + add r8, rax + mov [rcx+8], r8 // c1 + adc r9, rdx + mov [rsp+88], r13 + adc r10, 0 + + xor r8, r8 + mov rax, [reg_p2+16] + mul r11 + add r9, rax + adc r10, rdx + adc r8, 0 + + mov r13, [reg_p2] + mov rax, [reg_p1+16] + mul r13 + add r9, rax + adc r10, rdx + adc r8, 0 + + mov rax, [reg_p2+8] + mul r12 + add r9, rax + mov [rcx+16], r9 // c2 + adc r10, rdx + adc r8, 0 + + xor r9, r9 + mov rax, [reg_p2+24] + mul r11 + add r10, rax + adc r8, rdx + adc r9, 0 + + mov rax, [reg_p1+24] + mul r13 + add r10, rax + adc r8, rdx + adc r9, 0 + + mov rax, [reg_p2+16] + mul r12 + add r10, rax + adc r8, rdx + adc r9, 0 + + mov r14, [reg_p1+16] + mov rax, [reg_p2+8] + mul r14 + add r10, rax + mov [rcx+24], r10 // c3 + adc r8, rdx + adc r9, 0 + + xor r10, r10 + mov rax, [reg_p2+32] + mul r11 + add r8, rax + adc r9, rdx + adc r10, 0 + + mov rax, [reg_p2+16] + mul r14 + add r8, rax + adc r9, rdx + adc r10, 0 + + mov rax, [reg_p1+32] + mul r13 + add r8, rax + adc r9, rdx + adc r10, 0 + + mov rax, [reg_p2+24] + mul r12 + add r8, rax + adc r9, rdx + adc r10, 0 + + mov r13, [reg_p1+24] + mov rax, [reg_p2+8] + mul r13 + add r8, rax + mov [rcx+32], r8 // c4 + adc r9, rdx + adc r10, 0 + + xor r8, r8 + mov rax, [reg_p2+40] + mul r11 + add r9, rax + adc r10, rdx + adc r8, 0 + + mov rax, [reg_p2+16] + mul r13 + add r9, rax + adc r10, rdx + adc r8, 0 + + mov rax, [reg_p2+24] + mul r14 + add r9, rax + adc r10, rdx + adc r8, 0 + + mov r11, [reg_p1+40] + mov rax, [reg_p2] + mul r11 + add r9, rax + adc r10, rdx + adc r8, 0 + + mov rax, [reg_p2+32] + mul r12 + add r9, rax + adc r10, rdx + adc r8, 0 + + mov r15, [reg_p1+32] + mov rax, [reg_p2+8] + mul r15 + add r9, rax + mov [rcx+40], r9 // c5 + adc r10, rdx + adc r8, 0 + + xor r9, r9 + mov rax, [reg_p2+16] + mul r15 + add r10, rax + adc r8, rdx + adc r9, 0 + + mov rax, [reg_p2+40] + mul r12 + add r10, rax + adc r8, rdx + adc r9, 0 + + mov rax, [reg_p2+32] + mul r14 + add r10, rax + adc r8, rdx + adc r9, 0 + + mov rax, [reg_p2+8] + mul r11 + add r10, rax + adc r8, rdx + adc r9, 0 + + mov rax, [reg_p2+24] + mul r13 + add r10, rax + mov [rcx+48], r10 // c6 + adc r8, rdx + adc r9, 0 + + xor r10, r10 + mov rax, [reg_p2+40] + mul r14 + add r8, rax + adc r9, rdx + adc r10, 0 + + mov rax, [reg_p2+16] + mul r11 + add r8, rax + adc r9, rdx + adc r10, 0 + + mov rax, [reg_p2+32] + mul r13 + add r8, rax + adc r9, rdx + adc r10, 0 + + mov rax, [reg_p2+24] + mul r15 + add r8, rax + mov [rcx+56], r8 // c7 + adc r9, rdx + adc r10, 0 + + xor r8, r8 + mov rax, [reg_p2+24] + mul r11 + add r9, rax + adc r10, rdx + adc r8, 0 + + mov rax, [reg_p2+32] + mul r15 + add r9, rax + adc r10, rdx + adc r8, 0 + + mov rax, [reg_p2+40] + mul r13 + add r9, rax + mov [rcx+64], r9 // c8 + adc r10, rdx + adc r8, 0 + + xor r9, r9 + mov rax, [reg_p2+40] + mul r15 + add r10, rax + adc r8, rdx + adc r9, 0 + + mov rax, [reg_p2+32] + mul r11 + add r10, rax + mov [rcx+72], r10 // c9 + adc r8, rdx + adc r9, 0 + + mov rax, [reg_p2+40] + mul r11 + add r8, rax + mov [rcx+80], r8 // c10 + adc r9, rdx + mov [rcx+88], r9 // c11 + + // rcx[12-23] <- AH*BH + mov r11, [reg_p1+48] + mov rax, [reg_p2+48] + mul r11 + xor r9, r9 + mov [rcx+96], rax // c0 + mov r8, rdx + + mov rax, [reg_p2+56] + mul r11 + xor r10, r10 + add r8, rax + adc r9, rdx + + mov r12, [reg_p1+56] + mov rax, [reg_p2+48] + mul r12 + add r8, rax + mov [rcx+104], r8 // c1 + adc r9, rdx + adc r10, 0 + + xor r8, r8 + mov rax, [reg_p2+64] + mul r11 + add r9, rax + adc r10, rdx + adc r8, 0 + + mov r13, [reg_p2+48] + mov rax, [reg_p1+64] + mul r13 + add r9, rax + adc r10, rdx + adc r8, 0 + + mov rax, [reg_p2+56] + mul r12 + add r9, rax + mov [rcx+112], r9 // c2 + adc r10, rdx + adc r8, 0 + + xor r9, r9 + mov rax, [reg_p2+72] + mul r11 + add r10, rax + adc r8, rdx + adc r9, 0 + + mov rax, [reg_p1+72] + mul r13 + add r10, rax + adc r8, rdx + adc r9, 0 + + mov rax, [reg_p2+64] + mul r12 + add r10, rax + adc r8, rdx + adc r9, 0 + + mov r14, [reg_p1+64] + mov rax, [reg_p2+56] + mul r14 + add r10, rax + mov [rcx+120], r10 // c3 + adc r8, rdx + adc r9, 0 + + xor r10, r10 + mov rax, [reg_p2+80] + mul r11 + add r8, rax + adc r9, rdx + adc r10, 0 + + mov rax, [reg_p2+64] + mul r14 + add r8, rax + adc r9, rdx + adc r10, 0 + + mov r15, [reg_p1+80] + mov rax, r13 + mul r15 + add r8, rax + adc r9, rdx + adc r10, 0 + + mov rax, [reg_p2+72] + mul r12 + add r8, rax + adc r9, rdx + adc r10, 0 + + mov r13, [reg_p1+72] + mov rax, [reg_p2+56] + mul r13 + add r8, rax + mov [rcx+128], r8 // c4 + adc r9, rdx + adc r10, 0 + + xor r8, r8 + mov rax, [reg_p2+88] + mul r11 + add r9, rax + adc r10, rdx + adc r8, 0 + + mov rax, [reg_p2+64] + mul r13 + add r9, rax + adc r10, rdx + adc r8, 0 + + mov rax, [reg_p2+72] + mul r14 + add r9, rax + adc r10, rdx + adc r8, 0 + + mov r11, [reg_p1+88] + mov rax, [reg_p2+48] + mul r11 + add r9, rax + adc r10, rdx + adc r8, 0 + + mov rax, [reg_p2+80] + mul r12 + add r9, rax + adc r10, rdx + adc r8, 0 + + mov rax, [reg_p2+56] + mul r15 + add r9, rax + mov [rcx+136], r9 // c5 + adc r10, rdx + adc r8, 0 + + xor r9, r9 + mov rax, [reg_p2+64] + mul r15 + add r10, rax + adc r8, rdx + adc r9, 0 + + mov rax, [reg_p2+88] + mul r12 + add r10, rax + adc r8, rdx + adc r9, 0 + + mov rax, [reg_p2+80] + mul r14 + add r10, rax + adc r8, rdx + adc r9, 0 + + mov rax, [reg_p2+56] + mul r11 + add r10, rax + adc r8, rdx + adc r9, 0 + + mov rax, [reg_p2+72] + mul r13 + add r10, rax + mov [rcx+144], r10 // c6 + adc r8, rdx + adc r9, 0 + + xor r10, r10 + mov rax, [reg_p2+88] + mul r14 + add r8, rax + adc r9, rdx + adc r10, 0 + + mov rax, [reg_p2+64] + mul r11 + add r8, rax + adc r9, rdx + adc r10, 0 + + mov rax, [reg_p2+80] + mul r13 + add r8, rax + adc r9, rdx + adc r10, 0 + + mov rax, [reg_p2+72] + mul r15 + add r8, rax + mov [rcx+152], r8 // c7 + adc r9, rdx + adc r10, 0 + + xor r8, r8 + mov rax, [reg_p2+72] + mul r11 + add r9, rax + adc r10, rdx + adc r8, 0 + + mov rax, [reg_p2+80] + mul r15 + add r9, rax + adc r10, rdx + adc r8, 0 + + mov rax, [reg_p2+88] + mul r13 + add r9, rax + mov [rcx+160], r9 // c8 + adc r10, rdx + adc r8, 0 + + mov rax, [reg_p2+88] + mul r15 + add r10, rax + adc r8, rdx + + mov rax, [reg_p2+80] + mul r11 + add r10, rax + mov [rcx+168], r10 // c9 + adc r8, rdx + + mov rax, [reg_p2+88] + mul r11 + add r8, rax + mov [rcx+176], r8 // c10 + adc rdx, 0 + mov [rcx+184], rdx // c11 + + // [r8-r15,rax,rdx,rdi,[rsp]] <- (AH+AL)*(BH+BL) - AL*BL + mov r8, [rsp] + sub r8, [rcx] + mov r9, [rsp+8] + sbb r9, [rcx+8] + mov r10, [rsp+16] + sbb r10, [rcx+16] + mov r11, [rsp+24] + sbb r11, [rcx+24] + mov r12, [rsp+32] + sbb r12, [rcx+32] + mov r13, [rsp+40] + sbb r13, [rcx+40] + mov r14, [rsp+48] + sbb r14, [rcx+48] + mov r15, [rsp+56] + sbb r15, [rcx+56] + mov rax, [rsp+64] + sbb rax, [rcx+64] + mov rdx, [rsp+72] + sbb rdx, [rcx+72] + mov rdi, [rsp+80] + sbb rdi, [rcx+80] + mov rsi, [rsp+88] + sbb rsi, [rcx+88] + mov [rsp], rsi + + // [r8-r15,rax,rdx,rdi,[rsp]] <- (AH+AL)*(BH+BL) - AL*BL - AH*BH + mov rsi, [rcx+96] + sub r8, rsi + mov rsi, [rcx+104] + sbb r9, rsi + mov rsi, [rcx+112] + sbb r10, rsi + mov rsi, [rcx+120] + sbb r11, rsi + mov rsi, [rcx+128] + sbb r12, rsi + mov rsi, [rcx+136] + sbb r13, rsi + mov rsi, [rcx+144] + sbb r14, rsi + mov rsi, [rcx+152] + sbb r15, rsi + mov rsi, [rcx+160] + sbb rax, rsi + mov rsi, [rcx+168] + sbb rdx, rsi + mov rsi, [rcx+176] + sbb rdi, rsi + mov rsi, [rsp] + sbb rsi, [rcx+184] + + // Final result + add r8, [rcx+48] + mov [rcx+48], r8 + adc r9, [rcx+56] + mov [rcx+56], r9 + adc r10, [rcx+64] + mov [rcx+64], r10 + adc r11, [rcx+72] + mov [rcx+72], r11 + adc r12, [rcx+80] + mov [rcx+80], r12 + adc r13, [rcx+88] + mov [rcx+88], r13 + adc r14, [rcx+96] + mov [rcx+96], r14 + adc r15, [rcx+104] + mov [rcx+104], r15 + adc rax, [rcx+112] + mov [rcx+112], rax + adc rdx, [rcx+120] + mov [rcx+120], rdx + adc rdi, [rcx+128] + mov [rcx+128], rdi + adc rsi, [rcx+136] + mov [rcx+136], rsi + mov rax, [rcx+144] + adc rax, 0 + mov [rcx+144], rax + mov rax, [rcx+152] + adc rax, 0 + mov [rcx+152], rax + mov rax, [rcx+160] + adc rax, 0 + mov [rcx+160], rax + mov rax, [rcx+168] + adc rax, 0 + mov [rcx+168], rax + mov rax, [rcx+176] + adc rax, 0 + mov [rcx+176], rax + mov rax, [rcx+184] + adc rax, 0 + mov [rcx+184], rax + + add rsp, 96 // Restoring space in stack + pop r15 + pop r14 + pop r13 + pop r12 + ret + +#endif + + +//*********************************************************************** +// Montgomery reduction +// Based on comba method +// Operation: c [reg_p2] = a [reg_p1] +// NOTE: a=c is not allowed +//*********************************************************************** +.global rdc751_asm +rdc751_asm: + push r12 + push r13 + push r14 + push r15 + + mov r11, [reg_p1] + movq rax, p751p1_5 + mul r11 + xor r8, r8 + add rax, [reg_p1+40] + mov [reg_p2+40], rax // z5 + adc r8, rdx + + xor r9, r9 + movq rax, p751p1_6 + mul r11 + xor r10, r10 + add r8, rax + adc r9, rdx + + mov r12, [reg_p1+8] + movq rax, p751p1_5 + mul r12 + add r8, rax + adc r9, rdx + adc r10, 0 + add r8, [reg_p1+48] + mov [reg_p2+48], r8 // z6 + adc r9, 0 + adc r10, 0 + + xor r8, r8 + movq rax, p751p1_7 + mul r11 + add r9, rax + adc r10, rdx + adc r8, 0 + + movq rax, p751p1_6 + mul r12 + add r9, rax + adc r10, rdx + adc r8, 0 + + mov r13, [reg_p1+16] + movq rax, p751p1_5 + mul r13 + add r9, rax + adc r10, rdx + adc r8, 0 + add r9, [reg_p1+56] + mov [reg_p2+56], r9 // z7 + adc r10, 0 + adc r8, 0 + + xor r9, r9 + movq rax, p751p1_8 + mul r11 + add r10, rax + adc r8, rdx + adc r9, 0 + + movq rax, p751p1_7 + mul r12 + add r10, rax + adc r8, rdx + adc r9, 0 + + movq rax, p751p1_6 + mul r13 + add r10, rax + adc r8, rdx + adc r9, 0 + + mov r14, [reg_p1+24] + movq rax, p751p1_5 + mul r14 + add r10, rax + adc r8, rdx + adc r9, 0 + add r10, [reg_p1+64] + mov [reg_p2+64], r10 // z8 + adc r8, 0 + adc r9, 0 + + xor r10, r10 + movq rax, p751p1_9 + mul r11 + add r8, rax + adc r9, rdx + adc r10, 0 + + movq rax, p751p1_8 + mul r12 + add r8, rax + adc r9, rdx + adc r10, 0 + + movq rax, p751p1_7 + mul r13 + add r8, rax + adc r9, rdx + adc r10, 0 + + movq rax, p751p1_6 + mul r14 + add r8, rax + adc r9, rdx + adc r10, 0 + + mov r15, [reg_p1+32] + movq rax, p751p1_5 + mul r15 + add r8, rax + adc r9, rdx + adc r10, 0 + add r8, [reg_p1+72] + mov [reg_p2+72], r8 // z9 + adc r9, 0 + adc r10, 0 + + xor r8, r8 + movq rax, p751p1_10 + mul r11 + add r9, rax + adc r10, rdx + adc r8, 0 + + movq rax, p751p1_9 + mul r12 + add r9, rax + adc r10, rdx + adc r8, 0 + + movq rax, p751p1_8 + mul r13 + add r9, rax + adc r10, rdx + adc r8, 0 + + movq rax, p751p1_7 + mul r14 + add r9, rax + adc r10, rdx + adc r8, 0 + + movq rax, p751p1_6 + mul r15 + add r9, rax + adc r10, rdx + adc r8, 0 + + mov rcx, [reg_p2+40] + movq rax, p751p1_5 + mul rcx + add r9, rax + adc r10, rdx + adc r8, 0 + add r9, [reg_p1+80] + mov [reg_p2+80], r9 // z10 + adc r10, 0 + adc r8, 0 + + xor r9, r9 + movq rax, p751p1_11 + mul r11 + add r10, rax + adc r8, rdx + adc r9, 0 + + movq rax, p751p1_10 + mul r12 + add r10, rax + adc r8, rdx + adc r9, 0 + + movq rax, p751p1_9 + mul r13 + add r10, rax + adc r8, rdx + adc r9, 0 + + movq rax, p751p1_8 + mul r14 + add r10, rax + adc r8, rdx + adc r9, 0 + + movq rax, p751p1_7 + mul r15 + add r10, rax + adc r8, rdx + adc r9, 0 + + movq rax, p751p1_6 + mul rcx + add r10, rax + adc r8, rdx + adc r9, 0 + + mov r11, [reg_p2+48] + movq rax, p751p1_5 + mul r11 + add r10, rax + adc r8, rdx + adc r9, 0 + add r10, [reg_p1+88] + mov [reg_p2+88], r10 // z11 + adc r8, 0 + adc r9, 0 + + xor r10, r10 + movq rax, p751p1_11 + mul r12 + add r8, rax + adc r9, rdx + adc r10, 0 + + movq rax, p751p1_10 + mul r13 + add r8, rax + adc r9, rdx + adc r10, 0 + + movq rax, p751p1_9 + mul r14 + add r8, rax + adc r9, rdx + adc r10, 0 + + movq rax, p751p1_8 + mul r15 + add r8, rax + adc r9, rdx + adc r10, 0 + + movq rax, p751p1_7 + mul rcx + add r8, rax + adc r9, rdx + adc r10, 0 + + movq rax, p751p1_6 + mul r11 + add r8, rax + adc r9, rdx + adc r10, 0 + + mov r12, [reg_p2+56] + movq rax, p751p1_5 + mul r12 + add r8, rax + adc r9, rdx + adc r10, 0 + add r8, [reg_p1+96] + mov [reg_p2], r8 // z0 + adc r9, 0 + adc r10, 0 + + xor r8, r8 + movq rax, p751p1_11 + mul r13 + add r9, rax + adc r10, rdx + adc r8, 0 + + movq rax, p751p1_10 + mul r14 + add r9, rax + adc r10, rdx + adc r8, 0 + + movq rax, p751p1_9 + mul r15 + add r9, rax + adc r10, rdx + adc r8, 0 + + movq rax, p751p1_8 + mul rcx + add r9, rax + adc r10, rdx + adc r8, 0 + + movq rax, p751p1_7 + mul r11 + add r9, rax + adc r10, rdx + adc r8, 0 + + movq rax, p751p1_6 + mul r12 + add r9, rax + adc r10, rdx + adc r8, 0 + + mov r13, [reg_p2+64] + movq rax, p751p1_5 + mul r13 + add r9, rax + adc r10, rdx + adc r8, 0 + add r9, [reg_p1+104] + mov [reg_p2+8], r9 // z1 + adc r10, 0 + adc r8, 0 + + xor r9, r9 + movq rax, p751p1_11 + mul r14 + add r10, rax + adc r8, rdx + adc r9, 0 + + movq rax, p751p1_10 + mul r15 + add r10, rax + adc r8, rdx + adc r9, 0 + + movq rax, p751p1_9 + mul rcx + add r10, rax + adc r8, rdx + adc r9, 0 + + movq rax, p751p1_8 + mul r11 + add r10, rax + adc r8, rdx + adc r9, 0 + + movq rax, p751p1_7 + mul r12 + add r10, rax + adc r8, rdx + adc r9, 0 + + movq rax, p751p1_6 + mul r13 + add r10, rax + adc r8, rdx + adc r9, 0 + + mov r14, [reg_p2+72] + movq rax, p751p1_5 + mul r14 + add r10, rax + adc r8, rdx + adc r9, 0 + add r10, [reg_p1+112] + mov [reg_p2+16], r10 // z2 + adc r8, 0 + adc r9, 0 + + xor r10, r10 + movq rax, p751p1_11 + mul r15 + add r8, rax + adc r9, rdx + adc r10, 0 + + movq rax, p751p1_10 + mul rcx + add r8, rax + adc r9, rdx + adc r10, 0 + + movq rax, p751p1_9 + mul r11 + add r8, rax + adc r9, rdx + adc r10, 0 + + movq rax, p751p1_8 + mul r12 + add r8, rax + adc r9, rdx + adc r10, 0 + + movq rax, p751p1_7 + mul r13 + add r8, rax + adc r9, rdx + adc r10, 0 + + movq rax, p751p1_6 + mul r14 + add r8, rax + adc r9, rdx + adc r10, 0 + + mov r15, [reg_p2+80] + movq rax, p751p1_5 + mul r15 + add r8, rax + adc r9, rdx + adc r10, 0 + add r8, [reg_p1+120] + mov [reg_p2+24], r8 // z3 + adc r9, 0 + adc r10, 0 + + xor r8, r8 + movq rax, p751p1_11 + mul rcx + add r9, rax + adc r10, rdx + adc r8, 0 + + movq rax, p751p1_10 + mul r11 + add r9, rax + adc r10, rdx + adc r8, 0 + + movq rax, p751p1_9 + mul r12 + add r9, rax + adc r10, rdx + adc r8, 0 + + movq rax, p751p1_8 + mul r13 + add r9, rax + adc r10, rdx + adc r8, 0 + + movq rax, p751p1_7 + mul r14 + add r9, rax + adc r10, rdx + adc r8, 0 + + movq rax, p751p1_6 + mul r15 + add r9, rax + adc r10, rdx + adc r8, 0 + + mov rcx, [reg_p2+88] + movq rax, p751p1_5 + mul rcx + add r9, rax + adc r10, rdx + adc r8, 0 + add r9, [reg_p1+128] + mov [reg_p2+32], r9 // z4 + adc r10, 0 + adc r8, 0 + + xor r9, r9 + movq rax, p751p1_11 + mul r11 + add r10, rax + adc r8, rdx + adc r9, 0 + + movq rax, p751p1_10 + mul r12 + add r10, rax + adc r8, rdx + adc r9, 0 + + movq rax, p751p1_9 + mul r13 + add r10, rax + adc r8, rdx + adc r9, 0 + + movq rax, p751p1_8 + mul r14 + add r10, rax + adc r8, rdx + adc r9, 0 + + movq rax, p751p1_7 + mul r15 + add r10, rax + adc r8, rdx + adc r9, 0 + + movq rax, p751p1_6 + mul rcx + add r10, rax + adc r8, rdx + adc r9, 0 + add r10, [reg_p1+136] + mov [reg_p2+40], r10 // z5 + adc r8, 0 + adc r9, 0 + + xor r10, r10 + movq rax, p751p1_11 + mul r12 + add r8, rax + adc r9, rdx + adc r10, 0 + + movq rax, p751p1_10 + mul r13 + add r8, rax + adc r9, rdx + adc r10, 0 + + movq rax, p751p1_9 + mul r14 + add r8, rax + adc r9, rdx + adc r10, 0 + + movq rax, p751p1_8 + mul r15 + add r8, rax + adc r9, rdx + adc r10, 0 + + movq rax, p751p1_7 + mul rcx + add r8, rax + adc r9, rdx + adc r10, 0 + add r8, [reg_p1+144] + mov [reg_p2+48], r8 // z6 + adc r9, 0 + adc r10, 0 + + xor r8, r8 + movq rax, p751p1_11 + mul r13 + add r9, rax + adc r10, rdx + adc r8, 0 + + movq rax, p751p1_10 + mul r14 + add r9, rax + adc r10, rdx + adc r8, 0 + + movq rax, p751p1_9 + mul r15 + add r9, rax + adc r10, rdx + adc r8, 0 + + movq rax, p751p1_8 + mul rcx + add r9, rax + adc r10, rdx + adc r8, 0 + add r9, [reg_p1+152] + mov [reg_p2+56], r9 // z7 + adc r10, 0 + adc r8, 0 + + xor r9, r9 + movq rax, p751p1_11 + mul r14 + add r10, rax + adc r8, rdx + adc r9, 0 + + movq rax, p751p1_10 + mul r15 + add r10, rax + adc r8, rdx + adc r9, 0 + + movq rax, p751p1_9 + mul rcx + add r10, rax + adc r8, rdx + adc r9, 0 + add r10, [reg_p1+160] + mov [reg_p2+64], r10 // z8 + adc r8, 0 + adc r9, 0 + + xor r10, r10 + movq rax, p751p1_11 + mul r15 + add r8, rax + adc r9, rdx + adc r10, 0 + + movq rax, p751p1_10 + mul rcx + add r8, rax + adc r9, rdx + adc r10, 0 + add r8, [reg_p1+168] // z9 + mov [reg_p2+72], r8 // z9 + adc r9, 0 + adc r10, 0 + + movq rax, p751p1_11 + mul rcx + add r9, rax + adc r10, rdx + add r9, [reg_p1+176] // z10 + mov [reg_p2+80], r9 // z10 + adc r10, 0 + add r10, [reg_p1+184] // z11 + mov [reg_p2+88], r10 // z11 + + pop r15 + pop r14 + pop r13 + pop r12 + ret + + +//*********************************************************************** +// 751-bit multiprecision addition +// Operation: c [reg_p3] = a [reg_p1] + b [reg_p2] +//*********************************************************************** +.global mp_add751_asm +mp_add751_asm: + push r12 + push r13 + push r14 + push r15 + push rbx + + mov r8, [reg_p1] + mov r9, [reg_p1+8] + mov r10, [reg_p1+16] + mov r11, [reg_p1+24] + mov r12, [reg_p1+32] + mov r13, [reg_p1+40] + mov r14, [reg_p1+48] + mov r15, [reg_p1+56] + mov rax, [reg_p1+64] + mov rbx, [reg_p1+72] + mov rcx, [reg_p1+80] + mov rdi, [reg_p1+88] + + add r8, [reg_p2] + adc r9, [reg_p2+8] + adc r10, [reg_p2+16] + adc r11, [reg_p2+24] + adc r12, [reg_p2+32] + adc r13, [reg_p2+40] + adc r14, [reg_p2+48] + adc r15, [reg_p2+56] + adc rax, [reg_p2+64] + adc rbx, [reg_p2+72] + adc rcx, [reg_p2+80] + adc rdi, [reg_p2+88] + + mov [reg_p3], r8 + mov [reg_p3+8], r9 + mov [reg_p3+16], r10 + mov [reg_p3+24], r11 + mov [reg_p3+32], r12 + mov [reg_p3+40], r13 + mov [reg_p3+48], r14 + mov [reg_p3+56], r15 + mov [reg_p3+64], rax + mov [reg_p3+72], rbx + mov [reg_p3+80], rcx + mov [reg_p3+88], rdi + + pop rbx + pop r15 + pop r14 + pop r13 + pop r12 + ret + + +//*********************************************************************** +// 2x751-bit multiprecision addition +// Operation: c [reg_p3] = a [reg_p1] + b [reg_p2] +//*********************************************************************** +.global mp_add751x2_asm +mp_add751x2_asm: + push r12 + push r13 + push r14 + push r15 + push rbx + + mov r8, [reg_p1] + mov r9, [reg_p1+8] + mov r10, [reg_p1+16] + mov r11, [reg_p1+24] + mov r12, [reg_p1+32] + mov r13, [reg_p1+40] + mov r14, [reg_p1+48] + mov r15, [reg_p1+56] + mov rax, [reg_p1+64] + mov rbx, [reg_p1+72] + mov rcx, [reg_p1+80] + + add r8, [reg_p2] + adc r9, [reg_p2+8] + adc r10, [reg_p2+16] + adc r11, [reg_p2+24] + adc r12, [reg_p2+32] + adc r13, [reg_p2+40] + adc r14, [reg_p2+48] + adc r15, [reg_p2+56] + adc rax, [reg_p2+64] + adc rbx, [reg_p2+72] + adc rcx, [reg_p2+80] + + mov [reg_p3], r8 + mov [reg_p3+8], r9 + mov [reg_p3+16], r10 + mov [reg_p3+24], r11 + mov [reg_p3+32], r12 + mov [reg_p3+40], r13 + mov [reg_p3+48], r14 + mov [reg_p3+56], r15 + mov [reg_p3+64], rax + mov [reg_p3+72], rbx + mov [reg_p3+80], rcx + mov rax, [reg_p1+88] + adc rax, [reg_p2+88] + mov [reg_p3+88], rax + + mov r8, [reg_p1+96] + mov r9, [reg_p1+104] + mov r10, [reg_p1+112] + mov r11, [reg_p1+120] + mov r12, [reg_p1+128] + mov r13, [reg_p1+136] + mov r14, [reg_p1+144] + mov r15, [reg_p1+152] + mov rax, [reg_p1+160] + mov rbx, [reg_p1+168] + mov rcx, [reg_p1+176] + mov rdi, [reg_p1+184] + + adc r8, [reg_p2+96] + adc r9, [reg_p2+104] + adc r10, [reg_p2+112] + adc r11, [reg_p2+120] + adc r12, [reg_p2+128] + adc r13, [reg_p2+136] + adc r14, [reg_p2+144] + adc r15, [reg_p2+152] + adc rax, [reg_p2+160] + adc rbx, [reg_p2+168] + adc rcx, [reg_p2+176] + adc rdi, [reg_p2+184] + + mov [reg_p3+96], r8 + mov [reg_p3+104], r9 + mov [reg_p3+112], r10 + mov [reg_p3+120], r11 + mov [reg_p3+128], r12 + mov [reg_p3+136], r13 + mov [reg_p3+144], r14 + mov [reg_p3+152], r15 + mov [reg_p3+160], rax + mov [reg_p3+168], rbx + mov [reg_p3+176], rcx + mov [reg_p3+184], rdi + + pop rbx + pop r15 + pop r14 + pop r13 + pop r12 + ret + + +//*********************************************************************** +// 2x751-bit multiprecision subtraction +// Operation: c [reg_p3] = a [reg_p1] - b [reg_p2]. Returns borrow mask +//*********************************************************************** +.global mp_sub751x2_asm +mp_sub751x2_asm: + push r12 + push r13 + push r14 + push r15 + push rbx + + mov r8, [reg_p1] + mov r9, [reg_p1+8] + mov r10, [reg_p1+16] + mov r11, [reg_p1+24] + mov r12, [reg_p1+32] + mov r13, [reg_p1+40] + mov r14, [reg_p1+48] + mov r15, [reg_p1+56] + mov rax, [reg_p1+64] + mov rbx, [reg_p1+72] + mov rcx, [reg_p1+80] + + sub r8, [reg_p2] + sbb r9, [reg_p2+8] + sbb r10, [reg_p2+16] + sbb r11, [reg_p2+24] + sbb r12, [reg_p2+32] + sbb r13, [reg_p2+40] + sbb r14, [reg_p2+48] + sbb r15, [reg_p2+56] + sbb rax, [reg_p2+64] + sbb rbx, [reg_p2+72] + sbb rcx, [reg_p2+80] + + mov [reg_p3], r8 + mov [reg_p3+8], r9 + mov [reg_p3+16], r10 + mov [reg_p3+24], r11 + mov [reg_p3+32], r12 + mov [reg_p3+40], r13 + mov [reg_p3+48], r14 + mov [reg_p3+56], r15 + mov [reg_p3+64], rax + mov [reg_p3+72], rbx + mov [reg_p3+80], rcx + mov rax, [reg_p1+88] + sbb rax, [reg_p2+88] + mov [reg_p3+88], rax + + mov r8, [reg_p1+96] + mov r9, [reg_p1+104] + mov r10, [reg_p1+112] + mov r11, [reg_p1+120] + mov r12, [reg_p1+128] + mov r13, [reg_p1+136] + mov r14, [reg_p1+144] + mov r15, [reg_p1+152] + mov rax, [reg_p1+160] + mov rbx, [reg_p1+168] + mov rcx, [reg_p1+176] + mov rdi, [reg_p1+184] + + sbb r8, [reg_p2+96] + sbb r9, [reg_p2+104] + sbb r10, [reg_p2+112] + sbb r11, [reg_p2+120] + sbb r12, [reg_p2+128] + sbb r13, [reg_p2+136] + sbb r14, [reg_p2+144] + sbb r15, [reg_p2+152] + sbb rax, [reg_p2+160] + sbb rbx, [reg_p2+168] + sbb rcx, [reg_p2+176] + sbb rdi, [reg_p2+184] + + mov [reg_p3+96], r8 + mov [reg_p3+104], r9 + mov [reg_p3+112], r10 + mov [reg_p3+120], r11 + mov [reg_p3+128], r12 + mov [reg_p3+136], r13 + mov [reg_p3+144], r14 + mov [reg_p3+152], r15 + mov [reg_p3+160], rax + mov rax, 0 + sbb rax, 0 + mov [reg_p3+168], rbx + mov [reg_p3+176], rcx + mov [reg_p3+184], rdi + + pop rbx + pop r15 + pop r14 + pop r13 + pop r12 + ret diff --git a/sidh_ref/fpx.c b/sidh_ref/fpx.c new file mode 100644 index 0000000..b4156b2 --- /dev/null +++ b/sidh_ref/fpx.c @@ -0,0 +1,474 @@ +/******************************************************************************************** +* Supersingular Isogeny Key Encapsulation Library +* +* Abstract: core functions over GF(p) and GF(p^2) +*********************************************************************************************/ + +#include "P751_internal.h" + +__inline void fpcopy(const felm_t a, felm_t c) +{ // Copy a field element, c = a. + unsigned int i; + + for (i = 0; i < NWORDS_FIELD; i++) + c[i] = a[i]; +} + +__inline void fpzero(felm_t a) +{ // Zero a field element, a = 0. + unsigned int i; + + for (i = 0; i < NWORDS_FIELD; i++) + a[i] = 0; +} + +void to_mont(const felm_t a, felm_t mc) +{ // Conversion to Montgomery representation, + // mc = a*R^2*R^(-1) mod p = a*R mod p, where a in [0, p-1]. + // The Montgomery constant R^2 mod p is the global value "Montgomery_R2". + + fpmul_mont(a, (digit_t *)&Montgomery_R2, mc); +} + +void from_mont(const felm_t ma, felm_t c) +{ // Conversion from Montgomery representation to standard representation, + // c = ma*R^(-1) mod p = a mod p, where ma in [0, p-1]. + digit_t one[NWORDS_FIELD] = {0}; + + one[0] = 1; + fpmul_mont(ma, one, c); + fpcorrection(c); +} + +void copy_words(const digit_t *a, digit_t *c, const unsigned int nwords) +{ // Copy wordsize digits, c = a, where lng(a) = nwords. + unsigned int i; + + for (i = 0; i < nwords; i++) + { + c[i] = a[i]; + } +} + +void fpmul_mont(const felm_t ma, const felm_t mb, felm_t mc) +{ // Multiprecision multiplication, c = a*b mod p. + dfelm_t temp = {0}; + + mp_mul(ma, mb, temp, NWORDS_FIELD); + rdc_mont(temp, mc); +} + +void fpsqr_mont(const felm_t ma, felm_t mc) +{ // Multiprecision squaring, c = a^2 mod p. + dfelm_t temp = {0}; + + mp_mul(ma, ma, temp, NWORDS_FIELD); + rdc_mont(temp, mc); +} + +void fpinv_mont(felm_t a) +{ // Field inversion using Montgomery arithmetic, a = a^(-1)*R mod p. + felm_t tt; + + fpcopy(a, tt); + fpinv_chain_mont(tt); + fpsqr_mont(tt, tt); + fpsqr_mont(tt, tt); + fpmul_mont(a, tt, a); +} + +void fp2copy(const f2elm_t a, f2elm_t c) +{ // Copy a GF(p^2) element, c = a. + fpcopy(a[0], c[0]); + fpcopy(a[1], c[1]); +} + +void fp2zero(f2elm_t a) +{ // Zero a GF(p^2) element, a = 0. + fpzero(a[0]); + fpzero(a[1]); +} + +void fp2neg(f2elm_t a) +{ // GF(p^2) negation, a = -a in GF(p^2). + fpneg(a[0]); + fpneg(a[1]); +} + +__inline void fp2add(const f2elm_t a, const f2elm_t b, f2elm_t c) +{ // GF(p^2) addition, c = a+b in GF(p^2). + fpadd(a[0], b[0], c[0]); + fpadd(a[1], b[1], c[1]); +} + +__inline void fp2sub(const f2elm_t a, const f2elm_t b, f2elm_t c) +{ // GF(p^2) subtraction, c = a-b in GF(p^2). + fpsub(a[0], b[0], c[0]); + fpsub(a[1], b[1], c[1]); +} + +void fp2div2(const f2elm_t a, f2elm_t c) +{ // GF(p^2) division by two, c = a/2 in GF(p^2). + fpdiv2(a[0], c[0]); + fpdiv2(a[1], c[1]); +} + +void fp2correction(f2elm_t a) +{ // Modular correction, a = a in GF(p^2). + fpcorrection(a[0]); + fpcorrection(a[1]); +} + +__inline static void mp_addfast(const digit_t *a, const digit_t *b, digit_t *c) +{ // Multiprecision addition, c = a+b. + + mp_add_asm(a, b, c); +} + +__inline static void mp_addfastx2(const digit_t *a, const digit_t *b, digit_t *c) +{ // Double-length multiprecision addition, c = a+b. + + mp_addx2_asm(a, b, c); +} + +void fp2sqr_mont(const f2elm_t a, f2elm_t c) +{ // GF(p^2) squaring using Montgomery arithmetic, c = a^2 in GF(p^2). + // Inputs: a = a0+a1*i, where a0, a1 are in [0, 2*p-1] + // Output: c = c0+c1*i, where c0, c1 are in [0, 2*p-1] + felm_t t1, t2, t3; + + mp_addfast(a[0], a[1], t1); // t1 = a0+a1 + fpsub(a[0], a[1], t2); // t2 = a0-a1 + mp_addfast(a[0], a[0], t3); // t3 = 2a0 + fpmul_mont(t1, t2, c[0]); // c0 = (a0+a1)(a0-a1) + fpmul_mont(t3, a[1], c[1]); // c1 = 2a0*a1 +} + +__inline unsigned int mp_sub(const digit_t *a, const digit_t *b, digit_t *c, const unsigned int nwords) +{ // Multiprecision subtraction, c = a-b, where lng(a) = lng(b) = nwords. Returns the borrow bit. + unsigned int i, borrow = 0; + + for (i = 0; i < nwords; i++) + { + SUBC(borrow, a[i], b[i], borrow, c[i]); + } + + return borrow; +} + +__inline static digit_t mp_subfast(const digit_t *a, const digit_t *b, digit_t *c) +{ // Multiprecision subtraction, c = a-b, where lng(a) = lng(b) = 2*NWORDS_FIELD. + // If c < 0 then returns mask = 0xFF..F, else mask = 0x00..0 + + return mp_subx2_asm(a, b, c); +} + +void fp2mul_mont(const f2elm_t a, const f2elm_t b, f2elm_t c) +{ // GF(p^2) multiplication using Montgomery arithmetic, c = a*b in GF(p^2). + // Inputs: a = a0+a1*i and b = b0+b1*i, where a0, a1, b0, b1 are in [0, 2*p-1] + // Output: c = c0+c1*i, where c0, c1 are in [0, 2*p-1] + felm_t t1, t2; + dfelm_t tt1, tt2, tt3; + digit_t mask; + unsigned int i, borrow = 0; + + mp_mul(a[0], b[0], tt1, NWORDS_FIELD); // tt1 = a0*b0 + mp_mul(a[1], b[1], tt2, NWORDS_FIELD); // tt2 = a1*b1 + mp_addfast(a[0], a[1], t1); // t1 = a0+a1 + mp_addfast(b[0], b[1], t2); // t2 = b0+b1 + mask = mp_subfast(tt1, tt2, tt3); // tt3 = a0*b0 - a1*b1. If tt3 < 0 then mask = 0xFF..F, else if tt3 >= 0 then mask = 0x00..0 + for (i = 0; i < NWORDS_FIELD; i++) + { + ADDC(borrow, tt3[NWORDS_FIELD + i], ((digit_t *)PRIME)[i] & mask, borrow, tt3[NWORDS_FIELD + i]); + } + rdc_mont(tt3, c[0]); // c[0] = a0*b0 - a1*b1 + mp_addfastx2(tt1, tt2, tt1); // tt1 = a0*b0 + a1*b1 + mp_mul(t1, t2, tt2, NWORDS_FIELD); // tt2 = (a0+a1)*(b0+b1) + mp_subfast(tt2, tt1, tt2); // tt2 = (a0+a1)*(b0+b1) - a0*b0 - a1*b1 + rdc_mont(tt2, c[1]); // c[1] = (a0+a1)*(b0+b1) - a0*b0 - a1*b1 + + //a1*b0+a0*b1 +} + +void fpinv_chain_mont(felm_t a) +{ // Chain to compute a^(p-3)/4 using Montgomery arithmetic. + unsigned int i, j; + felm_t t[27], tt; + + // Precomputed table + fpsqr_mont(a, tt); + fpmul_mont(a, tt, t[0]); + fpmul_mont(t[0], tt, t[1]); + fpmul_mont(t[1], tt, t[2]); + fpmul_mont(t[2], tt, t[3]); + fpmul_mont(t[3], tt, t[3]); + for (i = 3; i <= 8; i++) + fpmul_mont(t[i], tt, t[i + 1]); + fpmul_mont(t[9], tt, t[9]); + for (i = 9; i <= 20; i++) + fpmul_mont(t[i], tt, t[i + 1]); + fpmul_mont(t[21], tt, t[21]); + for (i = 21; i <= 24; i++) + fpmul_mont(t[i], tt, t[i + 1]); + fpmul_mont(t[25], tt, t[25]); + fpmul_mont(t[25], tt, t[26]); + + fpcopy(a, tt); + for (i = 0; i < 6; i++) + fpsqr_mont(tt, tt); + fpmul_mont(t[20], tt, tt); + for (i = 0; i < 6; i++) + fpsqr_mont(tt, tt); + fpmul_mont(t[24], tt, tt); + for (i = 0; i < 6; i++) + fpsqr_mont(tt, tt); + fpmul_mont(t[11], tt, tt); + for (i = 0; i < 6; i++) + fpsqr_mont(tt, tt); + fpmul_mont(t[8], tt, tt); + for (i = 0; i < 8; i++) + fpsqr_mont(tt, tt); + fpmul_mont(t[2], tt, tt); + for (i = 0; i < 6; i++) + fpsqr_mont(tt, tt); + fpmul_mont(t[23], tt, tt); + for (i = 0; i < 6; i++) + fpsqr_mont(tt, tt); + fpmul_mont(t[2], tt, tt); + for (i = 0; i < 9; i++) + fpsqr_mont(tt, tt); + fpmul_mont(t[2], tt, tt); + for (i = 0; i < 10; i++) + fpsqr_mont(tt, tt); + fpmul_mont(t[15], tt, tt); + for (i = 0; i < 8; i++) + fpsqr_mont(tt, tt); + fpmul_mont(t[13], tt, tt); + for (i = 0; i < 8; i++) + fpsqr_mont(tt, tt); + fpmul_mont(t[26], tt, tt); + for (i = 0; i < 8; i++) + fpsqr_mont(tt, tt); + fpmul_mont(t[20], tt, tt); + for (i = 0; i < 6; i++) + fpsqr_mont(tt, tt); + fpmul_mont(t[11], tt, tt); + for (i = 0; i < 6; i++) + fpsqr_mont(tt, tt); + fpmul_mont(t[10], tt, tt); + for (i = 0; i < 6; i++) + fpsqr_mont(tt, tt); + fpmul_mont(t[14], tt, tt); + for (i = 0; i < 6; i++) + fpsqr_mont(tt, tt); + fpmul_mont(t[4], tt, tt); + for (i = 0; i < 10; i++) + fpsqr_mont(tt, tt); + fpmul_mont(t[18], tt, tt); + for (i = 0; i < 6; i++) + fpsqr_mont(tt, tt); + fpmul_mont(t[1], tt, tt); + for (i = 0; i < 7; i++) + fpsqr_mont(tt, tt); + fpmul_mont(t[22], tt, tt); + for (i = 0; i < 10; i++) + fpsqr_mont(tt, tt); + fpmul_mont(t[6], tt, tt); + for (i = 0; i < 7; i++) + fpsqr_mont(tt, tt); + fpmul_mont(t[24], tt, tt); + for (i = 0; i < 6; i++) + fpsqr_mont(tt, tt); + fpmul_mont(t[9], tt, tt); + for (i = 0; i < 8; i++) + fpsqr_mont(tt, tt); + fpmul_mont(t[18], tt, tt); + for (i = 0; i < 6; i++) + fpsqr_mont(tt, tt); + fpmul_mont(t[17], tt, tt); + for (i = 0; i < 8; i++) + fpsqr_mont(tt, tt); + fpmul_mont(a, tt, tt); + for (i = 0; i < 10; i++) + fpsqr_mont(tt, tt); + fpmul_mont(t[16], tt, tt); + for (i = 0; i < 6; i++) + fpsqr_mont(tt, tt); + fpmul_mont(t[7], tt, tt); + for (i = 0; i < 6; i++) + fpsqr_mont(tt, tt); + fpmul_mont(t[0], tt, tt); + for (i = 0; i < 7; i++) + fpsqr_mont(tt, tt); + fpmul_mont(t[12], tt, tt); + for (i = 0; i < 7; i++) + fpsqr_mont(tt, tt); + fpmul_mont(t[19], tt, tt); + for (i = 0; i < 6; i++) + fpsqr_mont(tt, tt); + fpmul_mont(t[22], tt, tt); + for (i = 0; i < 6; i++) + fpsqr_mont(tt, tt); + fpmul_mont(t[25], tt, tt); + for (i = 0; i < 7; i++) + fpsqr_mont(tt, tt); + fpmul_mont(t[2], tt, tt); + for (i = 0; i < 6; i++) + fpsqr_mont(tt, tt); + fpmul_mont(t[10], tt, tt); + for (i = 0; i < 7; i++) + fpsqr_mont(tt, tt); + fpmul_mont(t[22], tt, tt); + for (i = 0; i < 8; i++) + fpsqr_mont(tt, tt); + fpmul_mont(t[18], tt, tt); + for (i = 0; i < 6; i++) + fpsqr_mont(tt, tt); + fpmul_mont(t[4], tt, tt); + for (i = 0; i < 6; i++) + fpsqr_mont(tt, tt); + fpmul_mont(t[14], tt, tt); + for (i = 0; i < 7; i++) + fpsqr_mont(tt, tt); + fpmul_mont(t[13], tt, tt); + for (i = 0; i < 6; i++) + fpsqr_mont(tt, tt); + fpmul_mont(t[5], tt, tt); + for (i = 0; i < 6; i++) + fpsqr_mont(tt, tt); + fpmul_mont(t[23], tt, tt); + for (i = 0; i < 6; i++) + fpsqr_mont(tt, tt); + fpmul_mont(t[21], tt, tt); + for (i = 0; i < 6; i++) + fpsqr_mont(tt, tt); + fpmul_mont(t[2], tt, tt); + for (i = 0; i < 7; i++) + fpsqr_mont(tt, tt); + fpmul_mont(t[23], tt, tt); + for (i = 0; i < 8; i++) + fpsqr_mont(tt, tt); + fpmul_mont(t[12], tt, tt); + for (i = 0; i < 6; i++) + fpsqr_mont(tt, tt); + fpmul_mont(t[9], tt, tt); + for (i = 0; i < 6; i++) + fpsqr_mont(tt, tt); + fpmul_mont(t[3], tt, tt); + for (i = 0; i < 7; i++) + fpsqr_mont(tt, tt); + fpmul_mont(t[13], tt, tt); + for (i = 0; i < 7; i++) + fpsqr_mont(tt, tt); + fpmul_mont(t[17], tt, tt); + for (i = 0; i < 8; i++) + fpsqr_mont(tt, tt); + fpmul_mont(t[26], tt, tt); + for (i = 0; i < 8; i++) + fpsqr_mont(tt, tt); + fpmul_mont(t[5], tt, tt); + for (i = 0; i < 8; i++) + fpsqr_mont(tt, tt); + fpmul_mont(t[8], tt, tt); + for (i = 0; i < 6; i++) + fpsqr_mont(tt, tt); + fpmul_mont(t[2], tt, tt); + for (i = 0; i < 6; i++) + fpsqr_mont(tt, tt); + fpmul_mont(t[11], tt, tt); + for (i = 0; i < 7; i++) + fpsqr_mont(tt, tt); + fpmul_mont(t[20], tt, tt); + for (j = 0; j < 61; j++) + { + for (i = 0; i < 6; i++) + fpsqr_mont(tt, tt); + fpmul_mont(t[26], tt, tt); + } + fpcopy(tt, a); +} + +void fp2inv_mont(f2elm_t a) +{ // GF(p^2) inversion using Montgomery arithmetic, a = (a0-i*a1)/(a0^2+a1^2). + f2elm_t t1; + + fpsqr_mont(a[0], t1[0]); // t10 = a0^2 + fpsqr_mont(a[1], t1[1]); // t11 = a1^2 + fpadd(t1[0], t1[1], t1[0]); // t10 = a0^2+a1^2 + fpinv_mont(t1[0]); // t10 = (a0^2+a1^2)^-1 + fpneg(a[1]); // a = a0-i*a1 + fpmul_mont(a[0], t1[0], a[0]); + fpmul_mont(a[1], t1[0], a[1]); // a = (a0-i*a1)*(a0^2+a1^2)^-1 +} + +void to_fp2mont(const f2elm_t a, f2elm_t mc) +{ // Conversion of a GF(p^2) element to Montgomery representation, + // mc_i = a_i*R^2*R^(-1) = a_i*R in GF(p^2). + + to_mont(a[0], mc[0]); + to_mont(a[1], mc[1]); +} + +void from_fp2mont(const f2elm_t ma, f2elm_t c) +{ // Conversion of a GF(p^2) element from Montgomery representation to standard representation, + // c_i = ma_i*R^(-1) = a_i in GF(p^2). + + from_mont(ma[0], c[0]); + from_mont(ma[1], c[1]); +} + +__inline unsigned int mp_add(const digit_t *a, const digit_t *b, digit_t *c, const unsigned int nwords) +{ // Multiprecision addition, c = a+b, where lng(a) = lng(b) = nwords. Returns the carry bit. + unsigned int i, carry = 0; + + for (i = 0; i < nwords; i++) + { + ADDC(carry, a[i], b[i], carry, c[i]); + } + + return carry; +} + +void mp_shiftleft(digit_t *x, unsigned int shift, const unsigned int nwords) +{ + unsigned int i, j = 0; + + while (shift > RADIX) + { + j += 1; + shift -= RADIX; + } + + for (i = 0; i < nwords - j; i++) + x[nwords - 1 - i] = x[nwords - 1 - i - j]; + for (i = nwords - j; i < nwords; i++) + x[nwords - 1 - i] = 0; + if (shift != 0) + { + for (j = nwords - 1; j > 0; j--) + SHIFTL(x[j], x[j - 1], shift, x[j], RADIX); + x[0] <<= shift; + } +} + +void mp_shiftr1(digit_t *x, const unsigned int nwords) +{ // Multiprecision right shift by one. + unsigned int i; + + for (i = 0; i < nwords - 1; i++) + { + SHIFTR(x[i + 1], x[i], 1, x[i], RADIX); + } + x[nwords - 1] >>= 1; +} + +void mp_shiftl1(digit_t *x, const unsigned int nwords) +{ // Multiprecision left shift by one. + int i; + + for (i = nwords - 1; i > 0; i--) + { + SHIFTL(x[i], x[i - 1], 1, x[i], RADIX); + } + x[0] <<= 1; +} \ No newline at end of file diff --git a/sidh_ref/random/random.c b/sidh_ref/random/random.c new file mode 100644 index 0000000..1f6ce73 --- /dev/null +++ b/sidh_ref/random/random.c @@ -0,0 +1,43 @@ +/******************************************************************************************** +* Hardware-based random number generation function using /dev/urandom +*********************************************************************************************/ + +#include "random.h" +#include +#include +#include +static int lock = -1; + + +static __inline void delay(unsigned int count) +{ + while (count--) {} +} + + +int randombytes(unsigned char* random_array, unsigned long long nbytes) +{ // Generation of "nbytes" of random values + int r, n = (int)nbytes, count = 0; + + if (lock == -1) { + do { + lock = open("/dev/urandom", O_RDONLY); + if (lock == -1) { + delay(0xFFFFF); + } + } while (lock == -1); + } + + while (n > 0) { + do { + r = read(lock, random_array+count, n); + if (r == -1) { + delay(0xFFFF); + } + } while (r == -1); + count += r; + n -= r; + } + + return 0; +} \ No newline at end of file diff --git a/sidh_ref/random/random.h b/sidh_ref/random/random.h new file mode 100644 index 0000000..ab456bf --- /dev/null +++ b/sidh_ref/random/random.h @@ -0,0 +1,9 @@ +#ifndef __RANDOM_H__ +#define __RANDOM_H__ + + +// Generate random bytes and output the result to random_array +int randombytes(unsigned char* random_array, unsigned long long nbytes); + + +#endif diff --git a/sidh_ref/random/random.o b/sidh_ref/random/random.o new file mode 100644 index 0000000000000000000000000000000000000000..cda51faae5f5e4f5b168d342753b48fd6384e5fe GIT binary patch literal 5344 zcmbtXU2Ggz6~1?7U9Ypw#Zn!w53*#N-c7!OT^1 zIgBh4iJvENAQWn(#bltqZYJ>x9@)nkA(v$Vr@f z9&}WnG@8jl8)K-mAqT&N&=5Y5tgCXj=Af7_3%OfHq=9Bd@b~Jdiv+KqEOzTr@mD9M43GXj11QuglC9 zBIp9Mo!FOo75`(BW;#nuzQUgRxtBQDQML+3nyAn z1)|%-@>1#akxTZ#W7dPCZ(KZLZyvP=@7VR%lN*Pgvl5WF)(BP|^}v?axaJ2CxLI8d z{IY{crqZbUnf01i&Rh!{Px_@+Gt;bI^^4V&x+hM3Vab`Bo_40(1MZ}=e=_^I$w#tN z&P3f?-*_aue+9VKTHqgB^#ebwmQWFvD#tbsO-)Q4bS74vi8BYB30TIl()6@Y)9A|H znOzs$tTe)w(*^0uEN`~_W@iDViWds?(CVTWhTf(qHP&Ed(V}aO(p3@qURea%pO&v2 zjs2i)fgLCo{b0QsHUd%hYY^jXd-*)vt@sI zdM&%pD4GzXbUNfyUlejw@vhk+C@X`qd`3zW%!GQLy!m)&n2dhG~0sv2D}YVsIO#?bW5VO=ov5tuCD{q6;j- z^=Hub_3HSsk0kQI{T`0hnY)Dpcyv{gEe=!nz>nMm?{o`w!rjHWNrLn-FP>UFGn-p1 zo;i8){CuHU$jvU!7xjyv`hfc8fY{hhaMwS?k0+qRz$2OXyoS|D7y+K`FA;9qH)1$e zhwX0=j^_ev#Q5`sn{m_!6X+JGYs&E2Nj6lN9-e* zCy5{s{Q6P!5Ihl&zr(2DJ{7;$5jW+B0mGQ4KfkBw6DQ0Srz{AF?T>@Nvw!q!mGtb- zBlcb9<00N6;Tc0|h^lVA>h~Je??qY|)(`i{tlv4n%$S(E%7@)h8lvh`CZ_x;?@tY- zAqwpCCWii>>BFZGe2-~qyX}*p=(YYs)F9X&*N-JJ*MAeRUimLle!lO)Op$4O9nfC+ zE0mwlDdxw1Y3BbnU>K9<39O>GDF0<6A&WeIAFO-%=cwtq6psh?;raUEuEjD3gle?SWz1+14p-nDd8 f(Ji9ebZO5-h-SEM3}eh*{@ +#include +#include "fips202.h" + +#define NROUNDS 24 +#define ROL(a, offset) ((a << offset) ^ (a >> (64-offset))) + + +static uint64_t load64(const unsigned char *x) +{ + unsigned long long r = 0, i; + + for (i = 0; i < 8; ++i) { + r |= (unsigned long long)x[i] << 8 * i; + } + return r; +} + + +static void store64(uint8_t *x, uint64_t u) +{ + unsigned int i; + + for (i = 0; i < 8; ++i) { + x[i] = u; + u >>= 8; + } +} + + +static const uint64_t KeccakF_RoundConstants[NROUNDS] = +{ + (uint64_t)0x0000000000000001ULL, + (uint64_t)0x0000000000008082ULL, + (uint64_t)0x800000000000808aULL, + (uint64_t)0x8000000080008000ULL, + (uint64_t)0x000000000000808bULL, + (uint64_t)0x0000000080000001ULL, + (uint64_t)0x8000000080008081ULL, + (uint64_t)0x8000000000008009ULL, + (uint64_t)0x000000000000008aULL, + (uint64_t)0x0000000000000088ULL, + (uint64_t)0x0000000080008009ULL, + (uint64_t)0x000000008000000aULL, + (uint64_t)0x000000008000808bULL, + (uint64_t)0x800000000000008bULL, + (uint64_t)0x8000000000008089ULL, + (uint64_t)0x8000000000008003ULL, + (uint64_t)0x8000000000008002ULL, + (uint64_t)0x8000000000000080ULL, + (uint64_t)0x000000000000800aULL, + (uint64_t)0x800000008000000aULL, + (uint64_t)0x8000000080008081ULL, + (uint64_t)0x8000000000008080ULL, + (uint64_t)0x0000000080000001ULL, + (uint64_t)0x8000000080008008ULL +}; + + +void KeccakF1600_StatePermute(uint64_t * state) +{ + int round; + + uint64_t Aba, Abe, Abi, Abo, Abu; + uint64_t Aga, Age, Agi, Ago, Agu; + uint64_t Aka, Ake, Aki, Ako, Aku; + uint64_t Ama, Ame, Ami, Amo, Amu; + uint64_t Asa, Ase, Asi, Aso, Asu; + uint64_t BCa, BCe, BCi, BCo, BCu; + uint64_t Da, De, Di, Do, Du; + uint64_t Eba, Ebe, Ebi, Ebo, Ebu; + uint64_t Ega, Ege, Egi, Ego, Egu; + uint64_t Eka, Eke, Eki, Eko, Eku; + uint64_t Ema, Eme, Emi, Emo, Emu; + uint64_t Esa, Ese, Esi, Eso, Esu; + + //copyFromState(A, state) + Aba = state[ 0]; + Abe = state[ 1]; + Abi = state[ 2]; + Abo = state[ 3]; + Abu = state[ 4]; + Aga = state[ 5]; + Age = state[ 6]; + Agi = state[ 7]; + Ago = state[ 8]; + Agu = state[ 9]; + Aka = state[10]; + Ake = state[11]; + Aki = state[12]; + Ako = state[13]; + Aku = state[14]; + Ama = state[15]; + Ame = state[16]; + Ami = state[17]; + Amo = state[18]; + Amu = state[19]; + Asa = state[20]; + Ase = state[21]; + Asi = state[22]; + Aso = state[23]; + Asu = state[24]; + + for( round = 0; round < NROUNDS; round += 2 ) + { + // prepareTheta + BCa = Aba^Aga^Aka^Ama^Asa; + BCe = Abe^Age^Ake^Ame^Ase; + BCi = Abi^Agi^Aki^Ami^Asi; + BCo = Abo^Ago^Ako^Amo^Aso; + BCu = Abu^Agu^Aku^Amu^Asu; + + //thetaRhoPiChiIotaPrepareTheta(round , A, E) + Da = BCu^ROL(BCe, 1); + De = BCa^ROL(BCi, 1); + Di = BCe^ROL(BCo, 1); + Do = BCi^ROL(BCu, 1); + Du = BCo^ROL(BCa, 1); + + Aba ^= Da; + BCa = Aba; + Age ^= De; + BCe = ROL(Age, 44); + Aki ^= Di; + BCi = ROL(Aki, 43); + Amo ^= Do; + BCo = ROL(Amo, 21); + Asu ^= Du; + BCu = ROL(Asu, 14); + Eba = BCa ^((~BCe)& BCi ); + Eba ^= (uint64_t)KeccakF_RoundConstants[round]; + Ebe = BCe ^((~BCi)& BCo ); + Ebi = BCi ^((~BCo)& BCu ); + Ebo = BCo ^((~BCu)& BCa ); + Ebu = BCu ^((~BCa)& BCe ); + + Abo ^= Do; + BCa = ROL(Abo, 28); + Agu ^= Du; + BCe = ROL(Agu, 20); + Aka ^= Da; + BCi = ROL(Aka, 3); + Ame ^= De; + BCo = ROL(Ame, 45); + Asi ^= Di; + BCu = ROL(Asi, 61); + Ega = BCa ^((~BCe)& BCi ); + Ege = BCe ^((~BCi)& BCo ); + Egi = BCi ^((~BCo)& BCu ); + Ego = BCo ^((~BCu)& BCa ); + Egu = BCu ^((~BCa)& BCe ); + + Abe ^= De; + BCa = ROL(Abe, 1); + Agi ^= Di; + BCe = ROL(Agi, 6); + Ako ^= Do; + BCi = ROL(Ako, 25); + Amu ^= Du; + BCo = ROL(Amu, 8); + Asa ^= Da; + BCu = ROL(Asa, 18); + Eka = BCa ^((~BCe)& BCi ); + Eke = BCe ^((~BCi)& BCo ); + Eki = BCi ^((~BCo)& BCu ); + Eko = BCo ^((~BCu)& BCa ); + Eku = BCu ^((~BCa)& BCe ); + + Abu ^= Du; + BCa = ROL(Abu, 27); + Aga ^= Da; + BCe = ROL(Aga, 36); + Ake ^= De; + BCi = ROL(Ake, 10); + Ami ^= Di; + BCo = ROL(Ami, 15); + Aso ^= Do; + BCu = ROL(Aso, 56); + Ema = BCa ^((~BCe)& BCi ); + Eme = BCe ^((~BCi)& BCo ); + Emi = BCi ^((~BCo)& BCu ); + Emo = BCo ^((~BCu)& BCa ); + Emu = BCu ^((~BCa)& BCe ); + + Abi ^= Di; + BCa = ROL(Abi, 62); + Ago ^= Do; + BCe = ROL(Ago, 55); + Aku ^= Du; + BCi = ROL(Aku, 39); + Ama ^= Da; + BCo = ROL(Ama, 41); + Ase ^= De; + BCu = ROL(Ase, 2); + Esa = BCa ^((~BCe)& BCi ); + Ese = BCe ^((~BCi)& BCo ); + Esi = BCi ^((~BCo)& BCu ); + Eso = BCo ^((~BCu)& BCa ); + Esu = BCu ^((~BCa)& BCe ); + + // prepareTheta + BCa = Eba^Ega^Eka^Ema^Esa; + BCe = Ebe^Ege^Eke^Eme^Ese; + BCi = Ebi^Egi^Eki^Emi^Esi; + BCo = Ebo^Ego^Eko^Emo^Eso; + BCu = Ebu^Egu^Eku^Emu^Esu; + + //thetaRhoPiChiIotaPrepareTheta(round+1, E, A) + Da = BCu^ROL(BCe, 1); + De = BCa^ROL(BCi, 1); + Di = BCe^ROL(BCo, 1); + Do = BCi^ROL(BCu, 1); + Du = BCo^ROL(BCa, 1); + + Eba ^= Da; + BCa = Eba; + Ege ^= De; + BCe = ROL(Ege, 44); + Eki ^= Di; + BCi = ROL(Eki, 43); + Emo ^= Do; + BCo = ROL(Emo, 21); + Esu ^= Du; + BCu = ROL(Esu, 14); + Aba = BCa ^((~BCe)& BCi ); + Aba ^= (uint64_t)KeccakF_RoundConstants[round+1]; + Abe = BCe ^((~BCi)& BCo ); + Abi = BCi ^((~BCo)& BCu ); + Abo = BCo ^((~BCu)& BCa ); + Abu = BCu ^((~BCa)& BCe ); + + Ebo ^= Do; + BCa = ROL(Ebo, 28); + Egu ^= Du; + BCe = ROL(Egu, 20); + Eka ^= Da; + BCi = ROL(Eka, 3); + Eme ^= De; + BCo = ROL(Eme, 45); + Esi ^= Di; + BCu = ROL(Esi, 61); + Aga = BCa ^((~BCe)& BCi ); + Age = BCe ^((~BCi)& BCo ); + Agi = BCi ^((~BCo)& BCu ); + Ago = BCo ^((~BCu)& BCa ); + Agu = BCu ^((~BCa)& BCe ); + + Ebe ^= De; + BCa = ROL(Ebe, 1); + Egi ^= Di; + BCe = ROL(Egi, 6); + Eko ^= Do; + BCi = ROL(Eko, 25); + Emu ^= Du; + BCo = ROL(Emu, 8); + Esa ^= Da; + BCu = ROL(Esa, 18); + Aka = BCa ^((~BCe)& BCi ); + Ake = BCe ^((~BCi)& BCo ); + Aki = BCi ^((~BCo)& BCu ); + Ako = BCo ^((~BCu)& BCa ); + Aku = BCu ^((~BCa)& BCe ); + + Ebu ^= Du; + BCa = ROL(Ebu, 27); + Ega ^= Da; + BCe = ROL(Ega, 36); + Eke ^= De; + BCi = ROL(Eke, 10); + Emi ^= Di; + BCo = ROL(Emi, 15); + Eso ^= Do; + BCu = ROL(Eso, 56); + Ama = BCa ^((~BCe)& BCi ); + Ame = BCe ^((~BCi)& BCo ); + Ami = BCi ^((~BCo)& BCu ); + Amo = BCo ^((~BCu)& BCa ); + Amu = BCu ^((~BCa)& BCe ); + + Ebi ^= Di; + BCa = ROL(Ebi, 62); + Ego ^= Do; + BCe = ROL(Ego, 55); + Eku ^= Du; + BCi = ROL(Eku, 39); + Ema ^= Da; + BCo = ROL(Ema, 41); + Ese ^= De; + BCu = ROL(Ese, 2); + Asa = BCa ^((~BCe)& BCi ); + Ase = BCe ^((~BCi)& BCo ); + Asi = BCi ^((~BCo)& BCu ); + Aso = BCo ^((~BCu)& BCa ); + Asu = BCu ^((~BCa)& BCe ); + } + + //copyToState(state, A) + state[ 0] = Aba; + state[ 1] = Abe; + state[ 2] = Abi; + state[ 3] = Abo; + state[ 4] = Abu; + state[ 5] = Aga; + state[ 6] = Age; + state[ 7] = Agi; + state[ 8] = Ago; + state[ 9] = Agu; + state[10] = Aka; + state[11] = Ake; + state[12] = Aki; + state[13] = Ako; + state[14] = Aku; + state[15] = Ama; + state[16] = Ame; + state[17] = Ami; + state[18] = Amo; + state[19] = Amu; + state[20] = Asa; + state[21] = Ase; + state[22] = Asi; + state[23] = Aso; + state[24] = Asu; + + #undef round +} + +#include +#define MIN(a, b) ((a) < (b) ? (a) : (b)) + + +static void keccak_absorb(uint64_t *s, unsigned int r, const unsigned char *m, unsigned long long int mlen, unsigned char p) +{ + unsigned long long i; + unsigned char t[200]; + + while (mlen >= r) + { + for (i = 0; i < r / 8; ++i) + s[i] ^= load64(m + 8 * i); + + KeccakF1600_StatePermute(s); + mlen -= r; + m += r; + } + + for (i = 0; i < r; ++i) + t[i] = 0; + for (i = 0; i < mlen; ++i) + t[i] = m[i]; + t[i] = p; + t[r - 1] |= 128; + for (i = 0; i < r / 8; ++i) + s[i] ^= load64(t + 8 * i); +} + + +static void keccak_squeezeblocks(unsigned char *h, unsigned long long int nblocks, uint64_t *s, unsigned int r) +{ + unsigned int i; + + while(nblocks > 0) + { + KeccakF1600_StatePermute(s); + for (i = 0; i < (r>>3); i++) + { + store64(h+8*i, s[i]); + } + h += r; + nblocks--; + } +} + + +/********** SHAKE128 ***********/ + +void shake128_absorb(uint64_t *s, const unsigned char *input, unsigned int inputByteLen) +{ + keccak_absorb(s, SHAKE128_RATE, input, inputByteLen, 0x1F); +} + + +void shake128_squeezeblocks(unsigned char *output, unsigned long long nblocks, uint64_t *s) +{ + keccak_squeezeblocks(output, nblocks, s, SHAKE128_RATE); +} + + +void shake128(unsigned char *output, unsigned long long outlen, const unsigned char *input, unsigned long long inlen) +{ + uint64_t s[25] = {0}; + unsigned char t[SHAKE128_RATE]; + unsigned long long nblocks = outlen/SHAKE128_RATE; + size_t i; + + /* Absorb input */ + keccak_absorb(s, SHAKE128_RATE, input, inlen, 0x1F); + + /* Squeeze output */ + keccak_squeezeblocks(output, nblocks, s, SHAKE128_RATE); + + output += nblocks*SHAKE128_RATE; + outlen -= nblocks*SHAKE128_RATE; + + if (outlen) + { + keccak_squeezeblocks(t, 1, s, SHAKE128_RATE); + for (i = 0; i < outlen; i++) + output[i] = t[i]; + } +} + + +/********** cSHAKE128 ***********/ + +void cshake128_simple_absorb(uint64_t s[25], uint16_t cstm, const unsigned char *in, unsigned long long inlen) +{ + unsigned char *sep = (unsigned char*)s; + unsigned int i; + + for (i = 0; i < 25; i++) + s[i] = 0; + + /* Absorb customization (domain-separation) string */ + sep[0] = 0x01; + sep[1] = 0xa8; + sep[2] = 0x01; + sep[3] = 0x00; + sep[4] = 0x01; + sep[5] = 16; // fixed bitlen of cstm + sep[6] = cstm & 0xff; + sep[7] = cstm >> 8; + + KeccakF1600_StatePermute(s); + + /* Absorb input */ + keccak_absorb(s, SHAKE128_RATE, in, inlen, 0x04); +} + + +void cshake128_simple_squeezeblocks(unsigned char *output, unsigned long long nblocks, uint64_t *s) +{ + keccak_squeezeblocks(output, nblocks, s, SHAKE128_RATE); +} + + +void cshake128_simple(unsigned char *output, unsigned long long outlen, uint16_t cstm, const unsigned char *in, unsigned long long inlen) +{ + uint64_t s[25]; + unsigned char t[SHAKE128_RATE]; + unsigned int i; + + cshake128_simple_absorb(s, cstm, in, inlen); + + /* Squeeze output */ + keccak_squeezeblocks(output, outlen/SHAKE128_RATE, s, SHAKE128_RATE); + output += (outlen/SHAKE128_RATE)*SHAKE128_RATE; + + if (outlen%SHAKE128_RATE) + { + keccak_squeezeblocks(t, 1, s, SHAKE128_RATE); + for (i = 0; i < outlen%SHAKE128_RATE; i++) + output[i] = t[i]; + } +} + + +/********** SHAKE256 ***********/ + +void shake256_absorb(uint64_t *s, const unsigned char *input, unsigned int inputByteLen) +{ + keccak_absorb(s, SHAKE256_RATE, input, inputByteLen, 0x1F); +} + + +void shake256_squeezeblocks(unsigned char *output, unsigned long long nblocks, uint64_t *s) +{ + keccak_squeezeblocks(output, nblocks, s, SHAKE256_RATE); +} + + +void shake256(unsigned char *output, unsigned long long outlen, const unsigned char *input, unsigned long long inlen) +{ + uint64_t s[25]; + unsigned char t[SHAKE256_RATE]; + unsigned long long nblocks = outlen/SHAKE256_RATE; + size_t i; + + for (i = 0; i < 25; ++i) + s[i] = 0; + + /* Absorb input */ + keccak_absorb(s, SHAKE256_RATE, input, inlen, 0x1F); + + /* Squeeze output */ + keccak_squeezeblocks(output, nblocks, s, SHAKE256_RATE); + + output += nblocks*SHAKE256_RATE; + outlen -= nblocks*SHAKE256_RATE; + + if (outlen) + { + keccak_squeezeblocks(t, 1, s, SHAKE256_RATE); + for (i = 0; i < outlen; i++) + output[i] = t[i]; + } +} + + +/********** cSHAKE256 ***********/ + +void cshake256_simple_absorb(uint64_t s[25], uint16_t cstm, const unsigned char *in, unsigned long long inlen) +{ + unsigned char *sep = (unsigned char*)s; + unsigned int i; + + for (i = 0; i < 25; i++) + s[i] = 0; + + /* Absorb customization (domain-separation) string */ + sep[0] = 0x01; + sep[1] = 0x88; + sep[2] = 0x01; + sep[3] = 0x00; + sep[4] = 0x01; + sep[5] = 16; // fixed bitlen of cstm + sep[6] = cstm & 0xff; + sep[7] = cstm >> 8; + + KeccakF1600_StatePermute(s); + + /* Absorb input */ + keccak_absorb(s, SHAKE256_RATE, in, inlen, 0x04); +} + + +void cshake256_simple_squeezeblocks(unsigned char *output, unsigned long long nblocks, uint64_t *s) +{ + keccak_squeezeblocks(output, nblocks, s, SHAKE256_RATE); +} + + +void cshake256_simple(unsigned char *output, unsigned long long outlen, uint16_t cstm, const unsigned char *in, unsigned long long inlen) +{ + uint64_t s[25]; + unsigned char t[SHAKE256_RATE]; + unsigned int i; + + cshake256_simple_absorb(s, cstm, in, inlen); + + /* Squeeze output */ + keccak_squeezeblocks(output, outlen/SHAKE256_RATE, s, SHAKE256_RATE); + output += (outlen/SHAKE256_RATE)*SHAKE256_RATE; + + if(outlen%SHAKE256_RATE) + { + keccak_squeezeblocks(t, 1, s, SHAKE256_RATE); + for (i = 0; i < outlen%SHAKE256_RATE; i++) + output[i] = t[i]; + } +} \ No newline at end of file diff --git a/sidh_ref/sha3/fips202.h b/sidh_ref/sha3/fips202.h new file mode 100644 index 0000000..a98c542 --- /dev/null +++ b/sidh_ref/sha3/fips202.h @@ -0,0 +1,27 @@ +#ifndef FIPS202_H +#define FIPS202_H + +#include + + +#define SHAKE128_RATE 168 +#define SHAKE256_RATE 136 + +void shake128_absorb(uint64_t *s, const unsigned char *input, unsigned int inputByteLen); +void shake128_squeezeblocks(unsigned char *output, unsigned long long nblocks, uint64_t *s); +void shake128(unsigned char *output, unsigned long long outlen, const unsigned char *input, unsigned long long inlen); + +void cshake128_simple_absorb(uint64_t *s, uint16_t cstm, const unsigned char *in, unsigned long long inlen); +void cshake128_simple_squeezeblocks(unsigned char *output, unsigned long long nblocks, uint64_t *s); +void cshake128_simple(unsigned char *output, unsigned long long outlen, uint16_t cstm, const unsigned char *in, unsigned long long inlen); + +void shake256_absorb(uint64_t *s, const unsigned char *input, unsigned int inputByteLen); +void shake256_squeezeblocks(unsigned char *output, unsigned long long nblocks, uint64_t *s); +void shake256(unsigned char *output, unsigned long long outlen, const unsigned char *input, unsigned long long inlen); + +void cshake256_simple_absorb(uint64_t *s, uint16_t cstm, const unsigned char *in, unsigned long long inlen); +void cshake256_simple_squeezeblocks(unsigned char *output, unsigned long long nblocks, uint64_t *s); +void cshake256_simple(unsigned char *output, unsigned long long outlen, uint16_t cstm, const unsigned char *in, unsigned long long inlen); + + +#endif diff --git a/sidh_ref/sha3/fips202.o b/sidh_ref/sha3/fips202.o new file mode 100644 index 0000000000000000000000000000000000000000..2cdc2762ae09ed5a0eb19c38b1451f8dcc1902a0 GIT binary patch literal 98152 zcmeEv31HO4_5W@bF%cv|QG=o+x(bL#mViQQTLNfzHPM7iK-72OF z&6_uG<~#FdzO&2J%Zsnb2nGZE3kJR#@F;QvfsqqEZg0cx9moxw7AU&5XmHV>qJe`3 zM62H^kIw2CopnvmXnb;bNHjh?9F5MpF*6!34le;aDx5js&{@DE00jaH0D23k1t=A;Iy&p?<zl-+SApqMc(JU?1EPFg%rOui#Lr+8Lzcwup2ak#d4*5q(h(B^2QL8SIaBkQAu zcZ4?<#}7pdC!?|WSX~m?BSNVBebKilS~whbL41W|?=Frch1e-nkBs(t_L4B*E`&NsCJ4Ykii}N?f z_N|N7tqK)KcE|Exj?R*vkM|RY^$V{kiL5I|d5GT{?ir1|AC2sf#b1op)wYX8R>kV- zI>h1)uqYJUx1l)xeo1^oasFDtk*CoH!)XvQe_2WVgJ@(~EPq`r@^nf3tytZ%lS(2R zV(~RGF?<|6oxdu!uhFz@X-RxVaU51l)th6H)y1=JMc=`w6~!=WNi6bSaePhjtajm& z$UCvf)@a?*_HcQuuC}FV{!+Nh_+xc({wmm-Ssd8{JEKs!q&SXhHW>fB8qHr?5_t|b z6vtl@PGo6u{uX$nI1bJE^-<8MZhuMSuE?MP@9rJ6gB$wBk4# zzdBZz%!ut8gJ%SpCHX6izZy#-7^SaC@1koOtq-Mx z^TB>G7GD;PuZTs~nl6WdVr6Y{{?=IJeY83j{}`2I#rCa@#y6CR7dFJ;viyTl87Es& zNG6a+qVljr^S4GL??(5nk4B(yO*DVK3==$&{laq%uKP`#8QWD>R2PYO^C8Y`^1M|R& zXkkB@EM&3*r?-6cl#h+Y5tFw&8rfDHUymB1C>(h<*0@}rXFPI|P0_}sXlF4VlL-2I zM@a;&+ff|fR1$v=BT=5Z7)D~$Za9-kOolm8MtEIIRw@kr)3 z$wO9vhhCh!5SB=%EP>0-BNvrt=T*tV#0%%cWK01aqUdIQ^6KjFupWxV>oGX+e{usS}K6(v>Fl8OUxRJ~y zGHu8pXb@X4g6d>oVYcjtu^l-m^Jktst}BXVB3LRy@_=FrXc>!aD~UHq0~_mLa3+Qc z90MP=kLE9r#cO4fv*QV~$vzl?+>-ncqWQp>g!f0mj9_ZQM1{(VXLSe{=c7bpab#af zexpnUJ;f;)1?F+tBlGcou?}N-O>txwY&ZiB=@3KbMh?c}7@6b3yD`z6fQcbn9=#kH zn7yU>@%_bk0x|zRCnlLOR`*j!}ULa-9tS zIP#v)D9L|Eq$1ChU{c9HhzUPh-7j2|BmI{De027SvX+^ryCm|FSplN)%fixESk5s= z#(H5GVR_8Hx7C3|hh$_8k>>* zg|i!+T;X7@>pKBNo)C`!bQVwmP#|ClKu-a+0KEmQ0Eh}$4NxjzBfya2c=a^16!!~f z7RP7F(p?fI{KN%2CJL=eZX2^EQ+OjvliMoRgY|Z@Zp;gr5Z^28E6TxIncOp`QB=QE zUj)|Z+5ck3gXN8-r#fU#t*KXxQAQ<+c^CRcr$BX);o=0%^UeRE*w^#;!vZ`-NCmUzZQp z`5FU#6;Gq~ttiw0wVP4z1_;C|U(BrEGO=K9^2v(Y`aTm2>gsQvSol_QW#y~FFWd`W zA*|_DxiK@jzsqCch0<{=D>vCv^+gl!Iw&G5q}X0t?BLenV%tIau=*kW zR9MC`GRKAUOon{8Vue!d;qYpy1L|H+zE;^J4O$$oRaTQH5B?qD0z=D(6&BQloCV?C zTH>YTw#t`8$;08z%39;egOBjU(DGr01+78Oyl@V74xkDelY7twMz3r!gtanV9_zO- zUo^skR+gyu%OaQJ#yXM*PQCGAB88_|NgaO&(AfJvNm{5Y1`*I<7 zzZn9^*qfWI+kpuyCI+$uCJA{6woL4S@sARD0{4RXVIHJEb#k9@l=pZ;ROJ}q&R~Af z8FHUExsS}q6D4bmNCD{g3Lx?k+%pi}EEW4?VD#khXc6iOAzP-W6`FY0Q(-gk>PGEC zK!OLg#|9{QDlB6QBD;p8XB)+9$XSb)+6Ps1rDU^dEuS{ydR4+1?6tFFvWKRtG$!pF?$meQH1@Lh4OnF-*)iFZ95UWkEP z591L|FdHKPN+o-qdE#xiXjwDgnaummlPsBNp;c_AHIw<2c|s+#vt$A=53B;iIsns* z-8xyK9P)Z1Z?8CG2gcTi#%){Oq}QVs=ICC2QM2Jh#?-=kLeV#u!gx*~m24k33py1#NC#%0kCtP{P&k9{VJ zO=Y|8z+^zSeS*O5Rk=IYaqUB4xp|HYuUHk3?d-T%JRG;ca&Mm40~0!$Y&GPF#g~PB z_CR2!nZvQx?&>FlSgbU~P@QLr;Xo-kYKGzTHXJpPV*r(@t@t0r)Uz`D0Gqu9^aZO(~iHGq4c zjE!h~Pc*AesA?m#IX9Yx0BxyoYgs;*FQVpGNC^wp?)#S)L(R90ExzM0s)i?b!64qHQbw3&f^>Dsg%CeZLW+ z*Q?pp(>^$KC^p*++E}mp?CQ~=i{pDsWR|TcRV|{`4LA*X4gbBJ z7mcq1`y)9Rh&N_GJ{YGM@&wdnKYqo2SO|C4$zzeV0#ctwvzCK@5N8x9j*wW0ZA+tY-&YlZFSp1??-4V5v5H6PtvpspS;w^CQ504g(-J$FOV_tZgFwCyQ?Q9O*wKzOS*soQ*2(|0%4g&iW z6qp%aC^_8Fpr&WBp3MwDB3!fA!RAH+d|6!VZe=B;58tgIUX*Q)-N@+HrESymb|+DX zdboe7FN$D$v!~+OWWL#7*li2;2_HjbBljq}tLcJ@i=?V;*iO7y@j9XevqKtXHeTND z12VGFsc1TEvtsrsbDQ2u_Ck$wo9e8I+>wB@&zg6jq7KBf$(0p*AhI5dR+;Wqc4C{e z5({O)0ofkx#8PTvtHPZ^VzY;p8X@gs8>d4|JS0(ZAB~C+VYhHAmODu6?xfGe!p%;o za3gXGu(RTZ11lA3nShN&sg%J+&*l`q0hQRs812YkV7CF_Nl5K&Hr#HiV6T+N=yByi zzKWp1?0~STmx{6Ll@%K~vcD;K9}Pv^KcR~(MPP2Q;u5`Mxw=VQf=sqa(XbX+G$%?l1Z!L< zjEy36*P602|3c0NN74p#pmyELltPe4U@FW5l)=_U49JmjE?b*`0NI4}rUU}>>9PqC zW~q;fok=FbcnRAlOEeP{Bl}G3fv|tfj^_;*i@J{+M408}!JUWafx&Wc*Wr1HGi;uO z3x{W+4i+1onupK>dH66J5NTc>ng`GJSdo-A!CO6^<$y-?%O=4?Jq}6Ch{r$_fb{JZ zq!zXWdxc!d{@4y9=xZ|B+8FG4@+3QT2chjjRNq;$UAIec2<6(Q1C_gFjP}6(225>C z(U@J6Wh{=dJ5RH1vVokKkFDblX_1YdccRm@Z-u>Q2M#-Sm|dd{>6<{Vl8qS-V|K_! z)to{go|V~mw~Q;8Altna;8X zYlO-p)hJYFQmcis<+e=h@q$o2OUHPqxH*v|xhS-puB% zRrXPvr9Eh?Y?w@0yKj-Q^K?&ErosD8k;|i;-NH1P@gNzpt1=4A*5B;Mu~4XX*<`JS z6edOS{c5mflfiI#B#zklZ_I#z%D4aiGgNN+5pktlnzA1ZAvDZ+Wg%Ya_ z^@dPlilN>TO6oS$7NKNshc!-zf)UR_4R4|ywwrfKT_)BZh3#Cq?54zoH)ZPA?X$Ew zmpt2CSpcw)RyjM43=bP;TUND;TmgC3*G8@}%~xqf!9g1*yyxX|ueMvK@oh(Q73F+^$e z!u5vSg4h<}+x_9B&F!qY4K}wx$>o+rJZXeifs92E(qJr!u9dcM4#K@m6U*<4oqyd_!j8jw7!_P2PWux!OmHkU1a*^sWquh?8x{Ho1m#hWb2ieCdM z70X6KD&ArXqs3+zNQ*ZM%U1ll$wl06`q}v74O;-MHr@WFA+c|;RIG)D*OFNk?}2H9YfPeYeDBWpQG4=$d0m0HQcnK2RWwR1FG&zOFsgV!1=h`%ElYCLfq2w)1+r1A?MTG<&>T;0DyK%#u~&AjCuDX&XXrfH0?$4;r77az5o#BPH0&i zUT8)9?*Jsz9jY2J2|>VgQIR?Df6A_c5{%cX5_gziU=I8bm!E7;{ax&24*cz!2}?A- zM@!r}ygjQwTz;}W_4fy~($L*7zy-80RbzHR=B>fu@{{d>zYEjMTLT+8NeLUDLEStX zK!PP*R``P1{RldQOj!T`@s7!cW@cY9&mO1l$Lp2RSa(`r@zKDo7Hd9YrvrD%Ii1Hu zKqb))#@{Giy4Oe~1i67Y8kT(rD#3X>0tORWnGCG_{tOdknT+0tXPAJ?WJtu}Q)OZ= zlTm(nh7H3Ii!}OA>Zi*IHO#QvBSAQuk6=T9X#|IfXn@3MSbKA@Ls*lH?061?u?Gh? zwBdOlwc`Z|Zt-qyhwO=L#Ni$B+mOnLiK1Mi1oEGNvQfug8{9~2oafYiEW&xJ(}sUm zXJ@hU4X4k<-3L?-5gu%M&O}Yxm@EmB`pB`iyq1w{#LfBgxI#Z(hT_l}xd@;o<~HwV zOxzB+m+`dPvu-JXcOENue1Mq-!Bu@-dzfxQaL8er5f|Wq8mFHUSDK9k&W#c0O4w-R zY~a6TVvkqri?VjSFLk~wvU${kNfKI6rN;RZpBhnHJYNPID%)hOm5{Umi;$=&Fy`5} zMMgJj*_3=p+uekAB8G*WdEJ8wX4;2Z+LULF0ko&o5<}l_Cl2fY?vTg`Qk|vZRmifz zmMdeoi4qZ8$I1&h89yjeAVQ+;G5dOv-W8|!6m_p6?%hHxrQC*Itk{J~c1A#FpWBR3U2`j>+3@re$L%Td#5?GR;H4>>u%N zTQZ~W3TgBc7#fX>>5$2j0-wscY}RrS!lAOBF?%hLX7%Rr zax~IX4uea2?H)g4TaRYWY1_iS|I~BZ`@^z3!SFlcIqk}6h2cq_)9oR~~Ips|~r-f=4;x%_zYwE{`iDBhJiM^O3 z^&dN@b*H`vfUr({!RNGY^x#yonIl`X^5D=@mSCJkqlfk2Hu>ZZytvR+_7I{5zU&tJ z0KifieUIvR_yx1)8KKvaK-GOSWA~y(pAyAWYsy zp=8&vOxp5_P_k86h9E%gwqe|6SR|P_sPbhT!fBlU272W16I;!;A-iPj%@@-VnrSyI zl4(!F#DjTMY$G9a&e@qf8-Gawyxb8SQa!9SRSSN8_dy@6j0JaTSF(X+YbXC{n2^ z94*;WA+FGr?Yh7OkQZRDj6srzcthR~%R#L@V9k*Pqt?WQalG`D_!5V+Gs8T@)x%lz zA7nBSY>CY9aJii6)v6wtj|}11vsx2FiV}NPYc7gNu05-jGrXmeYtL%s-9O^E6;O|} zLw7qQ^(fEAiSad0cEsS@)^DA>SnnN(-n%z@=B39YfW()UaD8{sfN1=~C@$*H{`oY> z;Y&+8xJ)}TbRVH-ZPDjP8_ zgO!aqlt7W9p-r^_7@woM;{8rpB~H^`#K~W;ilvy>aAvl;VS5eG3n()^_9c?Su34aYv(xfa5$I4e zc&&g+=Hjq(Ze=5+G8;au0ek^4r(su3a(d+kKq+F3>yy=Wr_`K_Xh#Mp+Gz^yG))l& zfrh=N%5~TuOLI)wK5e_cn;hDN(>XB%`&(Rdj#oB9B+zisdLa)Q@kC6oe8F({)+8r3 z;YFt{9_)(Oe_54loX}cd(MI7;#?BJ9dS$_Ih4-Rv(KS?>5yX{f)ZN+CC(c>d*;*}i z3m3QPrLwhTPi#6|*K}lgVrwXd+ATXzN;i@%rD2tLuurh7SYouPDdt#a9mZ?>%5^=h zQZWv@@XE%Xo>?-nywEc_zhPyK5i(V*MX~(KwJIh<{^OeD0o*h9sswq+RtuvG>bH#P zdI*}=V)CurV5>8l)<9F2$~9hv8^9Rdu&M@wLvAjNlbdO16Fwu(E_q^J^eV(M*wkd( ziE5tq8k+|tD~<1Q$h$}UFLzC(zb4*=4sR^SyXju#o9c^3l|PqkTfQOLIC58gQLmB9 zG1HeH0DC0vjEsb%ZW@X6>oMgxV(wLrlb~MUmz6g_9C=0M+pvm{T$SA4wkxQS+W>CI z30(QM6R6(+n!VNzzj#^);v({h`gEy%oRJ?uW#>M^T55#*@_n=+Y4%{Ws_a= zfZVb}C9;t=l^9?uvDfOPG%7I{>x?Q?Vs6kG0+pD{bF9Q%ofCqU$UV852ju=7Dv{G} zQ;7kl5_@4zD=}B*Sc$nC$4bn#I9BoltHxFGfZU0zsg`?isKj1tgS!kcmDsCrT8X&~ z$4bntH&$XUys;8<-AxF#MDDiLJRo=8P{~3hrX>cLN*=N->#mD zec2*VH3M+u;8oQ3P|bjY(dwPS?3rSBaeUWSB!S{yuV+_(U%pr~+YE$QuVi-h458y2 zqqBb^ii+dg?LDzsC*WpRX0&=K4n3Z$iQ*#KyP3UfMjrqLm;;ClYXWltKejLr5MLz| zZf8K41h@@D<=ujs=pk?pMtdzA|FALa-U=LbM62tg_8r;#%VZVxzV9mE|Bb~D+0}G~ zSxsZ{cX4a2J&eRRW902H8#7&eO#eSW?!&8yW=#}2*12-T!^RK z>V}NFS@V>Q)@8v){sQ|#A$840fd%%U7HcpFy$dICZ*MflEZ!)6zqtu_Z2dmAeji)E zP4p#k(EsH1J34EoxyowRaWmh`gn}vl-(Q36`YRPozzkwmWV04y)xSN*R8^x@F?6VT z;NM)4YpTtybyPXcRVgd81RD58rQB96#|~R=d!p z<^kWLEpp{G5BOGY32JH{NLjr_Y;=up2^W{th{?#ymuu`)H?P&>p~4O7X@?YnOA?MX%P>aQi-(jV)l^n)5e*@@;hybTM3m8*fJj5RN->oeFktBL{4VI8>Sup z4e5Hb3wy&B>(jpLTiA-pHF@hx*`OdV3tzIu{tLUJF_qk7g0Uelnc1+XM#>(t3YG@D zJ|_*rmUMchsOCm?H8vVuT8BjeHl~M{@@mtqVA`hC)GOOa*(9Qw>&-UupxMuNPSIBH zZ0IaDsHE|7XU~SzQb)7F>_*pP?Py!MUbct0N(g6whBFFm6U7-|$iCB}Ts zeV=!d2c$RYp#rf^iyP~3cJSZ5h2Qp|3$UjQi?wE>jE6W?5*KqYCUs_n`l8 zD#zASHlsf-$I17|P098n-$<@3KagCBo#d0{AKESHlOu6Q^rn$=u?b>zvLl^Xj(R7K zY(Q&<hA!nzfwh{BTTK6#*gRr0{d zrG~Lk82h`}dmvmGWs=Y>}}N@{!BQBvD8j*{A*C1q-RUMT;nI4}I16+msP1=-pf zS<2M5#!*t+T1QE3>l`JuttVw_dqF6!ak3Gg7v5k6P}@d9wzd~p%GCCfqolSbM@elj zJ4$MMg_NmnlTiLuZ(jISD}dTI3$nGnW|^SGlUp1mwY}~rsqGC%No{YEGPS)8O7>(> zQ-H4|A^$DOpN(&C$leV9Ew7C9-kWSOzWm;u*)y}Sea(LS1TZpcGP7rXwExhd_|EJn z2A+R#M0{Z9gWI-&f$v?(k2v&>&YplB?b~>7pBdkVAJjMjXk6z0Xcwd#aIqlTr6H)_ zdfN6V%FQ|TV_c-$>U`lNXrp4w$|pL`E7(Md$CgY ztGPeC4{5_pa|j(ox(DPUr0*dW`U6crM9Kw!yIJO=@mFQ5zVpxW)$!flkv0ymqxY`C z`E~Zp1P-lb=Z_HI?96d!Er$Rg&xN{x99oZ<SO2CSWfIgOy05cY8i-Mp)i}xi_l~R0FJj@${4>w1VY+`ZvAgMvBrV)Q4g%;MElr@weMx_u@h^z0R2xtvA)=xSl=Kg*2f=yV!c20#QJS8 z=6uoT)x57f?jOGU%5$-Qlr3-Z?B7mS-b7dL%_Qo=@2bF@tt!25d4D#4@)9jYFd}}$17x@TF?KNq+wQMdC;5#1j+F0MqGue zZ->jupEWm^fky637L_l>TjG!Ka(QiXLfh&IxZsJG%K!zo7~e1c%bwl#FNS{vBYqI4l8dlUx960z7u&bY`?kp2$6Yv(G}m4MQS3-n8E*hk52A^8#zxR2ROL}o3R2&LvS?6I$}d{JLtIkp~PhW-Ek z^_BnK>w#GcWKFQkfcfrqAtR|T;un8iYt2F%%Jxxl=HA6^;xzMa}@ z%sihX%fyjqcv&juqg-eIN|JVQ5>nr)!S^4KHov;a*C6ga2h;swld8O7iLZ^;?Koam z0nCWe;6{11V(wNh8-Eh+R?2kiyIYCLm6M{*zO!-3#7t?sB`lVUm74+V#may=Q~ivK zl?8TAv|s=AE>@bU(wiLriHns|Ki2+Z7c0%Z&wuY?rTEfBWB*?+R+`%Wi!W9l!G_dx zs{dXWD{VLZAHP^B(=KLOT+pix0ND$Abpkbv{`W3c%0H)u8wN5>;;pybtBeJc)gNcZ zH(oWn-MiV<@>7P@ueTU~`8h*F9TEXtZ_lPpO=9T{IdOXa!U{Q^ETFP4aMv5yX4z$N6A@{c(+-_!B3#- zlWoI{sxSC{Ud`vdpLhBn((dOSJ74+V_jT}N@$|8H8f(k3^A)*ZX1-_qKlXfu_q`5! z_q}l0Oa3iKd$MBgd&T1WqC8wN=PHQ$vE<2hd%5T(H}P?yOK$kf#V)ztUxQRG_6seS z_$8I=`@HD2g%`bK|M14Iaq8p_c4R|>Z3s?m%w@0j*)xMUslob(O$C4~GjGc7g&eKaKTGQ#sm^Y4uJ!6iE3WS}LvOe}$(6 z0xbgamuZ^X+F;+u)@qN)$T?n!j7*yZ@dXf28D>WiYH{4CJI5iqeoDOj~&g2S~Khk+A^qn?4&}!73qO4O&HpN?427(u6 zT!`Gz+}%@Jj|&7Z%J_<*HcvirR3Lb<>A*JCT3nR|f{~2f@OSp3%Fq54!Gi>I6y!*m zwt4OF(FGqTQN^FoRr1gEoE#jKad#kqEc>^pEqNyg2WC_vzbs=iKKlp7Ic-{{^l-3y zMipFjN>A{!FVf8Hvk7t)7%NWgqg0za4b3Xy&~F7tik52dPnm=dD}TQebLNTJYBD<3 zZ_NTTa-geMMy}K!G&Y?gHf7Tc`PD7+N8zKI6oL!0#U%O57PI8<^O+VnJyS-kj=9q_ zM}cT5LsNR!jLO_VOOYrc{q4XRAOpd3GlEmV>!5@TG-4SwXJW*hn;{aYLKNCktuun_ zG`F?MwPjjo$Ur<(24QP62+0;zCsy7zIuLA=Ayvp=%eJ$HeE{Ay&$AfO(>(xd;9@)A z-whuA0K6OIh-UubBi_rG5x+a=AMtyF9PwvKb;0+Xu8uJ3{oo4W=J~t&LU#2h!B&9T zGWg{0CV0*ike}{(>(CKl}Nh@KK+S$Nmk|>}=cw4Q6)!wa%-_U{Lzhj{c`0?+J!*f>6s`aK0&IrbVU}|A1b-3i1>b!^!|fcX`d7_!)A3=C@sXzE zS;0A|%Z`~@zCk$44Z>Oe>G)nZX1wWG)=!y^?@gJG?@bvY_oh$BYT{>1$LE-#Fg6%0 zK}~0l3*ay4u`?#0^7l5Lc=-!1kf8Kj17jw<0zA)< zS5dfA(akpnf+Z4l$sfqu4kuB+KliS~$?=Dg6;O4xr>X*~{3KNQNvQIZP~{_UhpKBl zRkuTxpM)wu2~~a)s(j=(p{mqV^-ZYqlThU+p~_D}m5;21sxnViB~^IC!xwm$~YM8sgiNvC!xwuLY1F{DjzB1;95_W zi~~OjRelnx{3KNQNEru1JXJCd{3KNQNvQIZQ02%#XL;&^`N1FJN$oNL$N-4si~94S z>6~dIa(d>q`0Oe$c&cOtWeSrLr^@7UJ|>@BiDRXN0f7r-(&FEy(<}-ukU%thm}X_) zNicz6rh-$ytKh=<0HL|ls&a1ySkz_#>h=Ex%Y#Ej=VIlX91^m2**<0N7#w#D(vIk~q6E5O5omv=!06iRun_OB~6+KF3=W zT=1HhvsJUQKOnH1s6gOi{~lM$1eXVMWpBIaC>vWlg8D#8{+c^kk?#H%!jvgS;RXJq zC(p5?(F@r;V#;=hPVN|r2UDm14Ud`pU9?ibVUGPebfh{qZReNb*aA5&;U5oif(x>T zDLZ=v!B_?9PVJOooLcj5Ikh7WKK})$st1E@gXgw6xmBpm8Ex7IPdYva6HZ1r(5lr} z!NUBZ5aXl8sew?NFSYt|s~9@*xbp&%vjMiPAVf8Czuxnzt-Lq zJFZiE(ce4wM}68$i4%^qg2%;9&On_wJPBn&yOOP;h9uOS=#dy$X9goD_YV#_ZAh!3 z$BC-!;{p+*B&QX)*tB*BjUJBA2%>cnW8lf(nGppFVP5#@RQ zN<~q=U$-#Mwif@OPkXWUoQr~VsjWEdbbBCyWKn6OEm0g)g{BUFW*^LgQ69{2~F;(u@6F7YGy_Rs_#d`2_nw{YcYbU=ZPn zRW>UaxPkCX%E}5%31rrsRTVfpK~_w&!hxxQeckt6_`!vl{HoVknZ5U2crKWMOV8C( zl)g$!o%@-k-VY8sJ3;YcEp}c2$3sW2&6K134b`?uKKseyyNb_ezg6RX(v2IP8wV27 z=3wAPLYmV;%ukS_1rYQUTa>Q8jOtlRbd@EJRPY{WSAW8@{H?ypLa@+?SmOR7jl zpeU_Rex>!is%b``AVJ(`k~KIZ5J?dK+VQ$2h)??#rx0+*)Q z!RO&-*T}h>CnreF)<-xP=uh|~WueO;Na}e{knn}W1W9f31PQ&tD;iV#>a7`;bDb8< z2=q=6vt)A)epk!w*EAR?CHzEL#|1_QdY+pg8FHqT5uTtdk?ehLf@E7yk}%gN89g^a zvZE(Sc%HIaLT6^)xe3x2dZL8g{i2=EO_2VwCra4IFIsSJf^@0lj|pUgPns@l=INm4 zC?Y*QC9PkU>40f9m4mP#K|I>=A_?LRj@K;hc3KPWic}Adnf|$pS81mE~I>$$Ch$KHvIC>0fB6&$+%+{9!HjdDnN!KQ_iT$!9-VJ;&Ot&u71t^?j=Q z+Bi23B%H0bV%a8~qpX&JN<2{=5~S%%EV+biu{h|cfm6|0h2ahfinGC;B!W143}g$k zK!bV+CJ^pa*5@>V;+@o?&_qZde@;z4_gbjVJ<~b4jBto!4J2e`%`fKxEpV|b*q@LJ z5OWavb)^SBWvXMK(^W>uZVU#lCuA3$b@cvB>&lLw5z8jng&fhSlaTg*R-LIHAF3_l z^B+P~pEiZ03N;hpP^8*<==vj7<%!%t_R>JsP$Yj z(w3Han)pG-izJ8-IbOE}@m-@>wlG0F_IC0vNf6VX<^mjcDPCCVW6x_d(V6Y`O> z0S>8-?g8vi$Oq91lDgOvB;+y4n*Z{ElA~0S@W8o+#mbzbFD3(!X+i z8{m+p3!4dW=sAisAK;wvP1{qqXetAkc!lFd62uqX$${A|LHu*aD@+hi7)`lL62u$> zGJt6vJ5%ZK`AK3O%U|%w9{#p7qCeq9x@2JOAoS}>U-i?C7(yey>U3RC z=+~8g+_6CIp#p>+gdtCt=S+*{X(>-6ZP}v;wP9)@WU1!kBNlvE>%fGA;|@|R8w{kK z!L(N!I82WsLf#Cd}#SC}B~a4mV4B#7B0i7AM;x<~Yq1Th~~kz<4Te2IO;S?X1> zkN8x_>y{wq(;=}3F&_hoJ%~BpCH5euqb2qr<_H#Bh&fip7GjP{iGzqa{=^ny4m7ca zm?KJTA!bK57YA{vUS4kfjtRYXa)Q){%EdS%Jam{Ksjs;(s*KQE+C-z(?tT8?28&zW zXj9oM5KnZxq6G2Vj@KtaJgA&y`zDB&I$pm7F)e9sO8%+Z@oP;nB@_NmSvt{>{0}Et zM);&p5)%!{T2GR2xla-k4aw&`Ny7EY;zUFGWlxmwb-ySk8q(W6QNlfbQA{+XTc~?m z2JA#bnq7hcKzNFZ!c&Cpm8G5{ndc!Z#_kt0~MYktae&U3BRo@^%TkPI>|D^ANVBUDUu63 zNy12e1)>EY4_EZq=aja6p z50#~!BKe7vEF%o*BQKKh6v-2mjSeAf>yw11NalKygdLSdPmw;)6D7RRFA7hQ?(T^a ze%UVyPm#Xb@vWywb5cVS2yb*+O9^jLmU@chH=JY{;TWGJJVkPXCrLQfCkaoHyw8&) zoUJT+iu62Bl<+5hQFw~<&plDX-}pu0Dbh8hHC5stwwjBX)Jx1|OX?-&0wpmAF_$8VIf%LDNX$XZ#YJKcVtQNbBR)l+U9p9j z3xn7~%q2iF&Eldz|Ru@}{Iem*Q#GH@C7Gh4wVhb^+Ua^IEfvdM$f|%2(_=TAB zrr1Kv2~un!=8Pw{5Oc~CTZlh#^+poJoW{g2#JS}hLxl-q&P-wpF{dE0g_v`S*h0)n zMQkDF^dWvBp5W^3mLTSoAbug{voE#~^C1^oi1`4EEw)9z&&w<>)D#C`gl+oTEF6Fl zUal=#+x zk-Fw9u4~E&%N=VVAuVeD;4Gj8&U6L)6H)@qN0GQ|!V#uTT z`bg2tX5RdDQypK`RL=T{7du|x1Tj_0SsyXgH@C68LUmlDDYi?51C*r!8_A(gvW&3Y zCy9WK0NqCCni=HImW@XV+q_=ybgnRs= z@D%9}JW;}r{i5&`=|hfhJw=*>t(nj^bqw3HzAID_{c6wpNb!NOXMLntNwa5thx8%6 z^egtUlh}axCyp0M5C^_Wxo!#K(T-P`AikqFwNo+vOM^)t5U_M`B zAMsFE7VjAqk8r$h31U7S61x!dF_74WnB!ey7h<|r>?7s~7F&opRwa%ie#U1DF~^_S zLd=0Cwh(hfi7mwJ$mZfGt^}v(a*7=bPUcAUb&G5nA=h6kNa}l@AR$)_D`(U9!mNfMr;EKW3}&-X+LBYsg#G^BfaqJ&@bi(;Z7 z-QV%;L_?Zgg2ocw;E)gZ!bZm`C0ws8^%TjMon#r|>pn?%isW`r zl5met5}qQt-;*Rfq%3;M(fS~vms=`4QBxxdPmylti4vaS7lo%tU*!1KQ>6L$qBg>> zIjyCHeU+u2B6+2gEF&!QNy1Ykhj@~NH~J*uDUu^RNy2X`i=HAq&J!j4mR}T}B0a+s zC5-z;;VIG!9N&71G$%E9itzVNYboKQ%2H2}T;e3l2>;=egr`V0c#?$A`Xu2glIuK4 z!X{3?eqJ-Q0qVN>y-JU4nKEEhD9=7cPMA%5BEizJA* zJ6^X0@qWiEOc1x!0V}Z>aXZKBks#(+5?hGB=HvqS4xUz)_;&G1G zJwePVNc=*~5iGV4b5aq%5OewvTZkK69o-Vd>m08zLCj}g{6fr!Tx=ocQzW*eKEVsS zTh8w^#aSQWVr4adFE72|z~&03-x^CVSk_#@^jl*zy=BlMm`cbW2+0V{?M*SJAr+^r zc%OMpJ1RIwQ=DHAdd0@$TfH{FPSa3e?juu4T$W)oar#1dg|b=%CI|8ogO9TU_h@)AvttHY@0>sZO@?1xG@f3)6FmPibN0VfwVH zUus?Ndvy}}>YP@US|=AUy$B<960#oDnM=IG>*ty(61O>bjSI}Jse-%6sn#55B`i{_ zGXoS}q9*^H0p)9TB3E8?<>e7pCVC_tbuZyXI7pxYKpscwxhIa_-ig zVBk8!Nsd)Y$X^V|46yuKZN)~fHH5y_%t>p_Eoyp&rf3ZzTY%PdCZ+*s&HO47`~jfM zz*u3UHRRmuOuvpW?pVVJf1)g}d@KF2tTz17tiW(WUu)*4wMM@nfE}v+Lu&}x0<@-p zm@|g7#!Vkd9cw5{_?%+IoR!Gojz+scM=_Xfut1fjx=&u*1NG zRV0pc9vnh=p7S8&THXz7u23r;^{gTES+g+BgX7h%iJHO-gfuD?7+>pVVBc)8sES0y z|FA8R0YT2CnuFm@_+`f`BRuS3HRXqEGe)`Q3?=k6XVDRxiTmBy!>Uy)olVY@ZlGTGoLbiQ zcv}@-`4IXntffKD!WoMAB;w?X@M(Q0?Hj>P3DRra!&=H(^DE68b9L-&~~|tFG1F zzup@KguY5Q9qk9P9V#gXv_;_0R*U~H>>P%KZ%e|p|1L1l%>c`=GE}asjUv{p!ffc;rM*R(h z^j1qVz~^Wguli3EGE~P1C(P7_WCZRB5Vuj@@d!ii3BWgGeAzi-7~$2avOClKzzERU z!@Ai+Fo*EBZeF}DUD0(;LrQyC8cWzB*YAH<%h2z;tKL(qHaq7HC#3T*2Yr@)+{$28 zoVB5S+EfyZ_%R7gn`+kUeQDlci0tZ0_r^e0!kHG8?5`qGr|mK!?|vD&XFp4@|<)CCzv~7#Nmb0i$hK4}aoyi5u{P2pP_u zCcD9wCK8>C6|&RZjrfqw9R;{8JmLP95l{hrN z2(M9A3(RJDiTjn>=mdunzUlW0{0!e2O6e?rD17I}Doe zX&5#fgA5!n?GO%iBWVbuFfQ(BB;i!o^JS|PcJz9R@FHckGajrAbdrf&Y+r(gB)+V< zkGO&ZwkaIw`lyufPGzAvh@Hm-I-xmy3fQ$jbj1b{zV2AVGPPsIyFN=>6xcg|bzL!l zkRc$tVp>&8GgwEXC9-VGJliS3Rh0i=1$@dW`3cuM!w4B@qdAyH5NH>A&EXQ*%5_B< zAw!2?U??HG1QQNnsq39Vgfo<7ADY!lt#^X`3A?-A7}iJOgC1+A!mm2F4kdh0StsJL z^fiX;T34c!kijW@fN&K)h>#XYtxhA)jxB&anFL|ZhOs;_a$z=j=Bb?~^ zzHz(qddIq6{)I~UJJtE7KOqlSWf8_RxJahMg`5*7 z!!!!4aAiyPDU3V64x(Q_bgY5=Y0z&wC#FRoi`7#blXBx9HDmA&)qMfq`dhsjs?H{ZmW*T}$!Y4=3*U z-9+=(*_BEQ7&!=9YL6p2d}Qk9UbItfptf^8Z8cLl<5z1pA)vMEaDp7p`*^Gg=~Pen zNOcu@bB&W)P^ggl@_wODNB+~i2r-t1MoaOy{@=~J(X7|&ST z667$@z}id5UO`TXii0-7U%ChFMndXE8+TWc;9`lxh zm$nV1D(;2NURs~uR+=8KD`tWWuCh2XC8TehY^u$C&(Kv}(aeQrits!N&067l?g4MC z+f_e74(DaKkdPq)T&TiFGUs-kmQmNDv_9p^!)+&o_NSc$eXJVfr*Gx2nPVS{g!Czb zp3PPC+*xk6974#+8s;NVY;KftE5%p~olSVT+R+lbL77oVGkAcgHPrj0w(~suYOCnW zg>LW+A><(@?5nlI+0$jw2-S4DE1b44GPGn+iK&W^g9(0GGL?jHipD-bCWF*Usgc7T z#phR%*zG-n^Q}+G;Zlz|i;ypS&_ESFQeBL!P}jyP5|?O~qWDI898L}c2CT8E8h@@X zYvIPxU_!6!h7&Sm!JsD`?mB)lzB!!kp4}Qa6lU3~(O$w;d1MVId|p}2zu`~UHITZl zX-8c*5I&$R?5OeMA3s-WjVDOB-mwM}4micCvt>z5S1x7|!tXfNK*Hy=4b99o=?y4j z1Agj^xSo(Ew+KvG6m$)E#;H$>-_q6hr+U9*)4xyhT3g%PJ5I7`2EL0=ywCB962vo3 zrd*!{@rl|1`FRv#c7fc_9j%_)s;S(1m~)oZvqMvtfMGF{EFlCNe~atqg>wv z@jo1|UxJt(XzpGx+doL#if*44bRO93achFk2e+#aAd8>ZpcnRfT>QKS{lFm!S^T^P zsj2F*P@rl;f+Oz@Hx338UaB4o1;(ZS3-i4*9Cf&`nBqlUK&R80@g3uYcVy=NE= zGblg7+_O~*hsuQKJ60)S#Iec<`#M%BAsq-^96vu%DHsuVM%2&<=KkL4>QDG*$0{X! z%CX7_pL48ILiQJo;D9|(y9GwfbwX;rWlmQa;nR*)O1Q?c$_U?ZtWrXbMi{}Nc$tnb81axZVj+!S?jSb` z%Ls=#Rw?04j#Wk||HOm*VTB;%FozMTgB+*Cv|(I8hb2dSwvKwpOw_SEZi31%?-Z4R zYQj9nDkVHeSsi_vCK*k=!}-)XCXkLBJ*Bz2@dMp8%WLnC$Yso`@WbrAExkUEI@ zkVqY=Pl?o#`bLBJQmO6+zDy0r$4oj(nm_t9jiZKXU8fdWcQ**22x22w}rIFLG0dtK^C?RKdnwcn+V z)Q*=rQhQ$NNbP5-BekQYj?|u(I#Rn@>PYQtse?WHZS7g9gP2_`brAp3$@NGOa|lQs zDMKKzTIEm9w>el^34hR?tl@+YDy#XOVY;q?)D?E>Zy+o<->OGAj(-K?Yf9avDfVrI zV;yS<;m3!SCDo-X4ZeY}n_~?m{IP4+4OZOyOm9FL8}KD(#Px(U89PJlNJ#z2sZZM( zrmOEy^?t{u?+jOVu{P3WvO~+&md(*rrt@A}_jfcEIbzn(+%G>piRts+|^b_m~^aC!gY={h>)IYX7+8)PxM)bbLhKBn(xfa zqkg1=WL6-ihh$bDri*0eA*Q=zR-nJwUa5na4VF5H*Ch}rz+W*!dK2|8G9 zG~_*u7IApDx{x06hWTY$5^LggwGn#kO%R0^9R!eNe8MmWl`N(pHgmhXSXD)C{e z4dTO8BgBWPR)`N%&1lYt3$L`6J*g>XZ^EprY!+UV5T4>#g9uMmR`U}&)xYf24Yzhd5IQ5#FpZTJznJUT|P@1=G7Dy3~{-nDQx4Zg7%igs=D{@%3nuuX~b&+kKMwdNj%1o+RNuW%29Lq(AjU z30tVY_?3IoCn#TbZVKD_Me+4$(x*7S{USMOb`w-|R*{IN@DyPWWvQn~e%VQu5%%#( z!c!!po+RNlK1q0rAO5p!n^&V@D%Auj&D6hnu8af znyn%~bXrRZA5@llisV8kSw{G4pCmj*^7o!3;bT5Yc#7m-JW0YjWzkclmwTdw&-q2+ zDbnjaQNkv_C_F`aljB=Yk$%%t@t(qu9jlb^kh0WMj&$=;*>MV6YbuiP6v>l3Ny7F% zNqCB6M^BRQJY~^Sq%ZJ92@CzA@D%A@o+x2&zbHIKx}W1)Pm$(ChcR`nij+I8rG&RC zOFc#M8&0x}aEwn9o+5dVCrMc4lZ2;8&h#V+XDN%GB0bj=C7kaUg{MgW+!H1Ijb9X= zBK>>Ex1J)+=^vhYpW57l)IX^3IHrNI@-G~c%yN1%4{y6h59FqRJr{YL!At!w-@OX8} zd7l#;xHu@_9LMz;T)evE^UQ$@E?!-7>@xTO7q2c!vFpsQg|YKK)=q4VYg1DK9mWLC z&RoG&byjR4=7K7=5Od`eTZlPm#1;-2u7zR?F;_jYg_!G`*h0*eOl%?Mk|nkf zb8Rx)cx@-w9kGR&!&mx^gO@9i*hkE@M(iWzsv`CgbNvweh`CUReZ*W5#6Ds^G-4kg z8P4%y3o&PNv4xoPw%9_<8Cq;1=2R@U5OdBITZs8kiY*c?P5#nl+}Bz{9cvnC%b|J z3H=4r_o^)TGgq*T@VAaNn2^Tc=z!3#K7IR2^~0U6{)CGiYd9fwVsRw&t4}{VpnAIU z3-)TQnz-MYI*4%i1lym@cSm}`fz1_6?~e3>Wh^*YYs40vaF{b<3f^pz@|%|4BmWgO zxW@nP9_uxH6^8JS$_fQ0Pfd{M=h`})FsdvgM`DO4m-@h^x6}1q>q=TNfRNT;#mCzb zQk$K*0|;MttWrW+Xv>lsCX4ncM{^>9q2F?0dP%+1i94e%bk-R0AH;Sq$1B zx50KPy-(9gC)<49vO2DvrFkQ2meLzF?Q@FFPyJk_{5Lh_O^=SJTgi(xeMi&xHI1KP z^XF*Vs)MCZ()1}!mudR7Gj0AAny%8cQPZ&A80w{IOw(&My;;*ynvT(Qs-`nE<;|(r zmFBII;T>)LyaBRaY2MbjI?v`0*7ONYE!X?Z)$0kf&b7SnYx>lAmR_dmtWKJ*X+~#D zXK8wxre|qdq3NBPo~Q3^FVysYP5FMj^Z8ca1WmVVYPsI$*sd1O)|78amtSo8FKF5( zV(C*f{j#QgH2s^V^_mXqX61)#x>eI%npPIt{7ITVtLYX^w`;mz(?goJ>~71o(X@l6 z=V*Gqrrk8{>+V2B4aJe=Vn2@gnkD8T~>9t`kMfSdEwLE6Nu%`T{I`=AlrG7uGzoz_WRa5^h(p39TA7(jWO)u0mrs;K> zmTSs?j*I_P)=zJ+<@akExY5!hG`&;Pb(+4cX`gZ{ccrG$n=GCB`5zUQ^Mt0gnl@_s zlBTa}x?9r^H9e?l@^-7|SxtxEky?|bmnr{QO*d%zo~A9nY2~ss`7u<|8CFf2V4ChNgL%Ml`)d(=Tb-Pt%yDjE@<+GHzx3m>O-T z+H;TEIZ4x*n#MJKP}6rbO)Zb_H=4hYrY$u+u_{f{FotRVjhc?sbfTs+G`&yL`I`Pz z(_d-&%_&y@cuiXzp}yOdKVH)+O&`>Bv8In|x=hozW?232YI>x2cJiqb|9%U?!QjVj zPQ+(0CF2Jic=5#iQ=L<&N>U-e!qAM%{dI#Vxm7JfY_$ zH(t{H!aGLYIex;06?cxm_!1`<7%^&eWw(NE7u_NnE63eB>dtW&jg){J_}1G-0EfiR zEO6{!Xk{Q6YP+FtXylclwnK)N4hv1Va%kDG&^=cU?H`&jWa!+v-~ayif8YPENuwu) z#*Z^OnU|TYVNd-&kkKL(YL$7Jq(7C9h4S$e`M6F#u9c7J?&E2PzA9+2d<>J1f$}j_ zKE5U&gXF^~xJJ;I<>M>z5xU1z)$eew$+}c{rgR@crh?D}BW}1w!Zkuhytj}&oum=z zFJx$8)|K`LgRhvTNxZ0XKFFZj77$8*NYH_K@h0gj)`cb-TrGI6e0U?ZIbNuZt!J!F4x@$}eJ5CZ?zRG1BMj3%k>*J|y2p*5 zjS1IFLDOL4cGH)ptT&PkY2>aowM;T~n(@EVl{N*9+5y5d%y`yNDZ9x)Hy zFv&L___`p|6w?M>S`5R>^7_EoXYBQmQI^8;solz=)+xJY6+CH+Y@kc_1{-ntOuL zXJk_hGP2*3+6)RUt+FK+2gex`PO|+p+72o^^O~n3R9|JY{uZ<)mQ@W2Ee(!X8EQ4; zDy-#KHIrhly=t!c`@zJ4Z)FBUf1NT;;CC|! zH<0yEkgSY{JviHgH6FZCL^Hl(v{oDT4AJ_Tuqrt7E0qJpqDxHKZv-ntCuFY37|^ad z^v5YPLX8=jKLfZdv^FE-30V94DenpV8>;v{35uG75jgdk}L^lPiL!&ZJ^1Jta&%M8O?#);MGt}v$iJZ@4Uira{gUMX35Wc>BX z%>3J`uN``A#ceCEt-vSAYhSocn3)$N7yN7Aymshq75HB!#E|p<+B+98DXJ@vS9cB2 zDBBY7fr?2hAfU`J!(-$%$^e2R$Xj8=&CoOSG$Zq3x|@eD9;3*vJXcuxl!srwEHOKv ztWh_hu8^24sAMM^#r2V(5#40nsQBEh#yzL*|IC~@(_=O<>zDn$z4P@{)vwO|-*ZpZ zy>+{)x+*d`Se={`?$?vouqoT~O!&f{`;$A8JNE7m_obCPlKYc$-rr9phIdgAnr>cQ zIyzaBOvxXusPy;Tyt-sB;bYlwY03MvYH#X9YHw;+6WK?4aE9zENx}XL;k5tVo6|+9 zq9(tY%D6Z@DD+K#Mr2bE9S*_Hmgt<+QHjJNMf)2!Zf*?>R~&KT!0;jhlQ_{7gk|u? zT}{b3ks?|uO`O;m%(PkFU)q#xqQEsaB|m?3|FMlUdxW=3_+Ka}T$@ZuCMAEw(?Lu! ze|o$(<=^gy?+S&|el(mLnmlC`4H=6jQ<^$&b4jGw-?t*<2OY%aj*~Q*i^Rc zOqH~bccue0<=w3EtKFAg3&NI`t8t%`y`7OQemzC1`xw7IO{4sW{Ob>o%=pV!H#XkdxMrT;Gm__jYsE?b=iV5ajtTew z|JF&{s$J`R&D#_n(96GP#aGonxz}>dl-uj%{mDO{_W8IU-Wm#V7r*b+Q0T)SMeh6_ z?N2xQpWVWpdvV=7Zjt{B-TV8ANYNkAz3-Y6**qu0ojv`dwin*w`u8KFOG`4;Rx&B- zdujdz_rKx3A^(n0_@dB3|Girxv7is;F67U)58k!Pf6M1_P)Ge_GrfUjUucuRQumW{ z(HqWf=_i?RKPlU5?jP7se%X!kDmu!d651N>FE}24L8ta$=)gzRr&R`h+D_`z{Dw#m z+o%2DmdF_YpF(4JxE;Re|L|UUD6}b@+sl9SE88o-OwY4x6}2)t+nh?*H6#j$d326> zQKDeT2(K=au1?f?gBRA-BnB_3jxQKo-%xjbq9UCdoT^%sD66WhiPPERcsk)_=xlP~ zu(C8|u4_ogXvs@e-H;$qRa>7)dliA`kl}@8sj8a#>O@(*JXP0F9vG#U*C*N;c#9Gh z74bzacJQezwK$VV+>j`*uB%v-qU`Ft_~6zv+{oP4*!e&e6Cc{Hu7xI7WrR3BZh1N} zgLI#iTZ!^2*9cipq;+YExCo+Qb5{x~?`E1g9dE zuJHuNDi+2YyavupHDW_Y0_)&kizNrPx-PzeHd7nMP8Srm#J|nDcCl|;Z`m@=xi&c` zO}#oce&onlVcyWZ{MeBEf=lv;7Zk>FYvN0m4KEl{NptB;ZK5cds7*9fRq%?2iiJgV z6(YBASS&Xg%bhkfmP^+ciYi8qY!^MZqwzk0FxGI45zB(Y{QR<6yl+iUG}L5h7o)wA z_at%G?g_GkK+Bsne*CD|fC-86s(7udKyJZ^ix(uyM`j8Jni_fD>A2<*x?!dlTGl;u zZnrKGes>99a_^0G!OJ^voUFvoxF ztU#l77f0w>0M}a!<28K$f5dj3Z7{>Iof?a`?k)X5RI*2-x@p3 zh}QdOb%_or{$Vs${9~%vGgPyqTs3bBeI_9PHoniQHBMAF#EI&LIL(f(%<<3ea!R&9 zm7Gf12hfK)r62{j*OA`3aB;NP8uq6#CtM!wRgAc@?+3r6?|dDM(~E6=r?T4lel`1E z`USpoeC2`BI=%}~#&&+LXKQ?~Xo>Gt6yL?2I*hM8R(S*m=!@ffZS=VB*LBJo-(rdH z2k_^A65m*GZP4@n7~iLpyYrEMZCl^7$CrG6{0n?1xegQW207k)pJy}h`NC(G z?bi6DWD~m|^6+3*nm-7G3uqdqHS(O$pnVhLeUo`}jAkeWZPv|FsjWId!Nm7E>&2FN z-|NycuV;L(t6prG_q|hE=Jky4aV^>>-@^@*W|i;Z#>sY^_w@G3_j7ll-yhw>J@iV^{_nb+4{44N%;PM>bdqsyg9HpQy zqv?0)6X@XHs-VpfI|MHGR*BAW@cNlFLn-K2X!;&~SZ8{isppA5e5IfHeDI*1(fVHC z9`02M+CK5~!N=kr5%FgvxZF!3I&;D2+cy<+0`6sPPe&9aJb>)=BJ;#^dg!zcgf5jgWVZu+RVB2i2X$!@SPaQu>G0J{=*n8 zP_jCwoS&VK#yzT5=NQ@xb1q8!`Gw-z&l#gz^|e2AZ8%6)XO-ewXN%%mXZHA39nIG( zuKj#gajpMfXxFyBlP0(7Yn^F|Yn?+gTJ5zzZ=)Zu`u%6M+H3tn#kKwf#kGD8?!mMA zMT%?v)rxEV+vC~&*{Ha-FRg5~*Zx$Yzp~r+M#Z(xor-In`>M0+zp1#c*TuN^j?2~h z_n@D&@p(&et@DoJT4(>#?EXBDe%0#t#64y<4*e9@`lA)sahQ&K$*lec#kK#>DX#V9 zNXsJwJqnQH5OY4}_TYjj$&pt0_Z;@*Ckn!QfuD^j9d3fY28-T6%&=li~6rwnoF{rAliJ zN4MrZV0aqK4;elRu`u=d~nj(??U`PQ(X2L zMq&6R<;s5Z7mBm}BrNw(oVRZ^md{h1?RR5&km2%jsshDXr#F_zD9$?aHnv6JJmD(I zJ*z2Y&!gQb6TGx$!!yZR(XN#5|sant2K2z#Ee#J{3{dCG9P zm-a)$<^+=eC#J zEBu1e=X%wnUT-SS{#SRRf0W)ad^b95`L52{wa#b8{u$WkAl?$^*{E+HaMt1WvKjk_ zye>%ii{SH&z4QY~!{rP2t%hGmWpKL3aB1%k8ZPbkdBZ=3Kd*pGz2tk*`^H}U{GH+) z=i?Y>%bR>o_B&K8ovpDNk{>e-ZZ66lZ&h z=e3HneP0~3Yr!QBr5Hc`wXy#l>}9`}di7y3O52V7L~wa2nAlf=|9uDLK4EkY!~U?b zm)~s2n}JwAXzw_Py{U9Ge@AhaejEB98U4Euhwf-k5}$8Zb8YE-47k)QjZ+@Q9oWme zc*OpUz$wowGxqY^kfhQN+9~R_RB?9wD(I{<{B{hEZc|+AY*3ta_QHNExYTzK>MJkF z6)yXUyu*&`#rxZC^ds_uGvU&X{#});_s=l;8?j#uovz?guAJ|X7vxDi53v}fJfkD+ zYoy{_-`~Ps-Zm=bp2h`HDlzs$(Xgi)f3Ax=`P}v;Vvw<|F?g#aX8kI>Qa$4lXaAl=wUWKFQdB0zO4?UG6+^@$=o8R5+!S z;qn{4orcSA_;wrpderMt#kpPu{H2Q0lZMX*KWg|=@Mjed+As89R-E-;fc+cbQZM=K z;M>Zcb>xk0pDNBeh4@0&3H^=4`EI_>MCla8wN6jPSto=qa_4}HKUd5OWV~U9dpM79 zozXdhFM{z7?3Wn(;&Yt-3S%$7LA+hrv;Xr^|GN}t|BpcDUc;MWPXF%|*E;tr&N>6n zclHm1OB^Do?<>ll+t+bt?)}Pe=^s8cT#oPEFuvgJ#X1*oL6rI!F8#nL!=+vSrQs;f zTL3Qp2Nxtr|3+gkey&lR<51I=z45#|6lXuBANyOy+5XCYY)#`m#o1o^^WBQG{fP_N z+Vh$eXM3sFBZ{+qL4UUPykm;9y~Oi%#o7L5*nbEvafsuK_t!Y^l%&0U0{e3eKW~7u z>tpy>@Ii)GfER$7P&cnHmH+Nuv!r15LIs4xjd--ksN6Mc4--PjB zXN+@sHT(ZCbWSt;u3=99bj7vKd5W{n8N;2ujJqWcb5Y+>uoo`-Rk_mP{d2pM@oEj1 zc6Wo}vOjM&{Bev=wt-8za((Oj%AWl%9pMUoTyfnV4=c|0e*^oYinG0puU=G~?VpDI ztKj0#9>lpD8&Trz!a7g$0wwwWAzbP^)!1)v=H7C{S0N4$8NLSm0J!)A8SkgYz7Q*3 zR-9*ZM!FTp6=(lfupp&hE6%f{VgH|omw`v{Jwx`-UZVmT?<~Wg1HaJlNw|(O-0(2k z;{?N-!Dku1v&fZOZum@`SY2%RaqyLfKX94T*=YF1V;z@g@<@G;g6}onTG!yyr1DUmpl7mhCc#6-taRg zI{TT1-v}NzJT}SM*BibHe1+j}lsNkhhOeIN_zuHoUgh{6!>gw{F3(Vs`0T_9WqGEO z@SlOdV07dmtZx`TcDmC)Y4{TG5Wd%n{vPn|hW{4)8-`cTaQb5e`55L;!yl(il z;O`l}0zBx%DXVOkZ-JlI-RTMc0Q~EQkHbJP*YGXiV+{XC@X3aEo#V>8*6<8?mEr#k zUZ*%_1EC#;--mwwC^#?jp+)ZR`8Q?HUH*W%fsFT_;q>GEAjx?PiO=v-$9)W>g>M1x zX82L?zTo1IJOgczvPY8&3K*j}`*{@KUy2pi_E##-_G{gZINmJ7PbqW$e-m8%k!O@u zDtp#{*O_}u6leYW<8H-`ifj8@6=(ZF<<5S+;g>FOd<(evBQJX1Yj|_a*&jAK<0>8h zso@*IUow1J(%Jvg@cY5vHvBE{9Gs^SKjm3v-N1Q&;5fuEp6RVP$6?Y!SMK?WYkPUO zaLaxL`=QF7bsnp7I+ucrKk_WHiOQb!<$BgNiU&H^JDquoYkPS{p7`@R>}!-g>nvO3 zbkgAB&t(@mzTR+otIvH#$E$Ys|6q6-_yNPWf&b9(kHMcY{F)l4e+*pw{QOGCk-f&yQv}{wC~&%l&qt)7^U6UZZcqvf+1w_cZ)D@ZN?G!G-yLhHnPX2N!>K zHMnxGFkBvJoiSYQ3tVUP--Z5G!w2HJq1;y{^(_bgP6zrAC>@S}bHw?1NO6w;lhAp} z@C#Qs`{xYb11`_XWPh~&YsP-kN~iN{aEaS55x0}do^|9_+VB}JpVarxm2UY|!#`f_ zcz4CM&e@7{edjhhdwGGj_#+Qt?(9|v<0~#(5{4CYho~Ee2w#gC%VXet_`o3~Ak~J; zcxIpB%`pc)FkH?vM7TgoqA&N+USYVLZ`@|M^xIDwF8yH+oECk#{xaKexvsR(aJi56 zKEvgD@h65$yYB0^t(UZy1%}IgGG1P4c}+TAPSbRQn=X{OhD3EdFP&JH_VO0Q({V4a zJeBhD8tUAFo959EfeH$VvMv2MC6lDz3{}=qj$l4sUfz&cB1`m}r#6=HhInlRJu?Vt>g=Liu@tTB-!I$|Tqs+W% zWq%f}v6!*XK$HDj?9QbbKD9Si0Hz10x*cXEucXAyiwU|HuNOl~7t)70HXEZL4Y*b2 z+D@HnD;q?!tog_0FQB}viQSCmTpEk=4+_9EW|sN15$DJ7qIKZ0CRPXKMa&r8;(8tD~zxHG+f6m=5-+B`mYseJSr!)!GPdEIbc z<792OWnpah${=UC@=RW8O*a3Xv?`nb-IqE`X(#MIAFr(c-zLVEHcr=#clnQYu>Hi3 z2Rg_R?57#n}htC@w~M?3TE8&<|6+-)3BxfqI;NTvc+!z;yxNvc$ctR*GG2l5c{B3I=I=_(xJZP4S9X=lUxo4mNdL@YaD6UY{!PoA LKsm=f2KQst- literal 0 HcmV?d00001 diff --git a/sidh_ref/sidh.c b/sidh_ref/sidh.c new file mode 100644 index 0000000..ddcaf95 --- /dev/null +++ b/sidh_ref/sidh.c @@ -0,0 +1,345 @@ +/******************************************************************************************** +* Supersingular Isogeny Key Encapsulation Library +* +* Abstract: ephemeral supersingular isogeny Diffie-Hellman key exchange (SIDH) +*********************************************************************************************/ + +#include "P751_internal.h" +#include "random/random.h" + +#include +static void clear_words(void *mem, digit_t nwords) +{ // Clear digits from memory. "nwords" indicates the number of digits to be zeroed. + // This function uses the volatile type qualifier to inform the compiler not to optimize out the memory clearing. + unsigned int i; + volatile digit_t *v = mem; + + for (i = 0; i < nwords; i++) + { + v[i] = 0; + } +} + +static void init_basis(digit_t *gen, f2elm_t XP, f2elm_t XQ, f2elm_t XR) +{ // Initialization of basis points + + fpcopy(gen, XP[0]); + fpcopy(gen + NWORDS_FIELD, XP[1]); + fpcopy(gen + 2 * NWORDS_FIELD, XQ[0]); + fpzero(XQ[1]); + fpcopy(gen + 3 * NWORDS_FIELD, XR[0]); + fpcopy(gen + 4 * NWORDS_FIELD, XR[1]); +} + +static void fp2_encode(const f2elm_t x, unsigned char *enc) +{ // Conversion of GF(p^2) element from Montgomery to standard representation, and encoding by removing leading 0 bytes + unsigned int i; + f2elm_t t; + + from_fp2mont(x, t); + for (i = 0; i < FP2_ENCODED_BYTES / 2; i++) + { + enc[i] = ((unsigned char *)t)[i]; + enc[i + FP2_ENCODED_BYTES / 2] = ((unsigned char *)t)[i + MAXBITS_FIELD / 8]; + } +} + +static void fp2_decode(const unsigned char *enc, f2elm_t x) +{ // Parse byte sequence back into GF(p^2) element, and conversion to Montgomery representation + unsigned int i; + + for (i = 0; i < 2 * (MAXBITS_FIELD / 8); i++) + ((unsigned char *)x)[i] = 0; + for (i = 0; i < FP2_ENCODED_BYTES / 2; i++) + { + ((unsigned char *)x)[i] = enc[i]; + ((unsigned char *)x)[i + MAXBITS_FIELD / 8] = enc[i + FP2_ENCODED_BYTES / 2]; + } + to_fp2mont(x, x); +} + +void random_mod_order_A(unsigned char *random_digits) +{ // Generation of Alice's secret key + // Outputs random value in [0, 2^eA - 1] + unsigned long long nbytes = NBITS_TO_NBYTES(OALICE_BITS); + + clear_words((void *)random_digits, MAXWORDS_ORDER); + randombytes(random_digits, nbytes); + random_digits[nbytes - 1] &= MASK_ALICE; // Masking last byte +} + +void random_mod_order_B(unsigned char *random_digits) +{ // Generation of Bob's secret key + // Outputs random value in [0, 2^Floor(Log(2, oB)) - 1] + unsigned long long nbytes = NBITS_TO_NBYTES(OBOB_BITS - 1); + + clear_words((void *)random_digits, MAXWORDS_ORDER); + randombytes(random_digits, nbytes); + random_digits[nbytes - 1] &= MASK_BOB; // Masking last byte +} + +int EphemeralKeyGeneration_A(const unsigned char *PrivateKeyA, unsigned char *PublicKeyA) +{ // Alice's ephemeral public key generation + // Input: a private key PrivateKeyA in the range [0, 2^eA - 1]. + // Output: the public key PublicKeyA consisting of 3 elements in GF(p^2) which are encoded by removing leading 0 bytes. + point_proj_t R, phiP = {0}, phiQ = {0}, phiR = {0}, pts[MAX_INT_POINTS_ALICE]; + f2elm_t XPA, XQA, XRA, coeff[3], A24plus = {0}, C24 = {0}, A = {0}; + unsigned int i, row, m, index = 0, pts_index[MAX_INT_POINTS_ALICE], npts = 0, ii = 0; + + // Initialize basis points + init_basis((digit_t *)A_gen, XPA, XQA, XRA); + init_basis((digit_t *)B_gen, phiP->X, phiQ->X, phiR->X); + + fpcopy((digit_t *)&Montgomery_one, (phiP->Z)[0]); + fpcopy((digit_t *)&Montgomery_one, (phiQ->Z)[0]); + fpcopy((digit_t *)&Montgomery_one, (phiR->Z)[0]); + + // Initialize constants + fpcopy((digit_t *)&Montgomery_one, A24plus[0]); + fp2add(A24plus, A24plus, C24); + + uint64_t temp[12]; + uint64_t ifma_temp[15]; + + // Retrieve kernel point + LADDER3PT(XPA, XQA, XRA, (digit_t *)PrivateKeyA, ALICE, R, A); + + // Traverse tree + index = 0; + for (row = 1; row < MAX_Alice; row++) + { + while (index < MAX_Alice - row) + { + fp2copy(R->X, pts[npts]->X); + fp2copy(R->Z, pts[npts]->Z); + pts_index[npts++] = index; + m = strat_Alice[ii++]; + xDBLe(R, R, A24plus, C24, (int)(2 * m)); + index += m; + } + get_4_isog(R, A24plus, C24, coeff); + + for (i = 0; i < npts; i++) + { + eval_4_isog(pts[i], coeff); + } + eval_4_isog(phiP, coeff); + eval_4_isog(phiQ, coeff); + eval_4_isog(phiR, coeff); + + fp2copy(pts[npts - 1]->X, R->X); + fp2copy(pts[npts - 1]->Z, R->Z); + index = pts_index[npts - 1]; + npts -= 1; + } + + get_4_isog(R, A24plus, C24, coeff); + eval_4_isog(phiP, coeff); + eval_4_isog(phiQ, coeff); + eval_4_isog(phiR, coeff); + + inv_3_way(phiP->Z, phiQ->Z, phiR->Z); + fp2mul_mont(phiP->X, phiP->Z, phiP->X); + fp2mul_mont(phiQ->X, phiQ->Z, phiQ->X); + fp2mul_mont(phiR->X, phiR->Z, phiR->X); + + // Format public key + fp2_encode(phiP->X, PublicKeyA); + fp2_encode(phiQ->X, PublicKeyA + FP2_ENCODED_BYTES); + fp2_encode(phiR->X, PublicKeyA + 2 * FP2_ENCODED_BYTES); + + return 0; +} + +int EphemeralKeyGeneration_B(const unsigned char *PrivateKeyB, unsigned char *PublicKeyB) +{ // Bob's ephemeral public key generation + // Input: a private key PrivateKeyB in the range [0, 2^Floor(Log(2,oB)) - 1]. + // Output: the public key PublicKeyB consisting of 3 elements in GF(p^2) which are encoded by removing leading 0 bytes. + point_proj_t R, phiP = {0}, phiQ = {0}, phiR = {0}, pts[MAX_INT_POINTS_BOB]; + f2elm_t XPB, XQB, XRB, coeff[3], A24plus = {0}, A24minus = {0}, A = {0}; + unsigned int i, row, m, index = 0, pts_index[MAX_INT_POINTS_BOB], npts = 0, ii = 0; + + // Initialize basis points + init_basis((digit_t *)B_gen, XPB, XQB, XRB); + init_basis((digit_t *)A_gen, phiP->X, phiQ->X, phiR->X); + fpcopy((digit_t *)&Montgomery_one, (phiP->Z)[0]); + fpcopy((digit_t *)&Montgomery_one, (phiQ->Z)[0]); + fpcopy((digit_t *)&Montgomery_one, (phiR->Z)[0]); + + // Initialize constants + fpcopy((digit_t *)&Montgomery_one, A24plus[0]); + fp2add(A24plus, A24plus, A24plus); + fp2copy(A24plus, A24minus); + fp2neg(A24minus); + + // Retrieve kernel point + LADDER3PT(XPB, XQB, XRB, (digit_t *)PrivateKeyB, BOB, R, A); + + // Traverse tree + index = 0; + for (row = 1; row < MAX_Bob; row++) + { + while (index < MAX_Bob - row) + { + fp2copy(R->X, pts[npts]->X); + fp2copy(R->Z, pts[npts]->Z); + pts_index[npts++] = index; + m = strat_Bob[ii++]; + xTPLe(R, R, A24minus, A24plus, (int)m); + index += m; + } + get_3_isog(R, A24minus, A24plus, coeff); + + for (i = 0; i < npts; i++) + { + eval_3_isog(pts[i], coeff); + } + eval_3_isog(phiP, coeff); + eval_3_isog(phiQ, coeff); + eval_3_isog(phiR, coeff); + + fp2copy(pts[npts - 1]->X, R->X); + fp2copy(pts[npts - 1]->Z, R->Z); + index = pts_index[npts - 1]; + npts -= 1; + } + + get_3_isog(R, A24minus, A24plus, coeff); + eval_3_isog(phiP, coeff); + eval_3_isog(phiQ, coeff); + eval_3_isog(phiR, coeff); + + inv_3_way(phiP->Z, phiQ->Z, phiR->Z); + fp2mul_mont(phiP->X, phiP->Z, phiP->X); + fp2mul_mont(phiQ->X, phiQ->Z, phiQ->X); + fp2mul_mont(phiR->X, phiR->Z, phiR->X); + + // Format public key + fp2_encode(phiP->X, PublicKeyB); + fp2_encode(phiQ->X, PublicKeyB + FP2_ENCODED_BYTES); + fp2_encode(phiR->X, PublicKeyB + 2 * FP2_ENCODED_BYTES); + + return 0; +} + +int EphemeralSecretAgreement_A(const unsigned char *PrivateKeyA, const unsigned char *PublicKeyB, unsigned char *SharedSecretA) +{ // Alice's ephemeral shared secret computation + // It produces a shared secret key SharedSecretA using her secret key PrivateKeyA and Bob's public key PublicKeyB + // Inputs: Alice's PrivateKeyA is an integer in the range [0, oA-1]. + // Bob's PublicKeyB consists of 3 elements in GF(p^2) encoded by removing leading 0 bytes. + // Output: a shared secret SharedSecretA that consists of one element in GF(p^2) encoded by removing leading 0 bytes. + point_proj_t R, pts[MAX_INT_POINTS_ALICE]; + f2elm_t coeff[3], PKB[3], jinv; + f2elm_t A24plus = {0}, C24 = {0}, A = {0}; + unsigned int i, row, m, index = 0, pts_index[MAX_INT_POINTS_ALICE], npts = 0, ii = 0; + + // Initialize images of Bob's basis + fp2_decode(PublicKeyB, PKB[0]); + fp2_decode(PublicKeyB + FP2_ENCODED_BYTES, PKB[1]); + fp2_decode(PublicKeyB + 2 * FP2_ENCODED_BYTES, PKB[2]); + + // Initialize constants + get_A(PKB[0], PKB[1], PKB[2], A); // TODO: Can return projective A? + fpadd((digit_t *)&Montgomery_one, (digit_t *)&Montgomery_one, C24[0]); + fp2add(A, C24, A24plus); + fpadd(C24[0], C24[0], C24[0]); + + // Retrieve kernel point + LADDER3PT(PKB[0], PKB[1], PKB[2], (digit_t *)PrivateKeyA, ALICE, R, A); + + // Traverse tree + index = 0; + for (row = 1; row < MAX_Alice; row++) + { + while (index < MAX_Alice - row) + { + fp2copy(R->X, pts[npts]->X); + fp2copy(R->Z, pts[npts]->Z); + pts_index[npts++] = index; + m = strat_Alice[ii++]; + xDBLe(R, R, A24plus, C24, (int)(2 * m)); + index += m; + } + get_4_isog(R, A24plus, C24, coeff); + + for (i = 0; i < npts; i++) + { + eval_4_isog(pts[i], coeff); + } + + fp2copy(pts[npts - 1]->X, R->X); + fp2copy(pts[npts - 1]->Z, R->Z); + index = pts_index[npts - 1]; + npts -= 1; + } + + get_4_isog(R, A24plus, C24, coeff); + fp2div2(C24, C24); + fp2sub(A24plus, C24, A24plus); + fp2div2(C24, C24); + j_inv(A24plus, C24, jinv); + fp2_encode(jinv, SharedSecretA); // Format shared secret + + return 0; +} + +int EphemeralSecretAgreement_B(const unsigned char *PrivateKeyB, const unsigned char *PublicKeyA, unsigned char *SharedSecretB) +{ // Bob's ephemeral shared secret computation + // It produces a shared secret key SharedSecretB using his secret key PrivateKeyB and Alice's public key PublicKeyA + // Inputs: Bob's PrivateKeyB is an integer in the range [0, 2^Floor(Log(2,oB)) - 1]. + // Alice's PublicKeyA consists of 3 elements in GF(p^2) encoded by removing leading 0 bytes. + // Output: a shared secret SharedSecretB that consists of one element in GF(p^2) encoded by removing leading 0 bytes. + point_proj_t R, pts[MAX_INT_POINTS_BOB]; + f2elm_t coeff[3], PKB[3], jinv; + f2elm_t A24plus = {0}, A24minus = {0}, A = {0}; + unsigned int i, row, m, index = 0, pts_index[MAX_INT_POINTS_BOB], npts = 0, ii = 0; + + // Initialize images of Alice's basis + fp2_decode(PublicKeyA, PKB[0]); + fp2_decode(PublicKeyA + FP2_ENCODED_BYTES, PKB[1]); + fp2_decode(PublicKeyA + 2 * FP2_ENCODED_BYTES, PKB[2]); + + // Initialize constants + get_A(PKB[0], PKB[1], PKB[2], A); // TODO: Can return projective A? + fpadd((digit_t *)&Montgomery_one, (digit_t *)&Montgomery_one, A24minus[0]); + fp2add(A, A24minus, A24plus); + fp2sub(A, A24minus, A24minus); + + // Retrieve kernel point + LADDER3PT(PKB[0], PKB[1], PKB[2], (digit_t *)PrivateKeyB, BOB, R, A); + + // Traverse tree + index = 0; + for (row = 1; row < MAX_Bob; row++) + { + while (index < MAX_Bob - row) + { + fp2copy(R->X, pts[npts]->X); + fp2copy(R->Z, pts[npts]->Z); + pts_index[npts++] = index; + m = strat_Bob[ii++]; + xTPLe(R, R, A24minus, A24plus, (int)m); + index += m; + } + get_3_isog(R, A24minus, A24plus, coeff); + + for (i = 0; i < npts; i++) + { + eval_3_isog(pts[i], coeff); + } + + fp2copy(pts[npts - 1]->X, R->X); + fp2copy(pts[npts - 1]->Z, R->Z); + index = pts_index[npts - 1]; + npts -= 1; + } + + get_3_isog(R, A24minus, A24plus, coeff); + fp2add(A24plus, A24minus, A); + fp2add(A, A, A); + fp2sub(A24plus, A24minus, A24plus); + j_inv(A, A24plus, jinv); + fp2_encode(jinv, SharedSecretB); // Format shared secret + + return 0; +} \ No newline at end of file diff --git a/sidh_ref/sike.c b/sidh_ref/sike.c new file mode 100644 index 0000000..e6d4042 --- /dev/null +++ b/sidh_ref/sike.c @@ -0,0 +1,99 @@ +/******************************************************************************************** +* Supersingular Isogeny Key Encapsulation Library +* +* Abstract: supersingular isogeny key encapsulation (SIKE) protocol +*********************************************************************************************/ + +#include +#include "P751_internal.h" +#include "sha3/fips202.h" + + +int crypto_kem_keypair(unsigned char *pk, unsigned char *sk) +{ // SIKE's key generation + // Outputs: secret key sk (CRYPTO_SECRETKEYBYTES = MSG_BYTES + SECRETKEY_B_BYTES + CRYPTO_PUBLICKEYBYTES bytes) + // public key pk (CRYPTO_PUBLICKEYBYTES bytes) + + // Generate lower portion of secret key sk <- s||SK + randombytes(sk, MSG_BYTES); + random_mod_order_B(sk + MSG_BYTES); + + // Generate public key pk + EphemeralKeyGeneration_B(sk + MSG_BYTES, pk); + + // Append public key pk to secret key sk + memcpy(&sk[MSG_BYTES + SECRETKEY_B_BYTES], pk, CRYPTO_PUBLICKEYBYTES); + + return 0; +} + + +int crypto_kem_enc(unsigned char *ct, unsigned char *ss, const unsigned char *pk) +{ // SIKE's encapsulation + // Input: public key pk (CRYPTO_PUBLICKEYBYTES bytes) + // Outputs: shared secret ss (CRYPTO_BYTES bytes) + // ciphertext message ct (CRYPTO_CIPHERTEXTBYTES = CRYPTO_PUBLICKEYBYTES + MSG_BYTES bytes) + const uint16_t G = 0; + const uint16_t H = 1; + const uint16_t P = 2; + unsigned char ephemeralsk[SECRETKEY_A_BYTES]; + unsigned char jinvariant[FP2_ENCODED_BYTES]; + unsigned char h[MSG_BYTES]; + unsigned char temp[CRYPTO_CIPHERTEXTBYTES+MSG_BYTES]; + unsigned int i; + + // Generate ephemeralsk <- G(m||pk) mod oA + randombytes(temp, MSG_BYTES); + memcpy(&temp[MSG_BYTES], pk, CRYPTO_PUBLICKEYBYTES); + cshake256_simple(ephemeralsk, SECRETKEY_A_BYTES, G, temp, CRYPTO_PUBLICKEYBYTES+MSG_BYTES); + ephemeralsk[SECRETKEY_A_BYTES - 1] &= MASK_ALICE; + + // Encrypt + EphemeralKeyGeneration_A(ephemeralsk, ct); + EphemeralSecretAgreement_A(ephemeralsk, pk, jinvariant); + cshake256_simple(h, MSG_BYTES, P, jinvariant, FP2_ENCODED_BYTES); + for (i = 0; i < MSG_BYTES; i++) ct[i + CRYPTO_PUBLICKEYBYTES] = temp[i] ^ h[i]; + + // Generate shared secret ss <- H(m||ct) + memcpy(&temp[MSG_BYTES], ct, CRYPTO_CIPHERTEXTBYTES); + cshake256_simple(ss, CRYPTO_BYTES, H, temp, CRYPTO_CIPHERTEXTBYTES+MSG_BYTES); + + return 0; +} + + +int crypto_kem_dec(unsigned char *ss, const unsigned char *ct, const unsigned char *sk) +{ // SIKE's decapsulation + // Input: secret key sk (CRYPTO_SECRETKEYBYTES = MSG_BYTES + SECRETKEY_B_BYTES + CRYPTO_PUBLICKEYBYTES bytes) + // ciphertext message ct (CRYPTO_CIPHERTEXTBYTES = CRYPTO_PUBLICKEYBYTES + MSG_BYTES bytes) + // Outputs: shared secret ss (CRYPTO_BYTES bytes) + const uint16_t G = 0; + const uint16_t H = 1; + const uint16_t P = 2; + unsigned char ephemeralsk_[SECRETKEY_A_BYTES]; + unsigned char jinvariant_[FP2_ENCODED_BYTES]; + unsigned char h_[MSG_BYTES]; + unsigned char c0_[CRYPTO_PUBLICKEYBYTES]; + unsigned char temp[CRYPTO_CIPHERTEXTBYTES+MSG_BYTES]; + unsigned int i; + + // Decrypt + EphemeralSecretAgreement_B(sk + MSG_BYTES, ct, jinvariant_); + cshake256_simple(h_, MSG_BYTES, P, jinvariant_, FP2_ENCODED_BYTES); + for (i = 0; i < MSG_BYTES; i++) temp[i] = ct[i + CRYPTO_PUBLICKEYBYTES] ^ h_[i]; + + // Generate ephemeralsk_ <- G(m||pk) mod oA + memcpy(&temp[MSG_BYTES], &sk[MSG_BYTES + SECRETKEY_B_BYTES], CRYPTO_PUBLICKEYBYTES); + cshake256_simple(ephemeralsk_, SECRETKEY_A_BYTES, G, temp, CRYPTO_PUBLICKEYBYTES+MSG_BYTES); + ephemeralsk_[SECRETKEY_A_BYTES - 1] &= MASK_ALICE; + + // Generate shared secret ss <- H(m||ct) or output ss <- H(s||ct) + EphemeralKeyGeneration_A(ephemeralsk_, c0_); + if (memcmp(c0_, ct, CRYPTO_PUBLICKEYBYTES) != 0) { + memcpy(temp, sk, MSG_BYTES); + } + memcpy(&temp[MSG_BYTES], ct, CRYPTO_CIPHERTEXTBYTES); + cshake256_simple(ss, CRYPTO_BYTES, H, temp, CRYPTO_CIPHERTEXTBYTES+MSG_BYTES); + + return 0; +} \ No newline at end of file