瀏覽代碼

works on amazon

master
Henry Case 4 年之前
當前提交
8335e684f6
共有 25 個文件被更改,包括 9106 次插入0 次删除
  1. +39
    -0
      Makefile
  2. +817
    -0
      P751_ifma.c
  3. +34
    -0
      README.md
  4. +916
    -0
      fp2_751_ifma.S
  5. +523
    -0
      fp2_packed_751_ifma.S
  6. +268
    -0
      fp_751_ifma.S
  7. +218
    -0
      main.c
  8. +52
    -0
      measurements.h
  9. +122
    -0
      sidh_ref/P751.c
  10. +255
    -0
      sidh_ref/P751_internal.h
  11. +214
    -0
      sidh_ref/SIDH.h
  12. +109
    -0
      sidh_ref/api.h
  13. +128
    -0
      sidh_ref/config.h
  14. +330
    -0
      sidh_ref/ec_isogeny.c
  15. +867
    -0
      sidh_ref/fp_x64.c
  16. +2644
    -0
      sidh_ref/fp_x64_asm.S
  17. +474
    -0
      sidh_ref/fpx.c
  18. +43
    -0
      sidh_ref/random/random.c
  19. +9
    -0
      sidh_ref/random/random.h
  20. 二進制
     
  21. +573
    -0
      sidh_ref/sha3/fips202.c
  22. +27
    -0
      sidh_ref/sha3/fips202.h
  23. 二進制
     
  24. +345
    -0
      sidh_ref/sidh.c
  25. +99
    -0
      sidh_ref/sike.c

+ 39
- 0
Makefile 查看文件

@@ -0,0 +1,39 @@
CC?=clang

TARGET_OS=$(shell uname -s)

ifeq ($(TARGET_OS),Darwin)
CFLAGS= -std=c99 -O3 -g -D_AMD64_ -D__LINUX__ -mavx512f -mavx512bw -mavx512dq -mavx512ifma
else
ifeq ($(CC),clang)
CFLAGS= -std=c99 -O3 -g -D_AMD64_ -D__LINUX__ -mavx512f -mavx512bw -mavx512dq -mavx512ifma
else
CFLAGS= -std=c99 -O3 -g -D_AMD64_ -D__LINUX__ -mavx512f
endif
endif

CFLAGS+=-D_MULX_ -D_ADX_

SRC_REAL=fp2_751_ifma.S fp_751_ifma.S fp2_packed_751_ifma.S
SRC_STANDIN=$(SRC_REAL:.S=_standin.S)
SOURCES=./sidh_ref/fp_x64_asm.S ./sidh_ref/fp_x64.c ./sidh_ref/P751.c ./sidh_ref/random/random.c ./sidh_ref/sha3/fips202.c P751_ifma.c
OBJECTS=$(SOURCES:.c=.o)
EXE_REAL=sidh_ifma
EXE_STANDIN=sidh_standin

all: $(SOURCES) $(SRC_STANDIN) $(SRC_REAL) $(EXE_REAL) $(EXE_STANDIN)

$(SRC_STANDIN): %_standin.S: %.S
cat $< | sed 's/vpmadd52luq/VFMADD231PD/; s/vpmadd52huq/VFMADD231PD/;' > $@

$(EXE_REAL): main.c ./sidh_ref/sidh.c $(OBJECTS) $(SRC_REAL)
$(CC) main.c $(OBJECTS) $(SRC_REAL) $(CFLAGS) -o $@ -DREPEAT=1 -DOUTER_REPEAT=1

$(EXE_STANDIN): main.c ./sidh_ref/sidh.c $(OBJECTS) $(SRC_STANDIN)
$(CC) main.c $(OBJECTS) $(SRC_STANDIN) $(CFLAGS) -o $@ -DREPEAT=20 -DOUTER_REPEAT=20

.o: ./sidh_ref/sidh.c
$(CC) $(CFLAGS) $< -o $@

clean:
rm -f *.o ./sidh_ref/*.o $(EXE_REAL) $(EXE_STANDIN) $(SRC_STANDIN)

+ 817
- 0
P751_ifma.c 查看文件

@@ -0,0 +1,817 @@
#include <stdint.h>
#include <string.h>

#define NWORDS_FIELD 15
#define MAX_INT_POINTS_ALICE 8
#define MAX_INT_POINTS_BOB 10

#define ALICE 0
#define BOB 1
#define OALICE_BITS 372
#define OBOB_BITS 379

#define MAX_Alice 186
#define MAX_Bob 239

#define NBITS_FIELD 751
#define MAXBITS_FIELD 768
#define FP2_ENCODED_BYTES 2 * ((NBITS_FIELD + 7) / 8)

typedef uint64_t felm_t[NWORDS_FIELD];
typedef felm_t f2elm_t[2];

typedef struct
{
f2elm_t X;
f2elm_t Z;
} point_proj; // Point representation in projective XZ Montgomery coordinates.

typedef point_proj point_proj_t[1];

const uint64_t A_gen_ifma[5 * NWORDS_FIELD] = {
0x000ceab50ad8bc0d, 0x0005e457b1c2fc08, 0x000cd6e1d7d710f5, 0x000ae8738d92953d, 0x000a7ebee8a3418a, 0x0008345f03f46fba, 0x0007cfe2616c9a28, 0x000b4be50c8b9e16, 0x00039b6799643b2e, 0x000597a7ff9d56d5, 0x00021d410d97fe0a, 0x000a4a92a8f2ad52, 0x00054508e42abde4, 0x000ebf7d0178c137, 0x00000000004a0a75,
0x000d21582e4118ad, 0x0005df400ae6cc41, 0x000aec407c2ecb7c, 0x000de8e34b521432, 0x000761e2ab085167, 0x000bcaa6094b3c50, 0x000df9ddd71032cf, 0x00057d905265605f, 0x000f7dba2681f9d7, 0x0009e9732def416c, 0x0006f77956ce00ce, 0x000576fb3094772b, 0x000b2d166e2a949f, 0x0002f665c6588ea2, 0x0000000000337a25,
0x00026279148626cd, 0x0006b5baead56fe5, 0x000ab911fad60dc9, 0x000401e137d0bf07, 0x0004d3e925216196, 0x0005e4cd09a33740, 0x00069e4af733c538, 0x000d1169f6821367, 0x000c64ecfc721111, 0x000ba56507cd0dc7, 0x000995e4ae04dfad, 0x0007b992deeceab8, 0x0007bccd256aff1e, 0x000207f5fde1824c, 0x0000000000345cc7,
0x00041dffd19b3e7f, 0x000b48c18e0bb844, 0x000380584b4dea99, 0x0000692de648ad31, 0x000d72761b6dfaee, 0x0005c672c3058de6, 0x000cba26fdc22397, 0x000e15f9133d4bc3, 0x000d5ae123793466, 0x000bb494276e321d, 0x000c9c99fb74cd99, 0x0005da6e4fd03f75, 0x000b95feb24d0937, 0x000e6a307e03cd17, 0x000000000044ad2e,
0x0007f1ec71be8c36, 0x00053859b1ed78c1, 0x000529ff824d6df7, 0x000633a10839b2a8, 0x00003e9e25fdea79, 0x000a8054df1762fc, 0x000034c6467c4708, 0x000acb63530b60ec, 0x0000c6fc8c19bf71, 0x0005aca92467c3cb, 0x000d42050ba154a2, 0x000b4d5baa4ab074, 0x00044ba4962ac622, 0x0002bbf250aa70e6, 0x0000000000457f51};

const uint64_t B_gen_ifma[5 * NWORDS_FIELD] = {
0x0001ef867ab0bcb9, 0x0009a45c76cfb6d7, 0x0001f034a5fdd76e, 0x000038b1ee69194b, 0x000e7b18a7761f3f, 0x000a486a52c84cf6, 0x0005aa75466fcf01, 0x00044164f797233f, 0x000331aeaec77db1, 0x0005185f83d9a22f, 0x000e2d4dc94f5b17, 0x0000f7b3858b15a4, 0x000635ac44515c99, 0x000a5b14eaf4ee2e, 0x000000000048e907,
0x0004e7c075cc3a24, 0x00004aa430a49203, 0x00094c8677baf00b, 0x000b3aae0c9a755c, 0x000c4b064e9ebb08, 0x000dd04e826c661d, 0x00061f01b223684e, 0x000d43bc8a6360b6, 0x00008c633a79ab30, 0x0008e0092fbd6f39, 0x0002b9ba797337f8, 0x000fcb3252ddaf84, 0x000467ded2ca9dce, 0x0006117350e479f4, 0x00000000001ae9d1,
0x000ed7b96c4ab279, 0x000178486ef1a8c9, 0x000c2f4299429da5, 0x000aef4926f20cd5, 0x0003b2e2858b4716, 0x000bcc3cac3eeb68, 0x0003a600460dda2f, 0x00050e6650a24c9f, 0x0004cb60c61775f8, 0x00082b196ebc78b3, 0x000cc7fec8cce966, 0x000d9b778d801d65, 0x0005324630f74af3, 0x0009018193e7592e, 0x00000000003aef05,
0x00033769d0f314ef, 0x000e2659d11c0d67, 0x000d133f084c3086, 0x0005e23d5da27bcb, 0x0008ec9a8d586402, 0x000c781b3b645bf3, 0x000c9fb03ee6426d, 0x000ddc7bb40b83e3, 0x000bb7b4ab585e3a, 0x0006c2672e53eeaf, 0x0000397a1e62b655, 0x0004ac383daab923, 0x0008eb1ecdd2f39e, 0x000f1516da469247, 0x00000000003693cf,
0x0007d8f72bd956dc, 0x000e9934884ae37e, 0x0003c3edd2d504b3, 0x00005d14e7fa1ecb, 0x0007610ceb75d635, 0x000b4cac446b1112, 0x000c1f70caf255b4, 0x00057d3e324d2f36, 0x0006181c3bb1a700, 0x000db2f2916ccc40, 0x00021ee51d1c92f1, 0x000c07c22031c32a, 0x000e4310e5103473, 0x00069c1148de9ef5, 0x00000000004d1227};

const uint64_t One[NWORDS_FIELD] = {
0x00000000249ad67c, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0001f9800c542c00, 0x000b326488fe3b2a, 0x000e6176236db777, 0x000dd6e970232b83, 0x000d4d762277573f, 0x00054cd16c015f35, 0x0009fc72438c4fc7, 0x00000000001bf8f6};

const uint64_t Two[NWORDS_FIELD] = {
0x000000004935acf8, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0003f30018a85800, 0x000664c911fc7654, 0x000cc2ec46db6eef, 0x000badd2e0465707, 0x000a9aec44eeae7f, 0x000a99a2d802be6b, 0x0003f8e487189f8e, 0x000000000037f1ed};

// Fixed parameters for isogeny tree computation
extern const unsigned int strat_Alice[MAX_Alice - 1];
extern const unsigned int strat_Bob[MAX_Bob - 1];

void norm2red(uint64_t *res, const uint64_t *a);
void red2norm(uint64_t out[12], const uint64_t in[15])
{
out[0] = in[0] ^ in[1] << 52;

out[1] = in[1] >> 12 ^ in[2] << 40;
out[2] = in[2] >> 24 ^ in[3] << 28;
out[3] = in[3] >> 36 ^ in[4] << 16;
out[4] = in[4] >> 48 ^ in[5] << 4 ^ in[6] << 56;

out[5] = in[6] >> 8 ^ in[7] << 44;
out[6] = in[7] >> 20 ^ in[8] << 32;
out[7] = in[8] >> 32 ^ in[9] << 20;
out[8] = in[9] >> 44 ^ in[10] << 8 ^ in[11] << 60;

out[9] = in[11] >> 4 ^ in[12] << 48;
out[10] = in[12] >> 16 ^ in[13] << 36;
out[11] = in[13] >> 28 ^ in[14] << 24;
}

static void init_basis(const uint64_t *gen, f2elm_t XP, f2elm_t XQ, f2elm_t XR)
{ // Initialization of basis points

memcpy(XP[0], &gen[0 * NWORDS_FIELD], sizeof(felm_t));
memcpy(XP[1], &gen[1 * NWORDS_FIELD], sizeof(felm_t));

memcpy(XQ[0], &gen[2 * NWORDS_FIELD], sizeof(felm_t));
memset(XQ[1], 0, sizeof(felm_t));

memcpy(XR[0], &gen[3 * NWORDS_FIELD], sizeof(felm_t));
memcpy(XR[1], &gen[4 * NWORDS_FIELD], sizeof(felm_t));
}

void fp2_mul_ifma(f2elm_t res, const f2elm_t a, const f2elm_t b);
void fp2_mul_ifma_x2(f2elm_t res1, const f2elm_t a1, const f2elm_t b1, f2elm_t res2, const f2elm_t a2, const f2elm_t b2);
void fp2_sqr_ifma(f2elm_t res, const f2elm_t a);
void fp2_add(f2elm_t res, const f2elm_t a, const f2elm_t b);
void fp2_sub(f2elm_t res, const f2elm_t a, const f2elm_t b);

void fp2_swap(point_proj_t a, point_proj_t b, int swap);

void fp_mul_ifma(felm_t res, felm_t a, felm_t b);
void fp_add(felm_t res, const felm_t a, const felm_t b);
void fp_sub(felm_t res, const felm_t a, const felm_t b);

void to_mont_ifma(felm_t rp, const felm_t ap);
void from_mont_ifma(felm_t rp, const felm_t ap);

void red2norm(uint64_t out[12], const felm_t in);

#define fp2mul_mont(a, b, r) fp2_mul_ifma(r, a, b)
#define fp2sqr_mont(a, r) fp2_sqr_ifma(r, a)
#define fp2add(a, b, r) fp2_add(r, a, b)
#define fp2sub(a, b, r) fp2_sub(r, a, b)
#define fp2correction

#define fpsqr_mont(a, r) fp_mul_ifma(r, a, a)
#define fpmul_mont(a, b, r) fp_mul_ifma(r, a, b)

#define fpadd(a, b, r) fp_add(r, a, b)
#define fpsub(a, b, r) fp_sub(r, a, b)

void fpinv_chain_mont(felm_t a)
{ // Chain to compute a^(p-3)/4 using Montgomery arithmetic.
unsigned int i, j;
felm_t t[27], tt;

// Precomputed table
fpsqr_mont(a, tt);
fpmul_mont(a, tt, t[0]);
fpmul_mont(t[0], tt, t[1]);
fpmul_mont(t[1], tt, t[2]);
fpmul_mont(t[2], tt, t[3]);
fpmul_mont(t[3], tt, t[3]);
for (i = 3; i <= 8; i++)
fpmul_mont(t[i], tt, t[i + 1]);
fpmul_mont(t[9], tt, t[9]);
for (i = 9; i <= 20; i++)
fpmul_mont(t[i], tt, t[i + 1]);
fpmul_mont(t[21], tt, t[21]);
for (i = 21; i <= 24; i++)
fpmul_mont(t[i], tt, t[i + 1]);
fpmul_mont(t[25], tt, t[25]);
fpmul_mont(t[25], tt, t[26]);

memcpy(tt, a, sizeof(felm_t));
for (i = 0; i < 6; i++)
fpsqr_mont(tt, tt);
fpmul_mont(t[20], tt, tt);
for (i = 0; i < 6; i++)
fpsqr_mont(tt, tt);
fpmul_mont(t[24], tt, tt);
for (i = 0; i < 6; i++)
fpsqr_mont(tt, tt);
fpmul_mont(t[11], tt, tt);
for (i = 0; i < 6; i++)
fpsqr_mont(tt, tt);
fpmul_mont(t[8], tt, tt);
for (i = 0; i < 8; i++)
fpsqr_mont(tt, tt);
fpmul_mont(t[2], tt, tt);
for (i = 0; i < 6; i++)
fpsqr_mont(tt, tt);
fpmul_mont(t[23], tt, tt);
for (i = 0; i < 6; i++)
fpsqr_mont(tt, tt);
fpmul_mont(t[2], tt, tt);
for (i = 0; i < 9; i++)
fpsqr_mont(tt, tt);
fpmul_mont(t[2], tt, tt);
for (i = 0; i < 10; i++)
fpsqr_mont(tt, tt);
fpmul_mont(t[15], tt, tt);
for (i = 0; i < 8; i++)
fpsqr_mont(tt, tt);
fpmul_mont(t[13], tt, tt);
for (i = 0; i < 8; i++)
fpsqr_mont(tt, tt);
fpmul_mont(t[26], tt, tt);
for (i = 0; i < 8; i++)
fpsqr_mont(tt, tt);
fpmul_mont(t[20], tt, tt);
for (i = 0; i < 6; i++)
fpsqr_mont(tt, tt);
fpmul_mont(t[11], tt, tt);
for (i = 0; i < 6; i++)
fpsqr_mont(tt, tt);
fpmul_mont(t[10], tt, tt);
for (i = 0; i < 6; i++)
fpsqr_mont(tt, tt);
fpmul_mont(t[14], tt, tt);
for (i = 0; i < 6; i++)
fpsqr_mont(tt, tt);
fpmul_mont(t[4], tt, tt);
for (i = 0; i < 10; i++)
fpsqr_mont(tt, tt);
fpmul_mont(t[18], tt, tt);
for (i = 0; i < 6; i++)
fpsqr_mont(tt, tt);
fpmul_mont(t[1], tt, tt);
for (i = 0; i < 7; i++)
fpsqr_mont(tt, tt);
fpmul_mont(t[22], tt, tt);
for (i = 0; i < 10; i++)
fpsqr_mont(tt, tt);
fpmul_mont(t[6], tt, tt);
for (i = 0; i < 7; i++)
fpsqr_mont(tt, tt);
fpmul_mont(t[24], tt, tt);
for (i = 0; i < 6; i++)
fpsqr_mont(tt, tt);
fpmul_mont(t[9], tt, tt);
for (i = 0; i < 8; i++)
fpsqr_mont(tt, tt);
fpmul_mont(t[18], tt, tt);
for (i = 0; i < 6; i++)
fpsqr_mont(tt, tt);
fpmul_mont(t[17], tt, tt);
for (i = 0; i < 8; i++)
fpsqr_mont(tt, tt);
fpmul_mont(a, tt, tt);
for (i = 0; i < 10; i++)
fpsqr_mont(tt, tt);
fpmul_mont(t[16], tt, tt);
for (i = 0; i < 6; i++)
fpsqr_mont(tt, tt);
fpmul_mont(t[7], tt, tt);
for (i = 0; i < 6; i++)
fpsqr_mont(tt, tt);
fpmul_mont(t[0], tt, tt);
for (i = 0; i < 7; i++)
fpsqr_mont(tt, tt);
fpmul_mont(t[12], tt, tt);
for (i = 0; i < 7; i++)
fpsqr_mont(tt, tt);
fpmul_mont(t[19], tt, tt);
for (i = 0; i < 6; i++)
fpsqr_mont(tt, tt);
fpmul_mont(t[22], tt, tt);
for (i = 0; i < 6; i++)
fpsqr_mont(tt, tt);
fpmul_mont(t[25], tt, tt);
for (i = 0; i < 7; i++)
fpsqr_mont(tt, tt);
fpmul_mont(t[2], tt, tt);
for (i = 0; i < 6; i++)
fpsqr_mont(tt, tt);
fpmul_mont(t[10], tt, tt);
for (i = 0; i < 7; i++)
fpsqr_mont(tt, tt);
fpmul_mont(t[22], tt, tt);
for (i = 0; i < 8; i++)
fpsqr_mont(tt, tt);
fpmul_mont(t[18], tt, tt);
for (i = 0; i < 6; i++)
fpsqr_mont(tt, tt);
fpmul_mont(t[4], tt, tt);
for (i = 0; i < 6; i++)
fpsqr_mont(tt, tt);
fpmul_mont(t[14], tt, tt);
for (i = 0; i < 7; i++)
fpsqr_mont(tt, tt);
fpmul_mont(t[13], tt, tt);
for (i = 0; i < 6; i++)
fpsqr_mont(tt, tt);
fpmul_mont(t[5], tt, tt);
for (i = 0; i < 6; i++)
fpsqr_mont(tt, tt);
fpmul_mont(t[23], tt, tt);
for (i = 0; i < 6; i++)
fpsqr_mont(tt, tt);
fpmul_mont(t[21], tt, tt);
for (i = 0; i < 6; i++)
fpsqr_mont(tt, tt);
fpmul_mont(t[2], tt, tt);
for (i = 0; i < 7; i++)
fpsqr_mont(tt, tt);
fpmul_mont(t[23], tt, tt);
for (i = 0; i < 8; i++)
fpsqr_mont(tt, tt);
fpmul_mont(t[12], tt, tt);
for (i = 0; i < 6; i++)
fpsqr_mont(tt, tt);
fpmul_mont(t[9], tt, tt);
for (i = 0; i < 6; i++)
fpsqr_mont(tt, tt);
fpmul_mont(t[3], tt, tt);
for (i = 0; i < 7; i++)
fpsqr_mont(tt, tt);
fpmul_mont(t[13], tt, tt);
for (i = 0; i < 7; i++)
fpsqr_mont(tt, tt);
fpmul_mont(t[17], tt, tt);
for (i = 0; i < 8; i++)
fpsqr_mont(tt, tt);
fpmul_mont(t[26], tt, tt);
for (i = 0; i < 8; i++)
fpsqr_mont(tt, tt);
fpmul_mont(t[5], tt, tt);
for (i = 0; i < 8; i++)
fpsqr_mont(tt, tt);
fpmul_mont(t[8], tt, tt);
for (i = 0; i < 6; i++)
fpsqr_mont(tt, tt);
fpmul_mont(t[2], tt, tt);
for (i = 0; i < 6; i++)
fpsqr_mont(tt, tt);
fpmul_mont(t[11], tt, tt);
for (i = 0; i < 7; i++)
fpsqr_mont(tt, tt);
fpmul_mont(t[20], tt, tt);
for (j = 0; j < 61; j++)
{
for (i = 0; i < 6; i++)
fpsqr_mont(tt, tt);
fpmul_mont(t[26], tt, tt);
}
memcpy(a, tt, sizeof(felm_t));
}

void fpinv_mont(felm_t a)
{ // Field inversion using Montgomery arithmetic, a = a^(-1)*R mod p.
felm_t tt;
memcpy(tt, a, sizeof(felm_t));
fpinv_chain_mont(tt);
fpsqr_mont(tt, tt);
fpsqr_mont(tt, tt);
fpmul_mont(a, tt, a);
}

void fp2inv_mont(f2elm_t a)
{ // GF(p^2) inversion using Montgomery arithmetic, a = (a0-i*a1)/(a0^2+a1^2).
f2elm_t t1;
felm_t zero = {0};
fpsqr_mont(a[0], t1[0]); // t10 = a0^2
fpsqr_mont(a[1], t1[1]); // t11 = a1^2
fpadd(t1[0], t1[1], t1[0]); // t10 = a0^2+a1^2
fpinv_mont(t1[0]); // t10 = (a0^2+a1^2)^-1
fp_sub(a[1], zero, a[1]); // a = a0-i*a1
fpmul_mont(a[0], t1[0], a[0]);
fpmul_mont(a[1], t1[0], a[1]); // a = (a0-i*a1)*(a0^2+a1^2)^-1
}

void inv_3_way_ifma(f2elm_t z1, f2elm_t z2, f2elm_t z3)
{ // 3-way simultaneous inversion
// Input: z1,z2,z3
// Output: 1/z1,1/z2,1/z3 (override inputs).
f2elm_t t0, t1, t2, t3;

fp2mul_mont(z1, z2, t0); // t0 = z1*z2
fp2mul_mont(z3, t0, t1); // t1 = z1*z2*z3
fp2inv_mont(t1); // t1 = 1/(z1*z2*z3)
fp2mul_mont(z3, t1, t2); // t2 = 1/(z1*z2)
fp2_mul_ifma_x2(t3, t2, z2, z2, t2, z1);
//fp2mul_mont(t2, z2, t3); // t3 = 1/z1
//fp2mul_mont(t2, z1, z2); // z2 = 1/z2
fp2mul_mont(t0, t1, z3); // z3 = 1/z3
memcpy(z1, t3, sizeof(f2elm_t));
}

void xDBLADD_ifma(point_proj_t P, point_proj_t Q, const f2elm_t xPQ, const f2elm_t A24)
{ // Simultaneous doubling and differential addition.
// Input: projective Montgomery points P=(XP:ZP) and Q=(XQ:ZQ) such that xP=XP/ZP and xQ=XQ/ZQ, affine difference xPQ=x(P-Q) and Montgomery curve constant A24=(A+2)/4.
// Output: projective Montgomery points P <- 2*P = (X2P:Z2P) such that x(2P)=X2P/Z2P, and Q <- P+Q = (XQP:ZQP) such that = x(Q+P)=XQP/ZQP.
f2elm_t t0, t1, t2, t3;

fp2add(P->X, P->Z, t0); // t0 = XP+ZP
fp2sub(P->X, P->Z, t1); // t1 = XP-ZP

fp2_mul_ifma_x2(P->X, t0, t0, P->Z, t1, t1);
//fp2sqr_mont(t0, P->X); // XP = (XP+ZP)^2
//fp2sqr_mont(t1, P->Z); // ZP = (XP-ZP)^2

fp2add(Q->X, Q->Z, t2); // XQ = XQ+ZQ
fp2sub(Q->X, Q->Z, t3); // t2 = XQ-ZQ

fp2_mul_ifma_x2(t1, t1, t2, t0, t0, t3);
//fp2mul_mont(t2, t1, t1); // t1 = (XP-ZP)*(XQ+ZQ)
//fp2mul_mont(t3, t0, t0); // t0 = (XP+ZP)*(XQ-ZQ)

fp2sub(P->X, P->Z, t2); // t2 = (XP+ZP)^2-(XP-ZP)^2
fp2sub(t0, t1, Q->Z); // ZQ = (XP+ZP)*(XQ-ZQ)-(XP-ZP)*(XQ+ZQ)

fp2_mul_ifma_x2(P->X, P->X, P->Z, Q->X, A24, t2);
//fp2mul_mont(P->X, P->Z, P->X); // XP = (XP+ZP)^2*(XP-ZP)^2
//fp2mul_mont(t2, A24, Q->X); // XQ = A24*[(XP+ZP)^2-(XP-ZP)^2]

fp2add(Q->X, P->Z, P->Z); // ZP = A24*[(XP+ZP)^2-(XP-ZP)^2]+(XP-ZP)^2
fp2add(t0, t1, Q->X); // XQ = (XP+ZP)*(XQ-ZQ)+(XP-ZP)*(XQ+ZQ)

fp2_mul_ifma_x2(Q->Z, Q->Z, Q->Z, Q->X, Q->X, Q->X);
//fp2sqr_mont(Q->Z, Q->Z); // ZQ = [(XP+ZP)*(XQ-ZQ)-(XP-ZP)*(XQ+ZQ)]^2
//fp2sqr_mont(Q->X, Q->X); // XQ = [(XP+ZP)*(XQ-ZQ)+(XP-ZP)*(XQ+ZQ)]^2

fp2_mul_ifma_x2(P->Z, P->Z, t2, Q->Z, Q->Z, xPQ);
//fp2mul_mont(P->Z, t2, P->Z); // ZP = [A24*[(XP+ZP)^2-(XP-ZP)^2]+(XP-ZP)^2]*[(XP+ZP)^2-(XP-ZP)^2]
//fp2mul_mont(Q->Z, xPQ, Q->Z); // ZQ = xPQ*[(XP+ZP)*(XQ-ZQ)-(XP-ZP)*(XQ+ZQ)]^2
}

static void LADDER3PT_ifma(const f2elm_t xP, const f2elm_t xQ, const f2elm_t xPQ, const uint64_t *m, const unsigned int AliceOrBob, point_proj_t R)
{
point_proj_t R0 = {0}, R2 = {0};
const f2elm_t A24 = {
{0x00000000124d6b3e, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000fcc0062a1600, 0x000d9932447f1d95, 0x000f30bb11b6dbbb, 0x000eeb74b81195c1, 0x000ea6bb113bab9f, 0x000aa668b600af9a, 0x0004fe3921c627e3, 0x00000000000dfc7b},
{0}};

uint64_t mask;
int i, nbits, bit, swap, prevbit = 0;

if (AliceOrBob == ALICE)
{
nbits = OALICE_BITS;
}
else
{
nbits = OBOB_BITS;
}

// Initializing points
memcpy(R0->X, xQ, sizeof(f2elm_t));
memcpy(R0->Z[0], One, sizeof(felm_t));

memcpy(R2->X, xPQ, sizeof(f2elm_t));
memcpy(R2->Z[0], One, sizeof(felm_t));

memcpy(R->X, xP, sizeof(f2elm_t));
memcpy(R->Z[0], One, sizeof(felm_t));
memset(R->Z[1], 0, sizeof(felm_t));

// Main loop
for (i = 0; i < nbits; i++)
{
bit = (m[i >> 6] >> (i & (64 - 1))) & 1;
swap = bit ^ prevbit;
prevbit = bit;
fp2_swap(R, R2, swap);

xDBLADD_ifma(R0, R2, R->X, A24);
fp2_mul_ifma(R2->X, R->Z, R2->X);
}
}

static void xDBL_ifma(const point_proj_t P, point_proj_t Q, const f2elm_t A24plus, const f2elm_t C24)
{ // Doubling of a Montgomery point in projective coordinates (X:Z).
// Input: projective Montgomery x-coordinates P = (X1:Z1), where x1=X1/Z1 and Montgomery curve constants A+2C and 4C.
// Output: projective Montgomery x-coordinates Q = 2*P = (X2:Z2).
f2elm_t t0, t1, t2;

fp2sub(P->X, P->Z, t0); // t0 = X1-Z1
fp2add(P->X, P->Z, t1); // t1 = X1+Z1

fp2_mul_ifma_x2(t0, t0, t0, t1, t1, t1);
//fp2sqr_mont(t0, t0); // t0 = (X1-Z1)^2
//fp2sqr_mont(t1, t1); // t1 = (X1+Z1)^2

fp2sub(t1, t0, t2); // t1 = (X1+Z1)^2-(X1-Z1)^2

fp2_mul_ifma_x2(Q->Z, t0, C24, t0, t2, A24plus);
//fp2mul_mont(C24, t0, Q->Z); // Z2 = C24*(X1-Z1)^2
//fp2mul_mont(A24plus, t2, t0); // t0 = A24plus*[(X1+Z1)^2-(X1-Z1)^2]

fp2add(Q->Z, t0, t0); // Z2 = A24plus*[(X1+Z1)^2-(X1-Z1)^2] + C24*(X1-Z1)^2

fp2_mul_ifma_x2(Q->X, Q->Z, t1, Q->Z, t2, t0);
//fp2mul_mont(t1, Q->Z, Q->X); // X2 = C24*(X1-Z1)^2*(X1+Z1)^2
//fp2mul_mont(t0, t2, Q->Z); // Z2 = [A24plus*[(X1+Z1)^2-(X1-Z1)^2] + C24*(X1-Z1)^2]*[(X1+Z1)^2-(X1-Z1)^2]
}

static void xDBLe_ifma(const point_proj_t P, point_proj_t Q, const f2elm_t A24plus, const f2elm_t C24, const int e)
{ // Computes [2^e](X:Z) on Montgomery curve with projective constant via e repeated doublings.
// Input: projective Montgomery x-coordinates P = (XP:ZP), such that xP=XP/ZP and Montgomery curve constants A+2C and 4C.
// Output: projective Montgomery x-coordinates Q <- (2^e)*P.
int i;

memcpy(Q, P, sizeof(point_proj));

for (i = 0; i < e; i++)
{
xDBL_ifma(Q, Q, A24plus, C24);
}
}

static void xTPL_ifma(const point_proj_t P, point_proj_t Q, const f2elm_t A24minus, const f2elm_t A24plus)
{ // Tripling of a Montgomery point in projective coordinates (X:Z).
// Input: projective Montgomery x-coordinates P = (X:Z), where x=X/Z and Montgomery curve constants A24plus = A+2C and A24minus = A-2C.
// Output: projective Montgomery x-coordinates Q = 3*P = (X3:Z3).
f2elm_t t0, t1, t2, t3, t4, t5, t6, t7, t8;

fp2sub(P->X, P->Z, t0); // t0 = X-Z
fp2add(P->X, P->Z, t1); // t1 = X+Z
fp2_mul_ifma_x2(t2, t0, t0, t3, t1, t1);
//fp2sqr_mont(t0, t2); // t2 = (X-Z)^2
//fp2sqr_mont(t1, t3); // t3 = (X+Z)^2
fp2_mul_ifma_x2(t5, A24plus, t3, t6, A24minus, t2);
//fp2mul_mont(t3, A24plus, t5); // t5 = A24plus*(X+Z)^2
//fp2mul_mont(A24minus, t2, t6); // t6 = A24minus*(X-Z)^2
fp2_mul_ifma_x2(t7, t3, t5, t8, t2, t6);
//fp2mul_mont(t3, t5, t7); // t3 = A24plus*(X+Z)^3
//fp2mul_mont(t2, t6, t8); // t2 = A24minus*(X-Z)^3
fp2add(t0, t1, t4); // t4 = 2*X
fp2sub(t1, t0, t0); // t0 = 2*Z
fp2sqr_mont(t4, t1); // t1 = 4*X^2
fp2sub(t1, t3, t1); // t1 = 4*X^2 - (X+Z)^2
fp2sub(t1, t2, t1); // t1 = 4*X^2 - (X+Z)^2 - (X-Z)^2
fp2sub(t8, t7, t7); // t3 = A24minus*(X-Z)^3 - coeff*(X+Z)^3
fp2sub(t5, t6, t8); // t2 = A24plus*(X+Z)^2 - A24minus*(X-Z)^2
fp2mul_mont(t1, t8, t1); // t1 = [4*X^2 - (X+Z)^2 - (X-Z)^2]*[A24plus*(X+Z)^2 - A24minus*(X-Z)^2]
fp2add(t7, t1, t8); // t2 = [4*X^2 - (X+Z)^2 - (X-Z)^2]*[A24plus*(X+Z)^2 - A24minus*(X-Z)^2] + A24minus*(X-Z)^3 - coeff*(X+Z)^3
fp2sub(t7, t1, t1); // t1 = A24minus*(X-Z)^3 - A24plus*(X+Z)^3 - [4*X^2 - (X+Z)^2 - (X-Z)^2]*[A24plus*(X+Z)^2 - A24minus*(X-Z)^2]
fp2_mul_ifma_x2(t8, t8, t8, t1, t1, t1);
//fp2sqr_mont(t8, t8); // t2 = t2^2
//fp2sqr_mont(t1, t1); // t1 = t1^2
fp2_mul_ifma_x2(Q->X, t4, t8, Q->Z, t1, t0);
//fp2mul_mont(t4, t8, Q->X); // X3 = 2*X*t2
//fp2mul_mont(t0, t1, Q->Z); // Z3 = 2*Z*t1
}

void xTPLe_ifma(const point_proj_t P, point_proj_t Q, const f2elm_t A24minus, const f2elm_t A24plus, const int e)
{ // Computes [3^e](X:Z) on Montgomery curve with projective constant via e repeated triplings.
// Input: projective Montgomery x-coordinates P = (XP:ZP), such that xP=XP/ZP and Montgomery curve constants A24plus = A+2C and A24minus = A-2C.
// Output: projective Montgomery x-coordinates Q <- (3^e)*P.
int i;

memcpy(Q, P, sizeof(point_proj));

for (i = 0; i < e; i++)
{
xTPL_ifma(Q, Q, A24minus, A24plus);
}
}

static void get_4_isog_ifma(const point_proj_t P, f2elm_t A24plus, f2elm_t C24, f2elm_t *coeff)
{ // Computes the corresponding 4-isogeny of a projective Montgomery point (X4:Z4) of order 4.
// Input: projective point of order four P = (X4:Z4).
// Output: the 4-isogenous Montgomery curve with projective coefficients A+2C/4C and the 3 coefficients
// that are used to evaluate the isogeny at a point in eval_4_isog().

fp2sub(P->X, P->Z, coeff[1]); // coeff[1] = X4-Z4
fp2add(P->X, P->Z, coeff[2]); // coeff[2] = X4+Z4

fp2_mul_ifma_x2(coeff[0], P->Z, P->Z, A24plus, P->X, P->X);
//fp2sqr_mont(P->Z, coeff[0]); // coeff[0] = Z4^2
//fp2sqr_mont(P->X, A24plus); // A24plus = X4^2

fp2add(coeff[0], coeff[0], coeff[0]); // coeff[0] = 2*Z4^2
fp2add(A24plus, A24plus, A24plus); // A24plus = 2*X4^2

fp2_mul_ifma_x2(C24, coeff[0], coeff[0], A24plus, A24plus, A24plus);
//fp2sqr_mont(coeff[0], C24); // C24 = 4*Z4^4
//fp2sqr_mont(A24plus, A24plus); // A24plus = 4*X4^4

fp2add(coeff[0], coeff[0], coeff[0]); // coeff[0] = 4*Z4^2
}

static void eval_4_isog_ifma(point_proj_t P, f2elm_t *coeff)
{ // Evaluates the isogeny at the point (X:Z) in the domain of the isogeny, given a 4-isogeny phi defined
// by the 3 coefficients in coeff (computed in the function get_4_isog()).
// Inputs: the coefficients defining the isogeny, and the projective point P = (X:Z).
// Output: the projective point P = phi(P) = (X:Z) in the codomain.
f2elm_t t0, t1, t2;

fp2add(P->X, P->Z, t0); // t0 = X+Z
fp2sub(P->X, P->Z, t1); // t1 = X-Z

fp2_mul_ifma_x2(P->X, t0, coeff[1], t0, t0, t1);
//fp2mul_mont(t0, coeff[1], P->X); // X = (X+Z)*coeff[1]
//fp2mul_mont(t0, t1, t0); // t0 = (X+Z)*(X-Z)

fp2_mul_ifma_x2(P->Z, coeff[2], t1, t0, coeff[0], t0);
//fp2mul_mont(t1, coeff[2], P->Z); // Z = (X-Z)*coeff[2]
//fp2mul_mont(t0, coeff[0], t0); // t0 = coeff[0]*(X+Z)*(X-Z)

fp2add(P->X, P->Z, t1); // t1 = (X-Z)*coeff[2] + (X+Z)*coeff[1]
fp2sub(P->X, P->Z, P->Z); // Z = (X-Z)*coeff[2] - (X+Z)*coeff[1]

fp2_mul_ifma_x2(t1, t1, t1, P->Z, P->Z, P->Z);
//fp2sqr_mont(t1, t1); // t1 = [(X-Z)*coeff[2] + (X+Z)*coeff[1]]^2
//fp2sqr_mont(P->Z, P->Z); // Z = [(X-Z)*coeff[2] - (X+Z)*coeff[1]]^2

fp2add(t1, t0, P->X); // X = coeff[0]*(X+Z)*(X-Z) + [(X-Z)*coeff[2] + (X+Z)*coeff[1]]^2
fp2sub(P->Z, t0, t0); // t0 = [(X-Z)*coeff[2] - (X+Z)*coeff[1]]^2 - coeff[0]*(X+Z)*(X-Z)

fp2_mul_ifma_x2(P->X, P->X, t1, P->Z, P->Z, t0);
//fp2mul_mont(P->X, t1, P->X); // Xfinal
//fp2mul_mont(P->Z, t0, P->Z); // Zfinal
}

static void get_3_isog_ifma(const point_proj_t P, f2elm_t A24minus, f2elm_t A24plus, f2elm_t *coeff)
{ // Computes the corresponding 3-isogeny of a projective Montgomery point (X3:Z3) of order 3.
// Input: projective point of order three P = (X3:Z3).
// Output: the 3-isogenous Montgomery curve with projective coefficient A/C.
f2elm_t t0, t1, t2, t3, t4, t5;

fp2sub(P->X, P->Z, coeff[0]); // coeff0 = X-Z
fp2add(P->X, P->Z, coeff[1]); // coeff1 = X+Z
fp2_mul_ifma_x2(t0, coeff[0], coeff[0], t1, coeff[1], coeff[1]);
//fp2sqr_mont(coeff[0], t0); // t0 = (X-Z)^2
//fp2sqr_mont(coeff[1], t1); // t1 = (X+Z)^2
fp2add(t0, t1, t2); // t2 = (X+Z)^2 + (X-Z)^2
fp2add(coeff[0], coeff[1], t3); // t3 = 2*X
fp2sqr_mont(t3, t3); // t3 = 4*X^2
fp2sub(t3, t2, t3); // t3 = 4*X^2 - (X+Z)^2 - (X-Z)^2
fp2add(t1, t3, t2); // t2 = 4*X^2 - (X-Z)^2
fp2add(t3, t0, t3); // t3 = 4*X^2 - (X+Z)^2
fp2add(t0, t3, t4); // t4 = 4*X^2 - (X+Z)^2 + (X-Z)^2
fp2add(t4, t4, t4); // t4 = 2(4*X^2 - (X+Z)^2 + (X-Z)^2)
fp2add(t1, t4, t4); // t4 = 8*X^2 - (X+Z)^2 + 2*(X-Z)^2
fp2add(t1, t2, t5); // t4 = 4*X^2 + (X+Z)^2 - (X-Z)^2
fp2add(t5, t5, t5); // t4 = 2(4*X^2 + (X+Z)^2 - (X-Z)^2)
fp2add(t0, t5, t5); // t4 = 8*X^2 + 2*(X+Z)^2 - (X-Z)^2
fp2_mul_ifma_x2(A24minus, t2, t4, t5, t5, t3);
// fp2mul_mont(t2, t4, A24minus); // A24minus = [4*X^2 - (X-Z)^2]*[8*X^2 - (X+Z)^2 + 2*(X-Z)^2]
// fp2mul_mont(t3, t5, t5); // t4 = [4*X^2 - (X+Z)^2]*[8*X^2 + 2*(X+Z)^2 - (X-Z)^2]
fp2sub(t5, A24minus, t0); // t0 = [4*X^2 - (X+Z)^2]*[8*X^2 + 2*(X+Z)^2 - (X-Z)^2] - [4*X^2 - (X-Z)^2]*[8*X^2 - (X+Z)^2 + 2*(X-Z)^2]
fp2add(A24minus, t0, A24plus); // A24plus = 8*X^2 - (X+Z)^2 + 2*(X-Z)^2
}

static void eval_3_isog_ifma(point_proj_t Q, const f2elm_t *coeff)
{ // Computes the 3-isogeny R=phi(X:Z), given projective point (X3:Z3) of order 3 on a Montgomery curve and
// a point P with 2 coefficients in coeff (computed in the function get_3_isog()).
// Inputs: projective points P = (X3:Z3) and Q = (X:Z).
// Output: the projective point Q <- phi(Q) = (X3:Z3).
f2elm_t t0, t1, t2;

fp2add(Q->X, Q->Z, t0); // t0 = X+Z
fp2sub(Q->X, Q->Z, t1); // t1 = X-Z
fp2_mul_ifma_x2(t0, t0, coeff[0], t1, t1, coeff[1]);
//fp2mul_mont(t0, coeff[0], t0); // t0 = coeff0*(X+Z)
//fp2mul_mont(t1, coeff[1], t1); // t1 = coeff1*(X-Z)
fp2add(t0, t1, t2); // t2 = coeff0*(X-Z) + coeff1*(X+Z)
fp2sub(t1, t0, t0); // t0 = coeff0*(X-Z) - coeff1*(X+Z)
fp2_mul_ifma_x2(t2, t2, t2, t0, t0, t0);
//fp2sqr_mont(t2, t2); // t2 = [coeff0*(X-Z) + coeff1*(X+Z)]^2
//fp2sqr_mont(t0, t0); // t1 = [coeff0*(X-Z) - coeff1*(X+Z)]^2
fp2_mul_ifma_x2(Q->X, Q->X, t2, Q->Z, Q->Z, t0);
//fp2mul_mont(Q->X, t2, Q->X); // X3final = X*[coeff0*(X-Z) + coeff1*(X+Z)]^2
//fp2mul_mont(Q->Z, t0, Q->Z); // Z3final = Z*[coeff0*(X-Z) - coeff1*(X+Z)]^2
}

static void fp2_encode(const f2elm_t x, unsigned char *enc)
{ // Conversion of GF(p^2) element from Montgomery to standard representation, and encoding by removing leading 0 bytes
unsigned int i;
f2elm_t tt;
uint64_t t[12 * 2];

from_mont_ifma(tt[0], x[0]);
from_mont_ifma(tt[1], x[1]);

red2norm(t, tt[0]);
red2norm(&t[12], tt[1]);

for (i = 0; i < FP2_ENCODED_BYTES / 2; i++)
{
enc[i] = ((unsigned char *)t)[i];
enc[i + FP2_ENCODED_BYTES / 2] = ((unsigned char *)t)[i + MAXBITS_FIELD / 8];
}
}

static void fp2_decode(const unsigned char *enc, f2elm_t x)
{
unsigned int i;
uint64_t t[12 * 2];

memset(x, 0, sizeof(f2elm_t));
for (i = 0; i < FP2_ENCODED_BYTES / 2; i++)
{
((unsigned char *)t)[i] = enc[i];
((unsigned char *)t)[i + MAXBITS_FIELD / 8] = enc[i + FP2_ENCODED_BYTES / 2];
}

norm2red(x[0], t);
norm2red(x[1], &t[12]);
to_mont_ifma(x[0], x[0]);
to_mont_ifma(x[1], x[1]);
}

int EphemeralKeyGeneration_A_ifma(const unsigned char *PrivateKeyA, unsigned char *PublicKeyA)
{ // Alice's ephemeral public key generation
// Input: a private key PrivateKeyA in the range [0, 2^eA - 1].
// Output: the public key PublicKeyA consisting of 3 elements in GF(p^2) which are encoded by removing leading 0 bytes.
point_proj_t R, phiP = {0}, phiQ = {0}, phiR = {0}, pts[MAX_INT_POINTS_ALICE];
f2elm_t XPA, XQA, XRA, coeff[3];
unsigned int i, row, m, index = 0, pts_index[MAX_INT_POINTS_ALICE], npts = 0, ii = 0;

f2elm_t C24 = {
{0x000000004935acf8, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0003f30018a85800, 0x000664c911fc7654, 0x000cc2ec46db6eef, 0x000badd2e0465707, 0x000a9aec44eeae7f, 0x000a99a2d802be6b, 0x0003f8e487189f8e, 0x000000000037f1ed},
{0}};

f2elm_t A24plus = {
{0x00000000249ad67c, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0001f9800c542c00, 0x000b326488fe3b2a, 0x000e6176236db777, 0x000dd6e970232b83, 0x000d4d762277573f, 0x00054cd16c015f35, 0x0009fc72438c4fc7, 0x00000000001bf8f6},
{0}};

// Initialize basis points
init_basis(A_gen_ifma, XPA, XQA, XRA);
init_basis(B_gen_ifma, phiP->X, phiQ->X, phiR->X);
memcpy(phiP->Z, One, sizeof(felm_t));
memcpy(phiQ->Z, One, sizeof(felm_t));
memcpy(phiR->Z, One, sizeof(felm_t));

// Retrieve kernel point
LADDER3PT_ifma(XPA, XQA, XRA, (uint64_t *)PrivateKeyA, ALICE, R);

// Traverse tree
index = 0;
for (row = 1; row < MAX_Alice; row++)
{
while (index < MAX_Alice - row)
{
memcpy(pts[npts]->X, R->X, sizeof(f2elm_t));
memcpy(pts[npts]->Z, R->Z, sizeof(f2elm_t));
pts_index[npts++] = index;
m = strat_Alice[ii++];
xDBLe_ifma(R, R, A24plus, C24, (int)(2 * m));
index += m;
}
get_4_isog_ifma(R, A24plus, C24, coeff);

for (i = 0; i < npts; i++)
{
eval_4_isog_ifma(pts[i], coeff);
}
eval_4_isog_ifma(phiP, coeff);
eval_4_isog_ifma(phiQ, coeff);
eval_4_isog_ifma(phiR, coeff);

memcpy(R->X, pts[npts - 1]->X, sizeof(f2elm_t));
memcpy(R->Z, pts[npts - 1]->Z, sizeof(f2elm_t));
index = pts_index[npts - 1];
npts -= 1;
}

get_4_isog_ifma(R, A24plus, C24, coeff);
eval_4_isog_ifma(phiP, coeff);
eval_4_isog_ifma(phiQ, coeff);
eval_4_isog_ifma(phiR, coeff);

inv_3_way_ifma(phiP->Z, phiQ->Z, phiR->Z);
fp2_mul_ifma_x2(phiP->X, phiP->X, phiP->Z, phiQ->X, phiQ->X, phiQ->Z);
//fp2mul_mont(phiP->X, phiP->Z, phiP->X);
//fp2mul_mont(phiQ->X, phiQ->Z, phiQ->X);
fp2mul_mont(phiR->X, phiR->Z, phiR->X);

// Format public key
fp2_encode(phiP->X, PublicKeyA);
fp2_encode(phiQ->X, PublicKeyA + FP2_ENCODED_BYTES);
fp2_encode(phiR->X, PublicKeyA + 2 * FP2_ENCODED_BYTES);

return 0;
}

int EphemeralKeyGeneration_B_ifma(const unsigned char *PrivateKeyB, unsigned char *PublicKeyB)
{ // Bob's ephemeral public key generation
// Input: a private key PrivateKeyB in the range [0, 2^Floor(Log(2,oB)) - 1].
// Output: the public key PublicKeyB consisting of 3 elements in GF(p^2) which are encoded by removing leading 0 bytes.
point_proj_t R, phiP = {0}, phiQ = {0}, phiR = {0}, pts[MAX_INT_POINTS_BOB];
f2elm_t XPB, XQB, XRB, coeff[3], A = {0};
unsigned int i, row, m, index = 0, pts_index[MAX_INT_POINTS_BOB], npts = 0, ii = 0;

f2elm_t A24plus = {{0x000000004935acf8, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0003f30018a85800, 0x000664c911fc7654, 0x000cc2ec46db6eef, 0x000badd2e0465707, 0x000a9aec44eeae7f, 0x000a99a2d802be6b, 0x0003f8e487189f8e, 0x000000000037f1ed},
{0}};

f2elm_t A24minus = {{0x000fffffb6ca5307, 0x000fffffffffffff, 0x000fffffffffffff, 0x000fffffffffffff, 0x000fffffffffffff, 0x000fffffffffffff, 0x000fffffffffffff, 0x0000ac8771e692ff, 0x000167add1f02031, 0x000aaabd12d63250, 0x000ca0c5879094e0, 0x0000b5598636c600, 0x0004fe180463c6f7, 0x0000268d39c8897b, 0x000000000037f3e8},
{0}};

uint64_t temp[12];
uint64_t ifma_temp[15];
// Initialize basis points
init_basis(B_gen_ifma, XPB, XQB, XRB);
init_basis(A_gen_ifma, phiP->X, phiQ->X, phiR->X);
memcpy(phiP->Z, One, sizeof(felm_t));
memcpy(phiQ->Z, One, sizeof(felm_t));
memcpy(phiR->Z, One, sizeof(felm_t));

// Retrieve kernel point
LADDER3PT_ifma(XPB, XQB, XRB, (uint64_t *)PrivateKeyB, BOB, R);

// Traverse tree
index = 0;
for (row = 1; row < MAX_Bob; row++)
{
while (index < MAX_Bob - row)
{
memcpy(pts[npts]->X, R->X, sizeof(f2elm_t));
memcpy(pts[npts]->Z, R->Z, sizeof(f2elm_t));
pts_index[npts++] = index;
m = strat_Bob[ii++];
xTPLe_ifma(R, R, A24minus, A24plus, (int)m);
index += m;
}
get_3_isog_ifma(R, A24minus, A24plus, coeff);

for (i = 0; i < npts; i++)
{
eval_3_isog_ifma(pts[i], coeff);
}
eval_3_isog_ifma(phiP, coeff);
eval_3_isog_ifma(phiQ, coeff);
eval_3_isog_ifma(phiR, coeff);

memcpy(R->X, pts[npts - 1]->X, sizeof(f2elm_t));
memcpy(R->Z, pts[npts - 1]->Z, sizeof(f2elm_t));

index = pts_index[npts - 1];
npts -= 1;
}

get_3_isog_ifma(R, A24minus, A24plus, coeff);
eval_3_isog_ifma(phiP, coeff);
eval_3_isog_ifma(phiQ, coeff);
eval_3_isog_ifma(phiR, coeff);

inv_3_way_ifma(phiP->Z, phiQ->Z, phiR->Z);
fp2mul_mont(phiP->X, phiP->Z, phiP->X);
fp2mul_mont(phiQ->X, phiQ->Z, phiQ->X);
fp2mul_mont(phiR->X, phiR->Z, phiR->X);

// Format public key
fp2_encode(phiP->X, PublicKeyB);
fp2_encode(phiQ->X, PublicKeyB + FP2_ENCODED_BYTES);
fp2_encode(phiR->X, PublicKeyB + 2 * FP2_ENCODED_BYTES);

return 0;
}

+ 34
- 0
README.md 查看文件

@@ -0,0 +1,34 @@
## PQ SIDH/SIKE implementation using AVX512IFMA instructions

Using the AVX512IFMA (vpmadd52luq and vpmadd52huq) specifically designed for
prime field arithmetic allows a projected speedup of up to 4X on supporting
processors, when those become available.

### Current status

* Tested for correctness with Intel SDE
* EphemeralKeyGeneration_A and EphemeralKeyGeneration_B with P751 are implemented
* Using "standins": 3X performance gain on Xeon Gold (with two FMA units)
* Optimizations are 3-fold
* Finite field *𝔽~p~* multiplication by performing a single horizontal Montgomery multiplication
* Quadratic finite field *𝔽~p²~* multiplication and square by performing 3/4 horizontal Montgomery multiplications in parallel
* A pair of quadratic finite field *𝔽~p²~* multiplications (where applicable) by performing 8 vertical Montgomery multiplications in parallel
* AVX512 add/sub are also implemented

### How to test?

The Makefile generates to executables: sidh_ifma can be run with Intel SDE to
check for correctness. sidh_standin produces incorrect results, because it
replaces the IFMA instrutions with FMA instructions and can be executed on a
machine with AVX512 support to estimate performance.

### TODO

* EphemeralSecretAgreement_A and EphemeralSecretAgreement_B
* SIKE
* P503
* Using vertical representation throughout for greater speedups

### License

Available under the original [SIKE](https://github.com/Microsoft/PQCrypto-SIKE) license

+ 916
- 0
fp2_751_ifma.S 查看文件

@@ -0,0 +1,916 @@

#if defined(__APPLE__)
/* OS X's C ABI prefixes functions with underscore. */
#define C_ABI(x) _ ## x
#define HIDDEN .private_extern
#else
#define C_ABI(x) x
#define HIDDEN .hidden
#endif

.p2align 6
.LpermMask0:
.word 0,1,2,3, 3,4,5,6, 6,7,8,9, 9,10,11,12, 13,14,15,16, 16,17,18,19, 19,20,21,22, 22,23,24,25
.LshiftMask0:
.quad 0,4,8,12,0,4,8,12
.LandMask:
.quad 0xfffffffffffff

.p2align 6
.Lpoly:
.quad 0x000fffffffffffff, 0x000fffffffffffff, 0x000fffffffffffff, 0x000fffffffffffff
.quad 0x000fffffffffffff, 0x000fffffffffffff, 0x000fffffffffffff, 0x00049f878a8eeaff
.quad 0x0007cc76e3ec9685, 0x00076da959b1a13f, 0x00084e9867d6ebe8, 0x000b5045cb257480
.quad 0x000f97badc668562, 0x00041f71c0e12909, 0x00000000006fe5d5, 0

.LpolyX:
.quad 0x0fffffffffffff00, 0x0fffffffffffff00, 0x0fffffffffffff00, 0x0fffffffffffff00
.quad 0x0fffffffffffff00, 0x0fffffffffffff00, 0x0fffffffffffff00, 0x049f878a8eeaff00
.quad 0x07cc76e3ec968500, 0x076da959b1a13f00, 0x084e9867d6ebe800, 0x0b5045cb25748000
.quad 0x0f97badc66856200, 0x041f71c0e1290900, 0x000000006fe5d500, 0

#define felemR %rdi
#define felemA %rsi
#define felemB %rdx

#define itr %r10

#define M0 %zmm0
#define M1 %zmm1
#define ZERO %zmm2
#define AND_MASK %zmm3

#define A0a %zmm4
#define A0b %zmm5

#define A1a %zmm6
#define A1b %zmm7

#define ACC0a %zmm8
#define ACC0b %zmm9
#define ACC1a %zmm10
#define ACC1b %zmm11
#define ACC2a %zmm12
#define ACC2b %zmm13
#define ACC3a %zmm14
#define ACC3b %zmm15

#define B0curr %zmm16
#define B0prev %zmm17
#define B1curr %zmm18
#define B1prev %zmm19

#define Y0curr %zmm20
#define Y0prev %zmm21
#define Y1curr %zmm22
#define Y1prev %zmm23
#define Y2curr %zmm24
#define Y2prev %zmm25
#define Y3curr %zmm26
#define Y3prev %zmm27

#define T0 %zmm28
#define T1 %zmm29
#define T2 %zmm30
#define T3 %zmm31

###############################################################################
.globl C_ABI(fp2_mul_ifma)
.p2align 6
C_ABI(fp2_mul_ifma):

mov $1, %eax
kmovw %eax, %k1
mov $0x7f, %eax
kmovw %eax, %k5

vpbroadcastq .LandMask(%rip), AND_MASK
vpxorq ZERO, ZERO, ZERO

vmovdqu64 64*0(felemA), A0a
vmovdqu64 64*1(felemA), A0b{%k5}{z}
vmovdqu64 15*8 + 64*0(felemA), A1a
vmovdqu64 15*8 + 64*1(felemA), A1b{%k5}{z}

# Load the modulus
vmovdqa64 64*0 + .Lpoly(%rip), M0
vmovdqa64 64*1 + .Lpoly(%rip), M1

# Prepare the accumulators
vpxorq ACC0a, ACC0a, ACC0a
vpxorq ACC0b, ACC0b, ACC0b
vpxorq ACC1a, ACC1a, ACC1a
vpxorq ACC1b, ACC1b, ACC1b
vpxorq ACC2a, ACC2a, ACC2a
vpxorq ACC2b, ACC2b, ACC2b
vpxorq ACC3a, ACC3a, ACC3a
vpxorq ACC3b, ACC3b, ACC3b
vpxorq T0, T0, T0
vpxorq T1, T1, T1
vpxorq T2, T2, T2
vpxorq T3, T3, T3

# First iteration
vpbroadcastq (felemB), B0curr
vpbroadcastq 15*8(felemB), B1curr
lea 8(felemB), felemB

vpmadd52luq B0curr, A0a, ACC0a
vpmadd52luq B0curr, A0b, ACC0b
vpmadd52luq B1curr, A1a, ACC1a
vpmadd52luq B1curr, A1b, ACC1b
vpmadd52luq B0curr, A1a, ACC2a
vpmadd52luq B0curr, A1b, ACC2b
vpmadd52luq B1curr, A0a, ACC3a
vpmadd52luq B1curr, A0b, ACC3b

vpermq ACC0a, ZERO, Y0curr
vpermq ACC1a, ZERO, Y1curr
vpermq ACC2a, ZERO, Y2curr
vpermq ACC3a, ZERO, Y3curr

vpmadd52luq Y0curr, M0, ACC0a
vpmadd52luq Y0curr, M1, ACC0b
vpmadd52luq Y1curr, M0, ACC1a
vpmadd52luq Y1curr, M1, ACC1b
vpmadd52luq Y2curr, M0, ACC2a
vpmadd52luq Y2curr, M1, ACC2b
vpmadd52luq Y3curr, M0, ACC3a
vpmadd52luq Y3curr, M1, ACC3b

vpsrlq $52, ACC0a, T0{%k1}{z}
vpsrlq $52, ACC1a, T1{%k1}{z}
vpsrlq $52, ACC2a, T2{%k1}{z}
vpsrlq $52, ACC3a, T3{%k1}{z}

mov $14, itr

1:
# Shift the ACC in zmms right by a word
valignq $1, ACC0a, ACC0b, ACC0a
valignq $1, ACC0b, ZERO, ACC0b
valignq $1, ACC1a, ACC1b, ACC1a
valignq $1, ACC1b, ZERO, ACC1b
valignq $1, ACC2a, ACC2b, ACC2a
valignq $1, ACC2b, ZERO, ACC2b
valignq $1, ACC3a, ACC3b, ACC3a
valignq $1, ACC3b, ZERO, ACC3b

vmovdqa64 B0curr, B0prev
vmovdqa64 B1curr, B1prev
vmovdqa64 Y0curr, Y0prev
vmovdqa64 Y1curr, Y1prev
vmovdqa64 Y2curr, Y2prev
vmovdqa64 Y3curr, Y3prev

vpbroadcastq (felemB), B0curr
vpbroadcastq 15*8(felemB), B1curr
lea 8(felemB), felemB

# High multiplications
vpmadd52huq B0prev, A0a, ACC0a # ACC0 = A0 * B0
vpmadd52huq B0prev, A0b, ACC0b
vpmadd52huq B1prev, A1a, ACC1a # ACC1 = A1 * B1
vpmadd52huq B1prev, A1b, ACC1b
vpmadd52huq B0prev, A1a, ACC2a # ACC2 = A1 * B0
vpmadd52huq B0prev, A1b, ACC2b
vpmadd52huq B1prev, A0a, ACC3a # ACC3 = A0 * B1
vpmadd52huq B1prev, A0b, ACC3b

vpmadd52huq Y0prev, M0, ACC0a
vpmadd52huq Y0prev, M1, ACC0b
vpmadd52huq Y1prev, M0, ACC1a
vpmadd52huq Y1prev, M1, ACC1b
vpmadd52huq Y2prev, M0, ACC2a
vpmadd52huq Y2prev, M1, ACC2b
vpmadd52huq Y3prev, M0, ACC3a
vpmadd52huq Y3prev, M1, ACC3b
# Low multiplications
vpmadd52luq B0curr, A0a, ACC0a
vpmadd52luq B0curr, A0b, ACC0b
vpmadd52luq B1curr, A1a, ACC1a
vpmadd52luq B1curr, A1b, ACC1b
vpmadd52luq B0curr, A1a, ACC2a
vpmadd52luq B0curr, A1b, ACC2b
vpmadd52luq B1curr, A0a, ACC3a
vpmadd52luq B1curr, A0b, ACC3b

vpaddq T0, ACC0a, ACC0a
vpaddq T1, ACC1a, ACC1a
vpaddq T2, ACC2a, ACC2a
vpaddq T3, ACC3a, ACC3a
vpermq ACC0a, ZERO, Y0curr
vpermq ACC1a, ZERO, Y1curr
vpermq ACC2a, ZERO, Y2curr
vpermq ACC3a, ZERO, Y3curr

vpmadd52luq Y0curr, M0, ACC0a
vpmadd52luq Y0curr, M1, ACC0b
vpmadd52luq Y1curr, M0, ACC1a
vpmadd52luq Y1curr, M1, ACC1b
vpmadd52luq Y2curr, M0, ACC2a
vpmadd52luq Y2curr, M1, ACC2b
vpmadd52luq Y3curr, M0, ACC3a
vpmadd52luq Y3curr, M1, ACC3b

vpsrlq $52, ACC0a, T0{%k1}{z}
vpsrlq $52, ACC1a, T1{%k1}{z}
vpsrlq $52, ACC2a, T2{%k1}{z}
vpsrlq $52, ACC3a, T3{%k1}{z}

dec itr
jne 1b
valignq $1, ACC0a, ACC0b, ACC0a
valignq $1, ACC0b, ZERO, ACC0b
valignq $1, ACC1a, ACC1b, ACC1a
valignq $1, ACC1b, ZERO, ACC1b
valignq $1, ACC2a, ACC2b, ACC2a
valignq $1, ACC2b, ZERO, ACC2b
valignq $1, ACC3a, ACC3b, ACC3a
valignq $1, ACC3b, ZERO, ACC3b
vpaddq T0, ACC0a, ACC0a
vpaddq T1, ACC1a, ACC1a
vpaddq T2, ACC2a, ACC2a
vpaddq T3, ACC3a, ACC3a

# The last high multiplications
vpmadd52huq B0curr, A0a, ACC0a
vpmadd52huq B0curr, A0b, ACC0b
vpmadd52huq B1curr, A1a, ACC1a
vpmadd52huq B1curr, A1b, ACC1b
vpmadd52huq B0curr, A1a, ACC2a
vpmadd52huq B0curr, A1b, ACC2b
vpmadd52huq B1curr, A0a, ACC3a
vpmadd52huq B1curr, A0b, ACC3b

vpmadd52huq Y0curr, M0, ACC0a
vpmadd52huq Y0curr, M1, ACC0b
vpmadd52huq Y1curr, M0, ACC1a
vpmadd52huq Y1curr, M1, ACC1b
vpmadd52huq Y2curr, M0, ACC2a
vpmadd52huq Y2curr, M1, ACC2b
vpmadd52huq Y3curr, M0, ACC3a
vpmadd52huq Y3curr, M1, ACC3b

# C0 = A0*B0 - A1*B1
# C1 = A0*B1 + A1*B0
vpaddq 64*0 + .LpolyX(%rip), ACC0a, ACC0a
vpaddq 64*1 + .LpolyX(%rip), ACC0b, ACC0b

vpaddq ACC3a, ACC2a, ACC2a
vpaddq ACC3b, ACC2b, ACC2b

vpsubq ACC1a, ACC0a, ACC0a
vpsubq ACC1b, ACC0b, ACC0b
# Now 'normalize' the acc to 52 bit words
vpsrlq $52, ACC0a, A0a
vpsrlq $52, ACC0b, A0b

vpsrlq $52, ACC2a, A1a
vpsrlq $52, ACC2b, A1b

vpandq AND_MASK, ACC0a, ACC0a
vpandq AND_MASK, ACC0b, ACC0b
vpandq AND_MASK, ACC2a, ACC2a
vpandq AND_MASK, ACC2b, ACC2b

valignq $7, A0a, A0b, A0b
valignq $7, ZERO, A0a, A0a
valignq $7, A1a, A1b, A1b
valignq $7, ZERO, A1a, A1a

vpaddq A0a, ACC0a, ACC0a
vpaddq A0b, ACC0b, ACC0b
vpaddq A1a, ACC2a, ACC2a
vpaddq A1b, ACC2b, ACC2b

vpcmpuq $1, A0a, ACC0a, %k1
vpcmpuq $1, A0b, ACC0b, %k2
vpcmpuq $0, AND_MASK, ACC0a, %k3
vpcmpuq $0, AND_MASK, ACC0b, %k4

kmovb %k1, %eax
kmovb %k2, %ecx
kmovb %k3, %r8d
kmovb %k4, %r9d

add %al, %al
adc %cl, %cl

add %r8b, %al
adc %r9b, %cl

xor %r8b, %al
xor %r9b, %cl

kmovb %eax, %k1
kmovb %ecx, %k2

vpsubq AND_MASK, ACC0a, ACC0a{%k1}
vpsubq AND_MASK, ACC0b, ACC0b{%k2}
vpandq AND_MASK, ACC0a, ACC0a
vpandq AND_MASK, ACC0b, ACC0b

vpcmpuq $1, A1a, ACC2a, %k1
vpcmpuq $1, A1b, ACC2b, %k2
vpcmpuq $0, AND_MASK, ACC2a, %k3
vpcmpuq $0, AND_MASK, ACC2b, %k4

kmovb %k1, %eax
kmovb %k2, %ecx
kmovb %k3, %r8d
kmovb %k4, %r9d

add %al, %al
adc %cl, %cl
add %r8b, %al
adc %r9b, %cl
xor %r8b, %al
xor %r9b, %cl
kmovb %eax, %k1
kmovb %ecx, %k2

vpsubq AND_MASK, ACC2a, ACC2a{%k1}
vpsubq AND_MASK, ACC2b, ACC2b{%k2}
vpandq AND_MASK, ACC2a, ACC2a
vpandq AND_MASK, ACC2b, ACC2b

mov $0x7f, %eax
kmovw %eax, %k1

vmovdqu64 ACC0a, 64*0(felemR)
vmovdqu64 ACC0b, 64*1(felemR){%k5}
vmovdqu64 ACC2a, 15*8 + 64*0(felemR)
vmovdqu64 ACC2b, 15*8 + 64*1(felemR){%k5}
ret

###############################################################################

#define ST0 ACC3a
#define ST1 ACC3b
#define ST2 Y3curr

.globl C_ABI(fp2_sqr_ifma)
.p2align 6
C_ABI(fp2_sqr_ifma):

mov $1, %eax
kmovw %eax, %k1
mov $0x7f, %eax
kmovw %eax, %k2

vpbroadcastq .LandMask(%rip), AND_MASK
vpxorq ZERO, ZERO, ZERO

vmovdqu64 64*0(felemA), A0a
vmovdqu64 64*1(felemA), A0b{%k2}{z}
vmovdqu64 15*8 + 64*0(felemA), A1a
vmovdqu64 15*8 + 64*1(felemA), A1b{%k2}{z}

# Load the modulus
vmovdqa64 64*0 + .Lpoly(%rip), M0
vmovdqa64 64*1 + .Lpoly(%rip), M1

# Prepare the accumulators
vpxorq ACC0a, ACC0a, ACC0a
vpxorq ACC0b, ACC0b, ACC0b
vpxorq ACC1a, ACC1a, ACC1a
vpxorq ACC1b, ACC1b, ACC1b
vpxorq ACC2a, ACC2a, ACC2a
vpxorq ACC2b, ACC2b, ACC2b
vpxorq T0, T0, T0
vpxorq T1, T1, T1
vpxorq T2, T2, T2

# First iteration
vpbroadcastq (felemA), B0curr
vpbroadcastq 15*8(felemA), B1curr
lea 8(felemA), felemA

vpmadd52luq B0curr, A0a, ACC0a
vpmadd52luq B0curr, A0b, ACC0b
vpmadd52luq B1curr, A1a, ACC1a
vpmadd52luq B1curr, A1b, ACC1b
vpmadd52luq B0curr, A1a, ACC2a
vpmadd52luq B0curr, A1b, ACC2b

vpermq ACC0a, ZERO, Y0curr
vpermq ACC1a, ZERO, Y1curr
vpermq ACC2a, ZERO, Y2curr

vpmadd52luq Y0curr, M0, ACC0a
vpmadd52luq Y0curr, M1, ACC0b
vpmadd52luq Y1curr, M0, ACC1a
vpmadd52luq Y1curr, M1, ACC1b
vpmadd52luq Y2curr, M0, ACC2a
vpmadd52luq Y2curr, M1, ACC2b

vpsrlq $52, ACC0a, T0{%k1}{z}
vpsrlq $52, ACC1a, T1{%k1}{z}
vpsrlq $52, ACC2a, T2{%k1}{z}

mov $14, itr

1:
# Shift the ACC in zmms right by a word
valignq $1, ACC0a, ACC0b, ACC0a
valignq $1, ACC0b, ZERO, ACC0b
valignq $1, ACC1a, ACC1b, ACC1a
valignq $1, ACC1b, ZERO, ACC1b
valignq $1, ACC2a, ACC2b, ACC2a
valignq $1, ACC2b, ZERO, ACC2b

vpxorq ST0, ST0, ST0
vpxorq ST1, ST1, ST1
vpxorq ST2, ST2, ST2

vmovdqa64 B0curr, B0prev
vmovdqa64 B1curr, B1prev
vmovdqa64 Y0curr, Y0prev
vmovdqa64 Y1curr, Y1prev
vmovdqa64 Y2curr, Y2prev

vpbroadcastq (felemA), B0curr
vpbroadcastq 15*8(felemA), B1curr
lea 8(felemA), felemA

# High multiplications
vpmadd52huq B0prev, A0a, ACC0a # ACC0 = A0 * B0
vpmadd52huq B1prev, A1a, ACC1a # ACC1 = A1 * B1
vpmadd52huq B0prev, A1a, ACC2a # ACC2 = A1 * B0
vpmadd52huq B0prev, A0b, ACC0b
vpmadd52huq B1prev, A1b, ACC1b
vpmadd52huq B0prev, A1b, ACC2b
# We really want to have 8 independent vpmadd instructions in the pipe
vpmadd52huq Y0prev, M0, T0
vpmadd52huq Y1prev, M0, T1
vpmadd52huq Y2prev, M0, T2

vpmadd52huq Y0prev, M1, ACC0b
vpmadd52huq Y1prev, M1, ACC1b
vpmadd52huq Y2prev, M1, ACC2b
# Low multiplications
vpmadd52luq B0curr, A0a, ACC0a
vpmadd52luq B1curr, A1a, ACC1a
vpmadd52luq B0curr, A1a, ACC2a
vpmadd52luq B0curr, A0b, ST0
vpmadd52luq B1curr, A1b, ST1
vpmadd52luq B0curr, A1b, ST2

vpaddq T0, ACC0a, ACC0a
vpaddq T1, ACC1a, ACC1a
vpaddq T2, ACC2a, ACC2a
vpermq ACC0a, ZERO, Y0curr
vpermq ACC1a, ZERO, Y1curr
vpermq ACC2a, ZERO, Y2curr
vpaddq ST0, ACC0b, ACC0b
vpaddq ST1, ACC1b, ACC1b
vpaddq ST2, ACC2b, ACC2b

vpmadd52luq Y0curr, M0, ACC0a
vpmadd52luq Y0curr, M1, ACC0b
vpmadd52luq Y1curr, M0, ACC1a
vpmadd52luq Y1curr, M1, ACC1b
vpmadd52luq Y2curr, M0, ACC2a
vpmadd52luq Y2curr, M1, ACC2b

vpsrlq $52, ACC0a, T0{%k1}{z}
vpsrlq $52, ACC1a, T1{%k1}{z}
vpsrlq $52, ACC2a, T2{%k1}{z}

dec itr
jne 1b
valignq $1, ACC0a, ACC0b, ACC0a
valignq $1, ACC0b, ZERO, ACC0b
valignq $1, ACC1a, ACC1b, ACC1a
valignq $1, ACC1b, ZERO, ACC1b
valignq $1, ACC2a, ACC2b, ACC2a
valignq $1, ACC2b, ZERO, ACC2b
vpaddq T0, ACC0a, ACC0a
vpaddq T1, ACC1a, ACC1a
vpaddq T2, ACC2a, ACC2a

# The last high multiplications
vpmadd52huq B0curr, A0a, ACC0a
vpmadd52huq B0curr, A0b, ACC0b
vpmadd52huq B1curr, A1a, ACC1a
vpmadd52huq B1curr, A1b, ACC1b
vpmadd52huq B0curr, A1a, ACC2a
vpmadd52huq B0curr, A1b, ACC2b

vpmadd52huq Y0curr, M0, ACC0a
vpmadd52huq Y0curr, M1, ACC0b
vpmadd52huq Y1curr, M0, ACC1a
vpmadd52huq Y1curr, M1, ACC1b
vpmadd52huq Y2curr, M0, ACC2a
vpmadd52huq Y2curr, M1, ACC2b

# C0 = A0*B0 - A1*B1
# C1 = A0*B1 + A1*B0
vpaddq 64*0 + .LpolyX(%rip), ACC0a, ACC0a
vpaddq 64*1 + .LpolyX(%rip), ACC0b, ACC0b

vpaddq ACC2a, ACC2a, ACC2a
vpaddq ACC2b, ACC2b, ACC2b

vpsubq ACC1a, ACC0a, ACC0a
vpsubq ACC1b, ACC0b, ACC0b

# Now 'normalize' the acc to 52 bit words
vpsrlq $52, ACC0a, A0a
vpsrlq $52, ACC0b, A0b
vpsrlq $52, ACC2a, A1a
vpsrlq $52, ACC2b, A1b

vpandq AND_MASK, ACC0a, ACC0a
vpandq AND_MASK, ACC0b, ACC0b
vpandq AND_MASK, ACC2a, ACC2a
vpandq AND_MASK, ACC2b, ACC2b

valignq $7, A0a, A0b, A0b
valignq $7, ZERO, A0a, A0a
valignq $7, A1a, A1b, A1b
valignq $7, ZERO, A1a, A1a

vpaddq A0a, ACC0a, ACC0a
vpaddq A0b, ACC0b, ACC0b
vpaddq A1a, ACC2a, ACC2a
vpaddq A1b, ACC2b, ACC2b

vpcmpuq $1, A0a, ACC0a, %k1
vpcmpuq $1, A0b, ACC0b, %k2
vpcmpuq $0, AND_MASK, ACC0a, %k3
vpcmpuq $0, AND_MASK, ACC0b, %k4

kmovb %k1, %eax
kmovb %k2, %ecx
kmovb %k3, %r8d
kmovb %k4, %r9d

add %al, %al
adc %cl, %cl
add %r8b, %al
adc %r9b, %cl
xor %r8b, %al
xor %r9b, %cl
kmovb %eax, %k1
kmovb %ecx, %k2

vpsubq AND_MASK, ACC0a, ACC0a{%k1}
vpsubq AND_MASK, ACC0b, ACC0b{%k2}
vpandq AND_MASK, ACC0a, ACC0a
vpandq AND_MASK, ACC0b, ACC0b

vpcmpuq $1, A1a, ACC2a, %k1
vpcmpuq $1, A1b, ACC2b, %k2
vpcmpuq $0, AND_MASK, ACC2a, %k3
vpcmpuq $0, AND_MASK, ACC2b, %k4

kmovb %k1, %eax
kmovb %k2, %ecx
kmovb %k3, %r8d
kmovb %k4, %r9d

add %al, %al
adc %cl, %cl
add %r8b, %al
adc %r9b, %cl
xor %r8b, %al
xor %r9b, %cl
kmovb %eax, %k1
kmovb %ecx, %k2

vpsubq AND_MASK, ACC2a, ACC2a{%k1}
vpsubq AND_MASK, ACC2b, ACC2b{%k2}
vpandq AND_MASK, ACC2a, ACC2a
vpandq AND_MASK, ACC2b, ACC2b

mov $0x7f, %eax
kmovw %eax, %k1

vmovdqu64 ACC0a, 64*0(felemR)
vmovdqu64 ACC0b, 64*1(felemR){%k1}
vmovdqu64 ACC2a, 15*8 + 64*0(felemR)
vmovdqu64 ACC2b, 15*8 + 64*1(felemR){%k1}
ret

###############################################################################
.globl C_ABI(fp2_sub)
.p2align 6
C_ABI(fp2_sub):

mov $1, %eax
kmovw %eax, %k1
mov $0x7f, %eax
kmovw %eax, %k2
vmovdqu64 64*0(felemA), ACC0a
vmovdqu64 64*1(felemA), ACC0b{%k2}{z}
vmovdqu64 15*8 + 64*0(felemA), ACC1a
vmovdqu64 15*8 + 64*1(felemA), ACC1b{%k2}{z}

vmovdqu64 64*0(felemB), ACC2a
vmovdqu64 64*1(felemB), ACC2b{%k2}{z}
vmovdqu64 15*8 + 64*0(felemB), ACC3a
vmovdqu64 15*8 + 64*1(felemB), ACC3b{%k2}{z}

vpaddq 64*0 + .LpolyX(%rip), ACC0a, ACC0a
vpaddq 64*1 + .LpolyX(%rip), ACC0b, ACC0b
vpaddq 64*0 + .LpolyX(%rip), ACC1a, ACC1a
vpaddq 64*1 + .LpolyX(%rip), ACC1b, ACC1b

vpsubq ACC2a, ACC0a, ACC0a
vpsubq ACC2b, ACC0b, ACC0b
vpsubq ACC3a, ACC1a, ACC2a
vpsubq ACC3b, ACC1b, ACC2b

jmp fp2_normalize
###############################################################################
.globl C_ABI(fp2_add)
.p2align 6
C_ABI(fp2_add):

mov $1, %eax
kmovw %eax, %k1
mov $0x7f, %eax
kmovw %eax, %k2
vmovdqu64 64*0(felemA), ACC0a
vmovdqu64 64*1(felemA), ACC0b{%k2}{z}
vmovdqu64 15*8 + 64*0(felemA), ACC1a
vmovdqu64 15*8 + 64*1(felemA), ACC1b{%k2}{z}

vmovdqu64 64*0(felemB), ACC2a
vmovdqu64 64*1(felemB), ACC2b{%k2}{z}
vmovdqu64 15*8 + 64*0(felemB), ACC3a
vmovdqu64 15*8 + 64*1(felemB), ACC3b{%k2}{z}

vpaddq ACC2a, ACC0a, ACC0a
vpaddq ACC2b, ACC0b, ACC0b
vpaddq ACC3a, ACC1a, ACC2a
vpaddq ACC3b, ACC1b, ACC2b

// Fallthrough
###############################################################################
.p2align 6
C_ABI(fp2_normalize):

vpbroadcastq .LandMask(%rip), AND_MASK
vpxorq ZERO, ZERO, ZERO

# Now 'normalize' the acc to 52 bit words
vpsrlq $52, ACC0a, A0a
vpsrlq $52, ACC0b, A0b
vpsrlq $52, ACC2a, A1a
vpsrlq $52, ACC2b, A1b

vpandq AND_MASK, ACC0a, ACC0a
vpandq AND_MASK, ACC0b, ACC0b
vpandq AND_MASK, ACC2a, ACC2a
vpandq AND_MASK, ACC2b, ACC2b

valignq $7, A0a, A0b, A0b
valignq $7, ZERO, A0a, A0a
valignq $7, A1a, A1b, A1b
valignq $7, ZERO, A1a, A1a

vpaddq A0a, ACC0a, ACC0a
vpaddq A0b, ACC0b, ACC0b
vpaddq A1a, ACC2a, ACC2a
vpaddq A1b, ACC2b, ACC2b

vpcmpuq $1, A0a, ACC0a, %k1
vpcmpuq $1, A0b, ACC0b, %k2
vpcmpuq $0, AND_MASK, ACC0a, %k3
vpcmpuq $0, AND_MASK, ACC0b, %k4

kmovb %k1, %eax
kmovb %k2, %ecx
kmovb %k3, %r8d
kmovb %k4, %r9d

add %al, %al
adc %cl, %cl
add %r8b, %al
adc %r9b, %cl
xor %r8b, %al
xor %r9b, %cl
kmovb %eax, %k1
kmovb %ecx, %k2

vpsubq AND_MASK, ACC0a, ACC0a{%k1}
vpsubq AND_MASK, ACC0b, ACC0b{%k2}
vpandq AND_MASK, ACC0a, ACC0a
vpandq AND_MASK, ACC0b, ACC0b

vpcmpuq $1, A1a, ACC2a, %k1
vpcmpuq $1, A1b, ACC2b, %k2
vpcmpuq $0, AND_MASK, ACC2a, %k3
vpcmpuq $0, AND_MASK, ACC2b, %k4

kmovb %k1, %eax
kmovb %k2, %ecx
kmovb %k3, %r8d
kmovb %k4, %r9d

add %al, %al
adc %cl, %cl
add %r8b, %al
adc %r9b, %cl
xor %r8b, %al
xor %r9b, %cl
kmovb %eax, %k1
kmovb %ecx, %k2

vpsubq AND_MASK, ACC2a, ACC2a{%k1}
vpsubq AND_MASK, ACC2b, ACC2b{%k2}
vpandq AND_MASK, ACC2a, ACC2a
vpandq AND_MASK, ACC2b, ACC2b

mov $0x7f, %eax
kmovw %eax, %k1

vmovdqu64 ACC0a, 64*0(felemR)
vmovdqu64 ACC0b, 64*1(felemR){%k1}
vmovdqu64 ACC2a, 15*8 + 64*0(felemR)
vmovdqu64 ACC2b, 15*8 + 64*1(felemR){%k1}

ret


###############################################################################
#define p1ptr %rdi
#define p2ptr %rsi
#define swap %rdx
.globl C_ABI(fp2_swap)
.p2align 6
C_ABI(fp2_swap):

mov $0x7f, %eax
kmovw %eax, %k2
// TODO: get rid of the masks, not needed
vmovdqu64 64*0(p1ptr), %zmm0
vmovdqu64 64*1(p1ptr), %zmm1{%k2}{z}
vmovdqu64 15*8 + 64*0(p1ptr), %zmm2
vmovdqu64 15*8 + 64*1(p1ptr), %zmm3{%k2}{z}
vmovdqu64 2*15*8 + 64*0(p1ptr), %zmm4
vmovdqu64 2*15*8 + 64*1(p1ptr), %zmm5{%k2}{z}
vmovdqu64 3*15*8 + 64*0(p1ptr), %zmm6
vmovdqu64 3*15*8 + 64*1(p1ptr), %zmm7{%k2}{z}

vmovdqu64 64*0(p2ptr), %zmm8
vmovdqu64 64*1(p2ptr), %zmm9{%k2}{z}
vmovdqu64 15*8 + 64*0(p2ptr), %zmm10
vmovdqu64 15*8 + 64*1(p2ptr), %zmm11{%k2}{z}
vmovdqu64 2*15*8 + 64*0(p2ptr), %zmm12
vmovdqu64 2*15*8 + 64*1(p2ptr), %zmm13{%k2}{z}
vmovdqu64 3*15*8 + 64*0(p2ptr), %zmm14
vmovdqu64 3*15*8 + 64*1(p2ptr), %zmm15{%k2}{z}

vpxorq %zmm16, %zmm16, %zmm16
vpbroadcastq swap, %zmm17
vpsubq %zmm17, %zmm16, %zmm16

vmovdqa64 %zmm8, %zmm17
vmovdqa64 %zmm9, %zmm18
vmovdqa64 %zmm10, %zmm19
vmovdqa64 %zmm11, %zmm20
vmovdqa64 %zmm12, %zmm21
vmovdqa64 %zmm13, %zmm22
vmovdqa64 %zmm14, %zmm23
vmovdqa64 %zmm15, %zmm24

vpternlogq $0xd8, %zmm16, %zmm0, %zmm17
vpternlogq $0xd8, %zmm16, %zmm1, %zmm18
vpternlogq $0xd8, %zmm16, %zmm2, %zmm19
vpternlogq $0xd8, %zmm16, %zmm3, %zmm20
vpternlogq $0xd8, %zmm16, %zmm4, %zmm21
vpternlogq $0xd8, %zmm16, %zmm5, %zmm22
vpternlogq $0xd8, %zmm16, %zmm6, %zmm23
vpternlogq $0xd8, %zmm16, %zmm7, %zmm24

vpternlogq $0xe4, %zmm16, %zmm0, %zmm8
vpternlogq $0xe4, %zmm16, %zmm1, %zmm9
vpternlogq $0xe4, %zmm16, %zmm2, %zmm10
vpternlogq $0xe4, %zmm16, %zmm3, %zmm11
vpternlogq $0xe4, %zmm16, %zmm4, %zmm12
vpternlogq $0xe4, %zmm16, %zmm5, %zmm13
vpternlogq $0xe4, %zmm16, %zmm6, %zmm14
vpternlogq $0xe4, %zmm16, %zmm7, %zmm15


vmovdqu64 %zmm8, 64*0(p1ptr)
vmovdqu64 %zmm9, 64*1(p1ptr){%k2}
vmovdqu64 %zmm10, 15*8 + 64*0(p1ptr)
vmovdqu64 %zmm11, 15*8 + 64*1(p1ptr){%k2}
vmovdqu64 %zmm12, 2*15*8 + 64*0(p1ptr)
vmovdqu64 %zmm13, 2*15*8 + 64*1(p1ptr){%k2}
vmovdqu64 %zmm14, 3*15*8 + 64*0(p1ptr)
vmovdqu64 %zmm15, 3*15*8 + 64*1(p1ptr){%k2}

vmovdqu64 %zmm17, 64*0(p2ptr)
vmovdqu64 %zmm18, 64*1(p2ptr){%k2}
vmovdqu64 %zmm19, 15*8 + 64*0(p2ptr)
vmovdqu64 %zmm20, 15*8 + 64*1(p2ptr){%k2}
vmovdqu64 %zmm21, 2*15*8 + 64*0(p2ptr)
vmovdqu64 %zmm22, 2*15*8 + 64*1(p2ptr){%k2}
vmovdqu64 %zmm23, 3*15*8 + 64*0(p2ptr)
vmovdqu64 %zmm24, 3*15*8 + 64*1(p2ptr){%k2}

ret
###############################################################################
.globl C_ABI(fp_add)
.p2align 6
C_ABI(fp_add):

mov $0x7f, %eax
kmovw %eax, %k2
vmovdqu64 64*0(felemA), ACC0a
vmovdqu64 64*1(felemA), ACC0b{%k2}{z}

vmovdqu64 64*0(felemB), ACC2a
vmovdqu64 64*1(felemB), ACC2b{%k2}{z}

vpaddq ACC2a, ACC0a, ACC0a
vpaddq ACC2b, ACC0b, ACC0b

// Fallthrough
###############################################################################
.p2align 6
C_ABI(fp_normalize):

vpbroadcastq .LandMask(%rip), AND_MASK
vpxorq ZERO, ZERO, ZERO

# Now 'normalize' the acc to 52 bit words
vpsrlq $52, ACC0a, A0a
vpsrlq $52, ACC0b, A0b

vpandq AND_MASK, ACC0a, ACC0a
vpandq AND_MASK, ACC0b, ACC0b

valignq $7, A0a, A0b, A0b
valignq $7, ZERO, A0a, A0a

vpaddq A0a, ACC0a, ACC0a
vpaddq A0b, ACC0b, ACC0b

vpcmpuq $1, A0a, ACC0a, %k1
vpcmpuq $1, A0b, ACC0b, %k2
vpcmpuq $0, AND_MASK, ACC0a, %k3
vpcmpuq $0, AND_MASK, ACC0b, %k4

kmovb %k1, %eax
kmovb %k2, %ecx
kmovb %k3, %r8d
kmovb %k4, %r9d

add %al, %al
adc %cl, %cl

add %r8b, %al
adc %r9b, %cl

xor %r8b, %al
xor %r9b, %cl

kmovb %eax, %k1
kmovb %ecx, %k2

vpsubq AND_MASK, ACC0a, ACC0a{%k1}
vpsubq AND_MASK, ACC0b, ACC0b{%k2}
vpandq AND_MASK, ACC0a, ACC0a
vpandq AND_MASK, ACC0b, ACC0b

mov $0x7f, %eax
kmovw %eax, %k1

vmovdqu64 ACC0a, 64*0(%rdi)
vmovdqu64 ACC0b, 64*1(%rdi){%k1}

ret

###############################################################################
.globl C_ABI(fp_sub)
.p2align 6
C_ABI(fp_sub):

mov $0x7f, %eax
kmovw %eax, %k2
vmovdqu64 64*0(felemA), ACC0a
vmovdqu64 64*1(felemA), ACC0b{%k2}{z}

vmovdqu64 64*0(felemB), ACC2a
vmovdqu64 64*1(felemB), ACC2b{%k2}{z}

vpaddq 64*0 + .LpolyX(%rip), ACC0a, ACC0a
vpaddq 64*1 + .LpolyX(%rip), ACC0b, ACC0b

vpsubq ACC2a, ACC0a, ACC0a
vpsubq ACC2b, ACC0b, ACC0b

jmp fp_normalize


+ 523
- 0
fp2_packed_751_ifma.S 查看文件

@@ -0,0 +1,523 @@
#if defined(__APPLE__)
/* OS X's C ABI prefixes functions with underscore. */
#define C_ABI(x) _ ## x
#define HIDDEN .private_extern
#else
#define C_ABI(x) x
#define HIDDEN .hidden
#endif

#define ACC0 %zmm0
#define ACC1 %zmm1
#define ACC2 %zmm2
#define ACC3 %zmm3
#define ACC4 %zmm4
#define ACC5 %zmm5
#define ACC6 %zmm6
#define ACC7 %zmm7
#define ACC8 %zmm8
#define ACC9 %zmm9
#define ACC10 %zmm10
#define ACC11 %zmm11
#define ACC12 %zmm12
#define ACC13 %zmm13
#define ACC14 %zmm14
#define ACC15 %zmm15

#define A0 %zmm16
#define A1 %zmm17
#define A2 %zmm18
#define A3 %zmm19
#define A4 %zmm20
#define A5 %zmm21
#define A6 %zmm22
#define A7 %zmm23
#define A8 %zmm24
#define A9 %zmm25
#define A10 %zmm26
#define A11 %zmm27
#define A12 %zmm28
#define A13 %zmm29
#define A14 %zmm30

#define B %zmm31

#define rptr %rdi
#define aptr %rsi
#define bptr %rdx

#define r0ptr %rdi
#define a0ptr %rsi
#define b0ptr %rdx

#define r1ptr %rcx
#define a1ptr %r8
#define b1ptr %r9

#define hlp %rax

.p2align 6
.Lmask:
.Lpoly:
.quad 0x000fffffffffffff, 0x000fffffffffffff, 0x000fffffffffffff, 0x000fffffffffffff
.quad 0x000fffffffffffff, 0x000fffffffffffff, 0x000fffffffffffff, 0x00049f878a8eeaff
.quad 0x0007cc76e3ec9685, 0x00076da959b1a13f, 0x00084e9867d6ebe8, 0x000b5045cb257480
.quad 0x000f97badc668562, 0x00041f71c0e12909, 0x00000000006fe5d5, 0

.LpolyX:
.quad 0x0fffffffffffff00, 0x0fffffffffffff00, 0x0fffffffffffff00, 0x0fffffffffffff00
.quad 0x0fffffffffffff00, 0x0fffffffffffff00, 0x0fffffffffffff00, 0x049f878a8eeaff00
.quad 0x07cc76e3ec968500, 0x076da959b1a13f00, 0x084e9867d6ebe800, 0x0b5045cb25748000
.quad 0x0f97badc66856200, 0x041f71c0e1290900, 0x000000006fe5d500, 0

.Lperm0:
.quad 0,1,0,1,2,3,2,3

.Lperm1:
.quad 4,5,5,4,6,7,7,6

// TODO: avoid transposing every call by keeping data vertical throughout

// Performs 8 field multiplications in parallel
.globl C_ABI(fp2_mul_ifma_x2)
C_ABI(fp2_mul_ifma_x2):

push %rbp
mov %rsp, %rbp
sub $960, %rsp
and $-64, %rsp

mov $0x7f, %rax
kmovq %rax, %k5

// Load a0[0]
vmovdqu64 0*64(a0ptr), %zmm0
vmovdqu64 1*64(a0ptr), %zmm1{%k5}{z}
lea 15*8(a0ptr), a0ptr
// Load a0[1]
vmovdqu64 0*64(a0ptr), %zmm2
vmovdqu64 1*64(a0ptr), %zmm3{%k5}{z}
// Load b0[0]
vmovdqu64 0*64(b0ptr), %zmm4
vmovdqu64 1*64(b0ptr), %zmm5{%k5}{z}
lea 15*8(b0ptr), b0ptr
// Load b0[1]
vmovdqu64 0*64(b0ptr), %zmm6
vmovdqu64 1*64(b0ptr), %zmm7{%k5}{z}
// Load a1[0]
vmovdqu64 0*64(a1ptr), %zmm8
vmovdqu64 1*64(a1ptr), %zmm9{%k5}{z}
lea 15*8(a1ptr), a1ptr
// Load a1[1]
vmovdqu64 0*64(a1ptr), %zmm10
vmovdqu64 1*64(a1ptr), %zmm11{%k5}{z}
// Load b1[0]
vmovdqu64 0*64(b1ptr), %zmm12
vmovdqu64 1*64(b1ptr), %zmm13{%k5}{z}
lea 15*8(b1ptr), b1ptr
// Load b1[1]
vmovdqu64 0*64(b1ptr), %zmm14
vmovdqu64 1*64(b1ptr), %zmm15{%k5}{z}
// Transpose
vpunpcklqdq %zmm2, %zmm0, %zmm16 // 0 0 2 2 4 4 6 6
vpunpckhqdq %zmm2, %zmm0, %zmm17 // 1 1 3 3 5 5 7 7
vpunpcklqdq %zmm6, %zmm4, %zmm18 // 0 0 2 2 4 4 6 6
vpunpckhqdq %zmm6, %zmm4, %zmm19 // 1 1 3 3 5 5 7 7
vpunpcklqdq %zmm10, %zmm8, %zmm20 // 0 0 2 2 4 4 6 6
vpunpckhqdq %zmm10, %zmm8, %zmm21 // 1 1 3 3 5 5 7 7
vpunpcklqdq %zmm14, %zmm12, %zmm22 // 0 0 2 2 4 4 6 6
vpunpckhqdq %zmm14, %zmm12, %zmm23 // 1 1 3 3 5 5 7 7
vpunpcklqdq %zmm3, %zmm1, %zmm24 // 8 8 10 10 12 12 14 14
vpunpckhqdq %zmm3, %zmm1, %zmm25 // 9 9 11 11 13 13 15 15
vpunpcklqdq %zmm7, %zmm5, %zmm26 // 8 8 10 10 12 12 14 14
vpunpckhqdq %zmm7, %zmm5, %zmm27 // 9 9 11 11 13 13 15 15
vpunpcklqdq %zmm11, %zmm9, %zmm28 // 8 8 10 10 12 12 14 14
vpunpckhqdq %zmm11, %zmm9, %zmm29 // 9 9 11 11 13 13 15 15
vpunpcklqdq %zmm15, %zmm13, %zmm30 // 8 8 10 10 12 12 14 14
vpunpckhqdq %zmm15, %zmm13, %zmm31 // 9 9 11 11 13 13 15 15

vshufi64x2 $0x44, %zmm20, %zmm16, %zmm0 // 0 0 2 2 0 0 2 2
vshufi64x2 $0x44, %zmm22, %zmm18, %zmm1 // 0 0 2 2 0 0 2 2
vshufi64x2 $0xee, %zmm20, %zmm16, %zmm2 // 4 4 6 6 4 4 6 6
vshufi64x2 $0xee, %zmm22, %zmm18, %zmm3 // 4 4 6 6 4 4 6 6

vshufi64x2 $0x44, %zmm21, %zmm17, %zmm4 // 1 1 3 3 1 1 3 3
vshufi64x2 $0x44, %zmm23, %zmm19, %zmm5 // 1 1 3 3 1 1 3 3
vshufi64x2 $0xee, %zmm21, %zmm17, %zmm6 // 5 5 7 7 5 5 7 7
vshufi64x2 $0xee, %zmm23, %zmm19, %zmm7 // 5 5 7 7 5 5 7 7

vshufi64x2 $0x44, %zmm28, %zmm24, %zmm8 // 8 8 10 10 8 8 10 10
vshufi64x2 $0x44, %zmm30, %zmm26, %zmm9 // 8 8 10 10 8 8 10 10
vshufi64x2 $0xee, %zmm28, %zmm24, %zmm10 // 12 12 14 14 12 12 14 14
vshufi64x2 $0xee, %zmm30, %zmm26, %zmm11 // 12 12 14 14 12 12 14 14

vshufi64x2 $0x44, %zmm29, %zmm25, %zmm12 // 9 9 11 11 9 9 11 11
vshufi64x2 $0x44, %zmm31, %zmm27, %zmm13 // 9 9 11 11 9 9 11 11
vshufi64x2 $0xee, %zmm29, %zmm25, %zmm14 // 13 13 15 15 13 13 15 15
vshufi64x2 $0xee, %zmm31, %zmm27, %zmm15 // 13 13 15 15 13 13 15 15

vshufi64x2 $0x88, %zmm1, %zmm0, %zmm16 //0
vshufi64x2 $0x88, %zmm5, %zmm4, %zmm17 //1
vshufi64x2 $0xdd, %zmm1, %zmm0, %zmm18 //
vshufi64x2 $0xdd, %zmm5, %zmm4, %zmm19
vshufi64x2 $0x88, %zmm3, %zmm2, %zmm20
vshufi64x2 $0x88, %zmm7, %zmm6, %zmm21
vshufi64x2 $0xdd, %zmm3, %zmm2, %zmm22
vshufi64x2 $0xdd, %zmm7, %zmm6, %zmm23
vshufi64x2 $0x88, %zmm9, %zmm8, %zmm24
vshufi64x2 $0x88, %zmm13, %zmm12, %zmm25
vshufi64x2 $0xdd, %zmm9, %zmm8, %zmm26
vshufi64x2 $0xdd, %zmm13, %zmm12, %zmm27
vshufi64x2 $0x88, %zmm11, %zmm10, %zmm28
vshufi64x2 $0x88, %zmm15, %zmm14, %zmm29
vshufi64x2 $0xdd, %zmm11, %zmm10, %zmm30

vmovdqa64 .Lperm0(%rip), %zmm31
vpermq %zmm16, %zmm31, %zmm0
vpermq %zmm17, %zmm31, %zmm1
vpermq %zmm18, %zmm31, %zmm2
vpermq %zmm19, %zmm31, %zmm3
vpermq %zmm20, %zmm31, %zmm4
vpermq %zmm21, %zmm31, %zmm5
vpermq %zmm22, %zmm31, %zmm6
vpermq %zmm23, %zmm31, %zmm7
vpermq %zmm24, %zmm31, %zmm8
vpermq %zmm25, %zmm31, %zmm9
vpermq %zmm26, %zmm31, %zmm10
vpermq %zmm27, %zmm31, %zmm11
vpermq %zmm28, %zmm31, %zmm12
vpermq %zmm29, %zmm31, %zmm13
vpermq %zmm30, %zmm31, %zmm14

.irp r,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14
vmovdqu64 %zmm\r, \r*64(%rsp)
.endr

vmovdqa64 .Lperm1(%rip), %zmm31
vpermq %zmm16, %zmm31, A0
vpermq %zmm17, %zmm31, A1
vpermq %zmm18, %zmm31, A2
vpermq %zmm19, %zmm31, A3
vpermq %zmm20, %zmm31, A4
vpermq %zmm21, %zmm31, A5
vpermq %zmm22, %zmm31, A6
vpermq %zmm23, %zmm31, A7
vpermq %zmm24, %zmm31, A8
vpermq %zmm25, %zmm31, A9
vpermq %zmm26, %zmm31, A10
vpermq %zmm27, %zmm31, A11
vpermq %zmm28, %zmm31, A12
vpermq %zmm29, %zmm31, A13
vpermq %zmm30, %zmm31, A14

lea (%rsp), bptr
call do_mul_x2

// After parallel multiplication the layout is:
// A0[0] * B0[0], A0[1] * B0[1], A0[0] * B0[1], A0[1] * B0[0], A1[0] * B1[0], A1[1] * B1[1], A1[0] * B1[1], A1[1] * B1[0]
// We need to compute:
// A0[0] * B0[0] - A0[1] * B0[1], A0[0] * B0[1] + A0[1] * B0[0], A1[0] * B1[0] - A0[1] * B1[1], A1[0] * B1[1] + A1[1] * B1[0]
vpsrldq $8, ACC0, A0
vpsrldq $8, ACC1, A1
vpsrldq $8, ACC2, A2
vpsrldq $8, ACC3, A3
vpsrldq $8, ACC4, A4
vpsrldq $8, ACC5, A5
vpsrldq $8, ACC6, A6
vpsrldq $8, ACC7, A7
vpsrldq $8, ACC8, A8
vpsrldq $8, ACC9, A9
vpsrldq $8, ACC10, A10
vpsrldq $8, ACC11, A11
vpsrldq $8, ACC12, A12
vpsrldq $8, ACC13, A13
vpsrldq $8, ACC14, A14

mov $0x44, hlp
kmovq hlp, %k7

vpaddq A0, ACC0, ACC0{%k7}
vpaddq A1, ACC1, ACC1{%k7}
vpaddq A2, ACC2, ACC2{%k7}
vpaddq A3, ACC3, ACC3{%k7}
vpaddq A4, ACC4, ACC4{%k7}
vpaddq A5, ACC5, ACC5{%k7}
vpaddq A6, ACC6, ACC6{%k7}
vpaddq A7, ACC7, ACC7{%k7}
vpaddq A8, ACC8, ACC8{%k7}
vpaddq A9, ACC9, ACC9{%k7}
vpaddq A10, ACC10, ACC10{%k7}
vpaddq A11, ACC11, ACC11{%k7}
vpaddq A12, ACC12, ACC12{%k7}
vpaddq A13, ACC13, ACC13{%k7}
vpaddq A14, ACC14, ACC14{%k7}

mov $0x11, hlp
kmovq hlp, %k7

vpaddq 0*8+.LpolyX(%rip){1to8}, ACC0, ACC0{%k7}
vpaddq 1*8+.LpolyX(%rip){1to8}, ACC1, ACC1{%k7}
vpaddq 2*8+.LpolyX(%rip){1to8}, ACC2, ACC2{%k7}
vpaddq 3*8+.LpolyX(%rip){1to8}, ACC3, ACC3{%k7}
vpaddq 4*8+.LpolyX(%rip){1to8}, ACC4, ACC4{%k7}
vpaddq 5*8+.LpolyX(%rip){1to8}, ACC5, ACC5{%k7}
vpaddq 6*8+.LpolyX(%rip){1to8}, ACC6, ACC6{%k7}
vpaddq 7*8+.LpolyX(%rip){1to8}, ACC7, ACC7{%k7}
vpaddq 8*8+.LpolyX(%rip){1to8}, ACC8, ACC8{%k7}
vpaddq 9*8+.LpolyX(%rip){1to8}, ACC9, ACC9{%k7}
vpaddq 10*8+.LpolyX(%rip){1to8}, ACC10, ACC10{%k7}
vpaddq 11*8+.LpolyX(%rip){1to8}, ACC11, ACC11{%k7}
vpaddq 12*8+.LpolyX(%rip){1to8}, ACC12, ACC12{%k7}
vpaddq 13*8+.LpolyX(%rip){1to8}, ACC13, ACC13{%k7}
vpaddq 14*8+.LpolyX(%rip){1to8}, ACC14, ACC14{%k7}

vpsubq A0, ACC0, ACC0{%k7}
vpsubq A1, ACC1, ACC1{%k7}
vpsubq A2, ACC2, ACC2{%k7}
vpsubq A3, ACC3, ACC3{%k7}
vpsubq A4, ACC4, ACC4{%k7}
vpsubq A5, ACC5, ACC5{%k7}
vpsubq A6, ACC6, ACC6{%k7}
vpsubq A7, ACC7, ACC7{%k7}
vpsubq A8, ACC8, ACC8{%k7}
vpsubq A9, ACC9, ACC9{%k7}
vpsubq A10, ACC10, ACC10{%k7}
vpsubq A11, ACC11, ACC11{%k7}
vpsubq A12, ACC12, ACC12{%k7}
vpsubq A13, ACC13, ACC13{%k7}
vpsubq A14, ACC14, ACC14{%k7}
vpsrlq $52, ACC0, B
vpaddq B, ACC1, ACC1
vpandq .Lpoly(%rip){1to8}, ACC0, ACC0

vpsrlq $52, ACC1, B
vpaddq B, ACC2, ACC2
vpandq .Lpoly(%rip){1to8}, ACC1, ACC1

vpsrlq $52, ACC2, B
vpaddq B, ACC3, ACC3
vpandq .Lpoly(%rip){1to8}, ACC2, ACC2

vpsrlq $52, ACC3, B
vpaddq B, ACC4, ACC4
vpandq .Lpoly(%rip){1to8}, ACC3, ACC3

vpsrlq $52, ACC4, B
vpaddq B, ACC5, ACC5
vpandq .Lpoly(%rip){1to8}, ACC4, ACC4

vpsrlq $52, ACC5, B
vpaddq B, ACC6, ACC6
vpandq .Lpoly(%rip){1to8}, ACC5, ACC5

vpsrlq $52, ACC6, B
vpaddq B, ACC7, ACC7
vpandq .Lpoly(%rip){1to8}, ACC6, ACC6

vpsrlq $52, ACC7, B
vpaddq B, ACC8, ACC8
vpandq .Lpoly(%rip){1to8}, ACC7, ACC7

vpsrlq $52, ACC8, B
vpaddq B, ACC9, ACC9
vpandq .Lpoly(%rip){1to8}, ACC8, ACC8

vpsrlq $52, ACC9, B
vpaddq B, ACC10, ACC10
vpandq .Lpoly(%rip){1to8}, ACC9, ACC9

vpsrlq $52, ACC10, B
vpaddq B, ACC11, ACC11
vpandq .Lpoly(%rip){1to8}, ACC10, ACC10

vpsrlq $52, ACC11, B
vpaddq B, ACC12, ACC12
vpandq .Lpoly(%rip){1to8}, ACC11, ACC11

vpsrlq $52, ACC12, B
vpaddq B, ACC13, ACC13
vpandq .Lpoly(%rip){1to8}, ACC12, ACC12

vpsrlq $52, ACC13, B
vpaddq B, ACC14, ACC14
vpandq .Lpoly(%rip){1to8}, ACC13, ACC13

vpandq .Lpoly(%rip){1to8}, ACC14, ACC14

// Transpose to horizontal
vpunpcklqdq ACC1, ACC0, ACC0
vpunpcklqdq ACC3, ACC2, ACC1
vpunpcklqdq ACC5, ACC4, ACC2
vpunpcklqdq ACC7, ACC6, ACC3
vpunpcklqdq ACC9, ACC8, ACC4
vpunpcklqdq ACC11, ACC10, ACC5
vpunpcklqdq ACC13, ACC12, ACC6
vmovdqa64 ACC14, ACC7

vshufi64x2 $0x44, ACC1, ACC0, A0
vshufi64x2 $0x44, ACC3, ACC2, A1
vshufi64x2 $0x44, ACC5, ACC4, A2
vshufi64x2 $0x44, ACC7, ACC6, A3

vshufi64x2 $0xee, ACC1, ACC0, A4
vshufi64x2 $0xee, ACC3, ACC2, A5
vshufi64x2 $0xee, ACC5, ACC4, A6
vshufi64x2 $0xee, ACC7, ACC6, A7

vshufi64x2 $0x88, A1, A0, ACC0
vshufi64x2 $0x88, A3, A2, ACC1
vshufi64x2 $0xdd, A1, A0, ACC2
vshufi64x2 $0xdd, A3, A2, ACC3

vshufi64x2 $0x88, A5, A4, ACC4
vshufi64x2 $0x88, A7, A6, ACC5
vshufi64x2 $0xdd, A5, A4, ACC6
vshufi64x2 $0xdd, A7, A6, ACC7

vmovdqu64 ACC0, 0*64(r0ptr)
vmovdqu64 ACC1, 1*64(r0ptr){%k5}
lea 15*8(r0ptr), r0ptr
vmovdqu64 ACC2, 0*64(r0ptr)
vmovdqu64 ACC3, 1*64(r0ptr){%k5}

vmovdqu64 ACC4, 0*64(r1ptr)
vmovdqu64 ACC5, 1*64(r1ptr){%k5}
lea 15*8(r1ptr), r1ptr
vmovdqu64 ACC6, 0*64(r1ptr)
vmovdqu64 ACC7, 1*64(r1ptr){%k5}

mov %rbp, %rsp
pop %rbp
ret

// Performs 8 field multiplications in parallel
.globl C_ABI(amm_751_ifma_x2)
C_ABI(amm_751_ifma_x2):

vmovdqu64 0*64(aptr), A0
vmovdqu64 1*64(aptr), A1
vmovdqu64 2*64(aptr), A2
vmovdqu64 3*64(aptr), A3
vmovdqu64 4*64(aptr), A4
vmovdqu64 5*64(aptr), A5
vmovdqu64 6*64(aptr), A6
vmovdqu64 7*64(aptr), A7
vmovdqu64 8*64(aptr), A8
vmovdqu64 9*64(aptr), A9
vmovdqu64 10*64(aptr), A10
vmovdqu64 11*64(aptr), A11
vmovdqu64 12*64(aptr), A12
vmovdqu64 13*64(aptr), A13
vmovdqu64 14*64(aptr), A14
do_mul_x2:
vpxorq ACC0, ACC0, ACC0
vpxorq ACC1, ACC1, ACC1
vpxorq ACC2, ACC2, ACC2
vpxorq ACC3, ACC3, ACC3
vpxorq ACC4, ACC4, ACC4
vpxorq ACC5, ACC5, ACC5
vpxorq ACC6, ACC6, ACC6
vpxorq ACC7, ACC7, ACC7
vpxorq ACC8, ACC8, ACC8
vpxorq ACC9, ACC9, ACC9
vpxorq ACC10, ACC10, ACC10
vpxorq ACC11, ACC11, ACC11
vpxorq ACC12, ACC12, ACC12
vpxorq ACC13, ACC13, ACC13
vpxorq ACC14, ACC14, ACC14
vpxorq ACC15, ACC15, ACC15

mov $15, hlp

1:
vmovdqu64 (bptr), B
lea 1*64(bptr), bptr
vpmadd52luq A0, B, ACC0
vpmadd52luq A1, B, ACC1
vpmadd52luq A2, B, ACC2
vpmadd52luq A3, B, ACC3
vpmadd52luq A4, B, ACC4
vpmadd52luq A5, B, ACC5
vpmadd52luq A6, B, ACC6
vpmadd52luq A7, B, ACC7
vpmadd52luq A8, B, ACC8
vpmadd52luq A9, B, ACC9
vpmadd52luq A10, B, ACC10
vpmadd52luq A11, B, ACC11
vpmadd52luq A12, B, ACC12
vpmadd52luq A13, B, ACC13
vpmadd52luq A14, B, ACC14

vpmadd52huq A0, B, ACC1
vpmadd52huq A1, B, ACC2
vpmadd52huq A2, B, ACC3
vpmadd52huq A3, B, ACC4
vpmadd52huq A4, B, ACC5
vpmadd52huq A5, B, ACC6
vpmadd52huq A6, B, ACC7
vpmadd52huq A7, B, ACC8
vpmadd52huq A8, B, ACC9
vpmadd52huq A9, B, ACC10
vpmadd52huq A10, B, ACC11
vpmadd52huq A11, B, ACC12
vpmadd52huq A12, B, ACC13
vpmadd52huq A13, B, ACC14
vpmadd52huq A14, B, ACC15

vmovdqa64 ACC0, B

vpmadd52luq 0*8 + .Lpoly(%rip){1to8}, B, ACC0
vpsrlq $52, ACC0, ACC0
vpmadd52luq 1*8 + .Lpoly(%rip){1to8}, B, ACC1
vpaddq ACC1, ACC0, ACC0
vpmadd52luq 2*8 + .Lpoly(%rip){1to8}, B, ACC2
vmovdqa64 ACC2, ACC1
vpmadd52luq 3*8 + .Lpoly(%rip){1to8}, B, ACC3
vmovdqa64 ACC3, ACC2
vpmadd52luq 4*8 + .Lpoly(%rip){1to8}, B, ACC4
vmovdqa64 ACC4, ACC3
vpmadd52luq 5*8 + .Lpoly(%rip){1to8}, B, ACC5
vmovdqa64 ACC5, ACC4
vpmadd52luq 6*8 + .Lpoly(%rip){1to8}, B, ACC6
vmovdqa64 ACC6, ACC5
vpmadd52luq 7*8 + .Lpoly(%rip){1to8}, B, ACC7
vmovdqa64 ACC7, ACC6
vpmadd52luq 8*8 + .Lpoly(%rip){1to8}, B, ACC8
vmovdqa64 ACC8, ACC7
vpmadd52luq 9*8 + .Lpoly(%rip){1to8}, B, ACC9
vmovdqa64 ACC9, ACC8
vpmadd52luq 10*8 + .Lpoly(%rip){1to8}, B, ACC10
vmovdqa64 ACC10, ACC9
vpmadd52luq 11*8 + .Lpoly(%rip){1to8}, B, ACC11
vmovdqa64 ACC11, ACC10
vpmadd52luq 12*8 + .Lpoly(%rip){1to8}, B, ACC12
vmovdqa64 ACC12, ACC11
vpmadd52luq 13*8 + .Lpoly(%rip){1to8}, B, ACC13
vmovdqa64 ACC13, ACC12
vpmadd52luq 14*8 + .Lpoly(%rip){1to8}, B, ACC14
vmovdqa64 ACC14, ACC13
vmovdqa64 ACC15, ACC14
vpxorq ACC15, ACC15, ACC15

vpmadd52huq 0*8 + .Lpoly(%rip){1to8}, B, ACC0
vpmadd52huq 1*8 + .Lpoly(%rip){1to8}, B, ACC1
vpmadd52huq 2*8 + .Lpoly(%rip){1to8}, B, ACC2
vpmadd52huq 3*8 + .Lpoly(%rip){1to8}, B, ACC3
vpmadd52huq 4*8 + .Lpoly(%rip){1to8}, B, ACC4
vpmadd52huq 5*8 + .Lpoly(%rip){1to8}, B, ACC5
vpmadd52huq 6*8 + .Lpoly(%rip){1to8}, B, ACC6
vpmadd52huq 7*8 + .Lpoly(%rip){1to8}, B, ACC7
vpmadd52huq 8*8 + .Lpoly(%rip){1to8}, B, ACC8
vpmadd52huq 9*8 + .Lpoly(%rip){1to8}, B, ACC9
vpmadd52huq 10*8 + .Lpoly(%rip){1to8}, B, ACC10
vpmadd52huq 11*8 + .Lpoly(%rip){1to8}, B, ACC11
vpmadd52huq 12*8 + .Lpoly(%rip){1to8}, B, ACC12
vpmadd52huq 13*8 + .Lpoly(%rip){1to8}, B, ACC13
vpmadd52huq 14*8 + .Lpoly(%rip){1to8}, B, ACC14

dec hlp
jnz 1b

ret

+ 268
- 0
fp_751_ifma.S 查看文件

@@ -0,0 +1,268 @@
#if defined(__APPLE__)
/* OS X's C ABI prefixes functions with underscore. */
#define C_ABI(x) _ ## x
#define HIDDEN .private_extern
#else
#define C_ABI(x) x
#define HIDDEN .hidden
#endif

.p2align 6
.LpermMask0:
.word 0,1,2,3, 3,4,5,6, 6,7,8,9, 9,10,11,12, 13,14,15,16, 16,17,18,19, 19,20,21,22, 22,23,24,25
.LshiftMask0:
.quad 0,4,8,12,0,4,8,12
.LandMask:
.quad 0xfffffffffffff

.p2align 6
.Lpoly:
.quad 0x000fffffffffffff, 0x000fffffffffffff, 0x000fffffffffffff, 0x000fffffffffffff
.quad 0x000fffffffffffff, 0x000fffffffffffff, 0x000fffffffffffff, 0x00049f878a8eeaff
.quad 0x0007cc76e3ec9685, 0x00076da959b1a13f, 0x00084e9867d6ebe8, 0x000b5045cb257480
.quad 0x000f97badc668562, 0x00041f71c0e12909, 0x00000000006fe5d5, 0

.LR2:
.quad 0x000dad40589641fd, 0x000452a233046449, 0x000edb010161a696, 0x00036941472e3fd8
.quad 0x000e2082a2e7065e, 0x000904f8751f40bf, 0x0007fc814932cca8, 0x00033f174b08b2ee
.quad 0x0009814efb9f1375, 0x00099594a1afe512, 0x00043c75310de66d, 0x000197021a5b37b0
.quad 0x000cc1a272e73959, 0x000a733d7c97cd76, 0x0000000000292ee8, 0

.Lone:
.quad 1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0

.globl C_ABI(norm2red)
.p2align 6
C_ABI(norm2red):
mov $0x3FFFFF, %eax
kmovd %eax, %k1
mov $0x7F, %eax
kmovd %eax, %k2

vmovdqa64 .LpermMask0(%rip), %zmm0
vmovdqa64 .LshiftMask0(%rip), %zmm1
vpbroadcastq .LandMask(%rip), %zmm10

vpermw 52*0(%rsi), %zmm0, %zmm2
vmovdqu16 52*1(%rsi), %zmm3{%k1}{z}
vpermw %zmm3, %zmm0, %zmm3

vpsrlvq %zmm1, %zmm2, %zmm2
vpsrlvq %zmm1, %zmm3, %zmm3
vpsrlvq %zmm1, %zmm4, %zmm4

vpandq %zmm10, %zmm2, %zmm2
vpandq %zmm10, %zmm3, %zmm3
vpandq %zmm10, %zmm4, %zmm4

vmovdqu64 %zmm2, 64*0(%rdi)
vmovdqu64 %zmm3, 64*1(%rdi){%k2}
ret


#define res %rdi // uint64_t *rp,
#define a0 %rsi // const uint64_t *ap,
#define bpi %rdx // const uint64_t *bptr,
#define m0 %rcx

#define b_ptr %rax

#define acc0 %r9

#define itr %r10
#define t0 %r11
#define t1 %r12
#define t2 %r13

#define A0 %zmm0
#define A1 %zmm1

#define M0 %zmm2
#define M1 %zmm3

#define ACC0 %zmm4
#define ACC0_xmm %xmm4
#define ACC1 %zmm5

#define Y_curr %zmm6
#define Y_prev %zmm7
#define B_curr %zmm8
#define B_prev %zmm9

#define TMP %zmm10
#define TMP_xmm %xmm10

#define ZERO %zmm11
#define AND_MASK %zmm12

#define ACC0b %zmm13
#define ACC1b %zmm14

###############################################################################
.globl C_ABI(to_mont_ifma)
.p2align 6
C_ABI(to_mont_ifma):
leaq .LR2(%rip), bpi
jmp C_ABI(fp_mul_ifma)
###############################################################################
.globl C_ABI(from_mont_ifma)
.p2align 6
C_ABI(from_mont_ifma):
leaq .Lone(%rip), bpi
jmp C_ABI(fp_mul_ifma)
###############################################################################
.globl C_ABI(fp_mul_ifma)
.p2align 6
C_ABI(fp_mul_ifma):

push %rbx
push %r12
push %r13

mov bpi, b_ptr

mov $1, t0
mov $0x3f, t1
kmovq t0, %k1
kmovq t1, %k2

vpbroadcastq .LandMask(%rip), AND_MASK
vpxorq ZERO, ZERO, ZERO

# Load operands A into registers. A[0] is stored in ALU register, in order to compensate for the latency of IFMA when computing (A*B)[0] * K0
vmovdqu64 8*1+64*0(a0), A0
vmovdqu64 8*1+64*1(a0), A1{%k2}{z}
mov 8*0(a0), a0

# Load the modulii
mov .Lpoly(%rip), m0
vmovdqu64 8*1+64*0+.Lpoly(%rip), M0
vmovdqu64 8*1+64*1+.Lpoly(%rip), M1{%k2}{z}

# Prepare the accumulators
vpxorq ACC0, ACC0, ACC0
vpxorq ACC1, ACC1, ACC1
vpxorq B_curr, B_curr, B_curr
vpxorq Y_curr, Y_curr, Y_curr
xor acc0, acc0

mov $15, itr
1:
vpxorq ACC0b, ACC0b, ACC0b
vpxorq ACC1b, ACC1b, ACC1b

# High multiplications
vpmadd52huq B_curr, A0, ACC0b
vpmadd52huq B_curr, A1, ACC1b

vpmadd52huq Y_curr, M0, ACC0b
vpmadd52huq Y_curr, M1, ACC1b

# Shift the ACC in zmms right by a word
valignq $1, ACC0, ACC1, ACC0
valignq $1, ACC1, ZERO, ACC1
mov a0, %rdx

mulx (b_ptr), t0, t2
add t0, acc0
adc $0, t2

mov acc0, %rdx
and .LandMask(%rip), %rdx

vpbroadcastq %rdx, Y_curr
vpbroadcastq (b_ptr), B_curr

mulx m0, t0, t1
add t0, acc0
adc t1, t2

shrd $52, t2, acc0

# Low multiplications
vpmadd52luq B_curr, A0, ACC0b
vpmadd52luq B_curr, A1, ACC1b

vpmadd52luq Y_curr, M0, ACC0
vpmadd52luq Y_curr, M1, ACC1

vpaddq ACC0b, ACC0, ACC0
vpaddq ACC1b, ACC1, ACC1

vmovq ACC0_xmm, t0
add t0, acc0

lea 8(b_ptr), b_ptr
dec itr
jne 1b

vmovq acc0, TMP_xmm

vmovdqa64 TMP, ACC0{%k1}

valignq $7, A0, A1, A1
valignq $7, ZERO, A0, A0

valignq $7, M0, M1, M1
valignq $7, ZERO, M0, M0

# The last high multiplications
vpmadd52huq B_curr, A0, ACC0
vpmadd52huq B_curr, A1, ACC1

vpmadd52huq Y_curr, M0, ACC0
vpmadd52huq Y_curr, M1, ACC1

# Now 'normalize' the result to 52 bit words
vpsrlq $52, ACC0, A0
vpsrlq $52, ACC1, A1

vpandq AND_MASK, ACC0, ACC0
vpandq AND_MASK, ACC1, ACC1

valignq $7, A0, A1, A1
valignq $7, ZERO, A0, A0

vpaddq A0, ACC0, ACC0
vpaddq A1, ACC1, ACC1

vpcmpuq $1, A0, ACC0, %k1
vpcmpuq $1, A1, ACC1, %k2

kmovb %k1, %eax
kmovb %k2, %ebx

add %al, %al
adc %cl, %cl

vpcmpuq $0, AND_MASK, ACC0, %k1
vpcmpuq $0, AND_MASK, ACC1, %k2

kmovb %k1, %r8d
kmovb %k2, %r9d

add %r8b, %al
adc %r9b, %bl

xor %r8b, %al
xor %r9b, %bl

kmovb %eax, %k1
kmovb %ebx, %k2

vpsubq AND_MASK, ACC0, ACC0{%k1}
vpsubq AND_MASK, ACC1, ACC1{%k2}

vpandq AND_MASK, ACC0, ACC0
vpandq AND_MASK, ACC1, ACC1

mov $0x7f, t0
kmovq t0, %k1

vmovdqu64 ACC0, 64*0(res)
vmovdqu64 ACC1, 64*1(res){%k1}
bail:
pop %r13
pop %r12
pop %rbx
ret

+ 218
- 0
main.c 查看文件

@@ -0,0 +1,218 @@
#include <stdint.h>
#include <stdio.h>
#include <string.h>

#include "./sidh_ref/P751_internal.h"

#include "measurements.h"

#ifndef PRIME_BITS
#define PRIME_BITS 751
#endif

#define DIGITS_64 ((PRIME_BITS + 63) / 64)
#define DIGITS_52 ((PRIME_BITS + 51) / 52)

#define OALICE_BITS 372
#define OBOB_BITS 379
#define SECRETKEY_A_BYTES (OALICE_BITS + 7) / 8
#define SECRETKEY_B_BYTES (OBOB_BITS + 7) / 8

#define MASK_ALICE 0x0F
#define MASK_BOB 0x03

typedef uint64_t num52[DIGITS_52];
typedef num52 felem[2];

void fp2_mul_ifma(felem res, felem a, felem b);
void fp2_sqr_ifma(felem res, felem a);
void fp2_mul_ifma_x2(felem res1, const felem a1, const felem b1, felem res2, const felem a2, const felem b2);
void fp_mul_ifma(uint64_t *rp, const uint64_t *ap, const uint64_t *bp);
void to_mont_ifma(uint64_t *rp, const uint64_t *ap);
void from_mont_ifma(uint64_t *rp, const uint64_t *ap);

void red2norm(uint64_t out[12], const uint64_t in[15]);
void norm2red(uint64_t *res, const uint64_t *a);

int EphemeralKeyGeneration_A_ifma(const unsigned char *PrivateKeyA, unsigned char *PublicKeyA);
int EphemeralKeyGeneration_B_ifma(const unsigned char *PrivateKeyB, unsigned char *PublicKeyB);

int rdrand64_step(uint64_t *rand)
{
unsigned char ok;
__asm__ volatile("rdrand %0; setc %1"
: "=r"(*rand), "=qm"(ok));
return (int)ok;
}

static void rand_750(uint64_t out[DIGITS_64])
{
for (int i = 0; i < DIGITS_64; i++)
{
while (!rdrand64_step((uint64_t *)&out[i]))
;
}

out[DIGITS_64 - 1] &= ((1ULL << (PRIME_BITS - 64 * (DIGITS_64 - 1))) - 1);
}

static void rand_bytes(uint8_t *out, size_t out_len)
{
uint64_t temp;
for (int i = 0; i < out_len; i++)
{
while (!rdrand64_step((uint64_t *)&temp))
;
out[i] = temp;
}
}

int main()
{

int i;

do
{
felm_t fa, fb, fr;
num52 r, a, b;
uint64_t res_ifma[DIGITS_64];

rand_750(fa);
rand_750(fb);

norm2red(a, (uint64_t *)fa);
norm2red(b, (uint64_t *)fb);
to_mont_ifma(a, a);
to_mont_ifma(b, b);

MEASURE({ fp_mul_ifma(r, a, b); });

from_mont_ifma(r, r);
red2norm(res_ifma, r);
printf("Mont mul IFMA Cycles/op: %.0f\n", RDTSC_total_clk);

to_mont(fa, fa);
to_mont(fb, fb);
MEASURE({ fpmul751_mont(fa, fb, fr); });
from_mont(fr, fr);
printf("Mont mul ref Cycles/op: %.0f\n", RDTSC_total_clk);

printf("%s\n", memcmp(fr, res_ifma, sizeof(res_ifma)) ? "FP MUL Fail"
: "FP MUL Success");
} while (0);

do
{
felem a, b, r, r2;
f2elm_t fa, fb, fr;
uint64_t res_ifma[2][DIGITS_64];

rand_750(fa[0]);
rand_750(fa[1]);
rand_750(fb[0]);
rand_750(fb[1]);
norm2red(a[0], (uint64_t *)fa[0]);
norm2red(a[1], (uint64_t *)fa[1]);
norm2red(b[0], (uint64_t *)fb[0]);
norm2red(b[1], (uint64_t *)fb[1]);
to_mont_ifma(a[0], a[0]);
to_mont_ifma(a[1], a[1]);
to_mont_ifma(b[0], b[0]);
to_mont_ifma(b[1], b[1]);

MEASURE({ fp2_mul_ifma(r, a, b); });

from_mont_ifma(r[0], r[0]);
from_mont_ifma(r[1], r[1]);
red2norm(res_ifma[0], r[0]);
red2norm(res_ifma[1], r[1]);
printf("Mont FP2 mul IFMA Cycles/op: %.0f\n", RDTSC_total_clk);

to_mont(fa[0], fa[0]);
to_mont(fa[1], fa[1]);
to_mont(fb[0], fb[0]);
to_mont(fb[1], fb[1]);

MEASURE({ fp2mul751_mont(fa, fb, fr); });

from_mont(fr[0], fr[0]);
from_mont(fr[1], fr[1]);
printf("Mont FP2 mul ref Cycles/op: %.0f\n", RDTSC_total_clk);

printf("%s\n", memcmp(fr, res_ifma, sizeof(res_ifma)) ? "FP2 MUL Fail"
: "FP2 MUL Success");

MEASURE({ fp2_mul_ifma_x2(r, a, b, r2, a, b); });

from_mont_ifma(r[0], r[0]);
from_mont_ifma(r[1], r[1]);
red2norm(res_ifma[0], r[0]);
red2norm(res_ifma[1], r[1]);

printf("Dual Mont FP2 mul IFMA Cycles/op: %.0f\n", RDTSC_total_clk);

printf("%s\n", memcmp(fr, res_ifma, sizeof(res_ifma)) ? "Dual FP2 MUL 1/2 Fail"
: "Dual FP2 MUL 1/2 Success");

from_mont_ifma(r2[0], r2[0]);
from_mont_ifma(r2[1], r2[1]);
red2norm(res_ifma[0], r2[0]);
red2norm(res_ifma[1], r2[1]);

printf("%s\n", memcmp(fr, res_ifma, sizeof(res_ifma)) ? "Dual FP2 MUL 2/2 Fail"
: "Dual FP2 MUL 2/2 Success");

MEASURE({ fp2_sqr_ifma(r, a); });

from_mont_ifma(r[0], r[0]);
from_mont_ifma(r[1], r[1]);
red2norm(res_ifma[0], r[0]);
red2norm(res_ifma[1], r[1]);
printf("Mont FP2 sqr IFMA Cycles/op: %.0f\n", RDTSC_total_clk);

MEASURE({ fp2sqr751_mont(fa, fr); });

from_mont(fr[0], fr[0]);
from_mont(fr[1], fr[1]);
printf("Mont FP2 sqr ref Cycles/op: %.0f\n", RDTSC_total_clk);

printf("%s\n", memcmp(fr, res_ifma, sizeof(res_ifma)) ? "FP2 SQR Fail"
: "FP2 SQR Success");

} while (0);

do
{
unsigned char ephemeralsk_alice[SECRETKEY_A_BYTES];
unsigned char ephemeralsk_bob[SECRETKEY_B_BYTES];
unsigned char ct1[564] = {0};
unsigned char ct2[564] = {0};
rand_bytes(ephemeralsk_alice, sizeof(ephemeralsk_alice));
rand_bytes(ephemeralsk_bob, sizeof(ephemeralsk_bob));
ephemeralsk_alice[SECRETKEY_A_BYTES - 1] &= MASK_ALICE;
ephemeralsk_bob[SECRETKEY_B_BYTES - 1] &= MASK_BOB;

MEASURE({ EphemeralKeyGeneration_A(ephemeralsk_alice, ct1); });

printf("Ref EphemeralKeyGeneration_A Cycles/op: %.0f\n", RDTSC_total_clk);

MEASURE({ EphemeralKeyGeneration_A_ifma(ephemeralsk_alice, ct2); });

printf("IFMA EphemeralKeyGeneration_A Cycles/op: %.0f\n", RDTSC_total_clk);

printf("%s\n", memcmp(ct1, ct2, sizeof(ct1)) ? "EphemeralKeyGeneration_A Fail"
: "EphemeralKeyGeneration_A Success");

MEASURE({ EphemeralKeyGeneration_B(ephemeralsk_bob, ct1); });

printf("Ref EphemeralKeyGeneration_B Cycles/op: %.0f\n", RDTSC_total_clk);

MEASURE({ EphemeralKeyGeneration_B_ifma(ephemeralsk_bob, ct2); });

printf("IFMA EphemeralKeyGeneration_B Cycles/op: %.0f\n", RDTSC_total_clk);

printf("%s\n", memcmp(ct1, ct2, sizeof(ct1)) ? "EphemeralKeyGeneration_B Fail"
: "EphemeralKeyGeneration_B Success");
} while (0);
}

+ 52
- 0
measurements.h 查看文件

@@ -0,0 +1,52 @@

#ifndef MEASURE_H
#define MEASURE_H

#ifndef REPEAT
#define REPEAT 100
#endif

#ifndef OUTER_REPEAT
#define OUTER_REPEAT 10
#endif

#ifndef WARMUP
#define WARMUP REPEAT / 4
#endif

unsigned long long RDTSC_start_clk, RDTSC_end_clk;
double RDTSC_total_clk;
double RDTSC_TEMP_CLK;
int RDTSC_MEASURE_ITERATOR;
int RDTSC_OUTER_ITERATOR;

inline static unsigned long get_Clks(void)
{
unsigned hi, lo;
__asm__ __volatile__("rdtscp\n\t"
: "=a"(lo), "=d"(hi)::"rcx");
return ((unsigned long)lo) ^ (((unsigned long)hi) << 32);
}

#define RDTSC_MEASURE(x) \
for (RDTSC_MEASURE_ITERATOR = 0; RDTSC_MEASURE_ITERATOR < WARMUP; RDTSC_MEASURE_ITERATOR++) \
{ \
{x}; \
} \
RDTSC_total_clk = 1.7976931348623157e+308; \
for (RDTSC_OUTER_ITERATOR = 0; RDTSC_OUTER_ITERATOR < OUTER_REPEAT; RDTSC_OUTER_ITERATOR++) \
{ \
RDTSC_start_clk = get_Clks(); \
for (RDTSC_MEASURE_ITERATOR = 0; RDTSC_MEASURE_ITERATOR < REPEAT; RDTSC_MEASURE_ITERATOR++) \
{ \
{x}; \
} \
RDTSC_end_clk = get_Clks(); \
RDTSC_TEMP_CLK = (double)(RDTSC_end_clk - RDTSC_start_clk) / REPEAT; \
if (RDTSC_total_clk > RDTSC_TEMP_CLK) \
RDTSC_total_clk = RDTSC_TEMP_CLK; \
}

#define MEASURE(x) RDTSC_MEASURE(x)

#endif

+ 122
- 0
sidh_ref/P751.c 查看文件

@@ -0,0 +1,122 @@
/********************************************************************************************
* Supersingular Isogeny Key Encapsulation Library
*
* Abstract: supersingular isogeny parameters and generation of functions for P751
*********************************************************************************************/
#include "P751_internal.h"
// Encoding of field elements, elements over Z_order, elements over GF(p^2) and elliptic curve points:
// --------------------------------------------------------------------------------------------------
// Elements over GF(p) and Z_order are encoded with the least significant octet (and digit) located at the leftmost position (i.e., little endian format).
// Elements (a+b*i) over GF(p^2), where a and b are defined over GF(p), are encoded as {a, b}, with a in the least significant position.
// Elliptic curve points P = (x,y) are encoded as {x, y}, with x in the least significant position.
// Internally, the number of digits used to represent all these elements is obtained by approximating the number of bits to the immediately greater multiple of 32.
// For example, a 751-bit field element is represented with Ceil(751 / 64) = 12 64-bit digits or Ceil(751 / 32) = 24 32-bit digits.
//
// Curve isogeny system "SIDHp751". Base curve: Montgomery curve By^2 = Cx^3 + Ax^2 + Cx defined over GF(p751^2), where A=0, B=1, C=1 and p751 = 2^372*3^239-1
//
const uint64_t p751[NWORDS64_FIELD] = { 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0xEEAFFFFFFFFFFFFF,
0xE3EC968549F878A8, 0xDA959B1A13F7CC76, 0x084E9867D6EBE876, 0x8562B5045CB25748, 0x0E12909F97BADC66, 0x00006FE5D541F71C };
const uint64_t p751p1[NWORDS64_FIELD] = { 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0xEEB0000000000000,
0xE3EC968549F878A8, 0xDA959B1A13F7CC76, 0x084E9867D6EBE876, 0x8562B5045CB25748, 0x0E12909F97BADC66, 0x00006FE5D541F71C };
const uint64_t p751x2[NWORDS64_FIELD] = { 0xFFFFFFFFFFFFFFFE, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0xDD5FFFFFFFFFFFFF,
0xC7D92D0A93F0F151, 0xB52B363427EF98ED, 0x109D30CFADD7D0ED, 0x0AC56A08B964AE90, 0x1C25213F2F75B8CD, 0x0000DFCBAA83EE38 };
// Order of Alice's subgroup
const uint64_t Alice_order[NWORDS64_ORDER] = { 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0010000000000000 };
// Order of Bob's subgroup
const uint64_t Bob_order[NWORDS64_ORDER] = { 0xC968549F878A8EEB, 0x59B1A13F7CC76E3E, 0xE9867D6EBE876DA9, 0x2B5045CB25748084, 0x2909F97BADC66856, 0x06FE5D541F71C0E1 };
// Alice's generator values {XPA0 + XPA1*i, XQA0, XRA0 + XRA1*i} in GF(p751^2), expressed in Montgomery representation
const uint64_t A_gen[5 * NWORDS64_FIELD] = { 0xC2FC08CEAB50AD8B, 0x1D7D710F55E457B1, 0xE8738D92953DCD6E, 0xBAA7EBEE8A3418AA, 0xC9A288345F03F46F, 0xC8D18D167CFE2616,
0x02043761F6B1C045, 0xAA1975E13180E7E9, 0x9E13D3FDC6690DE6, 0x3A024640A3A3BB4F, 0x4E5AD44E6ACBBDAE, 0x0000544BEB561DAD, // XPA0
0xE6CC41D21582E411, 0x07C2ECB7C5DF400A, 0xE8E34B521432AEC4, 0x50761E2AB085167D, 0x032CFBCAA6094B3C, 0x6C522F5FDF9DDD71,
0x1319217DC3A1887D, 0xDC4FB25803353A86, 0x362C8D7B63A6AB09, 0x39DCDFBCE47EA488, 0x4C27C99A2C28D409, 0x00003CB0075527C4, // XPA1
0xD56FE52627914862, 0x1FAD60DC96B5BAEA, 0x01E137D0BF07AB91, 0x404D3E9252161964, 0x3C5385E4CD09A337, 0x4476426769E4AF73,
0x9790C6DB989DFE33, 0xE06E1C04D2AA8B5E, 0x38C08185EDEA73B9, 0xAA41F678A4396CA6, 0x92B9259B2229E9A0, 0x00002F9326818BE0, // XQA0
0x0BB84441DFFD19B3, 0x84B4DEA99B48C18E, 0x692DE648AD313805, 0xE6D72761B6DFAEE0, 0x223975C672C3058D, 0xA0FDE0C3CBA26FDC,
0xA5326132A922A3CA, 0xCA5E7F5D5EA96FA4, 0x127C7EFE33FFA8C6, 0x4749B1567E2A23C4, 0x2B7DF5B4AF413BFA, 0x0000656595B9623C, // XRA0
0xED78C17F1EC71BE8, 0xF824D6DF753859B1, 0x33A10839B2A8529F, 0xFC03E9E25FDEA796, 0xC4708A8054DF1762, 0x4034F2EC034C6467,
0xABFB70FBF06ECC79, 0xDABE96636EC108B7, 0x49CBCFB090605FD3, 0x20B89711819A45A7, 0xFB8E1590B2B0F63E, 0x0000556A5F964AB2 }; // XRA1
// Bob's generator values {XPB0 + XPB1*i, XQB0, XRB0 + XRB1*i} in GF(p751^2), expressed in Montgomery representation
const uint64_t B_gen[5 * NWORDS64_FIELD] = { 0xCFB6D71EF867AB0B, 0x4A5FDD76E9A45C76, 0x38B1EE69194B1F03, 0xF6E7B18A7761F3F0, 0xFCF01A486A52C84C, 0xCBE2F63F5AA75466,
0x6487BCE837B5E4D6, 0x7747F5A8C622E9B8, 0x4CBFE1E4EE6AEBBA, 0x8A8616A13FA91512, 0x53DB980E1579E0A5, 0x000058FEBFF3BE69, // XPB0
0xA492034E7C075CC3, 0x677BAF00B04AA430, 0x3AAE0C9A755C94C8, 0x1DC4B064E9EBB08B, 0x3684EDD04E826C66, 0x9BAA6CB661F01B22,
0x20285A00AD2EFE35, 0xDCE95ABD0497065F, 0x16C7FBB3778E3794, 0x26B3AC29CEF25AAF, 0xFB3C28A31A30AC1D, 0x000046ED190624EE, // XPB1
0xF1A8C9ED7B96C4AB, 0x299429DA5178486E, 0xEF4926F20CD5C2F4, 0x683B2E2858B4716A, 0xDDA2FBCC3CAC3EEB, 0xEC055F9F3A600460,
0xD5A5A17A58C3848B, 0x4652D836F42EAED5, 0x2F2E71ED78B3A3B3, 0xA771C057180ADD1D, 0xC780A5D2D835F512, 0x0000114EA3B55AC1, // XQB0
0x1C0D6733769D0F31, 0xF084C3086E2659D1, 0xE23D5DA27BCBD133, 0xF38EC9A8D5864025, 0x6426DC781B3B645B, 0x4B24E8E3C9FB03EE,
0x6432792F9D2CEA30, 0x7CC8E8B1AE76E857, 0x7F32BFB626BB8963, 0xB9F05995B48D7B74, 0x4D71200A7D67E042, 0x0000228457AF0637, // XRB0
0x4AE37E7D8F72BD95, 0xDD2D504B3E993488, 0x5D14E7FA1ECB3C3E, 0x127610CEB75D6350, 0x255B4B4CAC446B11, 0x9EA12336C1F70CAF,
0x79FA68A2147BC2F8, 0x11E895CFDADBBC49, 0xE4B9D3C4D6356C18, 0x44B25856A67F951C, 0x5851541F61308D0B, 0x00002FFD994F7E4C }; // XRB1
// Montgomery constant Montgomery_R2 = (2^768)^2 mod p751
const uint64_t Montgomery_R2[NWORDS64_FIELD] = { 0x233046449DAD4058, 0xDB010161A696452A, 0x5E36941472E3FD8E, 0xF40BFE2082A2E706, 0x4932CCA8904F8751 ,0x1F735F1F1EE7FC81,
0xA24F4D80C1048E18, 0xB56C383CCDB607C5, 0x441DD47B735F9C90, 0x5673ED2C6A6AC82A, 0x06C905261132294B, 0x000041AD830F1F35 };
// Value one in Montgomery representation
const uint64_t Montgomery_one[NWORDS64_FIELD] = { 0x00000000000249ad, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x8310000000000000,
0x5527b1e4375c6c66, 0x697797bf3f4f24d0, 0xc89db7b2ac5c4e2e, 0x4ca4b439d2076956, 0x10f7926c7512c7e9, 0x00002d5b24bce5e2 };
// Value (2^384)^2 mod 3^239
const uint64_t Montgomery_Rprime[NWORDS64_ORDER] = { 0x1A55482318541298, 0x070A6370DFA12A03, 0xCB1658E0E3823A40, 0xB3B7384EB5DEF3F9, 0xCBCA952F7006EA33, 0x00569EF8EC94864C };
// Value -(3^239)^-1 mod 2^384
const uint64_t Montgomery_rprime[NWORDS64_ORDER] = { 0x48062A91D3AB563D, 0x6CE572751303C2F5, 0x5D1319F3F160EC9D, 0xE35554E8C2D5623A, 0xCA29300232BC79A5, 0x8AAD843D646D78C5 };
// Value order_Bob/3 mod p751
const uint64_t Border_div3[NWORDS_ORDER] = { 0xEDCD718A828384F9, 0x733B35BFD4427A14, 0xF88229CF94D7CF38, 0x63C56C990C7C2AD6, 0xB858A87E8F4222C7, 0x0254C9C6B525EAF5 };
// Fixed parameters for isogeny tree computation
const unsigned int strat_Alice[MAX_Alice-1] = {
80, 48, 27, 15, 8, 4, 2, 1, 1, 2, 1, 1, 4, 2, 1, 1, 2, 1, 1, 7, 4, 2, 1, 1, 2, 1,
1, 3, 2, 1, 1, 1, 1, 12, 7, 4, 2, 1, 1, 2, 1, 1, 3, 2, 1, 1, 1, 1, 5, 3, 2, 1, 1,
1, 1, 2, 1, 1, 1, 21, 12, 7, 4, 2, 1, 1, 2, 1, 1, 3, 2, 1, 1, 1, 1, 5, 3, 2, 1,
1, 1, 1, 2, 1, 1, 1, 9, 5, 3, 2, 1, 1, 1, 1, 2, 1, 1, 1, 4, 2, 1, 1, 1, 2, 1, 1,
33, 20, 12, 7, 4, 2, 1, 1, 2, 1, 1, 3, 2, 1, 1, 1, 1, 5, 3, 2, 1, 1, 1, 1, 2, 1,
1, 1, 8, 5, 3, 2, 1, 1, 1, 1, 2, 1, 1, 1, 4, 2, 1, 1, 2, 1, 1, 16, 8, 4, 2, 1, 1,
1, 2, 1, 1, 4, 2, 1, 1, 2, 1, 1, 8, 4, 2, 1, 1, 2, 1, 1, 4, 2, 1, 1, 2, 1, 1 };
const unsigned int strat_Bob[MAX_Bob-1] = {
112, 63, 32, 16, 8, 4, 2, 1, 1, 2, 1, 1, 4, 2, 1, 1, 2, 1, 1, 8, 4, 2, 1, 1, 2, 1,
1, 4, 2, 1, 1, 2, 1, 1, 16, 8, 4, 2, 1, 1, 2, 1, 1, 4, 2, 1, 1, 2, 1, 1, 8, 4, 2,
1, 1, 2, 1, 1, 4, 2, 1, 1, 2, 1, 1, 31, 16, 8, 4, 2, 1, 1, 2, 1, 1, 4, 2, 1, 1, 2,
1, 1, 8, 4, 2, 1, 1, 2, 1, 1, 4, 2, 1, 1, 2, 1, 1, 15, 8, 4, 2, 1, 1, 2, 1, 1, 4,
2, 1, 1, 2, 1, 1, 7, 4, 2, 1, 1, 2, 1, 1, 3, 2, 1, 1, 1, 1, 49, 31, 16, 8, 4, 2,
1, 1, 2, 1, 1, 4, 2, 1, 1, 2, 1, 1, 8, 4, 2, 1, 1, 2, 1, 1, 4, 2, 1, 1, 2, 1, 1,
15, 8, 4, 2, 1, 1, 2, 1, 1, 4, 2, 1, 1, 2, 1, 1, 7, 4, 2, 1, 1, 2, 1, 1, 3, 2, 1,
1, 1, 1, 21, 12, 8, 4, 2, 1, 1, 2, 1, 1, 4, 2, 1, 1, 2, 1, 1, 5, 3, 2, 1, 1, 1, 1,
2, 1, 1, 1, 9, 5, 3, 2, 1, 1, 1, 1, 2, 1, 1, 1, 4, 2, 1, 1, 1, 2, 1, 1 };
// Setting up macro defines and including GF(p), GF(p^2), curve, isogeny and kex functions
#define fpcopy fpcopy751
#define fpzero fpzero751
#define fpadd fpadd751
#define fpsub fpsub751
#define fpneg fpneg751
#define fpdiv2 fpdiv2_751
#define fpcorrection fpcorrection751
#define fpmul_mont fpmul751_mont
#define fpsqr_mont fpsqr751_mont
#define fpinv_mont fpinv751_mont
#define fpinv_chain_mont fpinv751_chain_mont
#define fpinv_mont_bingcd fpinv751_mont_bingcd
#define fp2copy fp2copy751
#define fp2zero fp2zero751
#define fp2add fp2add751
#define fp2sub fp2sub751
#define fp2neg fp2neg751
#define fp2div2 fp2div2_751
#define fp2correction fp2correction751
#define fp2mul_mont fp2mul751_mont
#define fp2sqr_mont fp2sqr751_mont
#define fp2inv_mont fp2inv751_mont
#define fp2inv_mont_bingcd fp2inv751_mont_bingcd
#define fpequal_non_constant_time fpequal751_non_constant_time
#define mp_add_asm mp_add751_asm
#define mp_addx2_asm mp_add751x2_asm
#define mp_subx2_asm mp_sub751x2_asm
#include "fpx.c"
#include "ec_isogeny.c"
#include "sidh.c"
#include "sike.c"

+ 255
- 0
sidh_ref/P751_internal.h 查看文件

@@ -0,0 +1,255 @@
/********************************************************************************************
* Supersingular Isogeny Key Encapsulation Library
*
* Abstract: internal header file for P751
*********************************************************************************************/
#ifndef __P751_INTERNAL_H__
#define __P751_INTERNAL_H__
#include "api.h"
#define NWORDS_FIELD 12 // Number of words of a 751-bit field element
#define p751_ZERO_WORDS 5 // Number of "0" digits in the least significant part of p751 + 1
// Basic constants
#define NBITS_FIELD 751
#define MAXBITS_FIELD 768
#define MAXWORDS_FIELD ((MAXBITS_FIELD + RADIX - 1) / RADIX) // Max. number of words to represent field elements
#define NWORDS64_FIELD ((NBITS_FIELD + 63) / 64) // Number of 64-bit words of a 751-bit field element
#define NBITS_ORDER 384
#define NWORDS_ORDER ((NBITS_ORDER + RADIX - 1) / RADIX) // Number of words of oA and oB, where oA and oB are the subgroup orders of Alice and Bob, resp.
#define NWORDS64_ORDER ((NBITS_ORDER + 63) / 64) // Number of 64-bit words of a 384-bit element
#define MAXBITS_ORDER NBITS_ORDER
#define MAXWORDS_ORDER ((MAXBITS_ORDER + RADIX - 1) / RADIX) // Max. number of words to represent elements in [1, oA-1] or [1, oB].
#define ALICE 0
#define BOB 1
#define OALICE_BITS 372
#define OBOB_BITS 379
#define OBOB_EXPON 239
#define MASK_ALICE 0x0F
#define MASK_BOB 0x03
#define PRIME p751
#define PARAM_A 0
#define PARAM_C 1
// Fixed parameters for isogeny tree computation
#define MAX_INT_POINTS_ALICE 8
#define MAX_INT_POINTS_BOB 10
#define MAX_Alice 186
#define MAX_Bob 239
#define MSG_BYTES 32
#define SECRETKEY_A_BYTES (OALICE_BITS + 7) / 8
#define SECRETKEY_B_BYTES (OBOB_BITS + 7) / 8
#define FP2_ENCODED_BYTES 2 * ((NBITS_FIELD + 7) / 8)
// SIDH's basic element definitions and point representations
typedef digit_t felm_t[NWORDS_FIELD]; // Datatype for representing 751-bit field elements (768-bit max.)
typedef digit_t dfelm_t[2 * NWORDS_FIELD]; // Datatype for representing double-precision 2x751-bit field elements (2x768-bit max.)
typedef felm_t f2elm_t[2]; // Datatype for representing quadratic extension field elements GF(p751^2)
typedef f2elm_t publickey_t[3]; // Datatype for representing public keys equivalent to three GF(p751^2) elements
typedef struct
{
f2elm_t X;
f2elm_t Z;
} point_proj; // Point representation in projective XZ Montgomery coordinates.
typedef point_proj point_proj_t[1];
/**************** Function prototypes ****************/
/************* Multiprecision functions **************/
// Copy wordsize digits, c = a, where lng(a) = nwords
void copy_words(const digit_t *a, digit_t *c, const unsigned int nwords);
// Multiprecision addition, c = a+b, where lng(a) = lng(b) = nwords. Returns the carry bit
unsigned int mp_add(const digit_t *a, const digit_t *b, digit_t *c, const unsigned int nwords);
// 751-bit multiprecision addition, c = a+b
void mp_add751(const digit_t *a, const digit_t *b, digit_t *c);
void mp_add751_asm(const digit_t *a, const digit_t *b, digit_t *c);
//void mp_addmask751_asm(const digit_t* a, const digit_t mask, digit_t* c);
// 2x751-bit multiprecision addition, c = a+b
void mp_add751x2(const digit_t *a, const digit_t *b, digit_t *c);
void mp_add751x2_asm(const digit_t *a, const digit_t *b, digit_t *c);
// Multiprecision subtraction, c = a-b, where lng(a) = lng(b) = nwords. Returns the borrow bit
unsigned int mp_sub(const digit_t *a, const digit_t *b, digit_t *c, const unsigned int nwords);
digit_t mp_sub751x2_asm(const digit_t *a, const digit_t *b, digit_t *c);
// Multiprecision left shift
void mp_shiftleft(digit_t *x, unsigned int shift, const unsigned int nwords);
// Multiprecision right shift by one
void mp_shiftr1(digit_t *x, const unsigned int nwords);
// Multiprecision left right shift by one
void mp_shiftl1(digit_t *x, const unsigned int nwords);
// Digit multiplication, digit * digit -> 2-digit result
void digit_x_digit(const digit_t a, const digit_t b, digit_t *c);
// Multiprecision comba multiply, c = a*b, where lng(a) = lng(b) = nwords.
void mp_mul(const digit_t *a, const digit_t *b, digit_t *c, const unsigned int nwords);
void multiply(const digit_t *a, const digit_t *b, digit_t *c, const unsigned int nwords);
// Montgomery multiplication modulo the group order, mc = ma*mb*r' mod order, where ma,mb,mc in [0, order-1]
void Montgomery_multiply_mod_order(const digit_t *ma, const digit_t *mb, digit_t *mc, const digit_t *order, const digit_t *Montgomery_rprime);
// (Non-constant time) Montgomery inversion modulo the curve order using a^(-1) = a^(order-2) mod order
//void Montgomery_inversion_mod_order(const digit_t* ma, digit_t* mc, const digit_t* order, const digit_t* Montgomery_rprime);
void Montgomery_inversion_mod_order_bingcd(const digit_t *a, digit_t *c, const digit_t *order, const digit_t *Montgomery_rprime, const digit_t *Montgomery_R2);
// Conversion of elements in Z_r to Montgomery representation, where the order r is up to 384 bits.
void to_Montgomery_mod_order(const digit_t *a, digit_t *mc, const digit_t *order, const digit_t *Montgomery_rprime, const digit_t *Montgomery_Rprime);
// Conversion of elements in Z_r from Montgomery to standard representation, where the order is up to 384 bits.
void from_Montgomery_mod_order(const digit_t *ma, digit_t *c, const digit_t *order, const digit_t *Montgomery_rprime);
// Inversion modulo Alice's order 2^372.
void inv_mod_orderA(const digit_t *a, digit_t *c);
/************ Field arithmetic functions *************/
// Copy of a field element, c = a
void fpcopy751(const felm_t a, felm_t c);
// Zeroing a field element, a = 0
void fpzero751(felm_t a);
// Non constant-time comparison of two field elements. If a = b return TRUE, otherwise, return FALSE
bool fpequal751_non_constant_time(const felm_t a, const felm_t b);
// Modular addition, c = a+b mod p751
extern void fpadd751(const digit_t *a, const digit_t *b, digit_t *c);
extern void fpadd751_asm(const digit_t *a, const digit_t *b, digit_t *c);
// Modular subtraction, c = a-b mod p751
extern void fpsub751(const digit_t *a, const digit_t *b, digit_t *c);
extern void fpsub751_asm(const digit_t *a, const digit_t *b, digit_t *c);
// Modular negation, a = -a mod p751
extern void fpneg751(digit_t *a);
// Modular division by two, c = a/2 mod p751.
void fpdiv2_751(const digit_t *a, digit_t *c);
// Modular correction to reduce field element a in [0, 2*p751-1] to [0, p751-1].
void fpcorrection751(digit_t *a);
// 751-bit Montgomery reduction, c = a mod p
void rdc_mont(const digit_t *a, digit_t *c);
// Field multiplication using Montgomery arithmetic, c = a*b*R^-1 mod p751, where R=2^768
void fpmul751_mont(const felm_t a, const felm_t b, felm_t c);
void mul751_asm(const felm_t a, const felm_t b, dfelm_t c);
void rdc751_asm(const dfelm_t ma, dfelm_t mc);
// Field squaring using Montgomery arithmetic, c = a*b*R^-1 mod p751, where R=2^768
void fpsqr751_mont(const felm_t ma, felm_t mc);
// Conversion to Montgomery representation
void to_mont(const felm_t a, felm_t mc);
// Conversion from Montgomery representation to standard representation
void from_mont(const felm_t ma, felm_t c);
// Field inversion, a = a^-1 in GF(p751)
void fpinv751_mont(felm_t a);
// Field inversion, a = a^-1 in GF(p751) using the binary GCD
void fpinv751_mont_bingcd(felm_t a);
// Chain to compute (p751-3)/4 using Montgomery arithmetic
void fpinv751_chain_mont(felm_t a);
/************ GF(p^2) arithmetic functions *************/
// Copy of a GF(p751^2) element, c = a
void fp2copy751(const f2elm_t a, f2elm_t c);
// Zeroing a GF(p751^2) element, a = 0
void fp2zero751(f2elm_t a);
// GF(p751^2) negation, a = -a in GF(p751^2)
void fp2neg751(f2elm_t a);
// GF(p751^2) addition, c = a+b in GF(p751^2)
extern void fp2add751(const f2elm_t a, const f2elm_t b, f2elm_t c);
// GF(p751^2) subtraction, c = a-b in GF(p751^2)
extern void fp2sub751(const f2elm_t a, const f2elm_t b, f2elm_t c);
// GF(p751^2) division by two, c = a/2 in GF(p751^2)
void fp2div2_751(const f2elm_t a, f2elm_t c);
// Modular correction, a = a in GF(p751^2)
void fp2correction751(f2elm_t a);
// GF(p751^2) squaring using Montgomery arithmetic, c = a^2 in GF(p751^2)
void fp2sqr751_mont(const f2elm_t a, f2elm_t c);
// GF(p751^2) multiplication using Montgomery arithmetic, c = a*b in GF(p751^2)
void fp2mul751_mont(const f2elm_t a, const f2elm_t b, f2elm_t c);
// Conversion of a GF(p751^2) element to Montgomery representation
void to_fp2mont(const f2elm_t a, f2elm_t mc);
// Conversion of a GF(p751^2) element from Montgomery representation to standard representation
void from_fp2mont(const f2elm_t ma, f2elm_t c);
// GF(p751^2) inversion using Montgomery arithmetic, a = (a0-i*a1)/(a0^2+a1^2)
void fp2inv751_mont(f2elm_t a);
// GF(p751^2) inversion, a = (a0-i*a1)/(a0^2+a1^2), GF(p751) inversion done using the binary GCD
void fp2inv751_mont_bingcd(f2elm_t a);
// n-way Montgomery inversion
void mont_n_way_inv(const f2elm_t *vec, const int n, f2elm_t *out);
/************ Elliptic curve and isogeny functions *************/
// Computes the j-invariant of a Montgomery curve with projective constant.
void j_inv(const f2elm_t A, const f2elm_t C, f2elm_t jinv);
// Simultaneous doubling and differential addition.
void xDBLADD(point_proj_t P, point_proj_t Q, const f2elm_t xPQ, const f2elm_t A24);
// Doubling of a Montgomery point in projective coordinates (X:Z).
void xDBL(const point_proj_t P, point_proj_t Q, const f2elm_t A24plus, const f2elm_t C24);
// Computes [2^e](X:Z) on Montgomery curve with projective constant via e repeated doublings.
void xDBLe(const point_proj_t P, point_proj_t Q, const f2elm_t A24plus, const f2elm_t C24, const int e);
// Differential addition.
void xADD(point_proj_t P, const point_proj_t Q, const f2elm_t xPQ);
// Computes the corresponding 4-isogeny of a projective Montgomery point (X4:Z4) of order 4.
void get_4_isog(const point_proj_t P, f2elm_t A24plus, f2elm_t C24, f2elm_t *coeff);
// Evaluates the isogeny at the point (X:Z) in the domain of the isogeny.
void eval_4_isog(point_proj_t P, f2elm_t *coeff);
// Tripling of a Montgomery point in projective coordinates (X:Z).
void xTPL(const point_proj_t P, point_proj_t Q, const f2elm_t A24minus, const f2elm_t A24plus);
// Computes [3^e](X:Z) on Montgomery curve with projective constant via e repeated triplings.
void xTPLe(const point_proj_t P, point_proj_t Q, const f2elm_t A24minus, const f2elm_t A24plus, const int e);
// Computes the corresponding 3-isogeny of a projective Montgomery point (X3:Z3) of order 3.
void get_3_isog(const point_proj_t P, f2elm_t A24minus, f2elm_t A24plus, f2elm_t *coeff);
// Computes the 3-isogeny R=phi(X:Z), given projective point (X3:Z3) of order 3 on a Montgomery curve and a point P with coefficients given in coeff.
void eval_3_isog(point_proj_t Q, const f2elm_t *coeff);
// 3-way simultaneous inversion
void inv_3_way(f2elm_t z1, f2elm_t z2, f2elm_t z3);
// Given the x-coordinates of P, Q, and R, returns the value A corresponding to the Montgomery curve E_A: y^2=x^3+A*x^2+x such that R=Q-P on E_A.
void get_A(const f2elm_t xP, const f2elm_t xQ, const f2elm_t xR, f2elm_t A);
#endif

+ 214
- 0
sidh_ref/SIDH.h 查看文件

@@ -0,0 +1,214 @@
/********************************************************************************************
* SIDH: an efficient supersingular isogeny-based cryptography library for ephemeral
* Diffie-Hellman key exchange.
*
* Copyright (c) Microsoft Corporation. All rights reserved.
*
*
* Abstract: main header file
*
*********************************************************************************************/
#ifndef __SIDH_H__
#define __SIDH_H__
#include <stdint.h>
#include <stdbool.h>
#include <stddef.h>
// Definition of operating system
#define OS_WIN 1
#define OS_LINUX 2
#define OS_TARGET OS_LINUX
#define COMPILER_VC 1
#define COMPILER_GCC 2
#define COMPILER_CLANG 3
#define COMPILER COMPILER_GCC
// Definition of the targeted architecture and basic data types
#define TARGET_AMD64 1
#define TARGET_x86 2
#define TARGET_ARM 3
#define TARGET_ARM64 4
#define TARGET TARGET_AMD64
#define RADIX 64
typedef uint64_t digit_t; // Unsigned 64-bit digit
typedef int64_t sdigit_t; // Signed 64-bit digit
typedef uint32_t hdigit_t; // Unsigned 32-bit digit
#define NWORDS_FIELD 12 // Number of words of a 751-bit field element
#define p751_ZERO_WORDS 5 // Number of "0" digits in the least significant part of p751 + 1
#define RADIX64 64
// Selection of generic, portable implementation
// Unsupported configurations
#if (TARGET != TARGET_AMD64) && (TARGET != TARGET_ARM64) && !defined(GENERIC_IMPLEMENTATION)
#error-- "Unsupported configuration"
#endif
// Extended datatype support
#if defined(GENERIC_IMPLEMENTATION)
typedef uint64_t uint128_t[2];
#elif (TARGET == TARGET_AMD64 && OS_TARGET == OS_LINUX) && (COMPILER == COMPILER_GCC || COMPILER == COMPILER_CLANG)
#define UINT128_SUPPORT
typedef unsigned uint128_t __attribute__((mode(TI)));
#elif (TARGET == TARGET_ARM64 && OS_TARGET == OS_LINUX) && (COMPILER == COMPILER_GCC || COMPILER == COMPILER_CLANG)
#define UINT128_SUPPORT
typedef unsigned uint128_t __attribute__((mode(TI)));
#elif (TARGET == TARGET_AMD64) && (OS_TARGET == OS_WIN && COMPILER == COMPILER_VC)
#define SCALAR_INTRIN_SUPPORT
typedef uint64_t uint128_t[2];
#else
#error-- "Unsupported configuration"
#endif
// Basic constants
#define NBITS_FIELD 751
#define MAXBITS_FIELD 768
#define MAXWORDS_FIELD ((MAXBITS_FIELD + RADIX - 1) / RADIX) // Max. number of words to represent field elements
#define NWORDS64_FIELD ((NBITS_FIELD + 63) / 64) // Number of 64-bit words of a 751-bit field element
#define NBITS_ORDER 384
#define NWORDS_ORDER ((NBITS_ORDER + RADIX - 1) / RADIX) // Number of words of oA and oB, where oA and oB are the subgroup orders of Alice and Bob, resp.
#define NWORDS64_ORDER ((NBITS_ORDER + 63) / 64) // Number of 64-bit words of a 384-bit element
#define MAXBITS_ORDER NBITS_ORDER
#define MAXWORDS_ORDER ((MAXBITS_ORDER + RADIX - 1) / RADIX) // Max. number of words to represent elements in [1, oA-1] or [1, oB].
// Basic constants for elliptic curve BigMont
#define BIGMONT_NBITS_ORDER 749
#define BIGMONT_MAXBITS_ORDER 768
#define BIGMONT_NWORDS_ORDER ((BIGMONT_NBITS_ORDER + RADIX - 1) / RADIX) // Number of words of BigMont's subgroup order.
#define BIGMONT_MAXWORDS_ORDER ((BIGMONT_MAXBITS_ORDER + RADIX - 1) / RADIX) // Max. number of words to represent elements in [1, BigMont_order].
// Definitions of the error-handling type and error codes
typedef enum {
CRYPTO_SUCCESS, // 0x00
CRYPTO_ERROR, // 0x01
CRYPTO_ERROR_DURING_TEST, // 0x02
CRYPTO_ERROR_UNKNOWN, // 0x03
CRYPTO_ERROR_NOT_IMPLEMENTED, // 0x04
CRYPTO_ERROR_NO_MEMORY, // 0x05
CRYPTO_ERROR_INVALID_PARAMETER, // 0x06
CRYPTO_ERROR_SHARED_KEY, // 0x07
CRYPTO_ERROR_PUBLIC_KEY_VALIDATION, // 0x08
CRYPTO_ERROR_TOO_MANY_ITERATIONS, // 0x09
CRYPTO_ERROR_END_OF_LIST
} CRYPTO_STATUS;
#define CRYPTO_STATUS_TYPE_SIZE (CRYPTO_ERROR_END_OF_LIST)
// Definitions of the error messages
// NOTE: they must match the error codes above
#define CRYPTO_MSG_SUCCESS "CRYPTO_SUCCESS"
#define CRYPTO_MSG_ERROR "CRYPTO_ERROR"
#define CRYPTO_MSG_ERROR_DURING_TEST "CRYPTO_ERROR_DURING_TEST"
#define CRYPTO_MSG_ERROR_UNKNOWN "CRYPTO_ERROR_UNKNOWN"
#define CRYPTO_MSG_ERROR_NOT_IMPLEMENTED "CRYPTO_ERROR_NOT_IMPLEMENTED"
#define CRYPTO_MSG_ERROR_NO_MEMORY "CRYPTO_ERROR_NO_MEMORY"
#define CRYPTO_MSG_ERROR_INVALID_PARAMETER "CRYPTO_ERROR_INVALID_PARAMETER"
#define CRYPTO_MSG_ERROR_SHARED_KEY "CRYPTO_ERROR_SHARED_KEY"
#define CRYPTO_MSG_ERROR_PUBLIC_KEY_VALIDATION "CRYPTO_ERROR_PUBLIC_KEY_VALIDATION"
#define CRYPTO_MSG_ERROR_TOO_MANY_ITERATIONS "CRYPTO_ERROR_TOO_MANY_ITERATIONS"
// Definition of type random_bytes to implement callback functions outputting "nbytes" random values to "random_array"
typedef CRYPTO_STATUS (*RandomBytes)(unsigned int nbytes, unsigned char *random_array);
// Definition of type for curve isogeny system identifiers. Currently valid value is "SIDHp751" (see SIDH.h)
typedef char CurveIsogeny_ID[10];
// Supersingular elliptic curve isogeny structures:
// This data struct contains the static curve isogeny data
typedef struct
{
CurveIsogeny_ID CurveIsogeny; // Curve isogeny system identifier, base curve defined over GF(p^2)
unsigned int pwordbits; // Smallest multiple of 32 larger than the prime bitlength
unsigned int owordbits; // Smallest multiple of 32 larger than the order bitlength
unsigned int pbits; // Bitlength of the prime p
uint64_t prime[MAXWORDS_FIELD]; // Prime p
uint64_t A[MAXWORDS_FIELD]; // Base curve parameter "A"
uint64_t C[MAXWORDS_FIELD]; // Base curve parameter "C"
unsigned int oAbits; // Order bitlength for Alice
uint64_t Aorder[MAXWORDS_ORDER]; // Order of Alice's (sub)group
unsigned int oBbits; // Order bitlength for Bob
unsigned int eB; // Power of Bob's subgroup order (i.e., oB = 3^eB)
uint64_t Border[MAXWORDS_ORDER]; // Order of Bob's (sub)group
uint64_t PA[2 * MAXWORDS_FIELD]; // Alice's generator PA = (XPA,YPA), where XPA and YPA are defined over GF(p)
uint64_t PB[2 * MAXWORDS_FIELD]; // Bob's generator PB = (XPB,YPB), where XPB and YPB are defined over GF(p)
unsigned int BigMont_A24; // BigMont's curve parameter A24 = (A+2)/4
uint64_t BigMont_order[BIGMONT_MAXWORDS_ORDER]; // BigMont's subgroup order
uint64_t Montgomery_R2[MAXWORDS_FIELD]; // Montgomery constant (2^W)^2 mod p, using a suitable value W
uint64_t Montgomery_pp[MAXWORDS_FIELD]; // Montgomery constant -p^-1 mod 2^W, using a suitable value W
uint64_t Montgomery_one[MAXWORDS_FIELD]; // Value one in Montgomery representation
} CurveIsogenyStaticData, *PCurveIsogenyStaticData;
// This data struct is initialized with the targeted curve isogeny system during setup
typedef struct
{
CurveIsogeny_ID CurveIsogeny; // Curve isogeny system identifier, base curve defined over GF(p^2)
unsigned int pwordbits; // Closest multiple of 32 to prime bitlength
unsigned int owordbits; // Closest multiple of 32 to order bitlength
unsigned int pbits; // Bitlength of the prime p
digit_t *prime; // Prime p
digit_t *A; // Base curve parameter "A"
digit_t *C; // Base curve parameter "C"
unsigned int oAbits; // Order bitlength for Alice
digit_t *Aorder; // Order of Alice's (sub)group
unsigned int oBbits; // Order bitlength for Bob
unsigned int eB; // Power of Bob's subgroup order (i.e., oB = 3^eB)
digit_t *Border; // Order of Bob's (sub)group
digit_t *PA; // Alice's generator PA = (XPA,YPA), where XPA and YPA are defined over GF(p)
digit_t *PB; // Bob's generator PB = (XPB,YPB), where XPB and YPB are defined over GF(p)
unsigned int BigMont_A24; // BigMont's curve parameter A24 = (A+2)/4
digit_t *BigMont_order; // BigMont's subgroup order
digit_t *Montgomery_R2; // Montgomery constant (2^W)^2 mod p, using a suitable value W
digit_t *Montgomery_pp; // Montgomery constant -p^-1 mod 2^W, using a suitable value W
digit_t *Montgomery_one; // Value one in Montgomery representation
RandomBytes RandomBytesFunction; // Function providing random bytes to generate nonces or secret keys
} CurveIsogenyStruct, *PCurveIsogenyStruct;
// Supported curve isogeny systems:
// "SIDHp751", base curve: supersingular elliptic curve E: y^2 = x^3 + x
extern CurveIsogenyStaticData CurveIsogeny_SIDHp751;
/******************** Function prototypes ***********************/
/*************** Setup/initialization functions *****************/
// Dynamic allocation of memory for curve isogeny structure.
// Returns NULL on error.
PCurveIsogenyStruct SIDH_curve_allocate(PCurveIsogenyStaticData CurveData);
// Initialize curve isogeny structure pCurveIsogeny with static data extracted from pCurveIsogenyData.
// This needs to be called after allocating memory for "pCurveIsogeny" using SIDH_curve_allocate().
CRYPTO_STATUS SIDH_curve_initialize(PCurveIsogenyStruct pCurveIsogeny, RandomBytes RandomBytesFunction, PCurveIsogenyStaticData pCurveIsogenyData);
// Free memory for curve isogeny structure
void SIDH_curve_free(PCurveIsogenyStruct pCurveIsogeny);
// Output error/success message for a given CRYPTO_STATUS
const char *SIDH_get_error_message(CRYPTO_STATUS Status);
// Output random values in the range [1, order-1] in little endian format that can be used as private keys.
CRYPTO_STATUS random_mod_order(digit_t *random_digits, unsigned int AliceOrBob, PCurveIsogenyStruct pCurveIsogeny);
// Output random values in the range [1, BigMont_order-1] in little endian format that can be used as private keys
// to compute scalar multiplications using the elliptic curve BigMont.
CRYPTO_STATUS random_BigMont_mod_order(digit_t *random_digits, PCurveIsogenyStruct pCurveIsogeny);
// Clear "nwords" digits from memory
void clear_words(void *mem, digit_t nwords);
#endif

+ 109
- 0
sidh_ref/api.h 查看文件

@@ -0,0 +1,109 @@
/********************************************************************************************
* Supersingular Isogeny Key Encapsulation Library
*
* Abstract: API header file for P751
*********************************************************************************************/
#ifndef __P751_API_H__
#define __P751_API_H__
#include "config.h"
/*********************** Key encapsulation mechanism API ***********************/
#define CRYPTO_SECRETKEYBYTES 644 // MSG_BYTES + SECRETKEY_B_BYTES + CRYPTO_PUBLICKEYBYTES bytes
#define CRYPTO_PUBLICKEYBYTES 564
#define CRYPTO_BYTES 24
#define CRYPTO_CIPHERTEXTBYTES 596 // CRYPTO_PUBLICKEYBYTES + MSG_BYTES bytes
// Algorithm name
#define CRYPTO_ALGNAME "SIKEp751"
// SIKE's key generation
// It produces a private key sk and computes the public key pk.
// Outputs: secret key sk (CRYPTO_SECRETKEYBYTES = 644 bytes)
// public key pk (CRYPTO_PUBLICKEYBYTES = 564 bytes)
int crypto_kem_keypair(unsigned char *pk, unsigned char *sk);
// SIKE's encapsulation
// Input: public key pk (CRYPTO_PUBLICKEYBYTES = 564 bytes)
// Outputs: shared secret ss (CRYPTO_BYTES = 24 bytes)
// ciphertext message ct (CRYPTO_CIPHERTEXTBYTES = 596 bytes)
int crypto_kem_enc(unsigned char *ct, unsigned char *ss, const unsigned char *pk);
// SIKE's decapsulation
// Input: secret key sk (CRYPTO_SECRETKEYBYTES = 644 bytes)
// ciphertext message ct (CRYPTO_CIPHERTEXTBYTES = 596 bytes)
// Outputs: shared secret ss (CRYPTO_BYTES = 24 bytes)
int crypto_kem_dec(unsigned char *ss, const unsigned char *ct, const unsigned char *sk);
// Encoding of keys for KEM-based isogeny system "SIKEp751" (wire format):
// ----------------------------------------------------------------------
// Elements over GF(p751) are encoded in 94 octets in little endian format (i.e., the least significant octet is located in the lowest memory address).
// Elements (a+b*i) over GF(p751^2), where a and b are defined over GF(p751), are encoded as {a, b}, with a in the lowest memory portion.
//
// Private keys sk consist of the concatenation of a 32-byte random value, a value in the range [0, 2^378-1] and the public key pk. In the SIKE API,
// private keys are encoded in 644 octets in little endian format.
// Public keys pk consist of 3 elements in GF(p751^2). In the SIKE API, pk is encoded in 564 octets.
// Ciphertexts ct consist of the concatenation of a public key value and a 32-byte value. In the SIKE API, ct is encoded in 564 + 32 = 596 octets.
// Shared keys ss consist of a value of 24 octets.
/*********************** Ephemeral key exchange API ***********************/
#define SIDH_SECRETKEYBYTES 48
#define SIDH_PUBLICKEYBYTES 564
#define SIDH_BYTES 188
// SECURITY NOTE: SIDH supports ephemeral Diffie-Hellman key exchange. It is NOT secure to use it with static keys.
// See "On the Security of Supersingular Isogeny Cryptosystems", S.D. Galbraith, C. Petit, B. Shani and Y.B. Ti, in ASIACRYPT 2016, 2016.
// Extended version available at: http://eprint.iacr.org/2016/859
// Generation of Alice's secret key
// Outputs random value in [0, 2^372 - 1] to be used as Alice's private key
void random_mod_order_A(unsigned char* random_digits);
// Generation of Bob's secret key
// Outputs random value in [0, 2^Floor(Log(2,3^239)) - 1] to be used as Bob's private key
void random_mod_order_B(unsigned char* random_digits);
// Alice's ephemeral public key generation
// Input: a private key PrivateKeyA in the range [0, 2^372 - 1], stored in 47 bytes.
// Output: the public key PublicKeyA consisting of 3 GF(p751^2) elements encoded in 564 bytes.
int EphemeralKeyGeneration_A(const unsigned char* PrivateKeyA, unsigned char* PublicKeyA);
// Bob's ephemeral key-pair generation
// It produces a private key PrivateKeyB and computes the public key PublicKeyB.
// The private key is an integer in the range [0, 2^Floor(Log(2,3^239)) - 1], stored in 48 bytes.
// The public key consists of 3 GF(p751^2) elements encoded in 564 bytes.
int EphemeralKeyGeneration_B(const unsigned char* PrivateKeyB, unsigned char* PublicKeyB);
// Alice's ephemeral shared secret computation
// It produces a shared secret key SharedSecretA using her secret key PrivateKeyA and Bob's public key PublicKeyB
// Inputs: Alice's PrivateKeyA is an integer in the range [0, 2^372 - 1], stored in 47 bytes.
// Bob's PublicKeyB consists of 3 GF(p751^2) elements encoded in 564 bytes.
// Output: a shared secret SharedSecretA that consists of one element in GF(p751^2) encoded in 188 bytes.
int EphemeralSecretAgreement_A(const unsigned char* PrivateKeyA, const unsigned char* PublicKeyB, unsigned char* SharedSecretA);
// Bob's ephemeral shared secret computation
// It produces a shared secret key SharedSecretB using his secret key PrivateKeyB and Alice's public key PublicKeyA
// Inputs: Bob's PrivateKeyB is an integer in the range [0, 2^Floor(Log(2,3^239)) - 1], stored in 48 bytes.
// Alice's PublicKeyA consists of 3 GF(p751^2) elements encoded in 564 bytes.
// Output: a shared secret SharedSecretB that consists of one element in GF(p751^2) encoded in 188 bytes.
int EphemeralSecretAgreement_B(const unsigned char* PrivateKeyB, const unsigned char* PublicKeyA, unsigned char* SharedSecretB);
// Encoding of keys for KEX-based isogeny system "SIDHp751" (wire format):
// ----------------------------------------------------------------------
// Elements over GF(p751) are encoded in 94 octets in little endian format (i.e., the least significant octet is located in the lowest memory address).
// Elements (a+b*i) over GF(p751^2), where a and b are defined over GF(p751), are encoded as {a, b}, with a in the lowest memory portion.
//
// Private keys PrivateKeyA and PrivateKeyB can have values in the range [0, 2^372-1] and [0, 2^378-1], resp. In the SIDH API, private keys are encoded
// in 48 octets in little endian format.
// Public keys PublicKeyA and PublicKeyB consist of 3 elements in GF(p751^2). In the SIDH API, they are encoded in 564 octets.
// Shared keys SharedSecretA and SharedSecretB consist of one element in GF(p751^2). In the SIDH API, they are encoded in 188 octets.
#endif

+ 128
- 0
sidh_ref/config.h 查看文件

@@ -0,0 +1,128 @@
/********************************************************************************************
* Supersingular Isogeny Key Encapsulation Library
*
* Abstract: configuration file and platform-dependent macros
*********************************************************************************************/
#ifndef __CONFIG_H__
#define __CONFIG_H__
#include <stdint.h>
#include <stdbool.h>
#include <stddef.h>
// Definition of operating system
#define OS_LINUX 1
#if defined(__LINUX__) // Linux OS
#define OS_TARGET OS_LINUX
#else
#error -- "Unsupported OS"
#endif
// Definition of compiler
#define COMPILER_GCC 1
#define COMPILER_CLANG 2
#if defined(__GNUC__) // GNU GCC compiler
#define COMPILER COMPILER_GCC
#elif defined(__clang__) // Clang compiler
#define COMPILER COMPILER_CLANG
#else
#error -- "Unsupported COMPILER"
#endif
// Definition of the targeted architecture and basic data types
#define TARGET_AMD64 1
#if defined(_AMD64_)
#define TARGET TARGET_AMD64
#define RADIX 64
#define LOG2RADIX 6
typedef uint64_t digit_t; // Unsigned 64-bit digit
#else
#error -- "Unsupported ARCHITECTURE"
#endif
#define RADIX64 64
// Selection of implementation: optimized_fast with x64 assembly
#if defined(_OPTIMIZED_FAST_)
#define OPTIMIZED_FAST_IMPLEMENTATION
#endif
// Extended datatype support
#define UINT128_SUPPORT
typedef unsigned uint128_t __attribute__((mode(TI)));
// Macro definitions
#define NBITS_TO_NBYTES(nbits) (((nbits)+7)/8) // Conversion macro from number of bits to number of bytes
#define NBITS_TO_NWORDS(nbits) (((nbits)+(sizeof(digit_t)*8)-1)/(sizeof(digit_t)*8)) // Conversion macro from number of bits to number of computer words
#define NBYTES_TO_NWORDS(nbytes) (((nbytes)+sizeof(digit_t)-1)/sizeof(digit_t)) // Conversion macro from number of bytes to number of computer words
// Macro to avoid compiler warnings when detecting unreferenced parameters
#define UNREFERENCED_PARAMETER(PAR) ((void)(PAR))
/********************** Constant-time unsigned comparisons ***********************/
// The following functions return 1 (TRUE) if condition is true, 0 (FALSE) otherwise
static __inline unsigned int is_digit_nonzero_ct(digit_t x)
{ // Is x != 0?
return (unsigned int)((x | (0-x)) >> (RADIX-1));
}
static __inline unsigned int is_digit_zero_ct(digit_t x)
{ // Is x = 0?
return (unsigned int)(1 ^ is_digit_nonzero_ct(x));
}
static __inline unsigned int is_digit_lessthan_ct(digit_t x, digit_t y)
{ // Is x < y?
return (unsigned int)((x ^ ((x ^ y) | ((x - y) ^ y))) >> (RADIX-1));
}
/********************** Macros for platform-dependent operations **********************/
// Digit multiplication
#define MUL(multiplier, multiplicand, hi, lo) \
{ uint128_t tempReg = (uint128_t)(multiplier) * (uint128_t)(multiplicand); \
*(hi) = (digit_t)(tempReg >> RADIX); \
(lo) = (digit_t)tempReg; }
// Digit addition with carry
#define ADDC(carryIn, addend1, addend2, carryOut, sumOut) \
{ uint128_t tempReg = (uint128_t)(addend1) + (uint128_t)(addend2) + (uint128_t)(carryIn); \
(carryOut) = (digit_t)(tempReg >> RADIX); \
(sumOut) = (digit_t)tempReg; }
// Digit subtraction with borrow
#define SUBC(borrowIn, minuend, subtrahend, borrowOut, differenceOut) \
{ uint128_t tempReg = (uint128_t)(minuend) - (uint128_t)(subtrahend) - (uint128_t)(borrowIn); \
(borrowOut) = (digit_t)(tempReg >> (sizeof(uint128_t)*8 - 1)); \
(differenceOut) = (digit_t)tempReg; }
// Digit shift right
#define SHIFTR(highIn, lowIn, shift, shiftOut, DigitSize) \
(shiftOut) = ((lowIn) >> (shift)) ^ ((highIn) << (RADIX - (shift)));
// Digit shift left
#define SHIFTL(highIn, lowIn, shift, shiftOut, DigitSize) \
(shiftOut) = ((highIn) << (shift)) ^ ((lowIn) >> (RADIX - (shift)));
#endif

+ 330
- 0
sidh_ref/ec_isogeny.c 查看文件

@@ -0,0 +1,330 @@
/********************************************************************************************
* Supersingular Isogeny Key Encapsulation Library
*
* Abstract: elliptic curve and isogeny functions
*********************************************************************************************/
#include "P751_internal.h"
#include <stdio.h>
void xDBL(const point_proj_t P, point_proj_t Q, const f2elm_t A24plus, const f2elm_t C24)
{ // Doubling of a Montgomery point in projective coordinates (X:Z).
// Input: projective Montgomery x-coordinates P = (X1:Z1), where x1=X1/Z1 and Montgomery curve constants A+2C and 4C.
// Output: projective Montgomery x-coordinates Q = 2*P = (X2:Z2).
f2elm_t t0, t1;
fp2sub(P->X, P->Z, t0); // t0 = X1-Z1
fp2add(P->X, P->Z, t1); // t1 = X1+Z1
fp2sqr_mont(t0, t0); // t0 = (X1-Z1)^2
fp2sqr_mont(t1, t1); // t1 = (X1+Z1)^2
fp2mul_mont(C24, t0, Q->Z); // Z2 = C24*(X1-Z1)^2
fp2mul_mont(t1, Q->Z, Q->X); // X2 = C24*(X1-Z1)^2*(X1+Z1)^2
fp2sub(t1, t0, t1); // t1 = (X1+Z1)^2-(X1-Z1)^2
fp2mul_mont(A24plus, t1, t0); // t0 = A24plus*[(X1+Z1)^2-(X1-Z1)^2]
fp2add(Q->Z, t0, Q->Z); // Z2 = A24plus*[(X1+Z1)^2-(X1-Z1)^2] + C24*(X1-Z1)^2
fp2mul_mont(Q->Z, t1, Q->Z); // Z2 = [A24plus*[(X1+Z1)^2-(X1-Z1)^2] + C24*(X1-Z1)^2]*[(X1+Z1)^2-(X1-Z1)^2]
}
void xDBLe(const point_proj_t P, point_proj_t Q, const f2elm_t A24plus, const f2elm_t C24, const int e)
{ // Computes [2^e](X:Z) on Montgomery curve with projective constant via e repeated doublings.
// Input: projective Montgomery x-coordinates P = (XP:ZP), such that xP=XP/ZP and Montgomery curve constants A+2C and 4C.
// Output: projective Montgomery x-coordinates Q <- (2^e)*P.
int i;
copy_words((digit_t *)P, (digit_t *)Q, 2 * 2 * NWORDS_FIELD);
for (i = 0; i < e; i++)
{
xDBL(Q, Q, A24plus, C24);
}
}
void get_4_isog(const point_proj_t P, f2elm_t A24plus, f2elm_t C24, f2elm_t *coeff)
{ // Computes the corresponding 4-isogeny of a projective Montgomery point (X4:Z4) of order 4.
// Input: projective point of order four P = (X4:Z4).
// Output: the 4-isogenous Montgomery curve with projective coefficients A+2C/4C and the 3 coefficients
// that are used to evaluate the isogeny at a point in eval_4_isog().
fp2sub(P->X, P->Z, coeff[1]); // coeff[1] = X4-Z4
fp2add(P->X, P->Z, coeff[2]); // coeff[2] = X4+Z4
fp2sqr_mont(P->Z, coeff[0]); // coeff[0] = Z4^2
fp2add(coeff[0], coeff[0], coeff[0]); // coeff[0] = 2*Z4^2
fp2sqr_mont(coeff[0], C24); // C24 = 4*Z4^4
fp2add(coeff[0], coeff[0], coeff[0]); // coeff[0] = 4*Z4^2
fp2sqr_mont(P->X, A24plus); // A24plus = X4^2
fp2add(A24plus, A24plus, A24plus); // A24plus = 2*X4^2
fp2sqr_mont(A24plus, A24plus); // A24plus = 4*X4^4
}
void eval_4_isog(point_proj_t P, f2elm_t *coeff)
{ // Evaluates the isogeny at the point (X:Z) in the domain of the isogeny, given a 4-isogeny phi defined
// by the 3 coefficients in coeff (computed in the function get_4_isog()).
// Inputs: the coefficients defining the isogeny, and the projective point P = (X:Z).
// Output: the projective point P = phi(P) = (X:Z) in the codomain.
f2elm_t t0, t1;
fp2add(P->X, P->Z, t0); // t0 = X+Z
fp2sub(P->X, P->Z, t1); // t1 = X-Z
fp2mul_mont(t0, coeff[1], P->X); // X = (X+Z)*coeff[1]
fp2mul_mont(t1, coeff[2], P->Z); // Z = (X-Z)*coeff[2]
fp2mul_mont(t0, t1, t0); // t0 = (X+Z)*(X-Z)
fp2mul_mont(t0, coeff[0], t0); // t0 = coeff[0]*(X+Z)*(X-Z)
fp2add(P->X, P->Z, t1); // t1 = (X-Z)*coeff[2] + (X+Z)*coeff[1]
fp2sub(P->X, P->Z, P->Z); // Z = (X-Z)*coeff[2] - (X+Z)*coeff[1]
fp2sqr_mont(t1, t1); // t1 = [(X-Z)*coeff[2] + (X+Z)*coeff[1]]^2
fp2sqr_mont(P->Z, P->Z); // Z = [(X-Z)*coeff[2] - (X+Z)*coeff[1]]^2
fp2add(t1, t0, P->X); // X = coeff[0]*(X+Z)*(X-Z) + [(X-Z)*coeff[2] + (X+Z)*coeff[1]]^2
fp2sub(P->Z, t0, t0); // t0 = [(X-Z)*coeff[2] - (X+Z)*coeff[1]]^2 - coeff[0]*(X+Z)*(X-Z)
fp2mul_mont(P->X, t1, P->X); // Xfinal
fp2mul_mont(P->Z, t0, P->Z); // Zfinal
}
void xTPL(const point_proj_t P, point_proj_t Q, const f2elm_t A24minus, const f2elm_t A24plus)
{ // Tripling of a Montgomery point in projective coordinates (X:Z).
// Input: projective Montgomery x-coordinates P = (X:Z), where x=X/Z and Montgomery curve constants A24plus = A+2C and A24minus = A-2C.
// Output: projective Montgomery x-coordinates Q = 3*P = (X3:Z3).
f2elm_t t0, t1, t2, t3, t4, t5, t6;
fp2sub(P->X, P->Z, t0); // t0 = X-Z
fp2sqr_mont(t0, t2); // t2 = (X-Z)^2
fp2add(P->X, P->Z, t1); // t1 = X+Z
fp2sqr_mont(t1, t3); // t3 = (X+Z)^2
fp2add(t0, t1, t4); // t4 = 2*X
fp2sub(t1, t0, t0); // t0 = 2*Z
fp2sqr_mont(t4, t1); // t1 = 4*X^2
fp2sub(t1, t3, t1); // t1 = 4*X^2 - (X+Z)^2
fp2sub(t1, t2, t1); // t1 = 4*X^2 - (X+Z)^2 - (X-Z)^2
fp2mul_mont(t3, A24plus, t5); // t5 = A24plus*(X+Z)^2
fp2mul_mont(t3, t5, t3); // t3 = A24plus*(X+Z)^3
fp2mul_mont(A24minus, t2, t6); // t6 = A24minus*(X-Z)^2
fp2mul_mont(t2, t6, t2); // t2 = A24minus*(X-Z)^3
fp2sub(t2, t3, t3); // t3 = A24minus*(X-Z)^3 - coeff*(X+Z)^3
fp2sub(t5, t6, t2); // t2 = A24plus*(X+Z)^2 - A24minus*(X-Z)^2
fp2mul_mont(t1, t2, t1); // t1 = [4*X^2 - (X+Z)^2 - (X-Z)^2]*[A24plus*(X+Z)^2 - A24minus*(X-Z)^2]
fp2add(t3, t1, t2); // t2 = [4*X^2 - (X+Z)^2 - (X-Z)^2]*[A24plus*(X+Z)^2 - A24minus*(X-Z)^2] + A24minus*(X-Z)^3 - coeff*(X+Z)^3
fp2sqr_mont(t2, t2); // t2 = t2^2
fp2mul_mont(t4, t2, Q->X); // X3 = 2*X*t2
fp2sub(t3, t1, t1); // t1 = A24minus*(X-Z)^3 - A24plus*(X+Z)^3 - [4*X^2 - (X+Z)^2 - (X-Z)^2]*[A24plus*(X+Z)^2 - A24minus*(X-Z)^2]
fp2sqr_mont(t1, t1); // t1 = t1^2
fp2mul_mont(t0, t1, Q->Z); // Z3 = 2*Z*t1
}
void xTPLe(const point_proj_t P, point_proj_t Q, const f2elm_t A24minus, const f2elm_t A24plus, const int e)
{ // Computes [3^e](X:Z) on Montgomery curve with projective constant via e repeated triplings.
// Input: projective Montgomery x-coordinates P = (XP:ZP), such that xP=XP/ZP and Montgomery curve constants A24plus = A+2C and A24minus = A-2C.
// Output: projective Montgomery x-coordinates Q <- (3^e)*P.
int i;
copy_words((digit_t *)P, (digit_t *)Q, 2 * 2 * NWORDS_FIELD);
for (i = 0; i < e; i++)
{
xTPL(Q, Q, A24minus, A24plus);
}
}
void get_3_isog(const point_proj_t P, f2elm_t A24minus, f2elm_t A24plus, f2elm_t *coeff)
{ // Computes the corresponding 3-isogeny of a projective Montgomery point (X3:Z3) of order 3.
// Input: projective point of order three P = (X3:Z3).
// Output: the 3-isogenous Montgomery curve with projective coefficient A/C.
f2elm_t t0, t1, t2, t3, t4;
fp2sub(P->X, P->Z, coeff[0]); // coeff0 = X-Z
fp2sqr_mont(coeff[0], t0); // t0 = (X-Z)^2
fp2add(P->X, P->Z, coeff[1]); // coeff1 = X+Z
fp2sqr_mont(coeff[1], t1); // t1 = (X+Z)^2
fp2add(t0, t1, t2); // t2 = (X+Z)^2 + (X-Z)^2
fp2add(coeff[0], coeff[1], t3); // t3 = 2*X
fp2sqr_mont(t3, t3); // t3 = 4*X^2
fp2sub(t3, t2, t3); // t3 = 4*X^2 - (X+Z)^2 - (X-Z)^2
fp2add(t1, t3, t2); // t2 = 4*X^2 - (X-Z)^2
fp2add(t3, t0, t3); // t3 = 4*X^2 - (X+Z)^2
fp2add(t0, t3, t4); // t4 = 4*X^2 - (X+Z)^2 + (X-Z)^2
fp2add(t4, t4, t4); // t4 = 2(4*X^2 - (X+Z)^2 + (X-Z)^2)
fp2add(t1, t4, t4); // t4 = 8*X^2 - (X+Z)^2 + 2*(X-Z)^2
fp2mul_mont(t2, t4, A24minus); // A24minus = [4*X^2 - (X-Z)^2]*[8*X^2 - (X+Z)^2 + 2*(X-Z)^2]
fp2add(t1, t2, t4); // t4 = 4*X^2 + (X+Z)^2 - (X-Z)^2
fp2add(t4, t4, t4); // t4 = 2(4*X^2 + (X+Z)^2 - (X-Z)^2)
fp2add(t0, t4, t4); // t4 = 8*X^2 + 2*(X+Z)^2 - (X-Z)^2
fp2mul_mont(t3, t4, t4); // t4 = [4*X^2 - (X+Z)^2]*[8*X^2 + 2*(X+Z)^2 - (X-Z)^2]
fp2sub(t4, A24minus, t0); // t0 = [4*X^2 - (X+Z)^2]*[8*X^2 + 2*(X+Z)^2 - (X-Z)^2] - [4*X^2 - (X-Z)^2]*[8*X^2 - (X+Z)^2 + 2*(X-Z)^2]
fp2add(A24minus, t0, A24plus); // A24plus = 8*X^2 - (X+Z)^2 + 2*(X-Z)^2
}
void eval_3_isog(point_proj_t Q, const f2elm_t *coeff)
{ // Computes the 3-isogeny R=phi(X:Z), given projective point (X3:Z3) of order 3 on a Montgomery curve and
// a point P with 2 coefficients in coeff (computed in the function get_3_isog()).
// Inputs: projective points P = (X3:Z3) and Q = (X:Z).
// Output: the projective point Q <- phi(Q) = (X3:Z3).
f2elm_t t0, t1, t2;
fp2add(Q->X, Q->Z, t0); // t0 = X+Z
fp2sub(Q->X, Q->Z, t1); // t1 = X-Z
fp2mul_mont(t0, coeff[0], t0); // t0 = coeff0*(X+Z)
fp2mul_mont(t1, coeff[1], t1); // t1 = coeff1*(X-Z)
fp2add(t0, t1, t2); // t2 = coeff0*(X-Z) + coeff1*(X+Z)
fp2sub(t1, t0, t0); // t0 = coeff0*(X-Z) - coeff1*(X+Z)
fp2sqr_mont(t2, t2); // t2 = [coeff0*(X-Z) + coeff1*(X+Z)]^2
fp2sqr_mont(t0, t0); // t1 = [coeff0*(X-Z) - coeff1*(X+Z)]^2
fp2mul_mont(Q->X, t2, Q->X); // X3final = X*[coeff0*(X-Z) + coeff1*(X+Z)]^2
fp2mul_mont(Q->Z, t0, Q->Z); // Z3final = Z*[coeff0*(X-Z) - coeff1*(X+Z)]^2
}
void inv_3_way(f2elm_t z1, f2elm_t z2, f2elm_t z3)
{ // 3-way simultaneous inversion
// Input: z1,z2,z3
// Output: 1/z1,1/z2,1/z3 (override inputs).
f2elm_t t0, t1, t2, t3;
fp2mul_mont(z1, z2, t0); // t0 = z1*z2
fp2mul_mont(z3, t0, t1); // t1 = z1*z2*z3
fp2inv_mont(t1); // t1 = 1/(z1*z2*z3)
fp2mul_mont(z3, t1, t2); // t2 = 1/(z1*z2)
fp2mul_mont(t2, z2, t3); // t3 = 1/z1
fp2mul_mont(t2, z1, z2); // z2 = 1/z2
fp2mul_mont(t0, t1, z3); // z3 = 1/z3
fp2copy(t3, z1); // z1 = 1/z1
}
void get_A(const f2elm_t xP, const f2elm_t xQ, const f2elm_t xR, f2elm_t A)
{ // Given the x-coordinates of P, Q, and R, returns the value A corresponding to the Montgomery curve E_A: y^2=x^3+A*x^2+x such that R=Q-P on E_A.
// Input: the x-coordinates xP, xQ, and xR of the points P, Q and R.
// Output: the coefficient A corresponding to the curve E_A: y^2=x^3+A*x^2+x.
f2elm_t t0, t1, one = {0};
fpcopy((digit_t *)&Montgomery_one, one[0]);
fp2add(xP, xQ, t1); // t1 = xP+xQ
fp2mul_mont(xP, xQ, t0); // t0 = xP*xQ
fp2mul_mont(xR, t1, A); // A = xR*t1
fp2add(t0, A, A); // A = A+t0
fp2mul_mont(t0, xR, t0); // t0 = t0*xR
fp2sub(A, one, A); // A = A-1
fp2add(t0, t0, t0); // t0 = t0+t0
fp2add(t1, xR, t1); // t1 = t1+xR
fp2add(t0, t0, t0); // t0 = t0+t0
fp2sqr_mont(A, A); // A = A^2
fp2inv_mont(t0); // t0 = 1/t0
fp2mul_mont(A, t0, A); // A = A*t0
fp2sub(A, t1, A); // Afinal = A-t1
}
void j_inv(const f2elm_t A, const f2elm_t C, f2elm_t jinv)
{ // Computes the j-invariant of a Montgomery curve with projective constant.
// Input: A,C in GF(p^2).
// Output: j=256*(A^2-3*C^2)^3/(C^4*(A^2-4*C^2)), which is the j-invariant of the Montgomery curve B*y^2=x^3+(A/C)*x^2+x or (equivalently) j-invariant of B'*y^2=C*x^3+A*x^2+C*x.
f2elm_t t0, t1;
fp2sqr_mont(A, jinv); // jinv = A^2
fp2sqr_mont(C, t1); // t1 = C^2
fp2add(t1, t1, t0); // t0 = t1+t1
fp2sub(jinv, t0, t0); // t0 = jinv-t0
fp2sub(t0, t1, t0); // t0 = t0-t1
fp2sub(t0, t1, jinv); // jinv = t0-t1
fp2sqr_mont(t1, t1); // t1 = t1^2
fp2mul_mont(jinv, t1, jinv); // jinv = jinv*t1
fp2add(t0, t0, t0); // t0 = t0+t0
fp2add(t0, t0, t0); // t0 = t0+t0
fp2sqr_mont(t0, t1); // t1 = t0^2
fp2mul_mont(t0, t1, t0); // t0 = t0*t1
fp2add(t0, t0, t0); // t0 = t0+t0
fp2add(t0, t0, t0); // t0 = t0+t0
fp2inv_mont(jinv); // jinv = 1/jinv
fp2mul_mont(jinv, t0, jinv); // jinv = t0*jinv
}
void xDBLADD(point_proj_t P, point_proj_t Q, const f2elm_t xPQ, const f2elm_t A24)
{ // Simultaneous doubling and differential addition.
// Input: projective Montgomery points P=(XP:ZP) and Q=(XQ:ZQ) such that xP=XP/ZP and xQ=XQ/ZQ, affine difference xPQ=x(P-Q) and Montgomery curve constant A24=(A+2)/4.
// Output: projective Montgomery points P <- 2*P = (X2P:Z2P) such that x(2P)=X2P/Z2P, and Q <- P+Q = (XQP:ZQP) such that = x(Q+P)=XQP/ZQP.
f2elm_t t0, t1, t2;
fp2add(P->X, P->Z, t0); // t0 = XP+ZP
fp2sub(P->X, P->Z, t1); // t1 = XP-ZP
fp2sqr_mont(t0, P->X); // XP = (XP+ZP)^2
fp2sub(Q->X, Q->Z, t2); // t2 = XQ-ZQ
fp2correction(t2);
fp2add(Q->X, Q->Z, Q->X); // XQ = XQ+ZQ
fp2mul_mont(t2, t0, t0); // t0 = (XP+ZP)*(XQ-ZQ)
fp2sqr_mont(t1, P->Z); // ZP = (XP-ZP)^2
fp2mul_mont(Q->X, t1, t1); // t1 = (XP-ZP)*(XQ+ZQ)
fp2sub(P->X, P->Z, t2); // t2 = (XP+ZP)^2-(XP-ZP)^2
fp2mul_mont(P->X, P->Z, P->X); // XP = (XP+ZP)^2*(XP-ZP)^2
fp2mul_mont(t2, A24, Q->X); // XQ = A24*[(XP+ZP)^2-(XP-ZP)^2]
fp2sub(t0, t1, Q->Z); // ZQ = (XP+ZP)*(XQ-ZQ)-(XP-ZP)*(XQ+ZQ)
fp2add(Q->X, P->Z, P->Z); // ZP = A24*[(XP+ZP)^2-(XP-ZP)^2]+(XP-ZP)^2
fp2add(t0, t1, Q->X); // XQ = (XP+ZP)*(XQ-ZQ)+(XP-ZP)*(XQ+ZQ)
fp2mul_mont(P->Z, t2, P->Z); // ZP = [A24*[(XP+ZP)^2-(XP-ZP)^2]+(XP-ZP)^2]*[(XP+ZP)^2-(XP-ZP)^2]
fp2sqr_mont(Q->Z, Q->Z); // ZQ = [(XP+ZP)*(XQ-ZQ)-(XP-ZP)*(XQ+ZQ)]^2
fp2sqr_mont(Q->X, Q->X); // XQ = [(XP+ZP)*(XQ-ZQ)+(XP-ZP)*(XQ+ZQ)]^2
fp2mul_mont(Q->Z, xPQ, Q->Z); // ZQ = xPQ*[(XP+ZP)*(XQ-ZQ)-(XP-ZP)*(XQ+ZQ)]^2
}
static void swap_points(point_proj_t P, point_proj_t Q, const digit_t option)
{ // Swap points.
// If option = 0 then P <- P and Q <- Q, else if option = 0xFF...FF then P <- Q and Q <- P
digit_t temp;
unsigned int i;
for (i = 0; i < NWORDS_FIELD; i++)
{
temp = option & (P->X[0][i] ^ Q->X[0][i]);
P->X[0][i] = temp ^ P->X[0][i];
Q->X[0][i] = temp ^ Q->X[0][i];
temp = option & (P->Z[0][i] ^ Q->Z[0][i]);
P->Z[0][i] = temp ^ P->Z[0][i];
Q->Z[0][i] = temp ^ Q->Z[0][i];
temp = option & (P->X[1][i] ^ Q->X[1][i]);
P->X[1][i] = temp ^ P->X[1][i];
Q->X[1][i] = temp ^ Q->X[1][i];
temp = option & (P->Z[1][i] ^ Q->Z[1][i]);
P->Z[1][i] = temp ^ P->Z[1][i];
Q->Z[1][i] = temp ^ Q->Z[1][i];
}
}
static void LADDER3PT(const f2elm_t xP, const f2elm_t xQ, const f2elm_t xPQ, const digit_t *m, const unsigned int AliceOrBob, point_proj_t R, const f2elm_t A)
{
point_proj_t R0 = {0}, R2 = {0};
f2elm_t A24 = {0};
digit_t mask;
int i, nbits, bit, swap, prevbit = 0;
if (AliceOrBob == ALICE)
{
nbits = OALICE_BITS;
}
else
{
nbits = OBOB_BITS;
}
// Initializing constant
fpcopy((digit_t *)&Montgomery_one, A24[0]);
fp2add(A24, A24, A24);
fp2add(A, A24, A24);
fp2div2(A24, A24);
fp2div2(A24, A24); // A24 = (A+2)/4
// Initializing points
fp2copy(xQ, R0->X);
fpcopy((digit_t *)&Montgomery_one, (digit_t *)R0->Z);
fp2copy(xPQ, R2->X);
fpcopy((digit_t *)&Montgomery_one, (digit_t *)R2->Z);
fp2copy(xP, R->X);
fpcopy((digit_t *)&Montgomery_one, (digit_t *)R->Z);
fpzero((digit_t *)(R->Z)[1]);
// Main loop
for (i = 0; i < nbits; i++)
{
bit = (m[i >> LOG2RADIX] >> (i & (RADIX - 1))) & 1;
swap = bit ^ prevbit;
prevbit = bit;
mask = 0 - (digit_t)swap;
swap_points(R, R2, mask);
xDBLADD(R0, R2, R->X, A24);
fp2mul_mont(R2->X, R->Z, R2->X);
}
}

+ 867
- 0
sidh_ref/fp_x64.c 查看文件

@@ -0,0 +1,867 @@
/********************************************************************************************
* SIDH: an efficient supersingular isogeny-based cryptography library for ephemeral
* Diffie-Hellman key exchange.
*
* Copyright (c) Microsoft Corporation. All rights reserved.
*
*
* Abstract: modular arithmetic optimized for x64 platforms
*
*********************************************************************************************/
#include "P751_internal.h"
// Global constants
extern const uint64_t p751[NWORDS_FIELD];
extern const uint64_t p751p1[NWORDS_FIELD];
extern const uint64_t p751x2[NWORDS_FIELD];
__inline void fpadd751(const digit_t *a, const digit_t *b, digit_t *c)
{ // Modular addition, c = a+b mod p751.
// Inputs: a, b in [0, 2*p751-1]
// Output: c in [0, 2*p751-1]
#if (OS_TARGET == OS_WIN)
unsigned int i, carry = 0;
digit_t mask;
for (i = 0; i < NWORDS_FIELD; i++)
{
ADDC(carry, a[i], b[i], carry, c[i]);
}
carry = 0;
for (i = 0; i < NWORDS_FIELD; i++)
{
SUBC(carry, c[i], ((digit_t *)p751x2)[i], carry, c[i]);
}
mask = 0 - (digit_t)carry;
carry = 0;
for (i = 0; i < NWORDS_FIELD; i++)
{
ADDC(carry, c[i], ((digit_t *)p751x2)[i] & mask, carry, c[i]);
}
#elif (OS_TARGET == OS_LINUX)
fpadd751_asm(a, b, c);
#endif
}
__inline void fpsub751(const digit_t *a, const digit_t *b, digit_t *c)
{ // Modular subtraction, c = a-b mod p751.
// Inputs: a, b in [0, 2*p751-1]
// Output: c in [0, 2*p751-1]
#if (OS_TARGET == OS_WIN)
unsigned int i, borrow = 0;
digit_t mask;
for (i = 0; i < NWORDS_FIELD; i++)
{
SUBC(borrow, a[i], b[i], borrow, c[i]);
}
mask = 0 - (digit_t)borrow;
borrow = 0;
for (i = 0; i < NWORDS_FIELD; i++)
{
ADDC(borrow, c[i], ((digit_t *)p751x2)[i] & mask, borrow, c[i]);
}
#elif (OS_TARGET == OS_LINUX)
fpsub751_asm(a, b, c);
#endif
}
__inline void fpneg751(digit_t *a)
{ // Modular negation, a = -a mod p751.
// Input/output: a in [0, 2*p751-1]
unsigned int i, borrow = 0;
for (i = 0; i < NWORDS_FIELD; i++)
{
SUBC(borrow, ((digit_t *)p751x2)[i], a[i], borrow, a[i]);
}
}
void fpdiv2_751(const digit_t *a, digit_t *c)
{ // Modular division by two, c = a/2 mod p751.
// Input : a in [0, 2*p751-1]
// Output: c in [0, 2*p751-1]
unsigned int i, carry = 0;
digit_t mask;
mask = 0 - (digit_t)(a[0] & 1); // If a is odd compute a+p751
for (i = 0; i < NWORDS_FIELD; i++)
{
ADDC(carry, a[i], ((digit_t *)p751)[i] & mask, carry, c[i]);
}
mp_shiftr1(c, NWORDS_FIELD);
}
void fpcorrection751(digit_t *a)
{ // Modular correction to reduce field element a in [0, 2*p751-1] to [0, p751-1].
unsigned int i, borrow = 0;
digit_t mask;
for (i = 0; i < NWORDS_FIELD; i++)
{
SUBC(borrow, a[i], ((digit_t *)p751)[i], borrow, a[i]);
}
mask = 0 - (digit_t)borrow;
borrow = 0;
for (i = 0; i < NWORDS_FIELD; i++)
{
ADDC(borrow, a[i], ((digit_t *)p751)[i] & mask, borrow, a[i]);
}
}
void mp_mul(const digit_t *a, const digit_t *b, digit_t *c, const unsigned int nwords)
{ // Multiprecision multiply, c = a*b, where lng(a) = lng(b) = nwords.
UNREFERENCED_PARAMETER(nwords);
#if (OS_TARGET == OS_WIN)
digit_t t = 0;
uint128_t uv = {0};
unsigned int carry = 0;
MULADD128(a[0], b[0], uv, carry, uv);
t += carry;
c[0] = uv[0];
uv[0] = uv[1];
uv[1] = t;
t = 0;
MULADD128(a[0], b[1], uv, carry, uv);
t += carry;
MULADD128(a[1], b[0], uv, carry, uv);
t += carry;
c[1] = uv[0];
uv[0] = uv[1];
uv[1] = t;
t = 0;
MULADD128(a[0], b[2], uv, carry, uv);
t += carry;
MULADD128(a[1], b[1], uv, carry, uv);
t += carry;
MULADD128(a[2], b[0], uv, carry, uv);
t += carry;
c[2] = uv[0];
uv[0] = uv[1];
uv[1] = t;
t = 0;
MULADD128(a[0], b[3], uv, carry, uv);
t += carry;
MULADD128(a[2], b[1], uv, carry, uv);
t += carry;
MULADD128(a[1], b[2], uv, carry, uv);
t += carry;
MULADD128(a[3], b[0], uv, carry, uv);
t += carry;
c[3] = uv[0];
uv[0] = uv[1];
uv[1] = t;
t = 0;
MULADD128(a[0], b[4], uv, carry, uv);
t += carry;
MULADD128(a[3], b[1], uv, carry, uv);
t += carry;
MULADD128(a[2], b[2], uv, carry, uv);
t += carry;
MULADD128(a[1], b[3], uv, carry, uv);
t += carry;
MULADD128(a[4], b[0], uv, carry, uv);
t += carry;
c[4] = uv[0];
uv[0] = uv[1];
uv[1] = t;
t = 0;
MULADD128(a[0], b[5], uv, carry, uv);
t += carry;
MULADD128(a[4], b[1], uv, carry, uv);
t += carry;
MULADD128(a[3], b[2], uv, carry, uv);
t += carry;
MULADD128(a[2], b[3], uv, carry, uv);
t += carry;
MULADD128(a[1], b[4], uv, carry, uv);
t += carry;
MULADD128(a[5], b[0], uv, carry, uv);
t += carry;
c[5] = uv[0];
uv[0] = uv[1];
uv[1] = t;
t = 0;
MULADD128(a[0], b[6], uv, carry, uv);
t += carry;
MULADD128(a[5], b[1], uv, carry, uv);
t += carry;
MULADD128(a[4], b[2], uv, carry, uv);
t += carry;
MULADD128(a[3], b[3], uv, carry, uv);
t += carry;
MULADD128(a[2], b[4], uv, carry, uv);
t += carry;
MULADD128(a[1], b[5], uv, carry, uv);
t += carry;
MULADD128(a[6], b[0], uv, carry, uv);
t += carry;
c[6] = uv[0];
uv[0] = uv[1];
uv[1] = t;
t = 0;
MULADD128(a[0], b[7], uv, carry, uv);
t += carry;
MULADD128(a[6], b[1], uv, carry, uv);
t += carry;
MULADD128(a[5], b[2], uv, carry, uv);
t += carry;
MULADD128(a[4], b[3], uv, carry, uv);
t += carry;
MULADD128(a[3], b[4], uv, carry, uv);
t += carry;
MULADD128(a[2], b[5], uv, carry, uv);
t += carry;
MULADD128(a[1], b[6], uv, carry, uv);
t += carry;
MULADD128(a[7], b[0], uv, carry, uv);
t += carry;
c[7] = uv[0];
uv[0] = uv[1];
uv[1] = t;
t = 0;
MULADD128(a[0], b[8], uv, carry, uv);
t += carry;
MULADD128(a[7], b[1], uv, carry, uv);
t += carry;
MULADD128(a[6], b[2], uv, carry, uv);
t += carry;
MULADD128(a[5], b[3], uv, carry, uv);
t += carry;
MULADD128(a[4], b[4], uv, carry, uv);
t += carry;
MULADD128(a[3], b[5], uv, carry, uv);
t += carry;
MULADD128(a[2], b[6], uv, carry, uv);
t += carry;
MULADD128(a[1], b[7], uv, carry, uv);
t += carry;
MULADD128(a[8], b[0], uv, carry, uv);
t += carry;
c[8] = uv[0];
uv[0] = uv[1];
uv[1] = t;
t = 0;
MULADD128(a[0], b[9], uv, carry, uv);
t += carry;
MULADD128(a[8], b[1], uv, carry, uv);
t += carry;
MULADD128(a[7], b[2], uv, carry, uv);
t += carry;
MULADD128(a[6], b[3], uv, carry, uv);
t += carry;
MULADD128(a[5], b[4], uv, carry, uv);
t += carry;
MULADD128(a[4], b[5], uv, carry, uv);
t += carry;
MULADD128(a[3], b[6], uv, carry, uv);
t += carry;
MULADD128(a[2], b[7], uv, carry, uv);
t += carry;
MULADD128(a[1], b[8], uv, carry, uv);
t += carry;
MULADD128(a[9], b[0], uv, carry, uv);
t += carry;
c[9] = uv[0];
uv[0] = uv[1];
uv[1] = t;
t = 0;
MULADD128(a[0], b[10], uv, carry, uv);
t += carry;
MULADD128(a[9], b[1], uv, carry, uv);
t += carry;
MULADD128(a[8], b[2], uv, carry, uv);
t += carry;
MULADD128(a[7], b[3], uv, carry, uv);
t += carry;
MULADD128(a[6], b[4], uv, carry, uv);
t += carry;
MULADD128(a[5], b[5], uv, carry, uv);
t += carry;
MULADD128(a[4], b[6], uv, carry, uv);
t += carry;
MULADD128(a[3], b[7], uv, carry, uv);
t += carry;
MULADD128(a[2], b[8], uv, carry, uv);
t += carry;
MULADD128(a[1], b[9], uv, carry, uv);
t += carry;
MULADD128(a[10], b[0], uv, carry, uv);
t += carry;
c[10] = uv[0];
uv[0] = uv[1];
uv[1] = t;
t = 0;
MULADD128(a[0], b[11], uv, carry, uv);
t += carry;
MULADD128(a[10], b[1], uv, carry, uv);
t += carry;
MULADD128(a[9], b[2], uv, carry, uv);
t += carry;
MULADD128(a[8], b[3], uv, carry, uv);
t += carry;
MULADD128(a[7], b[4], uv, carry, uv);
t += carry;
MULADD128(a[6], b[5], uv, carry, uv);
t += carry;
MULADD128(a[5], b[6], uv, carry, uv);
t += carry;
MULADD128(a[4], b[7], uv, carry, uv);
t += carry;
MULADD128(a[3], b[8], uv, carry, uv);
t += carry;
MULADD128(a[2], b[9], uv, carry, uv);
t += carry;
MULADD128(a[1], b[10], uv, carry, uv);
t += carry;
MULADD128(a[11], b[0], uv, carry, uv);
t += carry;
c[11] = uv[0];
uv[0] = uv[1];
uv[1] = t;
t = 0;
MULADD128(a[1], b[11], uv, carry, uv);
t += carry;
MULADD128(a[10], b[2], uv, carry, uv);
t += carry;
MULADD128(a[9], b[3], uv, carry, uv);
t += carry;
MULADD128(a[8], b[4], uv, carry, uv);
t += carry;
MULADD128(a[7], b[5], uv, carry, uv);
t += carry;
MULADD128(a[6], b[6], uv, carry, uv);
t += carry;
MULADD128(a[5], b[7], uv, carry, uv);
t += carry;
MULADD128(a[4], b[8], uv, carry, uv);
t += carry;
MULADD128(a[3], b[9], uv, carry, uv);
t += carry;
MULADD128(a[2], b[10], uv, carry, uv);
t += carry;
MULADD128(a[11], b[1], uv, carry, uv);
t += carry;
c[12] = uv[0];
uv[0] = uv[1];
uv[1] = t;
t = 0;
MULADD128(a[11], b[2], uv, carry, uv);
t += carry;
MULADD128(a[10], b[3], uv, carry, uv);
t += carry;
MULADD128(a[9], b[4], uv, carry, uv);
t += carry;
MULADD128(a[8], b[5], uv, carry, uv);
t += carry;
MULADD128(a[7], b[6], uv, carry, uv);
t += carry;
MULADD128(a[6], b[7], uv, carry, uv);
t += carry;
MULADD128(a[5], b[8], uv, carry, uv);
t += carry;
MULADD128(a[4], b[9], uv, carry, uv);
t += carry;
MULADD128(a[3], b[10], uv, carry, uv);
t += carry;
MULADD128(a[2], b[11], uv, carry, uv);
t += carry;
c[13] = uv[0];
uv[0] = uv[1];
uv[1] = t;
t = 0;
MULADD128(a[11], b[3], uv, carry, uv);
t += carry;
MULADD128(a[10], b[4], uv, carry, uv);
t += carry;
MULADD128(a[9], b[5], uv, carry, uv);
t += carry;
MULADD128(a[8], b[6], uv, carry, uv);
t += carry;
MULADD128(a[7], b[7], uv, carry, uv);
t += carry;
MULADD128(a[6], b[8], uv, carry, uv);
t += carry;
MULADD128(a[5], b[9], uv, carry, uv);
t += carry;
MULADD128(a[4], b[10], uv, carry, uv);
t += carry;
MULADD128(a[3], b[11], uv, carry, uv);
t += carry;
c[14] = uv[0];
uv[0] = uv[1];
uv[1] = t;
t = 0;
MULADD128(a[11], b[4], uv, carry, uv);
t += carry;
MULADD128(a[10], b[5], uv, carry, uv);
t += carry;
MULADD128(a[9], b[6], uv, carry, uv);
t += carry;
MULADD128(a[8], b[7], uv, carry, uv);
t += carry;
MULADD128(a[7], b[8], uv, carry, uv);
t += carry;
MULADD128(a[6], b[9], uv, carry, uv);
t += carry;
MULADD128(a[5], b[10], uv, carry, uv);
t += carry;
MULADD128(a[4], b[11], uv, carry, uv);
t += carry;
c[15] = uv[0];
uv[0] = uv[1];
uv[1] = t;
t = 0;
MULADD128(a[11], b[5], uv, carry, uv);
t += carry;
MULADD128(a[10], b[6], uv, carry, uv);
t += carry;
MULADD128(a[9], b[7], uv, carry, uv);
t += carry;
MULADD128(a[8], b[8], uv, carry, uv);
t += carry;
MULADD128(a[7], b[9], uv, carry, uv);
t += carry;
MULADD128(a[6], b[10], uv, carry, uv);
t += carry;
MULADD128(a[5], b[11], uv, carry, uv);
t += carry;
c[16] = uv[0];
uv[0] = uv[1];
uv[1] = t;
t = 0;
MULADD128(a[11], b[6], uv, carry, uv);
t += carry;
MULADD128(a[10], b[7], uv, carry, uv);
t += carry;
MULADD128(a[9], b[8], uv, carry, uv);
t += carry;
MULADD128(a[8], b[9], uv, carry, uv);
t += carry;
MULADD128(a[7], b[10], uv, carry, uv);
t += carry;
MULADD128(a[6], b[11], uv, carry, uv);
t += carry;
c[17] = uv[0];
uv[0] = uv[1];
uv[1] = t;
t = 0;
MULADD128(a[11], b[7], uv, carry, uv);
t += carry;
MULADD128(a[10], b[8], uv, carry, uv);
t += carry;
MULADD128(a[9], b[9], uv, carry, uv);
t += carry;
MULADD128(a[8], b[10], uv, carry, uv);
t += carry;
MULADD128(a[7], b[11], uv, carry, uv);
t += carry;
c[18] = uv[0];
uv[0] = uv[1];
uv[1] = t;
t = 0;
MULADD128(a[11], b[8], uv, carry, uv);
t += carry;
MULADD128(a[10], b[9], uv, carry, uv);
t += carry;
MULADD128(a[9], b[10], uv, carry, uv);
t += carry;
MULADD128(a[8], b[11], uv, carry, uv);
t += carry;
c[19] = uv[0];
uv[0] = uv[1];
uv[1] = t;
t = 0;
MULADD128(a[11], b[9], uv, carry, uv);
t += carry;
MULADD128(a[10], b[10], uv, carry, uv);
t += carry;
MULADD128(a[9], b[11], uv, carry, uv);
t += carry;
c[20] = uv[0];
uv[0] = uv[1];
uv[1] = t;
t = 0;
MULADD128(a[11], b[10], uv, carry, uv);
t += carry;
MULADD128(a[10], b[11], uv, carry, uv);
t += carry;
c[21] = uv[0];
uv[0] = uv[1];
uv[1] = t;
MULADD128(a[11], b[11], uv, carry, uv);
c[22] = uv[0];
c[23] = uv[1];
#elif (OS_TARGET == OS_LINUX)
mul751_asm(a, b, c);
#endif
}
void rdc_mont(const dfelm_t ma, felm_t mc)
{ // Efficient Montgomery reduction using comba and exploiting the special form of the prime p751.
// mc = ma*R^-1 mod p751x2, where R = 2^768.
// If ma < 2^768*p751, the output mc is in the range [0, 2*p751-1].
// ma is assumed to be in Montgomery representation.
#if (OS_TARGET == OS_WIN)
unsigned int carry;
digit_t t = 0;
uint128_t uv = {0};
mc[0] = ma[0];
mc[1] = ma[1];
mc[2] = ma[2];
mc[3] = ma[3];
mc[4] = ma[4];
MUL128(mc[0], ((digit_t *)p751p1)[5], uv);
ADDC(0, uv[0], ma[5], carry, uv[0]);
ADDC(carry, uv[1], 0, carry, uv[1]);
mc[5] = uv[0];
uv[0] = uv[1];
uv[1] = 0;
MULADD128(mc[0], ((digit_t *)p751p1)[6], uv, carry, uv);
MULADD128(mc[1], ((digit_t *)p751p1)[5], uv, carry, uv);
t += carry;
ADDC(0, uv[0], ma[6], carry, uv[0]);
ADDC(carry, uv[1], 0, carry, uv[1]);
t += carry;
mc[6] = uv[0];
uv[0] = uv[1];
uv[1] = t;
t = 0;
MULADD128(mc[0], ((digit_t *)p751p1)[7], uv, carry, uv);
t += carry;
MULADD128(mc[1], ((digit_t *)p751p1)[6], uv, carry, uv);
t += carry;
MULADD128(mc[2], ((digit_t *)p751p1)[5], uv, carry, uv);
t += carry;
ADDC(0, uv[0], ma[7], carry, uv[0]);
ADDC(carry, uv[1], 0, carry, uv[1]);
t += carry;
mc[7] = uv[0];
uv[0] = uv[1];
uv[1] = t;
t = 0;
MULADD128(mc[0], ((digit_t *)p751p1)[8], uv, carry, uv);
t += carry;
MULADD128(mc[1], ((digit_t *)p751p1)[7], uv, carry, uv);
t += carry;
MULADD128(mc[2], ((digit_t *)p751p1)[6], uv, carry, uv);
t += carry;
MULADD128(mc[3], ((digit_t *)p751p1)[5], uv, carry, uv);
t += carry;
ADDC(0, uv[0], ma[8], carry, uv[0]);
ADDC(carry, uv[1], 0, carry, uv[1]);
t += carry;
mc[8] = uv[0];
uv[0] = uv[1];
uv[1] = t;
t = 0;
MULADD128(mc[0], ((digit_t *)p751p1)[9], uv, carry, uv);
t += carry;
MULADD128(mc[1], ((digit_t *)p751p1)[8], uv, carry, uv);
t += carry;
MULADD128(mc[2], ((digit_t *)p751p1)[7], uv, carry, uv);
t += carry;
MULADD128(mc[3], ((digit_t *)p751p1)[6], uv, carry, uv);
t += carry;
MULADD128(mc[4], ((digit_t *)p751p1)[5], uv, carry, uv);
t += carry;
ADDC(0, uv[0], ma[9], carry, uv[0]);
ADDC(carry, uv[1], 0, carry, uv[1]);
t += carry;
mc[9] = uv[0];
uv[0] = uv[1];
uv[1] = t;
t = 0;
MULADD128(mc[0], ((digit_t *)p751p1)[10], uv, carry, uv);
t += carry;
MULADD128(mc[1], ((digit_t *)p751p1)[9], uv, carry, uv);
t += carry;
MULADD128(mc[2], ((digit_t *)p751p1)[8], uv, carry, uv);
t += carry;
MULADD128(mc[3], ((digit_t *)p751p1)[7], uv, carry, uv);
t += carry;
MULADD128(mc[4], ((digit_t *)p751p1)[6], uv, carry, uv);
t += carry;
MULADD128(mc[5], ((digit_t *)p751p1)[5], uv, carry, uv);
t += carry;
ADDC(0, uv[0], ma[10], carry, uv[0]);
ADDC(carry, uv[1], 0, carry, uv[1]);
t += carry;
mc[10] = uv[0];
uv[0] = uv[1];
uv[1] = t;
t = 0;
MULADD128(mc[0], ((digit_t *)p751p1)[11], uv, carry, uv);
t += carry;
MULADD128(mc[1], ((digit_t *)p751p1)[10], uv, carry, uv);
t += carry;
MULADD128(mc[2], ((digit_t *)p751p1)[9], uv, carry, uv);
t += carry;
MULADD128(mc[3], ((digit_t *)p751p1)[8], uv, carry, uv);
t += carry;
MULADD128(mc[4], ((digit_t *)p751p1)[7], uv, carry, uv);
t += carry;
MULADD128(mc[5], ((digit_t *)p751p1)[6], uv, carry, uv);
t += carry;
MULADD128(mc[6], ((digit_t *)p751p1)[5], uv, carry, uv);
t += carry;
ADDC(0, uv[0], ma[11], carry, uv[0]);
ADDC(carry, uv[1], 0, carry, uv[1]);
t += carry;
mc[11] = uv[0];
uv[0] = uv[1];
uv[1] = t;
t = 0;
MULADD128(mc[1], ((digit_t *)p751p1)[11], uv, carry, uv);
t += carry;
MULADD128(mc[2], ((digit_t *)p751p1)[10], uv, carry, uv);
t += carry;
MULADD128(mc[3], ((digit_t *)p751p1)[9], uv, carry, uv);
t += carry;
MULADD128(mc[4], ((digit_t *)p751p1)[8], uv, carry, uv);
t += carry;
MULADD128(mc[5], ((digit_t *)p751p1)[7], uv, carry, uv);
t += carry;
MULADD128(mc[6], ((digit_t *)p751p1)[6], uv, carry, uv);
t += carry;
MULADD128(mc[7], ((digit_t *)p751p1)[5], uv, carry, uv);
t += carry;
ADDC(0, uv[0], ma[12], carry, uv[0]);
ADDC(carry, uv[1], 0, carry, uv[1]);
t += carry;
mc[0] = uv[0];
uv[0] = uv[1];
uv[1] = t;
t = 0;
MULADD128(mc[2], ((digit_t *)p751p1)[11], uv, carry, uv);
t += carry;
MULADD128(mc[3], ((digit_t *)p751p1)[10], uv, carry, uv);
t += carry;
MULADD128(mc[4], ((digit_t *)p751p1)[9], uv, carry, uv);
t += carry;
MULADD128(mc[5], ((digit_t *)p751p1)[8], uv, carry, uv);
t += carry;
MULADD128(mc[6], ((digit_t *)p751p1)[7], uv, carry, uv);
t += carry;
MULADD128(mc[7], ((digit_t *)p751p1)[6], uv, carry, uv);
t += carry;
MULADD128(mc[8], ((digit_t *)p751p1)[5], uv, carry, uv);
t += carry;
ADDC(0, uv[0], ma[13], carry, uv[0]);
ADDC(carry, uv[1], 0, carry, uv[1]);
t += carry;
mc[1] = uv[0];
uv[0] = uv[1];
uv[1] = t;
t = 0;
MULADD128(mc[3], ((digit_t *)p751p1)[11], uv, carry, uv);
t += carry;
MULADD128(mc[4], ((digit_t *)p751p1)[10], uv, carry, uv);
t += carry;
MULADD128(mc[5], ((digit_t *)p751p1)[9], uv, carry, uv);
t += carry;
MULADD128(mc[6], ((digit_t *)p751p1)[8], uv, carry, uv);
t += carry;
MULADD128(mc[7], ((digit_t *)p751p1)[7], uv, carry, uv);
t += carry;
MULADD128(mc[8], ((digit_t *)p751p1)[6], uv, carry, uv);
t += carry;
MULADD128(mc[9], ((digit_t *)p751p1)[5], uv, carry, uv);
t += carry;
ADDC(0, uv[0], ma[14], carry, uv[0]);
ADDC(carry, uv[1], 0, carry, uv[1]);
t += carry;
mc[2] = uv[0];
uv[0] = uv[1];
uv[1] = t;
t = 0;
MULADD128(mc[4], ((digit_t *)p751p1)[11], uv, carry, uv);
t += carry;
MULADD128(mc[5], ((digit_t *)p751p1)[10], uv, carry, uv);
t += carry;
MULADD128(mc[6], ((digit_t *)p751p1)[9], uv, carry, uv);
t += carry;
MULADD128(mc[7], ((digit_t *)p751p1)[8], uv, carry, uv);
t += carry;
MULADD128(mc[8], ((digit_t *)p751p1)[7], uv, carry, uv);
t += carry;
MULADD128(mc[9], ((digit_t *)p751p1)[6], uv, carry, uv);
t += carry;
MULADD128(mc[10], ((digit_t *)p751p1)[5], uv, carry, uv);
t += carry;
ADDC(0, uv[0], ma[15], carry, uv[0]);
ADDC(carry, uv[1], 0, carry, uv[1]);
t += carry;
mc[3] = uv[0];
uv[0] = uv[1];
uv[1] = t;
t = 0;
MULADD128(mc[5], ((digit_t *)p751p1)[11], uv, carry, uv);
t += carry;
MULADD128(mc[6], ((digit_t *)p751p1)[10], uv, carry, uv);
t += carry;
MULADD128(mc[7], ((digit_t *)p751p1)[9], uv, carry, uv);
t += carry;
MULADD128(mc[8], ((digit_t *)p751p1)[8], uv, carry, uv);
t += carry;
MULADD128(mc[9], ((digit_t *)p751p1)[7], uv, carry, uv);
t += carry;
MULADD128(mc[10], ((digit_t *)p751p1)[6], uv, carry, uv);
t += carry;
MULADD128(mc[11], ((digit_t *)p751p1)[5], uv, carry, uv);
t += carry;
ADDC(0, uv[0], ma[16], carry, uv[0]);
ADDC(carry, uv[1], 0, carry, uv[1]);
t += carry;
mc[4] = uv[0];
uv[0] = uv[1];
uv[1] = t;
t = 0;
MULADD128(mc[6], ((digit_t *)p751p1)[11], uv, carry, uv);
t += carry;
MULADD128(mc[7], ((digit_t *)p751p1)[10], uv, carry, uv);
t += carry;
MULADD128(mc[8], ((digit_t *)p751p1)[9], uv, carry, uv);
t += carry;
MULADD128(mc[9], ((digit_t *)p751p1)[8], uv, carry, uv);
t += carry;
MULADD128(mc[10], ((digit_t *)p751p1)[7], uv, carry, uv);
t += carry;
MULADD128(mc[11], ((digit_t *)p751p1)[6], uv, carry, uv);
t += carry;
ADDC(0, uv[0], ma[17], carry, uv[0]);
ADDC(carry, uv[1], 0, carry, uv[1]);
t += carry;
mc[5] = uv[0];
uv[0] = uv[1];
uv[1] = t;
t = 0;
MULADD128(mc[7], ((digit_t *)p751p1)[11], uv, carry, uv);
t += carry;
MULADD128(mc[8], ((digit_t *)p751p1)[10], uv, carry, uv);
t += carry;
MULADD128(mc[9], ((digit_t *)p751p1)[9], uv, carry, uv);
t += carry;
MULADD128(mc[10], ((digit_t *)p751p1)[8], uv, carry, uv);
t += carry;
MULADD128(mc[11], ((digit_t *)p751p1)[7], uv, carry, uv);
t += carry;
ADDC(0, uv[0], ma[18], carry, uv[0]);
ADDC(carry, uv[1], 0, carry, uv[1]);
t += carry;
mc[6] = uv[0];
uv[0] = uv[1];
uv[1] = t;
t = 0;
MULADD128(mc[8], ((digit_t *)p751p1)[11], uv, carry, uv);
t += carry;
MULADD128(mc[9], ((digit_t *)p751p1)[10], uv, carry, uv);
t += carry;
MULADD128(mc[10], ((digit_t *)p751p1)[9], uv, carry, uv);
t += carry;
MULADD128(mc[11], ((digit_t *)p751p1)[8], uv, carry, uv);
t += carry;
ADDC(0, uv[0], ma[19], carry, uv[0]);
ADDC(carry, uv[1], 0, carry, uv[1]);
t += carry;
mc[7] = uv[0];
uv[0] = uv[1];
uv[1] = t;
t = 0;
MULADD128(mc[9], ((digit_t *)p751p1)[11], uv, carry, uv);
t += carry;
MULADD128(mc[10], ((digit_t *)p751p1)[10], uv, carry, uv);
t += carry;
MULADD128(mc[11], ((digit_t *)p751p1)[9], uv, carry, uv);
t += carry;
ADDC(0, uv[0], ma[20], carry, uv[0]);
ADDC(carry, uv[1], 0, carry, uv[1]);
t += carry;
mc[8] = uv[0];
uv[0] = uv[1];
uv[1] = t;
t = 0;
MULADD128(mc[10], ((digit_t *)p751p1)[11], uv, carry, uv);
t += carry;
MULADD128(mc[11], ((digit_t *)p751p1)[10], uv, carry, uv);
t += carry;
ADDC(0, uv[0], ma[21], carry, uv[0]);
ADDC(carry, uv[1], 0, carry, uv[1]);
t += carry;
mc[9] = uv[0];
uv[0] = uv[1];
uv[1] = t;
t = 0;
MULADD128(mc[11], ((digit_t *)p751p1)[11], uv, carry, uv);
t += carry;
ADDC(0, uv[0], ma[22], carry, mc[10]);
ADDC(carry, uv[1], 0, carry, uv[1]);
ADDC(0, uv[1], ma[23], carry, mc[11]);
#elif (OS_TARGET == OS_LINUX)
rdc751_asm(ma, mc);
#endif
}

+ 2644
- 0
sidh_ref/fp_x64_asm.S
文件差異過大導致無法顯示
查看文件


+ 474
- 0
sidh_ref/fpx.c 查看文件

@@ -0,0 +1,474 @@
/********************************************************************************************
* Supersingular Isogeny Key Encapsulation Library
*
* Abstract: core functions over GF(p) and GF(p^2)
*********************************************************************************************/
#include "P751_internal.h"
__inline void fpcopy(const felm_t a, felm_t c)
{ // Copy a field element, c = a.
unsigned int i;
for (i = 0; i < NWORDS_FIELD; i++)
c[i] = a[i];
}
__inline void fpzero(felm_t a)
{ // Zero a field element, a = 0.
unsigned int i;
for (i = 0; i < NWORDS_FIELD; i++)
a[i] = 0;
}
void to_mont(const felm_t a, felm_t mc)
{ // Conversion to Montgomery representation,
// mc = a*R^2*R^(-1) mod p = a*R mod p, where a in [0, p-1].
// The Montgomery constant R^2 mod p is the global value "Montgomery_R2".
fpmul_mont(a, (digit_t *)&Montgomery_R2, mc);
}
void from_mont(const felm_t ma, felm_t c)
{ // Conversion from Montgomery representation to standard representation,
// c = ma*R^(-1) mod p = a mod p, where ma in [0, p-1].
digit_t one[NWORDS_FIELD] = {0};
one[0] = 1;
fpmul_mont(ma, one, c);
fpcorrection(c);
}
void copy_words(const digit_t *a, digit_t *c, const unsigned int nwords)
{ // Copy wordsize digits, c = a, where lng(a) = nwords.
unsigned int i;
for (i = 0; i < nwords; i++)
{
c[i] = a[i];
}
}
void fpmul_mont(const felm_t ma, const felm_t mb, felm_t mc)
{ // Multiprecision multiplication, c = a*b mod p.
dfelm_t temp = {0};
mp_mul(ma, mb, temp, NWORDS_FIELD);
rdc_mont(temp, mc);
}
void fpsqr_mont(const felm_t ma, felm_t mc)
{ // Multiprecision squaring, c = a^2 mod p.
dfelm_t temp = {0};
mp_mul(ma, ma, temp, NWORDS_FIELD);
rdc_mont(temp, mc);
}
void fpinv_mont(felm_t a)
{ // Field inversion using Montgomery arithmetic, a = a^(-1)*R mod p.
felm_t tt;
fpcopy(a, tt);
fpinv_chain_mont(tt);
fpsqr_mont(tt, tt);
fpsqr_mont(tt, tt);
fpmul_mont(a, tt, a);
}
void fp2copy(const f2elm_t a, f2elm_t c)
{ // Copy a GF(p^2) element, c = a.
fpcopy(a[0], c[0]);
fpcopy(a[1], c[1]);
}
void fp2zero(f2elm_t a)
{ // Zero a GF(p^2) element, a = 0.
fpzero(a[0]);
fpzero(a[1]);
}
void fp2neg(f2elm_t a)
{ // GF(p^2) negation, a = -a in GF(p^2).
fpneg(a[0]);
fpneg(a[1]);
}
__inline void fp2add(const f2elm_t a, const f2elm_t b, f2elm_t c)
{ // GF(p^2) addition, c = a+b in GF(p^2).
fpadd(a[0], b[0], c[0]);
fpadd(a[1], b[1], c[1]);
}
__inline void fp2sub(const f2elm_t a, const f2elm_t b, f2elm_t c)
{ // GF(p^2) subtraction, c = a-b in GF(p^2).
fpsub(a[0], b[0], c[0]);
fpsub(a[1], b[1], c[1]);
}
void fp2div2(const f2elm_t a, f2elm_t c)
{ // GF(p^2) division by two, c = a/2 in GF(p^2).
fpdiv2(a[0], c[0]);
fpdiv2(a[1], c[1]);
}
void fp2correction(f2elm_t a)
{ // Modular correction, a = a in GF(p^2).
fpcorrection(a[0]);
fpcorrection(a[1]);
}
__inline static void mp_addfast(const digit_t *a, const digit_t *b, digit_t *c)
{ // Multiprecision addition, c = a+b.
mp_add_asm(a, b, c);
}
__inline static void mp_addfastx2(const digit_t *a, const digit_t *b, digit_t *c)
{ // Double-length multiprecision addition, c = a+b.
mp_addx2_asm(a, b, c);
}
void fp2sqr_mont(const f2elm_t a, f2elm_t c)
{ // GF(p^2) squaring using Montgomery arithmetic, c = a^2 in GF(p^2).
// Inputs: a = a0+a1*i, where a0, a1 are in [0, 2*p-1]
// Output: c = c0+c1*i, where c0, c1 are in [0, 2*p-1]
felm_t t1, t2, t3;
mp_addfast(a[0], a[1], t1); // t1 = a0+a1
fpsub(a[0], a[1], t2); // t2 = a0-a1
mp_addfast(a[0], a[0], t3); // t3 = 2a0
fpmul_mont(t1, t2, c[0]); // c0 = (a0+a1)(a0-a1)
fpmul_mont(t3, a[1], c[1]); // c1 = 2a0*a1
}
__inline unsigned int mp_sub(const digit_t *a, const digit_t *b, digit_t *c, const unsigned int nwords)
{ // Multiprecision subtraction, c = a-b, where lng(a) = lng(b) = nwords. Returns the borrow bit.
unsigned int i, borrow = 0;
for (i = 0; i < nwords; i++)
{
SUBC(borrow, a[i], b[i], borrow, c[i]);
}
return borrow;
}
__inline static digit_t mp_subfast(const digit_t *a, const digit_t *b, digit_t *c)
{ // Multiprecision subtraction, c = a-b, where lng(a) = lng(b) = 2*NWORDS_FIELD.
// If c < 0 then returns mask = 0xFF..F, else mask = 0x00..0
return mp_subx2_asm(a, b, c);
}
void fp2mul_mont(const f2elm_t a, const f2elm_t b, f2elm_t c)
{ // GF(p^2) multiplication using Montgomery arithmetic, c = a*b in GF(p^2).
// Inputs: a = a0+a1*i and b = b0+b1*i, where a0, a1, b0, b1 are in [0, 2*p-1]
// Output: c = c0+c1*i, where c0, c1 are in [0, 2*p-1]
felm_t t1, t2;
dfelm_t tt1, tt2, tt3;
digit_t mask;
unsigned int i, borrow = 0;
mp_mul(a[0], b[0], tt1, NWORDS_FIELD); // tt1 = a0*b0
mp_mul(a[1], b[1], tt2, NWORDS_FIELD); // tt2 = a1*b1
mp_addfast(a[0], a[1], t1); // t1 = a0+a1
mp_addfast(b[0], b[1], t2); // t2 = b0+b1
mask = mp_subfast(tt1, tt2, tt3); // tt3 = a0*b0 - a1*b1. If tt3 < 0 then mask = 0xFF..F, else if tt3 >= 0 then mask = 0x00..0
for (i = 0; i < NWORDS_FIELD; i++)
{
ADDC(borrow, tt3[NWORDS_FIELD + i], ((digit_t *)PRIME)[i] & mask, borrow, tt3[NWORDS_FIELD + i]);
}
rdc_mont(tt3, c[0]); // c[0] = a0*b0 - a1*b1
mp_addfastx2(tt1, tt2, tt1); // tt1 = a0*b0 + a1*b1
mp_mul(t1, t2, tt2, NWORDS_FIELD); // tt2 = (a0+a1)*(b0+b1)
mp_subfast(tt2, tt1, tt2); // tt2 = (a0+a1)*(b0+b1) - a0*b0 - a1*b1
rdc_mont(tt2, c[1]); // c[1] = (a0+a1)*(b0+b1) - a0*b0 - a1*b1
//a1*b0+a0*b1
}
void fpinv_chain_mont(felm_t a)
{ // Chain to compute a^(p-3)/4 using Montgomery arithmetic.
unsigned int i, j;
felm_t t[27], tt;
// Precomputed table
fpsqr_mont(a, tt);
fpmul_mont(a, tt, t[0]);
fpmul_mont(t[0], tt, t[1]);
fpmul_mont(t[1], tt, t[2]);
fpmul_mont(t[2], tt, t[3]);
fpmul_mont(t[3], tt, t[3]);
for (i = 3; i <= 8; i++)
fpmul_mont(t[i], tt, t[i + 1]);
fpmul_mont(t[9], tt, t[9]);
for (i = 9; i <= 20; i++)
fpmul_mont(t[i], tt, t[i + 1]);
fpmul_mont(t[21], tt, t[21]);
for (i = 21; i <= 24; i++)
fpmul_mont(t[i], tt, t[i + 1]);
fpmul_mont(t[25], tt, t[25]);
fpmul_mont(t[25], tt, t[26]);
fpcopy(a, tt);
for (i = 0; i < 6; i++)
fpsqr_mont(tt, tt);
fpmul_mont(t[20], tt, tt);
for (i = 0; i < 6; i++)
fpsqr_mont(tt, tt);
fpmul_mont(t[24], tt, tt);
for (i = 0; i < 6; i++)
fpsqr_mont(tt, tt);
fpmul_mont(t[11], tt, tt);
for (i = 0; i < 6; i++)
fpsqr_mont(tt, tt);
fpmul_mont(t[8], tt, tt);
for (i = 0; i < 8; i++)
fpsqr_mont(tt, tt);
fpmul_mont(t[2], tt, tt);
for (i = 0; i < 6; i++)
fpsqr_mont(tt, tt);
fpmul_mont(t[23], tt, tt);
for (i = 0; i < 6; i++)
fpsqr_mont(tt, tt);
fpmul_mont(t[2], tt, tt);
for (i = 0; i < 9; i++)
fpsqr_mont(tt, tt);
fpmul_mont(t[2], tt, tt);
for (i = 0; i < 10; i++)
fpsqr_mont(tt, tt);
fpmul_mont(t[15], tt, tt);
for (i = 0; i < 8; i++)
fpsqr_mont(tt, tt);
fpmul_mont(t[13], tt, tt);
for (i = 0; i < 8; i++)
fpsqr_mont(tt, tt);
fpmul_mont(t[26], tt, tt);
for (i = 0; i < 8; i++)
fpsqr_mont(tt, tt);
fpmul_mont(t[20], tt, tt);
for (i = 0; i < 6; i++)
fpsqr_mont(tt, tt);
fpmul_mont(t[11], tt, tt);
for (i = 0; i < 6; i++)
fpsqr_mont(tt, tt);
fpmul_mont(t[10], tt, tt);
for (i = 0; i < 6; i++)
fpsqr_mont(tt, tt);
fpmul_mont(t[14], tt, tt);
for (i = 0; i < 6; i++)
fpsqr_mont(tt, tt);
fpmul_mont(t[4], tt, tt);
for (i = 0; i < 10; i++)
fpsqr_mont(tt, tt);
fpmul_mont(t[18], tt, tt);
for (i = 0; i < 6; i++)
fpsqr_mont(tt, tt);
fpmul_mont(t[1], tt, tt);
for (i = 0; i < 7; i++)
fpsqr_mont(tt, tt);
fpmul_mont(t[22], tt, tt);
for (i = 0; i < 10; i++)
fpsqr_mont(tt, tt);
fpmul_mont(t[6], tt, tt);
for (i = 0; i < 7; i++)
fpsqr_mont(tt, tt);
fpmul_mont(t[24], tt, tt);
for (i = 0; i < 6; i++)
fpsqr_mont(tt, tt);
fpmul_mont(t[9], tt, tt);
for (i = 0; i < 8; i++)
fpsqr_mont(tt, tt);
fpmul_mont(t[18], tt, tt);
for (i = 0; i < 6; i++)
fpsqr_mont(tt, tt);
fpmul_mont(t[17], tt, tt);
for (i = 0; i < 8; i++)
fpsqr_mont(tt, tt);
fpmul_mont(a, tt, tt);
for (i = 0; i < 10; i++)
fpsqr_mont(tt, tt);
fpmul_mont(t[16], tt, tt);
for (i = 0; i < 6; i++)
fpsqr_mont(tt, tt);
fpmul_mont(t[7], tt, tt);
for (i = 0; i < 6; i++)
fpsqr_mont(tt, tt);
fpmul_mont(t[0], tt, tt);
for (i = 0; i < 7; i++)
fpsqr_mont(tt, tt);
fpmul_mont(t[12], tt, tt);
for (i = 0; i < 7; i++)
fpsqr_mont(tt, tt);
fpmul_mont(t[19], tt, tt);
for (i = 0; i < 6; i++)
fpsqr_mont(tt, tt);
fpmul_mont(t[22], tt, tt);
for (i = 0; i < 6; i++)
fpsqr_mont(tt, tt);
fpmul_mont(t[25], tt, tt);
for (i = 0; i < 7; i++)
fpsqr_mont(tt, tt);
fpmul_mont(t[2], tt, tt);
for (i = 0; i < 6; i++)
fpsqr_mont(tt, tt);
fpmul_mont(t[10], tt, tt);
for (i = 0; i < 7; i++)
fpsqr_mont(tt, tt);
fpmul_mont(t[22], tt, tt);
for (i = 0; i < 8; i++)
fpsqr_mont(tt, tt);
fpmul_mont(t[18], tt, tt);
for (i = 0; i < 6; i++)
fpsqr_mont(tt, tt);
fpmul_mont(t[4], tt, tt);
for (i = 0; i < 6; i++)
fpsqr_mont(tt, tt);
fpmul_mont(t[14], tt, tt);
for (i = 0; i < 7; i++)
fpsqr_mont(tt, tt);
fpmul_mont(t[13], tt, tt);
for (i = 0; i < 6; i++)
fpsqr_mont(tt, tt);
fpmul_mont(t[5], tt, tt);
for (i = 0; i < 6; i++)
fpsqr_mont(tt, tt);
fpmul_mont(t[23], tt, tt);
for (i = 0; i < 6; i++)
fpsqr_mont(tt, tt);
fpmul_mont(t[21], tt, tt);
for (i = 0; i < 6; i++)
fpsqr_mont(tt, tt);
fpmul_mont(t[2], tt, tt);
for (i = 0; i < 7; i++)
fpsqr_mont(tt, tt);
fpmul_mont(t[23], tt, tt);
for (i = 0; i < 8; i++)
fpsqr_mont(tt, tt);
fpmul_mont(t[12], tt, tt);
for (i = 0; i < 6; i++)
fpsqr_mont(tt, tt);
fpmul_mont(t[9], tt, tt);
for (i = 0; i < 6; i++)
fpsqr_mont(tt, tt);
fpmul_mont(t[3], tt, tt);
for (i = 0; i < 7; i++)
fpsqr_mont(tt, tt);
fpmul_mont(t[13], tt, tt);
for (i = 0; i < 7; i++)
fpsqr_mont(tt, tt);
fpmul_mont(t[17], tt, tt);
for (i = 0; i < 8; i++)
fpsqr_mont(tt, tt);
fpmul_mont(t[26], tt, tt);
for (i = 0; i < 8; i++)
fpsqr_mont(tt, tt);
fpmul_mont(t[5], tt, tt);
for (i = 0; i < 8; i++)
fpsqr_mont(tt, tt);
fpmul_mont(t[8], tt, tt);
for (i = 0; i < 6; i++)
fpsqr_mont(tt, tt);
fpmul_mont(t[2], tt, tt);
for (i = 0; i < 6; i++)
fpsqr_mont(tt, tt);
fpmul_mont(t[11], tt, tt);
for (i = 0; i < 7; i++)
fpsqr_mont(tt, tt);
fpmul_mont(t[20], tt, tt);
for (j = 0; j < 61; j++)
{
for (i = 0; i < 6; i++)
fpsqr_mont(tt, tt);
fpmul_mont(t[26], tt, tt);
}
fpcopy(tt, a);
}
void fp2inv_mont(f2elm_t a)
{ // GF(p^2) inversion using Montgomery arithmetic, a = (a0-i*a1)/(a0^2+a1^2).
f2elm_t t1;
fpsqr_mont(a[0], t1[0]); // t10 = a0^2
fpsqr_mont(a[1], t1[1]); // t11 = a1^2
fpadd(t1[0], t1[1], t1[0]); // t10 = a0^2+a1^2
fpinv_mont(t1[0]); // t10 = (a0^2+a1^2)^-1
fpneg(a[1]); // a = a0-i*a1
fpmul_mont(a[0], t1[0], a[0]);
fpmul_mont(a[1], t1[0], a[1]); // a = (a0-i*a1)*(a0^2+a1^2)^-1
}
void to_fp2mont(const f2elm_t a, f2elm_t mc)
{ // Conversion of a GF(p^2) element to Montgomery representation,
// mc_i = a_i*R^2*R^(-1) = a_i*R in GF(p^2).
to_mont(a[0], mc[0]);
to_mont(a[1], mc[1]);
}
void from_fp2mont(const f2elm_t ma, f2elm_t c)
{ // Conversion of a GF(p^2) element from Montgomery representation to standard representation,
// c_i = ma_i*R^(-1) = a_i in GF(p^2).
from_mont(ma[0], c[0]);
from_mont(ma[1], c[1]);
}
__inline unsigned int mp_add(const digit_t *a, const digit_t *b, digit_t *c, const unsigned int nwords)
{ // Multiprecision addition, c = a+b, where lng(a) = lng(b) = nwords. Returns the carry bit.
unsigned int i, carry = 0;
for (i = 0; i < nwords; i++)
{
ADDC(carry, a[i], b[i], carry, c[i]);
}
return carry;
}
void mp_shiftleft(digit_t *x, unsigned int shift, const unsigned int nwords)
{
unsigned int i, j = 0;
while (shift > RADIX)
{
j += 1;
shift -= RADIX;
}
for (i = 0; i < nwords - j; i++)
x[nwords - 1 - i] = x[nwords - 1 - i - j];
for (i = nwords - j; i < nwords; i++)
x[nwords - 1 - i] = 0;
if (shift != 0)
{
for (j = nwords - 1; j > 0; j--)
SHIFTL(x[j], x[j - 1], shift, x[j], RADIX);
x[0] <<= shift;
}
}
void mp_shiftr1(digit_t *x, const unsigned int nwords)
{ // Multiprecision right shift by one.
unsigned int i;
for (i = 0; i < nwords - 1; i++)
{
SHIFTR(x[i + 1], x[i], 1, x[i], RADIX);
}
x[nwords - 1] >>= 1;
}
void mp_shiftl1(digit_t *x, const unsigned int nwords)
{ // Multiprecision left shift by one.
int i;
for (i = nwords - 1; i > 0; i--)
{
SHIFTL(x[i], x[i - 1], 1, x[i], RADIX);
}
x[0] <<= 1;
}

+ 43
- 0
sidh_ref/random/random.c 查看文件

@@ -0,0 +1,43 @@
/********************************************************************************************
* Hardware-based random number generation function using /dev/urandom
*********************************************************************************************/
#include "random.h"
#include <stdlib.h>
#include <unistd.h>
#include <fcntl.h>
static int lock = -1;
static __inline void delay(unsigned int count)
{
while (count--) {}
}
int randombytes(unsigned char* random_array, unsigned long long nbytes)
{ // Generation of "nbytes" of random values
int r, n = (int)nbytes, count = 0;
if (lock == -1) {
do {
lock = open("/dev/urandom", O_RDONLY);
if (lock == -1) {
delay(0xFFFFF);
}
} while (lock == -1);
}
while (n > 0) {
do {
r = read(lock, random_array+count, n);
if (r == -1) {
delay(0xFFFF);
}
} while (r == -1);
count += r;
n -= r;
}
return 0;
}

+ 9
- 0
sidh_ref/random/random.h 查看文件

@@ -0,0 +1,9 @@
#ifndef __RANDOM_H__
#define __RANDOM_H__
// Generate random bytes and output the result to random_array
int randombytes(unsigned char* random_array, unsigned long long nbytes);
#endif

二進制
查看文件


+ 573
- 0
sidh_ref/sha3/fips202.c 查看文件

@@ -0,0 +1,573 @@
/********************************************************************************************
* SHA3-derived functions: SHAKE and cSHAKE
*
* Based on the public domain implementation in crypto_hash/keccakc512/simple/
* from http://bench.cr.yp.to/supercop.html by Ronny Van Keer
* and the public domain "TweetFips202" implementation from https://twitter.com/tweetfips202
* by Gilles Van Assche, Daniel J. Bernstein, and Peter Schwabe
*
* See NIST Special Publication 800-185 for more information:
* http://nvlpubs.nist.gov/nistpubs/SpecialPublications/NIST.SP.800-185.pdf
*
*********************************************************************************************/
#include <stdint.h>
#include <assert.h>
#include "fips202.h"
#define NROUNDS 24
#define ROL(a, offset) ((a << offset) ^ (a >> (64-offset)))
static uint64_t load64(const unsigned char *x)
{
unsigned long long r = 0, i;
for (i = 0; i < 8; ++i) {
r |= (unsigned long long)x[i] << 8 * i;
}
return r;
}
static void store64(uint8_t *x, uint64_t u)
{
unsigned int i;
for (i = 0; i < 8; ++i) {
x[i] = u;
u >>= 8;
}
}
static const uint64_t KeccakF_RoundConstants[NROUNDS] =
{
(uint64_t)0x0000000000000001ULL,
(uint64_t)0x0000000000008082ULL,
(uint64_t)0x800000000000808aULL,
(uint64_t)0x8000000080008000ULL,
(uint64_t)0x000000000000808bULL,
(uint64_t)0x0000000080000001ULL,
(uint64_t)0x8000000080008081ULL,
(uint64_t)0x8000000000008009ULL,
(uint64_t)0x000000000000008aULL,
(uint64_t)0x0000000000000088ULL,
(uint64_t)0x0000000080008009ULL,
(uint64_t)0x000000008000000aULL,
(uint64_t)0x000000008000808bULL,
(uint64_t)0x800000000000008bULL,
(uint64_t)0x8000000000008089ULL,
(uint64_t)0x8000000000008003ULL,
(uint64_t)0x8000000000008002ULL,
(uint64_t)0x8000000000000080ULL,
(uint64_t)0x000000000000800aULL,
(uint64_t)0x800000008000000aULL,
(uint64_t)0x8000000080008081ULL,
(uint64_t)0x8000000000008080ULL,
(uint64_t)0x0000000080000001ULL,
(uint64_t)0x8000000080008008ULL
};
void KeccakF1600_StatePermute(uint64_t * state)
{
int round;
uint64_t Aba, Abe, Abi, Abo, Abu;
uint64_t Aga, Age, Agi, Ago, Agu;
uint64_t Aka, Ake, Aki, Ako, Aku;
uint64_t Ama, Ame, Ami, Amo, Amu;
uint64_t Asa, Ase, Asi, Aso, Asu;
uint64_t BCa, BCe, BCi, BCo, BCu;
uint64_t Da, De, Di, Do, Du;
uint64_t Eba, Ebe, Ebi, Ebo, Ebu;
uint64_t Ega, Ege, Egi, Ego, Egu;
uint64_t Eka, Eke, Eki, Eko, Eku;
uint64_t Ema, Eme, Emi, Emo, Emu;
uint64_t Esa, Ese, Esi, Eso, Esu;
//copyFromState(A, state)
Aba = state[ 0];
Abe = state[ 1];
Abi = state[ 2];
Abo = state[ 3];
Abu = state[ 4];
Aga = state[ 5];
Age = state[ 6];
Agi = state[ 7];
Ago = state[ 8];
Agu = state[ 9];
Aka = state[10];
Ake = state[11];
Aki = state[12];
Ako = state[13];
Aku = state[14];
Ama = state[15];
Ame = state[16];
Ami = state[17];
Amo = state[18];
Amu = state[19];
Asa = state[20];
Ase = state[21];
Asi = state[22];
Aso = state[23];
Asu = state[24];
for( round = 0; round < NROUNDS; round += 2 )
{
// prepareTheta
BCa = Aba^Aga^Aka^Ama^Asa;
BCe = Abe^Age^Ake^Ame^Ase;
BCi = Abi^Agi^Aki^Ami^Asi;
BCo = Abo^Ago^Ako^Amo^Aso;
BCu = Abu^Agu^Aku^Amu^Asu;
//thetaRhoPiChiIotaPrepareTheta(round , A, E)
Da = BCu^ROL(BCe, 1);
De = BCa^ROL(BCi, 1);
Di = BCe^ROL(BCo, 1);
Do = BCi^ROL(BCu, 1);
Du = BCo^ROL(BCa, 1);
Aba ^= Da;
BCa = Aba;
Age ^= De;
BCe = ROL(Age, 44);
Aki ^= Di;
BCi = ROL(Aki, 43);
Amo ^= Do;
BCo = ROL(Amo, 21);
Asu ^= Du;
BCu = ROL(Asu, 14);
Eba = BCa ^((~BCe)& BCi );
Eba ^= (uint64_t)KeccakF_RoundConstants[round];
Ebe = BCe ^((~BCi)& BCo );
Ebi = BCi ^((~BCo)& BCu );
Ebo = BCo ^((~BCu)& BCa );
Ebu = BCu ^((~BCa)& BCe );
Abo ^= Do;
BCa = ROL(Abo, 28);
Agu ^= Du;
BCe = ROL(Agu, 20);
Aka ^= Da;
BCi = ROL(Aka, 3);
Ame ^= De;
BCo = ROL(Ame, 45);
Asi ^= Di;
BCu = ROL(Asi, 61);
Ega = BCa ^((~BCe)& BCi );
Ege = BCe ^((~BCi)& BCo );
Egi = BCi ^((~BCo)& BCu );
Ego = BCo ^((~BCu)& BCa );
Egu = BCu ^((~BCa)& BCe );
Abe ^= De;
BCa = ROL(Abe, 1);
Agi ^= Di;
BCe = ROL(Agi, 6);
Ako ^= Do;
BCi = ROL(Ako, 25);
Amu ^= Du;
BCo = ROL(Amu, 8);
Asa ^= Da;
BCu = ROL(Asa, 18);
Eka = BCa ^((~BCe)& BCi );
Eke = BCe ^((~BCi)& BCo );
Eki = BCi ^((~BCo)& BCu );
Eko = BCo ^((~BCu)& BCa );
Eku = BCu ^((~BCa)& BCe );
Abu ^= Du;
BCa = ROL(Abu, 27);
Aga ^= Da;
BCe = ROL(Aga, 36);
Ake ^= De;
BCi = ROL(Ake, 10);
Ami ^= Di;
BCo = ROL(Ami, 15);
Aso ^= Do;
BCu = ROL(Aso, 56);
Ema = BCa ^((~BCe)& BCi );
Eme = BCe ^((~BCi)& BCo );
Emi = BCi ^((~BCo)& BCu );
Emo = BCo ^((~BCu)& BCa );
Emu = BCu ^((~BCa)& BCe );
Abi ^= Di;
BCa = ROL(Abi, 62);
Ago ^= Do;
BCe = ROL(Ago, 55);
Aku ^= Du;
BCi = ROL(Aku, 39);
Ama ^= Da;
BCo = ROL(Ama, 41);
Ase ^= De;
BCu = ROL(Ase, 2);
Esa = BCa ^((~BCe)& BCi );
Ese = BCe ^((~BCi)& BCo );
Esi = BCi ^((~BCo)& BCu );
Eso = BCo ^((~BCu)& BCa );
Esu = BCu ^((~BCa)& BCe );
// prepareTheta
BCa = Eba^Ega^Eka^Ema^Esa;
BCe = Ebe^Ege^Eke^Eme^Ese;
BCi = Ebi^Egi^Eki^Emi^Esi;
BCo = Ebo^Ego^Eko^Emo^Eso;
BCu = Ebu^Egu^Eku^Emu^Esu;
//thetaRhoPiChiIotaPrepareTheta(round+1, E, A)
Da = BCu^ROL(BCe, 1);
De = BCa^ROL(BCi, 1);
Di = BCe^ROL(BCo, 1);
Do = BCi^ROL(BCu, 1);
Du = BCo^ROL(BCa, 1);
Eba ^= Da;
BCa = Eba;
Ege ^= De;
BCe = ROL(Ege, 44);
Eki ^= Di;
BCi = ROL(Eki, 43);
Emo ^= Do;
BCo = ROL(Emo, 21);
Esu ^= Du;
BCu = ROL(Esu, 14);
Aba = BCa ^((~BCe)& BCi );
Aba ^= (uint64_t)KeccakF_RoundConstants[round+1];
Abe = BCe ^((~BCi)& BCo );
Abi = BCi ^((~BCo)& BCu );
Abo = BCo ^((~BCu)& BCa );
Abu = BCu ^((~BCa)& BCe );
Ebo ^= Do;
BCa = ROL(Ebo, 28);
Egu ^= Du;
BCe = ROL(Egu, 20);
Eka ^= Da;
BCi = ROL(Eka, 3);
Eme ^= De;
BCo = ROL(Eme, 45);
Esi ^= Di;
BCu = ROL(Esi, 61);
Aga = BCa ^((~BCe)& BCi );
Age = BCe ^((~BCi)& BCo );
Agi = BCi ^((~BCo)& BCu );
Ago = BCo ^((~BCu)& BCa );
Agu = BCu ^((~BCa)& BCe );
Ebe ^= De;
BCa = ROL(Ebe, 1);
Egi ^= Di;
BCe = ROL(Egi, 6);
Eko ^= Do;
BCi = ROL(Eko, 25);
Emu ^= Du;
BCo = ROL(Emu, 8);
Esa ^= Da;
BCu = ROL(Esa, 18);
Aka = BCa ^((~BCe)& BCi );
Ake = BCe ^((~BCi)& BCo );
Aki = BCi ^((~BCo)& BCu );
Ako = BCo ^((~BCu)& BCa );
Aku = BCu ^((~BCa)& BCe );
Ebu ^= Du;
BCa = ROL(Ebu, 27);
Ega ^= Da;
BCe = ROL(Ega, 36);
Eke ^= De;
BCi = ROL(Eke, 10);
Emi ^= Di;
BCo = ROL(Emi, 15);
Eso ^= Do;
BCu = ROL(Eso, 56);
Ama = BCa ^((~BCe)& BCi );
Ame = BCe ^((~BCi)& BCo );
Ami = BCi ^((~BCo)& BCu );
Amo = BCo ^((~BCu)& BCa );
Amu = BCu ^((~BCa)& BCe );
Ebi ^= Di;
BCa = ROL(Ebi, 62);
Ego ^= Do;
BCe = ROL(Ego, 55);
Eku ^= Du;
BCi = ROL(Eku, 39);
Ema ^= Da;
BCo = ROL(Ema, 41);
Ese ^= De;
BCu = ROL(Ese, 2);
Asa = BCa ^((~BCe)& BCi );
Ase = BCe ^((~BCi)& BCo );
Asi = BCi ^((~BCo)& BCu );
Aso = BCo ^((~BCu)& BCa );
Asu = BCu ^((~BCa)& BCe );
}
//copyToState(state, A)
state[ 0] = Aba;
state[ 1] = Abe;
state[ 2] = Abi;
state[ 3] = Abo;
state[ 4] = Abu;
state[ 5] = Aga;
state[ 6] = Age;
state[ 7] = Agi;
state[ 8] = Ago;
state[ 9] = Agu;
state[10] = Aka;
state[11] = Ake;
state[12] = Aki;
state[13] = Ako;
state[14] = Aku;
state[15] = Ama;
state[16] = Ame;
state[17] = Ami;
state[18] = Amo;
state[19] = Amu;
state[20] = Asa;
state[21] = Ase;
state[22] = Asi;
state[23] = Aso;
state[24] = Asu;
#undef round
}
#include <string.h>
#define MIN(a, b) ((a) < (b) ? (a) : (b))
static void keccak_absorb(uint64_t *s, unsigned int r, const unsigned char *m, unsigned long long int mlen, unsigned char p)
{
unsigned long long i;
unsigned char t[200];
while (mlen >= r)
{
for (i = 0; i < r / 8; ++i)
s[i] ^= load64(m + 8 * i);
KeccakF1600_StatePermute(s);
mlen -= r;
m += r;
}
for (i = 0; i < r; ++i)
t[i] = 0;
for (i = 0; i < mlen; ++i)
t[i] = m[i];
t[i] = p;
t[r - 1] |= 128;
for (i = 0; i < r / 8; ++i)
s[i] ^= load64(t + 8 * i);
}
static void keccak_squeezeblocks(unsigned char *h, unsigned long long int nblocks, uint64_t *s, unsigned int r)
{
unsigned int i;
while(nblocks > 0)
{
KeccakF1600_StatePermute(s);
for (i = 0; i < (r>>3); i++)
{
store64(h+8*i, s[i]);
}
h += r;
nblocks--;
}
}
/********** SHAKE128 ***********/
void shake128_absorb(uint64_t *s, const unsigned char *input, unsigned int inputByteLen)
{
keccak_absorb(s, SHAKE128_RATE, input, inputByteLen, 0x1F);
}
void shake128_squeezeblocks(unsigned char *output, unsigned long long nblocks, uint64_t *s)
{
keccak_squeezeblocks(output, nblocks, s, SHAKE128_RATE);
}
void shake128(unsigned char *output, unsigned long long outlen, const unsigned char *input, unsigned long long inlen)
{
uint64_t s[25] = {0};
unsigned char t[SHAKE128_RATE];
unsigned long long nblocks = outlen/SHAKE128_RATE;
size_t i;
/* Absorb input */
keccak_absorb(s, SHAKE128_RATE, input, inlen, 0x1F);
/* Squeeze output */
keccak_squeezeblocks(output, nblocks, s, SHAKE128_RATE);
output += nblocks*SHAKE128_RATE;
outlen -= nblocks*SHAKE128_RATE;
if (outlen)
{
keccak_squeezeblocks(t, 1, s, SHAKE128_RATE);
for (i = 0; i < outlen; i++)
output[i] = t[i];
}
}
/********** cSHAKE128 ***********/
void cshake128_simple_absorb(uint64_t s[25], uint16_t cstm, const unsigned char *in, unsigned long long inlen)
{
unsigned char *sep = (unsigned char*)s;
unsigned int i;
for (i = 0; i < 25; i++)
s[i] = 0;
/* Absorb customization (domain-separation) string */
sep[0] = 0x01;
sep[1] = 0xa8;
sep[2] = 0x01;
sep[3] = 0x00;
sep[4] = 0x01;
sep[5] = 16; // fixed bitlen of cstm
sep[6] = cstm & 0xff;
sep[7] = cstm >> 8;
KeccakF1600_StatePermute(s);
/* Absorb input */
keccak_absorb(s, SHAKE128_RATE, in, inlen, 0x04);
}
void cshake128_simple_squeezeblocks(unsigned char *output, unsigned long long nblocks, uint64_t *s)
{
keccak_squeezeblocks(output, nblocks, s, SHAKE128_RATE);
}
void cshake128_simple(unsigned char *output, unsigned long long outlen, uint16_t cstm, const unsigned char *in, unsigned long long inlen)
{
uint64_t s[25];
unsigned char t[SHAKE128_RATE];
unsigned int i;
cshake128_simple_absorb(s, cstm, in, inlen);
/* Squeeze output */
keccak_squeezeblocks(output, outlen/SHAKE128_RATE, s, SHAKE128_RATE);
output += (outlen/SHAKE128_RATE)*SHAKE128_RATE;
if (outlen%SHAKE128_RATE)
{
keccak_squeezeblocks(t, 1, s, SHAKE128_RATE);
for (i = 0; i < outlen%SHAKE128_RATE; i++)
output[i] = t[i];
}
}
/********** SHAKE256 ***********/
void shake256_absorb(uint64_t *s, const unsigned char *input, unsigned int inputByteLen)
{
keccak_absorb(s, SHAKE256_RATE, input, inputByteLen, 0x1F);
}
void shake256_squeezeblocks(unsigned char *output, unsigned long long nblocks, uint64_t *s)
{
keccak_squeezeblocks(output, nblocks, s, SHAKE256_RATE);
}
void shake256(unsigned char *output, unsigned long long outlen, const unsigned char *input, unsigned long long inlen)
{
uint64_t s[25];
unsigned char t[SHAKE256_RATE];
unsigned long long nblocks = outlen/SHAKE256_RATE;
size_t i;
for (i = 0; i < 25; ++i)
s[i] = 0;
/* Absorb input */
keccak_absorb(s, SHAKE256_RATE, input, inlen, 0x1F);
/* Squeeze output */
keccak_squeezeblocks(output, nblocks, s, SHAKE256_RATE);
output += nblocks*SHAKE256_RATE;
outlen -= nblocks*SHAKE256_RATE;
if (outlen)
{
keccak_squeezeblocks(t, 1, s, SHAKE256_RATE);
for (i = 0; i < outlen; i++)
output[i] = t[i];
}
}
/********** cSHAKE256 ***********/
void cshake256_simple_absorb(uint64_t s[25], uint16_t cstm, const unsigned char *in, unsigned long long inlen)
{
unsigned char *sep = (unsigned char*)s;
unsigned int i;
for (i = 0; i < 25; i++)
s[i] = 0;
/* Absorb customization (domain-separation) string */
sep[0] = 0x01;
sep[1] = 0x88;
sep[2] = 0x01;
sep[3] = 0x00;
sep[4] = 0x01;
sep[5] = 16; // fixed bitlen of cstm
sep[6] = cstm & 0xff;
sep[7] = cstm >> 8;
KeccakF1600_StatePermute(s);
/* Absorb input */
keccak_absorb(s, SHAKE256_RATE, in, inlen, 0x04);
}
void cshake256_simple_squeezeblocks(unsigned char *output, unsigned long long nblocks, uint64_t *s)
{
keccak_squeezeblocks(output, nblocks, s, SHAKE256_RATE);
}
void cshake256_simple(unsigned char *output, unsigned long long outlen, uint16_t cstm, const unsigned char *in, unsigned long long inlen)
{
uint64_t s[25];
unsigned char t[SHAKE256_RATE];
unsigned int i;
cshake256_simple_absorb(s, cstm, in, inlen);
/* Squeeze output */
keccak_squeezeblocks(output, outlen/SHAKE256_RATE, s, SHAKE256_RATE);
output += (outlen/SHAKE256_RATE)*SHAKE256_RATE;
if(outlen%SHAKE256_RATE)
{
keccak_squeezeblocks(t, 1, s, SHAKE256_RATE);
for (i = 0; i < outlen%SHAKE256_RATE; i++)
output[i] = t[i];
}
}

+ 27
- 0
sidh_ref/sha3/fips202.h 查看文件

@@ -0,0 +1,27 @@
#ifndef FIPS202_H
#define FIPS202_H
#include <stdint.h>
#define SHAKE128_RATE 168
#define SHAKE256_RATE 136
void shake128_absorb(uint64_t *s, const unsigned char *input, unsigned int inputByteLen);
void shake128_squeezeblocks(unsigned char *output, unsigned long long nblocks, uint64_t *s);
void shake128(unsigned char *output, unsigned long long outlen, const unsigned char *input, unsigned long long inlen);
void cshake128_simple_absorb(uint64_t *s, uint16_t cstm, const unsigned char *in, unsigned long long inlen);
void cshake128_simple_squeezeblocks(unsigned char *output, unsigned long long nblocks, uint64_t *s);
void cshake128_simple(unsigned char *output, unsigned long long outlen, uint16_t cstm, const unsigned char *in, unsigned long long inlen);
void shake256_absorb(uint64_t *s, const unsigned char *input, unsigned int inputByteLen);
void shake256_squeezeblocks(unsigned char *output, unsigned long long nblocks, uint64_t *s);
void shake256(unsigned char *output, unsigned long long outlen, const unsigned char *input, unsigned long long inlen);
void cshake256_simple_absorb(uint64_t *s, uint16_t cstm, const unsigned char *in, unsigned long long inlen);
void cshake256_simple_squeezeblocks(unsigned char *output, unsigned long long nblocks, uint64_t *s);
void cshake256_simple(unsigned char *output, unsigned long long outlen, uint16_t cstm, const unsigned char *in, unsigned long long inlen);
#endif

二進制
查看文件


+ 345
- 0
sidh_ref/sidh.c 查看文件

@@ -0,0 +1,345 @@
/********************************************************************************************
* Supersingular Isogeny Key Encapsulation Library
*
* Abstract: ephemeral supersingular isogeny Diffie-Hellman key exchange (SIDH)
*********************************************************************************************/
#include "P751_internal.h"
#include "random/random.h"
#include <stdio.h>
static void clear_words(void *mem, digit_t nwords)
{ // Clear digits from memory. "nwords" indicates the number of digits to be zeroed.
// This function uses the volatile type qualifier to inform the compiler not to optimize out the memory clearing.
unsigned int i;
volatile digit_t *v = mem;
for (i = 0; i < nwords; i++)
{
v[i] = 0;
}
}
static void init_basis(digit_t *gen, f2elm_t XP, f2elm_t XQ, f2elm_t XR)
{ // Initialization of basis points
fpcopy(gen, XP[0]);
fpcopy(gen + NWORDS_FIELD, XP[1]);
fpcopy(gen + 2 * NWORDS_FIELD, XQ[0]);
fpzero(XQ[1]);
fpcopy(gen + 3 * NWORDS_FIELD, XR[0]);
fpcopy(gen + 4 * NWORDS_FIELD, XR[1]);
}
static void fp2_encode(const f2elm_t x, unsigned char *enc)
{ // Conversion of GF(p^2) element from Montgomery to standard representation, and encoding by removing leading 0 bytes
unsigned int i;
f2elm_t t;
from_fp2mont(x, t);
for (i = 0; i < FP2_ENCODED_BYTES / 2; i++)
{
enc[i] = ((unsigned char *)t)[i];
enc[i + FP2_ENCODED_BYTES / 2] = ((unsigned char *)t)[i + MAXBITS_FIELD / 8];
}
}
static void fp2_decode(const unsigned char *enc, f2elm_t x)
{ // Parse byte sequence back into GF(p^2) element, and conversion to Montgomery representation
unsigned int i;
for (i = 0; i < 2 * (MAXBITS_FIELD / 8); i++)
((unsigned char *)x)[i] = 0;
for (i = 0; i < FP2_ENCODED_BYTES / 2; i++)
{
((unsigned char *)x)[i] = enc[i];
((unsigned char *)x)[i + MAXBITS_FIELD / 8] = enc[i + FP2_ENCODED_BYTES / 2];
}
to_fp2mont(x, x);
}
void random_mod_order_A(unsigned char *random_digits)
{ // Generation of Alice's secret key
// Outputs random value in [0, 2^eA - 1]
unsigned long long nbytes = NBITS_TO_NBYTES(OALICE_BITS);
clear_words((void *)random_digits, MAXWORDS_ORDER);
randombytes(random_digits, nbytes);
random_digits[nbytes - 1] &= MASK_ALICE; // Masking last byte
}
void random_mod_order_B(unsigned char *random_digits)
{ // Generation of Bob's secret key
// Outputs random value in [0, 2^Floor(Log(2, oB)) - 1]
unsigned long long nbytes = NBITS_TO_NBYTES(OBOB_BITS - 1);
clear_words((void *)random_digits, MAXWORDS_ORDER);
randombytes(random_digits, nbytes);
random_digits[nbytes - 1] &= MASK_BOB; // Masking last byte
}
int EphemeralKeyGeneration_A(const unsigned char *PrivateKeyA, unsigned char *PublicKeyA)
{ // Alice's ephemeral public key generation
// Input: a private key PrivateKeyA in the range [0, 2^eA - 1].
// Output: the public key PublicKeyA consisting of 3 elements in GF(p^2) which are encoded by removing leading 0 bytes.
point_proj_t R, phiP = {0}, phiQ = {0}, phiR = {0}, pts[MAX_INT_POINTS_ALICE];
f2elm_t XPA, XQA, XRA, coeff[3], A24plus = {0}, C24 = {0}, A = {0};
unsigned int i, row, m, index = 0, pts_index[MAX_INT_POINTS_ALICE], npts = 0, ii = 0;
// Initialize basis points
init_basis((digit_t *)A_gen, XPA, XQA, XRA);
init_basis((digit_t *)B_gen, phiP->X, phiQ->X, phiR->X);
fpcopy((digit_t *)&Montgomery_one, (phiP->Z)[0]);
fpcopy((digit_t *)&Montgomery_one, (phiQ->Z)[0]);
fpcopy((digit_t *)&Montgomery_one, (phiR->Z)[0]);
// Initialize constants
fpcopy((digit_t *)&Montgomery_one, A24plus[0]);
fp2add(A24plus, A24plus, C24);
uint64_t temp[12];
uint64_t ifma_temp[15];
// Retrieve kernel point
LADDER3PT(XPA, XQA, XRA, (digit_t *)PrivateKeyA, ALICE, R, A);
// Traverse tree
index = 0;
for (row = 1; row < MAX_Alice; row++)
{
while (index < MAX_Alice - row)
{
fp2copy(R->X, pts[npts]->X);
fp2copy(R->Z, pts[npts]->Z);
pts_index[npts++] = index;
m = strat_Alice[ii++];
xDBLe(R, R, A24plus, C24, (int)(2 * m));
index += m;
}
get_4_isog(R, A24plus, C24, coeff);
for (i = 0; i < npts; i++)
{
eval_4_isog(pts[i], coeff);
}
eval_4_isog(phiP, coeff);
eval_4_isog(phiQ, coeff);
eval_4_isog(phiR, coeff);
fp2copy(pts[npts - 1]->X, R->X);
fp2copy(pts[npts - 1]->Z, R->Z);
index = pts_index[npts - 1];
npts -= 1;
}
get_4_isog(R, A24plus, C24, coeff);
eval_4_isog(phiP, coeff);
eval_4_isog(phiQ, coeff);
eval_4_isog(phiR, coeff);
inv_3_way(phiP->Z, phiQ->Z, phiR->Z);
fp2mul_mont(phiP->X, phiP->Z, phiP->X);
fp2mul_mont(phiQ->X, phiQ->Z, phiQ->X);
fp2mul_mont(phiR->X, phiR->Z, phiR->X);
// Format public key
fp2_encode(phiP->X, PublicKeyA);
fp2_encode(phiQ->X, PublicKeyA + FP2_ENCODED_BYTES);
fp2_encode(phiR->X, PublicKeyA + 2 * FP2_ENCODED_BYTES);
return 0;
}
int EphemeralKeyGeneration_B(const unsigned char *PrivateKeyB, unsigned char *PublicKeyB)
{ // Bob's ephemeral public key generation
// Input: a private key PrivateKeyB in the range [0, 2^Floor(Log(2,oB)) - 1].
// Output: the public key PublicKeyB consisting of 3 elements in GF(p^2) which are encoded by removing leading 0 bytes.
point_proj_t R, phiP = {0}, phiQ = {0}, phiR = {0}, pts[MAX_INT_POINTS_BOB];
f2elm_t XPB, XQB, XRB, coeff[3], A24plus = {0}, A24minus = {0}, A = {0};
unsigned int i, row, m, index = 0, pts_index[MAX_INT_POINTS_BOB], npts = 0, ii = 0;
// Initialize basis points
init_basis((digit_t *)B_gen, XPB, XQB, XRB);
init_basis((digit_t *)A_gen, phiP->X, phiQ->X, phiR->X);
fpcopy((digit_t *)&Montgomery_one, (phiP->Z)[0]);
fpcopy((digit_t *)&Montgomery_one, (phiQ->Z)[0]);
fpcopy((digit_t *)&Montgomery_one, (phiR->Z)[0]);
// Initialize constants
fpcopy((digit_t *)&Montgomery_one, A24plus[0]);
fp2add(A24plus, A24plus, A24plus);
fp2copy(A24plus, A24minus);
fp2neg(A24minus);
// Retrieve kernel point
LADDER3PT(XPB, XQB, XRB, (digit_t *)PrivateKeyB, BOB, R, A);
// Traverse tree
index = 0;
for (row = 1; row < MAX_Bob; row++)
{
while (index < MAX_Bob - row)
{
fp2copy(R->X, pts[npts]->X);
fp2copy(R->Z, pts[npts]->Z);
pts_index[npts++] = index;
m = strat_Bob[ii++];
xTPLe(R, R, A24minus, A24plus, (int)m);
index += m;
}
get_3_isog(R, A24minus, A24plus, coeff);
for (i = 0; i < npts; i++)
{
eval_3_isog(pts[i], coeff);
}
eval_3_isog(phiP, coeff);
eval_3_isog(phiQ, coeff);
eval_3_isog(phiR, coeff);
fp2copy(pts[npts - 1]->X, R->X);
fp2copy(pts[npts - 1]->Z, R->Z);
index = pts_index[npts - 1];
npts -= 1;
}
get_3_isog(R, A24minus, A24plus, coeff);
eval_3_isog(phiP, coeff);
eval_3_isog(phiQ, coeff);
eval_3_isog(phiR, coeff);
inv_3_way(phiP->Z, phiQ->Z, phiR->Z);
fp2mul_mont(phiP->X, phiP->Z, phiP->X);
fp2mul_mont(phiQ->X, phiQ->Z, phiQ->X);
fp2mul_mont(phiR->X, phiR->Z, phiR->X);
// Format public key
fp2_encode(phiP->X, PublicKeyB);
fp2_encode(phiQ->X, PublicKeyB + FP2_ENCODED_BYTES);
fp2_encode(phiR->X, PublicKeyB + 2 * FP2_ENCODED_BYTES);
return 0;
}
int EphemeralSecretAgreement_A(const unsigned char *PrivateKeyA, const unsigned char *PublicKeyB, unsigned char *SharedSecretA)
{ // Alice's ephemeral shared secret computation
// It produces a shared secret key SharedSecretA using her secret key PrivateKeyA and Bob's public key PublicKeyB
// Inputs: Alice's PrivateKeyA is an integer in the range [0, oA-1].
// Bob's PublicKeyB consists of 3 elements in GF(p^2) encoded by removing leading 0 bytes.
// Output: a shared secret SharedSecretA that consists of one element in GF(p^2) encoded by removing leading 0 bytes.
point_proj_t R, pts[MAX_INT_POINTS_ALICE];
f2elm_t coeff[3], PKB[3], jinv;
f2elm_t A24plus = {0}, C24 = {0}, A = {0};
unsigned int i, row, m, index = 0, pts_index[MAX_INT_POINTS_ALICE], npts = 0, ii = 0;
// Initialize images of Bob's basis
fp2_decode(PublicKeyB, PKB[0]);
fp2_decode(PublicKeyB + FP2_ENCODED_BYTES, PKB[1]);
fp2_decode(PublicKeyB + 2 * FP2_ENCODED_BYTES, PKB[2]);
// Initialize constants
get_A(PKB[0], PKB[1], PKB[2], A); // TODO: Can return projective A?
fpadd((digit_t *)&Montgomery_one, (digit_t *)&Montgomery_one, C24[0]);
fp2add(A, C24, A24plus);
fpadd(C24[0], C24[0], C24[0]);
// Retrieve kernel point
LADDER3PT(PKB[0], PKB[1], PKB[2], (digit_t *)PrivateKeyA, ALICE, R, A);
// Traverse tree
index = 0;
for (row = 1; row < MAX_Alice; row++)
{
while (index < MAX_Alice - row)
{
fp2copy(R->X, pts[npts]->X);
fp2copy(R->Z, pts[npts]->Z);
pts_index[npts++] = index;
m = strat_Alice[ii++];
xDBLe(R, R, A24plus, C24, (int)(2 * m));
index += m;
}
get_4_isog(R, A24plus, C24, coeff);
for (i = 0; i < npts; i++)
{
eval_4_isog(pts[i], coeff);
}
fp2copy(pts[npts - 1]->X, R->X);
fp2copy(pts[npts - 1]->Z, R->Z);
index = pts_index[npts - 1];
npts -= 1;
}
get_4_isog(R, A24plus, C24, coeff);
fp2div2(C24, C24);
fp2sub(A24plus, C24, A24plus);
fp2div2(C24, C24);
j_inv(A24plus, C24, jinv);
fp2_encode(jinv, SharedSecretA); // Format shared secret
return 0;
}
int EphemeralSecretAgreement_B(const unsigned char *PrivateKeyB, const unsigned char *PublicKeyA, unsigned char *SharedSecretB)
{ // Bob's ephemeral shared secret computation
// It produces a shared secret key SharedSecretB using his secret key PrivateKeyB and Alice's public key PublicKeyA
// Inputs: Bob's PrivateKeyB is an integer in the range [0, 2^Floor(Log(2,oB)) - 1].
// Alice's PublicKeyA consists of 3 elements in GF(p^2) encoded by removing leading 0 bytes.
// Output: a shared secret SharedSecretB that consists of one element in GF(p^2) encoded by removing leading 0 bytes.
point_proj_t R, pts[MAX_INT_POINTS_BOB];
f2elm_t coeff[3], PKB[3], jinv;
f2elm_t A24plus = {0}, A24minus = {0}, A = {0};
unsigned int i, row, m, index = 0, pts_index[MAX_INT_POINTS_BOB], npts = 0, ii = 0;
// Initialize images of Alice's basis
fp2_decode(PublicKeyA, PKB[0]);
fp2_decode(PublicKeyA + FP2_ENCODED_BYTES, PKB[1]);
fp2_decode(PublicKeyA + 2 * FP2_ENCODED_BYTES, PKB[2]);
// Initialize constants
get_A(PKB[0], PKB[1], PKB[2], A); // TODO: Can return projective A?
fpadd((digit_t *)&Montgomery_one, (digit_t *)&Montgomery_one, A24minus[0]);
fp2add(A, A24minus, A24plus);
fp2sub(A, A24minus, A24minus);
// Retrieve kernel point
LADDER3PT(PKB[0], PKB[1], PKB[2], (digit_t *)PrivateKeyB, BOB, R, A);
// Traverse tree
index = 0;
for (row = 1; row < MAX_Bob; row++)
{
while (index < MAX_Bob - row)
{
fp2copy(R->X, pts[npts]->X);
fp2copy(R->Z, pts[npts]->Z);
pts_index[npts++] = index;
m = strat_Bob[ii++];
xTPLe(R, R, A24minus, A24plus, (int)m);
index += m;
}
get_3_isog(R, A24minus, A24plus, coeff);
for (i = 0; i < npts; i++)
{
eval_3_isog(pts[i], coeff);
}
fp2copy(pts[npts - 1]->X, R->X);
fp2copy(pts[npts - 1]->Z, R->Z);
index = pts_index[npts - 1];
npts -= 1;
}
get_3_isog(R, A24minus, A24plus, coeff);
fp2add(A24plus, A24minus, A);
fp2add(A, A, A);
fp2sub(A24plus, A24minus, A24plus);
j_inv(A, A24plus, jinv);
fp2_encode(jinv, SharedSecretB); // Format shared secret
return 0;
}

+ 99
- 0
sidh_ref/sike.c 查看文件

@@ -0,0 +1,99 @@
/********************************************************************************************
* Supersingular Isogeny Key Encapsulation Library
*
* Abstract: supersingular isogeny key encapsulation (SIKE) protocol
*********************************************************************************************/
#include <string.h>
#include "P751_internal.h"
#include "sha3/fips202.h"
int crypto_kem_keypair(unsigned char *pk, unsigned char *sk)
{ // SIKE's key generation
// Outputs: secret key sk (CRYPTO_SECRETKEYBYTES = MSG_BYTES + SECRETKEY_B_BYTES + CRYPTO_PUBLICKEYBYTES bytes)
// public key pk (CRYPTO_PUBLICKEYBYTES bytes)
// Generate lower portion of secret key sk <- s||SK
randombytes(sk, MSG_BYTES);
random_mod_order_B(sk + MSG_BYTES);
// Generate public key pk
EphemeralKeyGeneration_B(sk + MSG_BYTES, pk);
// Append public key pk to secret key sk
memcpy(&sk[MSG_BYTES + SECRETKEY_B_BYTES], pk, CRYPTO_PUBLICKEYBYTES);
return 0;
}
int crypto_kem_enc(unsigned char *ct, unsigned char *ss, const unsigned char *pk)
{ // SIKE's encapsulation
// Input: public key pk (CRYPTO_PUBLICKEYBYTES bytes)
// Outputs: shared secret ss (CRYPTO_BYTES bytes)
// ciphertext message ct (CRYPTO_CIPHERTEXTBYTES = CRYPTO_PUBLICKEYBYTES + MSG_BYTES bytes)
const uint16_t G = 0;
const uint16_t H = 1;
const uint16_t P = 2;
unsigned char ephemeralsk[SECRETKEY_A_BYTES];
unsigned char jinvariant[FP2_ENCODED_BYTES];
unsigned char h[MSG_BYTES];
unsigned char temp[CRYPTO_CIPHERTEXTBYTES+MSG_BYTES];
unsigned int i;
// Generate ephemeralsk <- G(m||pk) mod oA
randombytes(temp, MSG_BYTES);
memcpy(&temp[MSG_BYTES], pk, CRYPTO_PUBLICKEYBYTES);
cshake256_simple(ephemeralsk, SECRETKEY_A_BYTES, G, temp, CRYPTO_PUBLICKEYBYTES+MSG_BYTES);
ephemeralsk[SECRETKEY_A_BYTES - 1] &= MASK_ALICE;
// Encrypt
EphemeralKeyGeneration_A(ephemeralsk, ct);
EphemeralSecretAgreement_A(ephemeralsk, pk, jinvariant);
cshake256_simple(h, MSG_BYTES, P, jinvariant, FP2_ENCODED_BYTES);
for (i = 0; i < MSG_BYTES; i++) ct[i + CRYPTO_PUBLICKEYBYTES] = temp[i] ^ h[i];
// Generate shared secret ss <- H(m||ct)
memcpy(&temp[MSG_BYTES], ct, CRYPTO_CIPHERTEXTBYTES);
cshake256_simple(ss, CRYPTO_BYTES, H, temp, CRYPTO_CIPHERTEXTBYTES+MSG_BYTES);
return 0;
}
int crypto_kem_dec(unsigned char *ss, const unsigned char *ct, const unsigned char *sk)
{ // SIKE's decapsulation
// Input: secret key sk (CRYPTO_SECRETKEYBYTES = MSG_BYTES + SECRETKEY_B_BYTES + CRYPTO_PUBLICKEYBYTES bytes)
// ciphertext message ct (CRYPTO_CIPHERTEXTBYTES = CRYPTO_PUBLICKEYBYTES + MSG_BYTES bytes)
// Outputs: shared secret ss (CRYPTO_BYTES bytes)
const uint16_t G = 0;
const uint16_t H = 1;
const uint16_t P = 2;
unsigned char ephemeralsk_[SECRETKEY_A_BYTES];
unsigned char jinvariant_[FP2_ENCODED_BYTES];
unsigned char h_[MSG_BYTES];
unsigned char c0_[CRYPTO_PUBLICKEYBYTES];
unsigned char temp[CRYPTO_CIPHERTEXTBYTES+MSG_BYTES];
unsigned int i;
// Decrypt
EphemeralSecretAgreement_B(sk + MSG_BYTES, ct, jinvariant_);
cshake256_simple(h_, MSG_BYTES, P, jinvariant_, FP2_ENCODED_BYTES);
for (i = 0; i < MSG_BYTES; i++) temp[i] = ct[i + CRYPTO_PUBLICKEYBYTES] ^ h_[i];
// Generate ephemeralsk_ <- G(m||pk) mod oA
memcpy(&temp[MSG_BYTES], &sk[MSG_BYTES + SECRETKEY_B_BYTES], CRYPTO_PUBLICKEYBYTES);
cshake256_simple(ephemeralsk_, SECRETKEY_A_BYTES, G, temp, CRYPTO_PUBLICKEYBYTES+MSG_BYTES);
ephemeralsk_[SECRETKEY_A_BYTES - 1] &= MASK_ALICE;
// Generate shared secret ss <- H(m||ct) or output ss <- H(s||ct)
EphemeralKeyGeneration_A(ephemeralsk_, c0_);
if (memcmp(c0_, ct, CRYPTO_PUBLICKEYBYTES) != 0) {
memcpy(temp, sk, MSG_BYTES);
}
memcpy(&temp[MSG_BYTES], ct, CRYPTO_CIPHERTEXTBYTES);
cshake256_simple(ss, CRYPTO_BYTES, H, temp, CRYPTO_CIPHERTEXTBYTES+MSG_BYTES);
return 0;
}

Loading…
取消
儲存