diff --git a/CMakeLists.txt b/CMakeLists.txt index 4eb0d0d6..a43fec27 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -446,6 +446,7 @@ add_subdirectory(ssl/test) add_subdirectory(fipstools) add_subdirectory(tool) add_subdirectory(decrepit) +add_subdirectory(third_party/sidh) if(FUZZ) if(LIBFUZZER_FROM_DEPS) diff --git a/include/openssl/nid.h b/include/openssl/nid.h index afeb2dea..e12ebf3e 100644 --- a/include/openssl/nid.h +++ b/include/openssl/nid.h @@ -4194,6 +4194,9 @@ extern "C" { #define SN_X25519 "X25519" #define NID_X25519 948 +#define SN_x25519sidh503 "x25519sidh503" +#define NID_x25519sidh503 0x0105 + #define SN_ED25519 "ED25519" #define NID_ED25519 949 #define OBJ_ED25519 1L, 3L, 101L, 112L diff --git a/include/openssl/ssl.h b/include/openssl/ssl.h index d6169816..1d93d671 100644 --- a/include/openssl/ssl.h +++ b/include/openssl/ssl.h @@ -2177,6 +2177,7 @@ OPENSSL_EXPORT int SSL_set1_curves_list(SSL *ssl, const char *curves); #define SSL_CURVE_SECP384R1 24 #define SSL_CURVE_SECP521R1 25 #define SSL_CURVE_X25519 29 +#define SSL_CURVE_sidh503 0x0105 // SSL_get_curve_id returns the ID of the curve used by |ssl|'s most recently // completed handshake or 0 if not applicable. diff --git a/ssl/CMakeLists.txt b/ssl/CMakeLists.txt index 6881089f..c08e93b7 100644 --- a/ssl/CMakeLists.txt +++ b/ssl/CMakeLists.txt @@ -58,3 +58,9 @@ if(WIN32) target_link_libraries(ssl_test ws2_32) endif() add_dependencies(all_tests ssl_test) + + +if(EXP_SIDH) + add_definitions(-DBORINGSSL_USE_SIDH) + target_link_libraries(ssl sidh503) +endif() \ No newline at end of file diff --git a/ssl/handshake_client.cc b/ssl/handshake_client.cc index cb9b6dec..4765b8d9 100644 --- a/ssl/handshake_client.cc +++ b/ssl/handshake_client.cc @@ -985,6 +985,7 @@ static enum ssl_hs_wait_t do_read_server_key_exchange(SSL_HANDSHAKE *hs) { !hs->peer_key.CopyFrom(point)) { return ssl_hs_error; } + hs->key_share->SetInitiator(true); } else if (!(alg_k & SSL_kPSK)) { OPENSSL_PUT_ERROR(SSL, SSL_R_UNEXPECTED_MESSAGE); ssl_send_alert(ssl, SSL3_AL_FATAL, SSL_AD_UNEXPECTED_MESSAGE); diff --git a/ssl/handshake_server.cc b/ssl/handshake_server.cc index 0159c9e9..caf8b370 100644 --- a/ssl/handshake_server.cc +++ b/ssl/handshake_server.cc @@ -811,7 +811,10 @@ static enum ssl_hs_wait_t do_send_server_certificate(SSL_HANDSHAKE *hs) { hs->new_session->group_id = group_id; // Set up ECDH, generate a key, and emit the public half. - hs->key_share = SSLKeyShare::Create(group_id); + if ((hs->key_share = SSLKeyShare::Create(group_id)) == nullptr) { + return ssl_hs_error; + } + hs->key_share->SetInitiator(false); if (!hs->key_share || !CBB_add_u8(cbb.get(), NAMED_CURVE_TYPE) || !CBB_add_u16(cbb.get(), group_id) || diff --git a/ssl/internal.h b/ssl/internal.h index 46c52486..dff62531 100644 --- a/ssl/internal.h +++ b/ssl/internal.h @@ -934,12 +934,14 @@ bool ssl_public_key_verify(SSL *ssl, Span signature, // SSLKeyShare abstracts over Diffie-Hellman-like key exchanges. class SSLKeyShare { public: + SSLKeyShare() : isInitiator(false) {} virtual ~SSLKeyShare() {} static constexpr bool kAllowUniquePtr = true; HAS_VIRTUAL_DESTRUCTOR // Create returns a SSLKeyShare instance for use with group |group_id| or - // nullptr on error. + // nullptr on error. |isClient| indicates whether key share is created + // on a client (true) or a server (false) side. static UniquePtr Create(uint16_t group_id); // Create deserializes an SSLKeyShare instance previously serialized by @@ -977,6 +979,13 @@ class SSLKeyShare { // Deserialize initializes the state of the key exchange from |in|, returning // true if successful and false otherwise. It is called by |Create|. virtual bool Deserialize(CBS *in) { return false; } + + // Sets flag indicating role of the key share owner. True for initiator of the + // handshake, false for responder. + void SetInitiator(bool flag) { isInitiator = flag; } + + protected: + bool isInitiator; }; // ssl_nid_to_group_id looks up the group corresponding to |nid|. On success, it diff --git a/ssl/ssl_key_share.cc b/ssl/ssl_key_share.cc index c7f6f88f..93a4ddfc 100644 --- a/ssl/ssl_key_share.cc +++ b/ssl/ssl_key_share.cc @@ -30,6 +30,25 @@ #include "internal.h" #include "../crypto/internal.h" +#ifdef BORINGSSL_USE_SIDH +extern "C" +{ + #include +} + +namespace { + // Definitions for SIDH/P503 + const size_t kSIDH503_PrvAKeyBitsSz = 250; // Bit size of SIDH private key (type A) + const size_t kSIDH503_PrvBKeyBitsSz = 252; // Bit size of SIDH private key (type B) + const size_t kSIDH503_PubKeyBytesSz = 378; // Byte size of SIDH public key + const size_t kSIDH_SsByteSz = 126; // Shared secret byte size + const size_t kX25519_SsByteSz = 32; // Both for public and private key +} +#endif + +constexpr size_t BitsToBytes(size_t bits) { + return (bits + 7) / 8; +} namespace bssl { @@ -211,16 +230,123 @@ class X25519KeyShare : public SSLKeyShare { uint8_t private_key_[32]; }; +#ifdef BORINGSSL_USE_SIDH +class SIDH503X25519KeyShare : public SSLKeyShare { +public: + SIDH503X25519KeyShare() {} + ~SIDH503X25519KeyShare() override { + OPENSSL_cleanse(private_x25519, sizeof(private_x25519)); + OPENSSL_cleanse(private_SIDH, sizeof(private_SIDH)); + } + + uint16_t GroupID() const override { + return SSL_CURVE_sidh503; + } + + bool Offer(CBB *out) override { + uint8_t public_x25519[32]; + uint8_t public_SIDH[kSIDH503_PubKeyBytesSz]; + const size_t prvKeyBitSz = isInitiator?kSIDH503_PrvAKeyBitsSz:kSIDH503_PrvBKeyBitsSz; + + // Scoped BN + UniquePtr bn_ctx(BN_CTX_new()); + if (!bn_ctx) { + return false; + } + BN_CTXScope scope(bn_ctx.get()); + + // Calculate private key for Alice. Needs to be in range [0, 2^0xFA - 1] and < 250 bits + BIGNUM *bn_sidh_prv = BN_CTX_get(bn_ctx.get()); + if (!bn_sidh_prv) { + return false; + } + + if (!BN_rand(bn_sidh_prv, prvKeyBitSz, BN_RAND_TOP_ONE, BN_RAND_BOTTOM_ANY)) { + return false; + } + + // Convert to little endian + if (!BN_bn2le_padded(private_SIDH, sizeof(private_SIDH), bn_sidh_prv)) { + return false; + } + + X25519_keypair(public_x25519, private_x25519); + if (isInitiator) { + // Always returns 0 + (void)EphemeralKeyGeneration_A_SIDHp503(private_SIDH, public_SIDH); + } else { + // Always returns 0 + (void)EphemeralKeyGeneration_B_SIDHp503(private_SIDH, public_SIDH); + } + + return + CBB_add_bytes(out, public_x25519, sizeof(public_x25519)) && + CBB_add_bytes(out, public_SIDH, sizeof(public_SIDH)); + } + + bool Finish(Array *out_secret, uint8_t *out_alert, + Span peer_key) override { + *out_alert = SSL_AD_INTERNAL_ERROR; + + Array secret; + if (!secret.Init(sizeof(private_x25519) + kSIDH_SsByteSz)) { + OPENSSL_PUT_ERROR(SSL, ERR_R_MALLOC_FAILURE); + return false; + } + + if (peer_key.size() != (kX25519_SsByteSz + kSIDH503_PubKeyBytesSz) || + !X25519(secret.data(), private_x25519, peer_key.data())) { + *out_alert = SSL_AD_DECODE_ERROR; + OPENSSL_PUT_ERROR(SSL, SSL_R_BAD_ECPOINT); + return false; + } + + if (isInitiator) { + // Always returns 0 + (void)EphemeralSecretAgreement_A_SIDHp503(private_SIDH, peer_key.data() + 32, secret.data() + sizeof(private_x25519)); + } else { + (void)EphemeralSecretAgreement_B_SIDHp503(private_SIDH, peer_key.data() + 32, secret.data() + sizeof(private_x25519)); + } + *out_secret = std::move(secret); + return true; + } + + bool Serialize(CBB *out) override { + return (CBB_add_asn1_uint64(out, GroupID()) && + CBB_add_asn1_octet_string(out, private_x25519, sizeof(private_x25519)) && + CBB_add_asn1_octet_string(out, private_SIDH, sizeof(private_SIDH))); + } + + bool Deserialize(CBS *in) override { + CBS key; + if (!CBS_get_asn1(in, &key, CBS_ASN1_OCTETSTRING) || + CBS_len(&key) != (sizeof(private_x25519) + sizeof(private_SIDH)) || + !CBS_copy_bytes(&key, private_x25519, sizeof(private_x25519)) || + !CBS_copy_bytes(&key, private_SIDH, sizeof(private_SIDH))) { + return false; + } + return true; + } + +private: + uint8_t private_x25519[kX25519_SsByteSz]; + uint8_t private_SIDH[BitsToBytes(kSIDH503_PrvAKeyBitsSz)]; +}; +#endif // BORINGSSL_USE_SIDH + CONSTEXPR_ARRAY struct { int nid; uint16_t group_id; - const char name[8], alias[11]; + const char name[16], alias[16]; } kNamedGroups[] = { {NID_secp224r1, SSL_CURVE_SECP224R1, "P-224", "secp224r1"}, {NID_X9_62_prime256v1, SSL_CURVE_SECP256R1, "P-256", "prime256v1"}, {NID_secp384r1, SSL_CURVE_SECP384R1, "P-384", "secp384r1"}, {NID_secp521r1, SSL_CURVE_SECP521R1, "P-521", "secp521r1"}, {NID_X25519, SSL_CURVE_X25519, "X25519", "x25519"}, +#ifdef BORINGSSL_USE_SIDH + {NID_x25519sidh503, SSL_CURVE_sidh503, "x25519sidh503", "x25519sidh503"}, +#endif }; } // namespace @@ -241,6 +367,10 @@ UniquePtr SSLKeyShare::Create(uint16_t group_id) { New(NID_secp521r1, SSL_CURVE_SECP521R1)); case SSL_CURVE_X25519: return UniquePtr(New()); +#ifdef BORINGSSL_USE_SIDH + case SSL_CURVE_sidh503: + return UniquePtr(New()); +#endif // BORINGSSL_USE_SIDH default: return nullptr; } diff --git a/ssl/t1_lib.cc b/ssl/t1_lib.cc index dde767e9..806523af 100644 --- a/ssl/t1_lib.cc +++ b/ssl/t1_lib.cc @@ -2177,7 +2177,11 @@ static bool ext_key_share_add_clienthello(SSL_HANDSHAKE *hs, CBB *out) { group_id = groups[0]; } - hs->key_share = SSLKeyShare::Create(group_id); + if ((hs->key_share = SSLKeyShare::Create(group_id)) == nullptr) { + return false; + } + hs->key_share->SetInitiator(true); + CBB key_exchange; if (!hs->key_share || !CBB_add_u16(&kse_bytes, group_id) || diff --git a/third_party/sidh/CMakeLists.txt b/third_party/sidh/CMakeLists.txt new file mode 100644 index 00000000..d7213c8e --- /dev/null +++ b/third_party/sidh/CMakeLists.txt @@ -0,0 +1,62 @@ +cmake_minimum_required(VERSION 2.8.11) + +add_definitions(-D __LINUX__) +set(ASM_EXT S) +enable_language(ASM) + +# Compile to object files, we will link them with libssl +add_library( + sidh503 + + STATIC + + src/P503/P503.c + src/random/random.c + src/sha3/fips202.c +) + +if(OPENSSL_NO_ASM) + target_sources( + sidh503 + PRIVATE + src/P503/generic/fp_generic.c + ) +elseif(${CMAKE_SYSTEM_PROCESSOR} STREQUAL "x86_64") + add_definitions(-D _AMD64_) + add_definitions(-D _FAST_ -D _ADX_) + target_sources( + sidh503 + + PRIVATE + + src/P503/AMD64/fp_x64.c + src/P503/AMD64/fp_x64_asm.${ASM_EXT} + ) +elseif(${CMAKE_SYSTEM_PROCESSOR} STREQUAL "aarch64") + add_definitions(-lrt) + add_definitions(-D _ARM64_) + target_sources( + sidh503 + + PRIVATE + + src/P503/ARM64/fp_arm64.c + src/P503/ARM64/fp_arm64_asm.${ASM_EXT} + ) +elseif(${CMAKE_SYSTEM_PROCESSOR} STREQUAL "arm64") + add_definitions(-lrt) + add_definitions(-D _ARM64_) + target_sources( + sidh503 + + PRIVATE + + src/P503/ARM64/fp_arm64.c + src/P503/ARM64/fp_arm64_asm.${ASM_EXT} + ) +endif() + +target_include_directories(sidh503 PUBLIC + src + src/P503 +) diff --git a/third_party/sidh/Makefile b/third_party/sidh/Makefile new file mode 100644 index 00000000..c9fe9caa --- /dev/null +++ b/third_party/sidh/Makefile @@ -0,0 +1,167 @@ +#### Makefile for compilation on Linux #### + +OPT?=-O3 +CC?=gcc + +ifeq ($(CC),$(filter $(CC),gcc cc)) +OPT+= -fPIC -fPIE +endif + +ARCHITECTURE=_AMD64_ +USE_OPT_LEVEL=_FAST_ +ifeq "$(ARCH)" "x64" + ARCHITECTURE=_AMD64_ + USE_OPT_LEVEL=_FAST_ +else ifeq "$(ARCH)" "x86" + ARCHITECTURE=_X86_ + USE_OPT_LEVEL=_GENERIC_ +else ifeq "$(ARCH)" "ARM" + ARCHITECTURE=_ARM_ + USE_OPT_LEVEL=_GENERIC_ + ARM_SETTING=-lrt +else ifeq "$(ARCH)" "ARM64" + ARCHITECTURE=_ARM64_ + USE_OPT_LEVEL=_FAST_ + ARM_SETTING=-lrt +endif + +ifeq "$(OPT_LEVEL)" "GENERIC" + USE_OPT_LEVEL=_GENERIC_ +endif + +ifeq "$(ARCHITECTURE)" "_AMD64_" + ifeq "$(USE_OPT_LEVEL)" "_FAST_" + MULX=-D _MULX_ + ifeq "$(USE_MULX)" "FALSE" + MULX= + else + ADX=-D _ADX_ + ifeq "$(USE_ADX)" "FALSE" + ADX= + endif + endif + endif +endif + +ifeq "$(SET)" "EXTENDED" + ADDITIONAL_SETTINGS=-fwrapv -fomit-frame-pointer -march=native +endif + +AR=ar rcs +RANLIB=ranlib + +CFLAGS=$(OPT) $(ADDITIONAL_SETTINGS) -D $(ARCHITECTURE) -D __LINUX__ -D $(USE_OPT_LEVEL) $(MULX) $(ADX) +LDFLAGS=-lm +ifeq "$(USE_OPT_LEVEL)" "_GENERIC_" + EXTRA_OBJECTS_503=objs503/fp_generic.o + EXTRA_OBJECTS_751=objs751/fp_generic.o +else ifeq "$(USE_OPT_LEVEL)" "_FAST_" +ifeq "$(ARCHITECTURE)" "_AMD64_" + EXTRA_OBJECTS_503=objs503/fp_x64.o objs503/fp_x64_asm.o + EXTRA_OBJECTS_751=objs751/fp_x64.o objs751/fp_x64_asm.o +else ifeq "$(ARCHITECTURE)" "_ARM64_" + EXTRA_OBJECTS_503=objs503/fp_arm64.o objs503/fp_arm64_asm.o + EXTRA_OBJECTS_751=objs751/fp_arm64.o objs751/fp_arm64_asm.o +endif +endif +OBJECTS_503=objs503/P503.o $(EXTRA_OBJECTS_503) objs/random.o objs/fips202.o +OBJECTS_751=objs751/P751.o $(EXTRA_OBJECTS_751) objs/random.o objs/fips202.o + +all: lib503 lib751 tests KATS + +objs503/%.o: src/P503/%.c + @mkdir -p $(@D) + $(CC) -c $(CFLAGS) $< -o $@ + +objs751/%.o: src/P751/%.c + @mkdir -p $(@D) + $(CC) -c $(CFLAGS) $< -o $@ + +ifeq "$(USE_OPT_LEVEL)" "_GENERIC_" + objs503/fp_generic.o: src/P503/generic/fp_generic.c + $(CC) -c $(CFLAGS) src/P503/generic/fp_generic.c -o objs503/fp_generic.o + + objs751/fp_generic.o: src/P751/generic/fp_generic.c + $(CC) -c $(CFLAGS) src/P751/generic/fp_generic.c -o objs751/fp_generic.o +else ifeq "$(USE_OPT_LEVEL)" "_FAST_" +ifeq "$(ARCHITECTURE)" "_AMD64_" + objs503/fp_x64.o: src/P503/AMD64/fp_x64.c + $(CC) -c $(CFLAGS) src/P503/AMD64/fp_x64.c -o objs503/fp_x64.o + + objs503/fp_x64_asm.o: src/P503/AMD64/fp_x64_asm.S + $(CC) -c $(CFLAGS) src/P503/AMD64/fp_x64_asm.S -o objs503/fp_x64_asm.o + + objs751/fp_x64.o: src/P751/AMD64/fp_x64.c + $(CC) -c $(CFLAGS) src/P751/AMD64/fp_x64.c -o objs751/fp_x64.o + + objs751/fp_x64_asm.o: src/P751/AMD64/fp_x64_asm.S + $(CC) -c $(CFLAGS) src/P751/AMD64/fp_x64_asm.S -o objs751/fp_x64_asm.o +else ifeq "$(ARCHITECTURE)" "_ARM64_" + objs503/fp_arm64.o: src/P503/ARM64/fp_arm64.c + $(CC) -c $(CFLAGS) src/P503/ARM64/fp_arm64.c -o objs503/fp_arm64.o + + objs503/fp_arm64_asm.o: src/P503/ARM64/fp_arm64_asm.S + $(CC) -c $(CFLAGS) src/P503/ARM64/fp_arm64_asm.S -o objs503/fp_arm64_asm.o + + objs751/fp_arm64.o: src/P751/ARM64/fp_arm64.c + $(CC) -c $(CFLAGS) src/P751/ARM64/fp_arm64.c -o objs751/fp_arm64.o + + objs751/fp_arm64_asm.o: src/P751/ARM64/fp_arm64_asm.S + $(CC) -c $(CFLAGS) src/P751/ARM64/fp_arm64_asm.S -o objs751/fp_arm64_asm.o +endif +endif + +objs/random.o: src/random/random.c + @mkdir -p $(@D) + $(CC) -c $(CFLAGS) src/random/random.c -o objs/random.o + +objs/fips202.o: src/sha3/fips202.c + $(CC) -c $(CFLAGS) src/sha3/fips202.c -o objs/fips202.o + +lib503: $(OBJECTS_503) + rm -rf lib503 sike503 sidh503 + mkdir lib503 sike503 sidh503 + $(AR) lib503/libsidh.a $^ + $(RANLIB) lib503/libsidh.a + +lib751: $(OBJECTS_751) + rm -rf lib751 sike751 sidh751 + mkdir lib751 sike751 sidh751 + $(AR) lib751/libsidh.a $^ + $(RANLIB) lib751/libsidh.a + +tests: lib503 lib751 + $(CC) $(CFLAGS) -L./lib503 tests/arith_tests-p503.c tests/test_extras.c -lsidh $(LDFLAGS) -o arith_tests-p503 $(ARM_SETTING) + $(CC) $(CFLAGS) -L./lib751 tests/arith_tests-p751.c tests/test_extras.c -lsidh $(LDFLAGS) -o arith_tests-p751 $(ARM_SETTING) + $(CC) $(CFLAGS) -L./lib503 tests/test_SIDHp503.c tests/test_extras.c -lsidh $(LDFLAGS) -o sidh503/test_SIDH $(ARM_SETTING) + $(CC) $(CFLAGS) -L./lib751 tests/test_SIDHp751.c tests/test_extras.c -lsidh $(LDFLAGS) -o sidh751/test_SIDH $(ARM_SETTING) + $(CC) $(CFLAGS) -L./lib503 tests/test_SIKEp503.c tests/test_extras.c -lsidh $(LDFLAGS) -o sike503/test_SIKE $(ARM_SETTING) + $(CC) $(CFLAGS) -L./lib751 tests/test_SIKEp751.c tests/test_extras.c -lsidh $(LDFLAGS) -o sike751/test_SIKE $(ARM_SETTING) + $(CC) $(CFLAGS) -L./lib503 tests/arith_tests-p503.c tests/test_extras.c -lsidh $(LDFLAGS) -o sike503/arith_test $(ARM_SETTING) + $(CC) $(CFLAGS) -L./lib751 tests/arith_tests-p751.c tests/test_extras.c -lsidh $(LDFLAGS) -o sike751/arith_test $(ARM_SETTING) + +# AES +AES_OBJS=objs/aes.o objs/aes_c.o + +objs/%.o: tests/aes/%.c + @mkdir -p $(@D) + $(CC) -c $(CFLAGS) $< -o $@ + +lib503_for_KATs: $(OBJECTS_503) $(AES_OBJS) + $(AR) lib503/libsidh_for_testing.a $^ + $(RANLIB) lib503/libsidh_for_testing.a + +lib751_for_KATs: $(OBJECTS_751) $(AES_OBJS) + $(AR) lib751/libsidh_for_testing.a $^ + $(RANLIB) lib751/libsidh_for_testing.a + +KATS: lib503_for_KATs lib751_for_KATs + $(CC) $(CFLAGS) -L./lib503 tests/PQCtestKAT_kem503.c tests/rng/rng.c -lsidh_for_testing $(LDFLAGS) -o sike503/PQCtestKAT_kem $(ARM_SETTING) + $(CC) $(CFLAGS) -L./lib751 tests/PQCtestKAT_kem751.c tests/rng/rng.c -lsidh_for_testing $(LDFLAGS) -o sike751/PQCtestKAT_kem $(ARM_SETTING) + +check: tests + +.PHONY: clean + +clean: + rm -rf *.req objs503 objs751 objs lib503 lib751 sidh503 sidh751 sike503 sike751 arith_tests-* diff --git a/third_party/sidh/src/P503/AMD64/fp_x64.c b/third_party/sidh/src/P503/AMD64/fp_x64.c new file mode 100644 index 00000000..8f5305ea --- /dev/null +++ b/third_party/sidh/src/P503/AMD64/fp_x64.c @@ -0,0 +1,523 @@ +/******************************************************************************************** +* SIDH: an efficient supersingular isogeny cryptography library +* +* Abstract: modular arithmetic optimized for x64 platforms for P503 +*********************************************************************************************/ + +#include "../P503_internal.h" + + +// Global constants +extern const uint64_t p503[NWORDS_FIELD]; +extern const uint64_t p503p1[NWORDS_FIELD]; +extern const uint64_t p503x2[NWORDS_FIELD]; + + +__inline void fpadd503(const digit_t* a, const digit_t* b, digit_t* c) +{ // Modular addition, c = a+b mod p503. + // Inputs: a, b in [0, 2*p503-1] + // Output: c in [0, 2*p503-1] + +#if (OS_TARGET == OS_WIN) + unsigned int i, carry = 0; + digit_t mask; + + for (i = 0; i < NWORDS_FIELD; i++) { + ADDC(carry, a[i], b[i], carry, c[i]); + } + + carry = 0; + for (i = 0; i < NWORDS_FIELD; i++) { + SUBC(carry, c[i], ((digit_t*)p503x2)[i], carry, c[i]); + } + mask = 0 - (digit_t)carry; + + carry = 0; + for (i = 0; i < NWORDS_FIELD; i++) { + ADDC(carry, c[i], ((digit_t*)p503x2)[i] & mask, carry, c[i]); + } + +#elif (OS_TARGET == OS_LINUX) + + fpadd503_asm(a, b, c); + +#endif +} + + +__inline void fpsub503(const digit_t* a, const digit_t* b, digit_t* c) +{ // Modular subtraction, c = a-b mod p503. + // Inputs: a, b in [0, 2*p503-1] + // Output: c in [0, 2*p503-1] + +#if (OS_TARGET == OS_WIN) + unsigned int i, borrow = 0; + digit_t mask; + + for (i = 0; i < NWORDS_FIELD; i++) { + SUBC(borrow, a[i], b[i], borrow, c[i]); + } + mask = 0 - (digit_t)borrow; + + borrow = 0; + for (i = 0; i < NWORDS_FIELD; i++) { + ADDC(borrow, c[i], ((digit_t*)p503x2)[i] & mask, borrow, c[i]); + } + +#elif (OS_TARGET == OS_LINUX) + + fpsub503_asm(a, b, c); + +#endif +} + + +__inline void fpneg503(digit_t* a) +{ // Modular negation, a = -a mod p503. + // Input/output: a in [0, 2*p503-1] + unsigned int i, borrow = 0; + + for (i = 0; i < NWORDS_FIELD; i++) { + SUBC(borrow, ((digit_t*)p503x2)[i], a[i], borrow, a[i]); + } +} + + +void fpdiv2_503(const digit_t* a, digit_t* c) +{ // Modular division by two, c = a/2 mod p503. + // Input : a in [0, 2*p503-1] + // Output: c in [0, 2*p503-1] + unsigned int i, carry = 0; + digit_t mask; + + mask = 0 - (digit_t)(a[0] & 1); // If a is odd compute a+p503 + for (i = 0; i < NWORDS_FIELD; i++) { + ADDC(carry, a[i], ((digit_t*)p503)[i] & mask, carry, c[i]); + } + + mp_shiftr1(c, NWORDS_FIELD); +} + + +void fpcorrection503(digit_t* a) +{ // Modular correction to reduce field element a in [0, 2*p503-1] to [0, p503-1]. + unsigned int i, borrow = 0; + digit_t mask; + + for (i = 0; i < NWORDS_FIELD; i++) { + SUBC(borrow, a[i], ((digit_t*)p503)[i], borrow, a[i]); + } + mask = 0 - (digit_t)borrow; + + borrow = 0; + for (i = 0; i < NWORDS_FIELD; i++) { + ADDC(borrow, a[i], ((digit_t*)p503)[i] & mask, borrow, a[i]); + } +} + + +void mp_mul(const digit_t* a, const digit_t* b, digit_t* c, const unsigned int nwords) +{ // Multiprecision multiply, c = a*b, where lng(a) = lng(b) = nwords. + + UNREFERENCED_PARAMETER(nwords); + +#if (OS_TARGET == OS_WIN) + digit_t t = 0; + uint128_t uv = {0}; + unsigned int carry = 0; + + MULADD128(a[0], b[0], uv, carry, uv); + t += carry; + c[0] = uv[0]; + uv[0] = uv[1]; + uv[1] = t; + t = 0; + + MULADD128(a[0], b[1], uv, carry, uv); + t += carry; + MULADD128(a[1], b[0], uv, carry, uv); + t += carry; + c[1] = uv[0]; + uv[0] = uv[1]; + uv[1] = t; + t = 0; + + MULADD128(a[0], b[2], uv, carry, uv); + t += carry; + MULADD128(a[1], b[1], uv, carry, uv); + t += carry; + MULADD128(a[2], b[0], uv, carry, uv); + t += carry; + c[2] = uv[0]; + uv[0] = uv[1]; + uv[1] = t; + t = 0; + + MULADD128(a[0], b[3], uv, carry, uv); + t += carry; + MULADD128(a[2], b[1], uv, carry, uv); + t += carry; + MULADD128(a[1], b[2], uv, carry, uv); + t += carry; + MULADD128(a[3], b[0], uv, carry, uv); + t += carry; + c[3] = uv[0]; + uv[0] = uv[1]; + uv[1] = t; + t = 0; + + MULADD128(a[0], b[4], uv, carry, uv); + t += carry; + MULADD128(a[3], b[1], uv, carry, uv); + t += carry; + MULADD128(a[2], b[2], uv, carry, uv); + t += carry; + MULADD128(a[1], b[3], uv, carry, uv); + t += carry; + MULADD128(a[4], b[0], uv, carry, uv); + t += carry; + c[4] = uv[0]; + uv[0] = uv[1]; + uv[1] = t; + t = 0; + + MULADD128(a[0], b[5], uv, carry, uv); + t += carry; + MULADD128(a[4], b[1], uv, carry, uv); + t += carry; + MULADD128(a[3], b[2], uv, carry, uv); + t += carry; + MULADD128(a[2], b[3], uv, carry, uv); + t += carry; + MULADD128(a[1], b[4], uv, carry, uv); + t += carry; + MULADD128(a[5], b[0], uv, carry, uv); + t += carry; + c[5] = uv[0]; + uv[0] = uv[1]; + uv[1] = t; + t = 0; + + MULADD128(a[0], b[6], uv, carry, uv); + t += carry; + MULADD128(a[5], b[1], uv, carry, uv); + t += carry; + MULADD128(a[4], b[2], uv, carry, uv); + t += carry; + MULADD128(a[3], b[3], uv, carry, uv); + t += carry; + MULADD128(a[2], b[4], uv, carry, uv); + t += carry; + MULADD128(a[1], b[5], uv, carry, uv); + t += carry; + MULADD128(a[6], b[0], uv, carry, uv); + t += carry; + c[6] = uv[0]; + uv[0] = uv[1]; + uv[1] = t; + t = 0; + + MULADD128(a[0], b[7], uv, carry, uv); + t += carry; + MULADD128(a[6], b[1], uv, carry, uv); + t += carry; + MULADD128(a[5], b[2], uv, carry, uv); + t += carry; + MULADD128(a[4], b[3], uv, carry, uv); + t += carry; + MULADD128(a[3], b[4], uv, carry, uv); + t += carry; + MULADD128(a[2], b[5], uv, carry, uv); + t += carry; + MULADD128(a[1], b[6], uv, carry, uv); + t += carry; + MULADD128(a[7], b[0], uv, carry, uv); + t += carry; + c[7] = uv[0]; + uv[0] = uv[1]; + uv[1] = t; + t = 0; + + MULADD128(a[7], b[1], uv, carry, uv); + t += carry; + MULADD128(a[6], b[2], uv, carry, uv); + t += carry; + MULADD128(a[5], b[3], uv, carry, uv); + t += carry; + MULADD128(a[4], b[4], uv, carry, uv); + t += carry; + MULADD128(a[3], b[5], uv, carry, uv); + t += carry; + MULADD128(a[2], b[6], uv, carry, uv); + t += carry; + MULADD128(a[1], b[7], uv, carry, uv); + t += carry; + c[8] = uv[0]; + uv[0] = uv[1]; + uv[1] = t; + t = 0; + + MULADD128(a[7], b[2], uv, carry, uv); + t += carry; + MULADD128(a[6], b[3], uv, carry, uv); + t += carry; + MULADD128(a[5], b[4], uv, carry, uv); + t += carry; + MULADD128(a[4], b[5], uv, carry, uv); + t += carry; + MULADD128(a[3], b[6], uv, carry, uv); + t += carry; + MULADD128(a[2], b[7], uv, carry, uv); + t += carry; + c[9] = uv[0]; + uv[0] = uv[1]; + uv[1] = t; + t = 0; + + MULADD128(a[7], b[3], uv, carry, uv); + t += carry; + MULADD128(a[6], b[4], uv, carry, uv); + t += carry; + MULADD128(a[5], b[5], uv, carry, uv); + t += carry; + MULADD128(a[4], b[6], uv, carry, uv); + t += carry; + MULADD128(a[3], b[7], uv, carry, uv); + t += carry; + c[10] = uv[0]; + uv[0] = uv[1]; + uv[1] = t; + t = 0; + + MULADD128(a[7], b[4], uv, carry, uv); + t += carry; + MULADD128(a[6], b[5], uv, carry, uv); + t += carry; + MULADD128(a[5], b[6], uv, carry, uv); + t += carry; + MULADD128(a[4], b[7], uv, carry, uv); + t += carry; + c[11] = uv[0]; + uv[0] = uv[1]; + uv[1] = t; + t = 0; + + MULADD128(a[7], b[5], uv, carry, uv); + t += carry; + MULADD128(a[6], b[6], uv, carry, uv); + t += carry; + MULADD128(a[5], b[7], uv, carry, uv); + t += carry; + c[12] = uv[0]; + uv[0] = uv[1]; + uv[1] = t; + t = 0; + + MULADD128(a[7], b[6], uv, carry, uv); + t += carry; + MULADD128(a[6], b[7], uv, carry, uv); + t += carry; + c[13] = uv[0]; + uv[0] = uv[1]; + uv[1] = t; + + MULADD128(a[7], b[7], uv, carry, uv); + c[14] = uv[0]; + c[15] = uv[1]; + +#elif (OS_TARGET == OS_LINUX) + + mul503_asm(a, b, c); + +#endif +} + + +void rdc_mont(const digit_t* ma, digit_t* mc) +{ // Montgomery reduction exploiting special form of the prime. + // mc = ma*R^-1 mod p503x2, where R = 2^512. + // If ma < 2^512*p503, the output mc is in the range [0, 2*p503-1]. + // ma is assumed to be in Montgomery representation. + +#if (OS_TARGET == OS_WIN) + unsigned int carry; + digit_t t = 0; + uint128_t uv = {0}; + + mc[0] = ma[0]; + mc[1] = ma[1]; + mc[2] = ma[2]; + MUL128(mc[0], ((digit_t*)p503p1)[3], uv); + ADDC(0, uv[0], ma[3], carry, uv[0]); + ADDC(carry, uv[1], 0, carry, uv[1]); + mc[3] = uv[0]; + uv[0] = uv[1]; + uv[1] = 0; + + MULADD128(mc[0], ((digit_t*)p503p1)[4], uv, carry, uv); + MULADD128(mc[1], ((digit_t*)p503p1)[3], uv, carry, uv); + t += carry; + ADDC(0, uv[0], ma[4], carry, uv[0]); + ADDC(carry, uv[1], 0, carry, uv[1]); + t += carry; + mc[4] = uv[0]; + uv[0] = uv[1]; + uv[1] = t; + t = 0; + + MULADD128(mc[0], ((digit_t*)p503p1)[5], uv, carry, uv); + t += carry; + MULADD128(mc[1], ((digit_t*)p503p1)[4], uv, carry, uv); + t += carry; + MULADD128(mc[2], ((digit_t*)p503p1)[3], uv, carry, uv); + t += carry; + ADDC(0, uv[0], ma[5], carry, uv[0]); + ADDC(carry, uv[1], 0, carry, uv[1]); + t += carry; + mc[5] = uv[0]; + uv[0] = uv[1]; + uv[1] = t; + t = 0; + + MULADD128(mc[0], ((digit_t*)p503p1)[6], uv, carry, uv); + t += carry; + MULADD128(mc[1], ((digit_t*)p503p1)[5], uv, carry, uv); + t += carry; + MULADD128(mc[2], ((digit_t*)p503p1)[4], uv, carry, uv); + t += carry; + MULADD128(mc[3], ((digit_t*)p503p1)[3], uv, carry, uv); + t += carry; + ADDC(0, uv[0], ma[6], carry, uv[0]); + ADDC(carry, uv[1], 0, carry, uv[1]); + t += carry; + mc[6] = uv[0]; + uv[0] = uv[1]; + uv[1] = t; + t = 0; + + MULADD128(mc[0], ((digit_t*)p503p1)[7], uv, carry, uv); + t += carry; + MULADD128(mc[1], ((digit_t*)p503p1)[6], uv, carry, uv); + t += carry; + MULADD128(mc[2], ((digit_t*)p503p1)[5], uv, carry, uv); + t += carry; + MULADD128(mc[3], ((digit_t*)p503p1)[4], uv, carry, uv); + t += carry; + MULADD128(mc[4], ((digit_t*)p503p1)[3], uv, carry, uv); + t += carry; + ADDC(0, uv[0], ma[7], carry, uv[0]); + ADDC(carry, uv[1], 0, carry, uv[1]); + t += carry; + mc[7] = uv[0]; + uv[0] = uv[1]; + uv[1] = t; + t = 0; + + MULADD128(mc[1], ((digit_t*)p503p1)[7], uv, carry, uv); + t += carry; + MULADD128(mc[2], ((digit_t*)p503p1)[6], uv, carry, uv); + t += carry; + MULADD128(mc[3], ((digit_t*)p503p1)[5], uv, carry, uv); + t += carry; + MULADD128(mc[4], ((digit_t*)p503p1)[4], uv, carry, uv); + t += carry; + MULADD128(mc[5], ((digit_t*)p503p1)[3], uv, carry, uv); + t += carry; + ADDC(0, uv[0], ma[8], carry, uv[0]); + ADDC(carry, uv[1], 0, carry, uv[1]); + t += carry; + mc[0] = uv[0]; + uv[0] = uv[1]; + uv[1] = t; + t = 0; + + MULADD128(mc[2], ((digit_t*)p503p1)[7], uv, carry, uv); + t += carry; + MULADD128(mc[3], ((digit_t*)p503p1)[6], uv, carry, uv); + t += carry; + MULADD128(mc[4], ((digit_t*)p503p1)[5], uv, carry, uv); + t += carry; + MULADD128(mc[5], ((digit_t*)p503p1)[4], uv, carry, uv); + t += carry; + MULADD128(mc[6], ((digit_t*)p503p1)[3], uv, carry, uv); + t += carry; + ADDC(0, uv[0], ma[9], carry, uv[0]); + ADDC(carry, uv[1], 0, carry, uv[1]); + t += carry; + mc[1] = uv[0]; + uv[0] = uv[1]; + uv[1] = t; + t = 0; + + MULADD128(mc[3], ((digit_t*)p503p1)[7], uv, carry, uv); + t += carry; + MULADD128(mc[4], ((digit_t*)p503p1)[6], uv, carry, uv); + t += carry; + MULADD128(mc[5], ((digit_t*)p503p1)[5], uv, carry, uv); + t += carry; + MULADD128(mc[6], ((digit_t*)p503p1)[4], uv, carry, uv); + t += carry; + MULADD128(mc[7], ((digit_t*)p503p1)[3], uv, carry, uv); + t += carry; + ADDC(0, uv[0], ma[10], carry, uv[0]); + ADDC(carry, uv[1], 0, carry, uv[1]); + t += carry; + mc[2] = uv[0]; + uv[0] = uv[1]; + uv[1] = t; + t = 0; + + MULADD128(mc[4], ((digit_t*)p503p1)[7], uv, carry, uv); + t += carry; + MULADD128(mc[5], ((digit_t*)p503p1)[6], uv, carry, uv); + t += carry; + MULADD128(mc[6], ((digit_t*)p503p1)[5], uv, carry, uv); + t += carry; + MULADD128(mc[7], ((digit_t*)p503p1)[4], uv, carry, uv); + t += carry; + ADDC(0, uv[0], ma[11], carry, uv[0]); + ADDC(carry, uv[1], 0, carry, uv[1]); + t += carry; + mc[3] = uv[0]; + uv[0] = uv[1]; + uv[1] = t; + t = 0; + + MULADD128(mc[5], ((digit_t*)p503p1)[7], uv, carry, uv); + t += carry; + MULADD128(mc[6], ((digit_t*)p503p1)[6], uv, carry, uv); + t += carry; + MULADD128(mc[7], ((digit_t*)p503p1)[5], uv, carry, uv); + t += carry; + ADDC(0, uv[0], ma[12], carry, uv[0]); + ADDC(carry, uv[1], 0, carry, uv[1]); + t += carry; + mc[4] = uv[0]; + uv[0] = uv[1]; + uv[1] = t; + t = 0; + + MULADD128(mc[6], ((digit_t*)p503p1)[7], uv, carry, uv); + t += carry; + MULADD128(mc[7], ((digit_t*)p503p1)[6], uv, carry, uv); + t += carry; + ADDC(0, uv[0], ma[13], carry, uv[0]); + ADDC(carry, uv[1], 0, carry, uv[1]); + t += carry; + mc[5] = uv[0]; + uv[0] = uv[1]; + uv[1] = t; + t = 0; + + MULADD128(mc[7], ((digit_t*)p503p1)[7], uv, carry, uv); + t += carry; + ADDC(0, uv[0], ma[14], carry, mc[6]); + ADDC(carry, uv[1], 0, carry, uv[1]); + ADDC(0, uv[1], ma[15], carry, mc[7]); + +#elif (OS_TARGET == OS_LINUX) + + rdc503_asm(ma, mc); + +#endif +} diff --git a/third_party/sidh/src/P503/AMD64/fp_x64_asm.S b/third_party/sidh/src/P503/AMD64/fp_x64_asm.S new file mode 100644 index 00000000..b698e682 --- /dev/null +++ b/third_party/sidh/src/P503/AMD64/fp_x64_asm.S @@ -0,0 +1,1684 @@ +//******************************************************************************************* +// SIDH: an efficient supersingular isogeny cryptography library +// +// Abstract: field arithmetic in x64 assembly for P503 on Linux +//******************************************************************************************* + +.intel_syntax noprefix + +// Registers that are used for parameter passing: +#define reg_p1 rdi +#define reg_p2 rsi +#define reg_p3 rdx + +// p503 + 1 +#define p503p1_3 0xAC00000000000000 +#define p503p1_4 0x13085BDA2211E7A0 +#define p503p1_5 0x1B9BF6C87B7E7DAF +#define p503p1_6 0x6045C6BDDA77A4D0 +#define p503p1_7 0x004066F541811E1E +// p503 x 2 +#define p503x2_0 0xFFFFFFFFFFFFFFFE +#define p503x2_1 0xFFFFFFFFFFFFFFFF +#define p503x2_3 0x57FFFFFFFFFFFFFF +#define p503x2_4 0x2610B7B44423CF41 +#define p503x2_5 0x3737ED90F6FCFB5E +#define p503x2_6 0xC08B8D7BB4EF49A0 +#define p503x2_7 0x0080CDEA83023C3C + +p503p1_nz: +.quad 0xAC00000000000000 +.quad 0x13085BDA2211E7A0 +.quad 0x1B9BF6C87B7E7DAF +.quad 0x6045C6BDDA77A4D0 +.quad 0x004066F541811E1E + +// Define addition instructions +#ifdef _MULX_ +#ifdef _ADX_ + +#define ADD1 adox +#define ADC1 adox +#define ADD2 adcx +#define ADC2 adcx + +#else + +#define ADD1 add +#define ADC1 adc +#define ADD2 add +#define ADC2 adc + +#endif +#endif + + +.text +//*********************************************************************** +// Field addition +// Operation: c [reg_p3] = a [reg_p1] + b [reg_p2] +//*********************************************************************** +.global fpadd503_asm +fpadd503_asm: + push r12 + push r13 + push r14 + push r15 + + xor rax, rax + mov r8, [reg_p1] + mov r9, [reg_p1+8] + mov r10, [reg_p1+16] + mov r11, [reg_p1+24] + mov r12, [reg_p1+32] + mov r13, [reg_p1+40] + mov r14, [reg_p1+48] + mov r15, [reg_p1+56] + add r8, [reg_p2] + adc r9, [reg_p2+8] + adc r10, [reg_p2+16] + adc r11, [reg_p2+24] + adc r12, [reg_p2+32] + adc r13, [reg_p2+40] + adc r14, [reg_p2+48] + adc r15, [reg_p2+56] + + movq rcx, p503x2_0 + sub r8, rcx + movq rcx, p503x2_1 + sbb r9, rcx + sbb r10, rcx + movq rcx, p503x2_3 + sbb r11, rcx + movq rcx, p503x2_4 + sbb r12, rcx + movq rcx, p503x2_5 + sbb r13, rcx + movq rcx, p503x2_6 + sbb r14, rcx + movq rcx, p503x2_7 + sbb r15, rcx + sbb rax, 0 + + mov rdi, p503x2_0 + and rdi, rax + mov rsi, p503x2_1 + and rsi, rax + movq rcx, p503x2_3 + and rcx, rax + + add r8, rdi + adc r9, rsi + adc r10, rsi + adc r11, rcx + mov [reg_p3], r8 + mov [reg_p3+8], r9 + mov [reg_p3+16], r10 + mov [reg_p3+24], r11 + setc cl + + movq r8, p503x2_4 + and r8, rax + movq r9, p503x2_5 + and r9, rax + movq r10, p503x2_6 + and r10, rax + movq r11, p503x2_7 + and r11, rax + + bt rcx, 0 + adc r12, r8 + adc r13, r9 + adc r14, r10 + adc r15, r11 + mov [reg_p3+32], r12 + mov [reg_p3+40], r13 + mov [reg_p3+48], r14 + mov [reg_p3+56], r15 + + pop r15 + pop r14 + pop r13 + pop r12 + ret + + +//*********************************************************************** +// Field subtraction +// Operation: c [reg_p3] = a [reg_p1] - b [reg_p2] +//*********************************************************************** +.global fpsub503_asm +fpsub503_asm: + push r12 + push r13 + push r14 + push r15 + + xor rax, rax + mov r8, [reg_p1] + mov r9, [reg_p1+8] + mov r10, [reg_p1+16] + mov r11, [reg_p1+24] + mov r12, [reg_p1+32] + mov r13, [reg_p1+40] + mov r14, [reg_p1+48] + mov r15, [reg_p1+56] + sub r8, [reg_p2] + sbb r9, [reg_p2+8] + sbb r10, [reg_p2+16] + sbb r11, [reg_p2+24] + sbb r12, [reg_p2+32] + sbb r13, [reg_p2+40] + sbb r14, [reg_p2+48] + sbb r15, [reg_p2+56] + sbb rax, 0 + + mov rdi, p503x2_0 + and rdi, rax + mov rsi, p503x2_1 + and rsi, rax + movq rcx, p503x2_3 + and rcx, rax + + add r8, rdi + adc r9, rsi + adc r10, rsi + adc r11, rcx + mov [reg_p3], r8 + mov [reg_p3+8], r9 + mov [reg_p3+16], r10 + mov [reg_p3+24], r11 + setc cl + + movq r8, p503x2_4 + and r8, rax + movq r9, p503x2_5 + and r9, rax + movq r10, p503x2_6 + and r10, rax + movq r11, p503x2_7 + and r11, rax + + bt rcx, 0 + adc r12, r8 + adc r13, r9 + adc r14, r10 + adc r15, r11 + mov [reg_p3+32], r12 + mov [reg_p3+40], r13 + mov [reg_p3+48], r14 + mov [reg_p3+56], r15 + + pop r15 + pop r14 + pop r13 + pop r12 + ret + + +#ifdef _MULX_ + +///////////////////////////////////////////////////////////////// MACRO +// Schoolbook integer multiplication, a full row at a time +// Inputs: memory pointers M0 and M1 +// Outputs: memory pointer C +// Temps: regs T0:T9 +///////////////////////////////////////////////////////////////// + +#ifdef _ADX_ +.macro MUL256_SCHOOL M0, M1, C, T0, T1, T2, T3, T4, T5, T6, T7, T8, T9 + mov rdx, \M0 + mulx \T0, \T1, \M1 // T0:T1 = A0*B0 + mov \C, \T1 // C0_final + mulx \T1, \T2, 8\M1 // T1:T2 = A0*B1 + xor rax, rax + adox \T0, \T2 + mulx \T2, \T3, 16\M1 // T2:T3 = A0*B2 + adox \T1, \T3 + mulx \T3, \T4, 24\M1 // T3:T4 = A0*B3 + adox \T2, \T4 + + mov rdx, 8\M0 + mulx \T5, \T4, \M1 // T5:T4 = A1*B0 + adox \T3, rax + xor rax, rax + mulx \T6, \T7, 8\M1 // T6:T7 = A1*B1 + adox \T4, \T0 + mov 8\C, \T4 // C1_final + adcx \T5, \T7 + mulx \T7, \T8, 16\M1 // T7:T8 = A1*B2 + adcx \T6, \T8 + adox \T5, \T1 + mulx \T8, \T9, 24\M1 // T8:T9 = A1*B3 + adcx \T7, \T9 + adcx \T8, rax + adox \T6, \T2 + + mov rdx, 16\M0 + mulx \T1, \T0, \M1 // T1:T0 = A2*B0 + adox \T7, \T3 + adox \T8, rax + xor rax, rax + mulx \T2, \T3, 8\M1 // T2:T3 = A2*B1 + adox \T0, \T5 + mov 16\C, \T0 // C2_final + adcx \T1, \T3 + mulx \T3, \T4, 16\M1 // T3:T4 = A2*B2 + adcx \T2, \T4 + adox \T1, \T6 + mulx \T4,\T9, 24\M1 // T3:T4 = A2*B3 + adcx \T3, \T9 + mov rdx, 24\M0 + adcx \T4, rax + + adox \T2, \T7 + adox \T3, \T8 + adox \T4, rax + + mulx \T5, \T0, \M1 // T5:T0 = A3*B0 + xor rax, rax + mulx \T6, \T7, 8\M1 // T6:T7 = A3*B1 + adcx \T5, \T7 + adox \T1, \T0 + mulx \T7, \T8, 16\M1 // T7:T8 = A3*B2 + adcx \T6, \T8 + adox \T2, \T5 + mulx \T8, \T9, 24\M1 // T8:T9 = A3*B3 + adcx \T7, \T9 + adcx \T8, rax + + adox \T3, \T6 + adox \T4, \T7 + adox \T8, rax + mov 24\C, \T1 // C3_final + mov 32\C, \T2 // C4_final + mov 40\C, \T3 // C5_final + mov 48\C, \T4 // C6_final + mov 56\C, \T8 // C7_final +.endm + +#else + +.macro MUL256_SCHOOL M0, M1, C, T0, T1, T2, T3, T4, T5, T6, T7, T8, T9 + mov rdx, \M0 + mulx \T0, \T1, \M1 // T0:T1 = A0*B0 + mov \C, \T1 // C0_final + mulx \T1, \T2, 8\M1 // T1:T2 = A0*B1 + add \T0, \T2 + mulx \T2, \T3, 16\M1 // T2:T3 = A0*B2 + adc \T1, \T3 + mulx \T3, \T4, 24\M1 // T3:T4 = A0*B3 + adc \T2, \T4 + mov rdx, 8\M0 + adc \T3, 0 + + mulx \T5, \T4, \M1 // T5:T4 = A1*B0 + mulx \T6, \T7, 8\M1 // T6:T7 = A1*B1 + add \T5, \T7 + mulx \T7, \T8, 16\M1 // T7:T8 = A1*B2 + adc \T6, \T8 + mulx \T8, \T9, 24\M1 // T8:T9 = A1*B3 + adc \T7, \T9 + adc \T8, 0 + + add \T4, \T0 + mov 8\C, \T4 // C1_final + adc \T5, \T1 + adc \T6, \T2 + adc \T7, \T3 + mov rdx, 16\M0 + adc \T8, 0 + + mulx \T1, \T0, \M1 // T1:T0 = A2*B0 + mulx \T2, \T3, 8\M1 // T2:T3 = A2*B1 + add \T1, \T3 + mulx \T3, \T4, 16\M1 // T3:T4 = A2*B2 + adc \T2, \T4 + mulx \T4,\T9, 24\M1 // T3:T4 = A2*B3 + adc \T3, \T9 + mov rdx, 24\M0 + adc \T4, 0 + + add \T0, \T5 + mov 16\C, \T0 // C2_final + adc \T1, \T6 + adc \T2, \T7 + adc \T3, \T8 + adc \T4, 0 + + mulx \T5, \T0, \M1 // T5:T0 = A3*B0 + mulx \T6, \T7, 8\M1 // T6:T7 = A3*B1 + add \T5, \T7 + mulx \T7, \T8, 16\M1 // T7:T8 = A3*B2 + adc \T6, \T8 + mulx \T8, \T9, 24\M1 // T8:T9 = A3*B3 + adc \T7, \T9 + adc \T8, 0 + + add \T1, \T0 + mov 24\C, \T1 // C3_final + adc \T2, \T5 + mov 32\C, \T2 // C4_final + adc \T3, \T6 + mov 40\C, \T3 // C5_final + adc \T4, \T7 + mov 48\C, \T4 // C6_final + adc \T8, 0 + mov 56\C, \T8 // C7_final +.endm +#endif + + +//***************************************************************************** +// 503-bit multiplication using Karatsuba (one level), schoolbook (one level) +//***************************************************************************** +.global mul503_asm +mul503_asm: + push r12 + push r13 + push r14 + push r15 + mov rcx, reg_p3 + + // r8-r11 <- AH + AL, rax <- mask + xor rax, rax + mov r8, [reg_p1] + mov r9, [reg_p1+8] + mov r10, [reg_p1+16] + mov r11, [reg_p1+24] + push rbx + push rbp + sub rsp, 96 + add r8, [reg_p1+32] + adc r9, [reg_p1+40] + adc r10, [reg_p1+48] + adc r11, [reg_p1+56] + sbb rax, 0 + mov [rsp], r8 + mov [rsp+8], r9 + mov [rsp+16], r10 + mov [rsp+24], r11 + + // r12-r15 <- BH + BL, rbx <- mask + xor rbx, rbx + mov r12, [reg_p2] + mov r13, [reg_p2+8] + mov r14, [reg_p2+16] + mov r15, [reg_p2+24] + add r12, [reg_p2+32] + adc r13, [reg_p2+40] + adc r14, [reg_p2+48] + adc r15, [reg_p2+56] + sbb rbx, 0 + mov [rsp+32], r12 + mov [rsp+40], r13 + mov [rsp+48], r14 + mov [rsp+56], r15 + + // r12-r15 <- masked (BH + BL) + and r12, rax + and r13, rax + and r14, rax + and r15, rax + + // r8-r11 <- masked (AH + AL) + and r8, rbx + and r9, rbx + and r10, rbx + and r11, rbx + + // r8-r11 <- masked (AH + AL) + masked (AH + AL) + add r8, r12 + adc r9, r13 + adc r10, r14 + adc r11, r15 + mov [rsp+64], r8 + mov [rsp+72], r9 + mov [rsp+80], r10 + mov [rsp+88], r11 + + // [rcx+64] <- (AH+AL) x (BH+BL), low part + MUL256_SCHOOL [rsp], [rsp+32], [rcx+64], r8, r9, r10, r11, r12, r13, r14, r15, rbx, rbp + + // [rcx] <- AL x BL + MUL256_SCHOOL [reg_p1], [reg_p2], [rcx], r8, r9, r10, r11, r12, r13, r14, r15, rbx, rbp // Result C0-C3 + + // [rsp] <- AH x BH + MUL256_SCHOOL [reg_p1+32], [reg_p2+32], [rsp], r8, r9, r10, r11, r12, r13, r14, r15, rbx, rbp + + // r8-r11 <- (AH+AL) x (BH+BL), final step + mov r8, [rsp+64] + mov r9, [rsp+72] + mov r10, [rsp+80] + mov r11, [rsp+88] + mov rax, [rcx+96] + add r8, rax + mov rax, [rcx+104] + adc r9, rax + mov rax, [rcx+112] + adc r10, rax + mov rax, [rcx+120] + adc r11, rax + + // [rcx+64], x3-x5 <- (AH+AL) x (BH+BL) - ALxBL + mov r12, [rcx+64] + mov r13, [rcx+72] + mov r14, [rcx+80] + mov r15, [rcx+88] + sub r12, [rcx] + sbb r13, [rcx+8] + sbb r14, [rcx+16] + sbb r15, [rcx+24] + sbb r8, [rcx+32] + sbb r9, [rcx+40] + sbb r10, [rcx+48] + sbb r11, [rcx+56] + + // r8-r15 <- (AH+AL) x (BH+BL) - ALxBL - AHxBH + sub r12, [rsp] + sbb r13, [rsp+8] + sbb r14, [rsp+16] + sbb r15, [rsp+24] + sbb r8, [rsp+32] + sbb r9, [rsp+40] + sbb r10, [rsp+48] + sbb r11, [rsp+56] + + add r12, [rcx+32] + mov [rcx+32], r12 // Result C4-C7 + adc r13, [rcx+40] + mov [rcx+40], r13 + adc r14, [rcx+48] + mov [rcx+48], r14 + adc r15, [rcx+56] + mov [rcx+56], r15 + mov rax, [rsp] + adc r8, rax + mov [rcx+64], r8 // Result C8-C15 + mov rax, [rsp+8] + adc r9, rax + mov [rcx+72], r9 + mov rax, [rsp+16] + adc r10, rax + mov [rcx+80], r10 + mov rax, [rsp+24] + adc r11, rax + mov [rcx+88], r11 + mov r12, [rsp+32] + adc r12, 0 + mov [rcx+96], r12 + mov r13, [rsp+40] + adc r13, 0 + mov [rcx+104], r13 + mov r14, [rsp+48] + adc r14, 0 + mov [rcx+112], r14 + mov r15, [rsp+56] + adc r15, 0 + mov [rcx+120], r15 + + add rsp, 96 + pop rbp + pop rbx + pop r15 + pop r14 + pop r13 + pop r12 + ret + +#else + +//*********************************************************************** +// Integer multiplication +// Based on Karatsuba method +// Operation: c [reg_p3] = a [reg_p1] * b [reg_p2] +// NOTE: a=c or b=c are not allowed +//*********************************************************************** +.global mul503_asm +mul503_asm: + push r12 + push r13 + push r14 + mov rcx, reg_p3 + + // rcx[0-3] <- AH+AL + xor rax, rax + mov r8, [reg_p1+32] + mov r9, [reg_p1+40] + mov r10, [reg_p1+48] + mov r11, [reg_p1+56] + add r8, [reg_p1] + adc r9, [reg_p1+8] + adc r10, [reg_p1+16] + adc r11, [reg_p1+24] + push r15 + mov [rcx], r8 + mov [rcx+8], r9 + mov [rcx+16], r10 + mov [rcx+24], r11 + sbb rax, 0 + sub rsp, 80 // Allocating space in stack + + // r12-r15 <- BH+BL + xor rdx, rdx + mov r12, [reg_p2+32] + mov r13, [reg_p2+40] + mov r14, [reg_p2+48] + mov r15, [reg_p2+56] + add r12, [reg_p2] + adc r13, [reg_p2+8] + adc r14, [reg_p2+16] + adc r15, [reg_p2+24] + sbb rdx, 0 + mov [rsp+64], rax + mov [rsp+72], rdx + + // (rsp[0-3],r8,r9,r10,r11) <- (AH+AL)*(BH+BL) + mov rax, [rcx] + mul r12 + mov [rsp], rax // c0 + mov r8, rdx + + xor r9, r9 + mov rax, [rcx] + mul r13 + add r8, rax + adc r9, rdx + + xor r10, r10 + mov rax, [rcx+8] + mul r12 + add r8, rax + mov [rsp+8], r8 // c1 + adc r9, rdx + adc r10, 0 + + xor r8, r8 + mov rax, [rcx] + mul r14 + add r9, rax + adc r10, rdx + adc r8, 0 + + mov rax, [rcx+16] + mul r12 + add r9, rax + adc r10, rdx + adc r8, 0 + + mov rax, [rcx+8] + mul r13 + add r9, rax + mov [rsp+16], r9 // c2 + adc r10, rdx + adc r8, 0 + + xor r9, r9 + mov rax, [rcx] + mul r15 + add r10, rax + adc r8, rdx + adc r9, 0 + + mov rax, [rcx+24] + mul r12 + add r10, rax + adc r8, rdx + adc r9, 0 + + mov rax, [rcx+8] + mul r14 + add r10, rax + adc r8, rdx + adc r9, 0 + + mov rax, [rcx+16] + mul r13 + add r10, rax + mov [rsp+24], r10 // c3 + adc r8, rdx + adc r9, 0 + + xor r10, r10 + mov rax, [rcx+8] + mul r15 + add r8, rax + adc r9, rdx + adc r10, 0 + + mov rax, [rcx+24] + mul r13 + add r8, rax + adc r9, rdx + adc r10, 0 + + mov rax, [rcx+16] + mul r14 + add r8, rax + mov [rsp+32], r8 // c4 + adc r9, rdx + adc r10, 0 + + xor r11, r11 + mov rax, [rcx+16] + mul r15 + add r9, rax + adc r10, rdx + adc r11, 0 + + mov rax, [rcx+24] + mul r14 + add r9, rax // c5 + adc r10, rdx + adc r11, 0 + + mov rax, [rcx+24] + mul r15 + add r10, rax // c6 + adc r11, rdx // c7 + + mov rax, [rsp+64] + and r12, rax + and r13, rax + and r14, rax + and r15, rax + add r12, r8 + adc r13, r9 + adc r14, r10 + adc r15, r11 + + mov rax, [rsp+72] + mov r8, [rcx] + mov r9, [rcx+8] + mov r10, [rcx+16] + mov r11, [rcx+24] + and r8, rax + and r9, rax + and r10, rax + and r11, rax + add r8, r12 + adc r9, r13 + adc r10, r14 + adc r11, r15 + mov [rsp+32], r8 + mov [rsp+40], r9 + mov [rsp+48], r10 + mov [rsp+56], r11 + + // rcx[0-7] <- AL*BL + mov r11, [reg_p1] + mov rax, [reg_p2] + mul r11 + xor r9, r9 + mov [rcx], rax // c0 + mov r8, rdx + + mov r14, [reg_p1+16] + mov rax, [reg_p2+8] + mul r11 + xor r10, r10 + add r8, rax + adc r9, rdx + + mov r12, [reg_p1+8] + mov rax, [reg_p2] + mul r12 + add r8, rax + mov [rcx+8], r8 // c1 + adc r9, rdx + adc r10, 0 + + xor r8, r8 + mov rax, [reg_p2+16] + mul r11 + add r9, rax + adc r10, rdx + adc r8, 0 + + mov r13, [reg_p2] + mov rax, r14 + mul r13 + add r9, rax + adc r10, rdx + adc r8, 0 + + mov rax, [reg_p2+8] + mul r12 + add r9, rax + mov [rcx+16], r9 // c2 + adc r10, rdx + adc r8, 0 + + xor r9, r9 + mov rax, [reg_p2+24] + mul r11 + mov r15, [reg_p1+24] + add r10, rax + adc r8, rdx + adc r9, 0 + + mov rax, r15 + mul r13 + add r10, rax + adc r8, rdx + adc r9, 0 + + mov rax, [reg_p2+16] + mul r12 + add r10, rax + adc r8, rdx + adc r9, 0 + + mov rax, [reg_p2+8] + mul r14 + add r10, rax + mov [rcx+24], r10 // c3 + adc r8, rdx + adc r9, 0 + + xor r10, r10 + mov rax, [reg_p2+24] + mul r12 + add r8, rax + adc r9, rdx + adc r10, 0 + + mov rax, [reg_p2+8] + mul r15 + add r8, rax + adc r9, rdx + adc r10, 0 + + mov rax, [reg_p2+16] + mul r14 + add r8, rax + mov [rcx+32], r8 // c4 + adc r9, rdx + adc r10, 0 + + xor r8, r8 + mov rax, [reg_p2+24] + mul r14 + add r9, rax + adc r10, rdx + adc r8, 0 + + mov rax, [reg_p2+16] + mul r15 + add r9, rax + mov [rcx+40], r9 // c5 + adc r10, rdx + adc r8, 0 + + mov rax, [reg_p2+24] + mul r15 + add r10, rax + mov [rcx+48], r10 // c6 + adc r8, rdx + mov [rcx+56], r8 // c7 + + // rcx[8-15] <- AH*BH + mov r11, [reg_p1+32] + mov rax, [reg_p2+32] + mul r11 + xor r9, r9 + mov [rcx+64], rax // c0 + mov r8, rdx + + mov r14, [reg_p1+48] + mov rax, [reg_p2+40] + mul r11 + xor r10, r10 + add r8, rax + adc r9, rdx + + mov r12, [reg_p1+40] + mov rax, [reg_p2+32] + mul r12 + add r8, rax + mov [rcx+72], r8 // c1 + adc r9, rdx + adc r10, 0 + + xor r8, r8 + mov rax, [reg_p2+48] + mul r11 + add r9, rax + adc r10, rdx + adc r8, 0 + + mov r13, [reg_p2+32] + mov rax, r14 + mul r13 + add r9, rax + adc r10, rdx + adc r8, 0 + + mov rax, [reg_p2+40] + mul r12 + add r9, rax + mov [rcx+80], r9 // c2 + adc r10, rdx + adc r8, 0 + + xor r9, r9 + mov rax, [reg_p2+56] + mul r11 + mov r15, [reg_p1+56] + add r10, rax + adc r8, rdx + adc r9, 0 + + mov rax, r15 + mul r13 + add r10, rax + adc r8, rdx + adc r9, 0 + + mov rax, [reg_p2+48] + mul r12 + add r10, rax + adc r8, rdx + adc r9, 0 + + mov rax, [reg_p2+40] + mul r14 + add r10, rax + mov [rcx+88], r10 // c3 + adc r8, rdx + adc r9, 0 + + xor r10, r10 + mov rax, [reg_p2+56] + mul r12 + add r8, rax + adc r9, rdx + adc r10, 0 + + mov rax, [reg_p2+40] + mul r15 + add r8, rax + adc r9, rdx + adc r10, 0 + + mov rax, [reg_p2+48] + mul r14 + add r8, rax + mov [rcx+96], r8 // c4 + adc r9, rdx + adc r10, 0 + + xor r8, r8 + mov rax, [reg_p2+56] + mul r14 + add r9, rax + adc r10, rdx + adc r8, 0 + + mov rax, [reg_p2+48] + mul r15 + add r9, rax + mov [rcx+104], r9 // c5 + adc r10, rdx + adc r8, 0 + + mov rax, [reg_p2+56] + mul r15 + add r10, rax + mov [rcx+112], r10 // c6 + adc r8, rdx + mov [rcx+120], r8 // c7 + + // [r8-r15] <- (AH+AL)*(BH+BL) - AL*BL + mov r8, [rsp] + sub r8, [rcx] + mov r9, [rsp+8] + sbb r9, [rcx+8] + mov r10, [rsp+16] + sbb r10, [rcx+16] + mov r11, [rsp+24] + sbb r11, [rcx+24] + mov r12, [rsp+32] + sbb r12, [rcx+32] + mov r13, [rsp+40] + sbb r13, [rcx+40] + mov r14, [rsp+48] + sbb r14, [rcx+48] + mov r15, [rsp+56] + sbb r15, [rcx+56] + + // [r8-r15] <- (AH+AL)*(BH+BL) - AL*BL - AH*BH + mov rax, [rcx+64] + sub r8, rax + mov rax, [rcx+72] + sbb r9, rax + mov rax, [rcx+80] + sbb r10, rax + mov rax, [rcx+88] + sbb r11, rax + mov rax, [rcx+96] + sbb r12, rax + mov rdx, [rcx+104] + sbb r13, rdx + mov rdi, [rcx+112] + sbb r14, rdi + mov rsi, [rcx+120] + sbb r15, rsi + + // Final result + add r8, [rcx+32] + mov [rcx+32], r8 + adc r9, [rcx+40] + mov [rcx+40], r9 + adc r10, [rcx+48] + mov [rcx+48], r10 + adc r11, [rcx+56] + mov [rcx+56], r11 + adc r12, [rcx+64] + mov [rcx+64], r12 + adc r13, [rcx+72] + mov [rcx+72], r13 + adc r14, [rcx+80] + mov [rcx+80], r14 + adc r15, [rcx+88] + mov [rcx+88], r15 + adc rax, 0 + mov [rcx+96], rax + adc rdx, 0 + mov [rcx+104], rdx + adc rdi, 0 + mov [rcx+112], rdi + adc rsi, 0 + mov [rcx+120], rsi + + add rsp, 80 // Restoring space in stack + pop r15 + pop r14 + pop r13 + pop r12 + ret + +#endif + + +#ifdef _MULX_ + +///////////////////////////////////////////////////////////////// MACRO +// Schoolbook integer multiplication +// Inputs: memory pointers M0 and M1 +// Outputs: regs T0:T6 +// Temps: regs T7:T9 +///////////////////////////////////////////////////////////////// +.macro MUL128x320_SCHOOL M0, M1, T0, T1, T2, T3, T4, T5, T6, T7, T8, T9 + mov rdx, \M0 + mulx \T1, \T0, \M1 // T0 <- C0_final + mulx \T2, \T4, 8\M1 + xor rax, rax + mulx \T3, \T5, 16\M1 + ADD1 \T1, \T4 + ADC1 \T2, \T5 + mulx \T4, \T7, 24\M1 + ADC1 \T3, \T7 + mulx \T5, \T6, 32\M1 + ADC1 \T4, \T6 + ADC1 \T5, rax + + mov rdx, 8\M0 + mulx \T7, \T6, \M1 + ADD2 \T1, \T6 // T1 <- C1_final + ADC2 \T2, \T7 + mulx \T6, \T8, 8\M1 + ADC2 \T3, \T6 + mulx \T9, \T7, 16\M1 + ADC2 \T4, \T9 + mulx \T6, \T9, 24\M1 + ADC2 \T5, \T6 + mulx \T6, rdx, 32\M1 + ADC2 \T6, rax + + xor rax, rax + ADD1 \T2, \T8 + ADC1 \T3, \T7 + ADC1 \T4, \T9 + ADC1 \T5, rdx + ADC1 \T6, rax +.endm + + +//************************************************************************************** +// Montgomery reduction +// Based on method described in Faz-Hernandez et al. https://eprint.iacr.org/2017/1015 +// Operation: c [reg_p2] = a [reg_p1] +// NOTE: a=c is not allowed +//************************************************************************************** +.global rdc503_asm +rdc503_asm: + push rbx + push r12 + push r13 + push r14 + push r15 + + // a[0-1] x p503p1_nz --> result: r8:r14 + MUL128x320_SCHOOL [reg_p1], [p503p1_nz], r8, r9, r10, r11, r12, r13, r14, rbx, rcx, r15 + + xor r15, r15 + add r8, [reg_p1+24] + adc r9, [reg_p1+32] + adc r10, [reg_p1+40] + adc r11, [reg_p1+48] + adc r12, [reg_p1+56] + adc r13, [reg_p1+64] + adc r14, [reg_p1+72] + adc r15, [reg_p1+80] + mov [reg_p1+24], r8 + mov [reg_p1+32], r9 + mov [reg_p1+40], r10 + mov [reg_p1+48], r11 + mov [reg_p1+56], r12 + mov [reg_p1+64], r13 + mov [reg_p1+72], r14 + mov [reg_p1+80], r15 + mov r8, [reg_p1+88] + mov r9, [reg_p1+96] + mov r10, [reg_p1+104] + mov r11, [reg_p1+112] + mov r12, [reg_p1+120] + adc r8, 0 + adc r9, 0 + adc r10, 0 + adc r11, 0 + adc r12, 0 + mov [reg_p1+88], r8 + mov [reg_p1+96], r9 + mov [reg_p1+104], r10 + mov [reg_p1+112], r11 + mov [reg_p1+120], r12 + + // a[2-3] x p503p1_nz --> result: r8:r14 + MUL128x320_SCHOOL [reg_p1+16], [p503p1_nz], r8, r9, r10, r11, r12, r13, r14, rbx, rcx, r15 + + xor r15, r15 + add r8, [reg_p1+40] + adc r9, [reg_p1+48] + adc r10, [reg_p1+56] + adc r11, [reg_p1+64] + adc r12, [reg_p1+72] + adc r13, [reg_p1+80] + adc r14, [reg_p1+88] + adc r15, [reg_p1+96] + mov [reg_p1+40], r8 + mov [reg_p1+48], r9 + mov [reg_p1+56], r10 + mov [reg_p1+64], r11 + mov [reg_p1+72], r12 + mov [reg_p1+80], r13 + mov [reg_p1+88], r14 + mov [reg_p1+96], r15 + mov r8, [reg_p1+104] + mov r9, [reg_p1+112] + mov r10, [reg_p1+120] + adc r8, 0 + adc r9, 0 + adc r10, 0 + mov [reg_p1+104], r8 + mov [reg_p1+112], r9 + mov [reg_p1+120], r10 + + // a[4-5] x p503p1_nz --> result: r8:r14 + MUL128x320_SCHOOL [reg_p1+32], [p503p1_nz], r8, r9, r10, r11, r12, r13, r14, rbx, rcx, r15 + + xor r15, r15 + xor rbx, rbx + add r8, [reg_p1+56] + adc r9, [reg_p1+64] + adc r10, [reg_p1+72] + adc r11, [reg_p1+80] + adc r12, [reg_p1+88] + adc r13, [reg_p1+96] + adc r14, [reg_p1+104] + adc r15, [reg_p1+112] + adc rbx, [reg_p1+120] + mov [reg_p1+56], r8 + mov [reg_p2], r9 // Final result c0 + mov [reg_p1+72], r10 + mov [reg_p1+80], r11 + mov [reg_p1+88], r12 + mov [reg_p1+96], r13 + mov [reg_p1+104], r14 + mov [reg_p1+112], r15 + mov [reg_p1+120], rbx + + // a[6-7] x p503p1_nz --> result: r8:r14 + MUL128x320_SCHOOL [reg_p1+48], [p503p1_nz], r8, r9, r10, r11, r12, r13, r14, rbx, rcx, r15 + + // Final result c1:c7 + add r8, [reg_p1+72] + adc r9, [reg_p1+80] + adc r10, [reg_p1+88] + adc r11, [reg_p1+96] + adc r12, [reg_p1+104] + adc r13, [reg_p1+112] + adc r14, [reg_p1+120] + mov [reg_p2+8], r8 + mov [reg_p2+16], r9 + mov [reg_p2+24], r10 + mov [reg_p2+32], r11 + mov [reg_p2+40], r12 + mov [reg_p2+48], r13 + mov [reg_p2+56], r14 + + pop r15 + pop r14 + pop r13 + pop r12 + pop rbx + ret + + #else + +//*********************************************************************** +// Montgomery reduction +// Based on comba method +// Operation: c [reg_p2] = a [reg_p1] +// NOTE: a=c is not allowed +//*********************************************************************** +.global rdc503_asm +rdc503_asm: + push r12 + push r13 + push r14 + push r15 + + mov r11, [reg_p1] + movq rax, p503p1_3 + mul r11 + xor r8, r8 + add rax, [reg_p1+24] + mov [reg_p2+24], rax // z3 + adc r8, rdx + + xor r9, r9 + movq rax, p503p1_4 + mul r11 + xor r10, r10 + add r8, rax + adc r9, rdx + + mov r12, [reg_p1+8] + movq rax, p503p1_3 + mul r12 + add r8, rax + adc r9, rdx + adc r10, 0 + add r8, [reg_p1+32] + mov [reg_p2+32], r8 // z4 + adc r9, 0 + adc r10, 0 + + xor r8, r8 + movq rax, p503p1_5 + mul r11 + add r9, rax + adc r10, rdx + adc r8, 0 + + movq rax, p503p1_4 + mul r12 + add r9, rax + adc r10, rdx + adc r8, 0 + + mov r13, [reg_p1+16] + movq rax, p503p1_3 + mul r13 + add r9, rax + adc r10, rdx + adc r8, 0 + add r9, [reg_p1+40] + mov [reg_p2+40], r9 // z5 + adc r10, 0 + adc r8, 0 + + xor r9, r9 + movq rax, p503p1_6 + mul r11 + add r10, rax + adc r8, rdx + adc r9, 0 + + movq rax, p503p1_5 + mul r12 + add r10, rax + adc r8, rdx + adc r9, 0 + + movq rax, p503p1_4 + mul r13 + add r10, rax + adc r8, rdx + adc r9, 0 + + mov r14, [reg_p2+24] + movq rax, p503p1_3 + mul r14 + add r10, rax + adc r8, rdx + adc r9, 0 + add r10, [reg_p1+48] + mov [reg_p2+48], r10 // z6 + adc r8, 0 + adc r9, 0 + + xor r10, r10 + movq rax, p503p1_7 + mul r11 + add r8, rax + adc r9, rdx + adc r10, 0 + + movq rax, p503p1_6 + mul r12 + add r8, rax + adc r9, rdx + adc r10, 0 + + movq rax, p503p1_5 + mul r13 + add r8, rax + adc r9, rdx + adc r10, 0 + + movq rax, p503p1_4 + mul r14 + add r8, rax + adc r9, rdx + adc r10, 0 + + mov r15, [reg_p2+32] + movq rax, p503p1_3 + mul r15 + add r8, rax + adc r9, rdx + adc r10, 0 + add r8, [reg_p1+56] + mov [reg_p2+56], r8 // z7 + adc r9, 0 + adc r10, 0 + + xor r8, r8 + movq rax, p503p1_7 + mul r12 + add r9, rax + adc r10, rdx + adc r8, 0 + + movq rax, p503p1_6 + mul r13 + add r9, rax + adc r10, rdx + adc r8, 0 + + movq rax, p503p1_5 + mul r14 + add r9, rax + adc r10, rdx + adc r8, 0 + + movq rax, p503p1_4 + mul r15 + add r9, rax + adc r10, rdx + adc r8, 0 + + mov rcx, [reg_p2+40] + movq rax, p503p1_3 + mul rcx + add r9, rax + adc r10, rdx + adc r8, 0 + add r9, [reg_p1+64] + mov [reg_p2], r9 // z0 + adc r10, 0 + adc r8, 0 + + xor r9, r9 + movq rax, p503p1_7 + mul r13 + add r10, rax + adc r8, rdx + adc r9, 0 + + movq rax, p503p1_6 + mul r14 + add r10, rax + adc r8, rdx + adc r9, 0 + + movq rax, p503p1_5 + mul r15 + add r10, rax + adc r8, rdx + adc r9, 0 + + movq rax, p503p1_4 + mul rcx + add r10, rax + adc r8, rdx + adc r9, 0 + + mov r13, [reg_p2+48] + movq rax, p503p1_3 + mul r13 + add r10, rax + adc r8, rdx + adc r9, 0 + add r10, [reg_p1+72] + mov [reg_p2+8], r10 // z1 + adc r8, 0 + adc r9, 0 + + xor r10, r10 + movq rax, p503p1_7 + mul r14 + add r8, rax + adc r9, rdx + adc r10, 0 + + movq rax, p503p1_6 + mul r15 + add r8, rax + adc r9, rdx + adc r10, 0 + + movq rax, p503p1_5 + mul rcx + add r8, rax + adc r9, rdx + adc r10, 0 + + movq rax, p503p1_4 + mul r13 + add r8, rax + adc r9, rdx + adc r10, 0 + + mov r14, [reg_p2+56] + movq rax, p503p1_3 + mul r14 + add r8, rax + adc r9, rdx + adc r10, 0 + add r8, [reg_p1+80] + mov [reg_p2+16], r8 // z2 + adc r9, 0 + adc r10, 0 + + xor r8, r8 + movq rax, p503p1_7 + mul r15 + add r9, rax + adc r10, rdx + adc r8, 0 + + movq rax, p503p1_6 + mul rcx + add r9, rax + adc r10, rdx + adc r8, 0 + + movq rax, p503p1_5 + mul r13 + add r9, rax + adc r10, rdx + adc r8, 0 + + movq rax, p503p1_4 + mul r14 + add r9, rax + adc r10, rdx + adc r8, 0 + add r9, [reg_p1+88] + mov [reg_p2+24], r9 // z3 + adc r10, 0 + adc r8, 0 + + xor r9, r9 + movq rax, p503p1_7 + mul rcx + add r10, rax + adc r8, rdx + adc r9, 0 + + movq rax, p503p1_6 + mul r13 + add r10, rax + adc r8, rdx + adc r9, 0 + + movq rax, p503p1_5 + mul r14 + add r10, rax + adc r8, rdx + adc r9, 0 + add r10, [reg_p1+96] + mov [reg_p2+32], r10 // z4 + adc r8, 0 + adc r9, 0 + + xor r10, r10 + movq rax, p503p1_7 + mul r13 + add r8, rax + adc r9, rdx + adc r10, 0 + + movq rax, p503p1_6 + mul r14 + add r8, rax + adc r9, rdx + adc r10, 0 + add r8, [reg_p1+104] // z5 + mov [reg_p2+40], r8 // z5 + adc r9, 0 + adc r10, 0 + + movq rax, p503p1_7 + mul r14 + add r9, rax + adc r10, rdx + add r9, [reg_p1+112] // z6 + mov [reg_p2+48], r9 // z6 + adc r10, 0 + add r10, [reg_p1+120] // z7 + mov [reg_p2+56], r10 // z7 + + pop r15 + pop r14 + pop r13 + pop r12 + ret + + #endif + + +//*********************************************************************** +// 503-bit multiprecision addition +// Operation: c [reg_p3] = a [reg_p1] + b [reg_p2] +//*********************************************************************** +.global mp_add503_asm +mp_add503_asm: + mov r8, [reg_p1] + mov r9, [reg_p1+8] + mov r10, [reg_p1+16] + mov r11, [reg_p1+24] + add r8, [reg_p2] + adc r9, [reg_p2+8] + adc r10, [reg_p2+16] + adc r11, [reg_p2+24] + mov [reg_p3], r8 + mov [reg_p3+8], r9 + mov [reg_p3+16], r10 + mov [reg_p3+24], r11 + + mov r8, [reg_p1+32] + mov r9, [reg_p1+40] + mov r10, [reg_p1+48] + mov r11, [reg_p1+56] + adc r8, [reg_p2+32] + adc r9, [reg_p2+40] + adc r10, [reg_p2+48] + adc r11, [reg_p2+56] + mov [reg_p3+32], r8 + mov [reg_p3+40], r9 + mov [reg_p3+48], r10 + mov [reg_p3+56], r11 + ret + + +//*********************************************************************** +// 2x503-bit multiprecision subtraction +// Operation: c [reg_p3] = a [reg_p1] - b [reg_p2]. Returns borrow mask +//*********************************************************************** +.global mp_sub503x2_asm +mp_sub503x2_asm: + xor rax, rax + mov r8, [reg_p1] + mov r9, [reg_p1+8] + mov r10, [reg_p1+16] + mov r11, [reg_p1+24] + mov rcx, [reg_p1+32] + sub r8, [reg_p2] + sbb r9, [reg_p2+8] + sbb r10, [reg_p2+16] + sbb r11, [reg_p2+24] + sbb rcx, [reg_p2+32] + mov [reg_p3], r8 + mov [reg_p3+8], r9 + mov [reg_p3+16], r10 + mov [reg_p3+24], r11 + mov [reg_p3+32], rcx + + mov r8, [reg_p1+40] + mov r9, [reg_p1+48] + mov r10, [reg_p1+56] + mov r11, [reg_p1+64] + mov rcx, [reg_p1+72] + sbb r8, [reg_p2+40] + sbb r9, [reg_p2+48] + sbb r10, [reg_p2+56] + sbb r11, [reg_p2+64] + sbb rcx, [reg_p2+72] + mov [reg_p3+40], r8 + mov [reg_p3+48], r9 + mov [reg_p3+56], r10 + mov [reg_p3+64], r11 + mov [reg_p3+72], rcx + + mov r8, [reg_p1+80] + mov r9, [reg_p1+88] + mov r10, [reg_p1+96] + mov r11, [reg_p1+104] + mov rcx, [reg_p1+112] + sbb r8, [reg_p2+80] + sbb r9, [reg_p2+88] + sbb r10, [reg_p2+96] + sbb r11, [reg_p2+104] + sbb rcx, [reg_p2+112] + mov [reg_p3+80], r8 + mov [reg_p3+88], r9 + mov [reg_p3+96], r10 + mov [reg_p3+104], r11 + mov [reg_p3+112], rcx + + mov r8, [reg_p1+120] + sbb r8, [reg_p2+120] + sbb rax, 0 + mov [reg_p3+120], r8 + ret + + +//*********************************************************************** +// Double 2x503-bit multiprecision subtraction +// Operation: c [reg_p3] = c [reg_p3] - a [reg_p1] - b [reg_p2] +//*********************************************************************** +.global mp_dblsub503x2_asm +mp_dblsub503x2_asm: + push r12 + push r13 + push r14 + + xor rax, rax + mov r8, [reg_p3] + mov r9, [reg_p3+8] + mov r10, [reg_p3+16] + mov r11, [reg_p3+24] + mov r12, [reg_p3+32] + mov r13, [reg_p3+40] + mov r14, [reg_p3+48] + mov rcx, [reg_p3+56] + sub r8, [reg_p1] + sbb r9, [reg_p1+8] + sbb r10, [reg_p1+16] + sbb r11, [reg_p1+24] + sbb r12, [reg_p1+32] + sbb r13, [reg_p1+40] + sbb r14, [reg_p1+48] + sbb rcx, [reg_p1+56] + adc rax, 0 + sub r8, [reg_p2] + sbb r9, [reg_p2+8] + sbb r10, [reg_p2+16] + sbb r11, [reg_p2+24] + sbb r12, [reg_p2+32] + sbb r13, [reg_p2+40] + sbb r14, [reg_p2+48] + sbb rcx, [reg_p2+56] + adc rax, 0 + mov [reg_p3], r8 + mov [reg_p3+8], r9 + mov [reg_p3+16], r10 + mov [reg_p3+24], r11 + mov [reg_p3+32], r12 + mov [reg_p3+40], r13 + mov [reg_p3+48], r14 + mov [reg_p3+56], rcx + + mov r8, [reg_p3+64] + mov r9, [reg_p3+72] + mov r10, [reg_p3+80] + mov r11, [reg_p3+88] + mov r12, [reg_p3+96] + mov r13, [reg_p3+104] + mov r14, [reg_p3+112] + mov rcx, [reg_p3+120] + sub r8, rax + sbb r8, [reg_p1+64] + sbb r9, [reg_p1+72] + sbb r10, [reg_p1+80] + sbb r11, [reg_p1+88] + sbb r12, [reg_p1+96] + sbb r13, [reg_p1+104] + sbb r14, [reg_p1+112] + sbb rcx, [reg_p1+120] + sub r8, [reg_p2+64] + sbb r9, [reg_p2+72] + sbb r10, [reg_p2+80] + sbb r11, [reg_p2+88] + sbb r12, [reg_p2+96] + sbb r13, [reg_p2+104] + sbb r14, [reg_p2+112] + sbb rcx, [reg_p2+120] + mov [reg_p3+64], r8 + mov [reg_p3+72], r9 + mov [reg_p3+80], r10 + mov [reg_p3+88], r11 + mov [reg_p3+96], r12 + mov [reg_p3+104], r13 + mov [reg_p3+112], r14 + mov [reg_p3+120], rcx + + pop r14 + pop r13 + pop r12 + ret diff --git a/third_party/sidh/src/P503/ARM64/fp_arm64.c b/third_party/sidh/src/P503/ARM64/fp_arm64.c new file mode 100644 index 00000000..e92c40d6 --- /dev/null +++ b/third_party/sidh/src/P503/ARM64/fp_arm64.c @@ -0,0 +1,93 @@ +/******************************************************************************************** +* SIDH: an efficient supersingular isogeny cryptography library +* +* Abstract: modular arithmetic optimized for 64-bit ARMv8 platforms for P503 +*********************************************************************************************/ + +#include "../P503_internal.h" + +// Global constants +extern const uint64_t p503[NWORDS_FIELD]; +extern const uint64_t p503x2[NWORDS_FIELD]; + + +__inline void fpadd503(const digit_t* a, const digit_t* b, digit_t* c) +{ // Modular addition, c = a+b mod p503. + // Inputs: a, b in [0, 2*p503-1] + // Output: c in [0, 2*p503-1] + + fpadd503_asm(a, b, c); +} + + +__inline void fpsub503(const digit_t* a, const digit_t* b, digit_t* c) +{ // Modular subtraction, c = a-b mod p503. + // Inputs: a, b in [0, 2*p503-1] + // Output: c in [0, 2*p503-1] + + fpsub503_asm(a, b, c); +} + + +__inline void fpneg503(digit_t* a) +{ // Modular negation, a = -a mod p503. + // Input/output: a in [0, 2*p503-1] + unsigned int i, borrow = 0; + + for (i = 0; i < NWORDS_FIELD; i++) { + SUBC(borrow, ((digit_t*)p503x2)[i], a[i], borrow, a[i]); + } +} + + +void fpdiv2_503(const digit_t* a, digit_t* c) +{ // Modular division by two, c = a/2 mod p503. + // Input : a in [0, 2*p503-1] + // Output: c in [0, 2*p503-1] + unsigned int i, carry = 0; + digit_t mask; + + mask = 0 - (digit_t)(a[0] & 1); // If a is odd compute a+p521 + for (i = 0; i < NWORDS_FIELD; i++) { + ADDC(carry, a[i], ((digit_t*)p503)[i] & mask, carry, c[i]); + } + + mp_shiftr1(c, NWORDS_FIELD); +} + + +void fpcorrection503(digit_t* a) +{ // Modular correction to reduce field element a in [0, 2*p503-1] to [0, p503-1]. + unsigned int i, borrow = 0; + digit_t mask; + + for (i = 0; i < NWORDS_FIELD; i++) { + SUBC(borrow, a[i], ((digit_t*)p503)[i], borrow, a[i]); + } + mask = 0 - (digit_t)borrow; + + borrow = 0; + for (i = 0; i < NWORDS_FIELD; i++) { + ADDC(borrow, a[i], ((digit_t*)p503)[i] & mask, borrow, a[i]); + } +} + + +void mp_mul(const digit_t* a, const digit_t* b, digit_t* c, const unsigned int nwords) +{ // Multiprecision multiply, c = a*b, where lng(a) = lng(b) = nwords. + + UNREFERENCED_PARAMETER(nwords); + + mul503_asm(a, b, c); +} + + + +void rdc_mont(const digit_t* ma, digit_t* mc) +{ // Montgomery reduction exploiting special form of the prime. + // mc = ma*R^-1 mod p503x2, where R = 2^512. + // If ma < 2^512*p503, the output mc is in the range [0, 2*p503-1]. + // ma is assumed to be in Montgomery representation. + + rdc503_asm(ma, mc); +} diff --git a/third_party/sidh/src/P503/ARM64/fp_arm64_asm.S b/third_party/sidh/src/P503/ARM64/fp_arm64_asm.S new file mode 100644 index 00000000..ada3a40f --- /dev/null +++ b/third_party/sidh/src/P503/ARM64/fp_arm64_asm.S @@ -0,0 +1,829 @@ +//******************************************************************************************* +// SIDH: an efficient supersingular isogeny cryptography library +// +// Abstract: field arithmetic in 64-bit ARMv8 assembly for P503 on Linux +//******************************************************************************************* + +.data + +// p503 + 1 +p503p1: +.quad 0xAC00000000000000 +.quad 0x13085BDA2211E7A0 +.quad 0x1B9BF6C87B7E7DAF +.quad 0x6045C6BDDA77A4D0 +.quad 0x004066F541811E1E + +// 2 * p503 +p503x2: +.quad 0xFFFFFFFFFFFFFFFE +.quad 0xFFFFFFFFFFFFFFFF +.quad 0x57FFFFFFFFFFFFFF +.quad 0x2610B7B44423CF41 +.quad 0x3737ED90F6FCFB5E +.quad 0xC08B8D7BB4EF49A0 +.quad 0x0080CDEA83023C3C + +p503p1_nz_s8: +.quad 0x85BDA2211E7A0AC +.quad 0x9BF6C87B7E7DAF13 +.quad 0x45C6BDDA77A4D01B +.quad 0x4066F541811E1E60 + + +.text +//*********************************************************************** +// Field addition +// Operation: c [x2] = a [x0] + b [x1] +//*********************************************************************** +.global fpadd503_asm +fpadd503_asm: + ldp x3, x4, [x0,#0] + ldp x5, x6, [x0,#16] + ldp x11, x12, [x1,#0] + ldp x13, x14, [x1,#16] + + // Add a + b + adds x3, x3, x11 + adcs x4, x4, x12 + adcs x5, x5, x13 + adcs x6, x6, x14 + ldp x7, x8, [x0,#32] + ldp x9, x10, [x0,#48] + ldp x15, x16, [x1,#32] + ldp x17, x18, [x1,#48] + adcs x7, x7, x15 + adcs x8, x8, x16 + adcs x9, x9, x17 + adc x10, x10, x18 + + // Subtract 2xp503 + ldr x11, p503x2 + ldr x12, p503x2 + 8 + ldr x13, p503x2 + 16 + ldr x14, p503x2 + 24 + subs x3, x3, x11 + sbcs x4, x4, x12 + sbcs x5, x5, x12 + sbcs x6, x6, x13 + sbcs x7, x7, x14 + ldr x15, p503x2 + 32 + ldr x16, p503x2 + 40 + ldr x17, p503x2 + 48 + sbcs x8, x8, x15 + sbcs x9, x9, x16 + sbcs x10, x10, x17 + sbc x18, xzr, xzr + + // Add 2xp503 anded with the mask in x18 + and x11, x11, x18 + and x12, x12, x18 + and x13, x13, x18 + and x14, x14, x18 + and x15, x15, x18 + and x16, x16, x18 + and x17, x17, x18 + + adds x3, x3, x11 + adcs x4, x4, x12 + adcs x5, x5, x12 + adcs x6, x6, x13 + adcs x7, x7, x14 + adcs x8, x8, x15 + adcs x9, x9, x16 + adc x10, x10, x17 + + stp x3, x4, [x2,#0] + stp x5, x6, [x2,#16] + stp x7, x8, [x2,#32] + stp x9, x10, [x2,#48] + ret + + +//*********************************************************************** +// Field subtraction +// Operation: c [x2] = a [x0] - b [x1] +//*********************************************************************** +.global fpsub503_asm +fpsub503_asm: + ldp x3, x4, [x0,#0] + ldp x5, x6, [x0,#16] + ldp x11, x12, [x1,#0] + ldp x13, x14, [x1,#16] + + // Subtract a - b + subs x3, x3, x11 + sbcs x4, x4, x12 + sbcs x5, x5, x13 + sbcs x6, x6, x14 + ldp x7, x8, [x0,#32] + ldp x9, x10, [x0,#48] + ldp x15, x16, [x1,#32] + ldp x17, x18, [x1,#48] + sbcs x7, x7, x15 + sbcs x8, x8, x16 + sbcs x9, x9, x17 + sbcs x10, x10, x18 + sbc x18, xzr, xzr + + // Add 2xp503 anded with the mask in x18 + ldr x11, p503x2 + ldr x12, p503x2 + 8 + ldr x13, p503x2 + 16 + ldr x14, p503x2 + 24 + and x11, x11, x18 + and x12, x12, x18 + and x13, x13, x18 + and x14, x14, x18 + ldr x15, p503x2 + 32 + ldr x16, p503x2 + 40 + ldr x17, p503x2 + 48 + and x15, x15, x18 + and x16, x16, x18 + and x17, x17, x18 + + adds x3, x3, x11 + adcs x4, x4, x12 + adcs x5, x5, x12 + adcs x6, x6, x13 + adcs x7, x7, x14 + adcs x8, x8, x15 + adcs x9, x9, x16 + adc x10, x10, x17 + + stp x3, x4, [x2,#0] + stp x5, x6, [x2,#16] + stp x7, x8, [x2,#32] + stp x9, x10, [x2,#48] + ret + + +//////////////////////////////////////////// MACRO +.macro MUL128_COMBA_CUT A0, A1, B0, B1, C0, C1, C2, C3, T0 + mul \A0, \A1, \B0 + umulh \B0, \A1, \B0 + adds \C1, \C1, \C3 + adc \C2, \C2, xzr + + mul \T0, \A1, \B1 + umulh \B1, \A1, \B1 + adds \C1, \C1, \A0 + adcs \C2, \C2, \B0 + adc \C3, xzr, xzr + + adds \C2, \C2, \T0 + adc \C3, \C3, \B1 +.endm + + +//////////////////////////////////////////// MACRO +.macro MUL256_KARATSUBA_COMBA M,A0,A1,A2,A3,B0,B1,B2,B3,C0,C1,C2,C3,C4,C5,C6,C7,T0,T1 + + // A0-A1 <- AH + AL, T0 <- mask + adds \A0, \A0, \A2 + adcs \A1, \A1, \A3 + adc \T0, xzr, xzr + + // C6, T1 <- BH + BL, C7 <- mask + adds \C6, \B0, \B2 + adcs \T1, \B1, \B3 + adc \C7, xzr, xzr + + // C0-C1 <- masked (BH + BL) + sub \C2, xzr, \T0 + sub \C3, xzr, \C7 + and \C0, \C6, \C2 + and \C1, \T1, \C2 + + // C4-C5 <- masked (AH + AL), T0 <- combined carry + and \C4, \A0, \C3 + and \C5, \A1, \C3 + mul \C2, \A0, \C6 + mul \C3, \A0, \T1 + and \T0, \T0, \C7 + + // C0-C1, T0 <- (AH+AL) x (BH+BL), part 1 + adds \C0, \C4, \C0 + umulh \C4, \A0, \T1 + adcs \C1, \C5, \C1 + umulh \C5, \A0, \C6 + adc \T0, \T0, xzr + + // C2-C5 <- (AH+AL) x (BH+BL), low part + MUL128_COMBA_CUT \A0, \A1, \C6, \T1, \C2, \C3, \C4, \C5, \C7 + ldp \A0, \A1, [\M,#0] + + // C2-C5, T0 <- (AH+AL) x (BH+BL), final part + adds \C4, \C0, \C4 + umulh \C7, \A0, \B0 + umulh \T1, \A0, \B1 + adcs \C5, \C1, \C5 + mul \C0, \A0, \B0 + mul \C1, \A0, \B1 + adc \T0, \T0, xzr + + // C0-C1, T1, C7 <- AL x BL + MUL128_COMBA_CUT \A0, \A1, \B0, \B1, \C0, \C1, \T1, \C7, \C6 + + // C2-C5, T0 <- (AH+AL) x (BH+BL) - ALxBL + mul \A0, \A2, \B2 + umulh \B0, \A2, \B2 + subs \C2, \C2, \C0 + sbcs \C3, \C3, \C1 + sbcs \C4, \C4, \T1 + mul \A1, \A2, \B3 + umulh \C6, \A2, \B3 + sbcs \C5, \C5, \C7 + sbc \T0, \T0, xzr + + // A0, A1, C6, B0 <- AH x BH + MUL128_COMBA_CUT \A2, \A3, \B2, \B3, \A0, \A1, \C6, \B0, \B1 + + // C2-C5, T0 <- (AH+AL) x (BH+BL) - ALxBL - AHxBH + subs \C2, \C2, \A0 + sbcs \C3, \C3, \A1 + sbcs \C4, \C4, \C6 + sbcs \C5, \C5, \B0 + sbc \T0, \T0, xzr + + adds \C2, \C2, \T1 + adcs \C3, \C3, \C7 + adcs \C4, \C4, \A0 + adcs \C5, \C5, \A1 + adcs \C6, \T0, \C6 + adc \C7, \B0, xzr +.endm + + +//*********************************************************************************** +// 512-bit integer multiplication using Karatsuba (two levels), Comba (lower level) +// Operation: c [x2] = a [x0] * b [x1] +//*********************************************************************************** +.global mul503_asm +mul503_asm: + sub sp, sp, #96 + stp x19, x20, [sp,#0] + stp x21, x22, [sp,#16] + stp x23, x24, [sp,#32] + stp x25, x26, [sp,#48] + stp x27, x28, [sp,#64] + str x29, [sp, #80] + + ldp x3, x4, [x0] + ldp x5, x6, [x0,#16] + ldp x7, x8, [x0,#32] + ldp x9, x10, [x0,#48] + ldp x11, x12, [x1,#0] + ldp x13, x14, [x1,#16] + ldp x15, x16, [x1,#32] + ldp x17, x18, [x1,#48] + + // x26-x29 <- AH + AL, x7 <- mask + adds x26, x3, x7 + adcs x27, x4, x8 + adcs x28, x5, x9 + adcs x29, x6, x10 + adc x7, xzr, xzr + + // x11-x14 <- BH + BL, x8 <- mask + adds x11, x11, x15 + adcs x12, x12, x16 + adcs x13, x13, x17 + adcs x14, x14, x18 + adc x8, xzr, xzr + + // x15-x18 <- masked (BH + BL) + sub x9, xzr, x7 + sub x10, xzr, x8 + and x15, x11, x9 + and x16, x12, x9 + and x17, x13, x9 + and x18, x14, x9 + + // x19-x22 <- masked (AH + AL), x7 <- combined carry + and x19, x26, x10 + and x20, x27, x10 + and x21, x28, x10 + and x22, x29, x10 + and x7, x7, x8 + + // x15-x18, x7 <- masked (AH+AL) + masked (BH+BL), step 1 + adds x15, x15, x19 + adcs x16, x16, x20 + adcs x17, x17, x21 + adcs x18, x18, x22 + adc x7, x7, xzr + + // x8-x10,x19-x23 <- (AH+AL) x (BH+BL), low part + stp x26, x27, [x2,#0] + MUL256_KARATSUBA_COMBA x2, x26, x27, x28, x29, x11, x12, x13, x14, x8, x9, x10, x19, x20, x21, x22, x23, x24, x25 + + // x15-x18, x7 <- (AH+AL) x (BH+BL), final step + adds x15, x15, x20 + adcs x16, x16, x21 + adcs x17, x17, x22 + adcs x18, x18, x23 + adc x7, x7, xzr + + // x20-x27 <- AL x BL + ldp x11, x12, [x1,#0] + ldp x13, x14, [x1,#16] + MUL256_KARATSUBA_COMBA x0, x3, x4, x5, x6, x11, x12, x13, x14, x20, x21, x22, x23, x24, x25, x26, x27, x28, x29 + + // x13-x14, x3-x5 <- (AH+AL) x (BH+BL) - ALxBL + subs x8, x8, x20 + sbcs x9, x9, x21 + sbcs x10, x10, x22 + sbcs x19, x19, x23 + sbcs x15, x15, x24 + sbcs x16, x16, x25 + sbcs x17, x17, x26 + sbcs x18, x18, x27 + sbc x7, x7, xzr + + stp x20, x21, [x2] + stp x22, x23, [x2,#16] + + ldp x3, x4, [x0,#32] + ldp x5, x6, [x0,#48] + ldp x11, x12, [x1,#32] + ldp x13, x14, [x1,#48] + + adds x8, x8, x24 + adcs x9, x9, x25 + adcs x10, x10, x26 + adcs x19, x19, x27 + adc x1, xzr, xzr + + // x20-x27 <- AH x BH + add x0, x0, #32 + MUL256_KARATSUBA_COMBA x0, x3, x4, x5, x6, x11, x12, x13, x14, x20, x21, x22, x23, x24, x25, x26, x27, x28, x29 + neg x1, x1 + + // x13-x14, x3-x5 <- (AH+AL) x (BH+BL) - ALxBL - AHxBH + subs x8, x8, x20 + sbcs x9, x9, x21 + sbcs x10, x10, x22 + sbcs x19, x19, x23 + sbcs x15, x15, x24 + sbcs x16, x16, x25 + sbcs x17, x17, x26 + sbcs x18, x18, x27 + sbc x7, x7, xzr + + stp x8, x9, [x2,#32] + stp x10, x19, [x2,#48] + + adds x1, x1, #1 + adcs x15, x15, x20 + adcs x16, x16, x21 + adcs x17, x17, x22 + adcs x18, x18, x23 + adcs x24, x7, x24 + adcs x25, x25, xzr + adcs x26, x26, xzr + adc x27, x27, xzr + + stp x15, x16, [x2,#64] + stp x17, x18, [x2,#80] + stp x24, x25, [x2,#96] + stp x26, x27, [x2,#112] + + ldp x19, x20, [sp,#0] + ldp x21, x22, [sp,#16] + ldp x23, x24, [sp,#32] + ldp x25, x26, [sp,#48] + ldp x27, x28, [sp,#64] + ldr x29, [sp,#80] + add sp, sp, #96 + ret + + +//////////////////////////////////////////// MACRO +.macro MUL128x256_COMBA_CUT A0, A1, B0, B1, B2, B3, C0, C1, C2, C3, C4, C5, T0, T1, T2, T3 + mul \T0, \A1, \B0 + umulh \T1, \A1, \B0 + adds \C1, \C1, \C3 + adc \C2, \C2, xzr + + mul \T2, \A0, \B2 + umulh \T3, \A0, \B2 + adds \C1, \C1, \T0 + adcs \C2, \C2, \T1 + adc \C3, xzr, xzr + + mul \T0, \A1, \B1 + umulh \T1, \A1, \B1 + adds \C2, \C2, \T2 + adcs \C3, \C3, \T3 + adc \C4, xzr, xzr + + mul \T2, \A0, \B3 + umulh \T3, \A0, \B3 + adds \C2, \C2, \T0 + adcs \C3, \C3, \T1 + adc \C4, \C4, xzr + + mul \T0, \A1, \B2 + umulh \T1, \A1, \B2 + adds \C3, \C3, \T2 + adcs \C4, \C4, \T3 + adc \C5, xzr, xzr + + mul \T2, \A1, \B3 + umulh \T3, \A1, \B3 + adds \C3, \C3, \T0 + adcs \C4, \C4, \T1 + adc \C5, \C5, xzr + adds \C4, \C4, \T2 + adc \C5, \C5, \T3 +.endm + + +//************************************************************************************** +// Montgomery reduction +// Based on method described in Faz-Hernandez et al. https://eprint.iacr.org/2017/1015 +// Operation: mc [x1] = ma [x0] +// NOTE: ma=mc is not allowed +//************************************************************************************** +.global rdc503_asm +rdc503_asm: + sub sp, sp, #96 + stp x19, x20, [sp] + stp x21, x22, [sp, #16] + stp x23, x24, [sp, #32] + stp x25, x26, [sp, #48] + stp x27, x28, [sp, #64] + stp x29, x30, [sp, #80] + + ldp x2, x3, [x0,#0] // a[0-1] + + // Load the prime constant + ldr x24, p503p1_nz_s8 + 0 + ldr x25, p503p1_nz_s8 + 8 + ldr x26, p503p1_nz_s8 + 16 + ldr x27, p503p1_nz_s8 + 24 + + // a[0-1] x p503p1_nz_s8 --> result: x4:x9 + mul x4, x2, x24 // a[0] x p503p1_nz_s8[0] + umulh x7, x2, x24 + mul x5, x2, x25 // a[0] x p503p1_nz_s8[1] + umulh x6, x2, x25 + MUL128x256_COMBA_CUT x2, x3, x24, x25, x26, x27, x4, x5, x6, x7, x8, x9, x28, x29, x30, x10 + + ldp x3, x11, [x0,#16] // a[2] + ldp x12, x13, [x0,#32] + ldp x14, x15, [x0,#48] + + orr x10, xzr, x9, lsr #8 + lsl x9, x9, #56 + orr x9, x9, x8, lsr #8 + lsl x8, x8, #56 + orr x8, x8, x7, lsr #8 + lsl x7, x7, #56 + orr x7, x7, x6, lsr #8 + lsl x6, x6, #56 + orr x6, x6, x5, lsr #8 + lsl x5, x5, #56 + orr x5, x5, x4, lsr #8 + lsl x4, x4, #56 + + adds x11, x4, x11 // a[3] + adcs x12, x5, x12 // a[4] + adcs x13, x6, x13 + adcs x14, x7, x14 + adcs x15, x8, x15 + ldp x16, x17, [x0,#64] + ldp x18, x19, [x0,#80] + mul x4, x3, x24 // a[2] x p503p1_nz_s8[0] + umulh x7, x3, x24 + adcs x16, x9, x16 + adcs x17, x10, x17 + adcs x18, xzr, x18 + adcs x19, xzr, x19 + ldp x20, x21, [x0,#96] + ldp x22, x23, [x0,#112] + mul x5, x3, x25 // a[2] x p503p1_nz_s8[1] + umulh x6, x3, x25 + adcs x20, xzr, x20 + adcs x21, xzr, x21 + adcs x22, xzr, x22 + adc x23, xzr, x23 + + // a[2-3] x p503p1_nz_s8 --> result: x4:x9 + MUL128x256_COMBA_CUT x3, x11, x24, x25, x26, x27, x4, x5, x6, x7, x8, x9, x28, x29, x30, x10 + + orr x10, xzr, x9, lsr #8 + lsl x9, x9, #56 + orr x9, x9, x8, lsr #8 + lsl x8, x8, #56 + orr x8, x8, x7, lsr #8 + lsl x7, x7, #56 + orr x7, x7, x6, lsr #8 + lsl x6, x6, #56 + orr x6, x6, x5, lsr #8 + lsl x5, x5, #56 + orr x5, x5, x4, lsr #8 + lsl x4, x4, #56 + + adds x13, x4, x13 // a[5] + adcs x14, x5, x14 // a[6] + adcs x15, x6, x15 + adcs x16, x7, x16 + mul x4, x12, x24 // a[4] x p503p1_nz_s8[0] + umulh x7, x12, x24 + adcs x17, x8, x17 + adcs x18, x9, x18 + adcs x19, x10, x19 + adcs x20, xzr, x20 + mul x5, x12, x25 // a[4] x p503p1_nz_s8[1] + umulh x6, x12, x25 + adcs x21, xzr, x21 + adcs x22, xzr, x22 + adc x23, xzr, x23 + + // a[4-5] x p503p1_nz_s8 --> result: x4:x9 + MUL128x256_COMBA_CUT x12, x13, x24, x25, x26, x27, x4, x5, x6, x7, x8, x9, x28, x29, x30, x10 + + orr x10, xzr, x9, lsr #8 + lsl x9, x9, #56 + orr x9, x9, x8, lsr #8 + lsl x8, x8, #56 + orr x8, x8, x7, lsr #8 + lsl x7, x7, #56 + orr x7, x7, x6, lsr #8 + lsl x6, x6, #56 + orr x6, x6, x5, lsr #8 + lsl x5, x5, #56 + orr x5, x5, x4, lsr #8 + lsl x4, x4, #56 + + adds x15, x4, x15 // a[7] + adcs x16, x5, x16 // a[8] + adcs x17, x6, x17 + adcs x18, x7, x18 + mul x4, x14, x24 // a[6] x p503p1_nz_s8[0] + umulh x7, x14, x24 + adcs x19, x8, x19 + adcs x20, x9, x20 + adcs x21, x10, x21 + mul x5, x14, x25 // a[6] x p503p1_nz_s8[1] + umulh x6, x14, x25 + adcs x22, xzr, x22 + adc x23, xzr, x23 + + // a[6-7] x p503p1_nz_s8 --> result: x4:x9 + MUL128x256_COMBA_CUT x14, x15, x24, x25, x26, x27, x4, x5, x6, x7, x8, x9, x28, x29, x30, x10 + + orr x10, xzr, x9, lsr #8 + lsl x9, x9, #56 + orr x9, x9, x8, lsr #8 + lsl x8, x8, #56 + orr x8, x8, x7, lsr #8 + lsl x7, x7, #56 + orr x7, x7, x6, lsr #8 + lsl x6, x6, #56 + orr x6, x6, x5, lsr #8 + lsl x5, x5, #56 + orr x5, x5, x4, lsr #8 + lsl x4, x4, #56 + + adds x17, x4, x17 + adcs x18, x5, x18 + adcs x19, x6, x19 + adcs x20, x7, x20 + stp x16, x17, [x1,#0] // Final result + stp x18, x19, [x1,#16] + adcs x21, x8, x21 + adcs x22, x9, x22 + adc x23, x10, x23 + stp x20, x21, [x1,#32] + stp x22, x23, [x1,#48] + + ldp x19, x20, [sp] + ldp x21, x22, [sp, #16] + ldp x23, x24, [sp, #32] + ldp x25, x26, [sp, #48] + ldp x27, x28, [sp, #64] + ldp x29, x30, [sp, #80] + add sp, sp, #96 + ret + + +//*********************************************************************** +// 503-bit multiprecision addition +// Operation: c [x2] = a [x0] + b [x1] +//*********************************************************************** +.global mp_add503_asm +mp_add503_asm: + ldp x3, x4, [x0,#0] + ldp x5, x6, [x0,#16] + ldp x11, x12, [x1,#0] + ldp x13, x14, [x1,#16] + + adds x3, x3, x11 + adcs x4, x4, x12 + adcs x5, x5, x13 + adcs x6, x6, x14 + ldp x7, x8, [x0,#32] + ldp x9, x10, [x0,#48] + ldp x15, x16, [x1,#32] + ldp x17, x18, [x1,#48] + adcs x7, x7, x15 + adcs x8, x8, x16 + adcs x9, x9, x17 + adc x10, x10, x18 + + stp x3, x4, [x2,#0] + stp x5, x6, [x2,#16] + stp x7, x8, [x2,#32] + stp x9, x10, [x2,#48] + ret + + +//*********************************************************************** +// 2x503-bit multiprecision addition +// Operation: c [x2] = a [x0] + b [x1] +//*********************************************************************** +.global mp_add503x2_asm +mp_add503x2_asm: + ldp x3, x4, [x0,#0] + ldp x5, x6, [x0,#16] + ldp x11, x12, [x1,#0] + ldp x13, x14, [x1,#16] + adds x3, x3, x11 + adcs x4, x4, x12 + adcs x5, x5, x13 + adcs x6, x6, x14 + ldp x7, x8, [x0,#32] + ldp x9, x10, [x0,#48] + ldp x15, x16, [x1,#32] + ldp x17, x18, [x1,#48] + adcs x7, x7, x15 + adcs x8, x8, x16 + adcs x9, x9, x17 + adcs x10, x10, x18 + + stp x3, x4, [x2,#0] + stp x5, x6, [x2,#16] + stp x7, x8, [x2,#32] + stp x9, x10, [x2,#48] + + ldp x3, x4, [x0,#64] + ldp x5, x6, [x0,#80] + ldp x11, x12, [x1,#64] + ldp x13, x14, [x1,#80] + adcs x3, x3, x11 + adcs x4, x4, x12 + adcs x5, x5, x13 + adcs x6, x6, x14 + ldp x7, x8, [x0,#96] + ldp x9, x10, [x0,#112] + ldp x15, x16, [x1,#96] + ldp x17, x18, [x1,#112] + adcs x7, x7, x15 + adcs x8, x8, x16 + adcs x9, x9, x17 + adc x10, x10, x18 + + stp x3, x4, [x2,#64] + stp x5, x6, [x2,#80] + stp x7, x8, [x2,#96] + stp x9, x10, [x2,#112] + ret + + +//*********************************************************************** +// 2x503-bit multiprecision subtraction +// Operation: c [x2] = a [x0] - b [x1]. Returns borrow mask +//*********************************************************************** +.global mp_sub503x2_asm +mp_sub503x2_asm: + ldp x3, x4, [x0,#0] + ldp x5, x6, [x0,#16] + ldp x11, x12, [x1,#0] + ldp x13, x14, [x1,#16] + subs x3, x3, x11 + sbcs x4, x4, x12 + sbcs x5, x5, x13 + sbcs x6, x6, x14 + ldp x7, x8, [x0,#32] + ldp x9, x10, [x0,#48] + ldp x15, x16, [x1,#32] + ldp x17, x18, [x1,#48] + sbcs x7, x7, x15 + sbcs x8, x8, x16 + sbcs x9, x9, x17 + sbcs x10, x10, x18 + + stp x3, x4, [x2,#0] + stp x5, x6, [x2,#16] + stp x7, x8, [x2,#32] + stp x9, x10, [x2,#48] + + ldp x3, x4, [x0,#64] + ldp x5, x6, [x0,#80] + ldp x11, x12, [x1,#64] + ldp x13, x14, [x1,#80] + sbcs x3, x3, x11 + sbcs x4, x4, x12 + sbcs x5, x5, x13 + sbcs x6, x6, x14 + ldp x7, x8, [x0,#96] + ldp x9, x10, [x0,#112] + ldp x15, x16, [x1,#96] + ldp x17, x18, [x1,#112] + sbcs x7, x7, x15 + sbcs x8, x8, x16 + sbcs x9, x9, x17 + sbcs x10, x10, x18 + sbc x0, xzr, xzr + + stp x3, x4, [x2,#64] + stp x5, x6, [x2,#80] + stp x7, x8, [x2,#96] + stp x9, x10, [x2,#112] + ret + + +//*********************************************************************** +// Double 2x503-bit multiprecision subtraction +// Operation: c [x2] = c [x2] - a [x0] - b [x1] +//*********************************************************************** +.global mp_dblsub503x2_asm +mp_dblsub503x2_asm: + sub sp, sp, #32 + stp x27, x28, [sp, #0] + stp x29, x30, [sp, #16] + ldp x3, x4, [x2,#0] + ldp x5, x6, [x2,#16] + ldp x7, x8, [x2,#32] + ldp x9, x10, [x2,#48] + ldp x11, x12, [x2,#64] + ldp x13, x14, [x2,#80] + ldp x15, x16, [x2,#96] + ldp x17, x18, [x2,#112] + + ldp x27, x28, [x0,#0] + ldp x29, x30, [x0,#16] + subs x3, x3, x27 + sbcs x4, x4, x28 + sbcs x5, x5, x29 + sbcs x6, x6, x30 + ldp x27, x28, [x0,#32] + ldp x29, x30, [x0,#48] + sbcs x7, x7, x27 + sbcs x8, x8, x28 + sbcs x9, x9, x29 + sbcs x10, x10, x30 + ldp x27, x28, [x0,#64] + ldp x29, x30, [x0,#80] + sbcs x11, x11, x27 + sbcs x12, x12, x28 + sbcs x13, x13, x29 + sbcs x14, x14, x30 + ldp x27, x28, [x0,#96] + ldp x29, x30, [x0,#112] + sbcs x15, x15, x27 + sbcs x16, x16, x28 + sbcs x17, x17, x29 + sbc x18, x18, x30 + + ldp x27, x28, [x1,#0] + ldp x29, x30, [x1,#16] + subs x3, x3, x27 + sbcs x4, x4, x28 + sbcs x5, x5, x29 + sbcs x6, x6, x30 + ldp x27, x28, [x1,#32] + ldp x29, x30, [x1,#48] + sbcs x7, x7, x27 + sbcs x8, x8, x28 + sbcs x9, x9, x29 + sbcs x10, x10, x30 + ldp x27, x28, [x1,#64] + ldp x29, x30, [x1,#80] + sbcs x11, x11, x27 + sbcs x12, x12, x28 + sbcs x13, x13, x29 + sbcs x14, x14, x30 + ldp x27, x28, [x1,#96] + ldp x29, x30, [x1,#112] + sbcs x15, x15, x27 + sbcs x16, x16, x28 + sbcs x17, x17, x29 + sbc x18, x18, x30 + + stp x3, x4, [x2,#0] + stp x5, x6, [x2,#16] + stp x7, x8, [x2,#32] + stp x9, x10, [x2,#48] + stp x11, x12, [x2,#64] + stp x13, x14, [x2,#80] + stp x15, x16, [x2,#96] + stp x17, x18, [x2,#112] + + ldp x27, x28, [sp, #0] + ldp x29, x30, [sp, #16] + add sp, sp, #32 + ret diff --git a/third_party/sidh/src/P503/P503.c b/third_party/sidh/src/P503/P503.c new file mode 100644 index 00000000..dcd7a84c --- /dev/null +++ b/third_party/sidh/src/P503/P503.c @@ -0,0 +1,126 @@ +/******************************************************************************************** +* SIDH: an efficient supersingular isogeny cryptography library +* +* Abstract: supersingular isogeny parameters and generation of functions for P503 +*********************************************************************************************/ + +#include "P503_api.h" +#include "P503_internal.h" + + +// Encoding of field elements, elements over Z_order, elements over GF(p^2) and elliptic curve points: +// -------------------------------------------------------------------------------------------------- +// Elements over GF(p) and Z_order are encoded with the least significant octet (and digit) located at the leftmost position (i.e., little endian format). +// Elements (a+b*i) over GF(p^2), where a and b are defined over GF(p), are encoded as {a, b}, with a in the least significant position. +// Elliptic curve points P = (x,y) are encoded as {x, y}, with x in the least significant position. +// Internally, the number of digits used to represent all these elements is obtained by approximating the number of bits to the immediately greater multiple of 32. +// For example, a 503-bit field element is represented with Ceil(503 / 64) = 8 64-bit digits or Ceil(503 / 32) = 16 32-bit digits. + +// +// Curve isogeny system "SIDHp503". Base curve: Montgomery curve By^2 = Cx^3 + Ax^2 + Cx defined over GF(p503^2), where A=0, B=1, C=1 and p503 = 2^250*3^159-1 +// + +const uint64_t p503[NWORDS64_FIELD] = { 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0xABFFFFFFFFFFFFFF, + 0x13085BDA2211E7A0, 0x1B9BF6C87B7E7DAF, 0x6045C6BDDA77A4D0, 0x004066F541811E1E }; +const uint64_t p503p1[NWORDS64_FIELD] = { 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0xAC00000000000000, + 0x13085BDA2211E7A0, 0x1B9BF6C87B7E7DAF, 0x6045C6BDDA77A4D0, 0x004066F541811E1E }; +const uint64_t p503x2[NWORDS64_FIELD] = { 0xFFFFFFFFFFFFFFFE, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0x57FFFFFFFFFFFFFF, + 0x2610B7B44423CF41, 0x3737ED90F6FCFB5E, 0xC08B8D7BB4EF49A0, 0x0080CDEA83023C3C }; +// Order of Alice's subgroup +const uint64_t Alice_order[NWORDS64_ORDER] = { 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0400000000000000 }; +// Order of Bob's subgroup +const uint64_t Bob_order[NWORDS64_ORDER] = { 0xC216F6888479E82B, 0xE6FDB21EDF9F6BC4, 0x1171AF769DE93406, 0x1019BD5060478798 }; +// Alice's generator values {XPA0 + XPA1*i, XQA0, XRA0 + XRA1*i} in GF(p503^2), expressed in Montgomery representation +const uint64_t A_gen[5*NWORDS64_FIELD] = { 0xE7EF4AA786D855AF, 0xED5758F03EB34D3B, 0x09AE172535A86AA9, 0x237B9CC07D622723, + 0xE3A284CBA4E7932D, 0x27481D9176C5E63F, 0x6A323FF55C6E71BF, 0x002ECC31A6FB8773, // XPA0 + 0x64D02E4E90A620B8, 0xDAB8128537D4B9F1, 0x4BADF77B8A228F98, 0x0F5DBDF9D1FB7D1B, + 0xBEC4DB288E1A0DCC, 0xE76A8665E80675DB, 0x6D6F252E12929463, 0x003188BD1463FACC, // XPA1 + 0xB79D41025DE85D56, 0x0B867DA9DF169686, 0x740E5368021C827D, 0x20615D72157BF25C, + 0xFF1590013C9B9F5B, 0xC884DCADE8C16CEA, 0xEBD05E53BF724E01, 0x0032FEF8FDA5748C, // XQA0 + 0x12E2E849AA0A8006, 0x41CF47008635A1E8, 0x9CD720A70798AED7, 0x42A820B42FCF04CF, + 0x7BF9BAD32AAE88B1, 0xF619127A54090BBE, 0x1CB10D8F56408EAA, 0x001D6B54C3C0EDEB, // XRA0 + 0x34DB54931CBAAC36, 0x420A18CB8DD5F0C4, 0x32008C1A48C0F44D, 0x3B3BA772B1CFD44D, + 0xA74B058FDAF13515, 0x095FC9CA7EEC17B4, 0x448E829D28F120F8, 0x00261EC3ED16A489 }; // XRA1 +// Bob's generator values {XPB0 + XPB1*i, XQB0, XRB0 + XRB1*i} in GF(p503^2), expressed in Montgomery representation +const uint64_t B_gen[5*NWORDS64_FIELD] = { 0x7EDE37F4FA0BC727, 0xF7F8EC5C8598941C, 0xD15519B516B5F5C8, 0xF6D5AC9B87A36282, + 0x7B19F105B30E952E, 0x13BD8B2025B4EBEE, 0x7B96D27F4EC579A2, 0x00140850CAB7E5DE, // XPB0 + 0x7764909DAE7B7B2D, 0x578ABB16284911AB, 0x76E2BFD146A6BF4D, 0x4824044B23AA02F0, + 0x1105048912A321F3, 0xB8A2E482CF0F10C1, 0x42FF7D0BE2152085, 0x0018E599C5223352, // XPB1 + 0x4256C520FB388820, 0x744FD7C3BAAF0A13, 0x4B6A2DDDB12CBCB8, 0xE46826E27F427DF8, + 0xFE4A663CD505A61B, 0xD6B3A1BAF025C695, 0x7C3BB62B8FCC00BD, 0x003AFDDE4A35746C, // XQB0 + 0x75601CD1E6C0DFCB, 0x1A9007239B58F93E, 0xC1F1BE80C62107AC, 0x7F513B898F29FF08, + 0xEA0BEDFF43E1F7B2, 0x2C6D94018CBAE6D0, 0x3A430D31BCD84672, 0x000D26892ECCFE83, // XRB0 + 0x1119D62AEA3007A1, 0xE3702AA4E04BAE1B, 0x9AB96F7D59F990E7, 0xF58440E8B43319C0, + 0xAF8134BEE1489775, 0xE7F7774E905192AA, 0xF54AE09308E98039, 0x001EF7A041A86112 }; // XRB1 +// Montgomery constant Montgomery_R2 = (2^512)^2 mod p503 +const uint64_t Montgomery_R2[NWORDS64_FIELD] = { 0x5289A0CF641D011F, 0x9B88257189FED2B9, 0xA3B365D58DC8F17A, 0x5BC57AB6EFF168EC, + 0x9E51998BD84D4423, 0xBF8999CBAC3B5695, 0x46E9127BCE14CDB6, 0x003F6CFCE8B81771 }; +// Value one in Montgomery representation +const uint64_t Montgomery_one[NWORDS64_FIELD] = { 0x00000000000003F9, 0x0000000000000000, 0x0000000000000000, 0xB400000000000000, + 0x63CB1A6EA6DED2B4, 0x51689D8D667EB37D, 0x8ACD77C71AB24142, 0x0026FBAEC60F5953 }; +// Value (2^256)^2 mod 3^159 +const uint64_t Montgomery_Rprime[NWORDS64_ORDER] = { 0x0C2615CA3C5BAA99, 0x5A4FF3072AB6AA6A, 0xA6AFD4B039AD6AA2, 0x010DA06A26DD05CB }; +// Value -(3^159)^-1 mod 2^256 +const uint64_t Montgomery_rprime[NWORDS64_ORDER] = { 0x49C8A87190C0697D, 0x2EB7968EA0F0A558, 0x944257B696777FA2, 0xBAA4DDCD6139D2B3 }; +// Value order_Bob/3 mod p503 +const uint64_t Border_div3[NWORDS_ORDER] = { 0xEB5CFCD82C28A2B9, 0x4CFF3B5F9FDFCE96, 0xB07B3A7CDF4DBC02, 0x055DE9C5756D2D32 }; + + +// Fixed parameters for isogeny tree computation +const unsigned int strat_Alice[MAX_Alice-1] = { +61, 32, 16, 8, 4, 2, 1, 1, 2, 1, 1, 4, 2, 1, 1, 2, 1, 1, 8, 4, 2, 1, 1, 2, 1, 1, +4, 2, 1, 1, 2, 1, 1, 16, 8, 4, 2, 1, 1, 2, 1, 1, 4, 2, 1, 1, 2, 1, 1, 8, 4, 2, 1, +1, 2, 1, 1, 4, 2, 1, 1, 2, 1, 1, 29, 16, 8, 4, 2, 1, 1, 2, 1, 1, 4, 2, 1, 1, 2, 1, +1, 8, 4, 2, 1, 1, 2, 1, 1, 4, 2, 1, 1, 2, 1, 1, 13, 8, 4, 2, 1, 1, 2, 1, 1, 4, 2, +1, 1, 2, 1, 1, 5, 4, 2, 1, 1, 2, 1, 1, 2, 1, 1, 1 }; + +const unsigned int strat_Bob[MAX_Bob-1] = { +71, 38, 21, 13, 8, 4, 2, 1, 1, 2, 1, 1, 4, 2, 1, 1, 2, 1, 1, 5, 4, 2, 1, 1, 2, 1, +1, 2, 1, 1, 1, 9, 5, 3, 2, 1, 1, 1, 1, 2, 1, 1, 1, 4, 2, 1, 1, 1, 2, 1, 1, 17, 9, +5, 3, 2, 1, 1, 1, 1, 2, 1, 1, 1, 4, 2, 1, 1, 1, 2, 1, 1, 8, 4, 2, 1, 1, 1, 2, 1, +1, 4, 2, 1, 1, 2, 1, 1, 33, 17, 9, 5, 3, 2, 1, 1, 1, 1, 2, 1, 1, 1, 4, 2, 1, 1, 1, +2, 1, 1, 8, 4, 2, 1, 1, 1, 2, 1, 1, 4, 2, 1, 1, 2, 1, 1, 16, 8, 4, 2, 1, 1, 1, 2, +1, 1, 4, 2, 1, 1, 2, 1, 1, 8, 4, 2, 1, 1, 2, 1, 1, 4, 2, 1, 1, 2, 1, 1 }; + +// Setting up macro defines and including GF(p), GF(p^2), curve, isogeny and kex functions +#define fpcopy fpcopy503 +#define fpzero fpzero503 +#define fpadd fpadd503 +#define fpsub fpsub503 +#define fpneg fpneg503 +#define fpdiv2 fpdiv2_503 +#define fpcorrection fpcorrection503 +#define fpmul_mont fpmul503_mont +#define fpsqr_mont fpsqr503_mont +#define fpinv_mont fpinv503_mont +#define fpinv_chain_mont fpinv503_chain_mont +#define fpinv_mont_bingcd fpinv503_mont_bingcd +#define fp2copy fp2copy503 +#define fp2zero fp2zero503 +#define fp2add fp2add503 +#define fp2sub fp2sub503 +#define fp2neg fp2neg503 +#define fp2div2 fp2div2_503 +#define fp2correction fp2correction503 +#define fp2mul_mont fp2mul503_mont +#define fp2sqr_mont fp2sqr503_mont +#define fp2inv_mont fp2inv503_mont +#define fp2inv_mont_bingcd fp2inv503_mont_bingcd +#define fpequal_non_constant_time fpequal503_non_constant_time +#define mp_add_asm mp_add503_asm +#define mp_subx2_asm mp_sub503x2_asm +#define mp_dblsubx2_asm mp_dblsub503x2_asm +#define crypto_kem_keypair crypto_kem_keypair_SIKEp503 +#define crypto_kem_enc crypto_kem_enc_SIKEp503 +#define crypto_kem_dec crypto_kem_dec_SIKEp503 +#define random_mod_order_A random_mod_order_A_SIDHp503 +#define random_mod_order_B random_mod_order_B_SIDHp503 +#define EphemeralKeyGeneration_A EphemeralKeyGeneration_A_SIDHp503 +#define EphemeralKeyGeneration_B EphemeralKeyGeneration_B_SIDHp503 +#define EphemeralSecretAgreement_A EphemeralSecretAgreement_A_SIDHp503 +#define EphemeralSecretAgreement_B EphemeralSecretAgreement_B_SIDHp503 + +#include "../fpx.c" +#include "../ec_isogeny.c" +#include "../sidh.c" +#include "../sike.c" diff --git a/third_party/sidh/src/P503/P503_api.h b/third_party/sidh/src/P503/P503_api.h new file mode 100644 index 00000000..b595cf40 --- /dev/null +++ b/third_party/sidh/src/P503/P503_api.h @@ -0,0 +1,107 @@ +/******************************************************************************************** +* SIDH: an efficient supersingular isogeny cryptography library +* +* Abstract: API header file for P503 +*********************************************************************************************/ + +#ifndef __P503_API_H__ +#define __P503_API_H__ + + +/*********************** Key encapsulation mechanism API ***********************/ + +#define CRYPTO_SECRETKEYBYTES 434 // MSG_BYTES + SECRETKEY_B_BYTES + CRYPTO_PUBLICKEYBYTES bytes +#define CRYPTO_PUBLICKEYBYTES 378 +#define CRYPTO_BYTES 16 +#define CRYPTO_CIPHERTEXTBYTES 402 // CRYPTO_PUBLICKEYBYTES + MSG_BYTES bytes + +// Algorithm name +#define CRYPTO_ALGNAME "SIKEp503" + +// SIKE's key generation +// It produces a private key sk and computes the public key pk. +// Outputs: secret key sk (CRYPTO_SECRETKEYBYTES = 434 bytes) +// public key pk (CRYPTO_PUBLICKEYBYTES = 378 bytes) +int crypto_kem_keypair_SIKEp503(unsigned char *pk, unsigned char *sk); + +// SIKE's encapsulation +// Input: public key pk (CRYPTO_PUBLICKEYBYTES = 378 bytes) +// Outputs: shared secret ss (CRYPTO_BYTES = 16 bytes) +// ciphertext message ct (CRYPTO_CIPHERTEXTBYTES = 402 bytes) +int crypto_kem_enc_SIKEp503(unsigned char *ct, unsigned char *ss, const unsigned char *pk); + +// SIKE's decapsulation +// Input: secret key sk (CRYPTO_SECRETKEYBYTES = 434 bytes) +// ciphertext message ct (CRYPTO_CIPHERTEXTBYTES = 402 bytes) +// Outputs: shared secret ss (CRYPTO_BYTES = 16 bytes) +int crypto_kem_dec_SIKEp503(unsigned char *ss, const unsigned char *ct, const unsigned char *sk); + + +// Encoding of keys for KEM-based isogeny system "SIKEp503" (wire format): +// ---------------------------------------------------------------------- +// Elements over GF(p503) are encoded in 63 octets in little endian format (i.e., the least significant octet is located in the lowest memory address). +// Elements (a+b*i) over GF(p503^2), where a and b are defined over GF(p503), are encoded as {a, b}, with a in the lowest memory portion. +// +// Private keys sk consist of the concatenation of a 24-byte random value, a value in the range [0, 2^252-1] and the public key pk. In the SIKE API, +// private keys are encoded in 434 octets in little endian format. +// Public keys pk consist of 3 elements in GF(p503^2). In the SIKE API, pk is encoded in 378 octets. +// Ciphertexts ct consist of the concatenation of a public key value and a 24-byte value. In the SIKE API, ct is encoded in 378 + 24 = 402 octets. +// Shared keys ss consist of a value of 16 octets. + + +/*********************** Ephemeral key exchange API ***********************/ + +#define SIDH_SECRETKEYBYTES 32 +#define SIDH_PUBLICKEYBYTES 378 +#define SIDH_BYTES 126 + +// SECURITY NOTE: SIDH supports ephemeral Diffie-Hellman key exchange. It is NOT secure to use it with static keys. +// See "On the Security of Supersingular Isogeny Cryptosystems", S.D. Galbraith, C. Petit, B. Shani and Y.B. Ti, in ASIACRYPT 2016, 2016. +// Extended version available at: http://eprint.iacr.org/2016/859 + +// Generation of Alice's secret key +// Outputs random value in [0, 2^250 - 1] to be used as Alice's private key +void random_mod_order_A_SIDHp503(unsigned char* random_digits); + +// Generation of Bob's secret key +// Outputs random value in [0, 2^Floor(Log(2,3^159)) - 1] to be used as Bob's private key +void random_mod_order_B_SIDHp503(unsigned char* random_digits); + +// Alice's ephemeral public key generation +// Input: a private key PrivateKeyA in the range [0, 2^250 - 1], stored in 32 bytes. +// Output: the public key PublicKeyA consisting of 3 GF(p503^2) elements encoded in 378 bytes. +int EphemeralKeyGeneration_A_SIDHp503(const unsigned char* PrivateKeyA, unsigned char* PublicKeyA); + +// Bob's ephemeral key-pair generation +// It produces a private key PrivateKeyB and computes the public key PublicKeyB. +// The private key is an integer in the range [0, 2^Floor(Log(2,3^159)) - 1], stored in 32 bytes. +// The public key consists of 3 GF(p503^2) elements encoded in 378 bytes. +int EphemeralKeyGeneration_B_SIDHp503(const unsigned char* PrivateKeyB, unsigned char* PublicKeyB); + +// Alice's ephemeral shared secret computation +// It produces a shared secret key SharedSecretA using her secret key PrivateKeyA and Bob's public key PublicKeyB +// Inputs: Alice's PrivateKeyA is an integer in the range [0, 2^250 - 1], stored in 32 bytes. +// Bob's PublicKeyB consists of 3 GF(p503^2) elements encoded in 378 bytes. +// Output: a shared secret SharedSecretA that consists of one element in GF(p503^2) encoded in 126 bytes. +int EphemeralSecretAgreement_A_SIDHp503(const unsigned char* PrivateKeyA, const unsigned char* PublicKeyB, unsigned char* SharedSecretA); + +// Bob's ephemeral shared secret computation +// It produces a shared secret key SharedSecretB using his secret key PrivateKeyB and Alice's public key PublicKeyA +// Inputs: Bob's PrivateKeyB is an integer in the range [0, 2^Floor(Log(2,3^159)) - 1], stored in 32 bytes. +// Alice's PublicKeyA consists of 3 GF(p503^2) elements encoded in 378 bytes. +// Output: a shared secret SharedSecretB that consists of one element in GF(p503^2) encoded in 126 bytes. +int EphemeralSecretAgreement_B_SIDHp503(const unsigned char* PrivateKeyB, const unsigned char* PublicKeyA, unsigned char* SharedSecretB); + + +// Encoding of keys for KEX-based isogeny system "SIDHp503" (wire format): +// ---------------------------------------------------------------------- +// Elements over GF(p503) are encoded in 63 octets in little endian format (i.e., the least significant octet is located in the lowest memory address). +// Elements (a+b*i) over GF(p503^2), where a and b are defined over GF(p503), are encoded as {a, b}, with a in the lowest memory portion. +// +// Private keys PrivateKeyA and PrivateKeyB can have values in the range [0, 2^250-1] and [0, 2^252-1], resp. In the SIDH API, private keys are encoded +// in 32 octets in little endian format. +// Public keys PublicKeyA and PublicKeyB consist of 3 elements in GF(p503^2). In the SIDH API, they are encoded in 378 octets. +// Shared keys SharedSecretA and SharedSecretB consist of one element in GF(p503^2). In the SIDH API, they are encoded in 126 octets. + + +#endif \ No newline at end of file diff --git a/third_party/sidh/src/P503/P503_internal.h b/third_party/sidh/src/P503/P503_internal.h new file mode 100644 index 00000000..33dadaa1 --- /dev/null +++ b/third_party/sidh/src/P503/P503_internal.h @@ -0,0 +1,246 @@ +/******************************************************************************************** +* SIDH: an efficient supersingular isogeny cryptography library +* +* Abstract: internal header file for P503 +*********************************************************************************************/ + +#ifndef __P503_INTERNAL_H__ +#define __P503_INTERNAL_H__ + +#include "../config.h" + + +#if (TARGET == TARGET_AMD64) + #define NWORDS_FIELD 8 // Number of words of a 503-bit field element + #define p503_ZERO_WORDS 3 // Number of "0" digits in the least significant part of p503 + 1 +#elif (TARGET == TARGET_x86) + #define NWORDS_FIELD 16 + #define p503_ZERO_WORDS 7 +#elif (TARGET == TARGET_ARM) + #define NWORDS_FIELD 16 + #define p503_ZERO_WORDS 7 +#elif (TARGET == TARGET_ARM64) + #define NWORDS_FIELD 8 + #define p503_ZERO_WORDS 3 +#endif + + +// Basic constants + +#define NBITS_FIELD 503 +#define MAXBITS_FIELD 512 +#define MAXWORDS_FIELD ((MAXBITS_FIELD+RADIX-1)/RADIX) // Max. number of words to represent field elements +#define NWORDS64_FIELD ((NBITS_FIELD+63)/64) // Number of 64-bit words of a 503-bit field element +#define NBITS_ORDER 256 +#define NWORDS_ORDER ((NBITS_ORDER+RADIX-1)/RADIX) // Number of words of oA and oB, where oA and oB are the subgroup orders of Alice and Bob, resp. +#define NWORDS64_ORDER ((NBITS_ORDER+63)/64) // Number of 64-bit words of a 256-bit element +#define MAXBITS_ORDER NBITS_ORDER +#define MAXWORDS_ORDER ((MAXBITS_ORDER+RADIX-1)/RADIX) // Max. number of words to represent elements in [1, oA-1] or [1, oB]. +#define ALICE 0 +#define BOB 1 +#define OALICE_BITS 250 +#define OBOB_BITS 253 +#define OBOB_EXPON 159 +#define MASK_ALICE 0x03 +#define MASK_BOB 0x0F +#define PRIME p503 +#define PARAM_A 0 +#define PARAM_C 1 +// Fixed parameters for isogeny tree computation +#define MAX_INT_POINTS_ALICE 7 +#define MAX_INT_POINTS_BOB 8 +#define MAX_Alice 125 +#define MAX_Bob 159 +#define MSG_BYTES 24 +#define SECRETKEY_A_BYTES (OALICE_BITS + 7) / 8 +#define SECRETKEY_B_BYTES (OBOB_BITS + 7) / 8 +#define FP2_ENCODED_BYTES 2*((NBITS_FIELD + 7) / 8) + + +// SIDH's basic element definitions and point representations + +typedef digit_t felm_t[NWORDS_FIELD]; // Datatype for representing 503-bit field elements (512-bit max.) +typedef digit_t dfelm_t[2*NWORDS_FIELD]; // Datatype for representing double-precision 2x503-bit field elements (512-bit max.) +typedef felm_t f2elm_t[2]; // Datatype for representing quadratic extension field elements GF(p503^2) + +typedef struct { f2elm_t X; f2elm_t Z; } point_proj; // Point representation in projective XZ Montgomery coordinates. +typedef point_proj point_proj_t[1]; + + + +/**************** Function prototypes ****************/ +/************* Multiprecision functions **************/ + +// Copy wordsize digits, c = a, where lng(a) = nwords +void copy_words(const digit_t* a, digit_t* c, const unsigned int nwords); + +// Multiprecision addition, c = a+b, where lng(a) = lng(b) = nwords. Returns the carry bit +unsigned int mp_add(const digit_t* a, const digit_t* b, digit_t* c, const unsigned int nwords); + +// 503-bit multiprecision addition, c = a+b +void mp_add503(const digit_t* a, const digit_t* b, digit_t* c); +void mp_add503_asm(const digit_t* a, const digit_t* b, digit_t* c); + +// Multiprecision subtraction, c = a-b, where lng(a) = lng(b) = nwords. Returns the borrow bit +unsigned int mp_sub(const digit_t* a, const digit_t* b, digit_t* c, const unsigned int nwords); +digit_t mp_sub503x2_asm(const digit_t* a, const digit_t* b, digit_t* c); + +// Double 2x503-bit multiprecision subtraction, c = c-a-b, where c > a and c > b +void mp_dblsub503x2_asm(const digit_t* a, const digit_t* b, digit_t* c); + +// Multiprecision left shift +void mp_shiftleft(digit_t* x, unsigned int shift, const unsigned int nwords); + +// Multiprecision right shift by one +void mp_shiftr1(digit_t* x, const unsigned int nwords); + +// Multiprecision left right shift by one +void mp_shiftl1(digit_t* x, const unsigned int nwords); + +// Digit multiplication, digit * digit -> 2-digit result +void digit_x_digit(const digit_t a, const digit_t b, digit_t* c); + +// Multiprecision comba multiply, c = a*b, where lng(a) = lng(b) = nwords. +void mp_mul(const digit_t* a, const digit_t* b, digit_t* c, const unsigned int nwords); + +/************ Field arithmetic functions *************/ + +// Copy of a field element, c = a +void fpcopy503(const digit_t* a, digit_t* c); + +// Zeroing a field element, a = 0 +void fpzero503(digit_t* a); + +// Non constant-time comparison of two field elements. If a = b return TRUE, otherwise, return FALSE +bool fpequal503_non_constant_time(const digit_t* a, const digit_t* b); + +// Modular addition, c = a+b mod p503 +extern void fpadd503(const digit_t* a, const digit_t* b, digit_t* c); +extern void fpadd503_asm(const digit_t* a, const digit_t* b, digit_t* c); + +// Modular subtraction, c = a-b mod p503 +extern void fpsub503(const digit_t* a, const digit_t* b, digit_t* c); +extern void fpsub503_asm(const digit_t* a, const digit_t* b, digit_t* c); + +// Modular negation, a = -a mod p503 +extern void fpneg503(digit_t* a); + +// Modular division by two, c = a/2 mod p503. +void fpdiv2_503(const digit_t* a, digit_t* c); + +// Modular correction to reduce field element a in [0, 2*p503-1] to [0, p503-1]. +void fpcorrection503(digit_t* a); + +// 503-bit Montgomery reduction, c = a mod p +void rdc_mont(const digit_t* a, digit_t* c); + +// Field multiplication using Montgomery arithmetic, c = a*b*R^-1 mod p503, where R=2^768 +void fpmul503_mont(const digit_t* a, const digit_t* b, digit_t* c); +void mul503_asm(const digit_t* a, const digit_t* b, digit_t* c); +void rdc503_asm(const digit_t* ma, digit_t* mc); + +// Field squaring using Montgomery arithmetic, c = a*b*R^-1 mod p503, where R=2^768 +void fpsqr503_mont(const digit_t* ma, digit_t* mc); + +// Conversion to Montgomery representation +void to_mont(const digit_t* a, digit_t* mc); + +// Conversion from Montgomery representation to standard representation +void from_mont(const digit_t* ma, digit_t* c); + +// Field inversion, a = a^-1 in GF(p503) +void fpinv503_mont(digit_t* a); + +// Field inversion, a = a^-1 in GF(p503) using the binary GCD +void fpinv503_mont_bingcd(digit_t* a); + +// Chain to compute (p503-3)/4 using Montgomery arithmetic +void fpinv503_chain_mont(digit_t* a); + +/************ GF(p^2) arithmetic functions *************/ + +// Copy of a GF(p503^2) element, c = a +void fp2copy503(const f2elm_t a, f2elm_t c); + +// Zeroing a GF(p503^2) element, a = 0 +void fp2zero503(f2elm_t a); + +// GF(p503^2) negation, a = -a in GF(p503^2) +void fp2neg503(f2elm_t a); + +// GF(p503^2) addition, c = a+b in GF(p503^2) +extern void fp2add503(const f2elm_t a, const f2elm_t b, f2elm_t c); + +// GF(p503^2) subtraction, c = a-b in GF(p503^2) +extern void fp2sub503(const f2elm_t a, const f2elm_t b, f2elm_t c); + +// GF(p503^2) division by two, c = a/2 in GF(p503^2) +void fp2div2_503(const f2elm_t a, f2elm_t c); + +// Modular correction, a = a in GF(p503^2) +void fp2correction503(f2elm_t a); + +// GF(p503^2) squaring using Montgomery arithmetic, c = a^2 in GF(p503^2) +void fp2sqr503_mont(const f2elm_t a, f2elm_t c); + +// GF(p503^2) multiplication using Montgomery arithmetic, c = a*b in GF(p503^2) +void fp2mul503_mont(const f2elm_t a, const f2elm_t b, f2elm_t c); + +// Conversion of a GF(p503^2) element to Montgomery representation +void to_fp2mont(const f2elm_t a, f2elm_t mc); + +// Conversion of a GF(p503^2) element from Montgomery representation to standard representation +void from_fp2mont(const f2elm_t ma, f2elm_t c); + +// GF(p503^2) inversion using Montgomery arithmetic, a = (a0-i*a1)/(a0^2+a1^2) +void fp2inv503_mont(f2elm_t a); + +// GF(p503^2) inversion, a = (a0-i*a1)/(a0^2+a1^2), GF(p503) inversion done using the binary GCD +void fp2inv503_mont_bingcd(f2elm_t a); + +// n-way Montgomery inversion +void mont_n_way_inv(const f2elm_t* vec, const int n, f2elm_t* out); + +/************ Elliptic curve and isogeny functions *************/ + +// Computes the j-invariant of a Montgomery curve with projective constant. +void j_inv(const f2elm_t A, const f2elm_t C, f2elm_t jinv); + +// Simultaneous doubling and differential addition. +void xDBLADD(point_proj_t P, point_proj_t Q, const f2elm_t xPQ, const f2elm_t A24); + +// Doubling of a Montgomery point in projective coordinates (X:Z). +void xDBL(const point_proj_t P, point_proj_t Q, const f2elm_t A24plus, const f2elm_t C24); + +// Computes [2^e](X:Z) on Montgomery curve with projective constant via e repeated doublings. +void xDBLe(const point_proj_t P, point_proj_t Q, const f2elm_t A24plus, const f2elm_t C24, const int e); + +// Differential addition. +void xADD(point_proj_t P, const point_proj_t Q, const f2elm_t xPQ); + +// Computes the corresponding 4-isogeny of a projective Montgomery point (X4:Z4) of order 4. +void get_4_isog(const point_proj_t P, f2elm_t A24plus, f2elm_t C24, f2elm_t* coeff); + +// Evaluates the isogeny at the point (X:Z) in the domain of the isogeny. +void eval_4_isog(point_proj_t P, f2elm_t* coeff); + +// Tripling of a Montgomery point in projective coordinates (X:Z). +void xTPL(const point_proj_t P, point_proj_t Q, const f2elm_t A24minus, const f2elm_t A24plus); + +// Computes [3^e](X:Z) on Montgomery curve with projective constant via e repeated triplings. +void xTPLe(const point_proj_t P, point_proj_t Q, const f2elm_t A24minus, const f2elm_t A24plus, const int e); + +// Computes the corresponding 3-isogeny of a projective Montgomery point (X3:Z3) of order 3. +void get_3_isog(const point_proj_t P, f2elm_t A24minus, f2elm_t A24plus, f2elm_t* coeff); + +// Computes the 3-isogeny R=phi(X:Z), given projective point (X3:Z3) of order 3 on a Montgomery curve and a point P with coefficients given in coeff. +void eval_3_isog(point_proj_t Q, const f2elm_t* coeff); + +// 3-way simultaneous inversion +void inv_3_way(f2elm_t z1, f2elm_t z2, f2elm_t z3); + +// Given the x-coordinates of P, Q, and R, returns the value A corresponding to the Montgomery curve E_A: y^2=x^3+A*x^2+x such that R=Q-P on E_A. +void get_A(const f2elm_t xP, const f2elm_t xQ, const f2elm_t xR, f2elm_t A); + + +#endif diff --git a/third_party/sidh/src/P503/generic/fp_generic.c b/third_party/sidh/src/P503/generic/fp_generic.c new file mode 100644 index 00000000..d8dab8ac --- /dev/null +++ b/third_party/sidh/src/P503/generic/fp_generic.c @@ -0,0 +1,224 @@ +/******************************************************************************************** +* SIDH: an efficient supersingular isogeny cryptography library +* +* Abstract: portable modular arithmetic for P503 +*********************************************************************************************/ + +#include "../P503_internal.h" + + +// Global constants +extern const uint64_t p503[NWORDS_FIELD]; +extern const uint64_t p503p1[NWORDS_FIELD]; +extern const uint64_t p503x2[NWORDS_FIELD]; + + +__inline void fpadd503(const digit_t* a, const digit_t* b, digit_t* c) +{ // Modular addition, c = a+b mod p503. + // Inputs: a, b in [0, 2*p503-1] + // Output: c in [0, 2*p503-1] + unsigned int i, carry = 0; + digit_t mask; + + for (i = 0; i < NWORDS_FIELD; i++) { + ADDC(carry, a[i], b[i], carry, c[i]); + } + + carry = 0; + for (i = 0; i < NWORDS_FIELD; i++) { + SUBC(carry, c[i], ((digit_t*)p503x2)[i], carry, c[i]); + } + mask = 0 - (digit_t)carry; + + carry = 0; + for (i = 0; i < NWORDS_FIELD; i++) { + ADDC(carry, c[i], ((digit_t*)p503x2)[i] & mask, carry, c[i]); + } +} + + +__inline void fpsub503(const digit_t* a, const digit_t* b, digit_t* c) +{ // Modular subtraction, c = a-b mod p503. + // Inputs: a, b in [0, 2*p503-1] + // Output: c in [0, 2*p503-1] + unsigned int i, borrow = 0; + digit_t mask; + + for (i = 0; i < NWORDS_FIELD; i++) { + SUBC(borrow, a[i], b[i], borrow, c[i]); + } + mask = 0 - (digit_t)borrow; + + borrow = 0; + for (i = 0; i < NWORDS_FIELD; i++) { + ADDC(borrow, c[i], ((digit_t*)p503x2)[i] & mask, borrow, c[i]); + } +} + + +__inline void fpneg503(digit_t* a) +{ // Modular negation, a = -a mod p503. + // Input/output: a in [0, 2*p503-1] + unsigned int i, borrow = 0; + + for (i = 0; i < NWORDS_FIELD; i++) { + SUBC(borrow, ((digit_t*)p503x2)[i], a[i], borrow, a[i]); + } +} + + +void fpdiv2_503(const digit_t* a, digit_t* c) +{ // Modular division by two, c = a/2 mod p503. + // Input : a in [0, 2*p503-1] + // Output: c in [0, 2*p503-1] + unsigned int i, carry = 0; + digit_t mask; + + mask = 0 - (digit_t)(a[0] & 1); // If a is odd compute a+p503 + for (i = 0; i < NWORDS_FIELD; i++) { + ADDC(carry, a[i], ((digit_t*)p503)[i] & mask, carry, c[i]); + } + + mp_shiftr1(c, NWORDS_FIELD); +} + + +void fpcorrection503(digit_t* a) +{ // Modular correction to reduce field element a in [0, 2*p503-1] to [0, p503-1]. + unsigned int i, borrow = 0; + digit_t mask; + + for (i = 0; i < NWORDS_FIELD; i++) { + SUBC(borrow, a[i], ((digit_t*)p503)[i], borrow, a[i]); + } + mask = 0 - (digit_t)borrow; + + borrow = 0; + for (i = 0; i < NWORDS_FIELD; i++) { + ADDC(borrow, a[i], ((digit_t*)p503)[i] & mask, borrow, a[i]); + } +} + + +void digit_x_digit(const digit_t a, const digit_t b, digit_t* c) +{ // Digit multiplication, digit * digit -> 2-digit result + register digit_t al, ah, bl, bh, temp; + digit_t albl, albh, ahbl, ahbh, res1, res2, res3, carry; + digit_t mask_low = (digit_t)(-1) >> (sizeof(digit_t)*4), mask_high = (digit_t)(-1) << (sizeof(digit_t)*4); + + al = a & mask_low; // Low part + ah = a >> (sizeof(digit_t) * 4); // High part + bl = b & mask_low; + bh = b >> (sizeof(digit_t) * 4); + + albl = al*bl; + albh = al*bh; + ahbl = ah*bl; + ahbh = ah*bh; + c[0] = albl & mask_low; // C00 + + res1 = albl >> (sizeof(digit_t) * 4); + res2 = ahbl & mask_low; + res3 = albh & mask_low; + temp = res1 + res2 + res3; + carry = temp >> (sizeof(digit_t) * 4); + c[0] ^= temp << (sizeof(digit_t) * 4); // C01 + + res1 = ahbl >> (sizeof(digit_t) * 4); + res2 = albh >> (sizeof(digit_t) * 4); + res3 = ahbh & mask_low; + temp = res1 + res2 + res3 + carry; + c[1] = temp & mask_low; // C10 + carry = temp & mask_high; + c[1] ^= (ahbh & mask_high) + carry; // C11 +} + + +void mp_mul(const digit_t* a, const digit_t* b, digit_t* c, const unsigned int nwords) +{ // Multiprecision comba multiply, c = a*b, where lng(a) = lng(b) = nwords. + unsigned int i, j; + digit_t t = 0, u = 0, v = 0, UV[2]; + unsigned int carry = 0; + + for (i = 0; i < nwords; i++) { + for (j = 0; j <= i; j++) { + MUL(a[j], b[i-j], UV+1, UV[0]); + ADDC(0, UV[0], v, carry, v); + ADDC(carry, UV[1], u, carry, u); + t += carry; + } + c[i] = v; + v = u; + u = t; + t = 0; + } + + for (i = nwords; i < 2*nwords-1; i++) { + for (j = i-nwords+1; j < nwords; j++) { + MUL(a[j], b[i-j], UV+1, UV[0]); + ADDC(0, UV[0], v, carry, v); + ADDC(carry, UV[1], u, carry, u); + t += carry; + } + c[i] = v; + v = u; + u = t; + t = 0; + } + c[2*nwords-1] = v; +} + + +void rdc_mont(const digit_t* ma, digit_t* mc) +{ // Efficient Montgomery reduction using comba and exploiting the special form of the prime p503. + // mc = ma*R^-1 mod p503x2, where R = 2^512. + // If ma < 2^512*p503, the output mc is in the range [0, 2*p503-1]. + // ma is assumed to be in Montgomery representation. + unsigned int i, j, carry, count = p503_ZERO_WORDS; + digit_t UV[2], t = 0, u = 0, v = 0; + + for (i = 0; i < NWORDS_FIELD; i++) { + mc[i] = 0; + } + + for (i = 0; i < NWORDS_FIELD; i++) { + for (j = 0; j < i; j++) { + if (j < (i-p503_ZERO_WORDS+1)) { + MUL(mc[j], ((digit_t*)p503p1)[i-j], UV+1, UV[0]); + ADDC(0, UV[0], v, carry, v); + ADDC(carry, UV[1], u, carry, u); + t += carry; + } + } + ADDC(0, v, ma[i], carry, v); + ADDC(carry, u, 0, carry, u); + t += carry; + mc[i] = v; + v = u; + u = t; + t = 0; + } + + for (i = NWORDS_FIELD; i < 2*NWORDS_FIELD-1; i++) { + if (count > 0) { + count -= 1; + } + for (j = i-NWORDS_FIELD+1; j < NWORDS_FIELD; j++) { + if (j < (NWORDS_FIELD-count)) { + MUL(mc[j], ((digit_t*)p503p1)[i-j], UV+1, UV[0]); + ADDC(0, UV[0], v, carry, v); + ADDC(carry, UV[1], u, carry, u); + t += carry; + } + } + ADDC(0, v, ma[i], carry, v); + ADDC(carry, u, 0, carry, u); + t += carry; + mc[i-NWORDS_FIELD] = v; + v = u; + u = t; + t = 0; + } + ADDC(0, v, ma[2*NWORDS_FIELD-1], carry, v); + mc[NWORDS_FIELD-1] = v; +} \ No newline at end of file diff --git a/third_party/sidh/src/P751/AMD64/fp_x64.c b/third_party/sidh/src/P751/AMD64/fp_x64.c new file mode 100644 index 00000000..63ff177d --- /dev/null +++ b/third_party/sidh/src/P751/AMD64/fp_x64.c @@ -0,0 +1,861 @@ +/******************************************************************************************** +* SIDH: an efficient supersingular isogeny cryptography library +* +* Abstract: modular arithmetic optimized for x64 platforms for P751 +*********************************************************************************************/ + +#include "../P751_internal.h" + + +// Global constants +extern const uint64_t p751[NWORDS_FIELD]; +extern const uint64_t p751p1[NWORDS_FIELD]; +extern const uint64_t p751x2[NWORDS_FIELD]; + + +__inline void fpadd751(const digit_t* a, const digit_t* b, digit_t* c) +{ // Modular addition, c = a+b mod p751. + // Inputs: a, b in [0, 2*p751-1] + // Output: c in [0, 2*p751-1] + +#if (OS_TARGET == OS_WIN) + unsigned int i, carry = 0; + digit_t mask; + + for (i = 0; i < NWORDS_FIELD; i++) { + ADDC(carry, a[i], b[i], carry, c[i]); + } + + carry = 0; + for (i = 0; i < NWORDS_FIELD; i++) { + SUBC(carry, c[i], ((digit_t*)p751x2)[i], carry, c[i]); + } + mask = 0 - (digit_t)carry; + + carry = 0; + for (i = 0; i < NWORDS_FIELD; i++) { + ADDC(carry, c[i], ((digit_t*)p751x2)[i] & mask, carry, c[i]); + } + +#elif (OS_TARGET == OS_LINUX) + + fpadd751_asm(a, b, c); + +#endif +} + + +__inline void fpsub751(const digit_t* a, const digit_t* b, digit_t* c) +{ // Modular subtraction, c = a-b mod p751. + // Inputs: a, b in [0, 2*p751-1] + // Output: c in [0, 2*p751-1] + +#if (OS_TARGET == OS_WIN) + unsigned int i, borrow = 0; + digit_t mask; + + for (i = 0; i < NWORDS_FIELD; i++) { + SUBC(borrow, a[i], b[i], borrow, c[i]); + } + mask = 0 - (digit_t)borrow; + + borrow = 0; + for (i = 0; i < NWORDS_FIELD; i++) { + ADDC(borrow, c[i], ((digit_t*)p751x2)[i] & mask, borrow, c[i]); + } + +#elif (OS_TARGET == OS_LINUX) + + fpsub751_asm(a, b, c); + +#endif +} + + +__inline void fpneg751(digit_t* a) +{ // Modular negation, a = -a mod p751. + // Input/output: a in [0, 2*p751-1] + unsigned int i, borrow = 0; + + for (i = 0; i < NWORDS_FIELD; i++) { + SUBC(borrow, ((digit_t*)p751x2)[i], a[i], borrow, a[i]); + } +} + + +void fpdiv2_751(const digit_t* a, digit_t* c) +{ // Modular division by two, c = a/2 mod p751. + // Input : a in [0, 2*p751-1] + // Output: c in [0, 2*p751-1] + unsigned int i, carry = 0; + digit_t mask; + + mask = 0 - (digit_t)(a[0] & 1); // If a is odd compute a+p751 + for (i = 0; i < NWORDS_FIELD; i++) { + ADDC(carry, a[i], ((digit_t*)p751)[i] & mask, carry, c[i]); + } + + mp_shiftr1(c, NWORDS_FIELD); +} + + +void fpcorrection751(digit_t* a) +{ // Modular correction to reduce field element a in [0, 2*p751-1] to [0, p751-1]. + unsigned int i, borrow = 0; + digit_t mask; + + for (i = 0; i < NWORDS_FIELD; i++) { + SUBC(borrow, a[i], ((digit_t*)p751)[i], borrow, a[i]); + } + mask = 0 - (digit_t)borrow; + + borrow = 0; + for (i = 0; i < NWORDS_FIELD; i++) { + ADDC(borrow, a[i], ((digit_t*)p751)[i] & mask, borrow, a[i]); + } +} + + +void mp_mul(const digit_t* a, const digit_t* b, digit_t* c, const unsigned int nwords) +{ // Multiprecision multiply, c = a*b, where lng(a) = lng(b) = nwords. + + UNREFERENCED_PARAMETER(nwords); + +#if (OS_TARGET == OS_WIN) + digit_t t = 0; + uint128_t uv = {0}; + unsigned int carry = 0; + + MULADD128(a[0], b[0], uv, carry, uv); + t += carry; + c[0] = uv[0]; + uv[0] = uv[1]; + uv[1] = t; + t = 0; + + MULADD128(a[0], b[1], uv, carry, uv); + t += carry; + MULADD128(a[1], b[0], uv, carry, uv); + t += carry; + c[1] = uv[0]; + uv[0] = uv[1]; + uv[1] = t; + t = 0; + + MULADD128(a[0], b[2], uv, carry, uv); + t += carry; + MULADD128(a[1], b[1], uv, carry, uv); + t += carry; + MULADD128(a[2], b[0], uv, carry, uv); + t += carry; + c[2] = uv[0]; + uv[0] = uv[1]; + uv[1] = t; + t = 0; + + MULADD128(a[0], b[3], uv, carry, uv); + t += carry; + MULADD128(a[2], b[1], uv, carry, uv); + t += carry; + MULADD128(a[1], b[2], uv, carry, uv); + t += carry; + MULADD128(a[3], b[0], uv, carry, uv); + t += carry; + c[3] = uv[0]; + uv[0] = uv[1]; + uv[1] = t; + t = 0; + + MULADD128(a[0], b[4], uv, carry, uv); + t += carry; + MULADD128(a[3], b[1], uv, carry, uv); + t += carry; + MULADD128(a[2], b[2], uv, carry, uv); + t += carry; + MULADD128(a[1], b[3], uv, carry, uv); + t += carry; + MULADD128(a[4], b[0], uv, carry, uv); + t += carry; + c[4] = uv[0]; + uv[0] = uv[1]; + uv[1] = t; + t = 0; + + MULADD128(a[0], b[5], uv, carry, uv); + t += carry; + MULADD128(a[4], b[1], uv, carry, uv); + t += carry; + MULADD128(a[3], b[2], uv, carry, uv); + t += carry; + MULADD128(a[2], b[3], uv, carry, uv); + t += carry; + MULADD128(a[1], b[4], uv, carry, uv); + t += carry; + MULADD128(a[5], b[0], uv, carry, uv); + t += carry; + c[5] = uv[0]; + uv[0] = uv[1]; + uv[1] = t; + t = 0; + + MULADD128(a[0], b[6], uv, carry, uv); + t += carry; + MULADD128(a[5], b[1], uv, carry, uv); + t += carry; + MULADD128(a[4], b[2], uv, carry, uv); + t += carry; + MULADD128(a[3], b[3], uv, carry, uv); + t += carry; + MULADD128(a[2], b[4], uv, carry, uv); + t += carry; + MULADD128(a[1], b[5], uv, carry, uv); + t += carry; + MULADD128(a[6], b[0], uv, carry, uv); + t += carry; + c[6] = uv[0]; + uv[0] = uv[1]; + uv[1] = t; + t = 0; + + MULADD128(a[0], b[7], uv, carry, uv); + t += carry; + MULADD128(a[6], b[1], uv, carry, uv); + t += carry; + MULADD128(a[5], b[2], uv, carry, uv); + t += carry; + MULADD128(a[4], b[3], uv, carry, uv); + t += carry; + MULADD128(a[3], b[4], uv, carry, uv); + t += carry; + MULADD128(a[2], b[5], uv, carry, uv); + t += carry; + MULADD128(a[1], b[6], uv, carry, uv); + t += carry; + MULADD128(a[7], b[0], uv, carry, uv); + t += carry; + c[7] = uv[0]; + uv[0] = uv[1]; + uv[1] = t; + t = 0; + + MULADD128(a[0], b[8], uv, carry, uv); + t += carry; + MULADD128(a[7], b[1], uv, carry, uv); + t += carry; + MULADD128(a[6], b[2], uv, carry, uv); + t += carry; + MULADD128(a[5], b[3], uv, carry, uv); + t += carry; + MULADD128(a[4], b[4], uv, carry, uv); + t += carry; + MULADD128(a[3], b[5], uv, carry, uv); + t += carry; + MULADD128(a[2], b[6], uv, carry, uv); + t += carry; + MULADD128(a[1], b[7], uv, carry, uv); + t += carry; + MULADD128(a[8], b[0], uv, carry, uv); + t += carry; + c[8] = uv[0]; + uv[0] = uv[1]; + uv[1] = t; + t = 0; + + MULADD128(a[0], b[9], uv, carry, uv); + t += carry; + MULADD128(a[8], b[1], uv, carry, uv); + t += carry; + MULADD128(a[7], b[2], uv, carry, uv); + t += carry; + MULADD128(a[6], b[3], uv, carry, uv); + t += carry; + MULADD128(a[5], b[4], uv, carry, uv); + t += carry; + MULADD128(a[4], b[5], uv, carry, uv); + t += carry; + MULADD128(a[3], b[6], uv, carry, uv); + t += carry; + MULADD128(a[2], b[7], uv, carry, uv); + t += carry; + MULADD128(a[1], b[8], uv, carry, uv); + t += carry; + MULADD128(a[9], b[0], uv, carry, uv); + t += carry; + c[9] = uv[0]; + uv[0] = uv[1]; + uv[1] = t; + t = 0; + + MULADD128(a[0], b[10], uv, carry, uv); + t += carry; + MULADD128(a[9], b[1], uv, carry, uv); + t += carry; + MULADD128(a[8], b[2], uv, carry, uv); + t += carry; + MULADD128(a[7], b[3], uv, carry, uv); + t += carry; + MULADD128(a[6], b[4], uv, carry, uv); + t += carry; + MULADD128(a[5], b[5], uv, carry, uv); + t += carry; + MULADD128(a[4], b[6], uv, carry, uv); + t += carry; + MULADD128(a[3], b[7], uv, carry, uv); + t += carry; + MULADD128(a[2], b[8], uv, carry, uv); + t += carry; + MULADD128(a[1], b[9], uv, carry, uv); + t += carry; + MULADD128(a[10], b[0], uv, carry, uv); + t += carry; + c[10] = uv[0]; + uv[0] = uv[1]; + uv[1] = t; + t = 0; + + MULADD128(a[0], b[11], uv, carry, uv); + t += carry; + MULADD128(a[10], b[1], uv, carry, uv); + t += carry; + MULADD128(a[9], b[2], uv, carry, uv); + t += carry; + MULADD128(a[8], b[3], uv, carry, uv); + t += carry; + MULADD128(a[7], b[4], uv, carry, uv); + t += carry; + MULADD128(a[6], b[5], uv, carry, uv); + t += carry; + MULADD128(a[5], b[6], uv, carry, uv); + t += carry; + MULADD128(a[4], b[7], uv, carry, uv); + t += carry; + MULADD128(a[3], b[8], uv, carry, uv); + t += carry; + MULADD128(a[2], b[9], uv, carry, uv); + t += carry; + MULADD128(a[1], b[10], uv, carry, uv); + t += carry; + MULADD128(a[11], b[0], uv, carry, uv); + t += carry; + c[11] = uv[0]; + uv[0] = uv[1]; + uv[1] = t; + t = 0; + + MULADD128(a[1], b[11], uv, carry, uv); + t += carry; + MULADD128(a[10], b[2], uv, carry, uv); + t += carry; + MULADD128(a[9], b[3], uv, carry, uv); + t += carry; + MULADD128(a[8], b[4], uv, carry, uv); + t += carry; + MULADD128(a[7], b[5], uv, carry, uv); + t += carry; + MULADD128(a[6], b[6], uv, carry, uv); + t += carry; + MULADD128(a[5], b[7], uv, carry, uv); + t += carry; + MULADD128(a[4], b[8], uv, carry, uv); + t += carry; + MULADD128(a[3], b[9], uv, carry, uv); + t += carry; + MULADD128(a[2], b[10], uv, carry, uv); + t += carry; + MULADD128(a[11], b[1], uv, carry, uv); + t += carry; + c[12] = uv[0]; + uv[0] = uv[1]; + uv[1] = t; + t = 0; + + MULADD128(a[11], b[2], uv, carry, uv); + t += carry; + MULADD128(a[10], b[3], uv, carry, uv); + t += carry; + MULADD128(a[9], b[4], uv, carry, uv); + t += carry; + MULADD128(a[8], b[5], uv, carry, uv); + t += carry; + MULADD128(a[7], b[6], uv, carry, uv); + t += carry; + MULADD128(a[6], b[7], uv, carry, uv); + t += carry; + MULADD128(a[5], b[8], uv, carry, uv); + t += carry; + MULADD128(a[4], b[9], uv, carry, uv); + t += carry; + MULADD128(a[3], b[10], uv, carry, uv); + t += carry; + MULADD128(a[2], b[11], uv, carry, uv); + t += carry; + c[13] = uv[0]; + uv[0] = uv[1]; + uv[1] = t; + t = 0; + + MULADD128(a[11], b[3], uv, carry, uv); + t += carry; + MULADD128(a[10], b[4], uv, carry, uv); + t += carry; + MULADD128(a[9], b[5], uv, carry, uv); + t += carry; + MULADD128(a[8], b[6], uv, carry, uv); + t += carry; + MULADD128(a[7], b[7], uv, carry, uv); + t += carry; + MULADD128(a[6], b[8], uv, carry, uv); + t += carry; + MULADD128(a[5], b[9], uv, carry, uv); + t += carry; + MULADD128(a[4], b[10], uv, carry, uv); + t += carry; + MULADD128(a[3], b[11], uv, carry, uv); + t += carry; + c[14] = uv[0]; + uv[0] = uv[1]; + uv[1] = t; + t = 0; + + MULADD128(a[11], b[4], uv, carry, uv); + t += carry; + MULADD128(a[10], b[5], uv, carry, uv); + t += carry; + MULADD128(a[9], b[6], uv, carry, uv); + t += carry; + MULADD128(a[8], b[7], uv, carry, uv); + t += carry; + MULADD128(a[7], b[8], uv, carry, uv); + t += carry; + MULADD128(a[6], b[9], uv, carry, uv); + t += carry; + MULADD128(a[5], b[10], uv, carry, uv); + t += carry; + MULADD128(a[4], b[11], uv, carry, uv); + t += carry; + c[15] = uv[0]; + uv[0] = uv[1]; + uv[1] = t; + t = 0; + + MULADD128(a[11], b[5], uv, carry, uv); + t += carry; + MULADD128(a[10], b[6], uv, carry, uv); + t += carry; + MULADD128(a[9], b[7], uv, carry, uv); + t += carry; + MULADD128(a[8], b[8], uv, carry, uv); + t += carry; + MULADD128(a[7], b[9], uv, carry, uv); + t += carry; + MULADD128(a[6], b[10], uv, carry, uv); + t += carry; + MULADD128(a[5], b[11], uv, carry, uv); + t += carry; + c[16] = uv[0]; + uv[0] = uv[1]; + uv[1] = t; + t = 0; + + MULADD128(a[11], b[6], uv, carry, uv); + t += carry; + MULADD128(a[10], b[7], uv, carry, uv); + t += carry; + MULADD128(a[9], b[8], uv, carry, uv); + t += carry; + MULADD128(a[8], b[9], uv, carry, uv); + t += carry; + MULADD128(a[7], b[10], uv, carry, uv); + t += carry; + MULADD128(a[6], b[11], uv, carry, uv); + t += carry; + c[17] = uv[0]; + uv[0] = uv[1]; + uv[1] = t; + t = 0; + + MULADD128(a[11], b[7], uv, carry, uv); + t += carry; + MULADD128(a[10], b[8], uv, carry, uv); + t += carry; + MULADD128(a[9], b[9], uv, carry, uv); + t += carry; + MULADD128(a[8], b[10], uv, carry, uv); + t += carry; + MULADD128(a[7], b[11], uv, carry, uv); + t += carry; + c[18] = uv[0]; + uv[0] = uv[1]; + uv[1] = t; + t = 0; + + MULADD128(a[11], b[8], uv, carry, uv); + t += carry; + MULADD128(a[10], b[9], uv, carry, uv); + t += carry; + MULADD128(a[9], b[10], uv, carry, uv); + t += carry; + MULADD128(a[8], b[11], uv, carry, uv); + t += carry; + c[19] = uv[0]; + uv[0] = uv[1]; + uv[1] = t; + t = 0; + + MULADD128(a[11], b[9], uv, carry, uv); + t += carry; + MULADD128(a[10], b[10], uv, carry, uv); + t += carry; + MULADD128(a[9], b[11], uv, carry, uv); + t += carry; + c[20] = uv[0]; + uv[0] = uv[1]; + uv[1] = t; + t = 0; + + MULADD128(a[11], b[10], uv, carry, uv); + t += carry; + MULADD128(a[10], b[11], uv, carry, uv); + t += carry; + c[21] = uv[0]; + uv[0] = uv[1]; + uv[1] = t; + + MULADD128(a[11], b[11], uv, carry, uv); + c[22] = uv[0]; + c[23] = uv[1]; + +#elif (OS_TARGET == OS_LINUX) + + mul751_asm(a, b, c); + +#endif +} + + +void rdc_mont(const digit_t* ma, digit_t* mc) +{ // Montgomery reduction exploiting special form of the prime. + // mc = ma*R^-1 mod p751x2, where R = 2^768. + // If ma < 2^768*p751, the output mc is in the range [0, 2*p751-1]. + // ma is assumed to be in Montgomery representation. + +#if (OS_TARGET == OS_WIN) + unsigned int carry; + digit_t t = 0; + uint128_t uv = {0}; + + mc[0] = ma[0]; + mc[1] = ma[1]; + mc[2] = ma[2]; + mc[3] = ma[3]; + mc[4] = ma[4]; + MUL128(mc[0], ((digit_t*)p751p1)[5], uv); + ADDC(0, uv[0], ma[5], carry, uv[0]); + ADDC(carry, uv[1], 0, carry, uv[1]); + mc[5] = uv[0]; + uv[0] = uv[1]; + uv[1] = 0; + + MULADD128(mc[0], ((digit_t*)p751p1)[6], uv, carry, uv); + MULADD128(mc[1], ((digit_t*)p751p1)[5], uv, carry, uv); + t += carry; + ADDC(0, uv[0], ma[6], carry, uv[0]); + ADDC(carry, uv[1], 0, carry, uv[1]); + t += carry; + mc[6] = uv[0]; + uv[0] = uv[1]; + uv[1] = t; + t = 0; + + MULADD128(mc[0], ((digit_t*)p751p1)[7], uv, carry, uv); + t += carry; + MULADD128(mc[1], ((digit_t*)p751p1)[6], uv, carry, uv); + t += carry; + MULADD128(mc[2], ((digit_t*)p751p1)[5], uv, carry, uv); + t += carry; + ADDC(0, uv[0], ma[7], carry, uv[0]); + ADDC(carry, uv[1], 0, carry, uv[1]); + t += carry; + mc[7] = uv[0]; + uv[0] = uv[1]; + uv[1] = t; + t = 0; + + MULADD128(mc[0], ((digit_t*)p751p1)[8], uv, carry, uv); + t += carry; + MULADD128(mc[1], ((digit_t*)p751p1)[7], uv, carry, uv); + t += carry; + MULADD128(mc[2], ((digit_t*)p751p1)[6], uv, carry, uv); + t += carry; + MULADD128(mc[3], ((digit_t*)p751p1)[5], uv, carry, uv); + t += carry; + ADDC(0, uv[0], ma[8], carry, uv[0]); + ADDC(carry, uv[1], 0, carry, uv[1]); + t += carry; + mc[8] = uv[0]; + uv[0] = uv[1]; + uv[1] = t; + t = 0; + + MULADD128(mc[0], ((digit_t*)p751p1)[9], uv, carry, uv); + t += carry; + MULADD128(mc[1], ((digit_t*)p751p1)[8], uv, carry, uv); + t += carry; + MULADD128(mc[2], ((digit_t*)p751p1)[7], uv, carry, uv); + t += carry; + MULADD128(mc[3], ((digit_t*)p751p1)[6], uv, carry, uv); + t += carry; + MULADD128(mc[4], ((digit_t*)p751p1)[5], uv, carry, uv); + t += carry; + ADDC(0, uv[0], ma[9], carry, uv[0]); + ADDC(carry, uv[1], 0, carry, uv[1]); + t += carry; + mc[9] = uv[0]; + uv[0] = uv[1]; + uv[1] = t; + t = 0; + + MULADD128(mc[0], ((digit_t*)p751p1)[10], uv, carry, uv); + t += carry; + MULADD128(mc[1], ((digit_t*)p751p1)[9], uv, carry, uv); + t += carry; + MULADD128(mc[2], ((digit_t*)p751p1)[8], uv, carry, uv); + t += carry; + MULADD128(mc[3], ((digit_t*)p751p1)[7], uv, carry, uv); + t += carry; + MULADD128(mc[4], ((digit_t*)p751p1)[6], uv, carry, uv); + t += carry; + MULADD128(mc[5], ((digit_t*)p751p1)[5], uv, carry, uv); + t += carry; + ADDC(0, uv[0], ma[10], carry, uv[0]); + ADDC(carry, uv[1], 0, carry, uv[1]); + t += carry; + mc[10] = uv[0]; + uv[0] = uv[1]; + uv[1] = t; + t = 0; + + MULADD128(mc[0], ((digit_t*)p751p1)[11], uv, carry, uv); + t += carry; + MULADD128(mc[1], ((digit_t*)p751p1)[10], uv, carry, uv); + t += carry; + MULADD128(mc[2], ((digit_t*)p751p1)[9], uv, carry, uv); + t += carry; + MULADD128(mc[3], ((digit_t*)p751p1)[8], uv, carry, uv); + t += carry; + MULADD128(mc[4], ((digit_t*)p751p1)[7], uv, carry, uv); + t += carry; + MULADD128(mc[5], ((digit_t*)p751p1)[6], uv, carry, uv); + t += carry; + MULADD128(mc[6], ((digit_t*)p751p1)[5], uv, carry, uv); + t += carry; + ADDC(0, uv[0], ma[11], carry, uv[0]); + ADDC(carry, uv[1], 0, carry, uv[1]); + t += carry; + mc[11] = uv[0]; + uv[0] = uv[1]; + uv[1] = t; + t = 0; + + MULADD128(mc[1], ((digit_t*)p751p1)[11], uv, carry, uv); + t += carry; + MULADD128(mc[2], ((digit_t*)p751p1)[10], uv, carry, uv); + t += carry; + MULADD128(mc[3], ((digit_t*)p751p1)[9], uv, carry, uv); + t += carry; + MULADD128(mc[4], ((digit_t*)p751p1)[8], uv, carry, uv); + t += carry; + MULADD128(mc[5], ((digit_t*)p751p1)[7], uv, carry, uv); + t += carry; + MULADD128(mc[6], ((digit_t*)p751p1)[6], uv, carry, uv); + t += carry; + MULADD128(mc[7], ((digit_t*)p751p1)[5], uv, carry, uv); + t += carry; + ADDC(0, uv[0], ma[12], carry, uv[0]); + ADDC(carry, uv[1], 0, carry, uv[1]); + t += carry; + mc[0] = uv[0]; + uv[0] = uv[1]; + uv[1] = t; + t = 0; + + MULADD128(mc[2], ((digit_t*)p751p1)[11], uv, carry, uv); + t += carry; + MULADD128(mc[3], ((digit_t*)p751p1)[10], uv, carry, uv); + t += carry; + MULADD128(mc[4], ((digit_t*)p751p1)[9], uv, carry, uv); + t += carry; + MULADD128(mc[5], ((digit_t*)p751p1)[8], uv, carry, uv); + t += carry; + MULADD128(mc[6], ((digit_t*)p751p1)[7], uv, carry, uv); + t += carry; + MULADD128(mc[7], ((digit_t*)p751p1)[6], uv, carry, uv); + t += carry; + MULADD128(mc[8], ((digit_t*)p751p1)[5], uv, carry, uv); + t += carry; + ADDC(0, uv[0], ma[13], carry, uv[0]); + ADDC(carry, uv[1], 0, carry, uv[1]); + t += carry; + mc[1] = uv[0]; + uv[0] = uv[1]; + uv[1] = t; + t = 0; + + MULADD128(mc[3], ((digit_t*)p751p1)[11], uv, carry, uv); + t += carry; + MULADD128(mc[4], ((digit_t*)p751p1)[10], uv, carry, uv); + t += carry; + MULADD128(mc[5], ((digit_t*)p751p1)[9], uv, carry, uv); + t += carry; + MULADD128(mc[6], ((digit_t*)p751p1)[8], uv, carry, uv); + t += carry; + MULADD128(mc[7], ((digit_t*)p751p1)[7], uv, carry, uv); + t += carry; + MULADD128(mc[8], ((digit_t*)p751p1)[6], uv, carry, uv); + t += carry; + MULADD128(mc[9], ((digit_t*)p751p1)[5], uv, carry, uv); + t += carry; + ADDC(0, uv[0], ma[14], carry, uv[0]); + ADDC(carry, uv[1], 0, carry, uv[1]); + t += carry; + mc[2] = uv[0]; + uv[0] = uv[1]; + uv[1] = t; + t = 0; + + MULADD128(mc[4], ((digit_t*)p751p1)[11], uv, carry, uv); + t += carry; + MULADD128(mc[5], ((digit_t*)p751p1)[10], uv, carry, uv); + t += carry; + MULADD128(mc[6], ((digit_t*)p751p1)[9], uv, carry, uv); + t += carry; + MULADD128(mc[7], ((digit_t*)p751p1)[8], uv, carry, uv); + t += carry; + MULADD128(mc[8], ((digit_t*)p751p1)[7], uv, carry, uv); + t += carry; + MULADD128(mc[9], ((digit_t*)p751p1)[6], uv, carry, uv); + t += carry; + MULADD128(mc[10], ((digit_t*)p751p1)[5], uv, carry, uv); + t += carry; + ADDC(0, uv[0], ma[15], carry, uv[0]); + ADDC(carry, uv[1], 0, carry, uv[1]); + t += carry; + mc[3] = uv[0]; + uv[0] = uv[1]; + uv[1] = t; + t = 0; + + MULADD128(mc[5], ((digit_t*)p751p1)[11], uv, carry, uv); + t += carry; + MULADD128(mc[6], ((digit_t*)p751p1)[10], uv, carry, uv); + t += carry; + MULADD128(mc[7], ((digit_t*)p751p1)[9], uv, carry, uv); + t += carry; + MULADD128(mc[8], ((digit_t*)p751p1)[8], uv, carry, uv); + t += carry; + MULADD128(mc[9], ((digit_t*)p751p1)[7], uv, carry, uv); + t += carry; + MULADD128(mc[10], ((digit_t*)p751p1)[6], uv, carry, uv); + t += carry; + MULADD128(mc[11], ((digit_t*)p751p1)[5], uv, carry, uv); + t += carry; + ADDC(0, uv[0], ma[16], carry, uv[0]); + ADDC(carry, uv[1], 0, carry, uv[1]); + t += carry; + mc[4] = uv[0]; + uv[0] = uv[1]; + uv[1] = t; + t = 0; + + MULADD128(mc[6], ((digit_t*)p751p1)[11], uv, carry, uv); + t += carry; + MULADD128(mc[7], ((digit_t*)p751p1)[10], uv, carry, uv); + t += carry; + MULADD128(mc[8], ((digit_t*)p751p1)[9], uv, carry, uv); + t += carry; + MULADD128(mc[9], ((digit_t*)p751p1)[8], uv, carry, uv); + t += carry; + MULADD128(mc[10], ((digit_t*)p751p1)[7], uv, carry, uv); + t += carry; + MULADD128(mc[11], ((digit_t*)p751p1)[6], uv, carry, uv); + t += carry; + ADDC(0, uv[0], ma[17], carry, uv[0]); + ADDC(carry, uv[1], 0, carry, uv[1]); + t += carry; + mc[5] = uv[0]; + uv[0] = uv[1]; + uv[1] = t; + t = 0; + + MULADD128(mc[7], ((digit_t*)p751p1)[11], uv, carry, uv); + t += carry; + MULADD128(mc[8], ((digit_t*)p751p1)[10], uv, carry, uv); + t += carry; + MULADD128(mc[9], ((digit_t*)p751p1)[9], uv, carry, uv); + t += carry; + MULADD128(mc[10], ((digit_t*)p751p1)[8], uv, carry, uv); + t += carry; + MULADD128(mc[11], ((digit_t*)p751p1)[7], uv, carry, uv); + t += carry; + ADDC(0, uv[0], ma[18], carry, uv[0]); + ADDC(carry, uv[1], 0, carry, uv[1]); + t += carry; + mc[6] = uv[0]; + uv[0] = uv[1]; + uv[1] = t; + t = 0; + + MULADD128(mc[8], ((digit_t*)p751p1)[11], uv, carry, uv); + t += carry; + MULADD128(mc[9], ((digit_t*)p751p1)[10], uv, carry, uv); + t += carry; + MULADD128(mc[10], ((digit_t*)p751p1)[9], uv, carry, uv); + t += carry; + MULADD128(mc[11], ((digit_t*)p751p1)[8], uv, carry, uv); + t += carry; + ADDC(0, uv[0], ma[19], carry, uv[0]); + ADDC(carry, uv[1], 0, carry, uv[1]); + t += carry; + mc[7] = uv[0]; + uv[0] = uv[1]; + uv[1] = t; + t = 0; + + MULADD128(mc[9], ((digit_t*)p751p1)[11], uv, carry, uv); + t += carry; + MULADD128(mc[10], ((digit_t*)p751p1)[10], uv, carry, uv); + t += carry; + MULADD128(mc[11], ((digit_t*)p751p1)[9], uv, carry, uv); + t += carry; + ADDC(0, uv[0], ma[20], carry, uv[0]); + ADDC(carry, uv[1], 0, carry, uv[1]); + t += carry; + mc[8] = uv[0]; + uv[0] = uv[1]; + uv[1] = t; + t = 0; + + MULADD128(mc[10], ((digit_t*)p751p1)[11], uv, carry, uv); + t += carry; + MULADD128(mc[11], ((digit_t*)p751p1)[10], uv, carry, uv); + t += carry; + ADDC(0, uv[0], ma[21], carry, uv[0]); + ADDC(carry, uv[1], 0, carry, uv[1]); + t += carry; + mc[9] = uv[0]; + uv[0] = uv[1]; + uv[1] = t; + t = 0; + + MULADD128(mc[11], ((digit_t*)p751p1)[11], uv, carry, uv); + t += carry; + ADDC(0, uv[0], ma[22], carry, mc[10]); + ADDC(carry, uv[1], 0, carry, uv[1]); + ADDC(0, uv[1], ma[23], carry, mc[11]); + +#elif (OS_TARGET == OS_LINUX) + + rdc751_asm(ma, mc); + +#endif +} diff --git a/third_party/sidh/src/P751/AMD64/fp_x64_asm.S b/third_party/sidh/src/P751/AMD64/fp_x64_asm.S new file mode 100644 index 00000000..b76c415d --- /dev/null +++ b/third_party/sidh/src/P751/AMD64/fp_x64_asm.S @@ -0,0 +1,3009 @@ +//******************************************************************************************* +// SIDH: an efficient supersingular isogeny cryptography library +// +// Abstract: field arithmetic in x64 assembly for P751 on Linux +//******************************************************************************************* + +.intel_syntax noprefix + +// Registers that are used for parameter passing: +#define reg_p1 rdi +#define reg_p2 rsi +#define reg_p3 rdx + +// p751 + 1 +#define p751p1_5 0xEEB0000000000000 +#define p751p1_6 0xE3EC968549F878A8 +#define p751p1_7 0xDA959B1A13F7CC76 +#define p751p1_8 0x084E9867D6EBE876 +#define p751p1_9 0x8562B5045CB25748 +#define p751p1_10 0x0E12909F97BADC66 +#define p751p1_11 0x00006FE5D541F71C +// p751 x 2 +#define p751x2_0 0xFFFFFFFFFFFFFFFE +#define p751x2_1 0xFFFFFFFFFFFFFFFF +#define p751x2_5 0xDD5FFFFFFFFFFFFF +#define p751x2_6 0xC7D92D0A93F0F151 +#define p751x2_7 0xB52B363427EF98ED +#define p751x2_8 0x109D30CFADD7D0ED +#define p751x2_9 0x0AC56A08B964AE90 +#define p751x2_10 0x1C25213F2F75B8CD +#define p751x2_11 0x0000DFCBAA83EE38 + +p751p1_nz: +.quad 0xEEB0000000000000 +.quad 0xE3EC968549F878A8 +.quad 0xDA959B1A13F7CC76 +.quad 0x084E9867D6EBE876 +.quad 0x8562B5045CB25748 +.quad 0x0E12909F97BADC66 +.quad 0x00006FE5D541F71C + + +.text +//*********************************************************************** +// Field addition +// Operation: c [reg_p3] = a [reg_p1] + b [reg_p2] +//*********************************************************************** +.global fpadd751_asm +fpadd751_asm: + push r12 + push r13 + push r14 + push r15 + + mov r8, [reg_p1] + mov r9, [reg_p1+8] + mov r10, [reg_p1+16] + mov r11, [reg_p1+24] + mov r12, [reg_p1+32] + mov r13, [reg_p1+40] + mov r14, [reg_p1+48] + mov r15, [reg_p1+56] + mov rcx, [reg_p1+64] + add r8, [reg_p2] + adc r9, [reg_p2+8] + adc r10, [reg_p2+16] + adc r11, [reg_p2+24] + adc r12, [reg_p2+32] + adc r13, [reg_p2+40] + adc r14, [reg_p2+48] + adc r15, [reg_p2+56] + adc rcx, [reg_p2+64] + mov rax, [reg_p1+72] + adc rax, [reg_p2+72] + mov [reg_p3+72], rax + mov rax, [reg_p1+80] + adc rax, [reg_p2+80] + mov [reg_p3+80], rax + mov rax, [reg_p1+88] + adc rax, [reg_p2+88] + mov [reg_p3+88], rax + + movq rax, p751x2_0 + sub r8, rax + movq rax, p751x2_1 + sbb r9, rax + sbb r10, rax + sbb r11, rax + sbb r12, rax + movq rax, p751x2_5 + sbb r13, rax + movq rax, p751x2_6 + sbb r14, rax + movq rax, p751x2_7 + sbb r15, rax + movq rax, p751x2_8 + sbb rcx, rax + mov [reg_p3], r8 + mov [reg_p3+8], r9 + mov [reg_p3+16], r10 + mov [reg_p3+24], r11 + mov [reg_p3+32], r12 + mov [reg_p3+40], r13 + mov [reg_p3+48], r14 + mov [reg_p3+56], r15 + mov [reg_p3+64], rcx + mov r8, [reg_p3+72] + mov r9, [reg_p3+80] + mov r10, [reg_p3+88] + movq rax, p751x2_9 + sbb r8, rax + movq rax, p751x2_10 + sbb r9, rax + movq rax, p751x2_11 + sbb r10, rax + mov [reg_p3+72], r8 + mov [reg_p3+80], r9 + mov [reg_p3+88], r10 + movq rax, 0 + sbb rax, 0 + + mov rsi, p751x2_0 + and rsi, rax + mov r8, p751x2_1 + and r8, rax + movq r9, p751x2_5 + and r9, rax + movq r10, p751x2_6 + and r10, rax + movq r11, p751x2_7 + and r11, rax + movq r12, p751x2_8 + and r12, rax + movq r13, p751x2_9 + and r13, rax + movq r14, p751x2_10 + and r14, rax + movq r15, p751x2_11 + and r15, rax + + add rsi, [reg_p3] + mov [reg_p3], rsi + mov rax, [reg_p3+8] + adc rax, r8 + mov [reg_p3+8], rax + mov rax, [reg_p3+16] + adc rax, r8 + mov [reg_p3+16], rax + mov rax, [reg_p3+24] + adc rax, r8 + mov [reg_p3+24], rax + mov rax, [reg_p3+32] + adc rax, r8 + mov [reg_p3+32], rax + adc r9, [reg_p3+40] + adc r10, [reg_p3+48] + adc r11, [reg_p3+56] + adc r12, [reg_p3+64] + adc r13, [reg_p3+72] + adc r14, [reg_p3+80] + adc r15, [reg_p3+88] + mov [reg_p3+40], r9 + mov [reg_p3+48], r10 + mov [reg_p3+56], r11 + mov [reg_p3+64], r12 + mov [reg_p3+72], r13 + mov [reg_p3+80], r14 + mov [reg_p3+88], r15 + + pop r15 + pop r14 + pop r13 + pop r12 + ret + + +//*********************************************************************** +// Field subtraction +// Operation: c [reg_p3] = a [reg_p1] - b [reg_p2] +//*********************************************************************** +.global fpsub751_asm +fpsub751_asm: + push r12 + push r13 + push r14 + push r15 + + mov r8, [reg_p1] + mov r9, [reg_p1+8] + mov r10, [reg_p1+16] + mov r11, [reg_p1+24] + mov r12, [reg_p1+32] + mov r13, [reg_p1+40] + mov r14, [reg_p1+48] + mov r15, [reg_p1+56] + mov rcx, [reg_p1+64] + sub r8, [reg_p2] + sbb r9, [reg_p2+8] + sbb r10, [reg_p2+16] + sbb r11, [reg_p2+24] + sbb r12, [reg_p2+32] + sbb r13, [reg_p2+40] + sbb r14, [reg_p2+48] + sbb r15, [reg_p2+56] + sbb rcx, [reg_p2+64] + mov [reg_p3], r8 + mov [reg_p3+8], r9 + mov [reg_p3+16], r10 + mov [reg_p3+24], r11 + mov [reg_p3+32], r12 + mov [reg_p3+40], r13 + mov [reg_p3+48], r14 + mov [reg_p3+56], r15 + mov [reg_p3+64], rcx + mov rax, [reg_p1+72] + sbb rax, [reg_p2+72] + mov [reg_p3+72], rax + mov rax, [reg_p1+80] + sbb rax, [reg_p2+80] + mov [reg_p3+80], rax + mov rax, [reg_p1+88] + sbb rax, [reg_p2+88] + mov [reg_p3+88], rax + movq rax, 0 + sbb rax, 0 + + mov rsi, p751x2_0 + and rsi, rax + mov r8, p751x2_1 + and r8, rax + movq r9, p751x2_5 + and r9, rax + movq r10, p751x2_6 + and r10, rax + movq r11, p751x2_7 + and r11, rax + movq r12, p751x2_8 + and r12, rax + movq r13, p751x2_9 + and r13, rax + movq r14, p751x2_10 + and r14, rax + movq r15, p751x2_11 + and r15, rax + + mov rax, [reg_p3] + add rax, rsi + mov [reg_p3], rax + mov rax, [reg_p3+8] + adc rax, r8 + mov [reg_p3+8], rax + mov rax, [reg_p3+16] + adc rax, r8 + mov [reg_p3+16], rax + mov rax, [reg_p3+24] + adc rax, r8 + mov [reg_p3+24], rax + mov rax, [reg_p3+32] + adc rax, r8 + mov [reg_p3+32], rax + adc r9, [reg_p3+40] + adc r10, [reg_p3+48] + adc r11, [reg_p3+56] + adc r12, [reg_p3+64] + adc r13, [reg_p3+72] + adc r14, [reg_p3+80] + adc r15, [reg_p3+88] + mov [reg_p3+40], r9 + mov [reg_p3+48], r10 + mov [reg_p3+56], r11 + mov [reg_p3+64], r12 + mov [reg_p3+72], r13 + mov [reg_p3+80], r14 + mov [reg_p3+88], r15 + + pop r15 + pop r14 + pop r13 + pop r12 + ret + + +#ifdef _MULX_ + +/////////////////////////////////////////////////////////////////////////// MACRO +// Schoolbook integer multiplication +// Inputs: memory pointers M0 and M1 +// Outputs: memory pointer C +// Temps: stack space for two 64-bit values (case w/o _ADX_), regs T0:T7 +/////////////////////////////////////////////////////////////////////////// +#ifdef _ADX_ + +.macro MUL384_SCHOOL M0, M1, C, S, T0, T1, T2, T3, T4, T5, T6, T7 + mov rdx, \M0 + mulx \T0, \T1, \M1 + mulx \T2, \T3, 8\M1 + mov \C, \T1 // C0_final + xor rax, rax + mulx \T4, \T5, 16\M1 + adox \T0, \T3 + adox \T2, \T5 + mulx \T1, \T3, 24\M1 + adox \T4, \T3 + mulx \T5, \T6, 32\M1 + adox \T1, \T6 + mulx \T3, \T7, 40\M1 + adox \T5, \T7 + adox \T3, rax + + mov rdx, 8\M0 + mulx \T6, \T7, \M1 + xor rax, rax + adcx \T0, \T7 + mov 8\C, \T0 // C1_final + adcx \T2, \T6 + mulx \T6, \T7, 8\M1 + adox \T2, \T7 + adcx \T4, \T6 + mulx \T0, \T6, 16\M1 + adox \T4, \T6 + adcx \T0, \T1 + mulx \T1, \T7, 24\M1 + adcx \T1, \T5 + mulx \T5, \T6, 32\M1 + adcx \T3, \T5 + mulx \T5, rdx, 40\M1 + adcx \T5, rax + + adox \T0, \T7 + adox \T1, \T6 + adox \T3, rdx + adox \T5, rax + + mov rdx, 16\M0 + mulx \T6, \T7, \M1 + xor rax, rax + adcx \T2, \T7 + mov 16\C, \T2 // C2_final + adcx \T4, \T6 + mulx \T6, \T7, 8\M1 + adox \T4, \T7 + adcx \T0, \T6 + mulx \T2, \T6, 16\M1 + adox \T0, \T6 + adcx \T1, \T2 + mulx \T2, \T7, 24\M1 + adcx \T3, \T2 + mulx \T2, \T6, 32\M1 + adcx \T5, \T2 + mulx \T2, rdx, 40\M1 + adcx \T2, rax + + adox \T1, \T7 + adox \T3, \T6 + adox \T5, rdx + adox \T2, rax + + mov rdx, 24\M0 + mulx \T6, \T7, \M1 + xor rax, rax + adcx \T4, \T7 + mov 24\C, \T4 // C3_final + adcx \T0, \T6 + mulx \T6, \T7, 8\M1 + adox \T0, \T7 + adcx \T1, \T6 + mulx \T4, \T6, 16\M1 + adox \T1, \T6 + adcx \T3, \T4 + mulx \T4, \T7, 24\M1 + adcx \T5, \T4 + mulx \T4, \T6, 32\M1 + adcx \T2, \T4 + mulx \T4, rdx, 40\M1 + adcx \T4, rax + + adox \T3, \T7 + adox \T5, \T6 + adox \T2, rdx + adox \T4, rax + + mov rdx, 32\M0 + mulx \T6, \T7, \M1 + xor rax, rax + adcx \T0, \T7 + mov 32\C, \T0 // C4_final + adcx \T1, \T6 + mulx \T6, \T7, 8\M1 + adox \T1, \T7 + adcx \T3, \T6 + mulx \T0, \T6, 16\M1 + adox \T3, \T6 + adcx \T5, \T0 + mulx \T0, \T7, 24\M1 + adcx \T2, \T0 + mulx \T0, \T6, 32\M1 + adcx \T4, \T0 + mulx \T0, rdx, 40\M1 + adcx \T0, rax + + adox \T5, \T7 + adox \T2, \T6 + adox \T4, rdx + adox \T0, rax + + mov rdx, 40\M0 + mulx \T6, \T7, \M1 + xor rax, rax + adcx \T1, \T7 + mov 40\C, \T1 // C5_final + adcx \T3, \T6 + mulx \T6, \T7, 8\M1 + adox \T3, \T7 + adcx \T5, \T6 + mulx \T1, \T6, 16\M1 + adox \T5, \T6 + adcx \T2, \T1 + mulx \T1, \T7, 24\M1 + adcx \T4, \T1 + mulx \T1, \T6, 32\M1 + adcx \T0, \T1 + mulx \T1, rdx, 40\M1 + adcx \T1, rax + + adox \T2, \T7 + adox \T4, \T6 + adox \T0, rdx + adox \T1, rax + mov 48\C, \T3 + mov 56\C, \T5 + mov 64\C, \T2 + mov 72\C, \T4 + mov 80\C, \T0 + mov 88\C, \T1 +.endm + +#else + +.macro MUL384_SCHOOL M0, M1, C, S, T0, T1, T2, T3, T4, T5, T6, T7 + mov rdx, \M0 + mulx \T0, \T1, \M1 + mulx \T2, \T3, 8\M1 + mov \C, \T1 // C0_final + xor rax, rax + mulx \T4, \T5, 16\M1 + add \T0, \T3 + adc \T2, \T5 + mulx \T1, \T3, 24\M1 + adc \T4, \T3 + mulx \T5, \T6, 32\M1 + adc \T1, \T6 + mulx \T3, \T7, 40\M1 + adc \T5, \T7 + adc \T3, rax + + mov rdx, 8\M0 + mulx \T6, \T7, \M1 + add \T0, \T7 + mov 8\C, \T0 // C1_final + adc \T2, \T6 + mulx \T6, \T7, 8\M1 + mov \S, \T7 // store T7 + adc \T4, \T6 + mulx \T0, \T6, 16\M1 + mov 8\S, \T6 // store T6 + adc \T0, \T1 + mulx \T1, \T7, 24\M1 + adc \T1, \T5 + mulx \T5, \T6, 32\M1 + adc \T3, \T5 + mulx \T5, rdx, 40\M1 + adc \T5, rax + + xor rax, rax + add \T2, \S + adc \T4, 8\S + adc \T0, \T7 + adc \T1, \T6 + adc \T3, rdx + adc \T5, rax + + mov rdx, 16\M0 + mulx \T6, \T7, \M1 + add \T2, \T7 + mov 16\C, \T2 // C2_final + adc \T4, \T6 + mulx \T6, \T7, 8\M1 + mov \S, \T7 // store T7 + adc \T0, \T6 + mulx \T2, \T6, 16\M1 + mov 8\S, \T6 // store T6 + adc \T1, \T2 + mulx \T2, \T7, 24\M1 + adc \T3, \T2 + mulx \T2, \T6, 32\M1 + adc \T5, \T2 + mulx \T2, rdx, 40\M1 + adc \T2, rax + + xor rax, rax + add \T4, \S + adc \T0, 8\S + adc \T1, \T7 + adc \T3, \T6 + adc \T5, rdx + adc \T2, rax + + mov rdx, 24\M0 + mulx \T6, \T7, \M1 + add \T4, \T7 + mov 24\C, \T4 // C3_final + adc \T0, \T6 + mulx \T6, \T7, 8\M1 + mov \S, \T7 // store T7 + adc \T1, \T6 + mulx \T4, \T6, 16\M1 + mov 8\S, \T6 // store T6 + adc \T3, \T4 + mulx \T4, \T7, 24\M1 + adc \T5, \T4 + mulx \T4, \T6, 32\M1 + adc \T2, \T4 + mulx \T4, rdx, 40\M1 + adc \T4, rax + + xor rax, rax + add \T0, \S + adc \T1, 8\S + adc \T3, \T7 + adc \T5, \T6 + adc \T2, rdx + adc \T4, rax + + mov rdx, 32\M0 + mulx \T6, \T7, \M1 + add \T0, \T7 + mov 32\C, \T0 // C4_final + adc \T1, \T6 + mulx \T6, \T7, 8\M1 + mov \S, \T7 // store T7 + adc \T3, \T6 + mulx \T0, \T6, 16\M1 + mov 8\S, \T6 // store T6 + adc \T5, \T0 + mulx \T0, \T7, 24\M1 + adc \T2, \T0 + mulx \T0, \T6, 32\M1 + adc \T4, \T0 + mulx \T0, rdx, 40\M1 + adc \T0, rax + + xor rax, rax + add \T1, \S + adc \T3, 8\S + adc \T5, \T7 + adc \T2, \T6 + adc \T4, rdx + adc \T0, rax + + mov rdx, 40\M0 + mulx \T6, \T7, \M1 + add \T1, \T7 + mov 40\C, \T1 // C5_final + adc \T3, \T6 + mulx \T6, \T7, 8\M1 + mov \S, \T7 // store T7 + adc \T5, \T6 + mulx \T1, \T6, 16\M1 + mov 8\S, \T6 // store T6 + adc \T2, \T1 + mulx \T1, \T7, 24\M1 + adc \T4, \T1 + mulx \T1, \T6, 32\M1 + adc \T0, \T1 + mulx \T1, rdx, 40\M1 + adc \T1, rax + + add \T3, \S + adc \T5, 8\S + adc \T2, \T7 + adc \T4, \T6 + adc \T0, rdx + adc \T1, 0 + mov 48\C, \T3 + mov 56\C, \T5 + mov 64\C, \T2 + mov 72\C, \T4 + mov 80\C, \T0 + mov 88\C, \T1 +.endm + +#endif + + +//***************************************************************************** +// 751-bit multiplication using Karatsuba (one level), schoolbook (two levels) +//***************************************************************************** +.global mul751_asm +mul751_asm: + push r12 + push r13 + push r14 + push r15 + mov rcx, reg_p3 + + // [rsp] <- AH + AL, rax <- mask + xor rax, rax + mov r8, [reg_p1] + mov r9, [reg_p1+8] + mov r10, [reg_p1+16] + mov r11, [reg_p1+24] + mov r12, [reg_p1+32] + mov r13, [reg_p1+40] + push rbx + push rbp + sub rsp, 152 + add r8, [reg_p1+48] + adc r9, [reg_p1+56] + adc r10, [reg_p1+64] + adc r11, [reg_p1+72] + adc r12, [reg_p1+80] + adc r13, [reg_p1+88] + sbb rax, 0 + mov [rsp], r8 + mov [rsp+8], r9 + mov [rsp+16], r10 + mov [rsp+24], r11 + mov [rsp+32], r12 + mov [rsp+40], r13 + + // [rsp+48] <- BH + BL, rdx <- mask + xor rdx, rdx + mov r8, [reg_p2] + mov r9, [reg_p2+8] + mov rbx, [reg_p2+16] + mov rbp, [reg_p2+24] + mov r14, [reg_p2+32] + mov r15, [reg_p2+40] + add r8, [reg_p2+48] + adc r9, [reg_p2+56] + adc rbx, [reg_p2+64] + adc rbp, [reg_p2+72] + adc r14, [reg_p2+80] + adc r15, [reg_p2+88] + sbb rdx, 0 + mov [rsp+48], r8 + mov [rsp+56], r9 + mov [rsp+64], rbx + mov [rsp+72], rbp + mov [rsp+80], r14 + mov [rsp+88], r15 + + // [rcx] <- masked (BH + BL) + and r8, rax + and r9, rax + and rbx, rax + and rbp, rax + and r14, rax + and r15, rax + mov [rcx], r8 + mov [rcx+8], r9 + mov [rcx+16], rbx ///// + mov [rcx+24], rbp ///// + + // r8-r13 <- masked (AH + AL) + mov r8, [rsp] + mov r9, [rsp+8] + and r8, rdx + and r9, rdx + and r10, rdx + and r11, rdx + and r12, rdx + and r13, rdx + + // [rsp+96] <- masked (AH + AL) + masked (AH + AL) + mov rax, [rcx] + mov rdx, [rcx+8] + add r8, rax + adc r9, rdx + adc r10, rbx + adc r11, rbp + adc r12, r14 + adc r13, r15 + mov [rsp+96], r8 + mov [rsp+104], r9 + mov [rsp+112], r10 + mov [rsp+120], r11 + + // [rcx] <- AL x BL + MUL384_SCHOOL [reg_p1], [reg_p2], [rcx], [rsp+128], r8, r9, r10, r11, rbx, rbp, r14, r15 // Result C0-C5 + + // [rcx+96] <- (AH+AL) x (BH+BL), low part + MUL384_SCHOOL [rsp], [rsp+48], [rcx+96], [rsp+128], r8, r9, r10, r11, rbx, rbp, r14, r15 + + // [rsp] <- AH x BH + MUL384_SCHOOL [reg_p1+48], [reg_p2+48], [rsp], [rsp+128], r8, r9, r10, r11, rbx, rbp, r14, r15 + + // r8-r13 <- (AH+AL) x (BH+BL), final step + mov r8, [rsp+96] + mov r9, [rsp+104] + mov r10, [rsp+112] + mov r11, [rsp+120] + mov rax, [rcx+144] + add r8, rax + mov rax, [rcx+152] + adc r9, rax + mov rax, [rcx+160] + adc r10, rax + mov rax, [rcx+168] + adc r11, rax + mov rax, [rcx+176] + adc r12, rax + mov rax, [rcx+184] + adc r13, rax + + // rdi,rdx,rbx,rbp,r14,r15,r8-r13 <- (AH+AL) x (BH+BL) - ALxBL + mov rdi, [rcx+96] + sub rdi, [rcx] + mov rdx, [rcx+104] + sbb rdx, [rcx+8] + mov rbx, [rcx+112] + sbb rbx, [rcx+16] + mov rbp, [rcx+120] + sbb rbp, [rcx+24] + mov r14, [rcx+128] + sbb r14, [rcx+32] + mov r15, [rcx+136] + sbb r15, [rcx+40] + sbb r8, [rcx+48] + sbb r9, [rcx+56] + sbb r10, [rcx+64] + sbb r11, [rcx+72] + sbb r12, [rcx+80] + sbb r13, [rcx+88] + + // rdi,rdx,rbx,rbp,r14,r15,r8-r13 <- (AH+AL) x (BH+BL) - ALxBL - AHxBH + sub rdi, [rsp] + sbb rdx, [rsp+8] + sbb rbx, [rsp+16] + sbb rbp, [rsp+24] + sbb r14, [rsp+32] + sbb r15, [rsp+40] + sbb r8, [rsp+48] + sbb r9, [rsp+56] + sbb r10, [rsp+64] + sbb r11, [rsp+72] + sbb r12, [rsp+80] + sbb r13, [rsp+88] + + mov rax, [rcx+48] + add rax, rdi + mov [rcx+48], rax // Result C6-C11 + mov rax, [rcx+56] + adc rax, rdx + mov [rcx+56], rax + mov rax, [rcx+64] + adc rax, rbx + mov [rcx+64], rax + mov rax, [rcx+72] + adc rax, rbp + mov [rcx+72], rax + mov rax, [rcx+80] + adc rax, r14 + mov [rcx+80], rax + mov rax, [rcx+88] + adc rax, r15 + mov [rcx+88], rax + mov rax, [rsp] + adc r8, rax + mov [rcx+96], r8 // Result C8-C15 + mov rax, [rsp+8] + adc r9, rax + mov [rcx+104], r9 + mov rax, [rsp+16] + adc r10, rax + mov [rcx+112], r10 + mov rax, [rsp+24] + adc r11, rax + mov [rcx+120], r11 + mov rax, [rsp+32] + adc r12, rax + mov [rcx+128], r12 + mov rax, [rsp+40] + adc r13, rax + mov [rcx+136], r13 + mov r8, [rsp+48] + mov r9, [rsp+56] + mov r10, [rsp+64] + mov r11, [rsp+72] + mov r12, [rsp+80] + mov r13, [rsp+88] + adc r8, 0 + adc r9, 0 + adc r10, 0 + adc r11, 0 + adc r12, 0 + adc r13, 0 + add rsp, 152 + mov [rcx+144], r8 + mov [rcx+152], r9 + mov [rcx+160], r10 + mov [rcx+168], r11 + mov [rcx+176], r12 + mov [rcx+184], r13 + + pop rbp + pop rbx + pop r15 + pop r14 + pop r13 + pop r12 + ret + +#else + +//*********************************************************************** +// Integer multiplication +// Based on Karatsuba method +// Operation: c [reg_p3] = a [reg_p1] * b [reg_p2] +// NOTE: a=c or b=c are not allowed +//*********************************************************************** +.global mul751_asm +mul751_asm: + push r12 + push r13 + push r14 + mov rcx, reg_p3 + + // rcx[0-5] <- AH+AL + xor rax, rax + mov r8, [reg_p1+48] + mov r9, [reg_p1+56] + mov r10, [reg_p1+64] + mov r11, [reg_p1+72] + mov r12, [reg_p1+80] + mov r13, [reg_p1+88] + add r8, [reg_p1] + adc r9, [reg_p1+8] + adc r10, [reg_p1+16] + adc r11, [reg_p1+24] + adc r12, [reg_p1+32] + adc r13, [reg_p1+40] + push r15 + mov [rcx], r8 + mov [rcx+8], r9 + mov [rcx+16], r10 + mov [rcx+24], r11 + mov [rcx+32], r12 + mov [rcx+40], r13 + sbb rax, 0 + sub rsp, 96 // Allocating space in stack + + // rcx[6-11] <- BH+BL + xor rdx, rdx + mov r8, [reg_p2+48] + mov r9, [reg_p2+56] + mov r10, [reg_p2+64] + mov r11, [reg_p2+72] + mov r12, [reg_p2+80] + mov r13, [reg_p2+88] + add r8, [reg_p2] + adc r9, [reg_p2+8] + adc r10, [reg_p2+16] + adc r11, [reg_p2+24] + adc r12, [reg_p2+32] + adc r13, [reg_p2+40] + mov [rcx+48], r8 + mov [rcx+56], r9 + mov [rcx+64], r10 + mov [rcx+72], r11 + mov [rcx+80], r12 + mov [rcx+88], r13 + sbb rdx, 0 + mov [rsp+80], rax + mov [rsp+88], rdx + + // (rsp[0-8],r10,r8,r9) <- (AH+AL)*(BH+BL) + mov r11, [rcx] + mov rax, r8 + mul r11 + mov [rsp], rax // c0 + mov r14, rdx + + xor r15, r15 + mov rax, r9 + mul r11 + xor r9, r9 + add r14, rax + adc r9, rdx + + mov r12, [rcx+8] + mov rax, r8 + mul r12 + add r14, rax + mov [rsp+8], r14 // c1 + adc r9, rdx + adc r15, 0 + + xor r8, r8 + mov rax, r10 + mul r11 + add r9, rax + mov r13, [rcx+48] + adc r15, rdx + adc r8, 0 + + mov rax, [rcx+16] + mul r13 + add r9, rax + adc r15, rdx + mov rax, [rcx+56] + adc r8, 0 + + mul r12 + add r9, rax + mov [rsp+16], r9 // c2 + adc r15, rdx + adc r8, 0 + + xor r9, r9 + mov rax, [rcx+72] + mul r11 + add r15, rax + adc r8, rdx + adc r9, 0 + + mov rax, [rcx+24] + mul r13 + add r15, rax + adc r8, rdx + adc r9, 0 + + mov rax, r10 + mul r12 + add r15, rax + adc r8, rdx + adc r9, 0 + + mov r14, [rcx+16] + mov rax, [rcx+56] + mul r14 + add r15, rax + mov [rsp+24], r15 // c3 + adc r8, rdx + adc r9, 0 + + xor r10, r10 + mov rax, [rcx+80] + mul r11 + add r8, rax + adc r9, rdx + adc r10, 0 + + mov rax, [rcx+64] + mul r14 + add r8, rax + adc r9, rdx + adc r10, 0 + + mov r15, [rcx+48] + mov rax, [rcx+32] + mul r15 + add r8, rax + adc r9, rdx + adc r10, 0 + + mov rax, [rcx+72] + mul r12 + add r8, rax + adc r9, rdx + adc r10, 0 + + mov r13, [rcx+24] + mov rax, [rcx+56] + mul r13 + add r8, rax + mov [rsp+32], r8 // c4 + adc r9, rdx + adc r10, 0 + + xor r8, r8 + mov rax, [rcx+88] + mul r11 + add r9, rax + adc r10, rdx + adc r8, 0 + + mov rax, [rcx+64] + mul r13 + add r9, rax + adc r10, rdx + adc r8, 0 + + mov rax, [rcx+72] + mul r14 + add r9, rax + adc r10, rdx + adc r8, 0 + + mov rax, [rcx+40] + mul r15 + add r9, rax + adc r10, rdx + adc r8, 0 + + mov rax, [rcx+80] + mul r12 + add r9, rax + adc r10, rdx + adc r8, 0 + + mov r15, [rcx+32] + mov rax, [rcx+56] + mul r15 + add r9, rax + mov [rsp+40], r9 // c5 + adc r10, rdx + adc r8, 0 + + xor r9, r9 + mov rax, [rcx+64] + mul r15 + add r10, rax + adc r8, rdx + adc r9, 0 + + mov rax, [rcx+88] + mul r12 + add r10, rax + adc r8, rdx + adc r9, 0 + + mov rax, [rcx+80] + mul r14 + add r10, rax + adc r8, rdx + adc r9, 0 + + mov r11, [rcx+40] + mov rax, [rcx+56] + mul r11 + add r10, rax + adc r8, rdx + adc r9, 0 + + mov rax, [rcx+72] + mul r13 + add r10, rax + mov [rsp+48], r10 // c6 + adc r8, rdx + adc r9, 0 + + xor r10, r10 + mov rax, [rcx+88] + mul r14 + add r8, rax + adc r9, rdx + adc r10, 0 + + mov rax, [rcx+64] + mul r11 + add r8, rax + adc r9, rdx + adc r10, 0 + + mov rax, [rcx+80] + mul r13 + add r8, rax + adc r9, rdx + adc r10, 0 + + mov rax, [rcx+72] + mul r15 + add r8, rax + mov [rsp+56], r8 // c7 + adc r9, rdx + adc r10, 0 + + xor r8, r8 + mov rax, [rcx+72] + mul r11 + add r9, rax + adc r10, rdx + adc r8, 0 + + mov rax, [rcx+80] + mul r15 + add r9, rax + adc r10, rdx + adc r8, 0 + + mov rax, [rcx+88] + mul r13 + add r9, rax + mov [rsp+64], r9 // c8 + adc r10, rdx + adc r8, 0 + + xor r9, r9 + mov rax, [rcx+88] + mul r15 + add r10, rax + adc r8, rdx + adc r9, 0 + + mov rax, [rcx+80] + mul r11 + add r10, rax // c9 + adc r8, rdx + adc r9, 0 + + mov rax, [rcx+88] + mul r11 + add r8, rax // c10 + adc r9, rdx // c11 + + mov rax, [rsp+88] + mov rdx, [rcx] + and r12, rax + and r14, rax + and rdx, rax + and r13, rax + and r15, rax + and r11, rax + mov rax, [rsp+48] + add rdx, rax + mov rax, [rsp+56] + adc r12, rax + mov rax, [rsp+64] + adc r14, rax + adc r13, r10 + adc r15, r8 + adc r11, r9 + mov rax, [rsp+80] + mov [rsp+48], rdx + mov [rsp+56], r12 + mov [rsp+64], r14 + mov [rsp+72], r13 + mov [rsp+80], r15 + mov [rsp+88], r11 + + mov r8, [rcx+48] + mov r9, [rcx+56] + mov r10, [rcx+64] + mov r11, [rcx+72] + mov r12, [rcx+80] + mov r13, [rcx+88] + and r8, rax + and r9, rax + and r10, rax + and r11, rax + and r12, rax + and r13, rax + mov rax, [rsp+48] + add r8, rax + mov rax, [rsp+56] + adc r9, rax + mov rax, [rsp+64] + adc r10, rax + mov rax, [rsp+72] + adc r11, rax + mov rax, [rsp+80] + adc r12, rax + mov rax, [rsp+88] + adc r13, rax + mov [rsp+48], r8 + mov [rsp+56], r9 + mov [rsp+72], r11 + + // rcx[0-11] <- AL*BL + mov r11, [reg_p1] + mov rax, [reg_p2] + mul r11 + xor r9, r9 + mov [rcx], rax // c0 + mov [rsp+64], r10 + mov r8, rdx + + mov rax, [reg_p2+8] + mul r11 + xor r10, r10 + add r8, rax + mov [rsp+80], r12 + adc r9, rdx + + mov r12, [reg_p1+8] + mov rax, [reg_p2] + mul r12 + add r8, rax + mov [rcx+8], r8 // c1 + adc r9, rdx + mov [rsp+88], r13 + adc r10, 0 + + xor r8, r8 + mov rax, [reg_p2+16] + mul r11 + add r9, rax + adc r10, rdx + adc r8, 0 + + mov r13, [reg_p2] + mov rax, [reg_p1+16] + mul r13 + add r9, rax + adc r10, rdx + adc r8, 0 + + mov rax, [reg_p2+8] + mul r12 + add r9, rax + mov [rcx+16], r9 // c2 + adc r10, rdx + adc r8, 0 + + xor r9, r9 + mov rax, [reg_p2+24] + mul r11 + add r10, rax + adc r8, rdx + adc r9, 0 + + mov rax, [reg_p1+24] + mul r13 + add r10, rax + adc r8, rdx + adc r9, 0 + + mov rax, [reg_p2+16] + mul r12 + add r10, rax + adc r8, rdx + adc r9, 0 + + mov r14, [reg_p1+16] + mov rax, [reg_p2+8] + mul r14 + add r10, rax + mov [rcx+24], r10 // c3 + adc r8, rdx + adc r9, 0 + + xor r10, r10 + mov rax, [reg_p2+32] + mul r11 + add r8, rax + adc r9, rdx + adc r10, 0 + + mov rax, [reg_p2+16] + mul r14 + add r8, rax + adc r9, rdx + adc r10, 0 + + mov rax, [reg_p1+32] + mul r13 + add r8, rax + adc r9, rdx + adc r10, 0 + + mov rax, [reg_p2+24] + mul r12 + add r8, rax + adc r9, rdx + adc r10, 0 + + mov r13, [reg_p1+24] + mov rax, [reg_p2+8] + mul r13 + add r8, rax + mov [rcx+32], r8 // c4 + adc r9, rdx + adc r10, 0 + + xor r8, r8 + mov rax, [reg_p2+40] + mul r11 + add r9, rax + adc r10, rdx + adc r8, 0 + + mov rax, [reg_p2+16] + mul r13 + add r9, rax + adc r10, rdx + adc r8, 0 + + mov rax, [reg_p2+24] + mul r14 + add r9, rax + adc r10, rdx + adc r8, 0 + + mov r11, [reg_p1+40] + mov rax, [reg_p2] + mul r11 + add r9, rax + adc r10, rdx + adc r8, 0 + + mov rax, [reg_p2+32] + mul r12 + add r9, rax + adc r10, rdx + adc r8, 0 + + mov r15, [reg_p1+32] + mov rax, [reg_p2+8] + mul r15 + add r9, rax + mov [rcx+40], r9 // c5 + adc r10, rdx + adc r8, 0 + + xor r9, r9 + mov rax, [reg_p2+16] + mul r15 + add r10, rax + adc r8, rdx + adc r9, 0 + + mov rax, [reg_p2+40] + mul r12 + add r10, rax + adc r8, rdx + adc r9, 0 + + mov rax, [reg_p2+32] + mul r14 + add r10, rax + adc r8, rdx + adc r9, 0 + + mov rax, [reg_p2+8] + mul r11 + add r10, rax + adc r8, rdx + adc r9, 0 + + mov rax, [reg_p2+24] + mul r13 + add r10, rax + mov [rcx+48], r10 // c6 + adc r8, rdx + adc r9, 0 + + xor r10, r10 + mov rax, [reg_p2+40] + mul r14 + add r8, rax + adc r9, rdx + adc r10, 0 + + mov rax, [reg_p2+16] + mul r11 + add r8, rax + adc r9, rdx + adc r10, 0 + + mov rax, [reg_p2+32] + mul r13 + add r8, rax + adc r9, rdx + adc r10, 0 + + mov rax, [reg_p2+24] + mul r15 + add r8, rax + mov [rcx+56], r8 // c7 + adc r9, rdx + adc r10, 0 + + xor r8, r8 + mov rax, [reg_p2+24] + mul r11 + add r9, rax + adc r10, rdx + adc r8, 0 + + mov rax, [reg_p2+32] + mul r15 + add r9, rax + adc r10, rdx + adc r8, 0 + + mov rax, [reg_p2+40] + mul r13 + add r9, rax + mov [rcx+64], r9 // c8 + adc r10, rdx + adc r8, 0 + + xor r9, r9 + mov rax, [reg_p2+40] + mul r15 + add r10, rax + adc r8, rdx + adc r9, 0 + + mov rax, [reg_p2+32] + mul r11 + add r10, rax + mov [rcx+72], r10 // c9 + adc r8, rdx + adc r9, 0 + + mov rax, [reg_p2+40] + mul r11 + add r8, rax + mov [rcx+80], r8 // c10 + adc r9, rdx + mov [rcx+88], r9 // c11 + + // rcx[12-23] <- AH*BH + mov r11, [reg_p1+48] + mov rax, [reg_p2+48] + mul r11 + xor r9, r9 + mov [rcx+96], rax // c0 + mov r8, rdx + + mov rax, [reg_p2+56] + mul r11 + xor r10, r10 + add r8, rax + adc r9, rdx + + mov r12, [reg_p1+56] + mov rax, [reg_p2+48] + mul r12 + add r8, rax + mov [rcx+104], r8 // c1 + adc r9, rdx + adc r10, 0 + + xor r8, r8 + mov rax, [reg_p2+64] + mul r11 + add r9, rax + adc r10, rdx + adc r8, 0 + + mov r13, [reg_p2+48] + mov rax, [reg_p1+64] + mul r13 + add r9, rax + adc r10, rdx + adc r8, 0 + + mov rax, [reg_p2+56] + mul r12 + add r9, rax + mov [rcx+112], r9 // c2 + adc r10, rdx + adc r8, 0 + + xor r9, r9 + mov rax, [reg_p2+72] + mul r11 + add r10, rax + adc r8, rdx + adc r9, 0 + + mov rax, [reg_p1+72] + mul r13 + add r10, rax + adc r8, rdx + adc r9, 0 + + mov rax, [reg_p2+64] + mul r12 + add r10, rax + adc r8, rdx + adc r9, 0 + + mov r14, [reg_p1+64] + mov rax, [reg_p2+56] + mul r14 + add r10, rax + mov [rcx+120], r10 // c3 + adc r8, rdx + adc r9, 0 + + xor r10, r10 + mov rax, [reg_p2+80] + mul r11 + add r8, rax + adc r9, rdx + adc r10, 0 + + mov rax, [reg_p2+64] + mul r14 + add r8, rax + adc r9, rdx + adc r10, 0 + + mov r15, [reg_p1+80] + mov rax, r13 + mul r15 + add r8, rax + adc r9, rdx + adc r10, 0 + + mov rax, [reg_p2+72] + mul r12 + add r8, rax + adc r9, rdx + adc r10, 0 + + mov r13, [reg_p1+72] + mov rax, [reg_p2+56] + mul r13 + add r8, rax + mov [rcx+128], r8 // c4 + adc r9, rdx + adc r10, 0 + + xor r8, r8 + mov rax, [reg_p2+88] + mul r11 + add r9, rax + adc r10, rdx + adc r8, 0 + + mov rax, [reg_p2+64] + mul r13 + add r9, rax + adc r10, rdx + adc r8, 0 + + mov rax, [reg_p2+72] + mul r14 + add r9, rax + adc r10, rdx + adc r8, 0 + + mov r11, [reg_p1+88] + mov rax, [reg_p2+48] + mul r11 + add r9, rax + adc r10, rdx + adc r8, 0 + + mov rax, [reg_p2+80] + mul r12 + add r9, rax + adc r10, rdx + adc r8, 0 + + mov rax, [reg_p2+56] + mul r15 + add r9, rax + mov [rcx+136], r9 // c5 + adc r10, rdx + adc r8, 0 + + xor r9, r9 + mov rax, [reg_p2+64] + mul r15 + add r10, rax + adc r8, rdx + adc r9, 0 + + mov rax, [reg_p2+88] + mul r12 + add r10, rax + adc r8, rdx + adc r9, 0 + + mov rax, [reg_p2+80] + mul r14 + add r10, rax + adc r8, rdx + adc r9, 0 + + mov rax, [reg_p2+56] + mul r11 + add r10, rax + adc r8, rdx + adc r9, 0 + + mov rax, [reg_p2+72] + mul r13 + add r10, rax + mov [rcx+144], r10 // c6 + adc r8, rdx + adc r9, 0 + + xor r10, r10 + mov rax, [reg_p2+88] + mul r14 + add r8, rax + adc r9, rdx + adc r10, 0 + + mov rax, [reg_p2+64] + mul r11 + add r8, rax + adc r9, rdx + adc r10, 0 + + mov rax, [reg_p2+80] + mul r13 + add r8, rax + adc r9, rdx + adc r10, 0 + + mov rax, [reg_p2+72] + mul r15 + add r8, rax + mov [rcx+152], r8 // c7 + adc r9, rdx + adc r10, 0 + + xor r8, r8 + mov rax, [reg_p2+72] + mul r11 + add r9, rax + adc r10, rdx + adc r8, 0 + + mov rax, [reg_p2+80] + mul r15 + add r9, rax + adc r10, rdx + adc r8, 0 + + mov rax, [reg_p2+88] + mul r13 + add r9, rax + mov [rcx+160], r9 // c8 + adc r10, rdx + adc r8, 0 + + mov rax, [reg_p2+88] + mul r15 + add r10, rax + adc r8, rdx + + mov rax, [reg_p2+80] + mul r11 + add r10, rax + mov [rcx+168], r10 // c9 + adc r8, rdx + + mov rax, [reg_p2+88] + mul r11 + add r8, rax + mov [rcx+176], r8 // c10 + adc rdx, 0 + mov [rcx+184], rdx // c11 + + // [r8-r15,rax,rdx,rdi,[rsp]] <- (AH+AL)*(BH+BL) - AL*BL + mov r8, [rsp] + sub r8, [rcx] + mov r9, [rsp+8] + sbb r9, [rcx+8] + mov r10, [rsp+16] + sbb r10, [rcx+16] + mov r11, [rsp+24] + sbb r11, [rcx+24] + mov r12, [rsp+32] + sbb r12, [rcx+32] + mov r13, [rsp+40] + sbb r13, [rcx+40] + mov r14, [rsp+48] + sbb r14, [rcx+48] + mov r15, [rsp+56] + sbb r15, [rcx+56] + mov rax, [rsp+64] + sbb rax, [rcx+64] + mov rdx, [rsp+72] + sbb rdx, [rcx+72] + mov rdi, [rsp+80] + sbb rdi, [rcx+80] + mov rsi, [rsp+88] + sbb rsi, [rcx+88] + mov [rsp], rsi + + // [r8-r15,rax,rdx,rdi,[rsp]] <- (AH+AL)*(BH+BL) - AL*BL - AH*BH + mov rsi, [rcx+96] + sub r8, rsi + mov rsi, [rcx+104] + sbb r9, rsi + mov rsi, [rcx+112] + sbb r10, rsi + mov rsi, [rcx+120] + sbb r11, rsi + mov rsi, [rcx+128] + sbb r12, rsi + mov rsi, [rcx+136] + sbb r13, rsi + mov rsi, [rcx+144] + sbb r14, rsi + mov rsi, [rcx+152] + sbb r15, rsi + mov rsi, [rcx+160] + sbb rax, rsi + mov rsi, [rcx+168] + sbb rdx, rsi + mov rsi, [rcx+176] + sbb rdi, rsi + mov rsi, [rsp] + sbb rsi, [rcx+184] + + // Final result + add r8, [rcx+48] + mov [rcx+48], r8 + adc r9, [rcx+56] + mov [rcx+56], r9 + adc r10, [rcx+64] + mov [rcx+64], r10 + adc r11, [rcx+72] + mov [rcx+72], r11 + adc r12, [rcx+80] + mov [rcx+80], r12 + adc r13, [rcx+88] + mov [rcx+88], r13 + adc r14, [rcx+96] + mov [rcx+96], r14 + adc r15, [rcx+104] + mov [rcx+104], r15 + adc rax, [rcx+112] + mov [rcx+112], rax + adc rdx, [rcx+120] + mov [rcx+120], rdx + adc rdi, [rcx+128] + mov [rcx+128], rdi + adc rsi, [rcx+136] + mov [rcx+136], rsi + mov rax, [rcx+144] + adc rax, 0 + mov [rcx+144], rax + mov rax, [rcx+152] + adc rax, 0 + mov [rcx+152], rax + mov rax, [rcx+160] + adc rax, 0 + mov [rcx+160], rax + mov rax, [rcx+168] + adc rax, 0 + mov [rcx+168], rax + mov rax, [rcx+176] + adc rax, 0 + mov [rcx+176], rax + mov rax, [rcx+184] + adc rax, 0 + mov [rcx+184], rax + + add rsp, 96 // Restoring space in stack + pop r15 + pop r14 + pop r13 + pop r12 + ret + +#endif + + +#ifdef _MULX_ + +///////////////////////////////////////////////////////////////// MACRO +// Schoolbook integer multiplication +// Inputs: memory pointers M0 and M1 +// Outputs: memory locations C, C+8, C+16, and regs T0:T7 +// Temps: memory locations regs T7:T9 +///////////////////////////////////////////////////////////////// +#ifdef _ADX_ + +.macro MUL256x448_SCHOOL M0, M1, C, T0, T1, T2, T3, T4, T5, T6, T7, T8, T9, T10 + mov rdx, \M0 + mulx \T0, \T1, \M1 + mulx \T2, \T3, 8\M1 + mov \C, \T1 // C0_final + xor rax, rax + mulx \T4, \T5, 16\M1 + adox \T0, \T3 + adox \T2, \T5 + mulx \T1, \T3, 24\M1 + adox \T4, \T3 + mulx \T5, \T6, 32\M1 + adox \T1, \T6 + mulx \T3, \T7, 40\M1 + adox \T5, \T7 + mulx \T6, \T8, 48\M1 + adox \T3, \T8 + adox \T6, rax + + mov rdx, 8\M0 + mulx \T8, \T7, \M1 + xor rax, rax + adcx \T0, \T7 + mov 8\C, \T0 // C1_final + adcx \T2, \T8 + mulx \T7, \T8, 8\M1 + adox \T2, \T8 + adcx \T4, \T7 + mulx \T0, \T8, 16\M1 + adox \T4, \T8 + adcx \T0, \T1 + mulx \T1, \T7, 24\M1 + adcx \T1, \T5 + mulx \T5, \T8, 32\M1 + adcx \T3, \T5 + mulx \T5, \T9, 40\M1 + adcx \T6, \T5 + mulx \T5, rdx, 48\M1 + adcx \T5, rax + + adox \T0, \T7 + adox \T1, \T8 + adox \T3, \T9 + adox \T6, rdx + adox \T5, rax + + mov rdx, 16\M0 + mulx \T8, \T7, \M1 + xor rax, rax + adcx \T2, \T7 + mov 16\C, \T2 // C2_final + adcx \T4, \T8 + mulx \T8, \T7, 8\M1 + adox \T4, \T7 + adcx \T0, \T8 + mulx \T2, \T8, 16\M1 + adox \T0, \T8 + adcx \T1, \T2 + mulx \T2, \T7, 24\M1 + adcx \T3, \T2 + mulx \T2, \T8, 32\M1 + adcx \T6, \T2 + mulx \T2, \T9, 40\M1 + adcx \T5, \T2 + mulx \T2, rdx, 48\M1 + adcx \T2, rax + + adox \T1, \T7 + adox \T3, \T8 + adox \T6, \T9 + adox \T5, rdx + adox \T2, rax + + mov rdx, 24\M0 + mulx \T8, \T7, \M1 + xor rax, rax + adcx \T7, \T4 + adcx \T0, \T8 + mulx \T8, \T10, 8\M1 + adox \T0, \T10 + adcx \T1, \T8 + mulx \T4, \T8, 16\M1 + adox \T1, \T8 + adcx \T3, \T4 + mulx \T4, \T10, 24\M1 + adcx \T6, \T4 + mulx \T4, \T8, 32\M1 + adcx \T5, \T4 + mulx \T4, \T9, 40\M1 + adcx \T2, \T4 + mulx \T4, rdx, 48\M1 + adcx \T4, rax + + adox \T3, \T10 + adox \T6, \T8 + adox \T5, \T9 + adox \T2, rdx + adox \T4, rax +.endm + +#else + +.macro MUL256x448_SCHOOL M0, M1, C, T0, T1, T2, T3, T4, T5, T6, T7, T8, T9, T10 + mov rdx, \M0 + mulx \T0, \T1, \M1 + mulx \T2, \T3, 8\M1 + mov \C, \T1 // C0_final + xor rax, rax + mulx \T4, \T5, 16\M1 + add \T0, \T3 + adc \T2, \T5 + mulx \T1, \T3, 24\M1 + adc \T4, \T3 + mulx \T5, \T6, 32\M1 + adc \T1, \T6 + mulx \T3, \T7, 40\M1 + adc \T5, \T7 + mulx \T6, \T8, 48\M1 + adc \T3, \T8 + adc \T6, rax + + mov rdx, 8\M0 + mulx \T8, \T7, \M1 + add \T0, \T7 + mov 8\C, \T0 // C1_final + adc \T2, \T8 + mulx \T7, \T8, 8\M1 + mov 32\C, \T8 // store + adc \T4, \T7 + mulx \T0, \T8, 16\M1 + mov 40\C, \T8 // store + adc \T0, \T1 + mulx \T1, \T7, 24\M1 + adc \T1, \T5 + mulx \T5, \T8, 32\M1 + adc \T3, \T5 + mulx \T5, \T9, 40\M1 + adc \T6, \T5 + mulx \T5, rdx, 48\M1 + adc \T5, rax + + xor rax, rax + add \T2, 32\C + adc \T4, 40\C + adc \T0, \T7 + adc \T1, \T8 + adc \T3, \T9 + adc \T6, rdx + adc \T5, rax + + mov rdx, 16\M0 + mulx \T8, \T7, \M1 + add \T2, \T7 + mov 16\C, \T2 // C2_final + adc \T4, \T8 + mulx \T8, \T7, 8\M1 + mov 32\C, \T7 // store + adc \T0, \T8 + mulx \T2, \T8, 16\M1 + mov 40\C, \T8 // store + adc \T1, \T2 + mulx \T2, \T7, 24\M1 + adc \T3, \T2 + mulx \T2, \T8, 32\M1 + adc \T6, \T2 + mulx \T2, \T9, 40\M1 + adc \T5, \T2 + mulx \T2, rdx, 48\M1 + adc \T2, rax + + xor rax, rax + add \T4, 32\C + adc \T0, 40\C + adc \T1, \T7 + adc \T3, \T8 + adc \T6, \T9 + adc \T5, rdx + adc \T2, rax + + mov rdx, 24\M0 + mulx \T8, \T7, \M1 + add \T7, \T4 + adc \T0, \T8 + mulx \T8, \T10, 8\M1 + mov 32\C, \T10 // store + adc \T1, \T8 + mulx \T4, \T8, 16\M1 + mov 40\C, \T8 // store + adc \T3, \T4 + mulx \T4, \T10, 24\M1 + adc \T6, \T4 + mulx \T4, \T8, 32\M1 + adc \T5, \T4 + mulx \T4, \T9, 40\M1 + adc \T2, \T4 + mulx \T4, rdx, 48\M1 + adc \T4, rax + + xor rax, rax + add \T0, 32\C + adc \T1, 40\C + adc \T3, \T10 + adc \T6, \T8 + adc \T5, \T9 + adc \T2, rdx + adc \T4, rax +.endm + +#endif + + +//************************************************************************************** +// Montgomery reduction +// Based on method described in Faz-Hernandez et al. https://eprint.iacr.org/2017/1015 +// Operation: c [reg_p2] = a [reg_p1] +// NOTE: a=c is not allowed +//************************************************************************************** +.global rdc751_asm +rdc751_asm: + push rbx + push rbp + push r12 + push r13 + push r14 + push r15 + + // a[0-3] x p751p1_nz --> result: [reg_p2+48], [reg_p2+56], [reg_p2+64], and rbp, r8:r14 + MUL256x448_SCHOOL [reg_p1], [p751p1_nz], [reg_p2+48], r8, r9, r13, r10, r14, r12, r11, rbp, rbx, rcx, r15 + + xor r15, r15 + mov rax, [reg_p2+48] + mov rdx, [reg_p2+56] + mov rbx, [reg_p2+64] + add rax, [reg_p1+40] + adc rdx, [reg_p1+48] + adc rbx, [reg_p1+56] + mov [reg_p1+40], rax + mov [reg_p1+48], rdx + mov [reg_p1+56], rbx + adc rbp, [reg_p1+64] + adc r8, [reg_p1+72] + adc r9, [reg_p1+80] + adc r10, [reg_p1+88] + adc r11, [reg_p1+96] + adc r12, [reg_p1+104] + adc r13, [reg_p1+112] + adc r14, [reg_p1+120] + adc r15, [reg_p1+128] + mov [reg_p1+64], rbp + mov [reg_p1+72], r8 + mov [reg_p1+80], r9 + mov [reg_p1+88], r10 + mov [reg_p1+96], r11 + mov [reg_p1+104], r12 + mov [reg_p1+112], r13 + mov [reg_p1+120], r14 + mov [reg_p1+128], r15 + mov r8, [reg_p1+136] + mov r9, [reg_p1+144] + mov r10, [reg_p1+152] + mov r11, [reg_p1+160] + mov r12, [reg_p1+168] + mov r13, [reg_p1+176] + mov r14, [reg_p1+184] + adc r8, 0 + adc r9, 0 + adc r10, 0 + adc r11, 0 + adc r12, 0 + adc r13, 0 + adc r14, 0 + mov [reg_p1+136], r8 + mov [reg_p1+144], r9 + mov [reg_p1+152], r10 + mov [reg_p1+160], r11 + mov [reg_p1+168], r12 + mov [reg_p1+176], r13 + mov [reg_p1+184], r14 + + // a[4-7] x p751p1_nz --> result: [reg_p2+48], [reg_p2+56], [reg_p2+64], and rbp, r8:r14 + MUL256x448_SCHOOL [reg_p1+32], [p751p1_nz], [reg_p2+48], r8, r9, r13, r10, r14, r12, r11, rbp, rbx, rcx, r15 + + xor r15, r15 + mov rax, [reg_p2+48] + mov rdx, [reg_p2+56] + mov rbx, [reg_p2+64] + add rax, [reg_p1+72] + adc rdx, [reg_p1+80] + adc rbx, [reg_p1+88] + mov [reg_p1+72], rax + mov [reg_p1+80], rdx + mov [reg_p1+88], rbx + adc rbp, [reg_p1+96] + adc r8, [reg_p1+104] + adc r9, [reg_p1+112] + adc r10, [reg_p1+120] + adc r11, [reg_p1+128] + adc r12, [reg_p1+136] + adc r13, [reg_p1+144] + adc r14, [reg_p1+152] + adc r15, [reg_p1+160] + mov [reg_p2], rbp // Final result c0 + mov [reg_p1+104], r8 + mov [reg_p1+112], r9 + mov [reg_p1+120], r10 + mov [reg_p1+128], r11 + mov [reg_p1+136], r12 + mov [reg_p1+144], r13 + mov [reg_p1+152], r14 + mov [reg_p1+160], r15 + mov r12, [reg_p1+168] + mov r13, [reg_p1+176] + mov r14, [reg_p1+184] + adc r12, 0 + adc r13, 0 + adc r14, 0 + mov [reg_p1+168], r12 + mov [reg_p1+176], r13 + mov [reg_p1+184], r14 + + // a[8-11] x p751p1_nz --> result: [reg_p2+48], [reg_p2+56], [reg_p2+64], and rbp, r8:r14 + MUL256x448_SCHOOL [reg_p1+64], [p751p1_nz], [reg_p2+48], r8, r9, r13, r10, r14, r12, r11, rbp, rbx, rcx, r15 + + // Final result c1:c11 + mov rax, [reg_p2+48] + mov rdx, [reg_p2+56] + mov rbx, [reg_p2+64] + add rax, [reg_p1+104] + adc rdx, [reg_p1+112] + adc rbx, [reg_p1+120] + mov [reg_p2+8], rax + mov [reg_p2+16], rdx + mov [reg_p2+24], rbx + adc rbp, [reg_p1+128] + adc r8, [reg_p1+136] + adc r9, [reg_p1+144] + adc r10, [reg_p1+152] + adc r11, [reg_p1+160] + adc r12, [reg_p1+168] + adc r13, [reg_p1+176] + adc r14, [reg_p1+184] + mov [reg_p2+32], rbp + mov [reg_p2+40], r8 + mov [reg_p2+48], r9 + mov [reg_p2+56], r10 + mov [reg_p2+64], r11 + mov [reg_p2+72], r12 + mov [reg_p2+80], r13 + mov [reg_p2+88], r14 + + pop r15 + pop r14 + pop r13 + pop r12 + pop rbp + pop rbx + ret + + #else + +//*********************************************************************** +// Montgomery reduction +// Based on comba method +// Operation: c [reg_p2] = a [reg_p1] +// NOTE: a=c is not allowed +//*********************************************************************** +.global rdc751_asm +rdc751_asm: + push r12 + push r13 + push r14 + push r15 + + mov r11, [reg_p1] + movq rax, p751p1_5 + mul r11 + xor r8, r8 + add rax, [reg_p1+40] + mov [reg_p2+40], rax // z5 + adc r8, rdx + + xor r9, r9 + movq rax, p751p1_6 + mul r11 + xor r10, r10 + add r8, rax + adc r9, rdx + + mov r12, [reg_p1+8] + movq rax, p751p1_5 + mul r12 + add r8, rax + adc r9, rdx + adc r10, 0 + add r8, [reg_p1+48] + mov [reg_p2+48], r8 // z6 + adc r9, 0 + adc r10, 0 + + xor r8, r8 + movq rax, p751p1_7 + mul r11 + add r9, rax + adc r10, rdx + adc r8, 0 + + movq rax, p751p1_6 + mul r12 + add r9, rax + adc r10, rdx + adc r8, 0 + + mov r13, [reg_p1+16] + movq rax, p751p1_5 + mul r13 + add r9, rax + adc r10, rdx + adc r8, 0 + add r9, [reg_p1+56] + mov [reg_p2+56], r9 // z7 + adc r10, 0 + adc r8, 0 + + xor r9, r9 + movq rax, p751p1_8 + mul r11 + add r10, rax + adc r8, rdx + adc r9, 0 + + movq rax, p751p1_7 + mul r12 + add r10, rax + adc r8, rdx + adc r9, 0 + + movq rax, p751p1_6 + mul r13 + add r10, rax + adc r8, rdx + adc r9, 0 + + mov r14, [reg_p1+24] + movq rax, p751p1_5 + mul r14 + add r10, rax + adc r8, rdx + adc r9, 0 + add r10, [reg_p1+64] + mov [reg_p2+64], r10 // z8 + adc r8, 0 + adc r9, 0 + + xor r10, r10 + movq rax, p751p1_9 + mul r11 + add r8, rax + adc r9, rdx + adc r10, 0 + + movq rax, p751p1_8 + mul r12 + add r8, rax + adc r9, rdx + adc r10, 0 + + movq rax, p751p1_7 + mul r13 + add r8, rax + adc r9, rdx + adc r10, 0 + + movq rax, p751p1_6 + mul r14 + add r8, rax + adc r9, rdx + adc r10, 0 + + mov r15, [reg_p1+32] + movq rax, p751p1_5 + mul r15 + add r8, rax + adc r9, rdx + adc r10, 0 + add r8, [reg_p1+72] + mov [reg_p2+72], r8 // z9 + adc r9, 0 + adc r10, 0 + + xor r8, r8 + movq rax, p751p1_10 + mul r11 + add r9, rax + adc r10, rdx + adc r8, 0 + + movq rax, p751p1_9 + mul r12 + add r9, rax + adc r10, rdx + adc r8, 0 + + movq rax, p751p1_8 + mul r13 + add r9, rax + adc r10, rdx + adc r8, 0 + + movq rax, p751p1_7 + mul r14 + add r9, rax + adc r10, rdx + adc r8, 0 + + movq rax, p751p1_6 + mul r15 + add r9, rax + adc r10, rdx + adc r8, 0 + + mov rcx, [reg_p2+40] + movq rax, p751p1_5 + mul rcx + add r9, rax + adc r10, rdx + adc r8, 0 + add r9, [reg_p1+80] + mov [reg_p2+80], r9 // z10 + adc r10, 0 + adc r8, 0 + + xor r9, r9 + movq rax, p751p1_11 + mul r11 + add r10, rax + adc r8, rdx + adc r9, 0 + + movq rax, p751p1_10 + mul r12 + add r10, rax + adc r8, rdx + adc r9, 0 + + movq rax, p751p1_9 + mul r13 + add r10, rax + adc r8, rdx + adc r9, 0 + + movq rax, p751p1_8 + mul r14 + add r10, rax + adc r8, rdx + adc r9, 0 + + movq rax, p751p1_7 + mul r15 + add r10, rax + adc r8, rdx + adc r9, 0 + + movq rax, p751p1_6 + mul rcx + add r10, rax + adc r8, rdx + adc r9, 0 + + mov r11, [reg_p2+48] + movq rax, p751p1_5 + mul r11 + add r10, rax + adc r8, rdx + adc r9, 0 + add r10, [reg_p1+88] + mov [reg_p2+88], r10 // z11 + adc r8, 0 + adc r9, 0 + + xor r10, r10 + movq rax, p751p1_11 + mul r12 + add r8, rax + adc r9, rdx + adc r10, 0 + + movq rax, p751p1_10 + mul r13 + add r8, rax + adc r9, rdx + adc r10, 0 + + movq rax, p751p1_9 + mul r14 + add r8, rax + adc r9, rdx + adc r10, 0 + + movq rax, p751p1_8 + mul r15 + add r8, rax + adc r9, rdx + adc r10, 0 + + movq rax, p751p1_7 + mul rcx + add r8, rax + adc r9, rdx + adc r10, 0 + + movq rax, p751p1_6 + mul r11 + add r8, rax + adc r9, rdx + adc r10, 0 + + mov r12, [reg_p2+56] + movq rax, p751p1_5 + mul r12 + add r8, rax + adc r9, rdx + adc r10, 0 + add r8, [reg_p1+96] + mov [reg_p2], r8 // z0 + adc r9, 0 + adc r10, 0 + + xor r8, r8 + movq rax, p751p1_11 + mul r13 + add r9, rax + adc r10, rdx + adc r8, 0 + + movq rax, p751p1_10 + mul r14 + add r9, rax + adc r10, rdx + adc r8, 0 + + movq rax, p751p1_9 + mul r15 + add r9, rax + adc r10, rdx + adc r8, 0 + + movq rax, p751p1_8 + mul rcx + add r9, rax + adc r10, rdx + adc r8, 0 + + movq rax, p751p1_7 + mul r11 + add r9, rax + adc r10, rdx + adc r8, 0 + + movq rax, p751p1_6 + mul r12 + add r9, rax + adc r10, rdx + adc r8, 0 + + mov r13, [reg_p2+64] + movq rax, p751p1_5 + mul r13 + add r9, rax + adc r10, rdx + adc r8, 0 + add r9, [reg_p1+104] + mov [reg_p2+8], r9 // z1 + adc r10, 0 + adc r8, 0 + + xor r9, r9 + movq rax, p751p1_11 + mul r14 + add r10, rax + adc r8, rdx + adc r9, 0 + + movq rax, p751p1_10 + mul r15 + add r10, rax + adc r8, rdx + adc r9, 0 + + movq rax, p751p1_9 + mul rcx + add r10, rax + adc r8, rdx + adc r9, 0 + + movq rax, p751p1_8 + mul r11 + add r10, rax + adc r8, rdx + adc r9, 0 + + movq rax, p751p1_7 + mul r12 + add r10, rax + adc r8, rdx + adc r9, 0 + + movq rax, p751p1_6 + mul r13 + add r10, rax + adc r8, rdx + adc r9, 0 + + mov r14, [reg_p2+72] + movq rax, p751p1_5 + mul r14 + add r10, rax + adc r8, rdx + adc r9, 0 + add r10, [reg_p1+112] + mov [reg_p2+16], r10 // z2 + adc r8, 0 + adc r9, 0 + + xor r10, r10 + movq rax, p751p1_11 + mul r15 + add r8, rax + adc r9, rdx + adc r10, 0 + + movq rax, p751p1_10 + mul rcx + add r8, rax + adc r9, rdx + adc r10, 0 + + movq rax, p751p1_9 + mul r11 + add r8, rax + adc r9, rdx + adc r10, 0 + + movq rax, p751p1_8 + mul r12 + add r8, rax + adc r9, rdx + adc r10, 0 + + movq rax, p751p1_7 + mul r13 + add r8, rax + adc r9, rdx + adc r10, 0 + + movq rax, p751p1_6 + mul r14 + add r8, rax + adc r9, rdx + adc r10, 0 + + mov r15, [reg_p2+80] + movq rax, p751p1_5 + mul r15 + add r8, rax + adc r9, rdx + adc r10, 0 + add r8, [reg_p1+120] + mov [reg_p2+24], r8 // z3 + adc r9, 0 + adc r10, 0 + + xor r8, r8 + movq rax, p751p1_11 + mul rcx + add r9, rax + adc r10, rdx + adc r8, 0 + + movq rax, p751p1_10 + mul r11 + add r9, rax + adc r10, rdx + adc r8, 0 + + movq rax, p751p1_9 + mul r12 + add r9, rax + adc r10, rdx + adc r8, 0 + + movq rax, p751p1_8 + mul r13 + add r9, rax + adc r10, rdx + adc r8, 0 + + movq rax, p751p1_7 + mul r14 + add r9, rax + adc r10, rdx + adc r8, 0 + + movq rax, p751p1_6 + mul r15 + add r9, rax + adc r10, rdx + adc r8, 0 + + mov rcx, [reg_p2+88] + movq rax, p751p1_5 + mul rcx + add r9, rax + adc r10, rdx + adc r8, 0 + add r9, [reg_p1+128] + mov [reg_p2+32], r9 // z4 + adc r10, 0 + adc r8, 0 + + xor r9, r9 + movq rax, p751p1_11 + mul r11 + add r10, rax + adc r8, rdx + adc r9, 0 + + movq rax, p751p1_10 + mul r12 + add r10, rax + adc r8, rdx + adc r9, 0 + + movq rax, p751p1_9 + mul r13 + add r10, rax + adc r8, rdx + adc r9, 0 + + movq rax, p751p1_8 + mul r14 + add r10, rax + adc r8, rdx + adc r9, 0 + + movq rax, p751p1_7 + mul r15 + add r10, rax + adc r8, rdx + adc r9, 0 + + movq rax, p751p1_6 + mul rcx + add r10, rax + adc r8, rdx + adc r9, 0 + add r10, [reg_p1+136] + mov [reg_p2+40], r10 // z5 + adc r8, 0 + adc r9, 0 + + xor r10, r10 + movq rax, p751p1_11 + mul r12 + add r8, rax + adc r9, rdx + adc r10, 0 + + movq rax, p751p1_10 + mul r13 + add r8, rax + adc r9, rdx + adc r10, 0 + + movq rax, p751p1_9 + mul r14 + add r8, rax + adc r9, rdx + adc r10, 0 + + movq rax, p751p1_8 + mul r15 + add r8, rax + adc r9, rdx + adc r10, 0 + + movq rax, p751p1_7 + mul rcx + add r8, rax + adc r9, rdx + adc r10, 0 + add r8, [reg_p1+144] + mov [reg_p2+48], r8 // z6 + adc r9, 0 + adc r10, 0 + + xor r8, r8 + movq rax, p751p1_11 + mul r13 + add r9, rax + adc r10, rdx + adc r8, 0 + + movq rax, p751p1_10 + mul r14 + add r9, rax + adc r10, rdx + adc r8, 0 + + movq rax, p751p1_9 + mul r15 + add r9, rax + adc r10, rdx + adc r8, 0 + + movq rax, p751p1_8 + mul rcx + add r9, rax + adc r10, rdx + adc r8, 0 + add r9, [reg_p1+152] + mov [reg_p2+56], r9 // z7 + adc r10, 0 + adc r8, 0 + + xor r9, r9 + movq rax, p751p1_11 + mul r14 + add r10, rax + adc r8, rdx + adc r9, 0 + + movq rax, p751p1_10 + mul r15 + add r10, rax + adc r8, rdx + adc r9, 0 + + movq rax, p751p1_9 + mul rcx + add r10, rax + adc r8, rdx + adc r9, 0 + add r10, [reg_p1+160] + mov [reg_p2+64], r10 // z8 + adc r8, 0 + adc r9, 0 + + xor r10, r10 + movq rax, p751p1_11 + mul r15 + add r8, rax + adc r9, rdx + adc r10, 0 + + movq rax, p751p1_10 + mul rcx + add r8, rax + adc r9, rdx + adc r10, 0 + add r8, [reg_p1+168] // z9 + mov [reg_p2+72], r8 // z9 + adc r9, 0 + adc r10, 0 + + movq rax, p751p1_11 + mul rcx + add r9, rax + adc r10, rdx + add r9, [reg_p1+176] // z10 + mov [reg_p2+80], r9 // z10 + adc r10, 0 + add r10, [reg_p1+184] // z11 + mov [reg_p2+88], r10 // z11 + + pop r15 + pop r14 + pop r13 + pop r12 + ret + + #endif + + +//*********************************************************************** +// 751-bit multiprecision addition +// Operation: c [reg_p3] = a [reg_p1] + b [reg_p2] +//*********************************************************************** +.global mp_add751_asm +mp_add751_asm: + mov r8, [reg_p1] + mov r9, [reg_p1+8] + mov r10, [reg_p1+16] + mov r11, [reg_p1+24] + mov rax, [reg_p1+32] + mov rcx, [reg_p1+40] + add r8, [reg_p2] + adc r9, [reg_p2+8] + adc r10, [reg_p2+16] + adc r11, [reg_p2+24] + adc rax, [reg_p2+32] + adc rcx, [reg_p2+40] + mov [reg_p3], r8 + mov [reg_p3+8], r9 + mov [reg_p3+16], r10 + mov [reg_p3+24], r11 + mov [reg_p3+32], rax + mov [reg_p3+40], rcx + + mov r8, [reg_p1+48] + mov r9, [reg_p1+56] + mov r10, [reg_p1+64] + mov r11, [reg_p1+72] + mov rax, [reg_p1+80] + mov rcx, [reg_p1+88] + adc r8, [reg_p2+48] + adc r9, [reg_p2+56] + adc r10, [reg_p2+64] + adc r11, [reg_p2+72] + adc rax, [reg_p2+80] + adc rcx, [reg_p2+88] + mov [reg_p3+48], r8 + mov [reg_p3+56], r9 + mov [reg_p3+64], r10 + mov [reg_p3+72], r11 + mov [reg_p3+80], rax + mov [reg_p3+88], rcx + ret + + +//*********************************************************************** +// 2x751-bit multiprecision subtraction +// Operation: c [reg_p3] = a [reg_p1] - b [reg_p2]. Returns borrow mask +//*********************************************************************** +.global mp_sub751x2_asm +mp_sub751x2_asm: + xor rax, rax + mov r8, [reg_p1] + mov r9, [reg_p1+8] + mov r10, [reg_p1+16] + mov r11, [reg_p1+24] + mov rcx, [reg_p1+32] + sub r8, [reg_p2] + sbb r9, [reg_p2+8] + sbb r10, [reg_p2+16] + sbb r11, [reg_p2+24] + sbb rcx, [reg_p2+32] + mov [reg_p3], r8 + mov [reg_p3+8], r9 + mov [reg_p3+16], r10 + mov [reg_p3+24], r11 + mov [reg_p3+32], rcx + + mov r8, [reg_p1+40] + mov r9, [reg_p1+48] + mov r10, [reg_p1+56] + mov r11, [reg_p1+64] + mov rcx, [reg_p1+72] + sbb r8, [reg_p2+40] + sbb r9, [reg_p2+48] + sbb r10, [reg_p2+56] + sbb r11, [reg_p2+64] + sbb rcx, [reg_p2+72] + mov [reg_p3+40], r8 + mov [reg_p3+48], r9 + mov [reg_p3+56], r10 + mov [reg_p3+64], r11 + mov [reg_p3+72], rcx + + mov r8, [reg_p1+80] + mov r9, [reg_p1+88] + mov r10, [reg_p1+96] + mov r11, [reg_p1+104] + mov rcx, [reg_p1+112] + sbb r8, [reg_p2+80] + sbb r9, [reg_p2+88] + sbb r10, [reg_p2+96] + sbb r11, [reg_p2+104] + sbb rcx, [reg_p2+112] + mov [reg_p3+80], r8 + mov [reg_p3+88], r9 + mov [reg_p3+96], r10 + mov [reg_p3+104], r11 + mov [reg_p3+112], rcx + + mov r8, [reg_p1+120] + mov r9, [reg_p1+128] + mov r10, [reg_p1+136] + mov r11, [reg_p1+144] + mov rcx, [reg_p1+152] + sbb r8, [reg_p2+120] + sbb r9, [reg_p2+128] + sbb r10, [reg_p2+136] + sbb r11, [reg_p2+144] + sbb rcx, [reg_p2+152] + mov [reg_p3+120], r8 + mov [reg_p3+128], r9 + mov [reg_p3+136], r10 + mov [reg_p3+144], r11 + mov [reg_p3+152], rcx + + mov r8, [reg_p1+160] + mov r9, [reg_p1+168] + mov r10, [reg_p1+176] + mov r11, [reg_p1+184] + sbb r8, [reg_p2+160] + sbb r9, [reg_p2+168] + sbb r10, [reg_p2+176] + sbb r11, [reg_p2+184] + sbb rax, 0 + mov [reg_p3+160], r8 + mov [reg_p3+168], r9 + mov [reg_p3+176], r10 + mov [reg_p3+184], r11 + ret + + +//*********************************************************************** +// Double 2x751-bit multiprecision subtraction +// Operation: c [reg_p3] = c [reg_p3] - a [reg_p1] - b [reg_p2] +//*********************************************************************** +.global mp_dblsub751x2_asm +mp_dblsub751x2_asm: + push r12 + push r13 + push r14 + push r15 + + xor rax, rax + mov r8, [reg_p3] + mov r9, [reg_p3+8] + mov r10, [reg_p3+16] + mov r11, [reg_p3+24] + mov r12, [reg_p3+32] + mov r13, [reg_p3+40] + mov r14, [reg_p3+48] + mov r15, [reg_p3+56] + sub r8, [reg_p1] + sbb r9, [reg_p1+8] + sbb r10, [reg_p1+16] + sbb r11, [reg_p1+24] + sbb r12, [reg_p1+32] + sbb r13, [reg_p1+40] + sbb r14, [reg_p1+48] + sbb r15, [reg_p1+56] + adc rax, 0 + sub r8, [reg_p2] + sbb r9, [reg_p2+8] + sbb r10, [reg_p2+16] + sbb r11, [reg_p2+24] + sbb r12, [reg_p2+32] + sbb r13, [reg_p2+40] + sbb r14, [reg_p2+48] + sbb r15, [reg_p2+56] + adc rax, 0 + mov [reg_p3], r8 + mov [reg_p3+8], r9 + mov [reg_p3+16], r10 + mov [reg_p3+24], r11 + mov [reg_p3+32], r12 + mov [reg_p3+40], r13 + mov [reg_p3+48], r14 + mov [reg_p3+56], r15 + + xor rcx, rcx + mov r8, [reg_p3+64] + mov r9, [reg_p3+72] + mov r10, [reg_p3+80] + mov r11, [reg_p3+88] + mov r12, [reg_p3+96] + mov r13, [reg_p3+104] + mov r14, [reg_p3+112] + mov r15, [reg_p3+120] + sub r8, rax + sbb r8, [reg_p1+64] + sbb r9, [reg_p1+72] + sbb r10, [reg_p1+80] + sbb r11, [reg_p1+88] + sbb r12, [reg_p1+96] + sbb r13, [reg_p1+104] + sbb r14, [reg_p1+112] + sbb r15, [reg_p1+120] + adc rcx, 0 + sub r8, [reg_p2+64] + sbb r9, [reg_p2+72] + sbb r10, [reg_p2+80] + sbb r11, [reg_p2+88] + sbb r12, [reg_p2+96] + sbb r13, [reg_p2+104] + sbb r14, [reg_p2+112] + sbb r15, [reg_p2+120] + adc rcx, 0 + mov [reg_p3+64], r8 + mov [reg_p3+72], r9 + mov [reg_p3+80], r10 + mov [reg_p3+88], r11 + mov [reg_p3+96], r12 + mov [reg_p3+104], r13 + mov [reg_p3+112], r14 + mov [reg_p3+120], r15 + + mov r8, [reg_p3+128] + mov r9, [reg_p3+136] + mov r10, [reg_p3+144] + mov r11, [reg_p3+152] + mov r12, [reg_p3+160] + mov r13, [reg_p3+168] + mov r14, [reg_p3+176] + mov r15, [reg_p3+184] + sub r8, rcx + sbb r8, [reg_p1+128] + sbb r9, [reg_p1+136] + sbb r10, [reg_p1+144] + sbb r11, [reg_p1+152] + sbb r12, [reg_p1+160] + sbb r13, [reg_p1+168] + sbb r14, [reg_p1+176] + sbb r15, [reg_p1+184] + sub r8, [reg_p2+128] + sbb r9, [reg_p2+136] + sbb r10, [reg_p2+144] + sbb r11, [reg_p2+152] + sbb r12, [reg_p2+160] + sbb r13, [reg_p2+168] + sbb r14, [reg_p2+176] + sbb r15, [reg_p2+184] + mov [reg_p3+128], r8 + mov [reg_p3+136], r9 + mov [reg_p3+144], r10 + mov [reg_p3+152], r11 + mov [reg_p3+160], r12 + mov [reg_p3+168], r13 + mov [reg_p3+176], r14 + mov [reg_p3+184], r15 + + pop r15 + pop r14 + pop r13 + pop r12 + ret diff --git a/third_party/sidh/src/P751/ARM64/fp_arm64.c b/third_party/sidh/src/P751/ARM64/fp_arm64.c new file mode 100644 index 00000000..096e11ed --- /dev/null +++ b/third_party/sidh/src/P751/ARM64/fp_arm64.c @@ -0,0 +1,93 @@ +/******************************************************************************************** +* SIDH: an efficient supersingular isogeny cryptography library +* +* Abstract: modular arithmetic optimized for 64-bit ARMv8 platforms for P751 +*********************************************************************************************/ + +#include "../P751_internal.h" + +// Global constants +extern const uint64_t p751[NWORDS_FIELD]; +extern const uint64_t p751x2[NWORDS_FIELD]; + + +__inline void fpadd751(const digit_t* a, const digit_t* b, digit_t* c) +{ // Modular addition, c = a+b mod p751. + // Inputs: a, b in [0, 2*p751-1] + // Output: c in [0, 2*p751-1] + + fpadd751_asm(a, b, c); +} + + +__inline void fpsub751(const digit_t* a, const digit_t* b, digit_t* c) +{ // Modular subtraction, c = a-b mod p751. + // Inputs: a, b in [0, 2*p751-1] + // Output: c in [0, 2*p751-1] + + fpsub751_asm(a, b, c); +} + + +__inline void fpneg751(digit_t* a) +{ // Modular negation, a = -a mod p751. + // Input/output: a in [0, 2*p751-1] + unsigned int i, borrow = 0; + + for (i = 0; i < NWORDS_FIELD; i++) { + SUBC(borrow, ((digit_t*)p751x2)[i], a[i], borrow, a[i]); + } +} + + +void fpdiv2_751(const digit_t* a, digit_t* c) +{ // Modular division by two, c = a/2 mod p751. + // Input : a in [0, 2*p751-1] + // Output: c in [0, 2*p751-1] + unsigned int i, carry = 0; + digit_t mask; + + mask = 0 - (digit_t)(a[0] & 1); // If a is odd compute a+p521 + for (i = 0; i < NWORDS_FIELD; i++) { + ADDC(carry, a[i], ((digit_t*)p751)[i] & mask, carry, c[i]); + } + + mp_shiftr1(c, NWORDS_FIELD); +} + + +void fpcorrection751(digit_t* a) +{ // Modular correction to reduce field element a in [0, 2*p751-1] to [0, p751-1]. + unsigned int i, borrow = 0; + digit_t mask; + + for (i = 0; i < NWORDS_FIELD; i++) { + SUBC(borrow, a[i], ((digit_t*)p751)[i], borrow, a[i]); + } + mask = 0 - (digit_t)borrow; + + borrow = 0; + for (i = 0; i < NWORDS_FIELD; i++) { + ADDC(borrow, a[i], ((digit_t*)p751)[i] & mask, borrow, a[i]); + } +} + + +void mp_mul(const digit_t* a, const digit_t* b, digit_t* c, const unsigned int nwords) +{ // Multiprecision multiply, c = a*b, where lng(a) = lng(b) = nwords. + + UNREFERENCED_PARAMETER(nwords); + + mul751_asm(a, b, c); +} + + + +void rdc_mont(const digit_t* ma, digit_t* mc) +{ // Montgomery reduction exploiting special form of the prime. + // mc = ma*R^-1 mod p751x2, where R = 2^768. + // If ma < 2^768*p751, the output mc is in the range [0, 2*p751-1]. + // ma is assumed to be in Montgomery representation. + + rdc751_asm(ma, mc); +} diff --git a/third_party/sidh/src/P751/ARM64/fp_arm64_asm.S b/third_party/sidh/src/P751/ARM64/fp_arm64_asm.S new file mode 100644 index 00000000..995cb45a --- /dev/null +++ b/third_party/sidh/src/P751/ARM64/fp_arm64_asm.S @@ -0,0 +1,2511 @@ +//******************************************************************************************* +// SIDH: an efficient supersingular isogeny cryptography library +// +// Author: David Urbanik; dburbani@uwaterloo.ca +// +// Abstract: Assembly optimizations for finite field arithmetic over P751 on 64-bit ARM. +// +// File was modified to allow inputs in [0, 2*p751-1]. +//******************************************************************************************* + +.data + +// p751 + 1 +p751p1: +.quad 0xEEB0000000000000 +.quad 0xE3EC968549F878A8 +.quad 0xDA959B1A13F7CC76 +.quad 0x084E9867D6EBE876 +.quad 0x8562B5045CB25748 +.quad 0x0E12909F97BADC66 +.quad 0x00006FE5D541F71C + +// p751 +p751: +.quad 0xFFFFFFFFFFFFFFFF +.quad 0xEEAFFFFFFFFFFFFF +.quad 0xE3EC968549F878A8 +.quad 0xDA959B1A13F7CC76 +.quad 0x084E9867D6EBE876 +.quad 0x8562B5045CB25748 +.quad 0x0E12909F97BADC66 +.quad 0x00006FE5D541F71C + +// 2 * p751 +p751x2: +.quad 0xFFFFFFFFFFFFFFFE +.quad 0xFFFFFFFFFFFFFFFF +.quad 0xDD5FFFFFFFFFFFFF +.quad 0xC7D92D0A93F0F151 +.quad 0xB52B363427EF98ED +.quad 0x109D30CFADD7D0ED +.quad 0x0AC56A08B964AE90 +.quad 0x1C25213F2F75B8CD +.quad 0x0000DFCBAA83EE38 + + +.text +//*********************************************************************** +// Field addition +// Operation: c [x2] = a [x0] + b [x1] +//*********************************************************************** +.global fpadd751_asm +fpadd751_asm: + // Arguments are 3 pointers of type digit_t*, where the first two arguments are summands and the third is the result register. + // These arguments are stored in x0, x1, and x2 respectively. + + // load first summand into x3 - x14 + ldp x3, x4, [x0,#0] + ldp x5, x6, [x0,#16] + ldp x7, x8, [x0,#32] + ldp x9, x10, [x0,#48] + ldp x11, x12, [x0,#64] + ldp x13, x14, [x0,#80] + + // add first summand and second summand and store result in x3 - x14 + ldp x15, x16, [x1,#0] + ldp x17, x18, [x1,#16] + adds x3, x3, x15 + adcs x4, x4, x16 + adcs x5, x5, x17 + adcs x6, x6, x18 + ldp x15, x16, [x1,#32] + ldp x17, x18, [x1,#48] + adcs x7, x7, x15 + adcs x8, x8, x16 + adcs x9, x9, x17 + adcs x10, x10, x18 + ldp x15, x16, [x1,#64] + ldp x17, x18, [x1,#80] + adcs x11, x11, x15 + adcs x12, x12, x16 + adcs x13, x13, x17 + adcs x14, x14, x18 + + // subtract 2xp751 to the resut in x3 - x14 + ldr x16, p751x2 + subs x3, x3, x16 + ldr x15, p751x2 + 8 + sbcs x4, x4, x15 + sbcs x5, x5, x15 + sbcs x6, x6, x15 + sbcs x7, x7, x15 + ldr x16, p751x2 + 16 + ldr x17, p751x2 + 24 + sbcs x8, x8, x16 + ldr x18, p751x2 + 32 + sbcs x9, x9, x17 + ldr x16, p751x2 + 40 + sbcs x10, x10, x18 + ldr x17, p751x2 + 48 + sbcs x11, x11, x16 + ldr x18, p751x2 + 56 + sbcs x12, x12, x17 + ldr x15, p751x2 + 64 + sbcs x13, x13, x18 + sbcs x14, x14, x15 + sbc x15, xzr, xzr + + // add 2xp751 back but anded with the mask in x15 + ldr x16, p751x2 + and x16, x16, x15 + ldr x17, p751x2 + 8 + and x17, x17, x15 + ldr x18, p751x2 + 16 + and x18, x18, x15 + + adds x3, x3, x16 + adcs x4, x4, x17 + adcs x5, x5, x17 + adcs x6, x6, x17 + adcs x7, x7, x17 + adcs x8, x8, x18 + + ldr x16, p751x2 + 24 + and x16, x16, x15 + adcs x9, x9, x16 + + ldr x16, p751x2 + 32 + and x16, x16, x15 + ldr x17, p751x2 + 40 + and x17, x17, x15 + ldr x18, p751x2 + 48 + and x18, x18, x15 + + adcs x10, x10, x16 + adcs x11, x11, x17 + adcs x12, x12, x18 + + ldr x16, p751x2 + 56 + and x16, x16, x15 + ldr x17, p751x2 + 64 + and x17, x17, x15 + + adcs x13, x13, x16 + adcs x14, x14, x17 + + stp x3, x4, [x2,#0] + stp x5, x6, [x2,#16] + stp x7, x8, [x2,#32] + stp x9, x10, [x2,#48] + stp x11, x12, [x2,#64] + stp x13, x14, [x2,#80] + ret + + +//*********************************************************************** +// Field subtraction +// Operation: c [x2] = a [x0] - b [x1] +//*********************************************************************** +.global fpsub751_asm +fpsub751_asm: + ldp x3, x4, [x0,#0] + ldp x5, x6, [x0,#16] + ldp x7, x8, [x0,#32] + ldp x9, x10, [x0,#48] + ldp x11, x12, [x0,#64] + ldp x13, x14, [x0,#80] + + ldp x15, x16, [x1, #0] + subs x3, x3, x15 + sbcs x4, x4, x16 + ldp x15, x16, [x1, #16] + sbcs x5, x5, x15 + sbcs x6, x6, x16 + ldp x15, x16, [x1, #32] + sbcs x7, x7, x15 + sbcs x8, x8, x16 + ldp x15, x16, [x1, #48] + sbcs x9, x9, x15 + sbcs x10, x10, x16 + ldp x15, x16, [x1, #64] + sbcs x11, x11, x15 + sbcs x12, x12, x16 + ldp x15, x16, [x1, #80] + sbcs x13, x13, x15 + sbcs x14, x14, x16 + sbc x17, xzr, xzr + + ldr x15, p751x2 + and x15, x15, x17 + ldr x16, p751x2 + 8 + and x16, x16, x17 + ldr x18, p751x2 + 16 + and x18, x18, x17 + + adds x3, x3, x15 + adcs x4, x4, x16 + adcs x5, x5, x16 + adcs x6, x6, x16 + adcs x7, x7, x16 + adcs x8, x8, x18 + + ldr x15, p751x2 + 24 + and x15, x15, x17 + ldr x16, p751x2 + 32 + and x16, x16, x17 + + adcs x9, x9, x15 + adcs x10, x10, x16 + + ldr x15, p751x2 + 40 + and x15, x15, x17 + ldr x16, p751x2 + 48 + and x16, x16, x17 + + adcs x11, x11, x15 + adcs x12, x12, x16 + + ldr x15, p751x2 + 56 + and x15, x15, x17 + ldr x16, p751x2 + 64 + and x16, x16, x17 + + adcs x13, x13, x15 + adcs x14, x14, x16 + + stp x3, x4, [x2,#0] + stp x5, x6, [x2,#16] + stp x7, x8, [x2,#32] + stp x9, x10, [x2,#48] + stp x11, x12, [x2,#64] + stp x13, x14, [x2,#80] + ret + + +//*********************************************************************** +// Integer multiplication using Comba method +// Operation: c [x2] = a [x0] * b [x1] +//*********************************************************************** +.global mul751_asm +mul751_asm: + sub sp, sp, #80 + stp x19, x20, [sp] + stp x21, x22, [sp, #16] + stp x23, x24, [sp, #32] + stp x25, x26, [sp, #48] + stp x27, x28, [sp, #64] + + ldp x3, x4, [x0, #0] + ldp x5, x6, [x1, #0] + mul x18, x3, x5 + umulh x17, x3, x5 + // c0 is now in x18 + + // a0 * b1 + mul x13, x3, x6 + umulh x14, x3, x6 + + adds x17, x17, x13 + adcs x16, x14, xzr + adcs x15, xzr, xzr + + // b0 * a1 + mul x13, x4, x5 + umulh x14, x4, x5 + + adds x17, x17, x13 + adcs x16, x16, x14 + adcs x15, x15, xzr + + // store c0 and c1 + stp x18, x17, [x2, #0] + + // load a2, a3, b2, b3 + ldp x7, x8, [x0, #16] + ldp x9, x10, [x1, #16] + + // a0 * b2 + mul x13, x3, x9 + umulh x14, x3, x9 + + adds x16, x16, x13 + adcs x15, x15, x14 + adcs x18, xzr, xzr + + // a1 * b1 + mul x13, x4, x6 + umulh x14, x4, x6 + + adds x16, x16, x13 + adcs x15, x15, x14 + adcs x18, x18, xzr + + // a2 * b0 + mul x13, x7, x5 + umulh x14, x7, x5 + + adds x16, x16, x13 + adcs x15, x15, x14 + adcs x18, x18, xzr + + // c2 is now in x16 + + // a0 * b3 + mul x13, x3, x10 + umulh x14, x3, x10 + + adds x15, x15, x13 + adcs x18, x18, x14 + adcs x17, xzr, xzr + + // a1 * b2 + mul x13, x4, x9 + umulh x14, x4, x9 + + adds x15, x15, x13 + adcs x18, x18, x14 + adcs x17, x17, xzr + + // a2 * b1 + mul x13, x7, x6 + umulh x14, x7, x6 + + adds x15, x15, x13 + adcs x18, x18, x14 + adcs x17, x17, xzr + + // a3 * b0 + mul x13, x8, x5 + umulh x14, x8, x5 + + adds x15, x15, x13 + adcs x18, x18, x14 + adcs x17, x17, xzr + + // store c2 and c3 + stp x16, x15, [x2, #16] + + // a1 * b3 + mul x13, x4, x10 + umulh x14, x4, x10 + + adds x18, x18, x13 + adcs x17, x17, x14 + adcs x16, xzr, xzr + + // a2 * b2 + mul x13, x7, x9 + umulh x14, x7, x9 + + adds x18, x18, x13 + adcs x17, x17, x14 + adcs x16, x16, xzr + + // a3 * b1 + mul x13, x8, x6 + umulh x14, x8, x6 + + adds x18, x18, x13 + adcs x17, x17, x14 + adcs x16, x16, xzr + + // load a4, a5 + ldp x11, x12, [x0, #32] + + // a4 * b0 + mul x13, x11, x5 + umulh x14, x11, x5 + + adds x18, x18, x13 + adcs x17, x17, x14 + adcs x16, x16, xzr + + // load b4, b5 + ldp x19, x20, [x1, #32] + + // a0 * b4 + mul x13, x3, x19 + umulh x14, x3, x19 + + adds x18, x18, x13 + adcs x17, x17, x14 + adcs x16, x16, xzr + + // c4 is now in x18 + + // a0 * b5 + mul x13, x3, x20 + umulh x14, x3, x20 + + adds x17, x17, x13 + adcs x16, x16, x14 + adcs x15, xzr, xzr + + // a1 * b4 + mul x13, x4, x19 + umulh x14, x4, x19 + + adds x17, x17, x13 + adcs x16, x16, x14 + adcs x15, x15, xzr + + // a2 * b3 + mul x13, x7, x10 + umulh x14, x7, x10 + + adds x17, x17, x13 + adcs x16, x16, x14 + adcs x15, x15, xzr + + // a3 * b2 + mul x13, x8, x9 + umulh x14, x8, x9 + + adds x17, x17, x13 + adcs x16, x16, x14 + adcs x15, x15, xzr + + // a4 * b1 + mul x13, x11, x6 + umulh x14, x11, x6 + + adds x17, x17, x13 + adcs x16, x16, x14 + adcs x15, x15, xzr + + // a5 * b0 + mul x13, x12, x5 + umulh x14, x12, x5 + + adds x17, x17, x13 + adcs x16, x16, x14 + adcs x15, x15, xzr + + // store c4 and c5 + stp x18, x17, [x2, #32] + + // load a6, a7 + ldp x21, x22, [x0, #48] + + // a6 * b0 + mul x13, x21, x5 + umulh x14, x21, x5 + + adds x16, x16, x13 + adcs x15, x15, x14 + adcs x18, xzr, xzr + + // a5 * b1 + mul x13, x12, x6 + umulh x14, x12, x6 + + adds x16, x16, x13 + adcs x15, x15, x14 + adcs x18, x18, xzr + + // a4 * b2 + mul x13, x11, x9 + umulh x14, x11, x9 + + adds x16, x16, x13 + adcs x15, x15, x14 + adcs x18, x18, xzr + + // a3 * b3 + mul x13, x8, x10 + umulh x14, x8, x10 + + adds x16, x16, x13 + adcs x15, x15, x14 + adcs x18, x18, xzr + + // a2 * b4 + mul x13, x7, x19 + umulh x14, x7, x19 + + adds x16, x16, x13 + adcs x15, x15, x14 + adcs x18, x18, xzr + + // a1 * b5 + mul x13, x4, x20 + umulh x14, x4, x20 + + adds x16, x16, x13 + adcs x15, x15, x14 + adcs x18, x18, xzr + + // load b6, b7 + ldp x23, x24, [x1, #48] + + // a0 * b6 + mul x13, x3, x23 + umulh x14, x3, x23 + + adds x16, x16, x13 + adcs x15, x15, x14 + adcs x18, x18, xzr + + // c6 is now in x16 + + // a0 * b7 + mul x13, x3, x24 + umulh x14, x3, x24 + + adds x15, x15, x13 + adcs x18, x18, x14 + adcs x17, xzr, xzr + + // a1 * b6 + mul x13, x4, x23 + umulh x14, x4, x23 + + adds x15, x15, x13 + adcs x18, x18, x14 + adcs x17, x17, xzr + + // a2 * b5 + mul x13, x7, x20 + umulh x14, x7, x20 + + adds x15, x15, x13 + adcs x18, x18, x14 + adcs x17, x17, xzr + + // a3 * b4 + mul x13, x8, x19 + umulh x14, x8, x19 + + adds x15, x15, x13 + adcs x18, x18, x14 + adcs x17, x17, xzr + + // a4 * b3 + mul x13, x11, x10 + umulh x14, x11, x10 + + adds x15, x15, x13 + adcs x18, x18, x14 + adcs x17, x17, xzr + + // a5 * b2 + mul x13, x12, x9 + umulh x14, x12, x9 + + adds x15, x15, x13 + adcs x18, x18, x14 + adcs x17, x17, xzr + + // a6 * b1 + mul x13, x21, x6 + umulh x14, x21, x6 + + adds x15, x15, x13 + adcs x18, x18, x14 + adcs x17, x17, xzr + + // a7 * b0 + mul x13, x22, x5 + umulh x14, x22, x5 + + adds x15, x15, x13 + adcs x18, x18, x14 + adcs x17, x17, xzr + + // store c6 and c7 + stp x16, x15, [x2, #48] + + // load a8, a9 + ldp x25, x26, [x0, #64] + + // a8 * b0 + mul x13, x25, x5 + umulh x14, x25, x5 + + adds x18, x18, x13 + adcs x17, x17, x14 + adcs x16, xzr, xzr + + // a7 * b1 + mul x13, x22, x6 + umulh x14, x22, x6 + + adds x18, x18, x13 + adcs x17, x17, x14 + adcs x16, x16, xzr + + // a6 * b2 + mul x13, x21, x9 + umulh x14, x21, x9 + + adds x18, x18, x13 + adcs x17, x17, x14 + adcs x16, x16, xzr + + // a5 * b3 + mul x13, x12, x10 + umulh x14, x12, x10 + + adds x18, x18, x13 + adcs x17, x17, x14 + adcs x16, x16, xzr + + // a4 * b4 + mul x13, x11, x19 + umulh x14, x11, x19 + + adds x18, x18, x13 + adcs x17, x17, x14 + adcs x16, x16, xzr + + // a3 * b5 + mul x13, x8, x20 + umulh x14, x8, x20 + + adds x18, x18, x13 + adcs x17, x17, x14 + adcs x16, x16, xzr + + // a2 * b6 + mul x13, x7, x23 + umulh x14, x7, x23 + + adds x18, x18, x13 + adcs x17, x17, x14 + adcs x16, x16, xzr + + // a1 * b7 + mul x13, x4, x24 + umulh x14, x4, x24 + + adds x18, x18, x13 + adcs x17, x17, x14 + adcs x16, x16, xzr + + // load b8, b9 + ldp x27, x28, [x1, #64] + + // a0 * b8 + mul x13, x3, x27 + umulh x14, x3, x27 + + adds x18, x18, x13 + adcs x17, x17, x14 + adcs x16, x16, xzr + + // c8 is now in x18 + + // a0 * b9 + mul x13, x3, x28 + umulh x14, x3, x28 + + adds x17, x17, x13 + adcs x16, x16, x14 + adcs x15, xzr, xzr + + // a1 * b8 + mul x13, x4, x27 + umulh x14, x4, x27 + + adds x17, x17, x13 + adcs x16, x16, x14 + adcs x15, x15, xzr + + // a2 * b7 + mul x13, x7, x24 + umulh x14, x7, x24 + + adds x17, x17, x13 + adcs x16, x16, x14 + adcs x15, x15, xzr + + // a3 * b6 + mul x13, x8, x23 + umulh x14, x8, x23 + + adds x17, x17, x13 + adcs x16, x16, x14 + adcs x15, x15, xzr + + // a4 * b5 + mul x13, x11, x20 + umulh x14, x11, x20 + + adds x17, x17, x13 + adcs x16, x16, x14 + adcs x15, x15, xzr + + // a5 * b4 + mul x13, x12, x19 + umulh x14, x12, x19 + + adds x17, x17, x13 + adcs x16, x16, x14 + adcs x15, x15, xzr + + // a6 * b3 + mul x13, x21, x10 + umulh x14, x21, x10 + + adds x17, x17, x13 + adcs x16, x16, x14 + adcs x15, x15, xzr + + // a7 * b2 + mul x13, x22, x9 + umulh x14, x22, x9 + + adds x17, x17, x13 + adcs x16, x16, x14 + adcs x15, x15, xzr + + // a8 * b1 + mul x13, x25, x6 + umulh x14, x25, x6 + + adds x17, x17, x13 + adcs x16, x16, x14 + adcs x15, x15, xzr + + // a9 * b0 + mul x13, x26, x5 + umulh x14, x26, x5 + + adds x17, x17, x13 + adcs x16, x16, x14 + adcs x15, x15, xzr + + // store c8 and c9 + stp x18, x17, [x2, #64] + + // load a10, a11; a0 and a1 unloaded + ldp x3, x4, [x0, #80] + + // a10 * b0 + mul x13, x3, x5 + umulh x14, x3, x5 + + adds x16, x16, x13 + adcs x15, x15, x14 + adcs x18, xzr, xzr + + // a9 * b1 + mul x13, x26, x6 + umulh x14, x26, x6 + + adds x16, x16, x13 + adcs x15, x15, x14 + adcs x18, x18, xzr + + // a8 * b2 + mul x13, x25, x9 + umulh x14, x25, x9 + + adds x16, x16, x13 + adcs x15, x15, x14 + adcs x18, x18, xzr + + // a7 * b3 + mul x13, x22, x10 + umulh x14, x22, x10 + + adds x16, x16, x13 + adcs x15, x15, x14 + adcs x18, x18, xzr + + // a6 * b4 + mul x13, x21, x19 + umulh x14, x21, x19 + + adds x16, x16, x13 + adcs x15, x15, x14 + adcs x18, x18, xzr + + // a5 * b5 + mul x13, x12, x20 + umulh x14, x12, x20 + + adds x16, x16, x13 + adcs x15, x15, x14 + adcs x18, x18, xzr + + // a4 * b6 + mul x13, x11, x23 + umulh x14, x11, x23 + + adds x16, x16, x13 + adcs x15, x15, x14 + adcs x18, x18, xzr + + // a3 * b7 + mul x13, x8, x24 + umulh x14, x8, x24 + + adds x16, x16, x13 + adcs x15, x15, x14 + adcs x18, x18, xzr + + // a2 * b8 + mul x13, x7, x27 + umulh x14, x7, x27 + + adds x16, x16, x13 + adcs x15, x15, x14 + adcs x18, x18, xzr + + // load a0, a1; b0 and b1 unloaded + ldp x5, x6, [x0, #0] + + // a1 * b9 + mul x13, x6, x28 + umulh x14, x6, x28 + + adds x16, x16, x13 + adcs x15, x15, x14 + adcs x18, x18, xzr + + // load b10, b11; a10 and a11 unloaded + ldp x3, x4, [x1, #80] + + // a0 * b10 + mul x13, x3, x5 + umulh x14, x3, x5 + + adds x16, x16, x13 + adcs x15, x15, x14 + adcs x18, x18, xzr + + // c10 now in x16 + + // a0 * b11 + mul x13, x4, x5 + umulh x14, x4, x5 + + adds x15, x15, x13 + adcs x18, x18, x14 + adcs x17, xzr, xzr + + // a1 * b10 + mul x13, x3, x6 + umulh x14, x3, x6 + + adds x15, x15, x13 + adcs x18, x18, x14 + adcs x17, x17, xzr + + // a2 * b9 + mul x13, x7, x28 + umulh x14, x7, x28 + + adds x15, x15, x13 + adcs x18, x18, x14 + adcs x17, x17, xzr + + // a3 * b8 + mul x13, x8, x27 + umulh x14, x8, x27 + + adds x15, x15, x13 + adcs x18, x18, x14 + adcs x17, x17, xzr + + // a4 * b7 + mul x13, x11, x24 + umulh x14, x11, x24 + + adds x15, x15, x13 + adcs x18, x18, x14 + adcs x17, x17, xzr + + // a5 * b6 + mul x13, x12, x23 + umulh x14, x12, x23 + + adds x15, x15, x13 + adcs x18, x18, x14 + adcs x17, x17, xzr + + // a6 * b5 + mul x13, x21, x20 + umulh x14, x21, x20 + + adds x15, x15, x13 + adcs x18, x18, x14 + adcs x17, x17, xzr + + // a7 * b4 + mul x13, x22, x19 + umulh x14, x22, x19 + + adds x15, x15, x13 + adcs x18, x18, x14 + adcs x17, x17, xzr + + // a8 * b3 + mul x13, x25, x10 + umulh x14, x25, x10 + + adds x15, x15, x13 + adcs x18, x18, x14 + adcs x17, x17, xzr + + // a9 * b2 + mul x13, x26, x9 + umulh x14, x26, x9 + + adds x15, x15, x13 + adcs x18, x18, x14 + adcs x17, x17, xzr + + // load a10, a11; b10 and b11 unloaded + ldp x3, x4, [x0, #80] + // load b0, b1; a0 and a1 unloaded + ldp x5, x6, [x1, #0] + + // a10 * b1 + mul x13, x3, x6 + umulh x14, x3, x6 + + adds x15, x15, x13 + adcs x18, x18, x14 + adcs x17, x17, xzr + + // a11 * b0 + mul x13, x4, x5 + umulh x14, x4, x5 + + adds x15, x15, x13 + adcs x18, x18, x14 + adcs x17, x17, xzr + + // store c10 and c11 + stp x16, x15, [x2, #80] + + // a11 * b1 + mul x13, x4, x6 + umulh x14, x4, x6 + + adds x18, x18, x13 + adcs x17, x17, x14 + adcs x16, xzr, xzr + + // a10 * b2 + mul x13, x9, x3 + umulh x14, x9, x3 + + adds x18, x18, x13 + adcs x17, x17, x14 + adcs x16, x16, xzr + + // a9 * b3 + mul x13, x26, x10 + umulh x14, x26, x10 + + adds x18, x18, x13 + adcs x17, x17, x14 + adcs x16, x16, xzr + + // a8 * b4 + mul x13, x25, x19 + umulh x14, x25, x19 + + adds x18, x18, x13 + adcs x17, x17, x14 + adcs x16, x16, xzr + + // a7 * b5 + mul x13, x22, x20 + umulh x14, x22, x20 + + adds x18, x18, x13 + adcs x17, x17, x14 + adcs x16, x16, xzr + + // a6 * b6 + mul x13, x21, x23 + umulh x14, x21, x23 + + adds x18, x18, x13 + adcs x17, x17, x14 + adcs x16, x16, xzr + + // a5 * b7 + mul x13, x12, x24 + umulh x14, x12, x24 + + adds x18, x18, x13 + adcs x17, x17, x14 + adcs x16, x16, xzr + + // a4 * b8 + mul x13, x11, x27 + umulh x14, x11, x27 + + adds x18, x18, x13 + adcs x17, x17, x14 + adcs x16, x16, xzr + + // a3 * b9 + mul x13, x8, x28 + umulh x14, x8, x28 + + adds x18, x18, x13 + adcs x17, x17, x14 + adcs x16, x16, xzr + + // load b10, b11; a10 and a11 unloaded + ldp x3, x4, [x1, #80] + // load a0, a1; b0 and b1 unloaded + ldp x5, x6, [x0, #0] + + // a2 * b10 + mul x13, x7, x3 + umulh x14, x7, x3 + + adds x18, x18, x13 + adcs x17, x17, x14 + adcs x16, x16, xzr + + // a1 * b11 + mul x13, x6, x4 + umulh x14, x6, x4 + + adds x18, x18, x13 + adcs x17, x17, x14 + adcs x16, x16, xzr + + // c12 now in x18 + + // a2 * b11 + mul x13, x7, x4 + umulh x14, x7, x4 + + adds x17, x17, x13 + adcs x16, x16, x14 + adcs x15, xzr, xzr + + // a3 * b10 + mul x13, x8, x3 + umulh x14, x8, x3 + + adds x17, x17, x13 + adcs x16, x16, x14 + adcs x15, x15, xzr + + // a4 * b9 + mul x13, x11, x28 + umulh x14, x11, x28 + + adds x17, x17, x13 + adcs x16, x16, x14 + adcs x15, x15, xzr + + // a5 * b8 + mul x13, x12, x27 + umulh x14, x12, x27 + + adds x17, x17, x13 + adcs x16, x16, x14 + adcs x15, x15, xzr + + // a6 * b7 + mul x13, x21, x24 + umulh x14, x21, x24 + + adds x17, x17, x13 + adcs x16, x16, x14 + adcs x15, x15, xzr + + // a7 * b6 + mul x13, x22, x23 + umulh x14, x22, x23 + + adds x17, x17, x13 + adcs x16, x16, x14 + adcs x15, x15, xzr + + // a8 * b5 + mul x13, x25, x20 + umulh x14, x25, x20 + + adds x17, x17, x13 + adcs x16, x16, x14 + adcs x15, x15, xzr + + // a9 * b4 + mul x13, x26, x19 + umulh x14, x26, x19 + + adds x17, x17, x13 + adcs x16, x16, x14 + adcs x15, x15, xzr + + // load a10, a11; a0 and a1 unloaded + ldp x5, x6, [x0, #80] + + // a10 * b3 + mul x13, x5, x10 + umulh x14, x5, x10 + + adds x17, x17, x13 + adcs x16, x16, x14 + adcs x15, x15, xzr + + // a11 * b2 + mul x13, x6, x9 + umulh x14, x6, x9 + + adds x17, x17, x13 + adcs x16, x16, x14 + adcs x15, x15, xzr + + // store c12 and c13 + stp x18, x17, [x2, #96] + + // a11 * b3 + mul x13, x6, x10 + umulh x14, x6, x10 + + adds x16, x16, x13 + adcs x15, x15, x14 + adcs x18, xzr, xzr + + // a10 * b4 + mul x13, x5, x19 + umulh x14, x5, x19 + + adds x16, x16, x13 + adcs x15, x15, x14 + adcs x18, x18, xzr + + // a9 * b5 + mul x13, x26, x20 + umulh x14, x26, x20 + + adds x16, x16, x13 + adcs x15, x15, x14 + adcs x18, x18, xzr + + // a8 * b6 + mul x13, x25, x23 + umulh x14, x25, x23 + + adds x16, x16, x13 + adcs x15, x15, x14 + adcs x18, x18, xzr + + // a7 * b7 + mul x13, x22, x24 + umulh x14, x22, x24 + + adds x16, x16, x13 + adcs x15, x15, x14 + adcs x18, x18, xzr + + // a6 * b8 + mul x13, x21, x27 + umulh x14, x21, x27 + + adds x16, x16, x13 + adcs x15, x15, x14 + adcs x18, x18, xzr + + // a5 * b9 + mul x13, x12, x28 + umulh x14, x12, x28 + + adds x16, x16, x13 + adcs x15, x15, x14 + adcs x18, x18, xzr + + // a4 * b10 + mul x13, x11, x3 + umulh x14, x11, x3 + + adds x16, x16, x13 + adcs x15, x15, x14 + adcs x18, x18, xzr + + // a3 * b11 + mul x13, x8, x4 + umulh x14, x8, x4 + + adds x16, x16, x13 + adcs x15, x15, x14 + adcs x18, x18, xzr + + // c14 is now in x16 + + // a4 * b11 + mul x13, x11, x4 + umulh x14, x11, x4 + + adds x15, x15, x13 + adcs x18, x18, x14 + adcs x17, xzr, xzr + + // a5 * b10 + mul x13, x12, x3 + umulh x14, x12, x3 + + adds x15, x15, x13 + adcs x18, x18, x14 + adcs x17, x17, xzr + + // a6 * b9 + mul x13, x21, x28 + umulh x14, x21, x28 + + adds x15, x15, x13 + adcs x18, x18, x14 + adcs x17, x17, xzr + + // a7 * b8 + mul x13, x22, x27 + umulh x14, x22, x27 + + adds x15, x15, x13 + adcs x18, x18, x14 + adcs x17, x17, xzr + + // a8 * b7 + mul x13, x25, x24 + umulh x14, x25, x24 + + adds x15, x15, x13 + adcs x18, x18, x14 + adcs x17, x17, xzr + + // a9 * b6 + mul x13, x26, x23 + umulh x14, x26, x23 + + adds x15, x15, x13 + adcs x18, x18, x14 + adcs x17, x17, xzr + + // a10 * b5 + mul x13, x5, x20 + umulh x14, x5, x20 + + adds x15, x15, x13 + adcs x18, x18, x14 + adcs x17, x17, xzr + + // a11 * b4 + mul x13, x6, x19 + umulh x14, x6, x19 + + adds x15, x15, x13 + adcs x18, x18, x14 + adcs x17, x17, xzr + + // c15 is now in x15 + + // store c14 and c15 + stp x16, x15, [x2, #112] + + // a11 * b5 + mul x13, x6, x20 + umulh x14, x6, x20 + + adds x18, x18, x13 + adcs x17, x17, x14 + adcs x16, xzr, xzr + + // a10 * b6 + mul x13, x5, x23 + umulh x14, x5, x23 + + adds x18, x18, x13 + adcs x17, x17, x14 + adcs x16, x16, xzr + + // a9 * b7 + mul x13, x26, x24 + umulh x14, x26, x24 + + adds x18, x18, x13 + adcs x17, x17, x14 + adcs x16, x16, xzr + + // a8 * b8 + mul x13, x25, x27 + umulh x14, x25, x27 + + adds x18, x18, x13 + adcs x17, x17, x14 + adcs x16, x16, xzr + + // a7 * b9 + mul x13, x22, x28 + umulh x14, x22, x28 + + adds x18, x18, x13 + adcs x17, x17, x14 + adcs x16, x16, xzr + + // a6 * b10 + mul x13, x21, x3 + umulh x14, x21, x3 + + adds x18, x18, x13 + adcs x17, x17, x14 + adcs x16, x16, xzr + + // a5 * b11 + mul x13, x12, x4 + umulh x14, x12, x4 + + adds x18, x18, x13 + adcs x17, x17, x14 + adcs x16, x16, xzr + + // c16 is now in x18 + + // a6 * b11 + mul x13, x21, x4 + umulh x14, x21, x4 + + adds x17, x17, x13 + adcs x16, x16, x14 + adcs x15, xzr, xzr + + // a7 * b10 + mul x13, x22, x3 + umulh x14, x22, x3 + + adds x17, x17, x13 + adcs x16, x16, x14 + adcs x15, x15, xzr + + // a8 * b9 + mul x13, x25, x28 + umulh x14, x25, x28 + + adds x17, x17, x13 + adcs x16, x16, x14 + adcs x15, x15, xzr + + // a9 * b8 + mul x13, x26, x27 + umulh x14, x26, x27 + + adds x17, x17, x13 + adcs x16, x16, x14 + adcs x15, x15, xzr + + // a10 * b7 + mul x13, x5, x24 + umulh x14, x5, x24 + + adds x17, x17, x13 + adcs x16, x16, x14 + adcs x15, x15, xzr + + // a11 * b6 + mul x13, x6, x23 + umulh x14, x6, x23 + + adds x17, x17, x13 + adcs x16, x16, x14 + adcs x15, x15, xzr + + // store c16 and c17 + stp x18, x17, [x2, #128] + + // a11 * b7 + mul x13, x6, x24 + umulh x14, x6, x24 + + adds x16, x16, x13 + adcs x15, x15, x14 + adcs x18, xzr, xzr + + // a10 * b8 + mul x13, x5, x27 + umulh x14, x5, x27 + + adds x16, x16, x13 + adcs x15, x15, x14 + adcs x18, x18, xzr + + // a9 * b9 + mul x13, x26, x28 + umulh x14, x26, x28 + + adds x16, x16, x13 + adcs x15, x15, x14 + adcs x18, x18, xzr + + // a8 * b10 + mul x13, x25, x3 + umulh x14, x25, x3 + + adds x16, x16, x13 + adcs x15, x15, x14 + adcs x18, x18, xzr + + // a7 * b11 + mul x13, x22, x4 + umulh x14, x22, x4 + + adds x16, x16, x13 + adcs x15, x15, x14 + adcs x18, x18, xzr + + // c18 is now in x16 + + // a8 * b11 + mul x13, x25, x4 + umulh x14, x25, x4 + + adds x15, x15, x13 + adcs x18, x18, x14 + adcs x17, xzr, xzr + + // a9 * b10 + mul x13, x26, x3 + umulh x14, x26, x3 + + adds x15, x15, x13 + adcs x18, x18, x14 + adcs x17, x17, xzr + + // a10 * b9 + mul x13, x5, x28 + umulh x14, x5, x28 + + adds x15, x15, x13 + adcs x18, x18, x14 + adcs x17, x17, xzr + + // a11 * b8 + mul x13, x6, x27 + umulh x14, x6, x27 + + adds x15, x15, x13 + adcs x18, x18, x14 + adcs x17, x17, xzr + + // store c18 and c19 + stp x16, x15, [x2, #144] + + // a11 * b9 + mul x13, x6, x28 + umulh x14, x6, x28 + + adds x18, x18, x13 + adcs x17, x17, x14 + adcs x16, xzr, xzr + + // a10 * b10 + mul x13, x5, x3 + umulh x14, x5, x3 + + adds x18, x18, x13 + adcs x17, x17, x14 + adcs x16, x16, xzr + + // a9 * b11 + mul x13, x26, x4 + umulh x14, x26, x4 + + adds x18, x18, x13 + adcs x17, x17, x14 + adcs x16, x16, xzr + + // c20 is now in x18 + + // a10 * b11 + mul x13, x5, x4 + umulh x14, x5, x4 + + adds x17, x17, x13 + adcs x16, x16, x14 + adcs x15, xzr, xzr + + // a11 * b10 + mul x13, x6, x3 + umulh x14, x6, x3 + + adds x17, x17, x13 + adcs x16, x16, x14 + adcs x15, x15, xzr + + // store c20 and c21 + stp x18, x17, [x2, #160] + + // a11 * b11 + mul x13, x4, x6 + umulh x14, x4, x6 + + adds x16, x16, x13 + adcs x15, x15, x14 + + // store c22 and c23 + stp x16, x15, [x2, #176] + + ldp x19, x20, [sp] + ldp x21, x22, [sp, #16] + ldp x23, x24, [sp, #32] + ldp x25, x26, [sp, #48] + ldp x27, x28, [sp, #64] + add sp, sp, #80 + ret + + +//*********************************************************************** +// Montgomery reduction +// Based on comba method +// Operation: mc [x1] = ma [x0] +// NOTE: ma=mc is not allowed +//*********************************************************************** +.global rdc751_asm +rdc751_asm: + // ma is in x0 + // mc is in x1 + + sub sp, sp, #80 + stp x19, x20, [sp] + stp x21, x22, [sp, #16] + stp x23, x24, [sp, #32] + stp x25, x26, [sp, #48] + stp x27, x28, [sp, #64] + + // load the prime values into x14 through x20 + ldr x14, p751p1 + 0 + ldr x15, p751p1 + 8 + ldr x16, p751p1 + 16 + ldr x17, p751p1 + 24 + ldr x18, p751p1 + 32 + ldr x19, p751p1 + 40 + ldr x20, p751p1 + 48 + + // the values mc[0] through mc[11] will be held in x2 through x13 + // until the very end when they will be stored + + // load mc[0] through mc[4] and ma[5] + ldp x2, x3, [x0, #0] + ldp x4, x5, [x0, #16] + ldp x6, x21, [x0, #32] + + // ma[5] iteration + mul x22, x2, x14 + umulh x23, x2, x14 + adds x24, x22, x21 + adcs x25, x23, xzr + add x7, x24, xzr // set mc[5] + + // ma[6] iteration + + ldr x21, [x0, #48] + + mul x22, x2, x15 + umulh x23, x2, x15 + adds x25, x25, x22 + adcs x26, x23, xzr + + mul x22, x3, x14 + umulh x23, x3, x14 + adds x25, x25, x22 + adcs x26, x26, x23 + adcs x24, xzr, xzr + + adds x25, x25, x21 + adcs x26, x26, xzr + adcs x24, x24, xzr + add x8, x25, xzr // set mc[6] + + // ma[7] iteration + + ldr x21, [x0, #56] + mul x22, x2, x16 + umulh x23, x2, x16 + adds x26, x26, x22 + adcs x24, x24, x23 + adcs x25, xzr, xzr + + mul x22, x3, x15 + umulh x23, x3, x15 + adds x26, x26, x22 + adcs x24, x24, x23 + adcs x25, x25, xzr + + mul x22, x4, x14 + umulh x23, x4, x14 + adds x26, x26, x22 + adcs x24, x24, x23 + adcs x25, x25, xzr + + adds x26, x26, x21 + adcs x24, x24, xzr + adcs x25, x25, xzr + add x9, x26, xzr // set mc[7] + + // ma[8] iteration + + ldr x21, [x0, #64] + mul x22, x2, x17 + umulh x23, x2, x17 + adds x24, x24, x22 + adcs x25, x25, x23 + adcs x26, xzr, xzr + + mul x22, x3, x16 + umulh x23, x3, x16 + adds x24, x24, x22 + adcs x25, x25, x23 + adcs x26, x26, xzr + + mul x22, x4, x15 + umulh x23, x4, x15 + adds x24, x24, x22 + adcs x25, x25, x23 + adcs x26, x26, xzr + + mul x22, x5, x14 + umulh x23, x5, x14 + adds x24, x24, x22 + adcs x25, x25, x23 + adcs x26, x26, xzr + + adds x24, x24, x21 + adcs x25, x25, xzr + adcs x26, x26, xzr + add x10, x24, xzr // set mc[8] + + // ma[9] iteration + + ldr x21, [x0, #72] + mul x22, x2, x18 + umulh x23, x2, x18 + adds x25, x25, x22 + adcs x26, x26, x23 + adcs x24, xzr, xzr + + mul x22, x3, x17 + umulh x23, x3, x17 + adds x25, x25, x22 + adcs x26, x26, x23 + adcs x24, x24, xzr + + mul x22, x4, x16 + umulh x23, x4, x16 + adds x25, x25, x22 + adcs x26, x26, x23 + adcs x24, x24, xzr + + mul x22, x5, x15 + umulh x23, x5, x15 + adds x25, x25, x22 + adcs x26, x26, x23 + adcs x24, x24, xzr + + mul x22, x6, x14 + umulh x23, x6, x14 + adds x25, x25, x22 + adcs x26, x26, x23 + adcs x24, x24, xzr + + adds x25, x25, x21 + adcs x26, x26, xzr + adcs x24, x24, xzr + add x11, x25, xzr // set mc[9] + + // ma[10] iteration + + ldr x21, [x0, #80] + mul x22, x2, x19 + umulh x23, x2, x19 + adds x26, x26, x22 + adcs x24, x24, x23 + adcs x25, xzr, xzr + + mul x22, x3, x18 + umulh x23, x3, x18 + adds x26, x26, x22 + adcs x24, x24, x23 + adcs x25, x25, xzr + + mul x22, x4, x17 + umulh x23, x4, x17 + adds x26, x26, x22 + adcs x24, x24, x23 + adcs x25, x25, xzr + + mul x22, x5, x16 + umulh x23, x5, x16 + adds x26, x26, x22 + adcs x24, x24, x23 + adcs x25, x25, xzr + + mul x22, x6, x15 + umulh x23, x6, x15 + adds x26, x26, x22 + adcs x24, x24, x23 + adcs x25, x25, xzr + + mul x22, x7, x14 + umulh x23, x7, x14 + adds x26, x26, x22 + adcs x24, x24, x23 + adcs x25, x25, xzr + + adds x26, x26, x21 + adcs x24, x24, xzr + adcs x25, x25, xzr + add x12, x26, xzr // set mc[10] + + // ma[11] iteration + ldr x21, [x0, #88] + + mul x22, x2, x20 + umulh x23, x2, x20 + adds x24, x24, x22 + adcs x25, x25, x23 + adcs x26, xzr, xzr + + mul x22, x3, x19 + umulh x23, x3, x19 + adds x24, x24, x22 + adcs x25, x25, x23 + adcs x26, x26, xzr + + mul x22, x4, x18 + umulh x23, x4, x18 + adds x24, x24, x22 + adcs x25, x25, x23 + adcs x26, x26, xzr + + mul x22, x5, x17 + umulh x23, x5, x17 + adds x24, x24, x22 + adcs x25, x25, x23 + adcs x26, x26, xzr + + mul x22, x6, x16 + umulh x23, x6, x16 + adds x24, x24, x22 + adcs x25, x25, x23 + adcs x26, x26, xzr + + mul x22, x7, x15 + umulh x23, x7, x15 + adds x24, x24, x22 + adcs x25, x25, x23 + adcs x26, x26, xzr + + mul x22, x8, x14 + umulh x23, x8, x14 + adds x24, x24, x22 + adcs x25, x25, x23 + adcs x26, x26, xzr + + adds x24, x24, x21 + adcs x25, x25, xzr + adcs x26, x26, xzr + add x13, x24, xzr // set mc[11] + + // ma[12] iteration + + ldr x21, [x0, #96] + mul x22, x3, x20 + umulh x23, x3, x20 + adds x25, x25, x22 + adcs x26, x26, x23 + adcs x24, xzr, xzr + + mul x22, x4, x19 + umulh x23, x4, x19 + adds x25, x25, x22 + adcs x26, x26, x23 + adcs x24, x24, xzr + + mul x22, x5, x18 + umulh x23, x5, x18 + adds x25, x25, x22 + adcs x26, x26, x23 + adcs x24, x24, xzr + + mul x22, x6, x17 + umulh x23, x6, x17 + adds x25, x25, x22 + adcs x26, x26, x23 + adcs x24, x24, xzr + + mul x22, x7, x16 + umulh x23, x7, x16 + adds x25, x25, x22 + adcs x26, x26, x23 + adcs x24, x24, xzr + + mul x22, x8, x15 + umulh x23, x8, x15 + adds x25, x25, x22 + adcs x26, x26, x23 + adcs x24, x24, xzr + + mul x22, x9, x14 + umulh x23, x9, x14 + adds x25, x25, x22 + adcs x26, x26, x23 + adcs x24, x24, xzr + + adds x25, x25, x21 + adcs x26, x26, xzr + adcs x24, x24, xzr + add x2, x25, xzr // set mc[0] + + // ma[13] iteration + + ldr x21, [x0, #104] + mul x22, x4, x20 + umulh x23, x4, x20 + adds x26, x26, x22 + adcs x24, x24, x23 + adcs x25, xzr, xzr + + mul x22, x5, x19 + umulh x23, x5, x19 + adds x26, x26, x22 + adcs x24, x24, x23 + adcs x25, x25, xzr + + mul x22, x6, x18 + umulh x23, x6, x18 + adds x26, x26, x22 + adcs x24, x24, x23 + adcs x25, x25, xzr + + mul x22, x7, x17 + umulh x23, x7, x17 + adds x26, x26, x22 + adcs x24, x24, x23 + adcs x25, x25, xzr + + mul x22, x8, x16 + umulh x23, x8, x16 + adds x26, x26, x22 + adcs x24, x24, x23 + adcs x25, x25, xzr + + mul x22, x9, x15 + umulh x23, x9, x15 + adds x26, x26, x22 + adcs x24, x24, x23 + adcs x25, x25, xzr + + mul x22, x10, x14 + umulh x23, x10, x14 + adds x26, x26, x22 + adcs x24, x24, x23 + adcs x25, x25, xzr + + adds x26, x26, x21 + adcs x24, x24, xzr + adcs x25, x25, xzr + add x3, x26, xzr // set mc[1] + + // ma[14] iteration + + ldr x21, [x0, #112] + mul x22, x5, x20 + umulh x23, x5, x20 + adds x24, x24, x22 + adcs x25, x25, x23 + adcs x26, xzr, xzr + + mul x22, x6, x19 + umulh x23, x6, x19 + adds x24, x24, x22 + adcs x25, x25, x23 + adcs x26, x26, xzr + + mul x22, x7, x18 + umulh x23, x7, x18 + adds x24, x24, x22 + adcs x25, x25, x23 + adcs x26, x26, xzr + + mul x22, x8, x17 + umulh x23, x8, x17 + adds x24, x24, x22 + adcs x25, x25, x23 + adcs x26, x26, xzr + + mul x22, x9, x16 + umulh x23, x9, x16 + adds x24, x24, x22 + adcs x25, x25, x23 + adcs x26, x26, xzr + + mul x22, x10, x15 + umulh x23, x10, x15 + adds x24, x24, x22 + adcs x25, x25, x23 + adcs x26, x26, xzr + + mul x22, x11, x14 + umulh x23, x11, x14 + adds x24, x24, x22 + adcs x25, x25, x23 + adcs x26, x26, xzr + + adds x24, x24, x21 + adcs x25, x25, xzr + adcs x26, x26, xzr + add x4, x24, xzr // set mc[2] + + // ma[15] iteration + + ldr x21, [x0, #120] + mul x22, x6, x20 + umulh x23, x6, x20 + adds x25, x25, x22 + adcs x26, x26, x23 + adcs x24, xzr, xzr + + mul x22, x7, x19 + umulh x23, x7, x19 + adds x25, x25, x22 + adcs x26, x26, x23 + adcs x24, x24, xzr + + mul x22, x8, x18 + umulh x23, x8, x18 + adds x25, x25, x22 + adcs x26, x26, x23 + adcs x24, x24, xzr + + mul x22, x9, x17 + umulh x23, x9, x17 + adds x25, x25, x22 + adcs x26, x26, x23 + adcs x24, x24, xzr + + mul x22, x10, x16 + umulh x23, x10, x16 + adds x25, x25, x22 + adcs x26, x26, x23 + adcs x24, x24, xzr + + mul x22, x11, x15 + umulh x23, x11, x15 + adds x25, x25, x22 + adcs x26, x26, x23 + adcs x24, x24, xzr + + mul x22, x12, x14 + umulh x23, x12, x14 + adds x25, x25, x22 + adcs x26, x26, x23 + adcs x24, x24, xzr + + adds x25, x25, x21 + adcs x26, x26, xzr + adcs x24, x24, xzr + add x5, x25, xzr // set mc[3] + + // ma[16] iteration + + ldr x21, [x0, #128] + mul x22, x7, x20 + umulh x23, x7, x20 + adds x26, x26, x22 + adcs x24, x24, x23 + adcs x25, xzr, xzr + + mul x22, x8, x19 + umulh x23, x8, x19 + adds x26, x26, x22 + adcs x24, x24, x23 + adcs x25, x25, xzr + + mul x22, x9, x18 + umulh x23, x9, x18 + adds x26, x26, x22 + adcs x24, x24, x23 + adcs x25, x25, xzr + + mul x22, x10, x17 + umulh x23, x10, x17 + adds x26, x26, x22 + adcs x24, x24, x23 + adcs x25, x25, xzr + + mul x22, x11, x16 + umulh x23, x11, x16 + adds x26, x26, x22 + adcs x24, x24, x23 + adcs x25, x25, xzr + + mul x22, x12, x15 + umulh x23, x12, x15 + adds x26, x26, x22 + adcs x24, x24, x23 + adcs x25, x25, xzr + + mul x22, x13, x14 + umulh x23, x13, x14 + adds x26, x26, x22 + adcs x24, x24, x23 + adcs x25, x25, xzr + + adds x26, x26, x21 + adcs x24, x24, xzr + adcs x25, x25, xzr + add x6, x26, xzr // set mc[4] + + // ma[17] iteration + + ldr x21, [x0, #136] + mul x22, x8, x20 + umulh x23, x8, x20 + adds x24, x24, x22 + adcs x25, x25, x23 + adcs x26, xzr, xzr + + mul x22, x9, x19 + umulh x23, x9, x19 + adds x24, x24, x22 + adcs x25, x25, x23 + adcs x26, x26, xzr + + mul x22, x10, x18 + umulh x23, x10, x18 + adds x24, x24, x22 + adcs x25, x25, x23 + adcs x26, x26, xzr + + mul x22, x11, x17 + umulh x23, x11, x17 + adds x24, x24, x22 + adcs x25, x25, x23 + adcs x26, x26, xzr + + mul x22, x12, x16 + umulh x23, x12, x16 + adds x24, x24, x22 + adcs x25, x25, x23 + adcs x26, x26, xzr + + mul x22, x13, x15 + umulh x23, x13, x15 + adds x24, x24, x22 + adcs x25, x25, x23 + adcs x26, x26, xzr + + adds x24, x24, x21 + adcs x25, x25, xzr + adcs x26, x26, xzr + add x7, x24, xzr // set mc[5] + + // ma[18] iteration + + ldr x21, [x0, #144] + mul x22, x9, x20 + umulh x23, x9, x20 + adds x25, x25, x22 + adcs x26, x26, x23 + adcs x24, xzr, xzr + + mul x22, x10, x19 + umulh x23, x10, x19 + adds x25, x25, x22 + adcs x26, x26, x23 + adcs x24, x24, xzr + + mul x22, x11, x18 + umulh x23, x11, x18 + adds x25, x25, x22 + adcs x26, x26, x23 + adcs x24, x24, xzr + + mul x22, x12, x17 + umulh x23, x12, x17 + adds x25, x25, x22 + adcs x26, x26, x23 + adcs x24, x24, xzr + + mul x22, x13, x16 + umulh x23, x13, x16 + adds x25, x25, x22 + adcs x26, x26, x23 + adcs x24, x24, xzr + + adds x25, x25, x21 + adcs x26, x26, xzr + adcs x24, x24, xzr + add x8, x25, xzr // set mc[6] + + // ma[19] iteration + + ldr x21, [x0, #152] + mul x22, x10, x20 + umulh x23, x10, x20 + adds x26, x26, x22 + adcs x24, x24, x23 + adcs x25, xzr, xzr + + mul x22, x11, x19 + umulh x23, x11, x19 + adds x26, x26, x22 + adcs x24, x24, x23 + adcs x25, x25, xzr + + mul x22, x12, x18 + umulh x23, x12, x18 + adds x26, x26, x22 + adcs x24, x24, x23 + adcs x25, x25, xzr + + mul x22, x13, x17 + umulh x23, x13, x17 + adds x26, x26, x22 + adcs x24, x24, x23 + adcs x25, x25, xzr + + adds x26, x26, x21 + adcs x24, x24, xzr + adcs x25, x25, xzr + add x9, x26, xzr // set mc[7] + + // ma[20] iteration + ldr x21, [x0, #160] + + mul x22, x11, x20 + umulh x23, x11, x20 + adds x24, x24, x22 + adcs x25, x25, x23 + adcs x26, xzr, xzr + + mul x22, x12, x19 + umulh x23, x12, x19 + adds x24, x24, x22 + adcs x25, x25, x23 + adcs x26, x26, xzr + + mul x22, x13, x18 + umulh x23, x13, x18 + adds x24, x24, x22 + adcs x25, x25, x23 + adcs x26, x26, xzr + + adds x24, x24, x21 + adcs x25, x25, xzr + adcs x26, x26, xzr + add x10, x24, xzr // set mc[8] + + // ma[21] iteration + + ldr x21, [x0, #168] + mul x22, x12, x20 + umulh x23, x12, x20 + adds x25, x25, x22 + adcs x26, x26, x23 + adcs x24, xzr, xzr + + mul x22, x13, x19 + umulh x23, x13, x19 + adds x25, x25, x22 + adcs x26, x26, x23 + adcs x24, x24, xzr + + adds x25, x25, x21 + adcs x26, x26, xzr + adcs x24, x24, xzr + add x11, x25, xzr // set mc[9] + + // ma[22] iteration + + ldr x21, [x0, #176] + mul x22, x13, x20 + umulh x23, x13, x20 + adds x26, x26, x22 + adcs x24, x24, x23 + adds x26, x26, x21 + + ldr x21, [x0, #184] + adcs x24, x24, x21 + add x12, x26, xzr // set mc[10] + add x13, x24, xzr // set mc[11] + + stp x2, x3, [x1, #0] + stp x4, x5, [x1, #16] + stp x6, x7, [x1, #32] + stp x8, x9, [x1, #48] + stp x10, x11, [x1, #64] + stp x12, x13, [x1, #80] + + ldp x19, x20, [sp] + ldp x21, x22, [sp, #16] + ldp x23, x24, [sp, #32] + ldp x25, x26, [sp, #48] + ldp x27, x28, [sp, #64] + add sp, sp, #80 + ret + + +//*********************************************************************** +// 751-bit multiprecision addition +// Operation: c [x2] = a [x0] + b [x1] +//*********************************************************************** +.global mp_add751_asm +mp_add751_asm: + ldp x3, x4, [x0,#0] + ldp x5, x6, [x0,#16] + ldp x7, x8, [x0,#32] + ldp x9, x10, [x0,#48] + ldp x11, x12, [x0,#64] + ldp x13, x14, [x0,#80] + + ldp x15, x16, [x1,#0] + ldp x17, x18, [x1,#16] + adds x3, x3, x15 + adcs x4, x4, x16 + adcs x5, x5, x17 + adcs x6, x6, x18 + ldp x15, x16, [x1,#32] + ldp x17, x18, [x1,#48] + adcs x7, x7, x15 + adcs x8, x8, x16 + adcs x9, x9, x17 + adcs x10, x10, x18 + ldp x15, x16, [x1,#64] + ldp x17, x18, [x1,#80] + adcs x11, x11, x15 + adcs x12, x12, x16 + adcs x13, x13, x17 + adc x14, x14, x18 + + stp x3, x4, [x2,#0] + stp x5, x6, [x2,#16] + stp x7, x8, [x2,#32] + stp x9, x10, [x2,#48] + stp x11, x12, [x2,#64] + stp x13, x14, [x2,#80] + ret + + +//*********************************************************************** +// 2x751-bit multiprecision addition +// Operation: c [x2] = a [x0] + b [x1] +//*********************************************************************** +.global mp_add751x2_asm +mp_add751x2_asm: + ldp x3, x4, [x0,#0] + ldp x5, x6, [x0,#16] + ldp x7, x8, [x0,#32] + ldp x9, x10, [x0,#48] + ldp x11, x12, [x0,#64] + ldp x13, x14, [x0,#80] + + ldp x15, x16, [x1,#0] + ldp x17, x18, [x1,#16] + adds x3, x3, x15 + adcs x4, x4, x16 + adcs x5, x5, x17 + adcs x6, x6, x18 + ldp x15, x16, [x1,#32] + ldp x17, x18, [x1,#48] + adcs x7, x7, x15 + adcs x8, x8, x16 + adcs x9, x9, x17 + adcs x10, x10, x18 + ldp x15, x16, [x1,#64] + ldp x17, x18, [x1,#80] + adcs x11, x11, x15 + adcs x12, x12, x16 + adcs x13, x13, x17 + adcs x14, x14, x18 + + stp x3, x4, [x2,#0] + stp x5, x6, [x2,#16] + stp x7, x8, [x2,#32] + stp x9, x10, [x2,#48] + stp x11, x12, [x2,#64] + stp x13, x14, [x2,#80] + + ldp x3, x4, [x0,#96] + ldp x5, x6, [x0,#112] + ldp x7, x8, [x0,#128] + ldp x9, x10, [x0,#144] + ldp x11, x12, [x0,#160] + ldp x13, x14, [x0,#176] + + ldp x15, x16, [x1,#96] + ldp x17, x18, [x1,#112] + adcs x3, x3, x15 + adcs x4, x4, x16 + adcs x5, x5, x17 + adcs x6, x6, x18 + ldp x15, x16, [x1,#128] + ldp x17, x18, [x1,#144] + adcs x7, x7, x15 + adcs x8, x8, x16 + adcs x9, x9, x17 + adcs x10, x10, x18 + ldp x15, x16, [x1,#160] + ldp x17, x18, [x1,#176] + adcs x11, x11, x15 + adcs x12, x12, x16 + adcs x13, x13, x17 + adc x14, x14, x18 + + stp x3, x4, [x2,#96] + stp x5, x6, [x2,#112] + stp x7, x8, [x2,#128] + stp x9, x10, [x2,#144] + stp x11, x12, [x2,#160] + stp x13, x14, [x2,#176] + ret + + +//*********************************************************************** +// 2x751-bit multiprecision subtraction +// Operation: c [x2] = a [x0] - b [x1]. Returns borrow mask +//*********************************************************************** +.global mp_sub751x2_asm +mp_sub751x2_asm: + ldp x3, x4, [x0,#0] + ldp x5, x6, [x0,#16] + ldp x7, x8, [x0,#32] + ldp x9, x10, [x0,#48] + ldp x11, x12, [x0,#64] + ldp x13, x14, [x0,#80] + + ldp x15, x16, [x1,#0] + ldp x17, x18, [x1,#16] + subs x3, x3, x15 + sbcs x4, x4, x16 + sbcs x5, x5, x17 + sbcs x6, x6, x18 + ldp x15, x16, [x1,#32] + ldp x17, x18, [x1,#48] + sbcs x7, x7, x15 + sbcs x8, x8, x16 + sbcs x9, x9, x17 + sbcs x10, x10, x18 + ldp x15, x16, [x1,#64] + ldp x17, x18, [x1,#80] + sbcs x11, x11, x15 + sbcs x12, x12, x16 + sbcs x13, x13, x17 + sbcs x14, x14, x18 + + stp x3, x4, [x2,#0] + stp x5, x6, [x2,#16] + stp x7, x8, [x2,#32] + stp x9, x10, [x2,#48] + stp x11, x12, [x2,#64] + stp x13, x14, [x2,#80] + + ldp x3, x4, [x0,#96] + ldp x5, x6, [x0,#112] + ldp x7, x8, [x0,#128] + ldp x9, x10, [x0,#144] + ldp x11, x12, [x0,#160] + ldp x13, x14, [x0,#176] + + ldp x15, x16, [x1,#96] + ldp x17, x18, [x1,#112] + sbcs x3, x3, x15 + sbcs x4, x4, x16 + sbcs x5, x5, x17 + sbcs x6, x6, x18 + ldp x15, x16, [x1,#128] + ldp x17, x18, [x1,#144] + sbcs x7, x7, x15 + sbcs x8, x8, x16 + sbcs x9, x9, x17 + sbcs x10, x10, x18 + ldp x15, x16, [x1,#160] + ldp x17, x18, [x1,#176] + sbcs x11, x11, x15 + sbcs x12, x12, x16 + sbcs x13, x13, x17 + sbcs x14, x14, x18 + sbc x0, xzr, xzr + + stp x3, x4, [x2,#96] + stp x5, x6, [x2,#112] + stp x7, x8, [x2,#128] + stp x9, x10, [x2,#144] + stp x11, x12, [x2,#160] + stp x13, x14, [x2,#176] + ret + + +//*********************************************************************** +// Double 2x751-bit multiprecision subtraction +// Operation: c [x2] = c [x2] - a [x0] - b [x1] +//*********************************************************************** +.global mp_dblsub751x2_asm +mp_dblsub751x2_asm: + sub sp, sp, #96 + stp x19, x20, [sp] + stp x21, x22, [sp, #16] + stp x23, x24, [sp, #32] + stp x25, x26, [sp, #48] + stp x27, x28, [sp, #64] + stp x29, x30, [sp, #80] + ldp x3, x4, [x2,#0] + ldp x5, x6, [x2,#16] + ldp x7, x8, [x2,#32] + ldp x9, x10, [x2,#48] + ldp x11, x12, [x2,#64] + ldp x13, x14, [x2,#80] + ldp x15, x16, [x2,#96] + ldp x17, x18, [x2,#112] + ldp x19, x20, [x2,#128] + ldp x21, x22, [x2,#144] + ldp x23, x24, [x2,#160] + ldp x25, x26, [x2,#176] + + ldp x27, x28, [x0,#0] + ldp x29, x30, [x0,#16] + subs x3, x3, x27 + sbcs x4, x4, x28 + sbcs x5, x5, x29 + sbcs x6, x6, x30 + ldp x27, x28, [x0,#32] + ldp x29, x30, [x0,#48] + sbcs x7, x7, x27 + sbcs x8, x8, x28 + sbcs x9, x9, x29 + sbcs x10, x10, x30 + ldp x27, x28, [x0,#64] + ldp x29, x30, [x0,#80] + sbcs x11, x11, x27 + sbcs x12, x12, x28 + sbcs x13, x13, x29 + sbcs x14, x14, x30 + ldp x27, x28, [x0,#96] + ldp x29, x30, [x0,#112] + sbcs x15, x15, x27 + sbcs x16, x16, x28 + sbcs x17, x17, x29 + sbcs x18, x18, x30 + ldp x27, x28, [x0,#128] + ldp x29, x30, [x0,#144] + sbcs x19, x19, x27 + sbcs x20, x20, x28 + sbcs x21, x21, x29 + sbcs x22, x22, x30 + ldp x27, x28, [x0,#160] + ldp x29, x30, [x0,#176] + sbcs x23, x23, x27 + sbcs x24, x24, x28 + sbcs x25, x25, x29 + sbc x26, x26, x30 + + ldp x27, x28, [x1,#0] + ldp x29, x30, [x1,#16] + subs x3, x3, x27 + sbcs x4, x4, x28 + sbcs x5, x5, x29 + sbcs x6, x6, x30 + ldp x27, x28, [x1,#32] + ldp x29, x30, [x1,#48] + sbcs x7, x7, x27 + sbcs x8, x8, x28 + sbcs x9, x9, x29 + sbcs x10, x10, x30 + ldp x27, x28, [x1,#64] + ldp x29, x30, [x1,#80] + sbcs x11, x11, x27 + sbcs x12, x12, x28 + sbcs x13, x13, x29 + sbcs x14, x14, x30 + ldp x27, x28, [x1,#96] + ldp x29, x30, [x1,#112] + sbcs x15, x15, x27 + sbcs x16, x16, x28 + sbcs x17, x17, x29 + sbcs x18, x18, x30 + ldp x27, x28, [x1,#128] + ldp x29, x30, [x1,#144] + sbcs x19, x19, x27 + sbcs x20, x20, x28 + sbcs x21, x21, x29 + sbcs x22, x22, x30 + ldp x27, x28, [x1,#160] + ldp x29, x30, [x1,#176] + sbcs x23, x23, x27 + sbcs x24, x24, x28 + sbcs x25, x25, x29 + sbc x26, x26, x30 + + stp x3, x4, [x2,#0] + stp x5, x6, [x2,#16] + stp x7, x8, [x2,#32] + stp x9, x10, [x2,#48] + stp x11, x12, [x2,#64] + stp x13, x14, [x2,#80] + stp x15, x16, [x2,#96] + stp x17, x18, [x2,#112] + stp x19, x20, [x2,#128] + stp x21, x22, [x2,#144] + stp x23, x24, [x2,#160] + stp x25, x26, [x2,#176] + + ldp x19, x20, [sp] + ldp x21, x22, [sp, #16] + ldp x23, x24, [sp, #32] + ldp x25, x26, [sp, #48] + ldp x27, x28, [sp, #64] + ldp x29, x30, [sp, #80] + add sp, sp, #96 + ret diff --git a/third_party/sidh/src/P751/P751.c b/third_party/sidh/src/P751/P751.c new file mode 100644 index 00000000..ea7bcb78 --- /dev/null +++ b/third_party/sidh/src/P751/P751.c @@ -0,0 +1,131 @@ +/******************************************************************************************** +* SIDH: an efficient supersingular isogeny cryptography library +* +* Abstract: supersingular isogeny parameters and generation of functions for P751 +*********************************************************************************************/ + +#include "P751_api.h" +#include "P751_internal.h" + + +// Encoding of field elements, elements over Z_order, elements over GF(p^2) and elliptic curve points: +// -------------------------------------------------------------------------------------------------- +// Elements over GF(p) and Z_order are encoded with the least significant octet (and digit) located at the leftmost position (i.e., little endian format). +// Elements (a+b*i) over GF(p^2), where a and b are defined over GF(p), are encoded as {a, b}, with a in the least significant position. +// Elliptic curve points P = (x,y) are encoded as {x, y}, with x in the least significant position. +// Internally, the number of digits used to represent all these elements is obtained by approximating the number of bits to the immediately greater multiple of 32. +// For example, a 751-bit field element is represented with Ceil(751 / 64) = 12 64-bit digits or Ceil(751 / 32) = 24 32-bit digits. + +// +// Curve isogeny system "SIDHp751". Base curve: Montgomery curve By^2 = Cx^3 + Ax^2 + Cx defined over GF(p751^2), where A=0, B=1, C=1 and p751 = 2^372*3^239-1 +// + +const uint64_t p751[NWORDS64_FIELD] = { 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0xEEAFFFFFFFFFFFFF, + 0xE3EC968549F878A8, 0xDA959B1A13F7CC76, 0x084E9867D6EBE876, 0x8562B5045CB25748, 0x0E12909F97BADC66, 0x00006FE5D541F71C }; +const uint64_t p751p1[NWORDS64_FIELD] = { 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0xEEB0000000000000, + 0xE3EC968549F878A8, 0xDA959B1A13F7CC76, 0x084E9867D6EBE876, 0x8562B5045CB25748, 0x0E12909F97BADC66, 0x00006FE5D541F71C }; +const uint64_t p751x2[NWORDS64_FIELD] = { 0xFFFFFFFFFFFFFFFE, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0xFFFFFFFFFFFFFFFF, 0xDD5FFFFFFFFFFFFF, + 0xC7D92D0A93F0F151, 0xB52B363427EF98ED, 0x109D30CFADD7D0ED, 0x0AC56A08B964AE90, 0x1C25213F2F75B8CD, 0x0000DFCBAA83EE38 }; +// Order of Alice's subgroup +const uint64_t Alice_order[NWORDS64_ORDER] = { 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0010000000000000 }; +// Order of Bob's subgroup +const uint64_t Bob_order[NWORDS64_ORDER] = { 0xC968549F878A8EEB, 0x59B1A13F7CC76E3E, 0xE9867D6EBE876DA9, 0x2B5045CB25748084, 0x2909F97BADC66856, 0x06FE5D541F71C0E1 }; +// Alice's generator values {XPA0 + XPA1*i, XQA0, XRA0 + XRA1*i} in GF(p751^2), expressed in Montgomery representation +const uint64_t A_gen[5 * NWORDS64_FIELD] = { 0xC2FC08CEAB50AD8B, 0x1D7D710F55E457B1, 0xE8738D92953DCD6E, 0xBAA7EBEE8A3418AA, 0xC9A288345F03F46F, 0xC8D18D167CFE2616, + 0x02043761F6B1C045, 0xAA1975E13180E7E9, 0x9E13D3FDC6690DE6, 0x3A024640A3A3BB4F, 0x4E5AD44E6ACBBDAE, 0x0000544BEB561DAD, // XPA0 + 0xE6CC41D21582E411, 0x07C2ECB7C5DF400A, 0xE8E34B521432AEC4, 0x50761E2AB085167D, 0x032CFBCAA6094B3C, 0x6C522F5FDF9DDD71, + 0x1319217DC3A1887D, 0xDC4FB25803353A86, 0x362C8D7B63A6AB09, 0x39DCDFBCE47EA488, 0x4C27C99A2C28D409, 0x00003CB0075527C4, // XPA1 + 0xD56FE52627914862, 0x1FAD60DC96B5BAEA, 0x01E137D0BF07AB91, 0x404D3E9252161964, 0x3C5385E4CD09A337, 0x4476426769E4AF73, + 0x9790C6DB989DFE33, 0xE06E1C04D2AA8B5E, 0x38C08185EDEA73B9, 0xAA41F678A4396CA6, 0x92B9259B2229E9A0, 0x00002F9326818BE0, // XQA0 + 0x0BB84441DFFD19B3, 0x84B4DEA99B48C18E, 0x692DE648AD313805, 0xE6D72761B6DFAEE0, 0x223975C672C3058D, 0xA0FDE0C3CBA26FDC, + 0xA5326132A922A3CA, 0xCA5E7F5D5EA96FA4, 0x127C7EFE33FFA8C6, 0x4749B1567E2A23C4, 0x2B7DF5B4AF413BFA, 0x0000656595B9623C, // XRA0 + 0xED78C17F1EC71BE8, 0xF824D6DF753859B1, 0x33A10839B2A8529F, 0xFC03E9E25FDEA796, 0xC4708A8054DF1762, 0x4034F2EC034C6467, + 0xABFB70FBF06ECC79, 0xDABE96636EC108B7, 0x49CBCFB090605FD3, 0x20B89711819A45A7, 0xFB8E1590B2B0F63E, 0x0000556A5F964AB2 }; // XRA1 +// Bob's generator values {XPB0 + XPB1*i, XQB0, XRB0 + XRB1*i} in GF(p751^2), expressed in Montgomery representation +const uint64_t B_gen[5 * NWORDS64_FIELD] = { 0xCFB6D71EF867AB0B, 0x4A5FDD76E9A45C76, 0x38B1EE69194B1F03, 0xF6E7B18A7761F3F0, 0xFCF01A486A52C84C, 0xCBE2F63F5AA75466, + 0x6487BCE837B5E4D6, 0x7747F5A8C622E9B8, 0x4CBFE1E4EE6AEBBA, 0x8A8616A13FA91512, 0x53DB980E1579E0A5, 0x000058FEBFF3BE69, // XPB0 + 0xA492034E7C075CC3, 0x677BAF00B04AA430, 0x3AAE0C9A755C94C8, 0x1DC4B064E9EBB08B, 0x3684EDD04E826C66, 0x9BAA6CB661F01B22, + 0x20285A00AD2EFE35, 0xDCE95ABD0497065F, 0x16C7FBB3778E3794, 0x26B3AC29CEF25AAF, 0xFB3C28A31A30AC1D, 0x000046ED190624EE, // XPB1 + 0xF1A8C9ED7B96C4AB, 0x299429DA5178486E, 0xEF4926F20CD5C2F4, 0x683B2E2858B4716A, 0xDDA2FBCC3CAC3EEB, 0xEC055F9F3A600460, + 0xD5A5A17A58C3848B, 0x4652D836F42EAED5, 0x2F2E71ED78B3A3B3, 0xA771C057180ADD1D, 0xC780A5D2D835F512, 0x0000114EA3B55AC1, // XQB0 + 0x1C0D6733769D0F31, 0xF084C3086E2659D1, 0xE23D5DA27BCBD133, 0xF38EC9A8D5864025, 0x6426DC781B3B645B, 0x4B24E8E3C9FB03EE, + 0x6432792F9D2CEA30, 0x7CC8E8B1AE76E857, 0x7F32BFB626BB8963, 0xB9F05995B48D7B74, 0x4D71200A7D67E042, 0x0000228457AF0637, // XRB0 + 0x4AE37E7D8F72BD95, 0xDD2D504B3E993488, 0x5D14E7FA1ECB3C3E, 0x127610CEB75D6350, 0x255B4B4CAC446B11, 0x9EA12336C1F70CAF, + 0x79FA68A2147BC2F8, 0x11E895CFDADBBC49, 0xE4B9D3C4D6356C18, 0x44B25856A67F951C, 0x5851541F61308D0B, 0x00002FFD994F7E4C }; // XRB1 +// Montgomery constant Montgomery_R2 = (2^768)^2 mod p751 +const uint64_t Montgomery_R2[NWORDS64_FIELD] = { 0x233046449DAD4058, 0xDB010161A696452A, 0x5E36941472E3FD8E, 0xF40BFE2082A2E706, 0x4932CCA8904F8751 ,0x1F735F1F1EE7FC81, + 0xA24F4D80C1048E18, 0xB56C383CCDB607C5, 0x441DD47B735F9C90, 0x5673ED2C6A6AC82A, 0x06C905261132294B, 0x000041AD830F1F35 }; +// Value one in Montgomery representation +const uint64_t Montgomery_one[NWORDS64_FIELD] = { 0x00000000000249ad, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x0000000000000000, 0x8310000000000000, + 0x5527b1e4375c6c66, 0x697797bf3f4f24d0, 0xc89db7b2ac5c4e2e, 0x4ca4b439d2076956, 0x10f7926c7512c7e9, 0x00002d5b24bce5e2 }; +// Value (2^384)^2 mod 3^239 +const uint64_t Montgomery_Rprime[NWORDS64_ORDER] = { 0x1A55482318541298, 0x070A6370DFA12A03, 0xCB1658E0E3823A40, 0xB3B7384EB5DEF3F9, 0xCBCA952F7006EA33, 0x00569EF8EC94864C }; +// Value -(3^239)^-1 mod 2^384 +const uint64_t Montgomery_rprime[NWORDS64_ORDER] = { 0x48062A91D3AB563D, 0x6CE572751303C2F5, 0x5D1319F3F160EC9D, 0xE35554E8C2D5623A, 0xCA29300232BC79A5, 0x8AAD843D646D78C5 }; +// Value order_Bob/3 mod p751 +const uint64_t Border_div3[NWORDS_ORDER] = { 0xEDCD718A828384F9, 0x733B35BFD4427A14, 0xF88229CF94D7CF38, 0x63C56C990C7C2AD6, 0xB858A87E8F4222C7, 0x0254C9C6B525EAF5 }; + + +// Fixed parameters for isogeny tree computation +const unsigned int strat_Alice[MAX_Alice-1] = { +80, 48, 27, 15, 8, 4, 2, 1, 1, 2, 1, 1, 4, 2, 1, 1, 2, 1, 1, 7, 4, 2, 1, 1, 2, 1, +1, 3, 2, 1, 1, 1, 1, 12, 7, 4, 2, 1, 1, 2, 1, 1, 3, 2, 1, 1, 1, 1, 5, 3, 2, 1, 1, +1, 1, 2, 1, 1, 1, 21, 12, 7, 4, 2, 1, 1, 2, 1, 1, 3, 2, 1, 1, 1, 1, 5, 3, 2, 1, +1, 1, 1, 2, 1, 1, 1, 9, 5, 3, 2, 1, 1, 1, 1, 2, 1, 1, 1, 4, 2, 1, 1, 1, 2, 1, 1, +33, 20, 12, 7, 4, 2, 1, 1, 2, 1, 1, 3, 2, 1, 1, 1, 1, 5, 3, 2, 1, 1, 1, 1, 2, 1, +1, 1, 8, 5, 3, 2, 1, 1, 1, 1, 2, 1, 1, 1, 4, 2, 1, 1, 2, 1, 1, 16, 8, 4, 2, 1, 1, +1, 2, 1, 1, 4, 2, 1, 1, 2, 1, 1, 8, 4, 2, 1, 1, 2, 1, 1, 4, 2, 1, 1, 2, 1, 1 }; + +const unsigned int strat_Bob[MAX_Bob-1] = { +112, 63, 32, 16, 8, 4, 2, 1, 1, 2, 1, 1, 4, 2, 1, 1, 2, 1, 1, 8, 4, 2, 1, 1, 2, 1, +1, 4, 2, 1, 1, 2, 1, 1, 16, 8, 4, 2, 1, 1, 2, 1, 1, 4, 2, 1, 1, 2, 1, 1, 8, 4, 2, +1, 1, 2, 1, 1, 4, 2, 1, 1, 2, 1, 1, 31, 16, 8, 4, 2, 1, 1, 2, 1, 1, 4, 2, 1, 1, 2, +1, 1, 8, 4, 2, 1, 1, 2, 1, 1, 4, 2, 1, 1, 2, 1, 1, 15, 8, 4, 2, 1, 1, 2, 1, 1, 4, +2, 1, 1, 2, 1, 1, 7, 4, 2, 1, 1, 2, 1, 1, 3, 2, 1, 1, 1, 1, 49, 31, 16, 8, 4, 2, +1, 1, 2, 1, 1, 4, 2, 1, 1, 2, 1, 1, 8, 4, 2, 1, 1, 2, 1, 1, 4, 2, 1, 1, 2, 1, 1, +15, 8, 4, 2, 1, 1, 2, 1, 1, 4, 2, 1, 1, 2, 1, 1, 7, 4, 2, 1, 1, 2, 1, 1, 3, 2, 1, +1, 1, 1, 21, 12, 8, 4, 2, 1, 1, 2, 1, 1, 4, 2, 1, 1, 2, 1, 1, 5, 3, 2, 1, 1, 1, 1, +2, 1, 1, 1, 9, 5, 3, 2, 1, 1, 1, 1, 2, 1, 1, 1, 4, 2, 1, 1, 1, 2, 1, 1 }; + +// Setting up macro defines and including GF(p), GF(p^2), curve, isogeny and kex functions +#define fpcopy fpcopy751 +#define fpzero fpzero751 +#define fpadd fpadd751 +#define fpsub fpsub751 +#define fpneg fpneg751 +#define fpdiv2 fpdiv2_751 +#define fpcorrection fpcorrection751 +#define fpmul_mont fpmul751_mont +#define fpsqr_mont fpsqr751_mont +#define fpinv_mont fpinv751_mont +#define fpinv_chain_mont fpinv751_chain_mont +#define fpinv_mont_bingcd fpinv751_mont_bingcd +#define fp2copy fp2copy751 +#define fp2zero fp2zero751 +#define fp2add fp2add751 +#define fp2sub fp2sub751 +#define fp2neg fp2neg751 +#define fp2div2 fp2div2_751 +#define fp2correction fp2correction751 +#define fp2mul_mont fp2mul751_mont +#define fp2sqr_mont fp2sqr751_mont +#define fp2inv_mont fp2inv751_mont +#define fp2inv_mont_bingcd fp2inv751_mont_bingcd +#define fpequal_non_constant_time fpequal751_non_constant_time +#define mp_add_asm mp_add751_asm +#define mp_subx2_asm mp_sub751x2_asm +#define mp_dblsubx2_asm mp_dblsub751x2_asm +#define crypto_kem_keypair crypto_kem_keypair_SIKEp751 +#define crypto_kem_enc crypto_kem_enc_SIKEp751 +#define crypto_kem_dec crypto_kem_dec_SIKEp751 +#define random_mod_order_A random_mod_order_A_SIDHp751 +#define random_mod_order_B random_mod_order_B_SIDHp751 +#define EphemeralKeyGeneration_A EphemeralKeyGeneration_A_SIDHp751 +#define EphemeralKeyGeneration_B EphemeralKeyGeneration_B_SIDHp751 +#define EphemeralSecretAgreement_A EphemeralSecretAgreement_A_SIDHp751 +#define EphemeralSecretAgreement_B EphemeralSecretAgreement_B_SIDHp751 + +#include "../fpx.c" +#include "../ec_isogeny.c" +#include "../sidh.c" +#include "../sike.c" diff --git a/third_party/sidh/src/P751/P751_api.h b/third_party/sidh/src/P751/P751_api.h new file mode 100644 index 00000000..269decda --- /dev/null +++ b/third_party/sidh/src/P751/P751_api.h @@ -0,0 +1,107 @@ +/******************************************************************************************** +* SIDH: an efficient supersingular isogeny cryptography library +* +* Abstract: API header file for P751 +*********************************************************************************************/ + +#ifndef __P751_API_H__ +#define __P751_API_H__ + + +/*********************** Key encapsulation mechanism API ***********************/ + +#define CRYPTO_SECRETKEYBYTES 644 // MSG_BYTES + SECRETKEY_B_BYTES + CRYPTO_PUBLICKEYBYTES bytes +#define CRYPTO_PUBLICKEYBYTES 564 +#define CRYPTO_BYTES 24 +#define CRYPTO_CIPHERTEXTBYTES 596 // CRYPTO_PUBLICKEYBYTES + MSG_BYTES bytes + +// Algorithm name +#define CRYPTO_ALGNAME "SIKEp751" + +// SIKE's key generation +// It produces a private key sk and computes the public key pk. +// Outputs: secret key sk (CRYPTO_SECRETKEYBYTES = 644 bytes) +// public key pk (CRYPTO_PUBLICKEYBYTES = 564 bytes) +int crypto_kem_keypair_SIKEp751(unsigned char *pk, unsigned char *sk); + +// SIKE's encapsulation +// Input: public key pk (CRYPTO_PUBLICKEYBYTES = 564 bytes) +// Outputs: shared secret ss (CRYPTO_BYTES = 24 bytes) +// ciphertext message ct (CRYPTO_CIPHERTEXTBYTES = 596 bytes) +int crypto_kem_enc_SIKEp751(unsigned char *ct, unsigned char *ss, const unsigned char *pk); + +// SIKE's decapsulation +// Input: secret key sk (CRYPTO_SECRETKEYBYTES = 644 bytes) +// ciphertext message ct (CRYPTO_CIPHERTEXTBYTES = 596 bytes) +// Outputs: shared secret ss (CRYPTO_BYTES = 24 bytes) +int crypto_kem_dec_SIKEp751(unsigned char *ss, const unsigned char *ct, const unsigned char *sk); + + +// Encoding of keys for KEM-based isogeny system "SIKEp751" (wire format): +// ---------------------------------------------------------------------- +// Elements over GF(p751) are encoded in 94 octets in little endian format (i.e., the least significant octet is located in the lowest memory address). +// Elements (a+b*i) over GF(p751^2), where a and b are defined over GF(p751), are encoded as {a, b}, with a in the lowest memory portion. +// +// Private keys sk consist of the concatenation of a 32-byte random value, a value in the range [0, 2^378-1] and the public key pk. In the SIKE API, +// private keys are encoded in 644 octets in little endian format. +// Public keys pk consist of 3 elements in GF(p751^2). In the SIKE API, pk is encoded in 564 octets. +// Ciphertexts ct consist of the concatenation of a public key value and a 32-byte value. In the SIKE API, ct is encoded in 564 + 32 = 596 octets. +// Shared keys ss consist of a value of 24 octets. + + +/*********************** Ephemeral key exchange API ***********************/ + +#define SIDH_SECRETKEYBYTES 48 +#define SIDH_PUBLICKEYBYTES 564 +#define SIDH_BYTES 188 + +// SECURITY NOTE: SIDH supports ephemeral Diffie-Hellman key exchange. It is NOT secure to use it with static keys. +// See "On the Security of Supersingular Isogeny Cryptosystems", S.D. Galbraith, C. Petit, B. Shani and Y.B. Ti, in ASIACRYPT 2016, 2016. +// Extended version available at: http://eprint.iacr.org/2016/859 + +// Generation of Alice's secret key +// Outputs random value in [0, 2^372 - 1] to be used as Alice's private key +void random_mod_order_A_SIDHp751(unsigned char* random_digits); + +// Generation of Bob's secret key +// Outputs random value in [0, 2^Floor(Log(2,3^239)) - 1] to be used as Bob's private key +void random_mod_order_B_SIDHp751(unsigned char* random_digits); + +// Alice's ephemeral public key generation +// Input: a private key PrivateKeyA in the range [0, 2^372 - 1], stored in 47 bytes. +// Output: the public key PublicKeyA consisting of 3 GF(p751^2) elements encoded in 564 bytes. +int EphemeralKeyGeneration_A_SIDHp751(const unsigned char* PrivateKeyA, unsigned char* PublicKeyA); + +// Bob's ephemeral key-pair generation +// It produces a private key PrivateKeyB and computes the public key PublicKeyB. +// The private key is an integer in the range [0, 2^Floor(Log(2,3^239)) - 1], stored in 48 bytes. +// The public key consists of 3 GF(p751^2) elements encoded in 564 bytes. +int EphemeralKeyGeneration_B_SIDHp751(const unsigned char* PrivateKeyB, unsigned char* PublicKeyB); + +// Alice's ephemeral shared secret computation +// It produces a shared secret key SharedSecretA using her secret key PrivateKeyA and Bob's public key PublicKeyB +// Inputs: Alice's PrivateKeyA is an integer in the range [0, 2^372 - 1], stored in 47 bytes. +// Bob's PublicKeyB consists of 3 GF(p751^2) elements encoded in 564 bytes. +// Output: a shared secret SharedSecretA that consists of one element in GF(p751^2) encoded in 188 bytes. +int EphemeralSecretAgreement_A_SIDHp751(const unsigned char* PrivateKeyA, const unsigned char* PublicKeyB, unsigned char* SharedSecretA); + +// Bob's ephemeral shared secret computation +// It produces a shared secret key SharedSecretB using his secret key PrivateKeyB and Alice's public key PublicKeyA +// Inputs: Bob's PrivateKeyB is an integer in the range [0, 2^Floor(Log(2,3^239)) - 1], stored in 48 bytes. +// Alice's PublicKeyA consists of 3 GF(p751^2) elements encoded in 564 bytes. +// Output: a shared secret SharedSecretB that consists of one element in GF(p751^2) encoded in 188 bytes. +int EphemeralSecretAgreement_B_SIDHp751(const unsigned char* PrivateKeyB, const unsigned char* PublicKeyA, unsigned char* SharedSecretB); + + +// Encoding of keys for KEX-based isogeny system "SIDHp751" (wire format): +// ---------------------------------------------------------------------- +// Elements over GF(p751) are encoded in 94 octets in little endian format (i.e., the least significant octet is located in the lowest memory address). +// Elements (a+b*i) over GF(p751^2), where a and b are defined over GF(p751), are encoded as {a, b}, with a in the lowest memory portion. +// +// Private keys PrivateKeyA and PrivateKeyB can have values in the range [0, 2^372-1] and [0, 2^378-1], resp. In the SIDH API, private keys are encoded +// in 48 octets in little endian format. +// Public keys PublicKeyA and PublicKeyB consist of 3 elements in GF(p751^2). In the SIDH API, they are encoded in 564 octets. +// Shared keys SharedSecretA and SharedSecretB consist of one element in GF(p751^2). In the SIDH API, they are encoded in 188 octets. + + +#endif \ No newline at end of file diff --git a/third_party/sidh/src/P751/P751_internal.h b/third_party/sidh/src/P751/P751_internal.h new file mode 100644 index 00000000..ffa52530 --- /dev/null +++ b/third_party/sidh/src/P751/P751_internal.h @@ -0,0 +1,245 @@ +/******************************************************************************************** +* SIDH: an efficient supersingular isogeny cryptography library +* +* Abstract: internal header file for P751 +*********************************************************************************************/ + +#ifndef __P751_INTERNAL_H__ +#define __P751_INTERNAL_H__ + +#include "../config.h" + + +#if (TARGET == TARGET_AMD64) + #define NWORDS_FIELD 12 // Number of words of a 751-bit field element + #define p751_ZERO_WORDS 5 // Number of "0" digits in the least significant part of p751 + 1 +#elif (TARGET == TARGET_x86) + #define NWORDS_FIELD 24 + #define p751_ZERO_WORDS 11 +#elif (TARGET == TARGET_ARM) + #define NWORDS_FIELD 24 + #define p751_ZERO_WORDS 11 +#elif (TARGET == TARGET_ARM64) + #define NWORDS_FIELD 12 + #define p751_ZERO_WORDS 5 +#endif + + +// Basic constants + +#define NBITS_FIELD 751 +#define MAXBITS_FIELD 768 +#define MAXWORDS_FIELD ((MAXBITS_FIELD+RADIX-1)/RADIX) // Max. number of words to represent field elements +#define NWORDS64_FIELD ((NBITS_FIELD+63)/64) // Number of 64-bit words of a 751-bit field element +#define NBITS_ORDER 384 +#define NWORDS_ORDER ((NBITS_ORDER+RADIX-1)/RADIX) // Number of words of oA and oB, where oA and oB are the subgroup orders of Alice and Bob, resp. +#define NWORDS64_ORDER ((NBITS_ORDER+63)/64) // Number of 64-bit words of a 384-bit element +#define MAXBITS_ORDER NBITS_ORDER +#define MAXWORDS_ORDER ((MAXBITS_ORDER+RADIX-1)/RADIX) // Max. number of words to represent elements in [1, oA-1] or [1, oB]. +#define ALICE 0 +#define BOB 1 +#define OALICE_BITS 372 +#define OBOB_BITS 379 +#define OBOB_EXPON 239 +#define MASK_ALICE 0x0F +#define MASK_BOB 0x03 +#define PRIME p751 +#define PARAM_A 0 +#define PARAM_C 1 +// Fixed parameters for isogeny tree computation +#define MAX_INT_POINTS_ALICE 8 +#define MAX_INT_POINTS_BOB 10 +#define MAX_Alice 186 +#define MAX_Bob 239 +#define MSG_BYTES 32 +#define SECRETKEY_A_BYTES (OALICE_BITS + 7) / 8 +#define SECRETKEY_B_BYTES (OBOB_BITS + 7) / 8 +#define FP2_ENCODED_BYTES 2*((NBITS_FIELD + 7) / 8) + +// SIDH's basic element definitions and point representations + +typedef digit_t felm_t[NWORDS_FIELD]; // Datatype for representing 751-bit field elements (768-bit max.) +typedef digit_t dfelm_t[2*NWORDS_FIELD]; // Datatype for representing double-precision 2x751-bit field elements (2x768-bit max.) +typedef felm_t f2elm_t[2]; // Datatype for representing quadratic extension field elements GF(p751^2) + +typedef struct { f2elm_t X; f2elm_t Z; } point_proj; // Point representation in projective XZ Montgomery coordinates. +typedef point_proj point_proj_t[1]; + + + +/**************** Function prototypes ****************/ +/************* Multiprecision functions **************/ + +// Copy wordsize digits, c = a, where lng(a) = nwords +void copy_words(const digit_t* a, digit_t* c, const unsigned int nwords); + +// Multiprecision addition, c = a+b, where lng(a) = lng(b) = nwords. Returns the carry bit +unsigned int mp_add(const digit_t* a, const digit_t* b, digit_t* c, const unsigned int nwords); + +// 751-bit multiprecision addition, c = a+b +void mp_add751(const digit_t* a, const digit_t* b, digit_t* c); +void mp_add751_asm(const digit_t* a, const digit_t* b, digit_t* c); + +// Multiprecision subtraction, c = a-b, where lng(a) = lng(b) = nwords. Returns the borrow bit +unsigned int mp_sub(const digit_t* a, const digit_t* b, digit_t* c, const unsigned int nwords); +digit_t mp_sub751x2_asm(const digit_t* a, const digit_t* b, digit_t* c); + +// Double 2x751-bit multiprecision subtraction, c = c-a-b, where c > a and c > b +void mp_dblsub751x2_asm(const digit_t* a, const digit_t* b, digit_t* c); + +// Multiprecision left shift +void mp_shiftleft(digit_t* x, unsigned int shift, const unsigned int nwords); + +// Multiprecision right shift by one +void mp_shiftr1(digit_t* x, const unsigned int nwords); + +// Multiprecision left right shift by one +void mp_shiftl1(digit_t* x, const unsigned int nwords); + +// Digit multiplication, digit * digit -> 2-digit result +void digit_x_digit(const digit_t a, const digit_t b, digit_t* c); + +// Multiprecision comba multiply, c = a*b, where lng(a) = lng(b) = nwords. +void mp_mul(const digit_t* a, const digit_t* b, digit_t* c, const unsigned int nwords); + +/************ Field arithmetic functions *************/ + +// Copy of a field element, c = a +void fpcopy751(const digit_t* a, digit_t* c); + +// Zeroing a field element, a = 0 +void fpzero751(digit_t* a); + +// Non constant-time comparison of two field elements. If a = b return TRUE, otherwise, return FALSE +bool fpequal751_non_constant_time(const digit_t* a, const digit_t* b); + +// Modular addition, c = a+b mod p751 +extern void fpadd751(const digit_t* a, const digit_t* b, digit_t* c); +extern void fpadd751_asm(const digit_t* a, const digit_t* b, digit_t* c); + +// Modular subtraction, c = a-b mod p751 +extern void fpsub751(const digit_t* a, const digit_t* b, digit_t* c); +extern void fpsub751_asm(const digit_t* a, const digit_t* b, digit_t* c); + +// Modular negation, a = -a mod p751 +extern void fpneg751(digit_t* a); + +// Modular division by two, c = a/2 mod p751. +void fpdiv2_751(const digit_t* a, digit_t* c); + +// Modular correction to reduce field element a in [0, 2*p751-1] to [0, p751-1]. +void fpcorrection751(digit_t* a); + +// 751-bit Montgomery reduction, c = a mod p +void rdc_mont(const digit_t* a, digit_t* c); + +// Field multiplication using Montgomery arithmetic, c = a*b*R^-1 mod p751, where R=2^768 +void fpmul751_mont(const digit_t* a, const digit_t* b, digit_t* c); +void mul751_asm(const digit_t* a, const digit_t* b, digit_t* c); +void rdc751_asm(const digit_t* ma, digit_t* mc); + +// Field squaring using Montgomery arithmetic, c = a*b*R^-1 mod p751, where R=2^768 +void fpsqr751_mont(const digit_t* ma, digit_t* mc); + +// Conversion to Montgomery representation +void to_mont(const digit_t* a, digit_t* mc); + +// Conversion from Montgomery representation to standard representation +void from_mont(const digit_t* ma, digit_t* c); + +// Field inversion, a = a^-1 in GF(p751) +void fpinv751_mont(digit_t* a); + +// Field inversion, a = a^-1 in GF(p751) using the binary GCD +void fpinv751_mont_bingcd(digit_t* a); + +// Chain to compute (p751-3)/4 using Montgomery arithmetic +void fpinv751_chain_mont(digit_t* a); + +/************ GF(p^2) arithmetic functions *************/ + +// Copy of a GF(p751^2) element, c = a +void fp2copy751(const f2elm_t a, f2elm_t c); + +// Zeroing a GF(p751^2) element, a = 0 +void fp2zero751(f2elm_t a); + +// GF(p751^2) negation, a = -a in GF(p751^2) +void fp2neg751(f2elm_t a); + +// GF(p751^2) addition, c = a+b in GF(p751^2) +extern void fp2add751(const f2elm_t a, const f2elm_t b, f2elm_t c); + +// GF(p751^2) subtraction, c = a-b in GF(p751^2) +extern void fp2sub751(const f2elm_t a, const f2elm_t b, f2elm_t c); + +// GF(p751^2) division by two, c = a/2 in GF(p751^2) +void fp2div2_751(const f2elm_t a, f2elm_t c); + +// Modular correction, a = a in GF(p751^2) +void fp2correction751(f2elm_t a); + +// GF(p751^2) squaring using Montgomery arithmetic, c = a^2 in GF(p751^2) +void fp2sqr751_mont(const f2elm_t a, f2elm_t c); + +// GF(p751^2) multiplication using Montgomery arithmetic, c = a*b in GF(p751^2) +void fp2mul751_mont(const f2elm_t a, const f2elm_t b, f2elm_t c); + +// Conversion of a GF(p751^2) element to Montgomery representation +void to_fp2mont(const f2elm_t a, f2elm_t mc); + +// Conversion of a GF(p751^2) element from Montgomery representation to standard representation +void from_fp2mont(const f2elm_t ma, f2elm_t c); + +// GF(p751^2) inversion using Montgomery arithmetic, a = (a0-i*a1)/(a0^2+a1^2) +void fp2inv751_mont(f2elm_t a); + +// GF(p751^2) inversion, a = (a0-i*a1)/(a0^2+a1^2), GF(p751) inversion done using the binary GCD +void fp2inv751_mont_bingcd(f2elm_t a); + +// n-way Montgomery inversion +void mont_n_way_inv(const f2elm_t* vec, const int n, f2elm_t* out); + +/************ Elliptic curve and isogeny functions *************/ + +// Computes the j-invariant of a Montgomery curve with projective constant. +void j_inv(const f2elm_t A, const f2elm_t C, f2elm_t jinv); + +// Simultaneous doubling and differential addition. +void xDBLADD(point_proj_t P, point_proj_t Q, const f2elm_t xPQ, const f2elm_t A24); + +// Doubling of a Montgomery point in projective coordinates (X:Z). +void xDBL(const point_proj_t P, point_proj_t Q, const f2elm_t A24plus, const f2elm_t C24); + +// Computes [2^e](X:Z) on Montgomery curve with projective constant via e repeated doublings. +void xDBLe(const point_proj_t P, point_proj_t Q, const f2elm_t A24plus, const f2elm_t C24, const int e); + +// Differential addition. +void xADD(point_proj_t P, const point_proj_t Q, const f2elm_t xPQ); + +// Computes the corresponding 4-isogeny of a projective Montgomery point (X4:Z4) of order 4. +void get_4_isog(const point_proj_t P, f2elm_t A24plus, f2elm_t C24, f2elm_t* coeff); + +// Evaluates the isogeny at the point (X:Z) in the domain of the isogeny. +void eval_4_isog(point_proj_t P, f2elm_t* coeff); + +// Tripling of a Montgomery point in projective coordinates (X:Z). +void xTPL(const point_proj_t P, point_proj_t Q, const f2elm_t A24minus, const f2elm_t A24plus); + +// Computes [3^e](X:Z) on Montgomery curve with projective constant via e repeated triplings. +void xTPLe(const point_proj_t P, point_proj_t Q, const f2elm_t A24minus, const f2elm_t A24plus, const int e); + +// Computes the corresponding 3-isogeny of a projective Montgomery point (X3:Z3) of order 3. +void get_3_isog(const point_proj_t P, f2elm_t A24minus, f2elm_t A24plus, f2elm_t* coeff); + +// Computes the 3-isogeny R=phi(X:Z), given projective point (X3:Z3) of order 3 on a Montgomery curve and a point P with coefficients given in coeff. +void eval_3_isog(point_proj_t Q, const f2elm_t* coeff); + +// 3-way simultaneous inversion +void inv_3_way(f2elm_t z1, f2elm_t z2, f2elm_t z3); + +// Given the x-coordinates of P, Q, and R, returns the value A corresponding to the Montgomery curve E_A: y^2=x^3+A*x^2+x such that R=Q-P on E_A. +void get_A(const f2elm_t xP, const f2elm_t xQ, const f2elm_t xR, f2elm_t A); + + +#endif diff --git a/third_party/sidh/src/P751/generic/fp_generic.c b/third_party/sidh/src/P751/generic/fp_generic.c new file mode 100644 index 00000000..ec47384a --- /dev/null +++ b/third_party/sidh/src/P751/generic/fp_generic.c @@ -0,0 +1,224 @@ +/******************************************************************************************** +* SIDH: an efficient supersingular isogeny cryptography library +* +* Abstract: portable modular arithmetic for P751 +*********************************************************************************************/ + +#include "../P751_internal.h" + + +// Global constants +extern const uint64_t p751[NWORDS_FIELD]; +extern const uint64_t p751p1[NWORDS_FIELD]; +extern const uint64_t p751x2[NWORDS_FIELD]; + + +__inline void fpadd751(const digit_t* a, const digit_t* b, digit_t* c) +{ // Modular addition, c = a+b mod p751. + // Inputs: a, b in [0, 2*p751-1] + // Output: c in [0, 2*p751-1] + unsigned int i, carry = 0; + digit_t mask; + + for (i = 0; i < NWORDS_FIELD; i++) { + ADDC(carry, a[i], b[i], carry, c[i]); + } + + carry = 0; + for (i = 0; i < NWORDS_FIELD; i++) { + SUBC(carry, c[i], ((digit_t*)p751x2)[i], carry, c[i]); + } + mask = 0 - (digit_t)carry; + + carry = 0; + for (i = 0; i < NWORDS_FIELD; i++) { + ADDC(carry, c[i], ((digit_t*)p751x2)[i] & mask, carry, c[i]); + } +} + + +__inline void fpsub751(const digit_t* a, const digit_t* b, digit_t* c) +{ // Modular subtraction, c = a-b mod p751. + // Inputs: a, b in [0, 2*p751-1] + // Output: c in [0, 2*p751-1] + unsigned int i, borrow = 0; + digit_t mask; + + for (i = 0; i < NWORDS_FIELD; i++) { + SUBC(borrow, a[i], b[i], borrow, c[i]); + } + mask = 0 - (digit_t)borrow; + + borrow = 0; + for (i = 0; i < NWORDS_FIELD; i++) { + ADDC(borrow, c[i], ((digit_t*)p751x2)[i] & mask, borrow, c[i]); + } +} + + +__inline void fpneg751(digit_t* a) +{ // Modular negation, a = -a mod p751. + // Input/output: a in [0, 2*p751-1] + unsigned int i, borrow = 0; + + for (i = 0; i < NWORDS_FIELD; i++) { + SUBC(borrow, ((digit_t*)p751x2)[i], a[i], borrow, a[i]); + } +} + + +void fpdiv2_751(const digit_t* a, digit_t* c) +{ // Modular division by two, c = a/2 mod p751. + // Input : a in [0, 2*p751-1] + // Output: c in [0, 2*p751-1] + unsigned int i, carry = 0; + digit_t mask; + + mask = 0 - (digit_t)(a[0] & 1); // If a is odd compute a+p751 + for (i = 0; i < NWORDS_FIELD; i++) { + ADDC(carry, a[i], ((digit_t*)p751)[i] & mask, carry, c[i]); + } + + mp_shiftr1(c, NWORDS_FIELD); +} + + +void fpcorrection751(digit_t* a) +{ // Modular correction to reduce field element a in [0, 2*p751-1] to [0, p751-1]. + unsigned int i, borrow = 0; + digit_t mask; + + for (i = 0; i < NWORDS_FIELD; i++) { + SUBC(borrow, a[i], ((digit_t*)p751)[i], borrow, a[i]); + } + mask = 0 - (digit_t)borrow; + + borrow = 0; + for (i = 0; i < NWORDS_FIELD; i++) { + ADDC(borrow, a[i], ((digit_t*)p751)[i] & mask, borrow, a[i]); + } +} + + +void digit_x_digit(const digit_t a, const digit_t b, digit_t* c) +{ // Digit multiplication, digit * digit -> 2-digit result + register digit_t al, ah, bl, bh, temp; + digit_t albl, albh, ahbl, ahbh, res1, res2, res3, carry; + digit_t mask_low = (digit_t)(-1) >> (sizeof(digit_t)*4), mask_high = (digit_t)(-1) << (sizeof(digit_t)*4); + + al = a & mask_low; // Low part + ah = a >> (sizeof(digit_t) * 4); // High part + bl = b & mask_low; + bh = b >> (sizeof(digit_t) * 4); + + albl = al*bl; + albh = al*bh; + ahbl = ah*bl; + ahbh = ah*bh; + c[0] = albl & mask_low; // C00 + + res1 = albl >> (sizeof(digit_t) * 4); + res2 = ahbl & mask_low; + res3 = albh & mask_low; + temp = res1 + res2 + res3; + carry = temp >> (sizeof(digit_t) * 4); + c[0] ^= temp << (sizeof(digit_t) * 4); // C01 + + res1 = ahbl >> (sizeof(digit_t) * 4); + res2 = albh >> (sizeof(digit_t) * 4); + res3 = ahbh & mask_low; + temp = res1 + res2 + res3 + carry; + c[1] = temp & mask_low; // C10 + carry = temp & mask_high; + c[1] ^= (ahbh & mask_high) + carry; // C11 +} + + +void mp_mul(const digit_t* a, const digit_t* b, digit_t* c, const unsigned int nwords) +{ // Multiprecision comba multiply, c = a*b, where lng(a) = lng(b) = nwords. + unsigned int i, j; + digit_t t = 0, u = 0, v = 0, UV[2]; + unsigned int carry = 0; + + for (i = 0; i < nwords; i++) { + for (j = 0; j <= i; j++) { + MUL(a[j], b[i-j], UV+1, UV[0]); + ADDC(0, UV[0], v, carry, v); + ADDC(carry, UV[1], u, carry, u); + t += carry; + } + c[i] = v; + v = u; + u = t; + t = 0; + } + + for (i = nwords; i < 2*nwords-1; i++) { + for (j = i-nwords+1; j < nwords; j++) { + MUL(a[j], b[i-j], UV+1, UV[0]); + ADDC(0, UV[0], v, carry, v); + ADDC(carry, UV[1], u, carry, u); + t += carry; + } + c[i] = v; + v = u; + u = t; + t = 0; + } + c[2*nwords-1] = v; +} + + +void rdc_mont(const digit_t* ma, digit_t* mc) +{ // Efficient Montgomery reduction using comba and exploiting the special form of the prime p751. + // mc = ma*R^-1 mod p751x2, where R = 2^768. + // If ma < 2^768*p751, the output mc is in the range [0, 2*p751-1]. + // ma is assumed to be in Montgomery representation. + unsigned int i, j, carry, count = p751_ZERO_WORDS; + digit_t UV[2], t = 0, u = 0, v = 0; + + for (i = 0; i < NWORDS_FIELD; i++) { + mc[i] = 0; + } + + for (i = 0; i < NWORDS_FIELD; i++) { + for (j = 0; j < i; j++) { + if (j < (i-p751_ZERO_WORDS+1)) { + MUL(mc[j], ((digit_t*)p751p1)[i-j], UV+1, UV[0]); + ADDC(0, UV[0], v, carry, v); + ADDC(carry, UV[1], u, carry, u); + t += carry; + } + } + ADDC(0, v, ma[i], carry, v); + ADDC(carry, u, 0, carry, u); + t += carry; + mc[i] = v; + v = u; + u = t; + t = 0; + } + + for (i = NWORDS_FIELD; i < 2*NWORDS_FIELD-1; i++) { + if (count > 0) { + count -= 1; + } + for (j = i-NWORDS_FIELD+1; j < NWORDS_FIELD; j++) { + if (j < (NWORDS_FIELD-count)) { + MUL(mc[j], ((digit_t*)p751p1)[i-j], UV+1, UV[0]); + ADDC(0, UV[0], v, carry, v); + ADDC(carry, UV[1], u, carry, u); + t += carry; + } + } + ADDC(0, v, ma[i], carry, v); + ADDC(carry, u, 0, carry, u); + t += carry; + mc[i-NWORDS_FIELD] = v; + v = u; + u = t; + t = 0; + } + ADDC(0, v, ma[2*NWORDS_FIELD-1], carry, v); + mc[NWORDS_FIELD-1] = v; +} \ No newline at end of file diff --git a/third_party/sidh/src/config.h b/third_party/sidh/src/config.h new file mode 100644 index 00000000..08dd9295 --- /dev/null +++ b/third_party/sidh/src/config.h @@ -0,0 +1,265 @@ +/******************************************************************************************** +* SIDH: an efficient supersingular isogeny cryptography library +* +* Abstract: configuration file and platform-dependent macros +*********************************************************************************************/ + +#ifndef __CONFIG_H__ +#define __CONFIG_H__ + +#include +#include +#include + + +// Definition of operating system + +#define OS_WIN 1 +#define OS_LINUX 2 + +#if defined(__WINDOWS__) // Microsoft Windows OS + #define OS_TARGET OS_WIN +#elif defined(__LINUX__) // Linux OS + #define OS_TARGET OS_LINUX +#else + #error -- "Unsupported OS" +#endif + + +// Definition of compiler + +#define COMPILER_VC 1 +#define COMPILER_GCC 2 +#define COMPILER_CLANG 3 + +#if defined(_MSC_VER) // Microsoft Visual C compiler + #define COMPILER COMPILER_VC +#elif defined(__GNUC__) // GNU GCC compiler + #define COMPILER COMPILER_GCC +#elif defined(__clang__) // Clang compiler + #define COMPILER COMPILER_CLANG +#else + #error -- "Unsupported COMPILER" +#endif + + +// Definition of the targeted architecture and basic data types + +#define TARGET_AMD64 1 +#define TARGET_x86 2 +#define TARGET_ARM 3 +#define TARGET_ARM64 4 + +#if defined(_AMD64_) + #define TARGET TARGET_AMD64 + #define RADIX 64 + #define LOG2RADIX 6 + typedef uint64_t digit_t; // Unsigned 64-bit digit +#elif defined(_X86_) + #define TARGET TARGET_x86 + #define RADIX 32 + #define LOG2RADIX 5 + typedef uint32_t digit_t; // Unsigned 32-bit digit +#elif defined(_ARM_) + #define TARGET TARGET_ARM + #define RADIX 32 + #define LOG2RADIX 5 + typedef uint32_t digit_t; // Unsigned 32-bit digit +#elif defined(_ARM64_) + #define TARGET TARGET_ARM64 + #define RADIX 64 + #define LOG2RADIX 6 + typedef uint64_t digit_t; // Unsigned 64-bit digit +#else + #error -- "Unsupported ARCHITECTURE" +#endif + +#define RADIX64 64 + + +// Selection of generic, portable implementation + +#if defined(_GENERIC_) + #define GENERIC_IMPLEMENTATION +#elif defined(_FAST_) + #define FAST_IMPLEMENTATION +#endif + + +// Extended datatype support + +#if defined(GENERIC_IMPLEMENTATION) + typedef uint64_t uint128_t[2]; +#elif (TARGET == TARGET_AMD64 && OS_TARGET == OS_LINUX) && (COMPILER == COMPILER_GCC || COMPILER == COMPILER_CLANG) + #define UINT128_SUPPORT + typedef unsigned uint128_t __attribute__((mode(TI))); +#elif (TARGET == TARGET_ARM64 && OS_TARGET == OS_LINUX) && (COMPILER == COMPILER_GCC || COMPILER == COMPILER_CLANG) + #define UINT128_SUPPORT + typedef unsigned uint128_t __attribute__((mode(TI))); +#elif (TARGET == TARGET_AMD64) && (OS_TARGET == OS_WIN && COMPILER == COMPILER_VC) + #define SCALAR_INTRIN_SUPPORT + typedef uint64_t uint128_t[2]; +#else + #error -- "Unsupported configuration" +#endif + + +// Macro definitions + +#define NBITS_TO_NBYTES(nbits) (((nbits)+7)/8) // Conversion macro from number of bits to number of bytes +#define NBITS_TO_NWORDS(nbits) (((nbits)+(sizeof(digit_t)*8)-1)/(sizeof(digit_t)*8)) // Conversion macro from number of bits to number of computer words +#define NBYTES_TO_NWORDS(nbytes) (((nbytes)+sizeof(digit_t)-1)/sizeof(digit_t)) // Conversion macro from number of bytes to number of computer words + +// Macro to avoid compiler warnings when detecting unreferenced parameters +#define UNREFERENCED_PARAMETER(PAR) ((void)(PAR)) + + +/********************** Constant-time unsigned comparisons ***********************/ + +// The following functions return 1 (TRUE) if condition is true, 0 (FALSE) otherwise + +static __inline unsigned int is_digit_nonzero_ct(digit_t x) +{ // Is x != 0? + return (unsigned int)((x | (0-x)) >> (RADIX-1)); +} + +static __inline unsigned int is_digit_zero_ct(digit_t x) +{ // Is x = 0? + return (unsigned int)(1 ^ is_digit_nonzero_ct(x)); +} + +static __inline unsigned int is_digit_lessthan_ct(digit_t x, digit_t y) +{ // Is x < y? + return (unsigned int)((x ^ ((x ^ y) | ((x - y) ^ y))) >> (RADIX-1)); +} + + +/********************** Macros for platform-dependent operations **********************/ + +#if defined(GENERIC_IMPLEMENTATION) + +// Digit multiplication +#define MUL(multiplier, multiplicand, hi, lo) \ + digit_x_digit((multiplier), (multiplicand), &(lo)); + +// Digit addition with carry +#define ADDC(carryIn, addend1, addend2, carryOut, sumOut) \ + { digit_t tempReg = (addend1) + (digit_t)(carryIn); \ + (sumOut) = (addend2) + tempReg; \ + (carryOut) = (is_digit_lessthan_ct(tempReg, (digit_t)(carryIn)) | is_digit_lessthan_ct((sumOut), tempReg)); } + +// Digit subtraction with borrow +#define SUBC(borrowIn, minuend, subtrahend, borrowOut, differenceOut) \ + { digit_t tempReg = (minuend) - (subtrahend); \ + unsigned int borrowReg = (is_digit_lessthan_ct((minuend), (subtrahend)) | ((borrowIn) & is_digit_zero_ct(tempReg))); \ + (differenceOut) = tempReg - (digit_t)(borrowIn); \ + (borrowOut) = borrowReg; } + +// Shift right with flexible datatype +#define SHIFTR(highIn, lowIn, shift, shiftOut, DigitSize) \ + (shiftOut) = ((lowIn) >> (shift)) ^ ((highIn) << (DigitSize - (shift))); + +// Shift left with flexible datatype +#define SHIFTL(highIn, lowIn, shift, shiftOut, DigitSize) \ + (shiftOut) = ((highIn) << (shift)) ^ ((lowIn) >> (DigitSize - (shift))); + +// 64x64-bit multiplication +#define MUL128(multiplier, multiplicand, product) \ + mp_mul((digit_t*)&(multiplier), (digit_t*)&(multiplicand), (digit_t*)&(product), NWORDS_FIELD/2); + +// 128-bit addition, inputs < 2^127 +#define ADD128(addend1, addend2, addition) \ + mp_add((digit_t*)(addend1), (digit_t*)(addend2), (digit_t*)(addition), NWORDS_FIELD); + +// 128-bit addition with output carry +#define ADC128(addend1, addend2, carry, addition) \ + (carry) = mp_add((digit_t*)(addend1), (digit_t*)(addend2), (digit_t*)(addition), NWORDS_FIELD); + +#elif (TARGET == TARGET_AMD64 && OS_TARGET == OS_WIN) + +// Digit multiplication +#define MUL(multiplier, multiplicand, hi, lo) \ + (lo) = _umul128((multiplier), (multiplicand), (hi)); + +// Digit addition with carry +#define ADDC(carryIn, addend1, addend2, carryOut, sumOut) \ + (carryOut) = _addcarry_u64((carryIn), (addend1), (addend2), &(sumOut)); + +// Digit subtraction with borrow +#define SUBC(borrowIn, minuend, subtrahend, borrowOut, differenceOut) \ + (borrowOut) = _subborrow_u64((borrowIn), (minuend), (subtrahend), &(differenceOut)); + +// Digit shift right +#define SHIFTR(highIn, lowIn, shift, shiftOut, DigitSize) \ + (shiftOut) = __shiftright128((lowIn), (highIn), (shift)); + +// Digit shift left +#define SHIFTL(highIn, lowIn, shift, shiftOut, DigitSize) \ + (shiftOut) = __shiftleft128((lowIn), (highIn), (shift)); + +// 64x64-bit multiplication +#define MUL128(multiplier, multiplicand, product) \ + (product)[0] = _umul128((multiplier), (multiplicand), &(product)[1]); + +// 128-bit addition, inputs < 2^127 +#define ADD128(addend1, addend2, addition) \ + { unsigned char carry = _addcarry_u64(0, (addend1)[0], (addend2)[0], &(addition)[0]); \ + _addcarry_u64(carry, (addend1)[1], (addend2)[1], &(addition)[1]); } + +// 128-bit addition with output carry +#define ADC128(addend1, addend2, carry, addition) \ + (carry) = _addcarry_u64(0, (addend1)[0], (addend2)[0], &(addition)[0]); \ + (carry) = _addcarry_u64((carry), (addend1)[1], (addend2)[1], &(addition)[1]); + +// 128-bit subtraction, subtrahend < 2^127 +#define SUB128(minuend, subtrahend, difference) \ + { unsigned char borrow = _subborrow_u64(0, (minuend)[0], (subtrahend)[0], &(difference)[0]); \ + _subborrow_u64(borrow, (minuend)[1], (subtrahend)[1], &(difference)[1]); } + +// 128-bit right shift, max. shift value is 64 +#define SHIFTR128(Input, shift, shiftOut) \ + (shiftOut)[0] = __shiftright128((Input)[0], (Input)[1], (shift)); \ + (shiftOut)[1] = (Input)[1] >> (shift); + +// 128-bit left shift, max. shift value is 64 +#define SHIFTL128(Input, shift, shiftOut) \ + (shiftOut)[1] = __shiftleft128((Input)[0], (Input)[1], (shift)); \ + (shiftOut)[0] = (Input)[0] << (shift); + +#define MULADD128(multiplier, multiplicand, addend, carry, result); \ + { uint128_t product; \ + MUL128(multiplier, multiplicand, product); \ + ADC128(addend, product, carry, result); } + +#elif ((TARGET == TARGET_AMD64 || TARGET == TARGET_ARM64) && OS_TARGET == OS_LINUX) + +// Digit multiplication +#define MUL(multiplier, multiplicand, hi, lo) \ + { uint128_t tempReg = (uint128_t)(multiplier) * (uint128_t)(multiplicand); \ + *(hi) = (digit_t)(tempReg >> RADIX); \ + (lo) = (digit_t)tempReg; } + +// Digit addition with carry +#define ADDC(carryIn, addend1, addend2, carryOut, sumOut) \ + { uint128_t tempReg = (uint128_t)(addend1) + (uint128_t)(addend2) + (uint128_t)(carryIn); \ + (carryOut) = (digit_t)(tempReg >> RADIX); \ + (sumOut) = (digit_t)tempReg; } + +// Digit subtraction with borrow +#define SUBC(borrowIn, minuend, subtrahend, borrowOut, differenceOut) \ + { uint128_t tempReg = (uint128_t)(minuend) - (uint128_t)(subtrahend) - (uint128_t)(borrowIn); \ + (borrowOut) = (digit_t)(tempReg >> (sizeof(uint128_t)*8 - 1)); \ + (differenceOut) = (digit_t)tempReg; } + +// Digit shift right +#define SHIFTR(highIn, lowIn, shift, shiftOut, DigitSize) \ + (shiftOut) = ((lowIn) >> (shift)) ^ ((highIn) << (RADIX - (shift))); + +// Digit shift left +#define SHIFTL(highIn, lowIn, shift, shiftOut, DigitSize) \ + (shiftOut) = ((highIn) << (shift)) ^ ((lowIn) >> (RADIX - (shift))); + +#endif + + +#endif diff --git a/third_party/sidh/src/ec_isogeny.c b/third_party/sidh/src/ec_isogeny.c new file mode 100644 index 00000000..fefbaaa7 --- /dev/null +++ b/third_party/sidh/src/ec_isogeny.c @@ -0,0 +1,333 @@ +/******************************************************************************************** +* SIDH: an efficient supersingular isogeny cryptography library +* +* Abstract: elliptic curve and isogeny functions +*********************************************************************************************/ + + +void xDBL(const point_proj_t P, point_proj_t Q, const f2elm_t A24plus, const f2elm_t C24) +{ // Doubling of a Montgomery point in projective coordinates (X:Z). + // Input: projective Montgomery x-coordinates P = (X1:Z1), where x1=X1/Z1 and Montgomery curve constants A+2C and 4C. + // Output: projective Montgomery x-coordinates Q = 2*P = (X2:Z2). + f2elm_t t0, t1; + + fp2sub(P->X, P->Z, t0); // t0 = X1-Z1 + fp2add(P->X, P->Z, t1); // t1 = X1+Z1 + fp2sqr_mont(t0, t0); // t0 = (X1-Z1)^2 + fp2sqr_mont(t1, t1); // t1 = (X1+Z1)^2 + fp2mul_mont(C24, t0, Q->Z); // Z2 = C24*(X1-Z1)^2 + fp2mul_mont(t1, Q->Z, Q->X); // X2 = C24*(X1-Z1)^2*(X1+Z1)^2 + fp2sub(t1, t0, t1); // t1 = (X1+Z1)^2-(X1-Z1)^2 + fp2mul_mont(A24plus, t1, t0); // t0 = A24plus*[(X1+Z1)^2-(X1-Z1)^2] + fp2add(Q->Z, t0, Q->Z); // Z2 = A24plus*[(X1+Z1)^2-(X1-Z1)^2] + C24*(X1-Z1)^2 + fp2mul_mont(Q->Z, t1, Q->Z); // Z2 = [A24plus*[(X1+Z1)^2-(X1-Z1)^2] + C24*(X1-Z1)^2]*[(X1+Z1)^2-(X1-Z1)^2] +} + + +void xDBLe(const point_proj_t P, point_proj_t Q, const f2elm_t A24plus, const f2elm_t C24, const int e) +{ // Computes [2^e](X:Z) on Montgomery curve with projective constant via e repeated doublings. + // Input: projective Montgomery x-coordinates P = (XP:ZP), such that xP=XP/ZP and Montgomery curve constants A+2C and 4C. + // Output: projective Montgomery x-coordinates Q <- (2^e)*P. + int i; + + copy_words((digit_t*)P, (digit_t*)Q, 2*2*NWORDS_FIELD); + + for (i = 0; i < e; i++) { + xDBL(Q, Q, A24plus, C24); + } +} + + +void get_4_isog(const point_proj_t P, f2elm_t A24plus, f2elm_t C24, f2elm_t* coeff) +{ // Computes the corresponding 4-isogeny of a projective Montgomery point (X4:Z4) of order 4. + // Input: projective point of order four P = (X4:Z4). + // Output: the 4-isogenous Montgomery curve with projective coefficients A+2C/4C and the 3 coefficients + // that are used to evaluate the isogeny at a point in eval_4_isog(). + + fp2sub(P->X, P->Z, coeff[1]); // coeff[1] = X4-Z4 + fp2add(P->X, P->Z, coeff[2]); // coeff[2] = X4+Z4 + fp2sqr_mont(P->Z, coeff[0]); // coeff[0] = Z4^2 + fp2add(coeff[0], coeff[0], coeff[0]); // coeff[0] = 2*Z4^2 + fp2sqr_mont(coeff[0], C24); // C24 = 4*Z4^4 + fp2add(coeff[0], coeff[0], coeff[0]); // coeff[0] = 4*Z4^2 + fp2sqr_mont(P->X, A24plus); // A24plus = X4^2 + fp2add(A24plus, A24plus, A24plus); // A24plus = 2*X4^2 + fp2sqr_mont(A24plus, A24plus); // A24plus = 4*X4^4 +} + + +void eval_4_isog(point_proj_t P, f2elm_t* coeff) +{ // Evaluates the isogeny at the point (X:Z) in the domain of the isogeny, given a 4-isogeny phi defined + // by the 3 coefficients in coeff (computed in the function get_4_isog()). + // Inputs: the coefficients defining the isogeny, and the projective point P = (X:Z). + // Output: the projective point P = phi(P) = (X:Z) in the codomain. + f2elm_t t0, t1; + + fp2add(P->X, P->Z, t0); // t0 = X+Z + fp2sub(P->X, P->Z, t1); // t1 = X-Z + fp2mul_mont(t0, coeff[1], P->X); // X = (X+Z)*coeff[1] + fp2mul_mont(t1, coeff[2], P->Z); // Z = (X-Z)*coeff[2] + fp2mul_mont(t0, t1, t0); // t0 = (X+Z)*(X-Z) + fp2mul_mont(t0, coeff[0], t0); // t0 = coeff[0]*(X+Z)*(X-Z) + fp2add(P->X, P->Z, t1); // t1 = (X-Z)*coeff[2] + (X+Z)*coeff[1] + fp2sub(P->X, P->Z, P->Z); // Z = (X-Z)*coeff[2] - (X+Z)*coeff[1] + fp2sqr_mont(t1, t1); // t1 = [(X-Z)*coeff[2] + (X+Z)*coeff[1]]^2 + fp2sqr_mont(P->Z, P->Z); // Z = [(X-Z)*coeff[2] - (X+Z)*coeff[1]]^2 + fp2add(t1, t0, P->X); // X = coeff[0]*(X+Z)*(X-Z) + [(X-Z)*coeff[2] + (X+Z)*coeff[1]]^2 + fp2sub(P->Z, t0, t0); // t0 = [(X-Z)*coeff[2] - (X+Z)*coeff[1]]^2 - coeff[0]*(X+Z)*(X-Z) + fp2mul_mont(P->X, t1, P->X); // Xfinal + fp2mul_mont(P->Z, t0, P->Z); // Zfinal +} + + +void xTPL(const point_proj_t P, point_proj_t Q, const f2elm_t A24minus, const f2elm_t A24plus) +{ // Tripling of a Montgomery point in projective coordinates (X:Z). + // Input: projective Montgomery x-coordinates P = (X:Z), where x=X/Z and Montgomery curve constants A24plus = A+2C and A24minus = A-2C. + // Output: projective Montgomery x-coordinates Q = 3*P = (X3:Z3). + f2elm_t t0, t1, t2, t3, t4, t5, t6; + + fp2sub(P->X, P->Z, t0); // t0 = X-Z + fp2sqr_mont(t0, t2); // t2 = (X-Z)^2 + fp2add(P->X, P->Z, t1); // t1 = X+Z + fp2sqr_mont(t1, t3); // t3 = (X+Z)^2 + fp2add(t0, t1, t4); // t4 = 2*X + fp2sub(t1, t0, t0); // t0 = 2*Z + fp2sqr_mont(t4, t1); // t1 = 4*X^2 + fp2sub(t1, t3, t1); // t1 = 4*X^2 - (X+Z)^2 + fp2sub(t1, t2, t1); // t1 = 4*X^2 - (X+Z)^2 - (X-Z)^2 + fp2mul_mont(t3, A24plus, t5); // t5 = A24plus*(X+Z)^2 + fp2mul_mont(t3, t5, t3); // t3 = A24plus*(X+Z)^3 + fp2mul_mont(A24minus, t2, t6); // t6 = A24minus*(X-Z)^2 + fp2mul_mont(t2, t6, t2); // t2 = A24minus*(X-Z)^3 + fp2sub(t2, t3, t3); // t3 = A24minus*(X-Z)^3 - coeff*(X+Z)^3 + fp2sub(t5, t6, t2); // t2 = A24plus*(X+Z)^2 - A24minus*(X-Z)^2 + fp2mul_mont(t1, t2, t1); // t1 = [4*X^2 - (X+Z)^2 - (X-Z)^2]*[A24plus*(X+Z)^2 - A24minus*(X-Z)^2] + fp2add(t3, t1, t2); // t2 = [4*X^2 - (X+Z)^2 - (X-Z)^2]*[A24plus*(X+Z)^2 - A24minus*(X-Z)^2] + A24minus*(X-Z)^3 - coeff*(X+Z)^3 + fp2sqr_mont(t2, t2); // t2 = t2^2 + fp2mul_mont(t4, t2, Q->X); // X3 = 2*X*t2 + fp2sub(t3, t1, t1); // t1 = A24minus*(X-Z)^3 - A24plus*(X+Z)^3 - [4*X^2 - (X+Z)^2 - (X-Z)^2]*[A24plus*(X+Z)^2 - A24minus*(X-Z)^2] + fp2sqr_mont(t1, t1); // t1 = t1^2 + fp2mul_mont(t0, t1, Q->Z); // Z3 = 2*Z*t1 +} + + +void xTPLe(const point_proj_t P, point_proj_t Q, const f2elm_t A24minus, const f2elm_t A24plus, const int e) +{ // Computes [3^e](X:Z) on Montgomery curve with projective constant via e repeated triplings. + // Input: projective Montgomery x-coordinates P = (XP:ZP), such that xP=XP/ZP and Montgomery curve constants A24plus = A+2C and A24minus = A-2C. + // Output: projective Montgomery x-coordinates Q <- (3^e)*P. + int i; + + copy_words((digit_t*)P, (digit_t*)Q, 2*2*NWORDS_FIELD); + + for (i = 0; i < e; i++) { + xTPL(Q, Q, A24minus, A24plus); + } +} + + +void get_3_isog(const point_proj_t P, f2elm_t A24minus, f2elm_t A24plus, f2elm_t* coeff) +{ // Computes the corresponding 3-isogeny of a projective Montgomery point (X3:Z3) of order 3. + // Input: projective point of order three P = (X3:Z3). + // Output: the 3-isogenous Montgomery curve with projective coefficient A/C. + f2elm_t t0, t1, t2, t3, t4; + + fp2sub(P->X, P->Z, coeff[0]); // coeff0 = X-Z + fp2sqr_mont(coeff[0], t0); // t0 = (X-Z)^2 + fp2add(P->X, P->Z, coeff[1]); // coeff1 = X+Z + fp2sqr_mont(coeff[1], t1); // t1 = (X+Z)^2 + fp2add(t0, t1, t2); // t2 = (X+Z)^2 + (X-Z)^2 + fp2add(coeff[0], coeff[1], t3); // t3 = 2*X + fp2sqr_mont(t3, t3); // t3 = 4*X^2 + fp2sub(t3, t2, t3); // t3 = 4*X^2 - (X+Z)^2 - (X-Z)^2 + fp2add(t1, t3, t2); // t2 = 4*X^2 - (X-Z)^2 + fp2add(t3, t0, t3); // t3 = 4*X^2 - (X+Z)^2 + fp2add(t0, t3, t4); // t4 = 4*X^2 - (X+Z)^2 + (X-Z)^2 + fp2add(t4, t4, t4); // t4 = 2(4*X^2 - (X+Z)^2 + (X-Z)^2) + fp2add(t1, t4, t4); // t4 = 8*X^2 - (X+Z)^2 + 2*(X-Z)^2 + fp2mul_mont(t2, t4, A24minus); // A24minus = [4*X^2 - (X-Z)^2]*[8*X^2 - (X+Z)^2 + 2*(X-Z)^2] + fp2add(t1, t2, t4); // t4 = 4*X^2 + (X+Z)^2 - (X-Z)^2 + fp2add(t4, t4, t4); // t4 = 2(4*X^2 + (X+Z)^2 - (X-Z)^2) + fp2add(t0, t4, t4); // t4 = 8*X^2 + 2*(X+Z)^2 - (X-Z)^2 + fp2mul_mont(t3, t4, t4); // t4 = [4*X^2 - (X+Z)^2]*[8*X^2 + 2*(X+Z)^2 - (X-Z)^2] + fp2sub(t4, A24minus, t0); // t0 = [4*X^2 - (X+Z)^2]*[8*X^2 + 2*(X+Z)^2 - (X-Z)^2] - [4*X^2 - (X-Z)^2]*[8*X^2 - (X+Z)^2 + 2*(X-Z)^2] + fp2add(A24minus, t0, A24plus); // A24plus = 8*X^2 - (X+Z)^2 + 2*(X-Z)^2 +} + + +void eval_3_isog(point_proj_t Q, const f2elm_t* coeff) +{ // Computes the 3-isogeny R=phi(X:Z), given projective point (X3:Z3) of order 3 on a Montgomery curve and + // a point P with 2 coefficients in coeff (computed in the function get_3_isog()). + // Inputs: projective points P = (X3:Z3) and Q = (X:Z). + // Output: the projective point Q <- phi(Q) = (X3:Z3). + f2elm_t t0, t1, t2; + + fp2add(Q->X, Q->Z, t0); // t0 = X+Z + fp2sub(Q->X, Q->Z, t1); // t1 = X-Z + fp2mul_mont(t0, coeff[0], t0); // t0 = coeff0*(X+Z) + fp2mul_mont(t1, coeff[1], t1); // t1 = coeff1*(X-Z) + fp2add(t0, t1, t2); // t2 = coeff0*(X+Z) + coeff1*(X-Z) + fp2sub(t1, t0, t0); // t0 = coeff1*(X-Z) - coeff0*(X+Z) + fp2sqr_mont(t2, t2); // t2 = [coeff0*(X+Z) + coeff1*(X-Z)]^2 + fp2sqr_mont(t0, t0); // t0 = [coeff1*(X-Z) - coeff0*(X+Z)]^2 + fp2mul_mont(Q->X, t2, Q->X); // X3final = X*[coeff0*(X+Z) + coeff1*(X-Z)]^2 + fp2mul_mont(Q->Z, t0, Q->Z); // Z3final = Z*[coeff1*(X-Z) - coeff0*(X+Z)]^2 +} + + +void inv_3_way(f2elm_t z1, f2elm_t z2, f2elm_t z3) +{ // 3-way simultaneous inversion + // Input: z1,z2,z3 + // Output: 1/z1,1/z2,1/z3 (override inputs). + f2elm_t t0, t1, t2, t3; + + fp2mul_mont(z1, z2, t0); // t0 = z1*z2 + fp2mul_mont(z3, t0, t1); // t1 = z1*z2*z3 + fp2inv_mont(t1); // t1 = 1/(z1*z2*z3) + fp2mul_mont(z3, t1, t2); // t2 = 1/(z1*z2) + fp2mul_mont(t2, z2, t3); // t3 = 1/z1 + fp2mul_mont(t2, z1, z2); // z2 = 1/z2 + fp2mul_mont(t0, t1, z3); // z3 = 1/z3 + fp2copy(t3, z1); // z1 = 1/z1 +} + + +void get_A(const f2elm_t xP, const f2elm_t xQ, const f2elm_t xR, f2elm_t A) +{ // Given the x-coordinates of P, Q, and R, returns the value A corresponding to the Montgomery curve E_A: y^2=x^3+A*x^2+x such that R=Q-P on E_A. + // Input: the x-coordinates xP, xQ, and xR of the points P, Q and R. + // Output: the coefficient A corresponding to the curve E_A: y^2=x^3+A*x^2+x. + f2elm_t t0, t1, one = {0}; + + fpcopy((digit_t*)&Montgomery_one, one[0]); + fp2add(xP, xQ, t1); // t1 = xP+xQ + fp2mul_mont(xP, xQ, t0); // t0 = xP*xQ + fp2mul_mont(xR, t1, A); // A = xR*t1 + fp2add(t0, A, A); // A = A+t0 + fp2mul_mont(t0, xR, t0); // t0 = t0*xR + fp2sub(A, one, A); // A = A-1 + fp2add(t0, t0, t0); // t0 = t0+t0 + fp2add(t1, xR, t1); // t1 = t1+xR + fp2add(t0, t0, t0); // t0 = t0+t0 + fp2sqr_mont(A, A); // A = A^2 + fp2inv_mont(t0); // t0 = 1/t0 + fp2mul_mont(A, t0, A); // A = A*t0 + fp2sub(A, t1, A); // Afinal = A-t1 +} + + +void j_inv(const f2elm_t A, const f2elm_t C, f2elm_t jinv) +{ // Computes the j-invariant of a Montgomery curve with projective constant. + // Input: A,C in GF(p^2). + // Output: j=256*(A^2-3*C^2)^3/(C^4*(A^2-4*C^2)), which is the j-invariant of the Montgomery curve B*y^2=x^3+(A/C)*x^2+x or (equivalently) j-invariant of B'*y^2=C*x^3+A*x^2+C*x. + f2elm_t t0, t1; + + fp2sqr_mont(A, jinv); // jinv = A^2 + fp2sqr_mont(C, t1); // t1 = C^2 + fp2add(t1, t1, t0); // t0 = t1+t1 + fp2sub(jinv, t0, t0); // t0 = jinv-t0 + fp2sub(t0, t1, t0); // t0 = t0-t1 + fp2sub(t0, t1, jinv); // jinv = t0-t1 + fp2sqr_mont(t1, t1); // t1 = t1^2 + fp2mul_mont(jinv, t1, jinv); // jinv = jinv*t1 + fp2add(t0, t0, t0); // t0 = t0+t0 + fp2add(t0, t0, t0); // t0 = t0+t0 + fp2sqr_mont(t0, t1); // t1 = t0^2 + fp2mul_mont(t0, t1, t0); // t0 = t0*t1 + fp2add(t0, t0, t0); // t0 = t0+t0 + fp2add(t0, t0, t0); // t0 = t0+t0 + fp2inv_mont(jinv); // jinv = 1/jinv + fp2mul_mont(jinv, t0, jinv); // jinv = t0*jinv +} + + +void xDBLADD(point_proj_t P, point_proj_t Q, const f2elm_t xPQ, const f2elm_t A24) +{ // Simultaneous doubling and differential addition. + // Input: projective Montgomery points P=(XP:ZP) and Q=(XQ:ZQ) such that xP=XP/ZP and xQ=XQ/ZQ, affine difference xPQ=x(P-Q) and Montgomery curve constant A24=(A+2)/4. + // Output: projective Montgomery points P <- 2*P = (X2P:Z2P) such that x(2P)=X2P/Z2P, and Q <- P+Q = (XQP:ZQP) such that = x(Q+P)=XQP/ZQP. + f2elm_t t0, t1, t2; + + fp2add(P->X, P->Z, t0); // t0 = XP+ZP + fp2sub(P->X, P->Z, t1); // t1 = XP-ZP + fp2sqr_mont(t0, P->X); // XP = (XP+ZP)^2 + fp2sub(Q->X, Q->Z, t2); // t2 = XQ-ZQ + fp2correction(t2); + fp2add(Q->X, Q->Z, Q->X); // XQ = XQ+ZQ + fp2mul_mont(t0, t2, t0); // t0 = (XP+ZP)*(XQ-ZQ) + fp2sqr_mont(t1, P->Z); // ZP = (XP-ZP)^2 + fp2mul_mont(t1, Q->X, t1); // t1 = (XP-ZP)*(XQ+ZQ) + fp2sub(P->X, P->Z, t2); // t2 = (XP+ZP)^2-(XP-ZP)^2 + fp2mul_mont(P->X, P->Z, P->X); // XP = (XP+ZP)^2*(XP-ZP)^2 + fp2mul_mont(t2, A24, Q->X); // XQ = A24*[(XP+ZP)^2-(XP-ZP)^2] + fp2sub(t0, t1, Q->Z); // ZQ = (XP+ZP)*(XQ-ZQ)-(XP-ZP)*(XQ+ZQ) + fp2add(Q->X, P->Z, P->Z); // ZP = A24*[(XP+ZP)^2-(XP-ZP)^2]+(XP-ZP)^2 + fp2add(t0, t1, Q->X); // XQ = (XP+ZP)*(XQ-ZQ)+(XP-ZP)*(XQ+ZQ) + fp2mul_mont(P->Z, t2, P->Z); // ZP = [A24*[(XP+ZP)^2-(XP-ZP)^2]+(XP-ZP)^2]*[(XP+ZP)^2-(XP-ZP)^2] + fp2sqr_mont(Q->Z, Q->Z); // ZQ = [(XP+ZP)*(XQ-ZQ)-(XP-ZP)*(XQ+ZQ)]^2 + fp2sqr_mont(Q->X, Q->X); // XQ = [(XP+ZP)*(XQ-ZQ)+(XP-ZP)*(XQ+ZQ)]^2 + fp2mul_mont(Q->Z, xPQ, Q->Z); // ZQ = xPQ*[(XP+ZP)*(XQ-ZQ)-(XP-ZP)*(XQ+ZQ)]^2 +} + + +static void swap_points(point_proj_t P, point_proj_t Q, const digit_t option) +{ // Swap points. + // If option = 0 then P <- P and Q <- Q, else if option = 0xFF...FF then P <- Q and Q <- P + digit_t temp; + unsigned int i; + + for (i = 0; i < NWORDS_FIELD; i++) { + temp = option & (P->X[0][i] ^ Q->X[0][i]); + P->X[0][i] = temp ^ P->X[0][i]; + Q->X[0][i] = temp ^ Q->X[0][i]; + temp = option & (P->Z[0][i] ^ Q->Z[0][i]); + P->Z[0][i] = temp ^ P->Z[0][i]; + Q->Z[0][i] = temp ^ Q->Z[0][i]; + temp = option & (P->X[1][i] ^ Q->X[1][i]); + P->X[1][i] = temp ^ P->X[1][i]; + Q->X[1][i] = temp ^ Q->X[1][i]; + temp = option & (P->Z[1][i] ^ Q->Z[1][i]); + P->Z[1][i] = temp ^ P->Z[1][i]; + Q->Z[1][i] = temp ^ Q->Z[1][i]; + } +} + + +static void LADDER3PT(const f2elm_t xP, const f2elm_t xQ, const f2elm_t xPQ, const digit_t* m, const unsigned int AliceOrBob, point_proj_t R, const f2elm_t A) +{ + point_proj_t R0 = {0}, R2 = {0}; + f2elm_t A24 = {0}; + digit_t mask; + int i, nbits, bit, swap, prevbit = 0; + + if (AliceOrBob == ALICE) { + nbits = OALICE_BITS; + } else { + nbits = OBOB_BITS; + } + + // Initializing constant + fpcopy((digit_t*)&Montgomery_one, A24[0]); + fp2add(A24, A24, A24); + fp2add(A, A24, A24); + fp2div2(A24, A24); + fp2div2(A24, A24); // A24 = (A+2)/4 + + // Initializing points + fp2copy(xQ, R0->X); + fpcopy((digit_t*)&Montgomery_one, (digit_t*)R0->Z); + fp2copy(xPQ, R2->X); + fpcopy((digit_t*)&Montgomery_one, (digit_t*)R2->Z); + fp2copy(xP, R->X); + fpcopy((digit_t*)&Montgomery_one, (digit_t*)R->Z); + fpzero((digit_t*)(R->Z)[1]); + + // Main loop + for (i = 0; i < nbits; i++) { + bit = (m[i >> LOG2RADIX] >> (i & (RADIX-1))) & 1; + swap = bit ^ prevbit; + prevbit = bit; + mask = 0 - (digit_t)swap; + + swap_points(R, R2, mask); + xDBLADD(R0, R2, R->X, A24); + fp2mul_mont(R2->X, R->Z, R2->X); + } +} diff --git a/third_party/sidh/src/fpx.c b/third_party/sidh/src/fpx.c new file mode 100644 index 00000000..6e5e33a5 --- /dev/null +++ b/third_party/sidh/src/fpx.c @@ -0,0 +1,558 @@ +/******************************************************************************************** +* SIDH: an efficient supersingular isogeny cryptography library +* +* Abstract: core functions over GF(p) and GF(p^2) +*********************************************************************************************/ + + +__inline void fpcopy(const felm_t a, felm_t c) +{ // Copy a field element, c = a. + unsigned int i; + + for (i = 0; i < NWORDS_FIELD; i++) + c[i] = a[i]; +} + + +__inline void fpzero(felm_t a) +{ // Zero a field element, a = 0. + unsigned int i; + + for (i = 0; i < NWORDS_FIELD; i++) + a[i] = 0; +} + + +void to_mont(const felm_t a, felm_t mc) +{ // Conversion to Montgomery representation, + // mc = a*R^2*R^(-1) mod p = a*R mod p, where a in [0, p-1]. + // The Montgomery constant R^2 mod p is the global value "Montgomery_R2". + + fpmul_mont(a, (digit_t*)&Montgomery_R2, mc); +} + + +void from_mont(const felm_t ma, felm_t c) +{ // Conversion from Montgomery representation to standard representation, + // c = ma*R^(-1) mod p = a mod p, where ma in [0, p-1]. + digit_t one[NWORDS_FIELD] = {0}; + + one[0] = 1; + fpmul_mont(ma, one, c); + fpcorrection(c); +} + + +void copy_words(const digit_t* a, digit_t* c, const unsigned int nwords) +{ // Copy wordsize digits, c = a, where lng(a) = nwords. + unsigned int i; + + for (i = 0; i < nwords; i++) { + c[i] = a[i]; + } +} + + +void fpmul_mont(const felm_t ma, const felm_t mb, felm_t mc) +{ // Multiprecision multiplication, c = a*b mod p. + dfelm_t temp = {0}; + + mp_mul(ma, mb, temp, NWORDS_FIELD); + rdc_mont(temp, mc); +} + + +void fpsqr_mont(const felm_t ma, felm_t mc) +{ // Multiprecision squaring, c = a^2 mod p. + dfelm_t temp = {0}; + + mp_mul(ma, ma, temp, NWORDS_FIELD); + rdc_mont(temp, mc); +} + + +void fpinv_mont(felm_t a) +{ // Field inversion using Montgomery arithmetic, a = a^(-1)*R mod p. + felm_t tt; + + fpcopy(a, tt); + fpinv_chain_mont(tt); + fpsqr_mont(tt, tt); + fpsqr_mont(tt, tt); + fpmul_mont(a, tt, a); +} + + +void fp2copy(const f2elm_t a, f2elm_t c) +{ // Copy a GF(p^2) element, c = a. + fpcopy(a[0], c[0]); + fpcopy(a[1], c[1]); +} + + +void fp2zero(f2elm_t a) +{ // Zero a GF(p^2) element, a = 0. + fpzero(a[0]); + fpzero(a[1]); +} + + +void fp2neg(f2elm_t a) +{ // GF(p^2) negation, a = -a in GF(p^2). + fpneg(a[0]); + fpneg(a[1]); +} + + +__inline void fp2add(const f2elm_t a, const f2elm_t b, f2elm_t c) +{ // GF(p^2) addition, c = a+b in GF(p^2). + fpadd(a[0], b[0], c[0]); + fpadd(a[1], b[1], c[1]); +} + + +__inline void fp2sub(const f2elm_t a, const f2elm_t b, f2elm_t c) +{ // GF(p^2) subtraction, c = a-b in GF(p^2). + fpsub(a[0], b[0], c[0]); + fpsub(a[1], b[1], c[1]); +} + + +void fp2div2(const f2elm_t a, f2elm_t c) +{ // GF(p^2) division by two, c = a/2 in GF(p^2). + fpdiv2(a[0], c[0]); + fpdiv2(a[1], c[1]); +} + + +void fp2correction(f2elm_t a) +{ // Modular correction, a = a in GF(p^2). + fpcorrection(a[0]); + fpcorrection(a[1]); +} + + +__inline static void mp_addfast(const digit_t* a, const digit_t* b, digit_t* c) +{ // Multiprecision addition, c = a+b. +#if (OS_TARGET == OS_WIN) || defined(GENERIC_IMPLEMENTATION) + + mp_add(a, b, c, NWORDS_FIELD); + +#elif (OS_TARGET == OS_LINUX) + + mp_add_asm(a, b, c); + +#endif +} + + +void fp2sqr_mont(const f2elm_t a, f2elm_t c) +{ // GF(p^2) squaring using Montgomery arithmetic, c = a^2 in GF(p^2). + // Inputs: a = a0+a1*i, where a0, a1 are in [0, 2*p-1] + // Output: c = c0+c1*i, where c0, c1 are in [0, 2*p-1] + felm_t t1, t2, t3; + + mp_addfast(a[0], a[1], t1); // t1 = a0+a1 + fpsub(a[0], a[1], t2); // t2 = a0-a1 + mp_addfast(a[0], a[0], t3); // t3 = 2a0 + fpmul_mont(t1, t2, c[0]); // c0 = (a0+a1)(a0-a1) + fpmul_mont(t3, a[1], c[1]); // c1 = 2a0*a1 +} + + +__inline unsigned int mp_sub(const digit_t* a, const digit_t* b, digit_t* c, const unsigned int nwords) +{ // Multiprecision subtraction, c = a-b, where lng(a) = lng(b) = nwords. Returns the borrow bit. + unsigned int i, borrow = 0; + + for (i = 0; i < nwords; i++) { + SUBC(borrow, a[i], b[i], borrow, c[i]); + } + + return borrow; +} + + +__inline static digit_t mp_subfast(const digit_t* a, const digit_t* b, digit_t* c) +{ // Multiprecision subtraction, c = a-b, where lng(a) = lng(b) = 2*NWORDS_FIELD. + // If c < 0 then returns mask = 0xFF..F, else mask = 0x00..0 +#if (OS_TARGET == OS_WIN) || defined(GENERIC_IMPLEMENTATION) + + return (0 - (digit_t)mp_sub(a, b, c, 2*NWORDS_FIELD)); + +#elif (OS_TARGET == OS_LINUX) + + return mp_subx2_asm(a, b, c); + +#endif +} + + +__inline static void mp_dblsubfast(const digit_t* a, const digit_t* b, digit_t* c) +{ // Multiprecision subtraction, c = c-a-b, where lng(a) = lng(b) = 2*NWORDS_FIELD. + // Inputs should be s.t. c > a and c > b +#if (OS_TARGET == OS_WIN) || defined(GENERIC_IMPLEMENTATION) + + mp_sub(c, a, c, 2*NWORDS_FIELD); + mp_sub(c, b, c, 2*NWORDS_FIELD); + +#elif (OS_TARGET == OS_LINUX) + + mp_dblsubx2_asm(a, b, c); + +#endif +} + + +void fp2mul_mont(const f2elm_t a, const f2elm_t b, f2elm_t c) +{ // GF(p^2) multiplication using Montgomery arithmetic, c = a*b in GF(p^2). + // Inputs: a = a0+a1*i and b = b0+b1*i, where a0, a1, b0, b1 are in [0, 2*p-1] + // Output: c = c0+c1*i, where c0, c1 are in [0, 2*p-1] + felm_t t1, t2; + dfelm_t tt1, tt2, tt3; + digit_t mask; + unsigned int i; + + mp_addfast(a[0], a[1], t1); // t1 = a0+a1 + mp_addfast(b[0], b[1], t2); // t2 = b0+b1 + mp_mul(a[0], b[0], tt1, NWORDS_FIELD); // tt1 = a0*b0 + mp_mul(a[1], b[1], tt2, NWORDS_FIELD); // tt2 = a1*b1 + mp_mul(t1, t2, tt3, NWORDS_FIELD); // tt3 = (a0+a1)*(b0+b1) + mp_dblsubfast(tt1, tt2, tt3); // tt3 = (a0+a1)*(b0+b1) - a0*b0 - a1*b1 + mask = mp_subfast(tt1, tt2, tt1); // tt1 = a0*b0 - a1*b1. If tt1 < 0 then mask = 0xFF..F, else if tt1 >= 0 then mask = 0x00..0 + + for (i = 0; i < NWORDS_FIELD; i++) { + t1[i] = ((digit_t*)PRIME)[i] & mask; + } + + rdc_mont(tt3, c[1]); // c[1] = (a0+a1)*(b0+b1) - a0*b0 - a1*b1 + mp_addfast((digit_t*)&tt1[NWORDS_FIELD], t1, (digit_t*)&tt1[NWORDS_FIELD]); + rdc_mont(tt1, c[0]); // c[0] = a0*b0 - a1*b1 +} + + +void fpinv_chain_mont(felm_t a) +{ // Chain to compute a^(p-3)/4 using Montgomery arithmetic. + unsigned int i, j; + +#if (NBITS_FIELD == 503) + felm_t t[15], tt; + + // Precomputed table + fpsqr_mont(a, tt); + fpmul_mont(a, tt, t[0]); + for (i = 0; i <= 13; i++) fpmul_mont(t[i], tt, t[i+1]); + + fpcopy(a, tt); + for (i = 0; i < 8; i++) fpsqr_mont(tt, tt); + fpmul_mont(a, tt, tt); + for (i = 0; i < 5; i++) fpsqr_mont(tt, tt); + fpmul_mont(t[8], tt, tt); + for (i = 0; i < 5; i++) fpsqr_mont(tt, tt); + fpmul_mont(t[6], tt, tt); + for (i = 0; i < 6; i++) fpsqr_mont(tt, tt); + fpmul_mont(t[9], tt, tt); + for (i = 0; i < 7; i++) fpsqr_mont(tt, tt); + fpmul_mont(t[0], tt, tt); + for (i = 0; i < 7; i++) fpsqr_mont(tt, tt); + fpmul_mont(a, tt, tt); + for (i = 0; i < 7; i++) fpsqr_mont(tt, tt); + fpmul_mont(t[6], tt, tt); + for (i = 0; i < 7; i++) fpsqr_mont(tt, tt); + fpmul_mont(t[2], tt, tt); + for (i = 0; i < 5; i++) fpsqr_mont(tt, tt); + fpmul_mont(t[8], tt, tt); + for (i = 0; i < 7; i++) fpsqr_mont(tt, tt); + fpmul_mont(a, tt, tt); + for (i = 0; i < 8; i++) fpsqr_mont(tt, tt); + fpmul_mont(t[10], tt, tt); + for (i = 0; i < 5; i++) fpsqr_mont(tt, tt); + fpmul_mont(t[0], tt, tt); + for (i = 0; i < 6; i++) fpsqr_mont(tt, tt); + fpmul_mont(t[10], tt, tt); + for (i = 0; i < 5; i++) fpsqr_mont(tt, tt); + fpmul_mont(t[10], tt, tt); + for (i = 0; i < 5; i++) fpsqr_mont(tt, tt); + fpmul_mont(t[5], tt, tt); + for (i = 0; i < 5; i++) fpsqr_mont(tt, tt); + fpmul_mont(t[2], tt, tt); + for (i = 0; i < 5; i++) fpsqr_mont(tt, tt); + fpmul_mont(t[6], tt, tt); + for (i = 0; i < 5; i++) fpsqr_mont(tt, tt); + fpmul_mont(t[3], tt, tt); + for (i = 0; i < 6; i++) fpsqr_mont(tt, tt); + fpmul_mont(t[5], tt, tt); + for (i = 0; i < 12; i++) fpsqr_mont(tt, tt); + fpmul_mont(t[12], tt, tt); + for (i = 0; i < 5; i++) fpsqr_mont(tt, tt); + fpmul_mont(t[8], tt, tt); + for (i = 0; i < 5; i++) fpsqr_mont(tt, tt); + fpmul_mont(t[6], tt, tt); + for (i = 0; i < 5; i++) fpsqr_mont(tt, tt); + fpmul_mont(t[12], tt, tt); + for (i = 0; i < 6; i++) fpsqr_mont(tt, tt); + fpmul_mont(t[11], tt, tt); + for (i = 0; i < 8; i++) fpsqr_mont(tt, tt); + fpmul_mont(t[6], tt, tt); + for (i = 0; i < 5; i++) fpsqr_mont(tt, tt); + fpmul_mont(t[5], tt, tt); + for (i = 0; i < 5; i++) fpsqr_mont(tt, tt); + fpmul_mont(t[14], tt, tt); + for (i = 0; i < 7; i++) fpsqr_mont(tt, tt); + fpmul_mont(t[14], tt, tt); + for (i = 0; i < 5; i++) fpsqr_mont(tt, tt); + fpmul_mont(t[5], tt, tt); + for (i = 0; i < 5; i++) fpsqr_mont(tt, tt); + fpmul_mont(t[6], tt, tt); + for (i = 0; i < 8; i++) fpsqr_mont(tt, tt); + fpmul_mont(t[8], tt, tt); + for (i = 0; i < 5; i++) fpsqr_mont(tt, tt); + fpmul_mont(a, tt, tt); + for (i = 0; i < 8; i++) fpsqr_mont(tt, tt); + fpmul_mont(t[4], tt, tt); + for (i = 0; i < 5; i++) fpsqr_mont(tt, tt); + fpmul_mont(t[6], tt, tt); + for (i = 0; i < 5; i++) fpsqr_mont(tt, tt); + fpmul_mont(t[5], tt, tt); + for (i = 0; i < 8; i++) fpsqr_mont(tt, tt); + fpmul_mont(t[7], tt, tt); + for (i = 0; i < 5; i++) fpsqr_mont(tt, tt); + fpmul_mont(a, tt, tt); + for (i = 0; i < 5; i++) fpsqr_mont(tt, tt); + fpmul_mont(t[0], tt, tt); + for (i = 0; i < 5; i++) fpsqr_mont(tt, tt); + fpmul_mont(t[11], tt, tt); + for (i = 0; i < 5; i++) fpsqr_mont(tt, tt); + fpmul_mont(t[13], tt, tt); + for (i = 0; i < 8; i++) fpsqr_mont(tt, tt); + fpmul_mont(t[1], tt, tt); + for (i = 0; i < 6; i++) fpsqr_mont(tt, tt); + fpmul_mont(t[10], tt, tt); + for (j = 0; j < 49; j++) { + for (i = 0; i < 5; i++) fpsqr_mont(tt, tt); + fpmul_mont(t[14], tt, tt); + } + fpcopy(tt, a); + +#elif (NBITS_FIELD == 751) + felm_t t[27], tt; + + // Precomputed table + fpsqr_mont(a, tt); + fpmul_mont(a, tt, t[0]); + fpmul_mont(t[0], tt, t[1]); + fpmul_mont(t[1], tt, t[2]); + fpmul_mont(t[2], tt, t[3]); + fpmul_mont(t[3], tt, t[3]); + for (i = 3; i <= 8; i++) fpmul_mont(t[i], tt, t[i+1]); + fpmul_mont(t[9], tt, t[9]); + for (i = 9; i <= 20; i++) fpmul_mont(t[i], tt, t[i+1]); + fpmul_mont(t[21], tt, t[21]); + for (i = 21; i <= 24; i++) fpmul_mont(t[i], tt, t[i+1]); + fpmul_mont(t[25], tt, t[25]); + fpmul_mont(t[25], tt, t[26]); + + fpcopy(a, tt); + for (i = 0; i < 6; i++) fpsqr_mont(tt, tt); + fpmul_mont(t[20], tt, tt); + for (i = 0; i < 6; i++) fpsqr_mont(tt, tt); + fpmul_mont(t[24], tt, tt); + for (i = 0; i < 6; i++) fpsqr_mont(tt, tt); + fpmul_mont(t[11], tt, tt); + for (i = 0; i < 6; i++) fpsqr_mont(tt, tt); + fpmul_mont(t[8], tt, tt); + for (i = 0; i < 8; i++) fpsqr_mont(tt, tt); + fpmul_mont(t[2], tt, tt); + for (i = 0; i < 6; i++) fpsqr_mont(tt, tt); + fpmul_mont(t[23], tt, tt); + for (i = 0; i < 6; i++) fpsqr_mont(tt, tt); + fpmul_mont(t[2], tt, tt); + for (i = 0; i < 9; i++) fpsqr_mont(tt, tt); + fpmul_mont(t[2], tt, tt); + for (i = 0; i < 10; i++) fpsqr_mont(tt, tt); + fpmul_mont(t[15], tt, tt); + for (i = 0; i < 8; i++) fpsqr_mont(tt, tt); + fpmul_mont(t[13], tt, tt); + for (i = 0; i < 8; i++) fpsqr_mont(tt, tt); + fpmul_mont(t[26], tt, tt); + for (i = 0; i < 8; i++) fpsqr_mont(tt, tt); + fpmul_mont(t[20], tt, tt); + for (i = 0; i < 6; i++) fpsqr_mont(tt, tt); + fpmul_mont(t[11], tt, tt); + for (i = 0; i < 6; i++) fpsqr_mont(tt, tt); + fpmul_mont(t[10], tt, tt); + for (i = 0; i < 6; i++) fpsqr_mont(tt, tt); + fpmul_mont(t[14], tt, tt); + for (i = 0; i < 6; i++) fpsqr_mont(tt, tt); + fpmul_mont(t[4], tt, tt); + for (i = 0; i < 10; i++) fpsqr_mont(tt, tt); + fpmul_mont(t[18], tt, tt); + for (i = 0; i < 6; i++) fpsqr_mont(tt, tt); + fpmul_mont(t[1], tt, tt); + for (i = 0; i < 7; i++) fpsqr_mont(tt, tt); + fpmul_mont(t[22], tt, tt); + for (i = 0; i < 10; i++) fpsqr_mont(tt, tt); + fpmul_mont(t[6], tt, tt); + for (i = 0; i < 7; i++) fpsqr_mont(tt, tt); + fpmul_mont(t[24], tt, tt); + for (i = 0; i < 6; i++) fpsqr_mont(tt, tt); + fpmul_mont(t[9], tt, tt); + for (i = 0; i < 8; i++) fpsqr_mont(tt, tt); + fpmul_mont(t[18], tt, tt); + for (i = 0; i < 6; i++) fpsqr_mont(tt, tt); + fpmul_mont(t[17], tt, tt); + for (i = 0; i < 8; i++) fpsqr_mont(tt, tt); + fpmul_mont(a, tt, tt); + for (i = 0; i < 10; i++) fpsqr_mont(tt, tt); + fpmul_mont(t[16], tt, tt); + for (i = 0; i < 6; i++) fpsqr_mont(tt, tt); + fpmul_mont(t[7], tt, tt); + for (i = 0; i < 6; i++) fpsqr_mont(tt, tt); + fpmul_mont(t[0], tt, tt); + for (i = 0; i < 7; i++) fpsqr_mont(tt, tt); + fpmul_mont(t[12], tt, tt); + for (i = 0; i < 7; i++) fpsqr_mont(tt, tt); + fpmul_mont(t[19], tt, tt); + for (i = 0; i < 6; i++) fpsqr_mont(tt, tt); + fpmul_mont(t[22], tt, tt); + for (i = 0; i < 6; i++) fpsqr_mont(tt, tt); + fpmul_mont(t[25], tt, tt); + for (i = 0; i < 7; i++) fpsqr_mont(tt, tt); + fpmul_mont(t[2], tt, tt); + for (i = 0; i < 6; i++) fpsqr_mont(tt, tt); + fpmul_mont(t[10], tt, tt); + for (i = 0; i < 7; i++) fpsqr_mont(tt, tt); + fpmul_mont(t[22], tt, tt); + for (i = 0; i < 8; i++) fpsqr_mont(tt, tt); + fpmul_mont(t[18], tt, tt); + for (i = 0; i < 6; i++) fpsqr_mont(tt, tt); + fpmul_mont(t[4], tt, tt); + for (i = 0; i < 6; i++) fpsqr_mont(tt, tt); + fpmul_mont(t[14], tt, tt); + for (i = 0; i < 7; i++) fpsqr_mont(tt, tt); + fpmul_mont(t[13], tt, tt); + for (i = 0; i < 6; i++) fpsqr_mont(tt, tt); + fpmul_mont(t[5], tt, tt); + for (i = 0; i < 6; i++) fpsqr_mont(tt, tt); + fpmul_mont(t[23], tt, tt); + for (i = 0; i < 6; i++) fpsqr_mont(tt, tt); + fpmul_mont(t[21], tt, tt); + for (i = 0; i < 6; i++) fpsqr_mont(tt, tt); + fpmul_mont(t[2], tt, tt); + for (i = 0; i < 7; i++) fpsqr_mont(tt, tt); + fpmul_mont(t[23], tt, tt); + for (i = 0; i < 8; i++) fpsqr_mont(tt, tt); + fpmul_mont(t[12], tt, tt); + for (i = 0; i < 6; i++) fpsqr_mont(tt, tt); + fpmul_mont(t[9], tt, tt); + for (i = 0; i < 6; i++) fpsqr_mont(tt, tt); + fpmul_mont(t[3], tt, tt); + for (i = 0; i < 7; i++) fpsqr_mont(tt, tt); + fpmul_mont(t[13], tt, tt); + for (i = 0; i < 7; i++) fpsqr_mont(tt, tt); + fpmul_mont(t[17], tt, tt); + for (i = 0; i < 8; i++) fpsqr_mont(tt, tt); + fpmul_mont(t[26], tt, tt); + for (i = 0; i < 8; i++) fpsqr_mont(tt, tt); + fpmul_mont(t[5], tt, tt); + for (i = 0; i < 8; i++) fpsqr_mont(tt, tt); + fpmul_mont(t[8], tt, tt); + for (i = 0; i < 6; i++) fpsqr_mont(tt, tt); + fpmul_mont(t[2], tt, tt); + for (i = 0; i < 6; i++) fpsqr_mont(tt, tt); + fpmul_mont(t[11], tt, tt); + for (i = 0; i < 7; i++) fpsqr_mont(tt, tt); + fpmul_mont(t[20], tt, tt); + for (j = 0; j < 61; j++) { + for (i = 0; i < 6; i++) fpsqr_mont(tt, tt); + fpmul_mont(t[26], tt, tt); + } + fpcopy(tt, a); +#endif +} + + +void fp2inv_mont(f2elm_t a) +{// GF(p^2) inversion using Montgomery arithmetic, a = (a0-i*a1)/(a0^2+a1^2). + f2elm_t t1; + + fpsqr_mont(a[0], t1[0]); // t10 = a0^2 + fpsqr_mont(a[1], t1[1]); // t11 = a1^2 + fpadd(t1[0], t1[1], t1[0]); // t10 = a0^2+a1^2 + fpinv_mont(t1[0]); // t10 = (a0^2+a1^2)^-1 + fpneg(a[1]); // a = a0-i*a1 + fpmul_mont(a[0], t1[0], a[0]); + fpmul_mont(a[1], t1[0], a[1]); // a = (a0-i*a1)*(a0^2+a1^2)^-1 +} + + +void to_fp2mont(const f2elm_t a, f2elm_t mc) +{ // Conversion of a GF(p^2) element to Montgomery representation, + // mc_i = a_i*R^2*R^(-1) = a_i*R in GF(p^2). + + to_mont(a[0], mc[0]); + to_mont(a[1], mc[1]); +} + + +void from_fp2mont(const f2elm_t ma, f2elm_t c) +{ // Conversion of a GF(p^2) element from Montgomery representation to standard representation, + // c_i = ma_i*R^(-1) = a_i in GF(p^2). + + from_mont(ma[0], c[0]); + from_mont(ma[1], c[1]); +} + + +__inline unsigned int mp_add(const digit_t* a, const digit_t* b, digit_t* c, const unsigned int nwords) +{ // Multiprecision addition, c = a+b, where lng(a) = lng(b) = nwords. Returns the carry bit. + unsigned int i, carry = 0; + + for (i = 0; i < nwords; i++) { + ADDC(carry, a[i], b[i], carry, c[i]); + } + + return carry; +} + + +void mp_shiftleft(digit_t* x, unsigned int shift, const unsigned int nwords) +{ + unsigned int i, j = 0; + + while (shift > RADIX) { + j += 1; + shift -= RADIX; + } + + for (i = 0; i < nwords-j; i++) + x[nwords-1-i] = x[nwords-1-i-j]; + for (i = nwords-j; i < nwords; i++) + x[nwords-1-i] = 0; + if (shift != 0) { + for (j = nwords-1; j > 0; j--) + SHIFTL(x[j], x[j-1], shift, x[j], RADIX); + x[0] <<= shift; + } +} + + +void mp_shiftr1(digit_t* x, const unsigned int nwords) +{ // Multiprecision right shift by one. + unsigned int i; + + for (i = 0; i < nwords-1; i++) { + SHIFTR(x[i+1], x[i], 1, x[i], RADIX); + } + x[nwords-1] >>= 1; +} + + +void mp_shiftl1(digit_t* x, const unsigned int nwords) +{ // Multiprecision left shift by one. + int i; + + for (i = nwords-1; i > 0; i--) { + SHIFTL(x[i], x[i-1], 1, x[i], RADIX); + } + x[0] <<= 1; +} diff --git a/third_party/sidh/src/random/random.c b/third_party/sidh/src/random/random.c new file mode 100644 index 00000000..7f445b81 --- /dev/null +++ b/third_party/sidh/src/random/random.c @@ -0,0 +1,61 @@ +/******************************************************************************************** +* Hardware-based random number generation function +* +* It uses /dev/urandom in Linux and CNG's BCryptGenRandom function in Windows +*********************************************************************************************/ + +#include "random.h" +#include +#if defined(__WINDOWS__) + #include + #include +#elif defined(__LINUX__) + #include + #include + static int lock = -1; +#endif + +#define passed 0 +#define failed 1 + + +static __inline void delay(unsigned int count) +{ + while (count--) {} +} + + +int randombytes(unsigned char* random_array, unsigned long long nbytes) +{ // Generation of "nbytes" of random values + +#if defined(__WINDOWS__) + if (!BCRYPT_SUCCESS(BCryptGenRandom(NULL, random_array, (unsigned long)nbytes, BCRYPT_USE_SYSTEM_PREFERRED_RNG))) { + return failed; + } + +#elif defined(__LINUX__) + int r, n = (int)nbytes, count = 0; + + if (lock == -1) { + do { + lock = open("/dev/urandom", O_RDONLY); + if (lock == -1) { + delay(0xFFFFF); + } + } while (lock == -1); + } + + while (n > 0) { + do { + r = read(lock, random_array+count, n); + if (r == -1) { + delay(0xFFFF); + } + } while (r == -1); + count += r; + n -= r; + } +#endif + + return passed; +} \ No newline at end of file diff --git a/third_party/sidh/src/random/random.h b/third_party/sidh/src/random/random.h new file mode 100644 index 00000000..fbed5f82 --- /dev/null +++ b/third_party/sidh/src/random/random.h @@ -0,0 +1,9 @@ +#ifndef __RANDOM_H__ +#define __RANDOM_H__ + + +// Generate random bytes and output the result to random_array +int randombytes(unsigned char* random_array, unsigned long long nbytes); + + +#endif \ No newline at end of file diff --git a/third_party/sidh/src/sha3/fips202.c b/third_party/sidh/src/sha3/fips202.c new file mode 100644 index 00000000..f21926d0 --- /dev/null +++ b/third_party/sidh/src/sha3/fips202.c @@ -0,0 +1,572 @@ +/******************************************************************************************** +* SHA3-derived functions: SHAKE and cSHAKE +* +* Based on the public domain implementation in crypto_hash/keccakc512/simple/ +* from http://bench.cr.yp.to/supercop.html by Ronny Van Keer +* and the public domain "TweetFips202" implementation from https://twitter.com/tweetfips202 +* by Gilles Van Assche, Daniel J. Bernstein, and Peter Schwabe +* +* See NIST Special Publication 800-185 for more information: +* http://nvlpubs.nist.gov/nistpubs/SpecialPublications/NIST.SP.800-185.pdf +* +*********************************************************************************************/ + +#include +#include +#include "fips202.h" + +#define NROUNDS 24 +#define ROL(a, offset) ((a << offset) ^ (a >> (64-offset))) + + +static uint64_t load64(const unsigned char *x) +{ + unsigned long long r = 0, i; + + for (i = 0; i < 8; ++i) { + r |= (unsigned long long)x[i] << 8 * i; + } + return r; +} + + +static void store64(uint8_t *x, uint64_t u) +{ + unsigned int i; + + for (i = 0; i < 8; ++i) { + x[i] = (uint8_t)u; + u >>= 8; + } +} + + +static const uint64_t KeccakF_RoundConstants[NROUNDS] = +{ + (uint64_t)0x0000000000000001ULL, + (uint64_t)0x0000000000008082ULL, + (uint64_t)0x800000000000808aULL, + (uint64_t)0x8000000080008000ULL, + (uint64_t)0x000000000000808bULL, + (uint64_t)0x0000000080000001ULL, + (uint64_t)0x8000000080008081ULL, + (uint64_t)0x8000000000008009ULL, + (uint64_t)0x000000000000008aULL, + (uint64_t)0x0000000000000088ULL, + (uint64_t)0x0000000080008009ULL, + (uint64_t)0x000000008000000aULL, + (uint64_t)0x000000008000808bULL, + (uint64_t)0x800000000000008bULL, + (uint64_t)0x8000000000008089ULL, + (uint64_t)0x8000000000008003ULL, + (uint64_t)0x8000000000008002ULL, + (uint64_t)0x8000000000000080ULL, + (uint64_t)0x000000000000800aULL, + (uint64_t)0x800000008000000aULL, + (uint64_t)0x8000000080008081ULL, + (uint64_t)0x8000000000008080ULL, + (uint64_t)0x0000000080000001ULL, + (uint64_t)0x8000000080008008ULL +}; + + +static void KeccakF1600_StatePermute(uint64_t * state) { + int round; + + uint64_t Aba, Abe, Abi, Abo, Abu; + uint64_t Aga, Age, Agi, Ago, Agu; + uint64_t Aka, Ake, Aki, Ako, Aku; + uint64_t Ama, Ame, Ami, Amo, Amu; + uint64_t Asa, Ase, Asi, Aso, Asu; + uint64_t BCa, BCe, BCi, BCo, BCu; + uint64_t Da, De, Di, Do, Du; + uint64_t Eba, Ebe, Ebi, Ebo, Ebu; + uint64_t Ega, Ege, Egi, Ego, Egu; + uint64_t Eka, Eke, Eki, Eko, Eku; + uint64_t Ema, Eme, Emi, Emo, Emu; + uint64_t Esa, Ese, Esi, Eso, Esu; + + //copyFromState(A, state) + Aba = state[ 0]; + Abe = state[ 1]; + Abi = state[ 2]; + Abo = state[ 3]; + Abu = state[ 4]; + Aga = state[ 5]; + Age = state[ 6]; + Agi = state[ 7]; + Ago = state[ 8]; + Agu = state[ 9]; + Aka = state[10]; + Ake = state[11]; + Aki = state[12]; + Ako = state[13]; + Aku = state[14]; + Ama = state[15]; + Ame = state[16]; + Ami = state[17]; + Amo = state[18]; + Amu = state[19]; + Asa = state[20]; + Ase = state[21]; + Asi = state[22]; + Aso = state[23]; + Asu = state[24]; + + for( round = 0; round < NROUNDS; round += 2 ) + { + // prepareTheta + BCa = Aba^Aga^Aka^Ama^Asa; + BCe = Abe^Age^Ake^Ame^Ase; + BCi = Abi^Agi^Aki^Ami^Asi; + BCo = Abo^Ago^Ako^Amo^Aso; + BCu = Abu^Agu^Aku^Amu^Asu; + + //thetaRhoPiChiIotaPrepareTheta(round , A, E) + Da = BCu^ROL(BCe, 1); + De = BCa^ROL(BCi, 1); + Di = BCe^ROL(BCo, 1); + Do = BCi^ROL(BCu, 1); + Du = BCo^ROL(BCa, 1); + + Aba ^= Da; + BCa = Aba; + Age ^= De; + BCe = ROL(Age, 44); + Aki ^= Di; + BCi = ROL(Aki, 43); + Amo ^= Do; + BCo = ROL(Amo, 21); + Asu ^= Du; + BCu = ROL(Asu, 14); + Eba = BCa ^((~BCe)& BCi ); + Eba ^= (uint64_t)KeccakF_RoundConstants[round]; + Ebe = BCe ^((~BCi)& BCo ); + Ebi = BCi ^((~BCo)& BCu ); + Ebo = BCo ^((~BCu)& BCa ); + Ebu = BCu ^((~BCa)& BCe ); + + Abo ^= Do; + BCa = ROL(Abo, 28); + Agu ^= Du; + BCe = ROL(Agu, 20); + Aka ^= Da; + BCi = ROL(Aka, 3); + Ame ^= De; + BCo = ROL(Ame, 45); + Asi ^= Di; + BCu = ROL(Asi, 61); + Ega = BCa ^((~BCe)& BCi ); + Ege = BCe ^((~BCi)& BCo ); + Egi = BCi ^((~BCo)& BCu ); + Ego = BCo ^((~BCu)& BCa ); + Egu = BCu ^((~BCa)& BCe ); + + Abe ^= De; + BCa = ROL(Abe, 1); + Agi ^= Di; + BCe = ROL(Agi, 6); + Ako ^= Do; + BCi = ROL(Ako, 25); + Amu ^= Du; + BCo = ROL(Amu, 8); + Asa ^= Da; + BCu = ROL(Asa, 18); + Eka = BCa ^((~BCe)& BCi ); + Eke = BCe ^((~BCi)& BCo ); + Eki = BCi ^((~BCo)& BCu ); + Eko = BCo ^((~BCu)& BCa ); + Eku = BCu ^((~BCa)& BCe ); + + Abu ^= Du; + BCa = ROL(Abu, 27); + Aga ^= Da; + BCe = ROL(Aga, 36); + Ake ^= De; + BCi = ROL(Ake, 10); + Ami ^= Di; + BCo = ROL(Ami, 15); + Aso ^= Do; + BCu = ROL(Aso, 56); + Ema = BCa ^((~BCe)& BCi ); + Eme = BCe ^((~BCi)& BCo ); + Emi = BCi ^((~BCo)& BCu ); + Emo = BCo ^((~BCu)& BCa ); + Emu = BCu ^((~BCa)& BCe ); + + Abi ^= Di; + BCa = ROL(Abi, 62); + Ago ^= Do; + BCe = ROL(Ago, 55); + Aku ^= Du; + BCi = ROL(Aku, 39); + Ama ^= Da; + BCo = ROL(Ama, 41); + Ase ^= De; + BCu = ROL(Ase, 2); + Esa = BCa ^((~BCe)& BCi ); + Ese = BCe ^((~BCi)& BCo ); + Esi = BCi ^((~BCo)& BCu ); + Eso = BCo ^((~BCu)& BCa ); + Esu = BCu ^((~BCa)& BCe ); + + // prepareTheta + BCa = Eba^Ega^Eka^Ema^Esa; + BCe = Ebe^Ege^Eke^Eme^Ese; + BCi = Ebi^Egi^Eki^Emi^Esi; + BCo = Ebo^Ego^Eko^Emo^Eso; + BCu = Ebu^Egu^Eku^Emu^Esu; + + //thetaRhoPiChiIotaPrepareTheta(round+1, E, A) + Da = BCu^ROL(BCe, 1); + De = BCa^ROL(BCi, 1); + Di = BCe^ROL(BCo, 1); + Do = BCi^ROL(BCu, 1); + Du = BCo^ROL(BCa, 1); + + Eba ^= Da; + BCa = Eba; + Ege ^= De; + BCe = ROL(Ege, 44); + Eki ^= Di; + BCi = ROL(Eki, 43); + Emo ^= Do; + BCo = ROL(Emo, 21); + Esu ^= Du; + BCu = ROL(Esu, 14); + Aba = BCa ^((~BCe)& BCi ); + Aba ^= (uint64_t)KeccakF_RoundConstants[round+1]; + Abe = BCe ^((~BCi)& BCo ); + Abi = BCi ^((~BCo)& BCu ); + Abo = BCo ^((~BCu)& BCa ); + Abu = BCu ^((~BCa)& BCe ); + + Ebo ^= Do; + BCa = ROL(Ebo, 28); + Egu ^= Du; + BCe = ROL(Egu, 20); + Eka ^= Da; + BCi = ROL(Eka, 3); + Eme ^= De; + BCo = ROL(Eme, 45); + Esi ^= Di; + BCu = ROL(Esi, 61); + Aga = BCa ^((~BCe)& BCi ); + Age = BCe ^((~BCi)& BCo ); + Agi = BCi ^((~BCo)& BCu ); + Ago = BCo ^((~BCu)& BCa ); + Agu = BCu ^((~BCa)& BCe ); + + Ebe ^= De; + BCa = ROL(Ebe, 1); + Egi ^= Di; + BCe = ROL(Egi, 6); + Eko ^= Do; + BCi = ROL(Eko, 25); + Emu ^= Du; + BCo = ROL(Emu, 8); + Esa ^= Da; + BCu = ROL(Esa, 18); + Aka = BCa ^((~BCe)& BCi ); + Ake = BCe ^((~BCi)& BCo ); + Aki = BCi ^((~BCo)& BCu ); + Ako = BCo ^((~BCu)& BCa ); + Aku = BCu ^((~BCa)& BCe ); + + Ebu ^= Du; + BCa = ROL(Ebu, 27); + Ega ^= Da; + BCe = ROL(Ega, 36); + Eke ^= De; + BCi = ROL(Eke, 10); + Emi ^= Di; + BCo = ROL(Emi, 15); + Eso ^= Do; + BCu = ROL(Eso, 56); + Ama = BCa ^((~BCe)& BCi ); + Ame = BCe ^((~BCi)& BCo ); + Ami = BCi ^((~BCo)& BCu ); + Amo = BCo ^((~BCu)& BCa ); + Amu = BCu ^((~BCa)& BCe ); + + Ebi ^= Di; + BCa = ROL(Ebi, 62); + Ego ^= Do; + BCe = ROL(Ego, 55); + Eku ^= Du; + BCi = ROL(Eku, 39); + Ema ^= Da; + BCo = ROL(Ema, 41); + Ese ^= De; + BCu = ROL(Ese, 2); + Asa = BCa ^((~BCe)& BCi ); + Ase = BCe ^((~BCi)& BCo ); + Asi = BCi ^((~BCo)& BCu ); + Aso = BCo ^((~BCu)& BCa ); + Asu = BCu ^((~BCa)& BCe ); + } + + //copyToState(state, A) + state[ 0] = Aba; + state[ 1] = Abe; + state[ 2] = Abi; + state[ 3] = Abo; + state[ 4] = Abu; + state[ 5] = Aga; + state[ 6] = Age; + state[ 7] = Agi; + state[ 8] = Ago; + state[ 9] = Agu; + state[10] = Aka; + state[11] = Ake; + state[12] = Aki; + state[13] = Ako; + state[14] = Aku; + state[15] = Ama; + state[16] = Ame; + state[17] = Ami; + state[18] = Amo; + state[19] = Amu; + state[20] = Asa; + state[21] = Ase; + state[22] = Asi; + state[23] = Aso; + state[24] = Asu; + + #undef round +} + +#include +#define MIN(a, b) ((a) < (b) ? (a) : (b)) + + +static void keccak_absorb(uint64_t *s, unsigned int r, const unsigned char *m, unsigned long long int mlen, unsigned char p) +{ + unsigned long long i; + unsigned char t[200]; + + while (mlen >= r) + { + for (i = 0; i < r / 8; ++i) + s[i] ^= load64(m + 8 * i); + + KeccakF1600_StatePermute(s); + mlen -= r; + m += r; + } + + for (i = 0; i < r; ++i) + t[i] = 0; + for (i = 0; i < mlen; ++i) + t[i] = m[i]; + t[i] = p; + t[r - 1] |= 128; + for (i = 0; i < r / 8; ++i) + s[i] ^= load64(t + 8 * i); +} + + +static void keccak_squeezeblocks(unsigned char *h, unsigned long long int nblocks, uint64_t *s, unsigned int r) +{ + unsigned int i; + + while(nblocks > 0) + { + KeccakF1600_StatePermute(s); + for (i = 0; i < (r>>3); i++) + { + store64(h+8*i, s[i]); + } + h += r; + nblocks--; + } +} + + +/********** SHAKE128 ***********/ + +void shake128_absorb(uint64_t *s, const unsigned char *input, unsigned int inputByteLen) +{ + keccak_absorb(s, SHAKE128_RATE, input, inputByteLen, 0x1F); +} + + +void shake128_squeezeblocks(unsigned char *output, unsigned long long nblocks, uint64_t *s) +{ + keccak_squeezeblocks(output, nblocks, s, SHAKE128_RATE); +} + + +void shake128(unsigned char *output, unsigned long long outlen, const unsigned char *input, unsigned long long inlen) +{ + uint64_t s[25] = {0}; + unsigned char t[SHAKE128_RATE]; + unsigned long long nblocks = outlen/SHAKE128_RATE; + size_t i; + + /* Absorb input */ + keccak_absorb(s, SHAKE128_RATE, input, inlen, 0x1F); + + /* Squeeze output */ + keccak_squeezeblocks(output, nblocks, s, SHAKE128_RATE); + + output += nblocks*SHAKE128_RATE; + outlen -= nblocks*SHAKE128_RATE; + + if (outlen) + { + keccak_squeezeblocks(t, 1, s, SHAKE128_RATE); + for (i = 0; i < outlen; i++) + output[i] = t[i]; + } +} + + +/********** cSHAKE128 ***********/ + +void cshake128_simple_absorb(uint64_t s[25], uint16_t cstm, const unsigned char *in, unsigned long long inlen) +{ + unsigned char *sep = (unsigned char*)s; + unsigned int i; + + for (i = 0; i < 25; i++) + s[i] = 0; + + /* Absorb customization (domain-separation) string */ + sep[0] = 0x01; + sep[1] = 0xa8; + sep[2] = 0x01; + sep[3] = 0x00; + sep[4] = 0x01; + sep[5] = 16; // fixed bitlen of cstm + sep[6] = cstm & 0xff; + sep[7] = cstm >> 8; + + KeccakF1600_StatePermute(s); + + /* Absorb input */ + keccak_absorb(s, SHAKE128_RATE, in, inlen, 0x04); +} + + +void cshake128_simple_squeezeblocks(unsigned char *output, unsigned long long nblocks, uint64_t *s) +{ + keccak_squeezeblocks(output, nblocks, s, SHAKE128_RATE); +} + + +void cshake128_simple(unsigned char *output, unsigned long long outlen, uint16_t cstm, const unsigned char *in, unsigned long long inlen) +{ + uint64_t s[25]; + unsigned char t[SHAKE128_RATE]; + unsigned int i; + + cshake128_simple_absorb(s, cstm, in, inlen); + + /* Squeeze output */ + keccak_squeezeblocks(output, outlen/SHAKE128_RATE, s, SHAKE128_RATE); + output += (outlen/SHAKE128_RATE)*SHAKE128_RATE; + + if (outlen%SHAKE128_RATE) + { + keccak_squeezeblocks(t, 1, s, SHAKE128_RATE); + for (i = 0; i < outlen%SHAKE128_RATE; i++) + output[i] = t[i]; + } +} + + +/********** SHAKE256 ***********/ + +void shake256_absorb(uint64_t *s, const unsigned char *input, unsigned int inputByteLen) +{ + keccak_absorb(s, SHAKE256_RATE, input, inputByteLen, 0x1F); +} + + +void shake256_squeezeblocks(unsigned char *output, unsigned long long nblocks, uint64_t *s) +{ + keccak_squeezeblocks(output, nblocks, s, SHAKE256_RATE); +} + + +void shake256(unsigned char *output, unsigned long long outlen, const unsigned char *input, unsigned long long inlen) +{ + uint64_t s[25]; + unsigned char t[SHAKE256_RATE]; + unsigned long long nblocks = outlen/SHAKE256_RATE; + size_t i; + + for (i = 0; i < 25; ++i) + s[i] = 0; + + /* Absorb input */ + keccak_absorb(s, SHAKE256_RATE, input, inlen, 0x1F); + + /* Squeeze output */ + keccak_squeezeblocks(output, nblocks, s, SHAKE256_RATE); + + output += nblocks*SHAKE256_RATE; + outlen -= nblocks*SHAKE256_RATE; + + if (outlen) + { + keccak_squeezeblocks(t, 1, s, SHAKE256_RATE); + for (i = 0; i < outlen; i++) + output[i] = t[i]; + } +} + + +/********** cSHAKE256 ***********/ + +void cshake256_simple_absorb(uint64_t s[25], uint16_t cstm, const unsigned char *in, unsigned long long inlen) +{ + unsigned char *sep = (unsigned char*)s; + unsigned int i; + + for (i = 0; i < 25; i++) + s[i] = 0; + + /* Absorb customization (domain-separation) string */ + sep[0] = 0x01; + sep[1] = 0x88; + sep[2] = 0x01; + sep[3] = 0x00; + sep[4] = 0x01; + sep[5] = 16; // fixed bitlen of cstm + sep[6] = cstm & 0xff; + sep[7] = cstm >> 8; + + KeccakF1600_StatePermute(s); + + /* Absorb input */ + keccak_absorb(s, SHAKE256_RATE, in, inlen, 0x04); +} + + +void cshake256_simple_squeezeblocks(unsigned char *output, unsigned long long nblocks, uint64_t *s) +{ + keccak_squeezeblocks(output, nblocks, s, SHAKE256_RATE); +} + + +void cshake256_simple(unsigned char *output, unsigned long long outlen, uint16_t cstm, const unsigned char *in, unsigned long long inlen) +{ + uint64_t s[25]; + unsigned char t[SHAKE256_RATE]; + unsigned int i; + + cshake256_simple_absorb(s, cstm, in, inlen); + + /* Squeeze output */ + keccak_squeezeblocks(output, outlen/SHAKE256_RATE, s, SHAKE256_RATE); + output += (outlen/SHAKE256_RATE)*SHAKE256_RATE; + + if(outlen%SHAKE256_RATE) + { + keccak_squeezeblocks(t, 1, s, SHAKE256_RATE); + for (i = 0; i < outlen%SHAKE256_RATE; i++) + output[i] = t[i]; + } +} \ No newline at end of file diff --git a/third_party/sidh/src/sha3/fips202.h b/third_party/sidh/src/sha3/fips202.h new file mode 100644 index 00000000..55b400ae --- /dev/null +++ b/third_party/sidh/src/sha3/fips202.h @@ -0,0 +1,27 @@ +#ifndef FIPS202_H +#define FIPS202_H + +#include + + +#define SHAKE128_RATE 168 +#define SHAKE256_RATE 136 + +void shake128_absorb(uint64_t *s, const unsigned char *input, unsigned int inputByteLen); +void shake128_squeezeblocks(unsigned char *output, unsigned long long nblocks, uint64_t *s); +void shake128(unsigned char *output, unsigned long long outlen, const unsigned char *input, unsigned long long inlen); + +void cshake128_simple_absorb(uint64_t *s, uint16_t cstm, const unsigned char *in, unsigned long long inlen); +void cshake128_simple_squeezeblocks(unsigned char *output, unsigned long long nblocks, uint64_t *s); +void cshake128_simple(unsigned char *output, unsigned long long outlen, uint16_t cstm, const unsigned char *in, unsigned long long inlen); + +void shake256_absorb(uint64_t *s, const unsigned char *input, unsigned int inputByteLen); +void shake256_squeezeblocks(unsigned char *output, unsigned long long nblocks, uint64_t *s); +void shake256(unsigned char *output, unsigned long long outlen, const unsigned char *input, unsigned long long inlen); + +void cshake256_simple_absorb(uint64_t *s, uint16_t cstm, const unsigned char *in, unsigned long long inlen); +void cshake256_simple_squeezeblocks(unsigned char *output, unsigned long long nblocks, uint64_t *s); +void cshake256_simple(unsigned char *output, unsigned long long outlen, uint16_t cstm, const unsigned char *in, unsigned long long inlen); + + +#endif diff --git a/third_party/sidh/src/sidh.c b/third_party/sidh/src/sidh.c new file mode 100644 index 00000000..d8aff37d --- /dev/null +++ b/third_party/sidh/src/sidh.c @@ -0,0 +1,333 @@ +/******************************************************************************************** +* SIDH: an efficient supersingular isogeny cryptography library +* +* Abstract: ephemeral supersingular isogeny Diffie-Hellman key exchange (SIDH) +*********************************************************************************************/ + +#include "random/random.h" + + +static void clear_words(void* mem, digit_t nwords) +{ // Clear digits from memory. "nwords" indicates the number of digits to be zeroed. + // This function uses the volatile type qualifier to inform the compiler not to optimize out the memory clearing. + unsigned int i; + volatile digit_t *v = mem; + + for (i = 0; i < nwords; i++) { + v[i] = 0; + } +} + + +static void init_basis(digit_t *gen, f2elm_t XP, f2elm_t XQ, f2elm_t XR) +{ // Initialization of basis points + + fpcopy(gen, XP[0]); + fpcopy(gen + NWORDS_FIELD, XP[1]); + fpcopy(gen + 2*NWORDS_FIELD, XQ[0]); + fpzero(XQ[1]); + fpcopy(gen + 3*NWORDS_FIELD, XR[0]); + fpcopy(gen + 4*NWORDS_FIELD, XR[1]); +} + + +static void fp2_encode(const f2elm_t x, unsigned char *enc) +{ // Conversion of GF(p^2) element from Montgomery to standard representation, and encoding by removing leading 0 bytes + unsigned int i; + f2elm_t t; + + from_fp2mont(x, t); + for (i = 0; i < FP2_ENCODED_BYTES / 2; i++) { + enc[i] = ((unsigned char*)t)[i]; + enc[i + FP2_ENCODED_BYTES / 2] = ((unsigned char*)t)[i + MAXBITS_FIELD / 8]; + } +} + + +static void fp2_decode(const unsigned char *enc, f2elm_t x) +{ // Parse byte sequence back into GF(p^2) element, and conversion to Montgomery representation + unsigned int i; + + for (i = 0; i < 2*(MAXBITS_FIELD / 8); i++) ((unsigned char *)x)[i] = 0; + for (i = 0; i < FP2_ENCODED_BYTES / 2; i++) { + ((unsigned char*)x)[i] = enc[i]; + ((unsigned char*)x)[i + MAXBITS_FIELD / 8] = enc[i + FP2_ENCODED_BYTES / 2]; + } + to_fp2mont(x, x); +} + + +void random_mod_order_A(unsigned char* random_digits) +{ // Generation of Alice's secret key + // Outputs random value in [0, 2^eA - 1] + unsigned long long nbytes = NBITS_TO_NBYTES(OALICE_BITS); + + clear_words((void*)random_digits, MAXWORDS_ORDER); + randombytes(random_digits, nbytes); + random_digits[nbytes-1] &= MASK_ALICE; // Masking last byte +} + + +void random_mod_order_B(unsigned char* random_digits) +{ // Generation of Bob's secret key + // Outputs random value in [0, 2^Floor(Log(2, oB)) - 1] + unsigned long long nbytes = NBITS_TO_NBYTES(OBOB_BITS-1); + + clear_words((void*)random_digits, MAXWORDS_ORDER); + randombytes(random_digits, nbytes); + random_digits[nbytes-1] &= MASK_BOB; // Masking last byte +} + + +int EphemeralKeyGeneration_A(const unsigned char* PrivateKeyA, unsigned char* PublicKeyA) +{ // Alice's ephemeral public key generation + // Input: a private key PrivateKeyA in the range [0, 2^eA - 1]. + // Output: the public key PublicKeyA consisting of 3 elements in GF(p^2) which are encoded by removing leading 0 bytes. + point_proj_t R, phiP = {0}, phiQ = {0}, phiR = {0}, pts[MAX_INT_POINTS_ALICE]; + f2elm_t XPA, XQA, XRA, coeff[3], A24plus = {0}, C24 = {0}, A = {0}; + unsigned int i, row, m, index = 0, pts_index[MAX_INT_POINTS_ALICE], npts = 0, ii = 0; + + // Initialize basis points + init_basis((digit_t*)A_gen, XPA, XQA, XRA); + init_basis((digit_t*)B_gen, phiP->X, phiQ->X, phiR->X); + fpcopy((digit_t*)&Montgomery_one, (phiP->Z)[0]); + fpcopy((digit_t*)&Montgomery_one, (phiQ->Z)[0]); + fpcopy((digit_t*)&Montgomery_one, (phiR->Z)[0]); + + // Initialize constants + fpcopy((digit_t*)&Montgomery_one, A24plus[0]); + fp2add(A24plus, A24plus, C24); + + // Retrieve kernel point + LADDER3PT(XPA, XQA, XRA, (digit_t*)PrivateKeyA, ALICE, R, A); + + // Traverse tree + index = 0; + for (row = 1; row < MAX_Alice; row++) { + while (index < MAX_Alice-row) { + fp2copy(R->X, pts[npts]->X); + fp2copy(R->Z, pts[npts]->Z); + pts_index[npts++] = index; + m = strat_Alice[ii++]; + xDBLe(R, R, A24plus, C24, (int)(2*m)); + index += m; + } + get_4_isog(R, A24plus, C24, coeff); + + for (i = 0; i < npts; i++) { + eval_4_isog(pts[i], coeff); + } + eval_4_isog(phiP, coeff); + eval_4_isog(phiQ, coeff); + eval_4_isog(phiR, coeff); + + fp2copy(pts[npts-1]->X, R->X); + fp2copy(pts[npts-1]->Z, R->Z); + index = pts_index[npts-1]; + npts -= 1; + } + + get_4_isog(R, A24plus, C24, coeff); + eval_4_isog(phiP, coeff); + eval_4_isog(phiQ, coeff); + eval_4_isog(phiR, coeff); + + inv_3_way(phiP->Z, phiQ->Z, phiR->Z); + fp2mul_mont(phiP->X, phiP->Z, phiP->X); + fp2mul_mont(phiQ->X, phiQ->Z, phiQ->X); + fp2mul_mont(phiR->X, phiR->Z, phiR->X); + + // Format public key + fp2_encode(phiP->X, PublicKeyA); + fp2_encode(phiQ->X, PublicKeyA + FP2_ENCODED_BYTES); + fp2_encode(phiR->X, PublicKeyA + 2*FP2_ENCODED_BYTES); + + return 0; +} + + +int EphemeralKeyGeneration_B(const unsigned char* PrivateKeyB, unsigned char* PublicKeyB) +{ // Bob's ephemeral public key generation + // Input: a private key PrivateKeyB in the range [0, 2^Floor(Log(2,oB)) - 1]. + // Output: the public key PublicKeyB consisting of 3 elements in GF(p^2) which are encoded by removing leading 0 bytes. + point_proj_t R, phiP = {0}, phiQ = {0}, phiR = {0}, pts[MAX_INT_POINTS_BOB]; + f2elm_t XPB, XQB, XRB, coeff[3], A24plus = {0}, A24minus = {0}, A = {0}; + unsigned int i, row, m, index = 0, pts_index[MAX_INT_POINTS_BOB], npts = 0, ii = 0; + + // Initialize basis points + init_basis((digit_t*)B_gen, XPB, XQB, XRB); + init_basis((digit_t*)A_gen, phiP->X, phiQ->X, phiR->X); + fpcopy((digit_t*)&Montgomery_one, (phiP->Z)[0]); + fpcopy((digit_t*)&Montgomery_one, (phiQ->Z)[0]); + fpcopy((digit_t*)&Montgomery_one, (phiR->Z)[0]); + + // Initialize constants + fpcopy((digit_t*)&Montgomery_one, A24plus[0]); + fp2add(A24plus, A24plus, A24plus); + fp2copy(A24plus, A24minus); + fp2neg(A24minus); + + // Retrieve kernel point + LADDER3PT(XPB, XQB, XRB, (digit_t*)PrivateKeyB, BOB, R, A); + + // Traverse tree + index = 0; + for (row = 1; row < MAX_Bob; row++) { + while (index < MAX_Bob-row) { + fp2copy(R->X, pts[npts]->X); + fp2copy(R->Z, pts[npts]->Z); + pts_index[npts++] = index; + m = strat_Bob[ii++]; + xTPLe(R, R, A24minus, A24plus, (int)m); + index += m; + } + get_3_isog(R, A24minus, A24plus, coeff); + + for (i = 0; i < npts; i++) { + eval_3_isog(pts[i], coeff); + } + eval_3_isog(phiP, coeff); + eval_3_isog(phiQ, coeff); + eval_3_isog(phiR, coeff); + + fp2copy(pts[npts-1]->X, R->X); + fp2copy(pts[npts-1]->Z, R->Z); + index = pts_index[npts-1]; + npts -= 1; + } + + get_3_isog(R, A24minus, A24plus, coeff); + eval_3_isog(phiP, coeff); + eval_3_isog(phiQ, coeff); + eval_3_isog(phiR, coeff); + + inv_3_way(phiP->Z, phiQ->Z, phiR->Z); + fp2mul_mont(phiP->X, phiP->Z, phiP->X); + fp2mul_mont(phiQ->X, phiQ->Z, phiQ->X); + fp2mul_mont(phiR->X, phiR->Z, phiR->X); + + // Format public key + fp2_encode(phiP->X, PublicKeyB); + fp2_encode(phiQ->X, PublicKeyB + FP2_ENCODED_BYTES); + fp2_encode(phiR->X, PublicKeyB + 2*FP2_ENCODED_BYTES); + + return 0; +} + + +int EphemeralSecretAgreement_A(const unsigned char* PrivateKeyA, const unsigned char* PublicKeyB, unsigned char* SharedSecretA) +{ // Alice's ephemeral shared secret computation + // It produces a shared secret key SharedSecretA using her secret key PrivateKeyA and Bob's public key PublicKeyB + // Inputs: Alice's PrivateKeyA is an integer in the range [0, oA-1]. + // Bob's PublicKeyB consists of 3 elements in GF(p^2) encoded by removing leading 0 bytes. + // Output: a shared secret SharedSecretA that consists of one element in GF(p^2) encoded by removing leading 0 bytes. + point_proj_t R, pts[MAX_INT_POINTS_ALICE]; + f2elm_t coeff[3], PKB[3], jinv; + f2elm_t A24plus = {0}, C24 = {0}, A = {0}; + unsigned int i, row, m, index = 0, pts_index[MAX_INT_POINTS_ALICE], npts = 0, ii = 0; + + // Initialize images of Bob's basis + fp2_decode(PublicKeyB, PKB[0]); + fp2_decode(PublicKeyB + FP2_ENCODED_BYTES, PKB[1]); + fp2_decode(PublicKeyB + 2*FP2_ENCODED_BYTES, PKB[2]); + + // Initialize constants + get_A(PKB[0], PKB[1], PKB[2], A); // TODO: Can return projective A? + fpadd((digit_t*)&Montgomery_one, (digit_t*)&Montgomery_one, C24[0]); + fp2add(A, C24, A24plus); + fpadd(C24[0], C24[0], C24[0]); + + // Retrieve kernel point + LADDER3PT(PKB[0], PKB[1], PKB[2], (digit_t*)PrivateKeyA, ALICE, R, A); + + // Traverse tree + index = 0; + for (row = 1; row < MAX_Alice; row++) { + while (index < MAX_Alice-row) { + fp2copy(R->X, pts[npts]->X); + fp2copy(R->Z, pts[npts]->Z); + pts_index[npts++] = index; + m = strat_Alice[ii++]; + xDBLe(R, R, A24plus, C24, (int)(2*m)); + index += m; + } + get_4_isog(R, A24plus, C24, coeff); + + for (i = 0; i < npts; i++) { + eval_4_isog(pts[i], coeff); + } + + fp2copy(pts[npts-1]->X, R->X); + fp2copy(pts[npts-1]->Z, R->Z); + index = pts_index[npts-1]; + npts -= 1; + } + + get_4_isog(R, A24plus, C24, coeff); + fp2div2(C24, C24); + fp2sub(A24plus, C24, A24plus); + fp2div2(C24, C24); + j_inv(A24plus, C24, jinv); + fp2_encode(jinv, SharedSecretA); // Format shared secret + + return 0; +} + + +int EphemeralSecretAgreement_B(const unsigned char* PrivateKeyB, const unsigned char* PublicKeyA, unsigned char* SharedSecretB) +{ // Bob's ephemeral shared secret computation + // It produces a shared secret key SharedSecretB using his secret key PrivateKeyB and Alice's public key PublicKeyA + // Inputs: Bob's PrivateKeyB is an integer in the range [0, 2^Floor(Log(2,oB)) - 1]. + // Alice's PublicKeyA consists of 3 elements in GF(p^2) encoded by removing leading 0 bytes. + // Output: a shared secret SharedSecretB that consists of one element in GF(p^2) encoded by removing leading 0 bytes. + point_proj_t R, pts[MAX_INT_POINTS_BOB]; + f2elm_t coeff[3], PKB[3], jinv; + f2elm_t A24plus = {0}, A24minus = {0}, A = {0}; + unsigned int i, row, m, index = 0, pts_index[MAX_INT_POINTS_BOB], npts = 0, ii = 0; + + // Initialize images of Alice's basis + fp2_decode(PublicKeyA, PKB[0]); + fp2_decode(PublicKeyA + FP2_ENCODED_BYTES, PKB[1]); + fp2_decode(PublicKeyA + 2*FP2_ENCODED_BYTES, PKB[2]); + + // Initialize constants + get_A(PKB[0], PKB[1], PKB[2], A); // TODO: Can return projective A? + fpadd((digit_t*)&Montgomery_one, (digit_t*)&Montgomery_one, A24minus[0]); + fp2add(A, A24minus, A24plus); + fp2sub(A, A24minus, A24minus); + + // Retrieve kernel point + LADDER3PT(PKB[0], PKB[1], PKB[2], (digit_t*)PrivateKeyB, BOB, R, A); + + // Traverse tree + index = 0; + for (row = 1; row < MAX_Bob; row++) { + while (index < MAX_Bob-row) { + fp2copy(R->X, pts[npts]->X); + fp2copy(R->Z, pts[npts]->Z); + pts_index[npts++] = index; + m = strat_Bob[ii++]; + xTPLe(R, R, A24minus, A24plus, (int)m); + index += m; + } + get_3_isog(R, A24minus, A24plus, coeff); + + for (i = 0; i < npts; i++) { + eval_3_isog(pts[i], coeff); + } + + fp2copy(pts[npts-1]->X, R->X); + fp2copy(pts[npts-1]->Z, R->Z); + index = pts_index[npts-1]; + npts -= 1; + } + + get_3_isog(R, A24minus, A24plus, coeff); + fp2add(A24plus, A24minus, A); + fp2add(A, A, A); + fp2sub(A24plus, A24minus, A24plus); + j_inv(A, A24plus, jinv); + fp2_encode(jinv, SharedSecretB); // Format shared secret + + return 0; +} \ No newline at end of file diff --git a/third_party/sidh/src/sike.c b/third_party/sidh/src/sike.c new file mode 100644 index 00000000..013b16c3 --- /dev/null +++ b/third_party/sidh/src/sike.c @@ -0,0 +1,98 @@ +/******************************************************************************************** +* SIDH: an efficient supersingular isogeny cryptography library +* +* Abstract: supersingular isogeny key encapsulation (SIKE) protocol +*********************************************************************************************/ + +#include +#include "sha3/fips202.h" + + +int crypto_kem_keypair(unsigned char *pk, unsigned char *sk) +{ // SIKE's key generation + // Outputs: secret key sk (CRYPTO_SECRETKEYBYTES = MSG_BYTES + SECRETKEY_B_BYTES + CRYPTO_PUBLICKEYBYTES bytes) + // public key pk (CRYPTO_PUBLICKEYBYTES bytes) + + // Generate lower portion of secret key sk <- s||SK + randombytes(sk, MSG_BYTES); + random_mod_order_B(sk + MSG_BYTES); + + // Generate public key pk + EphemeralKeyGeneration_B(sk + MSG_BYTES, pk); + + // Append public key pk to secret key sk + memcpy(&sk[MSG_BYTES + SECRETKEY_B_BYTES], pk, CRYPTO_PUBLICKEYBYTES); + + return 0; +} + + +int crypto_kem_enc(unsigned char *ct, unsigned char *ss, const unsigned char *pk) +{ // SIKE's encapsulation + // Input: public key pk (CRYPTO_PUBLICKEYBYTES bytes) + // Outputs: shared secret ss (CRYPTO_BYTES bytes) + // ciphertext message ct (CRYPTO_CIPHERTEXTBYTES = CRYPTO_PUBLICKEYBYTES + MSG_BYTES bytes) + const uint16_t G = 0; + const uint16_t H = 1; + const uint16_t P = 2; + unsigned char ephemeralsk[SECRETKEY_A_BYTES]; + unsigned char jinvariant[FP2_ENCODED_BYTES]; + unsigned char h[MSG_BYTES]; + unsigned char temp[CRYPTO_CIPHERTEXTBYTES+MSG_BYTES]; + unsigned int i; + + // Generate ephemeralsk <- G(m||pk) mod oA + randombytes(temp, MSG_BYTES); + memcpy(&temp[MSG_BYTES], pk, CRYPTO_PUBLICKEYBYTES); + cshake256_simple(ephemeralsk, SECRETKEY_A_BYTES, G, temp, CRYPTO_PUBLICKEYBYTES+MSG_BYTES); + ephemeralsk[SECRETKEY_A_BYTES - 1] &= MASK_ALICE; + + // Encrypt + EphemeralKeyGeneration_A(ephemeralsk, ct); + EphemeralSecretAgreement_A(ephemeralsk, pk, jinvariant); + cshake256_simple(h, MSG_BYTES, P, jinvariant, FP2_ENCODED_BYTES); + for (i = 0; i < MSG_BYTES; i++) ct[i + CRYPTO_PUBLICKEYBYTES] = temp[i] ^ h[i]; + + // Generate shared secret ss <- H(m||ct) + memcpy(&temp[MSG_BYTES], ct, CRYPTO_CIPHERTEXTBYTES); + cshake256_simple(ss, CRYPTO_BYTES, H, temp, CRYPTO_CIPHERTEXTBYTES+MSG_BYTES); + + return 0; +} + + +int crypto_kem_dec(unsigned char *ss, const unsigned char *ct, const unsigned char *sk) +{ // SIKE's decapsulation + // Input: secret key sk (CRYPTO_SECRETKEYBYTES = MSG_BYTES + SECRETKEY_B_BYTES + CRYPTO_PUBLICKEYBYTES bytes) + // ciphertext message ct (CRYPTO_CIPHERTEXTBYTES = CRYPTO_PUBLICKEYBYTES + MSG_BYTES bytes) + // Outputs: shared secret ss (CRYPTO_BYTES bytes) + const uint16_t G = 0; + const uint16_t H = 1; + const uint16_t P = 2; + unsigned char ephemeralsk_[SECRETKEY_A_BYTES]; + unsigned char jinvariant_[FP2_ENCODED_BYTES]; + unsigned char h_[MSG_BYTES]; + unsigned char c0_[CRYPTO_PUBLICKEYBYTES]; + unsigned char temp[CRYPTO_CIPHERTEXTBYTES+MSG_BYTES]; + unsigned int i; + + // Decrypt + EphemeralSecretAgreement_B(sk + MSG_BYTES, ct, jinvariant_); + cshake256_simple(h_, MSG_BYTES, P, jinvariant_, FP2_ENCODED_BYTES); + for (i = 0; i < MSG_BYTES; i++) temp[i] = ct[i + CRYPTO_PUBLICKEYBYTES] ^ h_[i]; + + // Generate ephemeralsk_ <- G(m||pk) mod oA + memcpy(&temp[MSG_BYTES], &sk[MSG_BYTES + SECRETKEY_B_BYTES], CRYPTO_PUBLICKEYBYTES); + cshake256_simple(ephemeralsk_, SECRETKEY_A_BYTES, G, temp, CRYPTO_PUBLICKEYBYTES+MSG_BYTES); + ephemeralsk_[SECRETKEY_A_BYTES - 1] &= MASK_ALICE; + + // Generate shared secret ss <- H(m||ct) or output ss <- H(s||ct) + EphemeralKeyGeneration_A(ephemeralsk_, c0_); + if (memcmp(c0_, ct, CRYPTO_PUBLICKEYBYTES) != 0) { + memcpy(temp, sk, MSG_BYTES); + } + memcpy(&temp[MSG_BYTES], ct, CRYPTO_CIPHERTEXTBYTES); + cshake256_simple(ss, CRYPTO_BYTES, H, temp, CRYPTO_CIPHERTEXTBYTES+MSG_BYTES); + + return 0; +} \ No newline at end of file